diff --git a/Experiments/clip_expt.ipynb b/Experiments/clip_expt.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..25c9252a64d66306b25eb8adfd2227a7761e8416 --- /dev/null +++ b/Experiments/clip_expt.ipynb @@ -0,0 +1,840 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "id": "9fe51ce7-4c87-4186-9fd3-0fb18ac43e56", + "metadata": {}, + "outputs": [], + "source": [ + "from PIL import Image\n", + "import requests\n", + "from transformers import AutoProcessor, CLIPVisionModel" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "0f4c21dd-4258-461d-8511-5be089d068a8", + "metadata": {}, + "outputs": [], + "source": [ + "model = CLIPVisionModel.from_pretrained(\"openai/clip-vit-base-patch32\", device_map=\"cuda:0\")\n", + "processor = AutoProcessor.from_pretrained(\"openai/clip-vit-base-patch32\", device_map=\"cuda:0\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "98b9f906-ffaa-4be4-8671-4ecf65f12c49", + "metadata": {}, + "outputs": [], + "source": [ + "# url = \"http://images.cocodataset.org/val2017/000000039769.jpg\"\n", + "# image = Image.open(requests.get(url, stream=True).raw)\n", + "image = Image.open(\"002579.jpg\")" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "54b2e4ce-b77b-4314-87f6-ca2a1970fc79", + "metadata": {}, + "outputs": [], + "source": [ + "# image" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "cdd65c58-007f-450b-8deb-f8b4f372a823", + "metadata": {}, + "outputs": [], + "source": [ + "# image = None" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "e9066c2e-c78b-49d1-979b-10d0f4f09441", + "metadata": {}, + "outputs": [], + "source": [ + "inputs = processor(images=image, return_tensors=\"pt\", device_map=\"cuda:0\")" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "e98b211d-29d9-4662-be0b-e011e89b0101", + "metadata": {}, + "outputs": [], + "source": [ + "# inputs" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "b030bd3d-4282-4074-98fe-97e658bd0f50", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "torch.Size([1, 3, 224, 224])" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "inputs[\"pixel_values\"].shape" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "0ce68f11-1c88-4dd7-8b17-0d1de5811fe6", + "metadata": {}, + "outputs": [], + "source": [ + "outputs = model(inputs[\"pixel_values\"].to(\"cuda:0\"))\n", + "last_hidden_state = outputs.last_hidden_state\n", + "pooled_output = outputs.pooler_output # pooled CLS states" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "30cb0918-a30e-4246-b540-6b8e0d876807", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "torch.Size([1, 768])" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pooled_output.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "6399543a-f23f-426d-8289-3bb52d293ece", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "torch.Size([1, 50, 768])" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "last_hidden_state.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "19a70443-5942-4937-b3ea-6a52d76e2b08", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "torch.Size([1, 768])" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "outputs[1].shape" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "fa13903f-a94a-4839-ae5a-8df4f55c68b6", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "from torch import nn\n", + "from transformers import CLIPVisionConfig,CLIPPreTrainedModel" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "b2bd9198-42f0-40c3-80e1-d167c0b038fb", + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'Optional' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[9], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mclass\u001b[39;00m \u001b[38;5;21;01mCLIPVisionModelWithProjection\u001b[39;00m(CLIPPreTrainedModel):\n\u001b[1;32m 2\u001b[0m config_class \u001b[38;5;241m=\u001b[39m CLIPVisionConfig\n\u001b[1;32m 3\u001b[0m main_input_name \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpixel_values\u001b[39m\u001b[38;5;124m\"\u001b[39m\n", + "Cell \u001b[0;32mIn[9], line 20\u001b[0m, in \u001b[0;36mCLIPVisionModelWithProjection\u001b[0;34m()\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mget_input_embeddings\u001b[39m(\u001b[38;5;28mself\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m nn\u001b[38;5;241m.\u001b[39mModule:\n\u001b[1;32m 16\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mvision_model\u001b[38;5;241m.\u001b[39membeddings\u001b[38;5;241m.\u001b[39mpatch_embedding\n\u001b[1;32m 18\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\n\u001b[1;32m 19\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[0;32m---> 20\u001b[0m pixel_values: \u001b[43mOptional\u001b[49m[torch\u001b[38;5;241m.\u001b[39mFloatTensor] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 21\u001b[0m output_attentions: Optional[\u001b[38;5;28mbool\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 22\u001b[0m output_hidden_states: Optional[\u001b[38;5;28mbool\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 23\u001b[0m return_dict: Optional[\u001b[38;5;28mbool\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 24\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Union[Tuple, CLIPVisionModelOutput]:\n\u001b[1;32m 25\u001b[0m return_dict \u001b[38;5;241m=\u001b[39m return_dict \u001b[38;5;28;01mif\u001b[39;00m return_dict \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39muse_return_dict\n\u001b[1;32m 27\u001b[0m vision_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mvision_model(\n\u001b[1;32m 28\u001b[0m pixel_values\u001b[38;5;241m=\u001b[39mpixel_values,\n\u001b[1;32m 29\u001b[0m output_attentions\u001b[38;5;241m=\u001b[39moutput_attentions,\n\u001b[1;32m 30\u001b[0m output_hidden_states\u001b[38;5;241m=\u001b[39moutput_hidden_states,\n\u001b[1;32m 31\u001b[0m return_dict\u001b[38;5;241m=\u001b[39mreturn_dict,\n\u001b[1;32m 32\u001b[0m )\n", + "\u001b[0;31mNameError\u001b[0m: name 'Optional' is not defined" + ] + } + ], + "source": [ + "class CLIPVisionModelWithProjection(CLIPPreTrainedModel):\n", + " config_class = CLIPVisionConfig\n", + " main_input_name = \"pixel_values\"\n", + "\n", + " def __init__(self, config: CLIPVisionConfig):\n", + " super().__init__(config)\n", + "\n", + " self.vision_model = CLIPVisionTransformer(config)\n", + "\n", + " self.visual_projection = nn.Linear(config.hidden_size, config.projection_dim, bias=False)\n", + "\n", + " # Initialize weights and apply final processing\n", + " self.post_init()\n", + "\n", + " def get_input_embeddings(self) -> nn.Module:\n", + " return self.vision_model.embeddings.patch_embedding\n", + "\n", + " def forward(\n", + " self,\n", + " pixel_values: Optional[torch.FloatTensor] = None,\n", + " output_attentions: Optional[bool] = None,\n", + " output_hidden_states: Optional[bool] = None,\n", + " return_dict: Optional[bool] = None,\n", + " ) -> Union[Tuple, CLIPVisionModelOutput]:\n", + " return_dict = return_dict if return_dict is not None else self.config.use_return_dict\n", + "\n", + " vision_outputs = self.vision_model(\n", + " pixel_values=pixel_values,\n", + " output_attentions=output_attentions,\n", + " output_hidden_states=output_hidden_states,\n", + " return_dict=return_dict,\n", + " )\n", + "\n", + " pooled_output = vision_outputs[1] # pooled_output\n", + "\n", + " image_embeds = self.visual_projection(pooled_output)\n", + "\n", + " if not return_dict:\n", + " outputs = (image_embeds, vision_outputs[0]) + vision_outputs[2:]\n", + " return tuple(output for output in outputs if output is not None)\n", + "\n", + " return CLIPVisionModelOutput(\n", + " image_embeds=image_embeds,\n", + " last_hidden_state=vision_outputs.last_hidden_state,\n", + " hidden_states=vision_outputs.hidden_states,\n", + " attentions=vision_outputs.attentions,\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "68a9ee4a-d977-4725-842d-e64e0dd2f61d", + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "loading configuration file config.json from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/config.json\n", + "`text_config` is `None`. Initializing the `CLIPTextConfig` with default values.\n", + "`vision_config` is `None`. initializing the `CLIPVisionConfig` with default values.\n", + "Model config CLIPConfig {\n", + " \"_name_or_path\": \"openai/clip-vit-base-patch32\",\n", + " \"architectures\": [\n", + " \"CLIPModel\"\n", + " ],\n", + " \"initializer_factor\": 1.0,\n", + " \"logit_scale_init_value\": 2.6592,\n", + " \"model_type\": \"clip\",\n", + " \"projection_dim\": 512,\n", + " \"text_config\": {\n", + " \"bos_token_id\": 0,\n", + " \"dropout\": 0.0,\n", + " \"eos_token_id\": 2,\n", + " \"model_type\": \"clip_text_model\"\n", + " },\n", + " \"transformers_version\": \"4.36.2\",\n", + " \"vision_config\": {\n", + " \"dropout\": 0.0,\n", + " \"model_type\": \"clip_vision_model\"\n", + " }\n", + "}\n", + "\n", + "loading weights file pytorch_model.bin from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/pytorch_model.bin\n", + "All model checkpoint weights were used when initializing CLIPModel.\n", + "\n", + "All the weights of CLIPModel were initialized from the model checkpoint at openai/clip-vit-base-patch32.\n", + "If your task is similar to the task the model of the checkpoint was trained on, you can already use CLIPModel for predictions without further training.\n", + "loading configuration file preprocessor_config.json from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/preprocessor_config.json\n", + "loading configuration file preprocessor_config.json from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/preprocessor_config.json\n", + "loading configuration file config.json from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/config.json\n", + "`text_config` is `None`. Initializing the `CLIPTextConfig` with default values.\n", + "`vision_config` is `None`. initializing the `CLIPVisionConfig` with default values.\n", + "Model config CLIPConfig {\n", + " \"_name_or_path\": \"openai/clip-vit-base-patch32\",\n", + " \"architectures\": [\n", + " \"CLIPModel\"\n", + " ],\n", + " \"initializer_factor\": 1.0,\n", + " \"logit_scale_init_value\": 2.6592,\n", + " \"model_type\": \"clip\",\n", + " \"projection_dim\": 512,\n", + " \"text_config\": {\n", + " \"bos_token_id\": 0,\n", + " \"dropout\": 0.0,\n", + " \"eos_token_id\": 2,\n", + " \"model_type\": \"clip_text_model\"\n", + " },\n", + " \"transformers_version\": \"4.36.2\",\n", + " \"vision_config\": {\n", + " \"dropout\": 0.0,\n", + " \"model_type\": \"clip_vision_model\"\n", + " }\n", + "}\n", + "\n", + "loading configuration file preprocessor_config.json from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/preprocessor_config.json\n", + "size should be a dictionary on of the following set of keys: ({'width', 'height'}, {'shortest_edge'}, {'longest_edge', 'shortest_edge'}, {'longest_edge'}), got 224. Converted to {'shortest_edge': 224}.\n", + "crop_size should be a dictionary on of the following set of keys: ({'width', 'height'}, {'shortest_edge'}, {'longest_edge', 'shortest_edge'}, {'longest_edge'}), got 224. Converted to {'height': 224, 'width': 224}.\n", + "Image processor CLIPImageProcessor {\n", + " \"crop_size\": {\n", + " \"height\": 224,\n", + " \"width\": 224\n", + " },\n", + " \"do_center_crop\": true,\n", + " \"do_convert_rgb\": true,\n", + " \"do_normalize\": true,\n", + " \"do_rescale\": true,\n", + " \"do_resize\": true,\n", + " \"feature_extractor_type\": \"CLIPFeatureExtractor\",\n", + " \"image_mean\": [\n", + " 0.48145466,\n", + " 0.4578275,\n", + " 0.40821073\n", + " ],\n", + " \"image_processor_type\": \"CLIPImageProcessor\",\n", + " \"image_std\": [\n", + " 0.26862954,\n", + " 0.26130258,\n", + " 0.27577711\n", + " ],\n", + " \"resample\": 3,\n", + " \"rescale_factor\": 0.00392156862745098,\n", + " \"size\": {\n", + " \"shortest_edge\": 224\n", + " }\n", + "}\n", + "\n", + "loading file vocab.json from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/vocab.json\n", + "loading file merges.txt from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/merges.txt\n", + "loading file tokenizer.json from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/tokenizer.json\n", + "loading file added_tokens.json from cache at None\n", + "loading file special_tokens_map.json from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/special_tokens_map.json\n", + "loading file tokenizer_config.json from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/tokenizer_config.json\n", + "loading configuration file config.json from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/config.json\n", + "`text_config` is `None`. Initializing the `CLIPTextConfig` with default values.\n", + "`vision_config` is `None`. initializing the `CLIPVisionConfig` with default values.\n", + "Model config CLIPConfig {\n", + " \"_name_or_path\": \"openai/clip-vit-base-patch32\",\n", + " \"architectures\": [\n", + " \"CLIPModel\"\n", + " ],\n", + " \"initializer_factor\": 1.0,\n", + " \"logit_scale_init_value\": 2.6592,\n", + " \"model_type\": \"clip\",\n", + " \"projection_dim\": 512,\n", + " \"text_config\": {\n", + " \"bos_token_id\": 0,\n", + " \"dropout\": 0.0,\n", + " \"eos_token_id\": 2,\n", + " \"model_type\": \"clip_text_model\"\n", + " },\n", + " \"transformers_version\": \"4.36.2\",\n", + " \"vision_config\": {\n", + " \"dropout\": 0.0,\n", + " \"model_type\": \"clip_vision_model\"\n", + " }\n", + "}\n", + "\n" + ] + } + ], + "source": [ + "from PIL import Image\n", + "import requests\n", + "from transformers import AutoProcessor, CLIPModel\n", + "\n", + "model = CLIPModel.from_pretrained(\"openai/clip-vit-base-patch32\")\n", + "processor = AutoProcessor.from_pretrained(\"openai/clip-vit-base-patch32\")\n", + "\n", + "url = \"http://images.cocodataset.org/val2017/000000039769.jpg\"\n", + "image = Image.open(requests.get(url, stream=True).raw)\n", + "\n", + "inputs = processor(images=image, return_tensors=\"pt\")\n", + "\n", + "image_features = model.get_image_features(**inputs)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "9ff63766-b706-452b-b735-bf9000fb9c20", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "torch.Size([1, 512])" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "image_features.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "82566e7b-3c91-421a-94c5-f1e2b3e91c8c", + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "loading configuration file config.json from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/config.json\n", + "Model config CLIPVisionConfig {\n", + " \"attention_dropout\": 0.0,\n", + " \"dropout\": 0.0,\n", + " \"hidden_act\": \"quick_gelu\",\n", + " \"hidden_size\": 768,\n", + " \"image_size\": 224,\n", + " \"initializer_factor\": 1.0,\n", + " \"initializer_range\": 0.02,\n", + " \"intermediate_size\": 3072,\n", + " \"layer_norm_eps\": 1e-05,\n", + " \"model_type\": \"clip_vision_model\",\n", + " \"num_attention_heads\": 12,\n", + " \"num_channels\": 3,\n", + " \"num_hidden_layers\": 12,\n", + " \"patch_size\": 32,\n", + " \"projection_dim\": 512,\n", + " \"transformers_version\": \"4.36.2\"\n", + "}\n", + "\n", + "loading weights file pytorch_model.bin from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/pytorch_model.bin\n", + "Some weights of the model checkpoint at openai/clip-vit-base-patch32 were not used when initializing CLIPVisionModel: ['text_model.encoder.layers.8.mlp.fc2.weight', 'text_model.encoder.layers.4.self_attn.v_proj.weight', 'text_model.encoder.layers.2.mlp.fc2.bias', 'text_model.encoder.layers.2.self_attn.q_proj.bias', 'text_model.encoder.layers.4.self_attn.v_proj.bias', 'text_model.encoder.layers.6.self_attn.k_proj.bias', 'text_model.encoder.layers.9.self_attn.k_proj.weight', 'text_model.encoder.layers.11.self_attn.q_proj.bias', 'text_model.encoder.layers.3.self_attn.out_proj.bias', 'text_model.encoder.layers.11.self_attn.k_proj.bias', 'text_model.encoder.layers.7.self_attn.k_proj.bias', 'text_model.encoder.layers.1.self_attn.q_proj.weight', 'text_model.encoder.layers.11.layer_norm1.bias', 'text_model.encoder.layers.11.mlp.fc2.bias', 'text_model.encoder.layers.10.layer_norm1.bias', 'text_model.encoder.layers.7.self_attn.q_proj.bias', 'text_model.encoder.layers.11.self_attn.k_proj.weight', 'text_model.encoder.layers.5.self_attn.q_proj.bias', 'text_model.encoder.layers.0.self_attn.v_proj.bias', 'logit_scale', 'text_model.encoder.layers.8.mlp.fc1.bias', 'text_model.encoder.layers.6.layer_norm1.bias', 'text_model.encoder.layers.5.self_attn.out_proj.weight', 'text_model.encoder.layers.7.self_attn.out_proj.bias', 'text_model.embeddings.token_embedding.weight', 'text_model.encoder.layers.8.layer_norm2.bias', 'text_model.encoder.layers.9.self_attn.v_proj.weight', 'text_model.encoder.layers.1.mlp.fc1.weight', 'text_model.encoder.layers.0.layer_norm1.weight', 'text_model.encoder.layers.6.self_attn.k_proj.weight', 'text_model.encoder.layers.3.self_attn.q_proj.weight', 'text_model.encoder.layers.2.layer_norm1.weight', 'text_model.encoder.layers.0.self_attn.v_proj.weight', 'text_model.encoder.layers.7.self_attn.q_proj.weight', 'text_model.encoder.layers.1.layer_norm2.weight', 'text_model.encoder.layers.2.self_attn.out_proj.weight', 'text_model.encoder.layers.3.self_attn.k_proj.weight', 'text_model.encoder.layers.7.mlp.fc2.bias', 'text_model.encoder.layers.10.self_attn.out_proj.weight', 'text_model.encoder.layers.2.self_attn.q_proj.weight', 'text_model.encoder.layers.1.self_attn.k_proj.weight', 'text_model.encoder.layers.4.layer_norm1.weight', 'text_model.encoder.layers.0.self_attn.q_proj.weight', 'text_model.encoder.layers.0.self_attn.out_proj.bias', 'text_model.encoder.layers.4.self_attn.out_proj.bias', 'text_model.encoder.layers.5.self_attn.k_proj.weight', 'visual_projection.weight', 'text_model.encoder.layers.6.layer_norm2.bias', 'text_model.encoder.layers.6.layer_norm1.weight', 'text_model.encoder.layers.4.self_attn.out_proj.weight', 'text_model.encoder.layers.10.mlp.fc2.bias', 'text_model.encoder.layers.10.mlp.fc1.weight', 'text_model.encoder.layers.6.self_attn.out_proj.weight', 'text_model.encoder.layers.9.layer_norm1.weight', 'text_model.encoder.layers.11.layer_norm2.weight', 'text_model.encoder.layers.6.self_attn.q_proj.bias', 'text_model.encoder.layers.5.mlp.fc1.weight', 'text_model.encoder.layers.2.mlp.fc1.weight', 'text_model.encoder.layers.11.self_attn.out_proj.weight', 'text_model.encoder.layers.0.self_attn.out_proj.weight', 'text_model.encoder.layers.11.mlp.fc2.weight', 'text_model.encoder.layers.7.layer_norm2.weight', 'text_model.encoder.layers.10.self_attn.v_proj.bias', 'text_model.encoder.layers.9.mlp.fc1.bias', 'text_model.encoder.layers.8.self_attn.v_proj.weight', 'text_model.encoder.layers.3.layer_norm1.bias', 'text_model.encoder.layers.6.self_attn.v_proj.bias', 'text_model.encoder.layers.1.self_attn.v_proj.bias', 'text_model.encoder.layers.9.self_attn.q_proj.weight', 'text_model.encoder.layers.4.self_attn.k_proj.weight', 'text_model.encoder.layers.7.layer_norm1.weight', 'text_model.encoder.layers.10.self_attn.k_proj.weight', 'text_model.encoder.layers.7.self_attn.v_proj.bias', 'text_model.encoder.layers.7.mlp.fc1.bias', 'text_model.encoder.layers.11.mlp.fc1.weight', 'text_model.encoder.layers.2.mlp.fc1.bias', 'text_model.encoder.layers.3.mlp.fc2.bias', 'text_model.encoder.layers.8.self_attn.q_proj.weight', 'text_model.encoder.layers.0.mlp.fc1.weight', 'text_model.encoder.layers.11.self_attn.out_proj.bias', 'text_model.encoder.layers.1.self_attn.v_proj.weight', 'text_model.encoder.layers.0.self_attn.k_proj.weight', 'text_model.encoder.layers.9.layer_norm1.bias', 'text_model.final_layer_norm.weight', 'text_model.encoder.layers.3.layer_norm1.weight', 'text_model.encoder.layers.4.mlp.fc1.bias', 'text_model.encoder.layers.1.layer_norm1.weight', 'text_model.encoder.layers.10.layer_norm2.bias', 'text_model.encoder.layers.9.self_attn.v_proj.bias', 'text_model.encoder.layers.10.self_attn.k_proj.bias', 'text_model.encoder.layers.8.mlp.fc2.bias', 'text_model.encoder.layers.5.mlp.fc2.bias', 'text_model.encoder.layers.6.self_attn.q_proj.weight', 'text_model.encoder.layers.5.self_attn.out_proj.bias', 'text_model.encoder.layers.9.mlp.fc2.bias', 'text_model.encoder.layers.5.layer_norm2.weight', 'text_model.encoder.layers.2.mlp.fc2.weight', 'text_model.encoder.layers.3.self_attn.out_proj.weight', 'text_model.encoder.layers.6.mlp.fc2.weight', 'text_model.encoder.layers.1.self_attn.out_proj.weight', 'text_model.encoder.layers.1.mlp.fc2.bias', 'text_model.encoder.layers.7.mlp.fc2.weight', 'text_model.encoder.layers.10.self_attn.v_proj.weight', 'text_model.encoder.layers.11.self_attn.v_proj.bias', 'text_model.encoder.layers.4.layer_norm1.bias', 'text_model.encoder.layers.4.layer_norm2.bias', 'text_model.encoder.layers.8.self_attn.q_proj.bias', 'text_model.embeddings.position_ids', 'text_model.encoder.layers.10.layer_norm2.weight', 'text_model.encoder.layers.1.self_attn.out_proj.bias', 'text_model.encoder.layers.2.layer_norm2.weight', 'text_model.encoder.layers.10.self_attn.q_proj.weight', 'text_model.encoder.layers.4.mlp.fc1.weight', 'text_model.encoder.layers.8.layer_norm1.bias', 'text_model.encoder.layers.2.self_attn.k_proj.weight', 'text_model.encoder.layers.5.mlp.fc1.bias', 'text_model.encoder.layers.9.self_attn.out_proj.bias', 'text_model.encoder.layers.7.self_attn.v_proj.weight', 'text_model.encoder.layers.2.self_attn.k_proj.bias', 'text_model.encoder.layers.5.self_attn.k_proj.bias', 'text_model.encoder.layers.8.self_attn.out_proj.bias', 'text_model.encoder.layers.7.self_attn.k_proj.weight', 'text_model.encoder.layers.6.mlp.fc1.weight', 'text_model.encoder.layers.6.mlp.fc1.bias', 'text_model.encoder.layers.3.self_attn.v_proj.weight', 'text_model.encoder.layers.3.self_attn.q_proj.bias', 'text_model.encoder.layers.9.self_attn.out_proj.weight', 'text_model.encoder.layers.3.mlp.fc1.bias', 'text_model.encoder.layers.0.self_attn.q_proj.bias', 'text_model.encoder.layers.1.layer_norm2.bias', 'text_model.encoder.layers.8.layer_norm2.weight', 'text_model.encoder.layers.5.self_attn.q_proj.weight', 'text_model.encoder.layers.4.layer_norm2.weight', 'text_model.encoder.layers.4.mlp.fc2.bias', 'text_model.encoder.layers.9.mlp.fc2.weight', 'text_model.encoder.layers.8.self_attn.k_proj.weight', 'text_model.encoder.layers.10.layer_norm1.weight', 'text_model.encoder.layers.0.self_attn.k_proj.bias', 'text_model.encoder.layers.8.self_attn.k_proj.bias', 'text_model.encoder.layers.9.layer_norm2.weight', 'text_model.encoder.layers.4.self_attn.k_proj.bias', 'text_model.encoder.layers.6.layer_norm2.weight', 'text_model.encoder.layers.0.layer_norm2.weight', 'text_model.encoder.layers.5.self_attn.v_proj.bias', 'text_model.encoder.layers.3.layer_norm2.bias', 'text_model.encoder.layers.8.mlp.fc1.weight', 'text_model.encoder.layers.4.self_attn.q_proj.bias', 'text_model.encoder.layers.8.layer_norm1.weight', 'text_model.encoder.layers.2.self_attn.v_proj.weight', 'text_model.encoder.layers.3.self_attn.v_proj.bias', 'text_model.encoder.layers.11.mlp.fc1.bias', 'text_model.encoder.layers.6.mlp.fc2.bias', 'text_model.encoder.layers.1.mlp.fc1.bias', 'text_model.encoder.layers.2.self_attn.v_proj.bias', 'text_model.encoder.layers.5.mlp.fc2.weight', 'text_model.encoder.layers.8.self_attn.v_proj.bias', 'text_model.encoder.layers.10.self_attn.out_proj.bias', 'text_model.encoder.layers.5.layer_norm1.bias', 'text_model.encoder.layers.5.self_attn.v_proj.weight', 'text_model.encoder.layers.10.self_attn.q_proj.bias', 'text_model.encoder.layers.2.layer_norm2.bias', 'text_model.encoder.layers.7.layer_norm1.bias', 'text_model.encoder.layers.4.mlp.fc2.weight', 'text_model.encoder.layers.10.mlp.fc2.weight', 'text_model.encoder.layers.3.mlp.fc1.weight', 'text_model.encoder.layers.5.layer_norm2.bias', 'text_model.encoder.layers.9.self_attn.q_proj.bias', 'text_model.encoder.layers.1.self_attn.k_proj.bias', 'text_model.encoder.layers.7.self_attn.out_proj.weight', 'text_model.encoder.layers.0.mlp.fc2.weight', 'text_model.encoder.layers.11.self_attn.v_proj.weight', 'text_model.encoder.layers.1.layer_norm1.bias', 'text_model.encoder.layers.1.mlp.fc2.weight', 'text_model.encoder.layers.9.layer_norm2.bias', 'text_model.encoder.layers.9.self_attn.k_proj.bias', 'text_model.encoder.layers.11.layer_norm1.weight', 'text_model.encoder.layers.8.self_attn.out_proj.weight', 'text_model.encoder.layers.0.layer_norm1.bias', 'text_model.encoder.layers.7.mlp.fc1.weight', 'text_model.encoder.layers.0.mlp.fc1.bias', 'text_model.encoder.layers.0.layer_norm2.bias', 'text_model.encoder.layers.3.self_attn.k_proj.bias', 'text_model.encoder.layers.5.layer_norm1.weight', 'text_model.encoder.layers.3.layer_norm2.weight', 'text_model.encoder.layers.1.self_attn.q_proj.bias', 'text_model.encoder.layers.2.self_attn.out_proj.bias', 'text_model.encoder.layers.3.mlp.fc2.weight', 'text_model.encoder.layers.11.self_attn.q_proj.weight', 'text_model.final_layer_norm.bias', 'text_model.encoder.layers.6.self_attn.v_proj.weight', 'text_model.encoder.layers.0.mlp.fc2.bias', 'text_model.encoder.layers.7.layer_norm2.bias', 'text_model.encoder.layers.10.mlp.fc1.bias', 'text_model.embeddings.position_embedding.weight', 'text_model.encoder.layers.6.self_attn.out_proj.bias', 'text_model.encoder.layers.2.layer_norm1.bias', 'text_model.encoder.layers.9.mlp.fc1.weight', 'text_projection.weight', 'text_model.encoder.layers.11.layer_norm2.bias', 'text_model.encoder.layers.4.self_attn.q_proj.weight']\n", + "- This IS expected if you are initializing CLIPVisionModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", + "- This IS NOT expected if you are initializing CLIPVisionModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", + "All the weights of CLIPVisionModel were initialized from the model checkpoint at openai/clip-vit-base-patch32.\n", + "If your task is similar to the task the model of the checkpoint was trained on, you can already use CLIPVisionModel for predictions without further training.\n", + "loading configuration file preprocessor_config.json from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/preprocessor_config.json\n", + "loading configuration file preprocessor_config.json from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/preprocessor_config.json\n", + "loading configuration file config.json from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/config.json\n", + "`text_config` is `None`. Initializing the `CLIPTextConfig` with default values.\n", + "`vision_config` is `None`. initializing the `CLIPVisionConfig` with default values.\n", + "Model config CLIPConfig {\n", + " \"_name_or_path\": \"openai/clip-vit-base-patch32\",\n", + " \"architectures\": [\n", + " \"CLIPModel\"\n", + " ],\n", + " \"initializer_factor\": 1.0,\n", + " \"logit_scale_init_value\": 2.6592,\n", + " \"model_type\": \"clip\",\n", + " \"projection_dim\": 512,\n", + " \"text_config\": {\n", + " \"bos_token_id\": 0,\n", + " \"dropout\": 0.0,\n", + " \"eos_token_id\": 2,\n", + " \"model_type\": \"clip_text_model\"\n", + " },\n", + " \"transformers_version\": \"4.36.2\",\n", + " \"vision_config\": {\n", + " \"dropout\": 0.0,\n", + " \"model_type\": \"clip_vision_model\"\n", + " }\n", + "}\n", + "\n", + "loading configuration file preprocessor_config.json from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/preprocessor_config.json\n", + "size should be a dictionary on of the following set of keys: ({'width', 'height'}, {'shortest_edge'}, {'longest_edge', 'shortest_edge'}, {'longest_edge'}), got 224. Converted to {'shortest_edge': 224}.\n", + "crop_size should be a dictionary on of the following set of keys: ({'width', 'height'}, {'shortest_edge'}, {'longest_edge', 'shortest_edge'}, {'longest_edge'}), got 224. Converted to {'height': 224, 'width': 224}.\n", + "Image processor CLIPImageProcessor {\n", + " \"crop_size\": {\n", + " \"height\": 224,\n", + " \"width\": 224\n", + " },\n", + " \"do_center_crop\": true,\n", + " \"do_convert_rgb\": true,\n", + " \"do_normalize\": true,\n", + " \"do_rescale\": true,\n", + " \"do_resize\": true,\n", + " \"feature_extractor_type\": \"CLIPFeatureExtractor\",\n", + " \"image_mean\": [\n", + " 0.48145466,\n", + " 0.4578275,\n", + " 0.40821073\n", + " ],\n", + " \"image_processor_type\": \"CLIPImageProcessor\",\n", + " \"image_std\": [\n", + " 0.26862954,\n", + " 0.26130258,\n", + " 0.27577711\n", + " ],\n", + " \"resample\": 3,\n", + " \"rescale_factor\": 0.00392156862745098,\n", + " \"size\": {\n", + " \"shortest_edge\": 224\n", + " }\n", + "}\n", + "\n", + "loading file vocab.json from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/vocab.json\n", + "loading file merges.txt from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/merges.txt\n", + "loading file tokenizer.json from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/tokenizer.json\n", + "loading file added_tokens.json from cache at None\n", + "loading file special_tokens_map.json from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/special_tokens_map.json\n", + "loading file tokenizer_config.json from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/tokenizer_config.json\n", + "loading configuration file config.json from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/config.json\n", + "`text_config` is `None`. Initializing the `CLIPTextConfig` with default values.\n", + "`vision_config` is `None`. initializing the `CLIPVisionConfig` with default values.\n", + "Model config CLIPConfig {\n", + " \"_name_or_path\": \"openai/clip-vit-base-patch32\",\n", + " \"architectures\": [\n", + " \"CLIPModel\"\n", + " ],\n", + " \"initializer_factor\": 1.0,\n", + " \"logit_scale_init_value\": 2.6592,\n", + " \"model_type\": \"clip\",\n", + " \"projection_dim\": 512,\n", + " \"text_config\": {\n", + " \"bos_token_id\": 0,\n", + " \"dropout\": 0.0,\n", + " \"eos_token_id\": 2,\n", + " \"model_type\": \"clip_text_model\"\n", + " },\n", + " \"transformers_version\": \"4.36.2\",\n", + " \"vision_config\": {\n", + " \"dropout\": 0.0,\n", + " \"model_type\": \"clip_vision_model\"\n", + " }\n", + "}\n", + "\n" + ] + } + ], + "source": [ + "from PIL import Image\n", + "import requests\n", + "from transformers import AutoProcessor, CLIPVisionModel\n", + "\n", + "model = CLIPVisionModel.from_pretrained(\"openai/clip-vit-base-patch32\")\n", + "processor = AutoProcessor.from_pretrained(\"openai/clip-vit-base-patch32\")\n", + "\n", + "url = \"http://images.cocodataset.org/val2017/000000039769.jpg\"\n", + "image = Image.open(requests.get(url, stream=True).raw)\n", + "\n", + "inputs = processor(images=image, return_tensors=\"pt\")\n", + "\n", + "outputs = model(**inputs)\n", + "last_hidden_state = outputs.last_hidden_state\n", + "pooled_output = outputs.pooler_output # pooled CLS states" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "bcf0a7b3-6cbb-492e-bc2c-42e3edbe6a0c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "torch.Size([1, 768])" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pooled_output.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "67240294-c7a0-4e94-a8c1-86bfe1b21977", + "metadata": {}, + "outputs": [], + "source": [ + "from transformers import CLIPPreTrainedModel\n", + "from transformers.models.clip.modeling_clip import CLIPVisionModelOutput, CLIPVisionTransformer\n", + "from typing import Optional, Union, Tuple" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "cc9b20db-7f84-44c3-9c78-e84164ccc192", + "metadata": {}, + "outputs": [], + "source": [ + "class VisionLanguageConnector(nn.Module):\n", + " def __init__(self, hidden_size, projection_dim):\n", + " super().__init__()\n", + " self.mlp = nn.Sequential(\n", + " nn.Linear(hidden_size, hidden_size, bias=False),\n", + " nn.GELU(),\n", + " nn.Linear(hidden_size, projection_dim, bias=False)\n", + " )\n", + "\n", + " def forward(self, x):\n", + " return self.mlp(x)\n", + " \n", + "class ClipWithProjection(CLIPPreTrainedModel):\n", + " config_class = CLIPVisionConfig\n", + " main_input_name = \"pixel_values\"\n", + "\n", + " def __init__(self, config: CLIPVisionConfig):\n", + " super().__init__(config)\n", + "\n", + " self.vision_model = CLIPVisionTransformer(config)\n", + " self.vision_model.\n", + " self.vision_language_connector = VisionLanguageConnector(config.hidden_size, config.projection_dim)\n", + "\n", + " # Initialize weights and apply final processing\n", + " self.post_init()\n", + "\n", + " def forward(\n", + " self,\n", + " pixel_values: Optional[torch.FloatTensor] = None,\n", + " output_attentions: Optional[bool] = None,\n", + " output_hidden_states: Optional[bool] = None,\n", + " return_dict: Optional[bool] = None,\n", + " ) -> Union[Tuple, CLIPVisionModelOutput]:\n", + " return_dict = return_dict if return_dict is not None else self.config.use_return_dict\n", + "\n", + " vision_outputs = self.vision_model(\n", + " pixel_values=pixel_values,\n", + " output_attentions=output_attentions,\n", + " output_hidden_states=output_hidden_states,\n", + " return_dict=return_dict,\n", + " )\n", + "\n", + " pooled_output = vision_outputs[1] # pooled_output\n", + "\n", + " image_embeds = self.vision_language_connector(pooled_output)\n", + "\n", + " if not return_dict:\n", + " outputs = (image_embeds, vision_outputs[0]) + vision_outputs[2:]\n", + " return tuple(output for output in outputs if output is not None)\n", + "\n", + " return CLIPVisionModelOutput(\n", + " image_embeds=image_embeds,\n", + " last_hidden_state=vision_outputs.last_hidden_state,\n", + " hidden_states=vision_outputs.hidden_states,\n", + " attentions=vision_outputs.attentions,\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "a4892ab8-39d2-41c9-ad2a-04711c22b95f", + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "loading configuration file config.json from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/config.json\n", + "Model config CLIPVisionConfig {\n", + " \"attention_dropout\": 0.0,\n", + " \"dropout\": 0.0,\n", + " \"hidden_act\": \"quick_gelu\",\n", + " \"hidden_size\": 768,\n", + " \"image_size\": 224,\n", + " \"initializer_factor\": 1.0,\n", + " \"initializer_range\": 0.02,\n", + " \"intermediate_size\": 3072,\n", + " \"layer_norm_eps\": 1e-05,\n", + " \"model_type\": \"clip_vision_model\",\n", + " \"num_attention_heads\": 12,\n", + " \"num_channels\": 3,\n", + " \"num_hidden_layers\": 12,\n", + " \"patch_size\": 32,\n", + " \"projection_dim\": 512,\n", + " \"transformers_version\": \"4.36.2\"\n", + "}\n", + "\n", + "loading weights file pytorch_model.bin from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/pytorch_model.bin\n", + "Some weights of the model checkpoint at openai/clip-vit-base-patch32 were not used when initializing ClipWithProjection: ['text_model.encoder.layers.8.mlp.fc2.weight', 'text_model.encoder.layers.4.self_attn.v_proj.weight', 'text_model.encoder.layers.2.mlp.fc2.bias', 'text_model.encoder.layers.2.self_attn.q_proj.bias', 'text_model.encoder.layers.4.self_attn.v_proj.bias', 'text_model.encoder.layers.6.self_attn.k_proj.bias', 'text_model.encoder.layers.9.self_attn.k_proj.weight', 'text_model.encoder.layers.11.self_attn.q_proj.bias', 'text_model.encoder.layers.3.self_attn.out_proj.bias', 'text_model.encoder.layers.11.self_attn.k_proj.bias', 'text_model.encoder.layers.7.self_attn.k_proj.bias', 'text_model.encoder.layers.1.self_attn.q_proj.weight', 'text_model.encoder.layers.11.layer_norm1.bias', 'text_model.encoder.layers.11.mlp.fc2.bias', 'text_model.encoder.layers.10.layer_norm1.bias', 'text_model.encoder.layers.7.self_attn.q_proj.bias', 'text_model.encoder.layers.11.self_attn.k_proj.weight', 'text_model.encoder.layers.5.self_attn.q_proj.bias', 'text_model.encoder.layers.0.self_attn.v_proj.bias', 'logit_scale', 'text_model.encoder.layers.8.mlp.fc1.bias', 'text_model.encoder.layers.6.layer_norm1.bias', 'text_model.encoder.layers.5.self_attn.out_proj.weight', 'text_model.encoder.layers.7.self_attn.out_proj.bias', 'text_model.embeddings.token_embedding.weight', 'text_model.encoder.layers.8.layer_norm2.bias', 'text_model.encoder.layers.9.self_attn.v_proj.weight', 'text_model.encoder.layers.1.mlp.fc1.weight', 'text_model.encoder.layers.0.layer_norm1.weight', 'text_model.encoder.layers.6.self_attn.k_proj.weight', 'text_model.encoder.layers.3.self_attn.q_proj.weight', 'text_model.encoder.layers.2.layer_norm1.weight', 'text_model.encoder.layers.0.self_attn.v_proj.weight', 'text_model.encoder.layers.7.self_attn.q_proj.weight', 'text_model.encoder.layers.1.layer_norm2.weight', 'text_model.encoder.layers.2.self_attn.out_proj.weight', 'text_model.encoder.layers.3.self_attn.k_proj.weight', 'text_model.encoder.layers.7.mlp.fc2.bias', 'text_model.encoder.layers.10.self_attn.out_proj.weight', 'text_model.encoder.layers.2.self_attn.q_proj.weight', 'text_model.encoder.layers.1.self_attn.k_proj.weight', 'text_model.encoder.layers.4.layer_norm1.weight', 'text_model.encoder.layers.0.self_attn.q_proj.weight', 'text_model.encoder.layers.0.self_attn.out_proj.bias', 'text_model.encoder.layers.4.self_attn.out_proj.bias', 'text_model.encoder.layers.5.self_attn.k_proj.weight', 'visual_projection.weight', 'text_model.encoder.layers.6.layer_norm2.bias', 'text_model.encoder.layers.6.layer_norm1.weight', 'text_model.encoder.layers.4.self_attn.out_proj.weight', 'text_model.encoder.layers.10.mlp.fc2.bias', 'text_model.encoder.layers.10.mlp.fc1.weight', 'text_model.encoder.layers.6.self_attn.out_proj.weight', 'text_model.encoder.layers.9.layer_norm1.weight', 'text_model.encoder.layers.11.layer_norm2.weight', 'text_model.encoder.layers.6.self_attn.q_proj.bias', 'text_model.encoder.layers.5.mlp.fc1.weight', 'text_model.encoder.layers.2.mlp.fc1.weight', 'text_model.encoder.layers.11.self_attn.out_proj.weight', 'text_model.encoder.layers.0.self_attn.out_proj.weight', 'text_model.encoder.layers.11.mlp.fc2.weight', 'text_model.encoder.layers.7.layer_norm2.weight', 'text_model.encoder.layers.10.self_attn.v_proj.bias', 'text_model.encoder.layers.9.mlp.fc1.bias', 'text_model.encoder.layers.8.self_attn.v_proj.weight', 'text_model.encoder.layers.3.layer_norm1.bias', 'text_model.encoder.layers.6.self_attn.v_proj.bias', 'text_model.encoder.layers.1.self_attn.v_proj.bias', 'text_model.encoder.layers.9.self_attn.q_proj.weight', 'text_model.encoder.layers.4.self_attn.k_proj.weight', 'text_model.encoder.layers.7.layer_norm1.weight', 'text_model.encoder.layers.10.self_attn.k_proj.weight', 'text_model.encoder.layers.7.self_attn.v_proj.bias', 'text_model.encoder.layers.7.mlp.fc1.bias', 'text_model.encoder.layers.11.mlp.fc1.weight', 'text_model.encoder.layers.2.mlp.fc1.bias', 'text_model.encoder.layers.3.mlp.fc2.bias', 'text_model.encoder.layers.8.self_attn.q_proj.weight', 'text_model.encoder.layers.0.mlp.fc1.weight', 'text_model.encoder.layers.11.self_attn.out_proj.bias', 'text_model.encoder.layers.1.self_attn.v_proj.weight', 'text_model.encoder.layers.0.self_attn.k_proj.weight', 'text_model.encoder.layers.9.layer_norm1.bias', 'text_model.final_layer_norm.weight', 'text_model.encoder.layers.3.layer_norm1.weight', 'text_model.encoder.layers.4.mlp.fc1.bias', 'text_model.encoder.layers.1.layer_norm1.weight', 'text_model.encoder.layers.10.layer_norm2.bias', 'text_model.encoder.layers.9.self_attn.v_proj.bias', 'text_model.encoder.layers.10.self_attn.k_proj.bias', 'text_model.encoder.layers.8.mlp.fc2.bias', 'text_model.encoder.layers.5.mlp.fc2.bias', 'text_model.encoder.layers.6.self_attn.q_proj.weight', 'text_model.encoder.layers.5.self_attn.out_proj.bias', 'text_model.encoder.layers.9.mlp.fc2.bias', 'text_model.encoder.layers.5.layer_norm2.weight', 'text_model.encoder.layers.2.mlp.fc2.weight', 'text_model.encoder.layers.3.self_attn.out_proj.weight', 'text_model.encoder.layers.6.mlp.fc2.weight', 'text_model.encoder.layers.1.self_attn.out_proj.weight', 'text_model.encoder.layers.1.mlp.fc2.bias', 'text_model.encoder.layers.7.mlp.fc2.weight', 'text_model.encoder.layers.10.self_attn.v_proj.weight', 'text_model.encoder.layers.11.self_attn.v_proj.bias', 'text_model.encoder.layers.4.layer_norm1.bias', 'text_model.encoder.layers.4.layer_norm2.bias', 'text_model.encoder.layers.8.self_attn.q_proj.bias', 'text_model.embeddings.position_ids', 'text_model.encoder.layers.10.layer_norm2.weight', 'text_model.encoder.layers.1.self_attn.out_proj.bias', 'text_model.encoder.layers.2.layer_norm2.weight', 'text_model.encoder.layers.10.self_attn.q_proj.weight', 'text_model.encoder.layers.4.mlp.fc1.weight', 'text_model.encoder.layers.8.layer_norm1.bias', 'text_model.encoder.layers.2.self_attn.k_proj.weight', 'text_model.encoder.layers.5.mlp.fc1.bias', 'text_model.encoder.layers.9.self_attn.out_proj.bias', 'text_model.encoder.layers.7.self_attn.v_proj.weight', 'text_model.encoder.layers.2.self_attn.k_proj.bias', 'text_model.encoder.layers.5.self_attn.k_proj.bias', 'text_model.encoder.layers.8.self_attn.out_proj.bias', 'text_model.encoder.layers.7.self_attn.k_proj.weight', 'text_model.encoder.layers.6.mlp.fc1.weight', 'text_model.encoder.layers.6.mlp.fc1.bias', 'text_model.encoder.layers.3.self_attn.v_proj.weight', 'text_model.encoder.layers.3.self_attn.q_proj.bias', 'text_model.encoder.layers.9.self_attn.out_proj.weight', 'text_model.encoder.layers.3.mlp.fc1.bias', 'text_model.encoder.layers.0.self_attn.q_proj.bias', 'text_model.encoder.layers.1.layer_norm2.bias', 'text_model.encoder.layers.8.layer_norm2.weight', 'text_model.encoder.layers.5.self_attn.q_proj.weight', 'text_model.encoder.layers.4.layer_norm2.weight', 'text_model.encoder.layers.4.mlp.fc2.bias', 'text_model.encoder.layers.9.mlp.fc2.weight', 'text_model.encoder.layers.8.self_attn.k_proj.weight', 'text_model.encoder.layers.10.layer_norm1.weight', 'text_model.encoder.layers.0.self_attn.k_proj.bias', 'text_model.encoder.layers.8.self_attn.k_proj.bias', 'text_model.encoder.layers.9.layer_norm2.weight', 'text_model.encoder.layers.4.self_attn.k_proj.bias', 'text_model.encoder.layers.6.layer_norm2.weight', 'text_model.encoder.layers.0.layer_norm2.weight', 'text_model.encoder.layers.5.self_attn.v_proj.bias', 'text_model.encoder.layers.3.layer_norm2.bias', 'text_model.encoder.layers.8.mlp.fc1.weight', 'text_model.encoder.layers.4.self_attn.q_proj.bias', 'text_model.encoder.layers.8.layer_norm1.weight', 'text_model.encoder.layers.2.self_attn.v_proj.weight', 'text_model.encoder.layers.3.self_attn.v_proj.bias', 'text_model.encoder.layers.11.mlp.fc1.bias', 'text_model.encoder.layers.6.mlp.fc2.bias', 'text_model.encoder.layers.1.mlp.fc1.bias', 'text_model.encoder.layers.2.self_attn.v_proj.bias', 'text_model.encoder.layers.5.mlp.fc2.weight', 'text_model.encoder.layers.8.self_attn.v_proj.bias', 'text_model.encoder.layers.10.self_attn.out_proj.bias', 'text_model.encoder.layers.5.layer_norm1.bias', 'text_model.encoder.layers.5.self_attn.v_proj.weight', 'text_model.encoder.layers.10.self_attn.q_proj.bias', 'text_model.encoder.layers.2.layer_norm2.bias', 'text_model.encoder.layers.7.layer_norm1.bias', 'text_model.encoder.layers.4.mlp.fc2.weight', 'text_model.encoder.layers.10.mlp.fc2.weight', 'text_model.encoder.layers.3.mlp.fc1.weight', 'text_model.encoder.layers.5.layer_norm2.bias', 'text_model.encoder.layers.9.self_attn.q_proj.bias', 'text_model.encoder.layers.1.self_attn.k_proj.bias', 'text_model.encoder.layers.7.self_attn.out_proj.weight', 'text_model.encoder.layers.0.mlp.fc2.weight', 'text_model.encoder.layers.11.self_attn.v_proj.weight', 'text_model.encoder.layers.1.layer_norm1.bias', 'text_model.encoder.layers.1.mlp.fc2.weight', 'text_model.encoder.layers.9.layer_norm2.bias', 'text_model.encoder.layers.9.self_attn.k_proj.bias', 'text_model.encoder.layers.11.layer_norm1.weight', 'text_model.encoder.layers.8.self_attn.out_proj.weight', 'text_model.encoder.layers.0.layer_norm1.bias', 'text_model.encoder.layers.7.mlp.fc1.weight', 'text_model.encoder.layers.0.mlp.fc1.bias', 'text_model.encoder.layers.0.layer_norm2.bias', 'text_model.encoder.layers.3.self_attn.k_proj.bias', 'text_model.encoder.layers.5.layer_norm1.weight', 'text_model.encoder.layers.3.layer_norm2.weight', 'text_model.encoder.layers.1.self_attn.q_proj.bias', 'text_model.encoder.layers.2.self_attn.out_proj.bias', 'text_model.encoder.layers.3.mlp.fc2.weight', 'text_model.encoder.layers.11.self_attn.q_proj.weight', 'text_model.final_layer_norm.bias', 'text_model.encoder.layers.6.self_attn.v_proj.weight', 'text_model.encoder.layers.0.mlp.fc2.bias', 'text_model.encoder.layers.7.layer_norm2.bias', 'text_model.encoder.layers.10.mlp.fc1.bias', 'text_model.embeddings.position_embedding.weight', 'text_model.encoder.layers.6.self_attn.out_proj.bias', 'text_model.encoder.layers.2.layer_norm1.bias', 'text_model.encoder.layers.9.mlp.fc1.weight', 'text_projection.weight', 'text_model.encoder.layers.11.layer_norm2.bias', 'text_model.encoder.layers.4.self_attn.q_proj.weight']\n", + "- This IS expected if you are initializing ClipWithProjection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", + "- This IS NOT expected if you are initializing ClipWithProjection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", + "Some weights of ClipWithProjection were not initialized from the model checkpoint at openai/clip-vit-base-patch32 and are newly initialized: ['vision_language_connector.mlp.2.weight', 'vision_language_connector.mlp.0.weight']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" + ] + } + ], + "source": [ + "model = ClipWithProjection.from_pretrained(\"openai/clip-vit-base-patch32\")" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "588ef914-5be9-49e1-b68d-b899e0e74edd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "768" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.config.hidden_size" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "05d95b9e-9831-4415-860e-94793e29d210", + "metadata": {}, + "outputs": [], + "source": [ + "outputs = model(**inputs)" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "id": "185b1bff-6ffe-4cce-9255-ee7629feba54", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "torch.Size([1, 512])" + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "outputs[0].shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "04414a35-c7b3-4986-a79e-1d363916caa4", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "485dbbcb-06df-4926-b257-dfd1a4081d44", + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'outputs' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[1], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43moutputs\u001b[49m[\u001b[38;5;241m0\u001b[39m]\n", + "\u001b[0;31mNameError\u001b[0m: name 'outputs' is not defined" + ] + } + ], + "source": [ + "outputs[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f983313c-8e0f-4805-af14-25bb69afd04c", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/Experiments/eval.ipynb b/Experiments/eval.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..cea293198184da308ccc5806f7189ed460f13d98 --- /dev/null +++ b/Experiments/eval.ipynb @@ -0,0 +1,782 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 4, + "id": "215cfd2f-62b0-4a86-a407-777a1d32597f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-01-24 15:18:49,948] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n" + ] + } + ], + "source": [ + "from PIL import Image\n", + "import requests\n", + "\n", + "import torch\n", + "from torch import nn\n", + "from transformers import AutoProcessor, CLIPVisionModel, CLIPVisionConfig, CLIPPreTrainedModel\n", + "from transformers.models.clip.modeling_clip import CLIPVisionModelOutput, CLIPVisionTransformer\n", + "from transformers import WhisperProcessor, WhisperForConditionalGeneration\n", + "from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "2244e8f3-fcc7-4309-9d4d-fea557f89f79", + "metadata": {}, + "outputs": [], + "source": [ + "from llava_phi import LlavaPhiForCausalLM" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "587883e1-3419-4b14-b16b-38fabbc8bfaa", + "metadata": {}, + "outputs": [], + "source": [ + "# model = LlavaPhiForCausalLM.from_pretrained(\"./llava-phi/checkpoints/llavaPhi-v0-3b-finetune/checkpoint-4000\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "0e27a7db-e2ab-4d65-b21d-497222e318ad", + "metadata": {}, + "outputs": [], + "source": [ + "# processor = AutoProcessor.from_pretrained(\"./llava-phi/checkpoints/llavaPhi-v0-3b-finetune/checkpoint-4000\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "663efdd8-ea21-4231-a2ae-bcc0fb47b46a", + "metadata": {}, + "outputs": [], + "source": [ + "# prompt = \"\\nUSER: What's the content of the image?\\nASSISTANT:\"\n", + "# url = \"https://www.ilankelman.org/stopsigns/australia.jpg\"\n", + "# image = Image.open(requests.get(url, stream=True).raw)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "f622609f-f6a7-4ec1-ac35-c1d33d9436ca", + "metadata": {}, + "outputs": [], + "source": [ + "# # Generate\n", + "# generate_ids = model.generate(**inputs, max_length=30)\n", + "# processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "45f5ba72-2e41-4ccc-84c1-97d542ebee63", + "metadata": {}, + "outputs": [], + "source": [ + "from llava_phi.model.builder import load_pretrained_model\n", + "from llava_phi.mm_utils import tokenizer_image_token, get_model_name_from_path\n", + "from llava_phi.utils import disable_torch_init\n", + "from llava_phi.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN\n", + "from llava_phi.conversation import conv_templates, SeparatorStyle" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "b98ac5d3-5503-4430-81d1-19a4f8d6bd75", + "metadata": {}, + "outputs": [], + "source": [ + "model_path = \"checkpoints/llavaPhi-v0-3b-finetune/checkpoint-4000\"\n", + "model_name = get_model_name_from_path(model_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "42fd5721-75a7-475b-bd30-5ee23aeaac64", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'llavaPhi-v0-3b-finetune_checkpoint-4000'" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model_name" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "8c2076b5-3bfc-48fd-917b-5dfd06fc532f", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "load llaVA-Phi MLLM!!!\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n", + "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "20b86f2c01744081b537620c8780f12e", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Loading checkpoint shards: 0%| | 0/2 [00:00\\nWhat's the content of the image? ASSISTANT:\"" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "prompt" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "a89cc181-2214-4844-b966-164a41744e54", + "metadata": {}, + "outputs": [], + "source": [ + "url = \"https://www.ilankelman.org/stopsigns/australia.jpg\"\n", + "image = Image.open(requests.get(url, stream=True).raw)\n", + "image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'].cuda()\n", + "\n", + "input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()\n", + "\n", + "stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "0d519851-64d4-4cf5-b2eb-19474f9aa260", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "torch.Size([1, 55])" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "input_ids.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "1694ff36-f214-4ed3-b2f3-d3dbd0a1a25b", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n" + ] + } + ], + "source": [ + "from datasets import load_dataset\n", + "audio_ds = load_dataset(\"hf-internal-testing/librispeech_asr_dummy\", \"clean\", split=\"validation\")\n", + "audio = audio_ds[0][\"audio\"]\n", + "\n", + "whisper_w_proj = WhisperWithProjection(projection_dim=512)\n", + "audio_embed = whisper_w_proj(audio)[\"input_ids\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "9c4a9fae-d6ed-4fc2-ba02-97df64cddd93", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(torch.Size([1, 33]), device(type='cpu'))" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "audio_embed.shape, audio_embed.device" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "c3fffe29-98fb-4f4b-ac51-4bdda9e46752", + "metadata": {}, + "outputs": [], + "source": [ + "input_ids = torch.concat([input_ids, audio_embed.to(\"cuda:0\")], dim=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "5dee1ec8-2db2-4f65-99e8-d34bd2735c9c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "torch.Size([1, 88])" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "input_ids.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "96033b43-4f57-4f0c-bcf7-37b57ca02e47", + "metadata": {}, + "outputs": [], + "source": [ + "with torch.inference_mode():\n", + " output_ids = model.generate(\n", + " input_ids,\n", + " images=image_tensor,\n", + " do_sample=True,\n", + " temperature=0.2,\n", + " max_new_tokens=1024,\n", + " eos_token_id=tokenizer.eos_token_id, # End of sequence token\n", + " pad_token_id=tokenizer.eos_token_id, # Pad token\n", + " use_cache=True,\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "741e8da5-0d18-4c11-b559-76054ce4ca3a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "is a Japanese character from the story of Jesus, who is a Chinese monk who is also known for his teachings. The story is based on the story of the story of Jesus Christ, and it is a representation of the story of Jesus and the story of Jesus Christ.\n" + ] + } + ], + "source": [ + "input_token_len = input_ids.shape[1]\n", + "n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()\n", + "if n_diff_input_output > 0:\n", + " print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')\n", + "outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]\n", + "outputs = outputs.strip()\n", + "if outputs.endswith(stop_str):\n", + " outputs = outputs[:-len(stop_str)]\n", + "outputs = outputs.strip()\n", + "print(outputs)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "69d494d4-d768-4645-b4d6-5c455791b50d", + "metadata": {}, + "outputs": [], + "source": [ + "# image" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8a340856-a13f-4b18-9911-126a4ba37816", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3c56fdea-c7a1-4e67-9832-e2ed077d8704", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "89e84d39-8ed8-45db-ae82-27c156ee6dd1", + "metadata": {}, + "outputs": [], + "source": [ + "class AudioLanguageConnector:\n", + " def __init__(self, projection_dim):\n", + " model_name = \"microsoft/phi-2\"\n", + " self.phi2_tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n", + " self.phi2_tokenizer.pad_token = self.phi2_tokenizer.eos_token\n", + " self.phi2_tokenizer.max_length = projection_dim\n", + "\n", + " def __call__(self, text):\n", + " text = f\" {text} \"\n", + " tokens = self.phi2_tokenizer(text, return_tensors=\"pt\", return_attention_mask=False)\n", + " return tokens\n", + " \n", + "\n", + "class WhisperWithProjection:\n", + " def __init__(self, projection_dim, device):\n", + " self.device = device\n", + " self.processor = WhisperProcessor.from_pretrained(\"openai/whisper-tiny\", device_map=device)\n", + " self.model = WhisperForConditionalGeneration.from_pretrained(\"openai/whisper-tiny\", device_map=device)\n", + " self.model.config.forced_decoder_ids = None\n", + " self.audio_language_connector = AudioLanguageConnector(projection_dim)\n", + " \n", + " def __call__(self, audio):\n", + " input_features = self.processor(audio[\"array\"],\n", + " sampling_rate=audio[\"sampling_rate\"],\n", + " return_tensors=\"pt\").input_features\n", + " # generate token ids\n", + " predicted_ids = self.model.generate(input_features.to(self.device))\n", + " # decode token ids to text \n", + " transcription = self.processor.batch_decode(predicted_ids, skip_special_tokens=True)\n", + "\n", + " audio_embeddings = self.audio_language_connector(transcription)\n", + " return audio_embeddings.to(self.device)" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "75e24be0-b236-4047-83ef-5c344e262476", + "metadata": {}, + "outputs": [], + "source": [ + "class MultiModalPhi2:\n", + " def __init__(self, model_path=\"checkpoints/llavaPhi-v0-3b-finetune/checkpoint-4000\",\n", + " temperature=0.2,\n", + " max_new_tokens=1024,\n", + " device=\"cuda\"):\n", + " self.temperature = temperature\n", + " self.max_new_tokens = max_new_tokens\n", + " self.device = device\n", + " model_name = get_model_name_from_path(model_path)\n", + " self.tokenizer, self.model, self.image_processor, self.context_len = load_pretrained_model(model_path, None, model_name, device_map=device)\n", + " self.whisper_w_proj = WhisperWithProjection(projection_dim=512, device=device)\n", + " \n", + " \n", + " def __call__(self, text, audio, image):\n", + " qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\\n' + text\n", + " conv = conv_templates[\"default\"].copy()\n", + " conv.append_message(conv.roles[0], qs)\n", + " conv.append_message(conv.roles[1], None)\n", + " prompt = conv.get_prompt()\n", + "\n", + " image_tensor = self.image_processor.preprocess(image, return_tensors='pt')['pixel_values'].cuda()\n", + " \n", + " input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()\n", + "\n", + " audio_embed = self.whisper_w_proj(audio)[\"input_ids\"]\n", + " \n", + " stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2\n", + "\n", + " input_ids = torch.concat([input_ids, audio_embed], dim=1)\n", + "\n", + " with torch.inference_mode():\n", + " output_ids = self.model.generate(\n", + " input_ids,\n", + " images=image_tensor,\n", + " do_sample=True,\n", + " temperature=self.temperature,\n", + " max_new_tokens=self.max_new_tokens,\n", + " eos_token_id=tokenizer.eos_token_id, # End of sequence token\n", + " pad_token_id=tokenizer.eos_token_id, # Pad token\n", + " use_cache=True,\n", + " )\n", + "\n", + " input_token_len = input_ids.shape[1]\n", + " n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()\n", + " if n_diff_input_output > 0:\n", + " print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')\n", + " outputs = self.tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]\n", + " outputs = outputs.strip()\n", + " if outputs.endswith(stop_str):\n", + " outputs = outputs[:-len(stop_str)]\n", + " outputs = outputs.strip()\n", + " return outputs" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "4efdbad4-d88a-4477-a3a0-f5591cd0b172", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.\n", + "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n", + "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "load llaVA-Phi MLLM!!!\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "492c17cf54f34d4d9e4f288fc9e72e79", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Loading checkpoint shards: 0%| | 0/2 [00:00 1\u001b[0m \u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msave_pretrained\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mLlava-Phi-Checkpoint\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/miniconda3/envs/torchenv/lib/python3.10/site-packages/transformers/modeling_utils.py:2376\u001b[0m, in \u001b[0;36mPreTrainedModel.save_pretrained\u001b[0;34m(self, save_directory, is_main_process, state_dict, save_function, push_to_hub, max_shard_size, safe_serialization, variant, token, save_peft_format, **kwargs)\u001b[0m\n\u001b[1;32m 2372\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m shard_file, shard \u001b[38;5;129;01min\u001b[39;00m shards\u001b[38;5;241m.\u001b[39mitems():\n\u001b[1;32m 2373\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m safe_serialization:\n\u001b[1;32m 2374\u001b[0m \u001b[38;5;66;03m# At some point we will need to deal better with save_function (used for TPU and other distributed\u001b[39;00m\n\u001b[1;32m 2375\u001b[0m \u001b[38;5;66;03m# joyfulness), but for now this enough.\u001b[39;00m\n\u001b[0;32m-> 2376\u001b[0m \u001b[43msafe_save_file\u001b[49m\u001b[43m(\u001b[49m\u001b[43mshard\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mos\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpath\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mjoin\u001b[49m\u001b[43m(\u001b[49m\u001b[43msave_directory\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mshard_file\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmetadata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m{\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mformat\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mpt\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m}\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2377\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 2378\u001b[0m save_function(shard, os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(save_directory, shard_file))\n", + "File \u001b[0;32m~/miniconda3/envs/torchenv/lib/python3.10/site-packages/safetensors/torch.py:281\u001b[0m, in \u001b[0;36msave_file\u001b[0;34m(tensors, filename, metadata)\u001b[0m\n\u001b[1;32m 250\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21msave_file\u001b[39m(\n\u001b[1;32m 251\u001b[0m tensors: Dict[\u001b[38;5;28mstr\u001b[39m, torch\u001b[38;5;241m.\u001b[39mTensor],\n\u001b[1;32m 252\u001b[0m filename: Union[\u001b[38;5;28mstr\u001b[39m, os\u001b[38;5;241m.\u001b[39mPathLike],\n\u001b[1;32m 253\u001b[0m metadata: Optional[Dict[\u001b[38;5;28mstr\u001b[39m, \u001b[38;5;28mstr\u001b[39m]] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 254\u001b[0m ):\n\u001b[1;32m 255\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 256\u001b[0m \u001b[38;5;124;03m Saves a dictionary of tensors into raw bytes in safetensors format.\u001b[39;00m\n\u001b[1;32m 257\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 279\u001b[0m \u001b[38;5;124;03m ```\u001b[39;00m\n\u001b[1;32m 280\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 281\u001b[0m \u001b[43mserialize_file\u001b[49m\u001b[43m(\u001b[49m\u001b[43m_flatten\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtensors\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfilename\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmetadata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmetadata\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "model.save_pretrained(\"Llava-Phi-Checkpoint\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fa0bec34-a148-4340-a30c-6f09dd5e71ca", + "metadata": {}, + "outputs": [], + "source": [ + "model.push_to_hub(\"RaviNaik/Llava-Phi2\")" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "id": "382f74b0-2967-408a-badc-a90918810d74", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "CommitInfo(commit_url='https://huggingface.co/RaviNaik/Llava-Phi2/commit/fa8f7240058241243f6bdc3d6ab44bb691f76e39', commit_message='Upload tokenizer', commit_description='', oid='fa8f7240058241243f6bdc3d6ab44bb691f76e39', pr_url=None, pr_revision=None, pr_num=None)" + ] + }, + "execution_count": 73, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.push_to_hub(\"RaviNaik/Llava-Phi2\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b851459b-d3ac-4fb8-99b6-17a648adc41f", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/Experiments/instruct_150k_data.ipynb b/Experiments/instruct_150k_data.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..8af4e590a77bf7e3b4bdd09a2e3af6b8cb380a76 --- /dev/null +++ b/Experiments/instruct_150k_data.ipynb @@ -0,0 +1,591 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "8047332f-86d6-4812-b271-98786ecd470f", + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import load_dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "5aa32b98-e5f2-4ebd-abb6-4f60f1f6f997", + "metadata": {}, + "outputs": [], + "source": [ + "# dataset = load_dataset(\"liuhaotian/LLaVA-Instruct-150K\", split=\"train\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "c82d99ca-9ca6-4643-9611-9ec9996608d2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2024-01-19 17:09:33-- https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K/resolve/main/llava_instruct_150k.json\n", + "Resolving huggingface.co (huggingface.co)... 13.225.131.94, 13.225.131.6, 13.225.131.93, ...\n", + "Connecting to huggingface.co (huggingface.co)|13.225.131.94|:443... connected.\n", + "HTTP request sent, awaiting response... 302 Found\n", + "Location: https://cdn-lfs.huggingface.co/repos/4d/41/4d41ea1e2709f0e68e9e361e4218192b9620c5a3f2cb8055bc625942b6cd3039/6b68bc5ca2bfd8a71119af0e8454929668ccda6a334955ccc95d114fc8d082fa?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27llava_instruct_150k.json%3B+filename%3D%22llava_instruct_150k.json%22%3B&response-content-type=application%2Fjson&Expires=1705923573&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcwNTkyMzU3M319LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9yZXBvcy80ZC80MS80ZDQxZWExZTI3MDlmMGU2OGU5ZTM2MWU0MjE4MTkyYjk2MjBjNWEzZjJjYjgwNTViYzYyNTk0MmI2Y2QzMDM5LzZiNjhiYzVjYTJiZmQ4YTcxMTE5YWYwZTg0NTQ5Mjk2NjhjY2RhNmEzMzQ5NTVjY2M5NWQxMTRmYzhkMDgyZmE%7EcmVzcG9uc2UtY29udGVudC1kaXNwb3NpdGlvbj0qJnJlc3BvbnNlLWNvbnRlbnQtdHlwZT0qIn1dfQ__&Signature=f59rV0KoQ-7MJrgwl8uYXAJe5XqFuGBU2ukhEcGTXx965zA3Qv5N7Rv6uDoeeqaow7SamaAzy%7E8Ley7-YeLXeJ43ui%7EdzUpQROAhxfxS8IpSQ7IN4eCpgcBCZYfGMOaB1MzmtmTziBZfy9ElgeHU%7EEe7ixiiNnPb2Fm66N-GkKzCIqmmzUBXxz6QBc8z7lFrccpct%7EnqFbZN7rFzo3%7EClPUM6%7EZ1C5RQ2v3pgID9wE0YG82xlaxOaJg6XOSuDpDHXDuY3fKwphEZENwEVJHPX%7EW9yTldb4BNjxY%7EEwCT9KHqLMH-VVXLw5rnuU9OdWYOntEOuTFg-B2Ru3c-EQlNMA__&Key-Pair-Id=KVTP0A1DKRTAX [following]\n", + "--2024-01-19 17:09:33-- https://cdn-lfs.huggingface.co/repos/4d/41/4d41ea1e2709f0e68e9e361e4218192b9620c5a3f2cb8055bc625942b6cd3039/6b68bc5ca2bfd8a71119af0e8454929668ccda6a334955ccc95d114fc8d082fa?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27llava_instruct_150k.json%3B+filename%3D%22llava_instruct_150k.json%22%3B&response-content-type=application%2Fjson&Expires=1705923573&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcwNTkyMzU3M319LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9yZXBvcy80ZC80MS80ZDQxZWExZTI3MDlmMGU2OGU5ZTM2MWU0MjE4MTkyYjk2MjBjNWEzZjJjYjgwNTViYzYyNTk0MmI2Y2QzMDM5LzZiNjhiYzVjYTJiZmQ4YTcxMTE5YWYwZTg0NTQ5Mjk2NjhjY2RhNmEzMzQ5NTVjY2M5NWQxMTRmYzhkMDgyZmE%7EcmVzcG9uc2UtY29udGVudC1kaXNwb3NpdGlvbj0qJnJlc3BvbnNlLWNvbnRlbnQtdHlwZT0qIn1dfQ__&Signature=f59rV0KoQ-7MJrgwl8uYXAJe5XqFuGBU2ukhEcGTXx965zA3Qv5N7Rv6uDoeeqaow7SamaAzy%7E8Ley7-YeLXeJ43ui%7EdzUpQROAhxfxS8IpSQ7IN4eCpgcBCZYfGMOaB1MzmtmTziBZfy9ElgeHU%7EEe7ixiiNnPb2Fm66N-GkKzCIqmmzUBXxz6QBc8z7lFrccpct%7EnqFbZN7rFzo3%7EClPUM6%7EZ1C5RQ2v3pgID9wE0YG82xlaxOaJg6XOSuDpDHXDuY3fKwphEZENwEVJHPX%7EW9yTldb4BNjxY%7EEwCT9KHqLMH-VVXLw5rnuU9OdWYOntEOuTFg-B2Ru3c-EQlNMA__&Key-Pair-Id=KVTP0A1DKRTAX\n", + "Resolving cdn-lfs.huggingface.co (cdn-lfs.huggingface.co)... 54.230.61.40, 54.230.61.44, 54.230.61.102, ...\n", + "Connecting to cdn-lfs.huggingface.co (cdn-lfs.huggingface.co)|54.230.61.40|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 228941895 (218M) [application/json]\n", + "Saving to: ‘llava_instruct_150k.json’\n", + "\n", + "llava_instruct_150k 100%[===================>] 218.34M 14.8MB/s in 15s \n", + "\n", + "2024-01-19 17:09:49 (15.0 MB/s) - ‘llava_instruct_150k.json’ saved [228941895/228941895]\n", + "\n" + ] + } + ], + "source": [ + "!wget -c https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K/resolve/main/llava_instruct_150k.json" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "d70a4fb6-7e90-4cab-9fe3-2f1f47669fd1", + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "4e158361-58b0-4205-9da5-4944eee72509", + "metadata": {}, + "outputs": [], + "source": [ + "ds = Dataset.from_json(\"llava_instruct_150k.json\", split=\"train\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "0c7e48fe-b38b-4e78-bdc7-6944fd68574b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Dataset({\n", + " features: ['id', 'image', 'conversations'],\n", + " num_rows: 157712\n", + "})" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "f0c1305c-d79c-4762-bfe2-137c8069300d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'id': '000000033471',\n", + " 'image': '000000033471.jpg',\n", + " 'conversations': [{'from': 'human',\n", + " 'value': '\\nWhat are the colors of the bus in the image?'},\n", + " {'from': 'gpt', 'value': 'The bus in the image is white and red.'},\n", + " {'from': 'human',\n", + " 'value': 'What feature can be seen on the back of the bus?'},\n", + " {'from': 'gpt', 'value': 'The back of the bus features an advertisement.'},\n", + " {'from': 'human',\n", + " 'value': 'Is the bus driving down the street or pulled off to the side?'},\n", + " {'from': 'gpt',\n", + " 'value': 'The bus is driving down the street, which is crowded with people and other vehicles.'}]}" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "5a92d586-af5a-4a94-8b46-1570cb04bbfe", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'id': '000000033471',\n", + " 'image': '000000033471.jpg',\n", + " 'conversations': [{'from': 'human',\n", + " 'value': '\\nWhat are the colors of the bus in the image?'},\n", + " {'from': 'gpt', 'value': 'The bus in the image is white and red.'},\n", + " {'from': 'human',\n", + " 'value': 'What feature can be seen on the back of the bus?'},\n", + " {'from': 'gpt', 'value': 'The back of the bus features an advertisement.'},\n", + " {'from': 'human',\n", + " 'value': 'Is the bus driving down the street or pulled off to the side?'},\n", + " {'from': 'gpt',\n", + " 'value': 'The bus is driving down the street, which is crowded with people and other vehicles.'}]}" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "019d6657-0862-456f-a4d4-fd8cb9550abd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "157712" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(ds)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "d6ab0727-a569-4dd1-a5da-8b45ba41ef08", + "metadata": {}, + "outputs": [], + "source": [ + "df = ds.to_pandas()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "68591d5b-dfd9-482b-afe4-589bcc111efb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(81479,)" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.image.unique().shape" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "998c8fe8-2dff-4c18-8e08-05a596d0506b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2024-01-19 18:56:07-- http://images.cocodataset.org/zips/train2014.zip\n", + "Resolving images.cocodataset.org (images.cocodataset.org)... 16.182.65.17, 52.216.208.17, 52.217.227.193, ...\n", + "Connecting to images.cocodataset.org (images.cocodataset.org)|16.182.65.17|:80... connected.\n", + "HTTP request sent, awaiting response... 416 Requested Range Not Satisfiable\n", + "\n", + " The file is already fully retrieved; nothing to do.\n", + "\n" + ] + } + ], + "source": [ + "!wget -c http://images.cocodataset.org/zips/train2014.zip" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "5ed46437-c059-4283-8619-9d1dd9546509", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# !unzip train2014.zip" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "49bfa1a0-1633-48fc-952c-c07aacab9544", + "metadata": {}, + "outputs": [], + "source": [ + "ds_stream = ds.to_iterable_dataset()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "b00259e9-87c1-4b86-b43c-96eab4f16c3e", + "metadata": {}, + "outputs": [], + "source": [ + "from PIL import Image" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "0b02934a-6842-46d3-a1fe-3fd3d104f574", + "metadata": {}, + "outputs": [], + "source": [ + "def get_image(image_path):\n", + " image_path = f\"train2014/COCO_train2014_{image_path}\"\n", + " img = Image.open(image_path)\n", + " return img" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "c370617f-cbe8-4809-9b76-6fe071940a13", + "metadata": {}, + "outputs": [], + "source": [ + "chat_template = \"\"\"<|im_start|>system\n", + "You are a helpful assistant who always respond to user queries. For context you can use image data enclosed between and tags and you can use audio data enclosed between and tags<|im_end|>\n", + "user\n", + "{prompt}<|im_end|>\n", + "<|im_start|>assistant\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "d000e78c-737b-45c5-8f7a-7ec933c894ac", + "metadata": {}, + "outputs": [], + "source": [ + "from PIL import Image" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "8252cd0e-8e57-4ec4-a557-9a89d3502aa8", + "metadata": {}, + "outputs": [], + "source": [ + "img = Image.open(\"train2014/COCO_train2014_000000334872.jpg\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "7a2a4dc2-4b7a-4e18-adcb-ac112c12a4f0", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "img" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "60745c20-7ab3-4e5f-8225-9319fe5e786c", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7f08eb8b-f02f-4556-8019-7430af46d06c", + "metadata": {}, + "outputs": [], + "source": [ + "{'id': '000000334872',\n", + " 'image': '000000334872.jpg',\n", + " 'conversations': [{'from': 'human',\n", + " 'value': '\\nAre the people in the image skiing downhill or cross-country skiing?'},\n", + " {'from': 'gpt',\n", + " 'value': 'The people in the image are cross-country skiing in the woods, as they are skiing on a trail rather than a steep slope.'},\n", + " {'from': 'human', 'value': 'How many people are in the image?'},\n", + " {'from': 'gpt',\n", + " 'value': 'There are two people in the image, both on skis in the snow.'},\n", + " {'from': 'human', 'value': 'What kind of environment are they skiing in?'},\n", + " {'from': 'gpt',\n", + " 'value': 'They are skiing in a wooded environment, following a trail through the trees while surrounded by snow.'},\n", + " {'from': 'human',\n", + " 'value': 'Do the skiers have any additional gear with them besides their skis and poles?'},\n", + " {'from': 'gpt',\n", + " 'value': 'Yes, the two male skiers are carrying backpacks while they ski through the woods. The backpacks might contain essentials for their skiing adventure, such as food, water, extra clothing, or safety equipment.'}]}" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "4db5a263-fbc1-4273-b190-6f0eccfcb25b", + "metadata": {}, + "outputs": [], + "source": [ + "# ChatML format\n", + "templates = {\n", + " \"assistant\": \"<|im_start|>assistant\\n{msg}<|im_end|>\", # message by assistant\n", + " \"user\": \"<|im_start|>user\\n{msg}<|im_end|>\" # message by user\n", + "}\n", + "def get_chatml_text(conversations):\n", + " chatml_text = \"\"\n", + " for conversation in conversations:\n", + " role = conversation[\"from\"]\n", + " role = \"user\" if role == \"human\" else \"assistant\"\n", + " content = conversation[\"value\"]\n", + "\n", + " formatted_text = templates[role].format(msg=content)\n", + " chatml_text += formatted_text + \"\\n\"\n", + " return chatml_text" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "b1f13d00-2c33-4d76-9925-4977fd074a60", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'<|im_start|>user\\n\\nWhat are the colors of the bus in the image?<|im_end|>\\n<|im_start|>assistant\\nThe bus in the image is white and red.<|im_end|>\\n<|im_start|>user\\nWhat feature can be seen on the back of the bus?<|im_end|>\\n<|im_start|>assistant\\nThe back of the bus features an advertisement.<|im_end|>\\n<|im_start|>user\\nIs the bus driving down the street or pulled off to the side?<|im_end|>\\n<|im_start|>assistant\\nThe bus is driving down the street, which is crowded with people and other vehicles.<|im_end|>\\n'" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "get_chatml_text(ds[0][\"conversations\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "6195c324-9ccf-4aa5-9257-d1b882178e01", + "metadata": {}, + "outputs": [], + "source": [ + "def instruct_data_generator():\n", + " for sample in ds_stream:\n", + " image_path = sample[\"image\"]\n", + " conversations = sample[\"conversations\"]\n", + " \n", + " image = get_image(image_path)\n", + " text = get_chatml_text(conversations)\n", + " yield {\"text\": text, \"image\": image}" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "30325ee4-70e3-4e1b-82be-dbffc6b233cd", + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import IterableDataset" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "76f5f8e6-aefc-4289-924e-162ae94abe92", + "metadata": {}, + "outputs": [], + "source": [ + "instruct_ds = IterableDataset.from_generator(generator=instruct_data_generator)" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "cba24c67-5e97-45d0-a177-b0f810657656", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'text': '<|im_start|>user\\n\\nWhat are the colors of the bus in the image?<|im_end|>\\n<|im_start|>assistant\\nThe bus in the image is white and red.<|im_end|>\\n<|im_start|>user\\nWhat feature can be seen on the back of the bus?<|im_end|>\\n<|im_start|>assistant\\nThe back of the bus features an advertisement.<|im_end|>\\n<|im_start|>user\\nIs the bus driving down the street or pulled off to the side?<|im_end|>\\n<|im_start|>assistant\\nThe bus is driving down the street, which is crowded with people and other vehicles.<|im_end|>\\n', 'image': }\n" + ] + } + ], + "source": [ + "for sample in instruct_ds:\n", + " print(sample)\n", + " break" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "b40567bb-a52c-4cb2-b5f9-83e402f5b4ae", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "with open(\"llava_v1_5_mix665k.json\", \"rb\") as f:\n", + " data = json.load(f)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "02c22086-29d9-4c2c-85df-e9948704a791", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'id': '000000033471',\n", + " 'image': 'coco/train2017/000000033471.jpg',\n", + " 'conversations': [{'from': 'human',\n", + " 'value': '\\nWhat are the colors of the bus in the image?'},\n", + " {'from': 'gpt', 'value': 'The bus in the image is white and red.'},\n", + " {'from': 'human',\n", + " 'value': 'What feature can be seen on the back of the bus?'},\n", + " {'from': 'gpt', 'value': 'The back of the bus features an advertisement.'},\n", + " {'from': 'human',\n", + " 'value': 'Is the bus driving down the street or pulled off to the side?'},\n", + " {'from': 'gpt',\n", + " 'value': 'The bus is driving down the street, which is crowded with people and other vehicles.'}]}" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "3cd97403-8508-47ec-8377-2c51ce3750e5", + "metadata": {}, + "outputs": [], + "source": [ + "import os" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "2d46cb56-9441-4c84-af71-5f66f7b58dca", + "metadata": {}, + "outputs": [], + "source": [ + "for image_file in os.listdir(\"./train2014/\"):\n", + " os.rename(f\"./train2014/{image_file}\", f\"./train2014/{image_file.split('_')[2]}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "98d8c2c9-4184-4169-b57f-e3fccdc07924", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'000000472880.jpg'" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "61c47758-87b4-4cef-b48e-d9d0490d296f", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/Experiments/instruct_data.py b/Experiments/instruct_data.py new file mode 100644 index 0000000000000000000000000000000000000000..ad781ab430b708d0421c6047ae72898681b0040e --- /dev/null +++ b/Experiments/instruct_data.py @@ -0,0 +1,39 @@ +from datasets import Dataset, IterableDataset +from PIL import Image + +# ChatML format +templates = { + "assistant": "<|im_start|>assistant\n{msg}<|im_end|>", # message by assistant + "user": "<|im_start|>user\n{msg}<|im_end|>" # message by user +} + +ds = Dataset.from_json("llava_instruct_150k.json", split="train") +ds_stream = ds.to_iterable_dataset() + + +def get_image(image_path): + image_path = f"train2014/COCO_train2014_{image_path}" + img = Image.open(image_path) + return img + +def get_chatml_text(conversations): + chatml_text = "" + for conversation in conversations: + role = conversation["from"] + role = "user" if role == "human" else "assistant" + content = conversation["value"] + + formatted_text = templates[role].format(msg=content) + chatml_text += formatted_text + "\n" + return chatml_text + +def instruct_data_generator(): + for sample in ds_stream: + image_path = sample["image"] + conversations = sample["conversations"] + + image = get_image(image_path) + text = get_chatml_text(conversations) + yield {"text": text, "image": image} + +instruct_ds = IterableDataset.from_generator(generator=instruct_data_generator) \ No newline at end of file diff --git a/Experiments/llava_exp.ipynb b/Experiments/llava_exp.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..c2b38c972b162aad3d1665cdc79b03b276c880b7 --- /dev/null +++ b/Experiments/llava_exp.ipynb @@ -0,0 +1,145 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "99576983-f881-47c8-8b5e-c6f561a93e71", + "metadata": {}, + "outputs": [], + "source": [ + "import transformers" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "58ba19f2-4b91-4f90-a33d-4c1ed17e202a", + "metadata": {}, + "outputs": [], + "source": [ + "from transformers import LlavaForConditionalGeneration, LlavaConfig, CLIPVisionConfig, PhiConfig\n", + "\n", + "# Initializing a CLIP-vision config\n", + "vision_config = CLIPVisionConfig()\n", + "\n", + "# Initializing a Llama config\n", + "text_config = PhiConfig()\n", + "\n", + "# Initializing a Llava llava-1.5-7b style configuration\n", + "configuration = LlavaConfig(vision_config, text_config)\n", + "\n", + "# Initializing a model from the llava-1.5-7b style configuration\n", + "model = LlavaForConditionalGeneration(configuration)\n", + "\n", + "# Accessing the model configuration\n", + "configuration = model.config" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "a806a07a-fe72-45a3-8ceb-8e942c6c845d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "LlavaConfig {\n", + " \"ignore_index\": -100,\n", + " \"image_token_index\": 32000,\n", + " \"model_type\": \"llava\",\n", + " \"projector_hidden_act\": \"gelu\",\n", + " \"text_config\": {\n", + " \"embd_pdrop\": 0.0,\n", + " \"hidden_act\": \"gelu_new\",\n", + " \"hidden_size\": 2048,\n", + " \"intermediate_size\": 8192,\n", + " \"layer_norm_eps\": 1e-05,\n", + " \"model_type\": \"phi\",\n", + " \"num_hidden_layers\": 24,\n", + " \"partial_rotary_factor\": 0.5,\n", + " \"qk_layernorm\": false,\n", + " \"resid_pdrop\": 0.0,\n", + " \"vocab_size\": 51200\n", + " },\n", + " \"transformers_version\": \"4.36.2\",\n", + " \"vision_config\": {\n", + " \"hidden_size\": 768,\n", + " \"image_size\": 224,\n", + " \"intermediate_size\": 3072,\n", + " \"model_type\": \"clip_vision_model\",\n", + " \"num_attention_heads\": 12,\n", + " \"num_hidden_layers\": 12,\n", + " \"patch_size\": 32,\n", + " \"projection_dim\": 512\n", + " },\n", + " \"vision_feature_layer\": -2,\n", + " \"vision_feature_select_strategy\": \"default\",\n", + " \"vocab_size\": 32000\n", + "}" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.config" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "79efbc6b-f005-4a5c-82a1-112fa37f1904", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Cloning into 'llava-phi'...\n", + "remote: Enumerating objects: 151, done.\u001b[K\n", + "remote: Counting objects: 100% (151/151), done.\u001b[K\n", + "remote: Compressing objects: 100% (116/116), done.\u001b[K\n", + "remote: Total 151 (delta 36), reused 133 (delta 25), pack-reused 0\u001b[K\n", + "Receiving objects: 100% (151/151), 333.89 KiB | 112.00 KiB/s, done.\n", + "Resolving deltas: 100% (36/36), done.\n" + ] + } + ], + "source": [ + "!git clone https://github.com/zhuyiche/llava-phi.git" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cf827184-f334-4d86-ace1-fe9c92f84d66", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/Experiments/multimodal_exp.ipynb b/Experiments/multimodal_exp.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..2c97e877071a619e54aa2838501e0b604e590373 --- /dev/null +++ b/Experiments/multimodal_exp.ipynb @@ -0,0 +1,362 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 23, + "id": "d4bed9ef-4bff-4d61-a4f9-a585f377f136", + "metadata": {}, + "outputs": [], + "source": [ + "from PIL import Image\n", + "import requests\n", + "\n", + "import torch\n", + "from torch import nn\n", + "from transformers import AutoProcessor, CLIPVisionModel, CLIPVisionConfig, CLIPPreTrainedModel\n", + "from transformers.models.clip.modeling_clip import CLIPVisionModelOutput, CLIPVisionTransformer\n", + "from transformers import WhisperProcessor, WhisperForConditionalGeneration\n", + "from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer\n", + "from typing import Optional, Union, Tuple" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "952314f0-ee9d-45e7-85b8-1e3e44c1a2fd", + "metadata": {}, + "outputs": [], + "source": [ + "class VisionLanguageConnector(nn.Module):\n", + " def __init__(self, hidden_size, projection_dim):\n", + " super().__init__()\n", + " self.mlp = nn.Sequential(\n", + " nn.Linear(hidden_size, hidden_size, bias=False),\n", + " nn.GELU(),\n", + " nn.Linear(hidden_size, projection_dim, bias=False)\n", + " )\n", + "\n", + " def forward(self, x):\n", + " return self.mlp(x)\n", + " \n", + "class ClipWithProjection():\n", + " config_class = CLIPVisionConfig\n", + " main_input_name = \"pixel_values\"\n", + "\n", + " def __init__(self, hidden_size, projection_dim):\n", + " super().__init__()\n", + " \n", + " self.processor = AutoProcessor.from_pretrained(\"openai/clip-vit-base-patch32\")\n", + " self.vision_model = CLIPVisionModel.from_pretrained(\"openai/clip-vit-base-patch32\")\n", + " self.vision_language_connector = VisionLanguageConnector(hidden_size, projection_dim)\n", + "\n", + " def forward(\n", + " self,\n", + " image = None,\n", + " output_attentions: Optional[bool] = None,\n", + " output_hidden_states: Optional[bool] = None,\n", + " return_dict: Optional[bool] = None,\n", + " ) -> Union[Tuple, CLIPVisionModelOutput]:\n", + " \n", + " pixel_values = self.processor(images=image, return_tensors=\"pt\")[\"pixel_values\"]\n", + " vision_outputs = self.vision_model(\n", + " pixel_values=pixel_values,\n", + " output_attentions=output_attentions,\n", + " output_hidden_states=output_hidden_states,\n", + " return_dict=return_dict,\n", + " )\n", + "\n", + " pooled_output = vision_outputs[1] # pooled_output\n", + "\n", + " image_embeds = self.vision_language_connector(pooled_output)\n", + "\n", + " return CLIPVisionModelOutput(\n", + " image_embeds=image_embeds,\n", + " last_hidden_state=vision_outputs.last_hidden_state,\n", + " hidden_states=vision_outputs.hidden_states,\n", + " attentions=vision_outputs.attentions,\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "bd2889fe-be85-44a3-afe8-65b47f7a93c3", + "metadata": {}, + "outputs": [], + "source": [ + "url = \"http://images.cocodataset.org/val2017/000000039769.jpg\"\n", + "image = Image.open(requests.get(url, stream=True).raw)" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "17c72699-fe98-4b96-b63c-5c8ab7c1a65f", + "metadata": {}, + "outputs": [], + "source": [ + "# model = ClipWithProjection(768, 512)\n", + "# model.forward(image)" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "70806156-38a9-45a2-bf9f-e72047a0173f", + "metadata": {}, + "outputs": [], + "source": [ + "class AudioLanguageConnector:\n", + " def __init__(self, projection_dim):\n", + " model_name = \"microsoft/phi-2\"\n", + " self.phi2_tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n", + " self.phi2_tokenizer.pad_token = self.phi2_tokenizer.eos_token\n", + " self.phi2_tokenizer.max_length = projection_dim\n", + "\n", + " def __call__(self, text):\n", + " text = f\" {text} \"\n", + " tokens = self.phi2_tokenizer(text, return_tensors=\"pt\", return_attention_mask=False)\n", + " return tokens\n", + " \n", + "\n", + "class WhisperWithProjection:\n", + " def __init__(self, projection_dim):\n", + " self.processor = WhisperProcessor.from_pretrained(\"openai/whisper-tiny\")\n", + " self.model = WhisperForConditionalGeneration.from_pretrained(\"openai/whisper-tiny\")\n", + " self.model.config.forced_decoder_ids = None\n", + " self.audio_language_connector = AudioLanguageConnector(projection_dim)\n", + " \n", + " def forward(self, audio):\n", + " input_features = self.processor(audio[\"array\"],\n", + " sampling_rate=audio[\"sampling_rate\"],\n", + " return_tensors=\"pt\").input_features\n", + " # generate token ids\n", + " predicted_ids = self.model.generate(input_features)\n", + " # decode token ids to text \n", + " transcription = self.processor.batch_decode(predicted_ids, skip_special_tokens=True)\n", + "\n", + " audio_embeddings = self.audio_language_connector(transcription)\n", + " return audio_embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "79cc4d98-498b-4042-bd71-143b2477733d", + "metadata": {}, + "outputs": [], + "source": [ + "class TextModality:\n", + " def __init__(self, projection_dim):\n", + " model_name = \"microsoft/phi-2\"\n", + " self.phi2_tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n", + " self.phi2_tokenizer.pad_token = self.phi2_tokenizer.eos_token\n", + " self.phi2_tokenizer.max_length = projection_dim\n", + "\n", + "\n", + " def __call__(self, text):\n", + " tokens = self.phi2_tokenizer(text, return_tensors=\"pt\", return_attention_mask=False)\n", + " return tokens" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "id": "ba4c4772-923f-48e8-a4af-b7d9c192dd4b", + "metadata": {}, + "outputs": [], + "source": [ + "class MultiModalPhi2:\n", + " def __init__(self):\n", + " self.text_modality = TextModality(projection_dim=768)\n", + " self.whisper_w_proj = WhisperWithProjection(projection_dim=512)\n", + " self.clip_w_proj = ClipWithProjection(hidden_size=768, projection_dim=768)\n", + " self.llm = self.load_llm()\n", + "\n", + " def load_llm(self):\n", + " model_name = \"microsoft/phi-2\"\n", + " \n", + " bnb_config = BitsAndBytesConfig(\n", + " load_in_4bit=True,\n", + " bnb_4bit_quant_type=\"nf4\",\n", + " bnb_4bit_compute_dtype=torch.float16)\n", + " \n", + " model = AutoModelForCausalLM.from_pretrained(\n", + " model_name,\n", + " quantization_config=bnb_config,\n", + " trust_remote_code=True,\n", + " device_map=\"cuda:0\"\n", + " )\n", + " model.config.use_cache = False\n", + " return model\n", + "\n", + " def forward(self, audio, image, text):\n", + " if text is not None:\n", + " text_embed = self.text_modality(text)[\"input_ids\"]\n", + " if audio is not None:\n", + " audio_embed = self.whisper_w_proj.forward(audio)[\"input_ids\"]\n", + " if image is not None:\n", + " image_embed = self.clip_w_proj.forward(image)[0]\n", + " print(text_embed.shape, text_embed.dtype)\n", + " print(audio_embed.shape, audio_embed.dtype)\n", + " print(image_embed.shape, image_embed.dtype)\n", + " \n", + " inputs = torch.concat([text_embed, audio_embed, image_embed], dim=1)\n", + " print(inputs.shape, inputs.dtype)\n", + " outputs = self.llm(inputs)\n", + "\n", + " return outputs \n", + " \n", + "\n", + " def generate(self, audio, text):\n", + " text_embeddings = self.text_modality(text)\n", + " audio_embeddings = self.whisper_w_proj.forward(audio)\n", + " inputs = torch.concat([text_embed[\"input_ids\"], audio_embed[\"input_ids\"]], dim=1)\n", + " \n", + " outputs = self.llm.generate(inputs, max_length=200)\n", + " text = self.text_modality.phi2_tokenizer.batch_decode(outputs)[0]\n", + " print(text)" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "id": "7ca694eb-8009-4eb9-9a4c-eac406ab9584", + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import load_dataset\n", + "audio_ds = load_dataset(\"hf-internal-testing/librispeech_asr_dummy\", \"clean\", split=\"validation\")\n", + "audio = audio_ds[0][\"audio\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "id": "37be28c5-4cc3-4471-b394-032c7602accc", + "metadata": {}, + "outputs": [], + "source": [ + "text = \"explain about the audio\"" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "c0705114-1670-4937-bc3e-3660e5a5d2c5", + "metadata": {}, + "outputs": [], + "source": [ + "# image" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "id": "0d7e5b49-b4bd-477c-87b8-91ef70857677", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n", + "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "733dc7b2208b4853a89aea49bff9a55c", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Loading checkpoint shards: 0%| | 0/2 [00:00 1\u001b[0m \u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mforward\u001b[49m\u001b[43m(\u001b[49m\u001b[43maudio\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mimage\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtext\u001b[49m\u001b[43m)\u001b[49m\n", + "Cell \u001b[0;32mIn[77], line 38\u001b[0m, in \u001b[0;36mMultiModalPhi2.forward\u001b[0;34m(self, audio, image, text)\u001b[0m\n\u001b[1;32m 36\u001b[0m inputs \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mconcat([text_embed, audio_embed, image_embed], dim\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m)\n\u001b[1;32m 37\u001b[0m \u001b[38;5;28mprint\u001b[39m(inputs\u001b[38;5;241m.\u001b[39mshape, inputs\u001b[38;5;241m.\u001b[39mdtype)\n\u001b[0;32m---> 38\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mllm\u001b[49m\u001b[43m(\u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 40\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m outputs\n", + "File \u001b[0;32m~/miniconda3/envs/torchenv/lib/python3.10/site-packages/torch/nn/modules/module.py:1518\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1516\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1517\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1518\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/miniconda3/envs/torchenv/lib/python3.10/site-packages/torch/nn/modules/module.py:1527\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1522\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1523\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1524\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1525\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1526\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1527\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1529\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1530\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n", + "File \u001b[0;32m~/miniconda3/envs/torchenv/lib/python3.10/site-packages/accelerate/hooks.py:165\u001b[0m, in \u001b[0;36madd_hook_to_module..new_forward\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 163\u001b[0m output \u001b[38;5;241m=\u001b[39m old_forward(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 164\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 165\u001b[0m output \u001b[38;5;241m=\u001b[39m \u001b[43mold_forward\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 166\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m module\u001b[38;5;241m.\u001b[39m_hf_hook\u001b[38;5;241m.\u001b[39mpost_forward(module, output)\n", + "File \u001b[0;32m~/.cache/huggingface/modules/transformers_modules/microsoft/phi-2/85d00b03fee509307549d823fdd095473ba5197c/modeling_phi.py:1049\u001b[0m, in \u001b[0;36mPhiForCausalLM.forward\u001b[0;34m(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[1;32m 1046\u001b[0m return_dict \u001b[38;5;241m=\u001b[39m return_dict \u001b[38;5;28;01mif\u001b[39;00m return_dict \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39muse_return_dict\n\u001b[1;32m 1048\u001b[0m \u001b[38;5;66;03m# decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)\u001b[39;00m\n\u001b[0;32m-> 1049\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmodel\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1050\u001b[0m \u001b[43m \u001b[49m\u001b[43minput_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minput_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1051\u001b[0m \u001b[43m \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1052\u001b[0m \u001b[43m \u001b[49m\u001b[43mposition_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mposition_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1053\u001b[0m \u001b[43m \u001b[49m\u001b[43mpast_key_values\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpast_key_values\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1054\u001b[0m \u001b[43m \u001b[49m\u001b[43minputs_embeds\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minputs_embeds\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1055\u001b[0m \u001b[43m \u001b[49m\u001b[43muse_cache\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_cache\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1056\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1057\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_hidden_states\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_hidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1058\u001b[0m \u001b[43m \u001b[49m\u001b[43mreturn_dict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_dict\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1059\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1061\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m 1062\u001b[0m logits \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlm_head(hidden_states)\n", + "File \u001b[0;32m~/miniconda3/envs/torchenv/lib/python3.10/site-packages/torch/nn/modules/module.py:1518\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1516\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1517\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1518\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/miniconda3/envs/torchenv/lib/python3.10/site-packages/torch/nn/modules/module.py:1527\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1522\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1523\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1524\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1525\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1526\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1527\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1529\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1530\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n", + "File \u001b[0;32m~/miniconda3/envs/torchenv/lib/python3.10/site-packages/accelerate/hooks.py:165\u001b[0m, in \u001b[0;36madd_hook_to_module..new_forward\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 163\u001b[0m output \u001b[38;5;241m=\u001b[39m old_forward(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 164\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 165\u001b[0m output \u001b[38;5;241m=\u001b[39m \u001b[43mold_forward\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 166\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m module\u001b[38;5;241m.\u001b[39m_hf_hook\u001b[38;5;241m.\u001b[39mpost_forward(module, output)\n", + "File \u001b[0;32m~/.cache/huggingface/modules/transformers_modules/microsoft/phi-2/85d00b03fee509307549d823fdd095473ba5197c/modeling_phi.py:893\u001b[0m, in \u001b[0;36mPhiModel.forward\u001b[0;34m(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, use_cache, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[1;32m 890\u001b[0m position_ids \u001b[38;5;241m=\u001b[39m position_ids\u001b[38;5;241m.\u001b[39munsqueeze(\u001b[38;5;241m0\u001b[39m)\n\u001b[1;32m 892\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m inputs_embeds \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 893\u001b[0m inputs_embeds \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43membed_tokens\u001b[49m\u001b[43m(\u001b[49m\u001b[43minput_ids\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 895\u001b[0m inputs_embeds \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39membed_dropout(inputs_embeds)\n\u001b[1;32m 897\u001b[0m \u001b[38;5;66;03m# Attention mask.\u001b[39;00m\n", + "File \u001b[0;32m~/miniconda3/envs/torchenv/lib/python3.10/site-packages/torch/nn/modules/module.py:1518\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1516\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1517\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1518\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/miniconda3/envs/torchenv/lib/python3.10/site-packages/torch/nn/modules/module.py:1527\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1522\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1523\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1524\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1525\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1526\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1527\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1529\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1530\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n", + "File \u001b[0;32m~/miniconda3/envs/torchenv/lib/python3.10/site-packages/accelerate/hooks.py:165\u001b[0m, in \u001b[0;36madd_hook_to_module..new_forward\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 163\u001b[0m output \u001b[38;5;241m=\u001b[39m old_forward(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 164\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 165\u001b[0m output \u001b[38;5;241m=\u001b[39m \u001b[43mold_forward\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 166\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m module\u001b[38;5;241m.\u001b[39m_hf_hook\u001b[38;5;241m.\u001b[39mpost_forward(module, output)\n", + "File \u001b[0;32m~/miniconda3/envs/torchenv/lib/python3.10/site-packages/torch/nn/modules/sparse.py:162\u001b[0m, in \u001b[0;36mEmbedding.forward\u001b[0;34m(self, input)\u001b[0m\n\u001b[1;32m 161\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;28minput\u001b[39m: Tensor) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tensor:\n\u001b[0;32m--> 162\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mF\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43membedding\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 163\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mweight\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpadding_idx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmax_norm\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 164\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mnorm_type\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mscale_grad_by_freq\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msparse\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/miniconda3/envs/torchenv/lib/python3.10/site-packages/torch/nn/functional.py:2233\u001b[0m, in \u001b[0;36membedding\u001b[0;34m(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse)\u001b[0m\n\u001b[1;32m 2227\u001b[0m \u001b[38;5;66;03m# Note [embedding_renorm set_grad_enabled]\u001b[39;00m\n\u001b[1;32m 2228\u001b[0m \u001b[38;5;66;03m# XXX: equivalent to\u001b[39;00m\n\u001b[1;32m 2229\u001b[0m \u001b[38;5;66;03m# with torch.no_grad():\u001b[39;00m\n\u001b[1;32m 2230\u001b[0m \u001b[38;5;66;03m# torch.embedding_renorm_\u001b[39;00m\n\u001b[1;32m 2231\u001b[0m \u001b[38;5;66;03m# remove once script supports set_grad_enabled\u001b[39;00m\n\u001b[1;32m 2232\u001b[0m _no_grad_embedding_renorm_(weight, \u001b[38;5;28minput\u001b[39m, max_norm, norm_type)\n\u001b[0;32m-> 2233\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43membedding\u001b[49m\u001b[43m(\u001b[49m\u001b[43mweight\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpadding_idx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mscale_grad_by_freq\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msparse\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[0;31mRuntimeError\u001b[0m: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.cuda.FloatTensor instead (while checking arguments for embedding)" + ] + } + ], + "source": [ + "model.forward(audio, image, text)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4ca96caf-82e2-4f07-87b3-8654dfdc89aa", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/Experiments/pretrain_data_check.ipynb b/Experiments/pretrain_data_check.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..2cdc9a93c26180847e661498ebdf1241b19aed72 --- /dev/null +++ b/Experiments/pretrain_data_check.ipynb @@ -0,0 +1,304 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 5, + "id": "61c272f2-edbe-4b7d-8fec-3ab431400cd3", + "metadata": {}, + "outputs": [], + "source": [ + "import json" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "e9dfd7d7-1685-4fc7-bbb9-3905c32d8ba1", + "metadata": {}, + "outputs": [], + "source": [ + "with open(\"metadata.json\", \"rb\") as f:\n", + " metadata = json.load(f)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "70bdba48-db01-42ac-8d89-edc69d7d7672", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "595375" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(metadata)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "59e193cc-0dd8-4f7e-959a-fbad0133d76c", + "metadata": {}, + "outputs": [], + "source": [ + "with open(\"blip_laion_cc_sbu_558k.jsonblip_laion_cc_sbu_558k.json\", \"rb\") as f:\n", + " data = json.load(f)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "f3157f41-269b-4f7a-b3ba-9be711babe02", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'id': '004539375',\n", + " 'image': '00453/004539375.jpg',\n", + " 'conversations': [{'from': 'human',\n", + " 'value': 'Render a clear and concise summary of the photo.\\n'},\n", + " {'from': 'gpt',\n", + " 'value': 'select luxury furniture 3 - inch gel memory foam mattress topper'}]}" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "50d8a051-1526-47dd-ad71-d3c66f7bd34e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'id': '004374662',\n", + " 'image': '00437/004374662.jpg',\n", + " 'conversations': [{'from': 'human',\n", + " 'value': 'Give a brief description of the image.\\n'},\n", + " {'from': 'gpt', 'value': 'the north face duffel bag camo large'}]}" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data[234]" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "2e6d5664-4583-49a6-93cc-079ee2d1ff6c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "558128" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(data)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "11ed106d-6bef-482c-a456-5eaaf2025534", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'id': 'GCC_train_001749371',\n", + " 'image': 'GCC_train_001749371.jpg',\n", + " 'caption': 'if you are dreaming of simpler or off - the - grid living , a yurt is a fantastic option',\n", + " 'blip_caption': 'a white and tan yurt sitting on a dirt road',\n", + " 'url': 'https://i.pinimg.com/736x/14/7b/64/147b64467ee966d9a578097bb70475ad--yurt-kits-small-space-living.jpg'}" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "metadata[67]" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "ce8adcec-2499-4be3-be1d-7313fe54e96a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'id': '000466761',\n", + " 'image': '00046/000466761.jpg',\n", + " 'conversations': [{'from': 'human',\n", + " 'value': '\\nProvide a brief description of the given image.'},\n", + " {'from': 'gpt',\n", + " 'value': 'a clipboard and a pen with the words public health emergency next to it on a white table'}]}" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data[67]" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "068313b6-6379-4ca2-892c-682634d3581e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "list" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "type(data)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "9ec33b51-4a0b-4a1e-81f7-2fda7cddb25f", + "metadata": {}, + "outputs": [], + "source": [ + "sample_data = data[:200000]" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "095685e5-40f1-4d84-8280-ef74fa56c5a2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "200000" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(sample_data)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "ffbad552-23fd-475f-8e9a-7118bcc4f51e", + "metadata": {}, + "outputs": [], + "source": [ + "with open(\"llava-phi/pretrain_data/blip_sample.json\", \"w\") as f:\n", + " json.dump(sample_data, f)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "69a05d25-6f3b-40c0-a3b5-e185ff526471", + "metadata": {}, + "outputs": [], + "source": [ + "with open(\"llava-phi/pretrain_data/blip_sample.json\", \"rb\") as f:\n", + " sample = json.load(f)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "200eea06-dfd6-4b3a-bb91-82af7d363951", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "200000" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(sample)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f86caa1e-edea-4a9c-934f-5420ede80d0d", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/Experiments/whispher_exp.ipynb b/Experiments/whispher_exp.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..fbf5a52a2e46bd8b25555c93649d5dc6b11c9701 --- /dev/null +++ b/Experiments/whispher_exp.ipynb @@ -0,0 +1,500 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 9, + "id": "bb4dd66b-0c17-48d4-9d34-f48cece2feb5", + "metadata": {}, + "outputs": [], + "source": [ + "# !pip install soundfile\n", + "# !pip install librosa" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "6e9386ea-4862-4f5b-a02f-d656e1a5ab9e", + "metadata": {}, + "outputs": [], + "source": [ + "from transformers import WhisperProcessor, WhisperForConditionalGeneration\n", + "from datasets import load_dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "914ab2b4-389d-4c48-8d1d-1250356646ac", + "metadata": {}, + "outputs": [], + "source": [ + "# load model and processor\n", + "processor = WhisperProcessor.from_pretrained(\"openai/whisper-tiny\")\n", + "model = WhisperForConditionalGeneration.from_pretrained(\"openai/whisper-tiny\")\n", + "model.config.forced_decoder_ids = None\n", + "\n", + "# load dummy dataset and read audio files\n", + "ds = load_dataset(\"hf-internal-testing/librispeech_asr_dummy\", \"clean\", split=\"validation\")\n", + "sample = ds[0][\"audio\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "2b299bab-1228-48d9-a8a5-3d5b6c52162d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'path': '/home/ravi.naik/.cache/huggingface/datasets/downloads/extracted/431c2c946d216530b2666a0e7ffa5ac3f5b3da89dd28858a9de6c78fae7caa4a/dev_clean/1272/128104/1272-128104-0000.flac',\n", + " 'array': array([0.00238037, 0.0020752 , 0.00198364, ..., 0.00042725, 0.00057983,\n", + " 0.0010376 ]),\n", + " 'sampling_rate': 16000}" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sample" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "b7e570a1-cf5c-450c-a7b6-49b45a10d2df", + "metadata": {}, + "outputs": [], + "source": [ + "input_features = processor(sample[\"array\"], sampling_rate=sample[\"sampling_rate\"], return_tensors=\"pt\").input_features " + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "584e920b-a7fd-402d-95dd-3b9128cd34bb", + "metadata": {}, + "outputs": [], + "source": [ + "# generate token ids\n", + "predicted_ids = model.generate(input_features)\n", + "# decode token ids to text\n", + "transcription = processor.batch_decode(predicted_ids, skip_special_tokens=False)\n", + "\n", + "transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "b27ab660-861b-49d1-81f9-f51cb7f9d8d8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[' Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.']" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "transcription" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "eca553b8-68f6-493d-b567-3d526b49ae1b", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "from torch import nn" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "c619a4cf-9068-4e4d-8139-e16d15345f4f", + "metadata": {}, + "outputs": [], + "source": [ + "from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "47d5b1ff-ab0f-4d11-af64-d2fa2be39286", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n" + ] + } + ], + "source": [ + "model_name = \"microsoft/phi-2\"\n", + "phi2_tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n", + "phi2_tokenizer.pad_token = phi2_tokenizer.eos_token" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "0b36b3f0-db5b-4029-9072-0a53bcab315a", + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'transcription' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[6], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m tokens \u001b[38;5;241m=\u001b[39m phi2_tokenizer(\u001b[38;5;241m*\u001b[39m\u001b[43mtranscription\u001b[49m, return_tensors\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpt\u001b[39m\u001b[38;5;124m\"\u001b[39m, return_attention_mask\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)\n", + "\u001b[0;31mNameError\u001b[0m: name 'transcription' is not defined" + ] + } + ], + "source": [ + "tokens = phi2_tokenizer(*transcription, return_tensors=\"pt\", return_attention_mask=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "91f6d3d3-bb00-434f-a91e-6952375890d0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'input_ids': tensor([[ 1770, 13, 2264, 346, 353, 318, 262, 46329, 286, 262,\n", + " 3504, 6097, 290, 356, 389, 9675, 284, 7062, 465, 21443,\n", + " 13]])}" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokens" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "533191d9-4b3b-417a-918d-6fe854f24b50", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "A new version of the following files was downloaded from https://huggingface.co/microsoft/phi-2:\n", + "- configuration_phi.py\n", + ". Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "2a65a119388b4cb4b123b532176e786e", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "modeling_phi.py: 0%| | 0.00/62.7k [00:00 {text} \"\n", + " tokens = self.phi2_tokenizer(text, return_tensors=\"pt\", return_attention_mask=False)\n", + " return tokens\n", + " \n", + "\n", + "class WhisperWithProjection:\n", + " def __init__(self):\n", + " self.processor = WhisperProcessor.from_pretrained(\"openai/whisper-tiny\")\n", + " self.model = WhisperForConditionalGeneration.from_pretrained(\"openai/whisper-tiny\")\n", + " self.model.config.forced_decoder_ids = None\n", + " self.audio_language_connector = AudioLanguageConnector()\n", + " \n", + " def forward(self, audio):\n", + " input_features = self.processor(audio[\"array\"],\n", + " sampling_rate=audio[\"sampling_rate\"],\n", + " return_tensors=\"pt\").input_features\n", + " # generate token ids\n", + " predicted_ids = self.model.generate(input_features)\n", + " # decode token ids to text \n", + " transcription = self.processor.batch_decode(predicted_ids, skip_special_tokens=True)\n", + "\n", + " audio_embeddings = self.audio_language_connector(transcription)\n", + " return audio_embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "2b1f8f44-bfe6-413c-9e32-c38fa5517981", + "metadata": {}, + "outputs": [], + "source": [ + "class TextModality:\n", + " def __init__(self):\n", + " model_name = \"microsoft/phi-2\"\n", + " self.phi2_tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n", + " self.phi2_tokenizer.pad_token = self.phi2_tokenizer.eos_token\n", + "\n", + " def __call__(self, text):\n", + " tokens = self.phi2_tokenizer(text, return_tensors=\"pt\", return_attention_mask=False)\n", + " return tokens" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "21c51648-abb6-4bbd-b4c1-509967a69337", + "metadata": {}, + "outputs": [], + "source": [ + "class MultiModalPhi2:\n", + " def __init__(self):\n", + " self.text_modality = TextModality()\n", + " self.whisper_w_proj = WhisperWithProjection()\n", + " self.llm = self.load_llm()\n", + "\n", + " def load_llm(self):\n", + " bnb_config = BitsAndBytesConfig(\n", + " load_in_4bit=True,\n", + " bnb_4bit_quant_type=\"nf4\",\n", + " bnb_4bit_compute_dtype=torch.float16)\n", + " \n", + " model = AutoModelForCausalLM.from_pretrained(\n", + " model_name,\n", + " quantization_config=bnb_config,\n", + " trust_remote_code=True,\n", + " device_map=\"cuda:0\"\n", + " )\n", + " model.config.use_cache = False\n", + " return model\n", + "\n", + " def generate(self, audio, text):\n", + " text_embeddings = self.text_modality(text)\n", + " audio_embeddings = self.whisper_w_proj.forward(audio)\n", + " inputs = torch.concat([text_embeddings[\"input_ids\"], audio_embeddings[\"input_ids\"]], dim=1)\n", + " \n", + " # outputs = self.llm.generate(inputs, max_length=200)\n", + " outputs = self.llm(inputs)\n", + " return outputs\n", + " \n", + " # text = self.text_modality.phi2_tokenizer.batch_decode(outputs)[0]\n", + " # print(text)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "472a00cb-bae9-4c09-a0ef-bc57881b5e2c", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n", + "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "2236e6b1e26d444fa3d48181ba1a6cf9", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Loading checkpoint shards: 0%| | 0/2 [00:00)}, logits=tensor([[[ 6.9531, 9.9375, 7.0234, ..., 2.0020, 2.0020, 2.0000],\n", + " [ 8.9062, 12.1172, 7.5977, ..., -1.2012, -1.2012, -1.2012],\n", + " [ 7.0273, 5.3477, 3.6328, ..., -4.2070, -4.2070, -4.2070],\n", + " ...,\n", + " [ 7.0234, 7.4414, 9.1016, ..., 1.0117, 1.0127, 1.0117],\n", + " [ 9.4531, 10.0391, 9.7578, ..., 0.0776, 0.0775, 0.0764],\n", + " [ 8.0703, 6.6445, 5.5156, ..., -1.9268, -1.9268, -1.9277]]],\n", + " grad_fn=), past_key_values=None, hidden_states=None, attentions=None)" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "audio = sample\n", + "text = \"explain about the audio\"\n", + "multi_modal_phi.generate(audio, text)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "46aa9c66-a5bb-4760-8895-92673f49345f", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..527030da6c2403cdd583a6114fa2291ce329bba2 --- /dev/null +++ b/app.py @@ -0,0 +1,126 @@ +import gradio as gr +from PIL import Image +from inference.main import MultiModalPhi2 + +messages = [] + +multimodal_phi2 = MultiModalPhi2( + modelname_or_path="RaviNaik/Llava-Phi2", + temperature=0.2, + max_new_tokens=1024, + device="cpu", +) + + +def add_content(chatbot, text, image, audio_upload, audio_mic) -> gr.Chatbot: + textflag, imageflag, audioflag = False, False, False + if text not in ["", None]: + chatbot.append((text, None)) + textflag = True + if image is not None: + chatbot.append(((image,), None)) + imageflag = True + if audio_mic is not None: + chatbot.append(((audio_mic,), None)) + audioflag = True + else: + if audio_upload is not None: + chatbot.append(((audio_upload,), None)) + audioflag = True + if not any([textflag, imageflag, audioflag]): + # Raise an error if neither text nor file is provided + raise gr.Error("Enter a valid text, image or audio") + return chatbot + + +def clear_data(): + return {prompt: None, image: None, audio_upload: None, audio_mic: None, chatbot: []} + + +def run(history, text, image, audio_upload, audio_mic): + if text in [None, ""]: + text = None + + if audio_upload is not None: + audio = audio_upload + elif audio_mic is not None: + audio = audio_mic + else: + audio = None + + print("text", text) + print("image", image) + print("audio", audio) + + if image is not None: + image = Image.open(image) + outputs = multimodal_phi2(text, audio, image) + # outputs = "" + + history.append((None, outputs.title())) + return history, None, None, None, None + + +with gr.Blocks() as demo: + gr.Markdown("## MulitModal Phi2 Model Pretraining and Finetuning from Scratch") + gr.Markdown( + """This is a multimodal implementation of [Phi2](https://huggingface.co/microsoft/phi-2) model. + + Please find the source code and training details [here](https://github.com/RaviNaik/ERA-CAPSTONE/MultiModalPhi2). + + ### Details: + 1. LLM Backbone: [Phi2](https://huggingface.co/microsoft/phi-2) + 2. Vision Tower: [clip-vit-large-patch14-336](https://huggingface.co/openai/clip-vit-large-patch14-336) + 3. Audio Model: [Whisper Tiny](https://huggingface.co/openai/whisper-tiny) + 4. Pretraining Dataset: [LAION-CC-SBU dataset with BLIP captions(200k samples)](https://huggingface.co/datasets/liuhaotian/LLaVA-Pretrain) + 5. Finetuning Dataset: [Instruct 150k dataset based on COCO](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K) + 6. Finetuned Model: [RaviNaik/Llava-Phi2](https://huggingface.co/RaviNaik/Llava-Phi2) + """ + ) + with gr.Row(): + with gr.Column(scale=4): + # Creating a column with a scale of 6 + with gr.Box(): + with gr.Row(): + # Adding a Textbox with a placeholder "write prompt" + prompt = gr.Textbox( + placeholder="Enter Prompt", lines=2, label="Query", value=None + ) + # Creating a column with a scale of 2 + with gr.Row(): + # Adding image + image = gr.Image(type="filepath", value=None) + # Creating a column with a scale of 2 + with gr.Row(): + # Add audio + audio_upload = gr.Audio(source="upload", type="filepath") + audio_mic = gr.Audio(source="microphone", type="filepath") + + with gr.Column(scale=8): + with gr.Box(): + with gr.Row(): + chatbot = gr.Chatbot( + avatar_images=("🧑", "🤖"), + height=550, + ) + with gr.Row(): + # Adding a Button + submit = gr.Button() + clear = gr.Button(value="Clear") + + submit.click( + add_content, + inputs=[chatbot, prompt, image, audio_upload, audio_mic], + outputs=[chatbot], + ).success( + run, + inputs=[chatbot, prompt, image, audio_upload, audio_mic], + outputs=[chatbot, prompt, image, audio_upload, audio_mic], + ) + + clear.click( + clear_data, + outputs=[prompt, image, audio_upload, audio_mic, chatbot], + ) + +demo.launch() diff --git a/inference/__init__.py b/inference/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/inference/conversation.py b/inference/conversation.py new file mode 100644 index 0000000000000000000000000000000000000000..be382e5dd601555b67ac59a3952871947037e3ab --- /dev/null +++ b/inference/conversation.py @@ -0,0 +1,224 @@ +import dataclasses +from enum import auto, Enum +from typing import List, Tuple + + +class SeparatorStyle(Enum): + """Different separator style.""" + SINGLE = auto() + TWO = auto() + MPT = auto() + PLAIN = auto() + LLAMA_2 = auto() + + +@dataclasses.dataclass +class Conversation: + """A class that keeps all conversation history.""" + system: str + roles: List[str] + messages: List[List[str]] + offset: int + sep_style: SeparatorStyle = SeparatorStyle.SINGLE + sep: str = "###" + sep2: str = None + version: str = "Unknown" + + skip_next: bool = False + + def get_prompt(self): + messages = self.messages + if len(messages) > 0 and type(messages[0][1]) is tuple: + messages = self.messages.copy() + init_role, init_msg = messages[0].copy() + init_msg = init_msg[0].replace("", "").strip() + if 'mmtag' in self.version: + messages[0] = (init_role, init_msg) + messages.insert(0, (self.roles[0], "")) + messages.insert(1, (self.roles[1], "Received.")) + else: + messages[0] = (init_role, "\n" + init_msg) + + if self.sep_style == SeparatorStyle.SINGLE: + ret = self.system + self.sep + for role, message in messages: + if message: + if type(message) is tuple: + message, _, _ = message + ret += role + ": " + message + self.sep + else: + ret += role + ":" + elif self.sep_style == SeparatorStyle.TWO: + seps = [self.sep, self.sep2] + ret = self.system + seps[0] + for i, (role, message) in enumerate(messages): + if message: + if type(message) is tuple: + message, _, _ = message + ret += role + ": " + message + seps[i % 2] + else: + ret += role + ":" + elif self.sep_style == SeparatorStyle.PLAIN: + seps = [self.sep, self.sep2] + ret = self.system + for i, (role, message) in enumerate(messages): + if message: + if type(message) is tuple: + message, _, _ = message + ret += message + seps[i % 2] + else: + ret += "" + else: + raise ValueError(f"Invalid style: {self.sep_style}") + + return ret + + def append_message(self, role, message): + self.messages.append([role, message]) + + def get_images(self, return_pil=False): + images = [] + for i, (role, msg) in enumerate(self.messages[self.offset:]): + if i % 2 == 0: + if type(msg) is tuple: + import base64 + from io import BytesIO + from PIL import Image + msg, image, image_process_mode = msg + if image_process_mode == "Pad": + def expand2square(pil_img, background_color=(122, 116, 104)): + width, height = pil_img.size + if width == height: + return pil_img + elif width > height: + result = Image.new(pil_img.mode, (width, width), background_color) + result.paste(pil_img, (0, (width - height) // 2)) + return result + else: + result = Image.new(pil_img.mode, (height, height), background_color) + result.paste(pil_img, ((height - width) // 2, 0)) + return result + image = expand2square(image) + elif image_process_mode in ["Default", "Crop"]: + pass + elif image_process_mode == "Resize": + image = image.resize((336, 336)) + else: + raise ValueError(f"Invalid image_process_mode: {image_process_mode}") + max_hw, min_hw = max(image.size), min(image.size) + aspect_ratio = max_hw / min_hw + max_len, min_len = 800, 400 + shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw)) + longest_edge = int(shortest_edge * aspect_ratio) + W, H = image.size + if longest_edge != max(image.size): + if H > W: + H, W = longest_edge, shortest_edge + else: + H, W = shortest_edge, longest_edge + image = image.resize((W, H)) + if return_pil: + images.append(image) + else: + buffered = BytesIO() + image.save(buffered, format="PNG") + img_b64_str = base64.b64encode(buffered.getvalue()).decode() + images.append(img_b64_str) + return images + + def to_gradio_chatbot(self): + ret = [] + for i, (role, msg) in enumerate(self.messages[self.offset:]): + if i % 2 == 0: + if type(msg) is tuple: + import base64 + from io import BytesIO + msg, image, image_process_mode = msg + max_hw, min_hw = max(image.size), min(image.size) + aspect_ratio = max_hw / min_hw + max_len, min_len = 800, 400 + shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw)) + longest_edge = int(shortest_edge * aspect_ratio) + W, H = image.size + if H > W: + H, W = longest_edge, shortest_edge + else: + H, W = shortest_edge, longest_edge + image = image.resize((W, H)) + buffered = BytesIO() + image.save(buffered, format="JPEG") + img_b64_str = base64.b64encode(buffered.getvalue()).decode() + img_str = f'user upload image' + msg = img_str + msg.replace('', '').strip() + ret.append([msg, None]) + else: + ret.append([msg, None]) + else: + ret[-1][-1] = msg + return ret + + def copy(self): + return Conversation( + system=self.system, + roles=self.roles, + messages=[[x, y] for x, y in self.messages], + offset=self.offset, + sep_style=self.sep_style, + sep=self.sep, + sep2=self.sep2, + version=self.version) + + def dict(self): + if len(self.get_images()) > 0: + return { + "system": self.system, + "roles": self.roles, + "messages": [[x, y[0] if type(y) is tuple else y] for x, y in self.messages], + "offset": self.offset, + "sep": self.sep, + "sep2": self.sep2, + } + return { + "system": self.system, + "roles": self.roles, + "messages": self.messages, + "offset": self.offset, + "sep": self.sep, + "sep2": self.sep2, + } + + +conv_phi_v0 = Conversation( + system="A chat between a curious user and an artificial intelligence assistant. " + "The assistant gives helpful, detailed, and polite answers to the user's questions.", + roles=("USER", "ASSISTANT"), + version="v0", + messages=(), + offset=0, + sep_style=SeparatorStyle.TWO, + sep=" ", + sep2="<|endoftext|>", +) + +conv_llava_plain = Conversation( + system="", + roles=("", ""), + messages=( + ), + offset=0, + sep_style=SeparatorStyle.PLAIN, + sep="\n", +) + +default_conversation = conv_phi_v0 +conv_templates = { + "default": conv_phi_v0, + "v0": conv_phi_v0, + "phi-2_v0": conv_phi_v0, + + "plain": conv_llava_plain, +} + + +if __name__ == "__main__": + print(default_conversation.get_prompt()) diff --git a/inference/inference.ipynb b/inference/inference.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..0eac55d6929c1611e5fa3017792cfd59b8cbf253 --- /dev/null +++ b/inference/inference.ipynb @@ -0,0 +1,369 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "cdad6b21-030a-40d3-9b31-a229e5b6196d", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer, AutoConfig, CLIPImageProcessor" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "1f832710-0e8c-42ec-b581-1b15fd2a6acc", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-01-25 14:31:58,511] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n" + ] + } + ], + "source": [ + "from model import LlavaPhiForCausalLM" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "9e68f1d4-1ae3-4d45-b818-4600218d2215", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "e5e13e666e3a43d4ad26cc70904abee8", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Loading checkpoint shards: 0%| | 0/3 [00:00 {text} \"\n", + " tokens = self.phi2_tokenizer(text, return_tensors=\"pt\", return_attention_mask=False)\n", + " return tokens\n", + " \n", + "\n", + "class WhisperWithProjection:\n", + " def __init__(self, projection_dim, device):\n", + " self.device = device\n", + " self.processor = WhisperProcessor.from_pretrained(\"openai/whisper-tiny\", device_map=device)\n", + " self.model = WhisperForConditionalGeneration.from_pretrained(\"openai/whisper-tiny\", device_map=device)\n", + " self.model.config.forced_decoder_ids = None\n", + " # self.audio_language_connector = AudioLanguageConnector(projection_dim)\n", + " \n", + " def __call__(self, audio):\n", + " input_features = self.processor(audio[\"array\"],\n", + " sampling_rate=audio[\"sampling_rate\"],\n", + " return_tensors=\"pt\").input_features\n", + " # generate token ids\n", + " predicted_ids = self.model.generate(input_features.to(self.device))\n", + " # decode token ids to text \n", + " transcription = self.processor.batch_decode(predicted_ids, skip_special_tokens=True)\n", + "\n", + " # audio_embeddings = self.audio_language_connector(transcription)\n", + " return transcription" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "a2757c91-2ec1-4fe7-9216-03740bf80061", + "metadata": {}, + "outputs": [], + "source": [ + "IGNORE_INDEX = -100\n", + "IMAGE_TOKEN_INDEX = -200\n", + "DEFAULT_IMAGE_TOKEN = \"\"\n", + "DEFAULT_IMAGE_PATCH_TOKEN = \"\"\n", + "DEFAULT_IM_START_TOKEN = \"\"\n", + "DEFAULT_IM_END_TOKEN = \"\"\n", + "\n", + "from conversation import conv_templates, SeparatorStyle\n", + "\n", + "class MultiModalPhi2:\n", + " def __init__(self, modelname_or_path=\"RaviNaik/Llava-Phi2\",\n", + " temperature=0.2,\n", + " max_new_tokens=1024,\n", + " device=\"cuda:0\"):\n", + " self.model_name = modelname_or_path\n", + " self.temperature = temperature\n", + " self.max_new_tokens = max_new_tokens\n", + " self.device = device\n", + " self.disable_torch_init()\n", + " self.whisper_w_proj = WhisperWithProjection(projection_dim=512, device=device)\n", + " self.load_pretrained_model()\n", + " \n", + " def disable_torch_init(self):\n", + " \"\"\"\n", + " Disable the redundant torch default initialization to accelerate model creation.\n", + " \"\"\"\n", + " setattr(torch.nn.Linear, \"reset_parameters\", lambda self: None)\n", + " setattr(torch.nn.LayerNorm, \"reset_parameters\", lambda self: None)\n", + " \n", + " def load_pretrained_model(self):\n", + " self.model = LlavaPhiForCausalLM.from_pretrained(self.model_name, device_map=self.device)\n", + " self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)\n", + " self.image_processor = CLIPImageProcessor.from_pretrained(self.model_name)\n", + " mm_use_im_start_end = getattr(self.model.config, \"mm_use_im_start_end\", False)\n", + " mm_use_im_patch_token = getattr(self.model.config, \"mm_use_im_patch_token\", True)\n", + " if mm_use_im_patch_token:\n", + " self.tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)\n", + " if mm_use_im_start_end:\n", + " self.tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)\n", + " \n", + " def tokenizer_image_token(self, prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None):\n", + " prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('')]\n", + " \n", + " def insert_separator(X, sep):\n", + " return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1]\n", + " \n", + " input_ids = []\n", + " offset = 0\n", + " if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:\n", + " offset = 1\n", + " input_ids.append(prompt_chunks[0][0])\n", + " for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):\n", + " input_ids.extend(x[offset:])\n", + " \n", + " if return_tensors is not None:\n", + " if return_tensors == 'pt':\n", + " return torch.tensor(input_ids, dtype=torch.long)\n", + " raise ValueError(f'Unsupported tensor type: {return_tensors}')\n", + " return input_ids\n", + " \n", + " def __call__(self, text, audio, image):\n", + " qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\\n' + text\n", + " conv = conv_templates[\"phi-2_v0\"].copy()\n", + " conv.append_message(conv.roles[0], qs)\n", + " conv.append_message(conv.roles[1], None)\n", + " prompt = conv.get_prompt()\n", + "\n", + " image_tensor = self.image_processor.preprocess(image, return_tensors='pt')['pixel_values'].to(self.device)\n", + " \n", + " input_ids = self.tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0)\n", + " if audio is not None:\n", + " audio_transcript = self.whisper_w_proj(audio)\n", + " audio_embed = self.tokenizer(audio_transcript, return_tensors='pt')[\"input_ids\"]\n", + " input_ids = torch.concat([input_ids, audio_embed], dim=1)\n", + " input_ids = input_ids.to(self.device)\n", + " \n", + " stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2\n", + "\n", + " with torch.inference_mode():\n", + " output_ids = self.model.generate(\n", + " input_ids,\n", + " images=image_tensor,\n", + " do_sample=True,\n", + " temperature=self.temperature,\n", + " max_new_tokens=self.max_new_tokens,\n", + " eos_token_id=self.tokenizer.eos_token_id, # End of sequence token\n", + " pad_token_id=self.tokenizer.eos_token_id, # Pad token\n", + " use_cache=True,\n", + " )\n", + "\n", + " input_token_len = input_ids.shape[1]\n", + " n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()\n", + " if n_diff_input_output > 0:\n", + " print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')\n", + " outputs = self.tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]\n", + " outputs = outputs.strip()\n", + " if outputs.endswith(stop_str):\n", + " outputs = outputs[:-len(stop_str)]\n", + " outputs = outputs.strip()\n", + " return outputs" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "cc47e6a0-3544-4a60-930f-ccae87ef945a", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "9ef56077307d4cef907e25b092061611", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Loading checkpoint shards: 0%| | 0/3 [00:00" +DEFAULT_IMAGE_PATCH_TOKEN = "" +DEFAULT_IM_START_TOKEN = "" +DEFAULT_IM_END_TOKEN = "" + + +class AudioLanguageConnector: + def __init__(self, projection_dim): + model_name = "microsoft/phi-2" + self.phi2_tokenizer = AutoTokenizer.from_pretrained( + model_name, trust_remote_code=True + ) + self.phi2_tokenizer.pad_token = self.phi2_tokenizer.eos_token + self.phi2_tokenizer.max_length = projection_dim + + def __call__(self, text): + text = f" {text} " + tokens = self.phi2_tokenizer( + text, return_tensors="pt", return_attention_mask=False + ) + return tokens + + +class WhisperWithProjection: + def __init__(self, projection_dim, device): + self.device = device + self.processor = WhisperProcessor.from_pretrained( + "openai/whisper-tiny", device_map=device + ) + self.model = WhisperForConditionalGeneration.from_pretrained( + "openai/whisper-tiny", device_map=device + ) + self.model.config.forced_decoder_ids = None + # self.audio_language_connector = AudioLanguageConnector(projection_dim) + + def __call__(self, audio): + input_features = self.processor( + audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt" + ).input_features + # generate token ids + predicted_ids = self.model.generate(input_features.to(self.device)) + # decode token ids to text + transcription = self.processor.batch_decode( + predicted_ids, skip_special_tokens=True + ) + + # audio_embeddings = self.audio_language_connector(transcription) + return transcription + + +class MultiModalPhi2: + def __init__( + self, + modelname_or_path="RaviNaik/Llava-Phi2", + temperature=0.2, + max_new_tokens=1024, + device="cuda:0", + ): + self.model_name = modelname_or_path + self.temperature = temperature + self.max_new_tokens = max_new_tokens + self.device = device + self.disable_torch_init() + self.whisper_w_proj = WhisperWithProjection(projection_dim=512, device=device) + self.load_pretrained_model() + + def disable_torch_init(self): + """ + Disable the redundant torch default initialization to accelerate model creation. + """ + setattr(torch.nn.Linear, "reset_parameters", lambda self: None) + setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None) + + def load_pretrained_model(self): + self.model = LlavaPhiForCausalLM.from_pretrained( + self.model_name, device_map=self.device + ) + self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) + self.image_processor = CLIPImageProcessor.from_pretrained(self.model_name) + mm_use_im_start_end = getattr(self.model.config, "mm_use_im_start_end", False) + mm_use_im_patch_token = getattr( + self.model.config, "mm_use_im_patch_token", True + ) + if mm_use_im_patch_token: + self.tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True) + if mm_use_im_start_end: + self.tokenizer.add_tokens( + [DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True + ) + + def tokenizer_image_token( + self, + prompt, + tokenizer, + image_token_index=IMAGE_TOKEN_INDEX, + return_tensors=None, + ): + prompt_chunks = [ + tokenizer(chunk).input_ids for chunk in prompt.split("") + ] + + def insert_separator(X, sep): + return [ele for sublist in zip(X, [sep] * len(X)) for ele in sublist][:-1] + + input_ids = [] + offset = 0 + if ( + len(prompt_chunks) > 0 + and len(prompt_chunks[0]) > 0 + and prompt_chunks[0][0] == tokenizer.bos_token_id + ): + offset = 1 + input_ids.append(prompt_chunks[0][0]) + for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)): + input_ids.extend(x[offset:]) + + if return_tensors is not None: + if return_tensors == "pt": + return torch.tensor(input_ids, dtype=torch.long) + raise ValueError(f"Unsupported tensor type: {return_tensors}") + return input_ids + + def __call__(self, text, audio, image): + if text is None: + text = "" + if image is not None: + qs = ( + DEFAULT_IM_START_TOKEN + + DEFAULT_IMAGE_TOKEN + + DEFAULT_IM_END_TOKEN + + "\n" + + text + ) + conv = conv_templates["phi-2_v0"].copy() + conv.append_message(conv.roles[0], qs) + conv.append_message(conv.roles[1], None) + prompt = conv.get_prompt() + + input_ids = self.tokenizer_image_token( + prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt" + ).unsqueeze(0) + + image_tensor = self.image_processor.preprocess(image, return_tensors="pt")[ + "pixel_values" + ].to(self.device) + else: + qs = text + conv = conv_templates["phi-2_v0"].copy() + conv.append_message(conv.roles[0], qs) + conv.append_message(conv.roles[1], None) + prompt = conv.get_prompt() + + input_ids = self.tokenizer(prompt, return_tensors="pt")["input_ids"] + + image_tensor = None + + if audio is not None: + audio_transcript = self.whisper_w_proj(audio) + audio_embed = self.tokenizer(audio_transcript, return_tensors="pt")[ + "input_ids" + ] + input_ids = torch.concat([input_ids, audio_embed], dim=1) + input_ids = input_ids.to(self.device) + + stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2 + + with torch.inference_mode(): + if image is not None: + output_ids = self.model.generate( + input_ids, + images=image_tensor, + do_sample=True, + temperature=self.temperature, + max_new_tokens=self.max_new_tokens, + eos_token_id=self.tokenizer.eos_token_id, # End of sequence token + pad_token_id=self.tokenizer.eos_token_id, # Pad token + use_cache=True, + ) + else: + output_ids = self.model.generate( + input_ids, + do_sample=True, + temperature=self.temperature, + max_new_tokens=self.max_new_tokens, + eos_token_id=self.tokenizer.eos_token_id, # End of sequence token + pad_token_id=self.tokenizer.eos_token_id, # Pad token + use_cache=True, + ) + + input_token_len = input_ids.shape[1] + n_diff_input_output = ( + (input_ids != output_ids[:, :input_token_len]).sum().item() + ) + if n_diff_input_output > 0: + print( + f"[Warning] {n_diff_input_output} output_ids are not the same as the input_ids" + ) + outputs = self.tokenizer.batch_decode( + output_ids[:, input_token_len:], skip_special_tokens=True + )[0] + outputs = outputs.strip() + if outputs.endswith(stop_str): + outputs = outputs[: -len(stop_str)] + outputs = outputs.strip() + return outputs diff --git a/inference/model/__init__.py b/inference/model/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1d17d5fccde9017fa4cbd8cd94d17ef3202017cd --- /dev/null +++ b/inference/model/__init__.py @@ -0,0 +1,2 @@ +from .language_model.llava_phi import LlavaPhiForCausalLM +from .language_model.configuration_llava_phi import LlavaPhiConfig, LlavaPhiVisionConfig, ProjectorConfig diff --git a/inference/model/builder.py b/inference/model/builder.py new file mode 100644 index 0000000000000000000000000000000000000000..d873eaef35af085037a9bc8fefc5d20c49ae4029 --- /dev/null +++ b/inference/model/builder.py @@ -0,0 +1,121 @@ +import os +import warnings +import shutil + +from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig, CLIPImageProcessor +import torch +from llava_phi.model import * +from llava_phi.constants import DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN + + +def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, load_4bit=False, device_map="cuda", device="cuda"): + kwargs = {"device_map": device_map} + if load_8bit: + kwargs['load_in_8bit'] = True + elif load_4bit: + kwargs['load_in_4bit'] = True + kwargs['quantization_config'] = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type='nf4' + ) + # else: # TODO: after fine-tuning LLava-Phi, load the model weights with fp16 will pose nan + # kwargs['torch_dtype'] = torch.float16 + + if 'phi' in model_name.lower(): + # Load LLaVA-Phi model + if 'lora' in model_name.lower() and model_base is None: + warnings.warn('There is `lora` in model name but no `model_base` is provided. If you are loading a LoRA model, please provide the `model_base` argument.') + if 'lora' in model_name.lower() and model_base is not None: + lora_cfg_pretrained = AutoConfig.from_pretrained(model_path) + tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False) + print('Loading LLaVA-Phi from base model...') + model = LlavaPhiForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=lora_cfg_pretrained, **kwargs) + token_num, tokem_dim = model.lm_head.out_features, model.lm_head.in_features + if model.lm_head.weight.shape[0] != token_num: + model.lm_head.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype)) + model.model.embed_tokens.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype)) + + print('Loading additional LLaVA-Phi weights...') + if os.path.exists(os.path.join(model_path, 'non_lora_trainables.bin')): + non_lora_trainables = torch.load(os.path.join(model_path, 'non_lora_trainables.bin'), map_location='cpu') + else: + # this is probably from HF Hub + from huggingface_hub import hf_hub_download + def load_from_hf(repo_id, filename, subfolder=None): + cache_file = hf_hub_download( + repo_id=repo_id, + filename=filename, + subfolder=subfolder) + return torch.load(cache_file, map_location='cpu') + non_lora_trainables = load_from_hf(model_path, 'non_lora_trainables.bin') + non_lora_trainables = {(k[11:] if k.startswith('base_model.') else k): v for k, v in non_lora_trainables.items()} + if any(k.startswith('model.model.') for k in non_lora_trainables): + non_lora_trainables = {(k[6:] if k.startswith('model.') else k): v for k, v in non_lora_trainables.items()} + model.load_state_dict(non_lora_trainables, strict=False) + + from peft import PeftModel + print('Loading LoRA weights...') + model = PeftModel.from_pretrained(model, model_path) + print('Merging LoRA weights...') + model = model.merge_and_unload() + print('Model is loaded...') + elif model_base is not None: + # this may be mm projector only + print('Loading LLaVA-Phi from base model...') + tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False) + cfg_pretrained = AutoConfig.from_pretrained(model_path) + model = LlavaPhiForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs) + + mm_projector_weights = torch.load(os.path.join(model_path, 'mm_projector.bin'), map_location='cpu') + mm_projector_weights = {k: v.to(torch.float16) for k, v in mm_projector_weights.items()} + model.load_state_dict(mm_projector_weights, strict=False) + else: + print("load llaVA-Phi MLLM!!!") + config = LlavaPhiConfig.from_pretrained(model_path, trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True) + model = LlavaPhiForCausalLM.from_pretrained( + model_path, + config=config, + use_safetensors=True, + **kwargs).to("cuda") + else: + # Load language model + if model_base is not None: + # PEFT model + from peft import PeftModel + tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False) + model = AutoModelForCausalLM.from_pretrained(model_base, torch_dtype=torch.float16, low_cpu_mem_usage=True, device_map="auto") + print(f"Loading LoRA weights from {model_path}") + model = PeftModel.from_pretrained(model, model_path) + print(f"Merging weights") + model = model.merge_and_unload() + print('Convert to FP16...') + model.to(torch.float16) + else: + tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False) + model = AutoModelForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs) + + image_processor = CLIPImageProcessor.from_pretrained(model_path) + + if 'phi' in model_name.lower(): + mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False) + mm_use_im_patch_token = getattr(model.config, "mm_use_im_patch_token", True) + + # TODO: the tokenizer length of phi-2 is 50295, but the output class of lm_head is 51200 + if mm_use_im_patch_token: + tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True) + if mm_use_im_start_end: + tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True) + # model.resize_token_embeddings(len(tokenizer)) + else: + raise ValueError(f"Unsupported model name: {model_name}") + + if hasattr(model.config, "max_sequence_length"): + context_len = model.config.max_sequence_length + else: + context_len = 2048 + model.to(device="cuda") + print(kwargs) + return tokenizer, model, image_processor, context_len diff --git a/inference/model/language_model/configuration_llava_phi.py b/inference/model/language_model/configuration_llava_phi.py new file mode 100644 index 0000000000000000000000000000000000000000..db16c71955328fdfc0f6ce0ed76f12fde583c034 --- /dev/null +++ b/inference/model/language_model/configuration_llava_phi.py @@ -0,0 +1,179 @@ +import os +from typing import Union +from transformers import PretrainedConfig, PhiConfig +from transformers.utils import logging + +logger = logging.get_logger(__name__) + + +class LlavaPhiVisionConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`CLIPVisionModel`]. It is used to instantiate a + CLIP vision encoder according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the vision encoder of the CLIP + [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + projection_dim (`int`, *optional*, defaults to 512): + Dimentionality of text and vision projection layers. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + image_size (`int`, *optional*, defaults to 224): + The size (resolution) of each image. + patch_size (`int`, *optional*, defaults to 32): + The size (resolution) of each patch. + hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. + layer_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the layer normalization layers. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + initializer_factor (`float`, *optional*, defaults to 1.0): + A factor for initializing all weight matrices (should be kept to 1, used internally for initialization + testing). + mm_vision_select_feature (`str`, *optional*, defaults to `"patch"`): + The feature to select from the vision encoder output. Can be one of `"patch"` or `"cls_patch"`. + mm_vision_select_layer (`int`, *optional*, defaults to `-2`): + The layer to select from the vision encoder output. + + Example: + + ```python + >>> from transformers import CLIPVisionConfig, CLIPVisionModel + + >>> # Initializing a CLIPVisionConfig with openai/clip-vit-base-patch32 style configuration + >>> configuration = CLIPVisionConfig() + + >>> # Initializing a CLIPVisionModel (with random weights) from the openai/clip-vit-base-patch32 style configuration + >>> model = CLIPVisionModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "llava_phi_clip_vision_model" + + def __init__( + self, + hidden_size=768, + intermediate_size=3072, + projection_dim=512, + num_hidden_layers=12, + num_attention_heads=12, + num_channels=3, + image_size=224, + patch_size=32, + hidden_act="quick_gelu", + layer_norm_eps=1e-5, + attention_dropout=0.0, + initializer_range=0.02, + initializer_factor=1.0, + mm_vision_select_feature="patch", + mm_vision_select_layer=-2, + **kwargs, + ): + super().__init__(**kwargs) + + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.projection_dim = projection_dim + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.num_channels = num_channels + self.patch_size = patch_size + self.image_size = image_size + self.initializer_range = initializer_range + self.initializer_factor = initializer_factor + self.attention_dropout = attention_dropout + self.layer_norm_eps = layer_norm_eps + self.hidden_act = hidden_act + self.mm_vision_select_feature = mm_vision_select_feature + self.mm_vision_select_layer = mm_vision_select_layer + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": + cls._set_token_in_kwargs(kwargs) + + config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) + + # get the vision config dict if we are loading from CLIPConfig + if config_dict.get("model_type") == "llava_phi-phi": + config_dict = config_dict["vision_config"] + + if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: + logger.warning( + f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " + f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." + ) + + return cls.from_dict(config_dict, **kwargs) + + +class ProjectorConfig(PretrainedConfig): + model_type = "llava_phi_projector" + + def __init__( + self, + mm_projector_type="linear", + mm_hidden_size=768, + hidden_size=2560, + **kwargs + ): + self.mm_projector_type = mm_projector_type + self.mm_hidden_size = mm_hidden_size + self.hidden_size = hidden_size + super().__init__(**kwargs) + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": + cls._set_token_in_kwargs(kwargs) + + config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) + + # get the vision config dict if we are loading from CLIPConfig + if config_dict.get("model_type") == "llava_phi-phi": + config_dict = config_dict["projector_config"] + + if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: + logger.warning( + f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " + f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." + ) + + return cls.from_dict(config_dict, **kwargs) + + +DEFAULT_VISUAL_CONFIG = { + "vision_tower": LlavaPhiVisionConfig().to_dict(), + "mm_projector": ProjectorConfig().to_dict() +} + + +class LlavaPhiConfig(PhiConfig): + model_type = "llava_phi" + + def __init__(self, vision_config=None, **kwargs): + if vision_config is None: + self.vision_config = DEFAULT_VISUAL_CONFIG + else: + self.vision_config = vision_config + + super().__init__(**kwargs) + + +if __name__ == "__main__": + print(LlavaPhiVisionConfig()) diff --git a/inference/model/language_model/llava_phi.py b/inference/model/language_model/llava_phi.py new file mode 100644 index 0000000000000000000000000000000000000000..1c8b87121a0db29563d8ce0379c64746e1a2a5bf --- /dev/null +++ b/inference/model/language_model/llava_phi.py @@ -0,0 +1,126 @@ +import os +from typing import List, Optional, Tuple, Union + +import torch +import torch.nn as nn +from torch.nn import CrossEntropyLoss + +from transformers import AutoConfig, AutoModelForCausalLM, \ + PhiModel, PhiPreTrainedModel + +from transformers.modeling_outputs import CausalLMOutputWithPast +from ..llava_arch import LlavaMetaModel, LlavaMetaForCausalLM +from transformers.utils import logging +from .configuration_llava_phi import LlavaPhiConfig + +logger = logging.get_logger(__name__) + + +class LLavaPhiModel(LlavaMetaModel, PhiModel): + config_class = LlavaPhiConfig + + def __init__(self, config): + super(LLavaPhiModel, self).__init__(config) + + +class LlavaPhiForCausalLM(PhiPreTrainedModel, LlavaMetaForCausalLM): + config_class = LlavaPhiConfig + + def __init__(self, config): + super(PhiPreTrainedModel, self).__init__(config) + self.model = LLavaPhiModel(config) + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=True) + + # Initialize weights and apply final processing + self.post_init() + + def get_model(self): + return self.model + + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + images: Optional[torch.FloatTensor] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, CausalLMOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + input_ids, attention_mask, past_key_values, inputs_embeds, labels = self.prepare_inputs_labels_for_multimodal( + input_ids, attention_mask, past_key_values, labels, images) + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict + ) + + hidden_states = outputs[0] + logits = self.lm_head(hidden_states) + + loss = None + if labels is not None: + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss() + shift_logits = shift_logits.view(-1, self.config.vocab_size) + shift_labels = shift_labels.view(-1) + # Enable model/pipeline parallelism + shift_labels = shift_labels.to(shift_logits.device) + loss = loss_fct(shift_logits, shift_labels) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs + ): + if past_key_values: + input_ids = input_ids[:, -1:] + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and past_key_values is None: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids} + + model_inputs.update( + { + "past_key_values": past_key_values, + "use_cache": kwargs.get("use_cache"), + "attention_mask": attention_mask, + "images": kwargs.get("images", None), + } + ) + return model_inputs + + +AutoConfig.register("llava_phi", LlavaPhiConfig) +AutoModelForCausalLM.register(LlavaPhiConfig, LlavaPhiForCausalLM) diff --git a/inference/model/llava_arch.py b/inference/model/llava_arch.py new file mode 100644 index 0000000000000000000000000000000000000000..20df5b588d6d83437a32071ab428b82a14a85576 --- /dev/null +++ b/inference/model/llava_arch.py @@ -0,0 +1,208 @@ +# Copyright 2023 Haotian Liu +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from abc import ABC, abstractmethod + +import torch + +from .multimodal_encoder.clip_encoder import CLIPVisionTower +from .multimodal_projector.builder import build_vision_projector +from .language_model.configuration_llava_phi import LlavaPhiConfig, LlavaPhiVisionConfig, ProjectorConfig +from llava_phi.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN + + +class LlavaMetaModel: + def __init__(self, config): + super(LlavaMetaModel, self).__init__(config) + self.vision_tower = CLIPVisionTower( + LlavaPhiVisionConfig(**config.vision_config["vision_tower"]) + ) + self.mm_projector = build_vision_projector( + ProjectorConfig(**config.vision_config["mm_projector"]) + ) + + def get_vision_tower(self): + vision_tower = getattr(self, 'vision_tower', None) + if type(vision_tower) is list: + vision_tower = vision_tower[0] + return vision_tower + + +class LlavaMetaForCausalLM(ABC): + + @abstractmethod + def get_model(self): + pass + + def get_vision_tower(self): + return self.get_model().get_vision_tower() + + def encode_images(self, images): + image_features = self.get_model().get_vision_tower()(images) + image_features = self.get_model().mm_projector(image_features) + return image_features + + def prepare_inputs_labels_for_multimodal( + self, input_ids, attention_mask, past_key_values, labels, images + ): + vision_tower = self.get_vision_tower() + if vision_tower is None or images is None or input_ids.shape[1] == 1: + if past_key_values is not None and vision_tower is not None and images is not None and input_ids.shape[1] == 1: + attention_mask = torch.ones((attention_mask.shape[0], past_key_values[-1][-1].shape[-2] + 1), dtype=attention_mask.dtype, device=attention_mask.device) + return input_ids, attention_mask, past_key_values, None, labels + + if type(images) is list or images.ndim == 5: + concat_images = torch.cat([image for image in images], dim=0) + image_features = self.encode_images(concat_images) + split_sizes = [image.shape[0] for image in images] + image_features = torch.split(image_features, split_sizes, dim=0) + image_features = [x.flatten(0, 1) for x in image_features] + else: + image_features = self.encode_images(images) + + new_input_embeds = [] + new_labels = [] if labels is not None else None + cur_image_idx = 0 + for batch_idx, cur_input_ids in enumerate(input_ids): + if (cur_input_ids == IMAGE_TOKEN_INDEX).sum() == 0: + # multimodal LLM, but the current sample is not multimodal + # FIXME: this is a hacky fix, for deepspeed zero3 to work + half_len = cur_input_ids.shape[0] // 2 + cur_image_features = image_features[cur_image_idx] + cur_input_embeds_1 = self.get_model().embed_tokens(cur_input_ids[:half_len]) + cur_input_embeds_2 = self.get_model().embed_tokens(cur_input_ids[half_len:]) + cur_input_embeds = torch.cat([cur_input_embeds_1, cur_image_features[0:0], cur_input_embeds_2], dim=0) + new_input_embeds.append(cur_input_embeds) + if labels is not None: + new_labels.append(labels[batch_idx]) + cur_image_idx += 1 + continue + image_token_indices = torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0] + cur_new_input_embeds = [] + if labels is not None: + cur_labels = labels[batch_idx] + cur_new_labels = [] + assert cur_labels.shape == cur_input_ids.shape + while image_token_indices.numel() > 0: + cur_image_features = image_features[cur_image_idx] + image_token_start = image_token_indices[0] + if getattr(self.config, 'tune_mm_mlp_adapter', False) and getattr(self.config, 'mm_use_im_start_end', False): + cur_new_input_embeds.append(self.get_model().embed_tokens(cur_input_ids[:image_token_start-1]).detach()) + cur_new_input_embeds.append(self.get_model().embed_tokens(cur_input_ids[image_token_start-1:image_token_start])) + cur_new_input_embeds.append(cur_image_features) + cur_new_input_embeds.append(self.get_model().embed_tokens(cur_input_ids[image_token_start+1:image_token_start+2])) + if labels is not None: + cur_new_labels.append(cur_labels[:image_token_start]) + cur_new_labels.append(torch.full((cur_image_features.shape[0],), IGNORE_INDEX, device=labels.device, dtype=labels.dtype)) + cur_new_labels.append(cur_labels[image_token_start:image_token_start+1]) + cur_labels = cur_labels[image_token_start+2:] + else: + cur_new_input_embeds.append(self.get_model().embed_tokens(cur_input_ids[:image_token_start])) + cur_new_input_embeds.append(cur_image_features) + if labels is not None: + cur_new_labels.append(cur_labels[:image_token_start]) + cur_new_labels.append(torch.full((cur_image_features.shape[0],), IGNORE_INDEX, device=labels.device, dtype=labels.dtype)) + cur_labels = cur_labels[image_token_start+1:] + cur_image_idx += 1 + if getattr(self.config, 'tune_mm_mlp_adapter', False) and getattr(self.config, 'mm_use_im_start_end', False): + cur_input_ids = cur_input_ids[image_token_start+2:] + else: + cur_input_ids = cur_input_ids[image_token_start+1:] + image_token_indices = torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0] + if cur_input_ids.numel() > 0: + if getattr(self.config, 'tune_mm_mlp_adapter', False) and getattr(self.config, 'mm_use_im_start_end', False): + cur_new_input_embeds.append(self.get_model().embed_tokens(cur_input_ids).detach()) + else: + cur_new_input_embeds.append(self.get_model().embed_tokens(cur_input_ids)) + if labels is not None: + cur_new_labels.append(cur_labels) + cur_new_input_embeds = [x.to(device=self.device) for x in cur_new_input_embeds] + cur_new_input_embeds = torch.cat(cur_new_input_embeds, dim=0) + new_input_embeds.append(cur_new_input_embeds) + if labels is not None: + cur_new_labels = torch.cat(cur_new_labels, dim=0) + new_labels.append(cur_new_labels) + + if any(x.shape != new_input_embeds[0].shape for x in new_input_embeds): + max_len = max(x.shape[0] for x in new_input_embeds) + + new_input_embeds_align = [] + for cur_new_embed in new_input_embeds: + cur_new_embed = torch.cat((cur_new_embed, torch.zeros((max_len - cur_new_embed.shape[0], cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device)), dim=0) + new_input_embeds_align.append(cur_new_embed) + new_input_embeds = torch.stack(new_input_embeds_align, dim=0) + + if labels is not None: + new_labels_align = [] + _new_labels = new_labels + for cur_new_label in new_labels: + cur_new_label = torch.cat((cur_new_label, torch.full((max_len - cur_new_label.shape[0],), IGNORE_INDEX, dtype=cur_new_label.dtype, device=cur_new_label.device)), dim=0) + new_labels_align.append(cur_new_label) + new_labels = torch.stack(new_labels_align, dim=0) + + if attention_mask is not None: + new_attention_mask = [] + for cur_attention_mask, cur_new_labels, cur_new_labels_align in zip(attention_mask, _new_labels, new_labels): + new_attn_mask_pad_left = torch.full((cur_new_labels.shape[0] - labels.shape[1],), True, dtype=attention_mask.dtype, device=attention_mask.device) + new_attn_mask_pad_right = torch.full((cur_new_labels_align.shape[0] - cur_new_labels.shape[0],), False, dtype=attention_mask.dtype, device=attention_mask.device) + cur_new_attention_mask = torch.cat((new_attn_mask_pad_left, cur_attention_mask, new_attn_mask_pad_right), dim=0) + new_attention_mask.append(cur_new_attention_mask) + attention_mask = torch.stack(new_attention_mask, dim=0) + assert attention_mask.shape == new_labels.shape + else: + new_input_embeds = torch.stack(new_input_embeds, dim=0) + if labels is not None: + new_labels = torch.stack(new_labels, dim=0) + + if attention_mask is not None: + new_attn_mask_pad_left = torch.full((attention_mask.shape[0], new_input_embeds.shape[1] - input_ids.shape[1]), True, dtype=attention_mask.dtype, device=attention_mask.device) + attention_mask = torch.cat((new_attn_mask_pad_left, attention_mask), dim=1) + assert attention_mask.shape == new_input_embeds.shape[:2] + + return None, attention_mask, past_key_values, new_input_embeds, new_labels + + def initialize_vision_tokenizer(self, model_args, tokenizer): + if model_args.mm_use_im_patch_token: + tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True) + self.resize_token_embeddings(len(tokenizer)) + + if model_args.mm_use_im_start_end: + num_new_tokens = tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True) + self.resize_token_embeddings(len(tokenizer)) + + if num_new_tokens > 0: + input_embeddings = self.get_input_embeddings().weight.data + output_embeddings = self.get_output_embeddings().weight.data + + input_embeddings_avg = input_embeddings[:-num_new_tokens].mean( + dim=0, keepdim=True) + output_embeddings_avg = output_embeddings[:-num_new_tokens].mean( + dim=0, keepdim=True) + + input_embeddings[-num_new_tokens:] = input_embeddings_avg + output_embeddings[-num_new_tokens:] = output_embeddings_avg + + if model_args.tune_mm_mlp_adapter: + for p in self.get_input_embeddings().parameters(): + p.requires_grad = True + for p in self.get_output_embeddings().parameters(): + p.requires_grad = False + + elif model_args.mm_use_im_patch_token: + if model_args.tune_mm_mlp_adapter: + for p in self.get_input_embeddings().parameters(): + p.requires_grad = False + for p in self.get_output_embeddings().parameters(): + p.requires_grad = False diff --git a/inference/model/multimodal_encoder/clip_encoder.py b/inference/model/multimodal_encoder/clip_encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..5d7939d5ae0674fded20c52278b04dc2ef59936b --- /dev/null +++ b/inference/model/multimodal_encoder/clip_encoder.py @@ -0,0 +1,89 @@ +from abc import ABC + +import torch +import torch.nn as nn + +from transformers import CLIPPreTrainedModel, CLIPVisionConfig +from transformers.models.clip.modeling_clip import CLIPVisionTransformer +from llava_phi.model.language_model.configuration_llava_phi import LlavaPhiVisionConfig + + +class CLIPVisionTower(CLIPPreTrainedModel): + config_class = LlavaPhiVisionConfig + + def __init__(self, config): + super().__init__(config) + + self.vision_model = CLIPVisionTransformer(config) + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self) -> nn.Module: + return self.vision_model.embeddings.patch_embedding + + def feature_select(self, image_forward_outs): + image_features = image_forward_outs.hidden_states[ + self.config.mm_vision_select_layer + ] + if self.config.mm_vision_select_feature == "patch": + image_features = image_features[:, 1:] + elif self.config.mm_vision_select_feature == "cls_patch": + image_features = image_features + else: + raise ValueError( + f"Unexpected select feature: {self.config.mm_vision_select_feature}" + ) + return image_features + + def forward(self, images): + if type(images) is list: + image_features = [] + for image in images: + image_forward_out = self.vision_model( + image.to(device=self.device, dtype=self.dtype).unsqueeze(0), + output_hidden_states=True, + ) + image_feature = self.feature_select(image_forward_out).to(image.dtype) + image_features.append(image_feature) + else: + image_forward_outs = self.vision_model( + images.to(device=self.device, dtype=self.dtype), + output_hidden_states=True, + ) + image_features = self.feature_select(image_forward_outs).to(images.dtype) + + return image_features + + @property + def dummy_feature(self): + return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype) + + @property + def dtype(self): + return list(self.vision_model.parameters())[0].dtype + + @property + def device(self): + return list(self.vision_model.parameters())[0].device + + @property + def hidden_size(self): + return self.config.hidden_size + + @property + def num_patches(self): + return (self.config.image_size // self.config.patch_size) ** 2 + + +if __name__ == "__main__": + clip_config = CLIPVisionConfig.from_pretrained( + "/data/private/zhumj/GPTcode/mm-phi/openai/clip-vit-large-patch14-336" + ) + print("################ clip_config ##############") + print(clip_config) + phi_vis_config = LlavaPhiVisionConfig(**clip_config.to_dict()) + print("################ phi_vis_config ##############") + print(phi_vis_config) + + model = CLIPVisionTower(clip_config) + # print(list(model.vision_model.parameters())[0].dtype) diff --git a/inference/model/multimodal_projector/builder.py b/inference/model/multimodal_projector/builder.py new file mode 100644 index 0000000000000000000000000000000000000000..a7cceb34ba8760709e81af0b21824502449fca63 --- /dev/null +++ b/inference/model/multimodal_projector/builder.py @@ -0,0 +1,50 @@ +import torch +import torch.nn as nn +import re + + +class IdentityMap(nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x, *args, **kwargs): + return x + + @property + def config(self): + return {"mm_projector_type": "identity"} + + +class SimpleResBlock(nn.Module): + def __init__(self, channels): + super().__init__() + self.pre_norm = nn.LayerNorm(channels) + + self.proj = nn.Sequential( + nn.Linear(channels, channels), nn.GELU(), nn.Linear(channels, channels) + ) + + def forward(self, x): + x = self.pre_norm(x) + return x + self.proj(x) + + +def build_vision_projector(config): + projector_type = getattr(config, "mm_projector_type", "linear") + + if projector_type == "linear": + return nn.Linear(config.mm_hidden_size, config.hidden_size) + + mlp_gelu_match = re.match(r"^mlp(\d+)x_gelu$", projector_type) + if mlp_gelu_match: + mlp_depth = int(mlp_gelu_match.group(1)) + modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)] + for _ in range(1, mlp_depth): + modules.append(nn.GELU()) + modules.append(nn.Linear(config.hidden_size, config.hidden_size)) + return nn.Sequential(*modules) + + if projector_type == "identity": + return IdentityMap() + + raise ValueError(f"Unknown projector type: {projector_type}") diff --git a/llava-phi/llava_phi/__init__.py b/llava-phi/llava_phi/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e6ad9ed7d7d323588accccee15cd69eae41eba04 --- /dev/null +++ b/llava-phi/llava_phi/__init__.py @@ -0,0 +1 @@ +from .model import LlavaPhiForCausalLM diff --git a/llava-phi/llava_phi/constants.py b/llava-phi/llava_phi/constants.py new file mode 100644 index 0000000000000000000000000000000000000000..be8cf0204969a6c973f442b383d8e425d684e826 --- /dev/null +++ b/llava-phi/llava_phi/constants.py @@ -0,0 +1,12 @@ +CONTROLLER_HEART_BEAT_EXPIRATION = 30 +WORKER_HEART_BEAT_INTERVAL = 15 + +LOGDIR = "." + +# Model Constants +IGNORE_INDEX = -100 +IMAGE_TOKEN_INDEX = -200 +DEFAULT_IMAGE_TOKEN = "" +DEFAULT_IMAGE_PATCH_TOKEN = "" +DEFAULT_IM_START_TOKEN = "" +DEFAULT_IM_END_TOKEN = "" diff --git a/llava-phi/llava_phi/conversation.py b/llava-phi/llava_phi/conversation.py new file mode 100644 index 0000000000000000000000000000000000000000..be382e5dd601555b67ac59a3952871947037e3ab --- /dev/null +++ b/llava-phi/llava_phi/conversation.py @@ -0,0 +1,224 @@ +import dataclasses +from enum import auto, Enum +from typing import List, Tuple + + +class SeparatorStyle(Enum): + """Different separator style.""" + SINGLE = auto() + TWO = auto() + MPT = auto() + PLAIN = auto() + LLAMA_2 = auto() + + +@dataclasses.dataclass +class Conversation: + """A class that keeps all conversation history.""" + system: str + roles: List[str] + messages: List[List[str]] + offset: int + sep_style: SeparatorStyle = SeparatorStyle.SINGLE + sep: str = "###" + sep2: str = None + version: str = "Unknown" + + skip_next: bool = False + + def get_prompt(self): + messages = self.messages + if len(messages) > 0 and type(messages[0][1]) is tuple: + messages = self.messages.copy() + init_role, init_msg = messages[0].copy() + init_msg = init_msg[0].replace("", "").strip() + if 'mmtag' in self.version: + messages[0] = (init_role, init_msg) + messages.insert(0, (self.roles[0], "")) + messages.insert(1, (self.roles[1], "Received.")) + else: + messages[0] = (init_role, "\n" + init_msg) + + if self.sep_style == SeparatorStyle.SINGLE: + ret = self.system + self.sep + for role, message in messages: + if message: + if type(message) is tuple: + message, _, _ = message + ret += role + ": " + message + self.sep + else: + ret += role + ":" + elif self.sep_style == SeparatorStyle.TWO: + seps = [self.sep, self.sep2] + ret = self.system + seps[0] + for i, (role, message) in enumerate(messages): + if message: + if type(message) is tuple: + message, _, _ = message + ret += role + ": " + message + seps[i % 2] + else: + ret += role + ":" + elif self.sep_style == SeparatorStyle.PLAIN: + seps = [self.sep, self.sep2] + ret = self.system + for i, (role, message) in enumerate(messages): + if message: + if type(message) is tuple: + message, _, _ = message + ret += message + seps[i % 2] + else: + ret += "" + else: + raise ValueError(f"Invalid style: {self.sep_style}") + + return ret + + def append_message(self, role, message): + self.messages.append([role, message]) + + def get_images(self, return_pil=False): + images = [] + for i, (role, msg) in enumerate(self.messages[self.offset:]): + if i % 2 == 0: + if type(msg) is tuple: + import base64 + from io import BytesIO + from PIL import Image + msg, image, image_process_mode = msg + if image_process_mode == "Pad": + def expand2square(pil_img, background_color=(122, 116, 104)): + width, height = pil_img.size + if width == height: + return pil_img + elif width > height: + result = Image.new(pil_img.mode, (width, width), background_color) + result.paste(pil_img, (0, (width - height) // 2)) + return result + else: + result = Image.new(pil_img.mode, (height, height), background_color) + result.paste(pil_img, ((height - width) // 2, 0)) + return result + image = expand2square(image) + elif image_process_mode in ["Default", "Crop"]: + pass + elif image_process_mode == "Resize": + image = image.resize((336, 336)) + else: + raise ValueError(f"Invalid image_process_mode: {image_process_mode}") + max_hw, min_hw = max(image.size), min(image.size) + aspect_ratio = max_hw / min_hw + max_len, min_len = 800, 400 + shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw)) + longest_edge = int(shortest_edge * aspect_ratio) + W, H = image.size + if longest_edge != max(image.size): + if H > W: + H, W = longest_edge, shortest_edge + else: + H, W = shortest_edge, longest_edge + image = image.resize((W, H)) + if return_pil: + images.append(image) + else: + buffered = BytesIO() + image.save(buffered, format="PNG") + img_b64_str = base64.b64encode(buffered.getvalue()).decode() + images.append(img_b64_str) + return images + + def to_gradio_chatbot(self): + ret = [] + for i, (role, msg) in enumerate(self.messages[self.offset:]): + if i % 2 == 0: + if type(msg) is tuple: + import base64 + from io import BytesIO + msg, image, image_process_mode = msg + max_hw, min_hw = max(image.size), min(image.size) + aspect_ratio = max_hw / min_hw + max_len, min_len = 800, 400 + shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw)) + longest_edge = int(shortest_edge * aspect_ratio) + W, H = image.size + if H > W: + H, W = longest_edge, shortest_edge + else: + H, W = shortest_edge, longest_edge + image = image.resize((W, H)) + buffered = BytesIO() + image.save(buffered, format="JPEG") + img_b64_str = base64.b64encode(buffered.getvalue()).decode() + img_str = f'user upload image' + msg = img_str + msg.replace('', '').strip() + ret.append([msg, None]) + else: + ret.append([msg, None]) + else: + ret[-1][-1] = msg + return ret + + def copy(self): + return Conversation( + system=self.system, + roles=self.roles, + messages=[[x, y] for x, y in self.messages], + offset=self.offset, + sep_style=self.sep_style, + sep=self.sep, + sep2=self.sep2, + version=self.version) + + def dict(self): + if len(self.get_images()) > 0: + return { + "system": self.system, + "roles": self.roles, + "messages": [[x, y[0] if type(y) is tuple else y] for x, y in self.messages], + "offset": self.offset, + "sep": self.sep, + "sep2": self.sep2, + } + return { + "system": self.system, + "roles": self.roles, + "messages": self.messages, + "offset": self.offset, + "sep": self.sep, + "sep2": self.sep2, + } + + +conv_phi_v0 = Conversation( + system="A chat between a curious user and an artificial intelligence assistant. " + "The assistant gives helpful, detailed, and polite answers to the user's questions.", + roles=("USER", "ASSISTANT"), + version="v0", + messages=(), + offset=0, + sep_style=SeparatorStyle.TWO, + sep=" ", + sep2="<|endoftext|>", +) + +conv_llava_plain = Conversation( + system="", + roles=("", ""), + messages=( + ), + offset=0, + sep_style=SeparatorStyle.PLAIN, + sep="\n", +) + +default_conversation = conv_phi_v0 +conv_templates = { + "default": conv_phi_v0, + "v0": conv_phi_v0, + "phi-2_v0": conv_phi_v0, + + "plain": conv_llava_plain, +} + + +if __name__ == "__main__": + print(default_conversation.get_prompt()) diff --git a/llava-phi/llava_phi/eval/eval_gpt_review.py b/llava-phi/llava_phi/eval/eval_gpt_review.py new file mode 100644 index 0000000000000000000000000000000000000000..8af4559c65fc2728b11fd2097a109981ee1ef686 --- /dev/null +++ b/llava-phi/llava_phi/eval/eval_gpt_review.py @@ -0,0 +1,113 @@ +import argparse +import json +import os + +import openai +import tqdm +import ray +import time + +NUM_SECONDS_TO_SLEEP = 3 + +@ray.remote(num_cpus=4) +def get_eval(content: str, max_tokens: int): + while True: + try: + response = openai.ChatCompletion.create( + model='gpt-4', + messages=[{ + 'role': 'system', + 'content': 'You are a helpful and precise assistant for checking the quality of the answer.' + }, { + 'role': 'user', + 'content': content, + }], + temperature=0.2, # TODO: figure out which temperature is best for evaluation + max_tokens=max_tokens, + ) + break + except openai.error.RateLimitError: + pass + except Exception as e: + print(e) + time.sleep(NUM_SECONDS_TO_SLEEP) + + print('success!') + return response['choices'][0]['message']['content'] + + +def parse_score(review): + try: + score_pair = review.split('\n')[0] + score_pair = score_pair.replace(',', ' ') + sp = score_pair.split(' ') + if len(sp) == 2: + return [float(sp[0]), float(sp[1])] + else: + print('error', review) + return [-1, -1] + except Exception as e: + print(e) + print('error', review) + return [-1, -1] + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.') + parser.add_argument('-q', '--question') + # parser.add_argument('-a', '--answer') + parser.add_argument('-a', '--answer-list', nargs='+', default=[]) + parser.add_argument('-r', '--rule') + parser.add_argument('-o', '--output') + parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output') + args = parser.parse_args() + + ray.init() + + f_q = open(os.path.expanduser(args.question)) + f_ans1 = open(os.path.expanduser(args.answer_list[0])) + f_ans2 = open(os.path.expanduser(args.answer_list[1])) + rule_dict = json.load(open(os.path.expanduser(args.rule), 'r')) + + review_file = open(f'{args.output}', 'w') + + js_list = [] + handles = [] + idx = 0 + for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2): + # if idx == 1: + # break + + ques = json.loads(ques_js) + ans1 = json.loads(ans1_js) + ans2 = json.loads(ans2_js) + + category = json.loads(ques_js)['category'] + if category in rule_dict: + rule = rule_dict[category] + else: + rule = rule_dict['default'] + prompt = rule['prompt'] + role = rule['role'] + content = (f'[Question]\n{ques["text"]}\n\n' + f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n' + f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n' + f'[System]\n{prompt}\n\n') + js_list.append({ + 'id': idx+1, + 'question_id': ques['question_id'], + 'answer1_id': ans1['answer_id'], + 'answer2_id': ans2['answer_id'], + 'category': category}) + idx += 1 + handles.append(get_eval.remote(content, args.max_tokens)) + # To avoid the rate limit set by OpenAI + time.sleep(NUM_SECONDS_TO_SLEEP) + + reviews = ray.get(handles) + for idx, review in enumerate(reviews): + scores = parse_score(review) + js_list[idx]['content'] = review + js_list[idx]['tuple'] = scores + review_file.write(json.dumps(js_list[idx]) + '\n') + review_file.close() diff --git a/llava-phi/llava_phi/eval/eval_gpt_review_bench.py b/llava-phi/llava_phi/eval/eval_gpt_review_bench.py new file mode 100644 index 0000000000000000000000000000000000000000..06160f2422b5368f30fb967f7cae635208a1dc69 --- /dev/null +++ b/llava-phi/llava_phi/eval/eval_gpt_review_bench.py @@ -0,0 +1,121 @@ +import argparse +import json +import os + +import openai +import time + +NUM_SECONDS_TO_SLEEP = 0.5 + + +def get_eval(content: str, max_tokens: int): + while True: + try: + response = openai.ChatCompletion.create( + model='gpt-4-0314', + messages=[{ + 'role': 'system', + 'content': 'You are a helpful and precise assistant for checking the quality of the answer.' + }, { + 'role': 'user', + 'content': content, + }], + temperature=0.2, # TODO: figure out which temperature is best for evaluation + max_tokens=max_tokens, + ) + break + except openai.error.RateLimitError: + pass + except Exception as e: + print(e) + time.sleep(NUM_SECONDS_TO_SLEEP) + + return response['choices'][0]['message']['content'] + + +def parse_score(review): + try: + score_pair = review.split('\n')[0] + score_pair = score_pair.replace(',', ' ') + sp = score_pair.split(' ') + if len(sp) == 2: + return [float(sp[0]), float(sp[1])] + else: + print('error', review) + return [-1, -1] + except Exception as e: + print(e) + print('error', review) + return [-1, -1] + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.') + parser.add_argument('-q', '--question') + parser.add_argument('-c', '--context') + parser.add_argument('-a', '--answer-list', nargs='+', default=[]) + parser.add_argument('-r', '--rule') + parser.add_argument('-o', '--output') + parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output') + args = parser.parse_args() + + f_q = open(os.path.expanduser(args.question)) + f_ans1 = open(os.path.expanduser(args.answer_list[0])) + f_ans2 = open(os.path.expanduser(args.answer_list[1])) + rule_dict = json.load(open(os.path.expanduser(args.rule), 'r')) + + if os.path.isfile(os.path.expanduser(args.output)): + cur_reviews = [json.loads(line) for line in open(os.path.expanduser(args.output))] + else: + cur_reviews = [] + + review_file = open(f'{args.output}', 'a') + + context_list = [json.loads(line) for line in open(os.path.expanduser(args.context))] + image_to_context = {context['image']: context for context in context_list} + + handles = [] + idx = 0 + for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2): + ques = json.loads(ques_js) + ans1 = json.loads(ans1_js) + ans2 = json.loads(ans2_js) + + inst = image_to_context[ques['image']] + + if isinstance(inst['caption'], list): + cap_str = '\n'.join(inst['caption']) + else: + cap_str = inst['caption'] + + category = 'llava_bench_' + json.loads(ques_js)['category'] + if category in rule_dict: + rule = rule_dict[category] + else: + assert False, f"Visual QA category not found in rule file: {category}." + prompt = rule['prompt'] + role = rule['role'] + content = (f'[Context]\n{cap_str}\n\n' + f'[Question]\n{ques["text"]}\n\n' + f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n' + f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n' + f'[System]\n{prompt}\n\n') + cur_js = { + 'id': idx+1, + 'question_id': ques['question_id'], + 'answer1_id': ans1.get('answer_id', ans1['question_id']), + 'answer2_id': ans2.get('answer_id', ans2['answer_id']), + 'category': category + } + if idx >= len(cur_reviews): + review = get_eval(content, args.max_tokens) + scores = parse_score(review) + cur_js['content'] = review + cur_js['tuple'] = scores + review_file.write(json.dumps(cur_js) + '\n') + review_file.flush() + else: + print(f'Skipping {idx} as we already have it.') + idx += 1 + print(idx) + review_file.close() diff --git a/llava-phi/llava_phi/eval/eval_gpt_review_visual.py b/llava-phi/llava_phi/eval/eval_gpt_review_visual.py new file mode 100644 index 0000000000000000000000000000000000000000..d6e407a400a67020d801e6c27a3c32a2ee38f30c --- /dev/null +++ b/llava-phi/llava_phi/eval/eval_gpt_review_visual.py @@ -0,0 +1,118 @@ +import argparse +import json +import os + +import openai +import time + +NUM_SECONDS_TO_SLEEP = 0.5 + + +def get_eval(content: str, max_tokens: int): + while True: + try: + response = openai.ChatCompletion.create( + model='gpt-4-0314', + messages=[{ + 'role': 'system', + 'content': 'You are a helpful and precise assistant for checking the quality of the answer.' + }, { + 'role': 'user', + 'content': content, + }], + temperature=0.2, # TODO: figure out which temperature is best for evaluation + max_tokens=max_tokens, + ) + break + except openai.error.RateLimitError: + pass + except Exception as e: + print(e) + time.sleep(NUM_SECONDS_TO_SLEEP) + + return response['choices'][0]['message']['content'] + + +def parse_score(review): + try: + score_pair = review.split('\n')[0] + score_pair = score_pair.replace(',', ' ') + sp = score_pair.split(' ') + if len(sp) == 2: + return [float(sp[0]), float(sp[1])] + else: + print('error', review) + return [-1, -1] + except Exception as e: + print(e) + print('error', review) + return [-1, -1] + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.') + parser.add_argument('-q', '--question') + parser.add_argument('-c', '--context') + parser.add_argument('-a', '--answer-list', nargs='+', default=[]) + parser.add_argument('-r', '--rule') + parser.add_argument('-o', '--output') + parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output') + args = parser.parse_args() + + f_q = open(os.path.expanduser(args.question)) + f_ans1 = open(os.path.expanduser(args.answer_list[0])) + f_ans2 = open(os.path.expanduser(args.answer_list[1])) + rule_dict = json.load(open(os.path.expanduser(args.rule), 'r')) + + if os.path.isfile(os.path.expanduser(args.output)): + cur_reviews = [json.loads(line) for line in open(os.path.expanduser(args.output))] + else: + cur_reviews = [] + + review_file = open(f'{args.output}', 'a') + + context_list = [json.loads(line) for line in open(os.path.expanduser(args.context))] + image_to_context = {context['image']: context for context in context_list} + + handles = [] + idx = 0 + for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2): + ques = json.loads(ques_js) + ans1 = json.loads(ans1_js) + ans2 = json.loads(ans2_js) + + inst = image_to_context[ques['image']] + cap_str = '\n'.join(inst['captions']) + box_str = '\n'.join([f'{instance["category"]}: {instance["bbox"]}' for instance in inst['instances']]) + + category = json.loads(ques_js)['category'] + if category in rule_dict: + rule = rule_dict[category] + else: + assert False, f"Visual QA category not found in rule file: {category}." + prompt = rule['prompt'] + role = rule['role'] + content = (f'[Context]\n{cap_str}\n\n{box_str}\n\n' + f'[Question]\n{ques["text"]}\n\n' + f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n' + f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n' + f'[System]\n{prompt}\n\n') + cur_js = { + 'id': idx+1, + 'question_id': ques['question_id'], + 'answer1_id': ans1.get('answer_id', ans1['question_id']), + 'answer2_id': ans2.get('answer_id', ans2['answer_id']), + 'category': category + } + if idx >= len(cur_reviews): + review = get_eval(content, args.max_tokens) + scores = parse_score(review) + cur_js['content'] = review + cur_js['tuple'] = scores + review_file.write(json.dumps(cur_js) + '\n') + review_file.flush() + else: + print(f'Skipping {idx} as we already have it.') + idx += 1 + print(idx) + review_file.close() diff --git a/llava-phi/llava_phi/eval/eval_pope.py b/llava-phi/llava_phi/eval/eval_pope.py new file mode 100644 index 0000000000000000000000000000000000000000..b115b8f2327ea9d972f9e41bcbb03c68be6b3508 --- /dev/null +++ b/llava-phi/llava_phi/eval/eval_pope.py @@ -0,0 +1,81 @@ +import os +import json +import argparse + +def eval_pope(answers, label_file): + label_list = [json.loads(q)['label'] for q in open(label_file, 'r')] + + for answer in answers: + text = answer['text'] + + # Only keep the first sentence + if text.find('.') != -1: + text = text.split('.')[0] + + text = text.replace(',', '') + words = text.split(' ') + if 'No' in words or 'not' in words or 'no' in words: + answer['text'] = 'no' + else: + answer['text'] = 'yes' + + for i in range(len(label_list)): + if label_list[i] == 'no': + label_list[i] = 0 + else: + label_list[i] = 1 + + pred_list = [] + for answer in answers: + if answer['text'] == 'no': + pred_list.append(0) + else: + pred_list.append(1) + + pos = 1 + neg = 0 + yes_ratio = pred_list.count(1) / len(pred_list) + + TP, TN, FP, FN = 0, 0, 0, 0 + for pred, label in zip(pred_list, label_list): + if pred == pos and label == pos: + TP += 1 + elif pred == pos and label == neg: + FP += 1 + elif pred == neg and label == neg: + TN += 1 + elif pred == neg and label == pos: + FN += 1 + + print('TP\tFP\tTN\tFN\t') + print('{}\t{}\t{}\t{}'.format(TP, FP, TN, FN)) + + precision = float(TP) / float(TP + FP) + recall = float(TP) / float(TP + FN) + f1 = 2*precision*recall / (precision + recall) + acc = (TP + TN) / (TP + TN + FP + FN) + print('Accuracy: {}'.format(acc)) + print('Precision: {}'.format(precision)) + print('Recall: {}'.format(recall)) + print('F1 score: {}'.format(f1)) + print('Yes ratio: {}'.format(yes_ratio)) + print('%.3f, %.3f, %.3f, %.3f, %.3f' % (f1, acc, precision, recall, yes_ratio) ) + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--annotation-dir", type=str) + parser.add_argument("--question-file", type=str) + parser.add_argument("--result-file", type=str) + args = parser.parse_args() + + questions = [json.loads(line) for line in open(args.question_file)] + questions = {question['question_id']: question for question in questions} + answers = [json.loads(q) for q in open(args.result_file)] + for file in os.listdir(args.annotation_dir): + assert file.startswith('coco_pope_') + assert file.endswith('.json') + category = file[10:-5] + cur_answers = [x for x in answers if questions[x['question_id']]['category'] == category] + print('Category: {}, # samples: {}'.format(category, len(cur_answers))) + eval_pope(cur_answers, os.path.join(args.annotation_dir, file)) + print("====================================") diff --git a/llava-phi/llava_phi/eval/eval_science_qa.py b/llava-phi/llava_phi/eval/eval_science_qa.py new file mode 100644 index 0000000000000000000000000000000000000000..ccf206bbd7a5d6376eef82d61b3ef8bbe0f71c6c --- /dev/null +++ b/llava-phi/llava_phi/eval/eval_science_qa.py @@ -0,0 +1,114 @@ +import argparse +import json +import os +import re +import random + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument('--base-dir', type=str) + parser.add_argument('--result-file', type=str) + parser.add_argument('--output-file', type=str) + parser.add_argument('--output-result', type=str) + parser.add_argument('--split', type=str, default='test') + parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"]) + return parser.parse_args() + + +def convert_caps(results): + fakecaps = [] + for result in results: + image_id = result['question_id'] + caption = result['text'] + fakecaps.append({"image_id": int(image_id), "caption": caption}) + return fakecaps + + +def get_pred_idx(prediction, choices, options): + """ + Get the index (e.g. 2) from the prediction (e.g. 'C') + """ + if prediction in options[:len(choices)]: + return options.index(prediction) + else: + return -1 + return random.choice(range(len(choices))) + + +if __name__ == "__main__": + args = get_args() + + base_dir = args.base_dir + split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split] + problems = json.load(open(os.path.join(base_dir, "problems.json"))) + predictions = [json.loads(line) for line in open(args.result_file)] + predictions = {pred['question_id']: pred for pred in predictions} + split_problems = {idx: problems[idx] for idx in split_indices} + + results = {'correct': [], 'incorrect': []} + sqa_results = {} + sqa_results['acc'] = None + sqa_results['correct'] = None + sqa_results['count'] = None + sqa_results['results'] = {} + sqa_results['outputs'] = {} + + for prob_id, prob in split_problems.items(): + if prob_id not in predictions: + pred = {'text': 'FAILED', 'prompt': 'Unknown'} + pred_text = 'FAILED' + else: + pred = predictions[prob_id] + pred_text = pred['text'] + + if pred_text in args.options: + answer = pred_text + elif len(pred_text) >= 3 and pred_text[0] in args.options and pred_text[1:3] == ". ": + answer = pred_text[0] + else: + pattern = re.compile(r'The answer is ([A-Z]).') + res = pattern.findall(pred_text) + if len(res) == 1: + answer = res[0] # 'A', 'B', ... + else: + answer = "FAILED" + + pred_idx = get_pred_idx(answer, prob['choices'], args.options) + + analysis = { + 'question_id': prob_id, + 'parsed_ans': answer, + 'ground_truth': args.options[prob['answer']], + 'question': pred['prompt'], + 'pred': pred_text, + 'is_multimodal': '' in pred['prompt'], + } + + sqa_results['results'][prob_id] = get_pred_idx(answer, prob['choices'], args.options) + sqa_results['outputs'][prob_id] = pred_text + + if pred_idx == prob['answer']: + results['correct'].append(analysis) + else: + results['incorrect'].append(analysis) + + correct = len(results['correct']) + total = len(results['correct']) + len(results['incorrect']) + + ###### IMG ###### + multimodal_correct = len([x for x in results['correct'] if x['is_multimodal']]) + multimodal_incorrect = len([x for x in results['incorrect'] if x['is_multimodal']]) + multimodal_total = multimodal_correct + multimodal_incorrect + ###### IMG ###### + + print(f'Total: {total}, Correct: {correct}, Accuracy: {correct / total * 100:.2f}%, IMG-Accuracy: {multimodal_correct / multimodal_total * 100:.2f}%') + + sqa_results['acc'] = correct / total * 100 + sqa_results['correct'] = correct + sqa_results['count'] = total + + with open(args.output_file, 'w') as f: + json.dump(results, f, indent=2) + with open(args.output_result, 'w') as f: + json.dump(sqa_results, f, indent=2) diff --git a/llava-phi/llava_phi/eval/eval_science_qa_gpt4.py b/llava-phi/llava_phi/eval/eval_science_qa_gpt4.py new file mode 100644 index 0000000000000000000000000000000000000000..c2ff17c915481fb556aba6ec816a9e08f519c515 --- /dev/null +++ b/llava-phi/llava_phi/eval/eval_science_qa_gpt4.py @@ -0,0 +1,104 @@ +import argparse +import json +import os +import re +import random +from collections import defaultdict + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument('--base-dir', type=str) + parser.add_argument('--gpt4-result', type=str) + parser.add_argument('--our-result', type=str) + parser.add_argument('--split', type=str, default='test') + parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"]) + return parser.parse_args() + + +def convert_caps(results): + fakecaps = [] + for result in results: + image_id = result['question_id'] + caption = result['text'] + fakecaps.append({"image_id": int(image_id), "caption": caption}) + return fakecaps + + +def get_pred_idx(prediction, choices, options): + """ + Get the index (e.g. 2) from the prediction (e.g. 'C') + """ + if prediction in options[:len(choices)]: + return options.index(prediction) + else: + return random.choice(range(len(choices))) + + +if __name__ == "__main__": + args = get_args() + + base_dir = args.base_dir + split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split] + problems = json.load(open(os.path.join(base_dir, "problems.json"))) + our_predictions = [json.loads(line) for line in open(args.our_result)] + our_predictions = {pred['question_id']: pred for pred in our_predictions} + split_problems = {idx: problems[idx] for idx in split_indices} + + gpt4_predictions = json.load(open(args.gpt4_result))['outputs'] + + results = defaultdict(lambda: 0) + + for prob_id, prob in split_problems.items(): + if prob_id not in our_predictions: + continue + if prob_id not in gpt4_predictions: + continue + our_pred = our_predictions[prob_id]['text'] + gpt4_pred = gpt4_predictions[prob_id] + + pattern = re.compile(r'The answer is ([A-Z]).') + our_res = pattern.findall(our_pred) + if len(our_res) == 1: + our_answer = our_res[0] # 'A', 'B', ... + else: + our_answer = "FAILED" + gpt4_res = pattern.findall(gpt4_pred) + if len(gpt4_res) == 1: + gpt4_answer = gpt4_res[0] # 'A', 'B', ... + else: + gpt4_answer = "FAILED" + + our_pred_idx = get_pred_idx(our_answer, prob['choices'], args.options) + gpt4_pred_idx = get_pred_idx(gpt4_answer, prob['choices'], args.options) + + if gpt4_answer == 'FAILED': + results['gpt4_failed'] += 1 + # continue + gpt4_pred_idx = our_pred_idx + # if our_pred_idx != prob['answer']: + # print(our_predictions[prob_id]['prompt']) + # print('-----------------') + # print(f'LECTURE: {prob["lecture"]}') + # print(f'SOLUTION: {prob["solution"]}') + # print('=====================') + else: + # continue + pass + # gpt4_pred_idx = our_pred_idx + + if gpt4_pred_idx == prob['answer']: + results['correct'] += 1 + else: + results['incorrect'] += 1 + + + if gpt4_pred_idx == prob['answer'] or our_pred_idx == prob['answer']: + results['correct_upperbound'] += 1 + + correct = results['correct'] + total = results['correct'] + results['incorrect'] + print(f'Total: {total}, Correct: {correct}, Accuracy: {correct / total * 100:.2f}%') + print(f'Total: {total}, Correct (upper): {results["correct_upperbound"]}, Accuracy: {results["correct_upperbound"] / total * 100:.2f}%') + print(f'Total: {total}, GPT-4 NO-ANS (RANDOM): {results["gpt4_failed"]}, Percentage: {results["gpt4_failed"] / total * 100:.2f}%') + diff --git a/llava-phi/llava_phi/eval/eval_science_qa_gpt4_requery.py b/llava-phi/llava_phi/eval/eval_science_qa_gpt4_requery.py new file mode 100644 index 0000000000000000000000000000000000000000..698546e995d365d1ccc2c25a87e6c5cd681e6eb6 --- /dev/null +++ b/llava-phi/llava_phi/eval/eval_science_qa_gpt4_requery.py @@ -0,0 +1,149 @@ +import argparse +import json +import os +import re +import random +from collections import defaultdict + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument('--base-dir', type=str) + parser.add_argument('--gpt4-result', type=str) + parser.add_argument('--requery-result', type=str) + parser.add_argument('--our-result', type=str) + parser.add_argument('--output-result', type=str) + parser.add_argument('--split', type=str, default='test') + parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"]) + return parser.parse_args() + + +def convert_caps(results): + fakecaps = [] + for result in results: + image_id = result['question_id'] + caption = result['text'] + fakecaps.append({"image_id": int(image_id), "caption": caption}) + return fakecaps + + +def get_pred_idx(prediction, choices, options): + """ + Get the index (e.g. 2) from the prediction (e.g. 'C') + """ + if prediction in options[:len(choices)]: + return options.index(prediction) + else: + return random.choice(range(len(choices))) + + +if __name__ == "__main__": + args = get_args() + + base_dir = args.base_dir + split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split] + problems = json.load(open(os.path.join(base_dir, "problems.json"))) + our_predictions = [json.loads(line) for line in open(args.our_result)] + our_predictions = {pred['question_id']: pred for pred in our_predictions} + split_problems = {idx: problems[idx] for idx in split_indices} + + requery_predictions = [json.loads(line) for line in open(args.requery_result)] + requery_predictions = {pred['question_id']: pred for pred in requery_predictions} + + gpt4_predictions = json.load(open(args.gpt4_result))['outputs'] + + results = defaultdict(lambda: 0) + + sqa_results = {} + sqa_results['acc'] = None + sqa_results['correct'] = None + sqa_results['count'] = None + sqa_results['results'] = {} + sqa_results['outputs'] = {} + + for prob_id, prob in split_problems.items(): + if prob_id not in our_predictions: + assert False + if prob_id not in gpt4_predictions: + assert False + our_pred = our_predictions[prob_id]['text'] + gpt4_pred = gpt4_predictions[prob_id] + if prob_id not in requery_predictions: + results['missing_requery'] += 1 + requery_pred = "MISSING" + else: + requery_pred = requery_predictions[prob_id]['text'] + + pattern = re.compile(r'The answer is ([A-Z]).') + our_res = pattern.findall(our_pred) + if len(our_res) == 1: + our_answer = our_res[0] # 'A', 'B', ... + else: + our_answer = "FAILED" + + requery_res = pattern.findall(requery_pred) + if len(requery_res) == 1: + requery_answer = requery_res[0] # 'A', 'B', ... + else: + requery_answer = "FAILED" + + gpt4_res = pattern.findall(gpt4_pred) + if len(gpt4_res) == 1: + gpt4_answer = gpt4_res[0] # 'A', 'B', ... + else: + gpt4_answer = "FAILED" + + our_pred_idx = get_pred_idx(our_answer, prob['choices'], args.options) + gpt4_pred_idx = get_pred_idx(gpt4_answer, prob['choices'], args.options) + requery_pred_idx = get_pred_idx(requery_answer, prob['choices'], args.options) + + results['total'] += 1 + + if gpt4_answer == 'FAILED': + results['gpt4_failed'] += 1 + if gpt4_pred_idx == prob['answer']: + results['gpt4_correct'] += 1 + if our_pred_idx == prob['answer']: + results['gpt4_ourvisual_correct'] += 1 + elif gpt4_pred_idx == prob['answer']: + results['gpt4_correct'] += 1 + results['gpt4_ourvisual_correct'] += 1 + + if our_pred_idx == prob['answer']: + results['our_correct'] += 1 + + if requery_answer == 'FAILED': + sqa_results['results'][prob_id] = our_pred_idx + if our_pred_idx == prob['answer']: + results['requery_correct'] += 1 + else: + sqa_results['results'][prob_id] = requery_pred_idx + if requery_pred_idx == prob['answer']: + results['requery_correct'] += 1 + else: + print(f""" +Question ({args.options[prob['answer']]}): {our_predictions[prob_id]['prompt']} +Our ({our_answer}): {our_pred} +GPT-4 ({gpt4_answer}): {gpt4_pred} +Requery ({requery_answer}): {requery_pred} +print("=====================================") +""") + + if gpt4_pred_idx == prob['answer'] or our_pred_idx == prob['answer']: + results['correct_upperbound'] += 1 + + total = results['total'] + print(f'Total: {total}, Our-Correct: {results["our_correct"]}, Accuracy: {results["our_correct"] / total * 100:.2f}%') + print(f'Total: {total}, GPT-4-Correct: {results["gpt4_correct"]}, Accuracy: {results["gpt4_correct"] / total * 100:.2f}%') + print(f'Total: {total}, GPT-4 NO-ANS (RANDOM): {results["gpt4_failed"]}, Percentage: {results["gpt4_failed"] / total * 100:.2f}%') + print(f'Total: {total}, GPT-4-OursVisual-Correct: {results["gpt4_ourvisual_correct"]}, Accuracy: {results["gpt4_ourvisual_correct"] / total * 100:.2f}%') + print(f'Total: {total}, Requery-Correct: {results["requery_correct"]}, Accuracy: {results["requery_correct"] / total * 100:.2f}%') + print(f'Total: {total}, Correct upper: {results["correct_upperbound"]}, Accuracy: {results["correct_upperbound"] / total * 100:.2f}%') + + sqa_results['acc'] = results["requery_correct"] / total * 100 + sqa_results['correct'] = results["requery_correct"] + sqa_results['count'] = total + + with open(args.output_result, 'w') as f: + json.dump(sqa_results, f, indent=2) + diff --git a/llava-phi/llava_phi/eval/eval_textvqa.py b/llava-phi/llava_phi/eval/eval_textvqa.py new file mode 100644 index 0000000000000000000000000000000000000000..ef6ebf2cf83cd3e1434c6d6dff79ac5f5a152c60 --- /dev/null +++ b/llava-phi/llava_phi/eval/eval_textvqa.py @@ -0,0 +1,65 @@ +import os +import argparse +import json +import re + +from llava_phi.eval.m4c_evaluator import TextVQAAccuracyEvaluator + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument('--annotation-file', type=str) + parser.add_argument('--result-file', type=str) + parser.add_argument('--result-dir', type=str) + return parser.parse_args() + + +def prompt_processor(prompt): + if prompt.startswith('OCR tokens: '): + pattern = r"Question: (.*?) Short answer:" + match = re.search(pattern, prompt, re.DOTALL) + question = match.group(1) + elif 'Reference OCR token: ' in prompt and len(prompt.split('\n')) == 3: + if prompt.startswith('Reference OCR token:'): + question = prompt.split('\n')[1] + else: + question = prompt.split('\n')[0] + elif len(prompt.split('\n')) == 2: + question = prompt.split('\n')[0] + else: + assert False + + return question.lower() + + +def eval_single(annotation_file, result_file): + experiment_name = os.path.splitext(os.path.basename(result_file))[0] + print(experiment_name) + annotations = json.load(open(annotation_file))['data'] + annotations = {(annotation['image_id'], annotation['question'].lower()): annotation for annotation in annotations} + results = [json.loads(line) for line in open(result_file)] + + pred_list = [] + for result in results: + annotation = annotations[(result['question_id'], prompt_processor(result['prompt']))] + pred_list.append({ + "pred_answer": result['text'], + "gt_answers": annotation['answers'], + }) + + evaluator = TextVQAAccuracyEvaluator() + print('Samples: {}\nAccuracy: {:.2f}%\n'.format(len(pred_list), 100. * evaluator.eval_pred_list(pred_list))) + + +if __name__ == "__main__": + args = get_args() + + if args.result_file is not None: + eval_single(args.annotation_file, args.result_file) + + if args.result_dir is not None: + for result_file in sorted(os.listdir(args.result_dir)): + if not result_file.endswith('.jsonl'): + print(f'Skipping {result_file}') + continue + eval_single(args.annotation_file, os.path.join(args.result_dir, result_file)) diff --git a/llava-phi/llava_phi/eval/m4c_evaluator.py b/llava-phi/llava_phi/eval/m4c_evaluator.py new file mode 100644 index 0000000000000000000000000000000000000000..e30e958da061a4f0a0bfe34b12d2fcaeba7ff2f4 --- /dev/null +++ b/llava-phi/llava_phi/eval/m4c_evaluator.py @@ -0,0 +1,334 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +import re + +from tqdm import tqdm + + +class EvalAIAnswerProcessor: + """ + Processes an answer similar to Eval AI + copied from + https://github.com/facebookresearch/mmf/blob/c46b3b3391275b4181567db80943473a89ab98ab/pythia/tasks/processors.py#L897 + """ + + CONTRACTIONS = { + "aint": "ain't", + "arent": "aren't", + "cant": "can't", + "couldve": "could've", + "couldnt": "couldn't", + "couldn'tve": "couldn't've", + "couldnt've": "couldn't've", + "didnt": "didn't", + "doesnt": "doesn't", + "dont": "don't", + "hadnt": "hadn't", + "hadnt've": "hadn't've", + "hadn'tve": "hadn't've", + "hasnt": "hasn't", + "havent": "haven't", + "hed": "he'd", + "hed've": "he'd've", + "he'dve": "he'd've", + "hes": "he's", + "howd": "how'd", + "howll": "how'll", + "hows": "how's", + "Id've": "I'd've", + "I'dve": "I'd've", + "Im": "I'm", + "Ive": "I've", + "isnt": "isn't", + "itd": "it'd", + "itd've": "it'd've", + "it'dve": "it'd've", + "itll": "it'll", + "let's": "let's", + "maam": "ma'am", + "mightnt": "mightn't", + "mightnt've": "mightn't've", + "mightn'tve": "mightn't've", + "mightve": "might've", + "mustnt": "mustn't", + "mustve": "must've", + "neednt": "needn't", + "notve": "not've", + "oclock": "o'clock", + "oughtnt": "oughtn't", + "ow's'at": "'ow's'at", + "'ows'at": "'ow's'at", + "'ow'sat": "'ow's'at", + "shant": "shan't", + "shed've": "she'd've", + "she'dve": "she'd've", + "she's": "she's", + "shouldve": "should've", + "shouldnt": "shouldn't", + "shouldnt've": "shouldn't've", + "shouldn'tve": "shouldn't've", + "somebody'd": "somebodyd", + "somebodyd've": "somebody'd've", + "somebody'dve": "somebody'd've", + "somebodyll": "somebody'll", + "somebodys": "somebody's", + "someoned": "someone'd", + "someoned've": "someone'd've", + "someone'dve": "someone'd've", + "someonell": "someone'll", + "someones": "someone's", + "somethingd": "something'd", + "somethingd've": "something'd've", + "something'dve": "something'd've", + "somethingll": "something'll", + "thats": "that's", + "thered": "there'd", + "thered've": "there'd've", + "there'dve": "there'd've", + "therere": "there're", + "theres": "there's", + "theyd": "they'd", + "theyd've": "they'd've", + "they'dve": "they'd've", + "theyll": "they'll", + "theyre": "they're", + "theyve": "they've", + "twas": "'twas", + "wasnt": "wasn't", + "wed've": "we'd've", + "we'dve": "we'd've", + "weve": "we've", + "werent": "weren't", + "whatll": "what'll", + "whatre": "what're", + "whats": "what's", + "whatve": "what've", + "whens": "when's", + "whered": "where'd", + "wheres": "where's", + "whereve": "where've", + "whod": "who'd", + "whod've": "who'd've", + "who'dve": "who'd've", + "wholl": "who'll", + "whos": "who's", + "whove": "who've", + "whyll": "why'll", + "whyre": "why're", + "whys": "why's", + "wont": "won't", + "wouldve": "would've", + "wouldnt": "wouldn't", + "wouldnt've": "wouldn't've", + "wouldn'tve": "wouldn't've", + "yall": "y'all", + "yall'll": "y'all'll", + "y'allll": "y'all'll", + "yall'd've": "y'all'd've", + "y'alld've": "y'all'd've", + "y'all'dve": "y'all'd've", + "youd": "you'd", + "youd've": "you'd've", + "you'dve": "you'd've", + "youll": "you'll", + "youre": "you're", + "youve": "you've", + } + + NUMBER_MAP = { + "none": "0", + "zero": "0", + "one": "1", + "two": "2", + "three": "3", + "four": "4", + "five": "5", + "six": "6", + "seven": "7", + "eight": "8", + "nine": "9", + "ten": "10", + } + ARTICLES = ["a", "an", "the"] + PERIOD_STRIP = re.compile(r"(?!<=\d)(\.)(?!\d)") + COMMA_STRIP = re.compile(r"(?<=\d)(\,)+(?=\d)") + PUNCTUATIONS = [ + ";", + r"/", + "[", + "]", + '"', + "{", + "}", + "(", + ")", + "=", + "+", + "\\", + "_", + "-", + ">", + "<", + "@", + "`", + ",", + "?", + "!", + ] + + def __init__(self, *args, **kwargs): + pass + + def word_tokenize(self, word): + word = word.lower() + word = word.replace(",", "").replace("?", "").replace("'s", " 's") + return word.strip() + + def process_punctuation(self, in_text): + out_text = in_text + for p in self.PUNCTUATIONS: + if (p + " " in in_text or " " + p in in_text) or ( + re.search(self.COMMA_STRIP, in_text) is not None + ): + out_text = out_text.replace(p, "") + else: + out_text = out_text.replace(p, " ") + out_text = self.PERIOD_STRIP.sub("", out_text, re.UNICODE) + return out_text + + def process_digit_article(self, in_text): + out_text = [] + temp_text = in_text.lower().split() + for word in temp_text: + word = self.NUMBER_MAP.setdefault(word, word) + if word not in self.ARTICLES: + out_text.append(word) + else: + pass + for word_id, word in enumerate(out_text): + if word in self.CONTRACTIONS: + out_text[word_id] = self.CONTRACTIONS[word] + out_text = " ".join(out_text) + return out_text + + def __call__(self, item): + item = self.word_tokenize(item) + item = item.replace("\n", " ").replace("\t", " ").strip() + item = self.process_punctuation(item) + item = self.process_digit_article(item) + return item + + +class TextVQAAccuracyEvaluator: + def __init__(self): + self.answer_processor = EvalAIAnswerProcessor() + + def _compute_answer_scores(self, raw_answers): + """ + compute the accuracy (soft score) of human answers + """ + answers = [self.answer_processor(a) for a in raw_answers] + assert len(answers) == 10 + gt_answers = list(enumerate(answers)) + unique_answers = set(answers) + unique_answer_scores = {} + + for unique_answer in unique_answers: + accs = [] + for gt_answer in gt_answers: + other_answers = [item for item in gt_answers if item != gt_answer] + matching_answers = [ + item for item in other_answers if item[1] == unique_answer + ] + acc = min(1, float(len(matching_answers)) / 3) + accs.append(acc) + unique_answer_scores[unique_answer] = sum(accs) / len(accs) + + return unique_answer_scores + + def eval_pred_list(self, pred_list): + pred_scores = [] + for entry in tqdm(pred_list): + pred_answer = self.answer_processor(entry["pred_answer"]) + unique_answer_scores = self._compute_answer_scores(entry["gt_answers"]) + score = unique_answer_scores.get(pred_answer, 0.0) + pred_scores.append(score) + + accuracy = sum(pred_scores) / len(pred_scores) + return accuracy + + +class STVQAAccuracyEvaluator: + def __init__(self): + self.answer_processor = EvalAIAnswerProcessor() + + def eval_pred_list(self, pred_list): + pred_scores = [] + for entry in pred_list: + pred_answer = self.answer_processor(entry["pred_answer"]) + gts = [self.answer_processor(a) for a in entry["gt_answers"]] + score = 1.0 if pred_answer in gts else 0.0 + pred_scores.append(score) + + accuracy = sum(pred_scores) / len(pred_scores) + return accuracy + + +class STVQAANLSEvaluator: + def __init__(self): + import editdistance # install with `pip install editdistance` + + self.get_edit_distance = editdistance.eval + + def get_anls(self, s1, s2): + s1 = s1.lower().strip() + s2 = s2.lower().strip() + iou = 1 - self.get_edit_distance(s1, s2) / max(len(s1), len(s2)) + anls = iou if iou >= 0.5 else 0.0 + return anls + + def eval_pred_list(self, pred_list): + pred_scores = [] + for entry in pred_list: + anls = max( + self.get_anls(entry["pred_answer"], gt) for gt in entry["gt_answers"] + ) + pred_scores.append(anls) + + accuracy = sum(pred_scores) / len(pred_scores) + return accuracy + + +class TextCapsBleu4Evaluator: + def __init__(self): + # The following script requires Java 1.8.0 and pycocotools installed. + # The pycocoevalcap can be installed with pip as + # pip install git+https://github.com/ronghanghu/coco-caption.git@python23 + # Original pycocoevalcap code is at https://github.com/tylin/coco-caption + # but has no python3 support yet. + try: + from pycocoevalcap.bleu.bleu import Bleu + from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer + except ModuleNotFoundError: + print( + "Please install pycocoevalcap module using " + "pip install git+https://github.com/ronghanghu/coco-caption.git@python23" # noqa + ) + raise + + self.tokenizer = PTBTokenizer() + self.scorer = Bleu(4) + + def eval_pred_list(self, pred_list): + # Create reference and hypotheses captions. + gts = {} + res = {} + for idx, entry in enumerate(pred_list): + gts[idx] = [{"caption": a} for a in entry["gt_answers"]] + res[idx] = [{"caption": entry["pred_answer"]}] + + gts = self.tokenizer.tokenize(gts) + res = self.tokenizer.tokenize(res) + score, _ = self.scorer.compute_score(gts, res) + + bleu4 = score[3] # score is (Bleu-1, Bleu-2, Bleu-3, Bleu-4) + return bleu4 diff --git a/llava-phi/llava_phi/eval/model_qa.py b/llava-phi/llava_phi/eval/model_qa.py new file mode 100644 index 0000000000000000000000000000000000000000..877e350cef217b1062fd86b99cb1e617e737d8c4 --- /dev/null +++ b/llava-phi/llava_phi/eval/model_qa.py @@ -0,0 +1,88 @@ +import argparse +from transformers import AutoTokenizer, AutoModelForCausalLM, StoppingCriteria +import torch +import os +import json +from tqdm import tqdm +import shortuuid + +from llava_phi.conversation import default_conversation +from llava_phi.utils import disable_torch_init + + +# new stopping implementation +class KeywordsStoppingCriteria(StoppingCriteria): + def __init__(self, keywords, tokenizer, input_ids): + self.keywords = keywords + self.tokenizer = tokenizer + self.start_len = None + self.input_ids = input_ids + + def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool: + if self.start_len is None: + self.start_len = self.input_ids.shape[1] + else: + outputs = self.tokenizer.batch_decode(output_ids[:, self.start_len:], skip_special_tokens=True)[0] + for keyword in self.keywords: + if keyword in outputs: + return True + return False + + +@torch.inference_mode() +def eval_model(model_name, questions_file, answers_file): + # Model + disable_torch_init() + model_name = os.path.expanduser(model_name) + tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False) + model = AutoModelForCausalLM.from_pretrained(model_name, + torch_dtype=torch.float16).cuda() + + + ques_file = open(os.path.expanduser(questions_file), "r") + ans_file = open(os.path.expanduser(answers_file), "w") + for i, line in enumerate(tqdm(ques_file)): + idx = json.loads(line)["question_id"] + qs = json.loads(line)["text"] + cat = json.loads(line)["category"] + conv = default_conversation.copy() + conv.append_message(conv.roles[0], qs) + prompt = conv.get_prompt() + inputs = tokenizer([prompt]) + input_ids = torch.as_tensor(inputs.input_ids).cuda() + # stopping_criteria = KeywordsStoppingCriteria([conv.sep], tokenizer, input_ids) + output_ids = model.generate( + input_ids, + do_sample=True, + use_cache=True, + temperature=0.7, + max_new_tokens=1024, + eos_token_id=tokenizer.eos_token_id, # End of sequence token + pad_token_id=tokenizer.eos_token_id, # Pad token + # stopping_criteria=[stopping_criteria] + ) + outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0] + try: + index = outputs.index(conv.sep, len(prompt)) + except ValueError: + outputs += conv.sep + index = outputs.index(conv.sep, len(prompt)) + + outputs = outputs[len(prompt) + len(conv.roles[1]) + 2:index].strip() + ans_id = shortuuid.uuid() + ans_file.write(json.dumps({"question_id": idx, + "text": outputs, + "answer_id": ans_id, + "model_id": model_name, + "metadata": {}}) + "\n") + ans_file.flush() + ans_file.close() + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--model-name", type=str, default="facebook/opt-350m") + parser.add_argument("--question-file", type=str, default="tables/question.jsonl") + parser.add_argument("--answers-file", type=str, default="answer.jsonl") + args = parser.parse_args() + + eval_model(args.model_name, args.question_file, args.answers_file) diff --git a/llava-phi/llava_phi/eval/model_vqa.py b/llava-phi/llava_phi/eval/model_vqa.py new file mode 100644 index 0000000000000000000000000000000000000000..c0ddcb93319054f5905665e26095f7ec3a1b403f --- /dev/null +++ b/llava-phi/llava_phi/eval/model_vqa.py @@ -0,0 +1,115 @@ +import argparse +import torch +import os +import json +from tqdm import tqdm +import shortuuid + +from llava_phi.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN +from llava_phi.conversation import conv_templates, SeparatorStyle +from llava_phi.model.builder import load_pretrained_model +from llava_phi.utils import disable_torch_init +from llava_phi.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria + +from PIL import Image +import math + + +def split_list(lst, n): + """Split a list into n (roughly) equal-sized chunks""" + chunk_size = math.ceil(len(lst) / n) # integer division + return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)] + + +def get_chunk(lst, n, k): + chunks = split_list(lst, n) + return chunks[k] + + +def eval_model(args): + # Model + disable_torch_init() + model_path = os.path.expanduser(args.model_path) + model_name = get_model_name_from_path(model_path) + tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name) + + questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")] + questions = get_chunk(questions, args.num_chunks, args.chunk_idx) + answers_file = os.path.expanduser(args.answers_file) + os.makedirs(os.path.dirname(answers_file), exist_ok=True) + ans_file = open(answers_file, "w") + for line in tqdm(questions): + idx = line["question_id"] + image_file = line["image"] + qs = line["text"] + cur_prompt = qs + if model.config.mm_use_im_start_end: + qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs + else: + qs = DEFAULT_IMAGE_TOKEN + '\n' + qs + + conv = conv_templates[args.conv_mode].copy() + conv.append_message(conv.roles[0], qs) + conv.append_message(conv.roles[1], None) + prompt = conv.get_prompt() + input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda() + + image = Image.open(os.path.join(args.image_folder, image_file)) + image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0] + + stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2 + keywords = [stop_str] + stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids) + + with torch.inference_mode(): + output_ids = model.generate( + input_ids, + images=image_tensor.unsqueeze(0).cuda(), + do_sample=True if args.temperature > 0 else False, + temperature=args.temperature, + top_p=args.top_p, + num_beams=args.num_beams, + # no_repeat_ngram_size=3, + eos_token_id=tokenizer.eos_token_id, # End of sequence token + pad_token_id=tokenizer.eos_token_id, # Pad token + max_new_tokens=1024, + use_cache=True + ) + + input_token_len = input_ids.shape[1] + n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item() + if n_diff_input_output > 0: + print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids') + outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0] + outputs = outputs.strip() + if outputs.endswith(stop_str): + outputs = outputs[:-len(stop_str)] + outputs = outputs.strip() + + ans_id = shortuuid.uuid() + ans_file.write(json.dumps({"question_id": idx, + "image_id": image_file, + "prompt": cur_prompt, + "text": outputs, + "answer_id": ans_id, + "model_id": model_name, + "metadata": {}}) + "\n") + ans_file.flush() + ans_file.close() + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--model-path", type=str, default="facebook/opt-350m") + parser.add_argument("--model-base", type=str, default=None) + parser.add_argument("--image-folder", type=str, default="") + parser.add_argument("--question-file", type=str, default="tables/question.jsonl") + parser.add_argument("--answers-file", type=str, default="answer.jsonl") + parser.add_argument("--conv-mode", type=str, default="llava_v1") + parser.add_argument("--num-chunks", type=int, default=1) + parser.add_argument("--chunk-idx", type=int, default=0) + parser.add_argument("--temperature", type=float, default=0.2) + parser.add_argument("--top_p", type=float, default=None) + parser.add_argument("--num_beams", type=int, default=1) + args = parser.parse_args() + + eval_model(args) diff --git a/llava-phi/llava_phi/eval/model_vqa_loader.py b/llava-phi/llava_phi/eval/model_vqa_loader.py new file mode 100644 index 0000000000000000000000000000000000000000..027303ef133f9cd4e2190fe99112d2bba7a36b55 --- /dev/null +++ b/llava-phi/llava_phi/eval/model_vqa_loader.py @@ -0,0 +1,144 @@ +import argparse +import torch +import os +import json +from tqdm import tqdm +import shortuuid + +from llava_phi.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN +from llava_phi.conversation import conv_templates, SeparatorStyle +from llava_phi.model.builder import load_pretrained_model +from llava_phi.utils import disable_torch_init +from llava_phi.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path +from torch.utils.data import Dataset, DataLoader + +from PIL import Image +import math + + +def split_list(lst, n): + """Split a list into n (roughly) equal-sized chunks""" + chunk_size = math.ceil(len(lst) / n) # integer division + return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)] + + +def get_chunk(lst, n, k): + chunks = split_list(lst, n) + return chunks[k] + + +# Custom dataset class +class CustomDataset(Dataset): + def __init__(self, questions, image_folder, tokenizer, image_processor, model_config): + self.questions = questions + self.image_folder = image_folder + self.tokenizer = tokenizer + self.image_processor = image_processor + self.model_config = model_config + + def __getitem__(self, index): + line = self.questions[index] + image_file = line["image"] + qs = line["text"] + if self.model_config.mm_use_im_start_end: + qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs + else: + qs = DEFAULT_IMAGE_TOKEN + '\n' + qs + + conv = conv_templates[args.conv_mode].copy() + conv.append_message(conv.roles[0], qs) + conv.append_message(conv.roles[1], None) + prompt = conv.get_prompt() + image = Image.open(os.path.join(self.image_folder, image_file)).convert('RGB') + image_tensor = process_images([image], self.image_processor, self.model_config)[0] + + input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt') + + return input_ids, image_tensor + + def __len__(self): + return len(self.questions) + + +# DataLoader +def create_data_loader(questions, image_folder, tokenizer, image_processor, model_config, batch_size=1, num_workers=4): + assert batch_size == 1, "batch_size must be 1" + dataset = CustomDataset(questions, image_folder, tokenizer, image_processor, model_config) + data_loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False) + return data_loader + + +def eval_model(args): + # Model + disable_torch_init() + model_path = os.path.expanduser(args.model_path) + model_name = get_model_name_from_path(model_path) + tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name) + + questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")] + questions = get_chunk(questions, args.num_chunks, args.chunk_idx) + answers_file = os.path.expanduser(args.answers_file) + os.makedirs(os.path.dirname(answers_file), exist_ok=True) + ans_file = open(answers_file, "w") + + data_loader = create_data_loader(questions, args.image_folder, tokenizer, image_processor, model.config) + + for (input_ids, image_tensor), line in tqdm(zip(data_loader, questions), total=len(questions)): + idx = line["question_id"] + cur_prompt = line["text"] + + stop_str = conv_templates[args.conv_mode].sep if conv_templates[args.conv_mode].sep_style != SeparatorStyle.TWO else conv_templates[args.conv_mode].sep2 + input_ids = input_ids.to(device='cuda', non_blocking=True) + + with torch.inference_mode(): + output_ids = model.generate( + input_ids, + images=image_tensor.to(device='cuda', non_blocking=True), + do_sample=True if args.temperature > 0 else False, + temperature=args.temperature, + top_p=args.top_p, + # no_repeat_ngram_size=3, + num_beams=args.num_beams, + max_new_tokens=128, + eos_token_id=tokenizer.eos_token_id, # End of sequence token + pad_token_id=tokenizer.eos_token_id, # Pad token + use_cache=True + ) + + input_token_len = input_ids.shape[1] + n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item() + + if n_diff_input_output > 0: + print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids') + outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0] + outputs = outputs.strip() + if outputs.endswith(stop_str): + outputs = outputs[:-len(stop_str)] + outputs = outputs.strip() + + ans_id = shortuuid.uuid() + ans_file.write(json.dumps({"question_id": idx, + "prompt": cur_prompt, + "text": outputs, + "answer_id": ans_id, + "model_id": model_name, + "metadata": {}}) + "\n") + # ans_file.flush() + ans_file.close() + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--model-path", type=str, default="facebook/opt-350m") + parser.add_argument("--model-base", type=str, default=None) + parser.add_argument("--image-folder", type=str, default="") + parser.add_argument("--question-file", type=str, default="tables/question.jsonl") + parser.add_argument("--answers-file", type=str, default="answer.jsonl") + parser.add_argument("--conv-mode", type=str, default="v0") + parser.add_argument("--num-chunks", type=int, default=1) + parser.add_argument("--chunk-idx", type=int, default=0) + parser.add_argument("--temperature", type=float, default=0.2) + parser.add_argument("--top_p", type=float, default=None) + parser.add_argument("--num_beams", type=int, default=1) + args = parser.parse_args() + + eval_model(args) diff --git a/llava-phi/llava_phi/eval/model_vqa_mmbench.py b/llava-phi/llava_phi/eval/model_vqa_mmbench.py new file mode 100644 index 0000000000000000000000000000000000000000..b4eb2e4c363cd4f120b3d0b0995427c5c47ff64d --- /dev/null +++ b/llava-phi/llava_phi/eval/model_vqa_mmbench.py @@ -0,0 +1,173 @@ +import argparse +import torch +import os +import json +import pandas as pd +from tqdm import tqdm +import shortuuid + +from llava_phi.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN +from llava_phi.conversation import conv_templates, SeparatorStyle +from llava_phi.model.builder import load_pretrained_model +from llava_phi.utils import disable_torch_init +from llava_phi.mm_utils import tokenizer_image_token, process_images, load_image_from_base64, get_model_name_from_path + +from PIL import Image +import math + + +all_options = ['A', 'B', 'C', 'D'] + + +def split_list(lst, n): + """Split a list into n (roughly) equal-sized chunks""" + chunk_size = math.ceil(len(lst) / n) # integer division + return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)] + + +def get_chunk(lst, n, k): + chunks = split_list(lst, n) + return chunks[k] + + +def is_none(value): + if value is None: + return True + if type(value) is float and math.isnan(value): + return True + if type(value) is str and value.lower() == 'nan': + return True + if type(value) is str and value.lower() == 'none': + return True + return False + +def get_options(row, options): + parsed_options = [] + for option in options: + option_value = row[option] + if is_none(option_value): + break + parsed_options.append(option_value) + return parsed_options + + +def eval_model(args): + # Model + disable_torch_init() + model_path = os.path.expanduser(args.model_path) + model_name = get_model_name_from_path(model_path) + tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name) + + questions = pd.read_table(os.path.expanduser(args.question_file)) + questions = get_chunk(questions, args.num_chunks, args.chunk_idx) + answers_file = os.path.expanduser(args.answers_file) + os.makedirs(os.path.dirname(answers_file), exist_ok=True) + ans_file = open(answers_file, "w") + + if 'plain' in model_name and 'finetune' not in model_name.lower() and 'mmtag' not in args.conv_mode: + args.conv_mode = args.conv_mode + '_mmtag' + print(f'It seems that this is a plain model, but it is not using a mmtag prompt, auto switching to {args.conv_mode}.') + + for index, row in tqdm(questions.iterrows(), total=len(questions)): + options = get_options(row, all_options) + cur_option_char = all_options[:len(options)] + + if args.all_rounds: + num_rounds = len(options) + else: + num_rounds = 1 + + for round_idx in range(num_rounds): + idx = row['index'] + question = row['question'] + hint = row['hint'] + image = load_image_from_base64(row['image']) + if not is_none(hint): + question = hint + '\n' + question + for option_char, option in zip(all_options[:len(options)], options): + question = question + '\n' + option_char + '. ' + option + qs = cur_prompt = question + if model.config.mm_use_im_start_end: + qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs + else: + qs = DEFAULT_IMAGE_TOKEN + '\n' + qs + + if args.single_pred_prompt: + if args.lang == 'cn': + qs = qs + '\n' + "请直接回答选项字母。" + else: + qs = qs + '\n' + "Answer with the option's letter from the given choices directly." + + conv = conv_templates[args.conv_mode].copy() + conv.append_message(conv.roles[0], qs) + conv.append_message(conv.roles[1], None) + prompt = conv.get_prompt() + + input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda() + + image_tensor = process_images([image], image_processor, model.config)[0] + # image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0] + + stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2 + + with torch.inference_mode(): + output_ids = model.generate( + input_ids, + images=image_tensor.unsqueeze(0).cuda(), + do_sample=True if args.temperature > 0 else False, + temperature=args.temperature, + top_p=args.top_p, + num_beams=args.num_beams, + # no_repeat_ngram_size=3, + eos_token_id=tokenizer.eos_token_id, # End of sequence token + pad_token_id=tokenizer.eos_token_id, # Pad token + max_new_tokens=1024, + use_cache=True + ) + + input_token_len = input_ids.shape[1] + n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item() + if n_diff_input_output > 0: + print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids') + outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0] + outputs = outputs.strip() + if outputs.endswith(stop_str): + outputs = outputs[:-len(stop_str)] + outputs = outputs.strip() + + ans_id = shortuuid.uuid() + ans_file.write(json.dumps({"question_id": idx, + "round_id": round_idx, + "prompt": cur_prompt, + "text": outputs, + "options": options, + "option_char": cur_option_char, + "answer_id": ans_id, + "model_id": model_name, + "metadata": {}}) + "\n") + ans_file.flush() + + # rotate options + options = options[1:] + options[:1] + cur_option_char = cur_option_char[1:] + cur_option_char[:1] + ans_file.close() + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--model-path", type=str, default="facebook/opt-350m") + parser.add_argument("--model-base", type=str, default=None) + parser.add_argument("--image-folder", type=str, default="") + parser.add_argument("--question-file", type=str, default="tables/question.jsonl") + parser.add_argument("--answers-file", type=str, default="answer.jsonl") + parser.add_argument("--conv-mode", type=str, default="llava_v1") + parser.add_argument("--num-chunks", type=int, default=1) + parser.add_argument("--chunk-idx", type=int, default=0) + parser.add_argument("--temperature", type=float, default=0.2) + parser.add_argument("--top_p", type=float, default=None) + parser.add_argument("--num_beams", type=int, default=1) + parser.add_argument("--all-rounds", action="store_true") + parser.add_argument("--single-pred-prompt", action="store_true") + parser.add_argument("--lang", type=str, default="en") + args = parser.parse_args() + + eval_model(args) diff --git a/llava-phi/llava_phi/eval/model_vqa_phi.py b/llava-phi/llava_phi/eval/model_vqa_phi.py new file mode 100644 index 0000000000000000000000000000000000000000..e487451afacf9ee8cf6360243a543ac86e379378 --- /dev/null +++ b/llava-phi/llava_phi/eval/model_vqa_phi.py @@ -0,0 +1,117 @@ +import argparse +import torch +import os +import json +from tqdm import tqdm +import shortuuid + +from llava_phi.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN +from llava_phi.conversation import conv_templates, SeparatorStyle +from llava_phi.model.builder import load_pretrained_model +from llava_phi.utils import disable_torch_init +from llava_phi.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria + +from PIL import Image +import math + + +def split_list(lst, n): + """Split a list into n (roughly) equal-sized chunks""" + chunk_size = math.ceil(len(lst) / n) # integer division + return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)] + + +def get_chunk(lst, n, k): + chunks = split_list(lst, n) + return chunks[k] + + +def eval_model(args): + # Model + disable_torch_init() + model_path = os.path.expanduser(args.model_path) + model_name = get_model_name_from_path(model_path) + tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name) + + print(model) + questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")] + questions = get_chunk(questions, args.num_chunks, args.chunk_idx)[:10] + answers_file = os.path.expanduser(args.answers_file) + os.makedirs(os.path.dirname(answers_file), exist_ok=True) + ans_file = open(answers_file, "w") + for line in tqdm(questions): + idx = line["question_id"] + image_file = line["image"] + qs = line["text"] + cur_prompt = qs + if model.config.mm_use_im_start_end: + qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs + else: + qs = DEFAULT_IMAGE_TOKEN + '\n' + qs + + conv = conv_templates[args.conv_mode].copy() + conv.append_message(conv.roles[0], qs) + conv.append_message(conv.roles[1], None) + prompt = conv.get_prompt() + input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda() + + image = Image.open(os.path.join(args.image_folder, image_file)) + image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0] + + stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2 + keywords = [stop_str] + stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids) + + with torch.inference_mode(): + output_ids = model.generate( + input_ids, + images=image_tensor.unsqueeze(0).cuda(), + do_sample=True if args.temperature > 0 else False, + temperature=args.temperature, + top_p=args.top_p, + num_beams=args.num_beams, + # no_repeat_ngram_size=3, + eos_token_id=tokenizer.eos_token_id, # End of sequence token + pad_token_id=tokenizer.eos_token_id, # Pad token + max_new_tokens=1024, + use_cache=True) + + input_token_len = input_ids.shape[1] + print(output_ids[:, input_token_len:]) + n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item() + if n_diff_input_output > 0: + print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids') + outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0] + outputs = outputs.strip() + if outputs.endswith(stop_str): + outputs = outputs[:-len(stop_str)] + outputs = outputs.strip() + + ans_id = shortuuid.uuid() + ans_file.write(json.dumps({"question_id": idx, + "image_id": image_file, + "prompt": cur_prompt, + "text": outputs, + "answer_id": ans_id, + "model_id": model_name, + "metadata": {}}) + "\n") + ans_file.flush() + ans_file.close() + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--model-path", type=str, default="facebook/opt-350m") + parser.add_argument("--model-base", type=str, default=None) + parser.add_argument("--image-folder", type=str, default="") + parser.add_argument("--question-file", type=str, default="tables/question.jsonl") + parser.add_argument("--answers-file", type=str, default="answer.jsonl") + parser.add_argument("--conv-mode", type=str, default="llava_v1") + parser.add_argument("--num-chunks", type=int, default=1) + parser.add_argument("--chunk-idx", type=int, default=0) + parser.add_argument("--temperature", type=float, default=0.2) + parser.add_argument("--top_p", type=float, default=None) + parser.add_argument("--num_beams", type=int, default=1) + args = parser.parse_args() + print(args) + + eval_model(args) diff --git a/llava-phi/llava_phi/eval/model_vqa_science.py b/llava-phi/llava_phi/eval/model_vqa_science.py new file mode 100644 index 0000000000000000000000000000000000000000..1ae2f0426ad8572ec543e2687014bc9dd784367c --- /dev/null +++ b/llava-phi/llava_phi/eval/model_vqa_science.py @@ -0,0 +1,152 @@ +import argparse +import torch +import os +import json +from tqdm import tqdm +import shortuuid + +from llava_phi.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN +from llava_phi.conversation import conv_templates, SeparatorStyle +from llava_phi.model.builder import load_pretrained_model +from llava_phi.utils import disable_torch_init +from llava_phi.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria + +from PIL import Image +import math + + +def split_list(lst, n): + """Split a list into n (roughly) equal-sized chunks""" + chunk_size = math.ceil(len(lst) / n) # integer division + return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)] + + +def get_chunk(lst, n, k): + chunks = split_list(lst, n) + return chunks[k] + + +def eval_model(args): + # Model + disable_torch_init() + model_path = os.path.expanduser(args.model_path) + model_name = get_model_name_from_path(model_path) + tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name) + + questions = json.load(open(os.path.expanduser(args.question_file), "r")) + questions = get_chunk(questions, args.num_chunks, args.chunk_idx) + answers_file = os.path.expanduser(args.answers_file) + os.makedirs(os.path.dirname(answers_file), exist_ok=True) + ans_file = open(answers_file, "w") + for i, line in enumerate(tqdm(questions)): + idx = line["id"] + question = line['conversations'][0] + qs = question['value'].replace('', '').strip() + cur_prompt = qs + + if 'image' in line: + image_file = line["image"] + image = Image.open(os.path.join(args.image_folder, image_file)) + image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0] + images = image_tensor.unsqueeze(0).cuda() + if getattr(model.config, 'mm_use_im_start_end', False): + qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs + else: + qs = DEFAULT_IMAGE_TOKEN + '\n' + qs + cur_prompt = '' + '\n' + cur_prompt + else: + images = None + + if args.single_pred_prompt: + qs = qs + '\n' + "Answer with the option's letter from the given choices directly." + cur_prompt = cur_prompt + '\n' + "Answer with the option's letter from the given choices directly." + + conv = conv_templates[args.conv_mode].copy() + conv.append_message(conv.roles[0], qs) + conv.append_message(conv.roles[1], None) + prompt = conv.get_prompt() + + input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda() + + stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2 + keywords = [stop_str] + stopping_criteria = [KeywordsStoppingCriteria(keywords, tokenizer, input_ids)] if conv.version == "v0" else None + + with torch.inference_mode(): + output_ids = model.generate( + input_ids, + images=images, + do_sample=True if args.temperature > 0 else False, + temperature=args.temperature, + max_new_tokens=1024, + eos_token_id=tokenizer.eos_token_id, # End of sequence token + pad_token_id=tokenizer.eos_token_id, # Pad token + use_cache=True + # stopping_criteria=stopping_criteria, + ) + + input_token_len = input_ids.shape[1] + n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item() + if n_diff_input_output > 0: + print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids') + outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0] + outputs = outputs.strip() + if outputs.endswith(stop_str): + outputs = outputs[:-len(stop_str)] + outputs = outputs.strip() + + # prompt for answer + if args.answer_prompter: + outputs_reasoning = outputs + input_ids = tokenizer_image_token(prompt + outputs_reasoning + ' ###\nANSWER:', tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda() + + with torch.inference_mode(): + output_ids = model.generate( + input_ids, + images=images, + do_sample=True if args.temperature > 0 else False, + temperature=args.temperature, + max_new_tokens=64, + eos_token_id=tokenizer.eos_token_id, # End of sequence token + pad_token_id=tokenizer.eos_token_id, # Pad token + use_cache=True + # stopping_criteria=[stopping_criteria] + ) + + input_token_len = input_ids.shape[1] + n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item() + if n_diff_input_output > 0: + print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids') + outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0] + outputs = outputs.strip() + if outputs.endswith(stop_str): + outputs = outputs[:-len(stop_str)] + outputs = outputs.strip() + outputs = outputs_reasoning + '\n The answer is ' + outputs + + ans_id = shortuuid.uuid() + ans_file.write(json.dumps({"question_id": idx, + "prompt": cur_prompt, + "text": outputs, + "answer_id": ans_id, + "model_id": model_name, + "metadata": {}}) + "\n") + ans_file.flush() + ans_file.close() + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--model-path", type=str, default="facebook/opt-350m") + parser.add_argument("--model-base", type=str, default=None) + parser.add_argument("--image-folder", type=str, default="") + parser.add_argument("--question-file", type=str, default="tables/question.json") + parser.add_argument("--answers-file", type=str, default="answer.jsonl") + parser.add_argument("--conv-mode", type=str, default="llava_v0") + parser.add_argument("--num-chunks", type=int, default=1) + parser.add_argument("--chunk-idx", type=int, default=0) + parser.add_argument("--temperature", type=float, default=0.2) + parser.add_argument("--answer-prompter", action="store_true") + parser.add_argument("--single-pred-prompt", action="store_true") + args = parser.parse_args() + + eval_model(args) diff --git a/llava-phi/llava_phi/eval/qa_baseline_gpt35.py b/llava-phi/llava_phi/eval/qa_baseline_gpt35.py new file mode 100644 index 0000000000000000000000000000000000000000..babab6e12b4bb8cfa74a7edfa5e56cd1b3e2bf6c --- /dev/null +++ b/llava-phi/llava_phi/eval/qa_baseline_gpt35.py @@ -0,0 +1,74 @@ +"""Generate answers with GPT-3.5""" +# Note: you need to be using OpenAI Python v0.27.0 for the code below to work +import argparse +import json +import os +import time +import concurrent.futures + +import openai +import tqdm +import shortuuid + +MODEL = 'gpt-3.5-turbo' +MODEL_ID = 'gpt-3.5-turbo:20230327' + +def get_answer(question_id: int, question: str, max_tokens: int): + ans = { + 'answer_id': shortuuid.uuid(), + 'question_id': question_id, + 'model_id': MODEL_ID, + } + for _ in range(3): + try: + response = openai.ChatCompletion.create( + model=MODEL, + messages=[{ + 'role': 'system', + 'content': 'You are a helpful assistant.' + }, { + 'role': 'user', + 'content': question, + }], + max_tokens=max_tokens, + ) + ans['text'] = response['choices'][0]['message']['content'] + return ans + except Exception as e: + print('[ERROR]', e) + ans['text'] = '#ERROR#' + time.sleep(1) + return ans + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='ChatGPT answer generation.') + parser.add_argument('-q', '--question') + parser.add_argument('-o', '--output') + parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output') + args = parser.parse_args() + + questions_dict = {} + with open(os.path.expanduser(args.question)) as f: + for line in f: + if not line: + continue + q = json.loads(line) + questions_dict[q['question_id']] = q['text'] + + answers = [] + + with concurrent.futures.ThreadPoolExecutor(max_workers=32) as executor: + futures = [] + for qid, question in questions_dict.items(): + future = executor.submit(get_answer, qid, question, args.max_tokens) + futures.append(future) + + for future in tqdm.tqdm(concurrent.futures.as_completed(futures), total=len(futures)): + answers.append(future.result()) + + answers.sort(key=lambda x: x['question_id']) + + with open(os.path.expanduser(args.output), 'w') as f: + table = [json.dumps(ans) for ans in answers] + f.write('\n'.join(table)) diff --git a/llava-phi/llava_phi/eval/run_llava_phi.py b/llava-phi/llava_phi/eval/run_llava_phi.py new file mode 100644 index 0000000000000000000000000000000000000000..165764120bff787267d31b6b19bf8c5a705e81ae --- /dev/null +++ b/llava-phi/llava_phi/eval/run_llava_phi.py @@ -0,0 +1,93 @@ +import argparse +import torch + +from llava_phi.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN +from llava_phi.conversation import conv_templates, SeparatorStyle +from llava_phi.model.builder import load_pretrained_model +from llava_phi.utils import disable_torch_init +from llava_phi.mm_utils import tokenizer_image_token, get_model_name_from_path + +from PIL import Image + +import requests +from PIL import Image +from io import BytesIO + + +def load_image(image_file): + if image_file.startswith('http') or image_file.startswith('https'): + response = requests.get(image_file) + image = Image.open(BytesIO(response.content)).convert('RGB') + else: + image = Image.open(image_file).convert('RGB') + return image + + +def eval_model(args): + # Model + disable_torch_init() + + model_name = get_model_name_from_path(args.model_path) + tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name) + + qs = args.query + if model.config.mm_use_im_start_end: + qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs + else: + qs = DEFAULT_IMAGE_TOKEN + '\n' + qs + + if 'phi' in model_name.lower(): + conv_mode = "phi-2_v0" + else: + conv_mode = "default" + + if args.conv_mode is not None and conv_mode != args.conv_mode: + print('[WARNING] the auto inferred conversation mode is {}, while `--conv-mode` is {}, using {}'.format(conv_mode, args.conv_mode, args.conv_mode)) + else: + args.conv_mode = conv_mode + + conv = conv_templates[args.conv_mode].copy() + conv.append_message(conv.roles[0], qs) + conv.append_message(conv.roles[1], None) + prompt = conv.get_prompt() + + image = load_image(args.image_file) + image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'].cuda() + + input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda() + + stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2 + + with torch.inference_mode(): + output_ids = model.generate( + input_ids, + images=image_tensor, + do_sample=True, + temperature=0.2, + max_new_tokens=1024, + eos_token_id=tokenizer.eos_token_id, # End of sequence token + pad_token_id=tokenizer.eos_token_id, # Pad token + use_cache=True, + ) + + input_token_len = input_ids.shape[1] + n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item() + if n_diff_input_output > 0: + print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids') + outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0] + outputs = outputs.strip() + if outputs.endswith(stop_str): + outputs = outputs[:-len(stop_str)] + outputs = outputs.strip() + print(outputs) + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--model-path", type=str, default="facebook/opt-350m") + parser.add_argument("--model-base", type=str, default=None) + parser.add_argument("--image-file", type=str, required=True) + parser.add_argument("--query", type=str, required=True) + parser.add_argument("--conv-mode", type=str, default=None) + args = parser.parse_args() + + eval_model(args) diff --git a/llava-phi/llava_phi/eval/summarize_gpt_review.py b/llava-phi/llava_phi/eval/summarize_gpt_review.py new file mode 100644 index 0000000000000000000000000000000000000000..0f796a3880341739677a5fe3bfbcc90515a0f324 --- /dev/null +++ b/llava-phi/llava_phi/eval/summarize_gpt_review.py @@ -0,0 +1,60 @@ +import json +import os +from collections import defaultdict + +import numpy as np + +import argparse + +def parse_args(): + parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.') + parser.add_argument('-d', '--dir', default=None) + parser.add_argument('-v', '--version', default=None) + parser.add_argument('-s', '--select', nargs='*', default=None) + parser.add_argument('-f', '--files', nargs='*', default=[]) + parser.add_argument('-i', '--ignore', nargs='*', default=[]) + return parser.parse_args() + + +if __name__ == '__main__': + args = parse_args() + + if args.ignore is not None: + args.ignore = [int(x) for x in args.ignore] + + if len(args.files) > 0: + review_files = args.files + else: + review_files = [x for x in os.listdir(args.dir) if x.endswith('.jsonl') and (x.startswith('gpt4_text') or x.startswith('reviews_') or x.startswith('review_') or 'review' in args.dir)] + + for review_file in sorted(review_files): + config = os.path.basename(review_file).replace('gpt4_text_', '').replace('.jsonl', '') + if args.select is not None and any(x not in config for x in args.select): + continue + if '0613' in config: + version = '0613' + else: + version = '0314' + if args.version is not None and args.version != version: + continue + scores = defaultdict(list) + print(config) + with open(os.path.join(args.dir, review_file) if args.dir is not None else review_file) as f: + for review_str in f: + review = json.loads(review_str) + if review['question_id'] in args.ignore: + continue + if 'category' in review: + scores[review['category']].append(review['tuple']) + scores['all'].append(review['tuple']) + else: + if 'tuple' in review: + scores['all'].append(review['tuple']) + else: + scores['all'].append(review['score']) + for k, v in sorted(scores.items()): + stats = np.asarray(v).mean(0).tolist() + stats = [round(x, 3) for x in stats] + # print(k, stats, round(stats[1]/stats[0]*100, 1)) + print(k, round(stats[1]/stats[0]*100, 1), round(stats[0] * 10, 1), round(stats[1] * 10, 1)) + print('=================================') diff --git a/llava-phi/llava_phi/eval/table/rule.json b/llava-phi/llava_phi/eval/table/rule.json new file mode 100644 index 0000000000000000000000000000000000000000..26c7f4e0819bf0bafbace898f5f5a4f052490aa4 --- /dev/null +++ b/llava-phi/llava_phi/eval/table/rule.json @@ -0,0 +1,11 @@ +{ + "coding": {"role": "Assistant", "prompt": "Your task is to evaluate the coding abilities of the above two assistants. They have been asked to implement a program to solve a given problem. Please review their code submissions, paying close attention to their problem-solving approach, code structure, readability, and the inclusion of helpful comments.\n\nPlease ensure that the assistants' submissions:\n\n1. Correctly implement the given problem statement.\n2. Contain accurate and efficient code.\n3. Include clear and concise comments that explain the code's logic and functionality.\n4. Adhere to proper coding standards and best practices.\n\nOnce you have carefully reviewed both submissions, provide detailed feedback on their strengths and weaknesses, along with any suggestions for improvement. You should first output a single line containing two scores on the scale of 1-10 (1: no code/no sense; 10: perfect) for Assistant 1 and 2, respectively. Then give extra comments starting from the next line."}, + "math": {"role": "Assistant", "prompt": "We would like to request your feedback on the mathematical proficiency of two AI assistants regarding the given user question.\nFirstly, please solve the problem independently, without referring to the answers provided by Assistant 1 and Assistant 2.\nAfterward, please examine the problem-solving process of Assistant 1 and Assistant 2 step-by-step to ensure their correctness, identifying any incorrect steps if present. Your evaluation should take into account not only the answer but also the problem-solving steps.\nFinally, please output a Python tuple containing two numerical scores for Assistant 1 and Assistant 2, ranging from 1 to 10, respectively. If applicable, explain the reasons for any variations in their scores and determine which assistant performed better."}, + "default": {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above.\nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."}, + "conv": {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with five descriptive sentences describing the same image and the bounding box coordinates of each object in the scene. These coordinates are in the form of bounding boxes, represented as (x1, y1, x2, y2) with floating numbers ranging from 0 to 1. These values correspond to the top left x, top left y, bottom right x, and bottom right y. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."}, + "detail": {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with five descriptive sentences describing the same image and the bounding box coordinates of each object in the scene. These coordinates are in the form of bounding boxes, represented as (x1, y1, x2, y2) with floating numbers ranging from 0 to 1. These values correspond to the top left x, top left y, bottom right x, and bottom right y. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."}, + "complex": {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with five descriptive sentences describing the same image and the bounding box coordinates of each object in the scene. These coordinates are in the form of bounding boxes, represented as (x1, y1, x2, y2) with floating numbers ranging from 0 to 1. These values correspond to the top left x, top left y, bottom right x, and bottom right y. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."}, + "llava_bench_conv": {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with a few sentences describing the image. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."}, + "llava_bench_detail": {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with a few sentences describing the image. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."}, + "llava_bench_complex": {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with a few sentences describing the image. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."} +} \ No newline at end of file diff --git a/llava-phi/llava_phi/mm_utils.py b/llava-phi/llava_phi/mm_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..a195b848febcf73106e8c6e62b67ae6f22589def --- /dev/null +++ b/llava-phi/llava_phi/mm_utils.py @@ -0,0 +1,96 @@ +from PIL import Image +from io import BytesIO +import base64 + +import torch +from transformers import StoppingCriteria +from llava_phi.constants import IMAGE_TOKEN_INDEX + + +def load_image_from_base64(image): + return Image.open(BytesIO(base64.b64decode(image))) + + +def expand2square(pil_img, background_color): + width, height = pil_img.size + if width == height: + return pil_img + elif width > height: + result = Image.new(pil_img.mode, (width, width), background_color) + result.paste(pil_img, (0, (width - height) // 2)) + return result + else: + result = Image.new(pil_img.mode, (height, height), background_color) + result.paste(pil_img, ((height - width) // 2, 0)) + return result + + +def process_images(images, image_processor, model_cfg): + image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", None) + new_images = [] + if image_aspect_ratio == 'pad': + for image in images: + image = expand2square(image, tuple(int(x*255) for x in image_processor.image_mean)) + image = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0] + new_images.append(image) + else: + return image_processor(images, return_tensors='pt')['pixel_values'] + if all(x.shape == new_images[0].shape for x in new_images): + new_images = torch.stack(new_images, dim=0) + return new_images + + +def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None): + prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('')] + + def insert_separator(X, sep): + return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1] + + input_ids = [] + offset = 0 + if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id: + offset = 1 + input_ids.append(prompt_chunks[0][0]) + for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)): + input_ids.extend(x[offset:]) + + if return_tensors is not None: + if return_tensors == 'pt': + return torch.tensor(input_ids, dtype=torch.long) + raise ValueError(f'Unsupported tensor type: {return_tensors}') + return input_ids + + +def get_model_name_from_path(model_path): + model_path = model_path.strip("/") + model_paths = model_path.split("/") + if model_paths[-1].startswith('checkpoint-'): + return model_paths[-2] + "_" + model_paths[-1] + else: + return model_paths[-1] + + +class KeywordsStoppingCriteria(StoppingCriteria): + def __init__(self, keywords, tokenizer, input_ids): + self.keywords = keywords + self.keyword_ids = [] + for keyword in keywords: + cur_keyword_ids = tokenizer(keyword).input_ids + if len(cur_keyword_ids) > 1 and cur_keyword_ids[0] == tokenizer.bos_token_id: + cur_keyword_ids = cur_keyword_ids[1:] + self.keyword_ids.append(torch.tensor(cur_keyword_ids)) + self.tokenizer = tokenizer + self.start_len = input_ids.shape[1] + + def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool: + assert output_ids.shape[0] == 1, "Only support batch size 1 (yet)" # TODO + offset = min(output_ids.shape[1] - self.start_len, 3) + self.keyword_ids = [keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids] + for keyword_id in self.keyword_ids: + if output_ids[0, -keyword_id.shape[0]:] == keyword_id: + return True + outputs = self.tokenizer.batch_decode(output_ids[:, -offset:], skip_special_tokens=True)[0] + for keyword in self.keywords: + if keyword in outputs: + return True + return False diff --git a/llava-phi/llava_phi/model/__init__.py b/llava-phi/llava_phi/model/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1d17d5fccde9017fa4cbd8cd94d17ef3202017cd --- /dev/null +++ b/llava-phi/llava_phi/model/__init__.py @@ -0,0 +1,2 @@ +from .language_model.llava_phi import LlavaPhiForCausalLM +from .language_model.configuration_llava_phi import LlavaPhiConfig, LlavaPhiVisionConfig, ProjectorConfig diff --git a/llava-phi/llava_phi/model/builder.py b/llava-phi/llava_phi/model/builder.py new file mode 100644 index 0000000000000000000000000000000000000000..d873eaef35af085037a9bc8fefc5d20c49ae4029 --- /dev/null +++ b/llava-phi/llava_phi/model/builder.py @@ -0,0 +1,121 @@ +import os +import warnings +import shutil + +from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig, CLIPImageProcessor +import torch +from llava_phi.model import * +from llava_phi.constants import DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN + + +def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, load_4bit=False, device_map="cuda", device="cuda"): + kwargs = {"device_map": device_map} + if load_8bit: + kwargs['load_in_8bit'] = True + elif load_4bit: + kwargs['load_in_4bit'] = True + kwargs['quantization_config'] = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type='nf4' + ) + # else: # TODO: after fine-tuning LLava-Phi, load the model weights with fp16 will pose nan + # kwargs['torch_dtype'] = torch.float16 + + if 'phi' in model_name.lower(): + # Load LLaVA-Phi model + if 'lora' in model_name.lower() and model_base is None: + warnings.warn('There is `lora` in model name but no `model_base` is provided. If you are loading a LoRA model, please provide the `model_base` argument.') + if 'lora' in model_name.lower() and model_base is not None: + lora_cfg_pretrained = AutoConfig.from_pretrained(model_path) + tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False) + print('Loading LLaVA-Phi from base model...') + model = LlavaPhiForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=lora_cfg_pretrained, **kwargs) + token_num, tokem_dim = model.lm_head.out_features, model.lm_head.in_features + if model.lm_head.weight.shape[0] != token_num: + model.lm_head.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype)) + model.model.embed_tokens.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype)) + + print('Loading additional LLaVA-Phi weights...') + if os.path.exists(os.path.join(model_path, 'non_lora_trainables.bin')): + non_lora_trainables = torch.load(os.path.join(model_path, 'non_lora_trainables.bin'), map_location='cpu') + else: + # this is probably from HF Hub + from huggingface_hub import hf_hub_download + def load_from_hf(repo_id, filename, subfolder=None): + cache_file = hf_hub_download( + repo_id=repo_id, + filename=filename, + subfolder=subfolder) + return torch.load(cache_file, map_location='cpu') + non_lora_trainables = load_from_hf(model_path, 'non_lora_trainables.bin') + non_lora_trainables = {(k[11:] if k.startswith('base_model.') else k): v for k, v in non_lora_trainables.items()} + if any(k.startswith('model.model.') for k in non_lora_trainables): + non_lora_trainables = {(k[6:] if k.startswith('model.') else k): v for k, v in non_lora_trainables.items()} + model.load_state_dict(non_lora_trainables, strict=False) + + from peft import PeftModel + print('Loading LoRA weights...') + model = PeftModel.from_pretrained(model, model_path) + print('Merging LoRA weights...') + model = model.merge_and_unload() + print('Model is loaded...') + elif model_base is not None: + # this may be mm projector only + print('Loading LLaVA-Phi from base model...') + tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False) + cfg_pretrained = AutoConfig.from_pretrained(model_path) + model = LlavaPhiForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs) + + mm_projector_weights = torch.load(os.path.join(model_path, 'mm_projector.bin'), map_location='cpu') + mm_projector_weights = {k: v.to(torch.float16) for k, v in mm_projector_weights.items()} + model.load_state_dict(mm_projector_weights, strict=False) + else: + print("load llaVA-Phi MLLM!!!") + config = LlavaPhiConfig.from_pretrained(model_path, trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True) + model = LlavaPhiForCausalLM.from_pretrained( + model_path, + config=config, + use_safetensors=True, + **kwargs).to("cuda") + else: + # Load language model + if model_base is not None: + # PEFT model + from peft import PeftModel + tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False) + model = AutoModelForCausalLM.from_pretrained(model_base, torch_dtype=torch.float16, low_cpu_mem_usage=True, device_map="auto") + print(f"Loading LoRA weights from {model_path}") + model = PeftModel.from_pretrained(model, model_path) + print(f"Merging weights") + model = model.merge_and_unload() + print('Convert to FP16...') + model.to(torch.float16) + else: + tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False) + model = AutoModelForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs) + + image_processor = CLIPImageProcessor.from_pretrained(model_path) + + if 'phi' in model_name.lower(): + mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False) + mm_use_im_patch_token = getattr(model.config, "mm_use_im_patch_token", True) + + # TODO: the tokenizer length of phi-2 is 50295, but the output class of lm_head is 51200 + if mm_use_im_patch_token: + tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True) + if mm_use_im_start_end: + tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True) + # model.resize_token_embeddings(len(tokenizer)) + else: + raise ValueError(f"Unsupported model name: {model_name}") + + if hasattr(model.config, "max_sequence_length"): + context_len = model.config.max_sequence_length + else: + context_len = 2048 + model.to(device="cuda") + print(kwargs) + return tokenizer, model, image_processor, context_len diff --git a/llava-phi/llava_phi/model/language_model/configuration_llava_phi.py b/llava-phi/llava_phi/model/language_model/configuration_llava_phi.py new file mode 100644 index 0000000000000000000000000000000000000000..db16c71955328fdfc0f6ce0ed76f12fde583c034 --- /dev/null +++ b/llava-phi/llava_phi/model/language_model/configuration_llava_phi.py @@ -0,0 +1,179 @@ +import os +from typing import Union +from transformers import PretrainedConfig, PhiConfig +from transformers.utils import logging + +logger = logging.get_logger(__name__) + + +class LlavaPhiVisionConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`CLIPVisionModel`]. It is used to instantiate a + CLIP vision encoder according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the vision encoder of the CLIP + [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + projection_dim (`int`, *optional*, defaults to 512): + Dimentionality of text and vision projection layers. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + image_size (`int`, *optional*, defaults to 224): + The size (resolution) of each image. + patch_size (`int`, *optional*, defaults to 32): + The size (resolution) of each patch. + hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. + layer_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the layer normalization layers. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + initializer_factor (`float`, *optional*, defaults to 1.0): + A factor for initializing all weight matrices (should be kept to 1, used internally for initialization + testing). + mm_vision_select_feature (`str`, *optional*, defaults to `"patch"`): + The feature to select from the vision encoder output. Can be one of `"patch"` or `"cls_patch"`. + mm_vision_select_layer (`int`, *optional*, defaults to `-2`): + The layer to select from the vision encoder output. + + Example: + + ```python + >>> from transformers import CLIPVisionConfig, CLIPVisionModel + + >>> # Initializing a CLIPVisionConfig with openai/clip-vit-base-patch32 style configuration + >>> configuration = CLIPVisionConfig() + + >>> # Initializing a CLIPVisionModel (with random weights) from the openai/clip-vit-base-patch32 style configuration + >>> model = CLIPVisionModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "llava_phi_clip_vision_model" + + def __init__( + self, + hidden_size=768, + intermediate_size=3072, + projection_dim=512, + num_hidden_layers=12, + num_attention_heads=12, + num_channels=3, + image_size=224, + patch_size=32, + hidden_act="quick_gelu", + layer_norm_eps=1e-5, + attention_dropout=0.0, + initializer_range=0.02, + initializer_factor=1.0, + mm_vision_select_feature="patch", + mm_vision_select_layer=-2, + **kwargs, + ): + super().__init__(**kwargs) + + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.projection_dim = projection_dim + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.num_channels = num_channels + self.patch_size = patch_size + self.image_size = image_size + self.initializer_range = initializer_range + self.initializer_factor = initializer_factor + self.attention_dropout = attention_dropout + self.layer_norm_eps = layer_norm_eps + self.hidden_act = hidden_act + self.mm_vision_select_feature = mm_vision_select_feature + self.mm_vision_select_layer = mm_vision_select_layer + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": + cls._set_token_in_kwargs(kwargs) + + config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) + + # get the vision config dict if we are loading from CLIPConfig + if config_dict.get("model_type") == "llava_phi-phi": + config_dict = config_dict["vision_config"] + + if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: + logger.warning( + f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " + f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." + ) + + return cls.from_dict(config_dict, **kwargs) + + +class ProjectorConfig(PretrainedConfig): + model_type = "llava_phi_projector" + + def __init__( + self, + mm_projector_type="linear", + mm_hidden_size=768, + hidden_size=2560, + **kwargs + ): + self.mm_projector_type = mm_projector_type + self.mm_hidden_size = mm_hidden_size + self.hidden_size = hidden_size + super().__init__(**kwargs) + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": + cls._set_token_in_kwargs(kwargs) + + config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) + + # get the vision config dict if we are loading from CLIPConfig + if config_dict.get("model_type") == "llava_phi-phi": + config_dict = config_dict["projector_config"] + + if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: + logger.warning( + f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " + f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." + ) + + return cls.from_dict(config_dict, **kwargs) + + +DEFAULT_VISUAL_CONFIG = { + "vision_tower": LlavaPhiVisionConfig().to_dict(), + "mm_projector": ProjectorConfig().to_dict() +} + + +class LlavaPhiConfig(PhiConfig): + model_type = "llava_phi" + + def __init__(self, vision_config=None, **kwargs): + if vision_config is None: + self.vision_config = DEFAULT_VISUAL_CONFIG + else: + self.vision_config = vision_config + + super().__init__(**kwargs) + + +if __name__ == "__main__": + print(LlavaPhiVisionConfig()) diff --git a/llava-phi/llava_phi/model/language_model/llava_phi.py b/llava-phi/llava_phi/model/language_model/llava_phi.py new file mode 100644 index 0000000000000000000000000000000000000000..1c8b87121a0db29563d8ce0379c64746e1a2a5bf --- /dev/null +++ b/llava-phi/llava_phi/model/language_model/llava_phi.py @@ -0,0 +1,126 @@ +import os +from typing import List, Optional, Tuple, Union + +import torch +import torch.nn as nn +from torch.nn import CrossEntropyLoss + +from transformers import AutoConfig, AutoModelForCausalLM, \ + PhiModel, PhiPreTrainedModel + +from transformers.modeling_outputs import CausalLMOutputWithPast +from ..llava_arch import LlavaMetaModel, LlavaMetaForCausalLM +from transformers.utils import logging +from .configuration_llava_phi import LlavaPhiConfig + +logger = logging.get_logger(__name__) + + +class LLavaPhiModel(LlavaMetaModel, PhiModel): + config_class = LlavaPhiConfig + + def __init__(self, config): + super(LLavaPhiModel, self).__init__(config) + + +class LlavaPhiForCausalLM(PhiPreTrainedModel, LlavaMetaForCausalLM): + config_class = LlavaPhiConfig + + def __init__(self, config): + super(PhiPreTrainedModel, self).__init__(config) + self.model = LLavaPhiModel(config) + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=True) + + # Initialize weights and apply final processing + self.post_init() + + def get_model(self): + return self.model + + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + images: Optional[torch.FloatTensor] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, CausalLMOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + input_ids, attention_mask, past_key_values, inputs_embeds, labels = self.prepare_inputs_labels_for_multimodal( + input_ids, attention_mask, past_key_values, labels, images) + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict + ) + + hidden_states = outputs[0] + logits = self.lm_head(hidden_states) + + loss = None + if labels is not None: + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss() + shift_logits = shift_logits.view(-1, self.config.vocab_size) + shift_labels = shift_labels.view(-1) + # Enable model/pipeline parallelism + shift_labels = shift_labels.to(shift_logits.device) + loss = loss_fct(shift_logits, shift_labels) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs + ): + if past_key_values: + input_ids = input_ids[:, -1:] + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and past_key_values is None: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids} + + model_inputs.update( + { + "past_key_values": past_key_values, + "use_cache": kwargs.get("use_cache"), + "attention_mask": attention_mask, + "images": kwargs.get("images", None), + } + ) + return model_inputs + + +AutoConfig.register("llava_phi", LlavaPhiConfig) +AutoModelForCausalLM.register(LlavaPhiConfig, LlavaPhiForCausalLM) diff --git a/llava-phi/llava_phi/model/llava_arch.py b/llava-phi/llava_phi/model/llava_arch.py new file mode 100644 index 0000000000000000000000000000000000000000..20df5b588d6d83437a32071ab428b82a14a85576 --- /dev/null +++ b/llava-phi/llava_phi/model/llava_arch.py @@ -0,0 +1,208 @@ +# Copyright 2023 Haotian Liu +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from abc import ABC, abstractmethod + +import torch + +from .multimodal_encoder.clip_encoder import CLIPVisionTower +from .multimodal_projector.builder import build_vision_projector +from .language_model.configuration_llava_phi import LlavaPhiConfig, LlavaPhiVisionConfig, ProjectorConfig +from llava_phi.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN + + +class LlavaMetaModel: + def __init__(self, config): + super(LlavaMetaModel, self).__init__(config) + self.vision_tower = CLIPVisionTower( + LlavaPhiVisionConfig(**config.vision_config["vision_tower"]) + ) + self.mm_projector = build_vision_projector( + ProjectorConfig(**config.vision_config["mm_projector"]) + ) + + def get_vision_tower(self): + vision_tower = getattr(self, 'vision_tower', None) + if type(vision_tower) is list: + vision_tower = vision_tower[0] + return vision_tower + + +class LlavaMetaForCausalLM(ABC): + + @abstractmethod + def get_model(self): + pass + + def get_vision_tower(self): + return self.get_model().get_vision_tower() + + def encode_images(self, images): + image_features = self.get_model().get_vision_tower()(images) + image_features = self.get_model().mm_projector(image_features) + return image_features + + def prepare_inputs_labels_for_multimodal( + self, input_ids, attention_mask, past_key_values, labels, images + ): + vision_tower = self.get_vision_tower() + if vision_tower is None or images is None or input_ids.shape[1] == 1: + if past_key_values is not None and vision_tower is not None and images is not None and input_ids.shape[1] == 1: + attention_mask = torch.ones((attention_mask.shape[0], past_key_values[-1][-1].shape[-2] + 1), dtype=attention_mask.dtype, device=attention_mask.device) + return input_ids, attention_mask, past_key_values, None, labels + + if type(images) is list or images.ndim == 5: + concat_images = torch.cat([image for image in images], dim=0) + image_features = self.encode_images(concat_images) + split_sizes = [image.shape[0] for image in images] + image_features = torch.split(image_features, split_sizes, dim=0) + image_features = [x.flatten(0, 1) for x in image_features] + else: + image_features = self.encode_images(images) + + new_input_embeds = [] + new_labels = [] if labels is not None else None + cur_image_idx = 0 + for batch_idx, cur_input_ids in enumerate(input_ids): + if (cur_input_ids == IMAGE_TOKEN_INDEX).sum() == 0: + # multimodal LLM, but the current sample is not multimodal + # FIXME: this is a hacky fix, for deepspeed zero3 to work + half_len = cur_input_ids.shape[0] // 2 + cur_image_features = image_features[cur_image_idx] + cur_input_embeds_1 = self.get_model().embed_tokens(cur_input_ids[:half_len]) + cur_input_embeds_2 = self.get_model().embed_tokens(cur_input_ids[half_len:]) + cur_input_embeds = torch.cat([cur_input_embeds_1, cur_image_features[0:0], cur_input_embeds_2], dim=0) + new_input_embeds.append(cur_input_embeds) + if labels is not None: + new_labels.append(labels[batch_idx]) + cur_image_idx += 1 + continue + image_token_indices = torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0] + cur_new_input_embeds = [] + if labels is not None: + cur_labels = labels[batch_idx] + cur_new_labels = [] + assert cur_labels.shape == cur_input_ids.shape + while image_token_indices.numel() > 0: + cur_image_features = image_features[cur_image_idx] + image_token_start = image_token_indices[0] + if getattr(self.config, 'tune_mm_mlp_adapter', False) and getattr(self.config, 'mm_use_im_start_end', False): + cur_new_input_embeds.append(self.get_model().embed_tokens(cur_input_ids[:image_token_start-1]).detach()) + cur_new_input_embeds.append(self.get_model().embed_tokens(cur_input_ids[image_token_start-1:image_token_start])) + cur_new_input_embeds.append(cur_image_features) + cur_new_input_embeds.append(self.get_model().embed_tokens(cur_input_ids[image_token_start+1:image_token_start+2])) + if labels is not None: + cur_new_labels.append(cur_labels[:image_token_start]) + cur_new_labels.append(torch.full((cur_image_features.shape[0],), IGNORE_INDEX, device=labels.device, dtype=labels.dtype)) + cur_new_labels.append(cur_labels[image_token_start:image_token_start+1]) + cur_labels = cur_labels[image_token_start+2:] + else: + cur_new_input_embeds.append(self.get_model().embed_tokens(cur_input_ids[:image_token_start])) + cur_new_input_embeds.append(cur_image_features) + if labels is not None: + cur_new_labels.append(cur_labels[:image_token_start]) + cur_new_labels.append(torch.full((cur_image_features.shape[0],), IGNORE_INDEX, device=labels.device, dtype=labels.dtype)) + cur_labels = cur_labels[image_token_start+1:] + cur_image_idx += 1 + if getattr(self.config, 'tune_mm_mlp_adapter', False) and getattr(self.config, 'mm_use_im_start_end', False): + cur_input_ids = cur_input_ids[image_token_start+2:] + else: + cur_input_ids = cur_input_ids[image_token_start+1:] + image_token_indices = torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0] + if cur_input_ids.numel() > 0: + if getattr(self.config, 'tune_mm_mlp_adapter', False) and getattr(self.config, 'mm_use_im_start_end', False): + cur_new_input_embeds.append(self.get_model().embed_tokens(cur_input_ids).detach()) + else: + cur_new_input_embeds.append(self.get_model().embed_tokens(cur_input_ids)) + if labels is not None: + cur_new_labels.append(cur_labels) + cur_new_input_embeds = [x.to(device=self.device) for x in cur_new_input_embeds] + cur_new_input_embeds = torch.cat(cur_new_input_embeds, dim=0) + new_input_embeds.append(cur_new_input_embeds) + if labels is not None: + cur_new_labels = torch.cat(cur_new_labels, dim=0) + new_labels.append(cur_new_labels) + + if any(x.shape != new_input_embeds[0].shape for x in new_input_embeds): + max_len = max(x.shape[0] for x in new_input_embeds) + + new_input_embeds_align = [] + for cur_new_embed in new_input_embeds: + cur_new_embed = torch.cat((cur_new_embed, torch.zeros((max_len - cur_new_embed.shape[0], cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device)), dim=0) + new_input_embeds_align.append(cur_new_embed) + new_input_embeds = torch.stack(new_input_embeds_align, dim=0) + + if labels is not None: + new_labels_align = [] + _new_labels = new_labels + for cur_new_label in new_labels: + cur_new_label = torch.cat((cur_new_label, torch.full((max_len - cur_new_label.shape[0],), IGNORE_INDEX, dtype=cur_new_label.dtype, device=cur_new_label.device)), dim=0) + new_labels_align.append(cur_new_label) + new_labels = torch.stack(new_labels_align, dim=0) + + if attention_mask is not None: + new_attention_mask = [] + for cur_attention_mask, cur_new_labels, cur_new_labels_align in zip(attention_mask, _new_labels, new_labels): + new_attn_mask_pad_left = torch.full((cur_new_labels.shape[0] - labels.shape[1],), True, dtype=attention_mask.dtype, device=attention_mask.device) + new_attn_mask_pad_right = torch.full((cur_new_labels_align.shape[0] - cur_new_labels.shape[0],), False, dtype=attention_mask.dtype, device=attention_mask.device) + cur_new_attention_mask = torch.cat((new_attn_mask_pad_left, cur_attention_mask, new_attn_mask_pad_right), dim=0) + new_attention_mask.append(cur_new_attention_mask) + attention_mask = torch.stack(new_attention_mask, dim=0) + assert attention_mask.shape == new_labels.shape + else: + new_input_embeds = torch.stack(new_input_embeds, dim=0) + if labels is not None: + new_labels = torch.stack(new_labels, dim=0) + + if attention_mask is not None: + new_attn_mask_pad_left = torch.full((attention_mask.shape[0], new_input_embeds.shape[1] - input_ids.shape[1]), True, dtype=attention_mask.dtype, device=attention_mask.device) + attention_mask = torch.cat((new_attn_mask_pad_left, attention_mask), dim=1) + assert attention_mask.shape == new_input_embeds.shape[:2] + + return None, attention_mask, past_key_values, new_input_embeds, new_labels + + def initialize_vision_tokenizer(self, model_args, tokenizer): + if model_args.mm_use_im_patch_token: + tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True) + self.resize_token_embeddings(len(tokenizer)) + + if model_args.mm_use_im_start_end: + num_new_tokens = tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True) + self.resize_token_embeddings(len(tokenizer)) + + if num_new_tokens > 0: + input_embeddings = self.get_input_embeddings().weight.data + output_embeddings = self.get_output_embeddings().weight.data + + input_embeddings_avg = input_embeddings[:-num_new_tokens].mean( + dim=0, keepdim=True) + output_embeddings_avg = output_embeddings[:-num_new_tokens].mean( + dim=0, keepdim=True) + + input_embeddings[-num_new_tokens:] = input_embeddings_avg + output_embeddings[-num_new_tokens:] = output_embeddings_avg + + if model_args.tune_mm_mlp_adapter: + for p in self.get_input_embeddings().parameters(): + p.requires_grad = True + for p in self.get_output_embeddings().parameters(): + p.requires_grad = False + + elif model_args.mm_use_im_patch_token: + if model_args.tune_mm_mlp_adapter: + for p in self.get_input_embeddings().parameters(): + p.requires_grad = False + for p in self.get_output_embeddings().parameters(): + p.requires_grad = False diff --git a/llava-phi/llava_phi/model/multimodal_encoder/clip_encoder.py b/llava-phi/llava_phi/model/multimodal_encoder/clip_encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..5d7939d5ae0674fded20c52278b04dc2ef59936b --- /dev/null +++ b/llava-phi/llava_phi/model/multimodal_encoder/clip_encoder.py @@ -0,0 +1,89 @@ +from abc import ABC + +import torch +import torch.nn as nn + +from transformers import CLIPPreTrainedModel, CLIPVisionConfig +from transformers.models.clip.modeling_clip import CLIPVisionTransformer +from llava_phi.model.language_model.configuration_llava_phi import LlavaPhiVisionConfig + + +class CLIPVisionTower(CLIPPreTrainedModel): + config_class = LlavaPhiVisionConfig + + def __init__(self, config): + super().__init__(config) + + self.vision_model = CLIPVisionTransformer(config) + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self) -> nn.Module: + return self.vision_model.embeddings.patch_embedding + + def feature_select(self, image_forward_outs): + image_features = image_forward_outs.hidden_states[ + self.config.mm_vision_select_layer + ] + if self.config.mm_vision_select_feature == "patch": + image_features = image_features[:, 1:] + elif self.config.mm_vision_select_feature == "cls_patch": + image_features = image_features + else: + raise ValueError( + f"Unexpected select feature: {self.config.mm_vision_select_feature}" + ) + return image_features + + def forward(self, images): + if type(images) is list: + image_features = [] + for image in images: + image_forward_out = self.vision_model( + image.to(device=self.device, dtype=self.dtype).unsqueeze(0), + output_hidden_states=True, + ) + image_feature = self.feature_select(image_forward_out).to(image.dtype) + image_features.append(image_feature) + else: + image_forward_outs = self.vision_model( + images.to(device=self.device, dtype=self.dtype), + output_hidden_states=True, + ) + image_features = self.feature_select(image_forward_outs).to(images.dtype) + + return image_features + + @property + def dummy_feature(self): + return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype) + + @property + def dtype(self): + return list(self.vision_model.parameters())[0].dtype + + @property + def device(self): + return list(self.vision_model.parameters())[0].device + + @property + def hidden_size(self): + return self.config.hidden_size + + @property + def num_patches(self): + return (self.config.image_size // self.config.patch_size) ** 2 + + +if __name__ == "__main__": + clip_config = CLIPVisionConfig.from_pretrained( + "/data/private/zhumj/GPTcode/mm-phi/openai/clip-vit-large-patch14-336" + ) + print("################ clip_config ##############") + print(clip_config) + phi_vis_config = LlavaPhiVisionConfig(**clip_config.to_dict()) + print("################ phi_vis_config ##############") + print(phi_vis_config) + + model = CLIPVisionTower(clip_config) + # print(list(model.vision_model.parameters())[0].dtype) diff --git a/llava-phi/llava_phi/model/multimodal_projector/builder.py b/llava-phi/llava_phi/model/multimodal_projector/builder.py new file mode 100644 index 0000000000000000000000000000000000000000..a7cceb34ba8760709e81af0b21824502449fca63 --- /dev/null +++ b/llava-phi/llava_phi/model/multimodal_projector/builder.py @@ -0,0 +1,50 @@ +import torch +import torch.nn as nn +import re + + +class IdentityMap(nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x, *args, **kwargs): + return x + + @property + def config(self): + return {"mm_projector_type": "identity"} + + +class SimpleResBlock(nn.Module): + def __init__(self, channels): + super().__init__() + self.pre_norm = nn.LayerNorm(channels) + + self.proj = nn.Sequential( + nn.Linear(channels, channels), nn.GELU(), nn.Linear(channels, channels) + ) + + def forward(self, x): + x = self.pre_norm(x) + return x + self.proj(x) + + +def build_vision_projector(config): + projector_type = getattr(config, "mm_projector_type", "linear") + + if projector_type == "linear": + return nn.Linear(config.mm_hidden_size, config.hidden_size) + + mlp_gelu_match = re.match(r"^mlp(\d+)x_gelu$", projector_type) + if mlp_gelu_match: + mlp_depth = int(mlp_gelu_match.group(1)) + modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)] + for _ in range(1, mlp_depth): + modules.append(nn.GELU()) + modules.append(nn.Linear(config.hidden_size, config.hidden_size)) + return nn.Sequential(*modules) + + if projector_type == "identity": + return IdentityMap() + + raise ValueError(f"Unknown projector type: {projector_type}") diff --git a/llava-phi/llava_phi/serve/__init__.py b/llava-phi/llava_phi/serve/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/llava-phi/llava_phi/serve/__pycache__/__init__.cpython-310.pyc b/llava-phi/llava_phi/serve/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cde3ffc076a04bda47f526b7c0b4171fe97efa54 Binary files /dev/null and b/llava-phi/llava_phi/serve/__pycache__/__init__.cpython-310.pyc differ diff --git a/llava-phi/llava_phi/serve/__pycache__/cli.cpython-310.pyc b/llava-phi/llava_phi/serve/__pycache__/cli.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4a1be045494e4cfd4d831bb6f8ef976b6caf2679 Binary files /dev/null and b/llava-phi/llava_phi/serve/__pycache__/cli.cpython-310.pyc differ diff --git a/llava-phi/llava_phi/serve/app.py b/llava-phi/llava_phi/serve/app.py new file mode 100644 index 0000000000000000000000000000000000000000..9e343464c91fa7b58c07853f97cd2e7853bfb78a --- /dev/null +++ b/llava-phi/llava_phi/serve/app.py @@ -0,0 +1,354 @@ +import argparse +import hashlib +import json +import os +import time +from threading import Thread + +import gradio as gr +import torch +from llava_phi.constants import (DEFAULT_IM_END_TOKEN, DEFAULT_IM_START_TOKEN, + DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX) +from llava_phi.conversation import (SeparatorStyle, conv_templates, + default_conversation) +from llava_phi.mm_utils import (KeywordsStoppingCriteria, load_image_from_base64, + process_images, tokenizer_image_token) +from llava_phi.model.builder import load_pretrained_model +from transformers import TextIteratorStreamer + +print(gr.__version__) + +block_css = """ + +#buttons button { + min-width: min(120px,100%); +} +""" +title_markdown = (""" +# LLaVA-Phi: Efficient Multi-Modal Assistant with Small Language Model +[[Code](https://github.com/zhuyiche/llava-phi)] | 📚 [[Paper](https://arxiv.org/pdf/2401.02330)] +""") +tos_markdown = (""" +### Terms of use +By using this service, users are required to agree to the following terms: +The service is a research preview intended for non-commercial use only. It only provides limited safety measures and may generate offensive content. It must not be used for any illegal, harmful, violent, racist, or sexual purposes. +For an optimal experience, please use desktop computers for this demo, as mobile devices may compromise its quality. +""") +learn_more_markdown = (""" +### License +The service is a research preview intended for non-commercial use only, subject to the model [License](https://huggingface.co/microsoft/phi-2) of Phi-2. Please contact us if you find any potential violation. +""") +ack_markdown = (""" +### Acknowledgement +The template for this web demo is from [LLaVA](https://github.com/haotian-liu/LLaVA), and we are very grateful to LLaVA for their open source contributions to the community! +""") + + +def regenerate(state, image_process_mode): + state.messages[-1][-1] = None + prev_human_msg = state.messages[-2] + if type(prev_human_msg[1]) in (tuple, list): + prev_human_msg[1] = (*prev_human_msg[1][:2], image_process_mode) + state.skip_next = False + return (state, state.to_gradio_chatbot(), "", None) + + +def clear_history(): + state = default_conversation.copy() + return (state, state.to_gradio_chatbot(), "", None) + + +def add_text(state, text, image, image_process_mode): + if len(text) <= 0 and image is None: + state.skip_next = True + return (state, state.to_gradio_chatbot(), "", None) + + text = text[:1536] # Hard cut-off + if image is not None: + text = text[:1200] # Hard cut-off for images + if '' not in text: + # text = '' + text + text = text + '\n' + text = (text, image, image_process_mode) + if len(state.get_images(return_pil=True)) > 0: + state = default_conversation.copy() + state.append_message(state.roles[0], text) + state.append_message(state.roles[1], None) + state.skip_next = False + return (state, state.to_gradio_chatbot(), "", None) + + +def load_demo(): + state = default_conversation.copy() + return state + + +@torch.inference_mode() +def get_response(params): + prompt = params["prompt"] + ori_prompt = prompt + images = params.get("images", None) + num_image_tokens = 0 + if images is not None and len(images) > 0: + if len(images) > 0: + if len(images) != prompt.count(DEFAULT_IMAGE_TOKEN): + raise ValueError( + "Number of images does not match number of tokens in prompt") + + images = [load_image_from_base64(image) for image in images] + images = process_images(images, image_processor, model.config) + + if type(images) is list: + images = [image.to(model.device, dtype=torch.float16) + for image in images] + else: + images = images.to(model.device, dtype=torch.float16) + + replace_token = DEFAULT_IMAGE_TOKEN + if getattr(model.config, 'mm_use_im_start_end', False): + replace_token = DEFAULT_IM_START_TOKEN + replace_token + DEFAULT_IM_END_TOKEN + prompt = prompt.replace(DEFAULT_IMAGE_TOKEN, replace_token) + + num_image_tokens = prompt.count( + replace_token) * model.get_vision_tower().num_patches + else: + images = None + image_args = {"images": images} + else: + images = None + image_args = {} + + temperature = float(params.get("temperature", 1.0)) + top_p = float(params.get("top_p", 1.0)) + max_context_length = getattr( + model.config, 'max_position_embeddings', 2048) + max_new_tokens = min(int(params.get("max_new_tokens", 256)), 1024) + stop_str = params.get("stop", None) + do_sample = True if temperature > 0.001 else False + + input_ids = tokenizer_image_token( + prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(model.device) + keywords = [stop_str] + stopping_criteria = KeywordsStoppingCriteria( + keywords, tokenizer, input_ids) + streamer = TextIteratorStreamer( + tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=15) + + max_new_tokens = min(max_new_tokens, max_context_length - + input_ids.shape[-1] - num_image_tokens) + + if max_new_tokens < 1: + yield json.dumps({"text": ori_prompt + "Exceeds max token length. Please start a new conversation, thanks.", + "error_code": 0}).encode() + b"\0" + return + + # local inference + thread = Thread(target=model.generate, kwargs=dict( + inputs=input_ids, + do_sample=do_sample, + temperature=temperature, + top_p=top_p, + max_new_tokens=max_new_tokens, + streamer=streamer, + stopping_criteria=[stopping_criteria], + use_cache=True, + **image_args + )) + thread.start() + + generated_text = ori_prompt + for new_text in streamer: + generated_text += new_text + if generated_text.endswith(stop_str): + generated_text = generated_text[:-len(stop_str)] + yield json.dumps({"text": generated_text, "error_code": 0}).encode() + + +def http_bot(state, temperature, top_p, max_new_tokens): + if state.skip_next: + # This generate call is skipped due to invalid inputs + yield (state, state.to_gradio_chatbot()) + return + + if len(state.messages) == state.offset + 2: + # First round of conversation + if "phi" in model_name.lower(): + template_name = "phi-2_v0" + else: + template_name = "phi-2_v0" + new_state = conv_templates[template_name].copy() + new_state.append_message(new_state.roles[0], state.messages[-2][1]) + new_state.append_message(new_state.roles[1], None) + state = new_state + + # Construct prompt + prompt = state.get_prompt() + + all_images = state.get_images(return_pil=True) + all_image_hash = [hashlib.md5(image.tobytes()).hexdigest() + for image in all_images] + + # Make requests + pload = { + "model": model_name, + "prompt": prompt, + "temperature": float(temperature), + "top_p": float(top_p), + "max_new_tokens": min(int(max_new_tokens), 1536), + "stop": state.sep if state.sep_style in [SeparatorStyle.SINGLE, SeparatorStyle.MPT] else state.sep2, + "images": f'List of {len(state.get_images())} images: {all_image_hash}', + } + + pload['images'] = state.get_images() + + state.messages[-1][-1] = "▌" + yield (state, state.to_gradio_chatbot()) + + # for stream + output = get_response(pload) + for chunk in output: + if chunk: + data = json.loads(chunk.decode()) + if data["error_code"] == 0: + output = data["text"][len(prompt):].strip() + state.messages[-1][-1] = output + "▌" + yield (state, state.to_gradio_chatbot()) + else: + output = data["text"] + \ + f" (error_code: {data['error_code']})" + state.messages[-1][-1] = output + yield (state, state.to_gradio_chatbot()) + return + time.sleep(0.03) + + state.messages[-1][-1] = state.messages[-1][-1][:-1] + yield (state, state.to_gradio_chatbot()) + + +def build_demo(): + textbox = gr.Textbox( + show_label=False, placeholder="Enter text and press ENTER", container=False) + with gr.Blocks(title="LLaVA-Phi", theme=gr.themes.Default(), css=block_css) as demo: + state = gr.State() + gr.Markdown(title_markdown) + + with gr.Row(): + with gr.Column(scale=5): + with gr.Row(elem_id="Model ID"): + gr.Dropdown( + choices=['LLaVA-Phi-3B'], + value='LLaVA-Phi-3B', + interactive=True, + label='Model ID', + container=False) + imagebox = gr.Image(type="pil") + image_process_mode = gr.Radio( + ["Crop", "Resize", "Pad", "Default"], + value="Default", + label="Preprocess for non-square image", visible=False) + + cur_dir = os.path.dirname(os.path.abspath(__file__)) + gr.Examples(examples=[ + [f"{cur_dir}/examples/extreme_ironing.jpg", + "What is unusual about this image?"], + [f"{cur_dir}/examples/waterview.jpg", + "What are the things I should be cautious about when I visit here?"], + ], inputs=[imagebox, textbox]) + + with gr.Accordion("Parameters", open=False) as _: + temperature = gr.Slider( + minimum=0.0, maximum=1.0, value=0.2, step=0.1, interactive=True, label="Temperature", ) + top_p = gr.Slider( + minimum=0.0, maximum=1.0, value=0.7, step=0.1, interactive=True, label="Top P", ) + max_output_tokens = gr.Slider( + minimum=0, maximum=1024, value=512, step=64, interactive=True, label="Max output tokens", ) + + with gr.Column(scale=8): + chatbot = gr.Chatbot( + elem_id="chatbot", label="LLaVA-Phi Chatbot", height=550) + with gr.Row(): + with gr.Column(scale=8): + textbox.render() + with gr.Column(scale=1, min_width=50): + submit_btn = gr.Button(value="Send", variant="primary") + with gr.Row(elem_id="buttons") as _: + regenerate_btn = gr.Button( + value="🔄 Regenerate", interactive=True) + clear_btn = gr.Button(value="🗑️ Clear", interactive=True) + + gr.Markdown(tos_markdown) + gr.Markdown(learn_more_markdown) + gr.Markdown(ack_markdown) + + regenerate_btn.click( + regenerate, + [state, image_process_mode], + [state, chatbot, textbox, imagebox], + queue=False + ).then( + http_bot, + [state, temperature, top_p, max_output_tokens], + [state, chatbot] + ) + + clear_btn.click( + clear_history, + None, + [state, chatbot, textbox, imagebox], + queue=False + ) + + textbox.submit( + add_text, + [state, textbox, imagebox, image_process_mode], + [state, chatbot, textbox, imagebox], + queue=False + ).then( + http_bot, + [state, temperature, top_p, max_output_tokens], + [state, chatbot] + ) + + submit_btn.click( + add_text, + [state, textbox, imagebox, image_process_mode], + [state, chatbot, textbox, imagebox], + queue=False + ).then( + http_bot, + [state, temperature, top_p, max_output_tokens], + [state, chatbot] + ) + + demo.load( + load_demo, + None, + [state], + queue=False + ) + return demo + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--host", type=str, default="0.0.0.0") + parser.add_argument("--port", type=int, default=7860) + parser.add_argument("--share", default=True) + parser.add_argument("--model-path", type=str, + default="checkpoints/llavaPhi-v0-3b-finetune") + parser.add_argument("--model-name", type=str, + default="llavaPhi-v0-3b") + args = parser.parse_args() + return args + + +if __name__ == '__main__': + args = parse_args() + model_name = args.model_name + tokenizer, model, image_processor, context_len = load_pretrained_model( + args.model_path, None, args.model_name, False, False) + demo = build_demo() + demo.queue() + demo.launch(server_name=args.host, + server_port=args.port, + share=args.share) diff --git a/llava-phi/llava_phi/serve/cli.py b/llava-phi/llava_phi/serve/cli.py new file mode 100644 index 0000000000000000000000000000000000000000..0e82651f5a22cf94ee8bf514b48109b3c5d8449d --- /dev/null +++ b/llava-phi/llava_phi/serve/cli.py @@ -0,0 +1,121 @@ +import argparse +import torch + +from llava_phi.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN +from llava_phi.conversation import conv_templates, SeparatorStyle +from llava_phi.model.builder import load_pretrained_model +from llava_phi.utils import disable_torch_init +from llava_phi.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria + +from PIL import Image + +import requests +from PIL import Image +from io import BytesIO +from transformers import TextStreamer + + +def load_image(image_file): + if image_file.startswith('http') or image_file.startswith('https'): + response = requests.get(image_file) + image = Image.open(BytesIO(response.content)).convert('RGB') + else: + image = Image.open(image_file).convert('RGB') + return image + + +def main(args): + # Model + disable_torch_init() + + model_name = get_model_name_from_path(args.model_path) + tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, args.load_8bit, args.load_4bit) + + if 'llama-2' in model_name.lower(): + conv_mode = "llava_llama_2" + elif "v1" in model_name.lower(): + conv_mode = "llava_v1" + elif "mpt" in model_name.lower(): + conv_mode = "mpt" + else: + conv_mode = "llava_v0" + conv_mode="vicuna_v1" + if args.conv_mode is not None and conv_mode != args.conv_mode: + print('[WARNING] the auto inferred conversation mode is {}, while `--conv-mode` is {}, using {}'.format(conv_mode, args.conv_mode, args.conv_mode)) + else: + args.conv_mode = conv_mode + + conv = conv_templates[args.conv_mode].copy() + if "mpt" in model_name.lower(): + roles = ('user', 'assistant') + else: + roles = conv.roles + + image = load_image(args.image_file) + image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'].cuda() + + while True: + try: + inp = input(f"{roles[0]}: ") + except EOFError: + inp = "" + if not inp: + print("exit...") + break + + print(f"{roles[1]}: ", end="") + + if image is not None: + # first message + if model.config.mm_use_im_start_end: + inp = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + inp + else: + inp = DEFAULT_IMAGE_TOKEN + '\n' + inp + conv.append_message(conv.roles[0], inp) + image = None + else: + # later messages + conv.append_message(conv.roles[0], inp) + conv.append_message(conv.roles[1], None) + prompt = conv.get_prompt() + + input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda() + stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2 + keywords = [stop_str] + stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids) + streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) + + with torch.inference_mode(): + output_ids = model.generate( + input_ids, + images=image_tensor, + do_sample=True, + temperature=0.2, + max_new_tokens=1024, + streamer=streamer, + use_cache=True, + eos_token_id=tokenizer.eos_token_id, # End of sequence token + pad_token_id=tokenizer.eos_token_id, # Pad token + stopping_criteria=[stopping_criteria]) + + outputs = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip() + conv.messages[-1][-1] = outputs + + if args.debug: + print("\n", {"prompt": prompt, "outputs": outputs}, "\n") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--model-path", type=str, default="facebook/opt-350m") + parser.add_argument("--model-base", type=str, default=None) + parser.add_argument("--image-file", type=str, required=True) + parser.add_argument("--num-gpus", type=int, default=1) + parser.add_argument("--conv-mode", type=str, default=None) + parser.add_argument("--temperature", type=float, default=0.2) + parser.add_argument("--max-new-tokens", type=int, default=512) + parser.add_argument("--load-8bit", action="store_true") + parser.add_argument("--load-4bit", action="store_true") + parser.add_argument("--debug", action="store_true") + args = parser.parse_args() + main(args) diff --git a/llava-phi/llava_phi/serve/examples/extreme_ironing.jpg b/llava-phi/llava_phi/serve/examples/extreme_ironing.jpg new file mode 100644 index 0000000000000000000000000000000000000000..638b078837f175039b2db49a63821288d9681daa Binary files /dev/null and b/llava-phi/llava_phi/serve/examples/extreme_ironing.jpg differ diff --git a/llava-phi/llava_phi/serve/examples/waterview.jpg b/llava-phi/llava_phi/serve/examples/waterview.jpg new file mode 100644 index 0000000000000000000000000000000000000000..6f44ebaba1aa493b8bab3baa4e827b76752b1869 Binary files /dev/null and b/llava-phi/llava_phi/serve/examples/waterview.jpg differ diff --git a/llava-phi/llava_phi/train/convert_model2base_llava_phi.py b/llava-phi/llava_phi/train/convert_model2base_llava_phi.py new file mode 100644 index 0000000000000000000000000000000000000000..8ffa28b4e860a97f94b1e478793393941072076d --- /dev/null +++ b/llava-phi/llava_phi/train/convert_model2base_llava_phi.py @@ -0,0 +1,767 @@ +# Adopted from https://github.com/lm-sys/FastChat. Below is the original copyright: +# Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright: +# Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import copy +from dataclasses import dataclass, field +import json +import logging +import pathlib +from typing import Dict, Optional, Sequence, List + +import torch + +import transformers + +from llava_phi.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, \ + DEFAULT_IM_END_TOKEN +from torch.utils.data import Dataset +from llava_phi.train.llava_phi_trainer import LLaVAPhiTrainer + +from llava_phi import conversation as conversation_lib +from llava_phi.model import * +from llava_phi.mm_utils import tokenizer_image_token +from transformers import CLIPVisionConfig, CLIPImageProcessor + +from PIL import Image + +local_rank = None + + +def rank0_print(*args): + if local_rank == 0: + print(*args) + + +@dataclass +class ModelArguments: + model_name_or_path: Optional[str] = field(default="facebook/opt-125m") + version: Optional[str] = field(default="v0") + freeze_backbone: bool = field(default=False) + tune_mm_mlp_adapter: bool = field(default=False) + freeze_vision_tower: bool = field(default=False) + vision_tower: Optional[str] = field(default=None) + mm_vision_select_layer: Optional[int] = field(default=-1) # default to the last layer + mm_vision_select_feature: Optional[str] = field(default="patch") + pretrain_mm_mlp_adapter: Optional[str] = field(default=None) + mm_use_im_start_end: bool = field(default=False) + mm_use_im_patch_token: bool = field(default=True) + + + +@dataclass +class ProjectorArguments: + mm_projector_type: Optional[str] = field(default='linear') + + +@dataclass +class DataArguments: + data_path: str = field(default=None, + metadata={"help": "Path to the training data."}) + lazy_preprocess: bool = False + is_multimodal: bool = False + image_folder: Optional[str] = field(default=None) + image_aspect_ratio: str = 'square' + + +@dataclass +class TrainingArguments(transformers.TrainingArguments): + cache_dir: Optional[str] = field(default=None) + optim: str = field(default="adamw_torch") + adam_beta1: float = field(default=0.9) + adam_beta2: float = field(default=0.98) + adam_epsilon: float = field(default=1e-7) + remove_unused_columns: bool = field(default=False) + + # freeze_mm_mlp_adapter: bool = field(default=False) + model_max_length: int = field( + default=512, + metadata={ + "help": + "Maximum sequence length. Sequences will be right padded (and possibly truncated)." + }, + ) + double_quant: bool = field( + default=True, + metadata={"help": "Compress the quantization statistics through double quantization."} + ) + quant_type: str = field( + default="nf4", + metadata={"help": "Quantization data type to use. Should be one of `fp4` or `nf4`."} + ) + bits: int = field( + default=16, + metadata={"help": "How many bits to use."} + ) + lora_enable: bool = False + lora_r: int = 64 + lora_alpha: int = 16 + lora_dropout: float = 0.05 + lora_weight_path: str = "" + lora_bias: str = "none" + mm_projector_lr: Optional[float] = None + group_by_modality_length: bool = field(default=False) + + +def maybe_zero_3(param, ignore_status=False, name=None): + from deepspeed import zero + from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus + if hasattr(param, "ds_id"): + if param.ds_status == ZeroParamStatus.NOT_AVAILABLE: + if not ignore_status: + logging.warning(f"{name}: param.ds_status != ZeroParamStatus.NOT_AVAILABLE: {param.ds_status}") + with zero.GatheredParameters([param]): + param = param.data.detach().cpu().clone() + else: + param = param.detach().cpu().clone() + return param + + +# Borrowed from peft.utils.get_peft_model_state_dict +def get_peft_state_maybe_zero_3(named_params, bias): + if bias == "none": + to_return = {k: t for k, t in named_params if "lora_" in k} + elif bias == "all": + to_return = {k: t for k, t in named_params if "lora_" in k or "bias" in k} + elif bias == "lora_only": + to_return = {} + maybe_lora_bias = {} + lora_bias_names = set() + for k, t in named_params: + if "lora_" in k: + to_return[k] = t + bias_name = k.split("lora_")[0] + "bias" + lora_bias_names.add(bias_name) + elif "bias" in k: + maybe_lora_bias[k] = t + for k, t in maybe_lora_bias: + if bias_name in lora_bias_names: + to_return[bias_name] = t + else: + raise NotImplementedError + to_return = {k: maybe_zero_3(v, ignore_status=True) for k, v in to_return.items()} + return to_return + + +def get_peft_state_non_lora_maybe_zero_3(named_params, require_grad_only=True): + to_return = {k: t for k, t in named_params if "lora_" not in k} + if require_grad_only: + to_return = {k: t for k, t in to_return.items() if t.requires_grad} + to_return = {k: maybe_zero_3(v, ignore_status=True).cpu() for k, v in to_return.items()} + return to_return + + +def get_mm_adapter_state_maybe_zero_3(named_params, keys_to_match): + to_return = {k: t for k, t in named_params if any(key_match in k for key_match in keys_to_match)} + to_return = {k: maybe_zero_3(v, ignore_status=True).cpu() for k, v in to_return.items()} + return to_return + + +def find_all_linear_names(model): + cls = torch.nn.Linear + lora_module_names = set() + multimodal_keywords = ['mm_projector', 'vision_tower', 'vision_resampler'] + for name, module in model.named_modules(): + if any(mm_keyword in name for mm_keyword in multimodal_keywords): + continue + if isinstance(module, cls): + names = name.split('.') + lora_module_names.add(names[0] if len(names) == 1 else names[-1]) + + if 'lm_head' in lora_module_names: # needed for 16-bit + lora_module_names.remove('lm_head') + return list(lora_module_names) + + +def safe_save_model_for_hf_trainer(trainer: transformers.Trainer, + output_dir: str): + """Collects the state dict and dump to disk.""" + + # if getattr(trainer.args, "tune_mm_mlp_adapter", False): + # # Only save Adapter + # keys_to_match = ['mm_projector'] + # if getattr(trainer.args, "use_im_start_end", False): + # keys_to_match.extend(['embed_tokens', 'embed_in']) + # + # weight_to_save = get_mm_adapter_state_maybe_zero_3(trainer.model.named_parameters(), keys_to_match) + # trainer.model.config.save_pretrained(output_dir) + # + # current_folder = output_dir.split('/')[-1] + # parent_folder = os.path.dirname(output_dir) + # if trainer.args.local_rank == 0 or trainer.args.local_rank == -1: + # if current_folder.startswith('checkpoint-'): + # mm_projector_folder = os.path.join(parent_folder, "mm_projector") + # os.makedirs(mm_projector_folder, exist_ok=True) + # torch.save(weight_to_save, os.path.join(mm_projector_folder, f'{current_folder}.bin')) + # else: + # torch.save(weight_to_save, os.path.join(output_dir, f'mm_projector.bin')) + # + # if getattr(trainer.args, "freeze_vision_model", False): + # pass + # return + + if trainer.deepspeed: + torch.cuda.synchronize() + trainer.save_model(output_dir) + return + + state_dict = trainer.model.state_dict() + if trainer.args.should_save: + cpu_state_dict = { + key: value.cpu() + for key, value in state_dict.items() + } + del state_dict + trainer._save(output_dir, state_dict=cpu_state_dict) # noqa + + +def smart_tokenizer_and_embedding_resize( + special_tokens_dict: Dict, + tokenizer: transformers.PreTrainedTokenizer, + model: transformers.PreTrainedModel, +): + """Resize tokenizer and embedding. + + Note: This is the unoptimized version that may make your embedding size not be divisible by 64. + """ + num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict) + model.resize_token_embeddings(len(tokenizer)) + + if num_new_tokens > 0: + input_embeddings = model.get_input_embeddings().weight.data + output_embeddings = model.get_output_embeddings().weight.data + + input_embeddings_avg = input_embeddings[:-num_new_tokens].mean( + dim=0, keepdim=True) + output_embeddings_avg = output_embeddings[:-num_new_tokens].mean( + dim=0, keepdim=True) + + input_embeddings[-num_new_tokens:] = input_embeddings_avg + output_embeddings[-num_new_tokens:] = output_embeddings_avg + + +def _tokenize_fn(strings: Sequence[str], + tokenizer: transformers.PreTrainedTokenizer) -> Dict: + """Tokenize a list of strings.""" + tokenized_list = [ + tokenizer( + text, + return_tensors="pt", + padding="longest", + max_length=tokenizer.model_max_length, + truncation=True, + ) for text in strings + ] + input_ids = labels = [ + tokenized.input_ids[0] for tokenized in tokenized_list + ] + input_ids_lens = labels_lens = [ + tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item() + for tokenized in tokenized_list + ] + return dict( + input_ids=input_ids, + labels=labels, + input_ids_lens=input_ids_lens, + labels_lens=labels_lens, + ) + + +def _mask_targets(target, tokenized_lens, speakers): + # cur_idx = 0 + cur_idx = tokenized_lens[0] + tokenized_lens = tokenized_lens[1:] + target[:cur_idx] = IGNORE_INDEX + for tokenized_len, speaker in zip(tokenized_lens, speakers): + if speaker == "human": + target[cur_idx + 2:cur_idx + tokenized_len] = IGNORE_INDEX + cur_idx += tokenized_len + + +def _add_speaker_and_signal(header, source, get_conversation=True): + """Add speaker and start/end signal on each round.""" + BEGIN_SIGNAL = "### " + END_SIGNAL = "\n" + conversation = header + for sentence in source: + from_str = sentence["from"] + if from_str.lower() == "human": + from_str = conversation_lib.default_conversation.roles[0] + elif from_str.lower() == "gpt": + from_str = conversation_lib.default_conversation.roles[1] + else: + from_str = 'unknown' + sentence["value"] = (BEGIN_SIGNAL + from_str + ": " + + sentence["value"] + END_SIGNAL) + if get_conversation: + conversation += sentence["value"] + conversation += BEGIN_SIGNAL + return conversation + + +def preprocess_multimodal( + sources: Sequence[str], + data_args: DataArguments +) -> Dict: + is_multimodal = data_args.is_multimodal + if not is_multimodal: + return sources + + for source in sources: + for sentence in source: + if DEFAULT_IMAGE_TOKEN in sentence['value']: + sentence['value'] = sentence['value'].replace(DEFAULT_IMAGE_TOKEN, '').strip() + sentence['value'] = DEFAULT_IMAGE_TOKEN + '\n' + sentence['value'] + sentence['value'] = sentence['value'].strip() + if "mmtag" in conversation_lib.default_conversation.version: + sentence['value'] = sentence['value'].replace(DEFAULT_IMAGE_TOKEN, + '' + DEFAULT_IMAGE_TOKEN + '') + replace_token = DEFAULT_IMAGE_TOKEN + if data_args.mm_use_im_start_end: + replace_token = DEFAULT_IM_START_TOKEN + replace_token + DEFAULT_IM_END_TOKEN + sentence["value"] = sentence["value"].replace(DEFAULT_IMAGE_TOKEN, replace_token) + + return sources + + +def preprocess_v0( + sources, + tokenizer: transformers.PreTrainedTokenizer, + has_image: bool = False +) -> Dict: + conv = conversation_lib.default_conversation.copy() + roles = {"human": conv.roles[0], "gpt": conv.roles[1]} + + # Apply prompt templates + conversations = [] + for i, source in enumerate(sources): + if roles[source[0]["from"]] != conv.roles[0]: + # Skip the first one if it is not from human + source = source[1:] + + conv.messages = [] + for j, sentence in enumerate(source): + role = roles[sentence["from"]] + assert role == conv.roles[j % 2], f"{i}" + conv.append_message(role, sentence["value"]) + conversations.append(conv.get_prompt()) + + # Tokenize conversations + if has_image: + input_ids = torch.stack( + [tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0) + else: + input_ids = tokenizer( + conversations, + return_tensors="pt", + padding="longest", + max_length=tokenizer.model_max_length, + truncation=True, + ).input_ids + + targets = input_ids.clone() + + assert conv.sep_style == conversation_lib.SeparatorStyle.TWO + + # Mask targets + sep = conv.sep + conv.roles[1] + ": " + for conversation, target in zip(conversations, targets): + total_len = int(target.ne(tokenizer.pad_token_id).sum()) + conversation.count( + conv.sep2) # in phi-2, pad_token_id == eos_token_id + + rounds = conversation.split(conv.sep2) + cur_len = 0 + if cur_len > 0: + target[:cur_len] = IGNORE_INDEX + for i, rou in enumerate(rounds): + if rou == "": + break + + parts = rou.split(sep) + if len(parts) != 2: + break + parts[0] += sep + + if has_image: + round_len = len(tokenizer_image_token(rou, tokenizer)) + 1 # +1 for <|endoftext|> + instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) + else: + round_len = len(tokenizer(rou).input_ids) + 1 # +1 for <|endoftext|> + instruction_len = len(tokenizer(parts[0]).input_ids) + + target[cur_len: cur_len + instruction_len] = IGNORE_INDEX + + cur_len += round_len + target[cur_len:] = IGNORE_INDEX + + if cur_len < tokenizer.model_max_length: + if cur_len != total_len: + target[:] = IGNORE_INDEX + print(conversation) + print( + f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}." + f" (ignored)" + ) + + return dict( + input_ids=input_ids, + labels=targets, + ) + + +def preprocess_plain( + sources: Sequence[str], + tokenizer: transformers.PreTrainedTokenizer, +) -> Dict: + # add end signal and concatenate together + conversations = [] + # print(sources) + # time.sleep(5) + for source in sources: + assert len(source) == 2 + assert DEFAULT_IMAGE_TOKEN in source[0]['value'] + source[0]['value'] = DEFAULT_IMAGE_TOKEN + conversation = source[0]['value'] + source[1]['value'] + conversation_lib.default_conversation.sep + conversations.append(conversation) + # tokenize conversations + # print(conversations) + input_ids = [tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations] + targets = copy.deepcopy(input_ids) + for target, source in zip(targets, sources): + tokenized_len = len(tokenizer_image_token(source[0]['value'], tokenizer)) + target[:tokenized_len] = IGNORE_INDEX + return dict(input_ids=input_ids, labels=targets) + + +def preprocess( + sources: Sequence[str], + tokenizer: transformers.PreTrainedTokenizer, + has_image: bool = False +) -> Dict: + """ + Given a list of sources, each is a conversation list. This transform: + 1. Add signal '### ' at the beginning each sentence, with end signal '\n'; + 2. Concatenate conversations together; + 3. Tokenize the concatenated conversation; + 4. Make a deepcopy as the target. Mask human words with IGNORE_INDEX. + """ + if conversation_lib.default_conversation.sep_style == conversation_lib.SeparatorStyle.PLAIN: + return preprocess_plain(sources, tokenizer) + if conversation_lib.default_conversation.version.startswith("v0"): + return preprocess_v0(sources, tokenizer, has_image=has_image) + # add end signal and concatenate together + conversations = [] + for source in sources: + header = f"{conversation_lib.default_conversation.system}\n\n" + conversation = _add_speaker_and_signal(header, source) + conversations.append(conversation) + + # tokenize conversations + def get_tokenize_len(prompts): + return [len(tokenizer_image_token(prompt, tokenizer)) for prompt in prompts] + + if has_image: + input_ids = [tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations] + else: + conversations_tokenized = _tokenize_fn(conversations, tokenizer) + input_ids = conversations_tokenized["input_ids"] + + targets = copy.deepcopy(input_ids) + for target, source in zip(targets, sources): + if has_image: + tokenized_lens = get_tokenize_len([header] + [s["value"] for s in source]) + else: + tokenized_lens = _tokenize_fn([header] + [s["value"] for s in source], tokenizer)["input_ids_lens"] + speakers = [sentence["from"] for sentence in source] + _mask_targets(target, tokenized_lens, speakers) + + return dict(input_ids=input_ids, labels=targets) + + +class LazySupervisedDataset(Dataset): + """Dataset for supervised fine-tuning.""" + + def __init__(self, data_path: str, + tokenizer: transformers.PreTrainedTokenizer, + data_args: DataArguments): + super(LazySupervisedDataset, self).__init__() + list_data_dict = json.load(open(data_path, "r")) + + rank0_print("Formatting inputs...Skip in lazy mode") + self.tokenizer = tokenizer + self.list_data_dict = list_data_dict + self.data_args = data_args + + def __len__(self): + return len(self.list_data_dict) + + @property + def lengths(self): + length_list = [] + for sample in self.list_data_dict: + img_tokens = 128 if 'image' in sample else 0 + length_list.append(sum(len(conv['value'].split()) for conv in sample['conversations']) + img_tokens) + return length_list + + @property + def modality_lengths(self): + length_list = [] + for sample in self.list_data_dict: + cur_len = sum(len(conv['value'].split()) for conv in sample['conversations']) + cur_len = cur_len if 'image' in sample else -cur_len + length_list.append(cur_len) + return length_list + + def __getitem__(self, i) -> Dict[str, torch.Tensor]: + sources = self.list_data_dict[i] + if isinstance(i, int): + sources = [sources] + assert len(sources) == 1, "Don't know why it is wrapped to a list" # FIXME + if 'image' in sources[0]: + image_file = self.list_data_dict[i]['image'] + image_folder = self.data_args.image_folder + processor = self.data_args.image_processor + image = Image.open(os.path.join(image_folder, image_file)).convert('RGB') + if self.data_args.image_aspect_ratio == 'pad': + def expand2square(pil_img, background_color): + width, height = pil_img.size + if width == height: + return pil_img + elif width > height: + result = Image.new(pil_img.mode, (width, width), background_color) + result.paste(pil_img, (0, (width - height) // 2)) + return result + else: + result = Image.new(pil_img.mode, (height, height), background_color) + result.paste(pil_img, ((height - width) // 2, 0)) + return result + + image = expand2square(image, tuple(int(x * 255) for x in processor.image_mean)) + image = processor.preprocess(image, return_tensors='pt')['pixel_values'][0] + else: + image = processor.preprocess(image, return_tensors='pt')['pixel_values'][0] + sources = preprocess_multimodal( + copy.deepcopy([e["conversations"] for e in sources]), + self.data_args) + else: + sources = copy.deepcopy([e["conversations"] for e in sources]) + data_dict = preprocess( + sources, + self.tokenizer, + has_image=('image' in self.list_data_dict[i])) + if isinstance(i, int): + data_dict = dict(input_ids=data_dict["input_ids"][0], + labels=data_dict["labels"][0]) + + # image exist in the data + if 'image' in self.list_data_dict[i]: + data_dict['image'] = image + elif self.data_args.is_multimodal: + # image does not exist in the data, but the model is multimodal + crop_size = self.data_args.image_processor.crop_size + data_dict['image'] = torch.zeros(3, crop_size['height'], crop_size['width']) + return data_dict + + +@dataclass +class DataCollatorForSupervisedDataset(object): + """Collate examples for supervised fine-tuning.""" + + tokenizer: transformers.PreTrainedTokenizer + + def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]: + input_ids, labels = tuple([instance[key] for instance in instances] + for key in ("input_ids", "labels")) + temp_pad_token_id = 51000 + input_ids = torch.nn.utils.rnn.pad_sequence( + input_ids, + batch_first=True, + padding_value=temp_pad_token_id) + labels = torch.nn.utils.rnn.pad_sequence(labels, + batch_first=True, + padding_value=IGNORE_INDEX) + input_ids = input_ids[:, :self.tokenizer.model_max_length] + labels = labels[:, :self.tokenizer.model_max_length] + batch = dict( + input_ids=input_ids, + labels=labels, + attention_mask=input_ids.ne(temp_pad_token_id), + ) + + if 'image' in instances[0]: + images = [instance['image'] for instance in instances] + if all(x is not None and x.shape == images[0].shape for x in images): + batch['images'] = torch.stack(images) + else: + batch['images'] = images + + return batch + + +def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer, + data_args) -> Dict: + """Make dataset and collator for supervised fine-tuning.""" + train_dataset = LazySupervisedDataset(tokenizer=tokenizer, + data_path=data_args.data_path, + data_args=data_args) + data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer) + return dict(train_dataset=train_dataset, + eval_dataset=None, + data_collator=data_collator) + + +def train(): + global local_rank + + parser = transformers.HfArgumentParser( + (ModelArguments, DataArguments, TrainingArguments, ProjectorArguments)) + model_args, data_args, training_args, projector_args = parser.parse_args_into_dataclasses() + local_rank = training_args.local_rank + compute_dtype = (torch.float16 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32)) + + bnb_model_from_pretrained_args = {} + if training_args.bits in [4, 8]: + from transformers import BitsAndBytesConfig + bnb_model_from_pretrained_args.update(dict( + device_map={"": training_args.device}, + load_in_4bit=training_args.bits == 4, + load_in_8bit=training_args.bits == 8, + quantization_config=BitsAndBytesConfig( + load_in_4bit=training_args.bits == 4, + load_in_8bit=training_args.bits == 8, + llm_int8_skip_modules=["mm_projector"], + llm_int8_threshold=6.0, + llm_int8_has_fp16_weight=False, + bnb_4bit_compute_dtype=compute_dtype, + bnb_4bit_use_double_quant=training_args.double_quant, + bnb_4bit_quant_type=training_args.quant_type # {'fp4', 'nf4'} + ) + )) + + if model_args.vision_tower is not None: + config = LlavaPhiConfig.from_pretrained(model_args.model_name_or_path, trust_remote_code=True) + clip_config = CLIPVisionConfig.from_pretrained(model_args.vision_tower) + vis_config = LlavaPhiVisionConfig(**clip_config.to_dict()) + config.vision_config["vision_tower"] = vis_config.to_dict() + config.vision_config["vision_tower"]["mm_vision_select_feature"] = model_args.mm_vision_select_feature + config.vision_config["vision_tower"]["mm_vision_select_layer"] = model_args.mm_vision_select_layer + + config.vision_config["mm_projector"]["mm_projector_type"] = projector_args.mm_projector_type + config.vision_config["mm_projector"]["mm_hidden_size"] = vis_config.hidden_size + config.vision_config["mm_projector"]["hidden_size"] = config.hidden_size + + model = LlavaPhiForCausalLM.from_pretrained( + model_args.model_name_or_path, + config=config, + cache_dir=training_args.cache_dir, + trust_remote_code=True, + **bnb_model_from_pretrained_args + ) + rank0_print(model) + clip_model_param = torch.load(os.path.join(model_args.vision_tower, "pytorch_model.bin"), map_location='cpu') + model.get_model().vision_tower.load_state_dict(clip_model_param, strict=False) + else: + model = transformers.PhiForCausalLM.from_pretrained( + model_args.model_name_or_path, + cache_dir=training_args.cache_dir, + **bnb_model_from_pretrained_args + ) + model.config.use_cache = False + + if model_args.freeze_backbone: + model.model.requires_grad_(False) + + if training_args.gradient_checkpointing: + if hasattr(model, "enable_input_require_grads"): + model.enable_input_require_grads() + else: + def make_inputs_require_grad(module, input, output): + output.requires_grad_(True) + + model.get_input_embeddings().register_forward_hook(make_inputs_require_grad) + + if 'phi' in model_args.model_name_or_path: + tokenizer = transformers.AutoTokenizer.from_pretrained( + model_args.model_name_or_path, + cache_dir=training_args.cache_dir, + model_max_length=training_args.model_max_length, + padding_side="right" + ) + else: + tokenizer = transformers.AutoTokenizer.from_pretrained( + model_args.model_name_or_path, + cache_dir=training_args.cache_dir, + model_max_length=training_args.model_max_length, + padding_side="right", + use_fast=False, + ) + + tokenizer.pad_token = tokenizer.unk_token + if model_args.version in conversation_lib.conv_templates: + conversation_lib.default_conversation = conversation_lib.conv_templates[model_args.version] + else: + conversation_lib.default_conversation = conversation_lib.conv_templates["phi-2_v0"] + + assert model_args.vision_tower is not None, "llava_phi-phi only supports multi-modal models" + if model_args.vision_tower is not None: + + vision_tower = model.get_vision_tower() + vision_tower.to(dtype=torch.bfloat16 if training_args.bf16 else torch.float16, device=training_args.device) + + data_args.image_processor = CLIPImageProcessor.from_pretrained(model_args.vision_tower) + data_args.is_multimodal = True + + model.config.image_aspect_ratio = data_args.image_aspect_ratio + model.config.tokenizer_padding_side = tokenizer.padding_side + model.config.tokenizer_model_max_length = tokenizer.model_max_length + + model.config.tune_mm_mlp_adapter = training_args.tune_mm_mlp_adapter = model_args.tune_mm_mlp_adapter + model.requires_grad_(False) + if model_args.tune_mm_mlp_adapter: + for p in model.get_model().mm_projector.parameters(): + p.requires_grad = True + + model.config.freeze_vision_tower = training_args.freeze_vision_tower = model_args.freeze_vision_tower + if not model_args.freeze_vision_tower: + for p in model.get_model().vision_tower.parameters(): + p.requires_grad = True + + if training_args.bits in [4, 8]: + model.get_model().mm_projector.to(dtype=compute_dtype, device=training_args.device) + + model.config.mm_use_im_start_end = data_args.mm_use_im_start_end = model_args.mm_use_im_start_end + model.config.mm_projector_lr = training_args.mm_projector_lr + training_args.use_im_start_end = model_args.mm_use_im_start_end + model.config.mm_use_im_patch_token = model_args.mm_use_im_patch_token + model.initialize_vision_tokenizer(model_args, tokenizer=tokenizer) + + data_module = make_supervised_data_module(tokenizer=tokenizer, + data_args=data_args) + + trainer = LLaVAPhiTrainer(model=model, + tokenizer=tokenizer, + args=training_args, + **data_module) + # integrate the MLLM + trainer.save_state() + + model.config.use_cache = True + + safe_save_model_for_hf_trainer(trainer=trainer, output_dir=training_args.output_dir) + + +if __name__ == "__main__": + train() diff --git a/llava-phi/llava_phi/train/llava_phi_trainer.py b/llava-phi/llava_phi/train/llava_phi_trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..f007b65aad9cdad7e5601c21d475defc45884723 --- /dev/null +++ b/llava-phi/llava_phi/train/llava_phi_trainer.py @@ -0,0 +1,156 @@ +import os +import torch + +from torch.utils.data import Sampler + +from transformers import Trainer +from transformers.trainer import ( + has_length, +) +from typing import List, Optional + + +def maybe_zero_3(param, ignore_status=False, name=None): + from deepspeed import zero + from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus + if hasattr(param, "ds_id"): + if param.ds_status == ZeroParamStatus.NOT_AVAILABLE: + if not ignore_status: + print(name, 'no ignore status') + with zero.GatheredParameters([param]): + param = param.data.detach().cpu().clone() + else: + param = param.detach().cpu().clone() + return param + + +def get_mm_adapter_state_maybe_zero_3(named_params, keys_to_match): + to_return = {k: t for k, t in named_params if any(key_match in k for key_match in keys_to_match)} + to_return = {k: maybe_zero_3(v, ignore_status=True, name=k).cpu() for k, v in to_return.items()} + return to_return + + +def split_to_even_chunks(indices, lengths, num_chunks): + """ + Split a list of indices into `chunks` chunks of roughly equal lengths. + """ + + if len(indices) % num_chunks != 0: + return [indices[i::num_chunks] for i in range(num_chunks)] + + num_indices_per_chunk = len(indices) // num_chunks + + chunks = [[] for _ in range(num_chunks)] + chunks_lengths = [0 for _ in range(num_chunks)] + for index in indices: + shortest_chunk = chunks_lengths.index(min(chunks_lengths)) + chunks[shortest_chunk].append(index) + chunks_lengths[shortest_chunk] += lengths[index] + if len(chunks[shortest_chunk]) == num_indices_per_chunk: + chunks_lengths[shortest_chunk] = float("inf") + + return chunks + + +def get_modality_length_grouped_indices(lengths, batch_size, world_size, generator=None): + # We need to use torch for the random part as a distributed sampler will set the random seed for torch. + assert all(l != 0 for l in lengths), "Should not have zero length." + # assert all(l > 0 for l in lengths) or all(l < 0 for l in lengths), "Should have only positive or negative lengths." + + mm_indices, mm_lengths = zip(*[(i, l) for i, l in enumerate(lengths) if l > 0]) + lang_indices, lang_lengths = zip(*[(i, -l) for i, l in enumerate(lengths) if l < 0]) + + assert len(mm_indices) > 0, "Should have at least one multimodal sample." + assert len(lang_indices) > 0, "Should have at least one language sample." + + mm_shuffle = [mm_indices[i] for i in get_length_grouped_indices(mm_lengths, batch_size, world_size, generator=None)] + lang_shuffle = [lang_indices[i] for i in get_length_grouped_indices(lang_lengths, batch_size, world_size, generator=None)] + megabatch_size = world_size * batch_size + mm_megabatches = [mm_shuffle[i : i + megabatch_size] for i in range(0, len(mm_shuffle), megabatch_size)] + lang_megabatches = [lang_shuffle[i : i + megabatch_size] for i in range(0, len(lang_shuffle), megabatch_size)] + + last_mm = mm_megabatches[-1] + last_lang = lang_megabatches[-1] + additional_batch = last_mm + last_lang + megabatches = mm_megabatches[:-1] + lang_megabatches[:-1] + megabatch_indices = torch.randperm(len(megabatches), generator=generator) + megabatches = [megabatches[i] for i in megabatch_indices] + + if len(additional_batch) >= megabatch_size: + megabatches = [additional_batch[:megabatch_size]] + megabatches + additional_batch = additional_batch[megabatch_size:] + + if len(additional_batch) > 0: + megabatches.append(additional_batch) + + return [i for megabatch in megabatches for i in megabatch] + + +def get_length_grouped_indices(lengths, batch_size, world_size, generator=None, merge=True): + # We need to use torch for the random part as a distributed sampler will set the random seed for torch. + indices = torch.randperm(len(lengths), generator=generator) + megabatch_size = world_size * batch_size + megabatches = [indices[i : i + megabatch_size].tolist() for i in range(0, len(lengths), megabatch_size)] + megabatches = [sorted(megabatch, key=lambda i: lengths[i], reverse=True) for megabatch in megabatches] + megabatches = [split_to_even_chunks(megabatch, lengths, world_size) for megabatch in megabatches] + + return [i for megabatch in megabatches for batch in megabatch for i in batch] + + +class LengthGroupedSampler(Sampler): + r""" + Sampler that samples indices in a way that groups together features of the dataset of roughly the same length while + keeping a bit of randomness. + """ + + def __init__( + self, + batch_size: int, + world_size: int, + lengths: Optional[List[int]] = None, + generator=None, + group_by_modality: bool = False, + ): + if lengths is None: + raise ValueError("Lengths must be provided.") + + self.batch_size = batch_size + self.world_size = world_size + self.lengths = lengths + self.generator = generator + self.group_by_modality = group_by_modality + + def __len__(self): + return len(self.lengths) + + def __iter__(self): + if self.group_by_modality: + indices = get_modality_length_grouped_indices(self.lengths, self.batch_size, self.world_size, generator=self.generator) + else: + indices = get_length_grouped_indices(self.lengths, self.batch_size, self.world_size, generator=self.generator) + return iter(indices) + + +class LLaVAPhiTrainer(Trainer): + + def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]: + if self.train_dataset is None or not has_length(self.train_dataset): + return None + + if self.args.group_by_modality_length: + lengths = self.train_dataset.modality_lengths + return LengthGroupedSampler( + # self.args.train_batch_size * self.args.gradient_accumulation_steps, # TODO: seems that we should not have gradient_accumulation_steps + self.args.train_batch_size, + world_size=self.args.world_size, + lengths=lengths, + group_by_modality=True, + ) + else: + return super()._get_train_sampler() + + def _save_checkpoint(self, model, trial, metrics=None): + super(LLaVAPhiTrainer, self)._save_checkpoint(model, trial, metrics) + + def _save(self, output_dir: Optional[str] = None, state_dict=None): + super(LLaVAPhiTrainer, self)._save(output_dir, state_dict) diff --git a/llava-phi/llava_phi/train/train.py b/llava-phi/llava_phi/train/train.py new file mode 100644 index 0000000000000000000000000000000000000000..6aaed5b7f2eb0c82d63d6bd2b0be45a3d0cd02c5 --- /dev/null +++ b/llava-phi/llava_phi/train/train.py @@ -0,0 +1,783 @@ +# Adopted from https://github.com/lm-sys/FastChat. Below is the original copyright: +# Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright: +# Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import copy +from dataclasses import dataclass, field +import json +import logging +import pathlib +from typing import Dict, Optional, Sequence, List + +import torch + +import transformers + +from llava_phi.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, \ + DEFAULT_IM_END_TOKEN +from torch.utils.data import Dataset +from llava_phi.train.llava_phi_trainer import LLaVAPhiTrainer + +from llava_phi import conversation as conversation_lib +from llava_phi.model import * +from llava_phi.mm_utils import tokenizer_image_token +from transformers import CLIPVisionConfig, CLIPImageProcessor + +from PIL import Image + +local_rank = None + + +def rank0_print(*args): + if local_rank == 0: + print(*args) + + +@dataclass +class ModelArguments: + model_name_or_path: Optional[str] = field(default="facebook/opt-125m") + version: Optional[str] = field(default="v0") + freeze_backbone: bool = field(default=False) + tune_mm_mlp_adapter: bool = field(default=False) + freeze_vision_tower: bool = field(default=False) + mm_use_im_start_end: bool = field(default=False) + mm_use_im_patch_token: bool = field(default=True) + + +@dataclass +class DataArguments: + data_path: str = field(default=None, + metadata={"help": "Path to the training data."}) + lazy_preprocess: bool = False + is_multimodal: bool = False + image_folder: Optional[str] = field(default=None) + image_aspect_ratio: str = 'square' + + +@dataclass +class TrainingArguments(transformers.TrainingArguments): + cache_dir: Optional[str] = field(default=None) + optim: str = field(default="adamw_torch") + adam_beta1: float = field(default=0.9) + adam_beta2: float = field(default=0.98) + adam_epsilon: float = field(default=1e-7) + remove_unused_columns: bool = field(default=False) + + # freeze_mm_mlp_adapter: bool = field(default=False) + model_max_length: int = field( + default=512, + metadata={ + "help": + "Maximum sequence length. Sequences will be right padded (and possibly truncated)." + }, + ) + double_quant: bool = field( + default=True, + metadata={"help": "Compress the quantization statistics through double quantization."} + ) + quant_type: str = field( + default="nf4", + metadata={"help": "Quantization data type to use. Should be one of `fp4` or `nf4`."} + ) + bits: int = field( + default=16, + metadata={"help": "How many bits to use."} + ) + lora_enable: bool = False + lora_r: int = 64 + lora_alpha: int = 16 + lora_dropout: float = 0.05 + lora_weight_path: str = "" + lora_bias: str = "none" + mm_projector_lr: Optional[float] = None + group_by_modality_length: bool = field(default=False) + + +def maybe_zero_3(param, ignore_status=False, name=None): + from deepspeed import zero + from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus + if hasattr(param, "ds_id"): + if param.ds_status == ZeroParamStatus.NOT_AVAILABLE: + if not ignore_status: + logging.warning(f"{name}: param.ds_status != ZeroParamStatus.NOT_AVAILABLE: {param.ds_status}") + with zero.GatheredParameters([param]): + param = param.data.detach().cpu().clone() + else: + param = param.detach().cpu().clone() + return param + + +# Borrowed from peft.utils.get_peft_model_state_dict +def get_peft_state_maybe_zero_3(named_params, bias): + if bias == "none": + to_return = {k: t for k, t in named_params if "lora_" in k} + elif bias == "all": + to_return = {k: t for k, t in named_params if "lora_" in k or "bias" in k} + elif bias == "lora_only": + to_return = {} + maybe_lora_bias = {} + lora_bias_names = set() + for k, t in named_params: + if "lora_" in k: + to_return[k] = t + bias_name = k.split("lora_")[0] + "bias" + lora_bias_names.add(bias_name) + elif "bias" in k: + maybe_lora_bias[k] = t + for k, t in maybe_lora_bias: + if bias_name in lora_bias_names: + to_return[bias_name] = t + else: + raise NotImplementedError + to_return = {k: maybe_zero_3(v, ignore_status=True) for k, v in to_return.items()} + return to_return + + +def get_peft_state_non_lora_maybe_zero_3(named_params, require_grad_only=True): + to_return = {k: t for k, t in named_params if "lora_" not in k} + if require_grad_only: + to_return = {k: t for k, t in to_return.items() if t.requires_grad} + to_return = {k: maybe_zero_3(v, ignore_status=True).cpu() for k, v in to_return.items()} + return to_return + + +def get_mm_adapter_state_maybe_zero_3(named_params, keys_to_match): + to_return = {k: t for k, t in named_params if any(key_match in k for key_match in keys_to_match)} + to_return = {k: maybe_zero_3(v, ignore_status=True).cpu() for k, v in to_return.items()} + return to_return + + +def find_all_linear_names(model): + cls = torch.nn.Linear + lora_module_names = set() + multimodal_keywords = ['mm_projector', 'vision_tower', 'vision_resampler'] + for name, module in model.named_modules(): + if any(mm_keyword in name for mm_keyword in multimodal_keywords): + continue + if isinstance(module, cls): + names = name.split('.') + lora_module_names.add(names[0] if len(names) == 1 else names[-1]) + + if 'lm_head' in lora_module_names: # needed for 16-bit + lora_module_names.remove('lm_head') + return list(lora_module_names) + + +def safe_save_model_for_hf_trainer(trainer: transformers.Trainer, + output_dir: str): + """Collects the state dict and dump to disk.""" + + if trainer.deepspeed: + torch.cuda.synchronize() + trainer.save_model(output_dir) + return + + state_dict = trainer.model.state_dict() + if trainer.args.should_save: + cpu_state_dict = { + key: value.cpu() + for key, value in state_dict.items() + } + del state_dict + trainer._save(output_dir, state_dict=cpu_state_dict) # noqa + + +def smart_tokenizer_and_embedding_resize( + special_tokens_dict: Dict, + tokenizer: transformers.PreTrainedTokenizer, + model: transformers.PreTrainedModel, +): + """Resize tokenizer and embedding. + + Note: This is the unoptimized version that may make your embedding size not be divisible by 64. + """ + num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict) + model.resize_token_embeddings(len(tokenizer)) + + if num_new_tokens > 0: + input_embeddings = model.get_input_embeddings().weight.data + output_embeddings = model.get_output_embeddings().weight.data + + input_embeddings_avg = input_embeddings[:-num_new_tokens].mean( + dim=0, keepdim=True) + output_embeddings_avg = output_embeddings[:-num_new_tokens].mean( + dim=0, keepdim=True) + + input_embeddings[-num_new_tokens:] = input_embeddings_avg + output_embeddings[-num_new_tokens:] = output_embeddings_avg + + +def _tokenize_fn(strings: Sequence[str], + tokenizer: transformers.PreTrainedTokenizer) -> Dict: + """Tokenize a list of strings.""" + tokenized_list = [ + tokenizer( + text, + return_tensors="pt", + padding="longest", + max_length=tokenizer.model_max_length, + truncation=True, + ) for text in strings + ] + input_ids = labels = [ + tokenized.input_ids[0] for tokenized in tokenized_list + ] + input_ids_lens = labels_lens = [ + tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item() + for tokenized in tokenized_list + ] + return dict( + input_ids=input_ids, + labels=labels, + input_ids_lens=input_ids_lens, + labels_lens=labels_lens, + ) + + +def _mask_targets(target, tokenized_lens, speakers): + # cur_idx = 0 + cur_idx = tokenized_lens[0] + tokenized_lens = tokenized_lens[1:] + target[:cur_idx] = IGNORE_INDEX + for tokenized_len, speaker in zip(tokenized_lens, speakers): + if speaker == "human": + target[cur_idx + 2:cur_idx + tokenized_len] = IGNORE_INDEX + cur_idx += tokenized_len + + +def _add_speaker_and_signal(header, source, get_conversation=True): + """Add speaker and start/end signal on each round.""" + BEGIN_SIGNAL = "### " + END_SIGNAL = "\n" + conversation = header + for sentence in source: + from_str = sentence["from"] + if from_str.lower() == "human": + from_str = conversation_lib.default_conversation.roles[0] + elif from_str.lower() == "gpt": + from_str = conversation_lib.default_conversation.roles[1] + else: + from_str = 'unknown' + sentence["value"] = (BEGIN_SIGNAL + from_str + ": " + + sentence["value"] + END_SIGNAL) + if get_conversation: + conversation += sentence["value"] + conversation += BEGIN_SIGNAL + return conversation + + +def preprocess_multimodal( + sources: Sequence[str], + data_args: DataArguments +) -> Dict: + is_multimodal = data_args.is_multimodal + if not is_multimodal: + return sources + + for source in sources: + for sentence in source: + if DEFAULT_IMAGE_TOKEN in sentence['value']: + sentence['value'] = sentence['value'].replace(DEFAULT_IMAGE_TOKEN, '').strip() + sentence['value'] = DEFAULT_IMAGE_TOKEN + '\n' + sentence['value'] + sentence['value'] = sentence['value'].strip() + replace_token = DEFAULT_IMAGE_TOKEN + if data_args.mm_use_im_start_end: + replace_token = DEFAULT_IM_START_TOKEN + replace_token + DEFAULT_IM_END_TOKEN + sentence["value"] = sentence["value"].replace(DEFAULT_IMAGE_TOKEN, replace_token) + + return sources + + +def preprocess_v0( + sources, + tokenizer: transformers.PreTrainedTokenizer, + has_image: bool = False +) -> Dict: + conv = conversation_lib.default_conversation.copy() + roles = {"human": conv.roles[0], "gpt": conv.roles[1]} + + # Apply prompt templates + conversations = [] + for i, source in enumerate(sources): + if roles[source[0]["from"]] != conv.roles[0]: + # Skip the first one if it is not from human + source = source[1:] + + conv.messages = [] + for j, sentence in enumerate(source): + role = roles[sentence["from"]] + assert role == conv.roles[j % 2], f"{i}" + conv.append_message(role, sentence["value"]) + conversations.append(conv.get_prompt()) + + # Tokenize conversations + if has_image: + input_ids = torch.stack( + [tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0) + else: + input_ids = tokenizer( + conversations, + return_tensors="pt", + padding="longest", + max_length=tokenizer.model_max_length, + truncation=True, + ).input_ids + + targets = input_ids.clone() + + assert conv.sep_style == conversation_lib.SeparatorStyle.TWO + + # Mask targets + sep = conv.sep + conv.roles[1] + ": " + for conversation, target in zip(conversations, targets): + total_len = int(target.ne(tokenizer.pad_token_id).sum()) + conversation.count( + conv.sep2) # in phi-2, pad_token_id == eos_token_id + + rounds = conversation.split(conv.sep2) + cur_len = 0 + if cur_len > 0: + target[:cur_len] = IGNORE_INDEX + for i, rou in enumerate(rounds): + if rou == "": + break + + parts = rou.split(sep) + if len(parts) != 2: + break + parts[0] += sep + + if has_image: + round_len = len(tokenizer_image_token(rou, tokenizer)) + 1 # +1 for <|endoftext|> + instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) + else: + round_len = len(tokenizer(rou).input_ids) + 1 # +1 for <|endoftext|> + instruction_len = len(tokenizer(parts[0]).input_ids) + + target[cur_len: cur_len + instruction_len] = IGNORE_INDEX + + cur_len += round_len + target[cur_len:] = IGNORE_INDEX + + if cur_len < tokenizer.model_max_length: + if cur_len != total_len: + target[:] = IGNORE_INDEX + print(conversation) + print( + f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}." + f" (ignored)" + ) + + return dict( + input_ids=input_ids, + labels=targets, + ) + + +def preprocess_plain( + sources: Sequence[str], + tokenizer: transformers.PreTrainedTokenizer, +) -> Dict: + # add end signal and concatenate together + conversations = [] + # print(sources) + # time.sleep(5) + for source in sources: + assert len(source) == 2 + assert DEFAULT_IMAGE_TOKEN in source[0]['value'] + source[0]['value'] = DEFAULT_IMAGE_TOKEN + conversation = source[0]['value'] + source[1]['value'] + conversation_lib.default_conversation.sep + conversations.append(conversation) + # tokenize conversations + # print(conversations) + input_ids = [tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations] + targets = copy.deepcopy(input_ids) + for target, source in zip(targets, sources): + tokenized_len = len(tokenizer_image_token(source[0]['value'], tokenizer)) + target[:tokenized_len] = IGNORE_INDEX + return dict(input_ids=input_ids, labels=targets) + + +def preprocess( + sources: Sequence[str], + tokenizer: transformers.PreTrainedTokenizer, + has_image: bool = False +) -> Dict: + """ + Given a list of sources, each is a conversation list. This transform: + 1. Add signal '### ' at the beginning each sentence, with end signal '\n'; + 2. Concatenate conversations together; + 3. Tokenize the concatenated conversation; + 4. Make a deepcopy as the target. Mask human words with IGNORE_INDEX. + """ + if conversation_lib.default_conversation.sep_style == conversation_lib.SeparatorStyle.PLAIN: + return preprocess_plain(sources, tokenizer) + elif conversation_lib.default_conversation.version.startswith("v0"): + return preprocess_v0(sources, tokenizer, has_image=has_image) + else: + raise ValueError(f"Invalid version: {conversation_lib.default_conversation.version}") + # add end signal and concatenate together + conversations = [] + for source in sources: + header = f"{conversation_lib.default_conversation.system}\n\n" + conversation = _add_speaker_and_signal(header, source) + conversations.append(conversation) + + # tokenize conversations + def get_tokenize_len(prompts): + return [len(tokenizer_image_token(prompt, tokenizer)) for prompt in prompts] + + if has_image: + input_ids = [tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations] + else: + conversations_tokenized = _tokenize_fn(conversations, tokenizer) + input_ids = conversations_tokenized["input_ids"] + + targets = copy.deepcopy(input_ids) + for target, source in zip(targets, sources): + if has_image: + tokenized_lens = get_tokenize_len([header] + [s["value"] for s in source]) + else: + tokenized_lens = _tokenize_fn([header] + [s["value"] for s in source], tokenizer)["input_ids_lens"] + speakers = [sentence["from"] for sentence in source] + _mask_targets(target, tokenized_lens, speakers) + + return dict(input_ids=input_ids, labels=targets) + + +class LazySupervisedDataset(Dataset): + """Dataset for supervised fine-tuning.""" + + def __init__(self, data_path: str, + tokenizer: transformers.PreTrainedTokenizer, + data_args: DataArguments): + super(LazySupervisedDataset, self).__init__() + list_data_dict = json.load(open(data_path, "r")) + + rank0_print("Formatting inputs...Skip in lazy mode") + self.tokenizer = tokenizer + self.list_data_dict = list_data_dict + self.data_args = data_args + + def __len__(self): + return len(self.list_data_dict) + + @property + def lengths(self): + length_list = [] + for sample in self.list_data_dict: + img_tokens = 128 if 'image' in sample else 0 + length_list.append(sum(len(conv['value'].split()) for conv in sample['conversations']) + img_tokens) + return length_list + + @property + def modality_lengths(self): + length_list = [] + for sample in self.list_data_dict: + cur_len = sum(len(conv['value'].split()) for conv in sample['conversations']) + cur_len = cur_len if 'image' in sample else -cur_len + length_list.append(cur_len) + return length_list + + def __getitem__(self, i) -> Dict[str, torch.Tensor]: + sources = self.list_data_dict[i] + if isinstance(i, int): + sources = [sources] + assert len(sources) == 1, "Don't know why it is wrapped to a list" # FIXME + if 'image' in sources[0]: + image_file = self.list_data_dict[i]['image'] + image_folder = self.data_args.image_folder + processor = self.data_args.image_processor + image = Image.open(os.path.join(image_folder, image_file)).convert('RGB') + if self.data_args.image_aspect_ratio == 'pad': + def expand2square(pil_img, background_color): + width, height = pil_img.size + if width == height: + return pil_img + elif width > height: + result = Image.new(pil_img.mode, (width, width), background_color) + result.paste(pil_img, (0, (width - height) // 2)) + return result + else: + result = Image.new(pil_img.mode, (height, height), background_color) + result.paste(pil_img, ((height - width) // 2, 0)) + return result + + image = expand2square(image, tuple(int(x * 255) for x in processor.image_mean)) + image = processor.preprocess(image, return_tensors='pt')['pixel_values'][0] + else: + image = processor.preprocess(image, return_tensors='pt')['pixel_values'][0] + sources = preprocess_multimodal( + copy.deepcopy([e["conversations"] for e in sources]), + self.data_args) + else: + sources = copy.deepcopy([e["conversations"] for e in sources]) + data_dict = preprocess( + sources, + self.tokenizer, + has_image=('image' in self.list_data_dict[i])) + if isinstance(i, int): + data_dict = dict(input_ids=data_dict["input_ids"][0], + labels=data_dict["labels"][0]) + + # image exist in the data + if 'image' in self.list_data_dict[i]: + data_dict['image'] = image + elif self.data_args.is_multimodal: + # image does not exist in the data, but the model is multimodal + crop_size = self.data_args.image_processor.crop_size + data_dict['image'] = torch.zeros(3, crop_size['height'], crop_size['width']) + return data_dict + + +@dataclass +class DataCollatorForSupervisedDataset(object): + """Collate examples for supervised fine-tuning.""" + + tokenizer: transformers.PreTrainedTokenizer + + def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]: + input_ids, labels = tuple([instance[key] for instance in instances] + for key in ("input_ids", "labels")) + # temp_pad_token_id = 51000 + input_ids = torch.nn.utils.rnn.pad_sequence( + input_ids, + batch_first=True, + padding_value=self.tokenizer.pad_token_id + # padding_value=temp_pad_token_id + ) + labels = torch.nn.utils.rnn.pad_sequence(labels, + batch_first=True, + padding_value=IGNORE_INDEX) + input_ids = input_ids[:, :self.tokenizer.model_max_length] + labels = labels[:, :self.tokenizer.model_max_length] + batch = dict( + input_ids=input_ids, + labels=labels, + attention_mask=input_ids.ne(self.tokenizer.pad_token_id) + # attention_mask=input_ids.ne(temp_pad_token_id), + ) + + if 'image' in instances[0]: + images = [instance['image'] for instance in instances] + if all(x is not None and x.shape == images[0].shape for x in images): + batch['images'] = torch.stack(images) + else: + batch['images'] = images + + return batch + + +def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer, + data_args) -> Dict: + """Make dataset and collator for supervised fine-tuning.""" + train_dataset = LazySupervisedDataset(tokenizer=tokenizer, + data_path=data_args.data_path, + data_args=data_args) + data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer) + return dict(train_dataset=train_dataset, + eval_dataset=None, + data_collator=data_collator) + + +def train(): + global local_rank + + parser = transformers.HfArgumentParser( + (ModelArguments, DataArguments, TrainingArguments)) + model_args, data_args, training_args = parser.parse_args_into_dataclasses() + local_rank = training_args.local_rank + compute_dtype = (torch.float16 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32)) + + bnb_model_from_pretrained_args = {} + if training_args.bits in [4, 8]: + from transformers import BitsAndBytesConfig + bnb_model_from_pretrained_args.update(dict( + device_map={"": training_args.device}, + load_in_4bit=training_args.bits == 4, + load_in_8bit=training_args.bits == 8, + quantization_config=BitsAndBytesConfig( + load_in_4bit=training_args.bits == 4, + load_in_8bit=training_args.bits == 8, + llm_int8_skip_modules=["mm_projector"], + llm_int8_threshold=6.0, + llm_int8_has_fp16_weight=False, + bnb_4bit_compute_dtype=compute_dtype, + bnb_4bit_use_double_quant=training_args.double_quant, + bnb_4bit_quant_type=training_args.quant_type # {'fp4', 'nf4'} + ) + )) + + config = LlavaPhiConfig.from_pretrained(model_args.model_name_or_path, trust_remote_code=True) + model = LlavaPhiForCausalLM.from_pretrained( + model_args.model_name_or_path, + config=config, + cache_dir=training_args.cache_dir, + trust_remote_code=True, + **bnb_model_from_pretrained_args + ) + + model.config.use_cache = False + + if model_args.freeze_backbone: + model.model.requires_grad_(False) + else: + model.model.requires_grad_(True) + + if training_args.bits in [4, 8]: + from peft import prepare_model_for_kbit_training + model.config.torch_dtype = ( + torch.float32 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32)) + model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=training_args.gradient_checkpointing) + + # TODO: https://huggingface.co/microsoft/phi-2/discussions/31. But in this code, setting gradient_checkpointing=True, it doesn't raise any error + if training_args.gradient_checkpointing: + if hasattr(model, "enable_input_require_grads"): + model.enable_input_require_grads() + else: + def make_inputs_require_grad(module, input, output): + output.requires_grad_(True) + + model.get_input_embeddings().register_forward_hook(make_inputs_require_grad) + + if training_args.lora_enable: + from peft import LoraConfig, get_peft_model + lora_config = LoraConfig( + r=training_args.lora_r, + lora_alpha=training_args.lora_alpha, + target_modules=find_all_linear_names(model), + lora_dropout=training_args.lora_dropout, + bias=training_args.lora_bias, + task_type="CAUSAL_LM", + ) + if training_args.bits == 16: + if training_args.bf16: + model.to(torch.bfloat16) + if training_args.fp16: + model.to(torch.float16) + rank0_print("Adding LoRA adapters...") + model = get_peft_model(model, lora_config) + + if 'phi' in model_args.model_name_or_path: + tokenizer = transformers.AutoTokenizer.from_pretrained( + model_args.model_name_or_path, + cache_dir=training_args.cache_dir, + model_max_length=training_args.model_max_length, + padding_side="right" + ) + else: + tokenizer = transformers.AutoTokenizer.from_pretrained( + model_args.model_name_or_path, + cache_dir=training_args.cache_dir, + model_max_length=training_args.model_max_length, + padding_side="right", + use_fast=False, + ) + + tokenizer.pad_token = tokenizer.unk_token + if model_args.version in conversation_lib.conv_templates: + conversation_lib.default_conversation = conversation_lib.conv_templates[model_args.version] + else: + conversation_lib.default_conversation = conversation_lib.conv_templates["phi-2_v0"] + rank0_print("default_conversation :") + rank0_print(conversation_lib.default_conversation) + + vision_tower = model.get_vision_tower() + vision_tower.to(dtype=torch.bfloat16 if training_args.bf16 else torch.float16, device=training_args.device) + + data_args.image_processor = CLIPImageProcessor.from_pretrained(model_args.model_name_or_path) + data_args.is_multimodal = True + + model.config.image_aspect_ratio = data_args.image_aspect_ratio + model.config.tokenizer_padding_side = tokenizer.padding_side + model.config.tokenizer_model_max_length = tokenizer.model_max_length + + model.config.tune_mm_mlp_adapter = training_args.tune_mm_mlp_adapter = model_args.tune_mm_mlp_adapter + if not model_args.tune_mm_mlp_adapter: + for p in model.get_model().mm_projector.parameters(): + p.requires_grad = False + else: + for p in model.get_model().mm_projector.parameters(): + p.requires_grad = True + + model.config.freeze_vision_tower = training_args.freeze_vision_tower = model_args.freeze_vision_tower + if model_args.freeze_vision_tower: + for p in model.get_model().vision_tower.parameters(): + p.requires_grad = False + else: + for p in model.get_model().vision_tower.parameters(): + p.requires_grad = True + + if training_args.bits in [4, 8]: + model.get_model().mm_projector.to(dtype=compute_dtype, device=training_args.device) + + model.config.mm_use_im_start_end = data_args.mm_use_im_start_end = model_args.mm_use_im_start_end + model.config.mm_projector_lr = training_args.mm_projector_lr + training_args.use_im_start_end = model_args.mm_use_im_start_end + model.config.mm_use_im_patch_token = model_args.mm_use_im_patch_token + model.initialize_vision_tokenizer(model_args, tokenizer=tokenizer) + + if training_args.bits in [4, 8]: + from peft.tuners.lora import LoraLayer + for name, module in model.named_modules(): + if isinstance(module, LoraLayer): + if training_args.bf16: + module = module.to(torch.bfloat16) + if 'norm' in name: + module = module.to(torch.float32) + if 'lm_head' in name or 'embed_tokens' in name: + if hasattr(module, 'weight'): + if training_args.bf16 and module.weight.dtype == torch.float32: + module = module.to(torch.bfloat16) + + data_module = make_supervised_data_module(tokenizer=tokenizer, + data_args=data_args) + + trainer = LLaVAPhiTrainer(model=model, + tokenizer=tokenizer, + args=training_args, + **data_module) + + # if list(pathlib.Path(training_args.output_dir).glob("checkpoint-*")): + # trainer.train(resume_from_checkpoint=True) + # else: + # trainer.train() + + # TODO I dont like auto resume << REMOVE IT AND UNCOMMENT THE ABOVE CODE + trainer.train() + + trainer.save_state() + + model.config.use_cache = True + + if training_args.lora_enable: + state_dict = get_peft_state_maybe_zero_3( + model.named_parameters(), training_args.lora_bias + ) + non_lora_state_dict = get_peft_state_non_lora_maybe_zero_3( + model.named_parameters() + ) + if training_args.local_rank == 0 or training_args.local_rank == -1: + model.config.save_pretrained(training_args.output_dir) + model.save_pretrained(training_args.output_dir, state_dict=state_dict) + torch.save(non_lora_state_dict, os.path.join(training_args.output_dir, 'non_lora_trainables.bin')) + else: + safe_save_model_for_hf_trainer(trainer=trainer, + output_dir=training_args.output_dir) + + +if __name__ == "__main__": + train() diff --git a/llava-phi/llava_phi/utils.py b/llava-phi/llava_phi/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..2f522a1ac6dae70fd0cd6cf3b1c418a703594ce1 --- /dev/null +++ b/llava-phi/llava_phi/utils.py @@ -0,0 +1,126 @@ +import datetime +import logging +import logging.handlers +import os +import sys + +import requests + +from llava_phi.constants import LOGDIR + +server_error_msg = "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**" +moderation_msg = "YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES. PLEASE TRY AGAIN." + +handler = None + + +def build_logger(logger_name, logger_filename): + global handler + + formatter = logging.Formatter( + fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + ) + + # Set the format of root handlers + if not logging.getLogger().handlers: + logging.basicConfig(level=logging.INFO) + logging.getLogger().handlers[0].setFormatter(formatter) + + # Redirect stdout and stderr to loggers + stdout_logger = logging.getLogger("stdout") + stdout_logger.setLevel(logging.INFO) + sl = StreamToLogger(stdout_logger, logging.INFO) + sys.stdout = sl + + stderr_logger = logging.getLogger("stderr") + stderr_logger.setLevel(logging.ERROR) + sl = StreamToLogger(stderr_logger, logging.ERROR) + sys.stderr = sl + + # Get logger + logger = logging.getLogger(logger_name) + logger.setLevel(logging.INFO) + + # Add a file handler for all loggers + if handler is None: + os.makedirs(LOGDIR, exist_ok=True) + filename = os.path.join(LOGDIR, logger_filename) + handler = logging.handlers.TimedRotatingFileHandler( + filename, when='D', utc=True) + handler.setFormatter(formatter) + + for name, item in logging.root.manager.loggerDict.items(): + if isinstance(item, logging.Logger): + item.addHandler(handler) + + return logger + + +class StreamToLogger(object): + """ + Fake file-like stream object that redirects writes to a logger instance. + """ + def __init__(self, logger, log_level=logging.INFO): + self.terminal = sys.stdout + self.logger = logger + self.log_level = log_level + self.linebuf = '' + + def __getattr__(self, attr): + return getattr(self.terminal, attr) + + def write(self, buf): + temp_linebuf = self.linebuf + buf + self.linebuf = '' + for line in temp_linebuf.splitlines(True): + # From the io.TextIOWrapper docs: + # On output, if newline is None, any '\n' characters written + # are translated to the system default line separator. + # By default sys.stdout.write() expects '\n' newlines and then + # translates them so this is still cross platform. + if line[-1] == '\n': + self.logger.log(self.log_level, line.rstrip()) + else: + self.linebuf += line + + def flush(self): + if self.linebuf != '': + self.logger.log(self.log_level, self.linebuf.rstrip()) + self.linebuf = '' + + +def disable_torch_init(): + """ + Disable the redundant torch default initialization to accelerate model creation. + """ + import torch + setattr(torch.nn.Linear, "reset_parameters", lambda self: None) + setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None) + + +def violates_moderation(text): + """ + Check whether the text violates OpenAI moderation API. + """ + url = "https://api.openai.com/v1/moderations" + headers = {"Content-Type": "application/json", + "Authorization": "Bearer " + os.environ["OPENAI_API_KEY"]} + text = text.replace("\n", "") + data = "{" + '"input": ' + f'"{text}"' + "}" + data = data.encode("utf-8") + try: + ret = requests.post(url, headers=headers, data=data, timeout=5) + flagged = ret.json()["results"][0]["flagged"] + except requests.exceptions.RequestException as e: + flagged = False + except KeyError as e: + flagged = False + + return flagged + + +def pretty_print_semaphore(semaphore): + if semaphore is None: + return "None" + return f"Semaphore(value={semaphore._value}, locked={semaphore.locked()})" diff --git a/llava-phi/pyproject.toml b/llava-phi/pyproject.toml new file mode 100644 index 0000000000000000000000000000000000000000..db00b0d55293a6fdbbab1af0e690fc7f52f99bcb --- /dev/null +++ b/llava-phi/pyproject.toml @@ -0,0 +1,37 @@ +[build-system] +requires = ["setuptools>=61.0"] +build-backend = "setuptools.build_meta" + +[project] +name = "llava_phi" +version = "1.0.0" +description = "Towards GPT-4 like large language and visual assistant." +readme = "README.md" +requires-python = ">=3.8" +classifiers = [ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: Apache Software License", +] +dependencies = [ + "einops", "fastapi", "gradio==3.35.2", "markdown2[all]", "numpy", + "requests", "sentencepiece", "tokenizers==0.15.0", + "uvicorn", "wandb", + "shortuuid", "httpx==0.24.0", + "deepspeed==0.9.5", + "peft==0.4.0", + "accelerate==0.21.0", + "bitsandbytes==0.41.0", + "scikit-learn==1.2.2", + "sentencepiece==0.1.99", + "einops==0.6.1", "einops-exts==0.0.4", "timm==0.6.13", + "gradio_client==0.2.9" +] + +[project.urls] +"Bug Tracker" = "https://github.com/zhuyiche/llava-phi/issues" + +[tool.setuptools.packages.find] +exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"] + +[tool.wheel] +exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"] diff --git a/llava-phi/scripts/convert_gqa_for_eval.py b/llava-phi/scripts/convert_gqa_for_eval.py new file mode 100644 index 0000000000000000000000000000000000000000..4d46c8b876df618faac548e9b369109d541f4f23 --- /dev/null +++ b/llava-phi/scripts/convert_gqa_for_eval.py @@ -0,0 +1,18 @@ +import os +import json +import argparse + +parser = argparse.ArgumentParser() +parser.add_argument("--src", type=str) +parser.add_argument("--dst", type=str) +args = parser.parse_args() + +all_answers = [] +for line_idx, line in enumerate(open(args.src)): + res = json.loads(line) + question_id = res['question_id'] + text = res['text'].rstrip('.').lower() + all_answers.append({"questionId": question_id, "prediction": text}) + +with open(args.dst, 'w') as f: + json.dump(all_answers, f) diff --git a/llava-phi/scripts/convert_mmbench_for_submission.py b/llava-phi/scripts/convert_mmbench_for_submission.py new file mode 100644 index 0000000000000000000000000000000000000000..27baec12f9ef48d4e3df41e15b1d2644aab4174b --- /dev/null +++ b/llava-phi/scripts/convert_mmbench_for_submission.py @@ -0,0 +1,27 @@ +import os +import json +import argparse +import pandas as pd + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--annotation-file", type=str, required=True) + parser.add_argument("--result-dir", type=str, required=True) + parser.add_argument("--upload-dir", type=str, required=True) + parser.add_argument("--experiment", type=str, required=True) + + return parser.parse_args() + +if __name__ == "__main__": + args = get_args() + + df = pd.read_table(args.annotation_file) + + cur_df = df.copy() + cur_df = cur_df.drop(columns=['hint', 'category', 'source', 'image', 'comment', 'l2-category']) + cur_df.insert(6, 'prediction', None) + for pred in open(os.path.join(args.result_dir, f"{args.experiment}.jsonl")): + pred = json.loads(pred) + cur_df.loc[df['index'] == pred['question_id'], 'prediction'] = pred['text'] + + cur_df.to_excel(os.path.join(args.upload_dir, f"{args.experiment}.xlsx"), index=False, engine='openpyxl') diff --git a/llava-phi/scripts/convert_mmvet_for_eval.py b/llava-phi/scripts/convert_mmvet_for_eval.py new file mode 100644 index 0000000000000000000000000000000000000000..97f5cfb7fb7691ef3921e3e6afc6d82ec54d4c6c --- /dev/null +++ b/llava-phi/scripts/convert_mmvet_for_eval.py @@ -0,0 +1,18 @@ +import os +import json +import argparse + +parser = argparse.ArgumentParser() +parser.add_argument("--src", type=str) +parser.add_argument("--dst", type=str) +args = parser.parse_args() + +cur_result = {} + +for line in open(args.src): + data = json.loads(line) + qid = data['question_id'] + cur_result[f'v1_{qid}'] = data['text'] + +with open(args.dst, 'w') as f: + json.dump(cur_result, f, indent=2) diff --git a/llava-phi/scripts/convert_seed_for_submission.py b/llava-phi/scripts/convert_seed_for_submission.py new file mode 100644 index 0000000000000000000000000000000000000000..ae903e63087516bc8ae77142532196be6a85589c --- /dev/null +++ b/llava-phi/scripts/convert_seed_for_submission.py @@ -0,0 +1,74 @@ +import os +import json +import argparse + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--annotation-file", type=str) + parser.add_argument("--result-file", type=str) + parser.add_argument("--result-upload-file", type=str) + return parser.parse_args() + + +def eval_single(result_file, eval_only_type=None): + results = {} + for line in open(result_file): + row = json.loads(line) + results[row['question_id']] = row + + type_counts = {} + correct_counts = {} + for question_data in data['questions']: + if eval_only_type is not None and question_data['data_type'] != eval_only_type: continue + data_type = question_data['question_type_id'] + type_counts[data_type] = type_counts.get(data_type, 0) + 1 + try: + question_id = int(question_data['question_id']) + except: + question_id = question_data['question_id'] + if question_id not in results: + correct_counts[data_type] = correct_counts.get(data_type, 0) + continue + row = results[question_id] + if row['text'] == question_data['answer']: + correct_counts[data_type] = correct_counts.get(data_type, 0) + 1 + + total_count = 0 + total_correct = 0 + for data_type in sorted(type_counts.keys()): + accuracy = correct_counts[data_type] / type_counts[data_type] * 100 + if eval_only_type is None: + print(f"{ques_type_id_to_name[data_type]}: {accuracy:.2f}%") + + total_count += type_counts[data_type] + total_correct += correct_counts[data_type] + + total_accuracy = total_correct / total_count * 100 + if eval_only_type is None: + print(f"Total accuracy: {total_accuracy:.2f}%") + else: + print(f"{eval_only_type} accuracy: {total_accuracy:.2f}%") + + return results + +if __name__ == "__main__": + args = get_args() + data = json.load(open(args.annotation_file)) + ques_type_id_to_name = {id:n for n,id in data['question_type'].items()} + + results = eval_single(args.result_file) + eval_single(args.result_file, eval_only_type='image') + eval_single(args.result_file, eval_only_type='video') + + with open(args.result_upload_file, 'w') as fp: + for question in data['questions']: + qid = question['question_id'] + if qid in results: + result = results[qid] + else: + result = results[int(qid)] + fp.write(json.dumps({ + 'question_id': qid, + 'prediction': result['text'] + }) + '\n') diff --git a/llava-phi/scripts/convert_sqa_to_llava.py b/llava-phi/scripts/convert_sqa_to_llava.py new file mode 100644 index 0000000000000000000000000000000000000000..26fe3002413a23b5029e540c8b338ebb14307bf6 --- /dev/null +++ b/llava-phi/scripts/convert_sqa_to_llava.py @@ -0,0 +1,88 @@ +import json +import os +import fire +import re +from convert_sqa_to_llava_base_prompt import build_prompt_chatbot + + +def convert_to_llava(base_dir, split, prompt_format="QCM-LEA"): + split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[split] + problems = json.load(open(os.path.join(base_dir, "problems.json"))) + + split_problems = build_prompt_chatbot( + problems, split_indices, prompt_format, + use_caption=False, is_test=False) + + target_format = [] + for prob_id, (input, output) in split_problems.items(): + if input.startswith('Question: '): + input = input.replace('Question: ', '') + if output.startswith('Answer: '): + output = output.replace('Answer: ', '') + + raw_prob_data = problems[prob_id] + if raw_prob_data['image'] is None: + target_format.append({ + "id": prob_id, + "conversations": [ + {'from': 'human', 'value': f"{input}"}, + {'from': 'gpt', 'value': f"{output}"}, + ], + }) + + else: + target_format.append({ + "id": prob_id, + "image": os.path.join(prob_id, raw_prob_data['image']), + "conversations": [ + {'from': 'human', 'value': f"{input}\n"}, + {'from': 'gpt', 'value': f"{output}"}, + ], + }) + + print(f'Number of samples: {len(target_format)}') + + with open(os.path.join(base_dir, f"llava_{split}_{prompt_format}.json"), "w") as f: + json.dump(target_format, f, indent=2) + + +def convert_to_jsonl(base_dir, split, prompt_format="QCM-LEPA"): + split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[split] + problems = json.load(open(os.path.join(base_dir, "problems.json"))) + + split_problems = build_prompt_chatbot( + problems, split_indices, prompt_format, + use_caption=False, is_test=False) + + writer = open(os.path.join(base_dir, f"scienceqa_{split}_{prompt_format}.jsonl"), "w") + for prob_id, (input, output) in split_problems.items(): + if input.startswith('Question: '): + input = input.replace('Question: ', '') + if output.startswith('Answer: '): + output = output.replace('Answer: ', '') + + raw_prob_data = problems[prob_id] + if raw_prob_data['image'] is None: + data = { + "id": prob_id, + "instruction": f"{input}", + "output": f"{output}", + } + + else: + data = { + "id": prob_id, + "image": os.path.join(prob_id, raw_prob_data['image']), + "instruction": f"{input}\n", + "output": f"{output}", + } + writer.write(json.dumps(data) + '\n') + writer.close() + + +def main(task, **kwargs): + globals()[task](**kwargs) + + +if __name__ == "__main__": + fire.Fire(main) diff --git a/llava-phi/scripts/convert_sqa_to_llava_base_prompt.py b/llava-phi/scripts/convert_sqa_to_llava_base_prompt.py new file mode 100644 index 0000000000000000000000000000000000000000..b327fcc29eb44d7fe68be35da25bafa0e1d6feba --- /dev/null +++ b/llava-phi/scripts/convert_sqa_to_llava_base_prompt.py @@ -0,0 +1,334 @@ +def get_question_text(problem): + question = problem['question'] + return question + + +def get_context_text(problem, use_caption): + txt_context = problem['hint'] + img_context = problem['caption'] if use_caption else "" + context = " ".join([txt_context, img_context]).strip() + if context == "": + context = "N/A" + return context + + +def get_choice_text(probelm, options): + choices = probelm['choices'] + choice_list = [] + for i, c in enumerate(choices): + choice_list.append("({}) {}".format(options[i], c)) + choice_txt = " ".join(choice_list) + #print(choice_txt) + return choice_txt + + +def get_answer(problem, options): + return options[problem['answer']] + + +def get_lecture_text(problem): + # \\n: GPT-3 can generate the lecture with more tokens. + lecture = problem['lecture'].replace("\n", "\\n") + return lecture + + +def get_solution_text(problem): + # \\n: GPT-3 can generate the solution with more tokens + solution = problem['solution'].replace("\n", "\\n") + return solution + + +def create_one_example_chatbot(format, question, context, choice, answer, lecture, solution, test_example=True): + + input_format, output_format = format.split("-") + + ## Inputs + if input_format == "CQM": + input = f"Context: {context}\nQuestion: {question}\nOptions: {choice}\n" + elif input_format == "QCM": + input = f"Question: {question}\nContext: {context}\nOptions: {choice}\n" + # upper bound experiment + elif input_format == "QCML": + input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {lecture}\n" + elif input_format == "QCME": + input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {solution}\n" + elif input_format == "QCMLE": + input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {lecture} {solution}\n" + + elif input_format == "QCLM": + input = f"Question: {question}\nContext: {context}\nBECAUSE: {lecture}\nOptions: {choice}\n" + elif input_format == "QCEM": + input = f"Question: {question}\nContext: {context}\nBECAUSE: {solution}\nOptions: {choice}\n" + elif input_format == "QCLEM": + input = f"Question: {question}\nContext: {context}\nBECAUSE: {lecture} {solution}\nOptions: {choice}\n" + + # Outputs + if test_example: + output = "Answer:" + elif output_format == 'A': + output = f"Answer: The answer is {answer}." + + elif output_format == 'AL': + output = f"Answer: The answer is {answer}. BECAUSE: {solution}" + elif output_format == 'AE': + output = f"Answer: The answer is {answer}. BECAUSE: {lecture}" + elif output_format == 'ALE': + output = f"Answer: The answer is {answer}. BECAUSE: {lecture} {solution}" + elif output_format == 'AEL': + output = f"Answer: The answer is {answer}. BECAUSE: {solution} {lecture}" + + elif output_format == 'LA': + output = f"Answer: {lecture} The answer is {answer}." + elif output_format == 'EA': + output = f"Answer: {solution} The answer is {answer}." + elif output_format == 'LEA': + output = f"Answer: {lecture} {solution} The answer is {answer}." + elif output_format == 'ELA': + output = f"Answer: {solution} {lecture} The answer is {answer}." + elif output_format == 'LEPA': + output = '' + if len(lecture.strip()) > 0: + output += f"LECTURE: {lecture}\n" + if len(solution.strip()) > 0: + output += f"SOLUTION: {solution}\n" + output += '###\n' + output += f"ANSWER: {answer}." + + input = input.replace(" ", " ").strip() + output = output.replace(" ", " ").strip() + if input.endswith("BECAUSE:"): + input = input.replace("BECAUSE:", "").strip() + if output.endswith("BECAUSE:"): + output = output.replace("BECAUSE:", "").strip() + return input, output + + +def create_one_example(format, question, context, choice, answer, lecture, solution, test_example=True): + + input_format, output_format = format.split("-") + + ## Inputs + if input_format == "CQM": + input = f"Context: {context}\nQuestion: {question}\nOptions: {choice}\n" + elif input_format == "QCM": + input = f"Question: {question}\nContext: {context}\nOptions: {choice}\n" + # upper bound experiment + elif input_format == "QCML": + input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {lecture}\n" + elif input_format == "QCME": + input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {solution}\n" + elif input_format == "QCMLE": + input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {lecture} {solution}\n" + + elif input_format == "QCLM": + input = f"Question: {question}\nContext: {context}\nBECAUSE: {lecture}\nOptions: {choice}\n" + elif input_format == "QCEM": + input = f"Question: {question}\nContext: {context}\nBECAUSE: {solution}\nOptions: {choice}\n" + elif input_format == "QCLEM": + input = f"Question: {question}\nContext: {context}\nBECAUSE: {lecture} {solution}\nOptions: {choice}\n" + + # Outputs + if test_example: + output = "Answer:" + elif output_format == 'A': + output = f"Answer: The answer is {answer}." + + elif output_format == 'AL': + output = f"Answer: The answer is {answer}. BECAUSE: {solution}" + elif output_format == 'AE': + output = f"Answer: The answer is {answer}. BECAUSE: {lecture}" + elif output_format == 'ALE': + output = f"Answer: The answer is {answer}. BECAUSE: {lecture} {solution}" + elif output_format == 'AEL': + output = f"Answer: The answer is {answer}. BECAUSE: {solution} {lecture}" + + elif output_format == 'LA': + output = f"Answer: {lecture} The answer is {answer}." + elif output_format == 'EA': + output = f"Answer: {solution} The answer is {answer}." + elif output_format == 'LEA': + output = f"Answer: {lecture} {solution} The answer is {answer}." + elif output_format == 'ELA': + output = f"Answer: {solution} {lecture} The answer is {answer}." + + text = input + output + text = text.replace(" ", " ").strip() + if text.endswith("BECAUSE:"): + text = text.replace("BECAUSE:", "").strip() + return text + + + +def create_one_example_gpt4(format, question, context, choice, answer, lecture, solution, test_example=True): + + input_format, output_format = format.split("-") + + ## Inputs + if input_format == "CQM": + input = f"Context: {context}\nQuestion: {question}\nOptions: {choice}\n" + elif input_format == "QCM": + input = f"Question: {question}\nContext: {context}\nOptions: {choice}\n" + # upper bound experiment + elif input_format == "QCML": + input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {lecture}\n" + elif input_format == "QCME": + input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {solution}\n" + elif input_format == "QCMLE": + input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {lecture} {solution}\n" + + elif input_format == "QCLM": + input = f"Question: {question}\nContext: {context}\nBECAUSE: {lecture}\nOptions: {choice}\n" + elif input_format == "QCEM": + input = f"Question: {question}\nContext: {context}\nBECAUSE: {solution}\nOptions: {choice}\n" + elif input_format == "QCLEM": + input = f"Question: {question}\nContext: {context}\nBECAUSE: {lecture} {solution}\nOptions: {choice}\n" + + # Outputs + if test_example: + output = "Answer:" + elif output_format == 'A': + output = f"Answer: The answer is {answer}." + + elif output_format == 'AL': + output = f"Answer: The answer is {answer}. BECAUSE: {solution}" + elif output_format == 'AE': + output = f"Answer: The answer is {answer}. BECAUSE: {lecture}" + elif output_format == 'ALE': + output = f"Answer: The answer is {answer}. BECAUSE: {lecture} {solution}" + elif output_format == 'AEL': + output = f"Answer: The answer is {answer}. BECAUSE: {solution} {lecture}" + + elif output_format == 'LA': + output = f"Answer: {lecture} The answer is {answer}." + elif output_format == 'EA': + output = f"Answer: {solution} The answer is {answer}." + elif output_format == 'LEA': + output = f"Answer: {lecture} {solution} The answer is {answer}." + elif output_format == 'ELA': + output = f"Answer: {solution} {lecture} The answer is {answer}." + + input = input.replace(" ", " ").strip() + output = output.replace(" ", " ").strip() + if output.endswith("BECAUSE:"): + output = output.replace("BECAUSE:", "").strip() + + user_prompt = {"role": "user", "content": f"Can you explain {input}?"} + assistant_prompt = {"role": "assistant", "content": f"{output}"} + + return user_prompt, assistant_prompt + + +def build_prompt_chatbot(problems, shot_qids, prompt_format, use_caption=False, options=["A", "B", "C", "D", "E"], is_test=False): + examples = {} + + for qid in shot_qids: + question = get_question_text(problems[qid]) + context = get_context_text(problems[qid], use_caption) + choice = get_choice_text(problems[qid], options) + answer = get_answer(problems[qid], options) + lecture = get_lecture_text(problems[qid]).replace('\\n', '\n') + solution = get_solution_text(problems[qid]).replace('\\n', '\n') + + train_example = create_one_example_chatbot(prompt_format, + question, + context, + choice, + answer, + lecture, + solution, + test_example=is_test) + examples[qid] = train_example + return examples + + +def build_prompt(problems, shot_qids, test_qid, args): + + examples = [] + + # n-shot training examples + for qid in shot_qids: + question = get_question_text(problems[qid]) + context = get_context_text(problems[qid], args.use_caption) + choice = get_choice_text(problems[qid], args.options) + answer = get_answer(problems[qid], args.options) + lecture = get_lecture_text(problems[qid]) + solution = get_solution_text(problems[qid]) + + train_example = create_one_example(args.prompt_format, + question, + context, + choice, + answer, + lecture, + solution, + test_example=False) + examples.append(train_example) + + # test example + question = get_question_text(problems[test_qid]) + context = get_context_text(problems[test_qid], args.use_caption) + choice = get_choice_text(problems[test_qid], args.options) + answer = get_answer(problems[test_qid], args.options) + lecture = get_lecture_text(problems[test_qid]) + solution = get_solution_text(problems[test_qid]) + + test_example = create_one_example(args.prompt_format, + question, + context, + choice, + answer, + lecture, + solution, + test_example=True) + examples.append(test_example) + + # create the prompt input + prompt_input = '\n\n'.join(examples) + + return prompt_input + + +def build_prompt_gpt4(problems, shot_qids, test_qid, args): + + prompt_array = [{"role": "system", "content": "You are a helpful assistant."}] + + # n-shot training examples + for qid in shot_qids: + question = get_question_text(problems[qid]) + context = get_context_text(problems[qid], args.use_caption) + choice = get_choice_text(problems[qid], args.options) + answer = get_answer(problems[qid], args.options) + lecture = get_lecture_text(problems[qid]) + solution = get_solution_text(problems[qid]) + + user_prompt, assistant_prompt = create_one_example_gpt4(args.prompt_format, + question, + context, + choice, + answer, + lecture, + solution, + test_example=False) + prompt_array.append(user_prompt) + prompt_array.append(assistant_prompt) + + # test example + question = get_question_text(problems[test_qid]) + context = get_context_text(problems[test_qid], args.use_caption) + choice = get_choice_text(problems[test_qid], args.options) + answer = get_answer(problems[test_qid], args.options) + lecture = get_lecture_text(problems[test_qid]) + solution = get_solution_text(problems[test_qid]) + + user_prompt, assistant_prompt = create_one_example_gpt4(args.prompt_format, + question, + context, + choice, + answer, + lecture, + solution, + test_example=True) + prompt_array.append(user_prompt) + prompt_array.append(assistant_prompt) + + return prompt_array \ No newline at end of file diff --git a/llava-phi/scripts/convert_vizwiz_for_submission.py b/llava-phi/scripts/convert_vizwiz_for_submission.py new file mode 100644 index 0000000000000000000000000000000000000000..feb5892ff7fe7d68ede91e999888d9a28e3838d6 --- /dev/null +++ b/llava-phi/scripts/convert_vizwiz_for_submission.py @@ -0,0 +1,47 @@ +import os +import argparse +import json + +from llava_phi.eval.m4c_evaluator import EvalAIAnswerProcessor + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument('--annotation-file', type=str, required=True) + parser.add_argument('--result-file', type=str, required=True) + parser.add_argument('--result-upload-file', type=str, required=True) + return parser.parse_args() + + +if __name__ == '__main__': + + args = parse_args() + + os.makedirs(os.path.dirname(args.result_upload_file), exist_ok=True) + + results = [] + error_line = 0 + for line_idx, line in enumerate(open(args.result_file)): + try: + results.append(json.loads(line)) + except: + error_line += 1 + results = {x['question_id']: x['text'] for x in results} + test_split = [json.loads(line) for line in open(args.annotation_file)] + split_ids = set([x['question_id'] for x in test_split]) + + print(f'total results: {len(results)}, total split: {len(test_split)}, error_line: {error_line}') + + all_answers = [] + + answer_processor = EvalAIAnswerProcessor() + + for x in test_split: + assert x['question_id'] in results + all_answers.append({ + 'image': x['image'], + 'answer': answer_processor(results[x['question_id']]) + }) + + with open(args.result_upload_file, 'w') as f: + json.dump(all_answers, f) diff --git a/llava-phi/scripts/convert_vqav2_for_submission.py b/llava-phi/scripts/convert_vqav2_for_submission.py new file mode 100644 index 0000000000000000000000000000000000000000..8a430faf25fd6689600b97068ce02f4ba963d63a --- /dev/null +++ b/llava-phi/scripts/convert_vqav2_for_submission.py @@ -0,0 +1,56 @@ +import os +import argparse +import json + +from llava_phi.eval.m4c_evaluator import EvalAIAnswerProcessor + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument('--dir', type=str, default="./playground/data/eval/vqav2") + parser.add_argument('--ckpt', type=str, required=True) + parser.add_argument('--split', type=str, required=True) + return parser.parse_args() + + +if __name__ == '__main__': + + args = parse_args() + + src = os.path.join(args.dir, 'answers', args.split, args.ckpt, 'merge.jsonl') + test_split = os.path.join(args.dir, 'llava_vqav2_mscoco_test2015.jsonl') + dst = os.path.join(args.dir, 'answers_upload', args.split, f'{args.ckpt}.json') + os.makedirs(os.path.dirname(dst), exist_ok=True) + + results = [] + error_line = 0 + for line_idx, line in enumerate(open(src)): + try: + results.append(json.loads(line)) + except: + error_line += 1 + + results = {x['question_id']: x['text'] for x in results} + test_split = [json.loads(line) for line in open(test_split)] + split_ids = set([x['question_id'] for x in test_split]) + + print(f'total results: {len(results)}, total split: {len(test_split)}, error_line: {error_line}') + + all_answers = [] + + answer_processor = EvalAIAnswerProcessor() + + for x in test_split: + if x['question_id'] not in results: + all_answers.append({ + 'question_id': x['question_id'], + 'answer': '' + }) + else: + all_answers.append({ + 'question_id': x['question_id'], + 'answer': answer_processor(results[x['question_id']]) + }) + + with open(dst, 'w') as f: + json.dump(all_answers, open(dst, 'w')) diff --git a/llava-phi/scripts/llava_phi/eval/mmbench.sh b/llava-phi/scripts/llava_phi/eval/mmbench.sh new file mode 100644 index 0000000000000000000000000000000000000000..87819f2ff73991347987cb22a9a32ae77ec5edd8 --- /dev/null +++ b/llava-phi/scripts/llava_phi/eval/mmbench.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +SPLIT="mmbench_dev_20230712" + +python -m llava_phi.eval.model_vqa_mmbench \ + --model-path checkpoints/llavaPhi-v0-3b-finetune \ + --question-file ./playground/data/eval/mmbench/$SPLIT.tsv \ + --answers-file ./playground/data/eval/mmbench/answers/$SPLIT/llavaPhi-v0-3b.jsonl \ + --single-pred-prompt \ + --temperature 0 \ + --conv-mode phi-2_v0 + +mkdir -p playground/data/eval/mmbench/answers_upload/$SPLIT + +python scripts/convert_mmbench_for_submission.py \ + --annotation-file ./playground/data/eval/mmbench/$SPLIT.tsv \ + --result-dir ./playground/data/eval/mmbench/answers/$SPLIT \ + --upload-dir ./playground/data/eval/mmbench/answers_upload/$SPLIT \ + --experiment llavaPhi-v0-3b \ No newline at end of file diff --git a/llava-phi/scripts/llava_phi/eval/mme.sh b/llava-phi/scripts/llava_phi/eval/mme.sh new file mode 100644 index 0000000000000000000000000000000000000000..198b002f6af54a0d6b784b334d1c0385ff7fe963 --- /dev/null +++ b/llava-phi/scripts/llava_phi/eval/mme.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +python -m llava_phi.eval.model_vqa_loader \ + --model-path checkpoints/llavaPhi-v0-3b-finetune \ + --question-file ./playground/data/eval/MME/llava_mme.jsonl \ + --image-folder ./playground/data/eval/MME/MME_Benchmark_release_version \ + --answers-file ./playground/data/eval/MME/answers/llavaPhi-v0-3b.jsonl \ + --temperature 0 \ + --conv-mode phi-2_v0 + +cd ./playground/data/eval/MME + +python convert_answer_to_mme.py --experiment llavaPhi-v0-3b + +cd eval_tool + +python calculation.py --results_dir answers/llavaPhi-v0-3b diff --git a/llava-phi/scripts/llava_phi/eval/mmvet.sh b/llava-phi/scripts/llava_phi/eval/mmvet.sh new file mode 100644 index 0000000000000000000000000000000000000000..31ea32909cab61cbfd308512fbdb54782a8261b0 --- /dev/null +++ b/llava-phi/scripts/llava_phi/eval/mmvet.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +python -m llava_phi.eval.model_vqa \ + --model-path checkpoints/llavaPhi-v0-3b-finetune \ + --question-file ./playground/data/eval/mm-vet/llava-mm-vet.jsonl \ + --image-folder ./playground/data/eval/mm-vet/images \ + --answers-file ./playground/data/eval/mm-vet/answers/llavaPhi-v0-3b.jsonl \ + --temperature 0 \ + --conv-mode phi-2_v0 + +mkdir -p ./playground/data/eval/mm-vet/results + +python scripts/convert_mmvet_for_eval.py \ + --src ./playground/data/eval/mm-vet/answers/llavaPhi-v0-3b.jsonl \ + --dst ./playground/data/eval/mm-vet/results/llavaPhi-v0-3b.json + diff --git a/llava-phi/scripts/llava_phi/eval/pope.sh b/llava-phi/scripts/llava_phi/eval/pope.sh new file mode 100644 index 0000000000000000000000000000000000000000..02916b596c48f132b8e1c8e85e56e146f5800e1a --- /dev/null +++ b/llava-phi/scripts/llava_phi/eval/pope.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +python -m llava_phi.eval.model_vqa_loader \ + --model-path ./checkpoints/llavaPhi-v0-3b-finetune \ + --question-file ./playground/data/eval/pope/llava_pope_test.jsonl \ + --image-folder /path/to/data/coco/val2014 \ + --answers-file ./playground/data/eval/pope/answers/llavaPhi-v0-3b.jsonl \ + --temperature 0 \ + --conv-mode phi-2_v0 + +python llava_phi/eval/eval_pope.py \ + --annotation-dir ./playground/data/eval/pope/coco \ + --question-file ./playground/data/eval/pope/llava_pope_test.jsonl \ + --result-file ./playground/data/eval/pope/answers/llavaPhi-v0-3b.jsonl \ No newline at end of file diff --git a/llava-phi/scripts/llava_phi/eval/sqa.sh b/llava-phi/scripts/llava_phi/eval/sqa.sh new file mode 100644 index 0000000000000000000000000000000000000000..16f8bc7fc54fd31242c753d5de4f1022fbeca5a6 --- /dev/null +++ b/llava-phi/scripts/llava_phi/eval/sqa.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +python -m llava_phi.eval.model_vqa_science \ + --model-path ./checkpoints/llavaPhi-v0-3b-finetune \ + --question-file ./playground/data/eval/scienceqa/llava_test_CQM-A.json \ + --image-folder ./playground/data/eval/scienceqa/images/test \ + --answers-file ./playground/data/eval/scienceqa/answers/llavaPhi-v0-3b.jsonl \ + --single-pred-prompt \ + --temperature 0 \ + --conv-mode phi-2_v0 + +python llava_phi/eval/eval_science_qa.py \ + --base-dir ./playground/data/eval/scienceqa \ + --result-file ./playground/data/eval/scienceqa/answers/llavaPhi-v0-3b.jsonl \ + --output-file ./playground/data/eval/scienceqa/answers/llavaPhi-v0-3b_output.jsonl \ + --output-result ./playground/data/eval/scienceqa/answers/llavaPhi-v0-3b_result.json + diff --git a/llava-phi/scripts/llava_phi/eval/textvqa.sh b/llava-phi/scripts/llava_phi/eval/textvqa.sh new file mode 100644 index 0000000000000000000000000000000000000000..6c5dc09dabad57734af337ebca8de0a5dabe1ce7 --- /dev/null +++ b/llava-phi/scripts/llava_phi/eval/textvqa.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +python -m llava_phi.eval.model_vqa_loader \ + --model-path ./checkpoints/llavaPhi-v0-3b-finetune \ + --question-file ./playground/data/eval/textvqa/llava_textvqa_val_v051_ocr.jsonl \ + --image-folder /path/to/data/textvqa/train_images \ + --answers-file ./playground/data/eval/textvqa/answers/llavaPhi-v0-3b.jsonl \ + --temperature 0 \ + --conv-mode phi-2_v0 + +python -m llava_phi.eval.eval_textvqa \ + --annotation-file ./playground/data/eval/textvqa/TextVQA_0.5.1_val.json \ + --result-file ./playground/data/eval/textvqa/answers/llavaPhi-v0-3b.jsonl diff --git a/llava-phi/scripts/llava_phi/eval/vizwiz.sh b/llava-phi/scripts/llava_phi/eval/vizwiz.sh new file mode 100644 index 0000000000000000000000000000000000000000..55b256413a9ab58ac83a738ac0b5336b7a3379f4 --- /dev/null +++ b/llava-phi/scripts/llava_phi/eval/vizwiz.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +python -m llava_phi.eval.model_vqa_loader \ + --model-path checkpoints/llavaPhi-v0-3b-finetune \ + --question-file ./playground/data/eval/vizwiz/llava_test.jsonl \ + --image-folder ./playground/data/eval/vizwiz/test \ + --answers-file ./playground/data/eval/vizwiz/answers/llavaPhi-v0-3b.jsonl \ + --temperature 0 \ + --conv-mode phi-2_v0 + +python scripts/convert_vizwiz_for_submission.py \ + --annotation-file ./playground/data/eval/vizwiz/llava_test.jsonl \ + --result-file ./playground/data/eval/vizwiz/answers/llavaPhi-v0-3b.jsonl \ + --result-upload-file ./playground/data/eval/vizwiz/answers_upload/llavaPhi-v0-3b.json diff --git a/llava-phi/scripts/llava_phi/eval/vqav2.sh b/llava-phi/scripts/llava_phi/eval/vqav2.sh new file mode 100644 index 0000000000000000000000000000000000000000..0ae7933d925037e1a735c45fa171e9a00374d82b --- /dev/null +++ b/llava-phi/scripts/llava_phi/eval/vqav2.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +gpu_list="${CUDA_VISIBLE_DEVICES:-0}" +IFS=',' read -ra GPULIST <<< "$gpu_list" + +CHUNKS=${#GPULIST[@]} + +CKPT="llavaPhi-v0-3b-finetune" +SPLIT="llava_vqav2_mscoco_test-dev2015" + +for IDX in $(seq 0 $((CHUNKS-1))); do + CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava_phi.eval.model_vqa_loader \ + --model-path ./checkpoints/llavaPhi-v0-3b-finetune \ + --question-file ./playground/data/eval/vqav2/$SPLIT.jsonl \ + --image-folder /path/to/data/coco/test2015 \ + --answers-file ./playground/data/eval/vqav2/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl \ + --num-chunks $CHUNKS \ + --chunk-idx $IDX \ + --temperature 0 \ + --conv-mode phi-2_v0 & +done + +wait + +output_file=./playground/data/eval/vqav2/answers/$SPLIT/$CKPT/merge.jsonl + +# Clear out the output file if it exists. +> "$output_file" + +# Loop through the indices and concatenate each file. +for IDX in $(seq 0 $((CHUNKS-1))); do + cat ./playground/data/eval/vqav2/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" +done + +python scripts/convert_vqav2_for_submission.py --split $SPLIT --ckpt $CKPT + diff --git a/llava-phi/scripts/llava_phi/finetune.sh b/llava-phi/scripts/llava_phi/finetune.sh new file mode 100644 index 0000000000000000000000000000000000000000..95f071fbd0774d31799ddead3df49350b2a2744d --- /dev/null +++ b/llava-phi/scripts/llava_phi/finetune.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +deepspeed --master_port 29600 llava_phi/train/train.py \ + --deepspeed ./scripts/zero3.json \ + --model_name_or_path ./checkpoints/llavaPhi-v0-3b-pretrain \ + --version v0 \ + --data_path ./finetune_data/llava_instruct_150k.json \ + --image_folder ./finetune_data/images \ + --tune_mm_mlp_adapter True \ + --freeze_vision_tower False \ + --freeze_backbone False \ + --mm_use_im_start_end False \ + --mm_use_im_patch_token False \ + --image_aspect_ratio pad \ + --group_by_modality_length False \ + --bf16 True \ + --output_dir ./checkpoints/llavaPhi-v0-3b-finetune \ + --num_train_epochs 1 \ + --per_device_train_batch_size 6 \ + --per_device_eval_batch_size 4 \ + --gradient_accumulation_steps 2 \ + --evaluation_strategy "no" \ + --save_strategy "steps" \ + --save_steps 2000 \ + --save_total_limit 1 \ + --learning_rate 2e-5 \ + --weight_decay 0. \ + --warmup_ratio 0.03 \ + --lr_scheduler_type "cosine" \ + --logging_steps 1 \ + --tf32 False \ + --model_max_length 2048 \ + --gradient_checkpointing True \ + --dataloader_num_workers 4 \ + --lazy_preprocess True \ + --report_to tensorboard diff --git a/llava-phi/scripts/llava_phi/get_base_model.sh b/llava-phi/scripts/llava_phi/get_base_model.sh new file mode 100644 index 0000000000000000000000000000000000000000..ed7d83d22eb9d23f891fd77470d3b1f9430e9ae7 --- /dev/null +++ b/llava-phi/scripts/llava_phi/get_base_model.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +python llava_phi/train/convert_model2base_llava_phi.py \ + --model_name_or_path susnato/phi-2 \ + --version plain \ + --data_path pretrain_data/blip_sample.json \ + --image_folder pretrain_data/blip_images \ + --vision_tower ./clip-vit-large-patch14-336 \ + --mm_projector_type mlp2x_gelu \ + --tune_mm_mlp_adapter True \ + --mm_vision_select_layer -2 \ + --mm_use_im_start_end False \ + --mm_use_im_patch_token False \ + --bf16 True \ + --output_dir ./base_checkpoints_llava_phi \ + --num_train_epochs 1 \ + --per_device_train_batch_size 4 \ + --per_device_eval_batch_size 4 \ + --gradient_accumulation_steps 2 \ + --evaluation_strategy "no" \ + --save_strategy "steps" \ + --save_steps 24000 \ + --save_total_limit 1 \ + --learning_rate 1e-3 \ + --weight_decay 0.1 \ + --warmup_ratio 0. \ + --lr_scheduler_type "cosine" \ + --logging_steps 10 \ + --tf32 False \ + --model_max_length 2048 \ + --gradient_checkpointing True \ + --dataloader_num_workers 4 \ + --lazy_preprocess True \ + --report_to tensorboard diff --git a/llava-phi/scripts/llava_phi/pretrain.sh b/llava-phi/scripts/llava_phi/pretrain.sh new file mode 100644 index 0000000000000000000000000000000000000000..e01a5af2fb059b822c7f4963faa8517dde372282 --- /dev/null +++ b/llava-phi/scripts/llava_phi/pretrain.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +deepspeed --master_port 29600 llava_phi/train/train.py \ + --deepspeed ./scripts/zero2.json \ + --model_name_or_path ./base_checkpoints_llava_phi \ + --version plain \ + --data_path pretrain_data/blip_sample.json \ + --image_folder pretrain_data/blip_images \ + --tune_mm_mlp_adapter True \ + --freeze_vision_tower True \ + --freeze_backbone True \ + --mm_use_im_start_end False \ + --mm_use_im_patch_token False \ + --bf16 True \ + --output_dir ./checkpoints/llavaPhi-v0-3b-pretrain \ + --num_train_epochs 1 \ + --per_device_train_batch_size 32 \ + --per_device_eval_batch_size 4 \ + --gradient_accumulation_steps 2 \ + --evaluation_strategy "no" \ + --save_strategy "steps" \ + --save_steps 5000 \ + --save_total_limit 1 \ + --learning_rate 1e-3 \ + --weight_decay 0. \ + --warmup_ratio 0.03 \ + --lr_scheduler_type "cosine" \ + --logging_steps 10 \ + --tf32 False \ + --model_max_length 2048 \ + --gradient_checkpointing True \ + --dataloader_num_workers 4 \ + --lazy_preprocess True \ + --report_to tensorboard diff --git a/llava-phi/scripts/merge_lora_weights.py b/llava-phi/scripts/merge_lora_weights.py new file mode 100644 index 0000000000000000000000000000000000000000..89f272fc7fb57d3b0961c5e03c1229c1ec276ee1 --- /dev/null +++ b/llava-phi/scripts/merge_lora_weights.py @@ -0,0 +1,22 @@ +import argparse +from llava_phi.model.builder import load_pretrained_model +from llava_phi.mm_utils import get_model_name_from_path + + +def merge_lora(args): + model_name = get_model_name_from_path(args.model_path) + tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, device_map='cpu') + + model.save_pretrained(args.save_model_path) + tokenizer.save_pretrained(args.save_model_path) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--model-path", type=str, required=True) + parser.add_argument("--model-base", type=str, required=True) + parser.add_argument("--save-model-path", type=str, required=True) + + args = parser.parse_args() + + merge_lora(args) diff --git a/llava-phi/scripts/zero2.json b/llava-phi/scripts/zero2.json new file mode 100644 index 0000000000000000000000000000000000000000..c95ebefe07b7d8d9fd0936a014679d07102cc270 --- /dev/null +++ b/llava-phi/scripts/zero2.json @@ -0,0 +1,23 @@ +{ + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "train_micro_batch_size_per_gpu": "auto", + "train_batch_size": "auto", + "gradient_accumulation_steps": "auto", + "zero_optimization": { + "stage": 2, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1e9, + "reduce_bucket_size": "auto" + } +} \ No newline at end of file diff --git a/llava-phi/scripts/zero3.json b/llava-phi/scripts/zero3.json new file mode 100644 index 0000000000000000000000000000000000000000..6917317af62da757ca759a92b326ddfa65b203cc --- /dev/null +++ b/llava-phi/scripts/zero3.json @@ -0,0 +1,28 @@ +{ + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "train_micro_batch_size_per_gpu": "auto", + "train_batch_size": "auto", + "gradient_accumulation_steps": "auto", + "zero_optimization": { + "stage": 3, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1e9, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1e9, + "stage3_max_reuse_distance": 1e9, + "stage3_gather_16bit_weights_on_model_save": true + } +} \ No newline at end of file diff --git a/llava-phi/scripts/zero3_offload.json b/llava-phi/scripts/zero3_offload.json new file mode 100644 index 0000000000000000000000000000000000000000..e0a54c2c2bc10f76458c42a43de0970a9251759f --- /dev/null +++ b/llava-phi/scripts/zero3_offload.json @@ -0,0 +1,56 @@ +{ + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "optimizer": { + "type": "AdamW", + "params": { + "lr": "auto", + "betas": "auto", + "eps": "auto", + "weight_decay": "auto" + } + }, + "scheduler": { + "type": "WarmupLR", + "params": { + "warmup_min_lr": "auto", + "warmup_max_lr": "auto", + "warmup_num_steps": "auto" + } + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "cpu", + "pin_memory": true + }, + "offload_param": { + "device": "cpu", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1e9, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1e9, + "stage3_max_reuse_distance": 1e9, + "gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "steps_per_print": 1e5, + "wall_clock_breakdown": false +} \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..b8d26a2bbb0fe61ebe06ccf3046119af0569ff42 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,20 @@ +einops==0.6.1 +einops-exts==0.0.4 +timm==0.6.13 +gradio==3.35.2 +gradio_client==0.2.9 +markdown2[all] +numpy +requests +sentencepiece +tokenizers==0.15.0 +torch==2.0.1 +shortuuid +httpx==0.24.0 +deepspeed==0.9.5 +peft==0.4.0 +transformers==4.36.2 +accelerate==0.21.0 +bitsandbytes==0.41.0 +scikit-learn==1.2.2 +sentencepiece==0.1.99 \ No newline at end of file