import torch import gradio as gr from transformers import AutoModel, pipeline, AutoTokenizer import spaces import subprocess # from issue: https://discuss.huggingface.co/t/how-to-install-flash-attention-on-hf-gradio-space/70698/2 # InternVL2 needs flash_attn subprocess.run( "pip install flash-attn --no-build-isolation", env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"}, shell=True, ) try: model_name = "OpenGVLab/InternVL2-8B" # model: model = ( AutoModel.from_pretrained( model_name, torch_dtype=torch.bfloat16, # low_cpu_mem_usage=True, trust_remote_code=True, ) .cuda() .eval() ) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) # pipeline: inference = pipeline( task="visual-question-answering", model=model, tokenizer=tokenizer ) except Exception as error: raise gr.Error("👌" + str(error), duration=30) @spaces.GPU def predict(input_img, questions): try: gr.Info("pipeline: " + str(type(inference))) gr.Info("model: " + str(type(model))) predictions = inference(question=questions, image=input_img) return str(predictions) except Exception as e: error_message = "❌" + str(e) raise gr.Error(error_message, duration=25) gradio_app = gr.Interface( predict, inputs=[ gr.Image(label="Select A Image", sources=["upload", "webcam"], type="pil"), "text", ], outputs="text", title='ask me anything', ) if __name__ == "__main__": gradio_app.launch(show_error=True, debug=True)