|
import gradio as gr |
|
import torch |
|
from PIL import Image |
|
from io import BytesIO |
|
from huggingface_hub import hf_hub_download |
|
from processing_llava import LlavaProcessor, OpenCLIPImageProcessor |
|
from modeling_llava import LlavaForConditionalGeneration |
|
from transformers import AutoTokenizer, TextStreamer |
|
|
|
|
|
hf_hub_download(repo_id="OEvortex/HelpingAI-Vision", filename="configuration_llava.py", local_dir="./", force_download=True) |
|
hf_hub_download(repo_id="OEvortex/HelpingAI-Vision", filename="configuration_phi.py", local_dir="./", force_download=True) |
|
hf_hub_download(repo_id="OEvortex/HelpingAI-Vision", filename="modeling_llava.py", local_dir="./", force_download=True) |
|
hf_hub_download(repo_id="OEvortex/HelpingAI-Vision", filename="modeling_phi.py", local_dir="./", force_download=True) |
|
hf_hub_download(repo_id="OEvortex/HelpingAI-Vision", filename="processing_llava.py", local_dir="./", force_download=True) |
|
|
|
|
|
model = LlavaForConditionalGeneration.from_pretrained("OEvortex/HelpingAI-Vision", torch_dtype=torch.float16) |
|
model = model.to("cuda") |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained("OEvortex/HelpingAI-Vision") |
|
image_processor = OpenCLIPImageProcessor(model.config.preprocess_config) |
|
processor = LlavaProcessor(image_processor, tokenizer) |
|
|
|
|
|
def generate_text(image, initial_text): |
|
|
|
with torch.inference_mode(): |
|
inputs = processor(initial_text, image, model, return_tensors='pt') |
|
inputs['input_ids'] = inputs['input_ids'].to(model.device) |
|
inputs['attention_mask'] = inputs['attention_mask'].to(model.device) |
|
|
|
streamer = TextStreamer(tokenizer) |
|
|
|
|
|
output = model.generate(**inputs, max_new_tokens=200, do_sample=True, top_p=0.9, temperature=1.2, eos_token_id=tokenizer.eos_token_id, streamer=streamer) |
|
|
|
|
|
return tokenizer.decode(output[0], skip_special_tokens=True) |
|
|
|
|
|
with gr.Blocks() as demo: |
|
with gr.Row(): |
|
with gr.Column(): |
|
image_input = gr.Image(type="pil", label="Загрузите изображение") |
|
text_input = gr.Textbox(label="Введите текст запроса") |
|
with gr.Column(): |
|
output_text = gr.Textbox(label="Сгенерированный текст") |
|
|
|
generate_button = gr.Button("Генерировать текст") |
|
generate_button.click(generate_text, inputs=[image_input, text_input], outputs=output_text) |
|
|
|
|
|
demo.launch() |