Ketengan-Diffusion-Lab commited on
Commit
ef394e0
1 Parent(s): df0f804

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -34
app.py CHANGED
@@ -1,7 +1,7 @@
1
  import gradio as gr
2
  import torch
3
  import transformers
4
- from transformers import AutoModelForCausalLM, AutoTokenizer, AutoImageProcessor
5
  from PIL import Image
6
  import warnings
7
 
@@ -10,61 +10,66 @@ transformers.logging.set_verbosity_error()
10
  transformers.logging.disable_progress_bar()
11
  warnings.filterwarnings('ignore')
12
 
13
- # Set device to GPU if available, else CPU
 
 
 
14
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
15
- print(f"Using device: {device}")
16
 
17
- # Update model path to your local path
18
- model_name = 'failspy/kappa-3-phi-abliterated'
19
 
20
- # create model and load it to the specified device
21
  model = AutoModelForCausalLM.from_pretrained(
22
  model_name,
23
  torch_dtype=torch.float16,
 
24
  device_map="auto",
25
- trust_remote_code=True
26
- )
27
-
28
- tokenizer = AutoTokenizer.from_pretrained(
29
- model_name,
30
- trust_remote_code=True
31
  )
32
 
33
  def inference(prompt, image, temperature, beam_size):
34
- # Phi-3 uses a chat template
35
  messages = [
36
- {"role": "user", "content": f"Can you describe this image?\n{prompt}"}
37
  ]
38
-
39
- # Apply chat template and add generation prompt
40
- inputs = tokenizer.apply_chat_template(
41
- messages,
42
- add_generation_prompt=True,
43
- return_tensors="pt"
44
- ).to(device)
 
45
 
46
- # Process the image using AutoImageProcessor
47
- image_processor = AutoImageProcessor.from_pretrained(model_name)
48
- pixel_values = image_processor(image, return_tensors="pt").pixel_values.to(device)
49
 
50
- # Add debug prints
51
- print(f"Device of model: {next(model.parameters()).device}")
52
- print(f"Device of inputs: {inputs.input_ids.device}")
53
- print(f"Device of pixel_values: {pixel_values.device}")
54
 
55
- # generate
56
  with torch.cuda.amp.autocast():
57
  output_ids = model.generate(
58
- inputs.input_ids,
59
- pixel_values=pixel_values,
60
  max_new_tokens=1024,
61
  temperature=temperature,
62
  num_beams=beam_size,
63
- use_cache=True
 
 
 
 
64
  )[0]
65
 
66
- return tokenizer.decode(output_ids[inputs.input_ids.shape[1]:], skip_special_tokens=True).strip()
 
 
 
67
 
 
68
  with gr.Blocks() as demo:
69
  with gr.Row():
70
  with gr.Column():
@@ -82,4 +87,5 @@ with gr.Blocks() as demo:
82
  outputs=output_text
83
  )
84
 
85
- demo.launch(share=True)
 
 
1
  import gradio as gr
2
  import torch
3
  import transformers
4
+ from transformers import AutoModelForCausalLM, AutoTokenizer
5
  from PIL import Image
6
  import warnings
7
 
 
10
  transformers.logging.disable_progress_bar()
11
  warnings.filterwarnings('ignore')
12
 
13
+ model_name = 'cognitivecomputations/dolphin-vision-72b'
14
+
15
+ # Set up GPU memory optimization
16
+ torch.cuda.empty_cache()
17
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
18
 
19
+ # Load tokenizer
20
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
21
 
22
+ # Load model with memory optimizations
23
  model = AutoModelForCausalLM.from_pretrained(
24
  model_name,
25
  torch_dtype=torch.float16,
26
+ low_cpu_mem_usage=True,
27
  device_map="auto",
28
+ trust_remote_code=True,
29
+ offload_folder="offload", # Offload to disk if necessary
30
+ offload_state_dict=True, # Offload state dict to CPU
31
+ max_memory={0: "40GB"} # Limit GPU memory usage
 
 
32
  )
33
 
34
  def inference(prompt, image, temperature, beam_size):
 
35
  messages = [
36
+ {"role": "user", "content": f'<image>\n{prompt}'}
37
  ]
38
+ text = tokenizer.apply_chat_template(
39
+ messages,
40
+ tokenize=False,
41
+ add_generation_prompt=True
42
+ )
43
+
44
+ text_chunks = [tokenizer(chunk).input_ids for chunk in text.split('<image>')]
45
+ input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1], dtype=torch.long).unsqueeze(0).to(device)
46
 
47
+ image_tensor = model.process_images([image], model.config).to(device)
 
 
48
 
49
+ # Clear GPU memory
50
+ torch.cuda.empty_cache()
 
 
51
 
52
+ # Generate with memory optimization
53
  with torch.cuda.amp.autocast():
54
  output_ids = model.generate(
55
+ input_ids,
56
+ images=image_tensor,
57
  max_new_tokens=1024,
58
  temperature=temperature,
59
  num_beams=beam_size,
60
+ use_cache=True,
61
+ do_sample=True,
62
+ repetition_penalty=1.1,
63
+ length_penalty=1.0,
64
+ no_repeat_ngram_size=3
65
  )[0]
66
 
67
+ # Clear GPU memory again
68
+ torch.cuda.empty_cache()
69
+
70
+ return tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
71
 
72
+ # Create Gradio interface
73
  with gr.Blocks() as demo:
74
  with gr.Row():
75
  with gr.Column():
 
87
  outputs=output_text
88
  )
89
 
90
+ # Launch the app
91
+ demo.launch()