Ketengan-Diffusion-Lab commited on
Commit
85baff2
1 Parent(s): 93f8b15

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +95 -77
app.py CHANGED
@@ -1,91 +1,109 @@
 
1
  import gradio as gr
2
  import torch
 
3
  import transformers
4
  from transformers import AutoModelForCausalLM, AutoTokenizer
5
  from PIL import Image
6
  import warnings
7
- from accelerate import Accelerator, DistributedType
8
- import os
9
 
10
  # disable some warnings
11
  transformers.logging.set_verbosity_error()
12
  transformers.logging.disable_progress_bar()
13
  warnings.filterwarnings('ignore')
14
 
15
- # Initialize Accelerator
16
- accelerator = Accelerator()
17
-
18
- model_name = 'cognitivecomputations/dolphin-vision-72b'
19
-
20
- # Determine the number of GPUs available
21
- num_gpus = torch.cuda.device_count()
22
- print(f"Number of GPUs available: {num_gpus}")
23
-
24
- # Load model and tokenizer
25
- model = AutoModelForCausalLM.from_pretrained(
26
- model_name,
27
- torch_dtype=torch.float16,
28
- device_map="auto",
29
- trust_remote_code=True
30
- )
31
-
32
- tokenizer = AutoTokenizer.from_pretrained(
33
- model_name,
34
- trust_remote_code=True
35
- )
36
-
37
- # Prepare model
38
- model = accelerator.prepare(model)
39
-
40
- def inference(prompt, image, temperature, beam_size):
41
- messages = [
42
- {"role": "user", "content": f'<image>\n{prompt}'}
43
- ]
44
- text = tokenizer.apply_chat_template(
45
- messages,
46
- tokenize=False,
47
- add_generation_prompt=True
48
- )
49
-
50
- text_chunks = [tokenizer(chunk).input_ids for chunk in text.split('<image>')]
51
- input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1], dtype=torch.long).unsqueeze(0)
52
-
53
- image_tensor = model.process_images([image], model.config)
54
-
55
- # Move tensors to the appropriate device
56
- input_ids = input_ids.to(accelerator.device)
57
- image_tensor = image_tensor.to(accelerator.device)
58
-
59
- # generate
60
- with torch.cuda.amp.autocast():
61
- output_ids = accelerator.unwrap_model(model).generate(
62
- input_ids,
63
- images=image_tensor,
64
- max_new_tokens=1024,
65
- temperature=temperature,
66
- num_beams=beam_size,
67
- use_cache=True
68
- )[0]
69
-
70
- return tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
71
-
72
- # Create Gradio interface
73
- with gr.Blocks() as demo:
74
- with gr.Row():
75
- with gr.Column():
76
- prompt_input = gr.Textbox(label="Prompt", placeholder="Describe this image in detail")
77
- image_input = gr.Image(label="Image", type="pil")
78
- temperature_input = gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature")
79
- beam_size_input = gr.Slider(minimum=1, maximum=10, value=4, step=1, label="Beam Size")
80
- submit_button = gr.Button("Submit")
81
- with gr.Column():
82
- output_text = gr.Textbox(label="Output")
83
-
84
- submit_button.click(
85
- fn=inference,
86
- inputs=[prompt_input, image_input, temperature_input, beam_size_input],
87
- outputs=output_text
88
  )
89
 
90
- # Launch the app
91
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
  import gradio as gr
3
  import torch
4
+ import torch.distributed as dist
5
  import transformers
6
  from transformers import AutoModelForCausalLM, AutoTokenizer
7
  from PIL import Image
8
  import warnings
 
 
9
 
10
  # disable some warnings
11
  transformers.logging.set_verbosity_error()
12
  transformers.logging.disable_progress_bar()
13
  warnings.filterwarnings('ignore')
14
 
15
+ def setup(rank, world_size):
16
+ os.environ['MASTER_ADDR'] = 'localhost'
17
+ os.environ['MASTER_PORT'] = '12355'
18
+ dist.init_process_group("nccl", rank=rank, world_size=world_size)
19
+
20
+ def cleanup():
21
+ dist.destroy_process_group()
22
+
23
+ def load_model_on_gpus(model_name, num_gpus):
24
+ # Calculate number of layers to assign to each GPU
25
+ model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, trust_remote_code=True)
26
+ num_layers = len(model.model.layers)
27
+ layers_per_gpu = num_layers // num_gpus
28
+
29
+ # Assign layers to GPUs
30
+ device_map = {}
31
+ for i in range(num_layers):
32
+ device_map[f'model.layers.{i}'] = i // layers_per_gpu
33
+
34
+ # Assign other components
35
+ device_map['model.embed_tokens'] = 0
36
+ device_map['model.norm'] = num_gpus - 1
37
+ device_map['lm_head'] = num_gpus - 1
38
+
39
+ return AutoModelForCausalLM.from_pretrained(
40
+ model_name,
41
+ device_map=device_map,
42
+ torch_dtype=torch.float16,
43
+ trust_remote_code=True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  )
45
 
46
+ def run_distributed(rank, world_size, model_name):
47
+ setup(rank, world_size)
48
+
49
+ if rank == 0:
50
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
51
+
52
+ model = load_model_on_gpus(model_name, world_size)
53
+
54
+ def inference(prompt, image, temperature, beam_size):
55
+ if rank == 0:
56
+ messages = [{"role": "user", "content": f'<image>\n{prompt}'}]
57
+ text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
58
+ text_chunks = [tokenizer(chunk).input_ids for chunk in text.split('<image>')]
59
+ input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1], dtype=torch.long).unsqueeze(0).to(rank)
60
+ image_tensor = model.process_images([image], model.config).to(rank)
61
+ else:
62
+ input_ids = torch.zeros(1, 1, dtype=torch.long).to(rank)
63
+ image_tensor = torch.zeros(1, 3, 224, 224).to(rank)
64
+
65
+ dist.broadcast(input_ids, src=0)
66
+ dist.broadcast(image_tensor, src=0)
67
+
68
+ with torch.cuda.amp.autocast():
69
+ output_ids = model.generate(
70
+ input_ids,
71
+ images=image_tensor,
72
+ max_new_tokens=1024,
73
+ temperature=temperature,
74
+ num_beams=beam_size,
75
+ use_cache=True
76
+ )[0]
77
+
78
+ if rank == 0:
79
+ return tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
80
+ else:
81
+ return ""
82
+
83
+ if rank == 0:
84
+ with gr.Blocks() as demo:
85
+ with gr.Row():
86
+ with gr.Column():
87
+ prompt_input = gr.Textbox(label="Prompt", placeholder="Describe this image in detail")
88
+ image_input = gr.Image(label="Image", type="pil")
89
+ temperature_input = gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature")
90
+ beam_size_input = gr.Slider(minimum=1, maximum=10, value=4, step=1, label="Beam Size")
91
+ submit_button = gr.Button("Submit")
92
+ with gr.Column():
93
+ output_text = gr.Textbox(label="Output")
94
+
95
+ submit_button.click(
96
+ fn=inference,
97
+ inputs=[prompt_input, image_input, temperature_input, beam_size_input],
98
+ outputs=output_text
99
+ )
100
+
101
+ demo.launch(share=True)
102
+
103
+ cleanup()
104
+
105
+ if __name__ == "__main__":
106
+ model_name = 'cognitivecomputations/dolphin-vision-72b'
107
+ world_size = torch.cuda.device_count()
108
+ print(f"Running on {world_size} GPUs")
109
+ torch.multiprocessing.spawn(run_distributed, args=(world_size, model_name), nprocs=world_size, join=True)