Device_map Auto fails
Hello, thanks for the awesome model! I'm trying to use device_map="auto" to do some pipeline parallelism for inference, and I'm following the inference code you provided on the model card but I'm getting the following error:
File "/home/ubuntu/.cache/huggingface/modules/transformers_modules/THUDM/cogvlm2-llama3-chat-19B/9f600cfc163c089d188722a615a6150a3340f6ca/modeling_cogvlm.py", line 620, in forward
outputs = self.model(
^^^^^^^^^^^
File "/home/ubuntu/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ubuntu/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ubuntu/.cache/huggingface/modules/transformers_modules/THUDM/cogvlm2-llama3-chat-19B/9f600cfc163c089d188722a615a6150a3340f6ca/modeling_cogvlm.py", line 389, in forward
images_features = self.encode_images(images)
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ubuntu/.cache/huggingface/modules/transformers_modules/THUDM/cogvlm2-llama3-chat-19B/9f600cfc163c089d188722a615a6150a3340f6ca/modeling_cogvlm.py", line 361, in encode_images
images_features = self.vision(images)
^^^^^^^^^^^^^^^^^^^
File "/home/ubuntu/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ubuntu/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ubuntu/.cache/huggingface/modules/transformers_modules/THUDM/cogvlm2-llama3-chat-19B/9f600cfc163c089d188722a615a6150a3340f6ca/visual.py", line 130, in forward
x = self.transformer(x)
^^^^^^^^^^^^^^^^^^^
File "/home/ubuntu/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ubuntu/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ubuntu/.cache/huggingface/modules/transformers_modules/THUDM/cogvlm2-llama3-chat-19B/9f600cfc163c089d188722a615a6150a3340f6ca/visual.py", line 94, in forward
hidden_states = layer_module(hidden_states)
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ubuntu/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ubuntu/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ubuntu/.cache/huggingface/modules/transformers_modules/THUDM/cogvlm2-llama3-chat-19B/9f600cfc163c089d188722a615a6150a3340f6ca/visual.py", line 83, in forward
output = mlp_input + mlp_output
~~~~~~~~~~^~~~~~~~~~~~
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:5 and cuda:6!
Seems like the fix could be as simple as making sure the mlp_input
and mlp_output
are on the same device before doing the operation. Happy to open a fix with your permission
Do not use device_map="auto" and try it here https://github.com/THUDM/CogVLM2/blob/main/basic_demo/cli_demo_multi_gpus.py
Did you get to resolve this? I'm facing the same issue
I was able to load the model using the GitHub example, but now I’m facing the following issue:
AttributeError Traceback (most recent call last)
Cell In[6], line 67
61 gen_kwargs = {
62 "max_new_tokens": 2048,
63 "pad_token_id": 128002,
64 "top_k": 1,
65 }
66 with torch.no_grad():
---> 67 outputs = model.generate(**inputs, **gen_kwargs)
68 outputs = outputs[:, inputs['input_ids'].shape[1]:]
69 response = tokenizer.decode(outputs[0])
File /vlm/.venv/lib/python3.8/site-packages/torch/utils/_contextlib.py:115, in context_decorator.<locals>.decorate_context(*args, **kwargs)
112 @functools.wraps(func)
113 def decorate_context(*args, **kwargs):
114 with ctx_factory():
--> 115 return func(*args, **kwargs)
File /vlm/.venv/lib/python3.8/site-packages/transformers/generation/utils.py:1914, in GenerationMixin.generate(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, assistant_model, streamer, negative_prompt_ids, negative_prompt_attention_mask, **kwargs)
1906 input_ids, model_kwargs = self._expand_inputs_for_generation(
1907 input_ids=input_ids,
1908 expand_size=generation_config.num_return_sequences,
1909 is_encoder_decoder=self.config.is_encoder_decoder,
1910 **model_kwargs,
1911 )
1913 # 13. run sample (it degenerates to greedy search when `generation_config.do_sample=False`)
-> 1914 result = self._sample(
1915 input_ids,
1916 logits_processor=prepared_logits_processor,
1917 logits_warper=prepared_logits_warper,
1918 stopping_criteria=prepared_stopping_criteria,
1919 generation_config=generation_config,
1920 synced_gpus=synced_gpus,
1921 streamer=streamer,
1922 **model_kwargs,
1923 )
1925 elif generation_mode in (GenerationMode.BEAM_SAMPLE, GenerationMode.BEAM_SEARCH):
1926 # 11. prepare logits warper
1927 prepared_logits_warper = (
1928 self._get_logits_warper(generation_config, device=input_ids.device)
1929 if generation_config.do_sample
1930 else None
1931 )
File /vlm/.venv/lib/python3.8/site-packages/transformers/generation/utils.py:2651, in GenerationMixin._sample(self, input_ids, logits_processor, stopping_criteria, generation_config, synced_gpus, streamer, logits_warper, **model_kwargs)
2648 model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
2650 # forward pass to get next token
-> 2651 outputs = self(
2652 **model_inputs,
2653 return_dict=True,
2654 output_attentions=output_attentions,
2655 output_hidden_states=output_hidden_states,
2656 )
2658 if synced_gpus and this_peer_finished:
2659 continue # don't waste resources running the code we don't need
File /vlm/.venv/lib/python3.8/site-packages/torch/nn/modules/module.py:1532, in Module._wrapped_call_impl(self, *args, **kwargs)
1530 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1531 else:
-> 1532 return self._call_impl(*args, **kwargs)
File /vlm/.venv/lib/python3.8/site-packages/torch/nn/modules/module.py:1541, in Module._call_impl(self, *args, **kwargs)
1536 # If we don't have any hooks, we want to skip the rest of the logic in
1537 # this function, and just call forward.
1538 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1539 or _global_backward_pre_hooks or _global_backward_hooks
1540 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1541 return forward_call(*args, **kwargs)
1543 try:
1544 result = None
File /vlm/.venv/lib/python3.8/site-packages/accelerate/hooks.py:169, in add_hook_to_module.<locals>.new_forward(module, *args, **kwargs)
167 output = module._old_forward(*args, **kwargs)
168 else:
--> 169 output = module._old_forward(*args, **kwargs)
170 return module._hf_hook.post_forward(module, output)
File /.cache/huggingface/modules/transformers_modules/THUDM/cogvlm2-llama3-chat-19B/2bf7de6892877eb50142395af14847519ba95998/modeling_cogvlm.py:649, in CogVLMForCausalLM.forward(self, input_ids, images, token_type_ids, attention_mask, position_ids, past_key_values, inputs_embeds, use_cache, output_attentions, output_hidden_states, return_dict, labels)
646 return_dict = return_dict if return_dict is not None else self.config.use_return_dict
648 # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
--> 649 outputs = self.model(
650 input_ids=input_ids,
651 images=images,
652 token_type_ids=token_type_ids,
653 attention_mask=attention_mask,
654 position_ids=position_ids,
655 past_key_values=past_key_values,
656 inputs_embeds=inputs_embeds,
657 use_cache=use_cache,
658 output_attentions=output_attentions,
659 output_hidden_states=output_hidden_states,
660 return_dict=return_dict,
661 )
663 hidden_states = outputs[0]
664 logits = self.lm_head(hidden_states)
File /vlm/.venv/lib/python3.8/site-packages/torch/nn/modules/module.py:1532, in Module._wrapped_call_impl(self, *args, **kwargs)
1530 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1531 else:
-> 1532 return self._call_impl(*args, **kwargs)
File /vlm/.venv/lib/python3.8/site-packages/torch/nn/modules/module.py:1541, in Module._call_impl(self, *args, **kwargs)
1536 # If we don't have any hooks, we want to skip the rest of the logic in
1537 # this function, and just call forward.
1538 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1539 or _global_backward_pre_hooks or _global_backward_hooks
1540 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1541 return forward_call(*args, **kwargs)
1543 try:
1544 result = None
File /.cache/huggingface/modules/transformers_modules/THUDM/cogvlm2-llama3-chat-19B/2bf7de6892877eb50142395af14847519ba95998/modeling_cogvlm.py:403, in CogVLMModel.forward(self, input_ids, images, token_type_ids, attention_mask, position_ids, past_key_values, inputs_embeds, use_cache, output_attentions, output_hidden_states, return_dict)
401 position_ids = build_position_ids(token_type_ids, attention_mask)
402 input_ids = None
--> 403 return self.llm_forward(
404 input_ids=input_ids,
405 token_type_ids=token_type_ids,
406 attention_mask=attention_mask,
407 position_ids=position_ids,
408 past_key_values=past_key_values,
409 inputs_embeds=inputs_embeds,
410 use_cache=use_cache,
411 output_attentions=output_attentions,
412 output_hidden_states=output_hidden_states,
413 return_dict=return_dict,
414 )
File /.cache/huggingface/modules/transformers_modules/THUDM/cogvlm2-llama3-chat-19B/2bf7de6892877eb50142395af14847519ba95998/modeling_cogvlm.py:452, in CogVLMModel.llm_forward(self, input_ids, token_type_ids, attention_mask, position_ids, past_key_values, inputs_embeds, use_cache, output_attentions, output_hidden_states, return_dict)
449 past_key_values_length = 0
451 if past_key_values is not None:
--> 452 past_key_values_length = past_key_values[0][0].shape[2]
453 seq_length_with_past = seq_length_with_past + past_key_values_length
455 if position_ids is None:
AttributeError: 'str' object has no attribute 'shape'
transformers==4.40 solved the problem.
past_key_values_length = past_key_values[0][0].shape[2]
这个问题大概率是由于 transofmers 版本不对,请调整到 4.40.2
Hello! I used the provided code for multiple gpu, it works, thank you! Now I get this error
inputs_embeds = inputs_embeds.index_put([token_type_ids == VISION_TOKEN_TYPE], images_features)
RuntimeError: shape mismatch: value tensor of shape [2306, 4096] cannot be broadcast to indexing result of shape [0, 4096]