Device_map Auto fails

#2
by arnaudstiegler - opened

Hello, thanks for the awesome model! I'm trying to use device_map="auto" to do some pipeline parallelism for inference, and I'm following the inference code you provided on the model card but I'm getting the following error:

  File "/home/ubuntu/.cache/huggingface/modules/transformers_modules/THUDM/cogvlm2-llama3-chat-19B/9f600cfc163c089d188722a615a6150a3340f6ca/modeling_cogvlm.py", line 620, in forward
    outputs = self.model(
              ^^^^^^^^^^^
  File "/home/ubuntu/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/ubuntu/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/ubuntu/.cache/huggingface/modules/transformers_modules/THUDM/cogvlm2-llama3-chat-19B/9f600cfc163c089d188722a615a6150a3340f6ca/modeling_cogvlm.py", line 389, in forward
    images_features = self.encode_images(images)
                      ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/ubuntu/.cache/huggingface/modules/transformers_modules/THUDM/cogvlm2-llama3-chat-19B/9f600cfc163c089d188722a615a6150a3340f6ca/modeling_cogvlm.py", line 361, in encode_images
    images_features = self.vision(images)
                      ^^^^^^^^^^^^^^^^^^^
  File "/home/ubuntu/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/ubuntu/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/ubuntu/.cache/huggingface/modules/transformers_modules/THUDM/cogvlm2-llama3-chat-19B/9f600cfc163c089d188722a615a6150a3340f6ca/visual.py", line 130, in forward
    x = self.transformer(x)
        ^^^^^^^^^^^^^^^^^^^
  File "/home/ubuntu/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/ubuntu/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/ubuntu/.cache/huggingface/modules/transformers_modules/THUDM/cogvlm2-llama3-chat-19B/9f600cfc163c089d188722a615a6150a3340f6ca/visual.py", line 94, in forward
    hidden_states = layer_module(hidden_states)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/ubuntu/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/ubuntu/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/ubuntu/.cache/huggingface/modules/transformers_modules/THUDM/cogvlm2-llama3-chat-19B/9f600cfc163c089d188722a615a6150a3340f6ca/visual.py", line 83, in forward
    output = mlp_input + mlp_output
             ~~~~~~~~~~^~~~~~~~~~~~
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:5 and cuda:6!

Seems like the fix could be as simple as making sure the mlp_input and mlp_output are on the same device before doing the operation. Happy to open a fix with your permission

Knowledge Engineering Group (KEG) & Data Mining at Tsinghua University org

Did you get to resolve this? I'm facing the same issue

I was able to load the model using the GitHub example, but now I’m facing the following issue:

AttributeError                            Traceback (most recent call last)
Cell In[6], line 67
     61 gen_kwargs = {
     62     "max_new_tokens": 2048,
     63     "pad_token_id": 128002,
     64     "top_k": 1,
     65 }
     66 with torch.no_grad():
---> 67     outputs = model.generate(**inputs, **gen_kwargs)
     68     outputs = outputs[:, inputs['input_ids'].shape[1]:]
     69     response = tokenizer.decode(outputs[0])

File /vlm/.venv/lib/python3.8/site-packages/torch/utils/_contextlib.py:115, in context_decorator.<locals>.decorate_context(*args, **kwargs)
    112 @functools.wraps(func)
    113 def decorate_context(*args, **kwargs):
    114     with ctx_factory():
--> 115         return func(*args, **kwargs)

File /vlm/.venv/lib/python3.8/site-packages/transformers/generation/utils.py:1914, in GenerationMixin.generate(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, assistant_model, streamer, negative_prompt_ids, negative_prompt_attention_mask, **kwargs)
   1906     input_ids, model_kwargs = self._expand_inputs_for_generation(
   1907         input_ids=input_ids,
   1908         expand_size=generation_config.num_return_sequences,
   1909         is_encoder_decoder=self.config.is_encoder_decoder,
   1910         **model_kwargs,
   1911     )
   1913     # 13. run sample (it degenerates to greedy search when `generation_config.do_sample=False`)
-> 1914     result = self._sample(
   1915         input_ids,
   1916         logits_processor=prepared_logits_processor,
   1917         logits_warper=prepared_logits_warper,
   1918         stopping_criteria=prepared_stopping_criteria,
   1919         generation_config=generation_config,
   1920         synced_gpus=synced_gpus,
   1921         streamer=streamer,
   1922         **model_kwargs,
   1923     )
   1925 elif generation_mode in (GenerationMode.BEAM_SAMPLE, GenerationMode.BEAM_SEARCH):
   1926     # 11. prepare logits warper
   1927     prepared_logits_warper = (
   1928         self._get_logits_warper(generation_config, device=input_ids.device)
   1929         if generation_config.do_sample
   1930         else None
   1931     )

File /vlm/.venv/lib/python3.8/site-packages/transformers/generation/utils.py:2651, in GenerationMixin._sample(self, input_ids, logits_processor, stopping_criteria, generation_config, synced_gpus, streamer, logits_warper, **model_kwargs)
   2648 model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
   2650 # forward pass to get next token
-> 2651 outputs = self(
   2652     **model_inputs,
   2653     return_dict=True,
   2654     output_attentions=output_attentions,
   2655     output_hidden_states=output_hidden_states,
   2656 )
   2658 if synced_gpus and this_peer_finished:
   2659     continue  # don't waste resources running the code we don't need

File /vlm/.venv/lib/python3.8/site-packages/torch/nn/modules/module.py:1532, in Module._wrapped_call_impl(self, *args, **kwargs)
   1530     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1531 else:
-> 1532     return self._call_impl(*args, **kwargs)

File /vlm/.venv/lib/python3.8/site-packages/torch/nn/modules/module.py:1541, in Module._call_impl(self, *args, **kwargs)
   1536 # If we don't have any hooks, we want to skip the rest of the logic in
   1537 # this function, and just call forward.
   1538 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1539         or _global_backward_pre_hooks or _global_backward_hooks
   1540         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1541     return forward_call(*args, **kwargs)
   1543 try:
   1544     result = None

File /vlm/.venv/lib/python3.8/site-packages/accelerate/hooks.py:169, in add_hook_to_module.<locals>.new_forward(module, *args, **kwargs)
    167         output = module._old_forward(*args, **kwargs)
    168 else:
--> 169     output = module._old_forward(*args, **kwargs)
    170 return module._hf_hook.post_forward(module, output)

File /.cache/huggingface/modules/transformers_modules/THUDM/cogvlm2-llama3-chat-19B/2bf7de6892877eb50142395af14847519ba95998/modeling_cogvlm.py:649, in CogVLMForCausalLM.forward(self, input_ids, images, token_type_ids, attention_mask, position_ids, past_key_values, inputs_embeds, use_cache, output_attentions, output_hidden_states, return_dict, labels)
    646 return_dict = return_dict if return_dict is not None else self.config.use_return_dict
    648 # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
--> 649 outputs = self.model(
    650     input_ids=input_ids,
    651     images=images,
    652     token_type_ids=token_type_ids,
    653     attention_mask=attention_mask,
    654     position_ids=position_ids,
    655     past_key_values=past_key_values,
    656     inputs_embeds=inputs_embeds,
    657     use_cache=use_cache,
    658     output_attentions=output_attentions,
    659     output_hidden_states=output_hidden_states,
    660     return_dict=return_dict,
    661 )
    663 hidden_states = outputs[0]
    664 logits = self.lm_head(hidden_states)

File /vlm/.venv/lib/python3.8/site-packages/torch/nn/modules/module.py:1532, in Module._wrapped_call_impl(self, *args, **kwargs)
   1530     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1531 else:
-> 1532     return self._call_impl(*args, **kwargs)

File /vlm/.venv/lib/python3.8/site-packages/torch/nn/modules/module.py:1541, in Module._call_impl(self, *args, **kwargs)
   1536 # If we don't have any hooks, we want to skip the rest of the logic in
   1537 # this function, and just call forward.
   1538 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1539         or _global_backward_pre_hooks or _global_backward_hooks
   1540         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1541     return forward_call(*args, **kwargs)
   1543 try:
   1544     result = None

File /.cache/huggingface/modules/transformers_modules/THUDM/cogvlm2-llama3-chat-19B/2bf7de6892877eb50142395af14847519ba95998/modeling_cogvlm.py:403, in CogVLMModel.forward(self, input_ids, images, token_type_ids, attention_mask, position_ids, past_key_values, inputs_embeds, use_cache, output_attentions, output_hidden_states, return_dict)
    401         position_ids = build_position_ids(token_type_ids, attention_mask)
    402     input_ids = None
--> 403 return self.llm_forward(
    404     input_ids=input_ids,
    405     token_type_ids=token_type_ids,
    406     attention_mask=attention_mask,
    407     position_ids=position_ids,
    408     past_key_values=past_key_values,
    409     inputs_embeds=inputs_embeds,
    410     use_cache=use_cache,
    411     output_attentions=output_attentions,
    412     output_hidden_states=output_hidden_states,
    413     return_dict=return_dict,
    414 )

File /.cache/huggingface/modules/transformers_modules/THUDM/cogvlm2-llama3-chat-19B/2bf7de6892877eb50142395af14847519ba95998/modeling_cogvlm.py:452, in CogVLMModel.llm_forward(self, input_ids, token_type_ids, attention_mask, position_ids, past_key_values, inputs_embeds, use_cache, output_attentions, output_hidden_states, return_dict)
    449 past_key_values_length = 0
    451 if past_key_values is not None:
--> 452     past_key_values_length = past_key_values[0][0].shape[2]
    453     seq_length_with_past = seq_length_with_past + past_key_values_length
    455 if position_ids is None:

AttributeError: 'str' object has no attribute 'shape'

transformers==4.40 solved the problem.

Knowledge Engineering Group (KEG) & Data Mining at Tsinghua University org

past_key_values_length = past_key_values[0][0].shape[2]
这个问题大概率是由于 transofmers 版本不对,请调整到 4.40.2

Hello! I used the provided code for multiple gpu, it works, thank you! Now I get this error

inputs_embeds = inputs_embeds.index_put([token_type_ids == VISION_TOKEN_TYPE], images_features)

RuntimeError: shape mismatch: value tensor of shape [2306, 4096] cannot be broadcast to indexing result of shape [0, 4096]

Sign up or log in to comment