How to deploy to SageMaker
Can anyone help me figure out how to deploy this to SageMaker? I tried the following, but it fails with the error below.
hub = {
'HF_MODEL_ID':'mistral-community/Mixtral-8x22B-v0.1-AWQ',
'SM_NUM_GPUS': json.dumps(8)
}
create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
image_uri=get_huggingface_llm_image_uri("huggingface",version="1.4.2"),
env=hub,
role=role,
)
deploy model to SageMaker Inference
predictor = huggingface_model.deploy(
initial_instance_count=1,
instance_type="ml.g5.48xlarge",
container_startup_health_check_timeout=1500,
)
Traceback (most recent call last):
File "/opt/conda/bin/text-generation-server", line 8, in
sys.exit(app())
File "/opt/conda/lib/python3.10/site-packages/text_generation_server/cli.py", line 89, in serve
server.serve(
File "/opt/conda/lib/python3.10/site-packages/text_generation_server/server.py", line 235, in serve
asyncio.run(
File "/opt/conda/lib/python3.10/asyncio/runners.py", line 44, in run
return loop.run_until_complete(main)
File "/opt/conda/lib/python3.10/asyncio/base_events.py", line 649, in run_until_complete
return future.result()
File "/opt/conda/lib/python3.10/site-packages/text_generation_server/server.py", line 196, in serve_inner
model = get_model(
File "/opt/conda/lib/python3.10/site-packages/text_generation_server/models/init.py", line 390, in get_model
return FlashMixtral(
File "/opt/conda/lib/python3.10/site-packages/text_generation_server/models/flash_mixtral.py", line 21, in init
super(FlashMixtral, self).init(
File "/opt/conda/lib/python3.10/site-packages/text_generation_server/models/flash_mistral.py", line 335, in init
model = model_cls(config, weights)
File "/opt/conda/lib/python3.10/site-packages/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py", line 812, in init
self.model = MixtralModel(config, weights)
File "/opt/conda/lib/python3.10/site-packages/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py", line 749, in init
[
File "/opt/conda/lib/python3.10/site-packages/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py", line 750, in
MixtralLayer(
File "/opt/conda/lib/python3.10/site-packages/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py", line 684, in init
self.self_attn = MixtralAttention(
File "/opt/conda/lib/python3.10/site-packages/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py", line 224, in init
self.query_key_value = load_attention(config, prefix, weights)
File "/opt/conda/lib/python3.10/site-packages/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py", line 117, in load_attention
return _load_gqa(config, prefix, weights)
File "/opt/conda/lib/python3.10/site-packages/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py", line 132, in _load_gqa
weight = weights.get_multi_weights_col(
File "/opt/conda/lib/python3.10/site-packages/text_generation_server/utils/weights.py", line 257, in get_multi_weights_col
w = [self.get_sharded(f"{p}.weight", dim=0) for p in prefixes]
File "/opt/conda/lib/python3.10/site-packages/text_generation_server/utils/weights.py", line 257, in
w = [self.get_sharded(f"{p}.weight", dim=0) for p in prefixes]
File "/opt/conda/lib/python3.10/site-packages/text_generation_server/utils/weights.py", line 112, in get_sharded
filename, tensor_name = self.get_filename(tensor_name)
File "/opt/conda/lib/python3.10/site-packages/text_generation_server/utils/weights.py", line 63, in get_filename
raise RuntimeError(f"weight {tensor_name} does not exist")