JiakaiDu commited on
Commit
b18fb8c
1 Parent(s): df0b1a0

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. Test_RAG.py +1 -8
Test_RAG.py CHANGED
@@ -282,13 +282,6 @@ print(f"Loading model from {model_dir}")
282
 
283
  ov_config = {"PERFORMANCE_HINT": "LATENCY", "NUM_STREAMS": "1", "CACHE_DIR": ""}
284
 
285
- if "GPU" in llm_device and "qwen2-7b-instruct" in llm_model_id:
286
- ov_config["GPU_ENABLE_SDPA_OPTIMIZATION"] = "NO"
287
-
288
- # On a GPU device a model is executed in FP16 precision. For red-pajama-3b-chat model there known accuracy
289
- # issues caused by this, which we avoid by setting precision hint to "f32".
290
- if llm_model_id == "red-pajama-3b-chat" and "GPU" in core.available_devices and llm_device in ["GPU", "AUTO"]:
291
- ov_config["INFERENCE_PRECISION_HINT"] = "f32"
292
 
293
  # llm = HuggingFacePipeline.from_model_id(
294
  # model_id= "meta-llama/Meta-Llama-3-8B",
@@ -304,7 +297,7 @@ if llm_model_id == "red-pajama-3b-chat" and "GPU" in core.available_devices and
304
  # )
305
  from optimum.intel.openvino import OVModelForCausalLM
306
  from transformers import pipeline
307
-
308
 
309
  model_id = "meta-llama/Meta-Llama-3-8B"
310
  ov_config = {"PERFORMANCE_HINT": "LATENCY"} # 这是一个例子,检查你的实际 ov_config
 
282
 
283
  ov_config = {"PERFORMANCE_HINT": "LATENCY", "NUM_STREAMS": "1", "CACHE_DIR": ""}
284
 
 
 
 
 
 
 
 
285
 
286
  # llm = HuggingFacePipeline.from_model_id(
287
  # model_id= "meta-llama/Meta-Llama-3-8B",
 
297
  # )
298
  from optimum.intel.openvino import OVModelForCausalLM
299
  from transformers import pipeline
300
+ print("starting setting llm model")
301
 
302
  model_id = "meta-llama/Meta-Llama-3-8B"
303
  ov_config = {"PERFORMANCE_HINT": "LATENCY"} # 这是一个例子,检查你的实际 ov_config