Upload folder using huggingface_hub
Browse files- Test_RAG.py +1 -8
Test_RAG.py
CHANGED
@@ -282,13 +282,6 @@ print(f"Loading model from {model_dir}")
|
|
282 |
|
283 |
ov_config = {"PERFORMANCE_HINT": "LATENCY", "NUM_STREAMS": "1", "CACHE_DIR": ""}
|
284 |
|
285 |
-
if "GPU" in llm_device and "qwen2-7b-instruct" in llm_model_id:
|
286 |
-
ov_config["GPU_ENABLE_SDPA_OPTIMIZATION"] = "NO"
|
287 |
-
|
288 |
-
# On a GPU device a model is executed in FP16 precision. For red-pajama-3b-chat model there known accuracy
|
289 |
-
# issues caused by this, which we avoid by setting precision hint to "f32".
|
290 |
-
if llm_model_id == "red-pajama-3b-chat" and "GPU" in core.available_devices and llm_device in ["GPU", "AUTO"]:
|
291 |
-
ov_config["INFERENCE_PRECISION_HINT"] = "f32"
|
292 |
|
293 |
# llm = HuggingFacePipeline.from_model_id(
|
294 |
# model_id= "meta-llama/Meta-Llama-3-8B",
|
@@ -304,7 +297,7 @@ if llm_model_id == "red-pajama-3b-chat" and "GPU" in core.available_devices and
|
|
304 |
# )
|
305 |
from optimum.intel.openvino import OVModelForCausalLM
|
306 |
from transformers import pipeline
|
307 |
-
|
308 |
|
309 |
model_id = "meta-llama/Meta-Llama-3-8B"
|
310 |
ov_config = {"PERFORMANCE_HINT": "LATENCY"} # 这是一个例子,检查你的实际 ov_config
|
|
|
282 |
|
283 |
ov_config = {"PERFORMANCE_HINT": "LATENCY", "NUM_STREAMS": "1", "CACHE_DIR": ""}
|
284 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
285 |
|
286 |
# llm = HuggingFacePipeline.from_model_id(
|
287 |
# model_id= "meta-llama/Meta-Llama-3-8B",
|
|
|
297 |
# )
|
298 |
from optimum.intel.openvino import OVModelForCausalLM
|
299 |
from transformers import pipeline
|
300 |
+
print("starting setting llm model")
|
301 |
|
302 |
model_id = "meta-llama/Meta-Llama-3-8B"
|
303 |
ov_config = {"PERFORMANCE_HINT": "LATENCY"} # 这是一个例子,检查你的实际 ov_config
|