|
import os |
|
import time |
|
import numpy as np |
|
from rkllm_binding import * |
|
from rknnlite.api.rknn_lite import RKNNLite |
|
from transformers import WhisperFeatureExtractor |
|
import signal |
|
import cv2 |
|
import librosa |
|
|
|
|
|
MODEL_PATH = "qwen.rkllm" |
|
AUDIO_ENCODER_PATH = "audio_encoder.rknn" |
|
handle = None |
|
img_size = 448 |
|
|
|
|
|
def signal_handler(signal, frame): |
|
print("Ctrl-C pressed, exiting...") |
|
global handle |
|
if handle: |
|
abort(handle) |
|
destroy(handle) |
|
exit(0) |
|
|
|
signal.signal(signal.SIGINT, signal_handler) |
|
|
|
|
|
os.environ["RKLLM_LOG_LEVEL"] = "1" |
|
|
|
inference_count = 0 |
|
inference_start_time = 0 |
|
def result_callback(result, userdata, state): |
|
global inference_start_time |
|
global inference_count |
|
if state == LLMCallState.RKLLM_RUN_NORMAL: |
|
if inference_count == 0: |
|
first_token_time = time.time() |
|
print(f"Time to first token: {first_token_time - inference_start_time:.2f} seconds") |
|
inference_count += 1 |
|
print(result.contents.text.decode(), end="", flush=True) |
|
elif state == LLMCallState.RKLLM_RUN_FINISH: |
|
print("\n\n(finished)") |
|
elif state == LLMCallState.RKLLM_RUN_ERROR: |
|
print("\nError occurred during LLM call") |
|
|
|
feature_extractor = WhisperFeatureExtractor.from_pretrained(".") |
|
|
|
|
|
audio_encoder = RKNNLite(verbose=True) |
|
model_size = os.path.getsize(AUDIO_ENCODER_PATH) |
|
print(f"Start loading audio encoder model (size: {model_size / 1024 / 1024:.2f} MB)") |
|
start_time = time.time() |
|
audio_encoder.load_rknn(AUDIO_ENCODER_PATH) |
|
end_time = time.time() |
|
print(f"Audio encoder loaded in {end_time - start_time:.2f} seconds (speed: {model_size / (end_time - start_time) / 1024 / 1024:.2f} MB/s)") |
|
audio_encoder.init_runtime() |
|
|
|
|
|
param = create_default_param() |
|
param.model_path = MODEL_PATH.encode() |
|
param.img_start = "<|audio_bos|>".encode() |
|
param.img_end = "<|audio_eos|>".encode() |
|
param.img_content = "<|AUDIO|>".encode() |
|
param.max_context_len = 1024 |
|
extend_param = RKLLMExtendParam() |
|
extend_param.base_domain_id = 1 |
|
param.extend_param = extend_param |
|
model_size = os.path.getsize(MODEL_PATH) |
|
print(f"Start loading language model (size: {model_size / 1024 / 1024:.2f} MB)") |
|
start_time = time.time() |
|
handle = init(param, result_callback) |
|
end_time = time.time() |
|
print(f"Language model loaded in {end_time - start_time:.2f} seconds (speed: {model_size / (end_time - start_time) / 1024 / 1024:.2f} MB/s)") |
|
|
|
|
|
|
|
audio_path = "glass-breaking.mp3" |
|
|
|
|
|
print("Start inference...") |
|
audio, _ = librosa.load(audio_path, sr=feature_extractor.sampling_rate) |
|
feature_extractor_output = feature_extractor( |
|
audio, |
|
sampling_rate=feature_extractor.sampling_rate, |
|
return_attention_mask=True, |
|
padding="max_length" |
|
) |
|
|
|
print(feature_extractor_output.input_features.shape) |
|
start_time = time.time() |
|
audio_embeddings = audio_encoder.inference(inputs=[ |
|
feature_extractor_output.input_features.astype(np.float32), |
|
feature_extractor_output.attention_mask.astype(np.float32) |
|
], data_format="nhwc")[0].astype(np.float32) |
|
end_time = time.time() |
|
print(f"Audio encoder inference time: {end_time - start_time:.2f} seconds") |
|
print(audio_embeddings.flags) |
|
print(audio_embeddings.shape) |
|
|
|
|
|
prompt = """<|im_start|>system |
|
You are a helpful assistant.<|im_end|> |
|
<|im_start|>user |
|
Audio 1: <image> |
|
这是什么声音? <|im_end|> |
|
<|im_start|>assistant |
|
""" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
rkllm_input = create_rkllm_input(RKLLMInputType.RKLLM_INPUT_MULTIMODAL, prompt=prompt, image_embed=audio_embeddings) |
|
|
|
|
|
infer_param = RKLLMInferParam() |
|
infer_param.mode = RKLLMInferMode.RKLLM_INFER_GENERATE.value |
|
|
|
|
|
inference_start_time = time.time() |
|
run(handle, rkllm_input, infer_param, None) |
|
|
|
|
|
destroy(handle) |