|
from transformers import PretrainedConfig |
|
|
|
class TyphoonAudioConfig(PretrainedConfig): |
|
model_type = "typhoonaudio" |
|
|
|
def __init__( |
|
self, |
|
whisper_path="biodatlab/whisper-th-large-v3-combined", |
|
llm_path="scb10x/llama-3-typhoon-v1.5-8b-instruct", |
|
speech_qformer_token_num=1, |
|
speech_qformer_layer=2, |
|
second_per_frame=0.333333, |
|
second_stride=0.333333, |
|
lora=True, |
|
lora_alpha=32, |
|
lora_rank=8, |
|
lora_dropout=0.0, |
|
dtype="float16", |
|
**kwargs |
|
): |
|
self.whisper_path = whisper_path |
|
self.llm_path = llm_path |
|
self.speech_qformer_token_num = speech_qformer_token_num |
|
self.speech_qformer_layer = speech_qformer_layer |
|
self.second_per_frame = second_per_frame |
|
self.second_stride = second_stride |
|
self.lora = lora |
|
self.lora_alpha = lora_alpha |
|
self.lora_rank = lora_rank |
|
self.lora_dropout = lora_dropout |
|
self.dtype = dtype |
|
super().__init__(**kwargs) |
|
|