Spaces:

FunAudioLLM
/

CosyVoice-300M

Running on Zero

App Files Files Community

wenmengzhou commited on Jul 18

Commit

5297156

•

1 Parent(s): 589560d

update model according to hysts advice

Browse files

Files changed (1) hide show

cosyvoice/cli/model.py +4 -6

cosyvoice/cli/model.py CHANGED Viewed

@@ -19,18 +19,17 @@ class CosyVoiceModel:
                  llm: torch.nn.Module,
                  flow: torch.nn.Module,
                  hift: torch.nn.Module):
-        #self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-        self.device = 'cpu'
         self.llm = llm
         self.flow = flow
         self.hift = hift
     def load(self, llm_model, flow_model, hift_model):
-        self.llm.load_state_dict(torch.load(llm_model, map_location=self.device))
         self.llm.to(self.device).eval()
-        self.flow.load_state_dict(torch.load(flow_model, map_location=self.device))
         self.flow.to(self.device).eval()
-        self.hift.load_state_dict(torch.load(hift_model, map_location=self.device))
         self.hift.to(self.device).eval()
     def inference(self, text, text_len, flow_embedding, llm_embedding=torch.zeros(0, 192),
@@ -38,7 +37,6 @@ class CosyVoiceModel:
                   llm_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), llm_prompt_speech_token_len=torch.zeros(1, dtype=torch.int32),
                   flow_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), flow_prompt_speech_token_len=torch.zeros(1, dtype=torch.int32),
                   prompt_speech_feat=torch.zeros(1, 0, 80), prompt_speech_feat_len=torch.zeros(1, dtype=torch.int32)):
-        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
         tts_speech_token = self.llm.inference(text=text.to(self.device),
                                               text_len=text_len.to(self.device),
                                               prompt_text=prompt_text.to(self.device),

                  llm: torch.nn.Module,
                  flow: torch.nn.Module,
                  hift: torch.nn.Module):
+        self.device = torch.device('cuda')
         self.llm = llm
         self.flow = flow
         self.hift = hift
     def load(self, llm_model, flow_model, hift_model):
+        self.llm.load_state_dict(torch.load(llm_model, map_location='cpu'))
         self.llm.to(self.device).eval()
+        self.flow.load_state_dict(torch.load(flow_model, map_location='cpu'))
         self.flow.to(self.device).eval()
+        self.hift.load_state_dict(torch.load(hift_model, map_location='cpu'))
         self.hift.to(self.device).eval()
     def inference(self, text, text_len, flow_embedding, llm_embedding=torch.zeros(0, 192),
                   llm_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), llm_prompt_speech_token_len=torch.zeros(1, dtype=torch.int32),
                   flow_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), flow_prompt_speech_token_len=torch.zeros(1, dtype=torch.int32),
                   prompt_speech_feat=torch.zeros(1, 0, 80), prompt_speech_feat_len=torch.zeros(1, dtype=torch.int32)):
         tts_speech_token = self.llm.inference(text=text.to(self.device),
                                               text_len=text_len.to(self.device),
                                               prompt_text=prompt_text.to(self.device),