aluminumbox SuperPauly commited on
Commit
73e47b2
·
verified ·
1 Parent(s): 9951b31

Added English translation, more PEP8 compliant (#1)

Browse files

- Added English translation, more PEP8 compliant (d1a93cded949c9e46541b1df7d37b51ed6600db8)


Co-authored-by: Paul S <[email protected]>

Files changed (1) hide show
  1. app.py +379 -90
app.py CHANGED
@@ -32,35 +32,164 @@ from huggingface_hub import snapshot_download as hf_snapshot_download
32
  hf_snapshot_download('FunAudioLLM/Fun-CosyVoice3-0.5B-2512', local_dir='pretrained_models/Fun-CosyVoice3-0.5B')
33
  snapshot_download('iic/SenseVoiceSmall', local_dir='pretrained_models/SenseVoiceSmall')
34
  hf_snapshot_download('FunAudioLLM/CosyVoice-ttsfrd', local_dir='pretrained_models/CosyVoice-ttsfrd')
35
- os.system('cd pretrained_models/CosyVoice-ttsfrd/ && pip install ttsfrd_dependency-0.1-py3-none-any.whl && pip install ttsfrd-0.4.2-cp310-cp310-linux_x86_64.whl && apt install -y unzip && rm -rf resource && unzip resource.zip -d .')
 
 
 
 
 
 
 
36
 
37
  from cosyvoice.cli.cosyvoice import AutoModel as CosyVoiceAutoModel
38
  from cosyvoice.utils.file_utils import logging, load_wav
39
  from cosyvoice.utils.common import set_all_random_seed, instruct_list
40
 
41
- inference_mode_list = ['3s极速复刻', '自然语言控制']
42
- instruct_dict = {'3s极速复刻': '1. 选择prompt音频文件,或录入prompt音频,注意不超过30s,若同时提供,优先选择prompt音频文件\n2. 输入prompt文本\n3. 点击生成音频按钮',
43
- '自然语言控制': '1. 选择prompt音频文件,或录入prompt音频,注意不超过30s,若同时提供,优先选择prompt音频文件\n2. 输入instruct文本\n3. 点击生成音频按钮'}
44
- stream_mode_list = [('否', False)]
45
- max_val = 0.8
46
 
 
 
47
 
48
- def generate_seed():
49
- seed = random.randint(1, 100000000)
50
- return {
51
- "__type__": "update",
52
- "value": seed
53
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
 
 
 
 
 
 
 
 
 
 
 
55
  top_db = 60
56
  hop_length = 220
57
  win_length = 440
 
 
 
 
 
 
 
58
  def postprocess(wav):
59
  speech = load_wav(wav, target_sr=target_sr, min_sr=16000)
60
  speech, _ = librosa.effects.trim(
61
- speech, top_db=top_db,
62
- frame_length=win_length,
63
- hop_length=hop_length
64
  )
65
  if speech.abs().max() > max_val:
66
  speech = speech / speech.abs().max() * max_val
@@ -69,134 +198,294 @@ def postprocess(wav):
69
  return wav
70
 
71
 
72
- def change_instruction(mode_checkbox_group):
73
- return instruct_dict[mode_checkbox_group]
74
-
75
- @spaces.GPU
76
  def prompt_wav_recognition(prompt_wav):
77
- res = asr_model.generate(input=prompt_wav,
78
- language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech"
79
- use_itn=True,
 
80
  )
81
- text = res[0]["text"].split('|>')[-1]
82
  return text
83
 
 
84
  @spaces.GPU
85
- def generate_audio(tts_text, mode_checkbox_group, prompt_text, prompt_wav_upload, prompt_wav_record, instruct_text,
86
- seed, stream):
 
 
 
 
 
 
 
 
 
87
  stream = False
 
88
  if len(tts_text) > 200:
89
- gr.Warning('您输入的文字过长,请限制在200字以内')
90
  return (target_sr, default_data)
91
- sft_dropdown, speed = '', 1.0
 
 
92
  if prompt_wav_upload is not None:
93
  prompt_wav = prompt_wav_upload
94
  elif prompt_wav_record is not None:
95
  prompt_wav = prompt_wav_record
96
  else:
97
  prompt_wav = None
98
- # if instruct mode, please make sure that model is iic/CosyVoice-300M-Instruct and not cross_lingual mode
99
- if mode_checkbox_group in ['自然语言控制']:
100
- if instruct_text == '':
101
- gr.Warning('您正在使用自然语言控制模式, 请输入instruct文本')
 
102
  return (target_sr, default_data)
103
  if prompt_wav is None:
104
- gr.Info('您正在使用自然语言控制模式, 请输入prompt音频')
105
  return (target_sr, default_data)
106
- # if in zero_shot cross_lingual, please make sure that prompt_text and prompt_wav meets requirements
107
- if mode_checkbox_group in ['3s极速复刻', '跨语种复刻']:
 
108
  if prompt_wav is None:
109
- gr.Warning('prompt音频为空,您是否忘记输入prompt音频?')
110
  return (target_sr, default_data)
 
111
  info = torchaudio.info(prompt_wav)
112
  if info.sample_rate < prompt_sr:
113
- gr.Warning('prompt音频采样率{}低于{}'.format(torchaudio.info(prompt_wav).sample_rate, prompt_sr))
114
  return (target_sr, default_data)
 
115
  if info.num_frames / info.sample_rate > 10:
116
- gr.Warning('请限制输入音频在10s内,避免推理效果过低')
117
  return (target_sr, default_data)
118
- # zero_shot mode only use prompt_wav prompt text
119
- if mode_checkbox_group in ['3s极速复刻']:
120
- if prompt_text == '':
121
- gr.Warning('prompt文本为空,您是否忘记输入prompt文本?')
122
- return (target_sr, default_data)
123
- if instruct_text != '':
124
- gr.Info('您正在使用3s极速复刻模式,instruct文本会被忽略!')
125
- info = torchaudio.info(prompt_wav)
126
- if info.num_frames / info.sample_rate > 10:
127
- gr.Warning('请限制输入音频在10s内,避免推理效果过低')
128
  return (target_sr, default_data)
129
- if mode_checkbox_group == '3s极速复刻':
130
- logging.info('get zero_shot inference request')
 
 
 
 
131
  set_all_random_seed(seed)
132
  speech_list = []
133
- for i in cosyvoice.inference_zero_shot(tts_text, 'You are a helpful assistant.<|endofprompt|>' + prompt_text, postprocess(prompt_wav), stream=stream, speed=speed):
134
- speech_list.append(i['tts_speech'])
 
 
 
 
 
 
135
  return (target_sr, torch.concat(speech_list, dim=1).numpy().flatten())
136
- elif mode_checkbox_group == '自然语言控制':
137
- logging.info('get instruct inference request')
 
138
  set_all_random_seed(seed)
139
  speech_list = []
140
- for i in cosyvoice.inference_instruct2(tts_text, instruct_text, postprocess(prompt_wav), stream=stream, speed=speed):
141
- speech_list.append(i['tts_speech'])
 
 
 
 
 
 
142
  return (target_sr, torch.concat(speech_list, dim=1).numpy().flatten())
143
- else:
144
- gr.Warning('无效的模式选择')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
 
146
 
147
  def main():
148
  with gr.Blocks() as demo:
149
- gr.Markdown("### 代码库 [CosyVoice](https://github.com/FunAudioLLM/CosyVoice) \
150
- 预训练模型 [Fun-CosyVoice3-0.5B](https://huggingface.co/FunAudioLLM/Fun-CosyVoice3-0.5B-2512) \
151
- [CosyVoice2-0.5B](https://www.modelscope.cn/models/iic/CosyVoice2-0.5B) \
152
- [CosyVoice-300M](https://www.modelscope.cn/models/iic/CosyVoice-300M) \
153
- [CosyVoice-300M-Instruct](https://www.modelscope.cn/models/iic/CosyVoice-300M-Instruct) \
154
- [CosyVoice-300M-SFT](https://www.modelscope.cn/models/iic/CosyVoice-300M-SFT)")
155
- gr.Markdown("#### 请输入需要合成的文本,选择推理模式,并按照提示步骤进行操作")
156
-
157
- tts_text = gr.Textbox(label="输入合成文本", lines=1, value="Her handwriting is [M][AY0][N][UW1][T]并且很整洁,说明她[h][ào]干净。")
 
 
 
 
 
 
158
  with gr.Row():
159
- mode_checkbox_group = gr.Radio(choices=inference_mode_list, label='选择推理模式', value=inference_mode_list[0])
160
- instruction_text = gr.Text(label="操作步骤", value=instruct_dict[inference_mode_list[0]], scale=0.5)
161
- stream = gr.Radio(choices=stream_mode_list, label='是否流式推理', value=stream_mode_list[0][1])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
  with gr.Column(scale=0.25):
163
- seed_button = gr.Button(value="\U0001F3B2")
164
- seed = gr.Number(value=0, label="随机推理种子")
165
 
166
  with gr.Row():
167
- prompt_wav_upload = gr.Audio(sources='upload', type='filepath', label='选择prompt音频文件,注意采样率不低于16khz')
168
- prompt_wav_record = gr.Audio(sources='microphone', type='filepath', label='录制prompt音频文件')
169
- prompt_text = gr.Textbox(label="prompt文本", lines=1, placeholder="请输入prompt文本,支持自动识别,您可以自行修正识别结果...", value='')
170
- instruct_text = gr.Dropdown(choices=instruct_list, label='选择instruct文本', value=instruct_list[0])
 
 
 
 
 
 
171
 
172
- generate_button = gr.Button("生成音频")
 
 
 
 
 
 
 
 
 
 
173
 
174
- audio_output = gr.Audio(label="合成音频", autoplay=True, streaming=False)
 
 
 
 
 
175
 
176
  seed_button.click(generate_seed, inputs=[], outputs=seed)
177
- generate_button.click(generate_audio,
178
- inputs=[tts_text, mode_checkbox_group, prompt_text, prompt_wav_upload, prompt_wav_record, instruct_text,
179
- seed, stream],
180
- outputs=[audio_output])
181
- mode_checkbox_group.change(fn=change_instruction, inputs=[mode_checkbox_group], outputs=[instruction_text])
182
- prompt_wav_upload.change(fn=prompt_wav_recognition, inputs=[prompt_wav_upload], outputs=[prompt_text])
183
- prompt_wav_record.change(fn=prompt_wav_recognition, inputs=[prompt_wav_record], outputs=[prompt_text])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
  demo.queue(default_concurrency_limit=4).launch()
185
 
186
 
187
- if __name__ == '__main__':
188
- cosyvoice = CosyVoiceAutoModel(model_dir='pretrained_models/Fun-CosyVoice3-0.5B', load_trt=False, fp16=False)
 
 
 
 
189
  sft_spk = cosyvoice.list_available_spks()
 
190
  for stream in [False]:
191
- for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', 'You are a helpful assistant.<|endofprompt|>希望你以后能够做的比我还好呦。', 'zero_shot_prompt.wav', stream=stream)):
 
 
 
 
 
 
 
192
  continue
193
- prompt_sr, target_sr = 16000, 24000
 
 
194
  default_data = np.zeros(target_sr)
195
 
196
  model_dir = "pretrained_models/SenseVoiceSmall"
197
  asr_model = AutoModel(
198
  model=model_dir,
199
  disable_update=True,
200
- log_level='DEBUG',
201
- device="cuda:0")
 
 
202
  main()
 
32
  hf_snapshot_download('FunAudioLLM/Fun-CosyVoice3-0.5B-2512', local_dir='pretrained_models/Fun-CosyVoice3-0.5B')
33
  snapshot_download('iic/SenseVoiceSmall', local_dir='pretrained_models/SenseVoiceSmall')
34
  hf_snapshot_download('FunAudioLLM/CosyVoice-ttsfrd', local_dir='pretrained_models/CosyVoice-ttsfrd')
35
+ os.system(
36
+ "cd pretrained_models/CosyVoice-ttsfrd/ && "
37
+ "pip install ttsfrd_dependency-0.1-py3-none-any.whl && "
38
+ "pip install ttsfrd-0.4.2-cp310-cp310-linux_x86_64.whl && "
39
+ "apt install -y unzip && "
40
+ "rm -rf resource && "
41
+ "unzip resource.zip -d ."
42
+ )
43
 
44
  from cosyvoice.cli.cosyvoice import AutoModel as CosyVoiceAutoModel
45
  from cosyvoice.utils.file_utils import logging, load_wav
46
  from cosyvoice.utils.common import set_all_random_seed, instruct_list
47
 
48
+ # -----------------------------
49
+ # i18n (En: British spelling)
50
+ # -----------------------------
51
+ LANG_EN = "En"
52
+ LANG_ZH = "Zh"
53
 
54
+ MODE_ZERO_SHOT = "zero_shot"
55
+ MODE_INSTRUCT = "instruct"
56
 
57
+ UI_TEXT = {
58
+ LANG_EN: {
59
+ "lang_label": "Language",
60
+ "md_links": (
61
+ "### Repository [CosyVoice](https://github.com/FunAudioLLM/CosyVoice) \n"
62
+ "Pre-trained model [Fun-CosyVoice3-0.5B](https://huggingface.co/FunAudioLLM/Fun-CosyVoice3-0.5B-2512) \n"
63
+ "[CosyVoice2-0.5B](https://www.modelscope.cn/models/iic/CosyVoice2-0.5B) \n"
64
+ "[CosyVoice-300M](https://www.modelscope.cn/models/iic/CosyVoice-300M) \n"
65
+ "[CosyVoice-300M-Instruct](https://www.modelscope.cn/models/iic/CosyVoice-300M-Instruct) \n"
66
+ "[CosyVoice-300M-SFT](https://www.modelscope.cn/models/iic/CosyVoice-300M-SFT)"
67
+ ),
68
+ "md_hint": "#### Enter the text to synthesise, choose an inference mode, and follow the steps.",
69
+ "tts_label": "Text to synthesise",
70
+ "tts_default": "Her handwriting is very neat, which suggests she likes things tidy.",
71
+ "mode_label": "Inference mode",
72
+ "mode_zero_shot": "3s fast voice cloning",
73
+ "mode_instruct": "Natural language control",
74
+ "steps_label": "Steps",
75
+ "steps_zero_shot": (
76
+ "1. Choose a prompt audio file, or record prompt audio (≤ 30s). If both are provided, the uploaded file is used.\n"
77
+ "2. Enter the prompt text.\n"
78
+ "3. Click Generate audio."
79
+ ),
80
+ "steps_instruct": (
81
+ "1. Choose a prompt audio file, or record prompt audio (≤ 30s). If both are provided, the uploaded file is used.\n"
82
+ "2. Choose/enter the instruct text.\n"
83
+ "3. Click Generate audio."
84
+ ),
85
+ "stream_label": "Streaming inference",
86
+ "stream_no": "No",
87
+ "dice": "🎲",
88
+ "seed_label": "Random inference seed",
89
+ "upload_label": "Choose prompt audio file (sample rate ≥ 16 kHz)",
90
+ "record_label": "Record prompt audio",
91
+ "prompt_text_label": "Prompt text",
92
+ "prompt_text_ph": "Enter prompt text (auto recognition supported; you can edit the result)...",
93
+ "instruct_label": "Choose instruct text",
94
+ "generate_btn": "Generate audio",
95
+ "output_label": "Synthesised audio",
96
+ "warn_too_long": "Your input text is too long; please keep it within 200 characters.",
97
+ "warn_instruct_empty": "You are using Natural language control; please enter instruct text.",
98
+ "info_instruct_need_prompt": "You are using Natural language control; please provide prompt audio.",
99
+ "warn_prompt_missing": "Prompt audio is empty. Did you forget to provide prompt audio?",
100
+ "warn_prompt_sr_low": "Prompt audio sample rate {} is below {}.",
101
+ "warn_prompt_too_long_10s": "Please keep the prompt audio within 10 seconds to avoid poor inference quality.",
102
+ "warn_prompt_text_missing": "Prompt text is empty. Did you forget to enter prompt text?",
103
+ "info_instruct_ignored": "You are using 3s fast voice cloning; instruct text will be ignored.",
104
+ "warn_invalid_mode": "Invalid mode selection.",
105
+ },
106
+ LANG_ZH: {
107
+ "lang_label": "语言",
108
+ "md_links": (
109
+ "### 代码库 [CosyVoice](https://github.com/FunAudioLLM/CosyVoice) \n"
110
+ "预训练模型 [Fun-CosyVoice3-0.5B](https://huggingface.co/FunAudioLLM/Fun-CosyVoice3-0.5B-2512) \n"
111
+ "[CosyVoice2-0.5B](https://www.modelscope.cn/models/iic/CosyVoice2-0.5B) \n"
112
+ "[CosyVoice-300M](https://www.modelscope.cn/models/iic/CosyVoice-300M) \n"
113
+ "[CosyVoice-300M-Instruct](https://www.modelscope.cn/models/iic/CosyVoice-300M-Instruct) \n"
114
+ "[CosyVoice-300M-SFT](https://www.modelscope.cn/models/iic/CosyVoice-300M-SFT)"
115
+ ),
116
+ "md_hint": "#### 请输入需要合成的文本,选择推理模式,并按照提示步骤进行操作",
117
+ "tts_label": "输入合成文本",
118
+ "tts_default": "Her handwriting is [M][AY0][N][UW1][T]并且很整洁,说明她[h][ào]干净。",
119
+ "mode_label": "选择推理模式",
120
+ "mode_zero_shot": "3s极速复刻",
121
+ "mode_instruct": "自然语言控制",
122
+ "steps_label": "操作步骤",
123
+ "steps_zero_shot": (
124
+ "1. 选择prompt音频文件,或录入prompt音频,注意不超过30s,若同时提供,优先选择prompt音频文件\n"
125
+ "2. 输入prompt文本\n"
126
+ "3. 点击生成音频按钮"
127
+ ),
128
+ "steps_instruct": (
129
+ "1. 选择prompt音频文件,或录入prompt音频,注意不超过30s,若同时提供,优先选择prompt音频文件\n"
130
+ "2. 输入instruct文本\n"
131
+ "3. 点击生成音频按钮"
132
+ ),
133
+ "stream_label": "是否流式推理",
134
+ "stream_no": "否",
135
+ "dice": "🎲",
136
+ "seed_label": "随机推理种子",
137
+ "upload_label": "选择prompt音频文件,注意采样率不低于16khz",
138
+ "record_label": "录制prompt音频文件",
139
+ "prompt_text_label": "prompt文本",
140
+ "prompt_text_ph": "请输入prompt文本,支持自动识别,您可以自行修正识别结果...",
141
+ "instruct_label": "选择instruct文本",
142
+ "generate_btn": "生成音频",
143
+ "output_label": "合成音频",
144
+ "warn_too_long": "您输入的文字过长,请限制在200字以内",
145
+ "warn_instruct_empty": "您正在使用自然语言控制模式, 请输入instruct文本",
146
+ "info_instruct_need_prompt": "您正在使用自然语言控制模式, 请输入prompt音频",
147
+ "warn_prompt_missing": "prompt音频为空,您是否忘记输入prompt音频?",
148
+ "warn_prompt_sr_low": "prompt音频采样率{}低于{}",
149
+ "warn_prompt_too_long_10s": "请限制输入音频在10s内,避免推理效果过低",
150
+ "warn_prompt_text_missing": "prompt文本为空,您是否忘记输入prompt文本?",
151
+ "info_instruct_ignored": "您正在使用3s极速复刻模式,instruct文本会被忽略!",
152
+ "warn_invalid_mode": "无效的模式选择",
153
+ },
154
+ }
155
+
156
+
157
+ def t(lang: str, key: str) -> str:
158
+ lang = lang if lang in UI_TEXT else LANG_ZH
159
+ return UI_TEXT[lang][key]
160
+
161
+
162
+ def mode_choices(lang: str):
163
+ return [
164
+ (t(lang, "mode_zero_shot"), MODE_ZERO_SHOT),
165
+ (t(lang, "mode_instruct"), MODE_INSTRUCT),
166
+ ]
167
 
168
+
169
+ def steps_for(lang: str, mode_value: str) -> str:
170
+ if mode_value == MODE_INSTRUCT:
171
+ return t(lang, "steps_instruct")
172
+ return t(lang, "steps_zero_shot")
173
+
174
+
175
+ # -----------------------------
176
+ # Audio post-process
177
+ # -----------------------------
178
+ max_val = 0.8
179
  top_db = 60
180
  hop_length = 220
181
  win_length = 440
182
+
183
+
184
+ def generate_seed():
185
+ seed = random.randint(1, 100000000)
186
+ return {"__type__": "update", "value": seed}
187
+
188
+
189
  def postprocess(wav):
190
  speech = load_wav(wav, target_sr=target_sr, min_sr=16000)
191
  speech, _ = librosa.effects.trim(
192
+ speech, top_db=top_db, frame_length=win_length, hop_length=hop_length
 
 
193
  )
194
  if speech.abs().max() > max_val:
195
  speech = speech / speech.abs().max() * max_val
 
198
  return wav
199
 
200
 
 
 
 
 
201
  def prompt_wav_recognition(prompt_wav):
202
+ res = asr_model.generate(
203
+ input=prompt_wav,
204
+ language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech"
205
+ use_itn=True,
206
  )
207
+ text = res[0]["text"].split("|>")[-1]
208
  return text
209
 
210
+
211
  @spaces.GPU
212
+ def generate_audio(
213
+ tts_text,
214
+ mode_value,
215
+ prompt_text,
216
+ prompt_wav_upload,
217
+ prompt_wav_record,
218
+ instruct_text,
219
+ seed,
220
+ stream,
221
+ ui_lang,
222
+ ):
223
  stream = False
224
+
225
  if len(tts_text) > 200:
226
+ gr.Warning(t(ui_lang, "warn_too_long"))
227
  return (target_sr, default_data)
228
+
229
+ sft_dropdown, speed = "", 1.0
230
+
231
  if prompt_wav_upload is not None:
232
  prompt_wav = prompt_wav_upload
233
  elif prompt_wav_record is not None:
234
  prompt_wav = prompt_wav_record
235
  else:
236
  prompt_wav = None
237
+
238
+ # instruct mode requirements
239
+ if mode_value == MODE_INSTRUCT:
240
+ if instruct_text == "":
241
+ gr.Warning(t(ui_lang, "warn_instruct_empty"))
242
  return (target_sr, default_data)
243
  if prompt_wav is None:
244
+ gr.Info(t(ui_lang, "info_instruct_need_prompt"))
245
  return (target_sr, default_data)
246
+
247
+ # zero-shot requirements
248
+ if mode_value == MODE_ZERO_SHOT:
249
  if prompt_wav is None:
250
+ gr.Warning(t(ui_lang, "warn_prompt_missing"))
251
  return (target_sr, default_data)
252
+
253
  info = torchaudio.info(prompt_wav)
254
  if info.sample_rate < prompt_sr:
255
+ gr.Warning(t(ui_lang, "warn_prompt_sr_low").format(info.sample_rate, prompt_sr))
256
  return (target_sr, default_data)
257
+
258
  if info.num_frames / info.sample_rate > 10:
259
+ gr.Warning(t(ui_lang, "warn_prompt_too_long_10s"))
260
  return (target_sr, default_data)
261
+
262
+ if prompt_text == "":
263
+ gr.Warning(t(ui_lang, "warn_prompt_text_missing"))
 
 
 
 
 
 
 
264
  return (target_sr, default_data)
265
+
266
+ if instruct_text != "":
267
+ gr.Info(t(ui_lang, "info_instruct_ignored"))
268
+
269
+ if mode_value == MODE_ZERO_SHOT:
270
+ logging.info("get zero_shot inference request")
271
  set_all_random_seed(seed)
272
  speech_list = []
273
+ for i in cosyvoice.inference_zero_shot(
274
+ tts_text,
275
+ "You are a helpful assistant.<|endofprompt|>" + prompt_text,
276
+ postprocess(prompt_wav),
277
+ stream=stream,
278
+ speed=speed,
279
+ ):
280
+ speech_list.append(i["tts_speech"])
281
  return (target_sr, torch.concat(speech_list, dim=1).numpy().flatten())
282
+
283
+ if mode_value == MODE_INSTRUCT:
284
+ logging.info("get instruct inference request")
285
  set_all_random_seed(seed)
286
  speech_list = []
287
+ for i in cosyvoice.inference_instruct2(
288
+ tts_text,
289
+ instruct_text,
290
+ postprocess(prompt_wav),
291
+ stream=stream,
292
+ speed=speed,
293
+ ):
294
+ speech_list.append(i["tts_speech"])
295
  return (target_sr, torch.concat(speech_list, dim=1).numpy().flatten())
296
+
297
+ gr.Warning(t(ui_lang, "warn_invalid_mode"))
298
+ return (target_sr, default_data)
299
+
300
+
301
+ def on_mode_change(mode_value, ui_lang):
302
+ return steps_for(ui_lang, mode_value)
303
+
304
+
305
+ def on_language_change(ui_lang, current_mode_value):
306
+ lang = ui_lang if ui_lang in (LANG_EN, LANG_ZH) else LANG_ZH
307
+ return (
308
+ gr.update(value=UI_TEXT[lang]["md_links"]), # md_links
309
+ gr.update(value=UI_TEXT[lang]["md_hint"]), # md_hint
310
+ gr.update(label=t(lang, "lang_label")), # lang_radio label
311
+ gr.update(choices=mode_choices(lang), label=t(lang, "mode_label")), # mode radio
312
+ gr.update(value=steps_for(lang, current_mode_value), label=t(lang, "steps_label")), # steps box
313
+ gr.update(
314
+ choices=[(t(lang, "stream_no"), False)],
315
+ label=t(lang, "stream_label"),
316
+ value=False,
317
+ ), # stream radio
318
+ gr.update(value=t(lang, "dice")), # seed button text
319
+ gr.update(label=t(lang, "seed_label")), # seed label
320
+ gr.update(label=t(lang, "tts_label"), value=t(lang, "tts_default")), # tts textbox
321
+ gr.update(label=t(lang, "upload_label")), # upload label
322
+ gr.update(label=t(lang, "record_label")), # record label
323
+ gr.update(label=t(lang, "prompt_text_label"), placeholder=t(lang, "prompt_text_ph")), # prompt text
324
+ gr.update(label=t(lang, "instruct_label")), # instruct dropdown
325
+ gr.update(value=t(lang, "generate_btn")), # generate button
326
+ gr.update(label=t(lang, "output_label")), # output label
327
+ )
328
 
329
 
330
  def main():
331
  with gr.Blocks() as demo:
332
+ md_links = gr.Markdown(UI_TEXT[LANG_ZH]["md_links"])
333
+ md_hint = gr.Markdown(UI_TEXT[LANG_ZH]["md_hint"])
334
+
335
+ lang_radio = gr.Radio(
336
+ choices=[LANG_EN, LANG_ZH],
337
+ value=LANG_ZH,
338
+ label=t(LANG_ZH, "lang_label"),
339
+ )
340
+
341
+ tts_text = gr.Textbox(
342
+ label=t(LANG_ZH, "tts_label"),
343
+ lines=1,
344
+ value=t(LANG_ZH, "tts_default"),
345
+ )
346
+
347
  with gr.Row():
348
+ mode_radio = gr.Radio(
349
+ choices=mode_choices(LANG_ZH),
350
+ label=t(LANG_ZH, "mode_label"),
351
+ value=MODE_ZERO_SHOT,
352
+ )
353
+ steps_box = gr.Textbox(
354
+ label=t(LANG_ZH, "steps_label"),
355
+ value=steps_for(LANG_ZH, MODE_ZERO_SHOT),
356
+ lines=4,
357
+ interactive=False,
358
+ scale=0.5,
359
+ )
360
+ stream = gr.Radio(
361
+ choices=[(t(LANG_ZH, "stream_no"), False)],
362
+ label=t(LANG_ZH, "stream_label"),
363
+ value=False,
364
+ )
365
  with gr.Column(scale=0.25):
366
+ seed_button = gr.Button(value=t(LANG_ZH, "dice"))
367
+ seed = gr.Number(value=0, label=t(LANG_ZH, "seed_label"))
368
 
369
  with gr.Row():
370
+ prompt_wav_upload = gr.Audio(
371
+ sources="upload",
372
+ type="filepath",
373
+ label=t(LANG_ZH, "upload_label"),
374
+ )
375
+ prompt_wav_record = gr.Audio(
376
+ sources="microphone",
377
+ type="filepath",
378
+ label=t(LANG_ZH, "record_label"),
379
+ )
380
 
381
+ prompt_text = gr.Textbox(
382
+ label=t(LANG_ZH, "prompt_text_label"),
383
+ lines=1,
384
+ placeholder=t(LANG_ZH, "prompt_text_ph"),
385
+ value="",
386
+ )
387
+ instruct_text = gr.Dropdown(
388
+ choices=instruct_list,
389
+ label=t(LANG_ZH, "instruct_label"),
390
+ value=instruct_list[0],
391
+ )
392
 
393
+ generate_button = gr.Button(t(LANG_ZH, "generate_btn"))
394
+ audio_output = gr.Audio(
395
+ label=t(LANG_ZH, "output_label"),
396
+ autoplay=True,
397
+ streaming=False,
398
+ )
399
 
400
  seed_button.click(generate_seed, inputs=[], outputs=seed)
401
+
402
+ generate_button.click(
403
+ generate_audio,
404
+ inputs=[
405
+ tts_text,
406
+ mode_radio,
407
+ prompt_text,
408
+ prompt_wav_upload,
409
+ prompt_wav_record,
410
+ instruct_text,
411
+ seed,
412
+ stream,
413
+ lang_radio, # ui_lang
414
+ ],
415
+ outputs=[audio_output],
416
+ )
417
+
418
+ mode_radio.change(
419
+ fn=on_mode_change,
420
+ inputs=[mode_radio, lang_radio],
421
+ outputs=[steps_box],
422
+ )
423
+
424
+ prompt_wav_upload.change(
425
+ fn=prompt_wav_recognition,
426
+ inputs=[prompt_wav_upload],
427
+ outputs=[prompt_text],
428
+ )
429
+ prompt_wav_record.change(
430
+ fn=prompt_wav_recognition,
431
+ inputs=[prompt_wav_record],
432
+ outputs=[prompt_text],
433
+ )
434
+
435
+ lang_radio.change(
436
+ fn=on_language_change,
437
+ inputs=[lang_radio, mode_radio],
438
+ outputs=[
439
+ md_links,
440
+ md_hint,
441
+ lang_radio,
442
+ mode_radio,
443
+ steps_box,
444
+ stream,
445
+ seed_button,
446
+ seed,
447
+ tts_text,
448
+ prompt_wav_upload,
449
+ prompt_wav_record,
450
+ prompt_text,
451
+ instruct_text,
452
+ generate_button,
453
+ audio_output,
454
+ ],
455
+ )
456
+
457
  demo.queue(default_concurrency_limit=4).launch()
458
 
459
 
460
+ if __name__ == "__main__":
461
+ cosyvoice = CosyVoiceAutoModel(
462
+ model_dir="pretrained_models/Fun-CosyVoice3-0.5B",
463
+ load_trt=False,
464
+ fp16=False,
465
+ )
466
  sft_spk = cosyvoice.list_available_spks()
467
+
468
  for stream in [False]:
469
+ for i, j in enumerate(
470
+ cosyvoice.inference_zero_shot(
471
+ "收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。",
472
+ "You are a helpful assistant.<|endofprompt|>希望你以后能够做的比我还好呦。",
473
+ "zero_shot_prompt.wav",
474
+ stream=stream,
475
+ )
476
+ ):
477
  continue
478
+
479
+ prompt_sr = 16000
480
+ target_sr = 24000
481
  default_data = np.zeros(target_sr)
482
 
483
  model_dir = "pretrained_models/SenseVoiceSmall"
484
  asr_model = AutoModel(
485
  model=model_dir,
486
  disable_update=True,
487
+ log_level="DEBUG",
488
+ device="cuda:0",
489
+ )
490
+
491
  main()