Spaces:

FunAudioLLM
/

Fun-ASR-Nano

Running

App Files Files Community

FFomy commited on 5 days ago

Commit

b54b744

verified ·

1 Parent(s): 2a9b280

Update Fun-ASR/model.py

Browse files

Files changed (1) hide show

Fun-ASR/model.py +109 -42

Fun-ASR/model.py CHANGED Viewed

@@ -15,6 +15,7 @@ from funasr.register import tables
 from funasr.train_utils.device_funcs import force_gatherable, to_device
 from funasr.utils.datadir_writer import DatadirWriter
 from funasr.utils.load_utils import extract_fbank, load_audio_text_image_video
 dtype_map = {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}
@@ -37,13 +38,21 @@ class FunASRNano(nn.Module):
         # audio encoder
         hub = audio_encoder_conf.get("hub", None)
-        self.audio_encoder_activation_checkpoint = audio_encoder_conf.get("activation_checkpoint", False)
         if hub == "ms":
             model = AutoModel(model=audio_encoder, model_revision="master")
             audio_encoder_output_size = (
-                model.model.encoder_output_size if hasattr(model.model, "encoder_output_size") else -1
             )
-            audio_encoder = model.model.model.encoder if hasattr(model.model, "model") else model.model.encoder
         else:
             encoder_class = tables.encoder_classes.get(audio_encoder)
             audio_encoder = encoder_class(input_size=input_size, **audio_encoder_conf)
@@ -61,16 +70,9 @@ class FunASRNano(nn.Module):
         init_param_path = llm_conf.get("init_param_path", None)
         llm_dim = None
-        from transformers import AutoModelForCausalLM
         llm_load_kwargs = llm_conf.get("load_kwargs", {})
-        model = AutoModelForCausalLM.from_pretrained(
-            init_param_path,
-            load_in_8bit=None,
-            device_map=None,
-            use_cache=None,
-            **llm_load_kwargs,
-        )
         freeze = llm_conf.get("freeze", True)
         if freeze:
@@ -110,13 +112,10 @@ class FunASRNano(nn.Module):
         adaptor_class = tables.adaptor_classes.get(audio_adaptor)
         if audio_encoder_output_size > 0:
             audio_adaptor_conf["encoder_dim"] = audio_encoder_output_size
-        audio_adaptor_conf["llm_dim"] = llm_dim if llm_dim is not None else audio_adaptor_conf["llm_dim"]
         audio_adaptor = adaptor_class(**audio_adaptor_conf)
-        init_param_path = audio_adaptor_conf.get("init_param_path", None)
-        if init_param_path is not None:
-            src_state = torch.load(init_param_path, map_location="cpu")
-            flag = audio_adaptor.load_state_dict(src_state, strict=False)
-            logging.info(f"Loading audio_adaptor ckpt: {init_param_path}, status: {flag}")
         freeze = audio_adaptor_conf.get("freeze", False)
         if freeze:
             for name, param in audio_adaptor.named_parameters():
@@ -153,12 +152,16 @@ class FunASRNano(nn.Module):
             if self.audio_encoder_activation_checkpoint:
                 from torch.utils.checkpoint import checkpoint
-                encoder_out, encoder_out_lens = checkpoint(self.encode, speech, speech_lengths, use_reentrant=False)
             else:
                 encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
             # audio_adaptor
-            encoder_out, encoder_out_lens = self.audio_adaptor(encoder_out, encoder_out_lens)
             batch_size, token_num, dims = inputs_embeds.shape
             fake_token_len = kwargs.get("fake_token_len")
@@ -197,7 +200,9 @@ class FunASRNano(nn.Module):
             stats["batch_size_speech"] = batch_size_speech
             stats["batch_size_x_frames"] = frames * batch_size_speech
             stats["batch_size_real_frames"] = speech_lengths.sum().item()
-            stats["padding_frames"] = stats["batch_size_x_frames"] - stats["batch_size_real_frames"]
         with torch.cuda.amp.autocast(
             enabled=True if self.llm_dtype != "fp32" else False,
@@ -214,7 +219,9 @@ class FunASRNano(nn.Module):
         with torch.no_grad():
             preds = torch.argmax(model_outputs.logits, -1)
-            acc_att = compute_accuracy(preds[:, :-1], labels_ids[:, 1:], ignore_label=-100)
             stats["acc"] = acc_att
         stats["loss"] = torch.clone(loss.detach())
@@ -222,7 +229,9 @@ class FunASRNano(nn.Module):
         stats["batch_size_x_tokens"] = token_num * batch_size
         stats["batch_size_real_tokens"] = attention_mask.sum().item()
-        stats["padding_tokens"] = stats["batch_size_x_tokens"] - stats["batch_size_real_tokens"]
         dialog_turns = (fbank_beg > 0).sum(-1)
         dialog_turns_max = torch.max(dialog_turns).int().item()
@@ -244,7 +253,9 @@ class FunASRNano(nn.Module):
     def encode(self, speech, speech_lengths):
         # audio encoder
         if self.feat_permute:
-            encoder_out, encoder_out_lens = self.audio_encoder(speech.permute(0, 2, 1), speech_lengths)
         else:
             encoder_out, encoder_out_lens = self.audio_encoder(speech, speech_lengths)
@@ -275,7 +286,9 @@ class FunASRNano(nn.Module):
         return contents
-    def data_load_speech(self, contents: dict, tokenizer, frontend, meta_data={}, **kwargs):
         system = contents["system"]
         user = contents["user"]
         assistant = contents["assistant"]
@@ -296,7 +309,9 @@ class FunASRNano(nn.Module):
             [],
         )
         input_source_ids = []
-        for i, (system_prompt, user_prompt, target_out) in enumerate(zip(system, user, assistant)):
             if i >= kwargs.get("multiturn_num_max", 5):
                 break
             if len(input_ids) > kwargs.get("max_token_length", 1500):
@@ -332,18 +347,24 @@ class FunASRNano(nn.Module):
                     source_ids += sub_token
                     fbank_mask_i += [0] * len(sub_token)
                 else:
-                    sub_str = sub_str.replace("<|startofspeech|>", "").replace("<|endofspeech|>", "")
                     if sub_str.startswith("!"):
                         sub_str = sub_str[1:]
                         if sub_str.startswith("!"):  # !!: audio sample point
                             sub_str = audio
                         try:
                             time1 = time.perf_counter()
-                            data_src = load_audio_text_image_video(sub_str, fs=frontend.fs, **kwargs)
                             time2 = time.perf_counter()
                             meta_data["load_data"] = f"{time2 - time1:0.3f}"
                         except Exception as e:
-                            logging.error(f"Loading wav failed! {str(e)}, {traceback.format_exc()}")
                         speech, speech_lengths = extract_fbank(
                             data_src,
@@ -355,7 +376,10 @@ class FunASRNano(nn.Module):
                         time3 = time.perf_counter()
                         meta_data["extract_feat"] = f"{time3 - time2:0.3f}"
                         meta_data["batch_data_time"] = (
-                            speech_lengths.sum().item() * frontend.frame_shift * frontend.lfr_n / 1000
                         )
                         if self.feat_permute:
@@ -382,7 +406,9 @@ class FunASRNano(nn.Module):
                 fbank.append(speech[0, :, :])
                 fbank_lens.append(speech_lengths)
-        input_ids = torch.tensor(input_ids, dtype=torch.int64)  # [: self.max_token_length]
         attention_mask = torch.tensor([1] * len(input_ids), dtype=torch.int32)
         labels = torch.tensor(labels, dtype=torch.int64)  # [: self.max_token_length]
@@ -393,8 +419,12 @@ class FunASRNano(nn.Module):
         target_ids = torch.tensor(target_ids, dtype=torch.int64)
         if len(fbank) > 0:
-            speech = torch.nn.utils.rnn.pad_sequence(fbank, batch_first=True, padding_value=0.0)
-            speech_lengths = torch.nn.utils.rnn.pad_sequence(fbank_lens, batch_first=True, padding_value=-1)
         else:
             speech = []
             speech_lengths = []
@@ -428,7 +458,9 @@ class FunASRNano(nn.Module):
             raise NotImplementedError("batch decoding is not implemented")
         contents = self.data_template(data_in[0])
-        output = self.data_load_speech(contents, tokenizer, frontend, meta_data=meta_data, **kwargs)
         batch = to_device(output, kwargs["device"])
         # audio encoder
@@ -449,7 +481,9 @@ class FunASRNano(nn.Module):
                 encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
                 # audio_adaptor
-                encoder_out, encoder_out_lens = self.audio_adaptor(encoder_out, encoder_out_lens)
                 meta_data["audio_adaptor_out"] = encoder_out
                 meta_data["audio_adaptor_out_lens"] = encoder_out_lens
@@ -509,13 +543,36 @@ class FunASRNano(nn.Module):
         frontend=None,
         **kwargs,
     ):
         new_data_in = []
         for data in data_in:
             if isinstance(data, str):
                 new_data_in.append(
                     [
                         {"role": "system", "content": "You are a helpful assistant."},
-                        {"role": "user", "content": f"语音转写：<|startofspeech|>!{data}<|endofspeech|>"},
                         {"role": "assistant", "content": "null"},
                     ]
                 )
@@ -523,7 +580,11 @@ class FunASRNano(nn.Module):
                 new_data_in.append(
                     [
                         {"role": "system", "content": "You are a helpful assistant."},
-                        {"role": "user", "content": f"语音转写：<|startofspeech|>!!<|endofspeech|>", "audio": data},
                         {"role": "assistant", "content": "null"},
                     ]
                 )
@@ -533,7 +594,9 @@ class FunASRNano(nn.Module):
             key = []
             for _ in data_in:
                 chars = string.ascii_letters + string.digits
-                key.append("rand_key_" + "".join(random.choice(chars) for _ in range(13)))
         return self.inference_llm(
             data_in,
@@ -561,7 +624,9 @@ class FunASRNano(nn.Module):
             llm_dtype = "fp16" if kwargs.get("fp16", False) else llm_dtype
             llm_dtype = "bf16" if kwargs.get("bf16", False) else llm_dtype
-        with torch.cuda.amp.autocast(enabled=True if llm_dtype != "fp32" else False, dtype=dtype_map[llm_dtype]):
             label = contents["assistant"][-1]
             self.llm = self.llm.to(dtype_map[llm_dtype])
             inputs_embeds = inputs_embeds.to(dtype_map[llm_dtype])
@@ -608,7 +673,7 @@ class FunASRNano(nn.Module):
         response_clean = re.sub(r"[^\w\s\u3000\u4e00-\u9fff]+", "", response)
         result_i = {
             "key": key[0],
-            "text": response,
             "text_tn": response_clean,
             "label": label,
         }
@@ -627,6 +692,8 @@ class FunASRNano(nn.Module):
     def from_pretrained(model: str = None, **kwargs):
         from funasr import AutoModel
-        model, kwargs = AutoModel.build_model(model=model, trust_remote_code=True, **kwargs)
-        return model, kwargs

 from funasr.train_utils.device_funcs import force_gatherable, to_device
 from funasr.utils.datadir_writer import DatadirWriter
 from funasr.utils.load_utils import extract_fbank, load_audio_text_image_video
+from transformers import AutoConfig, AutoModelForCausalLM
 dtype_map = {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}
         # audio encoder
         hub = audio_encoder_conf.get("hub", None)
+        self.audio_encoder_activation_checkpoint = audio_encoder_conf.get(
+            "activation_checkpoint", False
+        )
         if hub == "ms":
             model = AutoModel(model=audio_encoder, model_revision="master")
             audio_encoder_output_size = (
+                model.model.encoder_output_size
+                if hasattr(model.model, "encoder_output_size")
+                else -1
+            )
+            audio_encoder = (
+                model.model.model.encoder
+                if hasattr(model.model, "model")
+                else model.model.encoder
             )
         else:
             encoder_class = tables.encoder_classes.get(audio_encoder)
             audio_encoder = encoder_class(input_size=input_size, **audio_encoder_conf)
         init_param_path = llm_conf.get("init_param_path", None)
         llm_dim = None
         llm_load_kwargs = llm_conf.get("load_kwargs", {})
+        config = AutoConfig.from_pretrained(init_param_path)
+        model = AutoModelForCausalLM.from_config(config, **llm_load_kwargs)
         freeze = llm_conf.get("freeze", True)
         if freeze:
         adaptor_class = tables.adaptor_classes.get(audio_adaptor)
         if audio_encoder_output_size > 0:
             audio_adaptor_conf["encoder_dim"] = audio_encoder_output_size
+        audio_adaptor_conf["llm_dim"] = (
+            llm_dim if llm_dim is not None else audio_adaptor_conf["llm_dim"]
+        )
         audio_adaptor = adaptor_class(**audio_adaptor_conf)
         freeze = audio_adaptor_conf.get("freeze", False)
         if freeze:
             for name, param in audio_adaptor.named_parameters():
             if self.audio_encoder_activation_checkpoint:
                 from torch.utils.checkpoint import checkpoint
+                encoder_out, encoder_out_lens = checkpoint(
+                    self.encode, speech, speech_lengths, use_reentrant=False
+                )
             else:
                 encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
             # audio_adaptor
+            encoder_out, encoder_out_lens = self.audio_adaptor(
+                encoder_out, encoder_out_lens
+            )
             batch_size, token_num, dims = inputs_embeds.shape
             fake_token_len = kwargs.get("fake_token_len")
             stats["batch_size_speech"] = batch_size_speech
             stats["batch_size_x_frames"] = frames * batch_size_speech
             stats["batch_size_real_frames"] = speech_lengths.sum().item()
+            stats["padding_frames"] = (
+                stats["batch_size_x_frames"] - stats["batch_size_real_frames"]
+            )
         with torch.cuda.amp.autocast(
             enabled=True if self.llm_dtype != "fp32" else False,
         with torch.no_grad():
             preds = torch.argmax(model_outputs.logits, -1)
+            acc_att = compute_accuracy(
+                preds[:, :-1], labels_ids[:, 1:], ignore_label=-100
+            )
             stats["acc"] = acc_att
         stats["loss"] = torch.clone(loss.detach())
         stats["batch_size_x_tokens"] = token_num * batch_size
         stats["batch_size_real_tokens"] = attention_mask.sum().item()
+        stats["padding_tokens"] = (
+            stats["batch_size_x_tokens"] - stats["batch_size_real_tokens"]
+        )
         dialog_turns = (fbank_beg > 0).sum(-1)
         dialog_turns_max = torch.max(dialog_turns).int().item()
     def encode(self, speech, speech_lengths):
         # audio encoder
         if self.feat_permute:
+            encoder_out, encoder_out_lens = self.audio_encoder(
+                speech.permute(0, 2, 1), speech_lengths
+            )
         else:
             encoder_out, encoder_out_lens = self.audio_encoder(speech, speech_lengths)
         return contents
+    def data_load_speech(
+        self, contents: dict, tokenizer, frontend, meta_data={}, **kwargs
+    ):
         system = contents["system"]
         user = contents["user"]
         assistant = contents["assistant"]
             [],
         )
         input_source_ids = []
+        for i, (system_prompt, user_prompt, target_out) in enumerate(
+            zip(system, user, assistant)
+        ):
             if i >= kwargs.get("multiturn_num_max", 5):
                 break
             if len(input_ids) > kwargs.get("max_token_length", 1500):
                     source_ids += sub_token
                     fbank_mask_i += [0] * len(sub_token)
                 else:
+                    sub_str = sub_str.replace("<|startofspeech|>", "").replace(
+                        "<|endofspeech|>", ""
+                    )
                     if sub_str.startswith("!"):
                         sub_str = sub_str[1:]
                         if sub_str.startswith("!"):  # !!: audio sample point
                             sub_str = audio
                         try:
                             time1 = time.perf_counter()
+                            data_src = load_audio_text_image_video(
+                                sub_str, fs=frontend.fs, **kwargs
+                            )
                             time2 = time.perf_counter()
                             meta_data["load_data"] = f"{time2 - time1:0.3f}"
                         except Exception as e:
+                            logging.error(
+                                f"Loading wav failed! {str(e)}, {traceback.format_exc()}"
+                            )
                         speech, speech_lengths = extract_fbank(
                             data_src,
                         time3 = time.perf_counter()
                         meta_data["extract_feat"] = f"{time3 - time2:0.3f}"
                         meta_data["batch_data_time"] = (
+                            speech_lengths.sum().item()
+                            * frontend.frame_shift
+                            * frontend.lfr_n
+                            / 1000
                         )
                         if self.feat_permute:
                 fbank.append(speech[0, :, :])
                 fbank_lens.append(speech_lengths)
+        input_ids = torch.tensor(
+            input_ids, dtype=torch.int64
+        )  # [: self.max_token_length]
         attention_mask = torch.tensor([1] * len(input_ids), dtype=torch.int32)
         labels = torch.tensor(labels, dtype=torch.int64)  # [: self.max_token_length]
         target_ids = torch.tensor(target_ids, dtype=torch.int64)
         if len(fbank) > 0:
+            speech = torch.nn.utils.rnn.pad_sequence(
+                fbank, batch_first=True, padding_value=0.0
+            )
+            speech_lengths = torch.nn.utils.rnn.pad_sequence(
+                fbank_lens, batch_first=True, padding_value=-1
+            )
         else:
             speech = []
             speech_lengths = []
             raise NotImplementedError("batch decoding is not implemented")
         contents = self.data_template(data_in[0])
+        output = self.data_load_speech(
+            contents, tokenizer, frontend, meta_data=meta_data, **kwargs
+        )
         batch = to_device(output, kwargs["device"])
         # audio encoder
                 encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
                 # audio_adaptor
+                encoder_out, encoder_out_lens = self.audio_adaptor(
+                    encoder_out, encoder_out_lens
+                )
                 meta_data["audio_adaptor_out"] = encoder_out
                 meta_data["audio_adaptor_out_lens"] = encoder_out_lens
         frontend=None,
         **kwargs,
     ):
+        hotwords = kwargs.get("hotwords", [])
+        if len(hotwords) > 0:
+            hotwords = ", ".join(hotwords)
+            prompt = f"请结合上下文信息，更加准确地完成语音转写任务。如果没有相关信息，我们会留空。\n\n\n**上下文信息：**\n\n\n"
+            prompt += f"热词列表：[{hotwords}]\n"
+        else:
+            prompt = ""
+        language = kwargs.get("language", "auto")
+        if language not in ("auto", "zh", "en", "ja"):
+            language = "auto"
+        if language == "auto":
+            prompt += "语音转写"
+        else:
+            LANGUAGE_MAP = {"zh": "中文", "en": "英文", "ja": "日文"}
+            prompt += f"语音转写成{LANGUAGE_MAP[language]}"
+        itn = kwargs.get("itn", True)
+        if not itn:
+            prompt += "，不进行文本规整"
+        prompt += "："
         new_data_in = []
         for data in data_in:
             if isinstance(data, str):
                 new_data_in.append(
                     [
                         {"role": "system", "content": "You are a helpful assistant."},
+                        {
+                            "role": "user",
+                            "content": f"{prompt}<|startofspeech|>!{data}<|endofspeech|>",
+                        },
                         {"role": "assistant", "content": "null"},
                     ]
                 )
                 new_data_in.append(
                     [
                         {"role": "system", "content": "You are a helpful assistant."},
+                        {
+                            "role": "user",
+                            "content": f"{prompt}<|startofspeech|>!!<|endofspeech|>",
+                            "audio": data,
+                        },
                         {"role": "assistant", "content": "null"},
                     ]
                 )
             key = []
             for _ in data_in:
                 chars = string.ascii_letters + string.digits
+                key.append(
+                    "rand_key_" + "".join(random.choice(chars) for _ in range(13))
+                )
         return self.inference_llm(
             data_in,
             llm_dtype = "fp16" if kwargs.get("fp16", False) else llm_dtype
             llm_dtype = "bf16" if kwargs.get("bf16", False) else llm_dtype
+        with torch.cuda.amp.autocast(
+            enabled=True if llm_dtype != "fp32" else False, dtype=dtype_map[llm_dtype]
+        ):
             label = contents["assistant"][-1]
             self.llm = self.llm.to(dtype_map[llm_dtype])
             inputs_embeds = inputs_embeds.to(dtype_map[llm_dtype])
         response_clean = re.sub(r"[^\w\s\u3000\u4e00-\u9fff]+", "", response)
         result_i = {
             "key": key[0],
+            "text": re.sub(r'\s+', ' ', response.replace("/sil", " ")),
             "text_tn": response_clean,
             "label": label,
         }
     def from_pretrained(model: str = None, **kwargs):
         from funasr import AutoModel
+        model, kwargs = AutoModel.build_model(
+            model=model, trust_remote_code=True, **kwargs
+        )
+        return model, kwargs