update meanaudio_l_full
Browse files- app.py +2 -21
- meanaudio/eval_utils.py +7 -1
- meanaudio/model/networks.py +12 -1
app.py
CHANGED
|
@@ -127,7 +127,7 @@ def generate_audio_gradio(
|
|
| 127 |
|
| 128 |
net.update_seq_lengths(seq_cfg.latent_seq_len)
|
| 129 |
|
| 130 |
-
if variant == 'meanaudio_s_ac' or variant == 'meanaudio_s_full':
|
| 131 |
use_meanflow=True
|
| 132 |
elif variant == 'fluxaudio_s_full':
|
| 133 |
use_meanflow=False
|
|
@@ -184,32 +184,13 @@ def generate_audio_gradio(
|
|
| 184 |
|
| 185 |
# Gradio input and output components
|
| 186 |
input_text = gr.Textbox(lines=2, label="Prompt")
|
|
|
|
| 187 |
output_audio = gr.Audio(label="Generated Audio", type="filepath")
|
| 188 |
denoising_steps = gr.Slider(minimum=1, maximum=25, value=1, step=1, label="Sampling Steps", interactive=True)
|
| 189 |
cfg_strength = gr.Slider(minimum=1, maximum=10, value=4.5, step=0.5, label="Guidance Scale", interactive=True)
|
| 190 |
duration = gr.Slider(minimum=1, maximum=30, value=10, step=1, label="Duration", interactive=True)
|
| 191 |
seed = gr.Slider(minimum=1, maximum=100, value=42, step=1, label="Seed", interactive=True)
|
| 192 |
-
variant = gr.Dropdown(label="Model Variant", choices=list(all_model_cfg.keys()), value='meanaudio_s_full', interactive=True)
|
| 193 |
-
|
| 194 |
|
| 195 |
-
# description_text = """
|
| 196 |
-
# **MeanAudio** is a novel text-to-audio generator that uses **MeanFlow** to synthesize realistic and faithful audio in few sampling steps. It achieves state-of-the-art performance in single-step audio generation and delivers strong performance in multi-step audio generation.
|
| 197 |
-
|
| 198 |
-
# <p align="center">
|
| 199 |
-
# <a href="https://huggingface.co/AndreasXi/MeanAudio">
|
| 200 |
-
# <img src="https://img.shields.io/badge/%F0%9F%A4%97%20Model-HuggingFace-violet" alt="HuggingFace Model">
|
| 201 |
-
# </a>
|
| 202 |
-
# <a href="https://huggingface.co/spaces/chenxie95/MeanAudio">
|
| 203 |
-
# <img src="https://img.shields.io/badge/%F0%9F%9A%80%20Space-HuggingFace-8A2BE2" alt="HuggingFace Space">
|
| 204 |
-
# </a>
|
| 205 |
-
# <a href="https://meanaudio.github.io/">
|
| 206 |
-
# <img src="https://img.shields.io/badge/%F0%9F%93%84%20Project-Page-brightred" alt="Project Page">
|
| 207 |
-
# </a>
|
| 208 |
-
# <a href="https://github.com/xiquan-li/MeanAudio">
|
| 209 |
-
# <img src="https://img.shields.io/badge/%F0%9F%92%BB%20Code-GitHub-black" alt="GitHub">
|
| 210 |
-
# </a>
|
| 211 |
-
# </p>
|
| 212 |
-
# """
|
| 213 |
|
| 214 |
description_text = """
|
| 215 |
### **MeanAudio** is a novel text-to-audio generator that uses **MeanFlow** to synthesize realistic and faithful audio in few sampling steps. It achieves state-of-the-art performance in single-step audio generation and delivers strong performance in multi-step audio generation.
|
|
|
|
| 127 |
|
| 128 |
net.update_seq_lengths(seq_cfg.latent_seq_len)
|
| 129 |
|
| 130 |
+
if variant == 'meanaudio_s_ac' or variant == 'meanaudio_s_full' or variant == 'meanaudio_l_full':
|
| 131 |
use_meanflow=True
|
| 132 |
elif variant == 'fluxaudio_s_full':
|
| 133 |
use_meanflow=False
|
|
|
|
| 184 |
|
| 185 |
# Gradio input and output components
|
| 186 |
input_text = gr.Textbox(lines=2, label="Prompt")
|
| 187 |
+
variant = gr.Dropdown(label="Model Variant", choices=list(all_model_cfg.keys()), value='meanaudio_s_full', interactive=True)
|
| 188 |
output_audio = gr.Audio(label="Generated Audio", type="filepath")
|
| 189 |
denoising_steps = gr.Slider(minimum=1, maximum=25, value=1, step=1, label="Sampling Steps", interactive=True)
|
| 190 |
cfg_strength = gr.Slider(minimum=1, maximum=10, value=4.5, step=0.5, label="Guidance Scale", interactive=True)
|
| 191 |
duration = gr.Slider(minimum=1, maximum=30, value=10, step=1, label="Duration", interactive=True)
|
| 192 |
seed = gr.Slider(minimum=1, maximum=100, value=42, step=1, label="Seed", interactive=True)
|
|
|
|
|
|
|
| 193 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 194 |
|
| 195 |
description_text = """
|
| 196 |
### **MeanAudio** is a novel text-to-audio generator that uses **MeanFlow** to synthesize realistic and faithful audio in few sampling steps. It achieves state-of-the-art performance in single-step audio generation and delivers strong performance in multi-step audio generation.
|
meanaudio/eval_utils.py
CHANGED
|
@@ -58,11 +58,17 @@ meanaudio_s_ac = ModelConfig(model_name='meanaudio_s_ac',
|
|
| 58 |
vae_path=Path('./weights/v1-16.pth'),
|
| 59 |
bigvgan_16k_path=Path('./weights/best_netG.pt'),
|
| 60 |
mode='16k')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
|
| 62 |
all_model_cfg: dict[str, ModelConfig] = {
|
| 63 |
-
'
|
| 64 |
'meanaudio_s_full': meanaudio_s_full,
|
| 65 |
'meanaudio_s_ac': meanaudio_s_ac,
|
|
|
|
| 66 |
}
|
| 67 |
|
| 68 |
|
|
|
|
| 58 |
vae_path=Path('./weights/v1-16.pth'),
|
| 59 |
bigvgan_16k_path=Path('./weights/best_netG.pt'),
|
| 60 |
mode='16k')
|
| 61 |
+
meanaudio_l_full = ModelConfig(model_name='meanaudio_l_full',
|
| 62 |
+
model_path=Path('./weights/meanaudio_l_full.pth'), # will be specified later
|
| 63 |
+
vae_path=Path('./weights/v1-16.pth'),
|
| 64 |
+
bigvgan_16k_path=Path('./weights/best_netG.pt'),
|
| 65 |
+
mode='16k')
|
| 66 |
|
| 67 |
all_model_cfg: dict[str, ModelConfig] = {
|
| 68 |
+
'meanaudio_l_full': meanaudio_l_full,
|
| 69 |
'meanaudio_s_full': meanaudio_s_full,
|
| 70 |
'meanaudio_s_ac': meanaudio_s_ac,
|
| 71 |
+
'fluxaudio_s_full': fluxaudio_s_full,
|
| 72 |
}
|
| 73 |
|
| 74 |
|
meanaudio/model/networks.py
CHANGED
|
@@ -597,11 +597,22 @@ def meanaudio_s(**kwargs) -> MeanAudio:
|
|
| 597 |
num_heads=num_heads,
|
| 598 |
latent_seq_len=312, # for 10s audio
|
| 599 |
**kwargs)
|
| 600 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 601 |
|
| 602 |
def get_mean_audio(name: str, **kwargs) -> MeanAudio:
|
| 603 |
if name == 'meanaudio_s_ac' or name == 'meanaudio_s_full':
|
| 604 |
return meanaudio_s(**kwargs)
|
|
|
|
|
|
|
| 605 |
elif name == 'fluxaudio_s_full':
|
| 606 |
return fluxaudio_s(**kwargs)
|
| 607 |
else:
|
|
|
|
| 597 |
num_heads=num_heads,
|
| 598 |
latent_seq_len=312, # for 10s audio
|
| 599 |
**kwargs)
|
| 600 |
+
def meanaudio_l(**kwargs) -> MeanAudio:
|
| 601 |
+
num_heads = 14
|
| 602 |
+
return MeanAudio(latent_dim=20,
|
| 603 |
+
text_dim=1024,
|
| 604 |
+
hidden_dim=64 * num_heads,
|
| 605 |
+
depth=24,
|
| 606 |
+
fused_depth=16,
|
| 607 |
+
num_heads=num_heads,
|
| 608 |
+
latent_seq_len=312, # for 10s audio
|
| 609 |
+
**kwargs)
|
| 610 |
|
| 611 |
def get_mean_audio(name: str, **kwargs) -> MeanAudio:
|
| 612 |
if name == 'meanaudio_s_ac' or name == 'meanaudio_s_full':
|
| 613 |
return meanaudio_s(**kwargs)
|
| 614 |
+
elif name == 'meanaudio_l_full':
|
| 615 |
+
return meanaudio_l(**kwargs)
|
| 616 |
elif name == 'fluxaudio_s_full':
|
| 617 |
return fluxaudio_s(**kwargs)
|
| 618 |
else:
|