from pydub import AudioSegment import random import string import os def chunk_text(text, max_chars=300): sentences = text.split(". ") chunks = [] current = "" for sentence in sentences: if len(current) + len(sentence) < max_chars: current += sentence + ". " else: chunks.append(current.strip()) current = sentence + ". " if current: chunks.append(current.strip()) return chunks def merge_audios(audio_paths): combined = AudioSegment.empty() for path in audio_paths: if os.path.exists(path): audio = AudioSegment.from_file(path) combined += audio out_path = "output_" + ''.join(random.choices(string.ascii_lowercase, k=5)) + ".mp3" combined.export(out_path, format="mp3") return out_path def generate_tts_xtts(text, speaker_wav, speed=1.0): from inference import Inferencer # Khởi tạo XTTS với model có sẵn inf = Inferencer( model_name_or_path="coqui/XTTS-v2", device="cpu", # Đổi thành "cuda" nếu dùng GPU ) # Tạo âm thanh từ text và mẫu giọng nói output_path = "tts_" + ''.join(random.choices(string.ascii_lowercase, k=6)) + ".wav" inf.tts( text, speaker_wav=speaker_wav, language="en", output_path=output_path ) sound = AudioSegment.from_wav(output_path) output_mp3 = output_path.replace(".wav", ".mp3") sound.export(output_mp3, format="mp3") return output_mp3