from pydub import AudioSegment
import random
import string
import os

def chunk_text(text, max_chars=300):
    sentences = text.split(". ")
    chunks = []
    current = ""
    for sentence in sentences:
        if len(current) + len(sentence) < max_chars:
            current += sentence + ". "
        else:
            chunks.append(current.strip())
            current = sentence + ". "
    if current:
        chunks.append(current.strip())
    return chunks

def merge_audios(audio_paths):
    combined = AudioSegment.empty()
    for path in audio_paths:
        if os.path.exists(path):
            audio = AudioSegment.from_file(path)
            combined += audio
    out_path = "output_" + ''.join(random.choices(string.ascii_lowercase, k=5)) + ".mp3"
    combined.export(out_path, format="mp3")
    return out_path

def generate_tts_xtts(text, speaker_wav, speed=1.0):
    from inference import Inferencer

    # Khởi tạo XTTS với model có sẵn
    inf = Inferencer(
        model_name_or_path="coqui/XTTS-v2",
        device="cpu",  # Đổi thành "cuda" nếu dùng GPU
    )

    # Tạo âm thanh từ text và mẫu giọng nói
    output_path = "tts_" + ''.join(random.choices(string.ascii_lowercase, k=6)) + ".wav"
    inf.tts(
        text,
        speaker_wav=speaker_wav,
        language="en",
        output_path=output_path
    )

    sound = AudioSegment.from_wav(output_path)
    output_mp3 = output_path.replace(".wav", ".mp3")
    sound.export(output_mp3, format="mp3")
    return output_mp3