informatica:inteligencia_artificial:tts
Differences
This shows you the differences between two versions of the page.
| Both sides previous revisionPrevious revisionNext revision | Previous revision | ||
| informatica:inteligencia_artificial:tts [2026/06/26 10:41] – [Entrenar COQUI TTS con XTTS] jose | informatica:inteligencia_artificial:tts [2026/06/26 18:55] (current) – jose | ||
|---|---|---|---|
| Line 326: | Line 326: | ||
| </ | </ | ||
| - | ====== Entrenar COQUI TTS con XTTS ====== | + | ====== Entrenar COQUI TTS con VITS ====== |
| Necesitamos muchísimas horas de audio y preparar un dataset | Necesitamos muchísimas horas de audio y preparar un dataset | ||
| Line 350: | Line 350: | ||
| < | < | ||
| import os | import os | ||
| + | from unicodedata import normalize | ||
| + | |||
| from trainer import Trainer, TrainerArgs | from trainer import Trainer, TrainerArgs | ||
| + | |||
| + | from TTS.config import BaseAudioConfig | ||
| from TTS.tts.configs.shared_configs import BaseDatasetConfig | from TTS.tts.configs.shared_configs import BaseDatasetConfig | ||
| from TTS.tts.configs.vits_config import VitsConfig | from TTS.tts.configs.vits_config import VitsConfig | ||
| - | from TTS.tts.datasets import load_tts_samples | + | from TTS.tts.models.vits import |
| - | from TTS.tts.models.vits import Vits | + | |
| # 1. Rutas de carpetas | # 1. Rutas de carpetas | ||
| - | PATH_DATASET = "/ | + | PATH_DATASET = "/ |
| - | PATH_SALIDA = "/ | + | PATH_SALIDA = "/ |
| + | SPANISH_PUNCTUATIONS = " | ||
| - | # 2. Configurar | + | # Incluimos |
| - | dataset_config | + | BASE_SPANISH_CHARACTERS |
| - | | + | |
| - | | + | |
| - | path=PATH_DATASET | + | def build_characters_config(texts: list[str]) -> CharactersConfig: |
| - | ) | + | |
| + | | ||
| + | { | ||
| + | char | ||
| + | for char in normalized_text | ||
| + | if not char.isspace() and char not in SPANISH_PUNCTUATIONS | ||
| + | } | ||
| + | ) | ||
| + | characters = "" | ||
| + | |||
| + | return CharactersConfig( | ||
| + | characters_class=" | ||
| + | pad="< | ||
| + | eos=None, | ||
| + | bos=None, | ||
| + | blank="< | ||
| + | characters=characters, | ||
| + | punctuations=SPANISH_PUNCTUATIONS, | ||
| + | phonemes=None, | ||
| + | is_unique=False, | ||
| + | is_sorted=True, | ||
| + | ) | ||
| + | |||
| + | |||
| + | def load_samples(dataset_path: | ||
| + | samples = [] | ||
| + | csv_path = os.path.join(dataset_path, | ||
| + | |||
| + | with open(csv_path, " | ||
| + | for line in file_handle: | ||
| + | parts = line.strip().split(" | ||
| + | if len(parts) < 2: | ||
| + | continue | ||
| + | |||
| + | audio_id, text = parts[0], parts[1] | ||
| + | samples.append( | ||
| + | { | ||
| + | " | ||
| + | " | ||
| + | " | ||
| + | " | ||
| + | " | ||
| + | } | ||
| + | ) | ||
| + | |||
| + | | ||
| + | |||
| + | |||
| + | def main(): | ||
| + | samples = load_samples(PATH_DATASET) | ||
| + | if not samples: | ||
| + | raise RuntimeError(f" | ||
| + | |||
| + | characters_config = build_characters_config([sample[" | ||
| + | |||
| + | # 2. Configurar el Dataset Base | ||
| + | dataset_config = BaseDatasetConfig( | ||
| + | formatter=" | ||
| + | | ||
| + | | ||
| + | ) | ||
| + | |||
| + | # Configuración estándar de audio para VITS | ||
| + | audio_config = BaseAudioConfig( | ||
| + | sample_rate=22050, | ||
| + | resample=True, | ||
| + | | ||
| # 3. Configurar la arquitectura VITS | # 3. Configurar la arquitectura VITS | ||
| - | config = VitsConfig( | + | # 3. Configurar la arquitectura VITS (Modo caracteres puros, sin fonemas externos) |
| - | audio=None, # Coqui calculará los parámetros de audio automáticamente | + | |
| - | run_name=" | + | audio=audio_config, |
| - | batch_size=16, | + | run_name=" |
| - | eval_batch_size=8, | + | batch_size=16, |
| - | num_loader_workers=2, | + | eval_batch_size=8, |
| - | num_eval_loader_workers=2, | + | num_loader_workers=0, |
| - | run_eval=True, | + | num_eval_loader_workers=0, |
| - | test_delay_epochs=5, | + | run_eval=True, |
| - | epochs=100, | + | test_delay_epochs=5, |
| - | text_cleaner=" | + | epochs=100, |
| - | | + | text_cleaner=" |
| - | | + | |
| - | phoneme_cache_path=os.path.join(PATH_SALIDA, | + | datasets=[dataset_config], |
| - | | + | |
| - | output_path=PATH_SALIDA | + | |
| - | ) | + | ) |
| - | # 4. Cargar muestras | + | |
| - | train_samples, | + | |
| - | # 5. Inicializar el modelo VITS | + | train_samples = samples |
| - | model = Vits(config) | + | |
| + | |||
| + | | ||
| + | |||
| + | trainer = Trainer( | ||
| + | TrainerArgs(), | ||
| + | config, | ||
| + | output_path=PATH_SALIDA, | ||
| + | model=model, | ||
| + | train_samples=train_samples, | ||
| + | eval_samples=eval_samples, | ||
| + | ) | ||
| + | |||
| + | print(f" | ||
| + | print(f" | ||
| + | print(" | ||
| + | trainer.fit() | ||
| + | |||
| + | |||
| + | if __name__ == " | ||
| + | main() | ||
| + | </ | ||
| + | |||
| + | Luego lo ejecutamos con (cambiar directorio de entreno): | ||
| + | python entrenar_vits.py --device mps --continue_path resultado_entrenamiento/ | ||
| + | |||
| + | En mi caso: | ||
| + | python entrenar_vits.py --device mps --continue_path resultado_entrenamiento/ | ||
| + | |||
| + | MAC: | ||
| + | |||
| + | Las notas de audio están en | ||
| + | open ~/ | ||
| + | |||
| + | |||
| + | |||
| + | ====== Gráficas de rendimiento ====== | ||
| + | Levantamos servidor tensor con: | ||
| + | tensorboard --logdir=/ | ||
| + | |||
| + | Entramos en: http:// | ||
| + | |||
| + | Sacado de chatGPT: | ||
| + | < | ||
| + | oss_disc y loss_gen: Son las gráficas de rendimiento del Discriminador y el Generador de VITS. Verás curvas que van bajando. Cuanto más abajo y estables estén, mejor y más limpia sonará tu voz. | ||
| - | # 6. Configurar | + | loss_mel: Te indica cómo de bien está aprendiendo |
| - | trainer = Trainer( | + | |
| - | TrainerArgs(), | + | |
| - | config, | + | |
| - | output_path=PATH_SALIDA, | + | |
| - | model=model, | + | |
| - | train_samples=train_samples, | + | |
| - | eval_samples=eval_samples, | + | |
| - | ) | + | |
| - | # 7. ¡FUEGO! Lanzar el entrenamiento | + | Rendimiento de tiempo: Te muestra cuántos segundos tarda por cada paso de entrenamiento. |
| - | print(" | + | |
| - | trainer.fit() | + | |
| </ | </ | ||
informatica/inteligencia_artificial/tts.1782470508.txt.gz · Last modified: by jose
