资讯详情

TTS项目--github--soobinseo/Transformer-TTS

文章目录

  • 项目结构
  • 网络训练
  • 测试音频合成
  • 主要代码
    • hyperparams.py
    • prepare_data.py
    • preprocess.py
    • module.py
    • network.py
    • train_transformer.py
    • train_postnet.py
    • synthesis.py
  • 总结
项目链接: https://github.com/soobinseo/Transformer-TTS 论文链接: https://arxiv.org/abs/1809.08895

项目结构

网络训练

  1. 下载LJSpeech在适当的路径下解压数据
  2. 调整hyperparams.py注意文件中的超参数data_path设置为LJSpeech存储的路径
  3. 执行prepare_data.py文件
  4. 执行train_transformer.py文件
  5. 执行train_postnet.py文件

测试音频合成

执行synthesis.py文件

主要代码

hyperparams.py

# 以下是所有超参数  # Audio num_mels = 80 # num_freq = 1024 n_fft = 2048 sr = 22050 # frame_length_ms = 50. # frame_shift_ms = 12.5 preemphasis = 0.97 frame_shift = 0.0125  # seconds frame_length = 0.05  # seconds hop_length = int(sr * frame_shift)  # samples. win_length = int(sr * frame_length)  # samples. n_mels = 80  # Number of Mel banks to generate power = 1.2  # Exponent for amplifying the predicted magnitude min_level_db = -100 ref_level_db = 20 hidden_size = 256 embedding_size = 512 max_db = 100 ref_db = 20  n_iter = 60 # power = 1.5 outputs_per_step = 1  epochs = 10000 lr = 0.001 save_step = 2000 image_step = 500 batch_size = 32  cleaners = 'english_cleaners'  data_path = './data/LJSpeech-1.1/LJSpeech-1.1' checkpoint_path = './checkpoint' sample_path = './samples' 

prepare_data.py

定义一个数据类型,从LJSpeech数据集中提取训练所需的文本、语音的mel谱图和mag特征/显性图谱。该文件主要是从.wav文件中提取出所需的特征数据并进行保存,方便后续使用时调用

import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DalaLoader
import os
from utils import get_spectrograms
import hyperparams as hp
import librosa


# 主要用于从wave文件中提取特征数据
class PrepareDataset(Dataset):
    """LJSpeech dataset."""

    def __init__(self, csv_file, root_dir):
        """ Args: csv_file (string): Path to the csv file with annotations. 文本数据的路径 root_dir (string): Directory with all the wavs. 音频数据的路径 """
        # 文本数据中每一行为一个数据,前面为对应音频的文件名,后面为具体的文本内容,与音频中的内容一致
        self.landmarks_frame = pd.read_csv(csv_file, sep='|', header=None)  # 该csv文件以'|'划分
        self.root_dir = root_dir

    def load_wav(self, filename):
        return librosa.load(filename, sr=hp.sample_rate)

    def __len__(self):
        return len(self.landmarks_frame)  # 返回数据量

    def __getitem__(self, idx):  # 通过索引获取对应的文本和音频的mel图谱
        wav_name = os.path.join(self.root_dir, self.landmarks_frame.iloc[idx, 0]) + '.wav'
        mel, mag = get_spectrograms(wav_name)  # 提取音频的mel和mag

        # 将从wav音频文件中提取的mel和mag保存至data中,方便后续训练使用;每个mel图谱的尺寸是[n, 80],每个mag图谱的尺寸是[n, 1025],因为不同音频的长度是不一样的,故n的大小是不一致的
        np.save(wav_name[:-4] + '.pt', mel)
        np.save(wav_name[:-4] + '.mag', mag)

        sample = { 
        'mel': mel, 'mag': mag}

        return sample

上述代码中调用了函数get_spectrograms()从’.wav’的语音文件抽取对应的mel和mag(幅度谱图)。代码如下,此代码可认为是抽取语音文件特征的标准代码,在不同场景使用时整个步骤流程基本一致,改变的可能只是部分参数

def get_spectrograms(fpath):
    '''解析音频文件,从音频文件中提取mel和mag Parse the wave file in `fpath` and Returns normalized melspectrogram and linear spectrogram. Args: fpath: A string. The full path of a sound file. Returns: mel: A 2d array of shape (T, n_mels) and dtype of float32. mag: A 2d array of shape (T, 1+n_fft/2) and dtype of float32. '''
    # Loading sound file
    y, sr = librosa.load(fpath, sr=hp.sr)

    # Trimming
    y, _ = librosa.effects.trim(y)

    # Preemphasis
    y = np.append(y[0], y[1:] - hp.preemphasis * y[:-1])

    # stft,短时傅里叶变换
    linear = librosa.stft(y=y,
                          n_fft=hp.n_fft,
                          hop_length=hp.hop_length,
                          win_length=hp.win_length)

    # magnitude spectrogram,幅度谱图
    mag = np.abs(linear)  # (1+n_fft//2, T)

    # mel spectrogram
    mel_basis = librosa.filters.mel(hp.sr, hp.n_fft, hp.n_mels)  # (n_mels, 1+n_fft//2)
    mel = np.dot(mel_basis, mag)  # (n_mels, t)

    # to decibel
    mel = 20 * np.log10(np.maximum(1e-5, mel))
    mag = 20 * np.log10(np.maximum(1e-5, mag))

    # normalize
    mel = np.clip((mel - hp.ref_db + hp.max_db) / hp.max_db, 1e-8, 1)
    mag = np.clip((mag - hp.ref_db + hp.max_db) / hp.max_db, 1e-8, 1)

    # Transpose
    mel = mel.T.astype(np.float32)  # (T, n_mels)
    mag = mag.T.astype(np.float32)  # (T, 1+n_fft//2)

    return mel, mag  # 分别是音频的梅尔谱图和幅度谱图

preprocess.py

其中主要为模型训练数据的加载与相关预处理代码

import hyperparams as hp
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import os
import librosa
import numpy as np
from text import text_to_sequence
import collections
from scipy import signal
import torch as t
import math


# 创建模型训练所使用的数据集
class LJDatasets(Dataset):
    """LJSpeech dataset."""

    def __init__(self, csv_file, root_dir):
        """ Args: csv_file (string): Path to the csv file with annotations. root_dir (string): Directory with all the wavs. """
        self.landmarks_frame = pd.read_csv(csv_file, sep='|', header=None)
        self.root_dir = root_dir

    def load_wav(self, filename):
        return librosa.load(filename, sr=hp.sample_rate)

    def __len__(self):
        return len(self.landmarks_frame)

    def __getitem__(self, idx):  # 通过下标索引获取数据
        wav_name = os.path.join(self.root_dir, self.landmarks_frame.iloc[idx, 0]) + '.wav'  # idx对应的音频文件路径
        text = self.landmarks_frame.iloc[idx, 1]  # idx对应的文本内容

        text = np.asarray(text_to_sequence(text, [hp.cleaners]), dtype=np.int32)  # 将英文文本转为序列,相当于字符级别的分词,在最后都会加上一个1
        mel = np.load(wav_name[:-4] + '.pt.npy')  # 加载梅尔谱图
        # 将[[0] * 80]与mel中的前n-1行在垂直方向concat,即去掉mel的最后一行,并且在最前面添加全为0的一行,作为输入
        mel_input = np.concatenate([np.zeros([1, hp.num_mels], np.float32), mel[:-1, :]], axis=0)
        text_length = len(text)  # 序列长度
        pos_text = np.arange(1, text_length + 1)  # 位置编码???
        pos_mel = np.arange(1, mel.shape[0] + 1)

        sample = { 
        'text': text, 'mel': mel, 'text_length': text_length, 'mel_input': mel_input, 'pos_mel': pos_mel,
                  'pos_text': pos_text}

        return sample


# 用于后续加载mel图谱和mag谱图数据
class PostDatasets(Dataset):
    """LJSpeech dataset."""

    def __init__(self, csv_file, root_dir):
        """ Args: csv_file (string): Path to the csv file with annotations. root_dir (string): Directory with all the wavs. """
        self.landmarks_frame = pd.read_csv(csv_file, sep='|', header=None)
        self.root_dir = root_dir

    def __len__(self):
        return len(self.landmarks_frame)

    def __getitem__(self, idx):
        wav_name = os.path.join(self.root_dir, self.landmarks_frame.iloc[idx, 0]) + '.wav'
        mel = np.load(wav_name[:-4] + '.pt.npy')  # 加载mel谱图
        mag = np.load(wav_name[:-4] + '.mag.npy')  # 加载幅度谱图
        sample = { 
        'mel': mel, 'mag': mag}

        return sample


# 用于对LJDatasets类构建的数据进行batch中的转换处理
def collate_fn_transformer(batch):
    # Puts each data field into a tensor with outer dimension batch size
    if isinstance(batch[0], collections.Mapping):
        text = [d['text'] for d in batch]  # batch中所有的文本数据
        mel = [d['mel'] for d in batch]  # batch中所有的mel数据
        mel_input = [d['mel_input'] for d in batch]  # batch中所有的mel_input
        text_length = [d['text_length'] for d in batch]  # batch中所有的test_length
        pos_mel = [d['pos_mel'] for d in batch]  # batch中所有的pos_mel
        pos_text = [d['pos_text'] for d in batch]  # batch中所有的pos_text

        # 将每个text与其对应的长度text_length匹配,以长度为标准对text进行降序排序,最后的列表中只取text
        text = [i for i, _ in sorted(zip(text, text_length), key=lambda x: x[1], reverse=True)]
        # 将每个melt与其对应的长度text_length匹配,以长度为标准对mel进行降序排序,最后的列表中只取mel
        mel = [i for i, _ in sorted(zip(mel, text_length), key=lambda x: x[1], reverse=True)]
        # 下面几项也是如此,就是以text_length的大小进行降序排序
        mel_input = [i for i, _ in sorted(zip(mel_input, text_length), key=lambda x: x[1], reverse=True)]
        pos_text = [i for i, _ in sorted(zip(pos_text, text_length), key=lambda x: x[1], reverse=True)]
        pos_mel = [i for i, _ in sorted(zip(pos_mel, text_length), key=lambda x: x[1], reverse=True)]
        text_length = sorted(text_length, reverse=True)
        # PAD sequences with largest length of the batch
        text = _prepare_data(text).astype(np.int32)  # 用0将text中的每个文本序列都pad到最长的文本序列的长度
        mel = _pad_mel(mel)  # 对mel进行pad
        mel_input = _pad_mel(mel_input)  # 对mel_input进行pad
        pos_mel = _prepare_data(pos_mel).astype(np.int32)  # 用0将pos_mel中的每个序列都pad到最长的序列的长度
        pos_text = _prepare_data(pos_text).astype(np.int32)  # 用0将pos_text中的每个序列都pad到最长的序列的长度

        return t.LongTensor(text), t.FloatTensor(mel), t.FloatTensor(mel_input), t.LongTensor(pos_text), t.LongTensor(
            pos_mel), t.LongTensor(text_length)

    raise TypeError(("batch must contain tensors, numbers, dicts or lists; found {}"
                     .format(type(batch[0]))))


# 用于对PostDatasets类构建的数据进行batch中的转换处理
def collate_fn_postnet(batch):
    # Puts each data field into a tensor with outer dimension batch size
    if isinstance(batch[0], collections.Mapping):
        mel = [d['mel'] for d in batch]
        mag = [d['mag'] for d in batch]

        # PAD sequences with largest length of the batch
        mel = _pad_mel(mel)
        mag = _pad_mel(mag)

        return t.FloatTensor(mel), t.FloatTensor(mag)

    raise TypeError(("batch must contain tensors, numbers, dicts or lists; found {}"
                     .format(type(batch[0]))))


def _pad_data(x, length):  # 使用0对输出的x进行pad到指定长度length
    _pad = 0
    return np.pad(x, (0, length - x.shape[0]), mode='constant', constant_values=_pad)


def _prepare_data(inputs):  # 将inputs中所有的序列用0pad到其中最长序列的长度
    max_len = max((len(x) for x in inputs))
    return np.stack([_pad_data(x, max_len) for x in inputs])


def _pad_per_step(inputs):
    timesteps = inputs.shape[-1]
    return np.pad(inputs, [[0, 0], [0, 0], [0, hp.outputs_per_step - (timesteps % hp.outputs_per_step)]],
                  mode='constant', constant_values=0.0)


def _pad_mel(inputs):  # 将一个batch中所有的mel用0pad到其中最大长度的大小
    _pad = 0

    def _pad_one(x, max_len):
        mel_len = x.shape[0]
        return np.pad(x, [[0, max_len - mel_len], [0, 0]], mode='constant', constant_values=_pad)

    max_len = max((x.shape[0] for x in inputs))
    return np.stack([_pad_one(x, max_len) for x in inputs])


# 计算模型的参数大小
def get_param_size(model):
    params = 0
    for p in model.parameters():
        tmp = 1
        for x in p.size():
            tmp *= x
        params += tmp
    return params


def get_dataset():
    return LJDatasets(os.path.join(hp.data_path, 'metadata.csv'), os.path.join(hp.data_path, 'wavs'))


def get_post_dataset():
    return PostDatasets(os.path.join(hp.data_path, 'metadata.csv'), os.path.join(hp.data_path, 'wavs'))

module.py

该文件中包含模型搭建的所有的模块

# 包含所有的模型方法

import torch.nn as nn
import torch as t
import torch.nn.functional as F
import math
import hyperparams as hp
from text.symbols import symbols
import numpy as np
import copy
from collections import OrderedDict


def clones(module, N):
    """对传入的module深度复制n份,并放在一个modulelist中"""
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])


class Linear(nn.Module):
    """ Linear Module,定义线性全连接层,使用xavier_uniform_进行初始化 """

    def __init__(self, in_dim, out_dim, bias=True, w_init='linear'):
        """ :param in_dim: dimension of input :param out_dim: dimension of output :param bias: boolean. if True, bias is included. :param w_init: str. weight inits with xavier initialization. """
        super(Linear, self).__init__()
        self.linear_layer = nn.Linear(in_dim, out_dim, bias=bias)

        nn.init.xavier_uniform_(
            self.linear_layer.weight,
            gain=nn.init.calculate_gain(w_init)
        标签: bsz808a振动传感器变送器

锐单商城拥有海量元器件数据手册IC替代型号,打造 电子元器件IC百科大全!

 锐单商城 - 一站式电子元器件采购平台  

 深圳锐单电子有限公司