文章目录
- 项目结构
- 网络训练
- 测试音频合成
- 主要代码
-
- hyperparams.py
- prepare_data.py
- preprocess.py
- module.py
- network.py
- train_transformer.py
- train_postnet.py
- synthesis.py
- 总结
项目结构
- checkpoint --存储训练有素的模型文件
- data --存储训练数据
- png
- samples --样本音频
- text --处理文本的py文件
- hyperparams.py --存储所有所需的超参数
- prepare_data.py --将音频wav文件处理为mel、linear spectrogram保存方便后续训练
- preprocess.py --包括所有加载数据所需的预处理代码
- model.py --包括注意力、前网络、后网络等所有模块
- network.py --包括编码器、解码器、后处理网络等所有网络结构
- train_transformer.py --训练回归注意网络,将文本转换为mel谱图
- train_postnet.py --训练后处理网络,将mel将谱图转换为线性
- synthesis.py --合成TTS样本
网络训练
- 下载LJSpeech在适当的路径下解压数据
- 调整hyperparams.py注意文件中的超参数data_path设置为LJSpeech存储的路径
- 执行prepare_data.py文件
- 执行train_transformer.py文件
- 执行train_postnet.py文件
测试音频合成
执行synthesis.py文件
主要代码
hyperparams.py
# 以下是所有超参数 # Audio num_mels = 80 # num_freq = 1024 n_fft = 2048 sr = 22050 # frame_length_ms = 50. # frame_shift_ms = 12.5 preemphasis = 0.97 frame_shift = 0.0125 # seconds frame_length = 0.05 # seconds hop_length = int(sr * frame_shift) # samples. win_length = int(sr * frame_length) # samples. n_mels = 80 # Number of Mel banks to generate power = 1.2 # Exponent for amplifying the predicted magnitude min_level_db = -100 ref_level_db = 20 hidden_size = 256 embedding_size = 512 max_db = 100 ref_db = 20 n_iter = 60 # power = 1.5 outputs_per_step = 1 epochs = 10000 lr = 0.001 save_step = 2000 image_step = 500 batch_size = 32 cleaners = 'english_cleaners' data_path = './data/LJSpeech-1.1/LJSpeech-1.1' checkpoint_path = './checkpoint' sample_path = './samples'
prepare_data.py
定义一个数据类型,从LJSpeech数据集中提取训练所需的文本、语音的mel谱图和mag特征/显性图谱。该文件主要是从.wav文件中提取出所需的特征数据并进行保存,方便后续使用时调用
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DalaLoader
import os
from utils import get_spectrograms
import hyperparams as hp
import librosa
# 主要用于从wave文件中提取特征数据
class PrepareDataset(Dataset):
"""LJSpeech dataset."""
def __init__(self, csv_file, root_dir):
""" Args: csv_file (string): Path to the csv file with annotations. 文本数据的路径 root_dir (string): Directory with all the wavs. 音频数据的路径 """
# 文本数据中每一行为一个数据,前面为对应音频的文件名,后面为具体的文本内容,与音频中的内容一致
self.landmarks_frame = pd.read_csv(csv_file, sep='|', header=None) # 该csv文件以'|'划分
self.root_dir = root_dir
def load_wav(self, filename):
return librosa.load(filename, sr=hp.sample_rate)
def __len__(self):
return len(self.landmarks_frame) # 返回数据量
def __getitem__(self, idx): # 通过索引获取对应的文本和音频的mel图谱
wav_name = os.path.join(self.root_dir, self.landmarks_frame.iloc[idx, 0]) + '.wav'
mel, mag = get_spectrograms(wav_name) # 提取音频的mel和mag
# 将从wav音频文件中提取的mel和mag保存至data中,方便后续训练使用;每个mel图谱的尺寸是[n, 80],每个mag图谱的尺寸是[n, 1025],因为不同音频的长度是不一样的,故n的大小是不一致的
np.save(wav_name[:-4] + '.pt', mel)
np.save(wav_name[:-4] + '.mag', mag)
sample = {
'mel': mel, 'mag': mag}
return sample
上述代码中调用了函数get_spectrograms()从’.wav’的语音文件抽取对应的mel和mag(幅度谱图)。代码如下,此代码可认为是抽取语音文件特征的标准代码,在不同场景使用时整个步骤流程基本一致,改变的可能只是部分参数
def get_spectrograms(fpath):
'''解析音频文件,从音频文件中提取mel和mag Parse the wave file in `fpath` and Returns normalized melspectrogram and linear spectrogram. Args: fpath: A string. The full path of a sound file. Returns: mel: A 2d array of shape (T, n_mels) and dtype of float32. mag: A 2d array of shape (T, 1+n_fft/2) and dtype of float32. '''
# Loading sound file
y, sr = librosa.load(fpath, sr=hp.sr)
# Trimming
y, _ = librosa.effects.trim(y)
# Preemphasis
y = np.append(y[0], y[1:] - hp.preemphasis * y[:-1])
# stft,短时傅里叶变换
linear = librosa.stft(y=y,
n_fft=hp.n_fft,
hop_length=hp.hop_length,
win_length=hp.win_length)
# magnitude spectrogram,幅度谱图
mag = np.abs(linear) # (1+n_fft//2, T)
# mel spectrogram
mel_basis = librosa.filters.mel(hp.sr, hp.n_fft, hp.n_mels) # (n_mels, 1+n_fft//2)
mel = np.dot(mel_basis, mag) # (n_mels, t)
# to decibel
mel = 20 * np.log10(np.maximum(1e-5, mel))
mag = 20 * np.log10(np.maximum(1e-5, mag))
# normalize
mel = np.clip((mel - hp.ref_db + hp.max_db) / hp.max_db, 1e-8, 1)
mag = np.clip((mag - hp.ref_db + hp.max_db) / hp.max_db, 1e-8, 1)
# Transpose
mel = mel.T.astype(np.float32) # (T, n_mels)
mag = mag.T.astype(np.float32) # (T, 1+n_fft//2)
return mel, mag # 分别是音频的梅尔谱图和幅度谱图
preprocess.py
其中主要为模型训练数据的加载与相关预处理代码
import hyperparams as hp
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import os
import librosa
import numpy as np
from text import text_to_sequence
import collections
from scipy import signal
import torch as t
import math
# 创建模型训练所使用的数据集
class LJDatasets(Dataset):
"""LJSpeech dataset."""
def __init__(self, csv_file, root_dir):
""" Args: csv_file (string): Path to the csv file with annotations. root_dir (string): Directory with all the wavs. """
self.landmarks_frame = pd.read_csv(csv_file, sep='|', header=None)
self.root_dir = root_dir
def load_wav(self, filename):
return librosa.load(filename, sr=hp.sample_rate)
def __len__(self):
return len(self.landmarks_frame)
def __getitem__(self, idx): # 通过下标索引获取数据
wav_name = os.path.join(self.root_dir, self.landmarks_frame.iloc[idx, 0]) + '.wav' # idx对应的音频文件路径
text = self.landmarks_frame.iloc[idx, 1] # idx对应的文本内容
text = np.asarray(text_to_sequence(text, [hp.cleaners]), dtype=np.int32) # 将英文文本转为序列,相当于字符级别的分词,在最后都会加上一个1
mel = np.load(wav_name[:-4] + '.pt.npy') # 加载梅尔谱图
# 将[[0] * 80]与mel中的前n-1行在垂直方向concat,即去掉mel的最后一行,并且在最前面添加全为0的一行,作为输入
mel_input = np.concatenate([np.zeros([1, hp.num_mels], np.float32), mel[:-1, :]], axis=0)
text_length = len(text) # 序列长度
pos_text = np.arange(1, text_length + 1) # 位置编码???
pos_mel = np.arange(1, mel.shape[0] + 1)
sample = {
'text': text, 'mel': mel, 'text_length': text_length, 'mel_input': mel_input, 'pos_mel': pos_mel,
'pos_text': pos_text}
return sample
# 用于后续加载mel图谱和mag谱图数据
class PostDatasets(Dataset):
"""LJSpeech dataset."""
def __init__(self, csv_file, root_dir):
""" Args: csv_file (string): Path to the csv file with annotations. root_dir (string): Directory with all the wavs. """
self.landmarks_frame = pd.read_csv(csv_file, sep='|', header=None)
self.root_dir = root_dir
def __len__(self):
return len(self.landmarks_frame)
def __getitem__(self, idx):
wav_name = os.path.join(self.root_dir, self.landmarks_frame.iloc[idx, 0]) + '.wav'
mel = np.load(wav_name[:-4] + '.pt.npy') # 加载mel谱图
mag = np.load(wav_name[:-4] + '.mag.npy') # 加载幅度谱图
sample = {
'mel': mel, 'mag': mag}
return sample
# 用于对LJDatasets类构建的数据进行batch中的转换处理
def collate_fn_transformer(batch):
# Puts each data field into a tensor with outer dimension batch size
if isinstance(batch[0], collections.Mapping):
text = [d['text'] for d in batch] # batch中所有的文本数据
mel = [d['mel'] for d in batch] # batch中所有的mel数据
mel_input = [d['mel_input'] for d in batch] # batch中所有的mel_input
text_length = [d['text_length'] for d in batch] # batch中所有的test_length
pos_mel = [d['pos_mel'] for d in batch] # batch中所有的pos_mel
pos_text = [d['pos_text'] for d in batch] # batch中所有的pos_text
# 将每个text与其对应的长度text_length匹配,以长度为标准对text进行降序排序,最后的列表中只取text
text = [i for i, _ in sorted(zip(text, text_length), key=lambda x: x[1], reverse=True)]
# 将每个melt与其对应的长度text_length匹配,以长度为标准对mel进行降序排序,最后的列表中只取mel
mel = [i for i, _ in sorted(zip(mel, text_length), key=lambda x: x[1], reverse=True)]
# 下面几项也是如此,就是以text_length的大小进行降序排序
mel_input = [i for i, _ in sorted(zip(mel_input, text_length), key=lambda x: x[1], reverse=True)]
pos_text = [i for i, _ in sorted(zip(pos_text, text_length), key=lambda x: x[1], reverse=True)]
pos_mel = [i for i, _ in sorted(zip(pos_mel, text_length), key=lambda x: x[1], reverse=True)]
text_length = sorted(text_length, reverse=True)
# PAD sequences with largest length of the batch
text = _prepare_data(text).astype(np.int32) # 用0将text中的每个文本序列都pad到最长的文本序列的长度
mel = _pad_mel(mel) # 对mel进行pad
mel_input = _pad_mel(mel_input) # 对mel_input进行pad
pos_mel = _prepare_data(pos_mel).astype(np.int32) # 用0将pos_mel中的每个序列都pad到最长的序列的长度
pos_text = _prepare_data(pos_text).astype(np.int32) # 用0将pos_text中的每个序列都pad到最长的序列的长度
return t.LongTensor(text), t.FloatTensor(mel), t.FloatTensor(mel_input), t.LongTensor(pos_text), t.LongTensor(
pos_mel), t.LongTensor(text_length)
raise TypeError(("batch must contain tensors, numbers, dicts or lists; found {}"
.format(type(batch[0]))))
# 用于对PostDatasets类构建的数据进行batch中的转换处理
def collate_fn_postnet(batch):
# Puts each data field into a tensor with outer dimension batch size
if isinstance(batch[0], collections.Mapping):
mel = [d['mel'] for d in batch]
mag = [d['mag'] for d in batch]
# PAD sequences with largest length of the batch
mel = _pad_mel(mel)
mag = _pad_mel(mag)
return t.FloatTensor(mel), t.FloatTensor(mag)
raise TypeError(("batch must contain tensors, numbers, dicts or lists; found {}"
.format(type(batch[0]))))
def _pad_data(x, length): # 使用0对输出的x进行pad到指定长度length
_pad = 0
return np.pad(x, (0, length - x.shape[0]), mode='constant', constant_values=_pad)
def _prepare_data(inputs): # 将inputs中所有的序列用0pad到其中最长序列的长度
max_len = max((len(x) for x in inputs))
return np.stack([_pad_data(x, max_len) for x in inputs])
def _pad_per_step(inputs):
timesteps = inputs.shape[-1]
return np.pad(inputs, [[0, 0], [0, 0], [0, hp.outputs_per_step - (timesteps % hp.outputs_per_step)]],
mode='constant', constant_values=0.0)
def _pad_mel(inputs): # 将一个batch中所有的mel用0pad到其中最大长度的大小
_pad = 0
def _pad_one(x, max_len):
mel_len = x.shape[0]
return np.pad(x, [[0, max_len - mel_len], [0, 0]], mode='constant', constant_values=_pad)
max_len = max((x.shape[0] for x in inputs))
return np.stack([_pad_one(x, max_len) for x in inputs])
# 计算模型的参数大小
def get_param_size(model):
params = 0
for p in model.parameters():
tmp = 1
for x in p.size():
tmp *= x
params += tmp
return params
def get_dataset():
return LJDatasets(os.path.join(hp.data_path, 'metadata.csv'), os.path.join(hp.data_path, 'wavs'))
def get_post_dataset():
return PostDatasets(os.path.join(hp.data_path, 'metadata.csv'), os.path.join(hp.data_path, 'wavs'))
module.py
该文件中包含模型搭建的所有的模块
# 包含所有的模型方法 import torch.nn as nn import torch as t import torch.nn.functional as F import math import hyperparams as hp from text.symbols import symbols import numpy as np import copy from collections import OrderedDict def clones(module, N): """对传入的module深度复制n份,并放在一个modulelist中""" return nn.ModuleList([copy.deepcopy(module) for _ in range(N)]) class Linear(nn.Module): """ Linear Module,定义线性全连接层,使用xavier_uniform_进行初始化 """ def __init__(self, in_dim, out_dim, bias=True, w_init='linear'): """ :param in_dim: dimension of input :param out_dim: dimension of output :param bias: boolean. if True, bias is included. :param w_init: str. weight inits with xavier initialization. """ super(Linear, self).__init__() self.linear_layer = nn.Linear(in_dim, out_dim, bias=bias) nn.init.xavier_uniform_( self.linear_layer.weight, gain=nn.init.calculate_gain(w_init) 标签:bsz808a振动传感器变送器