From 9b4aa92667ce7977f30f9b6473b567567a78046f Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Mon, 19 Mar 2018 09:27:19 -0700 Subject: [PATCH 01/22] Adding harmonized teacher-forcing --- layers/tacotron.py | 12 ++++++++++-- models/tacotron.py | 2 +- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/layers/tacotron.py b/layers/tacotron.py index 38471214..b76c1e9f 100644 --- a/layers/tacotron.py +++ b/layers/tacotron.py @@ -286,7 +286,15 @@ class Decoder(nn.Module): memory_input = initial_memory while True: if t > 0: - memory_input = outputs[-1] if greedy else memory[t - 1] + if greedy: + memory_input = outputs[-1] + else: + # combine prev. model output and prev. real target + memory_input = torch.div(outputs[-1] + memory[t-1], 2.0) + # add a random noise + noise = torch.autograd.Variable( + memory_input.data.new(memory_input.size()).normal_(0.0, 1.0)) + memory_input = memory_input + noise # Prenet processed_memory = self.prenet(memory_input) @@ -338,4 +346,4 @@ class Decoder(nn.Module): def is_end_of_frames(output, eps=0.2): #0.2 - return (output.data <= eps).all() + return (output.data <= eps).all() \ No newline at end of file diff --git a/models/tacotron.py b/models/tacotron.py index 0b55b76b..7653f1c3 100644 --- a/models/tacotron.py +++ b/models/tacotron.py @@ -34,7 +34,7 @@ class Tacotron(nn.Module): # (B, T', mel_dim*r) mel_outputs, alignments = self.decoder( - encoder_outputs, mel_specs, input_lengths=input_lengths) + encoder_outputs, mel_specs) # Post net processing below From cb4840638345c81e7d49d438b659f18f36d8fa5d Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Mon, 19 Mar 2018 10:38:47 -0700 Subject: [PATCH 02/22] Dont use teacher forcing at test time --- layers/tacotron.py | 6 ++++-- train.py | 3 +-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/layers/tacotron.py b/layers/tacotron.py index b76c1e9f..c0828d08 100644 --- a/layers/tacotron.py +++ b/layers/tacotron.py @@ -48,6 +48,7 @@ class BatchNormConv1d(nn.Module): - input: batch x dims - output: batch x dims """ + def __init__(self, in_channels, out_channels, kernel_size, stride, padding, activation=None): super(BatchNormConv1d, self).__init__() @@ -241,7 +242,8 @@ class Decoder(nn.Module): Args: inputs: Encoder outputs. memory (None): Decoder memory (autoregression. If None (at eval-time), - decoder outputs are used as decoder inputs. + decoder outputs are used as decoder inputs. If None, it uses the last + output as the input. Shapes: - inputs: batch x time x encoder_out_dim @@ -293,7 +295,7 @@ class Decoder(nn.Module): memory_input = torch.div(outputs[-1] + memory[t-1], 2.0) # add a random noise noise = torch.autograd.Variable( - memory_input.data.new(memory_input.size()).normal_(0.0, 1.0)) + memory_input.data.new(memory_input.size()).normal_(0.0, 0.5)) memory_input = memory_input + noise # Prenet diff --git a/train.py b/train.py index 7b32d74c..53027615 100644 --- a/train.py +++ b/train.py @@ -228,8 +228,7 @@ def evaluate(model, criterion, data_loader, current_step): linear_spec_var = linear_spec_var.cuda() # forward pass - mel_output, linear_output, alignments =\ - model.forward(text_input_var, mel_spec_var) + mel_output, linear_output, alignments = model.forward(text_input_var) # loss computation mel_loss = criterion(mel_output, mel_spec_var) From 5750090fcd4a6718608c7357e3890f5b2a545ae0 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Thu, 22 Mar 2018 12:34:16 -0700 Subject: [PATCH 03/22] Stop token prediction - does train yet --- config.json | 6 +++--- datasets/LJSpeech.py | 23 +++++++++++++++++----- layers/.tacotron.py.swo | Bin 28672 -> 0 bytes layers/tacotron.py | 40 ++++++++++++++++++++++++-------------- models/.tacotron.py.swo | Bin 12288 -> 0 bytes models/tacotron.py | 7 +++++-- tests/layers_tests.py | 11 ++++++++--- tests/loader_tests.py | 10 +++++++--- train.py | 42 +++++++++++++++++++++++++++++----------- utils/data.py | 23 ++++++++++++++++++++++ 10 files changed, 121 insertions(+), 41 deletions(-) delete mode 100644 layers/.tacotron.py.swo delete mode 100644 models/.tacotron.py.swo diff --git a/config.json b/config.json index ffea4466..285e1d8d 100644 --- a/config.json +++ b/config.json @@ -12,16 +12,16 @@ "text_cleaner": "english_cleaners", "epochs": 2000, - "lr": 0.001, + "lr": 0.0003, "warmup_steps": 4000, "batch_size": 32, - "eval_batch_size": 32, + "eval_batch_size":32, "r": 5, "griffin_lim_iters": 60, "power": 1.5, - "num_loader_workers": 12, + "num_loader_workers": 8, "checkpoint": false, "save_step": 69, diff --git a/datasets/LJSpeech.py b/datasets/LJSpeech.py index 5334e1ca..fb6c9304 100644 --- a/datasets/LJSpeech.py +++ b/datasets/LJSpeech.py @@ -7,7 +7,8 @@ from torch.utils.data import Dataset from TTS.utils.text import text_to_sequence from TTS.utils.audio import AudioProcessor -from TTS.utils.data import prepare_data, pad_data, pad_per_step +from TTS.utils.data import (prepare_data, pad_data, pad_per_step, + prepare_tensor, prepare_stop_target) class LJSpeechDataset(Dataset): @@ -93,15 +94,26 @@ class LJSpeechDataset(Dataset): text_lenghts = np.array([len(x) for x in text]) max_text_len = np.max(text_lenghts) + linear = [self.ap.spectrogram(w).astype('float32') for w in wav] + mel = [self.ap.melspectrogram(w).astype('float32') for w in wav] + mel_lengths = [m.shape[1] for m in mel] + + # compute 'stop token' targets + stop_targets = [np.array([0.]*mel_len) for mel_len in mel_lengths] + # PAD sequences with largest length of the batch text = prepare_data(text).astype(np.int32) wav = prepare_data(wav) - linear = np.array([self.ap.spectrogram(w).astype('float32') for w in wav]) - mel = np.array([self.ap.melspectrogram(w).astype('float32') for w in wav]) + # PAD features with largest length of the batch + linear = prepare_tensor(linear) + mel = prepare_tensor(mel) assert mel.shape[2] == linear.shape[2] timesteps = mel.shape[2] + # PAD stop targets + stop_targets = prepare_stop_target(stop_targets, self.outputs_per_step) + # PAD with zeros that can be divided by outputs per step if (timesteps + 1) % self.outputs_per_step != 0: pad_len = self.outputs_per_step - \ @@ -112,7 +124,7 @@ class LJSpeechDataset(Dataset): linear = pad_per_step(linear, pad_len) mel = pad_per_step(mel, pad_len) - # reshape jombo + # reshape mojo linear = linear.transpose(0, 2, 1) mel = mel.transpose(0, 2, 1) @@ -121,7 +133,8 @@ class LJSpeechDataset(Dataset): text = torch.LongTensor(text) linear = torch.FloatTensor(linear) mel = torch.FloatTensor(mel) - return text, text_lenghts, linear, mel, item_idxs[0] + stop_targets = torch.FloatTensor(stop_targets) + return text, text_lenghts, linear, mel, stop_targets, item_idxs[0] raise TypeError(("batch must contain tensors, numbers, dicts or lists;\ found {}" diff --git a/layers/.tacotron.py.swo b/layers/.tacotron.py.swo deleted file mode 100644 index c637f4479218ebeffd233f9b79def064bac02fbd..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 28672 zcmeI4dyr&TS-|g-Y?KIS<$+iNb#^9Yx|5#XncW1E4IQ()F*|X?8a9&+Fsxm7y6^N{ zr{{Ka@9o{$P?#9aA6i&Q5Uap9AxH!bhCie#qQnQVBE;}06!i~Z1tP_xtQ3h*=%yYD^sJic?zcfRkOuXkns)U7X9FKu7r@%kap`^&EduYUXNi@mr0 zwC4r!XfPVejh$Z`!`{NqDa_rUH5u+7$D^MKdTFwM$?uKQcogj)`sac;*`M0)?eV$Y z>q*E@J%VP2k?o!O@@z2weaK3pfOB~VMCmOw3mS^~8MY6;X5 zs3lNK;Cn#=>3FYq4-L7{Hs<;EeNW~6BK!T0ZPHy}uLt%$wC^sTXW8#RXx~3vxxd$b zztO&bx^n*;|6^XAZSLF4^;<20S^~8MY6;X5s3lNKpq4-_fm#B!1ZoM?5~w9GMFM`` z^Cr*rypMsL|2zNu@A>d|@D8{Su7j7tkHA-+=XsC818^FeZ~=VW{gq+R$Wlq_)lsavDvVU@M}wd-45CI7 z3|HH($T*6UX7hCE8Fz2j)o~AyS$8e$_k$>*Ld(-@91|7G8n1V*xv{eHK*p?$JCG0 zFdC?IEg<*TPJ*b%DLmbjy_UMJ`WgT_MB6aW9e7=jPV0X_b|JHP)k{QO^o zx4r7ScExf!A0;%_62?$ehcn}SHtx{II?#MeC9cx z_a1mR{3^T|9)L}FCCtM!Kj3*EfIop>gg#sY2jKv`0KR&$=beXlz?sDOtf4^-}OHoECmvGfRt0(&ODjX_3GOLcoqsdq_$;M=uhT~!2Gz_^# zK`%%WKR%Z|lNF)DL`9=ioft*Ip9nVA{S6=vSE)k_NmxwT6POv7BQ z52>FpxpJey!@>_b+ROXPn z+1Mj)T&l1gv_ z+Qr?`>0~Ztmn7HXNN%Z^nbI4wk~^K%Kx$?X${fmyii30#M@HHAR@Md$qYPR`DY&_p z%irn>yF8fDW}nIHRi5KhbDTb#r3Q%P=#j-^4c)27I}1!vJ1Vhjkh8u%Mb4%!c{hcT z$aKf<(PFj>%kIT&GS1{{pWj)&)`HFDuI$U+XsH#k*1D@yc(%h)vO@jnH7kr{_=xncI#AK4l5DdwxV}}}H@7yqw@wXkr~eOa)-@sLwQ#U@ z)<0KlEyLGcrX6N(*kr7_@}Rsn)zuv%-Ne%u_=7zdBw>Hz4@HAwZ;D1Kce=Dji<9xn zp1_vE>9S2@J&D1)&n7nFEG&MK&1O4g^CB6K65L=EVBTM|sL|fKN=MqE9>w7xjHpwx zhJ8?|Y_+>8oh4<^j57=Us&vfNOgQFiw=hjBHPsE?V5uIDMnj$5@}ZWJe@poei*-9} z>kAHtX*>>0wn<|PT=Apz#Q6XJgn#`C{BZIAUHt!7@bUi@WFO%7;bDm2Fx&)J!c+M6 zABTtFUN{9m49}3)C*VnV6y69YLCSa;G+-}0LAj5>J#Ydphx7RHZ-J9A2Q9b+zJ?F~ z1iS|x0P*jC1A7M#bI{|Nj6tib@T zffvADxDY;tPyc>+0LCzaSHfLzJzNGKbpC!}Et2tuebHNMV~#YIv!R!d6>WJv`Ha6z%QnadajsHP1&Cly}4_!V>hW>_%Eq(WxJ_3}KnPIS4q)g0g5 zQZ3KA+PZi2Pjv<3O4tu$qXhk7WmR)a3e`3CT%#=5jvd|3vPVny|%vd&;T>E~K9 zDD5Y|*A&%ycR0>h@={sui>!e-&K-oGm($^K&!v#}09(M$Q*(FNs8jjf=+dAnJ8iVekm${LF6Y-L#%p4K?)(jDfU z08t+ok7G+`79}p9U4HAb#YIWEJUW>y%0*46p-3znR|Yl`K*`H+{Q&1SLDsRNG3Zd?=BgdOd9Yc#ms0+HgJI#SdjYQV`wU!haX zIggs)66NOCoU%%Tt-kc^C|*zKs(7nKR(xV%uCEp9XoyXXO4V}HZKWR4NVTC|_e07r zlp6(;7=y|dLk1gSh4s~F)wx~ThuP8RT4Y2oQ9+}0Wm+t>=V|d$J}hMA(S4DGgN;#W zc1_w#&f&@vF1my=#mqt0VmZ}z%6`j^iY>~Fi#u$&c}cq%I7@9&mFBc=H5?ARjpEj> z&OjQ5aIDH@bcpWEbzr5q#X^Sm$S)s_a%d;b-(7Zk?AlTCZ4KW_1zk70Rcgnc)=gfC zidM7?Zdt1Jtz0)0lVChl?v~v*$wFRPHby!ytnCl2Xkqch*W{`!_;1Fo|W%pJ@-a8}(i8V*NiWn-M(sALqc^6g65Z(a4- zqeoFXS28Qs6<>X~+SQ~F)cT>(aL6M_EJr*Nu*ddtD@L5x>RGy z=<4#v3@5YaxXmaM5mxp9O_sJ9#Z5`4uuO*K-6}a-QtfQFRE4Z&8~{!|d-AqPyk+0E zwPm}NQ+O#zyLhgQC@NfR`;%0TP&ivKHE@9vp8c*L^=4Ub#qK?%^nVDZ&;K9Chy66Z zt@!`$_xFE`fB#2tKl}oW;4U}?i!cMvgQxNLAAz?)4_4q7_$vPW!>|l;-v1)_cl`Tb zhqpll*TGZx_V0%`!6F=kD_|DBh)@5g@B#QuI1A$M_h1FY@4pQG8{hss+z;#UDv)#i z7s8Li=ke=52j}4pun5QC3Ydi%_&V*^GC&OVtCm15fm#B!1ZoNVzm$NOLH2M`ybU50 zv_q+0))6P8+4eb+Z+c<@g$Yp-{qAU$7#WExw8@P#GT%Nh?Yz_Eh`Hwv!$Bl2OkoSv z-o!#PyV1l;_SS8#l$;;B%(vJGYU6gL;b1bF6wku)d@I@HH@4*DNt@-BAK&ObRi#e_ z^DQoWP-4T5#L6ak)mB=EX=SmJ`7IYC`h6!Q+$D7Bat2Ll|b*k1o0r~&*b*J%Q+aDN*U)tHuYR6 zgbwv6SI|8)y*f6d1s&0_&4^*3xJ=}oM~sLI4vAbRvZAB-&!XkCd*!_d?Pe%O;xy`i$f8N|C`V_P}7i{;9iqAA5m zQ#)SeG*X+=?LzWA+BjJb`wS7Ss5a;>UIgSNY{uu9IWZXR-3#M(&L(!A8!#GW`lDI z(c3MxC4jxD10&1rbyd>+<{D_X#LEFgSNJ`CNsa zT?>hRl~9uWTD7QByna?OiFKOK+ngygeUOtE>$DlltfIzDd=t5t57iYjD~L+lq^0m% z5sJKK)e`5MX7y8zY+Y(8qD=A-Ds%pS5$Dbk=hNc<-(d}&FW}q%2Yenr1oy&en1gTN z-~R=ig`?oXKj7bg7Cs0M!RtZ%{+r;(;YPRup26pT5*~#6UWxB)JP$MN}p30?&= zun#`NUdVglES!O5cs^VX|H9e+hv0qiDBK6Da1*>3UIZ7xUicjH{BsbwE{bgHS1p0> z1qm4SVopx-6?u7^%dCke1j@#t*kp3-Qh2zUipZuqTBVLw5xcVRGs8MsrH)pqqg9Ls z)jqCKo83BErH)q7ze2L-$GbipuA^0^o&ndEa=%m2Dt1R~ khfoApbTNLB00kXlng#K+?5~xwZaZqM?j7T*Ou&fuzbgW+1^@s6 diff --git a/layers/tacotron.py b/layers/tacotron.py index c0828d08..e9a40b24 100644 --- a/layers/tacotron.py +++ b/layers/tacotron.py @@ -5,6 +5,7 @@ from torch import nn from .attention import AttentionRNN from .attention import get_mask_from_lengths +from .custom_layers import StopProjection class Prenet(nn.Module): r""" Prenet as explained at https://arxiv.org/abs/1703.10135. @@ -214,8 +215,9 @@ class Decoder(nn.Module): r (int): number of outputs per time step. eps (float): threshold for detecting the end of a sentence. """ - def __init__(self, in_features, memory_dim, r, eps=0.05): + def __init__(self, in_features, memory_dim, r, eps=0.05, mode='train'): super(Decoder, self).__init__() + self.mode = mode self.max_decoder_steps = 200 self.memory_dim = memory_dim self.eps = eps @@ -231,6 +233,8 @@ class Decoder(nn.Module): [nn.GRUCell(256, 256) for _ in range(2)]) # RNN_state -> |Linear| -> mel_spec self.proj_to_mel = nn.Linear(256, memory_dim * r) + # RNN_state | attention_context -> |Linear| -> stop_token + self.stop_token = StopProjection(256 + in_features, r) def forward(self, inputs, memory=None): """ @@ -252,10 +256,9 @@ class Decoder(nn.Module): B = inputs.size(0) # Run greedy decoding if memory is None - greedy = memory is None + greedy = ~self.training if memory is not None: - # Grouping multiple frames if necessary if memory.size(-1) == self.memory_dim: memory = memory.view(B, memory.size(1) // self.r, -1) @@ -283,6 +286,7 @@ class Decoder(nn.Module): outputs = [] alignments = [] + stop_outputs = [] t = 0 memory_input = initial_memory @@ -292,11 +296,12 @@ class Decoder(nn.Module): memory_input = outputs[-1] else: # combine prev. model output and prev. real target - memory_input = torch.div(outputs[-1] + memory[t-1], 2.0) + # memory_input = torch.div(outputs[-1] + memory[t-1], 2.0) # add a random noise - noise = torch.autograd.Variable( - memory_input.data.new(memory_input.size()).normal_(0.0, 0.5)) - memory_input = memory_input + noise + # noise = torch.autograd.Variable( + # memory_input.data.new(memory_input.size()).normal_(0.0, 0.5)) + # memory_input = memory_input + noise + memory_input = memory[t-1] # Prenet processed_memory = self.prenet(memory_input) @@ -316,35 +321,42 @@ class Decoder(nn.Module): decoder_input, decoder_rnn_hiddens[idx]) # Residual connectinon decoder_input = decoder_rnn_hiddens[idx] + decoder_input - + output = decoder_input + stop_token_input = decoder_input + + # stop token prediction + stop_token_input = torch.cat((output, current_context_vec), -1) + stop_output = self.stop_token(stop_token_input) # predict mel vectors from decoder vectors output = self.proj_to_mel(output) outputs += [output] alignments += [alignment] + stop_outputs += [stop_output] t += 1 - if greedy: + if (not greedy and self.training) or (greedy and memory is not None): + if t >= T_decoder: + break + else: if t > 1 and is_end_of_frames(output, self.eps): break elif t > self.max_decoder_steps: print(" !! Decoder stopped with 'max_decoder_steps'. \ Something is probably wrong.") break - else: - if t >= T_decoder: - break - + assert greedy or len(outputs) == T_decoder # Back to batch first alignments = torch.stack(alignments).transpose(0, 1) outputs = torch.stack(outputs).transpose(0, 1).contiguous() + stop_outputs = torch.stack(stop_outputs).transpose(0, 1).contiguous() - return outputs, alignments + return outputs, alignments, stop_outputs def is_end_of_frames(output, eps=0.2): #0.2 diff --git a/models/.tacotron.py.swo b/models/.tacotron.py.swo deleted file mode 100644 index b4cfd7c58104b01c73bb3f9d66bfc10d8e7c0613..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 12288 zcmeI2O>Epm6vro&uL5lJo zRpkIjZasknhf08i;070j#Dxz52e@$KP^q9wz#&%-Nc?Tj*xprX5(y!J#?r65<9Rdl z-hZC`ys1~(ryn^^?{j7ejy;6@c!fQCxUrACd4>?i(?J?#9oIhM$nRX+qUb$eOxPOo z^abWi(OUNWRPr=w#c9AI(UPj|4A-|ZpRq9t7zMVdz)1EF&F>;J-A+3j(YWgXz2o*% zTO4f6i~>dhqkvJsC}0#Y3K#{90!D%Vg936mMP5Z9*QsE->T_!1Q#H+pQNSo*6fg=H z1&jhl0i%FXz$jo8FbWt2i~|2b1-yWey*CpwwG+58F)8J3|eHpwD-U9;M59UE06Fz@Hd>6s@;8Sn`JPuBP8Swi} z@DIEJd~g)(2OBpM@)1bD6JQ_MxPg!_!582>SOiA^0l!>N$j{(2a1jK+2aDiNuoGO_ zfq21N;1pN}ZO{UH!4B{>{uuKWxB#93PlE%1g8krMJ*YZ3lOvwcm_@yGB!?p@Y#N0L z^LW+vqHvJJERh1vs$<-Sx>T^J??j%Ey`sw+w`guMa+nI4Flp5sX;!P%&SI%r%Xv9u zYu2Jo4|VLqg{(6O<7&MI&H9(^eAqRPe7xLJ6G?h}z%un>7N>l@7qMg@SA{#?tlX*vVbe-stz^LR`K?km6~;Yf90z$d zS?1CBLYU`7ny_*>-os%FW$UW;VS??~ewU_UztD9gSWhhxd*a1~N{$IBbOh?Ol}`&@ z#-^`aui^@m@r^46mea*6EC{e0YQkD}EW)NRc0~9Jv)aY%0qfI#%GW#|SXp)|7vPH8 zD>qYU!m<2ZvliWCUSq8!aURPrkvvRD}@ww;;!b}jSu?3xhhju@vx24${fY4!KV(98F)7Y1kD+V^Vi z!g=j#A@q`NTDYJru^z9ak!aeq^g4^kCVJI;iiuRkL=$Jm&DD)6+ecU5`_LFSQewkG zeXF`hL&h!r&RcO8YC64Mn1r&|v#Rj4kCWcG$2?q|$hp!$f6*D`cG}%J?6Zr1YRVP) zn-9x!y<9RAVL~A{&#fv_HKU5fop0O3$IV+%9fyS^Kb{7oD07VTc^a2pxXZ6IE*#V= z?dbNa$Tr7rcctyxoBFm{ealis44H2#oWM<>exLp4E1bXTanF~y$ZAy}t8!JxCY7!- zEGkOAbEe*9&!}bPF!j#VyRxWlq2B4&*l8FKQ!eRA&hR_I)(=^>elWB12fdbilnp_n!86=&eMIC@=D%q(Sj>0gq)1G=CwO F`~}aao>l+= diff --git a/models/tacotron.py b/models/tacotron.py index 7653f1c3..a8b04fbd 100644 --- a/models/tacotron.py +++ b/models/tacotron.py @@ -11,6 +11,7 @@ class Tacotron(nn.Module): freq_dim=1025, r=5, padding_idx=None): super(Tacotron, self).__init__() + self.r = r self.mel_dim = mel_dim self.linear_dim = linear_dim self.embedding = nn.Embedding(len(symbols), embedding_dim, @@ -26,6 +27,7 @@ class Tacotron(nn.Module): self.last_linear = nn.Linear(mel_dim * 2, freq_dim) def forward(self, characters, mel_specs=None): + B = characters.size(0) inputs = self.embedding(characters) @@ -33,7 +35,7 @@ class Tacotron(nn.Module): encoder_outputs = self.encoder(inputs) # (B, T', mel_dim*r) - mel_outputs, alignments = self.decoder( + mel_outputs, alignments, stop_outputs = self.decoder( encoder_outputs, mel_specs) # Post net processing below @@ -41,8 +43,9 @@ class Tacotron(nn.Module): # Reshape # (B, T, mel_dim) mel_outputs = mel_outputs.view(B, -1, self.mel_dim) + stop_outputs = stop_outputs.view(B, -1) linear_outputs = self.postnet(mel_outputs) linear_outputs = self.last_linear(linear_outputs) - return mel_outputs, linear_outputs, alignments + return mel_outputs, linear_outputs, alignments, stop_outputs diff --git a/tests/layers_tests.py b/tests/layers_tests.py index 3fbab022..e8ebba0d 100644 --- a/tests/layers_tests.py +++ b/tests/layers_tests.py @@ -37,18 +37,23 @@ class DecoderTests(unittest.TestCase): dummy_memory = T.autograd.Variable(T.rand(4, 120, 32)) print(layer) - output, alignment = layer(dummy_input, dummy_memory) + output, alignment, stop_output = layer(dummy_input, dummy_memory) print(output.shape) + print(" > Stop ", stop_output.shape) + assert output.shape[0] == 4 assert output.shape[1] == 120 / 5 assert output.shape[2] == 32 * 5 - + assert stop_output.shape[0] == 4 + assert stop_output.shape[1] == 120 / 5 + assert stop_output.shape[2] == 5 + class EncoderTests(unittest.TestCase): def test_in_out(self): layer = Encoder(128) - dummy_input = T.autograd.Variable(T.rand(4, 8, 128)) + dummy_input = T.autograd.Variable(T.rand(4, 8, 128)) print(layer) output = layer(dummy_input) diff --git a/tests/loader_tests.py b/tests/loader_tests.py index fdecd6eb..dc023b60 100644 --- a/tests/loader_tests.py +++ b/tests/loader_tests.py @@ -32,7 +32,7 @@ class TestDataset(unittest.TestCase): c.power ) - dataloader = DataLoader(dataset, batch_size=c.batch_size, + dataloader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=dataset.collate_fn, drop_last=True, num_workers=c.num_loader_workers) @@ -43,7 +43,8 @@ class TestDataset(unittest.TestCase): text_lengths = data[1] linear_input = data[2] mel_input = data[3] - item_idx = data[4] + stop_targets = data[4] + item_idx = data[5] neg_values = text_input[text_input < 0] check_count = len(neg_values) @@ -81,13 +82,16 @@ class TestDataset(unittest.TestCase): text_lengths = data[1] linear_input = data[2] mel_input = data[3] - item_idx = data[4] + stop_target = data[4] + item_idx = data[5] # check the last time step to be zero padded assert mel_input[0, -1].sum() == 0 assert mel_input[0, -2].sum() != 0 assert linear_input[0, -1].sum() == 0 assert linear_input[0, -2].sum() != 0 + assert stop_target[0, -1] == 1 + assert stop_target.sum() == 1 diff --git a/train.py b/train.py index 53027615..3531771f 100644 --- a/train.py +++ b/train.py @@ -63,11 +63,12 @@ def signal_handler(signal, frame): sys.exit(1) -def train(model, criterion, data_loader, optimizer, epoch): +def train(model, criterion, critetion_stop, data_loader, optimizer, epoch): model = model.train() epoch_time = 0 avg_linear_loss = 0 avg_mel_loss = 0 + avg_stop_loss = 0 print(" | > Epoch {}/{}".format(epoch, c.epochs)) progbar = Progbar(len(data_loader.dataset) / c.batch_size) @@ -80,6 +81,7 @@ def train(model, criterion, data_loader, optimizer, epoch): text_lengths = data[1] linear_input = data[2] mel_input = data[3] + stop_targets = data[4] current_step = num_iter + args.restore_step + epoch * len(data_loader) + 1 @@ -93,6 +95,7 @@ def train(model, criterion, data_loader, optimizer, epoch): # convert inputs to variables text_input_var = Variable(text_input) mel_spec_var = Variable(mel_input) + stop_targets_var = Variable(stop_targets) linear_spec_var = Variable(linear_input, volatile=True) # sort sequence by length for curriculum learning @@ -109,9 +112,10 @@ def train(model, criterion, data_loader, optimizer, epoch): text_input_var = text_input_var.cuda() mel_spec_var = mel_spec_var.cuda() linear_spec_var = linear_spec_var.cuda() + stop_targets_var = stop_targets_var.cuda() # forward pass - mel_output, linear_output, alignments =\ + mel_output, linear_output, alignments, stop_output =\ model.forward(text_input_var, mel_spec_var) # loss computation @@ -119,7 +123,8 @@ def train(model, criterion, data_loader, optimizer, epoch): linear_loss = 0.5 * criterion(linear_output, linear_spec_var) \ + 0.5 * criterion(linear_output[:, :, :n_priority_freq], linear_spec_var[: ,: ,:n_priority_freq]) - loss = mel_loss + linear_loss + stop_loss = critetion_stop(stop_output, stop_targets_var) + loss = mel_loss + linear_loss + 0.25*stop_loss # backpass and check the grad norm loss.backward() @@ -136,6 +141,7 @@ def train(model, criterion, data_loader, optimizer, epoch): # update progbar.update(num_iter+1, values=[('total_loss', loss.data[0]), ('linear_loss', linear_loss.data[0]), + ('stop_loss', stop_loss.data[0]), ('mel_loss', mel_loss.data[0]), ('grad_norm', grad_norm)]) @@ -144,6 +150,7 @@ def train(model, criterion, data_loader, optimizer, epoch): tb.add_scalar('TrainIterLoss/LinearLoss', linear_loss.data[0], current_step) tb.add_scalar('TrainIterLoss/MelLoss', mel_loss.data[0], current_step) + tb.add_scalar('TrainIterLoss/StopLoss', stop_loss.data[0], current_step) tb.add_scalar('Params/LearningRate', optimizer.param_groups[0]['lr'], current_step) tb.add_scalar('Params/GradNorm', grad_norm, current_step) @@ -184,19 +191,21 @@ def train(model, criterion, data_loader, optimizer, epoch): avg_linear_loss /= (num_iter + 1) avg_mel_loss /= (num_iter + 1) - avg_total_loss = avg_mel_loss + avg_linear_loss + avg_stop_loss /= (num_iter + 1) + avg_total_loss = avg_mel_loss + avg_linear_loss + 0.25*avg_stop_loss # Plot Training Epoch Stats tb.add_scalar('TrainEpochLoss/TotalLoss', loss.data[0], current_step) tb.add_scalar('TrainEpochLoss/LinearLoss', linear_loss.data[0], current_step) tb.add_scalar('TrainEpochLoss/MelLoss', mel_loss.data[0], current_step) + tb.add_scalar('TrainEpochLoss/StopLoss', stop_loss.data[0], current_step) tb.add_scalar('Time/EpochTime', epoch_time, epoch) epoch_time = 0 return avg_linear_loss, current_step -def evaluate(model, criterion, data_loader, current_step): +def evaluate(model, criterion, criterion_stop, data_loader, current_step): model = model.eval() epoch_time = 0 @@ -206,6 +215,7 @@ def evaluate(model, criterion, data_loader, current_step): avg_linear_loss = 0 avg_mel_loss = 0 + avg_stop_loss = 0 for num_iter, data in enumerate(data_loader): start_time = time.time() @@ -215,38 +225,44 @@ def evaluate(model, criterion, data_loader, current_step): text_lengths = data[1] linear_input = data[2] mel_input = data[3] + stop_targets = data[4] # convert inputs to variables text_input_var = Variable(text_input) mel_spec_var = Variable(mel_input) linear_spec_var = Variable(linear_input, volatile=True) + stop_targets_var = Variable(stop_targets) # dispatch data to GPU if use_cuda: text_input_var = text_input_var.cuda() mel_spec_var = mel_spec_var.cuda() linear_spec_var = linear_spec_var.cuda() + stop_targets_var = stop_targets_var.cuda() # forward pass - mel_output, linear_output, alignments = model.forward(text_input_var) + mel_output, linear_output, alignments, stop_output = model.forward(text_input_var, mel_spec_var) # loss computation mel_loss = criterion(mel_output, mel_spec_var) linear_loss = 0.5 * criterion(linear_output, linear_spec_var) \ + 0.5 * criterion(linear_output[:, :, :n_priority_freq], linear_spec_var[: ,: ,:n_priority_freq]) - loss = mel_loss + linear_loss + stop_loss = criterion_stop(stop_output, stop_targets_var) + loss = mel_loss + linear_loss + 0.25*stop_loss step_time = time.time() - start_time epoch_time += step_time # update progbar.update(num_iter+1, values=[('total_loss', loss.data[0]), + ('stop_loss', stop_loss.data[0]), ('linear_loss', linear_loss.data[0]), ('mel_loss', mel_loss.data[0])]) avg_linear_loss += linear_loss.data[0] avg_mel_loss += mel_loss.data[0] + avg_stop_loss += stop_loss.data[0] # Diagnostic visualizations idx = np.random.randint(mel_input.shape[0]) @@ -278,12 +294,14 @@ def evaluate(model, criterion, data_loader, current_step): # compute average losses avg_linear_loss /= (num_iter + 1) avg_mel_loss /= (num_iter + 1) - avg_total_loss = avg_mel_loss + avg_linear_loss + avg_stop_loss /= (num_iter + 1) + avg_total_loss = avg_mel_loss + avg_linear_loss + 0.25*avg_stop_loss # Plot Learning Stats tb.add_scalar('ValEpochLoss/TotalLoss', avg_total_loss, current_step) tb.add_scalar('ValEpochLoss/LinearLoss', avg_linear_loss, current_step) tb.add_scalar('ValEpochLoss/MelLoss', avg_mel_loss, current_step) + tb.add_scalar('ValEpochLoss/StopLoss', avg_stop_loss, current_step) return avg_linear_loss @@ -336,13 +354,15 @@ def main(args): c.num_mels, c.num_freq, c.r) - + optimizer = optim.Adam(model.parameters(), lr=c.lr) if use_cuda: criterion = nn.L1Loss().cuda() + criterion_stop = nn.BCELoss().cuda() else: criterion = nn.L1Loss() + criterion_stop = nn.BCELoss() if args.restore_path: checkpoint = torch.load(args.restore_path) @@ -370,8 +390,8 @@ def main(args): best_loss = float('inf') for epoch in range(0, c.epochs): - train_loss, current_step = train(model, criterion, train_loader, optimizer, epoch) - val_loss = evaluate(model, criterion, val_loader, current_step) + train_loss, current_step = train(model, criterion, criterion_stop, train_loader, optimizer, epoch) + val_loss = evaluate(model, criterion, criterion_stop, val_loader, current_step) best_loss = save_best_model(model, optimizer, val_loss, best_loss, OUT_PATH, current_step, epoch) diff --git a/utils/data.py b/utils/data.py index a38092e9..022fab1e 100644 --- a/utils/data.py +++ b/utils/data.py @@ -14,6 +14,29 @@ def prepare_data(inputs): return np.stack([pad_data(x, max_len) for x in inputs]) +def pad_tensor(x, length): + _pad = 0 + assert x.ndim == 2 + return np.pad(x, [[0, 0], [0, length- x.shape[1]]], mode='constant', constant_values=_pad) + + +def prepare_tensor(inputs): + max_len = max((x.shape[1] for x in inputs)) + return np.stack([pad_tensor(x, max_len) for x in inputs]) + + +def pad_stop_target(x, length): + _pad = 1. + assert x.ndim == 1 + return np.pad(x, (0, length - x.shape[0]), mode='constant', constant_values=_pad) + + +def prepare_stop_target(inputs, out_steps): + max_len = max((x.shape[0] for x in inputs)) + remainder = max_len % out_steps + return np.stack([pad_stop_target(x, max_len + out_steps - remainder) for x in inputs]) + + def pad_per_step(inputs, pad_len): timesteps = inputs.shape[-1] return np.pad(inputs, [[0, 0], [0, 0], From 4e4f876bc4f1ba3087c950e2406cd7cc078aa2f5 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Thu, 22 Mar 2018 12:34:31 -0700 Subject: [PATCH 04/22] Stop token prediction - does train yet --- layers/custom_layers.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 layers/custom_layers.py diff --git a/layers/custom_layers.py b/layers/custom_layers.py new file mode 100644 index 00000000..802091e8 --- /dev/null +++ b/layers/custom_layers.py @@ -0,0 +1,26 @@ +# coding: utf-8 +import torch +from torch.autograd import Variable +from torch import nn + + +class StopProjection(nn.Module): + r""" Simple projection layer to predict the "stop token" + + Args: + in_features (int): size of the input vector + out_features (int or list): size of each output vector. aka number + of predicted frames. + """ + + def __init__(self, in_features, out_features): + super(StopProjection, self).__init__() + self.linear = nn.Linear(in_features, out_features) + self.dropout = nn.Dropout(0.5) + self.sigmoid = nn.Sigmoid() + + def forward(self, inputs): + out = self.dropout(inputs) + out = self.linear(out) + out = self.sigmoid(out) + return out \ No newline at end of file From a925c9c75cdc699336b1c1b012db9b2e08bc23da Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Thu, 22 Mar 2018 12:47:54 -0700 Subject: [PATCH 05/22] remove stop token prediciton --- layers/custom_layers.py | 34 +++++++++++++++++----------------- layers/tacotron.py | 12 +----------- train.py | 40 ++++++++++------------------------------ 3 files changed, 28 insertions(+), 58 deletions(-) diff --git a/layers/custom_layers.py b/layers/custom_layers.py index 802091e8..d659efb2 100644 --- a/layers/custom_layers.py +++ b/layers/custom_layers.py @@ -4,23 +4,23 @@ from torch.autograd import Variable from torch import nn -class StopProjection(nn.Module): - r""" Simple projection layer to predict the "stop token" +# class StopProjection(nn.Module): +# r""" Simple projection layer to predict the "stop token" - Args: - in_features (int): size of the input vector - out_features (int or list): size of each output vector. aka number - of predicted frames. - """ +# Args: +# in_features (int): size of the input vector +# out_features (int or list): size of each output vector. aka number +# of predicted frames. +# """ - def __init__(self, in_features, out_features): - super(StopProjection, self).__init__() - self.linear = nn.Linear(in_features, out_features) - self.dropout = nn.Dropout(0.5) - self.sigmoid = nn.Sigmoid() +# def __init__(self, in_features, out_features): +# super(StopProjection, self).__init__() +# self.linear = nn.Linear(in_features, out_features) +# self.dropout = nn.Dropout(0.5) +# self.sigmoid = nn.Sigmoid() - def forward(self, inputs): - out = self.dropout(inputs) - out = self.linear(out) - out = self.sigmoid(out) - return out \ No newline at end of file +# def forward(self, inputs): +# out = self.dropout(inputs) +# out = self.linear(out) +# out = self.sigmoid(out) +# return out \ No newline at end of file diff --git a/layers/tacotron.py b/layers/tacotron.py index e9a40b24..51548287 100644 --- a/layers/tacotron.py +++ b/layers/tacotron.py @@ -5,7 +5,6 @@ from torch import nn from .attention import AttentionRNN from .attention import get_mask_from_lengths -from .custom_layers import StopProjection class Prenet(nn.Module): r""" Prenet as explained at https://arxiv.org/abs/1703.10135. @@ -233,8 +232,6 @@ class Decoder(nn.Module): [nn.GRUCell(256, 256) for _ in range(2)]) # RNN_state -> |Linear| -> mel_spec self.proj_to_mel = nn.Linear(256, memory_dim * r) - # RNN_state | attention_context -> |Linear| -> stop_token - self.stop_token = StopProjection(256 + in_features, r) def forward(self, inputs, memory=None): """ @@ -286,7 +283,6 @@ class Decoder(nn.Module): outputs = [] alignments = [] - stop_outputs = [] t = 0 memory_input = initial_memory @@ -323,18 +319,13 @@ class Decoder(nn.Module): decoder_input = decoder_rnn_hiddens[idx] + decoder_input output = decoder_input - stop_token_input = decoder_input - # stop token prediction - stop_token_input = torch.cat((output, current_context_vec), -1) - stop_output = self.stop_token(stop_token_input) # predict mel vectors from decoder vectors output = self.proj_to_mel(output) outputs += [output] alignments += [alignment] - stop_outputs += [stop_output] t += 1 @@ -354,9 +345,8 @@ class Decoder(nn.Module): # Back to batch first alignments = torch.stack(alignments).transpose(0, 1) outputs = torch.stack(outputs).transpose(0, 1).contiguous() - stop_outputs = torch.stack(stop_outputs).transpose(0, 1).contiguous() - return outputs, alignments, stop_outputs + return outputs, alignments def is_end_of_frames(output, eps=0.2): #0.2 diff --git a/train.py b/train.py index 3531771f..3e1a75c7 100644 --- a/train.py +++ b/train.py @@ -63,12 +63,11 @@ def signal_handler(signal, frame): sys.exit(1) -def train(model, criterion, critetion_stop, data_loader, optimizer, epoch): +def train(model, criterion, data_loader, optimizer, epoch): model = model.train() epoch_time = 0 avg_linear_loss = 0 avg_mel_loss = 0 - avg_stop_loss = 0 print(" | > Epoch {}/{}".format(epoch, c.epochs)) progbar = Progbar(len(data_loader.dataset) / c.batch_size) @@ -81,7 +80,6 @@ def train(model, criterion, critetion_stop, data_loader, optimizer, epoch): text_lengths = data[1] linear_input = data[2] mel_input = data[3] - stop_targets = data[4] current_step = num_iter + args.restore_step + epoch * len(data_loader) + 1 @@ -95,7 +93,6 @@ def train(model, criterion, critetion_stop, data_loader, optimizer, epoch): # convert inputs to variables text_input_var = Variable(text_input) mel_spec_var = Variable(mel_input) - stop_targets_var = Variable(stop_targets) linear_spec_var = Variable(linear_input, volatile=True) # sort sequence by length for curriculum learning @@ -112,10 +109,9 @@ def train(model, criterion, critetion_stop, data_loader, optimizer, epoch): text_input_var = text_input_var.cuda() mel_spec_var = mel_spec_var.cuda() linear_spec_var = linear_spec_var.cuda() - stop_targets_var = stop_targets_var.cuda() # forward pass - mel_output, linear_output, alignments, stop_output =\ + mel_output, linear_output, alignments =\ model.forward(text_input_var, mel_spec_var) # loss computation @@ -123,8 +119,7 @@ def train(model, criterion, critetion_stop, data_loader, optimizer, epoch): linear_loss = 0.5 * criterion(linear_output, linear_spec_var) \ + 0.5 * criterion(linear_output[:, :, :n_priority_freq], linear_spec_var[: ,: ,:n_priority_freq]) - stop_loss = critetion_stop(stop_output, stop_targets_var) - loss = mel_loss + linear_loss + 0.25*stop_loss + loss = mel_loss + linear_loss # backpass and check the grad norm loss.backward() @@ -141,7 +136,6 @@ def train(model, criterion, critetion_stop, data_loader, optimizer, epoch): # update progbar.update(num_iter+1, values=[('total_loss', loss.data[0]), ('linear_loss', linear_loss.data[0]), - ('stop_loss', stop_loss.data[0]), ('mel_loss', mel_loss.data[0]), ('grad_norm', grad_norm)]) @@ -150,7 +144,6 @@ def train(model, criterion, critetion_stop, data_loader, optimizer, epoch): tb.add_scalar('TrainIterLoss/LinearLoss', linear_loss.data[0], current_step) tb.add_scalar('TrainIterLoss/MelLoss', mel_loss.data[0], current_step) - tb.add_scalar('TrainIterLoss/StopLoss', stop_loss.data[0], current_step) tb.add_scalar('Params/LearningRate', optimizer.param_groups[0]['lr'], current_step) tb.add_scalar('Params/GradNorm', grad_norm, current_step) @@ -191,21 +184,19 @@ def train(model, criterion, critetion_stop, data_loader, optimizer, epoch): avg_linear_loss /= (num_iter + 1) avg_mel_loss /= (num_iter + 1) - avg_stop_loss /= (num_iter + 1) - avg_total_loss = avg_mel_loss + avg_linear_loss + 0.25*avg_stop_loss + avg_total_loss = avg_mel_loss + avg_linear_loss # Plot Training Epoch Stats tb.add_scalar('TrainEpochLoss/TotalLoss', loss.data[0], current_step) tb.add_scalar('TrainEpochLoss/LinearLoss', linear_loss.data[0], current_step) tb.add_scalar('TrainEpochLoss/MelLoss', mel_loss.data[0], current_step) - tb.add_scalar('TrainEpochLoss/StopLoss', stop_loss.data[0], current_step) tb.add_scalar('Time/EpochTime', epoch_time, epoch) epoch_time = 0 return avg_linear_loss, current_step -def evaluate(model, criterion, criterion_stop, data_loader, current_step): +def evaluate(model, criterion, data_loader, current_step): model = model.eval() epoch_time = 0 @@ -215,7 +206,6 @@ def evaluate(model, criterion, criterion_stop, data_loader, current_step): avg_linear_loss = 0 avg_mel_loss = 0 - avg_stop_loss = 0 for num_iter, data in enumerate(data_loader): start_time = time.time() @@ -225,44 +215,38 @@ def evaluate(model, criterion, criterion_stop, data_loader, current_step): text_lengths = data[1] linear_input = data[2] mel_input = data[3] - stop_targets = data[4] # convert inputs to variables text_input_var = Variable(text_input) mel_spec_var = Variable(mel_input) linear_spec_var = Variable(linear_input, volatile=True) - stop_targets_var = Variable(stop_targets) # dispatch data to GPU if use_cuda: text_input_var = text_input_var.cuda() mel_spec_var = mel_spec_var.cuda() linear_spec_var = linear_spec_var.cuda() - stop_targets_var = stop_targets_var.cuda() # forward pass - mel_output, linear_output, alignments, stop_output = model.forward(text_input_var, mel_spec_var) + mel_output, linear_output, alignments = model.forward(text_input_var, mel_spec_var) # loss computation mel_loss = criterion(mel_output, mel_spec_var) linear_loss = 0.5 * criterion(linear_output, linear_spec_var) \ + 0.5 * criterion(linear_output[:, :, :n_priority_freq], linear_spec_var[: ,: ,:n_priority_freq]) - stop_loss = criterion_stop(stop_output, stop_targets_var) - loss = mel_loss + linear_loss + 0.25*stop_loss + loss = mel_loss + linear_loss step_time = time.time() - start_time epoch_time += step_time # update progbar.update(num_iter+1, values=[('total_loss', loss.data[0]), - ('stop_loss', stop_loss.data[0]), ('linear_loss', linear_loss.data[0]), ('mel_loss', mel_loss.data[0])]) avg_linear_loss += linear_loss.data[0] avg_mel_loss += mel_loss.data[0] - avg_stop_loss += stop_loss.data[0] # Diagnostic visualizations idx = np.random.randint(mel_input.shape[0]) @@ -294,14 +278,12 @@ def evaluate(model, criterion, criterion_stop, data_loader, current_step): # compute average losses avg_linear_loss /= (num_iter + 1) avg_mel_loss /= (num_iter + 1) - avg_stop_loss /= (num_iter + 1) - avg_total_loss = avg_mel_loss + avg_linear_loss + 0.25*avg_stop_loss + avg_total_loss = avg_mel_loss + avg_linear_loss # Plot Learning Stats tb.add_scalar('ValEpochLoss/TotalLoss', avg_total_loss, current_step) tb.add_scalar('ValEpochLoss/LinearLoss', avg_linear_loss, current_step) tb.add_scalar('ValEpochLoss/MelLoss', avg_mel_loss, current_step) - tb.add_scalar('ValEpochLoss/StopLoss', avg_stop_loss, current_step) return avg_linear_loss @@ -359,10 +341,8 @@ def main(args): if use_cuda: criterion = nn.L1Loss().cuda() - criterion_stop = nn.BCELoss().cuda() else: criterion = nn.L1Loss() - criterion_stop = nn.BCELoss() if args.restore_path: checkpoint = torch.load(args.restore_path) @@ -390,8 +370,8 @@ def main(args): best_loss = float('inf') for epoch in range(0, c.epochs): - train_loss, current_step = train(model, criterion, criterion_stop, train_loader, optimizer, epoch) - val_loss = evaluate(model, criterion, criterion_stop, val_loader, current_step) + train_loss, current_step = train(model, criterion, train_loader, optimizer, epoch) + val_loss = evaluate(model, criterion, val_loader, current_step) best_loss = save_best_model(model, optimizer, val_loss, best_loss, OUT_PATH, current_step, epoch) From 0f3b2ddd7b2ef4e5fdae4c535fa6288de704beff Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Thu, 22 Mar 2018 12:50:26 -0700 Subject: [PATCH 06/22] remove stop token prediciton --- models/tacotron.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/models/tacotron.py b/models/tacotron.py index a8b04fbd..a485d68e 100644 --- a/models/tacotron.py +++ b/models/tacotron.py @@ -35,7 +35,7 @@ class Tacotron(nn.Module): encoder_outputs = self.encoder(inputs) # (B, T', mel_dim*r) - mel_outputs, alignments, stop_outputs = self.decoder( + mel_outputs, alignments = self.decoder( encoder_outputs, mel_specs) # Post net processing below @@ -43,9 +43,8 @@ class Tacotron(nn.Module): # Reshape # (B, T, mel_dim) mel_outputs = mel_outputs.view(B, -1, self.mel_dim) - stop_outputs = stop_outputs.view(B, -1) linear_outputs = self.postnet(mel_outputs) linear_outputs = self.last_linear(linear_outputs) - return mel_outputs, linear_outputs, alignments, stop_outputs + return mel_outputs, linear_outputs, alignments From 33937f54d07abf6341ffbdbf42b45e77322d779e Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Thu, 22 Mar 2018 13:46:52 -0700 Subject: [PATCH 07/22] masked loss --- datasets/LJSpeech.py | 20 ++++++++++++-------- tests/layers_tests.py | 6 +----- tests/loader_tests.py | 16 +++++++++++----- train.py | 21 +++++++++++++++------ 4 files changed, 39 insertions(+), 24 deletions(-) diff --git a/datasets/LJSpeech.py b/datasets/LJSpeech.py index fb6c9304..7b50e646 100644 --- a/datasets/LJSpeech.py +++ b/datasets/LJSpeech.py @@ -98,9 +98,6 @@ class LJSpeechDataset(Dataset): mel = [self.ap.melspectrogram(w).astype('float32') for w in wav] mel_lengths = [m.shape[1] for m in mel] - # compute 'stop token' targets - stop_targets = [np.array([0.]*mel_len) for mel_len in mel_lengths] - # PAD sequences with largest length of the batch text = prepare_data(text).astype(np.int32) wav = prepare_data(wav) @@ -111,9 +108,6 @@ class LJSpeechDataset(Dataset): assert mel.shape[2] == linear.shape[2] timesteps = mel.shape[2] - # PAD stop targets - stop_targets = prepare_stop_target(stop_targets, self.outputs_per_step) - # PAD with zeros that can be divided by outputs per step if (timesteps + 1) % self.outputs_per_step != 0: pad_len = self.outputs_per_step - \ @@ -123,8 +117,17 @@ class LJSpeechDataset(Dataset): pad_len = 1 linear = pad_per_step(linear, pad_len) mel = pad_per_step(mel, pad_len) + + # update mel lengths + mel_lengths = [l+pad_len for l in mel_lengths] + + # compute 'stop token' targets + stop_targets = [np.array([0.]*mel_len) for mel_len in mel_lengths] + + # PAD stop targets + stop_targets = prepare_stop_target(stop_targets, self.outputs_per_step) - # reshape mojo + # B x T x D linear = linear.transpose(0, 2, 1) mel = mel.transpose(0, 2, 1) @@ -133,8 +136,9 @@ class LJSpeechDataset(Dataset): text = torch.LongTensor(text) linear = torch.FloatTensor(linear) mel = torch.FloatTensor(mel) + mel_lengths = torch.LongTensor(mel_lengths) stop_targets = torch.FloatTensor(stop_targets) - return text, text_lenghts, linear, mel, stop_targets, item_idxs[0] + return text, text_lenghts, linear, mel, mel_lengths, stop_targets, item_idxs[0] raise TypeError(("batch must contain tensors, numbers, dicts or lists;\ found {}" diff --git a/tests/layers_tests.py b/tests/layers_tests.py index e8ebba0d..14739bf9 100644 --- a/tests/layers_tests.py +++ b/tests/layers_tests.py @@ -37,16 +37,12 @@ class DecoderTests(unittest.TestCase): dummy_memory = T.autograd.Variable(T.rand(4, 120, 32)) print(layer) - output, alignment, stop_output = layer(dummy_input, dummy_memory) + output, alignment = layer(dummy_input, dummy_memory) print(output.shape) - print(" > Stop ", stop_output.shape) assert output.shape[0] == 4 assert output.shape[1] == 120 / 5 assert output.shape[2] == 32 * 5 - assert stop_output.shape[0] == 4 - assert stop_output.shape[1] == 120 / 5 - assert stop_output.shape[2] == 5 class EncoderTests(unittest.TestCase): diff --git a/tests/loader_tests.py b/tests/loader_tests.py index dc023b60..3b3d017c 100644 --- a/tests/loader_tests.py +++ b/tests/loader_tests.py @@ -43,9 +43,10 @@ class TestDataset(unittest.TestCase): text_lengths = data[1] linear_input = data[2] mel_input = data[3] - stop_targets = data[4] - item_idx = data[5] - + mel_lengths = data[4] + stop_target = data[5] + item_idx = data[6] + neg_values = text_input[text_input < 0] check_count = len(neg_values) assert check_count == 0, \ @@ -82,8 +83,9 @@ class TestDataset(unittest.TestCase): text_lengths = data[1] linear_input = data[2] mel_input = data[3] - stop_target = data[4] - item_idx = data[5] + mel_lengths = data[4] + stop_target = data[5] + item_idx = data[6] # check the last time step to be zero padded assert mel_input[0, -1].sum() == 0 @@ -92,6 +94,10 @@ class TestDataset(unittest.TestCase): assert linear_input[0, -2].sum() != 0 assert stop_target[0, -1] == 1 assert stop_target.sum() == 1 + assert len(mel_lengths.shape) == 1 + print(mel_lengths) + print(mel_input) + assert mel_lengths[0] == mel_input[0].shape[0] diff --git a/train.py b/train.py index 3e1a75c7..b39b17d9 100644 --- a/train.py +++ b/train.py @@ -26,6 +26,7 @@ from utils.model import get_param_size from utils.visual import plot_alignment, plot_spectrogram from datasets.LJSpeech import LJSpeechDataset from models.tacotron import Tacotron +from losses import use_cuda = torch.cuda.is_available() @@ -80,6 +81,7 @@ def train(model, criterion, data_loader, optimizer, epoch): text_lengths = data[1] linear_input = data[2] mel_input = data[3] + mel_lengths = data[4] current_step = num_iter + args.restore_step + epoch * len(data_loader) + 1 @@ -93,6 +95,7 @@ def train(model, criterion, data_loader, optimizer, epoch): # convert inputs to variables text_input_var = Variable(text_input) mel_spec_var = Variable(mel_input) + mel_length_var = Variable(mel_lengths) linear_spec_var = Variable(linear_input, volatile=True) # sort sequence by length for curriculum learning @@ -108,6 +111,7 @@ def train(model, criterion, data_loader, optimizer, epoch): if use_cuda: text_input_var = text_input_var.cuda() mel_spec_var = mel_spec_var.cuda() + mel_lengths_var = mel_lengths_var.cuda() linear_spec_var = linear_spec_var.cuda() # forward pass @@ -115,10 +119,11 @@ def train(model, criterion, data_loader, optimizer, epoch): model.forward(text_input_var, mel_spec_var) # loss computation - mel_loss = criterion(mel_output, mel_spec_var) - linear_loss = 0.5 * criterion(linear_output, linear_spec_var) \ + mel_loss = criterion(mel_output, mel_spec_var, mel_lengths) + linear_loss = 0.5 * criterion(linear_output, linear_spec_var, mel_lengths) \ + 0.5 * criterion(linear_output[:, :, :n_priority_freq], - linear_spec_var[: ,: ,:n_priority_freq]) + linear_spec_var[: ,: ,:n_priority_freq], + mel_lengths) loss = mel_loss + linear_loss # backpass and check the grad norm @@ -215,26 +220,30 @@ def evaluate(model, criterion, data_loader, current_step): text_lengths = data[1] linear_input = data[2] mel_input = data[3] + mel_lengths = data[4] # convert inputs to variables text_input_var = Variable(text_input) mel_spec_var = Variable(mel_input) + mel_lengths_var = Variable(mel_lengths) linear_spec_var = Variable(linear_input, volatile=True) # dispatch data to GPU if use_cuda: text_input_var = text_input_var.cuda() mel_spec_var = mel_spec_var.cuda() + mel_lengths_var = mel_lengths_var.cuda() linear_spec_var = linear_spec_var.cuda() # forward pass mel_output, linear_output, alignments = model.forward(text_input_var, mel_spec_var) # loss computation - mel_loss = criterion(mel_output, mel_spec_var) - linear_loss = 0.5 * criterion(linear_output, linear_spec_var) \ + mel_loss = criterion(mel_output, mel_spec_var, mel_lengths) + linear_loss = 0.5 * criterion(linear_output, linear_spec_var, mel_lengths) \ + 0.5 * criterion(linear_output[:, :, :n_priority_freq], - linear_spec_var[: ,: ,:n_priority_freq]) + linear_spec_var[: ,: ,:n_priority_freq], + mel_lengths) loss = mel_loss + linear_loss step_time = time.time() - start_time From b1beb1f876f94c9f529300feb97016250c5b728a Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Thu, 22 Mar 2018 14:06:33 -0700 Subject: [PATCH 08/22] data loader fix --- datasets/LJSpeech.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/datasets/LJSpeech.py b/datasets/LJSpeech.py index 7b50e646..5b1fe13e 100644 --- a/datasets/LJSpeech.py +++ b/datasets/LJSpeech.py @@ -97,6 +97,12 @@ class LJSpeechDataset(Dataset): linear = [self.ap.spectrogram(w).astype('float32') for w in wav] mel = [self.ap.melspectrogram(w).astype('float32') for w in wav] mel_lengths = [m.shape[1] for m in mel] + + # compute 'stop token' targets + stop_targets = [np.array([0.]*mel_len) for mel_len in mel_lengths] + + # PAD stop targets + stop_targets = prepare_stop_target(stop_targets, self.outputs_per_step) # PAD sequences with largest length of the batch text = prepare_data(text).astype(np.int32) @@ -106,7 +112,7 @@ class LJSpeechDataset(Dataset): linear = prepare_tensor(linear) mel = prepare_tensor(mel) assert mel.shape[2] == linear.shape[2] - timesteps = mel.shape[2] + timesteps = mel.shape[2] # PAD with zeros that can be divided by outputs per step if (timesteps + 1) % self.outputs_per_step != 0: @@ -120,12 +126,6 @@ class LJSpeechDataset(Dataset): # update mel lengths mel_lengths = [l+pad_len for l in mel_lengths] - - # compute 'stop token' targets - stop_targets = [np.array([0.]*mel_len) for mel_len in mel_lengths] - - # PAD stop targets - stop_targets = prepare_stop_target(stop_targets, self.outputs_per_step) # B x T x D linear = linear.transpose(0, 2, 1) From e4a0eec77eb61a34a9063d92fb92467ba8932c1e Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Thu, 22 Mar 2018 14:06:54 -0700 Subject: [PATCH 09/22] masked loss --- layers/losses.py | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 layers/losses.py diff --git a/layers/losses.py b/layers/losses.py new file mode 100644 index 00000000..0fdc654e --- /dev/null +++ b/layers/losses.py @@ -0,0 +1,48 @@ +import torch +from torch import functional + + +# from https://gist.github.com/jihunchoi/f1434a77df9db1bb337417854b398df1 +def _sequence_mask(sequence_length, max_len=None): + if max_len is None: + max_len = sequence_length.data.max() + batch_size = sequence_length.size(0) + seq_range = torch.range(0, max_len - 1).long() + seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len) + seq_range_expand = Variable(seq_range_expand) + if sequence_length.is_cuda: + seq_range_expand = seq_range_expand.cuda() + seq_length_expand = (sequence_length.unsqueeze(1) + .expand_as(seq_range_expand)) + return seq_range_expand < seq_length_expand + + +def L1LossMasked(input, target, length): + """ + Args: + logits: A Variable containing a FloatTensor of size + (batch, max_len, num_classes) which contains the + unnormalized probability for each class. + target: A Variable containing a LongTensor of size + (batch, max_len) which contains the index of the true + class for each corresponding step. + length: A Variable containing a LongTensor of size (batch,) + which contains the length of each data in a batch. + Returns: + loss: An average loss value masked by the length. + """ + + # logits_flat: (batch * max_len, num_classes) + input = input.view(-1, input.size(-1)) + # target_flat: (batch * max_len, 1) + target_flat = target.view(-1, 1) + # losses_flat: (batch * max_len, 1) + losees_flat = functional.l1_loss(input, target, size_average=False, + reduce=False) + # losses: (batch, max_len) + losses = losses_flat.view(*target.size()) + # mask: (batch, max_len) + mask = _sequence_mask(sequence_length=length, max_len=target.size(1)) + losses = losses * mask.float() + loss = losses.sum() / length.float().sum() + return loss \ No newline at end of file From 32d9c734b256540e9d21654304350177d8f47046 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Thu, 22 Mar 2018 14:35:02 -0700 Subject: [PATCH 10/22] masked loss --- layers/losses.py | 19 +++++++++++-------- train.py | 20 +++++++++----------- 2 files changed, 20 insertions(+), 19 deletions(-) diff --git a/layers/losses.py b/layers/losses.py index 0fdc654e..29ad7378 100644 --- a/layers/losses.py +++ b/layers/losses.py @@ -1,5 +1,6 @@ import torch -from torch import functional +from torch.nn import functional +from torch.autograd import Variable # from https://gist.github.com/jihunchoi/f1434a77df9db1bb337417854b398df1 @@ -7,7 +8,7 @@ def _sequence_mask(sequence_length, max_len=None): if max_len is None: max_len = sequence_length.data.max() batch_size = sequence_length.size(0) - seq_range = torch.range(0, max_len - 1).long() + seq_range = torch.arange(0, max_len).long() seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len) seq_range_expand = Variable(seq_range_expand) if sequence_length.is_cuda: @@ -31,18 +32,20 @@ def L1LossMasked(input, target, length): Returns: loss: An average loss value masked by the length. """ + input = input.contiguous() + target = target.contiguous() - # logits_flat: (batch * max_len, num_classes) + # logits_flat: (batch * max_len, dim) input = input.view(-1, input.size(-1)) - # target_flat: (batch * max_len, 1) + # target_flat: (batch * max_len, dim) target_flat = target.view(-1, 1) - # losses_flat: (batch * max_len, 1) - losees_flat = functional.l1_loss(input, target, size_average=False, + # losses_flat: (batch * max_len, dim) + losses_flat = functional.l1_loss(input, target, size_average=False, reduce=False) # losses: (batch, max_len) losses = losses_flat.view(*target.size()) # mask: (batch, max_len) - mask = _sequence_mask(sequence_length=length, max_len=target.size(1)) + mask = _sequence_mask(sequence_length=length, max_len=target.size(1)).unsqueeze(2) losses = losses * mask.float() loss = losses.sum() / length.float().sum() - return loss \ No newline at end of file + return loss / input.shape[0] \ No newline at end of file diff --git a/train.py b/train.py index b39b17d9..c4d34e2d 100644 --- a/train.py +++ b/train.py @@ -26,7 +26,7 @@ from utils.model import get_param_size from utils.visual import plot_alignment, plot_spectrogram from datasets.LJSpeech import LJSpeechDataset from models.tacotron import Tacotron -from losses import +from layers.losses import L1LossMasked use_cuda = torch.cuda.is_available() @@ -95,7 +95,7 @@ def train(model, criterion, data_loader, optimizer, epoch): # convert inputs to variables text_input_var = Variable(text_input) mel_spec_var = Variable(mel_input) - mel_length_var = Variable(mel_lengths) + mel_lengths_var = Variable(mel_lengths) linear_spec_var = Variable(linear_input, volatile=True) # sort sequence by length for curriculum learning @@ -105,6 +105,7 @@ def train(model, criterion, data_loader, optimizer, epoch): sorted_lengths = sorted_lengths.long().numpy() text_input_var = text_input_var[indices] mel_spec_var = mel_spec_var[indices] + mel_lengths_var = mel_lengths_var[indices] linear_spec_var = linear_spec_var[indices] # dispatch data to GPU @@ -119,11 +120,11 @@ def train(model, criterion, data_loader, optimizer, epoch): model.forward(text_input_var, mel_spec_var) # loss computation - mel_loss = criterion(mel_output, mel_spec_var, mel_lengths) - linear_loss = 0.5 * criterion(linear_output, linear_spec_var, mel_lengths) \ + mel_loss = criterion(mel_output, mel_spec_var, mel_lengths_var) + linear_loss = 0.5 * criterion(linear_output, linear_spec_var, mel_lengths_var) \ + 0.5 * criterion(linear_output[:, :, :n_priority_freq], linear_spec_var[: ,: ,:n_priority_freq], - mel_lengths) + mel_lengths_var) loss = mel_loss + linear_loss # backpass and check the grad norm @@ -240,10 +241,10 @@ def evaluate(model, criterion, data_loader, current_step): # loss computation mel_loss = criterion(mel_output, mel_spec_var, mel_lengths) - linear_loss = 0.5 * criterion(linear_output, linear_spec_var, mel_lengths) \ + linear_loss = 0.5 * criterion(linear_output, linear_spec_var, mel_lengths_var) \ + 0.5 * criterion(linear_output[:, :, :n_priority_freq], linear_spec_var[: ,: ,:n_priority_freq], - mel_lengths) + mel_lengths_var) loss = mel_loss + linear_loss step_time = time.time() - start_time @@ -348,10 +349,7 @@ def main(args): optimizer = optim.Adam(model.parameters(), lr=c.lr) - if use_cuda: - criterion = nn.L1Loss().cuda() - else: - criterion = nn.L1Loss() + criterion = L1LossMasked if args.restore_path: checkpoint = torch.load(args.restore_path) From 2617518d91bf98081fc92e238849a9d560f7ba49 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Thu, 22 Mar 2018 21:13:33 -0700 Subject: [PATCH 11/22] masked loss --- layers/losses.py | 4 ++-- train.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/layers/losses.py b/layers/losses.py index 29ad7378..18f4099a 100644 --- a/layers/losses.py +++ b/layers/losses.py @@ -47,5 +47,5 @@ def L1LossMasked(input, target, length): # mask: (batch, max_len) mask = _sequence_mask(sequence_length=length, max_len=target.size(1)).unsqueeze(2) losses = losses * mask.float() - loss = losses.sum() / length.float().sum() - return loss / input.shape[0] \ No newline at end of file + loss = losses.sum() / (length.float().sum() * target.shape[2]) + return loss \ No newline at end of file diff --git a/train.py b/train.py index c4d34e2d..4e132662 100644 --- a/train.py +++ b/train.py @@ -240,7 +240,7 @@ def evaluate(model, criterion, data_loader, current_step): mel_output, linear_output, alignments = model.forward(text_input_var, mel_spec_var) # loss computation - mel_loss = criterion(mel_output, mel_spec_var, mel_lengths) + mel_loss = criterion(mel_output, mel_spec_var, mel_lengths_var) linear_loss = 0.5 * criterion(linear_output, linear_spec_var, mel_lengths_var) \ + 0.5 * criterion(linear_output[:, :, :n_priority_freq], linear_spec_var[: ,: ,:n_priority_freq], From df4a6443269edf8e94ffb1fe391bed3c7bc00993 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Fri, 23 Mar 2018 05:18:51 -0700 Subject: [PATCH 12/22] bug fix --- layers/tacotron.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/layers/tacotron.py b/layers/tacotron.py index 51548287..983855d4 100644 --- a/layers/tacotron.py +++ b/layers/tacotron.py @@ -253,7 +253,7 @@ class Decoder(nn.Module): B = inputs.size(0) # Run greedy decoding if memory is None - greedy = ~self.training + greedy = not self.training if memory is not None: # Grouping multiple frames if necessary From 1dbc51c6b509d8787b2beda7a9d3e65e45b997b8 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Sat, 24 Mar 2018 19:22:45 -0700 Subject: [PATCH 13/22] convert loss to layer and add test --- layers/losses.py | 66 +++++++++++++++++++++++-------------------- tests/layers_tests.py | 26 +++++++++++++++++ train.py | 5 +++- 3 files changed, 66 insertions(+), 31 deletions(-) diff --git a/layers/losses.py b/layers/losses.py index 18f4099a..67bc0f22 100644 --- a/layers/losses.py +++ b/layers/losses.py @@ -1,6 +1,7 @@ import torch from torch.nn import functional from torch.autograd import Variable +from torch import nn # from https://gist.github.com/jihunchoi/f1434a77df9db1bb337417854b398df1 @@ -18,34 +19,39 @@ def _sequence_mask(sequence_length, max_len=None): return seq_range_expand < seq_length_expand -def L1LossMasked(input, target, length): - """ - Args: - logits: A Variable containing a FloatTensor of size - (batch, max_len, num_classes) which contains the - unnormalized probability for each class. - target: A Variable containing a LongTensor of size - (batch, max_len) which contains the index of the true - class for each corresponding step. - length: A Variable containing a LongTensor of size (batch,) - which contains the length of each data in a batch. - Returns: - loss: An average loss value masked by the length. - """ - input = input.contiguous() - target = target.contiguous() +class L1LossMasked(nn.Module): + + def __init__(self): + super(L1LossMasked, self).__init__() + + def forward(self, input, target, length): + """ + Args: + logits: A Variable containing a FloatTensor of size + (batch, max_len, num_classes) which contains the + unnormalized probability for each class. + target: A Variable containing a LongTensor of size + (batch, max_len) which contains the index of the true + class for each corresponding step. + length: A Variable containing a LongTensor of size (batch,) + which contains the length of each data in a batch. + Returns: + loss: An average loss value masked by the length. + """ + input = input.contiguous() + target = target.contiguous() - # logits_flat: (batch * max_len, dim) - input = input.view(-1, input.size(-1)) - # target_flat: (batch * max_len, dim) - target_flat = target.view(-1, 1) - # losses_flat: (batch * max_len, dim) - losses_flat = functional.l1_loss(input, target, size_average=False, - reduce=False) - # losses: (batch, max_len) - losses = losses_flat.view(*target.size()) - # mask: (batch, max_len) - mask = _sequence_mask(sequence_length=length, max_len=target.size(1)).unsqueeze(2) - losses = losses * mask.float() - loss = losses.sum() / (length.float().sum() * target.shape[2]) - return loss \ No newline at end of file + # logits_flat: (batch * max_len, dim) + input = input.view(-1, input.size(-1)) + # target_flat: (batch * max_len, dim) + target_flat = target.view(-1, 1) + # losses_flat: (batch * max_len, dim) + losses_flat = functional.l1_loss(input, target, size_average=False, + reduce=False) + # losses: (batch, max_len, dim) + losses = losses_flat.view(*target.size()) + # mask: (batch, max_len, 1) + mask = _sequence_mask(sequence_length=length, max_len=target.size(1)).unsqueeze(2) + losses = losses * mask.float() + loss = losses.sum() / (length.float().sum() * float(target.shape[2])) + return loss \ No newline at end of file diff --git a/tests/layers_tests.py b/tests/layers_tests.py index 14739bf9..246fce8c 100644 --- a/tests/layers_tests.py +++ b/tests/layers_tests.py @@ -2,6 +2,7 @@ import unittest import torch as T from TTS.layers.tacotron import Prenet, CBHG, Decoder, Encoder +from layers.losses import L1LossMasked, _sequence_mask class PrenetTests(unittest.TestCase): @@ -57,4 +58,29 @@ class EncoderTests(unittest.TestCase): assert output.shape[0] == 4 assert output.shape[1] == 8 assert output.shape[2] == 256 # 128 * 2 BiRNN + +class L1LossMaskedTests(unittest.TestCase): + + def test_in_out(self): + layer = L1LossMasked() + dummy_input = T.autograd.Variable(T.ones(4, 8, 128).float()) + dummy_target = T.autograd.Variable(T.ones(4, 8, 128).float()) + dummy_length = T.autograd.Variable((T.ones(4) * 8).long()) + output = layer(dummy_input, dummy_target, dummy_length) + assert output.shape[0] == 1 + assert len(output.shape) == 1 + assert output.data[0] == 0.0 + + dummy_input = T.autograd.Variable(T.ones(4, 8, 128).float()) + dummy_target = T.autograd.Variable(T.zeros(4, 8, 128).float()) + dummy_length = T.autograd.Variable((T.ones(4) * 8).long()) + output = layer(dummy_input, dummy_target, dummy_length) + assert output.data[0] == 1.0, "1.0 vs {}".format(output.data[0]) + + dummy_input = T.autograd.Variable(T.ones(4, 8, 128).float()) + dummy_target = T.autograd.Variable(T.zeros(4, 8, 128).float()) + dummy_length = T.autograd.Variable((T.arange(5,9)).long()) + mask = ((_sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) + output = layer(dummy_input + mask, dummy_target, dummy_length) + assert output.data[0] == 1.0, "1.0 vs {}".format(output.data[0]) diff --git a/train.py b/train.py index 4e132662..97876036 100644 --- a/train.py +++ b/train.py @@ -349,7 +349,10 @@ def main(args): optimizer = optim.Adam(model.parameters(), lr=c.lr) - criterion = L1LossMasked + if use_cuda: + criterion = L1LossMasked().cuda() + else: + criterion = L1LossMasked() if args.restore_path: checkpoint = torch.load(args.restore_path) From ed35ae7e20df39dbf69758f9cd0c830c677d58f0 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Sun, 25 Mar 2018 10:50:19 -0700 Subject: [PATCH 14/22] config change --- config.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/config.json b/config.json index 285e1d8d..0bd21b4e 100644 --- a/config.json +++ b/config.json @@ -23,9 +23,9 @@ "num_loader_workers": 8, - "checkpoint": false, - "save_step": 69, + "checkpoint": true, + "save_step": 378, "data_path": "/run/shm/erogol/LJSpeech-1.0", "min_seq_len": 0, - "output_path": "result" + "output_path": "/data/shared/erogol_models/" } From 632c08a6388373941d0b61cf54c76ca59805623b Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Sun, 25 Mar 2018 12:01:41 -0700 Subject: [PATCH 15/22] normal attention --- layers/attention.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/layers/attention.py b/layers/attention.py index e7385149..1f83c169 100644 --- a/layers/attention.py +++ b/layers/attention.py @@ -57,11 +57,19 @@ class AttentionRNN(nn.Module): if annotations_lengths is not None and mask is None: mask = get_mask_from_lengths(annotations, annotations_lengths) + + # Concat input query and previous context context + rnn_input = torch.cat((memory, context), -1) + #rnn_input = rnn_input.unsqueeze(1) + + # Feed it to RNN + # s_i = f(y_{i-1}, c_{i}, s_{i-1}) + rnn_output = self.rnn_cell(rnn_input, rnn_state) # Alignment # (batch, max_time) # e_{ij} = a(s_{i-1}, h_j) - alignment = self.alignment_model(annotations, rnn_state) + alignment = self.alignment_model(annotations, rnn_output) # TODO: needs recheck. if mask is not None: @@ -75,16 +83,6 @@ class AttentionRNN(nn.Module): # (batch, 1, dim) # c_i = \sum_{j=1}^{T_x} \alpha_{ij} h_j context = torch.bmm(alignment.unsqueeze(1), annotations) - context = context.squeeze(1) - - # Concat input query and previous context context - rnn_input = torch.cat((memory, context), -1) - #rnn_input = rnn_input.unsqueeze(1) - - # Feed it to RNN - # s_i = f(y_{i-1}, c_{i}, s_{i-1}) - rnn_output = self.rnn_cell(rnn_input, rnn_state) - context = context.squeeze(1) return rnn_output, context, alignment From 3c084177c661ec10c5442b4305930bf3057fe97b Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Mon, 26 Mar 2018 10:43:36 -0700 Subject: [PATCH 16/22] Data loader bug fix and Attention bug fix --- config.json | 2 +- datasets/LJSpeech.py | 25 ++++++---------------- layers/attention.py | 2 +- layers/tacotron.py | 3 +-- tests/layers_tests.py | 12 +++++------ tests/loader_tests.py | 49 ++++++++++++++++++++++++++++++++++++++++--- train.py | 10 --------- utils/data.py | 21 ++++++++++--------- 8 files changed, 71 insertions(+), 53 deletions(-) diff --git a/config.json b/config.json index 0bd21b4e..63f6d372 100644 --- a/config.json +++ b/config.json @@ -12,7 +12,7 @@ "text_cleaner": "english_cleaners", "epochs": 2000, - "lr": 0.0003, + "lr": 0.001, "warmup_steps": 4000, "batch_size": 32, "eval_batch_size":32, diff --git a/datasets/LJSpeech.py b/datasets/LJSpeech.py index 5b1fe13e..d213bd9c 100644 --- a/datasets/LJSpeech.py +++ b/datasets/LJSpeech.py @@ -7,7 +7,7 @@ from torch.utils.data import Dataset from TTS.utils.text import text_to_sequence from TTS.utils.audio import AudioProcessor -from TTS.utils.data import (prepare_data, pad_data, pad_per_step, +from TTS.utils.data import (prepare_data, pad_per_step, prepare_tensor, prepare_stop_target) @@ -96,10 +96,10 @@ class LJSpeechDataset(Dataset): linear = [self.ap.spectrogram(w).astype('float32') for w in wav] mel = [self.ap.melspectrogram(w).astype('float32') for w in wav] - mel_lengths = [m.shape[1] for m in mel] + mel_lengths = [m.shape[1] + 1 for m in mel] # +1 for zero-frame # compute 'stop token' targets - stop_targets = [np.array([0.]*mel_len) for mel_len in mel_lengths] + stop_targets = [np.array([0.]*(mel_len-1)) for mel_len in mel_lengths] # PAD stop targets stop_targets = prepare_stop_target(stop_targets, self.outputs_per_step) @@ -108,25 +108,12 @@ class LJSpeechDataset(Dataset): text = prepare_data(text).astype(np.int32) wav = prepare_data(wav) - # PAD features with largest length of the batch - linear = prepare_tensor(linear) - mel = prepare_tensor(mel) + # PAD features with largest length + a zero frame + linear = prepare_tensor(linear, self.outputs_per_step) + mel = prepare_tensor(mel, self.outputs_per_step) assert mel.shape[2] == linear.shape[2] timesteps = mel.shape[2] - # PAD with zeros that can be divided by outputs per step - if (timesteps + 1) % self.outputs_per_step != 0: - pad_len = self.outputs_per_step - \ - ((timesteps + 1) % self.outputs_per_step) - pad_len += 1 - else: - pad_len = 1 - linear = pad_per_step(linear, pad_len) - mel = pad_per_step(mel, pad_len) - - # update mel lengths - mel_lengths = [l+pad_len for l in mel_lengths] - # B x T x D linear = linear.transpose(0, 2, 1) mel = mel.transpose(0, 2, 1) diff --git a/layers/attention.py b/layers/attention.py index 1f83c169..1626e949 100644 --- a/layers/attention.py +++ b/layers/attention.py @@ -48,7 +48,7 @@ class AttentionRNN(nn.Module): def __init__(self, out_dim, annot_dim, memory_dim, score_mask_value=-float("inf")): super(AttentionRNN, self).__init__() - self.rnn_cell = nn.GRUCell(annot_dim + memory_dim, out_dim) + self.rnn_cell = nn.GRUCell(out_dim + memory_dim, out_dim) self.alignment_model = BahdanauAttention(annot_dim, out_dim, out_dim) self.score_mask_value = score_mask_value diff --git a/layers/tacotron.py b/layers/tacotron.py index 983855d4..8433d643 100644 --- a/layers/tacotron.py +++ b/layers/tacotron.py @@ -304,8 +304,7 @@ class Decoder(nn.Module): # Attention RNN attention_rnn_hidden, current_context_vec, alignment = self.attention_rnn( - processed_memory, current_context_vec, attention_rnn_hidden, - inputs) + processed_memory, current_context_vec, attention_rnn_hidden, inputs) # Concat RNN output and attention context vector decoder_input = self.project_to_decoder_in( diff --git a/tests/layers_tests.py b/tests/layers_tests.py index 246fce8c..570b474c 100644 --- a/tests/layers_tests.py +++ b/tests/layers_tests.py @@ -33,17 +33,15 @@ class CBHGTests(unittest.TestCase): class DecoderTests(unittest.TestCase): def test_in_out(self): - layer = Decoder(in_features=128, memory_dim=32, r=5) - dummy_input = T.autograd.Variable(T.rand(4, 8, 128)) - dummy_memory = T.autograd.Variable(T.rand(4, 120, 32)) + layer = Decoder(in_features=256, memory_dim=80, r=2) + dummy_input = T.autograd.Variable(T.rand(4, 8, 256)) + dummy_memory = T.autograd.Variable(T.rand(4, 2, 80)) - print(layer) output, alignment = layer(dummy_input, dummy_memory) - print(output.shape) assert output.shape[0] == 4 - assert output.shape[1] == 120 / 5 - assert output.shape[2] == 32 * 5 + assert output.shape[1] == 1, "size not {}".format(output.shape[1]) + assert output.shape[2] == 80 * 2, "size not {}".format(output.shape[2]) class EncoderTests(unittest.TestCase): diff --git a/tests/loader_tests.py b/tests/loader_tests.py index 3b3d017c..769fbebe 100644 --- a/tests/loader_tests.py +++ b/tests/loader_tests.py @@ -72,8 +72,9 @@ class TestDataset(unittest.TestCase): c.power ) + # Test for batch size 1 dataloader = DataLoader(dataset, batch_size=1, - shuffle=True, collate_fn=dataset.collate_fn, + shuffle=False, collate_fn=dataset.collate_fn, drop_last=True, num_workers=c.num_loader_workers) for i, data in enumerate(dataloader): @@ -93,11 +94,53 @@ class TestDataset(unittest.TestCase): assert linear_input[0, -1].sum() == 0 assert linear_input[0, -2].sum() != 0 assert stop_target[0, -1] == 1 + assert stop_target[0, -2] == 0 assert stop_target.sum() == 1 assert len(mel_lengths.shape) == 1 - print(mel_lengths) - print(mel_input) assert mel_lengths[0] == mel_input[0].shape[0] + + # Test for batch size 2 + dataloader = DataLoader(dataset, batch_size=2, + shuffle=False, collate_fn=dataset.collate_fn, + drop_last=False, num_workers=c.num_loader_workers) + + for i, data in enumerate(dataloader): + if i == self.max_loader_iter: + break + text_input = data[0] + text_lengths = data[1] + linear_input = data[2] + mel_input = data[3] + mel_lengths = data[4] + stop_target = data[5] + item_idx = data[6] + + if mel_lengths[0] > mel_lengths[1]: + idx = 0 + else: + idx = 1 + + # check the first item in the batch + assert mel_input[idx, -1].sum() == 0 + assert mel_input[idx, -2].sum() != 0, mel_input + assert linear_input[idx, -1].sum() == 0 + assert linear_input[idx, -2].sum() != 0 + assert stop_target[idx, -1] == 1 + assert stop_target[idx, -2] == 0 + assert stop_target[idx].sum() == 1 + assert len(mel_lengths.shape) == 1 + assert mel_lengths[idx] == mel_input[idx].shape[0] + + # check the second itme in the batch + assert mel_input[1-idx, -1].sum() == 0 + assert linear_input[1-idx, -1].sum() == 0 + assert stop_target[1-idx, -1] == 1 + assert len(mel_lengths.shape) == 1 + + # check batch conditions + assert (mel_input * stop_target.unsqueeze(2)).sum() == 0 + assert (linear_input * stop_target.unsqueeze(2)).sum() == 0 + diff --git a/train.py b/train.py index 97876036..626ff105 100644 --- a/train.py +++ b/train.py @@ -98,16 +98,6 @@ def train(model, criterion, data_loader, optimizer, epoch): mel_lengths_var = Variable(mel_lengths) linear_spec_var = Variable(linear_input, volatile=True) - # sort sequence by length for curriculum learning - # TODO: might be unnecessary - sorted_lengths, indices = torch.sort( - text_lengths.view(-1), dim=0, descending=True) - sorted_lengths = sorted_lengths.long().numpy() - text_input_var = text_input_var[indices] - mel_spec_var = mel_spec_var[indices] - mel_lengths_var = mel_lengths_var[indices] - linear_spec_var = linear_spec_var[indices] - # dispatch data to GPU if use_cuda: text_input_var = text_input_var.cuda() diff --git a/utils/data.py b/utils/data.py index 022fab1e..b0bc9588 100644 --- a/utils/data.py +++ b/utils/data.py @@ -1,7 +1,7 @@ import numpy as np -def pad_data(x, length): +def _pad_data(x, length): _pad = 0 assert x.ndim == 1 return np.pad(x, (0, length - x.shape[0]), @@ -11,30 +11,31 @@ def pad_data(x, length): def prepare_data(inputs): max_len = max((len(x) for x in inputs)) - return np.stack([pad_data(x, max_len) for x in inputs]) + return np.stack([_pad_data(x, max_len) for x in inputs]) -def pad_tensor(x, length): +def _pad_tensor(x, length): _pad = 0 assert x.ndim == 2 - return np.pad(x, [[0, 0], [0, length- x.shape[1]]], mode='constant', constant_values=_pad) + return np.pad(x, [[0, 0], [0, length - x.shape[1]]], mode='constant', constant_values=_pad) -def prepare_tensor(inputs): - max_len = max((x.shape[1] for x in inputs)) - return np.stack([pad_tensor(x, max_len) for x in inputs]) +def prepare_tensor(inputs, out_steps): + max_len = max((x.shape[1] for x in inputs)) + 1 # zero-frame + remainder = max_len % out_steps + return np.stack([_pad_tensor(x, max_len + remainder) for x in inputs]) -def pad_stop_target(x, length): +def _pad_stop_target(x, length): _pad = 1. assert x.ndim == 1 return np.pad(x, (0, length - x.shape[0]), mode='constant', constant_values=_pad) def prepare_stop_target(inputs, out_steps): - max_len = max((x.shape[0] for x in inputs)) + max_len = max((x.shape[0] for x in inputs)) + 1 # zero-frame remainder = max_len % out_steps - return np.stack([pad_stop_target(x, max_len + out_steps - remainder) for x in inputs]) + return np.stack([_pad_stop_target(x, max_len + remainder) for x in inputs]) def pad_per_step(inputs, pad_len): From 1ff8d6d2b7eab377ccd167d1e1a2daed34f1f3e7 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Mon, 26 Mar 2018 11:07:15 -0700 Subject: [PATCH 17/22] Data loader bug fix 2 --- datasets/LJSpeech.py | 1 + layers/tacotron.py | 3 +++ train.py | 2 +- utils/data.py | 8 ++++---- 4 files changed, 9 insertions(+), 5 deletions(-) diff --git a/datasets/LJSpeech.py b/datasets/LJSpeech.py index d213bd9c..a773c661 100644 --- a/datasets/LJSpeech.py +++ b/datasets/LJSpeech.py @@ -125,6 +125,7 @@ class LJSpeechDataset(Dataset): mel = torch.FloatTensor(mel) mel_lengths = torch.LongTensor(mel_lengths) stop_targets = torch.FloatTensor(stop_targets) + return text, text_lenghts, linear, mel, mel_lengths, stop_targets, item_idxs[0] raise TypeError(("batch must contain tensors, numbers, dicts or lists;\ diff --git a/layers/tacotron.py b/layers/tacotron.py index 8433d643..9ec8bffb 100644 --- a/layers/tacotron.py +++ b/layers/tacotron.py @@ -256,9 +256,12 @@ class Decoder(nn.Module): greedy = not self.training if memory is not None: + print(memory.shape) + # Grouping multiple frames if necessary if memory.size(-1) == self.memory_dim: memory = memory.view(B, memory.size(1) // self.r, -1) + print(memory.shape) assert memory.size(-1) == self.memory_dim * self.r,\ " !! Dimension mismatch {} vs {} * {}".format(memory.size(-1), self.memory_dim, self.r) diff --git a/train.py b/train.py index 626ff105..77288f77 100644 --- a/train.py +++ b/train.py @@ -82,7 +82,7 @@ def train(model, criterion, data_loader, optimizer, epoch): linear_input = data[2] mel_input = data[3] mel_lengths = data[4] - + current_step = num_iter + args.restore_step + epoch * len(data_loader) + 1 # setup lr diff --git a/utils/data.py b/utils/data.py index b0bc9588..4ff3a4c4 100644 --- a/utils/data.py +++ b/utils/data.py @@ -17,13 +17,13 @@ def prepare_data(inputs): def _pad_tensor(x, length): _pad = 0 assert x.ndim == 2 - return np.pad(x, [[0, 0], [0, length - x.shape[1]]], mode='constant', constant_values=_pad) - + x = np.pad(x, [[0, 0], [0, length - x.shape[1]]], mode='constant', constant_values=_pad) + return x def prepare_tensor(inputs, out_steps): max_len = max((x.shape[1] for x in inputs)) + 1 # zero-frame remainder = max_len % out_steps - return np.stack([_pad_tensor(x, max_len + remainder) for x in inputs]) + return np.stack([_pad_tensor(x, max_len + (out_steps - remainder)) for x in inputs]) def _pad_stop_target(x, length): @@ -35,7 +35,7 @@ def _pad_stop_target(x, length): def prepare_stop_target(inputs, out_steps): max_len = max((x.shape[0] for x in inputs)) + 1 # zero-frame remainder = max_len % out_steps - return np.stack([_pad_stop_target(x, max_len + remainder) for x in inputs]) + return np.stack([_pad_stop_target(x, max_len + (out_steps - remainder)) for x in inputs]) def pad_per_step(inputs, pad_len): From a68487f0b8fb60605de96b7b0a8b3925958fb83e Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Mon, 26 Mar 2018 11:08:15 -0700 Subject: [PATCH 18/22] Data loader bug fix 3 --- layers/tacotron.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/layers/tacotron.py b/layers/tacotron.py index 9ec8bffb..916ea677 100644 --- a/layers/tacotron.py +++ b/layers/tacotron.py @@ -256,13 +256,10 @@ class Decoder(nn.Module): greedy = not self.training if memory is not None: - print(memory.shape) # Grouping multiple frames if necessary if memory.size(-1) == self.memory_dim: memory = memory.view(B, memory.size(1) // self.r, -1) - print(memory.shape) - assert memory.size(-1) == self.memory_dim * self.r,\ " !! Dimension mismatch {} vs {} * {}".format(memory.size(-1), self.memory_dim, self.r) T_decoder = memory.size(1) From 3b895ad0115c7577ec87ceb12924111a42a3db2b Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Mon, 26 Mar 2018 11:12:05 -0700 Subject: [PATCH 19/22] config change --- config.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config.json b/config.json index 63f6d372..604794ad 100644 --- a/config.json +++ b/config.json @@ -14,7 +14,7 @@ "epochs": 2000, "lr": 0.001, "warmup_steps": 4000, - "batch_size": 32, + "batch_size": 128, "eval_batch_size":32, "r": 5, From e5e51ae35a616a04437e24f139562b48cfa1c2e0 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Tue, 27 Mar 2018 09:21:53 -0700 Subject: [PATCH 20/22] bug fix --- utils/data.py | 28 ++-------------------------- 1 file changed, 2 insertions(+), 26 deletions(-) diff --git a/utils/data.py b/utils/data.py index 4ff3a4c4..a38092e9 100644 --- a/utils/data.py +++ b/utils/data.py @@ -1,7 +1,7 @@ import numpy as np -def _pad_data(x, length): +def pad_data(x, length): _pad = 0 assert x.ndim == 1 return np.pad(x, (0, length - x.shape[0]), @@ -11,31 +11,7 @@ def _pad_data(x, length): def prepare_data(inputs): max_len = max((len(x) for x in inputs)) - return np.stack([_pad_data(x, max_len) for x in inputs]) - - -def _pad_tensor(x, length): - _pad = 0 - assert x.ndim == 2 - x = np.pad(x, [[0, 0], [0, length - x.shape[1]]], mode='constant', constant_values=_pad) - return x - -def prepare_tensor(inputs, out_steps): - max_len = max((x.shape[1] for x in inputs)) + 1 # zero-frame - remainder = max_len % out_steps - return np.stack([_pad_tensor(x, max_len + (out_steps - remainder)) for x in inputs]) - - -def _pad_stop_target(x, length): - _pad = 1. - assert x.ndim == 1 - return np.pad(x, (0, length - x.shape[0]), mode='constant', constant_values=_pad) - - -def prepare_stop_target(inputs, out_steps): - max_len = max((x.shape[0] for x in inputs)) + 1 # zero-frame - remainder = max_len % out_steps - return np.stack([_pad_stop_target(x, max_len + (out_steps - remainder)) for x in inputs]) + return np.stack([pad_data(x, max_len) for x in inputs]) def pad_per_step(inputs, pad_len): From 75dd1bf2add8639a25cdc982e53589c7af948b05 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Tue, 27 Mar 2018 09:23:02 -0700 Subject: [PATCH 21/22] bug fix --- utils/data.py | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/utils/data.py b/utils/data.py index a38092e9..6c47d5eb 100644 --- a/utils/data.py +++ b/utils/data.py @@ -1,7 +1,7 @@ import numpy as np -def pad_data(x, length): +def _pad_data(x, length): _pad = 0 assert x.ndim == 1 return np.pad(x, (0, length - x.shape[0]), @@ -11,7 +11,33 @@ def pad_data(x, length): def prepare_data(inputs): max_len = max((len(x) for x in inputs)) - return np.stack([pad_data(x, max_len) for x in inputs]) + return np.stack([_pad_data(x, max_len) for x in inputs]) + + +def _pad_tensor(x, length): + _pad = 0 + assert x.ndim == 2 + x = np.pad(x, [[0, 0], [0, length - x.shape[1]]], mode='constant', constant_values=_pad) + return x + +def prepare_tensor(inputs, out_steps): + max_len = max((x.shape[1] for x in inputs)) + 1 # zero-frame + remainder = max_len % out_steps + pad_len = max_len + (out_steps - remainder) if remainder > 0 else max_len + return np.stack([_pad_tensor(x, pad_len) for x in inputs]) + + +def _pad_stop_target(x, length): + _pad = 1. + assert x.ndim == 1 + return np.pad(x, (0, length - x.shape[0]), mode='constant', constant_values=_pad) + + +def prepare_stop_target(inputs, out_steps): + max_len = max((x.shape[0] for x in inputs)) + 1 # zero-frame + remainder = max_len % out_steps + pad_len = max_len + (out_steps - remainder) if remainder > 0 else max_len + return np.stack([_pad_stop_target(x, pad_len) for x in inputs]) def pad_per_step(inputs, pad_len): From f6f1b06b7718bf99c8385c4bc627d1b120d70f28 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Wed, 28 Mar 2018 09:43:29 -0700 Subject: [PATCH 22/22] Remove useless config argument --- config.json | 1 - models/tacotron.py | 4 ++-- train.py | 3 +-- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/config.json b/config.json index 604794ad..ed9764b1 100644 --- a/config.json +++ b/config.json @@ -7,7 +7,6 @@ "preemphasis": 0.97, "min_level_db": -100, "ref_level_db": 20, - "hidden_size": 128, "embedding_size": 256, "text_cleaner": "english_cleaners", diff --git a/models/tacotron.py b/models/tacotron.py index a485d68e..05bb1292 100644 --- a/models/tacotron.py +++ b/models/tacotron.py @@ -8,7 +8,7 @@ from TTS.layers.tacotron import Prenet, Encoder, Decoder, CBHG class Tacotron(nn.Module): def __init__(self, embedding_dim=256, linear_dim=1025, mel_dim=80, - freq_dim=1025, r=5, padding_idx=None): + r=5, padding_idx=None): super(Tacotron, self).__init__() self.r = r @@ -24,7 +24,7 @@ class Tacotron(nn.Module): self.decoder = Decoder(256, mel_dim, r) self.postnet = CBHG(mel_dim, K=8, projections=[256, mel_dim]) - self.last_linear = nn.Linear(mel_dim * 2, freq_dim) + self.last_linear = nn.Linear(mel_dim * 2, linear_dim) def forward(self, characters, mel_specs=None): diff --git a/train.py b/train.py index 77288f77..87908717 100644 --- a/train.py +++ b/train.py @@ -332,9 +332,8 @@ def main(args): pin_memory=True) model = Tacotron(c.embedding_size, - c.hidden_size, - c.num_mels, c.num_freq, + c.num_mels, c.r) optimizer = optim.Adam(model.parameters(), lr=c.lr)