Skip to content
This repository was archived by the owner on Aug 3, 2021. It is now read-only.

Commit a81babd

Browse files
committed
Added QuartzNet config
Signed-off-by: Vitaly Lavrukhin <[email protected]>
1 parent 620e7e1 commit a81babd

File tree

1 file changed

+256
-0
lines changed

1 file changed

+256
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,256 @@
1+
# pylint: skip-file
2+
# QuartzNet paper: https://arxiv.org/abs/1910.10261
3+
import tensorflow as tf
4+
from open_seq2seq.models import Speech2Text
5+
from open_seq2seq.encoders import TDNNEncoder
6+
from open_seq2seq.decoders import FullyConnectedCTCDecoder
7+
from open_seq2seq.data.speech2text.speech2text import Speech2TextDataLayer
8+
from open_seq2seq.losses import CTCLoss
9+
from open_seq2seq.optimizers.lr_policies import cosine_decay
10+
from open_seq2seq.optimizers.novograd import NovoGrad
11+
12+
residual_dense = False # Enable or disable Dense Residual
13+
14+
base_model = Speech2Text
15+
16+
base_params = {
17+
"random_seed": 0,
18+
"use_horovod": True,
19+
"num_epochs": 400,
20+
21+
"num_gpus": 8,
22+
"batch_size_per_gpu": 32,
23+
"iter_size": 1,
24+
25+
"save_summaries_steps": 100,
26+
"print_loss_steps": 10,
27+
"print_samples_steps": 2200,
28+
"eval_steps": 2200,
29+
"save_checkpoint_steps": 1100,
30+
"logdir": "jasper_log_folder",
31+
"num_checkpoints": 2,
32+
33+
"optimizer": NovoGrad,
34+
"optimizer_params": {
35+
"beta1": 0.95,
36+
"beta2": 0.5,
37+
"epsilon": 1e-08,
38+
"weight_decay": 0.001,
39+
"grad_averaging": False,
40+
},
41+
"lr_policy": cosine_decay,
42+
"lr_policy_params": {
43+
"learning_rate": 0.01,
44+
"min_lr": 0.0,
45+
"warmup_steps": 1000
46+
},
47+
48+
"dtype": tf.float32,
49+
# "loss_scaling": "Backoff",
50+
51+
"summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries',
52+
'variable_norm', 'gradient_norm', 'global_gradient_norm'],
53+
54+
"encoder": TDNNEncoder,
55+
"encoder_params": {
56+
"convnet_layers": [
57+
{
58+
"type": "sep_conv1d", "repeat": 1,
59+
"kernel_size": [33], "stride": [2],
60+
"num_channels": 256, "padding": "SAME",
61+
"dilation":[1]
62+
},
63+
{
64+
"type": "sep_conv1d", "repeat": 5,
65+
"kernel_size": [33], "stride": [1],
66+
"num_channels": 256, "padding": "SAME",
67+
"dilation":[1],
68+
"residual": True, "residual_dense": residual_dense
69+
},
70+
{
71+
"type": "sep_conv1d", "repeat": 5,
72+
"kernel_size": [33], "stride": [1],
73+
"num_channels": 256, "padding": "SAME",
74+
"dilation":[1],
75+
"residual": True, "residual_dense": residual_dense
76+
},
77+
{
78+
"type": "sep_conv1d", "repeat": 5,
79+
"kernel_size": [33], "stride": [1],
80+
"num_channels": 256, "padding": "SAME",
81+
"dilation":[1],
82+
"residual": True, "residual_dense": residual_dense
83+
},
84+
{
85+
"type": "sep_conv1d", "repeat": 5,
86+
"kernel_size": [39], "stride": [1],
87+
"num_channels": 256, "padding": "SAME",
88+
"dilation":[1],
89+
"residual": True, "residual_dense": residual_dense
90+
},
91+
{
92+
"type": "sep_conv1d", "repeat": 5,
93+
"kernel_size": [39], "stride": [1],
94+
"num_channels": 256, "padding": "SAME",
95+
"dilation":[1],
96+
"residual": True, "residual_dense": residual_dense
97+
},
98+
{
99+
"type": "sep_conv1d", "repeat": 5,
100+
"kernel_size": [39], "stride": [1],
101+
"num_channels": 256, "padding": "SAME",
102+
"dilation":[1],
103+
"residual": True, "residual_dense": residual_dense
104+
},
105+
{
106+
"type": "sep_conv1d", "repeat": 5,
107+
"kernel_size": [51], "stride": [1],
108+
"num_channels": 512, "padding": "SAME",
109+
"dilation":[1],
110+
"residual": True, "residual_dense": residual_dense
111+
},
112+
{
113+
"type": "sep_conv1d", "repeat": 5,
114+
"kernel_size": [51], "stride": [1],
115+
"num_channels": 512, "padding": "SAME",
116+
"dilation":[1],
117+
"residual": True, "residual_dense": residual_dense
118+
},
119+
{
120+
"type": "sep_conv1d", "repeat": 5,
121+
"kernel_size": [51], "stride": [1],
122+
"num_channels": 512, "padding": "SAME",
123+
"dilation":[1],
124+
"residual": True, "residual_dense": residual_dense
125+
},
126+
{
127+
"type": "sep_conv1d", "repeat": 5,
128+
"kernel_size": [63], "stride": [1],
129+
"num_channels": 512, "padding": "SAME",
130+
"dilation":[1],
131+
"residual": True, "residual_dense": residual_dense
132+
},
133+
{
134+
"type": "sep_conv1d", "repeat": 5,
135+
"kernel_size": [63], "stride": [1],
136+
"num_channels": 512, "padding": "SAME",
137+
"dilation":[1],
138+
"residual": True, "residual_dense": residual_dense
139+
},
140+
{
141+
"type": "sep_conv1d", "repeat": 5,
142+
"kernel_size": [63], "stride": [1],
143+
"num_channels": 512, "padding": "SAME",
144+
"dilation":[1],
145+
"residual": True, "residual_dense": residual_dense
146+
},
147+
{
148+
"type": "sep_conv1d", "repeat": 5,
149+
"kernel_size": [75], "stride": [1],
150+
"num_channels": 512, "padding": "SAME",
151+
"dilation":[1],
152+
"residual": True, "residual_dense": residual_dense
153+
},
154+
{
155+
"type": "sep_conv1d", "repeat": 5,
156+
"kernel_size": [75], "stride": [1],
157+
"num_channels": 512, "padding": "SAME",
158+
"dilation":[1],
159+
"residual": True, "residual_dense": residual_dense
160+
},
161+
{
162+
"type": "sep_conv1d", "repeat": 5,
163+
"kernel_size": [75], "stride": [1],
164+
"num_channels": 512, "padding": "SAME",
165+
"dilation":[1],
166+
"residual": True, "residual_dense": residual_dense
167+
},
168+
{
169+
"type": "sep_conv1d", "repeat": 1,
170+
"kernel_size": [87], "stride": [1],
171+
"num_channels": 512, "padding": "SAME",
172+
"dilation":[2],
173+
"residual": True, "residual_dense": residual_dense
174+
},
175+
{
176+
"type": "conv1d", "repeat": 1,
177+
"kernel_size": [1], "stride": [1],
178+
"num_channels": 1024, "padding": "SAME",
179+
"dilation":[1]
180+
}
181+
],
182+
183+
"dropout_keep_prob": 1.0,
184+
185+
"initializer": tf.contrib.layers.xavier_initializer,
186+
"initializer_params": {
187+
'uniform': False,
188+
},
189+
"normalization": "batch_norm",
190+
"activation_fn": tf.nn.relu,
191+
"data_format": "channels_last",
192+
"use_conv_mask": True,
193+
},
194+
195+
"decoder": FullyConnectedCTCDecoder,
196+
"decoder_params": {
197+
"initializer": tf.contrib.layers.xavier_initializer,
198+
"use_language_model": False,
199+
"infer_logits_to_pickle": False,
200+
},
201+
"loss": CTCLoss,
202+
"loss_params": {},
203+
204+
"data_layer": Speech2TextDataLayer,
205+
"data_layer_params": {
206+
"num_audio_features": 64,
207+
"input_type": "logfbank",
208+
"vocab_file": "open_seq2seq/test_utils/toy_speech_data/vocab.txt",
209+
"norm_per_feature": True,
210+
"window": "hanning",
211+
"precompute_mel_basis": True,
212+
"sample_freq": 16000,
213+
"pad_to": 16,
214+
"dither": 1e-5,
215+
"backend": "librosa",
216+
},
217+
}
218+
219+
train_params = {
220+
"data_layer": Speech2TextDataLayer,
221+
"data_layer_params": {
222+
"augmentation": {
223+
'n_freq_mask': 2,
224+
'n_time_mask': 2,
225+
'width_freq_mask': 6,
226+
'width_time_mask': 6,
227+
},
228+
"dataset_files": [
229+
"/data/librispeech/librivox-train-clean-100.csv",
230+
"/data/librispeech/librivox-train-clean-360.csv",
231+
"/data/librispeech/librivox-train-other-500.csv"
232+
],
233+
"max_duration": 16.7,
234+
"shuffle": True,
235+
},
236+
}
237+
238+
eval_params = {
239+
"data_layer": Speech2TextDataLayer,
240+
"data_layer_params": {
241+
"dataset_files": [
242+
"/data/librispeech/librivox-dev-clean.csv",
243+
],
244+
"shuffle": False,
245+
},
246+
}
247+
248+
infer_params = {
249+
"data_layer": Speech2TextDataLayer,
250+
"data_layer_params": {
251+
"dataset_files": [
252+
"/data/librispeech/librivox-test-clean.csv",
253+
],
254+
"shuffle": False,
255+
},
256+
}

0 commit comments

Comments
 (0)