|
- # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
-
- name: "Jasper"
- labels: [" ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
- "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "'"]
-
- input_val:
- audio_dataset: &val_dataset
- sample_rate: &sample_rate 16000
- trim_silence: true
- normalize_transcripts: true
-
- filterbank_features: &val_features
- normalize: per_feature
- sample_rate: *sample_rate
- window_size: 0.02
- window_stride: 0.01
- window: hann
- n_filt: &n_filt 64
- n_fft: 512
- frame_splicing: &frame_splicing 1
- dither: 0.00001
- pad_align: 16
-
- # For training we keep samples < 16.7s and apply augmentation
- input_train:
- audio_dataset:
- <<: *val_dataset
- max_duration: 16.7
- ignore_offline_speed_perturbation: true
-
- filterbank_features:
- <<: *val_features
- max_duration: 16.7
-
- spec_augment:
- freq_masks: 2
- max_freq: 20
- time_masks: 2
- max_time: 75
-
- jasper:
- encoder:
- weight_init: xavier_uniform
- in_feats: *n_filt
- frame_splicing: *frame_splicing
- activation: relu
- use_conv_masks: true
- blocks:
- - &Conv1
- filters: 256
- repeat: 1
- kernel_size: 11
- stride: 2
- dilation: 1
- dropout: 0.2
- residual: false
- - &B1
- filters: 256
- repeat: 5
- kernel_size: 11
- stride: 1
- dilation: 1
- dropout: 0.2
- residual: true
- residual_dense: true
- - *B1
- - &B2
- filters: 384
- repeat: 5
- kernel_size: 13
- stride: 1
- dilation: 1
- dropout: 0.2
- residual: true
- residual_dense: true
- - *B2
- - &B3
- filters: 512
- repeat: 5
- kernel_size: 17
- stride: 1
- dilation: 1
- dropout: 0.2
- residual: true
- residual_dense: true
- - *B3
- - &B4
- filters: 640
- repeat: 5
- kernel_size: 21
- stride: 1
- dilation: 1
- dropout: 0.3
- residual: true
- residual_dense: true
- - *B4
- - &B5
- filters: 768
- repeat: 5
- kernel_size: 25
- stride: 1
- dilation: 1
- dropout: 0.3
- residual: true
- residual_dense: true
- - *B5
- - &Conv2
- filters: 896
- repeat: 1
- kernel_size: 29
- stride: 1
- dilation: 2
- dropout: 0.4
- residual: false
- - &Conv3
- filters: &enc_feats 1024
- repeat: 1
- kernel_size: 1
- stride: 1
- dilation: 1
- dropout: 0.4
- residual: false
-
- decoder:
- in_feats: *enc_feats
- init: xavier_uniform
|