|
|
@@ -0,0 +1,176 @@ |
|
|
|
#!/bin/bash |
|
|
|
|
|
|
|
# Created on 2018/12 |
|
|
|
# Author: Kaituo XU |
|
|
|
|
|
|
|
# -- START IMPORTANT |
|
|
|
# * If you have mixture wsj0 audio, modify `data` to your path that including tr, cv and tt. |
|
|
|
# * If you jsut have origin sphere format wsj0 , modify `wsj0_origin` to your path and |
|
|
|
# modify `wsj0_wav` to path that put output wav format wsj0, then read and run stage 1 part. |
|
|
|
# After that, modify `data` and run from stage 2. |
|
|
|
wsj0_origin=E:\PHD\data\WSJ0 |
|
|
|
wsj0_wav=E:\PHD\data\wav |
|
|
|
data=E:\PHD\data\wav8k |
|
|
|
stage=0 # Modify this to control to start from which stage |
|
|
|
# -- END |
|
|
|
:<<! |
|
|
|
dumpdir=data # directory to put generated json file |
|
|
|
|
|
|
|
# -- START Conv-TasNet Config |
|
|
|
train_dir=$dumpdir/tr |
|
|
|
valid_dir=$dumpdir/cv |
|
|
|
evaluate_dir=$dumpdir/tt |
|
|
|
separate_dir=$dumpdir/tt |
|
|
|
sample_rate=8000 |
|
|
|
segment=4 # seconds |
|
|
|
cv_maxlen=6 # seconds |
|
|
|
# Network config |
|
|
|
N=256 |
|
|
|
L=20 |
|
|
|
B=256 |
|
|
|
H=512 |
|
|
|
P=3 |
|
|
|
X=8 |
|
|
|
R=4 |
|
|
|
norm_type=gLN |
|
|
|
causal=0 |
|
|
|
mask_nonlinear='relu' |
|
|
|
C=2 |
|
|
|
# Training config |
|
|
|
use_cuda=1 |
|
|
|
id=0 |
|
|
|
epochs=100 |
|
|
|
half_lr=1 |
|
|
|
early_stop=0 |
|
|
|
max_norm=5 |
|
|
|
# minibatch |
|
|
|
shuffle=1 |
|
|
|
batch_size=3 |
|
|
|
num_workers=4 |
|
|
|
# optimizer |
|
|
|
optimizer=adam |
|
|
|
lr=1e-3 |
|
|
|
momentum=0 |
|
|
|
l2=0 |
|
|
|
# save and visualize |
|
|
|
checkpoint=0 |
|
|
|
continue_from="" |
|
|
|
print_freq=10 |
|
|
|
visdom=0 |
|
|
|
visdom_epoch=0 |
|
|
|
visdom_id="Conv-TasNet Training" |
|
|
|
# evaluate |
|
|
|
ev_use_cuda=0 |
|
|
|
cal_sdr=1 |
|
|
|
# -- END Conv-TasNet Config |
|
|
|
|
|
|
|
# exp tag |
|
|
|
tag="" # tag for managing experiments. |
|
|
|
|
|
|
|
ngpu=1 # always 1 |
|
|
|
! |
|
|
|
. utils/parse_options.sh || exit 1; |
|
|
|
. ./cmd.sh |
|
|
|
. ./path.sh |
|
|
|
|
|
|
|
|
|
|
|
if [ $stage -le 0 ]; then |
|
|
|
echo "Stage 0: Convert sphere format to wav format and generate mixture" |
|
|
|
local/data_prepare.sh --data ${wsj0_origin} --wav_dir ${wsj0_wav} |
|
|
|
|
|
|
|
echo "NOTE: You should generate mixture by yourself now. |
|
|
|
You can use tools/create-speaker-mixtures.zip which is download from |
|
|
|
http://www.merl.com/demos/deep-clustering/create-speaker-mixtures.zip |
|
|
|
If you don't have Matlab and want to use Octave, I suggest to replace |
|
|
|
all mkdir(...) in create_wav_2speakers.m with system(['mkdir -p '...]) |
|
|
|
due to mkdir in Octave can not work in 'mkdir -p' way. |
|
|
|
e.g.: |
|
|
|
mkdir([output_dir16k '/' min_max{i_mm} '/' data_type{i_type}]); |
|
|
|
-> |
|
|
|
system(['mkdir -p ' output_dir16k '/' min_max{i_mm} '/' data_type{i_type}]);" |
|
|
|
exit 1 |
|
|
|
fi |
|
|
|
|
|
|
|
:<<! |
|
|
|
if [ $stage -le 1 ]; then |
|
|
|
echo "Stage 1: Generating json files including wav path and duration" |
|
|
|
[ ! -d $dumpdir ] && mkdir $dumpdir |
|
|
|
preprocess.py --in-dir $data --out-dir $dumpdir --sample-rate $sample_rate |
|
|
|
fi |
|
|
|
|
|
|
|
|
|
|
|
if [ -z ${tag} ]; then |
|
|
|
expdir=exp/train_r${sample_rate}_N${N}_L${L}_B${B}_H${H}_P${P}_X${X}_R${R}_C${C}_${norm_type}_causal${causal}_${mask_nonlinear}_epoch${epochs}_half${half_lr}_norm${max_norm}_bs${batch_size}_worker${num_workers}_${optimizer}_lr${lr}_mmt${momentum}_l2${l2}_`basename $train_dir` |
|
|
|
else |
|
|
|
expdir=exp/train_${tag} |
|
|
|
fi |
|
|
|
|
|
|
|
if [ $stage -le 2 ]; then |
|
|
|
echo "Stage 2: Training" |
|
|
|
${cuda_cmd} --gpu ${ngpu} ${expdir}/train.log \ |
|
|
|
CUDA_VISIBLE_DEVICES="$id" \ |
|
|
|
train.py \ |
|
|
|
--train_dir $train_dir \ |
|
|
|
--valid_dir $valid_dir \ |
|
|
|
--sample_rate $sample_rate \ |
|
|
|
--segment $segment \ |
|
|
|
--cv_maxlen $cv_maxlen \ |
|
|
|
--N $N \ |
|
|
|
--L $L \ |
|
|
|
--B $B \ |
|
|
|
--H $H \ |
|
|
|
--P $P \ |
|
|
|
--X $X \ |
|
|
|
--R $R \ |
|
|
|
--C $C \ |
|
|
|
--norm_type $norm_type \ |
|
|
|
--causal $causal \ |
|
|
|
--mask_nonlinear $mask_nonlinear \ |
|
|
|
--use_cuda $use_cuda \ |
|
|
|
--epochs $epochs \ |
|
|
|
--half_lr $half_lr \ |
|
|
|
--early_stop $early_stop \ |
|
|
|
--max_norm $max_norm \ |
|
|
|
--shuffle $shuffle \ |
|
|
|
--batch_size $batch_size \ |
|
|
|
--num_workers $num_workers \ |
|
|
|
--optimizer $optimizer \ |
|
|
|
--lr $lr \ |
|
|
|
--momentum $momentum \ |
|
|
|
--l2 $l2 \ |
|
|
|
--save_folder ${expdir} \ |
|
|
|
--checkpoint $checkpoint \ |
|
|
|
--continue_from "$continue_from" \ |
|
|
|
--print_freq ${print_freq} \ |
|
|
|
--visdom $visdom \ |
|
|
|
--visdom_epoch $visdom_epoch \ |
|
|
|
--visdom_id "$visdom_id" |
|
|
|
fi |
|
|
|
|
|
|
|
|
|
|
|
if [ $stage -le 3 ]; then |
|
|
|
echo "Stage 3: Evaluate separation performance" |
|
|
|
${decode_cmd} --gpu ${ngpu} ${expdir}/evaluate.log \ |
|
|
|
evaluate.py \ |
|
|
|
--model_path ${expdir}/final.pth.tar \ |
|
|
|
--data_dir $evaluate_dir \ |
|
|
|
--cal_sdr $cal_sdr \ |
|
|
|
--use_cuda $ev_use_cuda \ |
|
|
|
--sample_rate $sample_rate \ |
|
|
|
--batch_size $batch_size |
|
|
|
fi |
|
|
|
|
|
|
|
|
|
|
|
if [ $stage -le 4 ]; then |
|
|
|
echo "Stage 4: Separate speech using Conv-TasNet" |
|
|
|
separate_out_dir=${expdir}/separate |
|
|
|
${decode_cmd} --gpu ${ngpu} ${separate_out_dir}/separate.log \ |
|
|
|
separate.py \ |
|
|
|
--model_path ${expdir}/final.pth.tar \ |
|
|
|
--mix_json $separate_dir/mix.json \ |
|
|
|
--out_dir ${separate_out_dir} \ |
|
|
|
--use_cuda $ev_use_cuda \ |
|
|
|
--sample_rate $sample_rate \ |
|
|
|
--batch_size $batch_size |
|
|
|
fi |
|
|
|
! |