|
- # Copyright 2021 Huawei Technologies Co., Ltd
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- # ============================================================================
-
- """preprocess data and convert to mindrecord"""
-
- import os
- import string
- import logging
- import numpy as np
- import scipy.io.wavfile as wavfile
- from python_speech_features import mfcc
- from mindspore.mindrecord import FileWriter
- from src.model_utils.config import config
-
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
- logger = logging.getLogger(__name__)
- CHARSET = set(string.ascii_lowercase + ' ')
- PHONEME_LIST = [
- 'aa', 'ae', 'ah', 'ao', 'aw', 'ax', 'ax-h', 'axr', 'ay', 'b', 'bcl', 'ch', 'd', 'dcl', 'dh',
- 'dx', 'eh', 'el', 'em', 'en', 'eng', 'epi', 'er', 'ey', 'f', 'g', 'gcl', 'h#', 'hh', 'hv', 'ih',
- 'ix', 'iy', 'jh', 'k', 'kcl', 'l', 'm', 'n', 'ng', 'nx', 'ow', 'oy', 'p', 'pau', 'pcl', 'q', 'r',
- 's', 'sh', 't', 'tcl', 'th', 'uh', 'uw', 'ux', 'v', 'w', 'y', 'z', 'zh']
- PHONEME_DIC = {v: k for k, v in enumerate(PHONEME_LIST)}
- WORD_DIC = {v: k for k, v in enumerate(string.ascii_lowercase + ' ')}
-
-
- def read_timit_txt(f):
- '''read text label'''
- f = open(f)
- line = f.readlines()[0].strip().split(' ')
- line = line[2:]
- line = ' '.join(line)
- line = line.replace('.', '').lower()
- line = filter(lambda c: c in CHARSET, line)
- f.close()
- ret = []
- for c in line:
- ret.append(WORD_DIC[c])
- return np.asarray(ret)
-
-
- def read_timit_phoneme(f):
- '''read phoneme label'''
- f = open(f)
- pho = []
- for line in f:
- line = line.strip().split(' ')[-1]
- pho.append(PHONEME_DIC[line])
- f.close()
- return np.asarray(pho)
-
-
- def diff_feature(feat, nd=1):
- '''differentiate feature'''
- diff = feat[1:] - feat[:-1]
- feat = feat[1:]
- if nd == 1:
- return np.concatenate((feat, diff), axis=1)
- d2 = diff[1:] - diff[:-1]
- return np.concatenate((feat[1:], diff[1:], d2), axis=1)
-
-
- def read_files(root_path):
- '''read files'''
- files = os.walk(root_path)
- filelists = []
- for filepath, _, filenames in files:
- for filename in filenames:
- filelists.append(os.path.join(filepath, filename))
- return filelists
-
-
- def get_feature(f):
- '''extract feature'''
- fs, signal = wavfile.read(f)
- signal = signal.astype('float32')
- feat = mfcc(signal=signal, samplerate=fs, winlen=0.01, winstep=0.005, numcep=13, nfilt=26, lowfreq=0, highfreq=6000,
- preemph=0.95, appendEnergy=False)
- feat = diff_feature(feat, nd=2)
- return feat
-
-
- class TIMIT_PARSER():
- """
- Parse the dataset,extract the feature by mfcc,convert to mindrecord
- """
-
- def __init__(self, dirname, output_path, label_type='phoneme'):
- self.dirname = dirname
- assert os.path.isdir(dirname), dirname
- self.filelists = [k for k in read_files(self.dirname)
- if k.endswith('.wav')]
- assert label_type in ['phoneme', 'letter'], label_type
- self.label_type = label_type
- self.output_path = output_path
-
- def getdatas(self):
- '''get data'''
- data = []
- for f in self.filelists:
- feat = get_feature(f)
- if self.label_type == 'phoneme':
- label = read_timit_phoneme(f[:-5] + '.PHN')
- elif self.label_type == 'letter':
- label = read_timit_txt(f[:-5] + '.TXT')
- data.append([feat, label])
- return data
-
- def convert_to_mindrecord(self):
- '''convert to mindrecord'''
- schema_json = {"id": {"type": "int32"},
- "feature": {"type": "float32", "shape": [-1, 39]},
- "masks": {"type": "float32", "shape": [-1, 256]},
- "label": {"type": "int32", "shape": [-1]},
- "seq_len": {"type": "int32"},
- }
- data_list = []
- logger.info("write into mindrecord,plaese wait")
- pair = self.getdatas()
- for i, data in enumerate(pair):
- feature = data[0]
- label = data[1]
- feature_padding = np.zeros((config.max_sequence_length, feature.shape[1]), dtype=np.float32)
- feature_padding[:feature.shape[0], :] = feature
- masks = np.zeros((config.max_sequence_length, 2 * config.hidden_size), dtype=np.float32)
- masks[:feature.shape[0], :] = 1
- label_padding = np.full(config.max_label_length, 61, dtype=np.int32)
- label_padding[:label.shape[0]] = label
- data_json = {"id": i,
- "feature": feature_padding.reshape(-1, config.feature_dim),
- "masks": masks.reshape(-1, 2 * config.hidden_size),
- "label": label_padding.reshape(-1),
- "seq_len": feature.shape[0],
- }
- data_list.append(data_json)
- writer = FileWriter(self.output_path, shard_num=4)
- writer.add_schema(schema_json, "nlp_schema")
- writer.add_index(["id"])
- writer.write_raw_data(data_list)
- writer.commit()
- logger.info("writing into record suceesfully")
-
-
- if __name__ == '__main__':
- if not os.path.exists(config.dataset_dir):
- os.makedirs(config.dataset_dir)
- logger.info("Preparing train dataset:")
- train_path = os.path.join(config.dataset_dir, config.train_name)
- parser = TIMIT_PARSER(config.train_dir, train_path)
- parser.convert_to_mindrecord()
- logger.info("Preparing test dataset:")
- test_path = os.path.join(config.dataset_dir, config.test_name)
- parser = TIMIT_PARSER(config.test_dir, test_path)
- parser.convert_to_mindrecord()
|