|
- from __future__ import print_function
-
- import fnmatch
- import io
- import os
- import subprocess
-
- from tqdm import tqdm
-
-
- def create_manifest(data_path, output_name, manifest_path, min_duration=None, max_duration=None):
- file_paths = [os.path.join(dirpath, f)
- for dirpath, dirnames, files in os.walk(data_path)
- for f in fnmatch.filter(files, '*.wav')]
- file_paths = order_and_prune_files(file_paths, min_duration, max_duration)
-
- os.makedirs(manifest_path, exist_ok=True)
- with io.FileIO(manifest_path + output_name, "w") as file:
- for wav_path in tqdm(file_paths, total=len(file_paths)):
- transcript_path = wav_path.replace('/wav/', '/txt/').replace('.wav', '.txt')
- sample = os.path.abspath(wav_path) + ',' + os.path.abspath(transcript_path) + '\n'
- file.write(sample.encode('utf-8'))
- print('\n')
-
-
- def order_and_prune_files(file_paths, min_duration, max_duration):
- print("Sorting manifests...")
- duration_file_paths = [(path, float(subprocess.check_output(
- ['soxi -D \"%s\"' % path.strip()], shell=True))) for path in file_paths]
- if min_duration and max_duration:
- print("Pruning manifests between %d and %d seconds" % (min_duration, max_duration))
- duration_file_paths = [(path, duration) for path, duration in duration_file_paths if
- min_duration <= duration <= max_duration]
-
- def func(element):
- return element[1]
-
- duration_file_paths.sort(key=func)
- return [x[0] for x in duration_file_paths] # Remove durations
|