| """ |
| I didn't extract features from the test set of LibriSpeech, the features extracted |
| from train-100 was split into train and test set into two separate folders. |
| This was again done to read them easily using torch vision's Dataset Folder |
| """ |
|
|
| import os |
| import shutil |
| from pathlib import Path |
|
|
| import numpy as np |
|
|
|
|
| def assert_out_dir_exists(root, index): |
| dir_ = root + '/' + str(index) |
|
|
| if not os.path.exists(dir_): |
| os.makedirs(dir_) |
| print('crated dir {}'.format(dir_)) |
| else: |
| print('dir {} already exists'.format(dir_)) |
|
|
| return dir_ |
|
|
|
|
| def train_test_split(root, test_size=0.05): |
| |
| train_dir = root + '_train' |
| test_dir = root + '_test' |
|
|
| os.makedirs(train_dir) |
| os.makedirs(test_dir) |
|
|
| for label in os.listdir(root): |
| files_iter = Path(root + '/' + label).glob('**/*.npy') |
| files_ = [str(f) for f in files_iter] |
| files_ = np.array(files_) |
|
|
| assert_out_dir_exists(train_dir, label) |
| assert_out_dir_exists(test_dir, label) |
|
|
| choices = np.random.choice([0, 1], size=files_.shape[0], p=(1 - test_size, test_size)) |
| train_files = files_[choices == 0] |
| test_files = files_[choices == 1] |
|
|
| for train_sample in train_files: |
| src = train_sample |
| dest = train_dir + '/' + label + '/' + train_sample.split('/')[-1] |
| print('copying file {} to {}'.format(src, dest)) |
| shutil.copyfile(train_sample, train_dir + '/' + label + '/' + train_sample.split('/')[-1]) |
|
|
| for test_sample in test_files: |
| src = test_sample |
| dest = test_dir + '/' + label + '/' + test_sample.split('/')[-1] |
| print('copying file {} to {}'.format(src, dest)) |
| shutil.copyfile(test_sample, test_dir + '/' + label + '/' + test_sample.split('/')[-1]) |
|
|
| print('done for label: {}'.format(label)) |
|
|
| print('All done') |
|
|
|
|
| if __name__ == '__main__': |
| train_test_split('fbanks') |