Path: blob/master/notebooks/prepare_synpaflex.ipynb
1558 views
Kernel: Python 3
In [ ]:
import os import numpy as np import soundfile as sf from pathlib import Path from shutil import copyfile from tqdm import tqdm input_dataset_path = "[your_local_path]/synpaflex-corpus/v0.1/" reorganized_dataset_path = "../synpaflex/" maximal_duration = 12 # maximal audio file duration in seconds
In [ ]:
wav_dir = os.path.join(reorganized_dataset_path, "wavs/") os.makedirs(wav_dir, exist_ok=True) data = [] total_duration = 0 # Precomputing walk_count for tqdm walk_count = 0 for subdir, dirs, files in os.walk(input_dataset_path): walk_count += 1 # walk through dataset for subdir, dirs, files in tqdm(os.walk(input_dataset_path), total=walk_count, bar_format='Data Reorganization : {l_bar}{bar}|'): for filename in files: filepath = os.path.join(subdir, filename) # read wav files if filepath.endswith(".wav"): try: wav, sr = sf.read(filepath) duration = len(wav) / sr # Only keep files with shorter durations than maximal_duration if duration <= maximal_duration: total_duration += duration path = Path(filepath) current_path = Path(path.parent.absolute()) # find corresponding text file txt_file_path = os.path.join(current_path, "txt", filename.replace('.wav','.txt')) if not os.path.exists(txt_file_path): parent_path = Path(current_path.parent.absolute()) txt_file_path = os.path.join(parent_path, "txt", filename.replace('.wav', '.txt')) if not os.path.exists(txt_file_path): break norm_text_file_path = txt_file_path.replace(".txt", "_norm.txt") text = open(txt_file_path, "r").read() if os.path.exists(norm_text_file_path): norm_text = open(norm_text_file_path, 'r').read() else : norm_text = text # ignore file if text contains digits, otherwise copy wav file and keep metadata to memory if not any(chr.isdigit() for chr in text): data_line = filename.replace(".wav", "") + '|' + text + '|' + norm_text data.append(data_line) copyfile(filepath, os.path.join(wav_dir, filename)) except RuntimeError: print(filepath + " not recognized and ignored.") # save metadata with open(os.path.join(reorganized_dataset_path, "synpaflex.txt"), 'w') as f: for item in data: f.write("%s\n" % item) # display reorganized dataset total duration duration_hours = total_duration / 3600 print("total duration = " + str(f"{duration_hours:.2f}") + " hours")