CoCalc -- prepare_synpaflex.ipynb

GitHub Repository: TensorSpeech/TensorFlowTTS
Path: blob/master/notebooks/prepare_synpaflex.ipynb
¹⁵⁵⁸ views

Kernel: Python 3

In [ ]:

import os

import numpy as np
import soundfile as sf
from pathlib import Path
from shutil import copyfile
from tqdm import tqdm

input_dataset_path = "[your_local_path]/synpaflex-corpus/v0.1/"
reorganized_dataset_path = "../synpaflex/"

maximal_duration = 12 # maximal audio file duration in seconds

In [ ]:

wav_dir = os.path.join(reorganized_dataset_path, "wavs/")
os.makedirs(wav_dir, exist_ok=True)
data = []
total_duration = 0

# Precomputing walk_count for tqdm
walk_count = 0
for subdir, dirs, files in os.walk(input_dataset_path):
    walk_count += 1

# walk through dataset
for subdir, dirs, files in tqdm(os.walk(input_dataset_path), total=walk_count, bar_format='Data Reorganization : {l_bar}{bar}|'):
    for filename in files:
        filepath = os.path.join(subdir, filename)

        # read wav files
        if filepath.endswith(".wav"):
            try:
                wav, sr = sf.read(filepath)
                duration = len(wav) / sr
                
                # Only keep files with shorter durations than maximal_duration
                if duration <= maximal_duration:
                    total_duration += duration
                    path = Path(filepath)
                    current_path = Path(path.parent.absolute())
                    
                    # find corresponding text file
                    txt_file_path = os.path.join(current_path, "txt", filename.replace('.wav','.txt'))
                    if not os.path.exists(txt_file_path):
                        parent_path = Path(current_path.parent.absolute())
                        txt_file_path = os.path.join(parent_path, "txt", filename.replace('.wav', '.txt'))
                        if not os.path.exists(txt_file_path):
                            break
                    norm_text_file_path = txt_file_path.replace(".txt", "_norm.txt")
                    text = open(txt_file_path, "r").read()
                    if os.path.exists(norm_text_file_path):
                        norm_text = open(norm_text_file_path, 'r').read()
                    else : 
                        norm_text = text
                    
                    # ignore file if text contains digits, otherwise copy wav file and keep metadata to memory 
                    if not any(chr.isdigit() for chr in text):
                        data_line = filename.replace(".wav", "") + '|' + text + '|' + norm_text
                        data.append(data_line)
                        copyfile(filepath, os.path.join(wav_dir, filename))

            except RuntimeError:
                print(filepath + " not recognized and ignored.")  

# save metadata
with open(os.path.join(reorganized_dataset_path, "synpaflex.txt"), 'w') as f:
    for item in data:
        f.write("%s\n" % item)

# display reorganized dataset total duration
duration_hours = total_duration / 3600
print("total duration = " + str(f"{duration_hours:.2f}") + " hours")

Product

Resources

Company