Path: blob/master/examples/mfa_extraction/txt_grid_parser.py
1558 views
# -*- coding: utf-8 -*-1# Copyright 2020 TensorFlowTTS Team.2#3# Licensed under the Apache License, Version 2.0 (the "License");4# you may not use this file except in compliance with the License.5# You may obtain a copy of the License at6#7# http://www.apache.org/licenses/LICENSE-2.08#9# Unless required by applicable law or agreed to in writing, software10# distributed under the License is distributed on an "AS IS" BASIS,11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.12# See the License for the specific language governing permissions and13# limitations under the License.14"""Create training file and durations from textgrids."""1516import os17from dataclasses import dataclass18from pathlib import Path1920import click21import numpy as np22import textgrid23import yaml24from tqdm import tqdm2526import logging27import sys282930logging.basicConfig(31level=logging.DEBUG,32stream=sys.stdout,33format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",34)353637@dataclass38class TxtGridParser:39sample_rate: int40multi_speaker: bool41txt_grid_path: str42hop_size: int43output_durations_path: str44dataset_path: str45training_file: str = "train.txt"46phones_mapper = {"sil": "SIL", "sp": "SIL", "spn": "SIL", "": "END"}47""" '' -> is last token in every cases i encounter so u can change it for END but there is a safety check48so it'll fail always when empty string isn't last char in ur dataset just chang it to silence then49"""50sil_phones = set(phones_mapper.keys())5152def parse(self):53speakers = (54[55i56for i in os.listdir(self.txt_grid_path)57if os.path.isdir(os.path.join(self.txt_grid_path, i))58]59if self.multi_speaker60else []61)62data = []6364if speakers:65for speaker in speakers:66file_list = os.listdir(os.path.join(self.txt_grid_path, speaker))67self.parse_text_grid(file_list, data, speaker)68else:69file_list = os.listdir(self.txt_grid_path)70self.parse_text_grid(file_list, data, "")7172with open(os.path.join(self.dataset_path, self.training_file), "w") as f:73f.writelines(data)7475def parse_text_grid(self, file_list: list, data: list, speaker_name: str):76logging.info(77f"\n Parse: {len(file_list)} files, speaker name: {speaker_name} \n"78)79for f_name in tqdm(file_list):80text_grid = textgrid.TextGrid.fromFile(81os.path.join(self.txt_grid_path, speaker_name, f_name)82)83pha = text_grid[1]84durations = []85phs = []86for iterator, interval in enumerate(pha.intervals):87mark = interval.mark8889if mark in self.sil_phones:90mark = self.phones_mapper[mark]91if mark == "END":92assert iterator == pha.intervals.__len__() - 193# check if empty ph is always last example in your dataset if not fix it9495dur = interval.duration() * (self.sample_rate / self.hop_size)96durations.append(round(dur))97phs.append(mark)9899full_ph = " ".join(phs)100101assert full_ph.split(" ").__len__() == durations.__len__() # safety check102103base_name = f_name.split(".TextGrid")[0]104np.save(105os.path.join(self.output_durations_path, f"{base_name}-durations.npy"),106np.array(durations).astype(np.int32),107allow_pickle=False,108)109data.append(f"{speaker_name}/{base_name}|{full_ph}|{speaker_name}\n")110111112@click.command()113@click.option(114"--yaml_path", default="examples/fastspeech2_libritts/conf/fastspeech2libritts.yaml"115)116@click.option("--dataset_path", default="dataset", type=str, help="Dataset directory")117@click.option("--text_grid_path", default="mfa/parsed", type=str)118@click.option("--output_durations_path", default="dataset/durations")119@click.option("--sample_rate", default=24000, type=int)120@click.option("--multi_speakers", default=1, type=int, help="Use multi-speaker version")121@click.option("--train_file", default="train.txt")122def main(123yaml_path: str,124dataset_path: str,125text_grid_path: str,126output_durations_path: str,127sample_rate: int,128multi_speakers: int,129train_file: str,130):131132with open(yaml_path) as file:133attrs = yaml.load(file)134hop_size = attrs["hop_size"]135136Path(output_durations_path).mkdir(parents=True, exist_ok=True)137138txt_grid_parser = TxtGridParser(139sample_rate=sample_rate,140multi_speaker=bool(multi_speakers),141txt_grid_path=text_grid_path,142hop_size=hop_size,143output_durations_path=output_durations_path,144training_file=train_file,145dataset_path=dataset_path,146)147txt_grid_parser.parse()148149150if __name__ == "__main__":151main()152153154