CoCalc -- txt_grid

GitHub Repository: TensorSpeech/TensorFlowTTS
Path: blob/master/examples/mfa_extraction/txt_grid_parser.py
¹⁵⁵⁸ views
1
# -*- coding: utf-8 -*-
2
# Copyright 2020 TensorFlowTTS Team.
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
#     http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15
"""Create training file and durations from textgrids."""
16

17
import os
18
from dataclasses import dataclass
19
from pathlib import Path
20

21
import click
22
import numpy as np
23
import textgrid
24
import yaml
25
from tqdm import tqdm
26

27
import logging
28
import sys
29

30

31
logging.basicConfig(
32
    level=logging.DEBUG,
33
    stream=sys.stdout,
34
    format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
35
)
36

37

38
@dataclass
39
class TxtGridParser:
40
    sample_rate: int
41
    multi_speaker: bool
42
    txt_grid_path: str
43
    hop_size: int
44
    output_durations_path: str
45
    dataset_path: str
46
    training_file: str = "train.txt"
47
    phones_mapper = {"sil": "SIL", "sp": "SIL", "spn": "SIL", "": "END"}
48
    """ '' -> is last token in every cases i encounter so u can change it for END but there is a safety check
49
        so it'll fail always when empty string isn't last char in ur dataset just chang it to silence then
50
    """
51
    sil_phones = set(phones_mapper.keys())
52

53
    def parse(self):
54
        speakers = (
55
            [
56
                i
57
                for i in os.listdir(self.txt_grid_path)
58
                if os.path.isdir(os.path.join(self.txt_grid_path, i))
59
            ]
60
            if self.multi_speaker
61
            else []
62
        )
63
        data = []
64

65
        if speakers:
66
            for speaker in speakers:
67
                file_list = os.listdir(os.path.join(self.txt_grid_path, speaker))
68
                self.parse_text_grid(file_list, data, speaker)
69
        else:
70
            file_list = os.listdir(self.txt_grid_path)
71
            self.parse_text_grid(file_list, data, "")
72

73
        with open(os.path.join(self.dataset_path, self.training_file), "w") as f:
74
            f.writelines(data)
75

76
    def parse_text_grid(self, file_list: list, data: list, speaker_name: str):
77
        logging.info(
78
            f"\n Parse: {len(file_list)} files, speaker name: {speaker_name} \n"
79
        )
80
        for f_name in tqdm(file_list):
81
            text_grid = textgrid.TextGrid.fromFile(
82
                os.path.join(self.txt_grid_path, speaker_name, f_name)
83
            )
84
            pha = text_grid[1]
85
            durations = []
86
            phs = []
87
            for iterator, interval in enumerate(pha.intervals):
88
                mark = interval.mark
89

90
                if mark in self.sil_phones:
91
                    mark = self.phones_mapper[mark]
92
                    if mark == "END":
93
                        assert iterator == pha.intervals.__len__() - 1
94
                        # check if empty ph is always last example in your dataset if not fix it
95

96
                dur = interval.duration() * (self.sample_rate / self.hop_size)
97
                durations.append(round(dur))
98
                phs.append(mark)
99

100
            full_ph = " ".join(phs)
101

102
            assert full_ph.split(" ").__len__() == durations.__len__()  # safety check
103

104
            base_name = f_name.split(".TextGrid")[0]
105
            np.save(
106
                os.path.join(self.output_durations_path, f"{base_name}-durations.npy"),
107
                np.array(durations).astype(np.int32),
108
                allow_pickle=False,
109
            )
110
            data.append(f"{speaker_name}/{base_name}|{full_ph}|{speaker_name}\n")
111

112

113
@click.command()
114
@click.option(
115
    "--yaml_path", default="examples/fastspeech2_libritts/conf/fastspeech2libritts.yaml"
116
)
117
@click.option("--dataset_path", default="dataset", type=str, help="Dataset directory")
118
@click.option("--text_grid_path", default="mfa/parsed", type=str)
119
@click.option("--output_durations_path", default="dataset/durations")
120
@click.option("--sample_rate", default=24000, type=int)
121
@click.option("--multi_speakers", default=1, type=int, help="Use multi-speaker version")
122
@click.option("--train_file", default="train.txt")
123
def main(
124
    yaml_path: str,
125
    dataset_path: str,
126
    text_grid_path: str,
127
    output_durations_path: str,
128
    sample_rate: int,
129
    multi_speakers: int,
130
    train_file: str,
131
):
132

133
    with open(yaml_path) as file:
134
        attrs = yaml.load(file)
135
        hop_size = attrs["hop_size"]
136

137
    Path(output_durations_path).mkdir(parents=True, exist_ok=True)
138

139
    txt_grid_parser = TxtGridParser(
140
        sample_rate=sample_rate,
141
        multi_speaker=bool(multi_speakers),
142
        txt_grid_path=text_grid_path,
143
        hop_size=hop_size,
144
        output_durations_path=output_durations_path,
145
        training_file=train_file,
146
        dataset_path=dataset_path,
147
    )
148
    txt_grid_parser.parse()
149

150

151
if __name__ == "__main__":
152
    main()
153

154
Product

Resources

Company