Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
TensorSpeech
GitHub Repository: TensorSpeech/TensorFlowTTS
Path: blob/master/examples/mfa_extraction/txt_grid_parser.py
1558 views
1
# -*- coding: utf-8 -*-
2
# Copyright 2020 TensorFlowTTS Team.
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
# http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15
"""Create training file and durations from textgrids."""
16
17
import os
18
from dataclasses import dataclass
19
from pathlib import Path
20
21
import click
22
import numpy as np
23
import textgrid
24
import yaml
25
from tqdm import tqdm
26
27
import logging
28
import sys
29
30
31
logging.basicConfig(
32
level=logging.DEBUG,
33
stream=sys.stdout,
34
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
35
)
36
37
38
@dataclass
39
class TxtGridParser:
40
sample_rate: int
41
multi_speaker: bool
42
txt_grid_path: str
43
hop_size: int
44
output_durations_path: str
45
dataset_path: str
46
training_file: str = "train.txt"
47
phones_mapper = {"sil": "SIL", "sp": "SIL", "spn": "SIL", "": "END"}
48
""" '' -> is last token in every cases i encounter so u can change it for END but there is a safety check
49
so it'll fail always when empty string isn't last char in ur dataset just chang it to silence then
50
"""
51
sil_phones = set(phones_mapper.keys())
52
53
def parse(self):
54
speakers = (
55
[
56
i
57
for i in os.listdir(self.txt_grid_path)
58
if os.path.isdir(os.path.join(self.txt_grid_path, i))
59
]
60
if self.multi_speaker
61
else []
62
)
63
data = []
64
65
if speakers:
66
for speaker in speakers:
67
file_list = os.listdir(os.path.join(self.txt_grid_path, speaker))
68
self.parse_text_grid(file_list, data, speaker)
69
else:
70
file_list = os.listdir(self.txt_grid_path)
71
self.parse_text_grid(file_list, data, "")
72
73
with open(os.path.join(self.dataset_path, self.training_file), "w") as f:
74
f.writelines(data)
75
76
def parse_text_grid(self, file_list: list, data: list, speaker_name: str):
77
logging.info(
78
f"\n Parse: {len(file_list)} files, speaker name: {speaker_name} \n"
79
)
80
for f_name in tqdm(file_list):
81
text_grid = textgrid.TextGrid.fromFile(
82
os.path.join(self.txt_grid_path, speaker_name, f_name)
83
)
84
pha = text_grid[1]
85
durations = []
86
phs = []
87
for iterator, interval in enumerate(pha.intervals):
88
mark = interval.mark
89
90
if mark in self.sil_phones:
91
mark = self.phones_mapper[mark]
92
if mark == "END":
93
assert iterator == pha.intervals.__len__() - 1
94
# check if empty ph is always last example in your dataset if not fix it
95
96
dur = interval.duration() * (self.sample_rate / self.hop_size)
97
durations.append(round(dur))
98
phs.append(mark)
99
100
full_ph = " ".join(phs)
101
102
assert full_ph.split(" ").__len__() == durations.__len__() # safety check
103
104
base_name = f_name.split(".TextGrid")[0]
105
np.save(
106
os.path.join(self.output_durations_path, f"{base_name}-durations.npy"),
107
np.array(durations).astype(np.int32),
108
allow_pickle=False,
109
)
110
data.append(f"{speaker_name}/{base_name}|{full_ph}|{speaker_name}\n")
111
112
113
@click.command()
114
@click.option(
115
"--yaml_path", default="examples/fastspeech2_libritts/conf/fastspeech2libritts.yaml"
116
)
117
@click.option("--dataset_path", default="dataset", type=str, help="Dataset directory")
118
@click.option("--text_grid_path", default="mfa/parsed", type=str)
119
@click.option("--output_durations_path", default="dataset/durations")
120
@click.option("--sample_rate", default=24000, type=int)
121
@click.option("--multi_speakers", default=1, type=int, help="Use multi-speaker version")
122
@click.option("--train_file", default="train.txt")
123
def main(
124
yaml_path: str,
125
dataset_path: str,
126
text_grid_path: str,
127
output_durations_path: str,
128
sample_rate: int,
129
multi_speakers: int,
130
train_file: str,
131
):
132
133
with open(yaml_path) as file:
134
attrs = yaml.load(file)
135
hop_size = attrs["hop_size"]
136
137
Path(output_durations_path).mkdir(parents=True, exist_ok=True)
138
139
txt_grid_parser = TxtGridParser(
140
sample_rate=sample_rate,
141
multi_speaker=bool(multi_speakers),
142
txt_grid_path=text_grid_path,
143
hop_size=hop_size,
144
output_durations_path=output_durations_path,
145
training_file=train_file,
146
dataset_path=dataset_path,
147
)
148
txt_grid_parser.parse()
149
150
151
if __name__ == "__main__":
152
main()
153
154