Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
TensorSpeech
GitHub Repository: TensorSpeech/TensorFlowTTS
Path: blob/master/tensorflow_tts/utils/korean.py
1558 views
1
# -*- coding: utf-8 -*-
2
# Copyright 2020 TensorFlowTTS Team, Jaehyoung Kim(@crux153) and Taehoon Kim(@carpedm20)
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
# http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15
16
# Code based on https://github.com/carpedm20/multi-speaker-tacotron-tensorflow
17
"""Korean related helpers."""
18
19
import ast
20
import json
21
import os
22
import re
23
24
from jamo import h2j, hangul_to_jamo, j2h, jamo_to_hcj
25
26
etc_dictionary = {
27
"2 30대": "이삼십대",
28
"20~30대": "이삼십대",
29
"20, 30대": "이십대 삼십대",
30
"1+1": "원플러스원",
31
"3에서 6개월인": "3개월에서 육개월인",
32
}
33
34
english_dictionary = {
35
"Devsisters": "데브시스터즈",
36
"track": "트랙",
37
# krbook
38
"LA": "엘에이",
39
"LG": "엘지",
40
"KOREA": "코리아",
41
"JSA": "제이에스에이",
42
"PGA": "피지에이",
43
"GA": "지에이",
44
"idol": "아이돌",
45
"KTX": "케이티엑스",
46
"AC": "에이씨",
47
"DVD": "디비디",
48
"US": "유에스",
49
"CNN": "씨엔엔",
50
"LPGA": "엘피지에이",
51
"P": "피",
52
"L": "엘",
53
"T": "티",
54
"B": "비",
55
"C": "씨",
56
"BIFF": "비아이에프에프",
57
"GV": "지비",
58
# JTBC
59
"IT": "아이티",
60
"IQ": "아이큐",
61
"JTBC": "제이티비씨",
62
"trickle down effect": "트리클 다운 이펙트",
63
"trickle up effect": "트리클 업 이펙트",
64
"down": "다운",
65
"up": "업",
66
"FCK": "에프씨케이",
67
"AP": "에이피",
68
"WHERETHEWILDTHINGSARE": "",
69
"Rashomon Effect": "",
70
"O": "오",
71
"OO": "오오",
72
"B": "비",
73
"GDP": "지디피",
74
"CIPA": "씨아이피에이",
75
"YS": "와이에스",
76
"Y": "와이",
77
"S": "에스",
78
"JTBC": "제이티비씨",
79
"PC": "피씨",
80
"bill": "빌",
81
"Halmuny": "하모니", #####
82
"X": "엑스",
83
"SNS": "에스엔에스",
84
"ability": "어빌리티",
85
"shy": "",
86
"CCTV": "씨씨티비",
87
"IT": "아이티",
88
"the tenth man": "더 텐쓰 맨", ####
89
"L": "엘",
90
"PC": "피씨",
91
"YSDJJPMB": "", ########
92
"Content Attitude Timing": "컨텐트 애티튜드 타이밍",
93
"CAT": "캣",
94
"IS": "아이에스",
95
"K": "케이",
96
"Y": "와이",
97
"KDI": "케이디아이",
98
"DOC": "디오씨",
99
"CIA": "씨아이에이",
100
"PBS": "피비에스",
101
"D": "디",
102
"PPropertyPositionPowerPrisonP" "S": "에스",
103
"francisco": "프란시스코",
104
"I": "아이",
105
"III": "아이아이", ######
106
"No joke": "노 조크",
107
"BBK": "비비케이",
108
"LA": "엘에이",
109
"Don": "",
110
"t worry be happy": " 워리 비 해피",
111
"NO": "엔오", #####
112
"it was our sky": "잇 워즈 아워 스카이",
113
"it is our sky": "잇 이즈 아워 스카이", ####
114
"NEIS": "엔이아이에스", #####
115
"IMF": "아이엠에프",
116
"apology": "어폴로지",
117
"humble": "험블",
118
"M": "엠",
119
"Nowhere Man": "노웨어 맨",
120
"The Tenth Man": "더 텐쓰 맨",
121
"PBS": "피비에스",
122
"BBC": "비비씨",
123
"MRJ": "엠알제이",
124
"CCTV": "씨씨티비",
125
"Pick me up": "픽 미 업",
126
"DNA": "디엔에이",
127
"UN": "유엔",
128
"STOP": "스탑", #####
129
"PRESS": "프레스", #####
130
"not to be": "낫 투비",
131
"Denial": "디나이얼",
132
"G": "지",
133
"IMF": "아이엠에프",
134
"GDP": "지디피",
135
"JTBC": "제이티비씨",
136
"Time flies like an arrow": "타임 플라이즈 라이크 언 애로우",
137
"DDT": "디디티",
138
"AI": "에이아이",
139
"Z": "제트",
140
"OECD": "오이씨디",
141
"N": "앤",
142
"A": "에이",
143
"MB": "엠비",
144
"EH": "이에이치",
145
"IS": "아이에스",
146
"TV": "티비",
147
"MIT": "엠아이티",
148
"KBO": "케이비오",
149
"I love America": "아이 러브 아메리카",
150
"SF": "에스에프",
151
"Q": "큐",
152
"KFX": "케이에프엑스",
153
"PM": "피엠",
154
"Prime Minister": "프라임 미니스터",
155
"Swordline": "스워드라인",
156
"TBS": "티비에스",
157
"DDT": "디디티",
158
"CS": "씨에스",
159
"Reflecting Absence": "리플렉팅 앱센스",
160
"PBS": "피비에스",
161
"Drum being beaten by everyone": "드럼 빙 비튼 바이 에브리원",
162
"negative pressure": "네거티브 프레셔",
163
"F": "에프",
164
"KIA": "기아",
165
"FTA": "에프티에이",
166
"Que sais-je": "",
167
"UFC": "유에프씨",
168
"P": "피",
169
"DJ": "디제이",
170
"Chaebol": "채벌",
171
"BBC": "비비씨",
172
"OECD": "오이씨디",
173
"BC": "삐씨",
174
"C": "씨",
175
"B": "씨",
176
"KY": "케이와이",
177
"K": "케이",
178
"CEO": "씨이오",
179
"YH": "와이에치",
180
"IS": "아이에스",
181
"who are you": "후 얼 유",
182
"Y": "와이",
183
"The Devils Advocate": "더 데빌즈 어드보카트",
184
"YS": "와이에스",
185
"so sorry": "쏘 쏘리",
186
"Santa": "산타",
187
"Big Endian": "빅 엔디안",
188
"Small Endian": "스몰 엔디안",
189
"Oh Captain My Captain": "오 캡틴 마이 캡틴",
190
"AIB": "에이아이비",
191
"K": "케이",
192
"PBS": "피비에스",
193
# IU
194
"ASMR": "에이에스엠알",
195
"V": "브이",
196
"PD": "피디",
197
"CD": "씨디",
198
"ANR": "에이엔알",
199
"Twenty Three": "투엔티 쓰리",
200
"Through The Night": "쓰루 더 나잇",
201
"MD": "엠디",
202
}
203
204
num_to_kor = {
205
"0": "영",
206
"1": "일",
207
"2": "이",
208
"3": "삼",
209
"4": "사",
210
"5": "오",
211
"6": "육",
212
"7": "칠",
213
"8": "팔",
214
"9": "구",
215
}
216
217
unit_to_kor1 = {"%": "퍼센트", "cm": "센치미터", "mm": "밀리미터", "km": "킬로미터", "kg": "킬로그람"}
218
unit_to_kor2 = {"m": "미터"}
219
220
upper_to_kor = {
221
"A": "에이",
222
"B": "비",
223
"C": "씨",
224
"D": "디",
225
"E": "이",
226
"F": "에프",
227
"G": "지",
228
"H": "에이치",
229
"I": "아이",
230
"J": "제이",
231
"K": "케이",
232
"L": "엘",
233
"M": "엠",
234
"N": "엔",
235
"O": "오",
236
"P": "피",
237
"Q": "큐",
238
"R": "알",
239
"S": "에스",
240
"T": "티",
241
"U": "유",
242
"V": "브이",
243
"W": "더블유",
244
"X": "엑스",
245
"Y": "와이",
246
"Z": "지",
247
}
248
249
250
"""
251
초성과 종성은 같아보이지만, 다른 character이다.
252
253
'_-!'(),-.:;? ᄀᄁᄂᄃᄄᄅᄆᄇᄈᄉᄊᄋᄌᄍᄎᄏᄐᄑ하ᅢᅣᅤᅥᅦᅧᅨᅩᅪᅫᅬᅭᅮᅯᅰᅱᅲᅳᅴᅵᆨᆩᆪᆫᆬᆭᆮᆯᆰᆱᆲᆳᆴᆵᆶᆷᆸᆹᆺᆻᆼᆽᆾᆿᇀᇁᇂ~'
254
255
'_': 0, '-': 7, '!': 2, "'": 3, '(': 4, ')': 5, ',': 6, '.': 8, ':': 9, ';': 10,
256
'?': 11, ' ': 12, 'ᄀ': 13, 'ᄁ': 14, 'ᄂ': 15, 'ᄃ': 16, 'ᄄ': 17, 'ᄅ': 18, 'ᄆ': 19, 'ᄇ': 20,
257
'ᄈ': 21, 'ᄉ': 22, 'ᄊ': 23, 'ᄋ': 24, 'ᄌ': 25, 'ᄍ': 26, 'ᄎ': 27, 'ᄏ': 28, 'ᄐ': 29, 'ᄑ': 30,
258
'ᄒ': 31, 'ᅡ': 32, 'ᅢ': 33, 'ᅣ': 34, 'ᅤ': 35, 'ᅥ': 36, 'ᅦ': 37, 'ᅧ': 38, 'ᅨ': 39, 'ᅩ': 40,
259
'ᅪ': 41, 'ᅫ': 42, 'ᅬ': 43, 'ᅭ': 44, 'ᅮ': 45, 'ᅯ': 46, 'ᅰ': 47, 'ᅱ': 48, 'ᅲ': 49, 'ᅳ': 50,
260
'ᅴ': 51, 'ᅵ': 52, 'ᆨ': 53, 'ᆩ': 54, 'ᆪ': 55, 'ᆫ': 56, 'ᆬ': 57, 'ᆭ': 58, 'ᆮ': 59, 'ᆯ': 60,
261
'ᆰ': 61, 'ᆱ': 62, 'ᆲ': 63, 'ᆳ': 64, 'ᆴ': 65, 'ᆵ': 66, 'ᆶ': 67, 'ᆷ': 68, 'ᆸ': 69, 'ᆹ': 70,
262
'ᆺ': 71, 'ᆻ': 72, 'ᆼ': 73, 'ᆽ': 74, 'ᆾ': 75, 'ᆿ': 76, 'ᇀ': 77, 'ᇁ': 78, 'ᇂ': 79, '~': 80
263
"""
264
265
_pad = "pad"
266
_eos = "eos"
267
_punctuation = "!'(),-.:;? "
268
_special = "-"
269
270
_jamo_leads = [chr(_) for _ in range(0x1100, 0x1113)]
271
_jamo_vowels = [chr(_) for _ in range(0x1161, 0x1176)]
272
_jamo_tails = [chr(_) for _ in range(0x11A8, 0x11C3)]
273
274
_letters = _jamo_leads + _jamo_vowels + _jamo_tails
275
276
symbols = [_pad] + list(_special) + list(_punctuation) + _letters + [_eos]
277
278
_symbol_to_id = {c: i for i, c in enumerate(symbols)}
279
_id_to_symbol = {i: c for i, c in enumerate(symbols)}
280
281
quote_checker = """([`"'"“‘])(.+?)([`"'"”’])"""
282
283
284
def is_lead(char):
285
return char in _jamo_leads
286
287
288
def is_vowel(char):
289
return char in _jamo_vowels
290
291
292
def is_tail(char):
293
return char in _jamo_tails
294
295
296
def get_mode(char):
297
if is_lead(char):
298
return 0
299
elif is_vowel(char):
300
return 1
301
elif is_tail(char):
302
return 2
303
else:
304
return -1
305
306
307
def _get_text_from_candidates(candidates):
308
if len(candidates) == 0:
309
return ""
310
elif len(candidates) == 1:
311
return jamo_to_hcj(candidates[0])
312
else:
313
return j2h(**dict(zip(["lead", "vowel", "tail"], candidates)))
314
315
316
def jamo_to_korean(text):
317
text = h2j(text)
318
319
idx = 0
320
new_text = ""
321
candidates = []
322
323
while True:
324
if idx >= len(text):
325
new_text += _get_text_from_candidates(candidates)
326
break
327
328
char = text[idx]
329
mode = get_mode(char)
330
331
if mode == 0:
332
new_text += _get_text_from_candidates(candidates)
333
candidates = [char]
334
elif mode == -1:
335
new_text += _get_text_from_candidates(candidates)
336
new_text += char
337
candidates = []
338
else:
339
candidates.append(char)
340
341
idx += 1
342
return new_text
343
344
345
def compare_sentence_with_jamo(text1, text2):
346
return h2j(text1) != h2j(text2)
347
348
349
def tokenize(text, as_id=False):
350
# jamo package에 있는 hangul_to_jamo를 이용하여 한글 string을 초성/중성/종성으로 나눈다.
351
text = normalize(text)
352
tokens = list(
353
hangul_to_jamo(text)
354
) # '존경하는' --> ['ᄌ', 'ᅩ', 'ᆫ', 'ᄀ', 'ᅧ', 'ᆼ', 'ᄒ', 'ᅡ', 'ᄂ', 'ᅳ', 'ᆫ', '~']
355
356
if as_id:
357
return [_symbol_to_id[token] for token in tokens]
358
else:
359
return [token for token in tokens]
360
361
362
def tokenizer_fn(iterator):
363
return (token for x in iterator for token in tokenize(x, as_id=False))
364
365
366
def normalize(text):
367
text = text.strip()
368
369
text = re.sub("\(\d+일\)", "", text)
370
text = re.sub("\([⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+\)", "", text)
371
372
text = normalize_with_dictionary(text, etc_dictionary)
373
text = normalize_english(text)
374
text = re.sub("[a-zA-Z]+", normalize_upper, text)
375
376
text = normalize_quote(text)
377
text = normalize_number(text)
378
379
return text
380
381
382
def normalize_with_dictionary(text, dic):
383
if any(key in text for key in dic.keys()):
384
pattern = re.compile("|".join(re.escape(key) for key in dic.keys()))
385
return pattern.sub(lambda x: dic[x.group()], text)
386
else:
387
return text
388
389
390
def normalize_english(text):
391
def fn(m):
392
word = m.group()
393
if word in english_dictionary:
394
return english_dictionary.get(word)
395
else:
396
return word
397
398
text = re.sub("([A-Za-z]+)", fn, text)
399
return text
400
401
402
def normalize_upper(text):
403
text = text.group(0)
404
405
if all([char.isupper() for char in text]):
406
return "".join(upper_to_kor[char] for char in text)
407
else:
408
return text
409
410
411
def normalize_quote(text):
412
def fn(found_text):
413
from nltk import sent_tokenize # NLTK doesn't along with multiprocessing
414
415
found_text = found_text.group()
416
unquoted_text = found_text[1:-1]
417
418
sentences = sent_tokenize(unquoted_text)
419
return " ".join(["'{}'".format(sent) for sent in sentences])
420
421
return re.sub(quote_checker, fn, text)
422
423
424
number_checker = "([+-]?\d[\d,]*)[\.]?\d*"
425
count_checker = "(시|명|가지|살|마리|포기|송이|수|톨|통|점|개|벌|척|채|다발|그루|자루|줄|켤레|그릇|잔|마디|상자|사람|곡|병|판)"
426
427
428
def normalize_number(text):
429
text = normalize_with_dictionary(text, unit_to_kor1)
430
text = normalize_with_dictionary(text, unit_to_kor2)
431
text = re.sub(
432
number_checker + count_checker, lambda x: number_to_korean(x, True), text
433
)
434
text = re.sub(number_checker, lambda x: number_to_korean(x, False), text)
435
return text
436
437
438
num_to_kor1 = [""] + list("일이삼사오육칠팔구")
439
num_to_kor2 = [""] + list("만억조경해")
440
num_to_kor3 = [""] + list("십백천")
441
442
# count_to_kor1 = [""] + ["하나","둘","셋","넷","다섯","여섯","일곱","여덟","아홉"]
443
count_to_kor1 = [""] + ["한", "두", "세", "네", "다섯", "여섯", "일곱", "여덟", "아홉"]
444
445
count_tenth_dict = {
446
"십": "열",
447
"두십": "스물",
448
"세십": "서른",
449
"네십": "마흔",
450
"다섯십": "쉰",
451
"여섯십": "예순",
452
"일곱십": "일흔",
453
"여덟십": "여든",
454
"아홉십": "아흔",
455
}
456
457
458
def number_to_korean(num_str, is_count=False):
459
if is_count:
460
num_str, unit_str = num_str.group(1), num_str.group(2)
461
else:
462
num_str, unit_str = num_str.group(), ""
463
464
num_str = num_str.replace(",", "")
465
num = ast.literal_eval(num_str)
466
467
if num == 0:
468
return "영"
469
470
check_float = num_str.split(".")
471
if len(check_float) == 2:
472
digit_str, float_str = check_float
473
elif len(check_float) >= 3:
474
raise Exception(" [!] Wrong number format")
475
else:
476
digit_str, float_str = check_float[0], None
477
478
if is_count and float_str is not None:
479
raise Exception(" [!] `is_count` and float number does not fit each other")
480
481
digit = int(digit_str)
482
483
if digit_str.startswith("-"):
484
digit, digit_str = abs(digit), str(abs(digit))
485
486
kor = ""
487
size = len(str(digit))
488
tmp = []
489
490
for i, v in enumerate(digit_str, start=1):
491
v = int(v)
492
493
if v != 0:
494
if is_count:
495
tmp += count_to_kor1[v]
496
else:
497
tmp += num_to_kor1[v]
498
499
tmp += num_to_kor3[(size - i) % 4]
500
501
if (size - i) % 4 == 0 and len(tmp) != 0:
502
kor += "".join(tmp)
503
tmp = []
504
kor += num_to_kor2[int((size - i) / 4)]
505
506
if is_count:
507
if kor.startswith("한") and len(kor) > 1:
508
kor = kor[1:]
509
510
if any(word in kor for word in count_tenth_dict):
511
kor = re.sub(
512
"|".join(count_tenth_dict.keys()),
513
lambda x: count_tenth_dict[x.group()],
514
kor,
515
)
516
517
if not is_count and kor.startswith("일") and len(kor) > 1:
518
kor = kor[1:]
519
520
if float_str is not None:
521
kor += "쩜 "
522
kor += re.sub("\d", lambda x: num_to_kor[x.group()], float_str)
523
524
if num_str.startswith("+"):
525
kor = "플러스 " + kor
526
elif num_str.startswith("-"):
527
kor = "마이너스 " + kor
528
529
return kor + unit_str
530
531