CoCalc -- korean.py

GitHub Repository: TensorSpeech/TensorFlowTTS
Path: blob/master/tensorflow_tts/utils/korean.py
¹⁵⁵⁸ views
1
# -*- coding: utf-8 -*-
2
# Copyright 2020 TensorFlowTTS Team, Jaehyoung Kim(@crux153) and Taehoon Kim(@carpedm20)
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
#     http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15

16
# Code based on https://github.com/carpedm20/multi-speaker-tacotron-tensorflow
17
"""Korean related helpers."""
18

19
import ast
20
import json
21
import os
22
import re
23

24
from jamo import h2j, hangul_to_jamo, j2h, jamo_to_hcj
25

26
etc_dictionary = {
27
    "2 30대": "이삼십대",
28
    "20~30대": "이삼십대",
29
    "20, 30대": "이십대 삼십대",
30
    "1+1": "원플러스원",
31
    "3에서 6개월인": "3개월에서 육개월인",
32
}
33

34
english_dictionary = {
35
    "Devsisters": "데브시스터즈",
36
    "track": "트랙",
37
    # krbook
38
    "LA": "엘에이",
39
    "LG": "엘지",
40
    "KOREA": "코리아",
41
    "JSA": "제이에스에이",
42
    "PGA": "피지에이",
43
    "GA": "지에이",
44
    "idol": "아이돌",
45
    "KTX": "케이티엑스",
46
    "AC": "에이씨",
47
    "DVD": "디비디",
48
    "US": "유에스",
49
    "CNN": "씨엔엔",
50
    "LPGA": "엘피지에이",
51
    "P": "피",
52
    "L": "엘",
53
    "T": "티",
54
    "B": "비",
55
    "C": "씨",
56
    "BIFF": "비아이에프에프",
57
    "GV": "지비",
58
    # JTBC
59
    "IT": "아이티",
60
    "IQ": "아이큐",
61
    "JTBC": "제이티비씨",
62
    "trickle down effect": "트리클 다운 이펙트",
63
    "trickle up effect": "트리클 업 이펙트",
64
    "down": "다운",
65
    "up": "업",
66
    "FCK": "에프씨케이",
67
    "AP": "에이피",
68
    "WHERETHEWILDTHINGSARE": "",
69
    "Rashomon Effect": "",
70
    "O": "오",
71
    "OO": "오오",
72
    "B": "비",
73
    "GDP": "지디피",
74
    "CIPA": "씨아이피에이",
75
    "YS": "와이에스",
76
    "Y": "와이",
77
    "S": "에스",
78
    "JTBC": "제이티비씨",
79
    "PC": "피씨",
80
    "bill": "빌",
81
    "Halmuny": "하모니",  #####
82
    "X": "엑스",
83
    "SNS": "에스엔에스",
84
    "ability": "어빌리티",
85
    "shy": "",
86
    "CCTV": "씨씨티비",
87
    "IT": "아이티",
88
    "the tenth man": "더 텐쓰 맨",  ####
89
    "L": "엘",
90
    "PC": "피씨",
91
    "YSDJJPMB": "",  ########
92
    "Content Attitude Timing": "컨텐트 애티튜드 타이밍",
93
    "CAT": "캣",
94
    "IS": "아이에스",
95
    "K": "케이",
96
    "Y": "와이",
97
    "KDI": "케이디아이",
98
    "DOC": "디오씨",
99
    "CIA": "씨아이에이",
100
    "PBS": "피비에스",
101
    "D": "디",
102
    "PPropertyPositionPowerPrisonP" "S": "에스",
103
    "francisco": "프란시스코",
104
    "I": "아이",
105
    "III": "아이아이",  ######
106
    "No joke": "노 조크",
107
    "BBK": "비비케이",
108
    "LA": "엘에이",
109
    "Don": "",
110
    "t worry be happy": " 워리 비 해피",
111
    "NO": "엔오",  #####
112
    "it was our sky": "잇 워즈 아워 스카이",
113
    "it is our sky": "잇 이즈 아워 스카이",  ####
114
    "NEIS": "엔이아이에스",  #####
115
    "IMF": "아이엠에프",
116
    "apology": "어폴로지",
117
    "humble": "험블",
118
    "M": "엠",
119
    "Nowhere Man": "노웨어 맨",
120
    "The Tenth Man": "더 텐쓰 맨",
121
    "PBS": "피비에스",
122
    "BBC": "비비씨",
123
    "MRJ": "엠알제이",
124
    "CCTV": "씨씨티비",
125
    "Pick me up": "픽 미 업",
126
    "DNA": "디엔에이",
127
    "UN": "유엔",
128
    "STOP": "스탑",  #####
129
    "PRESS": "프레스",  #####
130
    "not to be": "낫 투비",
131
    "Denial": "디나이얼",
132
    "G": "지",
133
    "IMF": "아이엠에프",
134
    "GDP": "지디피",
135
    "JTBC": "제이티비씨",
136
    "Time flies like an arrow": "타임 플라이즈 라이크 언 애로우",
137
    "DDT": "디디티",
138
    "AI": "에이아이",
139
    "Z": "제트",
140
    "OECD": "오이씨디",
141
    "N": "앤",
142
    "A": "에이",
143
    "MB": "엠비",
144
    "EH": "이에이치",
145
    "IS": "아이에스",
146
    "TV": "티비",
147
    "MIT": "엠아이티",
148
    "KBO": "케이비오",
149
    "I love America": "아이 러브 아메리카",
150
    "SF": "에스에프",
151
    "Q": "큐",
152
    "KFX": "케이에프엑스",
153
    "PM": "피엠",
154
    "Prime Minister": "프라임 미니스터",
155
    "Swordline": "스워드라인",
156
    "TBS": "티비에스",
157
    "DDT": "디디티",
158
    "CS": "씨에스",
159
    "Reflecting Absence": "리플렉팅 앱센스",
160
    "PBS": "피비에스",
161
    "Drum being beaten by everyone": "드럼 빙 비튼 바이 에브리원",
162
    "negative pressure": "네거티브 프레셔",
163
    "F": "에프",
164
    "KIA": "기아",
165
    "FTA": "에프티에이",
166
    "Que sais-je": "",
167
    "UFC": "유에프씨",
168
    "P": "피",
169
    "DJ": "디제이",
170
    "Chaebol": "채벌",
171
    "BBC": "비비씨",
172
    "OECD": "오이씨디",
173
    "BC": "삐씨",
174
    "C": "씨",
175
    "B": "씨",
176
    "KY": "케이와이",
177
    "K": "케이",
178
    "CEO": "씨이오",
179
    "YH": "와이에치",
180
    "IS": "아이에스",
181
    "who are you": "후 얼 유",
182
    "Y": "와이",
183
    "The Devils Advocate": "더 데빌즈 어드보카트",
184
    "YS": "와이에스",
185
    "so sorry": "쏘 쏘리",
186
    "Santa": "산타",
187
    "Big Endian": "빅 엔디안",
188
    "Small Endian": "스몰 엔디안",
189
    "Oh Captain My Captain": "오 캡틴 마이 캡틴",
190
    "AIB": "에이아이비",
191
    "K": "케이",
192
    "PBS": "피비에스",
193
    # IU
194
    "ASMR": "에이에스엠알",
195
    "V": "브이",
196
    "PD": "피디",
197
    "CD": "씨디",
198
    "ANR": "에이엔알",
199
    "Twenty Three": "투엔티 쓰리",
200
    "Through The Night": "쓰루 더 나잇",
201
    "MD": "엠디",
202
}
203

204
num_to_kor = {
205
    "0": "영",
206
    "1": "일",
207
    "2": "이",
208
    "3": "삼",
209
    "4": "사",
210
    "5": "오",
211
    "6": "육",
212
    "7": "칠",
213
    "8": "팔",
214
    "9": "구",
215
}
216

217
unit_to_kor1 = {"%": "퍼센트", "cm": "센치미터", "mm": "밀리미터", "km": "킬로미터", "kg": "킬로그람"}
218
unit_to_kor2 = {"m": "미터"}
219

220
upper_to_kor = {
221
    "A": "에이",
222
    "B": "비",
223
    "C": "씨",
224
    "D": "디",
225
    "E": "이",
226
    "F": "에프",
227
    "G": "지",
228
    "H": "에이치",
229
    "I": "아이",
230
    "J": "제이",
231
    "K": "케이",
232
    "L": "엘",
233
    "M": "엠",
234
    "N": "엔",
235
    "O": "오",
236
    "P": "피",
237
    "Q": "큐",
238
    "R": "알",
239
    "S": "에스",
240
    "T": "티",
241
    "U": "유",
242
    "V": "브이",
243
    "W": "더블유",
244
    "X": "엑스",
245
    "Y": "와이",
246
    "Z": "지",
247
}
248

249

250
"""
251
초성과 종성은 같아보이지만, 다른 character이다.
252

253
'_-!'(),-.:;? ᄀᄁᄂᄃᄄᄅᄆᄇᄈᄉᄊᄋᄌᄍᄎᄏᄐᄑ하ᅢᅣᅤᅥᅦᅧᅨᅩᅪᅫᅬᅭᅮᅯᅰᅱᅲᅳᅴᅵᆨᆩᆪᆫᆬᆭᆮᆯᆰᆱᆲᆳᆴᆵᆶᆷᆸᆹᆺᆻᆼᆽᆾᆿᇀᇁᇂ~'
254

255
'_': 0, '-': 7, '!': 2, "'": 3, '(': 4, ')': 5, ',': 6, '.': 8, ':': 9, ';': 10,
256
'?': 11, ' ': 12, 'ᄀ': 13, 'ᄁ': 14, 'ᄂ': 15, 'ᄃ': 16, 'ᄄ': 17, 'ᄅ': 18, 'ᄆ': 19, 'ᄇ': 20,
257
'ᄈ': 21, 'ᄉ': 22, 'ᄊ': 23, 'ᄋ': 24, 'ᄌ': 25, 'ᄍ': 26, 'ᄎ': 27, 'ᄏ': 28, 'ᄐ': 29, 'ᄑ': 30,
258
'ᄒ': 31, 'ᅡ': 32, 'ᅢ': 33, 'ᅣ': 34, 'ᅤ': 35, 'ᅥ': 36, 'ᅦ': 37, 'ᅧ': 38, 'ᅨ': 39, 'ᅩ': 40,
259
'ᅪ': 41, 'ᅫ': 42, 'ᅬ': 43, 'ᅭ': 44, 'ᅮ': 45, 'ᅯ': 46, 'ᅰ': 47, 'ᅱ': 48, 'ᅲ': 49, 'ᅳ': 50,
260
'ᅴ': 51, 'ᅵ': 52, 'ᆨ': 53, 'ᆩ': 54, 'ᆪ': 55, 'ᆫ': 56, 'ᆬ': 57, 'ᆭ': 58, 'ᆮ': 59, 'ᆯ': 60,
261
'ᆰ': 61, 'ᆱ': 62, 'ᆲ': 63, 'ᆳ': 64, 'ᆴ': 65, 'ᆵ': 66, 'ᆶ': 67, 'ᆷ': 68, 'ᆸ': 69, 'ᆹ': 70,
262
'ᆺ': 71, 'ᆻ': 72, 'ᆼ': 73, 'ᆽ': 74, 'ᆾ': 75, 'ᆿ': 76, 'ᇀ': 77, 'ᇁ': 78, 'ᇂ': 79, '~': 80
263
"""
264

265
_pad = "pad"
266
_eos = "eos"
267
_punctuation = "!'(),-.:;? "
268
_special = "-"
269

270
_jamo_leads = [chr(_) for _ in range(0x1100, 0x1113)]
271
_jamo_vowels = [chr(_) for _ in range(0x1161, 0x1176)]
272
_jamo_tails = [chr(_) for _ in range(0x11A8, 0x11C3)]
273

274
_letters = _jamo_leads + _jamo_vowels + _jamo_tails
275

276
symbols = [_pad] + list(_special) + list(_punctuation) + _letters + [_eos]
277

278
_symbol_to_id = {c: i for i, c in enumerate(symbols)}
279
_id_to_symbol = {i: c for i, c in enumerate(symbols)}
280

281
quote_checker = """([`"'＂“‘])(.+?)([`"'＂”’])"""
282

283

284
def is_lead(char):
285
    return char in _jamo_leads
286

287

288
def is_vowel(char):
289
    return char in _jamo_vowels
290

291

292
def is_tail(char):
293
    return char in _jamo_tails
294

295

296
def get_mode(char):
297
    if is_lead(char):
298
        return 0
299
    elif is_vowel(char):
300
        return 1
301
    elif is_tail(char):
302
        return 2
303
    else:
304
        return -1
305

306

307
def _get_text_from_candidates(candidates):
308
    if len(candidates) == 0:
309
        return ""
310
    elif len(candidates) == 1:
311
        return jamo_to_hcj(candidates[0])
312
    else:
313
        return j2h(**dict(zip(["lead", "vowel", "tail"], candidates)))
314

315

316
def jamo_to_korean(text):
317
    text = h2j(text)
318

319
    idx = 0
320
    new_text = ""
321
    candidates = []
322

323
    while True:
324
        if idx >= len(text):
325
            new_text += _get_text_from_candidates(candidates)
326
            break
327

328
        char = text[idx]
329
        mode = get_mode(char)
330

331
        if mode == 0:
332
            new_text += _get_text_from_candidates(candidates)
333
            candidates = [char]
334
        elif mode == -1:
335
            new_text += _get_text_from_candidates(candidates)
336
            new_text += char
337
            candidates = []
338
        else:
339
            candidates.append(char)
340

341
        idx += 1
342
    return new_text
343

344

345
def compare_sentence_with_jamo(text1, text2):
346
    return h2j(text1) != h2j(text2)
347

348

349
def tokenize(text, as_id=False):
350
    # jamo package에 있는 hangul_to_jamo를 이용하여 한글 string을 초성/중성/종성으로 나눈다.
351
    text = normalize(text)
352
    tokens = list(
353
        hangul_to_jamo(text)
354
    )  # '존경하는'  --> ['ᄌ', 'ᅩ', 'ᆫ', 'ᄀ', 'ᅧ', 'ᆼ', 'ᄒ', 'ᅡ', 'ᄂ', 'ᅳ', 'ᆫ', '~']
355

356
    if as_id:
357
        return [_symbol_to_id[token] for token in tokens]
358
    else:
359
        return [token for token in tokens]
360

361

362
def tokenizer_fn(iterator):
363
    return (token for x in iterator for token in tokenize(x, as_id=False))
364

365

366
def normalize(text):
367
    text = text.strip()
368

369
    text = re.sub("\(\d+일\)", "", text)
370
    text = re.sub("\([⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+\)", "", text)
371

372
    text = normalize_with_dictionary(text, etc_dictionary)
373
    text = normalize_english(text)
374
    text = re.sub("[a-zA-Z]+", normalize_upper, text)
375

376
    text = normalize_quote(text)
377
    text = normalize_number(text)
378

379
    return text
380

381

382
def normalize_with_dictionary(text, dic):
383
    if any(key in text for key in dic.keys()):
384
        pattern = re.compile("|".join(re.escape(key) for key in dic.keys()))
385
        return pattern.sub(lambda x: dic[x.group()], text)
386
    else:
387
        return text
388

389

390
def normalize_english(text):
391
    def fn(m):
392
        word = m.group()
393
        if word in english_dictionary:
394
            return english_dictionary.get(word)
395
        else:
396
            return word
397

398
    text = re.sub("([A-Za-z]+)", fn, text)
399
    return text
400

401

402
def normalize_upper(text):
403
    text = text.group(0)
404

405
    if all([char.isupper() for char in text]):
406
        return "".join(upper_to_kor[char] for char in text)
407
    else:
408
        return text
409

410

411
def normalize_quote(text):
412
    def fn(found_text):
413
        from nltk import sent_tokenize  # NLTK doesn't along with multiprocessing
414

415
        found_text = found_text.group()
416
        unquoted_text = found_text[1:-1]
417

418
        sentences = sent_tokenize(unquoted_text)
419
        return " ".join(["'{}'".format(sent) for sent in sentences])
420

421
    return re.sub(quote_checker, fn, text)
422

423

424
number_checker = "([+-]?\d[\d,]*)[\.]?\d*"
425
count_checker = "(시|명|가지|살|마리|포기|송이|수|톨|통|점|개|벌|척|채|다발|그루|자루|줄|켤레|그릇|잔|마디|상자|사람|곡|병|판)"
426

427

428
def normalize_number(text):
429
    text = normalize_with_dictionary(text, unit_to_kor1)
430
    text = normalize_with_dictionary(text, unit_to_kor2)
431
    text = re.sub(
432
        number_checker + count_checker, lambda x: number_to_korean(x, True), text
433
    )
434
    text = re.sub(number_checker, lambda x: number_to_korean(x, False), text)
435
    return text
436

437

438
num_to_kor1 = [""] + list("일이삼사오육칠팔구")
439
num_to_kor2 = [""] + list("만억조경해")
440
num_to_kor3 = [""] + list("십백천")
441

442
# count_to_kor1 = [""] + ["하나","둘","셋","넷","다섯","여섯","일곱","여덟","아홉"]
443
count_to_kor1 = [""] + ["한", "두", "세", "네", "다섯", "여섯", "일곱", "여덟", "아홉"]
444

445
count_tenth_dict = {
446
    "십": "열",
447
    "두십": "스물",
448
    "세십": "서른",
449
    "네십": "마흔",
450
    "다섯십": "쉰",
451
    "여섯십": "예순",
452
    "일곱십": "일흔",
453
    "여덟십": "여든",
454
    "아홉십": "아흔",
455
}
456

457

458
def number_to_korean(num_str, is_count=False):
459
    if is_count:
460
        num_str, unit_str = num_str.group(1), num_str.group(2)
461
    else:
462
        num_str, unit_str = num_str.group(), ""
463

464
    num_str = num_str.replace(",", "")
465
    num = ast.literal_eval(num_str)
466

467
    if num == 0:
468
        return "영"
469

470
    check_float = num_str.split(".")
471
    if len(check_float) == 2:
472
        digit_str, float_str = check_float
473
    elif len(check_float) >= 3:
474
        raise Exception(" [!] Wrong number format")
475
    else:
476
        digit_str, float_str = check_float[0], None
477

478
    if is_count and float_str is not None:
479
        raise Exception(" [!] `is_count` and float number does not fit each other")
480

481
    digit = int(digit_str)
482

483
    if digit_str.startswith("-"):
484
        digit, digit_str = abs(digit), str(abs(digit))
485

486
    kor = ""
487
    size = len(str(digit))
488
    tmp = []
489

490
    for i, v in enumerate(digit_str, start=1):
491
        v = int(v)
492

493
        if v != 0:
494
            if is_count:
495
                tmp += count_to_kor1[v]
496
            else:
497
                tmp += num_to_kor1[v]
498

499
            tmp += num_to_kor3[(size - i) % 4]
500

501
        if (size - i) % 4 == 0 and len(tmp) != 0:
502
            kor += "".join(tmp)
503
            tmp = []
504
            kor += num_to_kor2[int((size - i) / 4)]
505

506
    if is_count:
507
        if kor.startswith("한") and len(kor) > 1:
508
            kor = kor[1:]
509

510
        if any(word in kor for word in count_tenth_dict):
511
            kor = re.sub(
512
                "|".join(count_tenth_dict.keys()),
513
                lambda x: count_tenth_dict[x.group()],
514
                kor,
515
            )
516

517
    if not is_count and kor.startswith("일") and len(kor) > 1:
518
        kor = kor[1:]
519

520
    if float_str is not None:
521
        kor += "쩜 "
522
        kor += re.sub("\d", lambda x: num_to_kor[x.group()], float_str)
523

524
    if num_str.startswith("+"):
525
        kor = "플러스 " + kor
526
    elif num_str.startswith("-"):
527
        kor = "마이너스 " + kor
528

529
    return kor + unit_str
530

531
Product

Resources

Company