Path: blob/master/tensorflow_tts/utils/korean.py
1558 views
# -*- coding: utf-8 -*-1# Copyright 2020 TensorFlowTTS Team, Jaehyoung Kim(@crux153) and Taehoon Kim(@carpedm20)2#3# Licensed under the Apache License, Version 2.0 (the "License");4# you may not use this file except in compliance with the License.5# You may obtain a copy of the License at6#7# http://www.apache.org/licenses/LICENSE-2.08#9# Unless required by applicable law or agreed to in writing, software10# distributed under the License is distributed on an "AS IS" BASIS,11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.12# See the License for the specific language governing permissions and13# limitations under the License.1415# Code based on https://github.com/carpedm20/multi-speaker-tacotron-tensorflow16"""Korean related helpers."""1718import ast19import json20import os21import re2223from jamo import h2j, hangul_to_jamo, j2h, jamo_to_hcj2425etc_dictionary = {26"2 30대": "이삼십대",27"20~30대": "이삼십대",28"20, 30대": "이십대 삼십대",29"1+1": "원플러스원",30"3에서 6개월인": "3개월에서 육개월인",31}3233english_dictionary = {34"Devsisters": "데브시스터즈",35"track": "트랙",36# krbook37"LA": "엘에이",38"LG": "엘지",39"KOREA": "코리아",40"JSA": "제이에스에이",41"PGA": "피지에이",42"GA": "지에이",43"idol": "아이돌",44"KTX": "케이티엑스",45"AC": "에이씨",46"DVD": "디비디",47"US": "유에스",48"CNN": "씨엔엔",49"LPGA": "엘피지에이",50"P": "피",51"L": "엘",52"T": "티",53"B": "비",54"C": "씨",55"BIFF": "비아이에프에프",56"GV": "지비",57# JTBC58"IT": "아이티",59"IQ": "아이큐",60"JTBC": "제이티비씨",61"trickle down effect": "트리클 다운 이펙트",62"trickle up effect": "트리클 업 이펙트",63"down": "다운",64"up": "업",65"FCK": "에프씨케이",66"AP": "에이피",67"WHERETHEWILDTHINGSARE": "",68"Rashomon Effect": "",69"O": "오",70"OO": "오오",71"B": "비",72"GDP": "지디피",73"CIPA": "씨아이피에이",74"YS": "와이에스",75"Y": "와이",76"S": "에스",77"JTBC": "제이티비씨",78"PC": "피씨",79"bill": "빌",80"Halmuny": "하모니", #####81"X": "엑스",82"SNS": "에스엔에스",83"ability": "어빌리티",84"shy": "",85"CCTV": "씨씨티비",86"IT": "아이티",87"the tenth man": "더 텐쓰 맨", ####88"L": "엘",89"PC": "피씨",90"YSDJJPMB": "", ########91"Content Attitude Timing": "컨텐트 애티튜드 타이밍",92"CAT": "캣",93"IS": "아이에스",94"K": "케이",95"Y": "와이",96"KDI": "케이디아이",97"DOC": "디오씨",98"CIA": "씨아이에이",99"PBS": "피비에스",100"D": "디",101"PPropertyPositionPowerPrisonP" "S": "에스",102"francisco": "프란시스코",103"I": "아이",104"III": "아이아이", ######105"No joke": "노 조크",106"BBK": "비비케이",107"LA": "엘에이",108"Don": "",109"t worry be happy": " 워리 비 해피",110"NO": "엔오", #####111"it was our sky": "잇 워즈 아워 스카이",112"it is our sky": "잇 이즈 아워 스카이", ####113"NEIS": "엔이아이에스", #####114"IMF": "아이엠에프",115"apology": "어폴로지",116"humble": "험블",117"M": "엠",118"Nowhere Man": "노웨어 맨",119"The Tenth Man": "더 텐쓰 맨",120"PBS": "피비에스",121"BBC": "비비씨",122"MRJ": "엠알제이",123"CCTV": "씨씨티비",124"Pick me up": "픽 미 업",125"DNA": "디엔에이",126"UN": "유엔",127"STOP": "스탑", #####128"PRESS": "프레스", #####129"not to be": "낫 투비",130"Denial": "디나이얼",131"G": "지",132"IMF": "아이엠에프",133"GDP": "지디피",134"JTBC": "제이티비씨",135"Time flies like an arrow": "타임 플라이즈 라이크 언 애로우",136"DDT": "디디티",137"AI": "에이아이",138"Z": "제트",139"OECD": "오이씨디",140"N": "앤",141"A": "에이",142"MB": "엠비",143"EH": "이에이치",144"IS": "아이에스",145"TV": "티비",146"MIT": "엠아이티",147"KBO": "케이비오",148"I love America": "아이 러브 아메리카",149"SF": "에스에프",150"Q": "큐",151"KFX": "케이에프엑스",152"PM": "피엠",153"Prime Minister": "프라임 미니스터",154"Swordline": "스워드라인",155"TBS": "티비에스",156"DDT": "디디티",157"CS": "씨에스",158"Reflecting Absence": "리플렉팅 앱센스",159"PBS": "피비에스",160"Drum being beaten by everyone": "드럼 빙 비튼 바이 에브리원",161"negative pressure": "네거티브 프레셔",162"F": "에프",163"KIA": "기아",164"FTA": "에프티에이",165"Que sais-je": "",166"UFC": "유에프씨",167"P": "피",168"DJ": "디제이",169"Chaebol": "채벌",170"BBC": "비비씨",171"OECD": "오이씨디",172"BC": "삐씨",173"C": "씨",174"B": "씨",175"KY": "케이와이",176"K": "케이",177"CEO": "씨이오",178"YH": "와이에치",179"IS": "아이에스",180"who are you": "후 얼 유",181"Y": "와이",182"The Devils Advocate": "더 데빌즈 어드보카트",183"YS": "와이에스",184"so sorry": "쏘 쏘리",185"Santa": "산타",186"Big Endian": "빅 엔디안",187"Small Endian": "스몰 엔디안",188"Oh Captain My Captain": "오 캡틴 마이 캡틴",189"AIB": "에이아이비",190"K": "케이",191"PBS": "피비에스",192# IU193"ASMR": "에이에스엠알",194"V": "브이",195"PD": "피디",196"CD": "씨디",197"ANR": "에이엔알",198"Twenty Three": "투엔티 쓰리",199"Through The Night": "쓰루 더 나잇",200"MD": "엠디",201}202203num_to_kor = {204"0": "영",205"1": "일",206"2": "이",207"3": "삼",208"4": "사",209"5": "오",210"6": "육",211"7": "칠",212"8": "팔",213"9": "구",214}215216unit_to_kor1 = {"%": "퍼센트", "cm": "센치미터", "mm": "밀리미터", "km": "킬로미터", "kg": "킬로그람"}217unit_to_kor2 = {"m": "미터"}218219upper_to_kor = {220"A": "에이",221"B": "비",222"C": "씨",223"D": "디",224"E": "이",225"F": "에프",226"G": "지",227"H": "에이치",228"I": "아이",229"J": "제이",230"K": "케이",231"L": "엘",232"M": "엠",233"N": "엔",234"O": "오",235"P": "피",236"Q": "큐",237"R": "알",238"S": "에스",239"T": "티",240"U": "유",241"V": "브이",242"W": "더블유",243"X": "엑스",244"Y": "와이",245"Z": "지",246}247248249"""250초성과 종성은 같아보이지만, 다른 character이다.251252'_-!'(),-.:;? ᄀᄁᄂᄃᄄᄅᄆᄇᄈᄉᄊᄋᄌᄍᄎᄏᄐᄑ하ᅢᅣᅤᅥᅦᅧᅨᅩᅪᅫᅬᅭᅮᅯᅰᅱᅲᅳᅴᅵᆨᆩᆪᆫᆬᆭᆮᆯᆰᆱᆲᆳᆴᆵᆶᆷᆸᆹᆺᆻᆼᆽᆾᆿᇀᇁᇂ~'253254'_': 0, '-': 7, '!': 2, "'": 3, '(': 4, ')': 5, ',': 6, '.': 8, ':': 9, ';': 10,255'?': 11, ' ': 12, 'ᄀ': 13, 'ᄁ': 14, 'ᄂ': 15, 'ᄃ': 16, 'ᄄ': 17, 'ᄅ': 18, 'ᄆ': 19, 'ᄇ': 20,256'ᄈ': 21, 'ᄉ': 22, 'ᄊ': 23, 'ᄋ': 24, 'ᄌ': 25, 'ᄍ': 26, 'ᄎ': 27, 'ᄏ': 28, 'ᄐ': 29, 'ᄑ': 30,257'ᄒ': 31, 'ᅡ': 32, 'ᅢ': 33, 'ᅣ': 34, 'ᅤ': 35, 'ᅥ': 36, 'ᅦ': 37, 'ᅧ': 38, 'ᅨ': 39, 'ᅩ': 40,258'ᅪ': 41, 'ᅫ': 42, 'ᅬ': 43, 'ᅭ': 44, 'ᅮ': 45, 'ᅯ': 46, 'ᅰ': 47, 'ᅱ': 48, 'ᅲ': 49, 'ᅳ': 50,259'ᅴ': 51, 'ᅵ': 52, 'ᆨ': 53, 'ᆩ': 54, 'ᆪ': 55, 'ᆫ': 56, 'ᆬ': 57, 'ᆭ': 58, 'ᆮ': 59, 'ᆯ': 60,260'ᆰ': 61, 'ᆱ': 62, 'ᆲ': 63, 'ᆳ': 64, 'ᆴ': 65, 'ᆵ': 66, 'ᆶ': 67, 'ᆷ': 68, 'ᆸ': 69, 'ᆹ': 70,261'ᆺ': 71, 'ᆻ': 72, 'ᆼ': 73, 'ᆽ': 74, 'ᆾ': 75, 'ᆿ': 76, 'ᇀ': 77, 'ᇁ': 78, 'ᇂ': 79, '~': 80262"""263264_pad = "pad"265_eos = "eos"266_punctuation = "!'(),-.:;? "267_special = "-"268269_jamo_leads = [chr(_) for _ in range(0x1100, 0x1113)]270_jamo_vowels = [chr(_) for _ in range(0x1161, 0x1176)]271_jamo_tails = [chr(_) for _ in range(0x11A8, 0x11C3)]272273_letters = _jamo_leads + _jamo_vowels + _jamo_tails274275symbols = [_pad] + list(_special) + list(_punctuation) + _letters + [_eos]276277_symbol_to_id = {c: i for i, c in enumerate(symbols)}278_id_to_symbol = {i: c for i, c in enumerate(symbols)}279280quote_checker = """([`"'"“‘])(.+?)([`"'"”’])"""281282283def is_lead(char):284return char in _jamo_leads285286287def is_vowel(char):288return char in _jamo_vowels289290291def is_tail(char):292return char in _jamo_tails293294295def get_mode(char):296if is_lead(char):297return 0298elif is_vowel(char):299return 1300elif is_tail(char):301return 2302else:303return -1304305306def _get_text_from_candidates(candidates):307if len(candidates) == 0:308return ""309elif len(candidates) == 1:310return jamo_to_hcj(candidates[0])311else:312return j2h(**dict(zip(["lead", "vowel", "tail"], candidates)))313314315def jamo_to_korean(text):316text = h2j(text)317318idx = 0319new_text = ""320candidates = []321322while True:323if idx >= len(text):324new_text += _get_text_from_candidates(candidates)325break326327char = text[idx]328mode = get_mode(char)329330if mode == 0:331new_text += _get_text_from_candidates(candidates)332candidates = [char]333elif mode == -1:334new_text += _get_text_from_candidates(candidates)335new_text += char336candidates = []337else:338candidates.append(char)339340idx += 1341return new_text342343344def compare_sentence_with_jamo(text1, text2):345return h2j(text1) != h2j(text2)346347348def tokenize(text, as_id=False):349# jamo package에 있는 hangul_to_jamo를 이용하여 한글 string을 초성/중성/종성으로 나눈다.350text = normalize(text)351tokens = list(352hangul_to_jamo(text)353) # '존경하는' --> ['ᄌ', 'ᅩ', 'ᆫ', 'ᄀ', 'ᅧ', 'ᆼ', 'ᄒ', 'ᅡ', 'ᄂ', 'ᅳ', 'ᆫ', '~']354355if as_id:356return [_symbol_to_id[token] for token in tokens]357else:358return [token for token in tokens]359360361def tokenizer_fn(iterator):362return (token for x in iterator for token in tokenize(x, as_id=False))363364365def normalize(text):366text = text.strip()367368text = re.sub("\(\d+일\)", "", text)369text = re.sub("\([⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+\)", "", text)370371text = normalize_with_dictionary(text, etc_dictionary)372text = normalize_english(text)373text = re.sub("[a-zA-Z]+", normalize_upper, text)374375text = normalize_quote(text)376text = normalize_number(text)377378return text379380381def normalize_with_dictionary(text, dic):382if any(key in text for key in dic.keys()):383pattern = re.compile("|".join(re.escape(key) for key in dic.keys()))384return pattern.sub(lambda x: dic[x.group()], text)385else:386return text387388389def normalize_english(text):390def fn(m):391word = m.group()392if word in english_dictionary:393return english_dictionary.get(word)394else:395return word396397text = re.sub("([A-Za-z]+)", fn, text)398return text399400401def normalize_upper(text):402text = text.group(0)403404if all([char.isupper() for char in text]):405return "".join(upper_to_kor[char] for char in text)406else:407return text408409410def normalize_quote(text):411def fn(found_text):412from nltk import sent_tokenize # NLTK doesn't along with multiprocessing413414found_text = found_text.group()415unquoted_text = found_text[1:-1]416417sentences = sent_tokenize(unquoted_text)418return " ".join(["'{}'".format(sent) for sent in sentences])419420return re.sub(quote_checker, fn, text)421422423number_checker = "([+-]?\d[\d,]*)[\.]?\d*"424count_checker = "(시|명|가지|살|마리|포기|송이|수|톨|통|점|개|벌|척|채|다발|그루|자루|줄|켤레|그릇|잔|마디|상자|사람|곡|병|판)"425426427def normalize_number(text):428text = normalize_with_dictionary(text, unit_to_kor1)429text = normalize_with_dictionary(text, unit_to_kor2)430text = re.sub(431number_checker + count_checker, lambda x: number_to_korean(x, True), text432)433text = re.sub(number_checker, lambda x: number_to_korean(x, False), text)434return text435436437num_to_kor1 = [""] + list("일이삼사오육칠팔구")438num_to_kor2 = [""] + list("만억조경해")439num_to_kor3 = [""] + list("십백천")440441# count_to_kor1 = [""] + ["하나","둘","셋","넷","다섯","여섯","일곱","여덟","아홉"]442count_to_kor1 = [""] + ["한", "두", "세", "네", "다섯", "여섯", "일곱", "여덟", "아홉"]443444count_tenth_dict = {445"십": "열",446"두십": "스물",447"세십": "서른",448"네십": "마흔",449"다섯십": "쉰",450"여섯십": "예순",451"일곱십": "일흔",452"여덟십": "여든",453"아홉십": "아흔",454}455456457def number_to_korean(num_str, is_count=False):458if is_count:459num_str, unit_str = num_str.group(1), num_str.group(2)460else:461num_str, unit_str = num_str.group(), ""462463num_str = num_str.replace(",", "")464num = ast.literal_eval(num_str)465466if num == 0:467return "영"468469check_float = num_str.split(".")470if len(check_float) == 2:471digit_str, float_str = check_float472elif len(check_float) >= 3:473raise Exception(" [!] Wrong number format")474else:475digit_str, float_str = check_float[0], None476477if is_count and float_str is not None:478raise Exception(" [!] `is_count` and float number does not fit each other")479480digit = int(digit_str)481482if digit_str.startswith("-"):483digit, digit_str = abs(digit), str(abs(digit))484485kor = ""486size = len(str(digit))487tmp = []488489for i, v in enumerate(digit_str, start=1):490v = int(v)491492if v != 0:493if is_count:494tmp += count_to_kor1[v]495else:496tmp += num_to_kor1[v]497498tmp += num_to_kor3[(size - i) % 4]499500if (size - i) % 4 == 0 and len(tmp) != 0:501kor += "".join(tmp)502tmp = []503kor += num_to_kor2[int((size - i) / 4)]504505if is_count:506if kor.startswith("한") and len(kor) > 1:507kor = kor[1:]508509if any(word in kor for word in count_tenth_dict):510kor = re.sub(511"|".join(count_tenth_dict.keys()),512lambda x: count_tenth_dict[x.group()],513kor,514)515516if not is_count and kor.startswith("일") and len(kor) > 1:517kor = kor[1:]518519if float_str is not None:520kor += "쩜 "521kor += re.sub("\d", lambda x: num_to_kor[x.group()], float_str)522523if num_str.startswith("+"):524kor = "플러스 " + kor525elif num_str.startswith("-"):526kor = "마이너스 " + kor527528return kor + unit_str529530531