Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
prophesier
GitHub Repository: prophesier/diff-svc
Path: blob/main/utils/text_norm.py
694 views
1
# coding=utf-8
2
# Authors:
3
# 2019.5 Zhiyang Zhou (https://github.com/Joee1995/chn_text_norm.git)
4
# 2019.9 Jiayu DU
5
#
6
# requirements:
7
# - python 3.X
8
# notes: python 2.X WILL fail or produce misleading results
9
10
import sys, os, argparse, codecs, string, re
11
12
# ================================================================================ #
13
# basic constant
14
# ================================================================================ #
15
CHINESE_DIGIS = u'零一二三四五六七八九'
16
BIG_CHINESE_DIGIS_SIMPLIFIED = u'零壹贰叁肆伍陆柒捌玖'
17
BIG_CHINESE_DIGIS_TRADITIONAL = u'零壹貳參肆伍陸柒捌玖'
18
SMALLER_BIG_CHINESE_UNITS_SIMPLIFIED = u'十百千万'
19
SMALLER_BIG_CHINESE_UNITS_TRADITIONAL = u'拾佰仟萬'
20
LARGER_CHINESE_NUMERING_UNITS_SIMPLIFIED = u'亿兆京垓秭穰沟涧正载'
21
LARGER_CHINESE_NUMERING_UNITS_TRADITIONAL = u'億兆京垓秭穰溝澗正載'
22
SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED = u'十百千万'
23
SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL = u'拾佰仟萬'
24
25
ZERO_ALT = u'〇'
26
ONE_ALT = u'幺'
27
TWO_ALTS = [u'两', u'兩']
28
29
POSITIVE = [u'正', u'正']
30
NEGATIVE = [u'负', u'負']
31
POINT = [u'点', u'點']
32
# PLUS = [u'加', u'加']
33
# SIL = [u'杠', u'槓']
34
35
# 中文数字系统类型
36
NUMBERING_TYPES = ['low', 'mid', 'high']
37
38
CURRENCY_NAMES = '(人民币|美元|日元|英镑|欧元|马克|法郎|加拿大元|澳元|港币|先令|芬兰马克|爱尔兰镑|' \
39
'里拉|荷兰盾|埃斯库多|比塞塔|印尼盾|林吉特|新西兰元|比索|卢布|新加坡元|韩元|泰铢)'
40
CURRENCY_UNITS = '((亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|)元|(亿|千万|百万|万|千|百|)块|角|毛|分)'
41
COM_QUANTIFIERS = '(匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|' \
42
'砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|' \
43
'针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|' \
44
'毫|厘|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|' \
45
'盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|旬|' \
46
'纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块)'
47
48
# punctuation information are based on Zhon project (https://github.com/tsroten/zhon.git)
49
CHINESE_PUNC_STOP = '!?。。'
50
CHINESE_PUNC_NON_STOP = '"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏'
51
CHINESE_PUNC_LIST = CHINESE_PUNC_STOP + CHINESE_PUNC_NON_STOP
52
53
54
# ================================================================================ #
55
# basic class
56
# ================================================================================ #
57
class ChineseChar(object):
58
"""
59
中文字符
60
每个字符对应简体和繁体,
61
e.g. 简体 = '负', 繁体 = '負'
62
转换时可转换为简体或繁体
63
"""
64
65
def __init__(self, simplified, traditional):
66
self.simplified = simplified
67
self.traditional = traditional
68
# self.__repr__ = self.__str__
69
70
def __str__(self):
71
return self.simplified or self.traditional or None
72
73
def __repr__(self):
74
return self.__str__()
75
76
77
class ChineseNumberUnit(ChineseChar):
78
"""
79
中文数字/数位字符
80
每个字符除繁简体外还有一个额外的大写字符
81
e.g. '陆' 和 '陸'
82
"""
83
84
def __init__(self, power, simplified, traditional, big_s, big_t):
85
super(ChineseNumberUnit, self).__init__(simplified, traditional)
86
self.power = power
87
self.big_s = big_s
88
self.big_t = big_t
89
90
def __str__(self):
91
return '10^{}'.format(self.power)
92
93
@classmethod
94
def create(cls, index, value, numbering_type=NUMBERING_TYPES[1], small_unit=False):
95
96
if small_unit:
97
return ChineseNumberUnit(power=index + 1,
98
simplified=value[0], traditional=value[1], big_s=value[1], big_t=value[1])
99
elif numbering_type == NUMBERING_TYPES[0]:
100
return ChineseNumberUnit(power=index + 8,
101
simplified=value[0], traditional=value[1], big_s=value[0], big_t=value[1])
102
elif numbering_type == NUMBERING_TYPES[1]:
103
return ChineseNumberUnit(power=(index + 2) * 4,
104
simplified=value[0], traditional=value[1], big_s=value[0], big_t=value[1])
105
elif numbering_type == NUMBERING_TYPES[2]:
106
return ChineseNumberUnit(power=pow(2, index + 3),
107
simplified=value[0], traditional=value[1], big_s=value[0], big_t=value[1])
108
else:
109
raise ValueError(
110
'Counting type should be in {0} ({1} provided).'.format(NUMBERING_TYPES, numbering_type))
111
112
113
class ChineseNumberDigit(ChineseChar):
114
"""
115
中文数字字符
116
"""
117
118
def __init__(self, value, simplified, traditional, big_s, big_t, alt_s=None, alt_t=None):
119
super(ChineseNumberDigit, self).__init__(simplified, traditional)
120
self.value = value
121
self.big_s = big_s
122
self.big_t = big_t
123
self.alt_s = alt_s
124
self.alt_t = alt_t
125
126
def __str__(self):
127
return str(self.value)
128
129
@classmethod
130
def create(cls, i, v):
131
return ChineseNumberDigit(i, v[0], v[1], v[2], v[3])
132
133
134
class ChineseMath(ChineseChar):
135
"""
136
中文数位字符
137
"""
138
139
def __init__(self, simplified, traditional, symbol, expression=None):
140
super(ChineseMath, self).__init__(simplified, traditional)
141
self.symbol = symbol
142
self.expression = expression
143
self.big_s = simplified
144
self.big_t = traditional
145
146
147
CC, CNU, CND, CM = ChineseChar, ChineseNumberUnit, ChineseNumberDigit, ChineseMath
148
149
150
class NumberSystem(object):
151
"""
152
中文数字系统
153
"""
154
pass
155
156
157
class MathSymbol(object):
158
"""
159
用于中文数字系统的数学符号 (繁/简体), e.g.
160
positive = ['正', '正']
161
negative = ['负', '負']
162
point = ['点', '點']
163
"""
164
165
def __init__(self, positive, negative, point):
166
self.positive = positive
167
self.negative = negative
168
self.point = point
169
170
def __iter__(self):
171
for v in self.__dict__.values():
172
yield v
173
174
175
# class OtherSymbol(object):
176
# """
177
# 其他符号
178
# """
179
#
180
# def __init__(self, sil):
181
# self.sil = sil
182
#
183
# def __iter__(self):
184
# for v in self.__dict__.values():
185
# yield v
186
187
188
# ================================================================================ #
189
# basic utils
190
# ================================================================================ #
191
def create_system(numbering_type=NUMBERING_TYPES[1]):
192
"""
193
根据数字系统类型返回创建相应的数字系统,默认为 mid
194
NUMBERING_TYPES = ['low', 'mid', 'high']: 中文数字系统类型
195
low: '兆' = '亿' * '十' = $10^{9}$, '京' = '兆' * '十', etc.
196
mid: '兆' = '亿' * '万' = $10^{12}$, '京' = '兆' * '万', etc.
197
high: '兆' = '亿' * '亿' = $10^{16}$, '京' = '兆' * '兆', etc.
198
返回对应的数字系统
199
"""
200
201
# chinese number units of '亿' and larger
202
all_larger_units = zip(
203
LARGER_CHINESE_NUMERING_UNITS_SIMPLIFIED, LARGER_CHINESE_NUMERING_UNITS_TRADITIONAL)
204
larger_units = [CNU.create(i, v, numbering_type, False)
205
for i, v in enumerate(all_larger_units)]
206
# chinese number units of '十, 百, 千, 万'
207
all_smaller_units = zip(
208
SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED, SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL)
209
smaller_units = [CNU.create(i, v, small_unit=True)
210
for i, v in enumerate(all_smaller_units)]
211
# digis
212
chinese_digis = zip(CHINESE_DIGIS, CHINESE_DIGIS,
213
BIG_CHINESE_DIGIS_SIMPLIFIED, BIG_CHINESE_DIGIS_TRADITIONAL)
214
digits = [CND.create(i, v) for i, v in enumerate(chinese_digis)]
215
digits[0].alt_s, digits[0].alt_t = ZERO_ALT, ZERO_ALT
216
digits[1].alt_s, digits[1].alt_t = ONE_ALT, ONE_ALT
217
digits[2].alt_s, digits[2].alt_t = TWO_ALTS[0], TWO_ALTS[1]
218
219
# symbols
220
positive_cn = CM(POSITIVE[0], POSITIVE[1], '+', lambda x: x)
221
negative_cn = CM(NEGATIVE[0], NEGATIVE[1], '-', lambda x: -x)
222
point_cn = CM(POINT[0], POINT[1], '.', lambda x,
223
y: float(str(x) + '.' + str(y)))
224
# sil_cn = CM(SIL[0], SIL[1], '-', lambda x, y: float(str(x) + '-' + str(y)))
225
system = NumberSystem()
226
system.units = smaller_units + larger_units
227
system.digits = digits
228
system.math = MathSymbol(positive_cn, negative_cn, point_cn)
229
# system.symbols = OtherSymbol(sil_cn)
230
return system
231
232
233
def chn2num(chinese_string, numbering_type=NUMBERING_TYPES[1]):
234
def get_symbol(char, system):
235
for u in system.units:
236
if char in [u.traditional, u.simplified, u.big_s, u.big_t]:
237
return u
238
for d in system.digits:
239
if char in [d.traditional, d.simplified, d.big_s, d.big_t, d.alt_s, d.alt_t]:
240
return d
241
for m in system.math:
242
if char in [m.traditional, m.simplified]:
243
return m
244
245
def string2symbols(chinese_string, system):
246
int_string, dec_string = chinese_string, ''
247
for p in [system.math.point.simplified, system.math.point.traditional]:
248
if p in chinese_string:
249
int_string, dec_string = chinese_string.split(p)
250
break
251
return [get_symbol(c, system) for c in int_string], \
252
[get_symbol(c, system) for c in dec_string]
253
254
def correct_symbols(integer_symbols, system):
255
"""
256
一百八 to 一百八十
257
一亿一千三百万 to 一亿 一千万 三百万
258
"""
259
260
if integer_symbols and isinstance(integer_symbols[0], CNU):
261
if integer_symbols[0].power == 1:
262
integer_symbols = [system.digits[1]] + integer_symbols
263
264
if len(integer_symbols) > 1:
265
if isinstance(integer_symbols[-1], CND) and isinstance(integer_symbols[-2], CNU):
266
integer_symbols.append(
267
CNU(integer_symbols[-2].power - 1, None, None, None, None))
268
269
result = []
270
unit_count = 0
271
for s in integer_symbols:
272
if isinstance(s, CND):
273
result.append(s)
274
unit_count = 0
275
elif isinstance(s, CNU):
276
current_unit = CNU(s.power, None, None, None, None)
277
unit_count += 1
278
279
if unit_count == 1:
280
result.append(current_unit)
281
elif unit_count > 1:
282
for i in range(len(result)):
283
if isinstance(result[-i - 1], CNU) and result[-i - 1].power < current_unit.power:
284
result[-i - 1] = CNU(result[-i - 1].power +
285
current_unit.power, None, None, None, None)
286
return result
287
288
def compute_value(integer_symbols):
289
"""
290
Compute the value.
291
When current unit is larger than previous unit, current unit * all previous units will be used as all previous units.
292
e.g. '两千万' = 2000 * 10000 not 2000 + 10000
293
"""
294
value = [0]
295
last_power = 0
296
for s in integer_symbols:
297
if isinstance(s, CND):
298
value[-1] = s.value
299
elif isinstance(s, CNU):
300
value[-1] *= pow(10, s.power)
301
if s.power > last_power:
302
value[:-1] = list(map(lambda v: v *
303
pow(10, s.power), value[:-1]))
304
last_power = s.power
305
value.append(0)
306
return sum(value)
307
308
system = create_system(numbering_type)
309
int_part, dec_part = string2symbols(chinese_string, system)
310
int_part = correct_symbols(int_part, system)
311
int_str = str(compute_value(int_part))
312
dec_str = ''.join([str(d.value) for d in dec_part])
313
if dec_part:
314
return '{0}.{1}'.format(int_str, dec_str)
315
else:
316
return int_str
317
318
319
def num2chn(number_string, numbering_type=NUMBERING_TYPES[1], big=False,
320
traditional=False, alt_zero=False, alt_one=False, alt_two=True,
321
use_zeros=True, use_units=True):
322
def get_value(value_string, use_zeros=True):
323
324
striped_string = value_string.lstrip('0')
325
326
# record nothing if all zeros
327
if not striped_string:
328
return []
329
330
# record one digits
331
elif len(striped_string) == 1:
332
if use_zeros and len(value_string) != len(striped_string):
333
return [system.digits[0], system.digits[int(striped_string)]]
334
else:
335
return [system.digits[int(striped_string)]]
336
337
# recursively record multiple digits
338
else:
339
result_unit = next(u for u in reversed(
340
system.units) if u.power < len(striped_string))
341
result_string = value_string[:-result_unit.power]
342
return get_value(result_string) + [result_unit] + get_value(striped_string[-result_unit.power:])
343
344
system = create_system(numbering_type)
345
346
int_dec = number_string.split('.')
347
if len(int_dec) == 1:
348
int_string = int_dec[0]
349
dec_string = ""
350
elif len(int_dec) == 2:
351
int_string = int_dec[0]
352
dec_string = int_dec[1]
353
else:
354
raise ValueError(
355
"invalid input num string with more than one dot: {}".format(number_string))
356
357
if use_units and len(int_string) > 1:
358
result_symbols = get_value(int_string)
359
else:
360
result_symbols = [system.digits[int(c)] for c in int_string]
361
dec_symbols = [system.digits[int(c)] for c in dec_string]
362
if dec_string:
363
result_symbols += [system.math.point] + dec_symbols
364
365
if alt_two:
366
liang = CND(2, system.digits[2].alt_s, system.digits[2].alt_t,
367
system.digits[2].big_s, system.digits[2].big_t)
368
for i, v in enumerate(result_symbols):
369
if isinstance(v, CND) and v.value == 2:
370
next_symbol = result_symbols[i +
371
1] if i < len(result_symbols) - 1 else None
372
previous_symbol = result_symbols[i - 1] if i > 0 else None
373
if isinstance(next_symbol, CNU) and isinstance(previous_symbol, (CNU, type(None))):
374
if next_symbol.power != 1 and ((previous_symbol is None) or (previous_symbol.power != 1)):
375
result_symbols[i] = liang
376
377
# if big is True, '两' will not be used and `alt_two` has no impact on output
378
if big:
379
attr_name = 'big_'
380
if traditional:
381
attr_name += 't'
382
else:
383
attr_name += 's'
384
else:
385
if traditional:
386
attr_name = 'traditional'
387
else:
388
attr_name = 'simplified'
389
390
result = ''.join([getattr(s, attr_name) for s in result_symbols])
391
392
# if not use_zeros:
393
# result = result.strip(getattr(system.digits[0], attr_name))
394
395
if alt_zero:
396
result = result.replace(
397
getattr(system.digits[0], attr_name), system.digits[0].alt_s)
398
399
if alt_one:
400
result = result.replace(
401
getattr(system.digits[1], attr_name), system.digits[1].alt_s)
402
403
for i, p in enumerate(POINT):
404
if result.startswith(p):
405
return CHINESE_DIGIS[0] + result
406
407
# ^10, 11, .., 19
408
if len(result) >= 2 and result[1] in [SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED[0],
409
SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL[0]] and \
410
result[0] in [CHINESE_DIGIS[1], BIG_CHINESE_DIGIS_SIMPLIFIED[1], BIG_CHINESE_DIGIS_TRADITIONAL[1]]:
411
result = result[1:]
412
413
return result
414
415
416
# ================================================================================ #
417
# different types of rewriters
418
# ================================================================================ #
419
class Cardinal:
420
"""
421
CARDINAL类
422
"""
423
424
def __init__(self, cardinal=None, chntext=None):
425
self.cardinal = cardinal
426
self.chntext = chntext
427
428
def chntext2cardinal(self):
429
return chn2num(self.chntext)
430
431
def cardinal2chntext(self):
432
return num2chn(self.cardinal)
433
434
435
class Digit:
436
"""
437
DIGIT类
438
"""
439
440
def __init__(self, digit=None, chntext=None):
441
self.digit = digit
442
self.chntext = chntext
443
444
# def chntext2digit(self):
445
# return chn2num(self.chntext)
446
447
def digit2chntext(self):
448
return num2chn(self.digit, alt_two=False, use_units=False)
449
450
451
class TelePhone:
452
"""
453
TELEPHONE类
454
"""
455
456
def __init__(self, telephone=None, raw_chntext=None, chntext=None):
457
self.telephone = telephone
458
self.raw_chntext = raw_chntext
459
self.chntext = chntext
460
461
# def chntext2telephone(self):
462
# sil_parts = self.raw_chntext.split('<SIL>')
463
# self.telephone = '-'.join([
464
# str(chn2num(p)) for p in sil_parts
465
# ])
466
# return self.telephone
467
468
def telephone2chntext(self, fixed=False):
469
470
if fixed:
471
sil_parts = self.telephone.split('-')
472
self.raw_chntext = '<SIL>'.join([
473
num2chn(part, alt_two=False, use_units=False) for part in sil_parts
474
])
475
self.chntext = self.raw_chntext.replace('<SIL>', '')
476
else:
477
sp_parts = self.telephone.strip('+').split()
478
self.raw_chntext = '<SP>'.join([
479
num2chn(part, alt_two=False, use_units=False) for part in sp_parts
480
])
481
self.chntext = self.raw_chntext.replace('<SP>', '')
482
return self.chntext
483
484
485
class Fraction:
486
"""
487
FRACTION类
488
"""
489
490
def __init__(self, fraction=None, chntext=None):
491
self.fraction = fraction
492
self.chntext = chntext
493
494
def chntext2fraction(self):
495
denominator, numerator = self.chntext.split('分之')
496
return chn2num(numerator) + '/' + chn2num(denominator)
497
498
def fraction2chntext(self):
499
numerator, denominator = self.fraction.split('/')
500
return num2chn(denominator) + '分之' + num2chn(numerator)
501
502
503
class Date:
504
"""
505
DATE类
506
"""
507
508
def __init__(self, date=None, chntext=None):
509
self.date = date
510
self.chntext = chntext
511
512
# def chntext2date(self):
513
# chntext = self.chntext
514
# try:
515
# year, other = chntext.strip().split('年', maxsplit=1)
516
# year = Digit(chntext=year).digit2chntext() + '年'
517
# except ValueError:
518
# other = chntext
519
# year = ''
520
# if other:
521
# try:
522
# month, day = other.strip().split('月', maxsplit=1)
523
# month = Cardinal(chntext=month).chntext2cardinal() + '月'
524
# except ValueError:
525
# day = chntext
526
# month = ''
527
# if day:
528
# day = Cardinal(chntext=day[:-1]).chntext2cardinal() + day[-1]
529
# else:
530
# month = ''
531
# day = ''
532
# date = year + month + day
533
# self.date = date
534
# return self.date
535
536
def date2chntext(self):
537
date = self.date
538
try:
539
year, other = date.strip().split('年', 1)
540
year = Digit(digit=year).digit2chntext() + '年'
541
except ValueError:
542
other = date
543
year = ''
544
if other:
545
try:
546
month, day = other.strip().split('月', 1)
547
month = Cardinal(cardinal=month).cardinal2chntext() + '月'
548
except ValueError:
549
day = date
550
month = ''
551
if day:
552
day = Cardinal(cardinal=day[:-1]).cardinal2chntext() + day[-1]
553
else:
554
month = ''
555
day = ''
556
chntext = year + month + day
557
self.chntext = chntext
558
return self.chntext
559
560
561
class Money:
562
"""
563
MONEY类
564
"""
565
566
def __init__(self, money=None, chntext=None):
567
self.money = money
568
self.chntext = chntext
569
570
# def chntext2money(self):
571
# return self.money
572
573
def money2chntext(self):
574
money = self.money
575
pattern = re.compile(r'(\d+(\.\d+)?)')
576
matchers = pattern.findall(money)
577
if matchers:
578
for matcher in matchers:
579
money = money.replace(matcher[0], Cardinal(cardinal=matcher[0]).cardinal2chntext())
580
self.chntext = money
581
return self.chntext
582
583
584
class Percentage:
585
"""
586
PERCENTAGE类
587
"""
588
589
def __init__(self, percentage=None, chntext=None):
590
self.percentage = percentage
591
self.chntext = chntext
592
593
def chntext2percentage(self):
594
return chn2num(self.chntext.strip().strip('百分之')) + '%'
595
596
def percentage2chntext(self):
597
return '百分之' + num2chn(self.percentage.strip().strip('%'))
598
599
600
# ================================================================================ #
601
# NSW Normalizer
602
# ================================================================================ #
603
class NSWNormalizer:
604
def __init__(self, raw_text):
605
self.raw_text = '^' + raw_text + '$'
606
self.norm_text = ''
607
608
def _particular(self):
609
text = self.norm_text
610
pattern = re.compile(r"(([a-zA-Z]+)二([a-zA-Z]+))")
611
matchers = pattern.findall(text)
612
if matchers:
613
# print('particular')
614
for matcher in matchers:
615
text = text.replace(matcher[0], matcher[1] + '2' + matcher[2], 1)
616
self.norm_text = text
617
return self.norm_text
618
619
def normalize(self, remove_punc=True):
620
text = self.raw_text
621
622
# 规范化日期
623
pattern = re.compile(r"\D+((([089]\d|(19|20)\d{2})年)?(\d{1,2}月(\d{1,2}[日号])?)?)")
624
matchers = pattern.findall(text)
625
if matchers:
626
# print('date')
627
for matcher in matchers:
628
text = text.replace(matcher[0], Date(date=matcher[0]).date2chntext(), 1)
629
630
# 规范化金钱
631
pattern = re.compile(r"\D+((\d+(\.\d+)?)[多余几]?" + CURRENCY_UNITS + r"(\d" + CURRENCY_UNITS + r"?)?)")
632
matchers = pattern.findall(text)
633
if matchers:
634
# print('money')
635
for matcher in matchers:
636
text = text.replace(matcher[0], Money(money=matcher[0]).money2chntext(), 1)
637
638
# 规范化固话/手机号码
639
# 手机
640
# http://www.jihaoba.com/news/show/13680
641
# 移动:139、138、137、136、135、134、159、158、157、150、151、152、188、187、182、183、184、178、198
642
# 联通:130、131、132、156、155、186、185、176
643
# 电信:133、153、189、180、181、177
644
pattern = re.compile(r"\D((\+?86 ?)?1([38]\d|5[0-35-9]|7[678]|9[89])\d{8})\D")
645
matchers = pattern.findall(text)
646
if matchers:
647
# print('telephone')
648
for matcher in matchers:
649
text = text.replace(matcher[0], TelePhone(telephone=matcher[0]).telephone2chntext(), 1)
650
# 固话
651
pattern = re.compile(r"\D((0(10|2[1-3]|[3-9]\d{2})-?)?[1-9]\d{6,7})\D")
652
matchers = pattern.findall(text)
653
if matchers:
654
# print('fixed telephone')
655
for matcher in matchers:
656
text = text.replace(matcher[0], TelePhone(telephone=matcher[0]).telephone2chntext(fixed=True), 1)
657
658
# 规范化分数
659
pattern = re.compile(r"(\d+/\d+)")
660
matchers = pattern.findall(text)
661
if matchers:
662
# print('fraction')
663
for matcher in matchers:
664
text = text.replace(matcher, Fraction(fraction=matcher).fraction2chntext(), 1)
665
666
# 规范化百分数
667
text = text.replace('%', '%')
668
pattern = re.compile(r"(\d+(\.\d+)?%)")
669
matchers = pattern.findall(text)
670
if matchers:
671
# print('percentage')
672
for matcher in matchers:
673
text = text.replace(matcher[0], Percentage(percentage=matcher[0]).percentage2chntext(), 1)
674
675
# 规范化纯数+量词
676
pattern = re.compile(r"(\d+(\.\d+)?)[多余几]?" + COM_QUANTIFIERS)
677
matchers = pattern.findall(text)
678
if matchers:
679
# print('cardinal+quantifier')
680
for matcher in matchers:
681
text = text.replace(matcher[0], Cardinal(cardinal=matcher[0]).cardinal2chntext(), 1)
682
683
# 规范化数字编号
684
pattern = re.compile(r"(\d{4,32})")
685
matchers = pattern.findall(text)
686
if matchers:
687
# print('digit')
688
for matcher in matchers:
689
text = text.replace(matcher, Digit(digit=matcher).digit2chntext(), 1)
690
691
# 规范化纯数
692
pattern = re.compile(r"(\d+(\.\d+)?)")
693
matchers = pattern.findall(text)
694
if matchers:
695
# print('cardinal')
696
for matcher in matchers:
697
text = text.replace(matcher[0], Cardinal(cardinal=matcher[0]).cardinal2chntext(), 1)
698
699
self.norm_text = text
700
self._particular()
701
702
text = self.norm_text.lstrip('^').rstrip('$')
703
if remove_punc:
704
# Punctuations removal
705
old_chars = CHINESE_PUNC_LIST + string.punctuation # includes all CN and EN punctuations
706
new_chars = ' ' * len(old_chars)
707
del_chars = ''
708
text = text.translate(str.maketrans(old_chars, new_chars, del_chars))
709
return text
710
711
712
def nsw_test_case(raw_text):
713
print('I:' + raw_text)
714
print('O:' + NSWNormalizer(raw_text).normalize())
715
print('')
716
717
718
def nsw_test():
719
nsw_test_case('固话:0595-23865596或23880880。')
720
nsw_test_case('固话:0595-23865596或23880880。')
721
nsw_test_case('手机:+86 19859213959或15659451527。')
722
nsw_test_case('分数:32477/76391。')
723
nsw_test_case('百分数:80.03%。')
724
nsw_test_case('编号:31520181154418。')
725
nsw_test_case('纯数:2983.07克或12345.60米。')
726
nsw_test_case('日期:1999年2月20日或09年3月15号。')
727
nsw_test_case('金钱:12块5,34.5元,20.1万')
728
nsw_test_case('特殊:O2O或B2C。')
729
nsw_test_case('3456万吨')
730
nsw_test_case('2938个')
731
nsw_test_case('938')
732
nsw_test_case('今天吃了115个小笼包231个馒头')
733
nsw_test_case('有62%的概率')
734
735
736
if __name__ == '__main__':
737
# nsw_test()
738
739
p = argparse.ArgumentParser()
740
p.add_argument('ifile', help='input filename, assume utf-8 encoding')
741
p.add_argument('ofile', help='output filename')
742
p.add_argument('--to_upper', action='store_true', help='convert to upper case')
743
p.add_argument('--to_lower', action='store_true', help='convert to lower case')
744
p.add_argument('--has_key', action='store_true', help="input text has Kaldi's key as first field.")
745
p.add_argument('--log_interval', type=int, default=10000, help='log interval in number of processed lines')
746
args = p.parse_args()
747
748
ifile = codecs.open(args.ifile, 'r', 'utf8')
749
ofile = codecs.open(args.ofile, 'w+', 'utf8')
750
751
n = 0
752
for l in ifile:
753
key = ''
754
text = ''
755
if args.has_key:
756
cols = l.split(maxsplit=1)
757
key = cols[0]
758
if len(cols) == 2:
759
text = cols[1]
760
else:
761
text = ''
762
else:
763
text = l
764
765
# cases
766
if args.to_upper and args.to_lower:
767
sys.stderr.write('text norm: to_upper OR to_lower?')
768
exit(1)
769
if args.to_upper:
770
text = text.upper()
771
if args.to_lower:
772
text = text.lower()
773
774
# NSW(Non-Standard-Word) normalization
775
text = NSWNormalizer(text).normalize()
776
777
#
778
if args.has_key:
779
ofile.write(key + '\t' + text)
780
else:
781
ofile.write(text)
782
783
n += 1
784
if n % args.log_interval == 0:
785
sys.stderr.write("text norm: {} lines done.\n".format(n))
786
787
sys.stderr.write("text norm: {} lines done in total.\n".format(n))
788
789
ifile.close()
790
ofile.close()
791
792