PaddlePaddle · zh794390558 · Jul 22, 2021 · Jun 4, 2021 · Jun 4, 2021 · zh794390558
diff --git a/examples/text_normalization/README.md b/examples/text_normalization/README.md
@@ -0,0 +1,3 @@
+# Regular expression based text normalization for Chinese
+
+For simplicity and ease of implementation, text normalization is basically done by rules and dictionaries. Jere's an example.
diff --git a/examples/text_normalization/data/sentences.txt b/examples/text_normalization/data/sentences.txt
@@ -0,0 +1,26 @@
+今天的最低气温达到-10°C.
+只要有33/4的人同意，就可以通过决议。
+1945年5月2日，苏联士兵在德国国会大厦上升起了胜利旗，象征着攻占柏林并战胜了纳粹德国。
+4月16日，清晨的战斗以炮击揭幕，数以千计的大炮和喀秋莎火箭炮开始炮轰德军阵地，炮击持续了数天之久。
+如果剩下的30.6%是过去，那么还有69.4%.
+事情发生在2020/03/31的上午8:00.
+警方正在找一支.22口径的手枪。
+欢迎致电中国联通，北京2022年冬奥会官方合作伙伴为您服务
+充值缴费请按1，查询话费及余量请按2，跳过本次提醒请按井号键。
+快速解除流量封顶请按星号键，腾讯王卡产品介绍、使用说明、特权及活动请按9，查询话费、套餐余量、积分及活动返款请按1，手机上网流量开通及取消请按2，查询本机号码及本号所使用套餐请按4，密码修改及重置请按5，紧急开机请按6，挂失请按7，查询充值记录请按8，其它自助服务及人工服务请按0
+智能客服助理快速查话费、查流量请按9，了解北京联通业务请按1，宽带IPTV新装、查询请按2，障碍报修请按3，充值缴费请按4，投诉建议请按5，政企业务请按7，人工服务请按0，for english severice press star key
+您的帐户当前可用余额为63.89元，本月消费为2.17元。您的消费、套餐余量和其它信息将以短信形式下发，请您注意查收。谢谢使用，再见！。
+您的帐户当前可用余额为负15.5元，本月消费为59.6元。您的消费、套餐余量和其它信息将以短信形式下发，请您注意查收。谢谢使用，再见！。
+尊敬的客户，您目前的话费余额为负14.60元，已低于10元，为保证您的通信畅通，请及时缴纳费用。
+您的流量已用完，为避免您产生额外费用，建议您根据需求开通一个流量包以作补充。
+您可以直接说，查询话费及余量、开通流量包、缴费，您也可以说出其它需求，请问有什么可以帮您？
+您的账户当前可用余额为负36.00元，本月消费36.00元。
+请问你是电话13985608526的机主吗？
+如您对处理结果不满意，可拨打中国联通集团投诉电话10015进行投诉，按本地通话费收费，返回自助服务请按井号键
+“26314”号VIP客服代表为您服务。
+尊敬的5G用户，欢迎您致电中国联通
+首先是应用了M1芯片的iPad Pro，新款的iPad Pro支持5G，这也是苹果的第二款5G产品线。
+除此之外，摄像头方面再次升级，增加了前摄全新超广角摄像头，支持人物居中功能，搭配超广角可实现视频中始终让人物居中效果。
+屏幕方面，iPad Pro 12.9版本支持XDR体验的Mini-LEDS显示屏，支持HDR10、杜比视界，还支持杜比全景声。
+iPad Pro的秒控键盘这次也推出白色版本。
+售价方面，11英寸版本售价799美元起，12.9英寸售价1099美元起。
diff --git a/examples/text_normalization/local/test_normalization.py b/examples/text_normalization/local/test_normalization.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+
+from text_processing import normalization
+
+parser = argparse.ArgumentParser(
+    description="Normalize text in Chinese with some rules.")
+parser.add_argument("input", type=str, help="the input sentences")
+parser.add_argument("output", type=str, help="path to save the output file.")
+args = parser.parse_args()
+
+with open(args.input, 'rt') as fin:
+    with open(args.output, 'wt') as fout:
+        for sent in fin:
+            sent = normalization.normalize_sentence(sent.strip())
+            fout.write(sent)
+            fout.write('\n')
diff --git a/examples/text_normalization/path.sh b/examples/text_normalization/path.sh
@@ -0,0 +1,7 @@
+export MAIN_ROOT=${PWD}/../../
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${MAIN_ROOT}/third_party:${PYTHONPATH}#
diff --git a/examples/text_normalization/run.sh b/examples/text_normalization/run.sh
@@ -0,0 +1,26 @@
+#!/usr/bin/env bash
+source path.sh
+
+stage=-1
+stop_stage=100
+
+exp_dir=exp
+data_dir=data
+filename="sentences.txt"
+
+source ${MAIN_ROOT}/utils/parse_options.sh || exit -1
+
+mkdir -p ${exp_dir}
+
+
+if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
+    echo "stage 1: Processing "
+    python3 local/test_normalization.py  ${data_dir}/${filename} ${exp_dir}/normalized.txt
+    if [ -f "${exp_dir}/normalized.txt" ]; then
+	echo "Normalized text save at ${exp_dir}/normalized.txt"
+    fi
+    # TODO(chenfeiyu): compute edit distance against ground-truth
+fi
+
+echo "done"
+exit 0
diff --git a/third_party/text_processing/__ini__.py b/third_party/text_processing/__ini__.py
@@ -0,0 +1 @@
+
diff --git a/third_party/text_processing/__init__.py b/third_party/text_processing/__init__.py
diff --git a/third_party/text_processing/normalization/__init__.py b/third_party/text_processing/normalization/__init__.py
@@ -0,0 +1,42 @@
+from .sentence_split import split
+from .num import RE_NUMBER, RE_FRAC, RE_PERCENTAGE, RE_RANGE, RE_INTEGER, RE_DEFAULT_NUM
+from .num import replace_number, replace_frac, replace_percentage, replace_range, replace_default_num
+
+from .chronology import RE_TIME, RE_DATE, RE_DATE2
+from .chronology import replace_time, replace_date, replace_date2
+
+from .quantifier import RE_TEMPERATURE
+from .quantifier import replace_temperature
+
+from .phone import RE_MOBILE_PHONE, RE_TELEPHONE, replace_phone
+
+from .char_convert import tranditional_to_simplified
+from .constants import F2H_ASCII_LETTERS, F2H_DIGITS, F2H_SPACE
+
+
+def normalize_sentence(sentence):
+    # basic character conversions
+    sentence = tranditional_to_simplified(sentence)
+    sentence = sentence.translate(F2H_ASCII_LETTERS).translate(
+        F2H_DIGITS).translate(F2H_SPACE)
+
+    # number related NSW verbalization
+    sentence = RE_DATE.sub(replace_date, sentence)
+    sentence = RE_DATE2.sub(replace_date2, sentence)
+    sentence = RE_TIME.sub(replace_time, sentence)
+    sentence = RE_TEMPERATURE.sub(replace_temperature, sentence)
+    sentence = RE_RANGE.sub(replace_range, sentence)
+    sentence = RE_FRAC.sub(replace_frac, sentence)
+    sentence = RE_PERCENTAGE.sub(replace_percentage, sentence)
+    sentence = RE_MOBILE_PHONE.sub(replace_phone, sentence)
+    sentence = RE_TELEPHONE.sub(replace_phone, sentence)
+    sentence = RE_DEFAULT_NUM.sub(replace_default_num, sentence)
+    sentence = RE_NUMBER.sub(replace_number, sentence)
+
+    return sentence
+
+
+def normalize(text):
+    sentences = split(text)
+    sentences = [normalize_sentence(sent) for sent in sentences]
+    return sentences
diff --git a/third_party/text_processing/normalization/char_convert.py b/third_party/text_processing/normalization/char_convert.py
@@ -0,0 +1,15 @@
+"""Traditional and simplified Chinese conversion with 
+`opencc <https://github.com/BYVoid/OpenCC>`_.
+"""
+
+
+import opencc
+
+_t2s_converter = opencc.OpenCC("t2s.json")
+_s2t_converter = opencc.OpenCC('s2t.json')
+
+def tranditional_to_simplified(text: str) -> str:
+    return _t2s_converter.convert(text)
+
+def simplified_to_traditional(text: str) -> str:
+    return _s2t_converter.convert(text)
diff --git a/third_party/text_processing/normalization/chronology.py b/third_party/text_processing/normalization/chronology.py
@@ -0,0 +1,64 @@
+import re
+from .num import verbalize_cardinal, verbalize_digit, num2str, DIGITS
+
+
+def _time_num2str(num_string: str) -> str:
+    """A special case for verbalizing number in time."""
+    result = num2str(num_string.lstrip('0'))
+    if num_string.startswith('0'):
+        result = DIGITS['0'] + result
+    return result
+
+# 时刻表达式
+RE_TIME = re.compile(
+    r'([0-1]?[0-9]|2[0-3])'
+    r':([0-5][0-9])'
+    r'(:([0-5][0-9]))?'
+)
+def replace_time(match: re.Match) -> str:
+    hour = match.group(1)
+    minute = match.group(2)
+    second = match.group(4)
+
+    result = f"{num2str(hour)}点"
+    if minute.lstrip('0'):
+        result += f"{_time_num2str(minute)}分"
+    if second and second.lstrip('0'):
+        result += f"{_time_num2str(second)}秒"
+    return result
+
+
+RE_DATE = re.compile(
+    r'(\d{4}|\d{2})年'
+    r'((0?[1-9]|1[0-2])月)?'
+    r'(((0?[1-9])|((1|2)[0-9])|30|31)([日号]))?'
+)
+def replace_date(match: re.Match) -> str:
+    year = match.group(1)
+    month = match.group(3)
+    day = match.group(5)
+    result = ""
+    if year:
+        result += f"{verbalize_digit(year)}年"
+    if month:
+        result += f"{verbalize_cardinal(month)}月"
+    if day:
+        result += f"{verbalize_cardinal(day)}{match.group(9)}"
+    return result
+
+# 用 / 或者 - 分隔的 YY/MM/DD 或者 YY-MM-DD 日期
+RE_DATE2 = re.compile(
+    r'(\d{4})([- /.])(0[1-9]|1[012])\2(0[1-9]|[12][0-9]|3[01])'
+)
+def replace_date2(match: re.Match) -> str:
+    year = match.group(1)
+    month = match.group(3)
+    day = match.group(4)
+    result = ""
+    if year:
+        result += f"{verbalize_digit(year)}年"
+    if month:
+        result += f"{verbalize_cardinal(month)}月"
+    if day:
+        result += f"{verbalize_cardinal(day)}日"
+    return result
diff --git a/third_party/text_processing/normalization/constants.py b/third_party/text_processing/normalization/constants.py
@@ -0,0 +1,58 @@
+import string
+import re
+from pypinyin.constants import SUPPORT_UCS4
+
+
+# 全角半角转换
+# 英文字符全角 -> 半角映射表 (num: 52)
+F2H_ASCII_LETTERS = {
+    chr(ord(char) + 65248): char
+    for char in string.ascii_letters
+}
+
+# 英文字符半角 -> 全角映射表
+H2F_ASCII_LETTERS = {value: key for key, value in F2H_ASCII_LETTERS.items()}
+
+# 数字字符全角 -> 半角映射表 (num: 10)
+F2H_DIGITS = {
+    chr(ord(char) + 65248): char
+    for char in string.digits
+}
+# 数字字符半角 -> 全角映射表
+H2F_DIGITS = {value: key for key, value in F2H_DIGITS.items()}
+
+# 标点符号全角 -> 半角映射表 (num: 32)
+F2H_PUNCTUATIONS = {
+    chr(ord(char) + 65248): char
+    for char in string.punctuation
+}
+# 标点符号半角 -> 全角映射表
+H2F_PUNCTUATIONS = {value: key for key, value in F2H_PUNCTUATIONS.items()}
+
+# 空格 (num: 1)
+F2H_SPACE = {'\u3000': ' '}
+H2F_SPACE = {' ': '\u3000'}
+
+# 非"有拼音的汉字"的字符串，可用于NSW提取
+if SUPPORT_UCS4:
+    RE_NSW = re.compile(
+        r'(?:[^'
+        r'\u3007'                  # 〇
+        r'\u3400-\u4dbf'           # CJK扩展A:[3400-4DBF]
+        r'\u4e00-\u9fff'           # CJK基本:[4E00-9FFF]
+        r'\uf900-\ufaff'           # CJK兼容:[F900-FAFF]
+        r'\U00020000-\U0002A6DF'   # CJK扩展B:[20000-2A6DF]
+        r'\U0002A703-\U0002B73F'   # CJK扩展C:[2A700-2B73F]
+        r'\U0002B740-\U0002B81D'   # CJK扩展D:[2B740-2B81D]
+        r'\U0002F80A-\U0002FA1F'   # CJK兼容扩展:[2F800-2FA1F]
+        r'])+'
+    )
+else:
+    RE_NSW = re.compile(  # pragma: no cover
+        r'(?:[^'
+        r'\u3007'                  # 〇
+        r'\u3400-\u4dbf'           # CJK扩展A:[3400-4DBF]
+        r'\u4e00-\u9fff'           # CJK基本:[4E00-9FFF]
+        r'\uf900-\ufaff'           # CJK兼容:[F900-FAFF]
+        r'])+'
+    )
diff --git a/third_party/text_processing/normalization/num.py b/third_party/text_processing/normalization/num.py
@@ -0,0 +1,155 @@
+"""
+Rules to verbalize numbers into Chinese characters.
+https://zh.wikipedia.org/wiki/中文数字#現代中文
+"""
+
+import re
+from typing import List
+from collections import OrderedDict
+
+DIGITS = {str(i): tran for i, tran in enumerate('零一二三四五六七八九')}
+UNITS = OrderedDict({
+    1: '十',
+    2: '百',
+    3: '千',
+    4: '万',
+    8: '亿',
+})
+
+# 分数表达式
+RE_FRAC = re.compile(r'(-?)(\d+)/(\d+)')
+def replace_frac(match: re.Match) -> str:
+    sign = match.group(1)
+    nominator = match.group(2)
+    denominator = match.group(3)
+    sign: str = "负" if sign else ""
+    nominator: str = num2str(nominator)
+    denominator: str = num2str(denominator)
+    result = f"{sign}{denominator}分之{nominator}"
+    return result
+
+
+# 百分数表达式
+RE_PERCENTAGE = re.compile(r'(-?)(\d+(\.\d+)?)%')
+def replace_percentage(match: re.Match) -> str:
+    sign = match.group(1)
+    percent = match.group(2)
+    sign: str = "负" if sign else ""
+    percent: str = num2str(percent)
+    result = f"{sign}百分之{percent}"
+    return result
+
+# 整数表达式
+# 带负号或者不带负号的整数 12, -10
+RE_INTEGER = re.compile(
+    r'(-?)'
+    r'(\d+)'
+)
+
+# 编号-无符号整形
+# 00078
+RE_DEFAULT_NUM = re.compile(r'\d{4}\d*')
+def replace_default_num(match: re.Match):
+    number = match.group(0)
+    return verbalize_digit(number)
+
+# 数字表达式
+# 1. 整数: -10, 10;
+# 2. 浮点数: 10.2, -0.3
+# 3. 不带符号和整数部分的纯浮点数: .22, .38   
+RE_NUMBER = re.compile(
+    r'(-?)((\d+)(\.\d+)?)'
+    r'|(\.(\d+))'
+)
+def replace_number(match: re.Match) -> str:
+    sign = match.group(1)
+    number = match.group(2)
+    pure_decimal = match.group(5)
+    if pure_decimal:
+        result = num2str(pure_decimal)
+    else:
+        sign: str = "负" if sign else ""
+        number: str = num2str(number)
+        result = f"{sign}{number}"
+    return result
+
+# 范围表达式
+# 12-23, 12~23
+RE_RANGE = re.compile(
+    r'(\d+)[-~](\d+)'
+)
+def replace_range(match: re.Match) -> str:
+    first, second = match.group(1), match.group(2)
+    first: str = num2str(first)
+    second: str = num2str(second)
+    result = f"{first}到{second}"
+    return result
+
+
+def _get_value(value_string: str, use_zero: bool=True) -> List[str]:
+    stripped = value_string.lstrip('0')
+    if len(stripped) == 0:
+        return []
+    elif len(stripped) == 1:
+        if use_zero and len(stripped) < len(value_string):
+            return [DIGITS['0'], DIGITS[stripped]]
+        else:
+            return [DIGITS[stripped]]
+    else:
+        largest_unit = next(power for power in reversed(UNITS.keys()) if power < len(stripped))
+        first_part = value_string[:-largest_unit]
+        second_part = value_string[-largest_unit:]
+        return _get_value(first_part) + [UNITS[largest_unit]] + _get_value(second_part)
+
+def verbalize_cardinal(value_string: str) -> str:
+    if not value_string:
+        return ''
+
+    # 000 -> '零' , 0 -> '零'
+    value_string = value_string.lstrip('0')
+    if len(value_string) == 0:
+        return DIGITS['0']
+
+    result_symbols = _get_value(value_string)
+    # verbalized number starting with '一十*' is abbreviated as `十*`
+    if len(result_symbols) >= 2 and result_symbols[0] == DIGITS['1'] and result_symbols[1] == UNITS[1]:
+        result_symbols = result_symbols[1:]
+    return ''.join(result_symbols)
+
+def verbalize_digit(value_string: str, alt_one=False) -> str:
+    result_symbols = [DIGITS[digit] for digit in value_string]
+    result = ''.join(result_symbols)
+    if alt_one:
+        result.replace("一", "幺")
+    return result
+
+def num2str(value_string: str) -> str:
+    integer_decimal = value_string.split('.')
+    if len(integer_decimal) == 1:
+        integer = integer_decimal[0]
+        decimal = ''
+    elif len(integer_decimal) == 2:
+        integer, decimal = integer_decimal
+    else:
+        raise ValueError(f"The value string: '${value_string}' has more than one point in it.")
+
+    result = verbalize_cardinal(integer)
+
+    decimal = decimal.rstrip('0')
+    if decimal:
+        # '.22' is verbalized as '点二二'
+        # '3.20' is verbalized as '三点二
+        result += '点' + verbalize_digit(decimal)
+    return result
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/third_party/text_processing/normalization/phone.py b/third_party/text_processing/normalization/phone.py
@@ -0,0 +1,31 @@
+import re
+from .num import verbalize_digit
+
+
+# 规范化固话/手机号码
+# 手机
+# http://www.jihaoba.com/news/show/13680
+# 移动：139、138、137、136、135、134、159、158、157、150、151、152、188、187、182、183、184、178、198
+# 联通：130、131、132、156、155、186、185、176
+# 电信：133、153、189、180、181、177
+RE_MOBILE_PHONE= re.compile(
+    r"(?<!\d)((\+?86 ?)?1([38]\d|5[0-35-9]|7[678]|9[89])\d{8})(?!\d)")
+RE_TELEPHONE = re.compile(
+    r"(?<!\d)((0(10|2[1-3]|[3-9]\d{2})-?)?[1-9]\d{6,7})(?!\d)")
+
+
+def phone2str(phone_string: str, mobile=True) -> str:
+    if mobile:
+        sp_parts = phone_string.strip('+').split()
+        result = ''.join(
+            [verbalize_digit(part, alt_one=True) for part in sp_parts])
+        return result
+    else:
+        sil_parts = phone_string.split('-')
+        result = ''.join(
+            [verbalize_digit(part, alt_one=True) for part in sil_parts])
+        return result
+
+
+def replace_phone(match: re.Match) -> str:
+    return phone2str(match.group(0))
diff --git a/third_party/text_processing/normalization/quantifier.py b/third_party/text_processing/normalization/quantifier.py
@@ -0,0 +1,18 @@
+import re
+from .num import num2str
+
+
+# 温度表达式，温度会影响负号的读法
+# -3°C 零下三度
+RE_TEMPERATURE = re.compile(
+    r'(-?)(\d+(\.\d+)?)(°C|℃|度|摄氏度)'
+)
+def replace_temperature(match: re.Match) -> str:
+    sign = match.group(1)
+    temperature = match.group(2)
+    unit = match.group(3)
+    sign: str = "零下" if sign else ""
+    temperature: str = num2str(temperature)
+    unit: str = "摄氏度" if unit == "摄氏度" else "度"
+    result = f"{sign}{temperature}{unit}"
+    return result
diff --git a/third_party/text_processing/normalization/sentence_split.py b/third_party/text_processing/normalization/sentence_split.py
@@ -0,0 +1,23 @@
+import re
+from typing import List
+
+
+SENTENCE_SPLITOR = re.compile(r'([。！？][”’]?)')
+
+def split(text: str) -> List[str]:
+    """Split long text into sentences with sentence-splitting punctuations.
+
+    Parameters
+    ----------
+    text : str
+        The input text.
+
+    Returns
+    -------
+    List[str]
+        Sentences.
+    """
+    text = SENTENCE_SPLITOR.sub(r'\1\n', text)
+    text = text.strip()
+    sentences = [sentence.strip() for sentence in re.split(r'\n+', text)]
+    return sentences
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# Regular expression based text normalization for Chinese

		For simplicity and ease of implementation, text normalization is basically done by rules and dictionaries. Jere's an example.
Copy link Collaborator zh794390558 Jun 7, 2021 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. Jere's