From 896b669a7dcbe6224d28dc42dc84ed21b5e77d2e Mon Sep 17 00:00:00 2001 From: Meng Wei Date: Wed, 26 Jun 2024 11:40:11 +0800 Subject: [PATCH] =?UTF-8?q?[itn]=20fix=20issue#237,=20digit=20+=20union("?= =?UTF-8?q?=E7=99=BE",=20"=E5=8D=83",=20"=E4=B8=87")=20+=20digit=20+=20uni?= =?UTF-8?q?t?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- itn/chinese/data/number/digit_zh.tsv | 10 ++++++ itn/chinese/rules/measure.py | 36 +++++++++++++++++-- ...sable_standalone_number_disable_0_to_9.txt | 4 ++- ...isable_standalone_number_enable_0_to_9.txt | 4 ++- ...nable_standalone_number_disable_0_to_9.txt | 4 ++- 5 files changed, 53 insertions(+), 5 deletions(-) create mode 100644 itn/chinese/data/number/digit_zh.tsv diff --git a/itn/chinese/data/number/digit_zh.tsv b/itn/chinese/data/number/digit_zh.tsv new file mode 100644 index 0000000..ac0110c --- /dev/null +++ b/itn/chinese/data/number/digit_zh.tsv @@ -0,0 +1,10 @@ +一 +二 +两 +三 +四 +五 +六 +七 +八 +九 diff --git a/itn/chinese/rules/measure.py b/itn/chinese/rules/measure.py index c48ab92..f8f5370 100644 --- a/itn/chinese/rules/measure.py +++ b/itn/chinese/rules/measure.py @@ -16,7 +16,7 @@ from tn.processor import Processor from tn.utils import get_abs_path -from pynini import string_file, accep, cross +from pynini import string_file, accep, cross, union from pynini.lib.pynutil import delete, insert, add_weight @@ -36,6 +36,11 @@ def build_tagger(self): get_abs_path('../itn/chinese/data/measure/units_zh.tsv')) sign = string_file( get_abs_path('../itn/chinese/data/number/sign.tsv')) # + - + digit = string_file( + get_abs_path('../itn/chinese/data/number/digit.tsv')) # 1 ~ 9 + digit_zh = string_file( + get_abs_path('../itn/chinese/data/number/digit_zh.tsv')) # 1 ~ 9 + addzero = insert('0') to = cross('到', '~') | cross('到百分之', '~') units = add_weight( @@ -55,8 +60,35 @@ def build_tagger(self): # 十千米每小时 => 10km/h, 十一到一百千米每小时 => 11~100km/h measure = number + (to + number).ques + units - tagger = insert('value: "') + (measure | percent) + insert('"') + # XXX: 特殊case处理, ignore enable_standalone_number + # digit + union("百", "千", "万") + digit + unit + unit_sp_case1 = [ + '年', + '月', + '个月', + '周', + '天', + '位', + '次', + '个', + '顿', + ] + if self.enable_0_to_9: + measure_sp = add_weight( + ((digit + delete('百') + add_weight(addzero**2, 1.0)) | + (digit + delete('千') + add_weight(addzero**3, 1.0)) | + (digit + delete('万') + add_weight(addzero**4, 1.0))) + + insert(' ') + digit + union(*unit_sp_case1), -0.5) + else: + measure_sp = add_weight( + ((digit + delete('百') + add_weight(addzero**2, 1.0)) | + (digit + delete('千') + add_weight(addzero**3, 1.0)) | + (digit + delete('万') + add_weight(addzero**4, 1.0))) + + digit_zh + union(*unit_sp_case1), -0.5) + + tagger = insert('value: "') + (measure | measure_sp + | percent) + insert('"') # 每小时十千米 => 10km/h, 每小时三十到三百一十一千米 => 30~311km/h tagger |= (insert('denominator: "') + delete('每') + units + insert('" numerator: "') + measure + insert('"')) diff --git a/itn/chinese/test/data/normalizer_disable_standalone_number_disable_0_to_9.txt b/itn/chinese/test/data/normalizer_disable_standalone_number_disable_0_to_9.txt index e35fecd..53b4bc6 100644 --- a/itn/chinese/test/data/normalizer_disable_standalone_number_disable_0_to_9.txt +++ b/itn/chinese/test/data/normalizer_disable_standalone_number_disable_0_to_9.txt @@ -36,4 +36,6 @@ 这是九十九九千 => 这是九十九九千 这是十二一千 => 这是十二一千 这是零百 => 这是零百 -这是零千 => 这是零千 \ No newline at end of file +这是零千 => 这是零千 +这是一百一个,一千两位,一万三天 => 这是100一个,1000两位,10000三天 +这是九百九周,九千九月,九万九年 => 这是900九周,9000九月,90000九年 \ No newline at end of file diff --git a/itn/chinese/test/data/normalizer_disable_standalone_number_enable_0_to_9.txt b/itn/chinese/test/data/normalizer_disable_standalone_number_enable_0_to_9.txt index 1263cff..af35c1d 100644 --- a/itn/chinese/test/data/normalizer_disable_standalone_number_enable_0_to_9.txt +++ b/itn/chinese/test/data/normalizer_disable_standalone_number_enable_0_to_9.txt @@ -5,4 +5,6 @@ 这是九十九九千 => 这是九十九九千 这是十二一千 => 这是十二一千 这是零百 => 这是零百 -这是零千 => 这是零千 \ No newline at end of file +这是零千 => 这是零千 +这是一百一个,一千两位,一万三天 => 这是100 1个,1000 2位,10000 3天 +这是九百九周,九千九月,九万九年 => 这是900 9周,9000 9月,90000 9年 \ No newline at end of file diff --git a/itn/chinese/test/data/normalizer_enable_standalone_number_disable_0_to_9.txt b/itn/chinese/test/data/normalizer_enable_standalone_number_disable_0_to_9.txt index 8a3af23..1850adf 100644 --- a/itn/chinese/test/data/normalizer_enable_standalone_number_disable_0_to_9.txt +++ b/itn/chinese/test/data/normalizer_enable_standalone_number_disable_0_to_9.txt @@ -36,4 +36,6 @@ 这是九十九九千 => 这是99 9000 这是十二一千 => 这是12 1000 这是零百 => 这是零百 -这是零千 => 这是零千 \ No newline at end of file +这是零千 => 这是零千 +这是一百一个,一千两位,一万三天 => 这是100一个,1000两位,10000三天 +这是九百九周,九千九月,九万九年 => 这是900九周,9000九月,90000九年 \ No newline at end of file