Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[itn] fix issue#237, digit + union("百", "千", "万") + digit + unit #255

Merged
merged 1 commit into from
Jun 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions itn/chinese/data/number/digit_zh.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
36 changes: 34 additions & 2 deletions itn/chinese/rules/measure.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from tn.processor import Processor
from tn.utils import get_abs_path

from pynini import string_file, accep, cross
from pynini import string_file, accep, cross, union
from pynini.lib.pynutil import delete, insert, add_weight


Expand All @@ -36,6 +36,11 @@ def build_tagger(self):
get_abs_path('../itn/chinese/data/measure/units_zh.tsv'))
sign = string_file(
get_abs_path('../itn/chinese/data/number/sign.tsv')) # + -
digit = string_file(
get_abs_path('../itn/chinese/data/number/digit.tsv')) # 1 ~ 9
digit_zh = string_file(
get_abs_path('../itn/chinese/data/number/digit_zh.tsv')) # 1 ~ 9
addzero = insert('0')
to = cross('到', '~') | cross('到百分之', '~')

units = add_weight(
Expand All @@ -55,8 +60,35 @@ def build_tagger(self):

# 十千米每小时 => 10km/h, 十一到一百千米每小时 => 11~100km/h
measure = number + (to + number).ques + units
tagger = insert('value: "') + (measure | percent) + insert('"')

# XXX: 特殊case处理, ignore enable_standalone_number
# digit + union("百", "千", "万") + digit + unit
unit_sp_case1 = [
'年',
'月',
'个月',
'周',
'天',
'位',
'次',
'个',
'顿',
]
if self.enable_0_to_9:
measure_sp = add_weight(
((digit + delete('百') + add_weight(addzero**2, 1.0)) |
(digit + delete('千') + add_weight(addzero**3, 1.0)) |
(digit + delete('万') + add_weight(addzero**4, 1.0))) +
insert(' ') + digit + union(*unit_sp_case1), -0.5)
else:
measure_sp = add_weight(
((digit + delete('百') + add_weight(addzero**2, 1.0)) |
(digit + delete('千') + add_weight(addzero**3, 1.0)) |
(digit + delete('万') + add_weight(addzero**4, 1.0))) +
digit_zh + union(*unit_sp_case1), -0.5)

tagger = insert('value: "') + (measure | measure_sp
| percent) + insert('"')
# 每小时十千米 => 10km/h, 每小时三十到三百一十一千米 => 30~311km/h
tagger |= (insert('denominator: "') + delete('每') + units +
insert('" numerator: "') + measure + insert('"'))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,6 @@
这是九十九九千 => 这是九十九九千
这是十二一千 => 这是十二一千
这是零百 => 这是零百
这是零千 => 这是零千
这是零千 => 这是零千
这是一百一个,一千两位,一万三天 => 这是100一个,1000两位,10000三天
这是九百九周,九千九月,九万九年 => 这是900九周,9000九月,90000九年
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,6 @@
这是九十九九千 => 这是九十九九千
这是十二一千 => 这是十二一千
这是零百 => 这是零百
这是零千 => 这是零千
这是零千 => 这是零千
这是一百一个,一千两位,一万三天 => 这是100 1个,1000 2位,10000 3天
这是九百九周,九千九月,九万九年 => 这是900 9周,9000 9月,90000 9年
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,6 @@
这是九十九九千 => 这是99 9000
这是十二一千 => 这是12 1000
这是零百 => 这是零百
这是零千 => 这是零千
这是零千 => 这是零千
这是一百一个,一千两位,一万三天 => 这是100一个,1000两位,10000三天
这是九百九周,九千九月,九万九年 => 这是900九周,9000九月,90000九年
Loading