diff --git a/include/kiwi/Form.h b/include/kiwi/Form.h index 60acc9cb..335a3138 100644 --- a/include/kiwi/Form.h +++ b/include/kiwi/Form.h @@ -228,17 +228,18 @@ namespace kiwi uint32_t formId = 0; float scoreHash = 0; uint32_t typoId = 0; + uint16_t numSpaces = 0; CondVowel leftCond = CondVowel::none; TypoForm() = default; - TypoForm(const std::tuple& p) - : formId{ std::get<0>(p) }, scoreHash{ std::get<1>(p) }, leftCond{ std::get<2>(p) } + TypoForm(const std::tuple& p) + : formId{ std::get<0>(p) }, scoreHash{ std::get<1>(p) }, numSpaces{ std::get<2>(p)}, leftCond{std::get<3>(p)} { } - TypoForm(uint32_t _formId, float _score = 0, bool _hash = 0, uint32_t _typoId = 0, CondVowel _leftCond = CondVowel::none) - : formId{ _formId }, scoreHash{ _hash ? -_score : _score }, typoId{ _typoId }, leftCond{ _leftCond } + TypoForm(uint32_t _formId, float _score = 0, bool _hash = 0, uint32_t _typoId = 0, uint16_t _numSpaces = 0, CondVowel _leftCond = CondVowel::none) + : formId{ _formId }, scoreHash{ _hash ? -_score : _score }, typoId{ _typoId }, numSpaces{ _numSpaces }, leftCond{ _leftCond } { } diff --git a/src/KTrie.cpp b/src/KTrie.cpp index f02e1965..490d5d46 100644 --- a/src/KTrie.cpp +++ b/src/KTrie.cpp @@ -50,18 +50,21 @@ namespace kiwi uint32_t start = 0; uint32_t typoId = 0; uint32_t end = 0; // only used in continual typo tolerant mode + uint32_t numSpaces = 0; FormCandidate(const Form* _form = nullptr, float _cost = 0, uint32_t _start = 0, uint32_t _typoId = 0, uint32_t _end = 0, + uint32_t _numSpaces = 0, uint32_t = 0) : form{ _form }, cost{ _cost }, start{ _start }, typoId{ _typoId }, - end{ _end } + end{ _end }, + numSpaces{ _numSpaces } {} size_t getStartPos(size_t ) const @@ -86,7 +89,7 @@ namespace kiwi size_t getFormSizeWithTypos(const size_t* typoPtrs) const { - return typoPtrs[typoId + 1] - typoPtrs[typoId]; + return typoPtrs[typoId + 1] - typoPtrs[typoId] + numSpaces; } bool operator==(const Form* f) const @@ -100,7 +103,7 @@ namespace kiwi { const Form* form = nullptr; - FormCandidate(const Form* _form = nullptr, float = 0, uint32_t = 0, uint32_t = 0, uint32_t = 0, uint32_t = 0) + FormCandidate(const Form* _form = nullptr, float = 0, uint32_t = 0, uint32_t = 0, uint32_t = 0, uint32_t = 0, uint32_t = 0) : form{ _form } {} @@ -146,8 +149,9 @@ namespace kiwi uint32_t _start = 0, uint32_t _typoId = 0, uint32_t _end = 0, + uint32_t _numSpaces = 0, uint32_t _lengthenedSize = 0) - : FormCandidate{ _form, _cost, _start, _typoId, _end, _lengthenedSize }, + : FormCandidate{ _form, _cost, _start, _typoId, _end, _numSpaces, _lengthenedSize }, lengthenedSize{ _lengthenedSize } {} @@ -203,6 +207,7 @@ namespace kiwi startPosition ? startPosition : ((nonSpaces.size() - typoFormSize) * posMultiplier), tCand->typoId, endPosition, + tCand->numSpaces, lengthenedSize); } if (tCand[0].hash() != tCand[1].hash()) break; diff --git a/src/KiwiBuilder.cpp b/src/KiwiBuilder.cpp index 26df4ff3..0c370065 100644 --- a/src/KiwiBuilder.cpp +++ b/src/KiwiBuilder.cpp @@ -2028,7 +2028,7 @@ Kiwi KiwiBuilder::build(const TypoTransformer& typos, float typoCostThreshold) c // 오타 교정이 있는 경우 가능한 모든 오타에 대해 Trie 생성 else { - using TypoInfo = tuple; + using TypoInfo = tuple; UnorderedMap> typoGroup; auto ptypos = typos.prepare(); ret.continualTypoCost = ptypos.getContinualTypoCost(); @@ -2043,12 +2043,12 @@ Kiwi KiwiBuilder::build(const TypoTransformer& typos, float typoCostThreshold) c for (auto t : ptypos._generate(f->form, typoCostThreshold)) { if (t.leftCond != CondVowel::none && f->vowel != CondVowel::none && t.leftCond != f->vowel) continue; - typoGroup[removeSpace(t.str)].emplace_back(f - ret.forms.data(), t.cost, t.leftCond); + typoGroup[removeSpace(t.str)].emplace_back(f - ret.forms.data(), t.cost, f->numSpaces, t.leftCond); } } else { - typoGroup[removeSpace(f->form)].emplace_back(f - ret.forms.data(), 0, CondVowel::none); + typoGroup[removeSpace(f->form)].emplace_back(f - ret.forms.data(), 0, f->numSpaces, CondVowel::none); } } @@ -2107,7 +2107,7 @@ Kiwi KiwiBuilder::build(const TypoTransformer& typos, float typoCostThreshold) c estimatedNodeSize += f->first.size() - commonPrefix; prevForm = &f->first; } - ret.typoForms.emplace_back(0, 0, hash); + ret.typoForms.emplace_back(0, 0, 0, hash); ret.typoPtrs.emplace_back(ret.typoPool.size()); formTrie.reserveMore(estimatedNodeSize); diff --git a/test/test_cpp.cpp b/test/test_cpp.cpp index 09567ba1..a200125d 100644 --- a/test/test_cpp.cpp +++ b/test/test_cpp.cpp @@ -1657,3 +1657,18 @@ TEST(KiwiCpp, IssueP189) EXPECT_EQ(res[3].str, u"팩"); EXPECT_EQ(res[4].str, u"무료"); } + +TEST(KiwiCpp, Issue205) +{ + KiwiBuilder builder{ MODEL_PATH, 0, BuildOption::default_, }; + builder.addWord(u"함박 스테이크"); + auto kiwi1 = builder.build(); + auto res1 = kiwi1.analyze(u"함박 스테이크를 먹었습니다", Match::allWithNormalizing).first; + + EXPECT_EQ(res1[0].str, u"함박 스테이크"); + + auto kiwi2 = builder.build(DefaultTypoSet::basicTypoSetWithContinual); + auto res2 = kiwi2.analyze(u"함박 스테이크를 먹었습니다", Match::allWithNormalizing).first; + + EXPECT_EQ(res2[0].str, u"함박 스테이크"); +}