Skip to content

Commit

Permalink
Replacing trie with DFA
Browse files Browse the repository at this point in the history
But keeping trie code for comparison
  • Loading branch information
gershnik committed Nov 26, 2023
1 parent 0b6011f commit 9a0de83
Show file tree
Hide file tree
Showing 12 changed files with 751 additions and 14 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ jobs:

- name: Test
shell: bash
run: xcodebuild test -workspace Translit.xcworkspace -scheme Translit -derivedDataPath DerivedData
run: xcodebuild test -workspace Translit.xcworkspace -scheme Translit -testPlan Main -derivedDataPath DerivedData

- name: Build
shell: bash
Expand Down
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@ DerivedData
External
Local.xcconfig
Translit/generated

Translit/tests/PerfData
10 changes: 10 additions & 0 deletions Translit.xcodeproj/project.pbxproj
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
445D1E622AFE364900FA1C07 /* Transliterator.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 445D1E602AFE364900FA1C07 /* Transliterator.cpp */; };
445D1E662AFFA1F700FA1C07 /* InputMethodKit.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 445D1E652AFFA1F700FA1C07 /* InputMethodKit.framework */; };
446DBB912B00BD53000B76EC /* TestMiniTrie.mm in Sources */ = {isa = PBXBuildFile; fileRef = 446DBB902B00BD53000B76EC /* TestMiniTrie.mm */; };
4475AD322B11F97F008DA122 /* TestStateMachine.mm in Sources */ = {isa = PBXBuildFile; fileRef = 4475AD312B11F97F008DA122 /* TestStateMachine.mm */; };
44C675812B021410003A5BDE /* TransliteratorRegistry.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 44C6757F2B021410003A5BDE /* TransliteratorRegistry.cpp */; };
44C675822B021410003A5BDE /* TransliteratorRegistry.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 44C6757F2B021410003A5BDE /* TransliteratorRegistry.cpp */; };
44C675842B02153F003A5BDE /* TestRu.mm in Sources */ = {isa = PBXBuildFile; fileRef = 44C675832B02153F003A5BDE /* TestRu.mm */; };
Expand Down Expand Up @@ -130,6 +131,10 @@
445D1E732AFFFAE600FA1C07 /* Base */ = {isa = PBXFileReference; lastKnownFileType = text.plist.strings; name = Base; path = Base.lproj/InfoPlist.strings; sourceTree = "<group>"; };
446DBB8E2B00BD53000B76EC /* TranslitTests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = TranslitTests.xctest; sourceTree = BUILT_PRODUCTS_DIR; };
446DBB902B00BD53000B76EC /* TestMiniTrie.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = TestMiniTrie.mm; sourceTree = "<group>"; };
4475AD302B11EEE4008DA122 /* StateMachine.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = StateMachine.h; sourceTree = "<group>"; };
4475AD312B11F97F008DA122 /* TestStateMachine.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = TestStateMachine.mm; sourceTree = "<group>"; };
4475AD332B131F80008DA122 /* Main.xctestplan */ = {isa = PBXFileReference; lastKnownFileType = text; path = Main.xctestplan; sourceTree = "<group>"; };
4475AD342B132242008DA122 /* Perf.xctestplan */ = {isa = PBXFileReference; lastKnownFileType = text; path = Perf.xctestplan; sourceTree = "<group>"; };
44C6757E2B01B4FD003A5BDE /* TableRU.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = TableRU.h; sourceTree = "<group>"; };
44C6757F2B021410003A5BDE /* TransliteratorRegistry.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = TransliteratorRegistry.cpp; sourceTree = "<group>"; };
44C675802B021410003A5BDE /* TransliteratorRegistry.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = TransliteratorRegistry.h; sourceTree = "<group>"; };
Expand Down Expand Up @@ -335,6 +340,7 @@
445D1E4B2AFE2C6C00FA1C07 /* AppDelegate.mm */,
44C675C62B034E17003A5BDE /* MenuProtocol.h */,
445D1E5E2AFE35AA00FA1C07 /* InputController.mm */,
4475AD302B11EEE4008DA122 /* StateMachine.h */,
445D1E632AFEE9A000FA1C07 /* MiniTrie.h */,
445D1E602AFE364900FA1C07 /* Transliterator.cpp */,
445D1E612AFE364900FA1C07 /* Transliterator.h */,
Expand All @@ -355,9 +361,12 @@
isa = PBXGroup;
children = (
446DBB902B00BD53000B76EC /* TestMiniTrie.mm */,
4475AD312B11F97F008DA122 /* TestStateMachine.mm */,
44C675832B02153F003A5BDE /* TestRu.mm */,
442D25BD2B0B2D2C00204800 /* TestCommon.h */,
442D25BE2B0B2D6600204800 /* TestCommon.mm */,
4475AD332B131F80008DA122 /* Main.xctestplan */,
4475AD342B132242008DA122 /* Perf.xctestplan */,
);
path = tests;
sourceTree = "<group>";
Expand Down Expand Up @@ -693,6 +702,7 @@
buildActionMask = 2147483647;
files = (
44C675822B021410003A5BDE /* TransliteratorRegistry.cpp in Sources */,
4475AD322B11F97F008DA122 /* TestStateMachine.mm in Sources */,
442D25B42B0A2E1B00204800 /* Transliterator.cpp in Sources */,
44C675842B02153F003A5BDE /* TestRu.mm in Sources */,
446DBB912B00BD53000B76EC /* TestMiniTrie.mm in Sources */,
Expand Down
12 changes: 10 additions & 2 deletions Translit.xcodeproj/xcshareddata/xcschemes/Translit.xcscheme
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,16 @@
buildConfiguration = "Debug"
selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"
selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB"
shouldUseLaunchSchemeArgsEnv = "YES"
shouldAutocreateTestPlan = "YES">
shouldUseLaunchSchemeArgsEnv = "YES">
<TestPlans>
<TestPlanReference
reference = "container:Translit/tests/Main.xctestplan"
default = "YES">
</TestPlanReference>
<TestPlanReference
reference = "container:Translit/tests/Perf.xctestplan">
</TestPlanReference>
</TestPlans>
<Testables>
<TestableReference
skipped = "NO"
Expand Down
286 changes: 286 additions & 0 deletions Translit/src/StateMachine.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,286 @@
// Copyright (c) 2023, Eugene Gershnik
// SPDX-License-Identifier: GPL-3.0-or-later

#ifndef TRANSLIT_HEADER_STATE_MACHINE_H_INCLUDED
#define TRANSLIT_HEADER_STATE_MACHINE_H_INCLUDED

#include <ranges>
#include <optional>
#include <map>
#include <vector>
#include <string>
#include <functional>


template<class Char, std::unsigned_integral LengthType = uint8_t, class PayloadType = Char>
class StateMachine {

private:
static inline constexpr LengthType noTransition = LengthType(-1);

private:
struct Outcome {
PayloadType payload;
bool successful: 1;
bool final: 1;
};
enum class OutcomeType {
final,
nonfinal,
intermediate
};
struct OutcomeDescriptor {
OutcomeDescriptor(LengthType idx_, OutcomeType type_, PayloadType payload_):
idx(idx_), type(type_), payload(payload_)
{}
LengthType idx;
OutcomeType type;
PayloadType payload;
};

struct Expanded {
std::vector<Char> inputs;
std::vector<Outcome> outcomes;
std::vector<LengthType> transitions;
};

public:
template<std::ranges::range Range, class Extractor = std::identity>
requires(std::is_convertible_v<std::tuple_element_t<0, std::ranges::range_value_t<Range>>, PayloadType> &&
std::is_convertible_v<std::tuple_element_t<1, std::ranges::range_value_t<Range>>, const Char *>)
StateMachine(Range && range) {

Expanded expanded;
std::map<std::basic_string<Char>, OutcomeDescriptor> outcomesMap;
size_t terminalCount = 0;

for(auto [dst, src]: range) {

if (!*src) {
[[maybe_unused]]
auto [_, inserted] = outcomesMap.emplace(std::piecewise_construct,
std::forward_as_tuple(),
std::forward_as_tuple(noTransition, OutcomeType::final, dst));

assert(inserted);
++terminalCount;
continue;
}

if (outcomesMap.empty()) {
outcomesMap.emplace(std::piecewise_construct,
std::forward_as_tuple(),
std::forward_as_tuple(noTransition, OutcomeType::intermediate, PayloadType{}));
++terminalCount;
}

for(const Char * p = src; *p; ++p) {

{
const Char input = *p;
auto it = std::lower_bound(expanded.inputs.begin(), expanded.inputs.end(), input);
if (it == expanded.inputs.end() || *it != input) {
expanded.inputs.insert(it, input);
}
}

{
OutcomeType newType = p[1] ? OutcomeType::intermediate : OutcomeType::final;
terminalCount += !p[1];
auto [it, inserted] = outcomesMap.emplace(std::piecewise_construct,
std::forward_as_tuple(src, p + 1),
std::forward_as_tuple(noTransition, newType, dst));
if (!inserted) {
if (newType == OutcomeType::final) {
assert(it->second.type != OutcomeType::final); //if it already exists it must be non final!
it->second.payload = dst;
terminalCount -= (it->second.type != OutcomeType::intermediate); //avoid double counting!
it->second.type = OutcomeType::nonfinal;
} else if (it->second.type == OutcomeType::final) {
it->second.type = OutcomeType::nonfinal;
}
}
}
}
}

assert(terminalCount < size_t(std::numeric_limits<LengthType>::max() - 1));

expanded.outcomes.reserve(terminalCount);
size_t stateCount = terminalCount;
for(auto & entry: outcomesMap) {
if (entry.first.empty() || entry.second.type != OutcomeType::intermediate) {
entry.second.idx = static_cast<LengthType>(expanded.outcomes.size());
expanded.outcomes.push_back({
entry.second.payload,
entry.second.type != OutcomeType::intermediate,
entry.second.type == OutcomeType::final
});
} else {
entry.second.idx = stateCount++;
assert(stateCount < size_t(std::numeric_limits<LengthType>::max() - 1));
}
}
assert(stateCount == outcomesMap.size());

expanded.transitions.resize(stateCount * expanded.inputs.size(), -1);

for(auto [dst, src]: range) {

LengthType currentState = 0;
for(const Char * p = src; *p; ++p) {

size_t inputIdx;
{
auto it = std::lower_bound(expanded.inputs.begin(), expanded.inputs.end(), *p);
assert(it != expanded.inputs.end() && *it == *p);
inputIdx = it - expanded.inputs.begin();
}

auto & nextState = expanded.transitions[currentState * expanded.inputs.size() + inputIdx];
if (nextState != noTransition) {
currentState = nextState;
continue;
}

auto it = outcomesMap.find({src, p + 1});
assert(it != outcomesMap.end());
nextState = static_cast<LengthType>(it->second.idx);
currentState = nextState;
}
}

m_inputsEnd = expanded.inputs.size() * sizeof(expanded.inputs[0]);
m_transitionsStart = alignSize(m_inputsEnd, __alignof(expanded.transitions[0]));
m_transitionsEnd = m_transitionsStart + expanded.transitions.size() * sizeof(expanded.transitions[0]);
m_outcomesStart = alignSize(m_transitionsEnd, __alignof(expanded.outcomes[0]));
size_t compactSize = m_outcomesStart + expanded.outcomes.size() * sizeof(expanded.outcomes[0]);
m_data.resize(compactSize);
std::copy(expanded.inputs.begin(), expanded.inputs.end(), inputsBegin());
std::copy(expanded.transitions.begin(), expanded.transitions.end(), transitionsBegin());
std::copy(expanded.outcomes.begin(), expanded.outcomes.end(), outcomesBegin());
}

template<class ItF, class ItL>
requires(std::is_convertible_v<typename std::iterator_traits<ItF>::iterator_category, std::input_iterator_tag> &&
std::is_convertible_v<std::tuple_element_t<0, typename std::iterator_traits<ItF>::value_type>, PayloadType> &&
std::is_convertible_v<std::tuple_element_t<1, typename std::iterator_traits<ItF>::value_type>, const Char *>)
StateMachine(ItF first, ItL last):
StateMachine(std::ranges::subrange(first, last))
{}

StateMachine(std::initializer_list<std::pair<PayloadType, const Char *>> init):
StateMachine(init.begin(), init.end())
{}

StateMachine() = default;

template<class It>
struct PrefixMatchResult {
/**
End of match
If !successful always stays at the start of input
*/
It next;
/** The payload of the successful match if successful. Undefined otherwise */
PayloadType payload;
/** Whether the match was successfull. */
bool successful;
/** Whether the answer is definite and won't change with larger input */
bool definite;
};

template<class ItF, class ItL>
requires(std::is_convertible_v<typename std::iterator_traits<ItF>::iterator_category, std::forward_iterator_tag> &&
std::is_same_v<typename std::iterator_traits<ItF>::value_type, Char> &&
std::equality_comparable_with<ItF, ItL>)
auto prefixMatch(ItF first, ItL last) const noexcept -> PrefixMatchResult<ItF> {

LengthType currentState = 0;
LengthType lastMatchedState = 0;
auto current = first;
auto consumed = first;
while(current != last) {

Char c = *current;
auto it = std::lower_bound(inputsBegin(), inputsEnd(), c);
if (it == inputsEnd() || *it != c) {
if (currentState >= outcomesSize())
return {first, {}, false, true};
auto & outcome = outcomesBegin()[currentState];
return {current, outcome.payload, outcome.successful, true};
}
size_t inputIdx = it - inputsBegin();

auto nextState = transitionFor(inputIdx, currentState);
if (nextState == noTransition) {
if (currentState >= outcomesSize())
return {first, {}, false, true};
auto & outcome = outcomesBegin()[currentState];
return {current, outcome.payload, outcome.successful, true};
}

if (currentState < outcomesSize()) {
consumed = current;
lastMatchedState = currentState;
}
currentState = nextState;
++current;
}
if (currentState >= outcomesSize()) {
if (lastMatchedState == 0)
return {first, {}, false, inputsSize() == 0};
auto & outcome = outcomesBegin()[lastMatchedState];
return {consumed, outcome.payload, outcome.successful, outcome.final};
}
auto & outcome = outcomesBegin()[currentState];
return {last, outcome.payload, outcome.successful, outcome.final};

}

private:
// Round size up to next multiple of alignment.
static constexpr auto alignSize(size_t s, size_t alignment) noexcept -> size_t {
assert(s + alignment > s);
return (s + alignment - 1) & ~(alignment - 1);
}

auto inputsBegin() const
{ return reinterpret_cast<const Char *>(m_data.data()); }
auto inputsBegin()
{ return reinterpret_cast<Char *>(m_data.data()); }
auto inputsEnd() const
{ return reinterpret_cast<const Char *>(m_data.data() + m_inputsEnd); }
auto inputsEnd()
{ return reinterpret_cast<Char *>(m_data.data() + m_inputsEnd); }

auto transitionsBegin() const
{ return reinterpret_cast<const LengthType *>(m_data.data() + m_transitionsStart); }
auto transitionsBegin()
{ return reinterpret_cast<LengthType *>(m_data.data() + m_transitionsStart); }

auto outcomesBegin() const
{ return reinterpret_cast<const Outcome *>(m_data.data() + m_outcomesStart); }
auto outcomesBegin()
{ return reinterpret_cast<Outcome *>(m_data.data() + m_outcomesStart); }


auto inputsSize() const -> LengthType
{ return LengthType(m_inputsEnd / sizeof(Char)); }
auto transitionFor(LengthType inputIdx, LengthType state) const
{ return transitionsBegin()[state * inputsSize() + inputIdx]; }

auto outcomesSize() const -> LengthType
{ return LengthType((m_data.size() - m_outcomesStart) / sizeof(Outcome)); }

private:
std::vector<uint8_t> m_data;
size_t m_inputsEnd = 0;
size_t m_transitionsStart = 0;
size_t m_transitionsEnd = 0;
size_t m_outcomesStart = 0;
};

#endif


12 changes: 10 additions & 2 deletions Translit/src/Transliterator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@


void Transliterator::append(const sys_string & str) {

sys_string::char_access strAccess(str);
m_prefix.append(strAccess.begin(), strAccess.end());
m_translit.erase(m_translit.begin() + m_translitCompletedSize, m_translit.end());
Expand All @@ -14,10 +13,19 @@ void Transliterator::append(const sys_string & str) {
const auto end = m_prefix.end();
auto completed = begin;
for (auto start = begin ; start != end; ) {
#ifdef TRANSLIT_USE_TRIE
auto res = m_trie.prefixMatch(start, end);
if (res.index != Trie::noMatch) {
if (res.index != m_trie.noMatch) {
#else
auto res = m_sm.prefixMatch(start, end);
if (res.successful) {
#endif
m_matchedSomething = true;
#ifdef TRANSLIT_USE_TRIE
m_translit += m_replacements[res.index];
#else
m_translit += res.payload;
#endif
//if the result is not definite we don't know if a longer match is possible so bail out
if (!res.definite)
break;
Expand Down
Loading

0 comments on commit 9a0de83

Please sign in to comment.