From 4275bcc4299881b3a7a7a3c6b9e26de55788c1cf Mon Sep 17 00:00:00 2001 From: Alexander Date: Sun, 24 Jul 2016 00:23:22 +0300 Subject: [PATCH 01/33] Added TODO list --- README.md | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 README.md diff --git a/README.md b/README.md new file mode 100644 index 000000000..0212ff9af --- /dev/null +++ b/README.md @@ -0,0 +1,8 @@ +# algorithm +Boost.org algorithm module + +##TODO (Aho-Corasik) +* Solve problem with const char* +* std::map or std::unordered map or smth else? +* internal container - std::vector or maybe smth else? +* Add choosing of insternal container for Node and Tree From 697e81675d76419f734dba665016250f375cb6bd Mon Sep 17 00:00:00 2001 From: Alexander Zaitsev Date: Tue, 26 Jul 2016 12:55:54 +0300 Subject: [PATCH 02/33] Initialize Aho-Corasik --- .../boost/algorithm/searching/aho_corasik.hpp | 193 ++++++++++++++++++ 1 file changed, 193 insertions(+) create mode 100644 include/boost/algorithm/searching/aho_corasik.hpp diff --git a/include/boost/algorithm/searching/aho_corasik.hpp b/include/boost/algorithm/searching/aho_corasik.hpp new file mode 100644 index 000000000..a8be6f48f --- /dev/null +++ b/include/boost/algorithm/searching/aho_corasik.hpp @@ -0,0 +1,193 @@ +// +// Created by zamazan4ik on 17.07.16. +// + +#ifndef AHO_CORASIK_AHO_CORASIK_HPP +#define AHO_CORASIK_AHO_CORASIK_HPP + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +namespace boost { namespace algorithm +{ + +template +class Base_Aho_Corasik +{ +public: + Base_Aho_Corasik() + { + tree.emplace_back(Node(0, 0, 0)); + } + + + template + void insert(const R &range) + { + insert(boost::begin(range), boost::end(range)); + } + + template + void insert(ForwardIterator begin, ForwardIterator end) + { + used.push_back(false); + + buffer.emplace_back(std::vector(std::distance(begin, end))); + size_t v = 0; + for (auto it = begin; it != end; ++it) + { + buffer.back().push_back(*it); + if (tree[v].next_.find(*it) == tree[v].next_.end()) + { + Node node(-1, -1, v); + node.prevValue_ = *it; + tree.push_back(node); + tree[v].next_[*it] = sz++; + } + v = tree[v].next_[*it]; + } + tree[v].isLeaf_ = true; + tree[v].pat_.push_back(countStrings++); + } + + //Search + template + std::vector> find(const R &range) + { + return find(boost::begin(range), boost::end(range)); + } + + template + std::vector> find(ForwardIterator begin, ForwardIterator end) + { + std::vector> result; + size_t v = 0; + std::fill(used.begin(), used.end(), false); + + while (begin != end) + { + v = go(v, *begin); + check(v); + ++begin; + } + + for (size_t i = 0; i < used.size(); ++i) + { + if (used[i]) + { + result.push_back(buffer[i]); + } + } + return result; + } + +private: + size_t getlink(const size_t v) + { + if (tree[v].suffLink_ == -1) + { + if (v == 0 || tree[v].prevNode_ == 0) + { + tree[v].suffLink_ = 0; + } + else + { + tree[v].suffLink_ = go(getlink(tree[v].prevNode_), tree[v].prevValue_); + } + } + return tree[v].suffLink_; + } + + size_t go(const size_t v, const T c) + { + if (tree[v].go_.find(c) == tree[v].go_.end()) + { + if (tree[v].next_.find(c) != tree[v].next_.end()) + { + tree[v].go_[c] = tree[v].next_[c]; + } + else + { + if (v == 0) + { + tree[v].go_[c] = 0; + } + else + { + tree[v].go_[c] = go(getlink(v), c); + } + } + } + return tree[v].go_[c]; + } + + size_t getgood(const size_t v) + { + if (tree[v].goodSuffLink_ == -1) + { + size_t u = getlink(v); + if (u == 0) + { + tree[v].goodSuffLink_ = 0; + } + else + { + tree[v].goodSuffLink_ = tree[u].isLeaf_ ? u : getgood(u); + } + } + return tree[v].goodSuffLink_; + } + + void check(const size_t v) + { + for (size_t i = v; i != 0; i = getgood(i)) + { + if (tree[i].isLeaf_) + { + for (const auto& x: tree[i].pat_) + { + used[x] = true; + } + } + } + } + +private: + struct Node + { + int suffLink_, goodSuffLink_; + size_t prevNode_; + bool isLeaf_; + T prevValue_; + std::vector pat_; + Map next_, go_; + Node(const int suffLink = -1, const int goodSuffLink = -1, const size_t prevNode = 0, + const bool isLeaf = false) : suffLink_(suffLink), goodSuffLink_(goodSuffLink), + prevNode_(prevNode), isLeaf_(isLeaf) + { + } + }; + std::vector tree; + std::vector> buffer; + size_t sz = 1, countStrings = 0; + std::vector used; +}; + +template, typename Alloc = std::allocator>> +using Aho_Corasik = Base_Aho_Corasik>; + +template, typename Pred = std::equal_to, + typename Alloc = std::allocator>> +using Aho_Corasik_Hash = Base_Aho_Corasik>; +}} +#endif //AHO_CORASIK_AHO_CORASIK_HPP From f5681d8298e3d3ff795d0336ec3a01b2463c835a Mon Sep 17 00:00:00 2001 From: Alexander Zaitsev Date: Fri, 29 Jul 2016 04:14:45 +0300 Subject: [PATCH 03/33] Added return-reference find --- .../boost/algorithm/searching/aho_corasik.hpp | 40 +++++++++++-------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/include/boost/algorithm/searching/aho_corasik.hpp b/include/boost/algorithm/searching/aho_corasik.hpp index a8be6f48f..f650e2b73 100644 --- a/include/boost/algorithm/searching/aho_corasik.hpp +++ b/include/boost/algorithm/searching/aho_corasik.hpp @@ -21,7 +21,7 @@ namespace boost { namespace algorithm { -template +template>> class Base_Aho_Corasik { public: @@ -30,6 +30,10 @@ class Base_Aho_Corasik tree.emplace_back(Node(0, 0, 0)); } + ~Base_Aho_Corasik() + { + + } template void insert(const R &range) @@ -49,7 +53,7 @@ class Base_Aho_Corasik buffer.back().push_back(*it); if (tree[v].next_.find(*it) == tree[v].next_.end()) { - Node node(-1, -1, v); + Node node(Node::Sentinel, Node::Sentinel, v); node.prevValue_ = *it; tree.push_back(node); tree[v].next_[*it] = sz++; @@ -62,15 +66,15 @@ class Base_Aho_Corasik //Search template - std::vector> find(const R &range) + ResultCont& find(const R &range) { return find(boost::begin(range), boost::end(range)); } template - std::vector> find(ForwardIterator begin, ForwardIterator end) + ResultCont& find(ForwardIterator begin, ForwardIterator end) { - std::vector> result; + result.clear(); size_t v = 0; std::fill(used.begin(), used.end(), false); @@ -94,7 +98,7 @@ class Base_Aho_Corasik private: size_t getlink(const size_t v) { - if (tree[v].suffLink_ == -1) + if (tree[v].suffLink_ == Node::Sentinel) { if (v == 0 || tree[v].prevNode_ == 0) { @@ -133,7 +137,7 @@ class Base_Aho_Corasik size_t getgood(const size_t v) { - if (tree[v].goodSuffLink_ == -1) + if (tree[v].goodSuffLink_ == Node::Sentinel) { size_t u = getlink(v); if (u == 0) @@ -165,15 +169,16 @@ class Base_Aho_Corasik private: struct Node { - int suffLink_, goodSuffLink_; - size_t prevNode_; + static const size_t Sentinel = static_cast(-1); + + size_t suffLink_, goodSuffLink_, prevNode_; bool isLeaf_; T prevValue_; std::vector pat_; Map next_, go_; - Node(const int suffLink = -1, const int goodSuffLink = -1, const size_t prevNode = 0, - const bool isLeaf = false) : suffLink_(suffLink), goodSuffLink_(goodSuffLink), - prevNode_(prevNode), isLeaf_(isLeaf) + Node(const size_t suffLink = Sentinel, const size_t goodSuffLink = Sentinel, + const size_t prevNode = 0, const bool isLeaf = false) : suffLink_(suffLink), goodSuffLink_(goodSuffLink), + prevNode_(prevNode), isLeaf_(isLeaf) { } }; @@ -181,13 +186,16 @@ class Base_Aho_Corasik std::vector> buffer; size_t sz = 1, countStrings = 0; std::vector used; + ResultCont result; }; -template, typename Alloc = std::allocator>> -using Aho_Corasik = Base_Aho_Corasik>; +template, typename Alloc = std::allocator>, + typename ResultCont = std::vector>> +using Aho_Corasik = Base_Aho_Corasik, ResultCont>; template, typename Pred = std::equal_to, - typename Alloc = std::allocator>> -using Aho_Corasik_Hash = Base_Aho_Corasik>; + typename Alloc = std::allocator>, + typename ResultCont = std::vector>> +using Aho_Corasik_Hash = Base_Aho_Corasik, ResultCont>; }} #endif //AHO_CORASIK_AHO_CORASIK_HPP From 47778dd50d50981d6a862257eb166eb1bebfb035 Mon Sep 17 00:00:00 2001 From: Alexander Zaitsev Date: Sat, 13 Aug 2016 22:44:51 +0300 Subject: [PATCH 04/33] New version of Aho-Corasick --- .../algorithm/searching/aho_corasick.hpp | 232 ++++++++++++++++++ .../boost/algorithm/searching/aho_corasik.hpp | 201 --------------- 2 files changed, 232 insertions(+), 201 deletions(-) create mode 100644 include/boost/algorithm/searching/aho_corasick.hpp delete mode 100644 include/boost/algorithm/searching/aho_corasik.hpp diff --git a/include/boost/algorithm/searching/aho_corasick.hpp b/include/boost/algorithm/searching/aho_corasick.hpp new file mode 100644 index 000000000..bbad739eb --- /dev/null +++ b/include/boost/algorithm/searching/aho_corasick.hpp @@ -0,0 +1,232 @@ +#include +#include +#include +#include +#include + +#include +#include + + +namespace boost { namespace algorithm { + +template typename Container, typename ...Args> +class AhoCorasick +{ +private: + class MapBorNode + { + public: + Container, Args...> links; + std::shared_ptr fail, term; + std::vector pat; + + MapBorNode(std::shared_ptr fail_node = nullptr) + : fail(fail_node), term(nullptr) + { } + + std::shared_ptr getLink(const T& c) const + { + const auto iter = links.find(c); + return iter != links.cend() ? iter->second : nullptr; + } + + bool isTerminal() const + { + return !pat.empty(); + } + }; +public: + using value_type = T; + using node_type = MapBorNode; +private: + std::shared_ptr root, current_state; + size_t countStrings = 0; +public: + AhoCorasick() : root(std::make_shared()) {} + + template + AhoCorasick(ForwardIterator patBegin, ForwardIterator patEnd) : root(std::make_shared()) + { + while(patBegin != patEnd) + { + insert(*patBegin); + ++patBegin; + } + } + + template + void insert(const R& range) + { + insert(boost::begin(range), boost::end(range)); + } + + template + void insert(ForwardIterator begin, ForwardIterator end) + { + size_t patLen = 0; + std::shared_ptr current_node = root; + for(auto it = begin; it != end; ++it) + { + ++patLen; + std::shared_ptr child_node = current_node->getLink(*it); + if (!child_node) + { + child_node = std::make_shared(root); + current_node->links[*it] = child_node; + } + current_node = child_node; + } + current_node->pat.push_back(patLen); + } + + void init() + { + std::queue> q; + q.push(root); + while (!q.empty()) + { + std::shared_ptr current_node = q.front(); + q.pop(); + for (auto iter = current_node->links.cbegin(); + iter != current_node->links.cend(); ++iter) + { + const value_type& symbol = iter->first; + std::shared_ptr child = iter->second; + + // Defining .fail for the childnode + std::shared_ptr temp_node = current_node->fail; + while (temp_node) + { + std::shared_ptr fail_candidate = temp_node->getLink(symbol); + if (fail_candidate) + { + child->fail = fail_candidate; + break; + } + temp_node = temp_node->fail; + } + + // Defining .term for the childnode using .term of current node + child->term = (child->fail == nullptr || child->fail->isTerminal()) ? child->fail : child->fail->term; + q.push(child); + } + } + } + + void step(const value_type& c) + { + while (current_state) + { + std::shared_ptr candidate = current_state->getLink(c); + if (candidate) + { + current_state = candidate; + return; + } + current_state = current_state->fail; + } + current_state = root; + } + + template + void getTermsForCurrentState(Out& cont, ForwardIterator pos) + { + if (current_state->isTerminal()) + { + for (const auto value : current_state->pat) + { + cont.push_back({1 + pos - value, pos + 1}); + } + } + std::shared_ptr temp_node = current_state->term; + while (temp_node) + { + for (const auto value : temp_node->pat) + { + cont.push_back({1 + pos - value, pos + 1}); + } + temp_node = temp_node->term; + } + } + + //Find methods + template + void find(ForwardIterator begin, ForwardIterator end, Out& cont) + { + init(); + current_state = root; + for(auto it = begin; it != end; ++it) + { + step(*it); + getTermsForCurrentState(cont, it); + } + } + + template + void find(const Range& range, Out& cont) + { + return find(boost::begin(range), boost::end(range), cont); + } + + template + void operator()(const Range& range, Out& cont) + { + return find(range, cont); + } + + template + void operator()(ForwardIterator begin, ForwardIterator end, Out& cont) + { + return find(begin, end, cont); + } +}; + +//Object interface +template > +using Aho_Corasick_Map = AhoCorasick; + +template , typename Comp = std::equal_to> +using Aho_Corasick_HashMap = AhoCorasick; + + +//Functional interface +template , typename ForwardIterator1, + typename ForwardIterator2, typename ResultCont> +void aho_corasick_map ( ForwardIterator1 corpus_first, ForwardIterator1 corpus_last, + ForwardIterator2 pat_first, ForwardIterator2 pat_last, + ResultCont &out) +{ + AhoCorasick obj(pat_first, pat_last); + obj.find(corpus_first, corpus_last, out); +} + +template , typename Range1, + typename Range2, typename ResultCont> +void aho_corasick_map ( Range1 corpus_range, Range2 pat_range, ResultCont &out) +{ + AhoCorasick obj(boost::begin(pat_range), boost::end(pat_range)); + obj.find(boost::begin(corpus_range), boost::end(corpus_range), out); +} + +template , typename Comp = std::equal_to, typename ForwardIterator1, + typename ForwardIterator2, typename ResultCont> +void aho_corasick_hashmap ( ForwardIterator1 corpus_first, ForwardIterator1 corpus_last, + ForwardIterator2 pat_first, ForwardIterator2 pat_last, + ResultCont &out) +{ + AhoCorasick obj(pat_first, pat_last); + obj.find(corpus_first, corpus_last, out); +} + +template , typename Comp = std::equal_to, typename Range1, + typename Range2, typename ResultCont> +void aho_corasick_hashmap ( Range1 corpus_range, Range2 pat_range, ResultCont &out) +{ + AhoCorasick obj(boost::begin(pat_range), boost::end(pat_range)); + obj.find(boost::begin(corpus_range), boost::end(corpus_range), out); +} + +}} + + diff --git a/include/boost/algorithm/searching/aho_corasik.hpp b/include/boost/algorithm/searching/aho_corasik.hpp deleted file mode 100644 index f650e2b73..000000000 --- a/include/boost/algorithm/searching/aho_corasik.hpp +++ /dev/null @@ -1,201 +0,0 @@ -// -// Created by zamazan4ik on 17.07.16. -// - -#ifndef AHO_CORASIK_AHO_CORASIK_HPP -#define AHO_CORASIK_AHO_CORASIK_HPP - -#include -#include -#include -#include -#include -#include - -#include -#include - -#include -#include - -namespace boost { namespace algorithm -{ - -template>> -class Base_Aho_Corasik -{ -public: - Base_Aho_Corasik() - { - tree.emplace_back(Node(0, 0, 0)); - } - - ~Base_Aho_Corasik() - { - - } - - template - void insert(const R &range) - { - insert(boost::begin(range), boost::end(range)); - } - - template - void insert(ForwardIterator begin, ForwardIterator end) - { - used.push_back(false); - - buffer.emplace_back(std::vector(std::distance(begin, end))); - size_t v = 0; - for (auto it = begin; it != end; ++it) - { - buffer.back().push_back(*it); - if (tree[v].next_.find(*it) == tree[v].next_.end()) - { - Node node(Node::Sentinel, Node::Sentinel, v); - node.prevValue_ = *it; - tree.push_back(node); - tree[v].next_[*it] = sz++; - } - v = tree[v].next_[*it]; - } - tree[v].isLeaf_ = true; - tree[v].pat_.push_back(countStrings++); - } - - //Search - template - ResultCont& find(const R &range) - { - return find(boost::begin(range), boost::end(range)); - } - - template - ResultCont& find(ForwardIterator begin, ForwardIterator end) - { - result.clear(); - size_t v = 0; - std::fill(used.begin(), used.end(), false); - - while (begin != end) - { - v = go(v, *begin); - check(v); - ++begin; - } - - for (size_t i = 0; i < used.size(); ++i) - { - if (used[i]) - { - result.push_back(buffer[i]); - } - } - return result; - } - -private: - size_t getlink(const size_t v) - { - if (tree[v].suffLink_ == Node::Sentinel) - { - if (v == 0 || tree[v].prevNode_ == 0) - { - tree[v].suffLink_ = 0; - } - else - { - tree[v].suffLink_ = go(getlink(tree[v].prevNode_), tree[v].prevValue_); - } - } - return tree[v].suffLink_; - } - - size_t go(const size_t v, const T c) - { - if (tree[v].go_.find(c) == tree[v].go_.end()) - { - if (tree[v].next_.find(c) != tree[v].next_.end()) - { - tree[v].go_[c] = tree[v].next_[c]; - } - else - { - if (v == 0) - { - tree[v].go_[c] = 0; - } - else - { - tree[v].go_[c] = go(getlink(v), c); - } - } - } - return tree[v].go_[c]; - } - - size_t getgood(const size_t v) - { - if (tree[v].goodSuffLink_ == Node::Sentinel) - { - size_t u = getlink(v); - if (u == 0) - { - tree[v].goodSuffLink_ = 0; - } - else - { - tree[v].goodSuffLink_ = tree[u].isLeaf_ ? u : getgood(u); - } - } - return tree[v].goodSuffLink_; - } - - void check(const size_t v) - { - for (size_t i = v; i != 0; i = getgood(i)) - { - if (tree[i].isLeaf_) - { - for (const auto& x: tree[i].pat_) - { - used[x] = true; - } - } - } - } - -private: - struct Node - { - static const size_t Sentinel = static_cast(-1); - - size_t suffLink_, goodSuffLink_, prevNode_; - bool isLeaf_; - T prevValue_; - std::vector pat_; - Map next_, go_; - Node(const size_t suffLink = Sentinel, const size_t goodSuffLink = Sentinel, - const size_t prevNode = 0, const bool isLeaf = false) : suffLink_(suffLink), goodSuffLink_(goodSuffLink), - prevNode_(prevNode), isLeaf_(isLeaf) - { - } - }; - std::vector tree; - std::vector> buffer; - size_t sz = 1, countStrings = 0; - std::vector used; - ResultCont result; -}; - -template, typename Alloc = std::allocator>, - typename ResultCont = std::vector>> -using Aho_Corasik = Base_Aho_Corasik, ResultCont>; - -template, typename Pred = std::equal_to, - typename Alloc = std::allocator>, - typename ResultCont = std::vector>> -using Aho_Corasik_Hash = Base_Aho_Corasik, ResultCont>; -}} -#endif //AHO_CORASIK_AHO_CORASIK_HPP From 559ad908ee6e03b4f1e1a67f1b2122ff747f67ba Mon Sep 17 00:00:00 2001 From: Alexander Zaitsev Date: Sat, 13 Aug 2016 22:50:59 +0300 Subject: [PATCH 05/33] Added guards and copyright --- include/boost/algorithm/searching/aho_corasick.hpp | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/include/boost/algorithm/searching/aho_corasick.hpp b/include/boost/algorithm/searching/aho_corasick.hpp index bbad739eb..a692ba7fd 100644 --- a/include/boost/algorithm/searching/aho_corasick.hpp +++ b/include/boost/algorithm/searching/aho_corasick.hpp @@ -1,3 +1,14 @@ +/* + Copyright (c) Alexander Zaitsev , 2016 + Distributed under the Boost Software License, Version 1.0. (See + accompanying file LICENSE_1_0.txt or copy at + http://www.boost.org/LICENSE_1_0.txt) + See http://www.boost.org/ for latest version. +*/ + +#ifndef AHO_CORASIK_AHO_CORASIK_HPP +#define AHO_CORASIK_AHO_CORASIK_HPP + #include #include #include @@ -229,4 +240,4 @@ void aho_corasick_hashmap ( Range1 corpus_range, Range2 pat_range, ResultCont &o }} - +#endif //AHO_CORASIK_AHO_CORASIK_HPP From 1c64e198f4d63f000ee032e30d4f5a192e8cd28e Mon Sep 17 00:00:00 2001 From: Alexander Zaitsev Date: Sun, 14 Aug 2016 00:44:46 +0300 Subject: [PATCH 06/33] Added example for Aho-Corasick Also fixed #include in aho-corasick implementation. --- example/Jamfile.v2 | 3 +- example/aho_corasick_example.cpp | 38 +++++++++++++++++++ .../algorithm/searching/aho_corasick.hpp | 1 + 3 files changed, 41 insertions(+), 1 deletion(-) create mode 100644 example/aho_corasick_example.cpp diff --git a/example/Jamfile.v2 b/example/Jamfile.v2 index ce067cfeb..b1d937d8f 100644 --- a/example/Jamfile.v2 +++ b/example/Jamfile.v2 @@ -20,5 +20,6 @@ project /boost/algorithm/example exe clamp_example : clamp_example.cpp ; exe search_example : search_example.cpp ; -exe is_palindrome_example : is_palindrome_example.cpp; +exe is_palindrome_example : is_palindrome_example.cpp ; +exe aho_corasick_example : aho_corasick_example.cpp ; diff --git a/example/aho_corasick_example.cpp b/example/aho_corasick_example.cpp new file mode 100644 index 000000000..c77b6c6a9 --- /dev/null +++ b/example/aho_corasick_example.cpp @@ -0,0 +1,38 @@ +/* + Copyright (c) Alexander Zaitsev , 2016 + + Distributed under the Boost Software License, Version 1.0. (See accompanying + file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + + For more information, see http://www.boost.org +*/ + +#include +#include +#include + +#include + + +int main() +{ + std::vector pat({"he", "is", "she", "his", "her", + "h", "hishera", "azaza"}); + std::string corp = "hisher"; + std::vector> out; + + boost::algorithm::aho_corasick_map(corp.begin(), corp.end(), pat.begin(), pat.end(), out); + + for(const auto val: out) + { + auto begin = val.first; + auto end = val.second; + while (begin != end) + { + std::cout << *begin; + ++begin; + } + std::cout << std::endl; + } + return 0; +} \ No newline at end of file diff --git a/include/boost/algorithm/searching/aho_corasick.hpp b/include/boost/algorithm/searching/aho_corasick.hpp index a692ba7fd..0b94e3b3f 100644 --- a/include/boost/algorithm/searching/aho_corasick.hpp +++ b/include/boost/algorithm/searching/aho_corasick.hpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include From 19bc4004ff5836f027f37f4cb2927ef6f2b3756c Mon Sep 17 00:00:00 2001 From: Alexander Zaitsev Date: Mon, 15 Aug 2016 00:02:09 +0300 Subject: [PATCH 07/33] Added doxygen and documentation --- doc/aho_corasick.qbk | 140 +++++++++++++++++ doc/algorithm.qbk | 1 + include/boost/algorithm/is_palindrome.hpp | 2 +- .../algorithm/searching/aho_corasick.hpp | 143 ++++++++++++------ 4 files changed, 236 insertions(+), 50 deletions(-) create mode 100644 doc/aho_corasick.qbk diff --git a/doc/aho_corasick.qbk b/doc/aho_corasick.qbk new file mode 100644 index 000000000..c08c4b34d --- /dev/null +++ b/doc/aho_corasick.qbk @@ -0,0 +1,140 @@ +[/ QuickBook Document version 1.5 ] + +[section:AhoCorasick Aho-Corasick Search] + +[/license + +Copyright (c) 2016 Alexander Zaitsev + +Distributed under the Boost Software License, Version 1.0. +(See accompanying file LICENSE_1_0.txt or copy at +http://www.boost.org/LICENSE_1_0.txt) +] + + +[heading Overview] + +The header file 'aho_corasick.hpp' contains an implementation of the Aho-Corasick algorithm for searching sequences of values. It is primarily used to search for multiple patterns within a corpus. + +The Aho-Corasick algorithm works by building a trie (a tree with each node corresponding to an object) of the patterns sequences and traversing the trie to search for the pattern in a given corpus sequence. Additionally, the Aho-Corasick introduced the concept of "failure pointer/failure node" which is the node to be traversed when there is a mismatch. + +The algorithm was conceived in 1975 by Alfred V. Aho and Margaret J. Corasick. Their paper "Efficient string matching: An aid to bibliographic search" was published in the Communications of the ACM. + +Nomenclature: The nomenclature is similar to that of the Knuth Morris Pratt implementation in Boost.Algorithm. The sequence being searched for is referred to as the "pattern", and the sequence being searched in is referred to as the "corpus". + +[heading Interface] + +For flexibility, the Aho-Corasick algorithm has two interfaces; an object-based interface and a procedural one. The object-based interface builds the trie in the constructor, and uses operator () to make suffix links and perform the search. The procedural interface builds the trie(with building suffix links) and does the search all in one step. If you are going to be searching for the same pattern in multiple corpora, then you should use the object interface, and only build the tries once. + +The header file 'aho_corasick.hpp' contains two versions of Aho-Corasick: based on std::map and std::unordered_map. Also there is class AhoCorasick, which you can customize. For every version this header file provide functional and object-based interfaces. + +Procedural interfaces: + +Procedural interfaces provide interfaces based on iterators and Boost.Range. + +For Aho-Corasick based on std::map: + +`` +template , typename RAIterator, + typename ForwardIterator, typename ResultCont> +void aho_corasick_map ( RAIterator corpus_first, RAIterator corpus_last, + ForwardIterator pat_first, ForwardIterator pat_last, + ResultCont &out); + +template , typename Range1, + typename Range2, typename ResultCont> +void aho_corasick_map ( Range1 corpus_range, Range2 pat_range, ResultCont &out); +`` + +For Aho-Corasick based on std::unordered_map: +`` +template , typename Comp = std::equal_to, typename RAIterator, + typename ForwardIterator, typename ResultCont> +void aho_corasick_hashmap ( RAIterator corpus_first, RAIterator corpus_last, + ForwardIterator pat_first, ForwardIterator pat_last, + ResultCont &out); + +template , typename Comp = std::equal_to, typename Range1, + typename Range2, typename ResultCont> +void aho_corasick_hashmap ( Range1 corpus_range, Range2 pat_range, ResultCont &out); +`` + + + +Object interface (typedefs): +`` +template > +using Aho_Corasick_Map = AhoCorasick; + +template , typename Comp = std::equal_to> +using Aho_Corasick_HashMap = AhoCorasick; +`` + +Interface (constructors, operator(), etc.) are equal for Aho_Corasick_Map, Aho_Corasick_HashMap and basical AhoCorasick: +`` +AhoCorasick(); + +template +AhoCorasick(ForwardIterator patBegin, ForwardIterator patEnd); + + +template +void operator()(const Range& range, Out& cont); + +template +void operator()(ForwardIterator begin, ForwardIterator end, Out& cont); +`` + +[heading Return value] + +You must give your own container to all algorithms (Out parameter). This container must consist of pairs of iterators to the corpus sequence. Also this container must support 'push_back' method. + +[heading Requirements] + +For Aho_Corasick_HashMap and aho_corasick_hashmap: by default use std::hash for Hash and std::equal_to as Comparator. If you type doesn't support it, you must use your own functions for this. Without Hash and Comparator algorithm doesn't work. + +For Aho_Corasick_Map and aho_corasick_map: by default use std::less as Predicate. If you type doesn't support it, you must use your own functions for this. Without Predicate algorithm doesn't work. + +[heading Performance] + +Performance of Aho_Corasick_Map and Aho_Corasick_HashMap is similar on small alphabets. On large alphabets Aho_Corasick_HashMap is faster than Aho_Corasick_Map. Remember, that getting hash of element is slow operation. Also if you use Aho_Corasick_HashMap, std::unordered_map can sometimes do rehash with O(Alphabet). + +[heading Memory Use] + +Every node of trie consist of container of std::shared_ptr to trie nodes, which you choose(std::map, std::unordered_map or maybe something else), two std::shared_ptr to trie nodes and std:vector of length of patterns, which that ends in this node. Count of nodes is linear in the sum of the length of the patterns. + +[heading Complexity] + +Nomenclature: M - sum of the patterns length, N - length of the corpus, K - alphabet size, T - number of coincidences + +std::unordered_map-based version: +Time: O(M + N + T), Memory: O(M) +std::map-based version: +Time: O((M + N)log(K) + T), Memory: O(M). + +[heading Exception Safety] + +Both the object-oriented and procedural versions of the Aho-Corasick algorithm take all their parameters by value(exclude output container, taked by non-const reference). Therefore, both interfaces provide the strong exception guarantee. + +[heading Notes] + +* When using the object-based interface, the pattern must remain unchanged for during the inserting. + +* The Aho-Corasick algorithm requires forward iterators for patterns and random-access iterators for the corpus. + +[heading Customization points] + +In Aho_Corasick_HashMap and aho_corasick_hashmap() you can customize: value type, hash and compare functions. + +In Aho_Corasick_Map and aho_corasick_map() you can customize: value type and predicate. + +In AhoCorasick you can customize: value type, type of container and any other template parameters. It container will be used in nodes of the trie. Defining of the container: Container, Args...>. So your other template parameters will be used as Args... . Also your container must support 'find' method. + +[endsect] + +[/ File aho_corasick.qbk +Copyright 2016 Alexander Zaitsev +Distributed under the Boost Software License, Version 1.0. +(See accompanying file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt). +] + diff --git a/doc/algorithm.qbk b/doc/algorithm.qbk index 1568fb50e..014740ec8 100644 --- a/doc/algorithm.qbk +++ b/doc/algorithm.qbk @@ -41,6 +41,7 @@ Thanks to all the people who have reviewed this library and made suggestions for [section:Searching Searching Algorithms] +[include aho_corasick.qbk] [include boyer_moore.qbk] [include boyer_moore_horspool.qbk] [include knuth_morris_pratt.qbk] diff --git a/include/boost/algorithm/is_palindrome.hpp b/include/boost/algorithm/is_palindrome.hpp index 61acbae2e..217657013 100644 --- a/include/boost/algorithm/is_palindrome.hpp +++ b/include/boost/algorithm/is_palindrome.hpp @@ -61,7 +61,7 @@ bool is_palindrome(BidirectionalIterator begin, BidirectionalIterator end, Predi /// \return true if the entire sequence is palindrome /// /// \param begin The start of the input sequence -/// \param end One past the end of the input sequence +/// \param end One past the end of the input sequence /// /// \note This function will return true for empty sequences and for palindromes. /// For other sequences function will return false. diff --git a/include/boost/algorithm/searching/aho_corasick.hpp b/include/boost/algorithm/searching/aho_corasick.hpp index 0b94e3b3f..1e8a1379d 100644 --- a/include/boost/algorithm/searching/aho_corasick.hpp +++ b/include/boost/algorithm/searching/aho_corasick.hpp @@ -67,12 +67,23 @@ class AhoCorasick } } - template - void insert(const R& range) + /// \fn insert(const Range& range) + /// \brief Insert pattern in trie + /// + /// \param range The pattern range + /// + template + void insert(const Range& range) { insert(boost::begin(range), boost::end(range)); } + /// \fn insert(ForwardIterator begin, ForwardIterator end) + /// \brief Insert pattern in trie + /// + /// \param begin The start of the pattern + /// \param end One past the end of the pattern + /// template void insert(ForwardIterator begin, ForwardIterator end) { @@ -92,6 +103,38 @@ class AhoCorasick current_node->pat.push_back(patLen); } + /// \fn operator ( RAIterator begin, RAIterator end, Out& cont) + /// \brief Searches patterns in the corpus + /// + /// \param begin The start of the data to search (Random Access Iterator) + /// \param end One past the end of the data to search (Random Access Iterator) + /// \param cont Output container of pairs of iterators to corpus sequence + /// + template + void operator()(RAIterator begin, RAIterator end, Out& cont) + { + init(); + current_state = root; + for(auto it = begin; it != end; ++it) + { + step(*it); + getTermsForCurrentState(it, cont); + } + } + + /// \fn operator (const Range& range, Out& cont) + /// \brief Searches patterns in the corpus + /// + /// \param range The corpus range + /// \param cont Output container of pairs of iterators to corpus sequence + /// + template + void operator()(const Range& range, Out& cont) + { + operator()(boost::begin(range), boost::end(range), cont); + } + +private: void init() { std::queue> q; @@ -141,8 +184,8 @@ class AhoCorasick current_state = root; } - template - void getTermsForCurrentState(Out& cont, ForwardIterator pos) + template + void getTermsForCurrentState(RAIterator pos, Out& cont) { if (current_state->isTerminal()) { @@ -161,37 +204,6 @@ class AhoCorasick temp_node = temp_node->term; } } - - //Find methods - template - void find(ForwardIterator begin, ForwardIterator end, Out& cont) - { - init(); - current_state = root; - for(auto it = begin; it != end; ++it) - { - step(*it); - getTermsForCurrentState(cont, it); - } - } - - template - void find(const Range& range, Out& cont) - { - return find(boost::begin(range), boost::end(range), cont); - } - - template - void operator()(const Range& range, Out& cont) - { - return find(range, cont); - } - - template - void operator()(ForwardIterator begin, ForwardIterator end, Out& cont) - { - return find(begin, end, cont); - } }; //Object interface @@ -203,40 +215,73 @@ using Aho_Corasick_HashMap = AhoCorasick; //Functional interface -template , typename ForwardIterator1, - typename ForwardIterator2, typename ResultCont> -void aho_corasick_map ( ForwardIterator1 corpus_first, ForwardIterator1 corpus_last, - ForwardIterator2 pat_first, ForwardIterator2 pat_last, + +/// \fn aho_corasick_map ( RAIterator corpus_begin, RAIterator corpus_end, +/// ForwardIterator pat_begin, ForwardIterator pat_end, +/// ResultCont &out) +/// +/// \param corpus_begin The start of the corpus sequence +/// \param corpus_end One past the end of the corpus sequence +/// \param pat_begin The start of the patterns sequence +/// \param pat_end One past the end of the patterns sequence +/// \param out Container for results +/// +template , typename RAIterator, + typename ForwardIterator, typename ResultCont> +void aho_corasick_map ( RAIterator corpus_begin, RAIterator corpus_end, + ForwardIterator pat_begin, ForwardIterator pat_end, ResultCont &out) { - AhoCorasick obj(pat_first, pat_last); - obj.find(corpus_first, corpus_last, out); + AhoCorasick obj(pat_begin, pat_end); + obj(corpus_begin, corpus_end, out); } +/// \fn aho_corasick_map (Range1 corpus_range, Range2 pat_range, ResultCont &out) +/// +/// \param corpus_range The corpus range +/// \param pat_range Patterns range +/// \param out Container for results +/// template , typename Range1, typename Range2, typename ResultCont> void aho_corasick_map ( Range1 corpus_range, Range2 pat_range, ResultCont &out) { AhoCorasick obj(boost::begin(pat_range), boost::end(pat_range)); - obj.find(boost::begin(corpus_range), boost::end(corpus_range), out); + obj(boost::begin(corpus_range), boost::end(corpus_range), out); } -template , typename Comp = std::equal_to, typename ForwardIterator1, - typename ForwardIterator2, typename ResultCont> -void aho_corasick_hashmap ( ForwardIterator1 corpus_first, ForwardIterator1 corpus_last, - ForwardIterator2 pat_first, ForwardIterator2 pat_last, - ResultCont &out) +/// \fn aho_corasick_hashmap ( RAIterator corpus_begin, RAIterator corpus_end, +/// ForwardIterator pat_begin, ForwardIterator pat_end, +/// ResultCont &out) +/// +/// \param corpus_begin The start of the corpus sequence +/// \param corpus_end One past the end of the corpus sequence +/// \param pat_begin The start of the patterns sequence +/// \param pat_end One past the end of the patterns sequence +/// \param out Container for results +/// +template , typename Comp = std::equal_to, typename RAIterator, + typename ForwardIterator, typename ResultCont> +void aho_corasick_hashmap ( RAIterator corpus_first, RAIterator corpus_last, + ForwardIterator pat_first, ForwardIterator pat_last, + ResultCont &out) { AhoCorasick obj(pat_first, pat_last); - obj.find(corpus_first, corpus_last, out); + obj(corpus_first, corpus_last, out); } +/// \fn aho_corasick_hashmap (Range1 corpus_range, Range2 pat_range, ResultCont &out) +/// +/// \param corpus_range The corpus range +/// \param pat_range Patterns range +/// \param out Container for results +/// template , typename Comp = std::equal_to, typename Range1, typename Range2, typename ResultCont> void aho_corasick_hashmap ( Range1 corpus_range, Range2 pat_range, ResultCont &out) { AhoCorasick obj(boost::begin(pat_range), boost::end(pat_range)); - obj.find(boost::begin(corpus_range), boost::end(corpus_range), out); + obj(boost::begin(corpus_range), boost::end(corpus_range), out); } }} From df8321a205ce30e852116cf4e9a2faa47e85dc1b Mon Sep 17 00:00:00 2001 From: Alexander Zaitsev Date: Mon, 15 Aug 2016 00:28:10 +0300 Subject: [PATCH 08/33] Removed range interface for operator() --- doc/aho_corasick.qbk | 13 +- .../algorithm/searching/aho_corasick.hpp | 320 ++++++++---------- 2 files changed, 140 insertions(+), 193 deletions(-) diff --git a/doc/aho_corasick.qbk b/doc/aho_corasick.qbk index c08c4b34d..aa7827267 100644 --- a/doc/aho_corasick.qbk +++ b/doc/aho_corasick.qbk @@ -30,7 +30,7 @@ The header file 'aho_corasick.hpp' contains two versions of Aho-Corasick: based Procedural interfaces: -Procedural interfaces provide interfaces based on iterators and Boost.Range. +Procedural interfaces provide interfaces based on iterators. For Aho-Corasick based on std::map: @@ -40,10 +40,6 @@ template , typename RAIterator, void aho_corasick_map ( RAIterator corpus_first, RAIterator corpus_last, ForwardIterator pat_first, ForwardIterator pat_last, ResultCont &out); - -template , typename Range1, - typename Range2, typename ResultCont> -void aho_corasick_map ( Range1 corpus_range, Range2 pat_range, ResultCont &out); `` For Aho-Corasick based on std::unordered_map: @@ -53,10 +49,6 @@ template , typename Comp = std::equal_t void aho_corasick_hashmap ( RAIterator corpus_first, RAIterator corpus_last, ForwardIterator pat_first, ForwardIterator pat_last, ResultCont &out); - -template , typename Comp = std::equal_to, typename Range1, - typename Range2, typename ResultCont> -void aho_corasick_hashmap ( Range1 corpus_range, Range2 pat_range, ResultCont &out); `` @@ -78,9 +70,6 @@ template AhoCorasick(ForwardIterator patBegin, ForwardIterator patEnd); -template -void operator()(const Range& range, Out& cont); - template void operator()(ForwardIterator begin, ForwardIterator end, Out& cont); `` diff --git a/include/boost/algorithm/searching/aho_corasick.hpp b/include/boost/algorithm/searching/aho_corasick.hpp index 1e8a1379d..3862f96bb 100644 --- a/include/boost/algorithm/searching/aho_corasick.hpp +++ b/include/boost/algorithm/searching/aho_corasick.hpp @@ -22,189 +22,176 @@ namespace boost { namespace algorithm { -template typename Container, typename ...Args> -class AhoCorasick -{ -private: - class MapBorNode + template typename Container, typename ...Args> + class AhoCorasick { - public: - Container, Args...> links; - std::shared_ptr fail, term; - std::vector pat; + private: + class MapBorNode + { + public: + Container, Args...> links; + std::shared_ptr fail, term; + std::vector pat; - MapBorNode(std::shared_ptr fail_node = nullptr) - : fail(fail_node), term(nullptr) - { } + MapBorNode(std::shared_ptr fail_node = nullptr) + : fail(fail_node), term(nullptr) + { } - std::shared_ptr getLink(const T& c) const - { - const auto iter = links.find(c); - return iter != links.cend() ? iter->second : nullptr; - } + std::shared_ptr getLink(const T& c) const + { + const auto iter = links.find(c); + return iter != links.cend() ? iter->second : nullptr; + } - bool isTerminal() const + bool isTerminal() const + { + return !pat.empty(); + } + }; + public: + using value_type = T; + using node_type = MapBorNode; + private: + std::shared_ptr root, current_state; + size_t countStrings = 0; + public: + AhoCorasick() : root(std::make_shared()) {} + + template + AhoCorasick(ForwardIterator patBegin, ForwardIterator patEnd) : root(std::make_shared()) { - return !pat.empty(); + while(patBegin != patEnd) + { + insert(*patBegin); + ++patBegin; + } } - }; -public: - using value_type = T; - using node_type = MapBorNode; -private: - std::shared_ptr root, current_state; - size_t countStrings = 0; -public: - AhoCorasick() : root(std::make_shared()) {} - template - AhoCorasick(ForwardIterator patBegin, ForwardIterator patEnd) : root(std::make_shared()) - { - while(patBegin != patEnd) + /// \fn insert(const Range& range) + /// \brief Insert pattern in trie + /// + /// \param range The pattern range + /// + template + void insert(const Range& range) { - insert(*patBegin); - ++patBegin; + insert(boost::begin(range), boost::end(range)); } - } - - /// \fn insert(const Range& range) - /// \brief Insert pattern in trie - /// - /// \param range The pattern range - /// - template - void insert(const Range& range) - { - insert(boost::begin(range), boost::end(range)); - } - /// \fn insert(ForwardIterator begin, ForwardIterator end) - /// \brief Insert pattern in trie - /// - /// \param begin The start of the pattern - /// \param end One past the end of the pattern - /// - template - void insert(ForwardIterator begin, ForwardIterator end) - { - size_t patLen = 0; - std::shared_ptr current_node = root; - for(auto it = begin; it != end; ++it) + /// \fn insert(ForwardIterator begin, ForwardIterator end) + /// \brief Insert pattern in trie + /// + /// \param begin The start of the pattern + /// \param end One past the end of the pattern + /// + template + void insert(ForwardIterator begin, ForwardIterator end) { - ++patLen; - std::shared_ptr child_node = current_node->getLink(*it); - if (!child_node) + size_t patLen = 0; + std::shared_ptr current_node = root; + for(auto it = begin; it != end; ++it) { - child_node = std::make_shared(root); - current_node->links[*it] = child_node; + ++patLen; + std::shared_ptr child_node = current_node->getLink(*it); + if (!child_node) + { + child_node = std::make_shared(root); + current_node->links[*it] = child_node; + } + current_node = child_node; } - current_node = child_node; + current_node->pat.push_back(patLen); } - current_node->pat.push_back(patLen); - } - /// \fn operator ( RAIterator begin, RAIterator end, Out& cont) - /// \brief Searches patterns in the corpus - /// - /// \param begin The start of the data to search (Random Access Iterator) - /// \param end One past the end of the data to search (Random Access Iterator) - /// \param cont Output container of pairs of iterators to corpus sequence - /// - template - void operator()(RAIterator begin, RAIterator end, Out& cont) - { - init(); - current_state = root; - for(auto it = begin; it != end; ++it) + /// \fn operator (RAIterator begin, RAIterator end, Out& cont) + /// \brief Searches patterns in the corpus + /// + /// \param begin The start of the data to search (Random Access Iterator) + /// \param end One past the end of the data to search (Random Access Iterator) + /// \param cont Output container of pairs of iterators to corpus sequence + /// + template + void operator()(RAIterator begin, RAIterator end, Out& cont) { - step(*it); - getTermsForCurrentState(it, cont); + init(); + current_state = root; + for(auto it = begin; it != end; ++it) + { + step(*it); + getTermsForCurrentState(it, cont); + } } - } - - /// \fn operator (const Range& range, Out& cont) - /// \brief Searches patterns in the corpus - /// - /// \param range The corpus range - /// \param cont Output container of pairs of iterators to corpus sequence - /// - template - void operator()(const Range& range, Out& cont) - { - operator()(boost::begin(range), boost::end(range), cont); - } - -private: - void init() - { - std::queue> q; - q.push(root); - while (!q.empty()) + private: + void init() { - std::shared_ptr current_node = q.front(); - q.pop(); - for (auto iter = current_node->links.cbegin(); - iter != current_node->links.cend(); ++iter) + std::queue> q; + q.push(root); + while (!q.empty()) { - const value_type& symbol = iter->first; - std::shared_ptr child = iter->second; - - // Defining .fail for the childnode - std::shared_ptr temp_node = current_node->fail; - while (temp_node) + std::shared_ptr current_node = q.front(); + q.pop(); + for (auto iter = current_node->links.cbegin(); + iter != current_node->links.cend(); ++iter) { - std::shared_ptr fail_candidate = temp_node->getLink(symbol); - if (fail_candidate) + const value_type& symbol = iter->first; + std::shared_ptr child = iter->second; + + // Defining .fail for the childnode + std::shared_ptr temp_node = current_node->fail; + while (temp_node) { - child->fail = fail_candidate; - break; + std::shared_ptr fail_candidate = temp_node->getLink(symbol); + if (fail_candidate) + { + child->fail = fail_candidate; + break; + } + temp_node = temp_node->fail; } - temp_node = temp_node->fail; - } - // Defining .term for the childnode using .term of current node - child->term = (child->fail == nullptr || child->fail->isTerminal()) ? child->fail : child->fail->term; - q.push(child); + // Defining .term for the childnode using .term of current node + child->term = (child->fail == nullptr || child->fail->isTerminal()) ? child->fail : child->fail->term; + q.push(child); + } } } - } - void step(const value_type& c) - { - while (current_state) + void step(const value_type& c) { - std::shared_ptr candidate = current_state->getLink(c); - if (candidate) + while (current_state) { - current_state = candidate; - return; + std::shared_ptr candidate = current_state->getLink(c); + if (candidate) + { + current_state = candidate; + return; + } + current_state = current_state->fail; } - current_state = current_state->fail; + current_state = root; } - current_state = root; - } - template - void getTermsForCurrentState(RAIterator pos, Out& cont) - { - if (current_state->isTerminal()) + template + void getTermsForCurrentState(RAIterator pos, Out& cont) { - for (const auto value : current_state->pat) + if (current_state->isTerminal()) { - cont.push_back({1 + pos - value, pos + 1}); + for (const auto value : current_state->pat) + { + cont.push_back({1 + pos - value, pos + 1}); + } } - } - std::shared_ptr temp_node = current_state->term; - while (temp_node) - { - for (const auto value : temp_node->pat) + std::shared_ptr temp_node = current_state->term; + while (temp_node) { - cont.push_back({1 + pos - value, pos + 1}); + for (const auto value : temp_node->pat) + { + cont.push_back({1 + pos - value, pos + 1}); + } + temp_node = temp_node->term; } - temp_node = temp_node->term; } - } -}; + }; //Object interface template > @@ -225,9 +212,9 @@ using Aho_Corasick_HashMap = AhoCorasick; /// \param pat_begin The start of the patterns sequence /// \param pat_end One past the end of the patterns sequence /// \param out Container for results -/// +/// template , typename RAIterator, - typename ForwardIterator, typename ResultCont> + typename ForwardIterator, typename ResultCont> void aho_corasick_map ( RAIterator corpus_begin, RAIterator corpus_end, ForwardIterator pat_begin, ForwardIterator pat_end, ResultCont &out) @@ -236,20 +223,6 @@ void aho_corasick_map ( RAIterator corpus_begin, RAIterator corpus_end, obj(corpus_begin, corpus_end, out); } -/// \fn aho_corasick_map (Range1 corpus_range, Range2 pat_range, ResultCont &out) -/// -/// \param corpus_range The corpus range -/// \param pat_range Patterns range -/// \param out Container for results -/// -template , typename Range1, - typename Range2, typename ResultCont> -void aho_corasick_map ( Range1 corpus_range, Range2 pat_range, ResultCont &out) -{ - AhoCorasick obj(boost::begin(pat_range), boost::end(pat_range)); - obj(boost::begin(corpus_range), boost::end(corpus_range), out); -} - /// \fn aho_corasick_hashmap ( RAIterator corpus_begin, RAIterator corpus_end, /// ForwardIterator pat_begin, ForwardIterator pat_end, /// ResultCont &out) @@ -259,31 +232,16 @@ void aho_corasick_map ( Range1 corpus_range, Range2 pat_range, ResultCont &out) /// \param pat_begin The start of the patterns sequence /// \param pat_end One past the end of the patterns sequence /// \param out Container for results -/// +/// template , typename Comp = std::equal_to, typename RAIterator, - typename ForwardIterator, typename ResultCont> + typename ForwardIterator, typename ResultCont> void aho_corasick_hashmap ( RAIterator corpus_first, RAIterator corpus_last, - ForwardIterator pat_first, ForwardIterator pat_last, - ResultCont &out) + ForwardIterator pat_first, ForwardIterator pat_last, + ResultCont &out) { AhoCorasick obj(pat_first, pat_last); obj(corpus_first, corpus_last, out); } - -/// \fn aho_corasick_hashmap (Range1 corpus_range, Range2 pat_range, ResultCont &out) -/// -/// \param corpus_range The corpus range -/// \param pat_range Patterns range -/// \param out Container for results -/// -template , typename Comp = std::equal_to, typename Range1, - typename Range2, typename ResultCont> -void aho_corasick_hashmap ( Range1 corpus_range, Range2 pat_range, ResultCont &out) -{ - AhoCorasick obj(boost::begin(pat_range), boost::end(pat_range)); - obj(boost::begin(corpus_range), boost::end(corpus_range), out); -} - }} #endif //AHO_CORASIK_AHO_CORASIK_HPP From 14392f87721b8b1c4ba8565d07faa384effb23f8 Mon Sep 17 00:00:00 2001 From: Alexander Zaitsev Date: Mon, 15 Aug 2016 03:32:51 +0300 Subject: [PATCH 09/33] Added in doc 'insert' method --- doc/aho_corasick.qbk | 5 + .../algorithm/searching/aho_corasick.hpp | 272 +++++++++--------- 2 files changed, 141 insertions(+), 136 deletions(-) diff --git a/doc/aho_corasick.qbk b/doc/aho_corasick.qbk index aa7827267..9fedd9cc8 100644 --- a/doc/aho_corasick.qbk +++ b/doc/aho_corasick.qbk @@ -69,6 +69,11 @@ AhoCorasick(); template AhoCorasick(ForwardIterator patBegin, ForwardIterator patEnd); +template +void insert(ForwardIterator begin, ForwardIterator end); + +template +void insert(const Range& range); template void operator()(ForwardIterator begin, ForwardIterator end, Out& cont); diff --git a/include/boost/algorithm/searching/aho_corasick.hpp b/include/boost/algorithm/searching/aho_corasick.hpp index 3862f96bb..a17aef893 100644 --- a/include/boost/algorithm/searching/aho_corasick.hpp +++ b/include/boost/algorithm/searching/aho_corasick.hpp @@ -22,176 +22,176 @@ namespace boost { namespace algorithm { - template typename Container, typename ...Args> - class AhoCorasick +template typename Container, typename ...Args> +class AhoCorasick +{ +private: + class MapBorNode { - private: - class MapBorNode - { - public: - Container, Args...> links; - std::shared_ptr fail, term; - std::vector pat; - - MapBorNode(std::shared_ptr fail_node = nullptr) - : fail(fail_node), term(nullptr) - { } - - std::shared_ptr getLink(const T& c) const - { - const auto iter = links.find(c); - return iter != links.cend() ? iter->second : nullptr; - } - - bool isTerminal() const - { - return !pat.empty(); - } - }; public: - using value_type = T; - using node_type = MapBorNode; - private: - std::shared_ptr root, current_state; - size_t countStrings = 0; - public: - AhoCorasick() : root(std::make_shared()) {} + Container, Args...> links; + std::shared_ptr fail, term; + std::vector pat; + + MapBorNode(std::shared_ptr fail_node = nullptr) + : fail(fail_node), term(nullptr) + { } - template - AhoCorasick(ForwardIterator patBegin, ForwardIterator patEnd) : root(std::make_shared()) + std::shared_ptr getLink(const T& c) const { - while(patBegin != patEnd) - { - insert(*patBegin); - ++patBegin; - } + const auto iter = links.find(c); + return iter != links.cend() ? iter->second : nullptr; } - /// \fn insert(const Range& range) - /// \brief Insert pattern in trie - /// - /// \param range The pattern range - /// - template - void insert(const Range& range) + bool isTerminal() const { - insert(boost::begin(range), boost::end(range)); + return !pat.empty(); } - - /// \fn insert(ForwardIterator begin, ForwardIterator end) - /// \brief Insert pattern in trie - /// - /// \param begin The start of the pattern - /// \param end One past the end of the pattern - /// - template - void insert(ForwardIterator begin, ForwardIterator end) + }; +public: + using value_type = T; + using node_type = MapBorNode; +private: + std::shared_ptr root, current_state; + size_t countStrings = 0; +public: + AhoCorasick() : root(std::make_shared()) {} + + template + AhoCorasick(ForwardIterator patBegin, ForwardIterator patEnd) : root(std::make_shared()) + { + while(patBegin != patEnd) { - size_t patLen = 0; - std::shared_ptr current_node = root; - for(auto it = begin; it != end; ++it) - { - ++patLen; - std::shared_ptr child_node = current_node->getLink(*it); - if (!child_node) - { - child_node = std::make_shared(root); - current_node->links[*it] = child_node; - } - current_node = child_node; - } - current_node->pat.push_back(patLen); + insert(*patBegin); + ++patBegin; } - - /// \fn operator (RAIterator begin, RAIterator end, Out& cont) - /// \brief Searches patterns in the corpus - /// - /// \param begin The start of the data to search (Random Access Iterator) - /// \param end One past the end of the data to search (Random Access Iterator) - /// \param cont Output container of pairs of iterators to corpus sequence - /// - template - void operator()(RAIterator begin, RAIterator end, Out& cont) + } + + /// \fn insert(const Range& range) + /// \brief Insert pattern in trie + /// + /// \param range The pattern range + /// + template + void insert(const Range& range) + { + insert(boost::begin(range), boost::end(range)); + } + + /// \fn insert(ForwardIterator begin, ForwardIterator end) + /// \brief Insert pattern in trie + /// + /// \param begin The start of the pattern + /// \param end One past the end of the pattern + /// + template + void insert(ForwardIterator begin, ForwardIterator end) + { + size_t patLen = 0; + std::shared_ptr current_node = root; + for(auto it = begin; it != end; ++it) { - init(); - current_state = root; - for(auto it = begin; it != end; ++it) + ++patLen; + std::shared_ptr child_node = current_node->getLink(*it); + if (!child_node) { - step(*it); - getTermsForCurrentState(it, cont); + child_node = std::make_shared(root); + current_node->links[*it] = child_node; } + current_node = child_node; + } + current_node->pat.push_back(patLen); + } + + /// \fn operator ( RAIterator begin, RAIterator end, Out& cont) + /// \brief Searches patterns in the corpus + /// + /// \param begin The start of the data to search (Random Access Iterator) + /// \param end One past the end of the data to search (Random Access Iterator) + /// \param cont Output container of pairs of iterators to corpus sequence + /// + template + void operator()(RAIterator begin, RAIterator end, Out& cont) + { + init(); + current_state = root; + for(auto it = begin; it != end; ++it) + { + step(*it); + getTermsForCurrentState(it, cont); } - private: - void init() + } +private: + void init() + { + std::queue> q; + q.push(root); + while (!q.empty()) { - std::queue> q; - q.push(root); - while (!q.empty()) + std::shared_ptr current_node = q.front(); + q.pop(); + for (auto iter = current_node->links.cbegin(); + iter != current_node->links.cend(); ++iter) { - std::shared_ptr current_node = q.front(); - q.pop(); - for (auto iter = current_node->links.cbegin(); - iter != current_node->links.cend(); ++iter) - { - const value_type& symbol = iter->first; - std::shared_ptr child = iter->second; + const value_type& symbol = iter->first; + std::shared_ptr child = iter->second; - // Defining .fail for the childnode - std::shared_ptr temp_node = current_node->fail; - while (temp_node) + // Defining .fail for the childnode + std::shared_ptr temp_node = current_node->fail; + while (temp_node) + { + std::shared_ptr fail_candidate = temp_node->getLink(symbol); + if (fail_candidate) { - std::shared_ptr fail_candidate = temp_node->getLink(symbol); - if (fail_candidate) - { - child->fail = fail_candidate; - break; - } - temp_node = temp_node->fail; + child->fail = fail_candidate; + break; } - - // Defining .term for the childnode using .term of current node - child->term = (child->fail == nullptr || child->fail->isTerminal()) ? child->fail : child->fail->term; - q.push(child); + temp_node = temp_node->fail; } + + // Defining .term for the childnode using .term of current node + child->term = (child->fail == nullptr || child->fail->isTerminal()) ? child->fail : child->fail->term; + q.push(child); } } + } - void step(const value_type& c) + void step(const value_type& c) + { + while (current_state) { - while (current_state) + std::shared_ptr candidate = current_state->getLink(c); + if (candidate) { - std::shared_ptr candidate = current_state->getLink(c); - if (candidate) - { - current_state = candidate; - return; - } - current_state = current_state->fail; + current_state = candidate; + return; } - current_state = root; + current_state = current_state->fail; } + current_state = root; + } - template - void getTermsForCurrentState(RAIterator pos, Out& cont) + template + void getTermsForCurrentState(RAIterator pos, Out& cont) + { + if (current_state->isTerminal()) { - if (current_state->isTerminal()) + for (const auto value : current_state->pat) { - for (const auto value : current_state->pat) - { - cont.push_back({1 + pos - value, pos + 1}); - } + cont.push_back({1 + pos - value, pos + 1}); } - std::shared_ptr temp_node = current_state->term; - while (temp_node) + } + std::shared_ptr temp_node = current_state->term; + while (temp_node) + { + for (const auto value : temp_node->pat) { - for (const auto value : temp_node->pat) - { - cont.push_back({1 + pos - value, pos + 1}); - } - temp_node = temp_node->term; + cont.push_back({1 + pos - value, pos + 1}); } + temp_node = temp_node->term; } - }; + } +}; //Object interface template > From 4f325aec65614f35e031934d9d1ceba6c0e2f3ec Mon Sep 17 00:00:00 2001 From: Alexander Zaitsev Date: Tue, 16 Aug 2016 21:23:54 +0300 Subject: [PATCH 10/33] Added tests. --- test/Jamfile.v2 | 1 + test/aho_corasick_test.cpp | 112 +++++++++++++++++++++++++++++++++++++ 2 files changed, 113 insertions(+) create mode 100644 test/aho_corasick_test.cpp diff --git a/test/Jamfile.v2 b/test/Jamfile.v2 index fb00843ad..8360cabdb 100644 --- a/test/Jamfile.v2 +++ b/test/Jamfile.v2 @@ -26,6 +26,7 @@ alias unit_test_framework [ compile-fail search_fail1.cpp : : : : ] [ compile-fail search_fail2.cpp : : : : ] [ compile-fail search_fail3.cpp : : : : ] + [ run aho_corasick_test.cpp aho_corasick_test : : : : aho_corasick_test ] # Misc tests [ run clamp_test.cpp unit_test_framework : : : : clamp_test ] diff --git a/test/aho_corasick_test.cpp b/test/aho_corasick_test.cpp new file mode 100644 index 000000000..545ce30e2 --- /dev/null +++ b/test/aho_corasick_test.cpp @@ -0,0 +1,112 @@ +/* + Copyright (c) Alexander Zaitsev , 2016 + Distributed under the Boost Software License, Version 1.0. (See + accompanying file LICENSE_1_0.txt or copy at + http://www.boost.org/LICENSE_1_0.txt) + See http://www.boost.org/ for latest version. +*/ + +#include +#include + +#define BOOST_TEST_MAIN +#include + +#include +#include +#include +#include +#include + + +namespace ba = boost::algorithm; +const std::vector> patterns({std::vector({"he", "is", "she", "his", "her", "h", "hishera", "azaza"}), + std::vector({"he", "she", "his", "her", "he", "usher", "d sh", "she hi"}), + std::vector({"he", "she", "his", "her", "he", "usher", "d sh", "she hi"})}); + +const std::vector corpus({"hisher", + "usher and she he her", + ""}); + +const std::vector> rightResults({std::vector({"h", "his", "is", "h", "she", "he", "her"}), + std::vector({"she", "he", "he", "usher", "her", "d sh", "she", + "he", "he", "he", "he", "he", "he", "her"}), + std::vector()});; +template +void fromIteratorsToContainer(const Cont1& from, Cont2& to) +{ + for (const auto &val: from) + { + T str; + auto begin = val.first; + auto end = val.second; + while (begin != end) + { + str += *begin; + ++begin; + } + to.push_back(std::move(str)); + } +} + +void test_aho_corasick() +{ + BOOST_CHECK(patterns.size() == corpus.size()); + //aho_corasick_map + for(size_t i = 0; i < patterns.size(); ++i) + { + std::vector> res; + std::vector localResult; + ba::aho_corasick_map(corpus[i].begin(), corpus[i].end(), patterns[i].begin(), patterns[i].end(), res); + fromIteratorsToContainer(res, localResult); + BOOST_CHECK(localResult == rightResults[i]); + } + + //aho_corasick_hashmap + for(size_t i = 0; i < patterns.size(); ++i) + { + std::vector> res; + std::vector localResult; + ba::aho_corasick_hashmap(corpus[i].begin(), corpus[i].end(), patterns[i].begin(), patterns[i].end(), res); + fromIteratorsToContainer(res, localResult); + BOOST_CHECK(localResult == rightResults[i]); + } + + //Aho_Corasick_Map + for(size_t i = 0; i < patterns.size(); ++i) + { + std::vector> res; + std::vector localResult; + ba::Aho_Corasick_Map obj(patterns[i].begin(), patterns[i].end()); + obj(corpus[i].begin(), corpus[i].end(), res); + fromIteratorsToContainer(res, localResult); + BOOST_CHECK(localResult == rightResults[i]); + } + + //Aho_Corasick_HashMap + for(size_t i = 0; i < patterns.size(); ++i) + { + std::vector> res; + std::vector localResult; + ba::Aho_Corasick_HashMap obj(patterns[i].begin(), patterns[i].end()); + obj(corpus[i].begin(), corpus[i].end(), res); + fromIteratorsToContainer(res, localResult); + BOOST_CHECK(localResult == rightResults[i]); + } + + //General AhoCorasick + for(size_t i = 0; i < patterns.size(); ++i) + { + std::vector> res; + std::vector localResult; + ba::AhoCorasick, std::equal_to> obj(patterns[i].begin(), patterns[i].end()); + obj(corpus[i].begin(), corpus[i].end(), res); + fromIteratorsToContainer(res, localResult); + BOOST_CHECK(localResult == rightResults[i]); + } +} + +BOOST_AUTO_TEST_CASE( test_main ) +{ + test_aho_corasick(); +} \ No newline at end of file From a40fe59e9667e795db73151f68f5064b81f62d98 Mon Sep 17 00:00:00 2001 From: Alexander Zaitsev Date: Wed, 24 Aug 2016 15:59:24 +0300 Subject: [PATCH 11/33] New version 'Aho-Corasick' Now Aho-Corasick uses callback instead of out container. Updated algorithm, documentation, example, tests. --- doc/aho_corasick.qbk | 33 ++++--- example/aho_corasick_example.cpp | 9 +- .../algorithm/searching/aho_corasick.hpp | 96 +++++++++++-------- test/aho_corasick_test.cpp | 20 +++- 4 files changed, 95 insertions(+), 63 deletions(-) diff --git a/doc/aho_corasick.qbk b/doc/aho_corasick.qbk index 9fedd9cc8..6f7031aa1 100644 --- a/doc/aho_corasick.qbk +++ b/doc/aho_corasick.qbk @@ -24,7 +24,7 @@ Nomenclature: The nomenclature is similar to that of the Knuth Morris Pratt impl [heading Interface] -For flexibility, the Aho-Corasick algorithm has two interfaces; an object-based interface and a procedural one. The object-based interface builds the trie in the constructor, and uses operator () to make suffix links and perform the search. The procedural interface builds the trie(with building suffix links) and does the search all in one step. If you are going to be searching for the same pattern in multiple corpora, then you should use the object interface, and only build the tries once. +For flexibility, the Aho-Corasick algorithm has two interfaces; an object-based interface and a procedural one. The object-based interface builds the trie in the constructor, and uses 'find()' to make suffix links and perform the search. The procedural interface builds the trie(with building suffix links) and does the search all in one step. If you are going to be searching for the same pattern in multiple corpora, then you should use the object interface, and only build the tries once. The header file 'aho_corasick.hpp' contains two versions of Aho-Corasick: based on std::map and std::unordered_map. Also there is class AhoCorasick, which you can customize. For every version this header file provide functional and object-based interfaces. @@ -36,19 +36,19 @@ For Aho-Corasick based on std::map: `` template , typename RAIterator, - typename ForwardIterator, typename ResultCont> -void aho_corasick_map ( RAIterator corpus_first, RAIterator corpus_last, + typename ForwardIterator, typename Callback> +bool aho_corasick_map ( RAIterator corpus_first, RAIterator corpus_last, ForwardIterator pat_first, ForwardIterator pat_last, - ResultCont &out); + Callback cb); `` For Aho-Corasick based on std::unordered_map: `` template , typename Comp = std::equal_to, typename RAIterator, - typename ForwardIterator, typename ResultCont> -void aho_corasick_hashmap ( RAIterator corpus_first, RAIterator corpus_last, - ForwardIterator pat_first, ForwardIterator pat_last, - ResultCont &out); + typename ForwardIterator, typename Callback> +bool aho_corasick_hashmap ( RAIterator corpus_first, RAIterator corpus_last, + ForwardIterator pat_first, ForwardIterator pat_last, + Callback cb); `` @@ -62,12 +62,15 @@ template , typename Comp = std::equal_t using Aho_Corasick_HashMap = AhoCorasick; `` -Interface (constructors, operator(), etc.) are equal for Aho_Corasick_Map, Aho_Corasick_HashMap and basical AhoCorasick: +Interface (constructors, etc.) are equal for Aho_Corasick_Map, Aho_Corasick_HashMap and basical AhoCorasick: `` AhoCorasick(); template -AhoCorasick(ForwardIterator patBegin, ForwardIterator patEnd); +explicit AhoCorasick(ForwardIterator patBegin, ForwardIterator patEnd); + +template +explicit AhoCorasick(const Range& range); template void insert(ForwardIterator begin, ForwardIterator end); @@ -75,16 +78,18 @@ void insert(ForwardIterator begin, ForwardIterator end); template void insert(const Range& range); -template -void operator()(ForwardIterator begin, ForwardIterator end, Out& cont); +template +bool find(RAIterator begin, RAIterator end, Callback cb); `` [heading Return value] -You must give your own container to all algorithms (Out parameter). This container must consist of pairs of iterators to the corpus sequence. Also this container must support 'push_back' method. +The 'find' method returns true, if all Callback callings return true, otherwise returns false. [heading Requirements] +C++11-compatible compiler required. + For Aho_Corasick_HashMap and aho_corasick_hashmap: by default use std::hash for Hash and std::equal_to as Comparator. If you type doesn't support it, you must use your own functions for this. Without Hash and Comparator algorithm doesn't work. For Aho_Corasick_Map and aho_corasick_map: by default use std::less as Predicate. If you type doesn't support it, you must use your own functions for this. Without Predicate algorithm doesn't work. @@ -118,6 +123,8 @@ Both the object-oriented and procedural versions of the Aho-Corasick algorithm t [heading Customization points] +For using Aho-Corasick algorithms you must use your own Callback(RAIterator, RAIterator) -> bool. This Callback must returns true if all is fine, otherwise false. + In Aho_Corasick_HashMap and aho_corasick_hashmap() you can customize: value type, hash and compare functions. In Aho_Corasick_Map and aho_corasick_map() you can customize: value type and predicate. diff --git a/example/aho_corasick_example.cpp b/example/aho_corasick_example.cpp index c77b6c6a9..d2bc6e283 100644 --- a/example/aho_corasick_example.cpp +++ b/example/aho_corasick_example.cpp @@ -16,14 +16,17 @@ int main() { - std::vector pat({"he", "is", "she", "his", "her", + std::vector pat({"228", "he", "is", "1488", "she", "his", "322", "her", "h", "hishera", "azaza"}); std::string corp = "hisher"; std::vector> out; - boost::algorithm::aho_corasick_map(corp.begin(), corp.end(), pat.begin(), pat.end(), out); + bool result = boost::algorithm::aho_corasick_map(corp.begin(), corp.end(), pat.begin(), pat.end(), + [&out](std::string::const_iterator begin, std::string::const_iterator end) -> bool + { out.push_back({begin, end}); return true; }); - for(const auto val: out) + std::cout << result << std::endl; + for(const auto& val: out) { auto begin = val.first; auto end = val.second; diff --git a/include/boost/algorithm/searching/aho_corasick.hpp b/include/boost/algorithm/searching/aho_corasick.hpp index a17aef893..1fd8df5a9 100644 --- a/include/boost/algorithm/searching/aho_corasick.hpp +++ b/include/boost/algorithm/searching/aho_corasick.hpp @@ -18,7 +18,7 @@ #include #include - +#include namespace boost { namespace algorithm { @@ -29,18 +29,18 @@ class AhoCorasick class MapBorNode { public: - Container, Args...> links; - std::shared_ptr fail, term; + Container, Args...> links; + MapBorNode *fail, *term; std::vector pat; - MapBorNode(std::shared_ptr fail_node = nullptr) + MapBorNode(MapBorNode* fail_node = nullptr) : fail(fail_node), term(nullptr) { } - std::shared_ptr getLink(const T& c) const + MapBorNode* getLink(const T& c) const { const auto iter = links.find(c); - return iter != links.cend() ? iter->second : nullptr; + return iter != links.cend() ? (iter->second).get() : nullptr; } bool isTerminal() const @@ -52,13 +52,14 @@ class AhoCorasick using value_type = T; using node_type = MapBorNode; private: - std::shared_ptr root, current_state; + std::unique_ptr root; + node_type* current_state; size_t countStrings = 0; public: - AhoCorasick() : root(std::make_shared()) {} + AhoCorasick() : root(boost::make_unique()) {} template - AhoCorasick(ForwardIterator patBegin, ForwardIterator patEnd) : root(std::make_shared()) + explicit AhoCorasick(ForwardIterator patBegin, ForwardIterator patEnd) : root(boost::make_unique()) { while(patBegin != patEnd) { @@ -67,6 +68,9 @@ class AhoCorasick } } + template + explicit AhoCorasick(const Range& range) : AhoCorasick(boost::begin(range), boost::end(range)) {} + /// \fn insert(const Range& range) /// \brief Insert pattern in trie /// @@ -88,59 +92,63 @@ class AhoCorasick void insert(ForwardIterator begin, ForwardIterator end) { size_t patLen = 0; - std::shared_ptr current_node = root; + node_type* current_node = root.get(); for(auto it = begin; it != end; ++it) { ++patLen; - std::shared_ptr child_node = current_node->getLink(*it); + node_type* child_node = current_node->getLink(*it); if (!child_node) { - child_node = std::make_shared(root); - current_node->links[*it] = child_node; + std::unique_ptr new_node = boost::make_unique(root.get()); + child_node = new_node.get(); + current_node->links[*it] = std::move(new_node); } current_node = child_node; } current_node->pat.push_back(patLen); } - /// \fn operator ( RAIterator begin, RAIterator end, Out& cont) + /// \fn find ( RAIterator begin, RAIterator end, Callback cb) /// \brief Searches patterns in the corpus + /// \return true if all callback callings return true, else false /// /// \param begin The start of the data to search (Random Access Iterator) /// \param end One past the end of the data to search (Random Access Iterator) - /// \param cont Output container of pairs of iterators to corpus sequence + /// \param cb Callback for matches /// - template - void operator()(RAIterator begin, RAIterator end, Out& cont) + template + bool find(RAIterator begin, RAIterator end, Callback cb) { + bool result = true; init(); - current_state = root; + current_state = root.get(); for(auto it = begin; it != end; ++it) { step(*it); - getTermsForCurrentState(it, cont); + result &= getTermsForCurrentState(it, cb); } + return result; } private: void init() { - std::queue> q; - q.push(root); + std::queue q; + q.push(root.get()); while (!q.empty()) { - std::shared_ptr current_node = q.front(); + node_type* current_node = q.front(); q.pop(); for (auto iter = current_node->links.cbegin(); iter != current_node->links.cend(); ++iter) { const value_type& symbol = iter->first; - std::shared_ptr child = iter->second; + node_type* child = (iter->second).get(); // Defining .fail for the childnode - std::shared_ptr temp_node = current_node->fail; + node_type* temp_node = current_node->fail; while (temp_node) { - std::shared_ptr fail_candidate = temp_node->getLink(symbol); + node_type* fail_candidate = temp_node->getLink(symbol); if (fail_candidate) { child->fail = fail_candidate; @@ -160,7 +168,7 @@ class AhoCorasick { while (current_state) { - std::shared_ptr candidate = current_state->getLink(c); + node_type* candidate = current_state->getLink(c); if (candidate) { current_state = candidate; @@ -168,28 +176,30 @@ class AhoCorasick } current_state = current_state->fail; } - current_state = root; + current_state = root.get(); } - template - void getTermsForCurrentState(RAIterator pos, Out& cont) + template + bool getTermsForCurrentState(RAIterator pos, Callback cb) { + bool result = true; if (current_state->isTerminal()) { for (const auto value : current_state->pat) { - cont.push_back({1 + pos - value, pos + 1}); + result &= cb(1 + pos - value, pos + 1); } } - std::shared_ptr temp_node = current_state->term; + node_type* temp_node = current_state->term; while (temp_node) { for (const auto value : temp_node->pat) { - cont.push_back({1 + pos - value, pos + 1}); + result &= cb(1 + pos - value, pos + 1); } temp_node = temp_node->term; } + return result; } }; @@ -205,7 +215,8 @@ using Aho_Corasick_HashMap = AhoCorasick; /// \fn aho_corasick_map ( RAIterator corpus_begin, RAIterator corpus_end, /// ForwardIterator pat_begin, ForwardIterator pat_end, -/// ResultCont &out) +/// Callback &out) +/// \return true if all callback callings return true, else false /// /// \param corpus_begin The start of the corpus sequence /// \param corpus_end One past the end of the corpus sequence @@ -214,18 +225,19 @@ using Aho_Corasick_HashMap = AhoCorasick; /// \param out Container for results /// template , typename RAIterator, - typename ForwardIterator, typename ResultCont> -void aho_corasick_map ( RAIterator corpus_begin, RAIterator corpus_end, + typename ForwardIterator, typename Callback> +bool aho_corasick_map ( RAIterator corpus_begin, RAIterator corpus_end, ForwardIterator pat_begin, ForwardIterator pat_end, - ResultCont &out) + Callback cb) { AhoCorasick obj(pat_begin, pat_end); - obj(corpus_begin, corpus_end, out); + return obj.find(corpus_begin, corpus_end, cb); } /// \fn aho_corasick_hashmap ( RAIterator corpus_begin, RAIterator corpus_end, /// ForwardIterator pat_begin, ForwardIterator pat_end, -/// ResultCont &out) +/// Callback &out) +/// \return true if all callback callings return true, else false /// /// \param corpus_begin The start of the corpus sequence /// \param corpus_end One past the end of the corpus sequence @@ -234,13 +246,13 @@ void aho_corasick_map ( RAIterator corpus_begin, RAIterator corpus_end, /// \param out Container for results /// template , typename Comp = std::equal_to, typename RAIterator, - typename ForwardIterator, typename ResultCont> -void aho_corasick_hashmap ( RAIterator corpus_first, RAIterator corpus_last, + typename ForwardIterator, typename Callback> +bool aho_corasick_hashmap ( RAIterator corpus_first, RAIterator corpus_last, ForwardIterator pat_first, ForwardIterator pat_last, - ResultCont &out) + Callback cb) { AhoCorasick obj(pat_first, pat_last); - obj(corpus_first, corpus_last, out); + return obj.find(corpus_first, corpus_last, cb); } }} diff --git a/test/aho_corasick_test.cpp b/test/aho_corasick_test.cpp index 545ce30e2..98c92cc59 100644 --- a/test/aho_corasick_test.cpp +++ b/test/aho_corasick_test.cpp @@ -57,7 +57,9 @@ void test_aho_corasick() { std::vector> res; std::vector localResult; - ba::aho_corasick_map(corpus[i].begin(), corpus[i].end(), patterns[i].begin(), patterns[i].end(), res); + ba::aho_corasick_map(corpus[i].begin(), corpus[i].end(), patterns[i].begin(), patterns[i].end(), + [&res](std::string::const_iterator begin, std::string::const_iterator end) -> bool + { res.push_back({begin, end}); return true; }); fromIteratorsToContainer(res, localResult); BOOST_CHECK(localResult == rightResults[i]); } @@ -67,7 +69,9 @@ void test_aho_corasick() { std::vector> res; std::vector localResult; - ba::aho_corasick_hashmap(corpus[i].begin(), corpus[i].end(), patterns[i].begin(), patterns[i].end(), res); + ba::aho_corasick_hashmap(corpus[i].begin(), corpus[i].end(), patterns[i].begin(), patterns[i].end(), + [&res](std::string::const_iterator begin, std::string::const_iterator end) -> bool + { res.push_back({begin, end}); return true; }); fromIteratorsToContainer(res, localResult); BOOST_CHECK(localResult == rightResults[i]); } @@ -78,7 +82,9 @@ void test_aho_corasick() std::vector> res; std::vector localResult; ba::Aho_Corasick_Map obj(patterns[i].begin(), patterns[i].end()); - obj(corpus[i].begin(), corpus[i].end(), res); + obj(corpus[i].begin(), corpus[i].end(), + [&res](std::string::const_iterator begin, std::string::const_iterator end) -> bool + { res.push_back({begin, end}); return true; }); fromIteratorsToContainer(res, localResult); BOOST_CHECK(localResult == rightResults[i]); } @@ -89,7 +95,9 @@ void test_aho_corasick() std::vector> res; std::vector localResult; ba::Aho_Corasick_HashMap obj(patterns[i].begin(), patterns[i].end()); - obj(corpus[i].begin(), corpus[i].end(), res); + obj(corpus[i].begin(), corpus[i].end(), + [&res](std::string::const_iterator begin, std::string::const_iterator end) -> bool + { res.push_back({begin, end}); return true; }); fromIteratorsToContainer(res, localResult); BOOST_CHECK(localResult == rightResults[i]); } @@ -100,7 +108,9 @@ void test_aho_corasick() std::vector> res; std::vector localResult; ba::AhoCorasick, std::equal_to> obj(patterns[i].begin(), patterns[i].end()); - obj(corpus[i].begin(), corpus[i].end(), res); + obj(corpus[i].begin(), corpus[i].end(), + [&res](std::string::const_iterator begin, std::string::const_iterator end) -> bool + { res.push_back({begin, end}); return true; }); fromIteratorsToContainer(res, localResult); BOOST_CHECK(localResult == rightResults[i]); } From a4dcb837b5441cf3825aacbe8f21b277ca220c1e Mon Sep 17 00:00:00 2001 From: Alexander Zaitsev Date: Wed, 24 Aug 2016 16:15:29 +0300 Subject: [PATCH 12/33] [micro] Fix doxygen comment --- .../boost/algorithm/searching/aho_corasick.hpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/include/boost/algorithm/searching/aho_corasick.hpp b/include/boost/algorithm/searching/aho_corasick.hpp index 1fd8df5a9..bead293c0 100644 --- a/include/boost/algorithm/searching/aho_corasick.hpp +++ b/include/boost/algorithm/searching/aho_corasick.hpp @@ -40,7 +40,7 @@ class AhoCorasick MapBorNode* getLink(const T& c) const { const auto iter = links.find(c); - return iter != links.cend() ? (iter->second).get() : nullptr; + return iter != links.cend() ? iter->second.get() : nullptr; } bool isTerminal() const @@ -142,7 +142,7 @@ class AhoCorasick iter != current_node->links.cend(); ++iter) { const value_type& symbol = iter->first; - node_type* child = (iter->second).get(); + node_type* child = iter->second.get(); // Defining .fail for the childnode node_type* temp_node = current_node->fail; @@ -215,14 +215,14 @@ using Aho_Corasick_HashMap = AhoCorasick; /// \fn aho_corasick_map ( RAIterator corpus_begin, RAIterator corpus_end, /// ForwardIterator pat_begin, ForwardIterator pat_end, -/// Callback &out) +/// Callback cb) /// \return true if all callback callings return true, else false /// /// \param corpus_begin The start of the corpus sequence /// \param corpus_end One past the end of the corpus sequence /// \param pat_begin The start of the patterns sequence -/// \param pat_end One past the end of the patterns sequence -/// \param out Container for results +/// \param pat_end One past the end of the patterns sequence +/// \param cb Callback for matches /// template , typename RAIterator, typename ForwardIterator, typename Callback> @@ -236,14 +236,14 @@ bool aho_corasick_map ( RAIterator corpus_begin, RAIterator corpus_end, /// \fn aho_corasick_hashmap ( RAIterator corpus_begin, RAIterator corpus_end, /// ForwardIterator pat_begin, ForwardIterator pat_end, -/// Callback &out) +/// Callback cb) /// \return true if all callback callings return true, else false /// /// \param corpus_begin The start of the corpus sequence /// \param corpus_end One past the end of the corpus sequence /// \param pat_begin The start of the patterns sequence -/// \param pat_end One past the end of the patterns sequence -/// \param out Container for results +/// \param pat_end One past the end of the patterns sequence +/// \param cb Callback for matches /// template , typename Comp = std::equal_to, typename RAIterator, typename ForwardIterator, typename Callback> From 98ec38a81a10ca67ee6489a5f4d09fc2b342922f Mon Sep 17 00:00:00 2001 From: Alexander Date: Wed, 24 Aug 2016 21:40:31 +0300 Subject: [PATCH 13/33] Delete useless README --- README.md | 8 -------- 1 file changed, 8 deletions(-) delete mode 100644 README.md diff --git a/README.md b/README.md deleted file mode 100644 index 0212ff9af..000000000 --- a/README.md +++ /dev/null @@ -1,8 +0,0 @@ -# algorithm -Boost.org algorithm module - -##TODO (Aho-Corasik) -* Solve problem with const char* -* std::map or std::unordered map or smth else? -* internal container - std::vector or maybe smth else? -* Add choosing of insternal container for Node and Tree From ea9f65afc2e018e964ff728e4536232b0ad57da7 Mon Sep 17 00:00:00 2001 From: Alexander Zaitsev Date: Sat, 27 Aug 2016 13:17:30 +0300 Subject: [PATCH 14/33] [micro] Fixed doxygen doc --- include/boost/algorithm/searching/aho_corasick.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/boost/algorithm/searching/aho_corasick.hpp b/include/boost/algorithm/searching/aho_corasick.hpp index bead293c0..a0ede34f0 100644 --- a/include/boost/algorithm/searching/aho_corasick.hpp +++ b/include/boost/algorithm/searching/aho_corasick.hpp @@ -247,12 +247,12 @@ bool aho_corasick_map ( RAIterator corpus_begin, RAIterator corpus_end, /// template , typename Comp = std::equal_to, typename RAIterator, typename ForwardIterator, typename Callback> -bool aho_corasick_hashmap ( RAIterator corpus_first, RAIterator corpus_last, - ForwardIterator pat_first, ForwardIterator pat_last, +bool aho_corasick_hashmap ( RAIterator corpus_begin, RAIterator corpus_end, + ForwardIterator pat_begin, ForwardIterator pat_end, Callback cb) { - AhoCorasick obj(pat_first, pat_last); - return obj.find(corpus_first, corpus_last, cb); + AhoCorasick obj(pat_begin, pat_end); + return obj.find(corpus_begin, corpus_end, cb); } }} From a638394a72a4246449430d697b129d57c672b18e Mon Sep 17 00:00:00 2001 From: Alexander Zaitsev Date: Sun, 28 Aug 2016 19:49:48 +0300 Subject: [PATCH 15/33] [micro] Fixed C++11 compatibility and include guard name --- include/boost/algorithm/searching/aho_corasick.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/boost/algorithm/searching/aho_corasick.hpp b/include/boost/algorithm/searching/aho_corasick.hpp index a0ede34f0..fb66fc2fd 100644 --- a/include/boost/algorithm/searching/aho_corasick.hpp +++ b/include/boost/algorithm/searching/aho_corasick.hpp @@ -6,8 +6,8 @@ See http://www.boost.org/ for latest version. */ -#ifndef AHO_CORASIK_AHO_CORASIK_HPP -#define AHO_CORASIK_AHO_CORASIK_HPP +#ifndef BOOST_ALGORITHM_AHO_CORASICK_HPP +#define BOOST_ALGORITHM_AHO_CORASICK_HPP #include #include @@ -22,7 +22,7 @@ namespace boost { namespace algorithm { -template typename Container, typename ...Args> +template class Container, typename ...Args> class AhoCorasick { private: From 63c7077924edd4f49fdfaeb7dfdda570ecbeac5f Mon Sep 17 00:00:00 2001 From: Alexander Zaitsev Date: Sun, 28 Aug 2016 19:51:51 +0300 Subject: [PATCH 16/33] [micro] Fixed comment --- include/boost/algorithm/searching/aho_corasick.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/boost/algorithm/searching/aho_corasick.hpp b/include/boost/algorithm/searching/aho_corasick.hpp index fb66fc2fd..861c1f550 100644 --- a/include/boost/algorithm/searching/aho_corasick.hpp +++ b/include/boost/algorithm/searching/aho_corasick.hpp @@ -256,4 +256,4 @@ bool aho_corasick_hashmap ( RAIterator corpus_begin, RAIterator corpus_end, } }} -#endif //AHO_CORASIK_AHO_CORASIK_HPP +#endif //BOOST_ALGORITHM_AHO_CORASICK_HPP From 517b6d546ce400c3fbc1ba37bc043252a6274469 Mon Sep 17 00:00:00 2001 From: Alexander Zaitsev Date: Sun, 28 Aug 2016 20:30:17 +0300 Subject: [PATCH 17/33] Fix in matching patterns Now if callback returns false for match, we cancel searching. --- .../algorithm/searching/aho_corasick.hpp | 21 ++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/include/boost/algorithm/searching/aho_corasick.hpp b/include/boost/algorithm/searching/aho_corasick.hpp index 861c1f550..1cc67e795 100644 --- a/include/boost/algorithm/searching/aho_corasick.hpp +++ b/include/boost/algorithm/searching/aho_corasick.hpp @@ -119,15 +119,17 @@ class AhoCorasick template bool find(RAIterator begin, RAIterator end, Callback cb) { - bool result = true; init(); current_state = root.get(); for(auto it = begin; it != end; ++it) { step(*it); - result &= getTermsForCurrentState(it, cb); + if(!getTermsForCurrentState(it, cb)) + { + return false; + } } - return result; + return true; } private: void init() @@ -182,12 +184,14 @@ class AhoCorasick template bool getTermsForCurrentState(RAIterator pos, Callback cb) { - bool result = true; if (current_state->isTerminal()) { for (const auto value : current_state->pat) { - result &= cb(1 + pos - value, pos + 1); + if(!cb(1 + pos - value, pos + 1)) + { + return false; + } } } node_type* temp_node = current_state->term; @@ -195,11 +199,14 @@ class AhoCorasick { for (const auto value : temp_node->pat) { - result &= cb(1 + pos - value, pos + 1); + if(!cb(1 + pos - value, pos + 1)) + { + return false; + } } temp_node = temp_node->term; } - return result; + return true; } }; From dbd435b69918ffea2ca78c74a596ed7d35f57e1b Mon Sep 17 00:00:00 2001 From: Alexander Zaitsev Date: Wed, 31 Aug 2016 22:55:11 +0300 Subject: [PATCH 18/33] Fixed multiple init in 'find', renamed to aho_corasick --- .../algorithm/searching/aho_corasick.hpp | 54 ++++++++++--------- 1 file changed, 30 insertions(+), 24 deletions(-) diff --git a/include/boost/algorithm/searching/aho_corasick.hpp b/include/boost/algorithm/searching/aho_corasick.hpp index 1cc67e795..b52a0ac02 100644 --- a/include/boost/algorithm/searching/aho_corasick.hpp +++ b/include/boost/algorithm/searching/aho_corasick.hpp @@ -23,21 +23,21 @@ namespace boost { namespace algorithm { template class Container, typename ...Args> -class AhoCorasick +class aho_corasick { private: - class MapBorNode + class node { public: - Container, Args...> links; - MapBorNode *fail, *term; + Container, Args...> links; + node *fail, *term; std::vector pat; - MapBorNode(MapBorNode* fail_node = nullptr) + node(node* fail_node = nullptr) : fail(fail_node), term(nullptr) { } - MapBorNode* getLink(const T& c) const + node* getLink(const T& c) const { const auto iter = links.find(c); return iter != links.cend() ? iter->second.get() : nullptr; @@ -50,16 +50,17 @@ class AhoCorasick }; public: using value_type = T; - using node_type = MapBorNode; + using node_type = node; private: std::unique_ptr root; node_type* current_state; size_t countStrings = 0; + bool isInited = false; public: - AhoCorasick() : root(boost::make_unique()) {} + aho_corasick() : root(boost::make_unique()) {} template - explicit AhoCorasick(ForwardIterator patBegin, ForwardIterator patEnd) : root(boost::make_unique()) + explicit aho_corasick(ForwardIterator patBegin, ForwardIterator patEnd) : root(boost::make_unique()) { while(patBegin != patEnd) { @@ -69,7 +70,7 @@ class AhoCorasick } template - explicit AhoCorasick(const Range& range) : AhoCorasick(boost::begin(range), boost::end(range)) {} + explicit aho_corasick(const Range& range) : aho_corasick(boost::begin(range), boost::end(range)) {} /// \fn insert(const Range& range) /// \brief Insert pattern in trie @@ -91,6 +92,7 @@ class AhoCorasick template void insert(ForwardIterator begin, ForwardIterator end) { + isInited = false; size_t patLen = 0; node_type* current_node = root.get(); for(auto it = begin; it != end; ++it) @@ -119,15 +121,18 @@ class AhoCorasick template bool find(RAIterator begin, RAIterator end, Callback cb) { - init(); + if(!isInited) + { + init(); + } current_state = root.get(); for(auto it = begin; it != end; ++it) { step(*it); if(!getTermsForCurrentState(it, cb)) - { - return false; - } + { + return false; + } } return true; } @@ -164,6 +169,7 @@ class AhoCorasick q.push(child); } } + isInited = true; } void step(const value_type& c) @@ -189,9 +195,9 @@ class AhoCorasick for (const auto value : current_state->pat) { if(!cb(1 + pos - value, pos + 1)) - { - return false; - } + { + return false; + } } } node_type* temp_node = current_state->term; @@ -200,9 +206,9 @@ class AhoCorasick for (const auto value : temp_node->pat) { if(!cb(1 + pos - value, pos + 1)) - { - return false; - } + { + return false; + } } temp_node = temp_node->term; } @@ -212,10 +218,10 @@ class AhoCorasick //Object interface template > -using Aho_Corasick_Map = AhoCorasick; +using aho_corasick_map_obj = aho_corasick; template , typename Comp = std::equal_to> -using Aho_Corasick_HashMap = AhoCorasick; +using aho_corasick_hashmap_obj = aho_corasick; //Functional interface @@ -237,7 +243,7 @@ bool aho_corasick_map ( RAIterator corpus_begin, RAIterator corpus_end, ForwardIterator pat_begin, ForwardIterator pat_end, Callback cb) { - AhoCorasick obj(pat_begin, pat_end); + aho_corasick obj(pat_begin, pat_end); return obj.find(corpus_begin, corpus_end, cb); } @@ -258,7 +264,7 @@ bool aho_corasick_hashmap ( RAIterator corpus_begin, RAIterator corpus_end, ForwardIterator pat_begin, ForwardIterator pat_end, Callback cb) { - AhoCorasick obj(pat_begin, pat_end); + aho_corasick obj(pat_begin, pat_end); return obj.find(corpus_begin, corpus_end, cb); } }} From 3212f73a17efd2c946cdc0098a8b3f608518d106 Mon Sep 17 00:00:00 2001 From: Alexander Zaitsev Date: Sat, 3 Sep 2016 21:16:58 +0300 Subject: [PATCH 19/33] Added range-based 'find' . --- include/boost/algorithm/searching/aho_corasick.hpp | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/include/boost/algorithm/searching/aho_corasick.hpp b/include/boost/algorithm/searching/aho_corasick.hpp index b52a0ac02..81cf67698 100644 --- a/include/boost/algorithm/searching/aho_corasick.hpp +++ b/include/boost/algorithm/searching/aho_corasick.hpp @@ -110,6 +110,19 @@ class aho_corasick current_node->pat.push_back(patLen); } + /// \fn find ( const Range& range, Callback cb) + /// \brief Searches patterns in the corpus + /// \return true if all callback callings return true, else false + /// + /// \param range The range of the data to search + /// \param cb Callback for matches + /// + template + bool find(const Range& range, Callback cb) + { + return find(boost::begin(range), boost::end(range), cb); + } + /// \fn find ( RAIterator begin, RAIterator end, Callback cb) /// \brief Searches patterns in the corpus /// \return true if all callback callings return true, else false From 86e451408e0ad5195ad62dd9bf2d3f9c434ec18c Mon Sep 17 00:00:00 2001 From: Alexander Zaitsev Date: Sun, 4 Sep 2016 02:20:14 +0300 Subject: [PATCH 20/33] Changed type of root std::unique_ptr -> node_type --- .../boost/algorithm/searching/aho_corasick.hpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/include/boost/algorithm/searching/aho_corasick.hpp b/include/boost/algorithm/searching/aho_corasick.hpp index 81cf67698..87b402ff9 100644 --- a/include/boost/algorithm/searching/aho_corasick.hpp +++ b/include/boost/algorithm/searching/aho_corasick.hpp @@ -52,15 +52,15 @@ class aho_corasick using value_type = T; using node_type = node; private: - std::unique_ptr root; + node_type root; node_type* current_state; size_t countStrings = 0; bool isInited = false; public: - aho_corasick() : root(boost::make_unique()) {} + aho_corasick(){} template - explicit aho_corasick(ForwardIterator patBegin, ForwardIterator patEnd) : root(boost::make_unique()) + explicit aho_corasick(ForwardIterator patBegin, ForwardIterator patEnd) { while(patBegin != patEnd) { @@ -94,14 +94,14 @@ class aho_corasick { isInited = false; size_t patLen = 0; - node_type* current_node = root.get(); + node_type* current_node = &root; for(auto it = begin; it != end; ++it) { ++patLen; node_type* child_node = current_node->getLink(*it); if (!child_node) { - std::unique_ptr new_node = boost::make_unique(root.get()); + std::unique_ptr new_node = boost::make_unique(&root); child_node = new_node.get(); current_node->links[*it] = std::move(new_node); } @@ -138,7 +138,7 @@ class aho_corasick { init(); } - current_state = root.get(); + current_state = &root; for(auto it = begin; it != end; ++it) { step(*it); @@ -153,7 +153,7 @@ class aho_corasick void init() { std::queue q; - q.push(root.get()); + q.push(&root); while (!q.empty()) { node_type* current_node = q.front(); @@ -197,7 +197,7 @@ class aho_corasick } current_state = current_state->fail; } - current_state = root.get(); + current_state = &root; } template From 3b37858fb20b7a5857ce8dba7724fac82b277770 Mon Sep 17 00:00:00 2001 From: Alexander Zaitsev Date: Tue, 6 Sep 2016 21:11:51 +0300 Subject: [PATCH 21/33] Deleted memory allocations. --- .../algorithm/searching/aho_corasick.hpp | 32 +++++++++---------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/include/boost/algorithm/searching/aho_corasick.hpp b/include/boost/algorithm/searching/aho_corasick.hpp index 87b402ff9..647123696 100644 --- a/include/boost/algorithm/searching/aho_corasick.hpp +++ b/include/boost/algorithm/searching/aho_corasick.hpp @@ -11,14 +11,12 @@ #include #include -#include -#include #include -#include #include #include -#include +#include +#include namespace boost { namespace algorithm { @@ -29,7 +27,7 @@ class aho_corasick class node { public: - Container, Args...> links; + Container links; node *fail, *term; std::vector pat; @@ -37,10 +35,10 @@ class aho_corasick : fail(fail_node), term(nullptr) { } - node* getLink(const T& c) const + node* getLink(const T& c) { - const auto iter = links.find(c); - return iter != links.cend() ? iter->second.get() : nullptr; + auto iter = links.find(c); + return iter != links.end() ? &iter->second : nullptr; } bool isTerminal() const @@ -101,9 +99,9 @@ class aho_corasick node_type* child_node = current_node->getLink(*it); if (!child_node) { - std::unique_ptr new_node = boost::make_unique(&root); - child_node = new_node.get(); + node new_node; current_node->links[*it] = std::move(new_node); + child_node = ¤t_node->links[*it]; } current_node = child_node; } @@ -158,11 +156,11 @@ class aho_corasick { node_type* current_node = q.front(); q.pop(); - for (auto iter = current_node->links.cbegin(); - iter != current_node->links.cend(); ++iter) + for (auto iter = current_node->links.begin(); + iter != current_node->links.end(); ++iter) { const value_type& symbol = iter->first; - node_type* child = iter->second.get(); + node_type* child = &iter->second; // Defining .fail for the childnode node_type* temp_node = current_node->fail; @@ -231,10 +229,10 @@ class aho_corasick //Object interface template > -using aho_corasick_map_obj = aho_corasick; +using aho_corasick_map_obj = aho_corasick; template , typename Comp = std::equal_to> -using aho_corasick_hashmap_obj = aho_corasick; +using aho_corasick_hashmap_obj = aho_corasick; //Functional interface @@ -256,7 +254,7 @@ bool aho_corasick_map ( RAIterator corpus_begin, RAIterator corpus_end, ForwardIterator pat_begin, ForwardIterator pat_end, Callback cb) { - aho_corasick obj(pat_begin, pat_end); + aho_corasick obj(pat_begin, pat_end); return obj.find(corpus_begin, corpus_end, cb); } @@ -277,7 +275,7 @@ bool aho_corasick_hashmap ( RAIterator corpus_begin, RAIterator corpus_end, ForwardIterator pat_begin, ForwardIterator pat_end, Callback cb) { - aho_corasick obj(pat_begin, pat_end); + aho_corasick obj(pat_begin, pat_end); return obj.find(corpus_begin, corpus_end, cb); } }} From b6382e6efa6c28ab0caf855dc02441f4a74817ba Mon Sep 17 00:00:00 2001 From: Alexander Zaitsev Date: Tue, 13 Sep 2016 20:26:14 +0300 Subject: [PATCH 22/33] [micro] Refactoring --- include/boost/algorithm/searching/aho_corasick.hpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/boost/algorithm/searching/aho_corasick.hpp b/include/boost/algorithm/searching/aho_corasick.hpp index 647123696..ccc60dafd 100644 --- a/include/boost/algorithm/searching/aho_corasick.hpp +++ b/include/boost/algorithm/searching/aho_corasick.hpp @@ -99,8 +99,7 @@ class aho_corasick node_type* child_node = current_node->getLink(*it); if (!child_node) { - node new_node; - current_node->links[*it] = std::move(new_node); + current_node->links[*it] = node(); child_node = ¤t_node->links[*it]; } current_node = child_node; From ae8c3c3ce4166298eca2230b949e19c75c937fc7 Mon Sep 17 00:00:00 2001 From: Alexander Zaitsev Date: Tue, 13 Sep 2016 21:47:11 +0300 Subject: [PATCH 23/33] Fix serious bug in searching. --- include/boost/algorithm/searching/aho_corasick.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/boost/algorithm/searching/aho_corasick.hpp b/include/boost/algorithm/searching/aho_corasick.hpp index ccc60dafd..5309f3e15 100644 --- a/include/boost/algorithm/searching/aho_corasick.hpp +++ b/include/boost/algorithm/searching/aho_corasick.hpp @@ -99,7 +99,7 @@ class aho_corasick node_type* child_node = current_node->getLink(*it); if (!child_node) { - current_node->links[*it] = node(); + current_node->links[*it] = node(&root); child_node = ¤t_node->links[*it]; } current_node = child_node; From 87117ce66752b86918596f24c1ee9d2735d82635 Mon Sep 17 00:00:00 2001 From: Alexander Zaitsev Date: Thu, 15 Sep 2016 22:24:36 +0300 Subject: [PATCH 24/33] Deleted std::map and std::unordered_map versions. --- .../algorithm/searching/aho_corasick.hpp | 43 +++++-------------- 1 file changed, 10 insertions(+), 33 deletions(-) diff --git a/include/boost/algorithm/searching/aho_corasick.hpp b/include/boost/algorithm/searching/aho_corasick.hpp index 5309f3e15..fc46708cd 100644 --- a/include/boost/algorithm/searching/aho_corasick.hpp +++ b/include/boost/algorithm/searching/aho_corasick.hpp @@ -15,13 +15,13 @@ #include #include -#include -#include +#include + namespace boost { namespace algorithm { template class Container, typename ...Args> -class aho_corasick +class aho_corasick_base { private: class node @@ -55,10 +55,10 @@ class aho_corasick size_t countStrings = 0; bool isInited = false; public: - aho_corasick(){} + aho_corasick_base(){} template - explicit aho_corasick(ForwardIterator patBegin, ForwardIterator patEnd) + explicit aho_corasick_base(ForwardIterator patBegin, ForwardIterator patEnd) { while(patBegin != patEnd) { @@ -68,7 +68,7 @@ class aho_corasick } template - explicit aho_corasick(const Range& range) : aho_corasick(boost::begin(range), boost::end(range)) {} + explicit aho_corasick_base(const Range& range) : aho_corasick_base(boost::begin(range), boost::end(range)) {} /// \fn insert(const Range& range) /// \brief Insert pattern in trie @@ -228,15 +228,12 @@ class aho_corasick //Object interface template > -using aho_corasick_map_obj = aho_corasick; - -template , typename Comp = std::equal_to> -using aho_corasick_hashmap_obj = aho_corasick; +using aho_corasick = aho_corasick_base; //Functional interface -/// \fn aho_corasick_map ( RAIterator corpus_begin, RAIterator corpus_end, +/// \fn aho_corasick_search ( RAIterator corpus_begin, RAIterator corpus_end, /// ForwardIterator pat_begin, ForwardIterator pat_end, /// Callback cb) /// \return true if all callback callings return true, else false @@ -249,34 +246,14 @@ using aho_corasick_hashmap_obj = aho_corasick, typename RAIterator, typename ForwardIterator, typename Callback> -bool aho_corasick_map ( RAIterator corpus_begin, RAIterator corpus_end, +bool aho_corasick_search ( RAIterator corpus_begin, RAIterator corpus_end, ForwardIterator pat_begin, ForwardIterator pat_end, Callback cb) { - aho_corasick obj(pat_begin, pat_end); + aho_corasick_base obj(pat_begin, pat_end); return obj.find(corpus_begin, corpus_end, cb); } -/// \fn aho_corasick_hashmap ( RAIterator corpus_begin, RAIterator corpus_end, -/// ForwardIterator pat_begin, ForwardIterator pat_end, -/// Callback cb) -/// \return true if all callback callings return true, else false -/// -/// \param corpus_begin The start of the corpus sequence -/// \param corpus_end One past the end of the corpus sequence -/// \param pat_begin The start of the patterns sequence -/// \param pat_end One past the end of the patterns sequence -/// \param cb Callback for matches -/// -template , typename Comp = std::equal_to, typename RAIterator, - typename ForwardIterator, typename Callback> -bool aho_corasick_hashmap ( RAIterator corpus_begin, RAIterator corpus_end, - ForwardIterator pat_begin, ForwardIterator pat_end, - Callback cb) -{ - aho_corasick obj(pat_begin, pat_end); - return obj.find(corpus_begin, corpus_end, cb); -} }} #endif //BOOST_ALGORITHM_AHO_CORASICK_HPP From 48b2bc105394e09ef6ec91a3eaf8593b49a81e4f Mon Sep 17 00:00:00 2001 From: Alexander Zaitsev Date: Sat, 17 Sep 2016 10:48:58 +0300 Subject: [PATCH 25/33] Fixed compile error --- include/boost/algorithm/searching/aho_corasick.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/boost/algorithm/searching/aho_corasick.hpp b/include/boost/algorithm/searching/aho_corasick.hpp index fc46708cd..5ee5b6d93 100644 --- a/include/boost/algorithm/searching/aho_corasick.hpp +++ b/include/boost/algorithm/searching/aho_corasick.hpp @@ -250,7 +250,7 @@ bool aho_corasick_search ( RAIterator corpus_begin, RAIterator corpus_end, ForwardIterator pat_begin, ForwardIterator pat_end, Callback cb) { - aho_corasick_base obj(pat_begin, pat_end); + aho_corasick_base obj(pat_begin, pat_end); return obj.find(corpus_begin, corpus_end, cb); } From 47e6b973dfbcd7563131fbb16b003dc5d2fcf37a Mon Sep 17 00:00:00 2001 From: Alexander Zaitsev Date: Fri, 7 Oct 2016 08:41:15 +0300 Subject: [PATCH 26/33] Added source of implementation, refactoring --- doc/aho_corasick.qbk | 2 ++ include/boost/algorithm/searching/aho_corasick.hpp | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/aho_corasick.qbk b/doc/aho_corasick.qbk index 6f7031aa1..a7c0e83cb 100644 --- a/doc/aho_corasick.qbk +++ b/doc/aho_corasick.qbk @@ -22,6 +22,8 @@ The algorithm was conceived in 1975 by Alfred V. Aho and Margaret J. Corasick. Nomenclature: The nomenclature is similar to that of the Knuth Morris Pratt implementation in Boost.Algorithm. The sequence being searched for is referred to as the "pattern", and the sequence being searched in is referred to as the "corpus". +See more in "Set Matching and Aho–Corasick Algorithm", lecture slides by Pekka Kilpeläinen(http://www.cs.uku.fi/~kilpelai/BSA05/lectures/slides04.pdf). + [heading Interface] For flexibility, the Aho-Corasick algorithm has two interfaces; an object-based interface and a procedural one. The object-based interface builds the trie in the constructor, and uses 'find()' to make suffix links and perform the search. The procedural interface builds the trie(with building suffix links) and does the search all in one step. If you are going to be searching for the same pattern in multiple corpora, then you should use the object interface, and only build the tries once. diff --git a/include/boost/algorithm/searching/aho_corasick.hpp b/include/boost/algorithm/searching/aho_corasick.hpp index 5ee5b6d93..fb0b7ef19 100644 --- a/include/boost/algorithm/searching/aho_corasick.hpp +++ b/include/boost/algorithm/searching/aho_corasick.hpp @@ -52,7 +52,6 @@ class aho_corasick_base private: node_type root; node_type* current_state; - size_t countStrings = 0; bool isInited = false; public: aho_corasick_base(){} From 903cb3468cf5bc8809b9c727815ced99f23c0209 Mon Sep 17 00:00:00 2001 From: Alexander Zaitsev Date: Tue, 11 Oct 2016 11:23:34 +0300 Subject: [PATCH 27/33] Init for Manacker's algorithm --- include/boost/algorithm/manacker.hpp | 89 ++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100644 include/boost/algorithm/manacker.hpp diff --git a/include/boost/algorithm/manacker.hpp b/include/boost/algorithm/manacker.hpp new file mode 100644 index 000000000..a841da7ec --- /dev/null +++ b/include/boost/algorithm/manacker.hpp @@ -0,0 +1,89 @@ +#ifndef MANACKER_H +#define MANACKER_H + +#include +#include + +#include +#include + +namespace boost { namespace algorithm { + +template +std::vector> manacker(RAIterator begin, RAIterator end) +{ + size_t length = std::distance(begin, end); + std::vector ansPalN2(length), ansPal2(length); + + int leftBorder = 0, rightBorder = -1, tempMirror;//start digits for algortihm + for (int i = 0; i < length; ++i) + { + tempMirror = (i > rightBorder ? 0 : std::min(ansPalN2[leftBorder + rightBorder - i], rightBorder - i)) + + 1;//find mirror of current index + while (i + tempMirror < length && i - tempMirror >= 0 && + begin[i - tempMirror] == begin[i + tempMirror])//increase our index + { + ++tempMirror; + } + ansPalN2[i] = --tempMirror; + if (i + tempMirror > rightBorder)//try to increase our right border of palindrom + { + leftBorder = i - tempMirror; + rightBorder = i + tempMirror; + } + } + + //--------------------------------------------------------------- + + leftBorder = 0, rightBorder = -1, tempMirror = 0; + for (int i = 0; i < length; ++i) + { + tempMirror = + (i > rightBorder ? 0 : std::min(ansPal2[leftBorder + rightBorder - i + 1], rightBorder - i + 1)) + + 1; + while (i + tempMirror - 1 < length && i - tempMirror >= 0 && + begin[i - tempMirror] == begin[i + tempMirror - 1]) + { + ++tempMirror; + } + ansPal2[i] = --tempMirror; + if (i + tempMirror - 1 > rightBorder) + { + leftBorder = i - tempMirror; + rightBorder = i + tempMirror - 1; + } + } + + //------------------------------------------------------ + std::cout << std::endl; + + std::vector> result; + for(size_t i = 0; i < length; ++i) + { + result.push_back({begin + i - ansPalN2[i], begin + i + ansPalN2[i] + 1}); + } + for(size_t i = 0; i < length; ++i) + { + if(ansPal2[i] != 0) + result.push_back({begin + i - ansPal2[i], begin + i + ansPal2[i]}); + } + return result; +} + +template +auto manacker(Range range) +{ + return manacker(boost::begin(range), boost::end(range)); +}; + +}} + +//Find palindroms like 2*N+1 + + +//Find palindroms like 2*N +//See PalN2. +//P.S. About magic numbers : you can read about this in the description of the algorithm of Manacker. +//These numbers need for finding palindroms like 2*N because not allowed to find centre of these palindrom + +#endif // MANACKER_H From 19c8292cdbb21baf1baad95bc1ab33c0bc42e05f Mon Sep 17 00:00:00 2001 From: Alexander Zaitsev Date: Tue, 11 Oct 2016 11:28:59 +0300 Subject: [PATCH 28/33] Deleted trash --- .../algorithm/searching/aho_corasick.hpp | 258 ------------------ 1 file changed, 258 deletions(-) delete mode 100644 include/boost/algorithm/searching/aho_corasick.hpp diff --git a/include/boost/algorithm/searching/aho_corasick.hpp b/include/boost/algorithm/searching/aho_corasick.hpp deleted file mode 100644 index fb0b7ef19..000000000 --- a/include/boost/algorithm/searching/aho_corasick.hpp +++ /dev/null @@ -1,258 +0,0 @@ -/* - Copyright (c) Alexander Zaitsev , 2016 - Distributed under the Boost Software License, Version 1.0. (See - accompanying file LICENSE_1_0.txt or copy at - http://www.boost.org/LICENSE_1_0.txt) - See http://www.boost.org/ for latest version. -*/ - -#ifndef BOOST_ALGORITHM_AHO_CORASICK_HPP -#define BOOST_ALGORITHM_AHO_CORASICK_HPP - -#include -#include -#include - -#include -#include -#include - - -namespace boost { namespace algorithm { - -template class Container, typename ...Args> -class aho_corasick_base -{ -private: - class node - { - public: - Container links; - node *fail, *term; - std::vector pat; - - node(node* fail_node = nullptr) - : fail(fail_node), term(nullptr) - { } - - node* getLink(const T& c) - { - auto iter = links.find(c); - return iter != links.end() ? &iter->second : nullptr; - } - - bool isTerminal() const - { - return !pat.empty(); - } - }; -public: - using value_type = T; - using node_type = node; -private: - node_type root; - node_type* current_state; - bool isInited = false; -public: - aho_corasick_base(){} - - template - explicit aho_corasick_base(ForwardIterator patBegin, ForwardIterator patEnd) - { - while(patBegin != patEnd) - { - insert(*patBegin); - ++patBegin; - } - } - - template - explicit aho_corasick_base(const Range& range) : aho_corasick_base(boost::begin(range), boost::end(range)) {} - - /// \fn insert(const Range& range) - /// \brief Insert pattern in trie - /// - /// \param range The pattern range - /// - template - void insert(const Range& range) - { - insert(boost::begin(range), boost::end(range)); - } - - /// \fn insert(ForwardIterator begin, ForwardIterator end) - /// \brief Insert pattern in trie - /// - /// \param begin The start of the pattern - /// \param end One past the end of the pattern - /// - template - void insert(ForwardIterator begin, ForwardIterator end) - { - isInited = false; - size_t patLen = 0; - node_type* current_node = &root; - for(auto it = begin; it != end; ++it) - { - ++patLen; - node_type* child_node = current_node->getLink(*it); - if (!child_node) - { - current_node->links[*it] = node(&root); - child_node = ¤t_node->links[*it]; - } - current_node = child_node; - } - current_node->pat.push_back(patLen); - } - - /// \fn find ( const Range& range, Callback cb) - /// \brief Searches patterns in the corpus - /// \return true if all callback callings return true, else false - /// - /// \param range The range of the data to search - /// \param cb Callback for matches - /// - template - bool find(const Range& range, Callback cb) - { - return find(boost::begin(range), boost::end(range), cb); - } - - /// \fn find ( RAIterator begin, RAIterator end, Callback cb) - /// \brief Searches patterns in the corpus - /// \return true if all callback callings return true, else false - /// - /// \param begin The start of the data to search (Random Access Iterator) - /// \param end One past the end of the data to search (Random Access Iterator) - /// \param cb Callback for matches - /// - template - bool find(RAIterator begin, RAIterator end, Callback cb) - { - if(!isInited) - { - init(); - } - current_state = &root; - for(auto it = begin; it != end; ++it) - { - step(*it); - if(!getTermsForCurrentState(it, cb)) - { - return false; - } - } - return true; - } -private: - void init() - { - std::queue q; - q.push(&root); - while (!q.empty()) - { - node_type* current_node = q.front(); - q.pop(); - for (auto iter = current_node->links.begin(); - iter != current_node->links.end(); ++iter) - { - const value_type& symbol = iter->first; - node_type* child = &iter->second; - - // Defining .fail for the childnode - node_type* temp_node = current_node->fail; - while (temp_node) - { - node_type* fail_candidate = temp_node->getLink(symbol); - if (fail_candidate) - { - child->fail = fail_candidate; - break; - } - temp_node = temp_node->fail; - } - - // Defining .term for the childnode using .term of current node - child->term = (child->fail == nullptr || child->fail->isTerminal()) ? child->fail : child->fail->term; - q.push(child); - } - } - isInited = true; - } - - void step(const value_type& c) - { - while (current_state) - { - node_type* candidate = current_state->getLink(c); - if (candidate) - { - current_state = candidate; - return; - } - current_state = current_state->fail; - } - current_state = &root; - } - - template - bool getTermsForCurrentState(RAIterator pos, Callback cb) - { - if (current_state->isTerminal()) - { - for (const auto value : current_state->pat) - { - if(!cb(1 + pos - value, pos + 1)) - { - return false; - } - } - } - node_type* temp_node = current_state->term; - while (temp_node) - { - for (const auto value : temp_node->pat) - { - if(!cb(1 + pos - value, pos + 1)) - { - return false; - } - } - temp_node = temp_node->term; - } - return true; - } -}; - -//Object interface -template > -using aho_corasick = aho_corasick_base; - - -//Functional interface - -/// \fn aho_corasick_search ( RAIterator corpus_begin, RAIterator corpus_end, -/// ForwardIterator pat_begin, ForwardIterator pat_end, -/// Callback cb) -/// \return true if all callback callings return true, else false -/// -/// \param corpus_begin The start of the corpus sequence -/// \param corpus_end One past the end of the corpus sequence -/// \param pat_begin The start of the patterns sequence -/// \param pat_end One past the end of the patterns sequence -/// \param cb Callback for matches -/// -template , typename RAIterator, - typename ForwardIterator, typename Callback> -bool aho_corasick_search ( RAIterator corpus_begin, RAIterator corpus_end, - ForwardIterator pat_begin, ForwardIterator pat_end, - Callback cb) -{ - aho_corasick_base obj(pat_begin, pat_end); - return obj.find(corpus_begin, corpus_end, cb); -} - -}} - -#endif //BOOST_ALGORITHM_AHO_CORASICK_HPP From fbe07645142bff8e4b0d427602f811bfb375bd9d Mon Sep 17 00:00:00 2001 From: Alexander Zaitsev Date: Tue, 11 Oct 2016 11:33:09 +0300 Subject: [PATCH 29/33] Deleted some more trash --- doc/aho_corasick.qbk | 143 ------------------------------- doc/algorithm.qbk | 1 - example/Jamfile.v2 | 2 +- example/aho_corasick_example.cpp | 41 --------- test/Jamfile.v2 | 1 - test/aho_corasick_test.cpp | 122 -------------------------- 6 files changed, 1 insertion(+), 309 deletions(-) delete mode 100644 doc/aho_corasick.qbk delete mode 100644 example/aho_corasick_example.cpp delete mode 100644 test/aho_corasick_test.cpp diff --git a/doc/aho_corasick.qbk b/doc/aho_corasick.qbk deleted file mode 100644 index a7c0e83cb..000000000 --- a/doc/aho_corasick.qbk +++ /dev/null @@ -1,143 +0,0 @@ -[/ QuickBook Document version 1.5 ] - -[section:AhoCorasick Aho-Corasick Search] - -[/license - -Copyright (c) 2016 Alexander Zaitsev - -Distributed under the Boost Software License, Version 1.0. -(See accompanying file LICENSE_1_0.txt or copy at -http://www.boost.org/LICENSE_1_0.txt) -] - - -[heading Overview] - -The header file 'aho_corasick.hpp' contains an implementation of the Aho-Corasick algorithm for searching sequences of values. It is primarily used to search for multiple patterns within a corpus. - -The Aho-Corasick algorithm works by building a trie (a tree with each node corresponding to an object) of the patterns sequences and traversing the trie to search for the pattern in a given corpus sequence. Additionally, the Aho-Corasick introduced the concept of "failure pointer/failure node" which is the node to be traversed when there is a mismatch. - -The algorithm was conceived in 1975 by Alfred V. Aho and Margaret J. Corasick. Their paper "Efficient string matching: An aid to bibliographic search" was published in the Communications of the ACM. - -Nomenclature: The nomenclature is similar to that of the Knuth Morris Pratt implementation in Boost.Algorithm. The sequence being searched for is referred to as the "pattern", and the sequence being searched in is referred to as the "corpus". - -See more in "Set Matching and Aho–Corasick Algorithm", lecture slides by Pekka Kilpeläinen(http://www.cs.uku.fi/~kilpelai/BSA05/lectures/slides04.pdf). - -[heading Interface] - -For flexibility, the Aho-Corasick algorithm has two interfaces; an object-based interface and a procedural one. The object-based interface builds the trie in the constructor, and uses 'find()' to make suffix links and perform the search. The procedural interface builds the trie(with building suffix links) and does the search all in one step. If you are going to be searching for the same pattern in multiple corpora, then you should use the object interface, and only build the tries once. - -The header file 'aho_corasick.hpp' contains two versions of Aho-Corasick: based on std::map and std::unordered_map. Also there is class AhoCorasick, which you can customize. For every version this header file provide functional and object-based interfaces. - -Procedural interfaces: - -Procedural interfaces provide interfaces based on iterators. - -For Aho-Corasick based on std::map: - -`` -template , typename RAIterator, - typename ForwardIterator, typename Callback> -bool aho_corasick_map ( RAIterator corpus_first, RAIterator corpus_last, - ForwardIterator pat_first, ForwardIterator pat_last, - Callback cb); -`` - -For Aho-Corasick based on std::unordered_map: -`` -template , typename Comp = std::equal_to, typename RAIterator, - typename ForwardIterator, typename Callback> -bool aho_corasick_hashmap ( RAIterator corpus_first, RAIterator corpus_last, - ForwardIterator pat_first, ForwardIterator pat_last, - Callback cb); -`` - - - -Object interface (typedefs): -`` -template > -using Aho_Corasick_Map = AhoCorasick; - -template , typename Comp = std::equal_to> -using Aho_Corasick_HashMap = AhoCorasick; -`` - -Interface (constructors, etc.) are equal for Aho_Corasick_Map, Aho_Corasick_HashMap and basical AhoCorasick: -`` -AhoCorasick(); - -template -explicit AhoCorasick(ForwardIterator patBegin, ForwardIterator patEnd); - -template -explicit AhoCorasick(const Range& range); - -template -void insert(ForwardIterator begin, ForwardIterator end); - -template -void insert(const Range& range); - -template -bool find(RAIterator begin, RAIterator end, Callback cb); -`` - -[heading Return value] - -The 'find' method returns true, if all Callback callings return true, otherwise returns false. - -[heading Requirements] - -C++11-compatible compiler required. - -For Aho_Corasick_HashMap and aho_corasick_hashmap: by default use std::hash for Hash and std::equal_to as Comparator. If you type doesn't support it, you must use your own functions for this. Without Hash and Comparator algorithm doesn't work. - -For Aho_Corasick_Map and aho_corasick_map: by default use std::less as Predicate. If you type doesn't support it, you must use your own functions for this. Without Predicate algorithm doesn't work. - -[heading Performance] - -Performance of Aho_Corasick_Map and Aho_Corasick_HashMap is similar on small alphabets. On large alphabets Aho_Corasick_HashMap is faster than Aho_Corasick_Map. Remember, that getting hash of element is slow operation. Also if you use Aho_Corasick_HashMap, std::unordered_map can sometimes do rehash with O(Alphabet). - -[heading Memory Use] - -Every node of trie consist of container of std::shared_ptr to trie nodes, which you choose(std::map, std::unordered_map or maybe something else), two std::shared_ptr to trie nodes and std:vector of length of patterns, which that ends in this node. Count of nodes is linear in the sum of the length of the patterns. - -[heading Complexity] - -Nomenclature: M - sum of the patterns length, N - length of the corpus, K - alphabet size, T - number of coincidences - -std::unordered_map-based version: -Time: O(M + N + T), Memory: O(M) -std::map-based version: -Time: O((M + N)log(K) + T), Memory: O(M). - -[heading Exception Safety] - -Both the object-oriented and procedural versions of the Aho-Corasick algorithm take all their parameters by value(exclude output container, taked by non-const reference). Therefore, both interfaces provide the strong exception guarantee. - -[heading Notes] - -* When using the object-based interface, the pattern must remain unchanged for during the inserting. - -* The Aho-Corasick algorithm requires forward iterators for patterns and random-access iterators for the corpus. - -[heading Customization points] - -For using Aho-Corasick algorithms you must use your own Callback(RAIterator, RAIterator) -> bool. This Callback must returns true if all is fine, otherwise false. - -In Aho_Corasick_HashMap and aho_corasick_hashmap() you can customize: value type, hash and compare functions. - -In Aho_Corasick_Map and aho_corasick_map() you can customize: value type and predicate. - -In AhoCorasick you can customize: value type, type of container and any other template parameters. It container will be used in nodes of the trie. Defining of the container: Container, Args...>. So your other template parameters will be used as Args... . Also your container must support 'find' method. - -[endsect] - -[/ File aho_corasick.qbk -Copyright 2016 Alexander Zaitsev -Distributed under the Boost Software License, Version 1.0. -(See accompanying file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt). -] - diff --git a/doc/algorithm.qbk b/doc/algorithm.qbk index 014740ec8..1568fb50e 100644 --- a/doc/algorithm.qbk +++ b/doc/algorithm.qbk @@ -41,7 +41,6 @@ Thanks to all the people who have reviewed this library and made suggestions for [section:Searching Searching Algorithms] -[include aho_corasick.qbk] [include boyer_moore.qbk] [include boyer_moore_horspool.qbk] [include knuth_morris_pratt.qbk] diff --git a/example/Jamfile.v2 b/example/Jamfile.v2 index b1d937d8f..f81e01f26 100644 --- a/example/Jamfile.v2 +++ b/example/Jamfile.v2 @@ -21,5 +21,5 @@ project /boost/algorithm/example exe clamp_example : clamp_example.cpp ; exe search_example : search_example.cpp ; exe is_palindrome_example : is_palindrome_example.cpp ; -exe aho_corasick_example : aho_corasick_example.cpp ; + diff --git a/example/aho_corasick_example.cpp b/example/aho_corasick_example.cpp deleted file mode 100644 index d2bc6e283..000000000 --- a/example/aho_corasick_example.cpp +++ /dev/null @@ -1,41 +0,0 @@ -/* - Copyright (c) Alexander Zaitsev , 2016 - - Distributed under the Boost Software License, Version 1.0. (See accompanying - file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) - - For more information, see http://www.boost.org -*/ - -#include -#include -#include - -#include - - -int main() -{ - std::vector pat({"228", "he", "is", "1488", "she", "his", "322", "her", - "h", "hishera", "azaza"}); - std::string corp = "hisher"; - std::vector> out; - - bool result = boost::algorithm::aho_corasick_map(corp.begin(), corp.end(), pat.begin(), pat.end(), - [&out](std::string::const_iterator begin, std::string::const_iterator end) -> bool - { out.push_back({begin, end}); return true; }); - - std::cout << result << std::endl; - for(const auto& val: out) - { - auto begin = val.first; - auto end = val.second; - while (begin != end) - { - std::cout << *begin; - ++begin; - } - std::cout << std::endl; - } - return 0; -} \ No newline at end of file diff --git a/test/Jamfile.v2 b/test/Jamfile.v2 index 8360cabdb..fb00843ad 100644 --- a/test/Jamfile.v2 +++ b/test/Jamfile.v2 @@ -26,7 +26,6 @@ alias unit_test_framework [ compile-fail search_fail1.cpp : : : : ] [ compile-fail search_fail2.cpp : : : : ] [ compile-fail search_fail3.cpp : : : : ] - [ run aho_corasick_test.cpp aho_corasick_test : : : : aho_corasick_test ] # Misc tests [ run clamp_test.cpp unit_test_framework : : : : clamp_test ] diff --git a/test/aho_corasick_test.cpp b/test/aho_corasick_test.cpp deleted file mode 100644 index 98c92cc59..000000000 --- a/test/aho_corasick_test.cpp +++ /dev/null @@ -1,122 +0,0 @@ -/* - Copyright (c) Alexander Zaitsev , 2016 - Distributed under the Boost Software License, Version 1.0. (See - accompanying file LICENSE_1_0.txt or copy at - http://www.boost.org/LICENSE_1_0.txt) - See http://www.boost.org/ for latest version. -*/ - -#include -#include - -#define BOOST_TEST_MAIN -#include - -#include -#include -#include -#include -#include - - -namespace ba = boost::algorithm; -const std::vector> patterns({std::vector({"he", "is", "she", "his", "her", "h", "hishera", "azaza"}), - std::vector({"he", "she", "his", "her", "he", "usher", "d sh", "she hi"}), - std::vector({"he", "she", "his", "her", "he", "usher", "d sh", "she hi"})}); - -const std::vector corpus({"hisher", - "usher and she he her", - ""}); - -const std::vector> rightResults({std::vector({"h", "his", "is", "h", "she", "he", "her"}), - std::vector({"she", "he", "he", "usher", "her", "d sh", "she", - "he", "he", "he", "he", "he", "he", "her"}), - std::vector()});; -template -void fromIteratorsToContainer(const Cont1& from, Cont2& to) -{ - for (const auto &val: from) - { - T str; - auto begin = val.first; - auto end = val.second; - while (begin != end) - { - str += *begin; - ++begin; - } - to.push_back(std::move(str)); - } -} - -void test_aho_corasick() -{ - BOOST_CHECK(patterns.size() == corpus.size()); - //aho_corasick_map - for(size_t i = 0; i < patterns.size(); ++i) - { - std::vector> res; - std::vector localResult; - ba::aho_corasick_map(corpus[i].begin(), corpus[i].end(), patterns[i].begin(), patterns[i].end(), - [&res](std::string::const_iterator begin, std::string::const_iterator end) -> bool - { res.push_back({begin, end}); return true; }); - fromIteratorsToContainer(res, localResult); - BOOST_CHECK(localResult == rightResults[i]); - } - - //aho_corasick_hashmap - for(size_t i = 0; i < patterns.size(); ++i) - { - std::vector> res; - std::vector localResult; - ba::aho_corasick_hashmap(corpus[i].begin(), corpus[i].end(), patterns[i].begin(), patterns[i].end(), - [&res](std::string::const_iterator begin, std::string::const_iterator end) -> bool - { res.push_back({begin, end}); return true; }); - fromIteratorsToContainer(res, localResult); - BOOST_CHECK(localResult == rightResults[i]); - } - - //Aho_Corasick_Map - for(size_t i = 0; i < patterns.size(); ++i) - { - std::vector> res; - std::vector localResult; - ba::Aho_Corasick_Map obj(patterns[i].begin(), patterns[i].end()); - obj(corpus[i].begin(), corpus[i].end(), - [&res](std::string::const_iterator begin, std::string::const_iterator end) -> bool - { res.push_back({begin, end}); return true; }); - fromIteratorsToContainer(res, localResult); - BOOST_CHECK(localResult == rightResults[i]); - } - - //Aho_Corasick_HashMap - for(size_t i = 0; i < patterns.size(); ++i) - { - std::vector> res; - std::vector localResult; - ba::Aho_Corasick_HashMap obj(patterns[i].begin(), patterns[i].end()); - obj(corpus[i].begin(), corpus[i].end(), - [&res](std::string::const_iterator begin, std::string::const_iterator end) -> bool - { res.push_back({begin, end}); return true; }); - fromIteratorsToContainer(res, localResult); - BOOST_CHECK(localResult == rightResults[i]); - } - - //General AhoCorasick - for(size_t i = 0; i < patterns.size(); ++i) - { - std::vector> res; - std::vector localResult; - ba::AhoCorasick, std::equal_to> obj(patterns[i].begin(), patterns[i].end()); - obj(corpus[i].begin(), corpus[i].end(), - [&res](std::string::const_iterator begin, std::string::const_iterator end) -> bool - { res.push_back({begin, end}); return true; }); - fromIteratorsToContainer(res, localResult); - BOOST_CHECK(localResult == rightResults[i]); - } -} - -BOOST_AUTO_TEST_CASE( test_main ) -{ - test_aho_corasick(); -} \ No newline at end of file From b558132a3ca08992304b97b0ce93f99e68e4f7aa Mon Sep 17 00:00:00 2001 From: Alexander Zaitsev Date: Tue, 11 Oct 2016 11:58:45 +0300 Subject: [PATCH 30/33] Refactoring --- include/boost/algorithm/manacker.hpp | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/include/boost/algorithm/manacker.hpp b/include/boost/algorithm/manacker.hpp index a841da7ec..64d6d1aa0 100644 --- a/include/boost/algorithm/manacker.hpp +++ b/include/boost/algorithm/manacker.hpp @@ -1,5 +1,13 @@ -#ifndef MANACKER_H -#define MANACKER_H +/* + Copyright (c) Alexander Zaitsev , 2016 + Distributed under the Boost Software License, Version 1.0. (See + accompanying file LICENSE_1_0.txt or copy at + http://www.boost.org/LICENSE_1_0.txt) + See http://www.boost.org/ for latest version. +*/ + +#ifndef BOOST_ALGORITHM_MANACKER_HPP +#define BOOST_ALGORITHM_MANACKER_HPP #include #include @@ -15,6 +23,7 @@ std::vector> manacker(RAIterator begin, RAIter size_t length = std::distance(begin, end); std::vector ansPalN2(length), ansPal2(length); + //Find palindroms like 2*N+1 int leftBorder = 0, rightBorder = -1, tempMirror;//start digits for algortihm for (int i = 0; i < length; ++i) { @@ -35,6 +44,10 @@ std::vector> manacker(RAIterator begin, RAIter //--------------------------------------------------------------- + //Find palindroms like 2*N + //See PalN2. + //P.S. About magic numbers : you can read about this in the description of the algorithm of Manacker. + //These numbers need for finding palindroms like 2*N because not allowed to find centre of these palindrom leftBorder = 0, rightBorder = -1, tempMirror = 0; for (int i = 0; i < length; ++i) { @@ -55,7 +68,6 @@ std::vector> manacker(RAIterator begin, RAIter } //------------------------------------------------------ - std::cout << std::endl; std::vector> result; for(size_t i = 0; i < length; ++i) @@ -78,12 +90,4 @@ auto manacker(Range range) }} -//Find palindroms like 2*N+1 - - -//Find palindroms like 2*N -//See PalN2. -//P.S. About magic numbers : you can read about this in the description of the algorithm of Manacker. -//These numbers need for finding palindroms like 2*N because not allowed to find centre of these palindrom - -#endif // MANACKER_H +#endif // BOOST_ALGORITHM_MANACKER_HPP From 02d05af92927220b2e313e8cb80849eff86e47db Mon Sep 17 00:00:00 2001 From: Alexander Date: Sun, 20 Nov 2016 03:07:10 +0300 Subject: [PATCH 31/33] [micro] Added reserving memory --- include/boost/algorithm/manacker.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/include/boost/algorithm/manacker.hpp b/include/boost/algorithm/manacker.hpp index 64d6d1aa0..af53bba54 100644 --- a/include/boost/algorithm/manacker.hpp +++ b/include/boost/algorithm/manacker.hpp @@ -70,6 +70,7 @@ std::vector> manacker(RAIterator begin, RAIter //------------------------------------------------------ std::vector> result; + result.reserve(2 * length); for(size_t i = 0; i < length; ++i) { result.push_back({begin + i - ansPalN2[i], begin + i + ansPalN2[i] + 1}); From ddd3b616841237f2d5d03af9e5e07876519e2f3b Mon Sep 17 00:00:00 2001 From: Alexander Zaitsev Date: Sat, 26 Nov 2016 00:16:56 +0300 Subject: [PATCH 32/33] Lazy Manacker --- include/boost/algorithm/manacker.hpp | 136 ++++++++++++++++++--------- 1 file changed, 89 insertions(+), 47 deletions(-) diff --git a/include/boost/algorithm/manacker.hpp b/include/boost/algorithm/manacker.hpp index 64d6d1aa0..6ce3d5c90 100644 --- a/include/boost/algorithm/manacker.hpp +++ b/include/boost/algorithm/manacker.hpp @@ -6,88 +6,130 @@ See http://www.boost.org/ for latest version. */ +/// \file manacker.hpp +/// \brief Finds all palindromes in a sequence. +/// \author Alexander Zaitsev + #ifndef BOOST_ALGORITHM_MANACKER_HPP #define BOOST_ALGORITHM_MANACKER_HPP #include #include +#include +#include #include #include namespace boost { namespace algorithm { -template -std::vector> manacker(RAIterator begin, RAIterator end) + + +template ::value_type>> +class manacker_class { - size_t length = std::distance(begin, end); - std::vector ansPalN2(length), ansPal2(length); +public: + manacker_class(Iter begin, Iter end, BinaryPredicate p = BinaryPredicate()) + : begin_(begin), end_(end), p_(p) + { + length_ = std::distance(begin_, end_); + answer_.resize(length_); + } + + template + manacker_class(const Range& r, BinaryPredicate p = BinaryPredicate()) + : manacker_class(boost::begin(r), boost::end(r), p) + { + } - //Find palindroms like 2*N+1 - int leftBorder = 0, rightBorder = -1, tempMirror;//start digits for algortihm - for (int i = 0; i < length; ++i) + std::pair next() { - tempMirror = (i > rightBorder ? 0 : std::min(ansPalN2[leftBorder + rightBorder - i], rightBorder - i)) + - 1;//find mirror of current index - while (i + tempMirror < length && i - tempMirror >= 0 && - begin[i - tempMirror] == begin[i + tempMirror])//increase our index + //if cannot find palindrome, returns {corp_end, corp_end} + std::pair ans; + if(flag_ == 0) { - ++tempMirror; + ans = calcOdd(); } - ansPalN2[i] = --tempMirror; - if (i + tempMirror > rightBorder)//try to increase our right border of palindrom + else if(flag_ == 1) { - leftBorder = i - tempMirror; - rightBorder = i + tempMirror; + ans = calcEven(); } + else + { + return std::pair(end_, end_); + } + + if(i == length_) + { + restoreToDefault(); + } + + + return ans; + } +private: + void restoreToDefault() + { + ++flag_; + leftBorder = 0, rightBorder = -1, tempMirror = 0, i = 0; + std::fill(answer_.begin(), answer_.end(), 0); } - //--------------------------------------------------------------- - //Find palindroms like 2*N - //See PalN2. - //P.S. About magic numbers : you can read about this in the description of the algorithm of Manacker. - //These numbers need for finding palindroms like 2*N because not allowed to find centre of these palindrom - leftBorder = 0, rightBorder = -1, tempMirror = 0; - for (int i = 0; i < length; ++i) + std::pair calcOdd() { - tempMirror = - (i > rightBorder ? 0 : std::min(ansPal2[leftBorder + rightBorder - i + 1], rightBorder - i + 1)) + - 1; - while (i + tempMirror - 1 < length && i - tempMirror >= 0 && - begin[i - tempMirror] == begin[i + tempMirror - 1]) + tempMirror = (i > rightBorder ? 0 : std::min(answer_[leftBorder + rightBorder - i], + rightBorder - i)) + 1;//find mirror of current index + while (i + tempMirror < length_ && i - tempMirror >= 0 && + p_(begin_[i - tempMirror], begin_[i + tempMirror]))//increase our index { ++tempMirror; } - ansPal2[i] = --tempMirror; - if (i + tempMirror - 1 > rightBorder) + answer_[i] = --tempMirror; + if (i + tempMirror > rightBorder)//try to increase our right border of palindrom { leftBorder = i - tempMirror; - rightBorder = i + tempMirror - 1; + rightBorder = i + tempMirror; } + int pos = i++; + return std::pair(begin_ + pos - answer_[pos], begin_ + pos + answer_[pos] + 1); } - //------------------------------------------------------ - - std::vector> result; - for(size_t i = 0; i < length; ++i) + std::pair calcEven() { - result.push_back({begin + i - ansPalN2[i], begin + i + ansPalN2[i] + 1}); - } - for(size_t i = 0; i < length; ++i) - { - if(ansPal2[i] != 0) - result.push_back({begin + i - ansPal2[i], begin + i + ansPal2[i]}); + for (; i < length_; ++i) + { + tempMirror = + (i > rightBorder ? 0 : std::min(answer_[leftBorder + rightBorder - i + 1], + rightBorder - i + 1)) + 1; + while (i + tempMirror - 1 < length_ && i - tempMirror >= 0 && + p_(begin_[i - tempMirror], begin_[i + tempMirror - 1])) + { + ++tempMirror; + } + answer_[i] = --tempMirror; + if (i + tempMirror - 1 > rightBorder) + { + leftBorder = i - tempMirror; + rightBorder = i + tempMirror - 1; + } + + if(answer_[i] != 0) + break; + } + int pos = i++; + return std::pair(begin_ + pos - answer_[pos], begin_ + pos + answer_[pos]); } - return result; -} +private: + Iter begin_, end_; + BinaryPredicate p_; + int length_, i = 0; + int leftBorder = 0, rightBorder = -1, tempMirror = 0, flag_ = 0; -template -auto manacker(Range range) -{ - return manacker(boost::begin(range), boost::end(range)); + std::vector answer_; }; + }} #endif // BOOST_ALGORITHM_MANACKER_HPP From 754f025b0be2200e75daff465e2001a15bb5bb2e Mon Sep 17 00:00:00 2001 From: Alexander Zaitsev Date: Sun, 27 Nov 2016 20:25:32 +0300 Subject: [PATCH 33/33] Fixed potential erros --- include/boost/algorithm/manacker.hpp | 29 +++++++++++++--------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/include/boost/algorithm/manacker.hpp b/include/boost/algorithm/manacker.hpp index 6ce3d5c90..17ffe0496 100644 --- a/include/boost/algorithm/manacker.hpp +++ b/include/boost/algorithm/manacker.hpp @@ -46,25 +46,22 @@ class manacker_class { //if cannot find palindrome, returns {corp_end, corp_end} std::pair ans; - if(flag_ == 0) + switch (flag_) { - ans = calcOdd(); - } - else if(flag_ == 1) - { - ans = calcEven(); - } - else - { - return std::pair(end_, end_); + case 0: + ans = calcOdd(); break; + case 1: + ans = calcEven(); break; + default: + return std::pair(end_, end_); } + ++i; if(i == length_) { restoreToDefault(); } - return ans; } private: @@ -91,8 +88,7 @@ class manacker_class leftBorder = i - tempMirror; rightBorder = i + tempMirror; } - int pos = i++; - return std::pair(begin_ + pos - answer_[pos], begin_ + pos + answer_[pos] + 1); + return std::pair(begin_ + i - answer_[i], begin_ + i + answer_[i] + 1); } std::pair calcEven() @@ -117,8 +113,9 @@ class manacker_class if(answer_[i] != 0) break; } - int pos = i++; - return std::pair(begin_ + pos - answer_[pos], begin_ + pos + answer_[pos]); + if(i == length_) + return std::pair(end_, end_); + return std::pair(begin_ + i - answer_[i], begin_ + i + answer_[i]); } private: Iter begin_, end_; @@ -132,4 +129,4 @@ class manacker_class }} -#endif // BOOST_ALGORITHM_MANACKER_HPP +#endif // BOOST_ALGORITHM_ \ No newline at end of file