Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

C++ Tokenizer implementation #26

Draft
wants to merge 5 commits into
base: dev
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from pybind11.setup_helpers import build_ext, intree_extensions

if __name__ == "__main__":
ext_modules = intree_extensions(["stringcompare/distance/_distance.cpp"])
ext_modules = intree_extensions(["stringcompare/distance/_distance.cpp", "stringcompare/preprocessing/_preprocessing.cpp"])

setup(
ext_modules=ext_modules,
Expand Down
2 changes: 2 additions & 0 deletions stringcompare/distance/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
LCSDistance,
CharacterDifference,
Hamming,
Jaccard,
)

__all__ = [
Expand All @@ -20,4 +21,5 @@
"LCSDistance",
"CharacterDifference",
"Hamming",
"Jaccard",
]
6 changes: 6 additions & 0 deletions stringcompare/distance/_distance.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include "lcs.hpp"
#include "characterdifference.hpp"
#include "hamming.hpp"
#include "jaccard.hpp"

PYBIND11_MODULE(_distance, m) {

Expand Down Expand Up @@ -196,4 +197,9 @@ Pairwise comparison between two lists.
)
.def(py::init<bool, bool>(), py::arg("normalize")=true, py::arg("similarity")=false)
.def("compare", &Hamming::compare);

py::class_<Jaccard, StringComparator>(m, "Jaccard")
.def(py::init<Tokenizer, bool, bool>(), py::arg("tokenizer"), py::arg("normalize")=true, py::arg("similarity")=false)
.def("compare", &Jaccard::compare);

}
30 changes: 30 additions & 0 deletions stringcompare/distance/jaccard.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#pragma once

#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include <vector>

#include "comparator.hpp"
#include "../preprocessing/tokenizer.hpp"

namespace py = pybind11;
using namespace std;

class Jaccard: public StringComparator {
public:

Tokenizer tokenizer;
bool normalize;
bool similarity;

Jaccard(Tokenizer tokenizer, bool normalize=true, bool similarity=false){
this->tokenizer = tokenizer;
this->normalize = normalize;
this->similarity = similarity;
}

double compare(const string &s, const string &t) {
return 0;
}

};
2 changes: 1 addition & 1 deletion stringcompare/preprocessing/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .tokenizer import Tokenizer, DelimTokenizer, WhitespaceTokenizer, NGramTokenizer
from stringcompare.preprocessing._preprocessing import Tokenizer, DelimTokenizer, WhitespaceTokenizer, NGramTokenizer
from .tagger import Tagger, DeepparseAddressTagger

__all__ = [
Expand Down
25 changes: 25 additions & 0 deletions stringcompare/preprocessing/_preprocessing.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>

#include "tokenizer.hpp"

PYBIND11_MODULE(_preprocessing, m) {

m.attr("__name__") = "stringcompare.preprocessing._preprocessing";

py::class_<Tokenizer>(m, "Tokenizer")
.def("__call__", &Tokenizer::operator())
.def("batch_tokenize", &Tokenizer::batch_tokenize);

py::class_<DelimTokenizer, Tokenizer>(m, "DelimTokenizer")
.def(py::init<string>(), py::arg("delim"))
.def("tokenize", &DelimTokenizer::tokenize);

py::class_<WhitespaceTokenizer, DelimTokenizer>(m, "WhitespaceTokenizer")
.def(py::init<>());

py::class_<NGramTokenizer, Tokenizer>(m, "NGramTokenizer")
.def(py::init<int>(), py::arg("n"))
.def("tokenize", &NGramTokenizer::tokenize);

}
99 changes: 99 additions & 0 deletions stringcompare/preprocessing/tokenizer.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
#pragma once

#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include <vector>
#include <sstream>

namespace py = pybind11;
using namespace std;

class Tokenizer {
public:

vector<string> tokenize(const string &sentence){
vector<string> result;
return result;
}

vector<string> operator()(const string &sentence) {
return this->tokenize(sentence);
}

vector<vector<string>> batch_tokenize(const vector<string> &sentences) {
vector<vector<string>> result(sentences.size());
for (size_t i = 0; i < sentences.size(); i++) {
result[i] = this->tokenize(sentences[i]);
}

return result;
}

};

class DelimTokenizer: public Tokenizer {
public:

string delim;

DelimTokenizer(const string delim) {
this->delim = delim;
if (this->delim.size() == 0) {
throw pybind11::value_error("Empty delimiter");
}
}

vector<string> tokenize(const string &sentence) {
vector<string> result;

if (sentence.size() == 0) {
return result;
}

size_t k = this->delim.size();
size_t pos = 0;
size_t match = 0;

while ((match = sentence.find(this->delim, pos)) != string::npos) {
if (match != pos) {
result.push_back(sentence.substr(pos, match - pos));
}
pos = match + k;
}
if (pos < sentence.size()) {
result.push_back(sentence.substr(pos));
}

return result;
}
};

class WhitespaceTokenizer: public DelimTokenizer {
public:
WhitespaceTokenizer(): DelimTokenizer(" ") {}
};

class NGramTokenizer: public Tokenizer {
public:

int n;

NGramTokenizer(int n) {
this->n = n;
}

vector<string> tokenize(const string &sentence) {
const char* cstring = sentence.c_str();
vector<string> result;

if (this->n <= 0) {
return result;
}

for (size_t i = 0; i < sentence.size() - this->n; i++) {
result.push_back(sentence.substr(i, this->n));
}

return result;
}
};
2 changes: 1 addition & 1 deletion stringcompare/preprocessing/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def batch_tokenize(self, sentences):


class DelimTokenizer(Tokenizer):
def __init__(self, delim = " "):
def __init__(self, delim):
self.delim = delim

def tokenize(self, sentence):
Expand Down
Empty file added tests/distance/__init__.py
Empty file.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Empty file added tests/preprocessing/__init__.py
Empty file.