OlivierBinette · OlivierBinette · Apr 17, 2022 · Apr 20, 2022 · Apr 22, 2022 · Apr 22, 2022
diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 from pybind11.setup_helpers import build_ext, intree_extensions
 
 if __name__ == "__main__":
-    ext_modules = intree_extensions(["stringcompare/distance/_distance.cpp"])
+    ext_modules = intree_extensions(["stringcompare/distance/_distance.cpp", "stringcompare/preprocessing/_preprocessing.cpp"])
 
     setup(
         ext_modules=ext_modules,

diff --git a/stringcompare/distance/__init__.py b/stringcompare/distance/__init__.py
@@ -8,6 +8,7 @@
     LCSDistance,
     CharacterDifference,
     Hamming,
+    Jaccard,
 )
 
 __all__ = [
@@ -20,4 +21,5 @@
     "LCSDistance",
     "CharacterDifference",
     "Hamming",
+    "Jaccard",
 ]
diff --git a/stringcompare/distance/_distance.cpp b/stringcompare/distance/_distance.cpp
@@ -11,6 +11,7 @@
 #include "lcs.hpp"
 #include "characterdifference.hpp"
 #include "hamming.hpp"
+#include "jaccard.hpp"
 
 PYBIND11_MODULE(_distance, m) {
 
@@ -196,4 +197,9 @@ Pairwise comparison between two lists.
     )
       .def(py::init<bool, bool>(), py::arg("normalize")=true, py::arg("similarity")=false)
       .def("compare", &Hamming::compare);
+
+    py::class_<Jaccard, StringComparator>(m, "Jaccard")
+      .def(py::init<Tokenizer, bool, bool>(), py::arg("tokenizer"), py::arg("normalize")=true, py::arg("similarity")=false)
+      .def("compare", &Jaccard::compare);
+
 }
diff --git a/stringcompare/distance/jaccard.hpp b/stringcompare/distance/jaccard.hpp
@@ -0,0 +1,30 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+
+#include "comparator.hpp"
+#include "../preprocessing/tokenizer.hpp"
+
+namespace py = pybind11;
+using namespace std;
+
+class Jaccard: public StringComparator {
+public:
+
+    Tokenizer tokenizer;
+    bool normalize;
+    bool similarity;
+
+  Jaccard(Tokenizer tokenizer, bool normalize=true, bool similarity=false){
+        this->tokenizer = tokenizer;
+        this->normalize = normalize;
+        this->similarity = similarity;
+  }
+
+  double compare(const string &s, const string &t) {
+      return 0;
+  }
+
+};
diff --git a/stringcompare/preprocessing/__init__.py b/stringcompare/preprocessing/__init__.py
@@ -1,4 +1,4 @@
-from .tokenizer import Tokenizer, DelimTokenizer, WhitespaceTokenizer, NGramTokenizer
+from stringcompare.preprocessing._preprocessing import Tokenizer, DelimTokenizer, WhitespaceTokenizer, NGramTokenizer
 from .tagger import Tagger, DeepparseAddressTagger
 
 __all__ = [

diff --git a/stringcompare/preprocessing/_preprocessing.cpp b/stringcompare/preprocessing/_preprocessing.cpp
@@ -0,0 +1,25 @@
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "tokenizer.hpp"
+
+PYBIND11_MODULE(_preprocessing, m) {
+
+    m.attr("__name__") = "stringcompare.preprocessing._preprocessing";
+
+    py::class_<Tokenizer>(m, "Tokenizer")
+        .def("__call__", &Tokenizer::operator())
+        .def("batch_tokenize", &Tokenizer::batch_tokenize);
+
+    py::class_<DelimTokenizer, Tokenizer>(m, "DelimTokenizer")
+        .def(py::init<string>(), py::arg("delim"))
+        .def("tokenize", &DelimTokenizer::tokenize);
+
+    py::class_<WhitespaceTokenizer, DelimTokenizer>(m, "WhitespaceTokenizer")
+        .def(py::init<>());
+
+    py::class_<NGramTokenizer, Tokenizer>(m, "NGramTokenizer")
+        .def(py::init<int>(), py::arg("n"))
+        .def("tokenize", &NGramTokenizer::tokenize);
+
+}
diff --git a/stringcompare/preprocessing/tokenizer.hpp b/stringcompare/preprocessing/tokenizer.hpp
@@ -0,0 +1,99 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+#include <sstream>
+
+namespace py = pybind11;
+using namespace std;
+
+class Tokenizer {
+public:
+
+    vector<string> tokenize(const string &sentence){
+        vector<string> result;
+        return result;
+    }
+
+    vector<string> operator()(const string &sentence) {
+        return this->tokenize(sentence);
+    }
+
+    vector<vector<string>> batch_tokenize(const vector<string> &sentences) {
+        vector<vector<string>> result(sentences.size());
+        for (size_t i = 0; i < sentences.size(); i++) {
+            result[i] = this->tokenize(sentences[i]);
+        }
+
+        return result;
+    }
+
+};
+
+class DelimTokenizer: public Tokenizer {
+public:
+
+    string delim;
+
+    DelimTokenizer(const string delim) {
+        this->delim = delim;
+        if (this->delim.size() == 0) {
+            throw pybind11::value_error("Empty delimiter");
+        }
+    }
+
+    vector<string> tokenize(const string &sentence) {
+        vector<string> result;
+
+        if (sentence.size() == 0) {
+            return result;
+        }
+
+        size_t k = this->delim.size();
+        size_t pos = 0;
+        size_t match = 0;
+
+        while ((match = sentence.find(this->delim, pos)) != string::npos) {
+            if (match != pos) {
+                result.push_back(sentence.substr(pos, match - pos));
+            }
+            pos = match + k;
+        }
+        if (pos < sentence.size()) {
+            result.push_back(sentence.substr(pos));
+        }
+
+        return result;
+    }
+};
+
+class WhitespaceTokenizer: public DelimTokenizer {
+public:
+    WhitespaceTokenizer(): DelimTokenizer(" ") {}
+};
+
+class NGramTokenizer: public Tokenizer {
+public:
+
+    int n;
+
+    NGramTokenizer(int n) {
+        this->n = n;
+    }
+
+    vector<string> tokenize(const string &sentence) {
+        const char* cstring = sentence.c_str();
+        vector<string> result;
+
+        if (this->n <= 0) {
+            return result;
+        }
+
+        for (size_t i = 0; i < sentence.size() - this->n; i++) {
+            result.push_back(sentence.substr(i, this->n));
+        }
+
+        return result;
+    }
+};
diff --git a/stringcompare/preprocessing/tokenizer.py b/stringcompare/preprocessing/tokenizer.py
@@ -15,7 +15,7 @@ def batch_tokenize(self, sentences):
 
 
 class DelimTokenizer(Tokenizer):
-    def __init__(self, delim = " "):
+    def __init__(self, delim):
         self.delim = delim
 
     def tokenize(self, sentence):

diff --git a/tests/distance/__init__.py b/tests/distance/__init__.py
diff --git a/tests/test_characterdifference.py → tests/distance/test_characterdifference.py b/tests/test_characterdifference.py → tests/distance/test_characterdifference.py
diff --git a/tests/test_damerau_levenshtein.py → tests/distance/test_damerau_levenshtein.py b/tests/test_damerau_levenshtein.py → tests/distance/test_damerau_levenshtein.py
diff --git a/tests/test_hamming.py → tests/distance/test_hamming.py b/tests/test_hamming.py → tests/distance/test_hamming.py
diff --git a/tests/test_jaro.py → tests/distance/test_jaro.py b/tests/test_jaro.py → tests/distance/test_jaro.py
diff --git a/tests/test_jaro_winkler.py → tests/distance/test_jaro_winkler.py b/tests/test_jaro_winkler.py → tests/distance/test_jaro_winkler.py
diff --git a/tests/test_lcs.py → tests/distance/test_lcs.py b/tests/test_lcs.py → tests/distance/test_lcs.py
diff --git a/tests/test_levenshtein.py → tests/distance/test_levenshtein.py b/tests/test_levenshtein.py → tests/distance/test_levenshtein.py
diff --git a/tests/preprocessing/__init__.py b/tests/preprocessing/__init__.py