From e36676adae51c134c568aadecafa9755f801214a Mon Sep 17 00:00:00 2001 From: vlemonidis Date: Mon, 22 Jan 2024 15:35:57 +0000 Subject: [PATCH] Added ability to prematurely terminate search if edit distance above a given threshold --- editdistance/__init__.pxd | 2 +- editdistance/__init__.py | 7 +++-- editdistance/_editdistance.cpp | 54 +++++++++++++++++++++++++++++++--- editdistance/_editdistance.h | 1 + editdistance/bycython.pxd | 1 + editdistance/bycython.pyx | 17 +++++++++++ setup.py | 2 +- test/test_editdistance.py | 6 +++- 8 files changed, 81 insertions(+), 9 deletions(-) diff --git a/editdistance/__init__.pxd b/editdistance/__init__.pxd index 1fcf681..2f0683b 100644 --- a/editdistance/__init__.pxd +++ b/editdistance/__init__.pxd @@ -1,2 +1,2 @@ # cython: language_level=3 -from editdistance.bycython cimport eval +from editdistance.bycython cimport eval, eval_criterion diff --git a/editdistance/__init__.py b/editdistance/__init__.py index 7ddd27e..ce3a0f6 100644 --- a/editdistance/__init__.py +++ b/editdistance/__init__.py @@ -1,9 +1,12 @@ -from .bycython import eval +from .bycython import eval, eval_criterion def distance(*args, **kwargs): """"An alias to eval""" return eval(*args, **kwargs) +def distance_le_than(*args, **kwargs): + """"An alias to eval""" + return eval_criterion(*args, **kwargs) -__all__ = ('eval', 'distance') +__all__ = ('eval', 'distance', "eval_criterion", "distance_le_than") diff --git a/editdistance/_editdistance.cpp b/editdistance/_editdistance.cpp index 220ddb7..e9c2bf1 100644 --- a/editdistance/_editdistance.cpp +++ b/editdistance/_editdistance.cpp @@ -41,7 +41,7 @@ unsigned int edit_distance_bpv(T &cmap, int64_t const *vec, size_t const &vecsiz for(size_t i = 0; i < tlen; ++i) VP[tmax] |= (1LL << i); for(size_t i = 0; i < vecsize; ++i) { TVALUE &PM = cmap[vec[i]]; - for(int r = 0; r <= tmax; ++r) { + for(unsigned int r = 0; r <= tmax; ++r) { uint64_t X = PM[r]; if(r > 0 && (HN[r - 1] & lmb)) X |= 1LL; D0[r] = (((X & VP[r]) + VP[r]) ^ VP[r]) | X | VN[r]; @@ -67,15 +67,36 @@ unsigned int edit_distance_dp(T const *str1, size_t const size1, T const *str2, vector< vector > d(2, vector(size2 + 1)); d[0][0] = 0; d[1][0] = 1; - for (int i = 0; i < size2 + 1; i++) d[0][i] = i; - for (int i = 1; i < size1 + 1; i++) { - for (int j = 1; j < size2 + 1; j++) { + for (size_t i = 0; i < size2 + 1; i++) d[0][i] = i; + for (size_t i = 1; i < size1 + 1; i++) { + for (size_t j = 1; j < size2 + 1; j++) { d[i&1][j] = min(min(d[(i-1)&1][j], d[i&1][j-1]) + 1, d[(i-1)&1][j-1] + (str1[i-1] == str2[j-1] ? 0 : 1)); } } return d[size1&1][size2]; } +template +bool edit_distancec_dp(T const *str1, size_t const size1, T const *str2, size_t const size2, unsigned int const thr) { + vector< vector > d(2, vector(size2 + 1)); + d[0][0] = 0; + d[1][0] = 1; + for (size_t i = 0; i < size2 + 1; i++) d[0][i] = i; + for (size_t i = 1; i < size1 + 1; i++) { + bool below_thr = false; + for (size_t j = 1; j < size2 + 1; j++) { + d[i&1][j] = min(min(d[(i-1)&1][j], d[i&1][j-1]) + 1, d[(i-1)&1][j-1] + (str1[i-1] == str2[j-1] ? 0 : 1)); + if (d[i%1][j] <= thr) { + below_thr = true; + } + } + if (!below_thr) { + return false; + } + } + return d[size1&1][size2] <= thr; +} + template struct varr { uint64_t arr_[N]; @@ -129,3 +150,28 @@ unsigned int edit_distance(const int64_t *a, const unsigned int asize, const int else if(vsize == 10) return edit_distance_map_<10>(ap, *asizep, bp, *bsizep); return edit_distance_dp(ap, *asizep, bp, *bsizep); // dynamic programmingに任せる } + +bool edit_distance_criterion(const int64_t *a, const unsigned int asize, const int64_t *b, const unsigned int bsize, const unsigned int thr) { + if(asize == 0) return bsize <= thr; + if(bsize == 0) return asize <= thr; + // 要素数の大きいほうがa + int64_t const *ap, *bp; + unsigned int const *asizep, *bsizep; + if(asize < bsize) ap = b, bp = a, asizep = &bsize, bsizep = &asize; + else ap = a, bp = b, asizep = &asize, bsizep = &bsize; + // 必要な配列サイズを調べる + size_t vsize = ((*asizep - 1) >> 6) + 1; // 64までは1, 128までは2, ... + // bit-parallelでできそうな限界を超えたら要素数の小さい方をaとする。 + if(vsize > 10) { + int64_t const *_ = ap; + unsigned int const *__ = asizep; + ap = bp, bp = _, asizep = bsizep, bsizep = __; + vsize = ((*asizep - 1) >> 6) + 1; + } + + return edit_distancec_dp(ap, *asizep, bp, *bsizep, thr); // dynamic programmingに任せる +} + + + + diff --git a/editdistance/_editdistance.h b/editdistance/_editdistance.h index 2671d00..6cf03ff 100644 --- a/editdistance/_editdistance.h +++ b/editdistance/_editdistance.h @@ -8,6 +8,7 @@ extern "C" { #endif unsigned int edit_distance(const int64_t *a, const unsigned int asize, const int64_t *b, const unsigned int bsize); +bool edit_distance_criterion(const int64_t *a, const unsigned int asize, const int64_t *b, const unsigned int bsize, const unsigned int thr); #ifdef __cplusplus } diff --git a/editdistance/bycython.pxd b/editdistance/bycython.pxd index 9d0758e..4734ce8 100644 --- a/editdistance/bycython.pxd +++ b/editdistance/bycython.pxd @@ -1,2 +1,3 @@ # cython: language_level=3 cpdef unsigned int eval(object a, object b) except 0xffffffffffffffff +cpdef bint eval_criterion(object a, object b, const unsigned int thr) except 0xffffffffffffffff \ No newline at end of file diff --git a/editdistance/bycython.pyx b/editdistance/bycython.pyx index d64a67a..d49ed91 100644 --- a/editdistance/bycython.pyx +++ b/editdistance/bycython.pyx @@ -2,11 +2,14 @@ # distutils: sources = editdistance/_editdistance.cpp from libc.stdlib cimport malloc, free +from libcpp cimport bool # from libc.stdint cimport int64_t cdef extern from "./_editdistance.h": ctypedef int int64_t unsigned int edit_distance(const int64_t *a, const unsigned int asize, const int64_t *b, const unsigned int bsize) + bool edit_distance_criterion(const int64_t *a, const unsigned int asize, const int64_t *b, const unsigned int bsize, const unsigned int thr) + cpdef unsigned int eval(object a, object b) except 0xffffffffffffffff: cdef unsigned int i, dist @@ -20,3 +23,17 @@ cpdef unsigned int eval(object a, object b) except 0xffffffffffffffff: free(al) free(bl) return dist + +cpdef bint eval_criterion(object a, object b, const unsigned int thr) except 0xffffffffffffffff: + cdef unsigned int i + cdef bint ret + cdef int64_t *al = malloc(len(a) * sizeof(int64_t)) + for i in range(len(a)): + al[i] = hash(a[i]) + cdef int64_t *bl = malloc(len(b) * sizeof(int64_t)) + for i in range(len(b)): + bl[i] = hash(b[i]) + ret = edit_distance_criterion(al, len(a), bl, len(b), thr) + free(al) + free(bl) + return ret diff --git a/setup.py b/setup.py index 2e3471a..78d2444 100644 --- a/setup.py +++ b/setup.py @@ -29,7 +29,7 @@ setup( name="editdistance", - version="0.6.2", + version="0.7.0", python_requires=">=3.6", description="Fast implementation of the edit distance(Levenshtein distance)", long_description=readme, diff --git a/test/test_editdistance.py b/test/test_editdistance.py index 57ed1f3..0128feb 100644 --- a/test/test_editdistance.py +++ b/test/test_editdistance.py @@ -4,7 +4,11 @@ class TestEditDistance(unittest.TestCase): def test_editdistance(self): import editdistance self.assertEqual(1, editdistance.eval('abc', 'aec')) - + + def test_editdistance_criterion(self): + import editdistance + self.assertEqual(False, editdistance.eval_criterion('abcb', 'aeca', 1)) + self.assertEqual(True, editdistance.eval_criterion('abc', 'aec', 1)) if __name__ == '__main__': unittest.main() \ No newline at end of file