Skip to content

Commit

Permalink
Merge pull request #107 from VasLem/master
Browse files Browse the repository at this point in the history
Added ability to prematurely terminate search
  • Loading branch information
roy-ht authored Feb 10, 2024
2 parents 84396f3 + 759dec0 commit 8b61734
Show file tree
Hide file tree
Showing 8 changed files with 81 additions and 9 deletions.
2 changes: 1 addition & 1 deletion editdistance/__init__.pxd
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
# cython: language_level=3
from editdistance.bycython cimport eval
from editdistance.bycython cimport eval, eval_criterion
7 changes: 5 additions & 2 deletions editdistance/__init__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
from .bycython import eval
from .bycython import eval, eval_criterion


def distance(*args, **kwargs):
""""An alias to eval"""
return eval(*args, **kwargs)

def distance_le_than(*args, **kwargs):
""""An alias to eval"""
return eval_criterion(*args, **kwargs)

__all__ = ('eval', 'distance')
__all__ = ('eval', 'distance', "eval_criterion", "distance_le_than")
49 changes: 48 additions & 1 deletion editdistance/_editdistance.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ unsigned int edit_distance_bpv(T &cmap, int64_t const *vec, size_t const &vecsiz
for(size_t i = 0; i < tlen; ++i) VP[tmax] |= (1LL << i);
for(size_t i = 0; i < vecsize; ++i) {
TVALUE &PM = cmap[vec[i]];
for(int r = 0; r <= tmax; ++r) {
for(unsigned int r = 0; r <= tmax; ++r) {
uint64_t X = PM[r];
if(r > 0 && (HN[r - 1] & lmb)) X |= 1LL;
D0[r] = (((X & VP[r]) + VP[r]) ^ VP[r]) | X | VN[r];
Expand Down Expand Up @@ -76,6 +76,28 @@ unsigned int edit_distance_dp(int64_t const *str1, size_t const size1, int64_t c
return d[size1&1][size2];
}

template<typename T>
bool edit_distancec_dp(T const *str1, size_t const size1, T const *str2, size_t const size2, unsigned int const thr) {
vector< vector<uint32_t> > d(2, vector<uint32_t>(size2 + 1));
d[0][0] = 0;
d[1][0] = 1;
for (size_t i = 0; i < size2 + 1; i++) d[0][i] = i;
for (size_t i = 1; i < size1 + 1; i++) {
d[i&1][0] = d[(i-1)&1][0] + 1;
bool below_thr = false;
for (size_t j = 1; j < size2 + 1; j++) {
d[i&1][j] = min(min(d[(i-1)&1][j], d[i&1][j-1]) + 1, d[(i-1)&1][j-1] + (str1[i-1] == str2[j-1] ? 0 : 1));
if (d[i%1][j] <= thr) {
below_thr = true;
}
}
if (!below_thr) {
return false;
}
}
return d[size1&1][size2] <= thr;
}

template <size_t N>
struct varr {
uint64_t arr_[N];
Expand Down Expand Up @@ -129,3 +151,28 @@ unsigned int edit_distance(const int64_t *a, const unsigned int asize, const int
else if(vsize == 10) return edit_distance_map_<10>(ap, *asizep, bp, *bsizep);
return edit_distance_dp(ap, *asizep, bp, *bsizep); // dynamic programmingに任せる
}

bool edit_distance_criterion(const int64_t *a, const unsigned int asize, const int64_t *b, const unsigned int bsize, const unsigned int thr) {
if(asize == 0) return bsize <= thr;
if(bsize == 0) return asize <= thr;
// 要素数の大きいほうがa
int64_t const *ap, *bp;
unsigned int const *asizep, *bsizep;
if(asize < bsize) ap = b, bp = a, asizep = &bsize, bsizep = &asize;
else ap = a, bp = b, asizep = &asize, bsizep = &bsize;
// 必要な配列サイズを調べる
size_t vsize = ((*asizep - 1) >> 6) + 1; // 64までは1, 128までは2, ...
// bit-parallelでできそうな限界を超えたら要素数の小さい方をaとする。
if(vsize > 10) {
int64_t const *_ = ap;
unsigned int const *__ = asizep;
ap = bp, bp = _, asizep = bsizep, bsizep = __;
vsize = ((*asizep - 1) >> 6) + 1;
}

return edit_distancec_dp<int64_t>(ap, *asizep, bp, *bsizep, thr); // dynamic programmingに任せる
}




1 change: 1 addition & 0 deletions editdistance/_editdistance.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ extern "C" {
#endif

unsigned int edit_distance(const int64_t *a, const unsigned int asize, const int64_t *b, const unsigned int bsize);
bool edit_distance_criterion(const int64_t *a, const unsigned int asize, const int64_t *b, const unsigned int bsize, const unsigned int thr);
unsigned int edit_distance_dp(int64_t const *str1, size_t const size1, int64_t const *str2, size_t const size2);

#ifdef __cplusplus
Expand Down
1 change: 1 addition & 0 deletions editdistance/bycython.pxd
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
# cython: language_level=3
cpdef unsigned int eval(object a, object b) except 0xffffffffffffffff
cpdef bint eval_criterion(object a, object b, const unsigned int thr) except 0xffffffffffffffff
21 changes: 18 additions & 3 deletions editdistance/bycython.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,13 @@
# distutils: sources = editdistance/_editdistance.cpp

from libc.stdlib cimport malloc, free
from libcpp cimport bool
# from libc.stdint cimport int64_t

cdef extern from "./_editdistance.h":
ctypedef int int64_t
unsigned int edit_distance(const int64_t *a, const unsigned int asize, const int64_t *b, const unsigned int bsize)
bool edit_distance_criterion(const int64_t *a, const unsigned int asize, const int64_t *b, const unsigned int bsize, const unsigned int thr)
unsigned int edit_distance_dp(const int64_t *str1, const size_t size1, const int64_t *str2, const size_t size2)


Expand All @@ -22,9 +24,22 @@ cpdef unsigned int eval(object a, object b) except 0xffffffffffffffff:
free(al)
free(bl)
return dist

cpdef bint eval_criterion(object a, object b, const unsigned int thr) except 0xffffffffffffffff:
cdef unsigned int i
cdef bint ret
cdef int64_t *al = <int64_t *>malloc(len(a) * sizeof(int64_t))
for i in range(len(a)):
al[i] = hash(a[i])
cdef int64_t *bl = <int64_t *>malloc(len(b) * sizeof(int64_t))
for i in range(len(b)):
bl[i] = hash(b[i])
ret = edit_distance_criterion(al, len(a), bl, len(b), thr)
free(al)
free(bl)
return ret


cpdef unsigned int eval_dp(object a, object b) except 0xffffffffffffffff:
cpdef unsigned int eval_dp(object a, object b) except 0xffffffffffffffff:
cdef unsigned int i, dist
cdef int64_t *al = <int64_t *>malloc(len(a) * sizeof(int64_t))
for i in range(len(a)):
Expand All @@ -35,4 +50,4 @@ cpdef unsigned int eval_dp(object a, object b) except 0xffffffffffffffff:
dist = edit_distance_dp(al, len(a), bl, len(b))
free(al)
free(bl)
return dist
return dist
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@

setup(
name="editdistance",
version="0.6.2",
version="0.7.0",
python_requires=">=3.6",
description="Fast implementation of the edit distance(Levenshtein distance)",
long_description=readme,
Expand Down
7 changes: 6 additions & 1 deletion test/test_editdistance.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,11 @@ def test_editdistance(self):
import editdistance
self.assertEqual(1, editdistance.eval('abc', 'aec'))

def test_editdistance_criterion(self):
import editdistance
self.assertEqual(False, editdistance.eval_criterion('abcb', 'aeca', 1))
self.assertEqual(True, editdistance.eval_criterion('abc', 'aec', 1))

def test_dp_editdistance(self):
from editdistance.bycython import eval_dp
self.assertEqual(3, eval_dp('bbb', 'a'))
Expand All @@ -20,7 +25,7 @@ def test_dp_vs_default(self):
seq2 = random.choices([0, 1, 2], k=random.randint(10, 50))

self.assertEqual(editdistance.eval(seq1, seq2), eval_dp(seq1, seq2))


if __name__ == '__main__':
unittest.main()

0 comments on commit 8b61734

Please sign in to comment.