diff --git a/editdistance/_editdistance.cpp b/editdistance/_editdistance.cpp index e9c2bf1..db5e710 100644 --- a/editdistance/_editdistance.cpp +++ b/editdistance/_editdistance.cpp @@ -61,14 +61,14 @@ unsigned int edit_distance_bpv(T &cmap, int64_t const *vec, size_t const &vecsiz /// c.f. http://handasse.blogspot.com/2009/04/c_29.html -template -unsigned int edit_distance_dp(T const *str1, size_t const size1, T const *str2, size_t const size2) { +unsigned int edit_distance_dp(int64_t const *str1, size_t const size1, int64_t const *str2, size_t const size2) { // vectorより固定長配列の方が速いが、文字列が長い時の保険でのみ呼ばれるのでサイズを決め打ちできない vector< vector > d(2, vector(size2 + 1)); d[0][0] = 0; d[1][0] = 1; for (size_t i = 0; i < size2 + 1; i++) d[0][i] = i; for (size_t i = 1; i < size1 + 1; i++) { + d[i&1][0] = d[(i-1)&1][0] + 1; for (size_t j = 1; j < size2 + 1; j++) { d[i&1][j] = min(min(d[(i-1)&1][j], d[i&1][j-1]) + 1, d[(i-1)&1][j-1] + (str1[i-1] == str2[j-1] ? 0 : 1)); } @@ -83,6 +83,7 @@ bool edit_distancec_dp(T const *str1, size_t const size1, T const *str2, size_t d[1][0] = 1; for (size_t i = 0; i < size2 + 1; i++) d[0][i] = i; for (size_t i = 1; i < size1 + 1; i++) { + d[i&1][0] = d[(i-1)&1][0] + 1; bool below_thr = false; for (size_t j = 1; j < size2 + 1; j++) { d[i&1][j] = min(min(d[(i-1)&1][j], d[i&1][j-1]) + 1, d[(i-1)&1][j-1] + (str1[i-1] == str2[j-1] ? 0 : 1)); @@ -148,7 +149,7 @@ unsigned int edit_distance(const int64_t *a, const unsigned int asize, const int else if(vsize == 8) return edit_distance_map_<8>(ap, *asizep, bp, *bsizep); else if(vsize == 9) return edit_distance_map_<9>(ap, *asizep, bp, *bsizep); else if(vsize == 10) return edit_distance_map_<10>(ap, *asizep, bp, *bsizep); - return edit_distance_dp(ap, *asizep, bp, *bsizep); // dynamic programmingに任せる + return edit_distance_dp(ap, *asizep, bp, *bsizep); // dynamic programmingに任せる } bool edit_distance_criterion(const int64_t *a, const unsigned int asize, const int64_t *b, const unsigned int bsize, const unsigned int thr) { diff --git a/editdistance/_editdistance.h b/editdistance/_editdistance.h index 6cf03ff..d590b7e 100644 --- a/editdistance/_editdistance.h +++ b/editdistance/_editdistance.h @@ -9,6 +9,7 @@ extern "C" { unsigned int edit_distance(const int64_t *a, const unsigned int asize, const int64_t *b, const unsigned int bsize); bool edit_distance_criterion(const int64_t *a, const unsigned int asize, const int64_t *b, const unsigned int bsize, const unsigned int thr); +unsigned int edit_distance_dp(int64_t const *str1, size_t const size1, int64_t const *str2, size_t const size2); #ifdef __cplusplus } diff --git a/editdistance/bycython.pyx b/editdistance/bycython.pyx index d49ed91..9008a32 100644 --- a/editdistance/bycython.pyx +++ b/editdistance/bycython.pyx @@ -9,6 +9,7 @@ cdef extern from "./_editdistance.h": ctypedef int int64_t unsigned int edit_distance(const int64_t *a, const unsigned int asize, const int64_t *b, const unsigned int bsize) bool edit_distance_criterion(const int64_t *a, const unsigned int asize, const int64_t *b, const unsigned int bsize, const unsigned int thr) + unsigned int edit_distance_dp(const int64_t *str1, const size_t size1, const int64_t *str2, const size_t size2) cpdef unsigned int eval(object a, object b) except 0xffffffffffffffff: @@ -23,7 +24,7 @@ cpdef unsigned int eval(object a, object b) except 0xffffffffffffffff: free(al) free(bl) return dist - + cpdef bint eval_criterion(object a, object b, const unsigned int thr) except 0xffffffffffffffff: cdef unsigned int i cdef bint ret @@ -37,3 +38,16 @@ cpdef bint eval_criterion(object a, object b, const unsigned int thr) except 0xf free(al) free(bl) return ret + + cpdef unsigned int eval_dp(object a, object b) except 0xffffffffffffffff: + cdef unsigned int i, dist + cdef int64_t *al = malloc(len(a) * sizeof(int64_t)) + for i in range(len(a)): + al[i] = hash(a[i]) + cdef int64_t *bl = malloc(len(b) * sizeof(int64_t)) + for i in range(len(b)): + bl[i] = hash(b[i]) + dist = edit_distance_dp(al, len(a), bl, len(b)) + free(al) + free(bl) + return dist \ No newline at end of file diff --git a/test/test_editdistance.py b/test/test_editdistance.py index 0128feb..ec2244c 100644 --- a/test/test_editdistance.py +++ b/test/test_editdistance.py @@ -1,4 +1,6 @@ import unittest +import random + class TestEditDistance(unittest.TestCase): def test_editdistance(self): @@ -9,6 +11,21 @@ def test_editdistance_criterion(self): import editdistance self.assertEqual(False, editdistance.eval_criterion('abcb', 'aeca', 1)) self.assertEqual(True, editdistance.eval_criterion('abc', 'aec', 1)) + + def test_dp_editdistance(self): + from editdistance.bycython import eval_dp + self.assertEqual(3, eval_dp('bbb', 'a')) + self.assertEqual(3, eval_dp('a', 'bbb')) + + def test_dp_vs_default(self): + for _ in range(10): + import editdistance + from editdistance.bycython import eval_dp + seq1 = random.choices([0, 1, 2], k=random.randint(10, 50)) + seq2 = random.choices([0, 1, 2], k=random.randint(10, 50)) + + self.assertEqual(editdistance.eval(seq1, seq2), eval_dp(seq1, seq2)) + if __name__ == '__main__': unittest.main() \ No newline at end of file