Skip to content

Commit

Permalink
Merge pull request #103 from boeddeker/master
Browse files Browse the repository at this point in the history
Fix bug for long sequences (> 640)
  • Loading branch information
roy-ht authored Feb 10, 2024
2 parents 3f5a5b0 + 6f3fbe4 commit 84396f3
Show file tree
Hide file tree
Showing 4 changed files with 39 additions and 6 deletions.
12 changes: 6 additions & 6 deletions editdistance/_editdistance.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,15 +61,15 @@ unsigned int edit_distance_bpv(T &cmap, int64_t const *vec, size_t const &vecsiz


/// c.f. http://handasse.blogspot.com/2009/04/c_29.html
template<typename T>
unsigned int edit_distance_dp(T const *str1, size_t const size1, T const *str2, size_t const size2) {
unsigned int edit_distance_dp(int64_t const *str1, size_t const size1, int64_t const *str2, size_t const size2) {
// vectorより固定長配列の方が速いが、文字列が長い時の保険でのみ呼ばれるのでサイズを決め打ちできない
vector< vector<uint32_t> > d(2, vector<uint32_t>(size2 + 1));
d[0][0] = 0;
d[1][0] = 1;
for (int i = 0; i < size2 + 1; i++) d[0][i] = i;
for (int i = 1; i < size1 + 1; i++) {
for (int j = 1; j < size2 + 1; j++) {
for (size_t i = 0; i < size2 + 1; i++) d[0][i] = i;
for (size_t i = 1; i < size1 + 1; i++) {
d[i&1][0] = d[(i-1)&1][0] + 1;
for (size_t j = 1; j < size2 + 1; j++) {
d[i&1][j] = min(min(d[(i-1)&1][j], d[i&1][j-1]) + 1, d[(i-1)&1][j-1] + (str1[i-1] == str2[j-1] ? 0 : 1));
}
}
Expand Down Expand Up @@ -127,5 +127,5 @@ unsigned int edit_distance(const int64_t *a, const unsigned int asize, const int
else if(vsize == 8) return edit_distance_map_<8>(ap, *asizep, bp, *bsizep);
else if(vsize == 9) return edit_distance_map_<9>(ap, *asizep, bp, *bsizep);
else if(vsize == 10) return edit_distance_map_<10>(ap, *asizep, bp, *bsizep);
return edit_distance_dp<int64_t>(ap, *asizep, bp, *bsizep); // dynamic programmingに任せる
return edit_distance_dp(ap, *asizep, bp, *bsizep); // dynamic programmingに任せる
}
1 change: 1 addition & 0 deletions editdistance/_editdistance.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ extern "C" {
#endif

unsigned int edit_distance(const int64_t *a, const unsigned int asize, const int64_t *b, const unsigned int bsize);
unsigned int edit_distance_dp(int64_t const *str1, size_t const size1, int64_t const *str2, size_t const size2);

#ifdef __cplusplus
}
Expand Down
16 changes: 16 additions & 0 deletions editdistance/bycython.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ from libc.stdlib cimport malloc, free
cdef extern from "./_editdistance.h":
ctypedef int int64_t
unsigned int edit_distance(const int64_t *a, const unsigned int asize, const int64_t *b, const unsigned int bsize)
unsigned int edit_distance_dp(const int64_t *str1, const size_t size1, const int64_t *str2, const size_t size2)


cpdef unsigned int eval(object a, object b) except 0xffffffffffffffff:
cdef unsigned int i, dist
Expand All @@ -20,3 +22,17 @@ cpdef unsigned int eval(object a, object b) except 0xffffffffffffffff:
free(al)
free(bl)
return dist


cpdef unsigned int eval_dp(object a, object b) except 0xffffffffffffffff:
cdef unsigned int i, dist
cdef int64_t *al = <int64_t *>malloc(len(a) * sizeof(int64_t))
for i in range(len(a)):
al[i] = hash(a[i])
cdef int64_t *bl = <int64_t *>malloc(len(b) * sizeof(int64_t))
for i in range(len(b)):
bl[i] = hash(b[i])
dist = edit_distance_dp(al, len(a), bl, len(b))
free(al)
free(bl)
return dist
16 changes: 16 additions & 0 deletions test/test_editdistance.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,25 @@
import unittest
import random


class TestEditDistance(unittest.TestCase):
def test_editdistance(self):
import editdistance
self.assertEqual(1, editdistance.eval('abc', 'aec'))

def test_dp_editdistance(self):
from editdistance.bycython import eval_dp
self.assertEqual(3, eval_dp('bbb', 'a'))
self.assertEqual(3, eval_dp('a', 'bbb'))

def test_dp_vs_default(self):
for _ in range(10):
import editdistance
from editdistance.bycython import eval_dp
seq1 = random.choices([0, 1, 2], k=random.randint(10, 50))
seq2 = random.choices([0, 1, 2], k=random.randint(10, 50))

self.assertEqual(editdistance.eval(seq1, seq2), eval_dp(seq1, seq2))


if __name__ == '__main__':
Expand Down

0 comments on commit 84396f3

Please sign in to comment.