From 627292744727beebe563cbe8fad5066c016136de Mon Sep 17 00:00:00 2001 From: LEFTazs Date: Mon, 3 Feb 2020 17:43:01 +0100 Subject: [PATCH 1/6] Add unicode character support for levenshtein and osa --- test/test.py | 44 ++++++++++++++++++++++++ weighted_levenshtein/clev.pxd | 10 +++--- weighted_levenshtein/clev.pyx | 65 +++++++++++++++++++++-------------- 3 files changed, 88 insertions(+), 31 deletions(-) diff --git a/test/test.py b/test/test.py index cc80238..d806776 100644 --- a/test/test.py +++ b/test/test.py @@ -183,3 +183,47 @@ def test_dl(self): self.assertEqual(dam_lev('bca', 'ab'), 2) self.assertEqual(dam_lev('ab', 'bdca'), 3) self.assertEqual(dam_lev('bdca', 'ab'), 3) + + +class TestClevWithUnicode(unittest.TestCase): + + def setUp(self): + self.iw = np.ones(10001, dtype=np.float64) + self.dw = np.ones(10001, dtype=np.float64) + self.sw = np.ones((10001, 10001), dtype=np.float64) + self.tw = np.ones((10001, 10001), dtype=np.float64) + self.iw[ord("á")] = 2.0 + self.dw[ord("á")] = 2.0 + self.iw[ord("ő")] = 9.0 + self.dw[ord("ő")] = 9.0 + self.iw[ord("Ұ")] = 10.0 + self.dw[ord("Ұ")] = 10.0 + + + def _lev(self, x, y): + return lev(x, y, self.iw, self.dw, self.sw) + + def _osa(self, x, y): + return osa(x, y, self.iw, self.dw, self.sw, self.tw) + + def test_lev(self): + try: + self.assertEqual(self._lev('átívelődök', 'átívelődök'), 0.0) + self.assertEqual(self._lev('', 'átívelődök'), 19.0) + self.assertEqual(self._lev('átívelődök', ''), 19.0) + self.assertEqual(self._lev('', ''), 0.0) + self.assertEqual(self._lev('átívelődök', 'átívelőd'), 2.0) + self.assertEqual(self._lev('', 'ҰǴʚΏ¤☣✐'), 16.0) + except UnicodeEncodeError: + self.fail("Could not handle special characters") + + def test_osa(self): + try: + self.assertEqual(self._osa('átívelődök', 'átívelődök'), 0.0) + self.assertEqual(self._osa('', 'átívelődök'), 19.0) + self.assertEqual(self._osa('átívelődök', ''), 19.0) + self.assertEqual(self._osa('', ''), 0.0) + self.assertEqual(self._osa('átívelődök', 'átívelőd'), 2.0) + self.assertEqual(self._osa('', 'ҰǴʚΏ¤☣✐'), 16.0) + except UnicodeEncodeError: + self.fail("Could not handle special characters") diff --git a/weighted_levenshtein/clev.pxd b/weighted_levenshtein/clev.pxd index 9ebf499..234aae3 100644 --- a/weighted_levenshtein/clev.pxd +++ b/weighted_levenshtein/clev.pxd @@ -3,7 +3,7 @@ from libc.float cimport DBL_MAX as DTYPE_MAX ctypedef double DTYPE_t cdef enum: - ALPHABET_SIZE = 128 + ALPHABET_SIZE = 512 cdef DTYPE_t c_damerau_levenshtein( @@ -18,9 +18,9 @@ cdef DTYPE_t c_damerau_levenshtein( cdef DTYPE_t c_optimal_string_alignment( - unsigned char* word_m, + int[:] word_m, Py_ssize_t m, - unsigned char* word_n, + int[:] word_n, Py_ssize_t n, DTYPE_t[::1] insert_costs, DTYPE_t[::1] delete_costs, @@ -29,9 +29,9 @@ cdef DTYPE_t c_optimal_string_alignment( cdef DTYPE_t c_levenshtein( - unsigned char* word_m, + int[:] word_m, Py_ssize_t m, - unsigned char* word_n, + int[:] word_n, Py_ssize_t n, DTYPE_t[::1] insert_costs, DTYPE_t[::1] delete_costs, diff --git a/weighted_levenshtein/clev.pyx b/weighted_levenshtein/clev.pyx index 9df89cd..c210ff2 100644 --- a/weighted_levenshtein/clev.pyx +++ b/weighted_levenshtein/clev.pyx @@ -1,10 +1,10 @@ #!python -# cython: language_level=3, boundscheck=False, wraparound=False, embedsignature=True, linetrace=True, c_string_type=str, c_string_encoding=ascii -# distutils: define_macros=CYTHON_TRACE_NOGIL=1 +# cython: language_level=3, boundscheck=False, wraparound=False, embedsignature=True, linetrace=True, c_string_type=unicode, c_string_encoding=utf8 from libc.stdlib cimport malloc, free from cython.view cimport array as cvarray from .clev cimport DTYPE_t, DTYPE_MAX, ALPHABET_SIZE +import numpy as np cyarr = cvarray(shape=(ALPHABET_SIZE,), itemsize=sizeof(double), format="d") @@ -132,6 +132,12 @@ cdef inline DTYPE_t row_insert_range_cost( # End Array2D +def convert_string_to_int_array(char* str): + s = str.encode('utf-8').decode('utf-8') + intarr = np.empty([len(s), ], dtype=int) + for i, c in enumerate(s): + intarr[i] = ord(c) + return intarr cdef inline unsigned char str_1_get(unsigned char* s, Py_ssize_t i) nogil: """ @@ -140,6 +146,13 @@ cdef inline unsigned char str_1_get(unsigned char* s, Py_ssize_t i) nogil: """ return s[i - 1] +cdef inline int int_array_1_get(int[:] s, Py_ssize_t i) nogil: + """ + Takes an index of a 1-indexed int array + and returns that number + """ + return s[i - 1] + # End helper functions @@ -307,12 +320,12 @@ def optimal_string_alignment( if transpose_costs is None: transpose_costs = unit_matrix - s1 = str(str1).encode() - s2 = str(str2).encode() + intarr1 = convert_string_to_int_array(str1) + intarr2 = convert_string_to_int_array(str2) return c_optimal_string_alignment( - s1, len(s1), - s2, len(s2), + intarr1, intarr1.size, + intarr2, intarr2.size, insert_costs, delete_costs, substitute_costs, @@ -323,8 +336,8 @@ osa = optimal_string_alignment cdef DTYPE_t c_optimal_string_alignment( - unsigned char* str1, Py_ssize_t len1, - unsigned char* str2, Py_ssize_t len2, + int[:] str1, Py_ssize_t len1, + int[:] str2, Py_ssize_t len2, DTYPE_t[::1] insert_costs, DTYPE_t[::1] delete_costs, DTYPE_t[:,::1] substitute_costs, @@ -334,7 +347,7 @@ cdef DTYPE_t c_optimal_string_alignment( """ cdef: Py_ssize_t i, j - unsigned char char_i, char_j, prev_char_i, prev_char_j + unsigned int char_i, char_j, prev_char_i, prev_char_j DTYPE_t ret_val Array2D d @@ -343,17 +356,17 @@ cdef DTYPE_t c_optimal_string_alignment( # fill row 0 and column 0 with insertion and deletion costs Array2D_0_at(d, 0, 0)[0] = 0 for i in range(1, len1 + 1): - char_i = str_1_get(str1, i) + char_i = int_array_1_get(str1, i) Array2D_0_at(d, i, 0)[0] = Array2D_0_get(d, i - 1, 0) + delete_costs[char_i] for j in range(1, len2 + 1): - char_j = str_1_get(str2, j) + char_j = int_array_1_get(str2, j) Array2D_0_at(d, 0, j)[0] = Array2D_0_get(d, 0, j - 1) + insert_costs[char_j] # fill DP array for i in range(1, len1 + 1): - char_i = str_1_get(str1, i) + char_i = int_array_1_get(str1, i) for j in range(1, len2 + 1): - char_j = str_1_get(str2, j) + char_j = int_array_1_get(str2, j) if char_i == char_j: # match Array2D_0_at(d, i, j)[0] = Array2D_0_get(d, i - 1, j - 1) else: @@ -364,8 +377,8 @@ cdef DTYPE_t c_optimal_string_alignment( ) if i > 1 and j > 1: - prev_char_i = str_1_get(str1, i - 1) - prev_char_j = str_1_get(str2, j - 1) + prev_char_i = int_array_1_get(str1, i - 1) + prev_char_j = int_array_1_get(str2, j - 1) if char_i == prev_char_j and prev_char_i == char_j: # transpose Array2D_0_at(d, i, j)[0] = min( Array2D_0_get(d, i, j), @@ -408,12 +421,12 @@ def levenshtein( if substitute_costs is None: substitute_costs = unit_matrix - s1 = str(str1).encode() - s2 = str(str2).encode() + intarr1 = convert_string_to_int_array(str1) + intarr2 = convert_string_to_int_array(str2) return c_levenshtein( - s1, len(s1), - s2, len(s2), + intarr1, intarr1.size, + intarr2, intarr2.size, insert_costs, delete_costs, substitute_costs @@ -423,8 +436,8 @@ lev = levenshtein cdef DTYPE_t c_levenshtein( - unsigned char* str1, Py_ssize_t len1, - unsigned char* str2, Py_ssize_t len2, + int[:] str1, Py_ssize_t len1, + int[:] str2, Py_ssize_t len2, DTYPE_t[::1] insert_costs, DTYPE_t[::1] delete_costs, DTYPE_t[:,::1] substitute_costs) nogil: @@ -433,7 +446,7 @@ cdef DTYPE_t c_levenshtein( """ cdef: Py_ssize_t i, j - unsigned char char_i, char_j + unsigned int char_i, char_j DTYPE_t ret_val Array2D d @@ -441,16 +454,16 @@ cdef DTYPE_t c_levenshtein( Array2D_0_at(d, 0, 0)[0] = 0 for i in range(1, len1 + 1): - char_i = str_1_get(str1, i) + char_i = int_array_1_get(str1, i) Array2D_0_at(d, i, 0)[0] = Array2D_0_get(d, i - 1, 0) + delete_costs[char_i] for j in range(1, len2 + 1): - char_j = str_1_get(str2, j) + char_j = int_array_1_get(str2, j) Array2D_0_at(d, 0, j)[0] = Array2D_0_get(d, 0, j - 1) + insert_costs[char_j] for i in range(1, len1 + 1): - char_i = str_1_get(str1, i) + char_i = int_array_1_get(str1, i) for j in range(1, len2 + 1): - char_j = str_1_get(str2, j) + char_j = int_array_1_get(str2, j) if char_i == char_j: # match Array2D_0_at(d, i, j)[0] = Array2D_0_get(d, i - 1, j - 1) else: From 9d64fb89cf38bb0cc079e82c156e4fd45725a5a1 Mon Sep 17 00:00:00 2001 From: LEFTazs Date: Mon, 3 Feb 2020 22:35:43 +0100 Subject: [PATCH 2/6] Add Windows and unicode support for damerau --- test/test.py | 14 ++++++++++++++ weighted_levenshtein/clev.pxd | 4 ++-- weighted_levenshtein/clev.pyx | 29 ++++++++++++++++------------- 3 files changed, 32 insertions(+), 15 deletions(-) diff --git a/test/test.py b/test/test.py index d806776..084606a 100644 --- a/test/test.py +++ b/test/test.py @@ -206,6 +206,9 @@ def _lev(self, x, y): def _osa(self, x, y): return osa(x, y, self.iw, self.dw, self.sw, self.tw) + def _dl(self, x, y): + return dam_lev(x, y, self.iw, self.dw, self.sw, self.tw) + def test_lev(self): try: self.assertEqual(self._lev('átívelődök', 'átívelődök'), 0.0) @@ -227,3 +230,14 @@ def test_osa(self): self.assertEqual(self._osa('', 'ҰǴʚΏ¤☣✐'), 16.0) except UnicodeEncodeError: self.fail("Could not handle special characters") + + def test_dl(self): + try: + self.assertEqual(self._dl('átívelődök', 'átívelődök'), 0.0) + self.assertEqual(self._dl('', 'átívelődök'), 19.0) + self.assertEqual(self._dl('átívelődök', ''), 19.0) + self.assertEqual(self._dl('', ''), 0.0) + self.assertEqual(self._dl('átívelődök', 'átívelőd'), 2.0) + self.assertEqual(self._dl('', 'ҰǴʚΏ¤☣✐'), 16.0) + except UnicodeEncodeError: + self.fail("Could not handle special characters") diff --git a/weighted_levenshtein/clev.pxd b/weighted_levenshtein/clev.pxd index 234aae3..f2021af 100644 --- a/weighted_levenshtein/clev.pxd +++ b/weighted_levenshtein/clev.pxd @@ -7,9 +7,9 @@ cdef enum: cdef DTYPE_t c_damerau_levenshtein( - unsigned char* str_a, + int[:] str_a, Py_ssize_t len_a, - unsigned char* str_b, + int[:] str_b, Py_ssize_t len_b, DTYPE_t[::1] insert_costs, DTYPE_t[::1] delete_costs, diff --git a/weighted_levenshtein/clev.pyx b/weighted_levenshtein/clev.pyx index c210ff2..7b5fa54 100644 --- a/weighted_levenshtein/clev.pyx +++ b/weighted_levenshtein/clev.pyx @@ -144,6 +144,8 @@ cdef inline unsigned char str_1_get(unsigned char* s, Py_ssize_t i) nogil: Takes an index of a 1-indexed string and returns that character """ + if i - 1 < 0: + return 0 return s[i - 1] cdef inline int int_array_1_get(int[:] s, Py_ssize_t i) nogil: @@ -151,6 +153,8 @@ cdef inline int int_array_1_get(int[:] s, Py_ssize_t i) nogil: Takes an index of a 1-indexed int array and returns that number """ + if i - 1 < 0: + return 0 return s[i - 1] # End helper functions @@ -192,12 +196,12 @@ def damerau_levenshtein( if transpose_costs is None: transpose_costs = unit_matrix - s1 = str(str1).encode() - s2 = str(str2).encode() + intarr1 = convert_string_to_int_array(str1) + intarr2 = convert_string_to_int_array(str2) return c_damerau_levenshtein( - s1, len(s1), - s2, len(s2), + intarr1, intarr1.size, + intarr2, intarr2.size, insert_costs, delete_costs, substitute_costs, @@ -208,8 +212,8 @@ dam_lev = damerau_levenshtein cdef DTYPE_t c_damerau_levenshtein( - unsigned char* str1, Py_ssize_t len1, - unsigned char* str2, Py_ssize_t len2, + int[:] str1, Py_ssize_t len1, + int[:] str2, Py_ssize_t len2, DTYPE_t[::1] insert_costs, DTYPE_t[::1] delete_costs, DTYPE_t[:,::1] substitute_costs, @@ -221,7 +225,7 @@ cdef DTYPE_t c_damerau_levenshtein( Py_ssize_t[ALPHABET_SIZE] da Py_ssize_t i, j - unsigned char char_i, char_j + unsigned int char_i, char_j DTYPE_t cost, ret_val Py_ssize_t db, k, l @@ -243,21 +247,20 @@ cdef DTYPE_t c_damerau_levenshtein( # fill row 0 and column 0 with insertion and deletion costs Array2D_n1_at(d, 0, 0)[0] = 0 for i in range(1, len1 + 1): - char_i = str_1_get(str1, i) + char_i = int_array_1_get(str1, i) cost = delete_costs[char_i] Array2D_n1_at(d, i, 0)[0] = Array2D_n1_get(d, i - 1, 0) + cost for j in range(1, len2 + 1): - char_j = str_1_get(str2, j) + char_j = int_array_1_get(str2, j) cost = insert_costs[char_j] Array2D_n1_at(d, 0, j)[0] = Array2D_n1_get(d, 0, j - 1) + cost # fill DP array for i in range(1, len1 + 1): - char_i = str_1_get(str1, i) - + char_i = int_array_1_get(str1, i) db = 0 for j in range(1, len2 + 1): - char_j = str_1_get(str2, j) + char_j = int_array_1_get(str2, j) k = da[char_j] l = db @@ -273,7 +276,7 @@ cdef DTYPE_t c_damerau_levenshtein( Array2D_n1_get(d, i - 1, j) + delete_costs[char_i], # delete Array2D_n1_get(d, k - 1, l - 1) + # transpose col_delete_range_cost(d, k + 1, i - 1) + # delete chars in between - transpose_costs[str_1_get(str1, k), str_1_get(str1, i)] + # transpose chars + transpose_costs[int_array_1_get(str1, k), int_array_1_get(str1, i)] + # transpose chars row_insert_range_cost(d, l + 1, j - 1) # insert chars in between ) From 58a00ef9674e451cd7eb05f3014845b9ae7c42e7 Mon Sep 17 00:00:00 2001 From: LEFTazs Date: Tue, 4 Feb 2020 11:37:22 +0100 Subject: [PATCH 3/6] Remove length arguments from the C methods --- weighted_levenshtein/clev.pxd | 6 ------ weighted_levenshtein/clev.pyx | 31 ++++++++++++++++++------------- 2 files changed, 18 insertions(+), 19 deletions(-) diff --git a/weighted_levenshtein/clev.pxd b/weighted_levenshtein/clev.pxd index f2021af..4c35521 100644 --- a/weighted_levenshtein/clev.pxd +++ b/weighted_levenshtein/clev.pxd @@ -8,9 +8,7 @@ cdef enum: cdef DTYPE_t c_damerau_levenshtein( int[:] str_a, - Py_ssize_t len_a, int[:] str_b, - Py_ssize_t len_b, DTYPE_t[::1] insert_costs, DTYPE_t[::1] delete_costs, DTYPE_t[:,::1] substitute_costs, @@ -19,9 +17,7 @@ cdef DTYPE_t c_damerau_levenshtein( cdef DTYPE_t c_optimal_string_alignment( int[:] word_m, - Py_ssize_t m, int[:] word_n, - Py_ssize_t n, DTYPE_t[::1] insert_costs, DTYPE_t[::1] delete_costs, DTYPE_t[:,::1] substitute_costs, @@ -30,9 +26,7 @@ cdef DTYPE_t c_optimal_string_alignment( cdef DTYPE_t c_levenshtein( int[:] word_m, - Py_ssize_t m, int[:] word_n, - Py_ssize_t n, DTYPE_t[::1] insert_costs, DTYPE_t[::1] delete_costs, DTYPE_t[:,::1] substitute_costs) nogil diff --git a/weighted_levenshtein/clev.pyx b/weighted_levenshtein/clev.pyx index 7b5fa54..f217f60 100644 --- a/weighted_levenshtein/clev.pyx +++ b/weighted_levenshtein/clev.pyx @@ -200,8 +200,7 @@ def damerau_levenshtein( intarr2 = convert_string_to_int_array(str2) return c_damerau_levenshtein( - intarr1, intarr1.size, - intarr2, intarr2.size, + intarr1, intarr2, insert_costs, delete_costs, substitute_costs, @@ -212,8 +211,7 @@ dam_lev = damerau_levenshtein cdef DTYPE_t c_damerau_levenshtein( - int[:] str1, Py_ssize_t len1, - int[:] str2, Py_ssize_t len2, + int[:] str1, int[:] str2, DTYPE_t[::1] insert_costs, DTYPE_t[::1] delete_costs, DTYPE_t[:,::1] substitute_costs, @@ -228,8 +226,11 @@ cdef DTYPE_t c_damerau_levenshtein( unsigned int char_i, char_j DTYPE_t cost, ret_val Py_ssize_t db, k, l - Array2D d + Py_ssize_t len1, len2 + + len1 = str1.shape[0] + len2 = str2.shape[0] Array2D_init(&d, len1 + 2, len2 + 2) @@ -327,8 +328,7 @@ def optimal_string_alignment( intarr2 = convert_string_to_int_array(str2) return c_optimal_string_alignment( - intarr1, intarr1.size, - intarr2, intarr2.size, + intarr1, intarr2, insert_costs, delete_costs, substitute_costs, @@ -339,8 +339,7 @@ osa = optimal_string_alignment cdef DTYPE_t c_optimal_string_alignment( - int[:] str1, Py_ssize_t len1, - int[:] str2, Py_ssize_t len2, + int[:] str1, int[:] str2, DTYPE_t[::1] insert_costs, DTYPE_t[::1] delete_costs, DTYPE_t[:,::1] substitute_costs, @@ -353,6 +352,10 @@ cdef DTYPE_t c_optimal_string_alignment( unsigned int char_i, char_j, prev_char_i, prev_char_j DTYPE_t ret_val Array2D d + Py_ssize_t len1, len2 + + len1 = str1.shape[0] + len2 = str2.shape[0] Array2D_init(&d, len1 + 1, len2 + 1) @@ -428,8 +431,7 @@ def levenshtein( intarr2 = convert_string_to_int_array(str2) return c_levenshtein( - intarr1, intarr1.size, - intarr2, intarr2.size, + intarr1, intarr2, insert_costs, delete_costs, substitute_costs @@ -439,8 +441,7 @@ lev = levenshtein cdef DTYPE_t c_levenshtein( - int[:] str1, Py_ssize_t len1, - int[:] str2, Py_ssize_t len2, + int[:] str1, int[:] str2, DTYPE_t[::1] insert_costs, DTYPE_t[::1] delete_costs, DTYPE_t[:,::1] substitute_costs) nogil: @@ -452,6 +453,10 @@ cdef DTYPE_t c_levenshtein( unsigned int char_i, char_j DTYPE_t ret_val Array2D d + Py_ssize_t len1, len2 + + len1 = str1.shape[0] + len2 = str2.shape[0] Array2D_init(&d, len1 + 1, len2 + 1) From 530042bf67cd8de5171b94068d3dda79b9e17052 Mon Sep 17 00:00:00 2001 From: LEFTazs Date: Wed, 5 Feb 2020 10:44:42 +0100 Subject: [PATCH 4/6] Fix negative indexing error --- weighted_levenshtein/clev.pyx | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/weighted_levenshtein/clev.pyx b/weighted_levenshtein/clev.pyx index f217f60..f423b5c 100644 --- a/weighted_levenshtein/clev.pyx +++ b/weighted_levenshtein/clev.pyx @@ -144,8 +144,6 @@ cdef inline unsigned char str_1_get(unsigned char* s, Py_ssize_t i) nogil: Takes an index of a 1-indexed string and returns that character """ - if i - 1 < 0: - return 0 return s[i - 1] cdef inline int int_array_1_get(int[:] s, Py_ssize_t i) nogil: @@ -153,8 +151,6 @@ cdef inline int int_array_1_get(int[:] s, Py_ssize_t i) nogil: Takes an index of a 1-indexed int array and returns that number """ - if i - 1 < 0: - return 0 return s[i - 1] # End helper functions @@ -229,6 +225,11 @@ cdef DTYPE_t c_damerau_levenshtein( Array2D d Py_ssize_t len1, len2 + DTYPE_t substitute_cost + DTYPE_t insert_cost + DTYPE_t delete_cost + DTYPE_t transpose_cost + len1 = str1.shape[0] len2 = str2.shape[0] @@ -271,14 +272,25 @@ cdef DTYPE_t c_damerau_levenshtein( else: cost = substitute_costs[char_i, char_j] + substitute_cost = Array2D_n1_get(d, i - 1, j - 1) + cost + insert_cost = Array2D_n1_get(d, i, j - 1) + insert_costs[char_j] + delete_cost = Array2D_n1_get(d, i - 1, j) + delete_costs[char_i] + if k <= 0: + # char_j hasn't been seen yet, so nothing to swap + transpose_cost = DTYPE_MAX + else: + # char_j has been seen, swap with char_i + transpose_cost = \ + Array2D_n1_get(d, k - 1, l - 1) + \ + col_delete_range_cost(d, k + 1, i - 1) + \ + transpose_costs[char_j, char_i] + \ + row_insert_range_cost(d, l + 1, j - 1) + Array2D_n1_at(d, i, j)[0] = min( - Array2D_n1_get(d, i - 1, j - 1) + cost, # equal/substitute - Array2D_n1_get(d, i, j - 1) + insert_costs[char_j], # insert - Array2D_n1_get(d, i - 1, j) + delete_costs[char_i], # delete - Array2D_n1_get(d, k - 1, l - 1) + # transpose - col_delete_range_cost(d, k + 1, i - 1) + # delete chars in between - transpose_costs[int_array_1_get(str1, k), int_array_1_get(str1, i)] + # transpose chars - row_insert_range_cost(d, l + 1, j - 1) # insert chars in between + substitute_cost, + insert_cost, + delete_cost, + transpose_cost ) da[char_i] = i From c0d9b336d2c149326787103a098be8c13bb5bd7d Mon Sep 17 00:00:00 2001 From: LEFTazs Date: Thu, 6 Feb 2020 19:21:49 +0100 Subject: [PATCH 5/6] Add legacy code for backwards compatibility --- weighted_levenshtein/clev.pxd | 37 ++++++- weighted_levenshtein/clev.pyx | 186 ++++++++++++++++++++++++++++++++-- 2 files changed, 214 insertions(+), 9 deletions(-) diff --git a/weighted_levenshtein/clev.pxd b/weighted_levenshtein/clev.pxd index 4c35521..ac2ad7e 100644 --- a/weighted_levenshtein/clev.pxd +++ b/weighted_levenshtein/clev.pxd @@ -6,7 +6,7 @@ cdef enum: ALPHABET_SIZE = 512 -cdef DTYPE_t c_damerau_levenshtein( +cdef DTYPE_t c_damerau_levenshtein_unicode( int[:] str_a, int[:] str_b, DTYPE_t[::1] insert_costs, @@ -15,7 +15,7 @@ cdef DTYPE_t c_damerau_levenshtein( DTYPE_t[:,::1] transpose_costs) nogil -cdef DTYPE_t c_optimal_string_alignment( +cdef DTYPE_t c_optimal_string_alignment_unicode( int[:] word_m, int[:] word_n, DTYPE_t[::1] insert_costs, @@ -24,10 +24,41 @@ cdef DTYPE_t c_optimal_string_alignment( DTYPE_t[:,::1] transpose_costs) nogil -cdef DTYPE_t c_levenshtein( +cdef DTYPE_t c_levenshtein_unicode( int[:] word_m, int[:] word_n, DTYPE_t[::1] insert_costs, DTYPE_t[::1] delete_costs, DTYPE_t[:,::1] substitute_costs) nogil + +cdef DTYPE_t c_damerau_levenshtein( + unsigned char* str_a, + Py_ssize_t len_a, + unsigned char* str_b, + Py_ssize_t len_b, + DTYPE_t[::1] insert_costs, + DTYPE_t[::1] delete_costs, + DTYPE_t[:,::1] substitute_costs, + DTYPE_t[:,::1] transpose_costs) nogil + + +cdef DTYPE_t c_optimal_string_alignment( + unsigned char* word_m, + Py_ssize_t m, + unsigned char* word_n, + Py_ssize_t n, + DTYPE_t[::1] insert_costs, + DTYPE_t[::1] delete_costs, + DTYPE_t[:,::1] substitute_costs, + DTYPE_t[:,::1] transpose_costs) nogil + + +cdef DTYPE_t c_levenshtein( + unsigned char* word_m, + Py_ssize_t m, + unsigned char* word_n, + Py_ssize_t n, + DTYPE_t[::1] insert_costs, + DTYPE_t[::1] delete_costs, + DTYPE_t[:,::1] substitute_costs) nogil diff --git a/weighted_levenshtein/clev.pyx b/weighted_levenshtein/clev.pyx index f423b5c..65ffb74 100644 --- a/weighted_levenshtein/clev.pyx +++ b/weighted_levenshtein/clev.pyx @@ -195,7 +195,7 @@ def damerau_levenshtein( intarr1 = convert_string_to_int_array(str1) intarr2 = convert_string_to_int_array(str2) - return c_damerau_levenshtein( + return c_damerau_levenshtein_unicode( intarr1, intarr2, insert_costs, delete_costs, @@ -206,7 +206,7 @@ def damerau_levenshtein( dam_lev = damerau_levenshtein -cdef DTYPE_t c_damerau_levenshtein( +cdef DTYPE_t c_damerau_levenshtein_unicode( int[:] str1, int[:] str2, DTYPE_t[::1] insert_costs, DTYPE_t[::1] delete_costs, @@ -339,7 +339,7 @@ def optimal_string_alignment( intarr1 = convert_string_to_int_array(str1) intarr2 = convert_string_to_int_array(str2) - return c_optimal_string_alignment( + return c_optimal_string_alignment_unicode( intarr1, intarr2, insert_costs, delete_costs, @@ -350,7 +350,7 @@ def optimal_string_alignment( osa = optimal_string_alignment -cdef DTYPE_t c_optimal_string_alignment( +cdef DTYPE_t c_optimal_string_alignment_unicode( int[:] str1, int[:] str2, DTYPE_t[::1] insert_costs, DTYPE_t[::1] delete_costs, @@ -442,7 +442,7 @@ def levenshtein( intarr1 = convert_string_to_int_array(str1) intarr2 = convert_string_to_int_array(str2) - return c_levenshtein( + return c_levenshtein_unicode( intarr1, intarr2, insert_costs, delete_costs, @@ -452,7 +452,7 @@ def levenshtein( lev = levenshtein -cdef DTYPE_t c_levenshtein( +cdef DTYPE_t c_levenshtein_unicode( int[:] str1, int[:] str2, DTYPE_t[::1] insert_costs, DTYPE_t[::1] delete_costs, @@ -496,3 +496,177 @@ cdef DTYPE_t c_levenshtein( ret_val = Array2D_0_get(d, len1, len2) Array2D_del(d) return ret_val + +# Legacy code + +cdef DTYPE_t c_damerau_levenshtein( + unsigned char* str1, Py_ssize_t len1, + unsigned char* str2, Py_ssize_t len2, + DTYPE_t[::1] insert_costs, + DTYPE_t[::1] delete_costs, + DTYPE_t[:,::1] substitute_costs, + DTYPE_t[:,::1] transpose_costs) nogil: + """ + https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Distance_with_adjacent_transpositions + """ + cdef: + Py_ssize_t[ALPHABET_SIZE] da + + Py_ssize_t i, j + unsigned char char_i, char_j + DTYPE_t cost, ret_val + Py_ssize_t db, k, l + + Array2D d + + Array2D_init(&d, len1 + 2, len2 + 2) + + # initialize 'da' to all 0 + for i in range(ALPHABET_SIZE): + da[i] = 0 + + # fill row (-1) and column (-1) with 'DTYPE_MAX' + Array2D_n1_at(d, -1, -1)[0] = DTYPE_MAX + for i in range(0, len1 + 1): + Array2D_n1_at(d, i, -1)[0] = DTYPE_MAX + for j in range(0, len2 + 1): + Array2D_n1_at(d, -1, j)[0] = DTYPE_MAX + + # fill row 0 and column 0 with insertion and deletion costs + Array2D_n1_at(d, 0, 0)[0] = 0 + for i in range(1, len1 + 1): + char_i = str_1_get(str1, i) + cost = delete_costs[char_i] + Array2D_n1_at(d, i, 0)[0] = Array2D_n1_get(d, i - 1, 0) + cost + for j in range(1, len2 + 1): + char_j = str_1_get(str2, j) + cost = insert_costs[char_j] + Array2D_n1_at(d, 0, j)[0] = Array2D_n1_get(d, 0, j - 1) + cost + + # fill DP array + for i in range(1, len1 + 1): + char_i = str_1_get(str1, i) + + db = 0 + for j in range(1, len2 + 1): + char_j = str_1_get(str2, j) + + k = da[char_j] + l = db + if char_i == char_j: + cost = 0 + db = j + else: + cost = substitute_costs[char_i, char_j] + + Array2D_n1_at(d, i, j)[0] = min( + Array2D_n1_get(d, i - 1, j - 1) + cost, # equal/substitute + Array2D_n1_get(d, i, j - 1) + insert_costs[char_j], # insert + Array2D_n1_get(d, i - 1, j) + delete_costs[char_i], # delete + Array2D_n1_get(d, k - 1, l - 1) + # transpose + col_delete_range_cost(d, k + 1, i - 1) + # delete chars in between + transpose_costs[str_1_get(str1, k), str_1_get(str1, i)] + # transpose chars + row_insert_range_cost(d, l + 1, j - 1) # insert chars in between + ) + + da[char_i] = i + + ret_val = Array2D_n1_get(d, len1, len2) + Array2D_del(d) + return ret_val + +cdef DTYPE_t c_optimal_string_alignment( + unsigned char* str1, Py_ssize_t len1, + unsigned char* str2, Py_ssize_t len2, + DTYPE_t[::1] insert_costs, + DTYPE_t[::1] delete_costs, + DTYPE_t[:,::1] substitute_costs, + DTYPE_t[:,::1] transpose_costs) nogil: + """ + https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Optimal_string_alignment_distance + """ + cdef: + Py_ssize_t i, j + unsigned char char_i, char_j, prev_char_i, prev_char_j + DTYPE_t ret_val + Array2D d + + Array2D_init(&d, len1 + 1, len2 + 1) + + # fill row 0 and column 0 with insertion and deletion costs + Array2D_0_at(d, 0, 0)[0] = 0 + for i in range(1, len1 + 1): + char_i = str_1_get(str1, i) + Array2D_0_at(d, i, 0)[0] = Array2D_0_get(d, i - 1, 0) + delete_costs[char_i] + for j in range(1, len2 + 1): + char_j = str_1_get(str2, j) + Array2D_0_at(d, 0, j)[0] = Array2D_0_get(d, 0, j - 1) + insert_costs[char_j] + + # fill DP array + for i in range(1, len1 + 1): + char_i = str_1_get(str1, i) + for j in range(1, len2 + 1): + char_j = str_1_get(str2, j) + if char_i == char_j: # match + Array2D_0_at(d, i, j)[0] = Array2D_0_get(d, i - 1, j - 1) + else: + Array2D_0_at(d, i, j)[0] = min( + Array2D_0_get(d, i - 1, j) + delete_costs[char_i], # deletion + Array2D_0_get(d, i, j - 1) + insert_costs[char_j], # insertion + Array2D_0_get(d, i - 1, j - 1) + substitute_costs[char_i, char_j] # substitution + ) + + if i > 1 and j > 1: + prev_char_i = str_1_get(str1, i - 1) + prev_char_j = str_1_get(str2, j - 1) + if char_i == prev_char_j and prev_char_i == char_j: # transpose + Array2D_0_at(d, i, j)[0] = min( + Array2D_0_get(d, i, j), + Array2D_0_get(d, i - 2, j - 2) + transpose_costs[prev_char_i, char_i] + ) + + ret_val = Array2D_0_get(d, len1, len2) + Array2D_del(d) + return ret_val + +cdef DTYPE_t c_levenshtein( + unsigned char* str1, Py_ssize_t len1, + unsigned char* str2, Py_ssize_t len2, + DTYPE_t[::1] insert_costs, + DTYPE_t[::1] delete_costs, + DTYPE_t[:,::1] substitute_costs) nogil: + """ + https://en.wikipedia.org/wiki/Wagner%E2%80%93Fischer_algorithm + """ + cdef: + Py_ssize_t i, j + unsigned char char_i, char_j + DTYPE_t ret_val + Array2D d + + Array2D_init(&d, len1 + 1, len2 + 1) + + Array2D_0_at(d, 0, 0)[0] = 0 + for i in range(1, len1 + 1): + char_i = str_1_get(str1, i) + Array2D_0_at(d, i, 0)[0] = Array2D_0_get(d, i - 1, 0) + delete_costs[char_i] + for j in range(1, len2 + 1): + char_j = str_1_get(str2, j) + Array2D_0_at(d, 0, j)[0] = Array2D_0_get(d, 0, j - 1) + insert_costs[char_j] + + for i in range(1, len1 + 1): + char_i = str_1_get(str1, i) + for j in range(1, len2 + 1): + char_j = str_1_get(str2, j) + if char_i == char_j: # match + Array2D_0_at(d, i, j)[0] = Array2D_0_get(d, i - 1, j - 1) + else: + Array2D_0_at(d, i, j)[0] = min( + Array2D_0_get(d, i - 1, j) + delete_costs[char_i], + Array2D_0_get(d, i, j - 1) + insert_costs[char_j], + Array2D_0_get(d, i - 1, j - 1) + substitute_costs[char_i, char_j] + ) + + ret_val = Array2D_0_get(d, len1, len2) + Array2D_del(d) + return ret_val From 454af44c1e14ffcb44a2f84514ade82e02ad9b3d Mon Sep 17 00:00:00 2001 From: LEFTazs Date: Fri, 7 Feb 2020 22:18:37 +0100 Subject: [PATCH 6/6] Refractor legacy code --- weighted_levenshtein/clev.pxd | 18 ++- weighted_levenshtein/clev.pyx | 288 ++++++++++++---------------------- 2 files changed, 109 insertions(+), 197 deletions(-) diff --git a/weighted_levenshtein/clev.pxd b/weighted_levenshtein/clev.pxd index ac2ad7e..9c19f29 100644 --- a/weighted_levenshtein/clev.pxd +++ b/weighted_levenshtein/clev.pxd @@ -7,8 +7,10 @@ cdef enum: cdef DTYPE_t c_damerau_levenshtein_unicode( - int[:] str_a, - int[:] str_b, + unsigned int* word_m, + Py_ssize_t len1, + unsigned int* word_n, + Py_ssize_t len2, DTYPE_t[::1] insert_costs, DTYPE_t[::1] delete_costs, DTYPE_t[:,::1] substitute_costs, @@ -16,8 +18,10 @@ cdef DTYPE_t c_damerau_levenshtein_unicode( cdef DTYPE_t c_optimal_string_alignment_unicode( - int[:] word_m, - int[:] word_n, + unsigned int* word_m, + Py_ssize_t len1, + unsigned int* word_n, + Py_ssize_t len2, DTYPE_t[::1] insert_costs, DTYPE_t[::1] delete_costs, DTYPE_t[:,::1] substitute_costs, @@ -25,8 +29,10 @@ cdef DTYPE_t c_optimal_string_alignment_unicode( cdef DTYPE_t c_levenshtein_unicode( - int[:] word_m, - int[:] word_n, + unsigned int* word_m, + Py_ssize_t len1, + unsigned int* word_n, + Py_ssize_t len2, DTYPE_t[::1] insert_costs, DTYPE_t[::1] delete_costs, DTYPE_t[:,::1] substitute_costs) nogil diff --git a/weighted_levenshtein/clev.pyx b/weighted_levenshtein/clev.pyx index 65ffb74..fba8129 100644 --- a/weighted_levenshtein/clev.pyx +++ b/weighted_levenshtein/clev.pyx @@ -132,21 +132,17 @@ cdef inline DTYPE_t row_insert_range_cost( # End Array2D -def convert_string_to_int_array(char* str): - s = str.encode('utf-8').decode('utf-8') - intarr = np.empty([len(s), ], dtype=int) - for i, c in enumerate(s): +cdef unsigned int* convert_string_to_int_array(unsigned char* str, Py_ssize_t size): + cdef unsigned int* intarr = malloc(size * sizeof(unsigned int)) + for i, c in enumerate(str): intarr[i] = ord(c) return intarr -cdef inline unsigned char str_1_get(unsigned char* s, Py_ssize_t i) nogil: - """ - Takes an index of a 1-indexed string - and returns that character - """ - return s[i - 1] +cdef void copy_str_to_int_arr(unsigned char* str, Py_ssize_t len, unsigned int* int_arr) nogil: + for i in range(len): + int_arr[i] = str[i] -cdef inline int int_array_1_get(int[:] s, Py_ssize_t i) nogil: +cdef inline unsigned int int_array_1_get(unsigned int* s, Py_ssize_t i) nogil: """ Takes an index of a 1-indexed int array and returns that number @@ -192,22 +188,33 @@ def damerau_levenshtein( if transpose_costs is None: transpose_costs = unit_matrix - intarr1 = convert_string_to_int_array(str1) - intarr2 = convert_string_to_int_array(str2) + s1 = str1.encode('utf-8').decode('utf-8') + len1 = len(s1) + intarr1 = convert_string_to_int_array(str1, len1) + + s2 = str2.encode('utf-8').decode('utf-8') + len2 = len(s2) + intarr2 = convert_string_to_int_array(str2, len2) - return c_damerau_levenshtein_unicode( - intarr1, intarr2, - insert_costs, - delete_costs, - substitute_costs, - transpose_costs - ) + cdef DTYPE_t result = c_damerau_levenshtein_unicode( + intarr1, len1, + intarr2, len2, + insert_costs, + delete_costs, + substitute_costs, + transpose_costs + ) + + free(intarr1) + free(intarr2) + return result dam_lev = damerau_levenshtein cdef DTYPE_t c_damerau_levenshtein_unicode( - int[:] str1, int[:] str2, + unsigned int* str1, Py_ssize_t len1, + unsigned int* str2, Py_ssize_t len2, DTYPE_t[::1] insert_costs, DTYPE_t[::1] delete_costs, DTYPE_t[:,::1] substitute_costs, @@ -223,16 +230,12 @@ cdef DTYPE_t c_damerau_levenshtein_unicode( DTYPE_t cost, ret_val Py_ssize_t db, k, l Array2D d - Py_ssize_t len1, len2 DTYPE_t substitute_cost DTYPE_t insert_cost DTYPE_t delete_cost DTYPE_t transpose_cost - len1 = str1.shape[0] - len2 = str2.shape[0] - Array2D_init(&d, len1 + 2, len2 + 2) # initialize 'da' to all 0 @@ -336,22 +339,33 @@ def optimal_string_alignment( if transpose_costs is None: transpose_costs = unit_matrix - intarr1 = convert_string_to_int_array(str1) - intarr2 = convert_string_to_int_array(str2) + s1 = str1.encode('utf-8').decode('utf-8') + len1 = len(s1) + intarr1 = convert_string_to_int_array(str1, len1) + + s2 = str2.encode('utf-8').decode('utf-8') + len2 = len(s2) + intarr2 = convert_string_to_int_array(str2, len2) + + cdef DTYPE_t result = c_optimal_string_alignment_unicode( + intarr1, len1, + intarr2, len2, + insert_costs, + delete_costs, + substitute_costs, + transpose_costs + ) - return c_optimal_string_alignment_unicode( - intarr1, intarr2, - insert_costs, - delete_costs, - substitute_costs, - transpose_costs - ) + free(intarr1) + free(intarr2) + return result osa = optimal_string_alignment cdef DTYPE_t c_optimal_string_alignment_unicode( - int[:] str1, int[:] str2, + unsigned int* str1, Py_ssize_t len1, + unsigned int* str2, Py_ssize_t len2, DTYPE_t[::1] insert_costs, DTYPE_t[::1] delete_costs, DTYPE_t[:,::1] substitute_costs, @@ -364,10 +378,6 @@ cdef DTYPE_t c_optimal_string_alignment_unicode( unsigned int char_i, char_j, prev_char_i, prev_char_j DTYPE_t ret_val Array2D d - Py_ssize_t len1, len2 - - len1 = str1.shape[0] - len2 = str2.shape[0] Array2D_init(&d, len1 + 1, len2 + 1) @@ -439,21 +449,33 @@ def levenshtein( if substitute_costs is None: substitute_costs = unit_matrix - intarr1 = convert_string_to_int_array(str1) - intarr2 = convert_string_to_int_array(str2) + s1 = str1.encode('utf-8').decode('utf-8') + len1 = len(s1) + intarr1 = convert_string_to_int_array(str1, len1) + + s2 = str2.encode('utf-8').decode('utf-8') + len2 = len(s2) + intarr2 = convert_string_to_int_array(str2, len2) + + cdef DTYPE_t result = c_levenshtein_unicode( + intarr1, len1, + intarr2, len2, + insert_costs, + delete_costs, + substitute_costs + ) + + free(intarr1) + free(intarr2) + return result - return c_levenshtein_unicode( - intarr1, intarr2, - insert_costs, - delete_costs, - substitute_costs - ) lev = levenshtein cdef DTYPE_t c_levenshtein_unicode( - int[:] str1, int[:] str2, + unsigned int* str1, Py_ssize_t len1, + unsigned int* str2, Py_ssize_t len2, DTYPE_t[::1] insert_costs, DTYPE_t[::1] delete_costs, DTYPE_t[:,::1] substitute_costs) nogil: @@ -465,10 +487,6 @@ cdef DTYPE_t c_levenshtein_unicode( unsigned int char_i, char_j DTYPE_t ret_val Array2D d - Py_ssize_t len1, len2 - - len1 = str1.shape[0] - len2 = str2.shape[0] Array2D_init(&d, len1 + 1, len2 + 1) @@ -506,74 +524,19 @@ cdef DTYPE_t c_damerau_levenshtein( DTYPE_t[::1] delete_costs, DTYPE_t[:,::1] substitute_costs, DTYPE_t[:,::1] transpose_costs) nogil: - """ - https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Distance_with_adjacent_transpositions - """ - cdef: - Py_ssize_t[ALPHABET_SIZE] da - - Py_ssize_t i, j - unsigned char char_i, char_j - DTYPE_t cost, ret_val - Py_ssize_t db, k, l - - Array2D d - - Array2D_init(&d, len1 + 2, len2 + 2) - - # initialize 'da' to all 0 - for i in range(ALPHABET_SIZE): - da[i] = 0 - - # fill row (-1) and column (-1) with 'DTYPE_MAX' - Array2D_n1_at(d, -1, -1)[0] = DTYPE_MAX - for i in range(0, len1 + 1): - Array2D_n1_at(d, i, -1)[0] = DTYPE_MAX - for j in range(0, len2 + 1): - Array2D_n1_at(d, -1, j)[0] = DTYPE_MAX - - # fill row 0 and column 0 with insertion and deletion costs - Array2D_n1_at(d, 0, 0)[0] = 0 - for i in range(1, len1 + 1): - char_i = str_1_get(str1, i) - cost = delete_costs[char_i] - Array2D_n1_at(d, i, 0)[0] = Array2D_n1_get(d, i - 1, 0) + cost - for j in range(1, len2 + 1): - char_j = str_1_get(str2, j) - cost = insert_costs[char_j] - Array2D_n1_at(d, 0, j)[0] = Array2D_n1_get(d, 0, j - 1) + cost - - # fill DP array - for i in range(1, len1 + 1): - char_i = str_1_get(str1, i) - - db = 0 - for j in range(1, len2 + 1): - char_j = str_1_get(str2, j) - k = da[char_j] - l = db - if char_i == char_j: - cost = 0 - db = j - else: - cost = substitute_costs[char_i, char_j] + cdef unsigned int* int_arr1 = malloc(len1 * sizeof(unsigned int)) + copy_str_to_int_arr(str1, len1, int_arr1) + cdef unsigned int* int_arr2 = malloc(len2 * sizeof(unsigned int)) + copy_str_to_int_arr(str2, len2, int_arr2) - Array2D_n1_at(d, i, j)[0] = min( - Array2D_n1_get(d, i - 1, j - 1) + cost, # equal/substitute - Array2D_n1_get(d, i, j - 1) + insert_costs[char_j], # insert - Array2D_n1_get(d, i - 1, j) + delete_costs[char_i], # delete - Array2D_n1_get(d, k - 1, l - 1) + # transpose - col_delete_range_cost(d, k + 1, i - 1) + # delete chars in between - transpose_costs[str_1_get(str1, k), str_1_get(str1, i)] + # transpose chars - row_insert_range_cost(d, l + 1, j - 1) # insert chars in between - ) + cdef DTYPE_t result = c_damerau_levenshtein_unicode(int_arr1, len1, int_arr2, len2, + insert_costs, delete_costs, substitute_costs, transpose_costs) - da[char_i] = i + free(int_arr1) + free(int_arr2) + return result - ret_val = Array2D_n1_get(d, len1, len2) - Array2D_del(d) - return ret_val cdef DTYPE_t c_optimal_string_alignment( unsigned char* str1, Py_ssize_t len1, @@ -582,52 +545,18 @@ cdef DTYPE_t c_optimal_string_alignment( DTYPE_t[::1] delete_costs, DTYPE_t[:,::1] substitute_costs, DTYPE_t[:,::1] transpose_costs) nogil: - """ - https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Optimal_string_alignment_distance - """ - cdef: - Py_ssize_t i, j - unsigned char char_i, char_j, prev_char_i, prev_char_j - DTYPE_t ret_val - Array2D d - - Array2D_init(&d, len1 + 1, len2 + 1) - - # fill row 0 and column 0 with insertion and deletion costs - Array2D_0_at(d, 0, 0)[0] = 0 - for i in range(1, len1 + 1): - char_i = str_1_get(str1, i) - Array2D_0_at(d, i, 0)[0] = Array2D_0_get(d, i - 1, 0) + delete_costs[char_i] - for j in range(1, len2 + 1): - char_j = str_1_get(str2, j) - Array2D_0_at(d, 0, j)[0] = Array2D_0_get(d, 0, j - 1) + insert_costs[char_j] - # fill DP array - for i in range(1, len1 + 1): - char_i = str_1_get(str1, i) - for j in range(1, len2 + 1): - char_j = str_1_get(str2, j) - if char_i == char_j: # match - Array2D_0_at(d, i, j)[0] = Array2D_0_get(d, i - 1, j - 1) - else: - Array2D_0_at(d, i, j)[0] = min( - Array2D_0_get(d, i - 1, j) + delete_costs[char_i], # deletion - Array2D_0_get(d, i, j - 1) + insert_costs[char_j], # insertion - Array2D_0_get(d, i - 1, j - 1) + substitute_costs[char_i, char_j] # substitution - ) + cdef unsigned int* int_arr1 = malloc(len1 * sizeof(unsigned int)) + copy_str_to_int_arr(str1, len1, int_arr1) + cdef unsigned int* int_arr2 = malloc(len2 * sizeof(unsigned int)) + copy_str_to_int_arr(str2, len2, int_arr2) - if i > 1 and j > 1: - prev_char_i = str_1_get(str1, i - 1) - prev_char_j = str_1_get(str2, j - 1) - if char_i == prev_char_j and prev_char_i == char_j: # transpose - Array2D_0_at(d, i, j)[0] = min( - Array2D_0_get(d, i, j), - Array2D_0_get(d, i - 2, j - 2) + transpose_costs[prev_char_i, char_i] - ) + cdef DTYPE_t result = c_optimal_string_alignment_unicode(int_arr1, len1, int_arr2, len2, + insert_costs, delete_costs, substitute_costs, transpose_costs) - ret_val = Array2D_0_get(d, len1, len2) - Array2D_del(d) - return ret_val + free(int_arr1) + free(int_arr2) + return result cdef DTYPE_t c_levenshtein( unsigned char* str1, Py_ssize_t len1, @@ -635,38 +564,15 @@ cdef DTYPE_t c_levenshtein( DTYPE_t[::1] insert_costs, DTYPE_t[::1] delete_costs, DTYPE_t[:,::1] substitute_costs) nogil: - """ - https://en.wikipedia.org/wiki/Wagner%E2%80%93Fischer_algorithm - """ - cdef: - Py_ssize_t i, j - unsigned char char_i, char_j - DTYPE_t ret_val - Array2D d - Array2D_init(&d, len1 + 1, len2 + 1) - - Array2D_0_at(d, 0, 0)[0] = 0 - for i in range(1, len1 + 1): - char_i = str_1_get(str1, i) - Array2D_0_at(d, i, 0)[0] = Array2D_0_get(d, i - 1, 0) + delete_costs[char_i] - for j in range(1, len2 + 1): - char_j = str_1_get(str2, j) - Array2D_0_at(d, 0, j)[0] = Array2D_0_get(d, 0, j - 1) + insert_costs[char_j] + cdef unsigned int* int_arr1 = malloc(len1 * sizeof(unsigned int)) + copy_str_to_int_arr(str1, len1, int_arr1) + cdef unsigned int* int_arr2 = malloc(len2 * sizeof(unsigned int)) + copy_str_to_int_arr(str2, len2, int_arr2) - for i in range(1, len1 + 1): - char_i = str_1_get(str1, i) - for j in range(1, len2 + 1): - char_j = str_1_get(str2, j) - if char_i == char_j: # match - Array2D_0_at(d, i, j)[0] = Array2D_0_get(d, i - 1, j - 1) - else: - Array2D_0_at(d, i, j)[0] = min( - Array2D_0_get(d, i - 1, j) + delete_costs[char_i], - Array2D_0_get(d, i, j - 1) + insert_costs[char_j], - Array2D_0_get(d, i - 1, j - 1) + substitute_costs[char_i, char_j] - ) + cdef DTYPE_t result = c_levenshtein_unicode(int_arr1, len1, int_arr2, len2, + insert_costs, delete_costs, substitute_costs) - ret_val = Array2D_0_get(d, len1, len2) - Array2D_del(d) - return ret_val + free(int_arr1) + free(int_arr2) + return result