Skip to content

Commit

Permalink
improve hash function
Browse files Browse the repository at this point in the history
  • Loading branch information
maxbachmann authored Jun 29, 2022
1 parent c3e40a3 commit c56f411
Show file tree
Hide file tree
Showing 5 changed files with 16 additions and 4 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
## Changelog

### [1.0.5] - 2022-06-29
#### Fixed
- treat hash for -1 and -2 as different

### [1.0.4] - 2022-06-23
#### Changed
- add fallback implementations of `jarowinkler-cpp` back to wheel,
Expand Down
2 changes: 1 addition & 1 deletion jarowinkler/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
__author__ = "Max Bachmann"
__license__ = "MIT"
__version__ = "1.0.4"
__version__ = "1.0.5"

from ._initialize import *
9 changes: 7 additions & 2 deletions jarowinkler/common.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,11 @@ cdef extern from "common.hpp":
RF_String convert_string(object py_str)
void validate_string(object py_str, const char* err) except +

cdef inline uint64_t rf_hash(val) except *:
if val == -1:
return <uint64_t>-1
return <uint64_t>hash(val)

cdef inline RF_String hash_array(arr) except *:
# TODO on Cpython this does not require any copies
cdef RF_String s_proc
Expand Down Expand Up @@ -64,7 +69,7 @@ cdef inline RF_String hash_array(arr) except *:
else: # float/double are hashed
s_proc.kind = RF_StringType.RF_UINT64
for i in range(s_proc.length):
(<uint64_t*>s_proc.data)[i] = <uint64_t>hash(arr[i])
(<uint64_t*>s_proc.data)[i] = rf_hash(arr[i])
except Exception as e:
free(s_proc.data)
s_proc.data = NULL
Expand All @@ -91,7 +96,7 @@ cdef inline RF_String hash_sequence(seq) except *:
if isinstance(elem, str) and len(elem) == 1:
(<uint64_t*>s_proc.data)[i] = <uint64_t><Py_UCS4>elem
else:
(<uint64_t*>s_proc.data)[i] = <uint64_t>hash(elem)
(<uint64_t*>s_proc.data)[i] = rf_hash(elem)
except Exception as e:
free(s_proc.data)
s_proc.data = NULL
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

setup(
name="jarowinkler",
version="1.0.4",
version="1.0.5",
url="https://github.com/maxbachmann/JaroWinkler",
author="Max Bachmann",
author_email="[email protected]",
Expand Down
3 changes: 3 additions & 0 deletions tests/test_JaroWinkler.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ def _jaro_winkler_similarity(self, s1, s2, result):
self.assertAlmostEqual(jarowinkler_similarity(s1, s2), result, places=4)
self.assertAlmostEqual(jarowinkler_similarity(s2, s1), result, places=4)

def test_hash_special_case(self):
self._jaro_winkler_similarity([0, -1], [0, -2], 0.6666)

def test_edge_case_lengths(self):
self._jaro_winkler_similarity('', '', 0)
self._jaro_winkler_similarity('0', '0', 1)
Expand Down

0 comments on commit c56f411

Please sign in to comment.