Skip to content

Commit 3357768

Browse files
l3str4ngepoyea
andauthored
Jaro winkler (TheAlgorithms#2041)
* Added jaro_winkler first version * Added doctests * Fix flake warnings * Refactor * Fixes bug in jaro winkler implementation * Commit suggestions * Missing comming suggestions * Remove unused math module * Import doctest Co-authored-by: John Law <johnlaw.po@gmail.com>
1 parent fa358d6 commit 3357768

File tree

1 file changed

+71
-0
lines changed

1 file changed

+71
-0
lines changed

strings/jaro_winkler.py

+71
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
"""https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance"""
2+
3+
4+
def jaro_winkler(str1: str, str2: str) -> float:
5+
"""
6+
Jaro–Winkler distance is a string metric measuring an edit distance between two sequences.
7+
Output value is between 0.0 and 1.0.
8+
9+
>>> jaro_winkler("martha", "marhta")
10+
0.9611111111111111
11+
>>> jaro_winkler("CRATE", "TRACE")
12+
0.7333333333333334
13+
>>> jaro_winkler("test", "dbdbdbdb")
14+
0.0
15+
>>> jaro_winkler("test", "test")
16+
1.0
17+
>>> jaro_winkler("hello world", "HeLLo W0rlD")
18+
0.6363636363636364
19+
>>> jaro_winkler("test", "")
20+
0.0
21+
>>> jaro_winkler("hello", "world")
22+
0.4666666666666666
23+
>>> jaro_winkler("hell**o", "*world")
24+
0.4365079365079365
25+
"""
26+
27+
def get_matched_characters(_str1: str, _str2: str) -> str:
28+
matched = []
29+
limit = min(len(_str1), len(_str2)) // 2
30+
for i, l in enumerate(_str1):
31+
left = int(max(0, i - limit))
32+
right = int(min(i + limit + 1, len(_str2)))
33+
if l in _str2[left:right]:
34+
matched.append(l)
35+
_str2 = f"{_str2[0:_str2.index(l)]} {_str2[_str2.index(l) + 1:]}"
36+
37+
return ''.join(matched)
38+
39+
# matching characters
40+
matching_1 = get_matched_characters(str1, str2)
41+
matching_2 = get_matched_characters(str2, str1)
42+
match_count = len(matching_1)
43+
44+
# transposition
45+
transpositions = len(
46+
[(c1, c2) for c1, c2 in zip(matching_1, matching_2) if c1 != c2]
47+
) // 2
48+
49+
if not match_count:
50+
jaro = 0.0
51+
else:
52+
jaro = 1 / 3 * (
53+
match_count / len(str1)
54+
+ match_count / len(str2)
55+
+ (match_count - transpositions) / match_count)
56+
57+
# common prefix up to 4 characters
58+
prefix_len = 0
59+
for c1, c2 in zip(str1[:4], str2[:4]):
60+
if c1 == c2:
61+
prefix_len += 1
62+
else:
63+
break
64+
65+
return jaro + 0.1 * prefix_len * (1 - jaro)
66+
67+
68+
if __name__ == '__main__':
69+
import doctest
70+
doctest.testmod()
71+
print(jaro_winkler("hello", "world"))

0 commit comments

Comments
 (0)