Skip to content

Commit 9226856

Browse files
AashayShingrecclauss
authored andcommitted
Aho-Corasick String Matching Algorithm (TheAlgorithms#346)
* add aho-corasick algorithm * Add a doctest and format with black
1 parent 8b572e6 commit 9226856

File tree

1 file changed

+92
-0
lines changed

1 file changed

+92
-0
lines changed

strings/aho-corasick.py

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
from collections import deque
2+
3+
4+
class Automaton:
5+
def __init__(self, keywords):
6+
self.adlist = list()
7+
self.adlist.append(
8+
{"value": "", "next_states": [], "fail_state": 0, "output": []}
9+
)
10+
11+
for keyword in keywords:
12+
self.add_keyword(keyword)
13+
self.set_fail_transitions()
14+
15+
def find_next_state(self, current_state, char):
16+
for state in self.adlist[current_state]["next_states"]:
17+
if char == self.adlist[state]["value"]:
18+
return state
19+
return None
20+
21+
def add_keyword(self, keyword):
22+
current_state = 0
23+
for character in keyword:
24+
if self.find_next_state(current_state, character):
25+
current_state = self.find_next_state(current_state, character)
26+
else:
27+
self.adlist.append(
28+
{
29+
"value": character,
30+
"next_states": [],
31+
"fail_state": 0,
32+
"output": [],
33+
}
34+
)
35+
self.adlist[current_state]["next_states"].append(len(self.adlist) - 1)
36+
current_state = len(self.adlist) - 1
37+
self.adlist[current_state]["output"].append(keyword)
38+
39+
def set_fail_transitions(self):
40+
q = deque()
41+
for node in self.adlist[0]["next_states"]:
42+
q.append(node)
43+
self.adlist[node]["fail_state"] = 0
44+
while q:
45+
r = q.popleft()
46+
for child in self.adlist[r]["next_states"]:
47+
q.append(child)
48+
state = self.adlist[r]["fail_state"]
49+
while (
50+
self.find_next_state(state, self.adlist[child]["value"]) == None
51+
and state != 0
52+
):
53+
state = self.adlist[state]["fail_state"]
54+
self.adlist[child]["fail_state"] = self.find_next_state(
55+
state, self.adlist[child]["value"]
56+
)
57+
if self.adlist[child]["fail_state"] == None:
58+
self.adlist[child]["fail_state"] = 0
59+
self.adlist[child]["output"] = (
60+
self.adlist[child]["output"]
61+
+ self.adlist[self.adlist[child]["fail_state"]]["output"]
62+
)
63+
64+
def search_in(self, string):
65+
"""
66+
>>> A = Automaton(["what", "hat", "ver", "er"])
67+
>>> A.search_in("whatever, err ... , wherever")
68+
{'what': [0], 'hat': [1], 'ver': [5, 25], 'er': [6, 10, 22, 26]}
69+
"""
70+
result = dict() # returns a dict with keywords and list of its occurences
71+
current_state = 0
72+
for i in range(len(string)):
73+
while (
74+
self.find_next_state(current_state, string[i]) == None
75+
and current_state != 0
76+
):
77+
current_state = self.adlist[current_state]["fail_state"]
78+
current_state = self.find_next_state(current_state, string[i])
79+
if current_state is None:
80+
current_state = 0
81+
else:
82+
for key in self.adlist[current_state]["output"]:
83+
if not (key in result):
84+
result[key] = []
85+
result[key].append((i - len(key) + 1))
86+
return result
87+
88+
89+
if __name__ == "__main__":
90+
import doctest
91+
92+
doctest.testmod()

0 commit comments

Comments
 (0)