Skip to content

Commit 9a2d956

Browse files
committed
feat: add search engine
1 parent a28a8e1 commit 9a2d956

File tree

5 files changed

+199
-0
lines changed

5 files changed

+199
-0
lines changed

search.py

Lines changed: 188 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,188 @@
1+
import re
2+
import pylru
3+
4+
5+
class SearchEngineBase(object):
6+
def __init__(self):
7+
pass
8+
9+
def add_corpus(self, file_path):
10+
with open(file_path, 'r') as fin:
11+
text = fin.read()
12+
self.process_corpus(file_path, text)
13+
14+
def process_corpus(self, id, text):
15+
raise Exception('process_corpus not implemented.')
16+
17+
def search(self, query):
18+
raise Exception('search not implemented.')
19+
20+
21+
class SimpleEngine(SearchEngineBase):
22+
def __init__(self):
23+
super(SimpleEngine, self).__init__()
24+
self.__id_to_texts = {}
25+
26+
def process_corpus(self, id, text):
27+
self.__id_to_texts[id] = text
28+
29+
def search(self, query):
30+
results = []
31+
for id, text in self.__id_to_texts.items():
32+
if query in text:
33+
results.append(id)
34+
return results
35+
36+
37+
class BOWEngine(SearchEngineBase):
38+
def __init__(self):
39+
super(BOWEngine, self).__init__()
40+
self.__id_to_word = {}
41+
42+
def process_corpus(self, id, text):
43+
self.__id_to_word[id] = self.parse_text_to_word(text)
44+
45+
def search(self, query):
46+
query_words = self.parse_text_to_word(query)
47+
results = []
48+
for id, words in self.__id_to_word.items():
49+
if self.query_match(query_words, words):
50+
results.append(id)
51+
return results
52+
53+
@staticmethod
54+
def parse_text_to_word(text):
55+
# Use regex to remove punctuation and newlines
56+
text = re.sub(r'[^\w ]', ' ', text)
57+
# To lower
58+
text = text.lower()
59+
# Generate a list of all words
60+
word_list = text.split(' ')
61+
# Remove blank words
62+
word_list = filter(None, word_list)
63+
# Return a set of words
64+
return set(word_list)
65+
66+
@staticmethod
67+
def query_match(query_words, words):
68+
for query_word in query_words:
69+
if query_word not in words:
70+
return False
71+
return True
72+
73+
74+
# 减少查询的量
75+
class BOWInvertedIndexEngine(SearchEngineBase):
76+
def __init__(self):
77+
super(BOWInvertedIndexEngine, self).__init__()
78+
self.inverted_index = {}
79+
80+
def process_corpus(self, id, text):
81+
words = self.parse_text_to_word(text)
82+
for word in words:
83+
if word not in self.inverted_index:
84+
self.inverted_index[word] = []
85+
self.inverted_index[word].append(id)
86+
87+
def search(self, query):
88+
query_words = list(self.parse_text_to_word(query))
89+
query_words_index = list()
90+
for _ in query_words:
91+
query_words_index.append(0)
92+
93+
# If a word is indexed in reverse order, return immediately.
94+
for query_word in query_words:
95+
if query_word not in self.inverted_index:
96+
return []
97+
98+
result = []
99+
while True:
100+
# First get the index of all inverted indexes in the current state.
101+
current_ids = []
102+
for idx, query_word in enumerate(query_words):
103+
current_index = query_words_index[idx]
104+
current_inverted_list = self.inverted_index[query_word]
105+
# If the current index exceeds the length of the inverted list,
106+
# it means that the query is not in the inverted list.
107+
if current_index >= len(current_inverted_list):
108+
return result
109+
current_ids.append(current_inverted_list[current_index])
110+
111+
# If all the elements in the current_ids are the same, it means that the query is in the inverted list.
112+
if all(x == current_ids[0] for x in current_ids):
113+
result.append(current_ids[0])
114+
query_words_index = [x + 1 for x in query_words_index]
115+
continue
116+
117+
# If not all elements are the same, increase the index of the smallest element by 1.
118+
min_val = min(current_ids)
119+
min_val_pos = current_ids.index(min_val)
120+
query_words_index[min_val_pos] += 1
121+
122+
@staticmethod
123+
def parse_text_to_word(text):
124+
# Use regex to remove punctuation and newlines
125+
text = re.sub(r'[^\w ]', ' ', text)
126+
# To lower
127+
text = text.lower()
128+
# Generate a list of all words
129+
word_list = text.split(' ')
130+
# Remove blank words
131+
word_list = filter(None, word_list)
132+
# Return a set of words
133+
return set(word_list)
134+
135+
136+
class LRUCache(object):
137+
def __init__(self, size=2):
138+
self.cache = pylru.lrucache(size)
139+
140+
def has(self, key):
141+
return key in self.cache
142+
143+
def get(self, key):
144+
return self.cache[key]
145+
146+
def set(self, key, value):
147+
self.cache[key] = value
148+
149+
150+
class BOWInvertedIndexEngineWithCache(BOWInvertedIndexEngine, LRUCache):
151+
def __init__(self):
152+
super(BOWInvertedIndexEngineWithCache, self).__init__()
153+
LRUCache.__init__(self)
154+
155+
def search(self, query):
156+
if self.has(query):
157+
print('cache hit!')
158+
return self.get(query)
159+
160+
result = super(BOWInvertedIndexEngineWithCache, self).search(query)
161+
self.set(query, result)
162+
163+
return result
164+
165+
166+
def main(search_engine):
167+
for file_path in ["./text/1.txt", "./text/2.txt", "./text/3.txt", "./text/4.txt"]:
168+
search_engine.add_corpus(file_path)
169+
170+
while True:
171+
query = input("Please input query:")
172+
if query == "q":
173+
break
174+
results = search_engine.search(query)
175+
print("found {} result(s):".format(len(results)))
176+
177+
for result in results:
178+
print(result)
179+
180+
# if __name__ == "__main__":
181+
# search_engine = SimpleEngine()
182+
# main(search_engine)
183+
# search_engine = BOWEngine()
184+
# main(search_engine)
185+
# search_engine = BOWInvertedIndexEngine()
186+
# main(search_engine)
187+
# search_engine = BOWInvertedIndexEngineWithCache()
188+
# main(search_engine)

text/1.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# 1.txt
2+
I have a dream that my four little children will one day live in a nation where they will not be judged by the color of their skin but by the content of their character. I have a dream today.
3+

text/2.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# 2.txt
2+
I have a dream that one day down in Alabama, with its vicious racists, . . . one day right there in Alabama little black boys and black girls will be able to join hands with little white boys and white girls as sisters and brothers. I have a dream today.
3+

text/3.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# 3.txt
2+
I have a dream that one day every valley shall be exalted, every hill and mountain shall be made low, the rough places will be made plain, and the crooked places will be made straight, and the glory of the Lord shall be revealed, and all flesh shall see it together.
3+

text/4.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
# 4.txt
2+
This is our hope. . . With this faith we will be able to hew out of the mountain of despair a stone of hope. With this faith we will be able to transform the jangling discords of our nation into a beautiful symphony of brotherhood. With this faith we will be able to work together, to pray together.

0 commit comments

Comments
 (0)