-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcollection.py
59 lines (51 loc) · 1.73 KB
/
collection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
from useful_functions import *
from boolean_search import boolean_search
from math import log, sqrt
class Collection:
def __init__(self, docs, vocabulary, index):
self.docs = docs
self.vocabulary = vocabulary
self.index = index
def boolean_search(self, query):
return boolean_search(self.docs, self.index, query)
def vector_search(self, query):
"""Based on first course in RIW, slide 171"""
docs = self.docs
s = {}
results = []
for doc in docs:
s[doc] = 0
n_d = s
query = lower_and_remove_common(custom_tokenize(query))
n_q = 0
for word in query:
w_q = self.get_idf(word)**2
n_q += w_q
if word in self.index.keys():
posting_list = self.index[word].keys()
for d in posting_list:
w_d = self.get_w(d, word)
n_d[d] += w_d
s[d] += w_d*w_q
for doc in s:
if s[doc] > 0:
results.append((doc, s[doc]/(sqrt(n_q)*sqrt(n_d[doc]))))
results.sort(key=lambda tup: tup[1], reverse=True)
return results
def get_log_tf(self, doc, word):
if word in self.index.keys():
if doc in self.index[word].keys():
return 1 + log(self.index[word][doc], 10)
else:
return 0
else:
return 0
def get_idf(self, word):
if word in self.index.keys():
size = len(self.vocabulary)
idf = log(size / len(self.index[word]), 10)
return idf
else:
return 0
def get_w(self, doc, word):
return self.get_log_tf(doc, word) * self.get_idf(word)