-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindexation_of_cs276.py
57 lines (46 loc) · 1.88 KB
/
indexation_of_cs276.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
"""
This file gathers functions used to create an index on CS276 collection.
"""
import os
from useful_functions import *
# This function creates 2 dictionaries to bind documents and documentIDs.
# The first one has a structure { doc : docID } and the other { docID : doc }.
# The first one is mandatory for the index creation, the second one is useful to answer queries.
def make_doc_id_to_doc(directory):
doc_id_dict = {}
doc_dict = {}
count = 0
for i in range(0, 10):
for filename in os.listdir(directory + str(i)):
doc_id_dict[count] = directory + str(i) + '/' + filename
doc_dict[directory + str(i) + '/' + filename] = count
count += 1
return doc_id_dict, doc_dict
# This function creates the index for the CS276 collection.
# Its structure is like this : { term : { docID : #occurences of the term in the doc referenced by docID } }.
def make_dictionary(directory):
dictionary = {}
doc_id_dict, doc_dict = make_doc_id_to_doc(directory)
for i in range(0, 10):
for filename in os.listdir(directory + str(i)):
doc_id = doc_dict[directory + str(i) + '/' + filename]
words = open(directory + str(i) + '/' + filename).readline()
tokens = custom_tokenize(words)
for word in lower_and_remove_common(tokens):
if not dictionary.get(word):
dictionary[word] = {doc_id: 1}
elif not dictionary.get(word).get(doc_id):
dictionary[word][doc_id] = 1
else:
dictionary[word][doc_id] += 1
doc_id += 1
return doc_id_dict, dictionary
"""
Main method for execution
"""
def main():
global_directory = "CS276/pa1-data/"
doc_id_dict, dictionary = make_dictionary(global_directory)
print(list(dictionary.items())[:5])
if __name__ == '__main__':
main()