-
Notifications
You must be signed in to change notification settings - Fork 19
/
Copy pathdata_preprocess.py
124 lines (102 loc) · 3.41 KB
/
data_preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import json
import os
import pickle
import sys
from collections import defaultdict
from tqdm import tqdm
cities = ['Boston', 'Chicago', 'Los Angeles', 'New York', 'San Francisco']
data_dir = 'data'
train_raw_file = os.path.join(data_dir, 'train.json')
valid_raw_file = os.path.join(data_dir, 'valid.json')
train_file = os.path.join(data_dir, 'train.pickle')
valid_file = os.path.join(data_dir, 'valid.pickle')
vocab_file = os.path.join(data_dir, 'vocab.pickle')
word_freq_file = os.path.join(data_dir, 'word-freq.pickle')
VOCAB_SIZE = 40000
UNK = 2
SENT_DELIMITER = '|||'
def read_reviews(file_path):
reviews = []
with open(file_path, 'r') as f:
for line in tqdm(f):
review = json.loads(line)
photos = []
for photo in review['Photos']:
photos.append(photo['_id'])
reviews.append({'_id': review['_id'],
'Text': review['Text'],
'Photos': photos,
'Rating': review['Rating']})
return reviews
def word_tokenize(text):
for sent in text.split(SENT_DELIMITER):
for word in sent.split():
yield word
def build_word_freq():
try:
with open(word_freq_file, 'rb') as freq_dist_f:
freq_dist_f = pickle.load(freq_dist_f)
print('word frequency loaded')
return freq_dist_f
except IOError:
pass
print('building word frequency')
word_freq = defaultdict(int)
# from train file
for i, review in enumerate(read_reviews(train_raw_file)):
for word in word_tokenize(review['Text']):
word_freq[word] += 1
# from validation file
for i, review in enumerate(read_reviews(valid_raw_file)):
for word in word_tokenize(review['Text']):
word_freq[word] += 1
with open(word_freq_file, 'wb') as f:
pickle.dump(word_freq, f)
return word_freq
def build_vocabulary():
print('building vocabulary')
word_freq = build_word_freq()
top_words = sorted(word_freq.items(), key=lambda x: -x[1])[:VOCAB_SIZE - 3]
print('most common word is %s which appears %d times' % (top_words[0][0], top_words[0][1]))
print('less common word is %s which appears %d times' % (top_words[-1][0], top_words[-1][1]))
vocab = {}
i = 3 # 0-index is for padding, 2-index is for UNKNOWN word
for word, freq in top_words:
vocab[word] = i
i += 1
with open(vocab_file, 'wb') as f:
pickle.dump(vocab, f)
def load_vocabulary():
try:
with open(vocab_file, 'rb') as f:
vocab = pickle.load(f)
print('Vocabulary loaded')
return vocab
except IOError:
print('Can not load vocabulary')
sys.exit(0)
def dump_file(input_file, output_file):
if os.path.exists(output_file):
print('%s is dumped already' % output_file)
return
vocab = load_vocabulary()
print('start dumping %s into %s' % (input_file, output_file))
f = open(output_file, 'wb')
try:
for review in read_reviews(input_file):
rating = review['Rating']
photos = review['Photos']
text = []
for sent in review['Text'].split(SENT_DELIMITER):
text.append([vocab.get(word, UNK) for word in sent.split()])
pickle.dump((text, photos, rating), f)
except KeyboardInterrupt:
pass
f.close()
if __name__ == '__main__':
build_vocabulary()
dump_file(train_raw_file, train_file)
dump_file(valid_raw_file, valid_file)
for city in cities:
dump_file(os.path.join(data_dir, 'test/{}_test.json'.format(city)),
os.path.join(data_dir, 'test/{}_test.pickle'.format(city)))