-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathpotter2vec.py
112 lines (92 loc) · 4.25 KB
/
potter2vec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import time
import tensorflow as tf
import numpy as np
import helper
import random
import os
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
corpus = helper.load_books()
corpus = helper.preprocess(corpus)
vocab_to_int, int_to_vocab = helper.create_dict(corpus)
encoded_corpus = [vocab_to_int[word] for word in corpus]
sampled_encoded_corpus = helper.sub_sampling(encoded_corpus)
vocab_size = len(vocab_to_int)
embedding_size = 300
n_sample = 100
inputs = tf.placeholder(tf.int32, [None], name='inputs')
targets = tf.placeholder(tf.int32, [None, None], name='targets')
embedding = tf.Variable(tf.random_normal([vocab_size, embedding_size], -1, 1), name='embedding_matrix')
embed = tf.nn.embedding_lookup(embedding, inputs, name='embedding_lookup')
tf.summary.scalar('embedding_matrix', embedding)
output_w = tf.Variable(tf.truncated_normal([vocab_size, embedding_size], -1, 1), name='output_w')
output_b = tf.Variable(tf.zeros(vocab_size), name='output_b')
loss = tf.nn.sampled_softmax_loss(output_w, output_b, targets, embed, n_sample, vocab_size, name='sampled_loss')
cost = tf.reduce_mean(loss, name='cost')
optimizer = tf.train.AdamOptimizer().minimize(cost)
tf.summary.scalar('cost', cost)
merged_summaries = tf.summary.merge_all()
writer = tf.summary.FileWriter('summaries/run_1')
# ----- testing -----
valid_size = 16
valid_window = 100
valid_examples = np.array(random.sample(range(valid_window), valid_size // 2))
valid_examples = np.append(valid_examples,
random.sample(range(1000, 1000 + valid_window), valid_size // 2))
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
# We use the cosine distance:
norm = tf.sqrt(tf.reduce_sum(tf.square(embedding), 1, keep_dims=True))
normalized_embedding = embedding / norm
valid_embedding = tf.nn.embedding_lookup(normalized_embedding, valid_dataset)
similarity = tf.matmul(valid_embedding, tf.transpose(normalized_embedding))
saver = tf.train.Saver()
epochs = 25
batch_size = 500
window_size = 10
viz_words = 500
if os.path.exists('checkpoints/potter2vec.ckpt'):
with tf.Session as sess:
saver.restore(sess, tf.train.latest_checkpoint('checkpoints'))
embed_mat = sess.run(embedding)
tsne = TSNE()
embed_tsne = tsne.fit_transform(embed_mat[:viz_words, :])
fig, ax = plt.subplots(figsize=(14, 14))
for idx in range(viz_words):
plt.scatter(*embed_tsne[idx, :], color='steelblue')
plt.annotate(int_to_vocab[idx], (embed_tsne[idx, 0], embed_tsne[idx, 1]), alpha=0.7)
else:
with tf.Session() as sess:
iteration = 1
loss = 0
sess.run(tf.global_variables_initializer())
for e in range(1, epochs + 1):
batches = helper.get_batches(sampled_encoded_corpus, batch_size, window_size)
start = time.time()
for x, y in batches:
feed = {inputs: x,
targets: np.array(y)[:, None]}
train_loss, _ = sess.run([cost, optimizer], feed_dict=feed)
loss += train_loss
if iteration % 100 == 0:
end = time.time()
print("Epoch {}/{}".format(e, epochs),
"Iteration: {}".format(iteration),
"Avg. Training loss: {:.4f}".format(loss / 100),
"{:.4f} sec/batch".format((end - start) / 100))
loss = 0
start = time.time()
if iteration % 500 == 0:
# note that this is expensive (~20% slowdown if computed every 500 steps)
sim = similarity.eval()
for i in range(valid_size):
valid_word = int_to_vocab[valid_examples[i]]
top_k = 8 # number of nearest neighbors
nearest = (-sim[i, :]).argsort()[1:top_k + 1]
log = 'Nearest to %s:' % valid_word
for k in range(top_k):
close_word = int_to_vocab[nearest[k]]
log = '%s %s,' % (log, close_word)
print(log)
iteration += 1
save_path = saver.save(sess, "checkpoints/potter2vec.ckpt")
embed_mat = sess.run(normalized_embedding)