-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlda.jl
52 lines (42 loc) · 991 Bytes
/
lda.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
using TopicModels
using Iterators
using StatsBase
# function readDocs(docStream, wordStream)
# wordLines = readlines(wordStream)
# docCounts = counts([parse(x) for x in readlines(docStream)])
# dws =
# [map(dw -> dw[2], Iterators.groupby(dw -> dw[1], zip(docLines, wordLines)))
# end
docs = readDocuments(open("ldac"))
lex = Array{String}(readLexicon(open("vocab")))
numTopics = 2
function init()
m = Model(fill(1.0, numTopics), 1.0, length(lex), Corpus(deepcopy(docs)))
m.assignments = deepcopy(docs)
m.topics = [3 0; 0 3]
m.topicSums = [3.0, 3.0]
m.documentSums = [2 1; 1 2]
return m
end
m = init()
n = 100000
v = zeros(Int64, n)
for i=1:n
m = init()
trainModel(m, 1)
v[i] = m.assignments[1][1]
end
for z in 1:numTopics
for w in topWords[z]
println(w)
end
println("--------------")
end
trainModel(model,2)
topWords = topTopicWords(model, lex, 7)
for z in 1:numTopics
for w in topWords[z]
println(w)
end
println("--------------")
end