-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.R
54 lines (46 loc) · 1.51 KB
/
utils.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# Copyright (C) 2016 Aghiles Salah. All rights reserved.
# License: Apache 2.0 License.
# Normalized Mutual Information
NMI <- function(row_cluster, true_labels, n){
N_kl = as.matrix(table(row_cluster, true_labels))
N_k = rowSums(N_kl)
N_l = colSums(N_kl)
Num = 0
denum_k = 0
for (i in 1 : nrow(N_kl)) {
denum_k = denum_k + (N_k[i] / n) * log((N_k[i] / n))
for (j in 1 : ncol(N_kl)) {
if (N_kl[i, j] != 0) {
num = log(n) + log(N_kl[i, j])
den = log(N_k[i]) + log(N_l[j])
Num = Num + (N_kl[i, j] / n) * (num - den)
}
}
}
denum_l = 0
for (j in 1 : ncol(N_kl)) {
denum_l = denum_l + (N_l[j] / n) * log((N_l[j] / n))
}
resnmi = Num / (sqrt(denum_k * denum_l))
resnmi
}
# Partition generator
par_gen <- function(n, k, nb_par=1){
par = as.integer(sample(as.numeric(1 : k), n, replace = TRUE))
if (nb_par > 1) {
for (i in 2 : nb_par) {
par = rbind(par, as.integer(sample(as.numeric(1 : k), n, replace = TRUE)))
}
par = as.matrix(par)
}
par
}
# TF-IDF data representation
tf_idf <- function(sparse_mat, l2_norm = FALSE){
bin_mat = replace(sparse_mat, sparse_mat > 0, 1)
tfidf_mat = sparse_mat + sparse_mat * log(1 + nrow(sparse_mat)) - t(t(sparse_mat) * log((1 + colSums(bin_mat))))
if (l2_norm)
tfidf_mat = tfidf_mat / sqrt(rowSums(tfidf_mat * tfidf_mat))
tfidf_sp_mat = as(tfidf_mat, "dgCMatrix")
tfidf_sp_mat
}