forked from EPINetz/EPINetz-Policy-Parser
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpolicy_field_visualisation.R
174 lines (132 loc) · 6.33 KB
/
policy_field_visualisation.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
## Visualize Policy Field Contents ##
#####################################
{
library(tidyverse)
library(quanteda)
library(quanteda.textstats)
library(data.table)
library(furrr)
library(vroom)
library(igraph)
library(RandomWalkRestartMH)
library(scales)
library(ggraph)
}
source("utils_text_processing.R")
source("get_rwr_terms.R")
n_terms <- 20 # number of top terms to be extracted per timeframe and policy field
classification_measure = "ScoreNormMean" # classification measure found in the classification results
drop_negative_pmi = TRUE # should edges negative PMI weights be dropped to help with graph layouting?
rescale_pmi = TRUE # should the PMI in each network rescaled between 0.01 and 1?
# note that the PMI is only calculated between the top terms and does therefore not represent the true PMI
classification_result <- readRDS("init_classification/init_classified_tweets.RDS")
classification_NE <- get_latest_tokens_file(path = "Tokenizer",
pattern = "tokens_full.csv.tar.gz") %>%
vroom(col_types = list(doc_id = "c")) %>%
filter_tokens(tokens_col = "lemma",
tags = c("NN", "NE"), # Noun words and NEs only
#minimum string length, stopwords dictionaries, additional stopwords and lower casing set to default
replies = classification_replies, # should replies get classified?
keep_mentions = classification_mentions, # should @-mentions be kept?
keep_urls = classification_urls) %>% # should URLs be kept?
split_timeframes(datetime_var = "_source.created_at",
timeframe = classification_timeframe,
before_after = classification_before_after)
docs <- classification_result %>% # pull relevant doc_ids
imap(\(classification_data, timeframe)
{
classification_data %>%
.[["classified_documents"]] %>%
distinct(doc_id) %>%
left_join(classification_NE %>%
.[[as.character(timeframe)]] %>%
select(doc_id, lemma), join_by(doc_id))
})
top_terms <- classification_result %>%
imap(\(result, timeframe)
{
result %>%.[["walk_terms"]] %>%
filter(lemma %in% (docs %>% .[[as.character(timeframe)]] %>%
distinct(lemma) %>% pull(lemma))) %>%
slice_max(order_by = !!as.name(classification_measure),
n = n_terms,
by = policy_field
)
})
# make policy field networks of the top terms, including their policy scores and PMI for documents in the target week
policy_networks <- top_terms %>%
imap(\(terms, timeframe)
{
dat <- terms %>% left_join(docs %>% .[[as.character(timeframe)]],
by = join_by(lemma),
relationship = "many-to-many") # duplicated terms over policy fields; duplicated terms over documents
networks <- dat %>% as.data.table() %>% split(by = "policy_field") %>%
map(\(group)
{
network <- group %>% # note that the PMI is only calculated between the top terms!
calculate_network(vertex_a = "doc_id",
vertex_b = "lemma",
pmi_weight = TRUE,
as_data_frame = FALSE)
if(drop_negative_pmi) { # drop negative PMI edges (this keeps the nodes!)
network <- delete_edges(network, E(network)[weight > 0])
}
if (rescale_pmi) {
E(network)$weight <- rescale(E(network)$weight,
to = c(0.001, 1)) # weights must be > 0 for layouting
}
# add policy field scores from the random walks to graph
scores <- tibble(lemma = V(network)$name) %>%
left_join(group %>%
select(lemma,
!!as.name(classification_measure)) %>%
distinct(),
by = join_by(lemma))
V(network)$score <- scores %>%
pull(!!as.name(classification_measure))
return(network)
})
return(networks)
})
# all policy fields together in one graph (to see overlaps etc)
field_networks <- top_terms %>%
imap(\(terms, timeframe)
{
network <- terms %>%
select(!seed_term) %>%
rename(weight = !!as.name(classification_measure)) %>%
graph_from_data_frame(directed = T)
# add policy field indicator
network <- set_vertex_attr(graph = network,
name = "policy_field",
index = V(network)[name %in% terms$policy_field],
value = V(network)[name %in% terms$policy_field]$name)
return(network)
})
# Visualize
timeframes <- names(classification_result)
timeframe <- timeframes[34] # specify a timeframe here
for (i in 1:length(policy_networks[[timeframe]])){
graph <- policy_networks[[timeframe]][[i]] %>%
ggraph(layout = "fr") +
geom_edge_fan(aes(edge_alpha = weight), color = "darkgrey") +
geom_node_point(aes(size = score)) +
geom_node_text(aes(label = name, size = score), repel = TRUE) +
scale_size(range = c(3,7)) +
labs(title = names(policy_networks[1]),
subtitle = paste("Policy Field:",
names(policy_networks[[timeframe]][i])),
edge_alpha = "Normalized PMI",
size = "Policy Score") +
theme_graph()
print(graph)
readline(prompt="Press [enter] to display the next policy field graph.")
}
field_networks[[timeframe]] %>%
ggraph(layout = "fr") +
geom_edge_fan(aes(edge_alpha = weight)) +
geom_node_point(aes(color = policy_field)) +
geom_node_text(aes(label = name, color = policy_field), repel = TRUE) +
labs(title = labs(title = names(field_networks[1])),
edge_alpha = "Policy Score") +
theme_graph()