-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathtask 3.R
107 lines (93 loc) · 4.29 KB
/
task 3.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# Alexander Vlasblom
# Prepare model data (functions)
# 2015-07-16
# split list of n-grams in predictor-tresponse tuples. List item n is a data frame of n-grams.
# The response is the final word in the n-gram, the predictor is formed
# Returns the list of data frames with columns xgram (for n > 1) and predict, the predictor word(s) and response word.
splitNgrams <- function(ng) {
dmessage("splitNgrams")
for (n in 1:length(ng)) {
if (class(ng[[n]]) == "data.frame") {
if (n == 1) {
colnames(ng[[n]])[which(colnames(ng[[n]]) == "ngram")] <- "predict"
}
else {
xgram.pattern <- paste0("^(\\w+( +\\w+){", n-2, "}) +\\w+ *$")
ng[[n]][,"xgram"] <- sub(xgram.pattern, "\\1", ng[[n]][,"ngram"])
predict.pattern <- "^.* +(\\w+) *$"
ng[[n]][,"predict"] <- sub(predict.pattern, "\\1", ng[[n]][,"ngram"])
}
}
}
dprint(str(ng))
ng
}
# Replace control and punctuation characters in an input text by space. Single quotes are removed.
# Returns cleaned input text.
cleanInput <- function(inputText) {
dmessage("cleanInput")
# remove single quotes, don't ==> dont
inputText <- gsub("'", "", tolower(inputText), fixed=TRUE)
# remove non-word charactters
charsToClean <- c("[:cntrl:]", "[:punct:]")
for (i in 1:length(charsToClean))
inputText <- gsub(paste0("[", charsToClean[i], "]"), " ", inputText, fixed=FALSE)
inputText
}
# Break an input text in x-grams, i.e. the last n-1 words of the corresponding n-grams.
# Returns list of x-grams.
makeXgrams <- function(inputText, ngramRange=1:4) {
dmessage("makeXgrams")
inputText <- cleanInput(inputText)
xgramRange <- (ngramRange - 1)
xgramRange <- xgramRange[xgramRange > 0]
xgrams <- lapply(xgramRange, function(n) {
if (n > 0) {
# xgram regex matches 1 .. n-1 words
xgram <- regmatches(inputText, regexec(paste0("^.*\\b(", paste(rep("\\w+ +", n-1), collapse=""), "\\w+) *$"), inputText))
dmessage("xgram: ", xgram)
# drop first element, which contains the entire inputText, keeping only the n words part
xgram[[1]][-1]
}
else character(0)
})
xgrams
}
# Predict the next words for all x-grams in an input text, based on a given set of n-grams, n in range 1:4
# Returns best candidates for the next word, as a data frame with columns predict (the predicted word) and total.freq (the observed
# total nr of occurrences of the n-grams predicting the best candidates)
predictN <- function(inputText, ngrams, ngramRange=1:4, order="freq", reorderThreshold=0) {
# inputText = predictor text
# ngrams = ordered list of ngrams, n-th item in list is data frame of n-grams (columns ngram and total.freq)
dmessage("predictN")
# Predict using longest ending ngram, if no match fall back to longest ngram minus 1, etc.
# If 2-gram fails, predict single word with highest probability in directionary.
xgrams <- makeXgrams(inputText, ngramRange)
dprint(xgrams)
# apply xgrams in reverse order (highest n first) to find potential candidates
candidates <- ldply(length(xgrams):1, function(x) {
# lookup x in n+1-grams
ng <- ngrams[[x+1]]
if (order=="ml") ng <- ng[order(-ng$ml),]
cands <- head(ng[ng$xgram == xgrams[[x]], c("predict", "freq", "ml")])
cands$n <- rep(x+1, nrow(cands))
cands
})
# if no candidates found using higher-level ngrams, use 1-gram
if (nrow(candidates) == 0) {
candidates <- head(ngrams[[1]])
candidates$n <- 1
}
# remove duplicates
# because of the data frame's descending order of frequency,
# duplicated() will remove the lower frequency duplicated items
candidates[,"dup"] <- ifelse(duplicated(candidates$predict), TRUE, FALSE)
# reorder by global frequency
# if digits(max(total.freq)) for highest n-gram is within threshold digits(max(total.freq)) for lower order n-grams
freq.max <- sapply(2:3, function(n) max(candidates[candidates$n == n, "freq"]))
if (min(nchar(freq.max)) + reorderThreshold >= max(nchar(freq.max)))
candidates <- candidates[order(-candidates$freq),]
dprint(str(candidates))
dprint(candidates)
candidates
}