Skip to content

Commit

Permalink
remove time requirement and formatting
Browse files Browse the repository at this point in the history
  • Loading branch information
Ben-Sacks committed Oct 31, 2024
1 parent 6789257 commit 3bdb838
Show file tree
Hide file tree
Showing 2 changed files with 68 additions and 69 deletions.
74 changes: 41 additions & 33 deletions R/clean_dyads.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,49 +5,42 @@
#' @param dataframe produced from the read_dyads() function
#' @return dataframe with stopwords omitted, lemmatized words one per row
#' @importFrom dplyr select
#' @importFrom dplyr filter
#' @importFrom dplyr group_by
#' @importFrom dplyr consecutive_id
#' @importFrom dplyr ungroup
#' @importFrom dplyr rowwise
#' @importFrom dplyr mutate
#' @importFrom magrittr %>%
#' @importFrom textclean replace_contraction
#' @importFrom tm removeWords
#' @importFrom stringr str_squish
#' @importFrom tm stripWhitespace
#' @importFrom textstem lemmatize_strings
#' @importFrom stringr str_split_1
#' @importFrom stringr str_split
#' @importFrom stringi stri_remove_empty
#' @importFrom stringi stri_count_words
#' @importFrom tidyr separate_rows
#' @export clean_dyads

clean_dyads <- function(read_ts_df, lemmatize=TRUE) {
# ADD LAPPLY TO RECOGNIZE DYADS WITH MORE THAT TWO INTERLOCUTORS AND THROWS A WARNING WITH PROBLEM DYADS

default <- TRUE
# if not default and not proper file path (aside from default), throw an error
if (class(stop_words_df) != "character" | (!grepl(".csv", stop_words_df) & stop_words_df != "default")) {
stop("The argument 'stop_words_df' takes a string type, and should be a filepath leading to a csv file.")
}
# otherwise, if not default and also a csv path
else if (stop_words_df != "default") {
# read the filepath to a csv if given, can also supply a data frame
stop_words_df <- read.csv(stop_words_df)
default <- FALSE
# throw an error if there is no column called 'word'
if (all(grepl("word", colnames(stop_words_df), ignore.case = T) == FALSE)) {
stop("The given stopwords data frame must have a column called 'word' or 'Word")
}
colnames(stop_words_df) <- tolower(colnames(stop_words_df))
}

#set Event_ID and speaker names as factors
read_ts_df$Participant_ID <- as.factor(read_ts_df$Participant_ID) #convert variables to factor
read_ts_df$Event_ID <- as.factor(read_ts_df$Event_ID)

if (any(grepl("Time", colnames(read_ts_df), ignore.case = TRUE)) == TRUE){
#convert Time from hh:mm:ss or mm:ss to milliseconds
read_ts_df$Time <- sapply(read_ts_df$Time, function(x){
if (any(grepl(":", x)) == TRUE) { #checks for colons, indicative of mm:ss
x <- as.numeric(unlist(str_split(x, ":"))) #breaks string into vector by colon placement
if (length(x) == 2) { #shows just mm, ss
sum((x[1]*60000), (x[2]*1000))
}
else if ( length(xvec) == 3) { #shows hh, mm, ss
sum((x[1]*3600000), (x[2]*60000), (x[3]*1000))
}}
else {
x
}
})
}


if(lemmatize==TRUE) {
clean <- function(x) {
x <- tolower(x) #to lower
Expand All @@ -59,7 +52,14 @@ clean_dyads <- function(read_ts_df, lemmatize=TRUE) {
x <- gsub("n't", " not", x) #replace contraction with full word not
x <- textclean::replace_contraction(x) #replace contractions
x <- gsub("-", " ", x) #replace all hyphens with spaces
x <- tm::removeWords(x, omissions_dyads23$word)

if (default == TRUE) {
x <- tm::removeWords(x, omissions_dyads23$word)
}
else {
x <- tm::removeWords(x, stop_words_df$word)
}

x <- gsub("\\d+(st|nd|rd|th)", " ", x) #omits 6th, 23rd, ordinal numbers
x <- gsub("[^a-zA-Z]", " ", x) #omit non-alphabetic characters
x <- gsub("\\b[a]\\b{1}", " ", x)
Expand All @@ -68,7 +68,7 @@ clean_dyads <- function(read_ts_df, lemmatize=TRUE) {
x <- textstem::lemmatize_strings(x) #lemmatize
}
}

if(lemmatize==FALSE) {
clean <- function(x) {
x <- tolower(x) #to lower
Expand All @@ -80,28 +80,36 @@ clean_dyads <- function(read_ts_df, lemmatize=TRUE) {
x <- gsub("n't", " not", x) #replace contraction with full word not
x <- textclean::replace_contraction(x) #replace contractions
x <- gsub("-", " ", x) #replace all hyphens with spaces
x <- tm::removeWords(x, omissions_dyads23$word)

if (default == TRUE) {
x <- tm::removeWords(x, omissions_dyads23$word)
}
else {
x <- tm::removeWords(x, stop_words_df$word)
}

x <- gsub("\\d+(st|nd|rd|th)", " ", x) #omits 6th, 23rd, ordinal numbers
x <- gsub("[^a-zA-Z]", " ", x) #omit non-alphabetic characters
x <- gsub("\\b[a]\\b{1}", " ", x)
x <- tm::stripWhitespace(x)
x <- stringr::str_squish(x)
}
}

read_ts_df$RawText <- stringr::str_squish(read_ts_df$RawText) #remove unneeded white space from text

dfclean <- read_ts_df %>%
dplyr::mutate(CleanText = clean(RawText)) %>% #run clean function on text, making a new column
dplyr::group_by(Event_ID) %>% # group by event to compute turn
dplyr::mutate(TurnCount = dplyr::consecutive_id(Participant_ID), .after = Participant_ID) %>% # create a turn count
dplyr::group_by(TurnCount, .add = TRUE) %>% #group and take word count by turn for both raw and clean utterances
dplyr::mutate(NWords_ByPersonTurn_RAW = stringi::stri_count_words(paste(RawText, collapse = " ")),
NWords_ByPersonTurn_CLEAN = stringi::stri_count_words(paste(CleanText, collapse = " "))) %>%
NWords_ByPersonTurn_CLEAN = stringi::stri_count_words(paste(CleanText,
collapse = " "))) %>%
dplyr::select(-RawText) %>% # remove the raw text column
dplyr::ungroup()

dfclean_sep <- tidyr::separate_rows(dfclean, CleanText) # create row for each word in clean text

return(dfclean_sep)
}
63 changes: 27 additions & 36 deletions R/read_dyads.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,12 @@
#' @param folder_name folder of conversation transcripts in csv or txt format
#' @return a concatenated dataframe with each language transcript saved as a separate 'event_id'
#' @importFrom magrittr %>%
#' @importFrom dplyr select
#' @importFrom dplyr bind_rows
#' @export read_dyads

read_dyads <- function(folder_name = "my_transcripts") {
#defines three functions - the two that select and format txt and csv files, and the function that actually reads in the otter transcript txt file.

read_otter_transcript <- function(file_path) {
lines <- readLines(file_path) #read otter ai file
#removes otter ai watermark if it is present
Expand All @@ -22,15 +21,15 @@ read_dyads <- function(folder_name = "my_transcripts") {
speaker <- character()
time <- character()
text <- character()

#process lines of dialogue
current_line <- 1
while (current_line <= num_lines) {

first_line_vec <- strsplit(lines[current_line], " ")
speaker_time <- first_line_vec[[1]]
speaker <- c(speaker, speaker_time[1]) #select speaker
# check if time is included in mm:ss format and if so add it to the time column
# check if time is included in mm:ss format and if so add it to the time column
if (any(grepl(":", speaker_time)) == TRUE) {
timeadd <- grep(":", speaker_time)
}
Expand All @@ -55,15 +54,15 @@ read_dyads <- function(folder_name = "my_transcripts") {
return(transcript_df)
}
#END DEFINE OTTER READ TRANSCRIPT .TXT FILE FUNCTION

#DEFINE READ ME TXT FILE FUNCITON
#DEFINE READ ME TXT FILE FUNCTION
read_dyads_txt <- function(folder_name){
if (any(grepl("*.txt$", list.files(path = folder_name, pattern = ".", full.names = TRUE, recursive = TRUE))) == TRUE) {
file_list_txt <- list.files(path = folder_name, pattern = "*.txt$", full.names = TRUE, recursive = TRUE) #list files with .txt ending
file_names_txt <- list.files(path = folder_name, pattern = ".txt$", full.names = FALSE, recursive = TRUE)
file_names_txt <- gsub('.*/ ?(\\w+)', '\\1', file_names_txt)
file_names_txt <- gsub(".txt$", "", file_names_txt)

txtdata <- lapply(file_list_txt, function(x) {
#runs txt files names through otter reading function
xorf <- read_otter_transcript(x)
Expand All @@ -76,30 +75,33 @@ read_dyads <- function(folder_name = "my_transcripts") {
stop(paste("Unable to read transcript ", as.character(match(x, file_list_txt)), " as an otter transcript. Please refer to the ConversationAlign GitHub page for examples of properly formatted transcripts.", sep = ""), call. = FALSE)
}
})

#adds a doc id column to each transcript based on its name attribute
txtdata <- lapply(file_names_txt, function(x){
txtdata[[match(x, file_names_txt)]] <- cbind(Event_ID = rep(x, nrow(txtdata[[match(x, file_names_txt)]])), txtdata[[match(x, file_names_txt)]])})
return(txtdata)
}}
#END DEFINE READ ME TXT FILE FUNCITON

#DEFINE READ ME CSV FILE FUNCTION
read_dyads_csv <- function(folder_name) {
if (any(grepl("*.csv$", list.files(path = folder_name, pattern = ".", full.names = TRUE, recursive = TRUE))) == TRUE) {
file_list_csv <- list.files(path = folder_name, pattern = "*.csv$", full.names = TRUE, recursive = TRUE)
file_names_csv <- list.files(path = folder_name, pattern = ".csv$", full.names = FALSE, recursive = TRUE)
if (any(grepl("*.csv$", list.files(path = folder_name, pattern = ".",
full.names = TRUE, recursive = TRUE))) == TRUE) {
file_list_csv <- list.files(path = folder_name, pattern = "*.csv$",
full.names = TRUE, recursive = TRUE)
file_names_csv <- list.files(path = folder_name, pattern = ".csv$",
full.names = FALSE, recursive = TRUE)
file_names_csv<- gsub('.*/ ?(\\w+)', '\\1', file_names_csv)
file_names_csv <- gsub(".csv$", "", file_names_csv)

#creates a list of read in csv data frames
csvdata <- lapply(file_list_csv, function(x){
x_read_csv <- read.csv(x, header = TRUE)
#identify and remove all columns that are entirely NA or empty strings
sum_nas_es <- apply(x_read_csv, 2, function(y){sum(is.na(y) | y == "")})
x_read_csv <- x_read_csv[sum_nas_es < nrow(x_read_csv)]
x_read_csv <- data.frame(x_read_csv)

#check that the column names are correct
if ((any(grepl("^speaker$", colnames(x_read_csv), ignore.case = T)) == TRUE |
any(grepl("^speaker_names_raw$", colnames(x_read_csv), ignore.case = T)) == TRUE |
Expand All @@ -113,16 +115,7 @@ read_dyads <- function(folder_name = "my_transcripts") {
(any(grepl("^Text$", colnames(x_read_csv), ignore.case = T)) == TRUE |
any(grepl("^Turn$", colnames(x_read_csv), ignore.case = T)) == TRUE |
any(grepl("^Utterance$", colnames(x_read_csv), ignore.case = T)) == TRUE)) {

#tests for column named Time - if not present it adds a Time column filled with NAs
if (any(grepl("^Time*", colnames(x_read_csv), ignore.case = TRUE)) |
any(grepl("^start*", colnames(x_read_csv), ignore.case = TRUE))) {
colnames(x_read_csv)[which(grepl("^Time*", colnames(x_read_csv), ignore.case = T) |
grepl("^start*", colnames(x_read_csv), ignore.case = T))] <- "Time"
}
else {
x_read_csv$Time <- rep(NA, nrow(x_read_csv)) #if no Time col fill with NA
}

#correct the speaker and text names to our conventions
colnames(x_read_csv)[which(grepl("speaker", colnames(x_read_csv), ignore.case = T) |
grepl("PID", colnames(x_read_csv), ignore.case = T) |
Expand All @@ -132,25 +125,23 @@ read_dyads <- function(folder_name = "my_transcripts") {
grepl("partner", colnames(x_read_csv), ignore.case = T) |
grepl("source", colnames(x_read_csv), ignore.case = T) |
grepl("participant", colnames(x_read_csv), ignore.case = T))] <- "Participant_ID"

colnames(x_read_csv)[which(grepl("Text", colnames(x_read_csv), ignore.case = T) |
grepl("utterance", colnames(x_read_csv), ignore.case = T))] <- "RawText"

x_read_csv <- data.frame(x_read_csv)
x_final <- x_read_csv
}

col_check <- x_read_csv[, colnames(x_read_csv) %in%
c("Participant_ID", "RawText", "Time")]
if (ncol(col_check) != 3) { #if there are less than three columns
col_check <- sum(colnames(x_read_csv) %in% c("Participant_ID", "RawText"))

if (col_check != 2) { #if there are less than three columns
stop(paste("Function is unable to process csv transcript ", #error stating missing column
as.character(match(x, file_list_csv)), #also states the transcript
" correctly. Make sure that each transcript includes a column marking who is producing text in each row.
When you are preparing your language transcripts, please make sure this column is named one of the following: 'interlocutor', 'person', 'partner', 'source', 'speaker', 'participant', 'PID', or 'speaker_names_raw'.
Please make sure your a column containing raw language transcriptions is named 'utterance', 'turn', or 'text'.
If you wish to include a Time column please title it 'Time' or 'start'.",
sep = ""), call. = FALSE)

Please make sure your a column containing raw language transcriptions is named 'utterance', 'turn', or 'text'.", sep = ""), call. = FALSE)

}
x_final
})
Expand All @@ -164,12 +155,12 @@ read_dyads <- function(folder_name = "my_transcripts") {
txtlist <- read_dyads_txt(folder_name)
csvlist <- read_dyads_csv(folder_name)
all_list <- append(txtlist, csvlist) #append the two lists into one list

#throws an error if no files are found
if (length(all_list) == 0) {
stop("No files found. Please make sure you are providing the local or absolute file path to the desired folder as a character vector. At least one .csv or .txt file must be present.")
}

alldf <- dplyr::bind_rows(all_list) #binds the rows of each list into one data frame
alldf$Event_ID <- as.factor(alldf$Event_ID)
alldf$Participant_ID <- as.factor(alldf$Participant_ID)
Expand Down

0 comments on commit 3bdb838

Please sign in to comment.