remove time requirement and formatting

Reilly-ConceptsCognitionLab · Oct 31, 2024 · 3bdb838 · 3bdb838
1 parent 6789257
commit 3bdb838
Show file tree

Hide file tree

Showing 2 changed files with 68 additions and 69 deletions.
diff --git a/R/clean_dyads.R b/R/clean_dyads.R
@@ -5,49 +5,42 @@
 #' @param dataframe produced from the read_dyads() function
 #' @return dataframe with stopwords omitted, lemmatized words one per row
 #' @importFrom dplyr select
-#' @importFrom dplyr filter
 #' @importFrom dplyr group_by
 #' @importFrom dplyr consecutive_id
 #' @importFrom dplyr ungroup
-#' @importFrom dplyr rowwise
 #' @importFrom dplyr mutate
 #' @importFrom magrittr %>%
 #' @importFrom textclean replace_contraction
 #' @importFrom tm removeWords
 #' @importFrom stringr str_squish
 #' @importFrom tm stripWhitespace
 #' @importFrom textstem lemmatize_strings
-#' @importFrom stringr str_split_1
-#' @importFrom stringr str_split
-#' @importFrom stringi stri_remove_empty
 #' @importFrom stringi stri_count_words
 #' @importFrom tidyr separate_rows
 #' @export clean_dyads
 
 clean_dyads <- function(read_ts_df, lemmatize=TRUE) {
-  # ADD LAPPLY TO RECOGNIZE DYADS WITH MORE THAT TWO INTERLOCUTORS AND THROWS A WARNING WITH PROBLEM DYADS
-
+  default <- TRUE
+  # if not default and not proper file path (aside from default), throw an error
+  if (class(stop_words_df) != "character" | (!grepl(".csv", stop_words_df) & stop_words_df != "default")) {
+    stop("The argument 'stop_words_df' takes a string type, and should be a filepath leading to a csv file.")
+  }
+  # otherwise, if not default and also a csv path
+  else if (stop_words_df != "default") {
+    # read the filepath to a csv if given, can also supply a data frame
+    stop_words_df <- read.csv(stop_words_df)
+    default <- FALSE
+    # throw an error if there is no column called 'word'
+    if (all(grepl("word", colnames(stop_words_df), ignore.case = T) == FALSE)) {
+      stop("The given stopwords data frame must have a column called 'word' or 'Word")
+    }
+    colnames(stop_words_df) <- tolower(colnames(stop_words_df))
+  }
+
   #set Event_ID  and speaker names as factors
   read_ts_df$Participant_ID <- as.factor(read_ts_df$Participant_ID) #convert variables to factor
   read_ts_df$Event_ID <- as.factor(read_ts_df$Event_ID)
-
-  if (any(grepl("Time", colnames(read_ts_df), ignore.case = TRUE)) == TRUE){
-    #convert Time from hh:mm:ss or mm:ss to milliseconds
-    read_ts_df$Time <- sapply(read_ts_df$Time, function(x){
-      if (any(grepl(":", x)) == TRUE) {  #checks for colons, indicative of mm:ss
-        x <- as.numeric(unlist(str_split(x, ":"))) #breaks string into vector by colon placement
-        if (length(x) == 2) { #shows just mm, ss
-          sum((x[1]*60000), (x[2]*1000))
-        }
-        else if ( length(xvec) == 3) { #shows hh, mm, ss
-          sum((x[1]*3600000), (x[2]*60000), (x[3]*1000))
-        }}
-      else {
-        x
-      }
-    })
-  }
-
+
   if(lemmatize==TRUE) {
     clean <- function(x) {
       x <- tolower(x) #to lower
@@ -59,7 +52,14 @@ clean_dyads <- function(read_ts_df, lemmatize=TRUE) {
       x <- gsub("n't", " not", x) #replace contraction with full word not
       x <- textclean::replace_contraction(x) #replace contractions
       x <- gsub("-", " ", x) #replace all hyphens with spaces
-      x <- tm::removeWords(x, omissions_dyads23$word)
+
+      if (default == TRUE) {
+        x <- tm::removeWords(x, omissions_dyads23$word)
+      }
+      else {
+        x <- tm::removeWords(x, stop_words_df$word)
+      }
+
       x <- gsub("\\d+(st|nd|rd|th)", " ", x) #omits 6th, 23rd, ordinal numbers
       x <- gsub("[^a-zA-Z]", " ", x) #omit non-alphabetic characters
       x <- gsub("\\b[a]\\b{1}", " ", x)
@@ -68,7 +68,7 @@ clean_dyads <- function(read_ts_df, lemmatize=TRUE) {
       x <- textstem::lemmatize_strings(x) #lemmatize
     }
   }
-
+  
   if(lemmatize==FALSE) {
     clean <- function(x) {
       x <- tolower(x) #to lower
@@ -80,28 +80,36 @@ clean_dyads <- function(read_ts_df, lemmatize=TRUE) {
       x <- gsub("n't", " not", x) #replace contraction with full word not
       x <- textclean::replace_contraction(x) #replace contractions
       x <- gsub("-", " ", x) #replace all hyphens with spaces
-      x <- tm::removeWords(x, omissions_dyads23$word)
+
+      if (default == TRUE) {
+        x <- tm::removeWords(x, omissions_dyads23$word)
+      }
+      else {
+        x <- tm::removeWords(x, stop_words_df$word)
+      }
+
       x <- gsub("\\d+(st|nd|rd|th)", " ", x) #omits 6th, 23rd, ordinal numbers
       x <- gsub("[^a-zA-Z]", " ", x) #omit non-alphabetic characters
       x <- gsub("\\b[a]\\b{1}", " ", x)
       x <- tm::stripWhitespace(x)
       x <- stringr::str_squish(x)
     }
   }
-
+  
   read_ts_df$RawText <- stringr::str_squish(read_ts_df$RawText) #remove unneeded white space from text
-
+  
   dfclean <- read_ts_df %>%
     dplyr::mutate(CleanText = clean(RawText)) %>%  #run clean function on text, making a new column
     dplyr::group_by(Event_ID) %>% # group by event to compute turn
     dplyr::mutate(TurnCount = dplyr::consecutive_id(Participant_ID), .after = Participant_ID) %>% # create a turn count
     dplyr::group_by(TurnCount, .add = TRUE) %>% #group and take word count by turn for both raw and clean utterances
     dplyr::mutate(NWords_ByPersonTurn_RAW = stringi::stri_count_words(paste(RawText, collapse = " ")),
-                  NWords_ByPersonTurn_CLEAN = stringi::stri_count_words(paste(CleanText, collapse = " "))) %>%
+                  NWords_ByPersonTurn_CLEAN = stringi::stri_count_words(paste(CleanText,
+                                                                              collapse = " "))) %>%
     dplyr::select(-RawText) %>% # remove the raw text column
     dplyr::ungroup()
-
+  
   dfclean_sep <- tidyr::separate_rows(dfclean, CleanText) # create row for each word in clean text
-
+  
   return(dfclean_sep)
 }
diff --git a/R/read_dyads.R b/R/read_dyads.R
@@ -6,13 +6,12 @@
 #' @param folder_name folder of conversation transcripts in csv or txt format
 #' @return a concatenated dataframe with each language transcript saved as a separate 'event_id'
 #' @importFrom magrittr %>%
-#' @importFrom dplyr select
 #' @importFrom dplyr bind_rows
 #' @export read_dyads
 
 read_dyads <- function(folder_name = "my_transcripts") {
   #defines three functions - the two that select and format txt and csv files, and the function that actually reads in the otter transcript txt file.
-
+  
   read_otter_transcript <- function(file_path) {
     lines <- readLines(file_path) #read otter ai file
     #removes otter ai watermark if it is present
@@ -22,15 +21,15 @@ read_dyads <- function(folder_name = "my_transcripts") {
     speaker <- character()
     time <- character()
     text <- character()
-
+    
     #process lines of dialogue
     current_line <- 1
     while (current_line <= num_lines) {
 
       first_line_vec <- strsplit(lines[current_line], " ")
       speaker_time <- first_line_vec[[1]]
       speaker <- c(speaker, speaker_time[1]) #select speaker
-    # check if time is included in mm:ss format and if so add it to the time column
+      # check if time is included in mm:ss format and if so add it to the time column
       if (any(grepl(":", speaker_time)) == TRUE) {
         timeadd <- grep(":", speaker_time)
       } 
@@ -55,15 +54,15 @@ read_dyads <- function(folder_name = "my_transcripts") {
     return(transcript_df)
   }
   #END DEFINE OTTER READ TRANSCRIPT .TXT FILE FUNCTION
-
-  #DEFINE READ ME TXT FILE FUNCITON
+  
+  #DEFINE READ ME TXT FILE FUNCTION
   read_dyads_txt <- function(folder_name){
     if (any(grepl("*.txt$", list.files(path = folder_name, pattern = ".", full.names = TRUE, recursive = TRUE))) == TRUE) {
       file_list_txt <- list.files(path = folder_name, pattern = "*.txt$", full.names = TRUE, recursive = TRUE) #list files with .txt ending
       file_names_txt <- list.files(path = folder_name, pattern = ".txt$", full.names = FALSE, recursive = TRUE)
       file_names_txt <- gsub('.*/ ?(\\w+)', '\\1', file_names_txt)
       file_names_txt <- gsub(".txt$", "", file_names_txt)
-
+      
       txtdata <- lapply(file_list_txt, function(x) {
         #runs txt files names through otter reading function
         xorf <- read_otter_transcript(x)
@@ -76,30 +75,33 @@ read_dyads <- function(folder_name = "my_transcripts") {
           stop(paste("Unable to read transcript ", as.character(match(x, file_list_txt)), " as an otter transcript. Please refer to the ConversationAlign GitHub page for examples of properly formatted transcripts.", sep = ""), call. = FALSE)
         }
       })
-
+      
       #adds a doc id column to each transcript based on its name attribute
       txtdata <- lapply(file_names_txt, function(x){
         txtdata[[match(x, file_names_txt)]] <- cbind(Event_ID = rep(x, nrow(txtdata[[match(x, file_names_txt)]])), txtdata[[match(x, file_names_txt)]])})
       return(txtdata)
     }}
   #END DEFINE READ ME TXT FILE FUNCITON
-
+  
   #DEFINE READ ME CSV FILE FUNCTION
   read_dyads_csv <- function(folder_name) {
-    if (any(grepl("*.csv$", list.files(path = folder_name, pattern = ".", full.names = TRUE, recursive = TRUE))) == TRUE) {
-      file_list_csv <- list.files(path = folder_name, pattern = "*.csv$", full.names = TRUE, recursive = TRUE)
-      file_names_csv <- list.files(path = folder_name, pattern = ".csv$", full.names = FALSE, recursive = TRUE)
+    if (any(grepl("*.csv$", list.files(path = folder_name, pattern = ".",
+                                       full.names = TRUE, recursive = TRUE))) == TRUE) {
+      file_list_csv <- list.files(path = folder_name, pattern = "*.csv$",
+                                  full.names = TRUE, recursive = TRUE)
+      file_names_csv <- list.files(path = folder_name, pattern = ".csv$",
+                                   full.names = FALSE, recursive = TRUE)
       file_names_csv<- gsub('.*/ ?(\\w+)', '\\1', file_names_csv)
       file_names_csv <- gsub(".csv$", "", file_names_csv)
-
+      
       #creates a list of read in csv data frames
       csvdata <- lapply(file_list_csv, function(x){
         x_read_csv <- read.csv(x, header = TRUE)
         #identify and remove all columns that are entirely NA or empty strings
         sum_nas_es <- apply(x_read_csv, 2, function(y){sum(is.na(y) | y == "")})
         x_read_csv <- x_read_csv[sum_nas_es < nrow(x_read_csv)]
         x_read_csv <- data.frame(x_read_csv)
-
+        
         #check that the column names are correct
         if ((any(grepl("^speaker$", colnames(x_read_csv), ignore.case = T)) == TRUE |
              any(grepl("^speaker_names_raw$", colnames(x_read_csv), ignore.case = T)) == TRUE |
@@ -113,16 +115,7 @@ read_dyads <- function(folder_name = "my_transcripts") {
             (any(grepl("^Text$", colnames(x_read_csv), ignore.case = T)) == TRUE |
              any(grepl("^Turn$", colnames(x_read_csv), ignore.case = T)) == TRUE |
              any(grepl("^Utterance$", colnames(x_read_csv), ignore.case = T)) == TRUE)) {
-
-          #tests for column named Time - if not present it adds a Time column filled with NAs
-          if (any(grepl("^Time*", colnames(x_read_csv), ignore.case = TRUE)) |
-              any(grepl("^start*", colnames(x_read_csv), ignore.case = TRUE))) {
-            colnames(x_read_csv)[which(grepl("^Time*", colnames(x_read_csv), ignore.case = T) |
-                                         grepl("^start*", colnames(x_read_csv), ignore.case = T))] <- "Time"
-          }
-          else {
-            x_read_csv$Time <- rep(NA, nrow(x_read_csv)) #if no Time col fill with NA
-          }
+
           #correct the speaker and text names to our conventions
           colnames(x_read_csv)[which(grepl("speaker", colnames(x_read_csv), ignore.case = T) |
                                        grepl("PID", colnames(x_read_csv), ignore.case = T) |
@@ -132,25 +125,23 @@ read_dyads <- function(folder_name = "my_transcripts") {
                                        grepl("partner", colnames(x_read_csv), ignore.case = T) |
                                        grepl("source", colnames(x_read_csv), ignore.case = T) |
                                        grepl("participant", colnames(x_read_csv), ignore.case = T))] <- "Participant_ID"
-
+          
           colnames(x_read_csv)[which(grepl("Text", colnames(x_read_csv), ignore.case = T) |
                                        grepl("utterance", colnames(x_read_csv), ignore.case = T))] <- "RawText"
-
+          
           x_read_csv <- data.frame(x_read_csv)
           x_final <- x_read_csv
         }
-
-        col_check <- x_read_csv[, colnames(x_read_csv) %in%
-                                  c("Participant_ID", "RawText", "Time")]
-        if (ncol(col_check) != 3) { #if there are less than three columns
+        
+        col_check <- sum(colnames(x_read_csv) %in% c("Participant_ID", "RawText"))
+
+        if (col_check != 2) { #if there are less than three columns
           stop(paste("Function is unable to process csv transcript ", #error stating missing column
                      as.character(match(x, file_list_csv)), #also states the transcript
                      " correctly. Make sure that each transcript includes a column marking who is producing text in each row.
                      When you are preparing your language transcripts, please make sure this column is named one of the following: 'interlocutor', 'person', 'partner', 'source', 'speaker', 'participant', 'PID', or 'speaker_names_raw'.
-                     Please make sure your a column containing raw language transcriptions is named 'utterance', 'turn', or 'text'.
-                     If you wish to include a Time column please title it 'Time' or 'start'.",
-                     sep = ""), call. = FALSE)
-
+                     Please make sure your a column containing raw language transcriptions is named 'utterance', 'turn', or 'text'.", sep = ""), call. = FALSE)
+
         }
         x_final
       })
@@ -164,12 +155,12 @@ read_dyads <- function(folder_name = "my_transcripts") {
   txtlist <- read_dyads_txt(folder_name)
   csvlist <- read_dyads_csv(folder_name)
   all_list <- append(txtlist, csvlist) #append the two lists into one list
-
+  
   #throws an error if no files are found
   if (length(all_list) == 0) {
     stop("No files found. Please make sure you are providing the local or absolute file path to the desired folder as a character vector. At least one .csv or .txt file must be present.")
   }
-
+  
   alldf <- dplyr::bind_rows(all_list) #binds the rows  of each list into one data frame
   alldf$Event_ID <- as.factor(alldf$Event_ID)
   alldf$Participant_ID <- as.factor(alldf$Participant_ID)