Revert "added new split summarize, removed time addition in read"

This reverts commit 89185c8.
Reilly-ConceptsCognitionLab · Oct 30, 2024 · 6789257 · 6789257
1 parent 89185c8
commit 6789257
Show file tree

Hide file tree

Showing 5 changed files with 64 additions and 667 deletions.
diff --git a/R/align_dyads.R b/R/align_dyads.R
@@ -26,7 +26,6 @@
 #' @export align_dyads
 
 align_dyads <- function(clean_ts_df) {
-  load("data/lookup_db.rda")
   #allow the user to select what variables they want to align, or provide their own database(s) and subset them
   myvars <- select.list(c("aff_anger", "aff_anxiety", "aff_boredom",  "aff_closeness",
                           "aff_confusion", "aff_dominance", "aff_doubt", "aff_empathy",
@@ -43,20 +42,20 @@ align_dyads <- function(clean_ts_df) {
     dplyr::select(matches("^word$"), tidyselect::contains(myvars))
   #create variable containing the column names of each variable to be aligned
   var_aligners <- colnames(var_selected)[-grep("^word$", colnames(lookup_db), ignore.case = TRUE)]
-  
+
   #join measures of each variable to each word in each transcript
   df_aligned <- dplyr::left_join(clean_ts_df, var_selected, by = c("CleanText" = "word"), multiple = "first")
-  
+
   # index rows where a word could not be aligned on all dimensions and remove that word from the clean text
   df_aligned$CleanText[which(rowSums(is.na(df_aligned[,which(colnames(df_aligned) %in% var_aligners)])) == length(var_aligners))] <- ""
   # important to note here that the row is preserved, the text is subbed for an empty string
-  
+
   # group on event and turn, then take the number of words in that turn which could be aligned on
   df_aligned_wordcount <- df_aligned %>%
     dplyr::group_by(Event_ID, TurnCount) %>%
     dplyr::mutate(NWords_ByPersonTurn_CLEAN_ALIGNED = stringi::stri_count_words(paste(CleanText, collapse = " "))) %>%
     dplyr::ungroup()
-  
+
   #add an exchange count variable, which is one turn from each interlocutor
   df_aligned_wordcount$ExchangeCount <- ceiling(df_aligned_wordcount$TurnCount / 2)
   #rearrange the columns to be more readable

diff --git a/R/clean_dyads.R b/R/clean_dyads.R
@@ -25,29 +25,29 @@
 #' @export clean_dyads
 
 clean_dyads <- function(read_ts_df, lemmatize=TRUE) {
-  load("data/omissions_dyads23.rda")
-  default <- TRUE
-
-  # if not default and not proper file path (aside from default), throw an error
-  if (class(stop_words_df) != "character" | (!grepl(".csv", stop_words_df) & stop_words_df != "default")) {
-    stop("The argument 'stop_words_df' takes a string type, and should be a filepath leading to a csv file.")
-  }
-  # otherwise, if not default and also a csv path
-  else if (stop_words_df != "default") {
-    # read the filepath to a csv if given, can also supply a data frame
-    stop_words_df <- read.csv(stop_words_df)
-    default <- FALSE
-    # throw an error if there is no column called 'word'
-    if (all(grepl("word", colnames(stop_words_df), ignore.case = T) == FALSE)) {
-      stop("The given stopwords data frame must have a column called 'word' or 'Word")
-    }
-    colnames(stop_words_df) <- tolower(colnames(stop_words_df))
-  }
-
+  # ADD LAPPLY TO RECOGNIZE DYADS WITH MORE THAT TWO INTERLOCUTORS AND THROWS A WARNING WITH PROBLEM DYADS
+
   #set Event_ID  and speaker names as factors
   read_ts_df$Participant_ID <- as.factor(read_ts_df$Participant_ID) #convert variables to factor
   read_ts_df$Event_ID <- as.factor(read_ts_df$Event_ID)
-
+
+  if (any(grepl("Time", colnames(read_ts_df), ignore.case = TRUE)) == TRUE){
+    #convert Time from hh:mm:ss or mm:ss to milliseconds
+    read_ts_df$Time <- sapply(read_ts_df$Time, function(x){
+      if (any(grepl(":", x)) == TRUE) {  #checks for colons, indicative of mm:ss
+        x <- as.numeric(unlist(str_split(x, ":"))) #breaks string into vector by colon placement
+        if (length(x) == 2) { #shows just mm, ss
+          sum((x[1]*60000), (x[2]*1000))
+        }
+        else if ( length(xvec) == 3) { #shows hh, mm, ss
+          sum((x[1]*3600000), (x[2]*60000), (x[3]*1000))
+        }}
+      else {
+        x
+      }
+    })
+  }
+
   if(lemmatize==TRUE) {
     clean <- function(x) {
       x <- tolower(x) #to lower
@@ -59,14 +59,7 @@ clean_dyads <- function(read_ts_df, lemmatize=TRUE) {
       x <- gsub("n't", " not", x) #replace contraction with full word not
       x <- textclean::replace_contraction(x) #replace contractions
       x <- gsub("-", " ", x) #replace all hyphens with spaces
-
-      if (default == TRUE) {
-        x <- tm::removeWords(x, omissions_dyads23$word)
-      }
-      else {
-        x <- tm::removeWords(x, stop_words_df$word)
-      }
-
+      x <- tm::removeWords(x, omissions_dyads23$word)
       x <- gsub("\\d+(st|nd|rd|th)", " ", x) #omits 6th, 23rd, ordinal numbers
       x <- gsub("[^a-zA-Z]", " ", x) #omit non-alphabetic characters
       x <- gsub("\\b[a]\\b{1}", " ", x)
@@ -75,7 +68,7 @@ clean_dyads <- function(read_ts_df, lemmatize=TRUE) {
       x <- textstem::lemmatize_strings(x) #lemmatize
     }
   }
-  
+
   if(lemmatize==FALSE) {
     clean <- function(x) {
       x <- tolower(x) #to lower
@@ -87,36 +80,28 @@ clean_dyads <- function(read_ts_df, lemmatize=TRUE) {
       x <- gsub("n't", " not", x) #replace contraction with full word not
       x <- textclean::replace_contraction(x) #replace contractions
       x <- gsub("-", " ", x) #replace all hyphens with spaces
-
-      if (default == TRUE) {
-        x <- tm::removeWords(x, omissions_dyads23$word)
-      }
-      else {
-        x <- tm::removeWords(x, stop_words_df$word)
-      }
-
+      x <- tm::removeWords(x, omissions_dyads23$word)
       x <- gsub("\\d+(st|nd|rd|th)", " ", x) #omits 6th, 23rd, ordinal numbers
       x <- gsub("[^a-zA-Z]", " ", x) #omit non-alphabetic characters
       x <- gsub("\\b[a]\\b{1}", " ", x)
       x <- tm::stripWhitespace(x)
       x <- stringr::str_squish(x)
     }
   }
-  
+
   read_ts_df$RawText <- stringr::str_squish(read_ts_df$RawText) #remove unneeded white space from text
-  
+
   dfclean <- read_ts_df %>%
     dplyr::mutate(CleanText = clean(RawText)) %>%  #run clean function on text, making a new column
     dplyr::group_by(Event_ID) %>% # group by event to compute turn
     dplyr::mutate(TurnCount = dplyr::consecutive_id(Participant_ID), .after = Participant_ID) %>% # create a turn count
     dplyr::group_by(TurnCount, .add = TRUE) %>% #group and take word count by turn for both raw and clean utterances
     dplyr::mutate(NWords_ByPersonTurn_RAW = stringi::stri_count_words(paste(RawText, collapse = " ")),
-                  NWords_ByPersonTurn_CLEAN = stringi::stri_count_words(paste(CleanText,
-                                                                              collapse = " "))) %>%
+                  NWords_ByPersonTurn_CLEAN = stringi::stri_count_words(paste(CleanText, collapse = " "))) %>%
     dplyr::select(-RawText) %>% # remove the raw text column
     dplyr::ungroup()
-  
+
   dfclean_sep <- tidyr::separate_rows(dfclean, CleanText) # create row for each word in clean text
-  
+
   return(dfclean_sep)
 }
diff --git a/R/read_dyads.R b/R/read_dyads.R
@@ -12,7 +12,7 @@
 
 read_dyads <- function(folder_name = "my_transcripts") {
   #defines three functions - the two that select and format txt and csv files, and the function that actually reads in the otter transcript txt file.
-  
+
   read_otter_transcript <- function(file_path) {
     lines <- readLines(file_path) #read otter ai file
     #removes otter ai watermark if it is present
@@ -22,15 +22,15 @@ read_dyads <- function(folder_name = "my_transcripts") {
     speaker <- character()
     time <- character()
     text <- character()
-    
+
     #process lines of dialogue
     current_line <- 1
     while (current_line <= num_lines) {
 
       first_line_vec <- strsplit(lines[current_line], " ")
       speaker_time <- first_line_vec[[1]]
       speaker <- c(speaker, speaker_time[1]) #select speaker
-      # check if time is included in mm:ss format and if so add it to the time column
+    # check if time is included in mm:ss format and if so add it to the time column
       if (any(grepl(":", speaker_time)) == TRUE) {
         timeadd <- grep(":", speaker_time)
       } 
@@ -55,15 +55,15 @@ read_dyads <- function(folder_name = "my_transcripts") {
     return(transcript_df)
   }
   #END DEFINE OTTER READ TRANSCRIPT .TXT FILE FUNCTION
-  
-  #DEFINE READ ME TXT FILE FUNCTION
+
+  #DEFINE READ ME TXT FILE FUNCITON
   read_dyads_txt <- function(folder_name){
     if (any(grepl("*.txt$", list.files(path = folder_name, pattern = ".", full.names = TRUE, recursive = TRUE))) == TRUE) {
       file_list_txt <- list.files(path = folder_name, pattern = "*.txt$", full.names = TRUE, recursive = TRUE) #list files with .txt ending
       file_names_txt <- list.files(path = folder_name, pattern = ".txt$", full.names = FALSE, recursive = TRUE)
       file_names_txt <- gsub('.*/ ?(\\w+)', '\\1', file_names_txt)
       file_names_txt <- gsub(".txt$", "", file_names_txt)
-      
+
       txtdata <- lapply(file_list_txt, function(x) {
         #runs txt files names through otter reading function
         xorf <- read_otter_transcript(x)
@@ -76,33 +76,30 @@ read_dyads <- function(folder_name = "my_transcripts") {
           stop(paste("Unable to read transcript ", as.character(match(x, file_list_txt)), " as an otter transcript. Please refer to the ConversationAlign GitHub page for examples of properly formatted transcripts.", sep = ""), call. = FALSE)
         }
       })
-      
+
       #adds a doc id column to each transcript based on its name attribute
       txtdata <- lapply(file_names_txt, function(x){
         txtdata[[match(x, file_names_txt)]] <- cbind(Event_ID = rep(x, nrow(txtdata[[match(x, file_names_txt)]])), txtdata[[match(x, file_names_txt)]])})
       return(txtdata)
     }}
   #END DEFINE READ ME TXT FILE FUNCITON
-  
+
   #DEFINE READ ME CSV FILE FUNCTION
   read_dyads_csv <- function(folder_name) {
-    if (any(grepl("*.csv$", list.files(path = folder_name, pattern = ".",
-                                       full.names = TRUE, recursive = TRUE))) == TRUE) {
-      file_list_csv <- list.files(path = folder_name, pattern = "*.csv$",
-                                  full.names = TRUE, recursive = TRUE)
-      file_names_csv <- list.files(path = folder_name, pattern = ".csv$",
-                                   full.names = FALSE, recursive = TRUE)
+    if (any(grepl("*.csv$", list.files(path = folder_name, pattern = ".", full.names = TRUE, recursive = TRUE))) == TRUE) {
+      file_list_csv <- list.files(path = folder_name, pattern = "*.csv$", full.names = TRUE, recursive = TRUE)
+      file_names_csv <- list.files(path = folder_name, pattern = ".csv$", full.names = FALSE, recursive = TRUE)
       file_names_csv<- gsub('.*/ ?(\\w+)', '\\1', file_names_csv)
       file_names_csv <- gsub(".csv$", "", file_names_csv)
-      
+
       #creates a list of read in csv data frames
       csvdata <- lapply(file_list_csv, function(x){
         x_read_csv <- read.csv(x, header = TRUE)
         #identify and remove all columns that are entirely NA or empty strings
         sum_nas_es <- apply(x_read_csv, 2, function(y){sum(is.na(y) | y == "")})
         x_read_csv <- x_read_csv[sum_nas_es < nrow(x_read_csv)]
         x_read_csv <- data.frame(x_read_csv)
-        
+
         #check that the column names are correct
         if ((any(grepl("^speaker$", colnames(x_read_csv), ignore.case = T)) == TRUE |
              any(grepl("^speaker_names_raw$", colnames(x_read_csv), ignore.case = T)) == TRUE |
@@ -116,7 +113,16 @@ read_dyads <- function(folder_name = "my_transcripts") {
             (any(grepl("^Text$", colnames(x_read_csv), ignore.case = T)) == TRUE |
              any(grepl("^Turn$", colnames(x_read_csv), ignore.case = T)) == TRUE |
              any(grepl("^Utterance$", colnames(x_read_csv), ignore.case = T)) == TRUE)) {
-
+
+          #tests for column named Time - if not present it adds a Time column filled with NAs
+          if (any(grepl("^Time*", colnames(x_read_csv), ignore.case = TRUE)) |
+              any(grepl("^start*", colnames(x_read_csv), ignore.case = TRUE))) {
+            colnames(x_read_csv)[which(grepl("^Time*", colnames(x_read_csv), ignore.case = T) |
+                                         grepl("^start*", colnames(x_read_csv), ignore.case = T))] <- "Time"
+          }
+          else {
+            x_read_csv$Time <- rep(NA, nrow(x_read_csv)) #if no Time col fill with NA
+          }
           #correct the speaker and text names to our conventions
           colnames(x_read_csv)[which(grepl("speaker", colnames(x_read_csv), ignore.case = T) |
                                        grepl("PID", colnames(x_read_csv), ignore.case = T) |
@@ -126,25 +132,25 @@ read_dyads <- function(folder_name = "my_transcripts") {
                                        grepl("partner", colnames(x_read_csv), ignore.case = T) |
                                        grepl("source", colnames(x_read_csv), ignore.case = T) |
                                        grepl("participant", colnames(x_read_csv), ignore.case = T))] <- "Participant_ID"
-          
+
           colnames(x_read_csv)[which(grepl("Text", colnames(x_read_csv), ignore.case = T) |
                                        grepl("utterance", colnames(x_read_csv), ignore.case = T))] <- "RawText"
-          
+
           x_read_csv <- data.frame(x_read_csv)
           x_final <- x_read_csv
         }
-        
-        col_check <- sum(colnames(x_read_csv) %in% c("Participant_ID", "RawText"))
-
-        if (col_check != 2) { #if there are less than three columns
+
+        col_check <- x_read_csv[, colnames(x_read_csv) %in%
+                                  c("Participant_ID", "RawText", "Time")]
+        if (ncol(col_check) != 3) { #if there are less than three columns
           stop(paste("Function is unable to process csv transcript ", #error stating missing column
                      as.character(match(x, file_list_csv)), #also states the transcript
                      " correctly. Make sure that each transcript includes a column marking who is producing text in each row.
                      When you are preparing your language transcripts, please make sure this column is named one of the following: 'interlocutor', 'person', 'partner', 'source', 'speaker', 'participant', 'PID', or 'speaker_names_raw'.
                      Please make sure your a column containing raw language transcriptions is named 'utterance', 'turn', or 'text'.
                      If you wish to include a Time column please title it 'Time' or 'start'.",
                      sep = ""), call. = FALSE)
-          
+
         }
         x_final
       })
@@ -158,12 +164,12 @@ read_dyads <- function(folder_name = "my_transcripts") {
   txtlist <- read_dyads_txt(folder_name)
   csvlist <- read_dyads_csv(folder_name)
   all_list <- append(txtlist, csvlist) #append the two lists into one list
-  
+
   #throws an error if no files are found
   if (length(all_list) == 0) {
     stop("No files found. Please make sure you are providing the local or absolute file path to the desired folder as a character vector. At least one .csv or .txt file must be present.")
   }
-  
+
   alldf <- dplyr::bind_rows(all_list) #binds the rows  of each list into one data frame
   alldf$Event_ID <- as.factor(alldf$Event_ID)
   alldf$Participant_ID <- as.factor(alldf$Participant_ID)