From 678925737318bc932a29c2df0daa961a138355ca Mon Sep 17 00:00:00 2001 From: Ben Sacks Date: Wed, 30 Oct 2024 16:36:41 -0400 Subject: [PATCH] Revert "added new split summarize, removed time addition in read" This reverts commit 89185c8f674229112f7c8b2e36f40b87213806f0. --- R/align_dyads.R | 9 +- R/clean_dyads.R | 71 +++----- R/read_dyads.R | 58 ++++--- R/summarize_dyads_auc.R | 347 -------------------------------------- R/summarize_dyads_covar.R | 246 --------------------------- 5 files changed, 64 insertions(+), 667 deletions(-) delete mode 100644 R/summarize_dyads_auc.R delete mode 100644 R/summarize_dyads_covar.R diff --git a/R/align_dyads.R b/R/align_dyads.R index e3396f5..5e6433b 100644 --- a/R/align_dyads.R +++ b/R/align_dyads.R @@ -26,7 +26,6 @@ #' @export align_dyads align_dyads <- function(clean_ts_df) { - load("data/lookup_db.rda") #allow the user to select what variables they want to align, or provide their own database(s) and subset them myvars <- select.list(c("aff_anger", "aff_anxiety", "aff_boredom", "aff_closeness", "aff_confusion", "aff_dominance", "aff_doubt", "aff_empathy", @@ -43,20 +42,20 @@ align_dyads <- function(clean_ts_df) { dplyr::select(matches("^word$"), tidyselect::contains(myvars)) #create variable containing the column names of each variable to be aligned var_aligners <- colnames(var_selected)[-grep("^word$", colnames(lookup_db), ignore.case = TRUE)] - + #join measures of each variable to each word in each transcript df_aligned <- dplyr::left_join(clean_ts_df, var_selected, by = c("CleanText" = "word"), multiple = "first") - + # index rows where a word could not be aligned on all dimensions and remove that word from the clean text df_aligned$CleanText[which(rowSums(is.na(df_aligned[,which(colnames(df_aligned) %in% var_aligners)])) == length(var_aligners))] <- "" # important to note here that the row is preserved, the text is subbed for an empty string - + # group on event and turn, then take the number of words in that turn which could be aligned on df_aligned_wordcount <- df_aligned %>% dplyr::group_by(Event_ID, TurnCount) %>% dplyr::mutate(NWords_ByPersonTurn_CLEAN_ALIGNED = stringi::stri_count_words(paste(CleanText, collapse = " "))) %>% dplyr::ungroup() - + #add an exchange count variable, which is one turn from each interlocutor df_aligned_wordcount$ExchangeCount <- ceiling(df_aligned_wordcount$TurnCount / 2) #rearrange the columns to be more readable diff --git a/R/clean_dyads.R b/R/clean_dyads.R index e109348..0adc730 100644 --- a/R/clean_dyads.R +++ b/R/clean_dyads.R @@ -25,29 +25,29 @@ #' @export clean_dyads clean_dyads <- function(read_ts_df, lemmatize=TRUE) { - load("data/omissions_dyads23.rda") - default <- TRUE - - # if not default and not proper file path (aside from default), throw an error - if (class(stop_words_df) != "character" | (!grepl(".csv", stop_words_df) & stop_words_df != "default")) { - stop("The argument 'stop_words_df' takes a string type, and should be a filepath leading to a csv file.") - } - # otherwise, if not default and also a csv path - else if (stop_words_df != "default") { - # read the filepath to a csv if given, can also supply a data frame - stop_words_df <- read.csv(stop_words_df) - default <- FALSE - # throw an error if there is no column called 'word' - if (all(grepl("word", colnames(stop_words_df), ignore.case = T) == FALSE)) { - stop("The given stopwords data frame must have a column called 'word' or 'Word") - } - colnames(stop_words_df) <- tolower(colnames(stop_words_df)) - } - + # ADD LAPPLY TO RECOGNIZE DYADS WITH MORE THAT TWO INTERLOCUTORS AND THROWS A WARNING WITH PROBLEM DYADS + #set Event_ID and speaker names as factors read_ts_df$Participant_ID <- as.factor(read_ts_df$Participant_ID) #convert variables to factor read_ts_df$Event_ID <- as.factor(read_ts_df$Event_ID) - + + if (any(grepl("Time", colnames(read_ts_df), ignore.case = TRUE)) == TRUE){ + #convert Time from hh:mm:ss or mm:ss to milliseconds + read_ts_df$Time <- sapply(read_ts_df$Time, function(x){ + if (any(grepl(":", x)) == TRUE) { #checks for colons, indicative of mm:ss + x <- as.numeric(unlist(str_split(x, ":"))) #breaks string into vector by colon placement + if (length(x) == 2) { #shows just mm, ss + sum((x[1]*60000), (x[2]*1000)) + } + else if ( length(xvec) == 3) { #shows hh, mm, ss + sum((x[1]*3600000), (x[2]*60000), (x[3]*1000)) + }} + else { + x + } + }) + } + if(lemmatize==TRUE) { clean <- function(x) { x <- tolower(x) #to lower @@ -59,14 +59,7 @@ clean_dyads <- function(read_ts_df, lemmatize=TRUE) { x <- gsub("n't", " not", x) #replace contraction with full word not x <- textclean::replace_contraction(x) #replace contractions x <- gsub("-", " ", x) #replace all hyphens with spaces - - if (default == TRUE) { - x <- tm::removeWords(x, omissions_dyads23$word) - } - else { - x <- tm::removeWords(x, stop_words_df$word) - } - + x <- tm::removeWords(x, omissions_dyads23$word) x <- gsub("\\d+(st|nd|rd|th)", " ", x) #omits 6th, 23rd, ordinal numbers x <- gsub("[^a-zA-Z]", " ", x) #omit non-alphabetic characters x <- gsub("\\b[a]\\b{1}", " ", x) @@ -75,7 +68,7 @@ clean_dyads <- function(read_ts_df, lemmatize=TRUE) { x <- textstem::lemmatize_strings(x) #lemmatize } } - + if(lemmatize==FALSE) { clean <- function(x) { x <- tolower(x) #to lower @@ -87,14 +80,7 @@ clean_dyads <- function(read_ts_df, lemmatize=TRUE) { x <- gsub("n't", " not", x) #replace contraction with full word not x <- textclean::replace_contraction(x) #replace contractions x <- gsub("-", " ", x) #replace all hyphens with spaces - - if (default == TRUE) { - x <- tm::removeWords(x, omissions_dyads23$word) - } - else { - x <- tm::removeWords(x, stop_words_df$word) - } - + x <- tm::removeWords(x, omissions_dyads23$word) x <- gsub("\\d+(st|nd|rd|th)", " ", x) #omits 6th, 23rd, ordinal numbers x <- gsub("[^a-zA-Z]", " ", x) #omit non-alphabetic characters x <- gsub("\\b[a]\\b{1}", " ", x) @@ -102,21 +88,20 @@ clean_dyads <- function(read_ts_df, lemmatize=TRUE) { x <- stringr::str_squish(x) } } - + read_ts_df$RawText <- stringr::str_squish(read_ts_df$RawText) #remove unneeded white space from text - + dfclean <- read_ts_df %>% dplyr::mutate(CleanText = clean(RawText)) %>% #run clean function on text, making a new column dplyr::group_by(Event_ID) %>% # group by event to compute turn dplyr::mutate(TurnCount = dplyr::consecutive_id(Participant_ID), .after = Participant_ID) %>% # create a turn count dplyr::group_by(TurnCount, .add = TRUE) %>% #group and take word count by turn for both raw and clean utterances dplyr::mutate(NWords_ByPersonTurn_RAW = stringi::stri_count_words(paste(RawText, collapse = " ")), - NWords_ByPersonTurn_CLEAN = stringi::stri_count_words(paste(CleanText, - collapse = " "))) %>% + NWords_ByPersonTurn_CLEAN = stringi::stri_count_words(paste(CleanText, collapse = " "))) %>% dplyr::select(-RawText) %>% # remove the raw text column dplyr::ungroup() - + dfclean_sep <- tidyr::separate_rows(dfclean, CleanText) # create row for each word in clean text - + return(dfclean_sep) } diff --git a/R/read_dyads.R b/R/read_dyads.R index 3a36d84..d8ce504 100644 --- a/R/read_dyads.R +++ b/R/read_dyads.R @@ -12,7 +12,7 @@ read_dyads <- function(folder_name = "my_transcripts") { #defines three functions - the two that select and format txt and csv files, and the function that actually reads in the otter transcript txt file. - + read_otter_transcript <- function(file_path) { lines <- readLines(file_path) #read otter ai file #removes otter ai watermark if it is present @@ -22,7 +22,7 @@ read_dyads <- function(folder_name = "my_transcripts") { speaker <- character() time <- character() text <- character() - + #process lines of dialogue current_line <- 1 while (current_line <= num_lines) { @@ -30,7 +30,7 @@ read_dyads <- function(folder_name = "my_transcripts") { first_line_vec <- strsplit(lines[current_line], " ") speaker_time <- first_line_vec[[1]] speaker <- c(speaker, speaker_time[1]) #select speaker - # check if time is included in mm:ss format and if so add it to the time column + # check if time is included in mm:ss format and if so add it to the time column if (any(grepl(":", speaker_time)) == TRUE) { timeadd <- grep(":", speaker_time) } @@ -55,15 +55,15 @@ read_dyads <- function(folder_name = "my_transcripts") { return(transcript_df) } #END DEFINE OTTER READ TRANSCRIPT .TXT FILE FUNCTION - - #DEFINE READ ME TXT FILE FUNCTION + + #DEFINE READ ME TXT FILE FUNCITON read_dyads_txt <- function(folder_name){ if (any(grepl("*.txt$", list.files(path = folder_name, pattern = ".", full.names = TRUE, recursive = TRUE))) == TRUE) { file_list_txt <- list.files(path = folder_name, pattern = "*.txt$", full.names = TRUE, recursive = TRUE) #list files with .txt ending file_names_txt <- list.files(path = folder_name, pattern = ".txt$", full.names = FALSE, recursive = TRUE) file_names_txt <- gsub('.*/ ?(\\w+)', '\\1', file_names_txt) file_names_txt <- gsub(".txt$", "", file_names_txt) - + txtdata <- lapply(file_list_txt, function(x) { #runs txt files names through otter reading function xorf <- read_otter_transcript(x) @@ -76,25 +76,22 @@ read_dyads <- function(folder_name = "my_transcripts") { stop(paste("Unable to read transcript ", as.character(match(x, file_list_txt)), " as an otter transcript. Please refer to the ConversationAlign GitHub page for examples of properly formatted transcripts.", sep = ""), call. = FALSE) } }) - + #adds a doc id column to each transcript based on its name attribute txtdata <- lapply(file_names_txt, function(x){ txtdata[[match(x, file_names_txt)]] <- cbind(Event_ID = rep(x, nrow(txtdata[[match(x, file_names_txt)]])), txtdata[[match(x, file_names_txt)]])}) return(txtdata) }} #END DEFINE READ ME TXT FILE FUNCITON - + #DEFINE READ ME CSV FILE FUNCTION read_dyads_csv <- function(folder_name) { - if (any(grepl("*.csv$", list.files(path = folder_name, pattern = ".", - full.names = TRUE, recursive = TRUE))) == TRUE) { - file_list_csv <- list.files(path = folder_name, pattern = "*.csv$", - full.names = TRUE, recursive = TRUE) - file_names_csv <- list.files(path = folder_name, pattern = ".csv$", - full.names = FALSE, recursive = TRUE) + if (any(grepl("*.csv$", list.files(path = folder_name, pattern = ".", full.names = TRUE, recursive = TRUE))) == TRUE) { + file_list_csv <- list.files(path = folder_name, pattern = "*.csv$", full.names = TRUE, recursive = TRUE) + file_names_csv <- list.files(path = folder_name, pattern = ".csv$", full.names = FALSE, recursive = TRUE) file_names_csv<- gsub('.*/ ?(\\w+)', '\\1', file_names_csv) file_names_csv <- gsub(".csv$", "", file_names_csv) - + #creates a list of read in csv data frames csvdata <- lapply(file_list_csv, function(x){ x_read_csv <- read.csv(x, header = TRUE) @@ -102,7 +99,7 @@ read_dyads <- function(folder_name = "my_transcripts") { sum_nas_es <- apply(x_read_csv, 2, function(y){sum(is.na(y) | y == "")}) x_read_csv <- x_read_csv[sum_nas_es < nrow(x_read_csv)] x_read_csv <- data.frame(x_read_csv) - + #check that the column names are correct if ((any(grepl("^speaker$", colnames(x_read_csv), ignore.case = T)) == TRUE | any(grepl("^speaker_names_raw$", colnames(x_read_csv), ignore.case = T)) == TRUE | @@ -116,7 +113,16 @@ read_dyads <- function(folder_name = "my_transcripts") { (any(grepl("^Text$", colnames(x_read_csv), ignore.case = T)) == TRUE | any(grepl("^Turn$", colnames(x_read_csv), ignore.case = T)) == TRUE | any(grepl("^Utterance$", colnames(x_read_csv), ignore.case = T)) == TRUE)) { - + + #tests for column named Time - if not present it adds a Time column filled with NAs + if (any(grepl("^Time*", colnames(x_read_csv), ignore.case = TRUE)) | + any(grepl("^start*", colnames(x_read_csv), ignore.case = TRUE))) { + colnames(x_read_csv)[which(grepl("^Time*", colnames(x_read_csv), ignore.case = T) | + grepl("^start*", colnames(x_read_csv), ignore.case = T))] <- "Time" + } + else { + x_read_csv$Time <- rep(NA, nrow(x_read_csv)) #if no Time col fill with NA + } #correct the speaker and text names to our conventions colnames(x_read_csv)[which(grepl("speaker", colnames(x_read_csv), ignore.case = T) | grepl("PID", colnames(x_read_csv), ignore.case = T) | @@ -126,17 +132,17 @@ read_dyads <- function(folder_name = "my_transcripts") { grepl("partner", colnames(x_read_csv), ignore.case = T) | grepl("source", colnames(x_read_csv), ignore.case = T) | grepl("participant", colnames(x_read_csv), ignore.case = T))] <- "Participant_ID" - + colnames(x_read_csv)[which(grepl("Text", colnames(x_read_csv), ignore.case = T) | grepl("utterance", colnames(x_read_csv), ignore.case = T))] <- "RawText" - + x_read_csv <- data.frame(x_read_csv) x_final <- x_read_csv } - - col_check <- sum(colnames(x_read_csv) %in% c("Participant_ID", "RawText")) - - if (col_check != 2) { #if there are less than three columns + + col_check <- x_read_csv[, colnames(x_read_csv) %in% + c("Participant_ID", "RawText", "Time")] + if (ncol(col_check) != 3) { #if there are less than three columns stop(paste("Function is unable to process csv transcript ", #error stating missing column as.character(match(x, file_list_csv)), #also states the transcript " correctly. Make sure that each transcript includes a column marking who is producing text in each row. @@ -144,7 +150,7 @@ read_dyads <- function(folder_name = "my_transcripts") { Please make sure your a column containing raw language transcriptions is named 'utterance', 'turn', or 'text'. If you wish to include a Time column please title it 'Time' or 'start'.", sep = ""), call. = FALSE) - + } x_final }) @@ -158,12 +164,12 @@ read_dyads <- function(folder_name = "my_transcripts") { txtlist <- read_dyads_txt(folder_name) csvlist <- read_dyads_csv(folder_name) all_list <- append(txtlist, csvlist) #append the two lists into one list - + #throws an error if no files are found if (length(all_list) == 0) { stop("No files found. Please make sure you are providing the local or absolute file path to the desired folder as a character vector. At least one .csv or .txt file must be present.") } - + alldf <- dplyr::bind_rows(all_list) #binds the rows of each list into one data frame alldf$Event_ID <- as.factor(alldf$Event_ID) alldf$Participant_ID <- as.factor(alldf$Participant_ID) diff --git a/R/summarize_dyads_auc.R b/R/summarize_dyads_auc.R deleted file mode 100644 index e6904b0..0000000 --- a/R/summarize_dyads_auc.R +++ /dev/null @@ -1,347 +0,0 @@ -#' summarize_dyads_auc -#' -#' Calculates the area under the curve of the absolute difference time series between interlocutor time series. The length of the difference time series can be standardized the shortest number of exchanges present in the group using an internally defined resampling function, called with resample = TRUE. Area under the curve become less reliable for dyads under 30 exchanges. -#' -#' @name summarize_dyads_auc -#' @param dataframe produced in the align_dyads function -#' @return A data frame associating each transcript with the dAUC calculated for each dimension -#' @importFrom DescTools AUC -#' @importFrom dplyr summarise -#' @importFrom dplyr mutate -#' @importFrom dplyr select -#' @importFrom dplyr group_by -#' @importFrom dplyr ungroup -#' @importFrom dplyr filter -#' @importFrom dplyr left_join -#' @importFrom dplyr bind_rows -#' @importFrom dplyr bind_cols -#' @importFrom dplyr rename_at -#' @importFrom dplyr vars -#' @importFrom dplyr across -#' @importFrom stringr str_replace -#' @importFrom stringr str_c -#' @importFrom magrittr %>% -#' @importFrom tidyr pivot_wider -#' @importFrom tidyr pivot_longer -#' @importFrom tidyr drop_na -#' @importFrom tidyr fill -#' @importFrom tidyselect starts_with -#' @importFrom tidyselect ends_with -#' @importFrom tidyselect contains -#' @importFrom tidyselect everything -#' @importFrom zoo na.approx -#' @export summarize_dyads_auc - - -summarize_dyads_auc <- function(aligned_ts_df, resample = TRUE) { - #remove empty levels of all factors in the data frame - specifically for any removed transcript event ids - aligned_ts_df <- droplevels(aligned_ts_df) - - #initial check for transcripts that do not have two interlocutors - check_pnum <- aligned_ts_df %>% - dplyr::group_by(Event_ID) %>% - dplyr::summarize(p_count = length(unique(Participant_ID)), - .groups = "drop") #generate dataframe with number of interlocutors in each transcript - - err_ts_vec <- check_pnum$Event_ID[which(check_pnum$p_count != 2)] - #check if any transcripts were identified and throw an error with the specific transcripts - if (length(err_ts_vec) != 0) { - stop("summarize_dyads requires that all transcripts have exactly two interlocutors.\ntranscripts with less than or greater than 2 interlocutors:\n", - paste(err_ts_vec, collapse = "\n")) - } - - #mutates an interlocutor pair variable, and preserves all columns - aligned_ts_df <- aligned_ts_df %>% - dplyr::group_by(Event_ID) %>% #add a column of concatenated interlocutor names by transcript - dplyr::mutate(Participant_Pair = paste(sort(unique(Participant_ID)), collapse = "---")) %>% - dplyr::ungroup() - - - #DEFINE TIME SERIES RESCALER - resample_time_series <- function(df_list, threshold) { - - align_dimensions <- c("aff_anger", "aff_anxiety", "aff_boredom", "aff_closeness", - "aff_confusion", "aff_dominance", "aff_doubt", "aff_empathy", - "aff_encouragement", "aff_excitement", "aff_guilt", "aff_happiness", - "aff_hope", "aff_hostility", "aff_politeness", "aff_sadness", "aff_stress", - "aff_surprise", "aff_trust", "aff_valence", "lex_age_acquisition", - "lex_letter_count_raw", "lex_morphemecount_raw", "lex_prevalence", - "lex_senses_polysemy", "lex_wordfreqlg10_raw", "sem_arousal", - "sem_concreteness", "sem_diversity", "sem_neighbors") - # if the data is not in a list, then it splits by Event_ID - if (typeof(df_list) != "list"){ - df_list <- df_list %>% droplevels() - df_list = split(df_list, f = df_list$Event_ID) - } - - if (length(names(df_list)) == 0) { - names(df_list) <- seq(1:length(df_list)) #assigns numeric name to list if no names are present - } - names(df_list) <- as.character(names(df_list)) - dflistscaled <- lapply(names(df_list), function(x){ - - xname <- x - x <- df_list[[match(xname, names(df_list))]] - x <- data.frame(x) - ts_select <- x %>% - dplyr::select(tidyselect::contains(align_dimensions)) %>% #select emotion columns - dplyr::mutate(ExchangeCount = seq(from = 0, to = (nrow(x)-1))) #add exchange count - # here ExC gets reduced by 1 so that zero is the floor - this makes the calculations easier and the threshold is also reduced by one so that no new time points are added - - #if the ts has 3 or fewer exchange counts it shouldn't be resampled, so will report NA - if (max(ts_select$ExchangeCount) <= 2){ - final_ts <- ts_select %>% - dplyr::mutate(Event_ID = unique(x$Event_ID), - Participant_Pair = unique(x$Participant_Pair)) - - final_ts[,colnames(final_ts) %in% align_dimensions] <- NA - final_ts - } - else{ - #computes how many points must be calculated from each currently present point - scale_ratio <- threshold / max(ts_select$ExchangeCount) - - #if the transcript is the desired length already - if (scale_ratio == 1) { - rescale_final <- ts_select %>% #assign to var so that it will be smoothed - dplyr::select(!ExchangeCount) #remove exchange count - } - # only down samples each time series - else if (scale_ratio < 1) { - grouper_var <- 1 / scale_ratio #take inverse of ratio to find size of groups - grouper_var_floor <- floor(grouper_var) #round group size to nearest integer - #creates a sequence to group Time counts, and adds up to the max Time if the count is not even. - grouper_counter <- rep(seq(1:(nrow(ts_select)/grouper_var_floor)), each=grouper_var_floor) - if (length(grouper_counter) < nrow(ts_select)) { - grouper_counter[(length(grouper_counter)+1):nrow(ts_select)] <- max(grouper_counter) + 1 - } - - agg_int <- ts_select %>% - dplyr::mutate(Grouper = grouper_counter) %>% - dplyr::group_by(Grouper) %>% #create and group by the integer needed to downscale - dplyr::summarise(dplyr::across(tidyselect::contains(colnames(x)), ~ mean(.x, na.rm = TRUE)), #aggregate by group - .groups = "drop") %>% - dplyr::select(!Grouper) - - if (nrow(agg_int) == threshold + 1) { - rescale_final <- agg_int - rescale_final - } - else { - #compute the number of points to remove after aggregating by lowest integer ratio - point_diff <- (nrow(agg_int) - 1) - threshold - #creates a new factor variable to distribute where aggregation occurs - splitter <- (nrow(agg_int) - 1) / point_diff - #evenly split indexes for aggregation across the Time series - index_subs <- floor(seq(from = splitter, to = nrow(agg_int), by = splitter)) - #generate a grouping sequence and substitute at indexes to create groups of two that will be aggregated - second_grouper <- seq(1:nrow(agg_int)) - second_grouper[index_subs - 1] <- index_subs - - agg_final <- agg_int %>% - dplyr::mutate(Grouper = second_grouper) %>% - dplyr::group_by(Grouper) %>% #create and group by the integer needed to downscale - dplyr::summarise(dplyr::across(tidyselect::contains(colnames(x)), ~ mean(.x, na.rm = TRUE)), #aggregate by group - .groups = "drop") %>% - dplyr::select(!Grouper) - rescale_final <- agg_final - } - } #end of down scaling section - - final_ts <- rescale_final %>% - dplyr::mutate(ExchangeCount = seq(from = 1, to = threshold + 1), #create exchange count column - Event_ID = unique(x$Event_ID), - Participant_Pair = unique(x$Participant_Pair)) - #bind all new columns together into one data frame with an exchange count - final_ts <- data.frame(final_ts) - } - }) - #returns a list of scaled and smoothed data frames - return(dflistscaled) - } - #END DEFINE TIME SERIES RESCALER - - #DEFINE FIND DYAD AREA UNDER THE CURVE FUNCTION - find_auc_dyads <- function(aligned_df, resample = TRUE) { - align_dimensions <- c("aff_anger", "aff_anxiety", "aff_boredom", "aff_closeness", - "aff_confusion", "aff_dominance", "aff_doubt", "aff_empathy", - "aff_encouragement", "aff_excitement", "aff_guilt", "aff_happiness", - "aff_hope", "aff_hostility", "aff_politeness", "aff_sadness", "aff_stress", - "aff_surprise", "aff_trust", "aff_valence", "lex_age_acquisition", - "lex_letter_count_raw", "lex_morphemecount_raw", "lex_prevalence", - "lex_senses_polysemy", "lex_wordfreqlg10_raw", "sem_arousal", - "sem_concreteness", "sem_diversity", "sem_neighbors") - - align_var <- colnames(aligned_df)[colnames(aligned_df) %in% align_dimensions] - # split the data frame into a list by event id - df_list <- split(aligned_df, f = aligned_df$Event_ID) - # iterate over each event, replacing participant names with a transient filler variable - df_list_speakvar <- lapply(df_list, function(df){ - svec <- unique(as.character(df$Participant_ID)) - df <- df %>% - dplyr::mutate(participant_var = ifelse(as.character(Participant_ID) == svec[1], "S1", "S2"), - Participant_Pair = paste(sort(svec), collapse = "---")) - }) - - df_speakvar <- dplyr::bind_rows(df_list_speakvar) - - # group by turn then take the average score for each turn count,then pivot on pids - df_wide <- df_speakvar %>% - dplyr::group_by(Event_ID, ExchangeCount, Participant_ID) %>% - dplyr::summarise(dplyr::across(tidyselect::contains(align_var), ~ mean(.x, na.rm = TRUE)), - participant_var = first(participant_var), - Participant_Pair = first(Participant_Pair), - .groups = "drop") %>% - tidyr::pivot_wider(names_from = c("participant_var"), values_from = any_of(align_dimensions)) - - # if there is only one aligned variable manually add that variable name to participant columns - if (length(align_var) == 1){ - colnames(df_wide)[which(colnames(df_wide) %in% c("S1", "S2"))] = paste(align_var[1], colnames(df_wide)[which(colnames(df_wide) %in% c("S1", "S2"))], sep = "_") - } - - # should remove uneven final exchanges here - need to grab the last exchange count of each row and if it is uneven, remove it. - for (eid in unique(as.character(df_wide$Event_ID))){ - final_exc_rows <- df_wide[which(df_wide$Event_ID == eid & df_wide$ExchangeCount == max(df_wide$ExchangeCount[which(df_wide$Event_ID == eid)])),] - if (nrow(final_exc_rows) == 1){ - df_wide <- df_wide[-which(df_wide$Event_ID == eid & df_wide$ExchangeCount == max(df_wide$ExchangeCount[which(df_wide$Event_ID == eid)])),] - } - } - - # before wrangling data, find the threshold if resampling - if (is.logical(resample) == TRUE) { - #mutate the max ExchangeCount for each dyad - min_exc_max <- df_wide %>% - dplyr::group_by(Event_ID) %>% - dplyr::summarize(exc_max = max(ExchangeCount), - .groups = "drop") - #take the minimum of the max exchange count by dyad variable after converting to a vector - min_exc <- min(min_exc_max$exc_max) - # reduce the threshold by one because the resample function will add an exchange - threshold <- min_exc - 1 - - if (resample == TRUE){ - if (threshold < 30) { - warning(writeLines("the threshold for resampling is below 30 exchanges.\narea under the curve becomes a less valid measures of alignment below 30 exchanges")) - } - } - else { - small_dyads <- unique(min_exc_max$Event_ID[which(min_exc_max$exc_max < 30)]) - warning(writeLines(paste("The following dyads are shorter than 30 exchange counts.\nArea under the curve becomes less valid below 30 exchanges", paste(small_dyads, collapse = "\n"), sep = "\n"))) - threshold <- 3 - } - } - else { - stop("Argument resample must be logical") - } - # here is the big iteration: - split_pid_df_list <- sapply(c("S1", "S2"), function(temp_pid){ - # grab the columns that are needed for grouping or contain just on participant's scores - p_df <- df_wide %>% - dplyr::select(c("Event_ID", "ExchangeCount", "Participant_Pair") | c(ends_with(paste0("_", temp_pid)))) - - # take the first column that matches an aligned variable - first_var_col <- p_df[,which(colnames(p_df) %in% paste(align_var, temp_pid, sep = "_"))[1]] - # index the rows that are either not NA or are NaN values (numbers and NaN) - p_df <- p_df[which(is.na(first_var_col) == FALSE | is.nan(unlist(first_var_col)) == TRUE),] - p_df_list <- split(p_df, f = p_df$Event_ID) - - if (resample == TRUE) { - p_df_list <- resample_time_series(p_df_list, threshold = threshold) - } - - p_df_altered <- dplyr::bind_rows(p_df_list) - # interpolate here - fill forward for end, fill back for front, linear interpolation for the middle - # grouped by event id, then each function is run across all the Time series individually - - p_df_interp <- p_df_altered %>% - dplyr::group_by(Event_ID) %>% # interpolates all missing values between two points - dplyr::mutate(dplyr::across(tidyselect::contains(align_var), - ~ zoo::na.approx(.x, na.rm = FALSE))) %>% - tidyr::fill(tidyselect::contains(align_var), .direction = "downup") %>% - dplyr::ungroup() - - # wrap the data frame in a list to preserve structure (lapply will pull it into a list of lists) - df_interp_names_list <- list(p_df_interp) - }) - # join the two participant data frame together by dyad and exchangecount - widedf <- dplyr::left_join(split_pid_df_list[[1]], split_pid_df_list[[2]], by = c("Event_ID", "Participant_Pair", "ExchangeCount")) - - #iterate over each aligned dimension, selecting only the scores for that dimension and pulling a difference value and subbing it in for the actual values - for (dimension in align_var){ - both_participant_cols <- widedf %>% select(starts_with(dimension)) - absdiffcol <- data.frame(dimension = abs(both_participant_cols[,1] - both_participant_cols[,2])) - widedf[which(colnames(widedf) %in% paste(dimension, c("S1", "S2"), sep = "_"))] <- absdiffcol - } - - long_diff_df <- widedf %>% - tidyr::pivot_longer(cols = c(ends_with("_S1") | ends_with("_S2")), - names_to = c("dimension", "Participant_ID"), - names_pattern = "(.*)_([^_]+)$", - values_to = "score") %>% # pivot longer by dimension and pid - # pivot each dimension to a column - tidyr::pivot_wider(names_from = dimension, values_from = score) %>% - dplyr::filter(Participant_ID == "S1") %>% # remove every second row per turn - dplyr::select(-Participant_ID) # remove pid column - # split the difference data frame into a list based on event id - long_diff_df_list <- split(long_diff_df, f = long_diff_df$Event_ID) - - # grab the aligned dimensions as a vector to iterate over - xdimensions <- colnames(long_diff_df)[which(colnames(long_diff_df) %in% align_dimensions)] - domain_auc_list <- lapply(xdimensions, function(dimension){ #iterate over emotion - # now iterate over each dyad in the corpus - - # function to calculate AUC for single time series, return NA if there is an error - calculate_auc <- function(domain_ts, doc_name, dimension) { - tryCatch({ - # if time series has fewer points than the threshold, fill with NA - if (max(domain_ts$ExchangeCount) < threshold) { - #create a single row, single column dataframe with one empty value to fill in the AUC - doc_domain_auc_df <- data.frame(domain_auc = NA) - } - else { - domain_ts <- data.frame(domain_ts) #make single emotion Time series a data frame - domain_auc <- DescTools::AUC(x = domain_ts[,1], y = domain_ts[,2], method = "trapezoid") - doc_domain_auc_df <- data.frame(domain_auc) #make data frame of AUC, replicated once - } - doc_domain_auc_df - }, - error = function(e) { - # print file name and dimension that are behaving unexpectedly - cat(paste("Results for dAUC will be filled with NA.\n\tTranscript: ", - doc_name, "\n\tDimension: ", dimension, "\n", sep = "")) - # fill the result cell with NA - doc_domain_auc_df <- data.frame(domain_auc = NA) - doc_domain_auc_df - }) - } - - # iterate over each document - single_doc_auc <- lapply(long_diff_df_list, function(aligned_ts_df){ - - - domain_ts <- aligned_ts_df %>% - dplyr::select(ExchangeCount, - tidyselect::contains(dimension)) # take dimension and time - - # put the function in here - single_doc_domain_auc <- calculate_auc(domain_ts, - doc_name = as.character(aligned_ts_df$Event_ID)[1], - dimension = dimension) - }) - #bind all docs AUCs for emotion into one column and add column prefix - all_doc_domain_auc_df <- dplyr::bind_rows(single_doc_auc) - colnames(all_doc_domain_auc_df) <- paste("AUC", dimension, sep = "_") - all_doc_domain_auc_df - }) - - all_domain_df <- dplyr::bind_cols(domain_auc_list, data.frame(Event_ID = names(long_diff_df_list))) #bind all columns of AUCs into one data frame - - return(all_domain_df) - } - #END DEFINE FIND DYAD AREA UNDER THE CURVE FUNCTION - - auc_df <- find_auc_dyads(aligned_df = aligned_ts_df, resample = resample) - auc_df <- auc_df %>% - dplyr::select(c("Event_ID", tidyselect::contains("AUC"))) - return(auc_df) -} \ No newline at end of file diff --git a/R/summarize_dyads_covar.R b/R/summarize_dyads_covar.R deleted file mode 100644 index b1c1f9b..0000000 --- a/R/summarize_dyads_covar.R +++ /dev/null @@ -1,246 +0,0 @@ -#' summarize_dyads_covar -#' -#' Calculates Spearman rank correlation and lagged Pearson correlation between interlocutor time series for each dimension and transcript. -#' -#' @name summarize_dyads_covar -#' @param aligned_ts_df Dataframe produced in the align_dyads function -#' @param lags A vector of signed integers specifying lags that should be included for Pearson correlation. Negative integers are used for leads. -#' @return A data frame containing the Spearman and Pearson correlation for each transcript. Results are organized as column and data is pivoted longer by dimension for readability. -#' @importFrom dplyr summarise -#' @importFrom dplyr mutate -#' @importFrom dplyr select -#' @importFrom dplyr group_by -#' @importFrom dplyr ungroup -#' @importFrom dplyr left_join -#' @importFrom dplyr bind_rows -#' @importFrom dplyr bind_cols -#' @importFrom dplyr rename_at -#' @importFrom dplyr vars -#' @importFrom dplyr across -#' @importFrom stringr str_replace -#' @importFrom magrittr %>% -#' @importFrom tidyr pivot_wider -#' @importFrom tidyr pivot_longer -#' @importFrom tidyr drop_na -#' @importFrom tidyr fill -#' @importFrom tidyselect ends_with -#' @importFrom tidyselect contains -#' @importFrom tidyselect everything -#' @importFrom stats cor.test -#' @importFrom zoo na.approx -#' @importFrom YRMisc cor.lag -#' @export summarize_dyads_covar - - -summarize_dyads_covar <- function(aligned_ts_df, lags = c(-3, -2, -1, 0, 1, 2, 3)) { - #remove empty levels of all factors in the data frame - specifically for any removed transcript event ids - aligned_ts_df <- droplevels(aligned_ts_df) - - align_vars <- c("aff_anger", "aff_anxiety", "aff_boredom", "aff_closeness", - "aff_confusion", "aff_dominance", "aff_doubt", "aff_empathy", - "aff_encouragement", "aff_excitement", "aff_guilt", "aff_happiness", - "aff_hope", "aff_hostility", "aff_politeness", "aff_sadness", "aff_stress", - "aff_surprise", "aff_trust", "aff_valence", "lex_age_acquisition", - "lex_letter_count_raw", "lex_morphemecount_raw", "lex_prevalence", - "lex_senses_polysemy", "lex_wordfreqlg10_raw", "sem_arousal", - "sem_concreteness", "sem_diversity", "sem_neighbors") - - # pull out metadata into a seperate df - metaDf <- aligned_ts_df %>% - dplyr::select(-c(Participant_ID, contains(align_vars), - contains("Nwords"), CleanText, TurnCount, ExchangeCount)) %>% - dplyr::group_by(Event_ID) %>% - dplyr::summarize(across(tidyselect::everything(), first)) - - #initial check for transcripts that do not have two interlocutors - check_pnum <- aligned_ts_df %>% - dplyr::group_by(Event_ID) %>% - dplyr::summarize(p_count = length(unique(Participant_ID)), - .groups = "drop") #generate dataframe with number of interlocutors in each transcript - - err_ts_vec <- check_pnum$Event_ID[which(check_pnum$p_count != 2)] - #check if any transcripts were identified and throw an error with the specific transcripts - if (length(err_ts_vec) != 0) { - stop("summarize_dyads requires that all transcripts have exactly two interlocutors.\ntranscripts with less than or greater than 2 interlocutors:\n", - paste(err_ts_vec, collapse = "\n")) - } - - #mutates an interlocutor pair variable, and preserves all columns - aligned_ts_df <- aligned_ts_df %>% - dplyr::group_by(Event_ID) %>% #add a column of concatenated interlocutor names by transcript - dplyr::mutate(Participant_Pair = paste(sort(unique(Participant_ID)), collapse = "---")) %>% - dplyr::ungroup() - - #DEFINE SPEARMAN'S CORRELATION FUNCTION - spearmans_corr_dyads <- function(aligned_ts_df) { - #first, flexibly find the dimensions the user has aligned on - align_dimensions <- c("aff_anger", "aff_anxiety", "aff_boredom", "aff_closeness", - "aff_confusion", "aff_dominance", "aff_doubt", "aff_empathy", - "aff_encouragement", "aff_excitement", "aff_guilt", - "aff_happiness", "aff_hope", "aff_hostility", "aff_politeness", - "aff_sadness", "aff_stress", "aff_surprise", "aff_trust", - "aff_valence", "lex_age_acquisition", - "lex_letter_count_raw", "lex_morphemecount_raw", "lex_prevalence", - "lex_senses_polysemy", "lex_wordfreqlg10_raw", "sem_arousal", - "sem_concreteness", "sem_diversity", "sem_neighbors") - - align_var <- colnames(aligned_ts_df[,which(colnames(aligned_ts_df) %in% align_dimensions)]) - - df_list <- split(aligned_ts_df, f = aligned_ts_df$Event_ID) - #iterate over each newly split data frame. - output_df_list <- lapply(df_list, function(df){ - - #establish raw participant name and S1/S2 'key' - denoted by order of utterance - participantvec <- unique(df$Participant_ID) - - #substitute participant names with transient variable to be replaced with real names later - names(participantvec) <- c("S1", "S2") - df$Participant_ID <- gsub(participantvec[1], names(participantvec)[1], df$Participant_ID) - df$Participant_ID <- gsub(participantvec[2], names(participantvec)[2], df$Participant_ID) - - #create a wide data frame with each row containing both participants' turn aggregated scores - df_wide <- df %>% - dplyr::group_by(Event_ID, ExchangeCount, Participant_ID, .add = FALSE) %>% - dplyr::summarise(dplyr::across(tidyselect::contains(align_var), ~ mean(.x, na.rm = TRUE)), - .groups = "drop")%>% - tidyr::pivot_wider(names_from = tidyselect::contains("Participant_ID"), - values_from = align_var) #%>% - - # if there is only one aligned variable manually add that variable name to participant columns - if (length(align_var) == 1){ - colnames(df_wide)[which(colnames(df_wide) %in% c("S1", "S2"))] = paste(align_var[1], colnames(df_wide)[which(colnames(df_wide) %in% c("S1", "S2"))], sep = "_") - } - - df_wide <- df_wide %>% - dplyr::select(Event_ID, ExchangeCount, tidyselect::contains(align_var)) - - rows_with_na_ind <- apply(df_wide[,which(colnames(df_wide) %in% paste(align_var, "S1", sep = "_") | colnames(df_wide) %in% paste(align_var, "S2", sep = "_"))], 1, function(x){ - any(is.na(x) == TRUE & is.nan(x) == FALSE) - }) - # only slice out rows if there is at least one, otherwise it will not index any rows - if (any(rows_with_na_ind)) { - df_wide <- df_wide[!rows_with_na_ind, ] - } - - # interpolate over turns that are fully missing (if all words in the utterance have been removed) - interp_df <- df_wide %>% - dplyr::mutate(dplyr::across(tidyselect::contains(align_var), - ~ zoo::na.approx(.x, na.rm = FALSE))) %>% - tidyr::fill(names(df_wide[, which(colnames(df_wide) %in% paste(align_var, "S1", sep = "_")| - colnames(df_wide) %in% paste(align_var, "S2", sep = "_"))]), .direction = "updown") # fills up or down for initial and ending values - - # select the columns for one participant and remove suffixes - x_vars <- interp_df %>% - dplyr::select(c('Event_ID') | c(ends_with("_S1"))) %>% - dplyr::rename_at(dplyr::vars(matches("_S1")), ~stringr::str_replace(., "_S1", "")) - y_vars <- interp_df %>% - select(c('Event_ID') | c(ends_with("_S2")))%>% - dplyr::rename_at(dplyr::vars(matches("_S2")), ~stringr::str_replace(., "_S2", "")) - - # function to calculate and format spearman correlation - calculate_spearman_corr <- function(x_vars, y_vars, dim, align_var) { - tryCatch({ - # pull the values for each participant out of the data frame as a vector - dim_x_vars <- x_vars[[dim]] - dim_y_vars <- y_vars[[dim]] - - #run spearman corr and format rho and p value into a data frame - with complete cases - sc_results <- cor.test(dim_x_vars, dim_y_vars, - method = "spearman", exact = F, - na.action = "na.omit") - - sc_results_df <- data.frame(SpearR = sc_results$estimate) - sc_results_df - }, - error = function(e){ - # print file name and dimension that are behaving unexpectedly (TODO) - cat(paste("Results for spearman correlation will be filled with NA.\n\tTranscript: ", - df$Event_ID, "\n\tDimension: ", dim, "\n", sep = "")) - # fill results with NA - sc_results_df <- data.frame(SpearR = NA) - sc_results_df - }) - } - - # function to calculate and format lagged Pearson correlation - calculate_lag_corr <- function(x_vars, y_vars, dim, align_var) { - # pull the values for each participant out of the data frame as a vectors - dim_x_vars <- x_vars[[dim]] - dim_y_vars <- y_vars[[dim]] - - # get the amount of lag and lead - lagTimes <- lags[lags > 0] - leadTimes <- lags[lags < 0] - # set both at least equal to one so that the function works - if (length(lagTimes) == 0) { - lagTimes <- c(1) - } - if (length(leadTimes) == 0) { - leadTimes <- c(1) - } - # calculate lagged correlation - suppressWarnings(( - lo_full <- YRmisc::cor.lag(dim_x_vars, dim_y_vars, - max(lagTimes), max(abs(leadTimes))) - - )) - - # get the index of column name with zero - which0 <- which(colnames(lo_full) == "0") - # grab just the lag times, add a negative if a lead - colnames(lo_full)[0:which0] <- gsub("lag", "", colnames(lo_full)[0:which0]) - # same for lead - colnames(lo_full)[which0:length(colnames(lo_full))] <- gsub("lead", "-", colnames(lo_full)[which0:length(colnames(lo_full))]) - colnames(lo_full) <- paste0("PRho_lag_", colnames(lo_full)) - - # make a vector of names from the given lags - selectNames <- paste0("PRho_lag_", lags) - if (0 %in% lags) {selectNames <- append(selectNames, "0")} - # select columns given by user - lo_select <- lo_full %>% - dplyr::select(tidyselect::contains(selectNames)) - lo_select - } - - #iterate over variables to calculate spearman's correlation - dyad_dim_sc_list <- lapply(align_var, function(dim){ - # run the trycatch function - either gives Rhos or NAs if there would be an issue - sc_results_df <- calculate_spearman_corr(x_vars, y_vars, dim, align_var) - # run the lagged covariance function - lag_results_df <- calculate_lag_corr(x_vars, y_vars, dim, align_var) - # put both results in a single data frame - covar_results_df <- dplyr::bind_cols(sc_results_df, lag_results_df) - - #add participant column only if it is the first iteration - colnames(covar_results_df) <- paste(colnames(covar_results_df), dim, sep = "-") - if (match(dim, align_var) == 1) { - covar_results_df$Participant_ID <- paste(participantvec, collapse = "---") - } - covar_results_df - }) - - #bind and add event id to spearman correlation df - dyad_covar <- dplyr::bind_cols(dyad_dim_sc_list, Event_ID = unique(df$Event_ID)) - dyad_covar$Who_lagged <- participantvec[2] - # report the dyad data frame - dyad_covar - }) - all_dyad_df <- dplyr::bind_rows(output_df_list) - return(all_dyad_df) - } - #END DEFINE SPEARMAN'S CORRELATION FUNCTIONs - covar_df <- spearmans_corr_dyads(aligned_ts_df = aligned_ts_df) # bind the results into dataframe - # order columns - - covar_df <- covar_df %>% - dplyr::select(!c(Participant_ID)) %>% - # pivot just by the DIMENSION - tidyr::pivot_longer(cols = tidyselect::contains(c("SpearR", "PRho")), - names_pattern = "(.*)-(.*)", # match up to but not include the dimensions - names_to = c(".value", "Dimension")) %>% - dplyr::left_join(metaDf, by = "Event_ID") # bind metadata to output by event - - #remove row names - row.names(covar_df) <- NULL - return(covar_df) -} \ No newline at end of file