diff --git a/NAMESPACE b/NAMESPACE index 77ac775..a5bcd92 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,7 +1,6 @@ # Generated by roxygen2: do not edit by hand export(align_dyads) -export(align_metadata) export(clean_dyads) export(read_dyads) export(summarize_dyads) diff --git a/R/align_dyads.R b/R/align_dyads.R index 0f4a5e9..e9e3612 100644 --- a/R/align_dyads.R +++ b/R/align_dyads.R @@ -1,9 +1,9 @@ #' align_dyads #' -#' Yokes user-specified semantic, affective, and phonological values to each word in a cleaned language transcript. Prepares a dataframe aligned by exchange and turn across Participant_IDs. +#' Yokes user-specified semantic, affective, and phonological values to each word in a cleaned language transcript. Values are aligned by each individual word, and words that are not present in the database are dropped. The number of words dropped is reported by interlocutor in each dyad. Reports an exchange count, which counts by each pair of turns. #' #' @name align_dyads -#' @param clean_ts_df a dataframe cleaned and formatted during the read_dyads() function +#' @param clean_ts_df a dataframe cleaned and formatted during the clean_dyads() function #' @return a dataframe one-word-per-row format with variables of interest appended #' @importFrom magrittr %>% #' @importFrom tidyselect any_of diff --git a/R/align_metadata.R b/R/align_metadata.R deleted file mode 100644 index 2114cdf..0000000 --- a/R/align_metadata.R +++ /dev/null @@ -1,48 +0,0 @@ -#' align_metadata -#' -#' Prompts the user for a file path to a csv file. Joins metadata in the file to the output of align_dyads by 'event_id' and 'Participant_ID' -#' -#' @name align_metadata -#' @param aligned_ts_df a data frame with yoked psycholingustic variables from the align_dyads function -#' @return a data frame with user provided and selected metadata joined by 'event_id' and 'Participant_ID' -#' @importFrom magrittr %>% -#' @importFrom dplyr group_by -#' @importFrom dplyr select -#' @importFrom dplyr left_join -#' @importFrom utils read.csv -#' @importFrom utils select.list -#' @export align_metadata - -align_metadata <- function(aligned_ts_df) { - #allow user to input the file path to demographic data, randomly assign groups, or not align groups - ask_meta_filepath <- readline(writeLines("If you would like to align metadata by interlocutor and event ID, input the absolute or relative file path to the metadata csv file.\nThe file path should not be in quotes (e.g. my_data/metadata.csv)\nThe csv file must contain column names 'Participant_ID' and 'event_id' or it will not align.")) - #reads in a csv file of demographic information associated with participant IDs. - metadata <- data.frame(read.csv(ask_meta_filepath)) - - #check for event and participant column names and replace with correct cases if incorrect - colnames(metadata)[grep("^event_id$", colnames(metadata), ignore.case = T)] <- "event_id" - colnames(metadata)[grep("^Participant_ID$", colnames(metadata), ignore.case = T)] <- "Participant_ID" - - #check that there are correctly named participant and event id column headers, and throw an error if not - if (any(any(grepl("^event_id$", colnames(metadata), ignore.case = F)), - any(grepl("^Participant_ID$", colnames(metadata), ignore.case = F))) == FALSE) { - stop("cannot find column header 'Participant_ID' and or 'event_id' in metadata file; the file must contain both") - } - - #allows the user to specify which columns they want to subset - preselecting event and participant id - subset_metadata <- select.list(c(colnames(metadata), "Select all columns"), - preselect = c("event_id", "Participant_ID"), multiple = TRUE, - title = "Select the columns you would like to subset. 'Participant_ID' and 'event_id' columns are preselected.", - graphics = FALSE) - #if the select all option is chosen, selects every column - if (any(grepl("Select all columns", subset_metadata)) == TRUE) { - subset_metadata <- colnames(metadata) - } - metadata_selected <- metadata[,colnames(metadata) %in% subset_metadata] #select specified columns - - #join metadata to aligned data frame by event id and PID - metadata_aligned_df <- dplyr::left_join(aligned_ts_df, metadata_selected, - by=c("event_id", "Participant_ID")) - - return(metadata_aligned_df) -} diff --git a/R/clean_dyads.R b/R/clean_dyads.R index 43361d8..92f5714 100644 --- a/R/clean_dyads.R +++ b/R/clean_dyads.R @@ -1,6 +1,6 @@ #' clean_dyads #' -#' Cleans and Formats raw language transcripts, removing stopwords and formatting dataframe for alignment steps +#' Cleans and formats language transcripts from the read stage. Removes non-alphabetic characters and stopwords. Language transcripts can be lemmatized by calling lemmatize = TRUE. Vectorizes each utterance and reports the total word count and mean word length by interlocutor in each dyad. Also reports the number of words in each turn. #' @name clean_dyads #' @param dataframe produced from the read_dyads() function #' @return dataframe with stopwords omitted, lemmatized words one per row @@ -24,14 +24,6 @@ #' @export clean_dyads clean_dyads <- function(read_ts_df, lemmatize=TRUE) { - #specify a group of speaker names that should be automatically removed from the transcript - s_remove <- c("Unknown", "unknown", "Speaker", "speaker", "Other", "other", "E", "e", "Experimenter", "experimenter", "Assistant", "assistant") - - #removes rows from the transcript that have the speaker as specified in the remove - if (any(read_ts_df$Participant_ID %in% s_remove) == TRUE){ #conditional in case no matches - read_ts_df <- read_ts_df[-which(read_ts_df$Participant_ID %in% s_remove),] - } - #set event_id and speaker names as factors read_ts_df$Participant_ID <- as.factor(read_ts_df$Participant_ID) #convert variables to factor read_ts_df$event_id <- as.factor(read_ts_df$event_id) diff --git a/R/read_dyads.R b/R/read_dyads.R index 4c93698..00cd9a8 100644 --- a/R/read_dyads.R +++ b/R/read_dyads.R @@ -1,10 +1,10 @@ #' read_dyads #' -#' Reads pre-formatted conversation transcripts from txt or csv on user's machine; user supplies directory path (e.g., "my_transcripts") to local folder as argument to function call +#' Reads pre-formatted dyadic (2 interlocutor) conversation transcripts from your machine. Transcripts must be either csv or txt format. IF you are supplying a txt file, your transcript must be formatted as an otter.ai txt file export. Your options for using csv files are more flexible. ConversationAlign minimally requires a csv file with two columns, denoting interlocutor and text. Each separate conversation transcript should be saved as a separate file. ConversationAlign will use the file names as a document ID. Within the read dyads function, set the folder_name argument as the directory path to the local folder containing your transcripts on your machine (e.g., "my_transcripts"). Please see our github page for examples of properly formatted transcripts: https://github.com/Reilly-ConceptsCognitionLab/ConversationAlign #' #' @name read_dyads #' @param folder_name folder of conversation transcripts in csv or txt format -#' @return a concatenated dataframe with each language transcript saved as a separate 'event_id'; these are split into separate lists for discrete operations in later steps +#' @return a concatenated dataframe with each language transcript saved as a separate 'event_id' #' @importFrom magrittr %>% #' @importFrom dplyr select #' @importFrom dplyr bind_rows diff --git a/R/summarize_dyads.R b/R/summarize_dyads.R index effebb3..424653a 100644 --- a/R/summarize_dyads.R +++ b/R/summarize_dyads.R @@ -1,6 +1,6 @@ #' summarize_dyads #' -#' appends AUC and Spearman Rank Correlation indices to each dyad (event_id) using a resampling algoirthm that defaults to the minimum number of exchanges across all documents entered +#' Calculates and appends 3 measures for quantifying alignment. Appends the mean score for each dimension by turn. Calculates and Spearman's rank correlation between interlocutor time series and appends by transcript. Calculates the area under the curve of the absolute difference time series between interlocutor time series. The length of the difference time series can be standardized the shortest number of exchanges present in the group using an internally defined resampling function, called with resample = TRUE. Spearman's rank correlation and area under the curve become less reliable for dyads under 30 exchanges. #' #' @name summarize_dyads #' @param dataframe produced in the align_dyads function diff --git a/man/align_dyads.Rd b/man/align_dyads.Rd index dd5d9b4..f62ceb0 100644 --- a/man/align_dyads.Rd +++ b/man/align_dyads.Rd @@ -7,11 +7,11 @@ align_dyads(clean_ts_df) } \arguments{ -\item{clean_ts_df}{a dataframe cleaned and formatted during the read_dyads() function} +\item{clean_ts_df}{a dataframe cleaned and formatted during the clean_dyads() function} } \value{ a dataframe one-word-per-row format with variables of interest appended } \description{ -Yokes user-specified semantic, affective, and phonological values to each word in a cleaned language transcript. Prepares a dataframe aligned by exchange and turn across Participant_IDs. +Yokes user-specified semantic, affective, and phonological values to each word in a cleaned language transcript. Values are aligned by each individual word, and words that are not present in the database are dropped. The number of words dropped is reported by interlocutor in each dyad. Reports an exchange count, which counts by each pair of turns. } diff --git a/man/clean_dyads.Rd b/man/clean_dyads.Rd index 5c2dd1f..db00bda 100644 --- a/man/clean_dyads.Rd +++ b/man/clean_dyads.Rd @@ -13,5 +13,5 @@ clean_dyads(read_ts_df, lemmatize = TRUE) dataframe with stopwords omitted, lemmatized words one per row } \description{ -Cleans and Formats raw language transcripts, removing stopwords and formatting dataframe for alignment steps +Cleans and formats language transcripts from the read stage. Removes non-alphabetic characters and stopwords. Language transcripts can be lemmatized by calling lemmatize = TRUE. Vectorizes each utterance and reports the total word count and mean word length by interlocutor in each dyad. Also reports the number of words in each turn. } diff --git a/man/read_dyads.Rd b/man/read_dyads.Rd index 611fd58..ef9ae05 100644 --- a/man/read_dyads.Rd +++ b/man/read_dyads.Rd @@ -10,8 +10,8 @@ read_dyads(folder_name = "my_transcripts") \item{folder_name}{folder of conversation transcripts in csv or txt format} } \value{ -a concatenated dataframe with each language transcript saved as a separate 'event_id'; these are split into separate lists for discrete operations in later steps +a concatenated dataframe with each language transcript saved as a separate 'event_id' } \description{ -Reads pre-formatted conversation transcripts from txt or csv on user's machine; user supplies directory path (e.g., "my_transcripts") to local folder as argument to function call +Reads pre-formatted dyadic (2 interlocutor) conversation transcripts from your machine. Transcripts must be either csv or txt format. IF you are supplying a txt file, your transcript must be formatted as an otter.ai txt file export. Your options for using csv files are more flexible. ConversationAlign minimally requires a csv file with two columns, denoting interlocutor and text. Each separate conversation transcript should be saved as a separate file. ConversationAlign will use the file names as a document ID. Within the read dyads function, set the folder_name argument as the directory path to the local folder containing your transcripts on your machine (e.g., "my_transcripts"). Please see our github page for examples of properly formatted transcripts: https://github.com/Reilly-ConceptsCognitionLab/ConversationAlign } diff --git a/man/summarize_dyads.Rd b/man/summarize_dyads.Rd index 29a7404..44164d2 100644 --- a/man/summarize_dyads.Rd +++ b/man/summarize_dyads.Rd @@ -4,11 +4,11 @@ \alias{summarize_dyads} \title{summarize_dyads} \usage{ -summarize_dyads(aligned_ts_df, resample = TRUE, threshold = "min") +summarize_dyads(aligned_ts_df, resample = TRUE) } \arguments{ \item{dataframe}{produced in the align_dyads function} } \description{ -appends AUC and Spearman Rank Correlation indices to each dyad (event_id) using a resampling algoirthm that defaults to the minimum number of exchanges across all documents entered +Calculates and appends 3 measures for quantifying alignment. Appends the mean score for each dimension by turn. Calculates and Spearman's rank correlation between interlocutor time series and appends by transcript. Calculates the area under the curve of the absolute difference time series between interlocutor time series. The length of the difference time series can be standardized the shortest number of exchanges present in the group using an internally defined resampling function, called with resample = TRUE. Spearman's rank correlation and area under the curve become less reliable for dyads under 30 exchanges. }