diff --git a/README.Rmd b/README.Rmd index 488003e..c843ddd 100755 --- a/README.Rmd +++ b/README.Rmd @@ -62,165 +62,16 @@ MyCleanLangSamples <- clean_dyads() ``` ## Align your transcripts -Prompts user to specify one or more variables to align on from a lookup database (lookup_db) reflecting published word norms from numermous sources (e.g., afffectvec, Kuperman norms, Brysbaert norms, etc.). Yokes data to each word then structures a dataframe by speaker and exchange across each dyad. +Prompts user to specify one or more variables to align on from a lookup database (lookup_db) reflecting published word norms from numermous sources (e.g., afffectvec, Kuperman norms, Brysbaert norms, etc.). Yokes data to each word then structures a dataframe by speaker and exchange across each dyad.
+ + myvars <- select.list(c("admiration", "anger", "animosity", "anticipation", "anxiety", "aoa", "awe", "boredom", "calmness", "closeness", "comfort", "compatibility", "concreteness", "confusion", "contempt", "disgust", "distance", "dominance", "doubt", "empathy", "encouragement", "excitement", "fear", "friendliness", "gratitude", "happiness", "hostility", "interest", "joy", "lg10wf", "love", "n_letters", "relieved", "sadness", "satisfaction", "stress", "surprise", "tension", "trust", "valence" + ```{r, eval=F} -#takes cleaned dataframe from clean_dyads() step align_dyads <- function(clean_ts_df) { - load("data/lookup_db.rda") #load lookup database - #allow the user to select what variables they want to align, or provide their own database(s) and subset them - myvars <- select.list(c("admiration", "anger", "animosity", "anticipation", "anxiety", "aoa", "awe", "boredom", "calmness", "closeness", "comfort", "compatibility", "concreteness", "confusion", "contempt", "disgust", "distance", "dominance", "doubt", "empathy", "encouragement", "excitement", "fear", "friendliness", "gratitude", "happiness", "hostility", "interest", "joy", "lg10wf", "love", "n_letters", "relieved", "sadness", "satisfaction", "stress", "surprise", "tension", "trust", "valence", "add my own database as well"), - preselect = NULL, multiple = TRUE, - title = "Select the variables you would like to align your conversation transcripts on", - graphics = FALSE) - - if (length(myvars) == 0) { #if no variables are selected, defaults are automatically added - myvars <- c("happiness", "hostility", "empathy", "excitement") - } - - var_selected <- lookup_db %>% #select desired columns from lookup_db - select(matches("^word$"), contains(myvars)) - - if (any(grepl("add my own database as well", myvars)) == TRUE) { - #take use input for the full file path to the data base they want to use - database_path <- readline("Input the file path to the database you would like to add.") - user_added_db <- read.csv(database_path) #IS IT OK TO ASSUME THAT DATABASE WILL BE .CSV??? - user_added_db <- data.frame(user_added_db) - #display the column names of user added database and allow them to choose the columns they want - subset_user_db <- select.list(c(colnames(user_added_db), "Select all columns"), - preselect = NULL, multiple = TRUE, - title = "Select the columns you would like to subset. The word column must be included.", - graphics = FALSE) - #allows user to select one option to select every column in their added database - if (any(grepl("Select all columns", subset_user_db)) == TRUE) { - subset_user_db <- colnames(user_added_db) - } - user_added_db <- user_added_db %>% select(contains(subset_user_db)) #select the columns specified from the database - #alter the word column on the added database to match the column name of the built in databse - colnames(user_added_db)[grep("^word$", colnames(user_added_db), ignore.case = TRUE)] <- "word" - #if user added their own database and subsetted from built in - binds both together. - if (length(myvars[-grep("add my own database as well", myvars)]) > 0) { - var_selected <- full_join(x = var_selected, y = user_added_db, by="word") - } - } - #create variable containing the column names of each variable to be aligned - var_aligners <- colnames(var_selected)[-grep("^word$", colnames(lookup_db), ignore.case = TRUE)] - - var_selected <- var_selected %>% distinct(word, .keep_all = TRUE) - - ts_list <- split(clean_ts_df, f = clean_ts_df$Doc_id) #split the transcript data frame into a list by Doc_id - ts_aligned_list <- lapply(ts_list, function(ts_select){ - #join measures of each variable to each word in each transcript - df_aligned <- left_join(ts_select, var_selected, by = c("CleanText" = "word"), multiple = "first") - df_aligned <- df_aligned[complete.cases(df_aligned), ] # remove any words that couldn't be aligned - df_aligned <- data.frame(df_aligned) - - df_aligned_agg <- df_aligned %>% - mutate(TurnCount = consecutive_id(Speaker_names_raw), .before = 1) %>% # add a turn column - select(Doc_id, Speaker_names_raw, TurnCount, Time, contains(var_aligners), starts_with("Analytics")) %>% - # select variables, speaker and dyad information, and word analytics - group_by(Doc_id, TurnCount, Speaker_names_raw) %>% #group by doc id, turn, and speaker - summarise(Time = min(Time), #make time the minimum for each turn - across(contains(var_aligners), mean), #average each variable by turn - across(starts_with("Analytics_wordcount"), sum), #sum word counts - across(starts_with("Analytics_words_removed"), sum), #sum removed word counts - across(starts_with("Analytics_mean_word_length"), mean), - .groups = "drop") %>% - ungroup() #reformat data frame back to chronological order - # identifies if there are an odd number of rows (one speaker spoke but other did not respond) - if ((nrow(df_aligned_agg)%%2) == 1 ) { - temprow <- data.frame(matrix(NA, nrow = 1, ncol = ncol(df_aligned_agg))) #creates a new adder row - colnames(temprow) <- c(colnames(df_aligned_agg)) - df_aligned_agg <- rbind(df_aligned_agg, temprow) #adds row full of NA to end of the data frame - } - ExchangeCount <- rep(seq(1:(length(df_aligned_agg$TurnCount)/2)), each=2) #creates Exchange Count - df_aligned_EC <- cbind(ExchangeCount, df_aligned_agg) #binds ExC to the data frame - df_aligned_EC <- na.omit(df_aligned_EC) #removes added NA row - df_aligned_EC <- df_aligned_EC %>% - select(!TurnCount) #removes turn count column - - df_aligned_EC #output the transcript exchange count organized aligned data frame to a list - }) - ts_aligned_df_total <- bind_rows(ts_aligned_list) - - #DEFINE THE DEMOGRAPHIC_ALIGN FUNCTION - demographic_align <- function(aligned_ts_df) { - #allow user to input the file path to demographic data, randomly assign groups, or not align groups - ask_demo_filepath <- readline("If you would like to align demographics to speakers, input the file path to the demographic csv file.") - #if user inputs 'random', randomly assigns groups across transcripts - if (str_to_lower(ask_demo_filepath) == "random") { - randomly <- lapply(split(aligned_ts_df, aligned_ts_df$Doc_id), function(x){ #iterates over each doc - x <- data.frame(x) - #creates a vector of each speaker with random indexes and assigns a alphanumeric sequence name - speakervec <- sample(unique(x[,grep("Speaker_names_raw", colnames(x), ignore.case = T)])) - names(speakervec) <- paste("S", 1:length(speakervec), sep = "") - #creates a data frame with just speaker names and assigned code - coloutput <- data.frame(Speaker_names_raw = speakervec, - Speaker_Code_Random = sapply(speakervec, function(y) { - names(speakervec)[match(y, speakervec)]})) - x <- x %>% left_join(coloutput, by=c("Speaker_names_raw")) #binds code to the aligned data frame - }) - randomly <- bind_rows(randomly) #binds all the doc data frame into one - return(randomly) - } - #if input is empty, returns the aligned data frame with no demographics - else if (ask_demo_filepath == "") { - return(aligned_ts_df) - } - #if file path is entered: - else { - #reads in a csv file of demographic information associated with participant IDs. - demoinfo <- data.frame(read.csv(ask_demo_filepath)) - #allows the user to specify which columns they want to subset - subset_demo_data <- select.list(c(colnames(demoinfo), "Select all columns"), - preselect = NULL, multiple = TRUE, - title = "Select the columns you would like to subset. The participant ID column must be included.", - graphics = FALSE) - #if the select all option is chosen, selects every column - if (any(grepl("Select all columns", subset_demo_data)) == TRUE) { - subset_demo_data <- colnames(demoinfo) - } - demos_selected <- demoinfo %>% - select(contains(subset_demo_data)) #selects only specified columns from the demographics - - demos <- demos_selected %>% - select(!contains("PID")) %>% - select(!contains("Participant")) #selects only columns that aren't participant ID - - partid <- demos_selected %>% - select(contains(setdiff(colnames(demos_selected), colnames(demos)))) - #creates a new data frame that just includes specified demo domains and combines them into to one string, which will be a total combination of demographics - domaincode <- data.frame(sapply(colnames(demos), function(x) { - domainlvl <- sort(unique(demos[, match(x, colnames(demos))])) #creates a vector of unique domain info - names(domainlvl) <- paste("S", 1:length(domainlvl), sep = "") #alphabetically assigns a code to each - coloutput <- sapply(demos[match(x, colnames(demos))], function(y) { - names(domainlvl)[match(y, domainlvl)] - }) - coloutput - })) - colnames(domaincode) <- paste("Speaker_group_var_code", tolower(colnames(demos)), sep = "_") - colnames(demos) <- paste("Speaker_group_var", tolower(colnames(demos)), sep = "_") - demos <- cbind(demos, domaincode) #bind the assigned codes to the original groups - demos[] <- lapply(demos[], factor) #make each grouping variable a factor - demos <- cbind(PID = partid, demos) #bind participant ID to the demographic groups - - demo_aligned_df <- aligned_ts_df %>% - left_join(demos, by=c("Speaker_names_raw" = "PID")) #align demographic groups by participant ID - - return(demo_aligned_df) - }} - #END DEFINING DEMOGRAPHIC_ALIGN FUNCTION - demographic_align(aligned_ts_df = ts_aligned_df_total) #run demoraphic aligner on aligned data frame -} - -``` - -## Inspect your transcripts -```{r, eval=F} -#TBD ``` -## Analyze your transcripts +## Summarize transcripts ```{r, eval=F} #TBD ``` diff --git a/README.md b/README.md index 973ac5e..fa7889e 100644 --- a/README.md +++ b/README.md @@ -68,203 +68,37 @@ Move your raw transcripts into a folder. The default folder name ConversationAlign will search for on your machine is ‘my_transcripts’. However, if you want to specify your own folder name that’s fine too. You will call that path as an argument to the first function called -read_dyads().
- -We recommend being very careful and deliberate about where you store -your files +read_dyads().

## Read your transcripts into R ### read_dyads() This function will read all your files and concatenate them into a -single dataframe, appending document IDs +single dataframe, appending document IDs. You can call this dataframe +whatever you like. read_dyads will default to reading all csv and txt +files in a folder called my_transcripts. Just remember that when you are +finished processing a set of transcripts, make sure to move them out of +that folder. You can think of ‘my_transcripts’ as a staging area for +loading data into ConversationAlign. ``` r -myrawtranscripts <- read_dyads() -myrawtranscripts <- read_dyads(/my_customfolder) #if specifying a custom folder path -read_dyads <- function(folder_name = "my_transcripts") { - #defines three functions - the two that select and format txt and csv files, and the function that actually reads in the otter transcript txt file. - read_otter_transcript <- function(file_path) { - lines <- readLines(file_path) #read otter ai file - #Ben added - removes otter ai watermark if it is present - if (any(grepl("otter.ai", lines)) == TRUE) { - lines <- as.character(lines[-(grep("otter.ai", lines))])} - num_lines <- length(lines) #create a var for number of lines - speaker <- character() #create speaker col - time <- character() #create time col - text <- character() #create text col - - #process lines of dialogue - current_line <- 1 - while (current_line <= num_lines) { - speaker_time <- strsplit(lines[current_line], " ")[[1]] - speaker <- c(speaker, speaker_time[1]) #select speaker - #Ben added - allows for last names and also timeless transcripts - timeadd <- tryCatch({speaker_time[max(grep(":", speaker_time))]}, #attempts to identify a colon - warning = function(w){return(NA)}) #if no colon, continues without gathering time - time <- c(time, timeadd) - - #select lines of speech - speech_lines <- character() - line_counter <- current_line + 1 - while (line_counter <= num_lines && lines[line_counter] != "") { #if not max line not on empty line - speech_lines <- c(speech_lines, lines[line_counter]) #add text on line to speech line text vector - line_counter <- line_counter + 1 - } - text <- c(text, paste(speech_lines, collapse = " ")) #append speech on line to vector as one string - current_line <- line_counter + 1 #move to next speaker - } - #create df - transcript_df <- data.frame(Speaker_names_raw = speaker,Time = time, RawText = text, - stringsAsFactors = FALSE) - return(transcript_df) - } - #END DEFINE OTTER READ TRANSCRIPT .TXT FILE FUNCTION - - read_me_txt <- function(folder_name){ - if (any(grepl("*.txt$", list.files(path = folder_name, pattern = ".", full.names = TRUE, recursive = TRUE))) == TRUE) { - file_list_txt <- list.files(path = folder_name, pattern = "*.txt$", full.names = TRUE, recursive = TRUE) #list files with .txt ending - txtdata <- lapply(file_list_txt, function(x) { - #runs txt files names through otter reading function - xorf <- read_otter_transcript(x) - #selects those that were properly transcribed by otter, still need to add other parameters - if (ncol(xorf) == 3) { - x <- xorf - } - else { - # --- WIP --- Other function for reading in non-otter txt transcripts - } - }) - - data.table::setattr(txtdata, "names", file_list_txt) #add names attribute to each list element - #adds a doc id column to each transcript based on its name attribute - # txtdata <- lapply(names(txtdata), function(x){ - # txtdata[[match(x, names(txtdata))]] <- cbind(Doc_id = rep(x, nrow(txtdata[[match(x, names(txtdata))]])), txtdata[[match(x, names(txtdata))]])}) - #returns the list of each data frame with doc IDs. - return(txtdata) - }} #end of the read_me txt function - - read_me_csv <- function(folder_name) { - if (any(grepl("*.csv$", list.files(path = folder_name, pattern = ".", full.names = TRUE, recursive = TRUE))) == TRUE) { - file_list_csv <- list.files(path = folder_name, pattern = "*.csv$", full.names = TRUE, recursive = TRUE) #list files with .csv ending - #creates a list of read in csv dataframes - csvdata <- lapply(full_file_list, function(x){ - x <- read.csv(x) - if (ncol(x) == 3) { - if (sort(tolower(colnames(x))) != sort("speaker", "text", "time")) { - # --- WIP --- test for columns somehow... - } - } - else { - # --- WIP --- tests for the three columns we need? Maybe it throws an error? - } - }) - - data.table::setattr(csvdata, "names", file_list_csv) #add names attribute to textdata - #adds a doc id column to each transcript based on its name attribute - # csvdata <- lapply(names(csvdata), function(x){ - # csvdata[[match(x, names(csvdata))]] <- cbind(Doc_id = rep(x, nrow(csvdata[[match(x, names(csvdata))]])), csvdata[[match(x, names(csvdata))]])}) - return(csvdata) - }} - #END OF THE READ_ME__CSV FUNCTION - #calls two functions to read in txt and csv file transcripts, returning a list. - txtlist <- read_me_txt(folder_name) - csvlist <- read_me_csv(folder_name) - all_list <- append(txtlist, csvlist) #append the two lists into one list - - all_list_num <- lapply(seq(length(all_list)), function(doc_num){ #iterate over each transcript - all_list[[doc_num]] <- cbind(Doc_id = rep(doc_num, nrow(all_list[[doc_num]])), all_list[[doc_num]])}) #bind the index of the transcript as a column to the data frame - alldf <- bind_rows(all_list_num) #binds the rows of each list into one data frame - return(alldf) - #outputs a data frame containing every dyad with columns: Doc_id, Speaker_names_raw, Time, and RawText -} +MyRawLangSamples <- read_dyads() +#if you want to specify a different folder, supply your own path +MyRawLangSamples <- read_dyads("/my_custompath") ``` -## Clean your transcripts +## Clean your transcripts: clean_dyads -This step uses regular expressions to clean and format your data, -eliminating stopwords, changing the case to lower, omitting whitespaces -and non-alphabetic characters, etc. +clean_dyas uses regular expressions to clean and format your data. The +function also omits stopwords using a custom stopword list, and it +lemmatizes (converts all words to their dictionary entries) unless you +tell it not to (lemmatize=T is the default). Run ‘clean_dyads’ on the +object you just assembled by running the ‘read_dyads’ function in the +last step. ``` r -#takes object from the read_dyads step -mycleantranscripts <- clean_dyads(myrawtranscripts) -clean_dyads <- function(read_ts_df) { - read_data_frame <- read_ts_df %>% - filter(Speaker_names_raw != "Unknown") %>% #filter out common unwanted speaker names - filter(Speaker_names_raw != "unknown") %>% - filter(Speaker_names_raw != "Speaker") %>% - filter(Speaker_names_raw != "speaker") %>% - filter(Speaker_names_raw != "Other") %>% - filter(Speaker_names_raw != "other") %>% - filter(Speaker_names_raw != "E") %>% - filter(Speaker_names_raw != "e") %>% - filter(Speaker_names_raw != "Experimenter") %>% - filter(Speaker_names_raw != "experimenter") %>% - filter(Speaker_names_raw != "Assistant") %>% - filter(Speaker_names_raw != "assistant") - read_data_frame$Speaker_names_raw <- as.factor(read_data_frame$Speaker_names_raw) #convert variables to factor - read_data_frame$Doc_id <- as.factor(read_data_frame$Doc_id) - - #convert time from hh:mm:ss or mm:ss to milliseconds - read_data_frame$Time <- sapply(read_data_frame$Time, function(x){ - if (any(grepl(":", x)) == TRUE) { #checks for colons, indicative of mm:ss - x <- as.numeric(unlist(str_split(x, ":"))) #breaks string into vector by colon placement - if (length(x) == 2) { #shows just mm, ss - sum((x[1]*60000), (x[2]*1000)) } - else if ( length(xvec) == 3) { #shows hh, mm, ss - sum((x[1]*3600000), (x[2]*60000), (x[3]*1000))}} - else if (is.na(x) == TRUE) { #keeps NA time values as NA - may be a better way to do this? - NA}}) - - load("data/omissions_dyads23.rda") #load in omissions database - - clean <- function(x) { - x <- tolower(x) #to lower - x <- gsub("\"", " ", x) - x <- gsub("\n", " ", x) - x <- gsub("`", "'", x) # replaces tick marks with apostrophe for contractions - x <- gsub("can't", "can not", x) - x <- gsub("won't", "will not", x) - x <- gsub("n't", " not", x) #replace contraction with full word not - x <- textclean::replace_contraction(x) #replace contractions - x <- gsub("-", " ", x) #replace all hyphens with spaces - x <- tm::removeWords(x, omissions_dyads23$.) - x <- gsub("\\d+(st|nd|rd|th)", " ", x) #omits 6th, 23rd, ordinal numbers - x <- gsub("[^a-zA-Z]", " ", x) #omit non-alphabetic characters - x <- gsub("\\b[a]\\b{1}", " ", x) - x <- tm::stripWhitespace(x) - x <- stringr::str_squish(x) - x <- textstem::lemmatize_words(x) - } - - read_data_frame$RawText <- str_squish(read_data_frame$RawText) #remove unneeded white space from text - - df_with_word_count <- read_data_frame %>% - rowwise() %>% #group by individual row - mutate(Analytics_wordcount_raw = length(stri_remove_empty(str_split_1(paste(RawText, collapse = " "), " "))), #create new column of word count by row - Analytics_mean_word_length_raw = mean(nchar(stri_remove_empty(str_split_1(paste(RawText, collapse = " "), pattern = " "))))) %>% #create new column of average word length by row - ungroup() - - dfclean <- df_with_word_count %>% - mutate(CleanText = clean(RawText)) %>% #run clean function on raw text, making a new column - rowwise() %>% #group by individual row - mutate(Analytics_wordcount_clean = length(stri_remove_empty(str_split_1(paste(CleanText, collapse = " "), " "))), # create word count column for cleaned text - Analytics_mean_word_length_clean = mean(nchar(stri_remove_empty(str_split_1(paste(CleanText, collapse = " "), pattern = " "))))) %>% #create mean word length column for clean text - ungroup() %>% - select(!RawText)# remove old raw text and grouping column - - dfclean_sep <- tidyr::separate_rows(dfclean, CleanText) # create row for each word in clean text - - dfclean_filtered <- dfclean_sep %>% - filter(CleanText != "")#remove rows where text is an empty string - - #calculate words removed from the difference between the raw word count and clean word count - dfclean_filtered$Analytics_words_removed <- dfclean_filtered$Analytics_wordcount_raw - dfclean_filtered$Analytics_wordcount_clean - - return(dfclean_filtered) -} +MyCleanLangSamples <- clean_dyads() ``` ## Align your transcripts @@ -273,165 +107,22 @@ Prompts user to specify one or more variables to align on from a lookup database (lookup_db) reflecting published word norms from numermous sources (e.g., afffectvec, Kuperman norms, Brysbaert norms, etc.). Yokes data to each word then structures a dataframe by speaker and exchange -across each dyad. +across each dyad.
-``` r -#takes cleaned dataframe from clean_dyads() step -align_dyads <- function(clean_ts_df) { - load("data/lookup_db.rda") #load lookup database - #allow the user to select what variables they want to align, or provide their own database(s) and subset them - myvars <- select.list(c("admiration", "anger", "animosity", "anticipation", "anxiety", "aoa", "awe", "boredom", "calmness", "closeness", "comfort", "compatibility", "concreteness", "confusion", "contempt", "disgust", "distance", "dominance", "doubt", "empathy", "encouragement", "excitement", "fear", "friendliness", "gratitude", "happiness", "hostility", "interest", "joy", "lg10wf", "love", "n_letters", "relieved", "sadness", "satisfaction", "stress", "surprise", "tension", "trust", "valence", "add my own database as well"), - preselect = NULL, multiple = TRUE, - title = "Select the variables you would like to align your conversation transcripts on", - graphics = FALSE) - - if (length(myvars) == 0) { #if no variables are selected, defaults are automatically added - myvars <- c("happiness", "hostility", "empathy", "excitement") - } - - var_selected <- lookup_db %>% #select desired columns from lookup_db - select(matches("^word$"), contains(myvars)) - - if (any(grepl("add my own database as well", myvars)) == TRUE) { - #take use input for the full file path to the data base they want to use - database_path <- readline("Input the file path to the database you would like to add.") - user_added_db <- read.csv(database_path) #IS IT OK TO ASSUME THAT DATABASE WILL BE .CSV??? - user_added_db <- data.frame(user_added_db) - #display the column names of user added database and allow them to choose the columns they want - subset_user_db <- select.list(c(colnames(user_added_db), "Select all columns"), - preselect = NULL, multiple = TRUE, - title = "Select the columns you would like to subset. The word column must be included.", - graphics = FALSE) - #allows user to select one option to select every column in their added database - if (any(grepl("Select all columns", subset_user_db)) == TRUE) { - subset_user_db <- colnames(user_added_db) - } - user_added_db <- user_added_db %>% select(contains(subset_user_db)) #select the columns specified from the database - #alter the word column on the added database to match the column name of the built in databse - colnames(user_added_db)[grep("^word$", colnames(user_added_db), ignore.case = TRUE)] <- "word" - #if user added their own database and subsetted from built in - binds both together. - if (length(myvars[-grep("add my own database as well", myvars)]) > 0) { - var_selected <- full_join(x = var_selected, y = user_added_db, by="word") - } - } - #create variable containing the column names of each variable to be aligned - var_aligners <- colnames(var_selected)[-grep("^word$", colnames(lookup_db), ignore.case = TRUE)] - - var_selected <- var_selected %>% distinct(word, .keep_all = TRUE) - - ts_list <- split(clean_ts_df, f = clean_ts_df$Doc_id) #split the transcript data frame into a list by Doc_id - ts_aligned_list <- lapply(ts_list, function(ts_select){ - #join measures of each variable to each word in each transcript - df_aligned <- left_join(ts_select, var_selected, by = c("CleanText" = "word"), multiple = "first") - df_aligned <- df_aligned[complete.cases(df_aligned), ] # remove any words that couldn't be aligned - df_aligned <- data.frame(df_aligned) - - df_aligned_agg <- df_aligned %>% - mutate(TurnCount = consecutive_id(Speaker_names_raw), .before = 1) %>% # add a turn column - select(Doc_id, Speaker_names_raw, TurnCount, Time, contains(var_aligners), starts_with("Analytics")) %>% - # select variables, speaker and dyad information, and word analytics - group_by(Doc_id, TurnCount, Speaker_names_raw) %>% #group by doc id, turn, and speaker - summarise(Time = min(Time), #make time the minimum for each turn - across(contains(var_aligners), mean), #average each variable by turn - across(starts_with("Analytics_wordcount"), sum), #sum word counts - across(starts_with("Analytics_words_removed"), sum), #sum removed word counts - across(starts_with("Analytics_mean_word_length"), mean), - .groups = "drop") %>% - ungroup() #reformat data frame back to chronological order - # identifies if there are an odd number of rows (one speaker spoke but other did not respond) - if ((nrow(df_aligned_agg)%%2) == 1 ) { - temprow <- data.frame(matrix(NA, nrow = 1, ncol = ncol(df_aligned_agg))) #creates a new adder row - colnames(temprow) <- c(colnames(df_aligned_agg)) - df_aligned_agg <- rbind(df_aligned_agg, temprow) #adds row full of NA to end of the data frame - } - ExchangeCount <- rep(seq(1:(length(df_aligned_agg$TurnCount)/2)), each=2) #creates Exchange Count - df_aligned_EC <- cbind(ExchangeCount, df_aligned_agg) #binds ExC to the data frame - df_aligned_EC <- na.omit(df_aligned_EC) #removes added NA row - df_aligned_EC <- df_aligned_EC %>% - select(!TurnCount) #removes turn count column - - df_aligned_EC #output the transcript exchange count organized aligned data frame to a list - }) - ts_aligned_df_total <- bind_rows(ts_aligned_list) - - #DEFINE THE DEMOGRAPHIC_ALIGN FUNCTION - demographic_align <- function(aligned_ts_df) { - #allow user to input the file path to demographic data, randomly assign groups, or not align groups - ask_demo_filepath <- readline("If you would like to align demographics to speakers, input the file path to the demographic csv file.") - #if user inputs 'random', randomly assigns groups across transcripts - if (str_to_lower(ask_demo_filepath) == "random") { - randomly <- lapply(split(aligned_ts_df, aligned_ts_df$Doc_id), function(x){ #iterates over each doc - x <- data.frame(x) - #creates a vector of each speaker with random indexes and assigns a alphanumeric sequence name - speakervec <- sample(unique(x[,grep("Speaker_names_raw", colnames(x), ignore.case = T)])) - names(speakervec) <- paste("S", 1:length(speakervec), sep = "") - #creates a data frame with just speaker names and assigned code - coloutput <- data.frame(Speaker_names_raw = speakervec, - Speaker_Code_Random = sapply(speakervec, function(y) { - names(speakervec)[match(y, speakervec)]})) - x <- x %>% left_join(coloutput, by=c("Speaker_names_raw")) #binds code to the aligned data frame - }) - randomly <- bind_rows(randomly) #binds all the doc data frame into one - return(randomly) - } - #if input is empty, returns the aligned data frame with no demographics - else if (ask_demo_filepath == "") { - return(aligned_ts_df) - } - #if file path is entered: - else { - #reads in a csv file of demographic information associated with participant IDs. - demoinfo <- data.frame(read.csv(ask_demo_filepath)) - #allows the user to specify which columns they want to subset - subset_demo_data <- select.list(c(colnames(demoinfo), "Select all columns"), - preselect = NULL, multiple = TRUE, - title = "Select the columns you would like to subset. The participant ID column must be included.", - graphics = FALSE) - #if the select all option is chosen, selects every column - if (any(grepl("Select all columns", subset_demo_data)) == TRUE) { - subset_demo_data <- colnames(demoinfo) - } - demos_selected <- demoinfo %>% - select(contains(subset_demo_data)) #selects only specified columns from the demographics - - demos <- demos_selected %>% - select(!contains("PID")) %>% - select(!contains("Participant")) #selects only columns that aren't participant ID - - partid <- demos_selected %>% - select(contains(setdiff(colnames(demos_selected), colnames(demos)))) - #creates a new data frame that just includes specified demo domains and combines them into to one string, which will be a total combination of demographics - domaincode <- data.frame(sapply(colnames(demos), function(x) { - domainlvl <- sort(unique(demos[, match(x, colnames(demos))])) #creates a vector of unique domain info - names(domainlvl) <- paste("S", 1:length(domainlvl), sep = "") #alphabetically assigns a code to each - coloutput <- sapply(demos[match(x, colnames(demos))], function(y) { - names(domainlvl)[match(y, domainlvl)] - }) - coloutput - })) - colnames(domaincode) <- paste("Speaker_group_var_code", tolower(colnames(demos)), sep = "_") - colnames(demos) <- paste("Speaker_group_var", tolower(colnames(demos)), sep = "_") - demos <- cbind(demos, domaincode) #bind the assigned codes to the original groups - demos[] <- lapply(demos[], factor) #make each grouping variable a factor - demos <- cbind(PID = partid, demos) #bind participant ID to the demographic groups - - demo_aligned_df <- aligned_ts_df %>% - left_join(demos, by=c("Speaker_names_raw" = "PID")) #align demographic groups by participant ID - - return(demo_aligned_df) - }} - #END DEFINING DEMOGRAPHIC_ALIGN FUNCTION - demographic_align(aligned_ts_df = ts_aligned_df_total) #run demoraphic aligner on aligned data frame -} -``` - -## Inspect your transcripts +myvars \<- select.list(c(“admiration”, “anger”, “animosity”, +“anticipation”, “anxiety”, “aoa”, “awe”, “boredom”, “calmness”, +“closeness”, “comfort”, “compatibility”, “concreteness”, “confusion”, +“contempt”, “disgust”, “distance”, “dominance”, “doubt”, “empathy”, +“encouragement”, “excitement”, “fear”, “friendliness”, “gratitude”, +“happiness”, “hostility”, “interest”, “joy”, “lg10wf”, “love”, +“n_letters”, “relieved”, “sadness”, “satisfaction”, “stress”, +“surprise”, “tension”, “trust”, “valence” ``` r -#TBD +align_dyads <- function(clean_ts_df) { ``` -## Analyze your transcripts +## Summarize transcripts ``` r #TBD diff --git a/docs/index.html b/docs/index.html index 2effc4e..a3fd19c 100644 --- a/docs/index.html +++ b/docs/index.html @@ -90,8 +90,7 @@

Prep your language transcripts

@@ -99,356 +98,34 @@

Read your transcripts into R

read_dyads()

-

This function will read all your files and concatenate them into a single dataframe, appending document IDs

-
myrawtranscripts <- read_dyads()
-myrawtranscripts <- read_dyads(/my_customfolder)  #if specifying a custom folder path
-read_dyads <- function(folder_name = "my_transcripts") {
-  #defines three functions - the two that select and format txt and csv files, and the function that actually reads in the otter transcript txt file.
-  read_otter_transcript <- function(file_path) {
-    lines <- readLines(file_path) #read otter ai file
-    #Ben added - removes otter ai watermark if it is present
-    if (any(grepl("otter.ai", lines)) == TRUE) {
-      lines <- as.character(lines[-(grep("otter.ai", lines))])}
-    num_lines <- length(lines) #create a var for number of lines
-    speaker <- character() #create speaker col
-    time <- character() #create time col
-    text <- character() #create text col
-
-    #process lines of dialogue
-    current_line <- 1
-    while (current_line <= num_lines) {
-      speaker_time <- strsplit(lines[current_line], " ")[[1]]
-      speaker <- c(speaker, speaker_time[1]) #select speaker
-      #Ben added - allows for last names and also timeless transcripts
-      timeadd <- tryCatch({speaker_time[max(grep(":", speaker_time))]}, #attempts to identify a colon
-                          warning = function(w){return(NA)}) #if no colon, continues without gathering time
-      time <- c(time, timeadd)
-
-      #select lines of speech
-      speech_lines <- character()
-      line_counter <- current_line + 1
-      while (line_counter <= num_lines && lines[line_counter] != "") { #if not max line not on empty line
-        speech_lines <- c(speech_lines, lines[line_counter]) #add text on line to speech line text vector
-        line_counter <- line_counter + 1
-      }
-      text <- c(text, paste(speech_lines, collapse = " ")) #append speech on line to vector as one string
-      current_line <- line_counter + 1 #move to next speaker
-    }
-    #create df
-    transcript_df <- data.frame(Speaker_names_raw = speaker,Time = time, RawText = text,
-    stringsAsFactors = FALSE)
-    return(transcript_df)
-  }
-  #END DEFINE OTTER READ TRANSCRIPT .TXT FILE FUNCTION
-
-  read_me_txt <- function(folder_name){
-    if (any(grepl("*.txt$", list.files(path = folder_name, pattern = ".", full.names = TRUE, recursive = TRUE))) == TRUE) {
-      file_list_txt <- list.files(path = folder_name, pattern = "*.txt$", full.names = TRUE, recursive = TRUE) #list files with .txt ending
-      txtdata <- lapply(file_list_txt, function(x) {
-        #runs txt files names through otter reading function
-        xorf <- read_otter_transcript(x)
-        #selects those that were properly transcribed by otter, still need to add other parameters
-        if (ncol(xorf) == 3) {
-          x <- xorf
-        }
-        else {
-          # --- WIP --- Other function for reading in non-otter txt transcripts
-        }
-      })
-
-      data.table::setattr(txtdata, "names", file_list_txt) #add names attribute to each list element
-      #adds a doc id column to each transcript based on its name attribute
-      #    txtdata <- lapply(names(txtdata), function(x){
-      #      txtdata[[match(x, names(txtdata))]] <- cbind(Doc_id = rep(x, nrow(txtdata[[match(x, names(txtdata))]])), txtdata[[match(x, names(txtdata))]])})
-      #returns the list of each data frame with doc IDs.
-      return(txtdata)
-    }} #end of the read_me txt function
-
-  read_me_csv <- function(folder_name) {
-    if (any(grepl("*.csv$", list.files(path = folder_name, pattern = ".", full.names = TRUE, recursive = TRUE))) == TRUE) {
-      file_list_csv <- list.files(path = folder_name, pattern = "*.csv$", full.names = TRUE, recursive = TRUE) #list files with .csv ending
-      #creates a list of read in csv dataframes
-      csvdata <- lapply(full_file_list, function(x){
-        x <- read.csv(x)
-        if (ncol(x) == 3) {
-          if (sort(tolower(colnames(x))) != sort("speaker", "text", "time")) {
-            # --- WIP --- test for columns somehow...
-          }
-        }
-        else {
-          # --- WIP --- tests for the three columns we need? Maybe it throws an error?
-        }
-      })
-
-      data.table::setattr(csvdata, "names", file_list_csv) #add names attribute to textdata
-      #adds a doc id column to each transcript based on its name attribute
-      #      csvdata <- lapply(names(csvdata), function(x){
-      #        csvdata[[match(x, names(csvdata))]] <- cbind(Doc_id = rep(x, nrow(csvdata[[match(x, names(csvdata))]])), csvdata[[match(x, names(csvdata))]])})
-      return(csvdata)
-    }}
-  #END OF THE READ_ME__CSV FUNCTION
-  #calls two functions to read in txt and csv file transcripts, returning a list.
-  txtlist <- read_me_txt(folder_name)
-  csvlist <- read_me_csv(folder_name)
-  all_list <- append(txtlist, csvlist) #append the two lists into one list
-
-  all_list_num <- lapply(seq(length(all_list)), function(doc_num){ #iterate over each transcript
-    all_list[[doc_num]] <- cbind(Doc_id = rep(doc_num, nrow(all_list[[doc_num]])), all_list[[doc_num]])}) #bind the index of the transcript as a column to the data frame
-  alldf <- bind_rows(all_list_num) #binds the rows  of each list into one data frame
-  return(alldf)
-  #outputs a data frame containing every dyad with columns: Doc_id, Speaker_names_raw, Time, and RawText
-}
+

This function will read all your files and concatenate them into a single dataframe, appending document IDs. You can call this dataframe whatever you like. read_dyads will default to reading all csv and txt files in a folder called my_transcripts. Just remember that when you are finished processing a set of transcripts, make sure to move them out of that folder. You can think of ‘my_transcripts’ as a staging area for loading data into ConversationAlign.

+
+MyRawLangSamples <- read_dyads()
+#if you want to specify a different folder, supply your own path
+MyRawLangSamples <- read_dyads("/my_custompath")
-

Clean your transcripts +

Clean your transcripts: clean_dyads

-

This step uses regular expressions to clean and format your data, eliminating stopwords, changing the case to lower, omitting whitespaces and non-alphabetic characters, etc.

+

clean_dyas uses regular expressions to clean and format your data. The function also omits stopwords using a custom stopword list, and it lemmatizes (converts all words to their dictionary entries) unless you tell it not to (lemmatize=T is the default). Run ‘clean_dyads’ on the object you just assembled by running the ‘read_dyads’ function in the last step.

-#takes object from the read_dyads step
-mycleantranscripts <- clean_dyads(myrawtranscripts)
-clean_dyads <- function(read_ts_df) {
-  read_data_frame <- read_ts_df %>%
-    filter(Speaker_names_raw != "Unknown") %>% #filter out common unwanted speaker names
-    filter(Speaker_names_raw != "unknown") %>%
-    filter(Speaker_names_raw != "Speaker") %>%
-    filter(Speaker_names_raw != "speaker") %>%
-    filter(Speaker_names_raw != "Other") %>%
-    filter(Speaker_names_raw != "other") %>%
-    filter(Speaker_names_raw != "E") %>%
-    filter(Speaker_names_raw != "e") %>%
-    filter(Speaker_names_raw != "Experimenter") %>%
-    filter(Speaker_names_raw != "experimenter") %>%
-    filter(Speaker_names_raw != "Assistant") %>%
-    filter(Speaker_names_raw != "assistant")
-  read_data_frame$Speaker_names_raw <- as.factor(read_data_frame$Speaker_names_raw) #convert variables to factor
-  read_data_frame$Doc_id <- as.factor(read_data_frame$Doc_id)
-
-  #convert time from hh:mm:ss or mm:ss to milliseconds
-  read_data_frame$Time <- sapply(read_data_frame$Time, function(x){
-    if (any(grepl(":", x)) == TRUE) {  #checks for colons, indicative of mm:ss
-      x <- as.numeric(unlist(str_split(x, ":"))) #breaks string into vector by colon placement
-      if (length(x) == 2) { #shows just mm, ss
-        sum((x[1]*60000), (x[2]*1000))      }
-      else if ( length(xvec) == 3) { #shows hh, mm, ss
-        sum((x[1]*3600000), (x[2]*60000), (x[3]*1000))}}
-    else if (is.na(x) == TRUE) { #keeps NA time values as NA - may be a better way to do this?
-      NA}})
-
-  load("data/omissions_dyads23.rda") #load in omissions database
-
-  clean <- function(x) {
-    x <- tolower(x) #to lower
-    x <- gsub("\"", " ", x)
-    x <- gsub("\n", " ", x)
-    x <- gsub("`", "'", x)  # replaces tick marks with apostrophe for contractions
-    x <- gsub("can't", "can not", x)
-    x <- gsub("won't", "will not", x)
-    x <- gsub("n't", " not", x) #replace contraction with full word not
-    x <- textclean::replace_contraction(x) #replace contractions
-    x <- gsub("-", " ", x) #replace all hyphens with spaces
-    x <- tm::removeWords(x, omissions_dyads23$.)
-    x <- gsub("\\d+(st|nd|rd|th)", " ", x) #omits 6th, 23rd, ordinal numbers
-    x <- gsub("[^a-zA-Z]", " ", x) #omit non-alphabetic characters
-    x <- gsub("\\b[a]\\b{1}", " ", x)
-    x <- tm::stripWhitespace(x)
-    x <- stringr::str_squish(x)
-    x <- textstem::lemmatize_words(x)
-  }
-
-  read_data_frame$RawText <- str_squish(read_data_frame$RawText) #remove unneeded white space from text
-
-  df_with_word_count <- read_data_frame %>%
-    rowwise() %>% #group by individual row
-    mutate(Analytics_wordcount_raw = length(stri_remove_empty(str_split_1(paste(RawText, collapse = " "), " "))), #create new column of word count by row
-           Analytics_mean_word_length_raw = mean(nchar(stri_remove_empty(str_split_1(paste(RawText, collapse = " "), pattern = " "))))) %>% #create new column of average word length by row
-    ungroup()
-
-  dfclean <- df_with_word_count %>%
-    mutate(CleanText = clean(RawText)) %>%  #run clean function on raw text, making a new column
-    rowwise() %>% #group by individual row
-    mutate(Analytics_wordcount_clean = length(stri_remove_empty(str_split_1(paste(CleanText, collapse = " "), " "))), # create word count column for cleaned text
-           Analytics_mean_word_length_clean = mean(nchar(stri_remove_empty(str_split_1(paste(CleanText, collapse = " "), pattern = " "))))) %>% #create mean word length column for clean text
-    ungroup() %>%
-    select(!RawText)# remove old raw text and grouping column
-
-  dfclean_sep <- tidyr::separate_rows(dfclean, CleanText) # create row for each word in clean text
-
-  dfclean_filtered <- dfclean_sep %>%
-    filter(CleanText != "")#remove rows where text is an empty string
-
-  #calculate words removed from the difference between the raw word count and clean word count
-  dfclean_filtered$Analytics_words_removed <- dfclean_filtered$Analytics_wordcount_raw - dfclean_filtered$Analytics_wordcount_clean
-
-  return(dfclean_filtered)
-}
+MyCleanLangSamples <- clean_dyads()

Align your transcripts

-

Prompts user to specify one or more variables to align on from a lookup database (lookup_db) reflecting published word norms from numermous sources (e.g., afffectvec, Kuperman norms, Brysbaert norms, etc.). Yokes data to each word then structures a dataframe by speaker and exchange across each dyad.

-
-#takes cleaned dataframe from clean_dyads() step
-align_dyads <- function(clean_ts_df) {
-  load("data/lookup_db.rda") #load lookup database
-  #allow the user to select what variables they want to align, or provide their own database(s) and subset them
-  myvars <- select.list(c("admiration", "anger", "animosity", "anticipation", "anxiety", "aoa", "awe", "boredom", "calmness",  "closeness", "comfort", "compatibility", "concreteness", "confusion", "contempt", "disgust", "distance", "dominance", "doubt", "empathy", "encouragement", "excitement", "fear", "friendliness", "gratitude", "happiness", "hostility", "interest", "joy", "lg10wf", "love", "n_letters", "relieved", "sadness", "satisfaction", "stress", "surprise", "tension", "trust", "valence", "add my own database as well"),
-                        preselect = NULL, multiple = TRUE,
-                        title = "Select the variables you would like to align your conversation transcripts on",
-                        graphics = FALSE)
-
-  if (length(myvars) == 0) { #if no variables are selected, defaults are automatically added
-    myvars <- c("happiness", "hostility", "empathy", "excitement")
-  }
-
-  var_selected <- lookup_db %>% #select desired columns from lookup_db
-    select(matches("^word$"), contains(myvars))
-
-  if (any(grepl("add my own database as well", myvars)) == TRUE) {
-    #take use input for the full file path to the data base they want to use
-    database_path <- readline("Input the file path to the database you would like to add.")
-    user_added_db <- read.csv(database_path) #IS IT OK TO ASSUME THAT DATABASE WILL BE .CSV???
-    user_added_db <- data.frame(user_added_db)
-    #display the column names of user added database and allow them to choose the columns they want
-    subset_user_db <- select.list(c(colnames(user_added_db), "Select all columns"),
-                                  preselect = NULL, multiple = TRUE,
-                                  title = "Select the columns you would like to subset. The word column must be included.",
-                                  graphics = FALSE)
-    #allows user to select one option to select every column in their added database
-    if (any(grepl("Select all columns", subset_user_db)) == TRUE) {
-      subset_user_db <- colnames(user_added_db)
-    }
-    user_added_db <- user_added_db %>% select(contains(subset_user_db)) #select the columns specified from the database
-    #alter the word column on the added database to match the column name of the built in databse
-    colnames(user_added_db)[grep("^word$", colnames(user_added_db), ignore.case = TRUE)] <- "word"
-    #if user added their own database and subsetted from built in - binds both together.
-    if (length(myvars[-grep("add my own database as well", myvars)]) > 0) {
-      var_selected <- full_join(x = var_selected, y = user_added_db, by="word")
-    }
-  }
-  #create variable containing the column names of each variable to be aligned
-  var_aligners <- colnames(var_selected)[-grep("^word$", colnames(lookup_db), ignore.case = TRUE)]
-
-  var_selected <- var_selected %>% distinct(word, .keep_all = TRUE)
-
-  ts_list <- split(clean_ts_df, f = clean_ts_df$Doc_id) #split the transcript data frame into a list by Doc_id
-  ts_aligned_list <- lapply(ts_list, function(ts_select){
-    #join measures of each variable to each word in each transcript
-    df_aligned <- left_join(ts_select, var_selected, by = c("CleanText" = "word"), multiple = "first")
-    df_aligned <- df_aligned[complete.cases(df_aligned), ] # remove any words that couldn't be aligned
-    df_aligned <- data.frame(df_aligned)
-
-    df_aligned_agg <- df_aligned %>%
-      mutate(TurnCount = consecutive_id(Speaker_names_raw), .before = 1) %>% # add a turn column
-      select(Doc_id, Speaker_names_raw, TurnCount, Time, contains(var_aligners), starts_with("Analytics")) %>%
-      # select variables, speaker and dyad information, and word analytics
-      group_by(Doc_id, TurnCount, Speaker_names_raw) %>% #group by doc id, turn, and speaker
-      summarise(Time = min(Time), #make time the minimum for each turn
-                across(contains(var_aligners), mean), #average each variable by turn
-                across(starts_with("Analytics_wordcount"), sum), #sum word counts
-                across(starts_with("Analytics_words_removed"), sum), #sum removed word counts
-                across(starts_with("Analytics_mean_word_length"), mean),
-                .groups = "drop") %>%
-      ungroup() #reformat data frame back to chronological order
-    # identifies if there are an odd number of rows (one speaker spoke but other did not respond)
-    if ((nrow(df_aligned_agg)%%2) == 1 ) {
-      temprow <- data.frame(matrix(NA, nrow = 1, ncol = ncol(df_aligned_agg))) #creates a new adder row
-      colnames(temprow) <- c(colnames(df_aligned_agg))
-      df_aligned_agg <- rbind(df_aligned_agg, temprow) #adds row full of NA to end of the data frame
-    }
-    ExchangeCount <- rep(seq(1:(length(df_aligned_agg$TurnCount)/2)), each=2) #creates Exchange Count
-    df_aligned_EC <- cbind(ExchangeCount, df_aligned_agg) #binds ExC to the data frame
-    df_aligned_EC <- na.omit(df_aligned_EC) #removes added NA row
-    df_aligned_EC <- df_aligned_EC %>%
-      select(!TurnCount) #removes turn count column
-
-    df_aligned_EC #output the transcript exchange count organized aligned data frame to a list
-  })
-  ts_aligned_df_total <- bind_rows(ts_aligned_list)
-
-  #DEFINE THE DEMOGRAPHIC_ALIGN FUNCTION
-  demographic_align <- function(aligned_ts_df) {
-    #allow user to input the file path to demographic data, randomly assign groups, or not align groups
-    ask_demo_filepath <- readline("If you would like to align demographics to speakers, input the file path to the demographic csv file.")
-    #if user inputs 'random', randomly assigns groups across transcripts
-    if (str_to_lower(ask_demo_filepath) == "random") {
-      randomly <- lapply(split(aligned_ts_df, aligned_ts_df$Doc_id), function(x){ #iterates over each doc
-        x <- data.frame(x)
-        #creates a vector of each speaker with random indexes and assigns a alphanumeric sequence name
-        speakervec <- sample(unique(x[,grep("Speaker_names_raw", colnames(x), ignore.case = T)]))
-        names(speakervec) <- paste("S", 1:length(speakervec), sep = "")
-        #creates a data frame with just speaker names and assigned code
-        coloutput <- data.frame(Speaker_names_raw = speakervec,
-                                Speaker_Code_Random = sapply(speakervec, function(y) {
-                                  names(speakervec)[match(y, speakervec)]}))
-        x <- x %>% left_join(coloutput, by=c("Speaker_names_raw")) #binds code to the aligned data frame
-      })
-      randomly <- bind_rows(randomly) #binds all the doc data frame into one
-      return(randomly)
-    }
-    #if input is empty, returns the aligned data frame with no demographics
-    else if (ask_demo_filepath == "") {
-      return(aligned_ts_df)
-    }
-    #if file path is entered:
-    else {
-      #reads in a csv file of demographic information associated with participant IDs.
-      demoinfo <- data.frame(read.csv(ask_demo_filepath))
-      #allows the user to specify which columns they want to subset
-      subset_demo_data <- select.list(c(colnames(demoinfo), "Select all columns"),
-                                      preselect = NULL, multiple = TRUE,
-                                      title = "Select the columns you would like to subset. The participant ID column must be included.",
-                                      graphics = FALSE)
-      #if the select all option is chosen, selects every column
-      if (any(grepl("Select all columns", subset_demo_data)) == TRUE) {
-        subset_demo_data <- colnames(demoinfo)
-      }
-      demos_selected <- demoinfo %>%
-        select(contains(subset_demo_data)) #selects only specified columns from the demographics
-
-      demos <- demos_selected %>%
-        select(!contains("PID")) %>%
-        select(!contains("Participant")) #selects only columns that aren't participant ID
-
-      partid <- demos_selected %>%
-        select(contains(setdiff(colnames(demos_selected), colnames(demos))))
-      #creates a new data frame that just includes specified demo domains and combines them into to one string, which will be a total combination of demographics
-      domaincode <- data.frame(sapply(colnames(demos), function(x) {
-        domainlvl <- sort(unique(demos[, match(x, colnames(demos))]))  #creates a vector of unique domain info
-        names(domainlvl) <- paste("S", 1:length(domainlvl), sep = "")  #alphabetically assigns a code to each
-        coloutput <- sapply(demos[match(x, colnames(demos))], function(y) {
-          names(domainlvl)[match(y, domainlvl)]
-        })
-        coloutput
-      }))
-      colnames(domaincode) <- paste("Speaker_group_var_code", tolower(colnames(demos)), sep = "_")
-      colnames(demos) <- paste("Speaker_group_var", tolower(colnames(demos)), sep = "_")
-      demos <- cbind(demos, domaincode) #bind the assigned codes to the original groups
-      demos[] <- lapply(demos[], factor) #make each grouping variable a factor
-      demos <- cbind(PID = partid, demos) #bind participant ID to the demographic groups
-
-      demo_aligned_df <- aligned_ts_df %>%
-        left_join(demos, by=c("Speaker_names_raw" = "PID")) #align demographic groups by participant ID
-
-      return(demo_aligned_df)
-    }}
-  #END DEFINING DEMOGRAPHIC_ALIGN FUNCTION
-  demographic_align(aligned_ts_df = ts_aligned_df_total) #run demoraphic aligner on aligned data frame
-}
+

Prompts user to specify one or more variables to align on from a lookup database (lookup_db) reflecting published word norms from numermous sources (e.g., afffectvec, Kuperman norms, Brysbaert norms, etc.). Yokes data to each word then structures a dataframe by speaker and exchange across each dyad.

+

myvars <- select.list(c(“admiration”, “anger”, “animosity”, “anticipation”, “anxiety”, “aoa”, “awe”, “boredom”, “calmness”, “closeness”, “comfort”, “compatibility”, “concreteness”, “confusion”, “contempt”, “disgust”, “distance”, “dominance”, “doubt”, “empathy”, “encouragement”, “excitement”, “fear”, “friendliness”, “gratitude”, “happiness”, “hostility”, “interest”, “joy”, “lg10wf”, “love”, “n_letters”, “relieved”, “sadness”, “satisfaction”, “stress”, “surprise”, “tension”, “trust”, “valence”

+
align_dyads <- function(clean_ts_df) {
-

Inspect your transcripts +

Summarize transcripts

 #TBD
-

Analyze your transcripts -

-
-#TBD
-
-

Get in touch!

Contact for feedback and assistance.

diff --git a/docs/pkgdown.yml b/docs/pkgdown.yml index c99a003..2747333 100644 --- a/docs/pkgdown.yml +++ b/docs/pkgdown.yml @@ -3,7 +3,7 @@ pkgdown: 2.0.7 pkgdown_sha: ~ articles: my-vignette: my-vignette.html -last_built: 2023-10-16T00:11Z +last_built: 2023-10-16T00:29Z urls: reference: https://reilly-conceptscognitionlab.github.io/ConversationAlign/reference article: https://reilly-conceptscognitionlab.github.io/ConversationAlign/articles