Skip to content


help files updated
Browse files Browse the repository at this point in the history
  • Loading branch information
reilly-lab committed Oct 16, 2023
1 parent 8ee13da commit 401da62
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 829 deletions.
159 changes: 5 additions & 154 deletions README.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -62,165 +62,16 @@ MyCleanLangSamples <- clean_dyads()

## Align your transcripts
Prompts user to specify one or more variables to align on from a lookup database (lookup_db) reflecting published word norms from numermous sources (e.g., afffectvec, Kuperman norms, Brysbaert norms, etc.). Yokes data to each word then structures a dataframe by speaker and exchange across each dyad.
Prompts user to specify one or more variables to align on from a lookup database (lookup_db) reflecting published word norms from numermous sources (e.g., afffectvec, Kuperman norms, Brysbaert norms, etc.). Yokes data to each word then structures a dataframe by speaker and exchange across each dyad. <br/>

myvars <- select.list(c("admiration", "anger", "animosity", "anticipation", "anxiety", "aoa", "awe", "boredom", "calmness", "closeness", "comfort", "compatibility", "concreteness", "confusion", "contempt", "disgust", "distance", "dominance", "doubt", "empathy", "encouragement", "excitement", "fear", "friendliness", "gratitude", "happiness", "hostility", "interest", "joy", "lg10wf", "love", "n_letters", "relieved", "sadness", "satisfaction", "stress", "surprise", "tension", "trust", "valence"

```{r, eval=F}
#takes cleaned dataframe from clean_dyads() step
align_dyads <- function(clean_ts_df) {
load("data/lookup_db.rda") #load lookup database
#allow the user to select what variables they want to align, or provide their own database(s) and subset them
myvars <- select.list(c("admiration", "anger", "animosity", "anticipation", "anxiety", "aoa", "awe", "boredom", "calmness", "closeness", "comfort", "compatibility", "concreteness", "confusion", "contempt", "disgust", "distance", "dominance", "doubt", "empathy", "encouragement", "excitement", "fear", "friendliness", "gratitude", "happiness", "hostility", "interest", "joy", "lg10wf", "love", "n_letters", "relieved", "sadness", "satisfaction", "stress", "surprise", "tension", "trust", "valence", "add my own database as well"),
preselect = NULL, multiple = TRUE,
title = "Select the variables you would like to align your conversation transcripts on",
graphics = FALSE)
if (length(myvars) == 0) { #if no variables are selected, defaults are automatically added
myvars <- c("happiness", "hostility", "empathy", "excitement")
var_selected <- lookup_db %>% #select desired columns from lookup_db
select(matches("^word$"), contains(myvars))
if (any(grepl("add my own database as well", myvars)) == TRUE) {
#take use input for the full file path to the data base they want to use
database_path <- readline("Input the file path to the database you would like to add.")
user_added_db <- read.csv(database_path) #IS IT OK TO ASSUME THAT DATABASE WILL BE .CSV???
user_added_db <- data.frame(user_added_db)
#display the column names of user added database and allow them to choose the columns they want
subset_user_db <- select.list(c(colnames(user_added_db), "Select all columns"),
preselect = NULL, multiple = TRUE,
title = "Select the columns you would like to subset. The word column must be included.",
graphics = FALSE)
#allows user to select one option to select every column in their added database
if (any(grepl("Select all columns", subset_user_db)) == TRUE) {
subset_user_db <- colnames(user_added_db)
user_added_db <- user_added_db %>% select(contains(subset_user_db)) #select the columns specified from the database
#alter the word column on the added database to match the column name of the built in databse
colnames(user_added_db)[grep("^word$", colnames(user_added_db), = TRUE)] <- "word"
#if user added their own database and subsetted from built in - binds both together.
if (length(myvars[-grep("add my own database as well", myvars)]) > 0) {
var_selected <- full_join(x = var_selected, y = user_added_db, by="word")
#create variable containing the column names of each variable to be aligned
var_aligners <- colnames(var_selected)[-grep("^word$", colnames(lookup_db), = TRUE)]
var_selected <- var_selected %>% distinct(word, .keep_all = TRUE)
ts_list <- split(clean_ts_df, f = clean_ts_df$Doc_id) #split the transcript data frame into a list by Doc_id
ts_aligned_list <- lapply(ts_list, function(ts_select){
#join measures of each variable to each word in each transcript
df_aligned <- left_join(ts_select, var_selected, by = c("CleanText" = "word"), multiple = "first")
df_aligned <- df_aligned[complete.cases(df_aligned), ] # remove any words that couldn't be aligned
df_aligned <- data.frame(df_aligned)
df_aligned_agg <- df_aligned %>%
mutate(TurnCount = consecutive_id(Speaker_names_raw), .before = 1) %>% # add a turn column
select(Doc_id, Speaker_names_raw, TurnCount, Time, contains(var_aligners), starts_with("Analytics")) %>%
# select variables, speaker and dyad information, and word analytics
group_by(Doc_id, TurnCount, Speaker_names_raw) %>% #group by doc id, turn, and speaker
summarise(Time = min(Time), #make time the minimum for each turn
across(contains(var_aligners), mean), #average each variable by turn
across(starts_with("Analytics_wordcount"), sum), #sum word counts
across(starts_with("Analytics_words_removed"), sum), #sum removed word counts
across(starts_with("Analytics_mean_word_length"), mean),
.groups = "drop") %>%
ungroup() #reformat data frame back to chronological order
# identifies if there are an odd number of rows (one speaker spoke but other did not respond)
if ((nrow(df_aligned_agg)%%2) == 1 ) {
temprow <- data.frame(matrix(NA, nrow = 1, ncol = ncol(df_aligned_agg))) #creates a new adder row
colnames(temprow) <- c(colnames(df_aligned_agg))
df_aligned_agg <- rbind(df_aligned_agg, temprow) #adds row full of NA to end of the data frame
ExchangeCount <- rep(seq(1:(length(df_aligned_agg$TurnCount)/2)), each=2) #creates Exchange Count
df_aligned_EC <- cbind(ExchangeCount, df_aligned_agg) #binds ExC to the data frame
df_aligned_EC <- na.omit(df_aligned_EC) #removes added NA row
df_aligned_EC <- df_aligned_EC %>%
select(!TurnCount) #removes turn count column
df_aligned_EC #output the transcript exchange count organized aligned data frame to a list
ts_aligned_df_total <- bind_rows(ts_aligned_list)
demographic_align <- function(aligned_ts_df) {
#allow user to input the file path to demographic data, randomly assign groups, or not align groups
ask_demo_filepath <- readline("If you would like to align demographics to speakers, input the file path to the demographic csv file.")
#if user inputs 'random', randomly assigns groups across transcripts
if (str_to_lower(ask_demo_filepath) == "random") {
randomly <- lapply(split(aligned_ts_df, aligned_ts_df$Doc_id), function(x){ #iterates over each doc
x <- data.frame(x)
#creates a vector of each speaker with random indexes and assigns a alphanumeric sequence name
speakervec <- sample(unique(x[,grep("Speaker_names_raw", colnames(x), = T)]))
names(speakervec) <- paste("S", 1:length(speakervec), sep = "")
#creates a data frame with just speaker names and assigned code
coloutput <- data.frame(Speaker_names_raw = speakervec,
Speaker_Code_Random = sapply(speakervec, function(y) {
names(speakervec)[match(y, speakervec)]}))
x <- x %>% left_join(coloutput, by=c("Speaker_names_raw")) #binds code to the aligned data frame
randomly <- bind_rows(randomly) #binds all the doc data frame into one
#if input is empty, returns the aligned data frame with no demographics
else if (ask_demo_filepath == "") {
#if file path is entered:
else {
#reads in a csv file of demographic information associated with participant IDs.
demoinfo <- data.frame(read.csv(ask_demo_filepath))
#allows the user to specify which columns they want to subset
subset_demo_data <- select.list(c(colnames(demoinfo), "Select all columns"),
preselect = NULL, multiple = TRUE,
title = "Select the columns you would like to subset. The participant ID column must be included.",
graphics = FALSE)
#if the select all option is chosen, selects every column
if (any(grepl("Select all columns", subset_demo_data)) == TRUE) {
subset_demo_data <- colnames(demoinfo)
demos_selected <- demoinfo %>%
select(contains(subset_demo_data)) #selects only specified columns from the demographics
demos <- demos_selected %>%
select(!contains("PID")) %>%
select(!contains("Participant")) #selects only columns that aren't participant ID
partid <- demos_selected %>%
select(contains(setdiff(colnames(demos_selected), colnames(demos))))
#creates a new data frame that just includes specified demo domains and combines them into to one string, which will be a total combination of demographics
domaincode <- data.frame(sapply(colnames(demos), function(x) {
domainlvl <- sort(unique(demos[, match(x, colnames(demos))])) #creates a vector of unique domain info
names(domainlvl) <- paste("S", 1:length(domainlvl), sep = "") #alphabetically assigns a code to each
coloutput <- sapply(demos[match(x, colnames(demos))], function(y) {
names(domainlvl)[match(y, domainlvl)]
colnames(domaincode) <- paste("Speaker_group_var_code", tolower(colnames(demos)), sep = "_")
colnames(demos) <- paste("Speaker_group_var", tolower(colnames(demos)), sep = "_")
demos <- cbind(demos, domaincode) #bind the assigned codes to the original groups
demos[] <- lapply(demos[], factor) #make each grouping variable a factor
demos <- cbind(PID = partid, demos) #bind participant ID to the demographic groups
demo_aligned_df <- aligned_ts_df %>%
left_join(demos, by=c("Speaker_names_raw" = "PID")) #align demographic groups by participant ID
demographic_align(aligned_ts_df = ts_aligned_df_total) #run demoraphic aligner on aligned data frame

## Inspect your transcripts
```{r, eval=F}

## Analyze your transcripts
## Summarize transcripts
```{r, eval=F}
Expand Down

0 comments on commit 401da62

Please sign in to comment.