Skip to content

Commit

Permalink
Revert "added new split summarize, removed time addition in read"
Browse files Browse the repository at this point in the history
This reverts commit 89185c8.
  • Loading branch information
Ben-Sacks committed Oct 30, 2024
1 parent 89185c8 commit 6789257
Show file tree
Hide file tree
Showing 5 changed files with 64 additions and 667 deletions.
9 changes: 4 additions & 5 deletions R/align_dyads.R
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
#' @export align_dyads

align_dyads <- function(clean_ts_df) {
load("data/lookup_db.rda")
#allow the user to select what variables they want to align, or provide their own database(s) and subset them
myvars <- select.list(c("aff_anger", "aff_anxiety", "aff_boredom", "aff_closeness",
"aff_confusion", "aff_dominance", "aff_doubt", "aff_empathy",
Expand All @@ -43,20 +42,20 @@ align_dyads <- function(clean_ts_df) {
dplyr::select(matches("^word$"), tidyselect::contains(myvars))
#create variable containing the column names of each variable to be aligned
var_aligners <- colnames(var_selected)[-grep("^word$", colnames(lookup_db), ignore.case = TRUE)]

#join measures of each variable to each word in each transcript
df_aligned <- dplyr::left_join(clean_ts_df, var_selected, by = c("CleanText" = "word"), multiple = "first")

# index rows where a word could not be aligned on all dimensions and remove that word from the clean text
df_aligned$CleanText[which(rowSums(is.na(df_aligned[,which(colnames(df_aligned) %in% var_aligners)])) == length(var_aligners))] <- ""
# important to note here that the row is preserved, the text is subbed for an empty string

# group on event and turn, then take the number of words in that turn which could be aligned on
df_aligned_wordcount <- df_aligned %>%
dplyr::group_by(Event_ID, TurnCount) %>%
dplyr::mutate(NWords_ByPersonTurn_CLEAN_ALIGNED = stringi::stri_count_words(paste(CleanText, collapse = " "))) %>%
dplyr::ungroup()

#add an exchange count variable, which is one turn from each interlocutor
df_aligned_wordcount$ExchangeCount <- ceiling(df_aligned_wordcount$TurnCount / 2)
#rearrange the columns to be more readable
Expand Down
71 changes: 28 additions & 43 deletions R/clean_dyads.R
Original file line number Diff line number Diff line change
Expand Up @@ -25,29 +25,29 @@
#' @export clean_dyads

clean_dyads <- function(read_ts_df, lemmatize=TRUE) {
load("data/omissions_dyads23.rda")
default <- TRUE

# if not default and not proper file path (aside from default), throw an error
if (class(stop_words_df) != "character" | (!grepl(".csv", stop_words_df) & stop_words_df != "default")) {
stop("The argument 'stop_words_df' takes a string type, and should be a filepath leading to a csv file.")
}
# otherwise, if not default and also a csv path
else if (stop_words_df != "default") {
# read the filepath to a csv if given, can also supply a data frame
stop_words_df <- read.csv(stop_words_df)
default <- FALSE
# throw an error if there is no column called 'word'
if (all(grepl("word", colnames(stop_words_df), ignore.case = T) == FALSE)) {
stop("The given stopwords data frame must have a column called 'word' or 'Word")
}
colnames(stop_words_df) <- tolower(colnames(stop_words_df))
}

# ADD LAPPLY TO RECOGNIZE DYADS WITH MORE THAT TWO INTERLOCUTORS AND THROWS A WARNING WITH PROBLEM DYADS

#set Event_ID and speaker names as factors
read_ts_df$Participant_ID <- as.factor(read_ts_df$Participant_ID) #convert variables to factor
read_ts_df$Event_ID <- as.factor(read_ts_df$Event_ID)


if (any(grepl("Time", colnames(read_ts_df), ignore.case = TRUE)) == TRUE){
#convert Time from hh:mm:ss or mm:ss to milliseconds
read_ts_df$Time <- sapply(read_ts_df$Time, function(x){
if (any(grepl(":", x)) == TRUE) { #checks for colons, indicative of mm:ss
x <- as.numeric(unlist(str_split(x, ":"))) #breaks string into vector by colon placement
if (length(x) == 2) { #shows just mm, ss
sum((x[1]*60000), (x[2]*1000))
}
else if ( length(xvec) == 3) { #shows hh, mm, ss
sum((x[1]*3600000), (x[2]*60000), (x[3]*1000))
}}
else {
x
}
})
}

if(lemmatize==TRUE) {
clean <- function(x) {
x <- tolower(x) #to lower
Expand All @@ -59,14 +59,7 @@ clean_dyads <- function(read_ts_df, lemmatize=TRUE) {
x <- gsub("n't", " not", x) #replace contraction with full word not
x <- textclean::replace_contraction(x) #replace contractions
x <- gsub("-", " ", x) #replace all hyphens with spaces

if (default == TRUE) {
x <- tm::removeWords(x, omissions_dyads23$word)
}
else {
x <- tm::removeWords(x, stop_words_df$word)
}

x <- tm::removeWords(x, omissions_dyads23$word)
x <- gsub("\\d+(st|nd|rd|th)", " ", x) #omits 6th, 23rd, ordinal numbers
x <- gsub("[^a-zA-Z]", " ", x) #omit non-alphabetic characters
x <- gsub("\\b[a]\\b{1}", " ", x)
Expand All @@ -75,7 +68,7 @@ clean_dyads <- function(read_ts_df, lemmatize=TRUE) {
x <- textstem::lemmatize_strings(x) #lemmatize
}
}

if(lemmatize==FALSE) {
clean <- function(x) {
x <- tolower(x) #to lower
Expand All @@ -87,36 +80,28 @@ clean_dyads <- function(read_ts_df, lemmatize=TRUE) {
x <- gsub("n't", " not", x) #replace contraction with full word not
x <- textclean::replace_contraction(x) #replace contractions
x <- gsub("-", " ", x) #replace all hyphens with spaces

if (default == TRUE) {
x <- tm::removeWords(x, omissions_dyads23$word)
}
else {
x <- tm::removeWords(x, stop_words_df$word)
}

x <- tm::removeWords(x, omissions_dyads23$word)
x <- gsub("\\d+(st|nd|rd|th)", " ", x) #omits 6th, 23rd, ordinal numbers
x <- gsub("[^a-zA-Z]", " ", x) #omit non-alphabetic characters
x <- gsub("\\b[a]\\b{1}", " ", x)
x <- tm::stripWhitespace(x)
x <- stringr::str_squish(x)
}
}

read_ts_df$RawText <- stringr::str_squish(read_ts_df$RawText) #remove unneeded white space from text

dfclean <- read_ts_df %>%
dplyr::mutate(CleanText = clean(RawText)) %>% #run clean function on text, making a new column
dplyr::group_by(Event_ID) %>% # group by event to compute turn
dplyr::mutate(TurnCount = dplyr::consecutive_id(Participant_ID), .after = Participant_ID) %>% # create a turn count
dplyr::group_by(TurnCount, .add = TRUE) %>% #group and take word count by turn for both raw and clean utterances
dplyr::mutate(NWords_ByPersonTurn_RAW = stringi::stri_count_words(paste(RawText, collapse = " ")),
NWords_ByPersonTurn_CLEAN = stringi::stri_count_words(paste(CleanText,
collapse = " "))) %>%
NWords_ByPersonTurn_CLEAN = stringi::stri_count_words(paste(CleanText, collapse = " "))) %>%
dplyr::select(-RawText) %>% # remove the raw text column
dplyr::ungroup()

dfclean_sep <- tidyr::separate_rows(dfclean, CleanText) # create row for each word in clean text

return(dfclean_sep)
}
58 changes: 32 additions & 26 deletions R/read_dyads.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

read_dyads <- function(folder_name = "my_transcripts") {
#defines three functions - the two that select and format txt and csv files, and the function that actually reads in the otter transcript txt file.

read_otter_transcript <- function(file_path) {
lines <- readLines(file_path) #read otter ai file
#removes otter ai watermark if it is present
Expand All @@ -22,15 +22,15 @@ read_dyads <- function(folder_name = "my_transcripts") {
speaker <- character()
time <- character()
text <- character()

#process lines of dialogue
current_line <- 1
while (current_line <= num_lines) {

first_line_vec <- strsplit(lines[current_line], " ")
speaker_time <- first_line_vec[[1]]
speaker <- c(speaker, speaker_time[1]) #select speaker
# check if time is included in mm:ss format and if so add it to the time column
# check if time is included in mm:ss format and if so add it to the time column
if (any(grepl(":", speaker_time)) == TRUE) {
timeadd <- grep(":", speaker_time)
}
Expand All @@ -55,15 +55,15 @@ read_dyads <- function(folder_name = "my_transcripts") {
return(transcript_df)
}
#END DEFINE OTTER READ TRANSCRIPT .TXT FILE FUNCTION
#DEFINE READ ME TXT FILE FUNCTION

#DEFINE READ ME TXT FILE FUNCITON
read_dyads_txt <- function(folder_name){
if (any(grepl("*.txt$", list.files(path = folder_name, pattern = ".", full.names = TRUE, recursive = TRUE))) == TRUE) {
file_list_txt <- list.files(path = folder_name, pattern = "*.txt$", full.names = TRUE, recursive = TRUE) #list files with .txt ending
file_names_txt <- list.files(path = folder_name, pattern = ".txt$", full.names = FALSE, recursive = TRUE)
file_names_txt <- gsub('.*/ ?(\\w+)', '\\1', file_names_txt)
file_names_txt <- gsub(".txt$", "", file_names_txt)

txtdata <- lapply(file_list_txt, function(x) {
#runs txt files names through otter reading function
xorf <- read_otter_transcript(x)
Expand All @@ -76,33 +76,30 @@ read_dyads <- function(folder_name = "my_transcripts") {
stop(paste("Unable to read transcript ", as.character(match(x, file_list_txt)), " as an otter transcript. Please refer to the ConversationAlign GitHub page for examples of properly formatted transcripts.", sep = ""), call. = FALSE)
}
})

#adds a doc id column to each transcript based on its name attribute
txtdata <- lapply(file_names_txt, function(x){
txtdata[[match(x, file_names_txt)]] <- cbind(Event_ID = rep(x, nrow(txtdata[[match(x, file_names_txt)]])), txtdata[[match(x, file_names_txt)]])})
return(txtdata)
}}
#END DEFINE READ ME TXT FILE FUNCITON

#DEFINE READ ME CSV FILE FUNCTION
read_dyads_csv <- function(folder_name) {
if (any(grepl("*.csv$", list.files(path = folder_name, pattern = ".",
full.names = TRUE, recursive = TRUE))) == TRUE) {
file_list_csv <- list.files(path = folder_name, pattern = "*.csv$",
full.names = TRUE, recursive = TRUE)
file_names_csv <- list.files(path = folder_name, pattern = ".csv$",
full.names = FALSE, recursive = TRUE)
if (any(grepl("*.csv$", list.files(path = folder_name, pattern = ".", full.names = TRUE, recursive = TRUE))) == TRUE) {
file_list_csv <- list.files(path = folder_name, pattern = "*.csv$", full.names = TRUE, recursive = TRUE)
file_names_csv <- list.files(path = folder_name, pattern = ".csv$", full.names = FALSE, recursive = TRUE)
file_names_csv<- gsub('.*/ ?(\\w+)', '\\1', file_names_csv)
file_names_csv <- gsub(".csv$", "", file_names_csv)

#creates a list of read in csv data frames
csvdata <- lapply(file_list_csv, function(x){
x_read_csv <- read.csv(x, header = TRUE)
#identify and remove all columns that are entirely NA or empty strings
sum_nas_es <- apply(x_read_csv, 2, function(y){sum(is.na(y) | y == "")})
x_read_csv <- x_read_csv[sum_nas_es < nrow(x_read_csv)]
x_read_csv <- data.frame(x_read_csv)

#check that the column names are correct
if ((any(grepl("^speaker$", colnames(x_read_csv), ignore.case = T)) == TRUE |
any(grepl("^speaker_names_raw$", colnames(x_read_csv), ignore.case = T)) == TRUE |
Expand All @@ -116,7 +113,16 @@ read_dyads <- function(folder_name = "my_transcripts") {
(any(grepl("^Text$", colnames(x_read_csv), ignore.case = T)) == TRUE |
any(grepl("^Turn$", colnames(x_read_csv), ignore.case = T)) == TRUE |
any(grepl("^Utterance$", colnames(x_read_csv), ignore.case = T)) == TRUE)) {


#tests for column named Time - if not present it adds a Time column filled with NAs
if (any(grepl("^Time*", colnames(x_read_csv), ignore.case = TRUE)) |
any(grepl("^start*", colnames(x_read_csv), ignore.case = TRUE))) {
colnames(x_read_csv)[which(grepl("^Time*", colnames(x_read_csv), ignore.case = T) |
grepl("^start*", colnames(x_read_csv), ignore.case = T))] <- "Time"
}
else {
x_read_csv$Time <- rep(NA, nrow(x_read_csv)) #if no Time col fill with NA
}
#correct the speaker and text names to our conventions
colnames(x_read_csv)[which(grepl("speaker", colnames(x_read_csv), ignore.case = T) |
grepl("PID", colnames(x_read_csv), ignore.case = T) |
Expand All @@ -126,25 +132,25 @@ read_dyads <- function(folder_name = "my_transcripts") {
grepl("partner", colnames(x_read_csv), ignore.case = T) |
grepl("source", colnames(x_read_csv), ignore.case = T) |
grepl("participant", colnames(x_read_csv), ignore.case = T))] <- "Participant_ID"

colnames(x_read_csv)[which(grepl("Text", colnames(x_read_csv), ignore.case = T) |
grepl("utterance", colnames(x_read_csv), ignore.case = T))] <- "RawText"

x_read_csv <- data.frame(x_read_csv)
x_final <- x_read_csv
}
col_check <- sum(colnames(x_read_csv) %in% c("Participant_ID", "RawText"))

if (col_check != 2) { #if there are less than three columns

col_check <- x_read_csv[, colnames(x_read_csv) %in%
c("Participant_ID", "RawText", "Time")]
if (ncol(col_check) != 3) { #if there are less than three columns
stop(paste("Function is unable to process csv transcript ", #error stating missing column
as.character(match(x, file_list_csv)), #also states the transcript
" correctly. Make sure that each transcript includes a column marking who is producing text in each row.
When you are preparing your language transcripts, please make sure this column is named one of the following: 'interlocutor', 'person', 'partner', 'source', 'speaker', 'participant', 'PID', or 'speaker_names_raw'.
Please make sure your a column containing raw language transcriptions is named 'utterance', 'turn', or 'text'.
If you wish to include a Time column please title it 'Time' or 'start'.",
sep = ""), call. = FALSE)

}
x_final
})
Expand All @@ -158,12 +164,12 @@ read_dyads <- function(folder_name = "my_transcripts") {
txtlist <- read_dyads_txt(folder_name)
csvlist <- read_dyads_csv(folder_name)
all_list <- append(txtlist, csvlist) #append the two lists into one list

#throws an error if no files are found
if (length(all_list) == 0) {
stop("No files found. Please make sure you are providing the local or absolute file path to the desired folder as a character vector. At least one .csv or .txt file must be present.")
}

alldf <- dplyr::bind_rows(all_list) #binds the rows of each list into one data frame
alldf$Event_ID <- as.factor(alldf$Event_ID)
alldf$Participant_ID <- as.factor(alldf$Participant_ID)
Expand Down
Loading

0 comments on commit 6789257

Please sign in to comment.