Fix. Refactor docx_comments to (properly) account for comments spanni…

…ng multiple paragraphs, multiple comments in the same paragraph and replies. Closes #541. Update docs to include a detailed description of the columns in the returned dataframe.
davidgohel · Jan 4, 2024 · 9a44f52 · 9a44f52
1 parent 611328d
commit 9a44f52
Show file tree

Hide file tree

Showing 5 changed files with 335 additions and 38 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,17 @@
+# officer 0.6.3.9000
+
+## Changes
+
+- the dataframe returned by `docx_comments()` gains a list column `para_id` containing 
+the id(s) of the parent paragraph(s). A possible breaking change is that the `commented_text` 
+is now returned as a list column to account for comments spanning multiple runs.
+The docs gain a description of the columns of the returned dataframe.
+
+## Issues
+
+- Fix. Refactor `docx_comments()` to (properly) account for comments spanning 
+multiple paragraphs, multiple comments in the same paragraph and replies. Closes #541.
+
 # officer 0.6.3
 
 ## Features

diff --git a/R/docx_comments.R b/R/docx_comments.R
@@ -1,6 +1,22 @@
 #' @title Get comments in a Word document as a data.frame
 #' @description return a data.frame representing the comments in a Word document.
 #' @param x an rdocx object
+#' @details
+#' Each row of the returned data frame contains data for one comment. The
+#'    columns contain the following information:
+#' * "comment_id" - unique comment id
+#' * "author" - name of the comment author
+#' * "initials" - initials of the comment author
+#' * "date" - timestamp of the comment
+#' * "text" - a list column of characters containing the comment text. Elements can
+#'    be vectors of length > 1 if a comment contains multiple paragraphs,
+#'    blocks or runs or of length 0 if the comment is empty.
+#' * "para_id" - a list column of characters containing the parent paragraph IDs.
+#'    Elememts can be vectors of length > 1 if a comment spans multiple paragraphs
+#'    or of length 0 if the comment has no parent paragraph.
+#' * "commented_text" - a list column of characters containing the
+#'    commented text. Elememts can be vectors of length > 1 if a comment
+#'    spans multiple paragraphs or runs or of length 0 if the commented text is empty.
 #' @examples
 #' bl <- block_list(
 #'   fpar("Comment multiple words."),
@@ -10,7 +26,7 @@
 #' a_par <- fpar(
 #'   "This paragraph contains",
 #'   run_comment(
-#'   cmt = bl,
+#'     cmt = bl,
 #'     run = ftext("a comment."),
 #'     author = "Author Me",
 #'     date = "2023-06-01"
@@ -27,19 +43,33 @@
 docx_comments <- function(x) {
   stopifnot(inherits(x, "rdocx"))
 
-  comment_nodes <- xml_find_all(
-    x$doc_obj$get(), "//*[self::w:p/w:commentRangeStart]"
+  comment_ids <- xml_attr(
+    xml_find_all(
+      x$doc_obj$get(), "/w:document/w:body//*[self::w:commentRangeStart]"
+    ), "id"
   )
 
-  if (length(comment_nodes) > 0) {
-    data <- lapply(comment_nodes, comment_as_tibble)
-    data <- rbind_match_columns(data)
-  } else {
-    data <- data.frame(
-      comment_id = integer(0),
-      commented_text = character(0)
+  comment_text_runs <- lapply(comment_ids, \(id) {
+    xml_find_all(
+      x$doc_obj$get(),
+      paste0(
+        "/w:document/w:body//*[self::w:r[w:t and",
+        "preceding::w:commentRangeStart[@w:id=\'", id, "\']",
+        " and ",
+        "following::w:commentRangeEnd[@w:id=\'", id, "\']]]"
+      )
     )
-  }
+  })
+
+  data <- data.frame(
+    comment_id = comment_ids
+  )
+  # Add parent paragraph id
+  data$para_id <- lapply(
+    comment_text_runs,
+    function(x) xml_attr(xml_parent(x), "paraId")
+  )
+  data$commented_text <- lapply(comment_text_runs, xml_text)
 
   comments <- xml_find_all(x$comments$get(), "//w:comments/w:comment")
 
@@ -57,30 +87,5 @@ docx_comments <- function(x) {
   )
 
   data <- merge(out, data, by = "comment_id", all.x = TRUE)
-
-  data
-}
-
-comment_as_tibble <- function(node) {
-  node_name <- xml_name(node)
-  name_children <- xml_name(xml_children(node))
-
-  comment_range <- grep("commentRange", name_children)
-
-  comment_data <- data.frame(
-    comment_id = xml_attr(xml_child(node, comment_range[[1]]), "id"),
-    stringsAsFactors = FALSE
-  )
-  comment_range <- seq(comment_range[[1]] + 1, comment_range[[2]] - 1)
-  comment_data$commented_text <-
-    paste0(
-      vapply(
-        comment_range,
-        function(x) xml_text(xml_child(node, x)),
-        character(1)
-      ),
-      collapse = ""
-    )
-
-  comment_data
+  data[order(as.integer(data$comment_id)), ]
 }
diff --git a/man/docx_comments.Rd b/man/docx_comments.Rd
diff --git a/tests/testthat/docs_dir/test-docx_comments.docx b/tests/testthat/docs_dir/test-docx_comments.docx