Merge branch 'get-most-comments' into master, add new function as int…

…ermediate fix for `get_all_comments()` that doesn't seem to be working properly (see gojiplus#52)
balthasars · Apr 20, 2021 · 77092a8 · 77092a8
2 parents 20f9eb5 + 8acb717
commit 77092a8
Show file tree

Hide file tree

Showing 7 changed files with 244 additions and 12 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -26,7 +26,9 @@ Imports:
     magrittr,
     tidyr,
     tidyselect,
-    tibble
+    tibble,
+    stringr,
+    rlang
 VignetteBuilder: knitr
 Suggests:
     knitr (>= 1.11),

diff --git a/NAMESPACE b/NAMESPACE
@@ -13,6 +13,7 @@ export(get_captions)
 export(get_channel_stats)
 export(get_comment_threads)
 export(get_comments)
+export(get_most_comments)
 export(get_playlist_items)
 export(get_playlists)
 export(get_related_videos)
@@ -44,7 +45,10 @@ importFrom(dplyr,bind_rows)
 importFrom(dplyr,filter)
 importFrom(dplyr,mutate)
 importFrom(dplyr,pull)
+importFrom(dplyr,rename)
+importFrom(dplyr,rename_at)
 importFrom(dplyr,select)
+importFrom(dplyr,vars)
 importFrom(httr,DELETE)
 importFrom(httr,GET)
 importFrom(httr,POST)
@@ -61,11 +65,15 @@ importFrom(magrittr,"%>%")
 importFrom(plyr,ldply)
 importFrom(purrr,map_dbl)
 importFrom(purrr,map_df)
+importFrom(rlang,.data)
+importFrom(stringr,str_remove)
 importFrom(tibble,enframe)
 importFrom(tidyr,pivot_wider)
 importFrom(tidyr,unnest)
 importFrom(tidyr,unnest_longer)
+importFrom(tidyr,unnest_wider)
 importFrom(tidyselect,all_of)
 importFrom(tidyselect,everything)
+importFrom(tidyselect,starts_with)
 importFrom(utils,browseURL)
 importFrom(utils,read.table)
diff --git a/R/get_most_comments.R b/R/get_most_comments.R
@@ -0,0 +1,189 @@
+# helpers
+
+parse_comment_thread <- function(res) {
+  res %>%
+    # fields that may not be available:
+    # live streaming details
+    conditional_unnest_wider(var = "topLevelComment") %>%
+    conditional_unnest_wider(var = "topLevelComment_snippet") %>%
+    conditional_unnest_wider(var = "topLevelComment_snippet_authorChannelId") %>%
+    conditional_unnest_wider(var = "pageInfo") %>%
+    dplyr::select(-c(id)) %>%
+    # rename to make compatible with other comments later
+    dplyr::rename_at(
+      dplyr::vars(tidyselect::starts_with("topLevelComment_")),
+      ~stringr::str_remove(.x, "topLevelComment_")
+    ) %>%
+    dplyr::mutate(is_reply = FALSE)
+}
+
+parse_replies <- function(comment_thread) {
+  replies <- comment_thread %>%
+    dplyr::select(replies, totalReplyCount) %>%
+    tidyr::unnest_wider(replies) %>%
+    dplyr::filter(totalReplyCount > 0)
+
+  if (nrow(replies) >= 0) {
+    replies <- replies %>%
+      tidyr::unnest(comments) %>%
+      conditional_unnest_wider("comments") %>%
+      # rename to make compatible with other comments
+      dplyr::rename_at(
+        dplyr::vars(tidyselect::starts_with("comments_")),
+        ~ stringr::str_remove(.x, "comments_")
+      ) %>%
+      conditional_unnest_wider("snippet") %>%
+      conditional_unnest_wider("snippet_authorChannelId") %>%
+      dplyr::mutate(is_reply = TRUE)
+  }
+}
+
+get_parse_bind_comments <- function(filter = NULL, page_token = NULL,
+                                    part = part, text_format = text_format, max_results = 100) {
+  if (max_results < 20) {
+    stop("max_results only takes a value over 20.
+            Above 100, it outputs all the results.")
+  }
+
+  if (text_format != "html" & text_format != "plainText") {
+    stop("Provide a legitimate value of textFormat.")
+  }
+
+  if (!(names(filter) %in%
+        c("video_id", "channel_id", "thread_id", "threads_related_to_channel"))) {
+    stop("filter can only take one of values: channel_id, video_id, parent_id,
+        threads_related_to_channel.")
+  }
+
+  if (length(filter) != 1) stop("filter must be a vector of length 1.")
+
+  orig_filter <- filter
+  translate_filter <- c(
+    video_id = "videoId", thread_id = "id",
+    threads_related_to_channel = "allThreadsRelatedToChannelId",
+    channel_id = "channelId", page_token = "pageToken"
+  )
+
+  yt_filter_name <- as.vector(translate_filter[match(
+    names(filter),
+    names(translate_filter)
+  )])
+  names(filter) <- yt_filter_name
+
+  querylist <- list(
+    part = part, maxResults =
+      ifelse(max_results > 100, 100, max_results),
+    textFormat = text_format,
+    pageToken = page_token
+  )
+
+  querylist <- c(querylist, filter)
+  # print(querylist)
+
+  ## get first page of results of a comment thread and
+  ## initialize objects with content of first page before
+  ## proceeding to next pages of API response
+  res <- tuber_GET("commentThreads", querylist)
+  # parse results
+  snippet <- parse_snippet(res)
+  comment_thread <- parse_comment_thread(snippet)
+  replies <- parse_replies(comment_thread)
+  # get columns names of columns that will be NA upon binding the two dataframes
+  na_cols_1 <- setdiff(
+    colnames(comment_thread),
+    colnames(replies)
+  )
+  # setdiff(
+  #   colnames(replies),
+  #   colnames(comment_thread)
+  # )
+
+  comments <- dplyr::bind_rows(
+    comment_thread, replies
+  ) %>%
+    dplyr::select(-c(replies)) %>%
+    # dplyr::filter(totalReplyCount > 1) %>%
+    # make columns complete if missing to avoid NAs
+    tidyr::fill(tidyselect::any_of(na_cols_1), .direction = "down")
+
+  # get all following pages of comment thread
+  # agg_res <- comments
+
+  #   # shouldn't this be `unique()`?
+  # next_page_token <- unique(res$nextPageToken)
+  # print("erstes Mal")
+  # print(next_page_token)
+
+  # print("finished")
+
+  comments
+}
+
+
+#' Get Most Comments
+#'
+#' Retrieves all top level comments and replies to them.
+#' Replies to replies are not included.
+#' @param video_id ID of video, required.
+#' @return
+#' Data frame with all comments and replies.
+#'
+#' @export get_most_comments
+#'
+#' @examples
+#' \dontrun{
+#'
+#' # Set API token via yt_oauth() first
+#'
+#' get_most_comments(video_id = "Hop_MfkXl7c")
+#' }
+#'
+#'
+#'
+get_most_comments <- function(video_id) {
+  video_id_arg <- video_id
+  part_arg <- "snippet,replies,id"
+  filter_arg <- c(video_id = video_id_arg)
+  text_format_arg <- "html"
+
+  # initialize objects for loop
+  all_data <- get_parse_bind_comments(
+    filter = filter_arg,
+    page_token = NULL,
+    part = part_arg,
+    text_format = text_format_arg,
+    max_results = 100
+    )
+  counter_while <- 0
+  suppressWarnings(next_page_token <- unique(all_data$nextPageToken))
+
+  # loop over results until last nextPageToken
+  while (counter_while == 0 | !is.null(next_page_token)) {
+    next_data <- get_parse_bind_comments(
+      filter = filter_arg,
+      page_token = next_page_token,
+      part = part_arg,
+      text_format = text_format_arg,
+      max_results = 100
+    )
+    counter_while <- counter_while + 1
+    # cli::cli_alert_success("Page {counter_while} packages.")
+
+    # overwrite `next_page_token` that was initialized outside loop
+    # with new content that was just retrieved in the data
+    suppressWarnings(next_page_token <- unique(all_data$nextPageToken))
+
+    # overwrite `all_data` that was initialized outside loop
+    # using `all_data` from outside of loop in first iteration
+    # and then using itself from previous iteration plus
+    # new `next_data`.
+    all_data <- dplyr::bind_rows(next_data, all_data)
+  }
+  return(all_data)
+}
+
+# tuber::yt_oauth(
+#   app_id = Sys.getenv("YOUTUBE_API_APP_ID"),
+#   app_secret = Sys.getenv("YOUTUBE_API_CLIENT_SECRET")
+# )
+# get_most_comments(video_id = "Hop_MfkXl7c")
diff --git a/R/get_video_details.R b/R/get_video_details.R
@@ -7,9 +7,8 @@ conditional_unnest_wider <- function(data_input, var) {
   }
 }
 
-
-json_to_df <- function(res) {
-  intermediate <- res %>%
+parse_snippet <- function(res){
+  res %>%
     tibble::enframe() %>%
     tidyr::pivot_wider() %>%
     tidyr::unnest(cols = c(kind, etag)) %>%
@@ -20,8 +19,10 @@ json_to_df <- function(res) {
     # reflect level of nesting in column name for those that may not be unique
     dplyr::rename(items_kind = kind, items_etag = etag) %>%
     tidyr::unnest_wider(snippet)
+}
 
-  intermediate_2 <- intermediate %>%
+parse_video_details <- function(res) {
+  res %>%
     # fields that may not be available:
     # live streaming details
     conditional_unnest_wider(var = "liveStreamingDetails") %>%
@@ -46,9 +47,6 @@ json_to_df <- function(res) {
     conditional_unnest_wider(var = "thumbnails_medium") %>%
     conditional_unnest_wider(var = "thumbnails_high") %>%
     conditional_unnest_wider(var = "thumbnails_maxres")
-
-
-  intermediate_2
 }
 
 #' Get Details of a Video or Videos
@@ -124,7 +122,8 @@ get_video_details <- function(video_id = NULL, part = "snippet", as.data.frame =
   }
 
   if (as.data.frame) {
-    raw_res <- json_to_df(raw_res)
+    snippet_df <- parse_snippet(raw_res)
+    raw_res <- parse_video_details(snippet_df)
   }
 
   raw_res

diff --git a/R/globals.R b/R/globals.R
@@ -0,0 +1,3 @@
+# fix warning about no visible bindings due to tidyverse functions
+# https://community.rstudio.com/t/how-to-solve-no-visible-binding-for-global-variable-note/28887
+utils::globalVariables(c("totalReplyCount", "id", "comments", "kind", "etag", "items", "snippet", "video_id_arg"))
diff --git a/R/tuber.R b/R/tuber.R
@@ -8,10 +8,12 @@
 #' @importFrom httr upload_file content oauth_endpoints oauth_app oauth2.0_token
 #' @importFrom utils read.table
 #' @importFrom plyr ldply
-#' @importFrom dplyr bind_rows select pull filter mutate
+#' @importFrom dplyr bind_rows select pull filter mutate vars rename_at rename
+#' @importFrom rlang .data
+#' @importFrom stringr str_remove
 #' @importFrom tibble enframe
-#' @importFrom tidyselect everything all_of
-#' @importFrom tidyr pivot_wider unnest unnest_longer
+#' @importFrom tidyselect everything all_of starts_with
+#' @importFrom tidyr pivot_wider unnest unnest_longer unnest_wider
 #' @importFrom purrr map_df map_dbl
 #' @docType package
 NULL

diff --git a/man/get_most_comments.Rd b/man/get_most_comments.Rd