From 1d672b1070984e289c75932ab55474bfd4a85065 Mon Sep 17 00:00:00 2001 From: Balthasar Date: Mon, 19 Apr 2021 14:37:24 +0200 Subject: [PATCH 1/6] make code more modular for `get_video_details()` by dividing helpers up --- R/get_video_details.R | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/R/get_video_details.R b/R/get_video_details.R index c9811ca..5f508c0 100644 --- a/R/get_video_details.R +++ b/R/get_video_details.R @@ -7,9 +7,8 @@ conditional_unnest_wider <- function(data_input, var) { } } - -json_to_df <- function(res) { - intermediate <- res %>% +parse_snippet <- function(res){ + res %>% tibble::enframe() %>% tidyr::pivot_wider() %>% tidyr::unnest(cols = c(kind, etag)) %>% @@ -20,8 +19,10 @@ json_to_df <- function(res) { # reflect level of nesting in column name for those that may not be unique dplyr::rename(items_kind = kind, items_etag = etag) %>% tidyr::unnest_wider(snippet) +} - intermediate_2 <- intermediate %>% +parse_video_details <- function(res) { + res %>% # fields that may not be available: # live streaming details conditional_unnest_wider(var = "liveStreamingDetails") %>% @@ -46,9 +47,6 @@ json_to_df <- function(res) { conditional_unnest_wider(var = "thumbnails_medium") %>% conditional_unnest_wider(var = "thumbnails_high") %>% conditional_unnest_wider(var = "thumbnails_maxres") - - - intermediate_2 } #' Get Details of a Video or Videos @@ -124,7 +122,8 @@ get_video_details <- function(video_id = NULL, part = "snippet", as.data.frame = } if (as.data.frame) { - raw_res <- json_to_df(raw_res) + snippet_df <- parse_snippet(raw_res) + raw_res <- parse_video_details(snippet_df) } raw_res From a0cfbb77b84e423ef757c73d06ff55d3d97e9050 Mon Sep 17 00:00:00 2001 From: Balthasar Date: Mon, 19 Apr 2021 20:42:55 +0200 Subject: [PATCH 2/6] get_most_comments --- DESCRIPTION | 4 +- NAMESPACE | 8 ++ R/get_most_comments.R | 179 +++++++++++++++++++++++++++++++++++++++ R/globals.R | 3 + R/tuber.R | 8 +- man/get_most_comments.Rd | 69 +++++++++++++++ 6 files changed, 267 insertions(+), 4 deletions(-) create mode 100644 R/get_most_comments.R create mode 100644 R/globals.R create mode 100644 man/get_most_comments.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 036aaa1..47dcaca 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -26,7 +26,9 @@ Imports: magrittr, tidyr, tidyselect, - tibble + tibble, + stringr, + rlang VignetteBuilder: knitr Suggests: knitr (>= 1.11), diff --git a/NAMESPACE b/NAMESPACE index ce09053..ef74caf 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -13,6 +13,7 @@ export(get_captions) export(get_channel_stats) export(get_comment_threads) export(get_comments) +export(get_most_comments) export(get_playlist_items) export(get_playlists) export(get_related_videos) @@ -44,7 +45,10 @@ importFrom(dplyr,bind_rows) importFrom(dplyr,filter) importFrom(dplyr,mutate) importFrom(dplyr,pull) +importFrom(dplyr,rename) +importFrom(dplyr,rename_at) importFrom(dplyr,select) +importFrom(dplyr,vars) importFrom(httr,DELETE) importFrom(httr,GET) importFrom(httr,POST) @@ -61,11 +65,15 @@ importFrom(magrittr,"%>%") importFrom(plyr,ldply) importFrom(purrr,map_dbl) importFrom(purrr,map_df) +importFrom(rlang,.data) +importFrom(stringr,str_remove) importFrom(tibble,enframe) importFrom(tidyr,pivot_wider) importFrom(tidyr,unnest) importFrom(tidyr,unnest_longer) +importFrom(tidyr,unnest_wider) importFrom(tidyselect,all_of) importFrom(tidyselect,everything) +importFrom(tidyselect,starts_with) importFrom(utils,browseURL) importFrom(utils,read.table) diff --git a/R/get_most_comments.R b/R/get_most_comments.R new file mode 100644 index 0000000..7972228 --- /dev/null +++ b/R/get_most_comments.R @@ -0,0 +1,179 @@ +# helpers + +parse_comment_thread <- function(res) { + res %>% + # fields that may not be available: + # live streaming details + conditional_unnest_wider(var = "topLevelComment") %>% + conditional_unnest_wider(var = "topLevelComment_snippet") %>% + conditional_unnest_wider(var = "topLevelComment_snippet_authorChannelId") %>% + dplyr::select(-c(id)) %>% + # rename to make compatible with other comments later + dplyr::rename_at( + dplyr::vars(tidyselect::starts_with("topLevelComment_")), + ~stringr::str_remove(.x, "topLevelComment_") + ) %>% + dplyr::mutate(is_reply = FALSE) +} + +parse_replies <- function(comment_thread) { + replies <- comment_thread %>% + dplyr::select(replies, totalReplyCount) %>% + tidyr::unnest_wider(replies) %>% + dplyr::filter(totalReplyCount > 0) + + if (nrow(replies) >= 0) { + replies <- replies %>% + tidyr::unnest(comments) %>% + conditional_unnest_wider("comments") %>% + # rename to make compatible with other comments + dplyr::rename_at( + dplyr::vars(tidyselect::starts_with("comments_")), + ~ stringr::str_remove(.x, "comments_") + ) %>% + conditional_unnest_wider("snippet") %>% + conditional_unnest_wider("snippet_authorChannelId") %>% + dplyr::mutate(is_reply = TRUE) + } +} + +#' Get Most Comments +#' +#' @param filter string; Required. +#' named vector of length 1 +#' potential names of the entry in the vector: +#' \code{video_id}: video ID. +#' \code{channel_id}: channel ID. +#' \code{thread_id}: comma-separated list of comment thread IDs +#' \code{threads_related_to_channel}: channel ID. +#' +#' @param part Comment resource requested. Required. Comma separated list +#' of one or more of the +#' following: \code{id, snippet}. e.g., \code{"id, snippet"}, +#' \code{"id"}, etc. Default: \code{snippet}. +#' @param max_results Maximum number of items that should be returned. +#' Integer. Optional. Default is 100. +#' If the value is greater than 100 then the function fetches all the +#' results. The outcome is a simplified \code{data.frame}. +#' @param page_token Specific page in the result set that should be +#' returned. Optional. +#' @param text_format Data Type: Character. Default is \code{"html"}. +#' Only takes \code{"html"} or \code{"plainText"}. Optional. +#' @param \dots Additional arguments passed to \code{\link{tuber_GET}}. +#' +#' @return +#' Nested named list. The entry \code{items} is a list of comments +#' along with meta information. +#' Within each of the \code{items} is an item \code{snippet} which +#' has an item \code{topLevelComment$snippet$textDisplay} +#' that contains the actual comment. +#' +#' If simplify is \code{TRUE}, a \code{data.frame} with the following columns: +#' \code{authorDisplayName, authorProfileImageUrl, authorChannelUrl, +#' authorChannelId.value, videoId, textDisplay, +#' canRate, viewerRating, likeCount, publishedAt, updatedAt} +#' +#' @export get_most_comments +#' +#' @references \url{https://developers.google.com/youtube/v3/docs/commentThreads/list} +#' +#' @examples +#' \dontrun{ +#' +#' # Set API token via yt_oauth() first +#' +#' get_most_comments(filter = c(video_id = "N708P-A45D0")) +#' get_most_comments(filter = c(video_id = "N708P-A45D0"), max_results = 101) +#' } +get_most_comments <- function(filter = NULL, part = "snippet,replies", + text_format = "html", max_results = 101, page_token = NULL, ...) { + if (max_results < 20) { + stop("max_results only takes a value over 20. + Above 100, it outputs all the results.") + } + + if (text_format != "html" & text_format != "plainText") { + stop("Provide a legitimate value of textFormat.") + } + + if (!(names(filter) %in% + c("video_id", "channel_id", "thread_id", "threads_related_to_channel"))) { + stop("filter can only take one of values: channel_id, video_id, parent_id, + threads_related_to_channel.") + } + + if (length(filter) != 1) stop("filter must be a vector of length 1.") + + orig_filter <- filter + translate_filter <- c( + video_id = "videoId", thread_id = "id", + threads_related_to_channel = "allThreadsRelatedToChannelId", + channel_id = "channelId", page_token = "pageToken" + ) + + yt_filter_name <- as.vector(translate_filter[match( + names(filter), + names(translate_filter) + )]) + names(filter) <- yt_filter_name + + querylist <- list( + part = part, maxResults = + ifelse(max_results > 100, 100, max_results), + textFormat = text_format + ) + querylist <- c(querylist, filter) + + ## get first page of results of a comment thread and + ## initialize objects with content of first page before + ## proceeding to next pages of API response + res <- tuber_GET("commentThreads", querylist, ...) + # parse results + snippet <- parse_snippet(res) + comment_thread <- parse_comment_thread(snippet) + replies <- parse_replies(comment_thread) + # get columns names of columns that will be NA upon binding the two dataframes + na_cols_1 <- setdiff( + colnames(comment_thread), + colnames(replies) + ) + # setdiff( + # colnames(replies), + # colnames(comment_thread) + # ) + + comments <- dplyr::bind_rows( + comment_thread, replies + ) %>% + dplyr::select(-c(replies)) %>% + dplyr::filter(totalReplyCount > 1) %>% + # make columns complete if missing to avoid NAs + tidyr::fill(tidyselect::any_of(na_cols_1), .direction = "down") + + # get all following pages of comment thread + agg_res <- comments + + # # shouldn't this be `unique()`? + next_page_token <- res$nextPageToken + print("erstes Mal") + print(next_page_token) + + while (!is.null(next_page_token)) { + print("zweites Mal") + print(next_page_token) + next_results <- get_most_comments(orig_filter, + part = part, + text_format = text_format, + simplify = FALSE, + max_results = 101, + page_token = next_page_token + ) + agg_res <- rbind(next_results, agg_res) + # get token with link to next result page + next_page_token <- next_results$nextPageToken + print(next_page_token) + } + print("finished") + return(agg_res) +} + diff --git a/R/globals.R b/R/globals.R new file mode 100644 index 0000000..100c9e0 --- /dev/null +++ b/R/globals.R @@ -0,0 +1,3 @@ +# fix warning about no visible bindings due to tidyverse functions +# https://community.rstudio.com/t/how-to-solve-no-visible-binding-for-global-variable-note/28887 +utils::globalVariables(c("totalReplyCount", "id", "comments", "kind", "etag", "items", "snippet")) diff --git a/R/tuber.R b/R/tuber.R index cba0a64..00ebb27 100644 --- a/R/tuber.R +++ b/R/tuber.R @@ -8,10 +8,12 @@ #' @importFrom httr upload_file content oauth_endpoints oauth_app oauth2.0_token #' @importFrom utils read.table #' @importFrom plyr ldply -#' @importFrom dplyr bind_rows select pull filter mutate +#' @importFrom dplyr bind_rows select pull filter mutate vars rename_at rename +#' @importFrom rlang .data +#' @importFrom stringr str_remove #' @importFrom tibble enframe -#' @importFrom tidyselect everything all_of -#' @importFrom tidyr pivot_wider unnest unnest_longer +#' @importFrom tidyselect everything all_of starts_with +#' @importFrom tidyr pivot_wider unnest unnest_longer unnest_wider #' @importFrom purrr map_df map_dbl #' @docType package NULL diff --git a/man/get_most_comments.Rd b/man/get_most_comments.Rd new file mode 100644 index 0000000..48cd150 --- /dev/null +++ b/man/get_most_comments.Rd @@ -0,0 +1,69 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_most_comments.R +\name{get_most_comments} +\alias{get_most_comments} +\title{Get Most Comments} +\usage{ +get_most_comments( + filter = NULL, + part = "snippet,replies", + text_format = "html", + max_results = 101, + page_token = NULL, + ... +) +} +\arguments{ +\item{filter}{string; Required. +named vector of length 1 +potential names of the entry in the vector: +\code{video_id}: video ID. +\code{channel_id}: channel ID. +\code{thread_id}: comma-separated list of comment thread IDs +\code{threads_related_to_channel}: channel ID.} + +\item{part}{Comment resource requested. Required. Comma separated list +of one or more of the +following: \code{id, snippet}. e.g., \code{"id, snippet"}, +\code{"id"}, etc. Default: \code{snippet}.} + +\item{text_format}{Data Type: Character. Default is \code{"html"}. +Only takes \code{"html"} or \code{"plainText"}. Optional.} + +\item{max_results}{Maximum number of items that should be returned. + Integer. Optional. Default is 100. +If the value is greater than 100 then the function fetches all the +results. The outcome is a simplified \code{data.frame}.} + +\item{page_token}{Specific page in the result set that should be +returned. Optional.} + +\item{\dots}{Additional arguments passed to \code{\link{tuber_GET}}.} +} +\value{ +Nested named list. The entry \code{items} is a list of comments +along with meta information. +Within each of the \code{items} is an item \code{snippet} which +has an item \code{topLevelComment$snippet$textDisplay} +that contains the actual comment. + +If simplify is \code{TRUE}, a \code{data.frame} with the following columns: +\code{authorDisplayName, authorProfileImageUrl, authorChannelUrl, +authorChannelId.value, videoId, textDisplay, +canRate, viewerRating, likeCount, publishedAt, updatedAt} +} +\description{ +Get Most Comments +} +\examples{ +\dontrun{ + +# Set API token via yt_oauth() first + +get_most_comments(filter = c(video_id = "N708P-A45D0")) +get_most_comments(filter = c(video_id = "N708P-A45D0"), max_results = 101) +} +} +\references{ +\url{https://developers.google.com/youtube/v3/docs/commentThreads/list} +} From b4c81de4d7576f951573568385c036c4f62c5cc5 Mon Sep 17 00:00:00 2001 From: Balthasar Date: Tue, 20 Apr 2021 00:39:59 +0200 Subject: [PATCH 3/6] working prototype do not touch :) --- R/get_most_comments.R | 93 +++++++++++++++++++++++++++++++------------ 1 file changed, 67 insertions(+), 26 deletions(-) diff --git a/R/get_most_comments.R b/R/get_most_comments.R index 7972228..295f4e6 100644 --- a/R/get_most_comments.R +++ b/R/get_most_comments.R @@ -85,11 +85,14 @@ parse_replies <- function(comment_thread) { #' get_most_comments(filter = c(video_id = "N708P-A45D0")) #' get_most_comments(filter = c(video_id = "N708P-A45D0"), max_results = 101) #' } -get_most_comments <- function(filter = NULL, part = "snippet,replies", - text_format = "html", max_results = 101, page_token = NULL, ...) { +#' +#' + +get_most_comments <- function(filter = NULL, part = "snippet,replies,id", + text_format = "html", max_results = 100, page_token = NULL, ...) { if (max_results < 20) { stop("max_results only takes a value over 20. - Above 100, it outputs all the results.") + Above 100, it outputs all the results.") } if (text_format != "html" & text_format != "plainText") { @@ -99,7 +102,7 @@ get_most_comments <- function(filter = NULL, part = "snippet,replies", if (!(names(filter) %in% c("video_id", "channel_id", "thread_id", "threads_related_to_channel"))) { stop("filter can only take one of values: channel_id, video_id, parent_id, - threads_related_to_channel.") + threads_related_to_channel.") } if (length(filter) != 1) stop("filter must be a vector of length 1.") @@ -120,14 +123,17 @@ get_most_comments <- function(filter = NULL, part = "snippet,replies", querylist <- list( part = part, maxResults = ifelse(max_results > 100, 100, max_results), - textFormat = text_format + textFormat = text_format, + pageToken = page_token ) + querylist <- c(querylist, filter) + print(querylist) ## get first page of results of a comment thread and ## initialize objects with content of first page before ## proceeding to next pages of API response - res <- tuber_GET("commentThreads", querylist, ...) + res <- tuber:::tuber_GET("commentThreads", querylist, ...) # parse results snippet <- parse_snippet(res) comment_thread <- parse_comment_thread(snippet) @@ -146,34 +152,69 @@ get_most_comments <- function(filter = NULL, part = "snippet,replies", comment_thread, replies ) %>% dplyr::select(-c(replies)) %>% - dplyr::filter(totalReplyCount > 1) %>% + # dplyr::filter(totalReplyCount > 1) %>% # make columns complete if missing to avoid NAs tidyr::fill(tidyselect::any_of(na_cols_1), .direction = "down") # get all following pages of comment thread - agg_res <- comments + # agg_res <- comments - # # shouldn't this be `unique()`? - next_page_token <- res$nextPageToken + # # shouldn't this be `unique()`? + # next_page_token <- unique(res$nextPageToken) print("erstes Mal") print(next_page_token) - while (!is.null(next_page_token)) { - print("zweites Mal") - print(next_page_token) - next_results <- get_most_comments(orig_filter, - part = part, - text_format = text_format, - simplify = FALSE, - max_results = 101, - page_token = next_page_token - ) - agg_res <- rbind(next_results, agg_res) - # get token with link to next result page - next_page_token <- next_results$nextPageToken - print(next_page_token) - } print("finished") - return(agg_res) + + comments +} + +library(magrittr) +tuber::yt_oauth( + app_id = Sys.getenv("YOUTUBE_API_APP_ID"), + app_secret = Sys.getenv("YOUTUBE_API_CLIENT_SECRET") +) + +all_data <- get_most_comments(filter = c(video_id = "Hop_MfkXl7c"), page_token = NULL) +counter_while <- 0 +# next_page_token <- NULL +next_page_token <- unique(all_data$nextPageToken) + +while (counter_while == 0 | !is.null(next_page_token)) { + next_data <- get_most_comments( + filter = c(video_id = "Hop_MfkXl7c"), + page_token = next_page_token + ) + next_page_token <- unique(next_data$nextPageToken) + print(next_data) + counter_while <- counter_while + 1 + message(paste(counter_while, ":", "counter_while")) + message(paste(next_page_token, ":", "next_page_token")) + all_data <- dplyr::bind_rows(next_data, all_data) +} +return(all_data) + +retrieve_data_from_paginated_api <- function(video_id){ + } +# example <- get_most_comments(filter = c(video_id = "Hop_MfkXl7c"), page_token = NULL) +# example %>% +# tidyr::unnest(nextPageToken) +# +# get_most_comments(filter = c(video_id = "Hop_MfkXl7c"), page_token = "QURTSl9pMjVDT2V3WGF5Nm5ha3ZYam1HbWVPMFVybTJuWk96R2UyOTZwNnVwSXFJdTJfUFVpSVI3VUxqbU1TSGpKWVpCcFpITEl4cm83dw==") +# tuber::get_comment_threads(filter = c(video_id = "Hop_MfkXl7c")) %>% +# tibble::as_tibble() +# +# res_2 <- get_most_comments(filter = c(video_id = "Hop_MfkXl7c")) +# +# res_2$nextPageToken +# +# t1 <- "QURTSl9pMnM3TFpxY2FQUTVDNGVfTTBDRUF0Nm52R0RXNGRuM1R3a21fMDZhemR0aUtDeHRTWnV3UXpmREs0cnI0TmYzOXh2VTlzRXdOYw==" +# t2 <- "QURTSl9pMHdOWXBXWlRBakdlQnk1VXBUNEljSXM0QTU0WkNuOEp2VFBiX0RMQlhMeDVGdEo1UTlpWm5BaEFTRGZZZWU4UmV0LVFMTmFydw==" +# t3 <- "QURTSl9pMlY5M1Q1VkxsZW9RbXR0VG1acXkzRjRMaDYxcEc4c05UenVUc0VIR0tXbmZmVHU2V3RVdC1WUmlHaW5wa19WUlJfS19vSGEzNA==" +# +# get_most_comments(filter = c(video_id = "Hop_MfkXl7c"), page_token = t1) +# get_most_comments(filter = c(video_id = "Hop_MfkXl7c"), page_token = t2) +# get_most_comments(filter = c(video_id = "Hop_MfkXl7c"), page_token = t3) +# From 31a1f417f9e1f7e82a8fc55ce59999fc193ba5da Mon Sep 17 00:00:00 2001 From: Balthasar Date: Tue, 20 Apr 2021 01:27:03 +0200 Subject: [PATCH 4/6] function from prototype --- R/get_most_comments.R | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/R/get_most_comments.R b/R/get_most_comments.R index 295f4e6..23b52d2 100644 --- a/R/get_most_comments.R +++ b/R/get_most_comments.R @@ -194,10 +194,39 @@ while (counter_while == 0 | !is.null(next_page_token)) { } return(all_data) -retrieve_data_from_paginated_api <- function(video_id){ - +retrieve_data_from_paginated_api <- function(video_id_input) { + + # initialize objects for loop + all_data <- get_most_comments(filter = c(video_id = video_id_input), page_token = NULL) + counter_while <- 0 + next_page_token <- unique(all_data$nextPageToken) + + # loop over results until last nextPageToken + while (counter_while == 0 | !is.null(next_page_token)) { + next_data <- get_most_comments( + filter = c(video_id = video_id_input), + page_token = next_page_token + ) + + # overwrite `next_page_token` that was initialized outside loop + # with new content that was just retrieved in the data + next_page_token <- unique(next_data$nextPageToken) + counter_while <- counter_while + 1 + + # overwrite `all_data` that was initialized outside loop + # using `all_data` from outside of loop in first iteration + # and then using itself from previous iteration plus + # new `next_data`. + all_data <- dplyr::bind_rows(next_data, all_data) + } + return(all_data) } +# new_example_with_function <- retrieve_data_from_paginated_api(video_id_input = "Hop_MfkXl7c") +# new_example_with_function %>% +# dplyr::filter(totalReplyCount > 1 & is_reply) +# length(unique(new_example_with_function$snippet_textOriginal)) + # example <- get_most_comments(filter = c(video_id = "Hop_MfkXl7c"), page_token = NULL) # example %>% # tidyr::unnest(nextPageToken) From 1d820fd3bab5d321fcb2154af5edc8d9dcee5369 Mon Sep 17 00:00:00 2001 From: Balthasar Date: Tue, 20 Apr 2021 15:10:50 +0200 Subject: [PATCH 5/6] forgot to include pageInfo --- R/get_most_comments.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/get_most_comments.R b/R/get_most_comments.R index 23b52d2..fbd260e 100644 --- a/R/get_most_comments.R +++ b/R/get_most_comments.R @@ -7,6 +7,7 @@ parse_comment_thread <- function(res) { conditional_unnest_wider(var = "topLevelComment") %>% conditional_unnest_wider(var = "topLevelComment_snippet") %>% conditional_unnest_wider(var = "topLevelComment_snippet_authorChannelId") %>% + conditional_unnest_wider(var = "pageInfo") %>% dplyr::select(-c(id)) %>% # rename to make compatible with other comments later dplyr::rename_at( From 8acb717d39b39cbb7fd09482cc4e27f34a3b50f2 Mon Sep 17 00:00:00 2001 From: Balthasar Date: Tue, 20 Apr 2021 15:11:49 +0200 Subject: [PATCH 6/6] divide up into helpers and main functions, documentation, etc. --- R/get_most_comments.R | 171 +++++++++++++-------------------------- R/globals.R | 2 +- man/get_most_comments.Rd | 58 +++---------- 3 files changed, 65 insertions(+), 166 deletions(-) diff --git a/R/get_most_comments.R b/R/get_most_comments.R index fbd260e..c308882 100644 --- a/R/get_most_comments.R +++ b/R/get_most_comments.R @@ -38,59 +38,8 @@ parse_replies <- function(comment_thread) { } } -#' Get Most Comments -#' -#' @param filter string; Required. -#' named vector of length 1 -#' potential names of the entry in the vector: -#' \code{video_id}: video ID. -#' \code{channel_id}: channel ID. -#' \code{thread_id}: comma-separated list of comment thread IDs -#' \code{threads_related_to_channel}: channel ID. -#' -#' @param part Comment resource requested. Required. Comma separated list -#' of one or more of the -#' following: \code{id, snippet}. e.g., \code{"id, snippet"}, -#' \code{"id"}, etc. Default: \code{snippet}. -#' @param max_results Maximum number of items that should be returned. -#' Integer. Optional. Default is 100. -#' If the value is greater than 100 then the function fetches all the -#' results. The outcome is a simplified \code{data.frame}. -#' @param page_token Specific page in the result set that should be -#' returned. Optional. -#' @param text_format Data Type: Character. Default is \code{"html"}. -#' Only takes \code{"html"} or \code{"plainText"}. Optional. -#' @param \dots Additional arguments passed to \code{\link{tuber_GET}}. -#' -#' @return -#' Nested named list. The entry \code{items} is a list of comments -#' along with meta information. -#' Within each of the \code{items} is an item \code{snippet} which -#' has an item \code{topLevelComment$snippet$textDisplay} -#' that contains the actual comment. -#' -#' If simplify is \code{TRUE}, a \code{data.frame} with the following columns: -#' \code{authorDisplayName, authorProfileImageUrl, authorChannelUrl, -#' authorChannelId.value, videoId, textDisplay, -#' canRate, viewerRating, likeCount, publishedAt, updatedAt} -#' -#' @export get_most_comments -#' -#' @references \url{https://developers.google.com/youtube/v3/docs/commentThreads/list} -#' -#' @examples -#' \dontrun{ -#' -#' # Set API token via yt_oauth() first -#' -#' get_most_comments(filter = c(video_id = "N708P-A45D0")) -#' get_most_comments(filter = c(video_id = "N708P-A45D0"), max_results = 101) -#' } -#' -#' - -get_most_comments <- function(filter = NULL, part = "snippet,replies,id", - text_format = "html", max_results = 100, page_token = NULL, ...) { +get_parse_bind_comments <- function(filter = NULL, page_token = NULL, + part = part, text_format = text_format, max_results = 100) { if (max_results < 20) { stop("max_results only takes a value over 20. Above 100, it outputs all the results.") @@ -101,7 +50,7 @@ get_most_comments <- function(filter = NULL, part = "snippet,replies,id", } if (!(names(filter) %in% - c("video_id", "channel_id", "thread_id", "threads_related_to_channel"))) { + c("video_id", "channel_id", "thread_id", "threads_related_to_channel"))) { stop("filter can only take one of values: channel_id, video_id, parent_id, threads_related_to_channel.") } @@ -129,12 +78,12 @@ get_most_comments <- function(filter = NULL, part = "snippet,replies,id", ) querylist <- c(querylist, filter) - print(querylist) + # print(querylist) ## get first page of results of a comment thread and ## initialize objects with content of first page before ## proceeding to next pages of API response - res <- tuber:::tuber_GET("commentThreads", querylist, ...) + res <- tuber_GET("commentThreads", querylist) # parse results snippet <- parse_snippet(res) comment_thread <- parse_comment_thread(snippet) @@ -162,57 +111,67 @@ get_most_comments <- function(filter = NULL, part = "snippet,replies,id", # # shouldn't this be `unique()`? # next_page_token <- unique(res$nextPageToken) - print("erstes Mal") - print(next_page_token) + # print("erstes Mal") + # print(next_page_token) - print("finished") + # print("finished") comments } -library(magrittr) -tuber::yt_oauth( - app_id = Sys.getenv("YOUTUBE_API_APP_ID"), - app_secret = Sys.getenv("YOUTUBE_API_CLIENT_SECRET") -) - -all_data <- get_most_comments(filter = c(video_id = "Hop_MfkXl7c"), page_token = NULL) -counter_while <- 0 -# next_page_token <- NULL -next_page_token <- unique(all_data$nextPageToken) - -while (counter_while == 0 | !is.null(next_page_token)) { - next_data <- get_most_comments( - filter = c(video_id = "Hop_MfkXl7c"), - page_token = next_page_token - ) - next_page_token <- unique(next_data$nextPageToken) - print(next_data) - counter_while <- counter_while + 1 - message(paste(counter_while, ":", "counter_while")) - message(paste(next_page_token, ":", "next_page_token")) - all_data <- dplyr::bind_rows(next_data, all_data) -} -return(all_data) -retrieve_data_from_paginated_api <- function(video_id_input) { +#' Get Most Comments +#' +#' Retrieves all top level comments and replies to them. +#' Replies to replies are not included. +#' @param video_id ID of video, required. +#' @return +#' Data frame with all comments and replies. +#' +#' @export get_most_comments +#' +#' @examples +#' \dontrun{ +#' +#' # Set API token via yt_oauth() first +#' +#' get_most_comments(video_id = "Hop_MfkXl7c") +#' } +#' +#' +#' +get_most_comments <- function(video_id) { + video_id_arg <- video_id + part_arg <- "snippet,replies,id" + filter_arg <- c(video_id = video_id_arg) + text_format_arg <- "html" # initialize objects for loop - all_data <- get_most_comments(filter = c(video_id = video_id_input), page_token = NULL) + all_data <- get_parse_bind_comments( + filter = filter_arg, + page_token = NULL, + part = part_arg, + text_format = text_format_arg, + max_results = 100 + ) counter_while <- 0 - next_page_token <- unique(all_data$nextPageToken) + suppressWarnings(next_page_token <- unique(all_data$nextPageToken)) # loop over results until last nextPageToken while (counter_while == 0 | !is.null(next_page_token)) { - next_data <- get_most_comments( - filter = c(video_id = video_id_input), - page_token = next_page_token + next_data <- get_parse_bind_comments( + filter = filter_arg, + page_token = next_page_token, + part = part_arg, + text_format = text_format_arg, + max_results = 100 ) + counter_while <- counter_while + 1 + # cli::cli_alert_success("Page {counter_while} packages.") # overwrite `next_page_token` that was initialized outside loop # with new content that was just retrieved in the data - next_page_token <- unique(next_data$nextPageToken) - counter_while <- counter_while + 1 + suppressWarnings(next_page_token <- unique(all_data$nextPageToken)) # overwrite `all_data` that was initialized outside loop # using `all_data` from outside of loop in first iteration @@ -223,28 +182,8 @@ retrieve_data_from_paginated_api <- function(video_id_input) { return(all_data) } -# new_example_with_function <- retrieve_data_from_paginated_api(video_id_input = "Hop_MfkXl7c") -# new_example_with_function %>% -# dplyr::filter(totalReplyCount > 1 & is_reply) -# length(unique(new_example_with_function$snippet_textOriginal)) - -# example <- get_most_comments(filter = c(video_id = "Hop_MfkXl7c"), page_token = NULL) -# example %>% -# tidyr::unnest(nextPageToken) -# -# get_most_comments(filter = c(video_id = "Hop_MfkXl7c"), page_token = "QURTSl9pMjVDT2V3WGF5Nm5ha3ZYam1HbWVPMFVybTJuWk96R2UyOTZwNnVwSXFJdTJfUFVpSVI3VUxqbU1TSGpKWVpCcFpITEl4cm83dw==") -# tuber::get_comment_threads(filter = c(video_id = "Hop_MfkXl7c")) %>% -# tibble::as_tibble() -# -# res_2 <- get_most_comments(filter = c(video_id = "Hop_MfkXl7c")) -# -# res_2$nextPageToken -# -# t1 <- "QURTSl9pMnM3TFpxY2FQUTVDNGVfTTBDRUF0Nm52R0RXNGRuM1R3a21fMDZhemR0aUtDeHRTWnV3UXpmREs0cnI0TmYzOXh2VTlzRXdOYw==" -# t2 <- "QURTSl9pMHdOWXBXWlRBakdlQnk1VXBUNEljSXM0QTU0WkNuOEp2VFBiX0RMQlhMeDVGdEo1UTlpWm5BaEFTRGZZZWU4UmV0LVFMTmFydw==" -# t3 <- "QURTSl9pMlY5M1Q1VkxsZW9RbXR0VG1acXkzRjRMaDYxcEc4c05UenVUc0VIR0tXbmZmVHU2V3RVdC1WUmlHaW5wa19WUlJfS19vSGEzNA==" -# -# get_most_comments(filter = c(video_id = "Hop_MfkXl7c"), page_token = t1) -# get_most_comments(filter = c(video_id = "Hop_MfkXl7c"), page_token = t2) -# get_most_comments(filter = c(video_id = "Hop_MfkXl7c"), page_token = t3) -# +# tuber::yt_oauth( +# app_id = Sys.getenv("YOUTUBE_API_APP_ID"), +# app_secret = Sys.getenv("YOUTUBE_API_CLIENT_SECRET") +# ) +# get_most_comments(video_id = "Hop_MfkXl7c") diff --git a/R/globals.R b/R/globals.R index 100c9e0..1af4aad 100644 --- a/R/globals.R +++ b/R/globals.R @@ -1,3 +1,3 @@ # fix warning about no visible bindings due to tidyverse functions # https://community.rstudio.com/t/how-to-solve-no-visible-binding-for-global-variable-note/28887 -utils::globalVariables(c("totalReplyCount", "id", "comments", "kind", "etag", "items", "snippet")) +utils::globalVariables(c("totalReplyCount", "id", "comments", "kind", "etag", "items", "snippet", "video_id_arg")) diff --git a/man/get_most_comments.Rd b/man/get_most_comments.Rd index 48cd150..10e964c 100644 --- a/man/get_most_comments.Rd +++ b/man/get_most_comments.Rd @@ -4,66 +4,26 @@ \alias{get_most_comments} \title{Get Most Comments} \usage{ -get_most_comments( - filter = NULL, - part = "snippet,replies", - text_format = "html", - max_results = 101, - page_token = NULL, - ... -) +get_most_comments(video_id) } \arguments{ -\item{filter}{string; Required. -named vector of length 1 -potential names of the entry in the vector: -\code{video_id}: video ID. -\code{channel_id}: channel ID. -\code{thread_id}: comma-separated list of comment thread IDs -\code{threads_related_to_channel}: channel ID.} - -\item{part}{Comment resource requested. Required. Comma separated list -of one or more of the -following: \code{id, snippet}. e.g., \code{"id, snippet"}, -\code{"id"}, etc. Default: \code{snippet}.} - -\item{text_format}{Data Type: Character. Default is \code{"html"}. -Only takes \code{"html"} or \code{"plainText"}. Optional.} - -\item{max_results}{Maximum number of items that should be returned. - Integer. Optional. Default is 100. -If the value is greater than 100 then the function fetches all the -results. The outcome is a simplified \code{data.frame}.} - -\item{page_token}{Specific page in the result set that should be -returned. Optional.} - -\item{\dots}{Additional arguments passed to \code{\link{tuber_GET}}.} +\item{video_id}{ID of video, required.} } \value{ -Nested named list. The entry \code{items} is a list of comments -along with meta information. -Within each of the \code{items} is an item \code{snippet} which -has an item \code{topLevelComment$snippet$textDisplay} -that contains the actual comment. - -If simplify is \code{TRUE}, a \code{data.frame} with the following columns: -\code{authorDisplayName, authorProfileImageUrl, authorChannelUrl, -authorChannelId.value, videoId, textDisplay, -canRate, viewerRating, likeCount, publishedAt, updatedAt} +Data frame with all comments and replies. } \description{ -Get Most Comments +Retrieves all top level comments and replies to them. +Replies to replies are not included. } \examples{ \dontrun{ # Set API token via yt_oauth() first -get_most_comments(filter = c(video_id = "N708P-A45D0")) -get_most_comments(filter = c(video_id = "N708P-A45D0"), max_results = 101) -} +get_most_comments(video_id = "Hop_MfkXl7c") } -\references{ -\url{https://developers.google.com/youtube/v3/docs/commentThreads/list} + + + }