From 5540799a23b5e08582fd09f6933f3fe25acf882e Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Sat, 11 Apr 2026 17:10:51 +0100 Subject: [PATCH 1/9] Allow dictionaries from a wider range of types for indices --- r/src/array_to_vector.cpp | 23 ++++++++++++++++++++++- r/tests/testthat/test-Table.R | 16 ++++++++++++++++ 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/r/src/array_to_vector.cpp b/r/src/array_to_vector.cpp index 432b49503e1a..7af710bc7f32 100644 --- a/r/src/array_to_vector.cpp +++ b/r/src/array_to_vector.cpp @@ -595,7 +595,9 @@ class Converter_Dictionary : public Converter { case Type::UINT16: case Type::INT16: case Type::INT32: - // TODO: also add int64, uint32, uint64 downcasts, if possible + case Type::UINT32: + case Type::INT64: + case Type::UINT64: break; default: cpp11::stop("Cannot convert Dictionary Array of type `%s` to R", @@ -612,6 +614,16 @@ class Converter_Dictionary : public Converter { dictionary_ = CreateEmptyArray(dict_type.value_type()); } } + + // R factors store their codes in 32-bit integers, so dictionary arrays with + // more levels than that cannot be represented safely. + if (dictionary_->length() > std::numeric_limits::max()) { + const auto& dict_type = checked_cast(*chunked_array->type()); + cpp11::stop( + "Cannot convert Dictionary Array of type `%s` to R: dictionary has " + "more levels than an R factor can represent", + dict_type.ToString().c_str()); + } } SEXP Allocate(R_xlen_t n) const { @@ -653,6 +665,15 @@ class Converter_Dictionary : public Converter { case Type::INT32: return Ingest_some_nulls_Impl(data, array, start, n, chunk_index); + case Type::UINT32: + return Ingest_some_nulls_Impl(data, array, start, n, + chunk_index); + case Type::INT64: + return Ingest_some_nulls_Impl(data, array, start, n, + chunk_index); + case Type::UINT64: + return Ingest_some_nulls_Impl(data, array, start, n, + chunk_index); default: break; } diff --git a/r/tests/testthat/test-Table.R b/r/tests/testthat/test-Table.R index 1ca8832beb84..e404da1d029e 100644 --- a/r/tests/testthat/test-Table.R +++ b/r/tests/testthat/test-Table.R @@ -371,6 +371,22 @@ test_that("Can create table with specific dictionary types", { } }) +test_that("Table converts dictionary arrays with wider index types back to R", { + fact <- example_data[, "fct"] + + tab_uint32 <- Table$create(fact, schema = schema(fct = dictionary(uint32(), utf8()))) + expect_equal(tab_uint32$schema, schema(fct = dictionary(uint32(), utf8()))) + expect_equal_data_frame(tab_uint32, fact) + + tab_int64 <- Table$create(fact, schema = schema(fct = dictionary(int64(), utf8()))) + expect_equal(tab_int64$schema, schema(fct = dictionary(int64(), utf8()))) + expect_equal_data_frame(tab_int64, fact) + + tab_uint64 <- Table$create(fact, schema = schema(fct = dictionary(uint64(), utf8()))) + expect_equal(tab_uint64$schema, schema(fct = dictionary(uint64(), utf8()))) + expect_equal_data_frame(tab_uint64, fact) +}) + test_that("Table unifies dictionary on conversion back to R (ARROW-8374)", { b1 <- record_batch(f = factor(c("a"), levels = c("a", "b"))) b2 <- record_batch(f = factor(c("c"), levels = c("c", "d"))) From 747210dceff6772bbde5103c0fe9f2bd5ec26fe3 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Sat, 11 Apr 2026 19:12:30 +0100 Subject: [PATCH 2/9] Implement string view --- cpp/src/arrow/util/converter.h | 1 + r/NAMESPACE | 1 + r/R/arrowExports.R | 21 +++++++----- r/R/dplyr-funcs-doc.R | 2 +- r/R/type.R | 13 ++++++++ r/man/data-type.Rd | 3 ++ r/man/read_json_arrow.Rd | 2 +- r/man/schema.Rd | 2 +- r/src/array_to_vector.cpp | 44 +++++++++++++++---------- r/src/arrowExports.cpp | 45 ++++++++++++++----------- r/src/datatype.cpp | 5 +++ r/src/r_to_arrow.cpp | 55 +++++++++++++++++++++++++++++-- r/tests/testthat/test-Table.R | 7 ++++ r/tests/testthat/test-data-type.R | 12 +++++++ 14 files changed, 161 insertions(+), 52 deletions(-) diff --git a/cpp/src/arrow/util/converter.h b/cpp/src/arrow/util/converter.h index c23d6ccd9886..3d1d07d53c2b 100644 --- a/cpp/src/arrow/util/converter.h +++ b/cpp/src/arrow/util/converter.h @@ -239,6 +239,7 @@ struct MakeConverterImpl { DICTIONARY_CASE(DoubleType); DICTIONARY_CASE(BinaryType); DICTIONARY_CASE(StringType); + DICTIONARY_CASE(StringViewType); DICTIONARY_CASE(FixedSizeBinaryType); #undef DICTIONARY_CASE default: diff --git a/r/NAMESPACE b/r/NAMESPACE index f42944fb58b5..320c9b378e3f 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -397,6 +397,7 @@ export(set_io_thread_count) export(show_exec_plan) export(starts_with) export(string) +export(string_view) export(struct) export(time32) export(time64) diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index 52274d29f0d9..735a86964373 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -900,6 +900,10 @@ Utf8__initialize <- function() { .Call(`_arrow_Utf8__initialize`) } +StringView__initialize <- function() { + .Call(`_arrow_StringView__initialize`) +} + LargeUtf8__initialize <- function() { .Call(`_arrow_LargeUtf8__initialize`) } @@ -1248,14 +1252,6 @@ Field__Equals <- function(field, other, check_metadata) { .Call(`_arrow_Field__Equals`, field, other, check_metadata) } -Field__nullable <- function(field) { - .Call(`_arrow_Field__nullable`, field) -} - -Field__type <- function(field) { - .Call(`_arrow_Field__type`, field) -} - Field__HasMetadata <- function(field) { .Call(`_arrow_Field__HasMetadata`, field) } @@ -1272,6 +1268,14 @@ Field__RemoveMetadata <- function(field) { .Call(`_arrow_Field__RemoveMetadata`, field) } +Field__nullable <- function(field) { + .Call(`_arrow_Field__nullable`, field) +} + +Field__type <- function(field) { + .Call(`_arrow_Field__type`, field) +} + fs___FileInfo__type <- function(x) { .Call(`_arrow_fs___FileInfo__type`, x) } @@ -2199,4 +2203,3 @@ SetIOThreadPoolCapacity <- function(threads) { Array__infer_type <- function(x) { .Call(`_arrow_Array__infer_type`, x) } - diff --git a/r/R/dplyr-funcs-doc.R b/r/R/dplyr-funcs-doc.R index f7ca29833c81..176181a09bbb 100644 --- a/r/R/dplyr-funcs-doc.R +++ b/r/R/dplyr-funcs-doc.R @@ -84,7 +84,7 @@ #' Functions can be called either as `pkg::fun()` or just `fun()`, i.e. both #' `str_sub()` and `stringr::str_sub()` work. #' -#' In addition to these functions, you can call any of Arrow's 281 compute +#' In addition to these functions, you can call any of Arrow's 253 compute #' functions directly. Arrow has many functions that don't map to an existing R #' function. In other cases where there is an R function mapping, you can still #' call the Arrow function directly if you don't want the adaptations that the R diff --git a/r/R/type.R b/r/R/type.R index 27cb0afe3db6..b370db82d0cc 100644 --- a/r/R/type.R +++ b/r/R/type.R @@ -203,6 +203,13 @@ Utf8 <- R6Class( code = function(namespace = FALSE) call2("utf8", .ns = if (namespace) "arrow") ) ) +StringView <- R6Class( + "StringView", + inherit = DataType, + public = list( + code = function(namespace = FALSE) call2("string_view", .ns = if (namespace) "arrow") + ) +) LargeUtf8 <- R6Class( "LargeUtf8", inherit = DataType, @@ -505,6 +512,10 @@ bool <- boolean #' @export utf8 <- function() Utf8__initialize() +#' @rdname data-type +#' @export +string_view <- function() StringView__initialize() + #' @rdname data-type #' @export large_utf8 <- function() LargeUtf8__initialize() @@ -806,6 +817,8 @@ canonical_type_str <- function(type_str) { boolean = "bool", bool = "bool", utf8 = "string", + utf8_view = "string_view", + string_view = "string_view", large_utf8 = "large_string", large_string = "large_string", binary = "binary", diff --git a/r/man/data-type.Rd b/r/man/data-type.Rd index aa11c222bc55..ce2a6e4e7583 100644 --- a/r/man/data-type.Rd +++ b/r/man/data-type.Rd @@ -18,6 +18,7 @@ \alias{boolean} \alias{bool} \alias{utf8} +\alias{string_view} \alias{large_utf8} \alias{binary} \alias{large_binary} @@ -76,6 +77,8 @@ bool() utf8() +string_view() + large_utf8() binary() diff --git a/r/man/read_json_arrow.Rd b/r/man/read_json_arrow.Rd index b809a63bcc6f..abf6b8fc44a8 100644 --- a/r/man/read_json_arrow.Rd +++ b/r/man/read_json_arrow.Rd @@ -54,7 +54,7 @@ If \code{schema} is not provided, Arrow data types are inferred from the data: \item JSON numbers convert to \code{\link[=int64]{int64()}}, falling back to \code{\link[=float64]{float64()}} if a non-integer is encountered. \item JSON strings of the kind "YYYY-MM-DD" and "YYYY-MM-DD hh:mm:ss" convert to \code{\link[=timestamp]{timestamp(unit = "s")}}, falling back to \code{\link[=utf8]{utf8()}} if a conversion error occurs. -\item JSON arrays convert to a \code{\link[=list_of]{list_of()}} type, and inference proceeds recursively on the JSON arrays' values. +\item JSON arrays convert to a \code{\link[vctrs:list_of]{vctrs::list_of()}} type, and inference proceeds recursively on the JSON arrays' values. \item Nested JSON objects convert to a \code{\link[=struct]{struct()}} type, and inference proceeds recursively on the JSON objects' values. } diff --git a/r/man/schema.Rd b/r/man/schema.Rd index 65ab2eea0d27..ff77a05d84aa 100644 --- a/r/man/schema.Rd +++ b/r/man/schema.Rd @@ -7,7 +7,7 @@ schema(...) } \arguments{ -\item{...}{\link[=field]{fields}, field name/\link[=data-type]{data type} pairs (or a list of), or object from which to extract +\item{...}{\link[vctrs:fields]{fields}, field name/\link[=data-type]{data type} pairs (or a list of), or object from which to extract a schema} } \description{ diff --git a/r/src/array_to_vector.cpp b/r/src/array_to_vector.cpp index 7af710bc7f32..bad234eb1120 100644 --- a/r/src/array_to_vector.cpp +++ b/r/src/array_to_vector.cpp @@ -290,26 +290,29 @@ struct Converter_String : public Converter { Status Ingest_some_nulls(SEXP data, const std::shared_ptr& array, R_xlen_t start, R_xlen_t n, size_t chunk_index) const { - auto p_offset = array->data()->GetValues(1); - if (!p_offset) { - return Status::Invalid("Invalid offset buffer"); - } - auto p_strings = array->data()->GetValues(2, *p_offset); - if (!p_strings) { - // There is an offset buffer, but the data buffer is null - // There is at least one value in the array and not all the values are null - // That means all values are either empty strings or nulls so there is nothing to do - - if (array->null_count()) { - arrow::internal::BitmapReader null_reader(array->null_bitmap_data(), - array->offset(), n); - for (int i = 0; i < n; i++, null_reader.Next()) { - if (null_reader.IsNotSet()) { - SET_STRING_ELT(data, start + i, NA_STRING); + if constexpr (!std::is_same_v) { + auto p_offset = array->data()->GetValues(1); + if (!p_offset) { + return Status::Invalid("Invalid offset buffer"); + } + auto p_strings = array->data()->GetValues(2, *p_offset); + if (!p_strings) { + // There is an offset buffer, but the data buffer is null + // There is at least one value in the array and not all the values are null + // That means all values are either empty strings or nulls so there is nothing to + // do + + if (array->null_count()) { + arrow::internal::BitmapReader null_reader(array->null_bitmap_data(), + array->offset(), n); + for (int i = 0; i < n; i++, null_reader.Next()) { + if (null_reader.IsNotSet()) { + SET_STRING_ELT(data, start + i, NA_STRING); + } } } + return Status::OK(); } - return Status::OK(); } StringArrayType* string_array = static_cast(array.get()); @@ -725,7 +728,8 @@ class Converter_Dictionary : public Converter { // TODO (npr): this coercion should be optional, "dictionariesAsFactors" ;) // Alternative: preserve the logical type of the dictionary values // (e.g. if dict is timestamp, return a POSIXt R vector, not factor) - if (dictionary_->type_id() != Type::STRING) { + if (dictionary_->type_id() != Type::STRING && + dictionary_->type_id() != Type::STRING_VIEW) { cpp11::safe[Rf_warning]("Coercing dictionary values to R character factor levels"); } @@ -1262,6 +1266,10 @@ std::shared_ptr Converter::Make( return std::make_shared>( chunked_array); + case Type::STRING_VIEW: + return std::make_shared>( + chunked_array); + case Type::DICTIONARY: return std::make_shared(chunked_array); diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 5482c8679f68..4f8b54ceb339 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -2511,6 +2511,13 @@ BEGIN_CPP11 END_CPP11 } // datatype.cpp +std::shared_ptr StringView__initialize(); +extern "C" SEXP _arrow_StringView__initialize(){ +BEGIN_CPP11 + return cpp11::as_sexp(StringView__initialize()); +END_CPP11 +} +// datatype.cpp std::shared_ptr LargeUtf8__initialize(); extern "C" SEXP _arrow_LargeUtf8__initialize(){ BEGIN_CPP11 @@ -3238,22 +3245,6 @@ BEGIN_CPP11 END_CPP11 } // field.cpp -bool Field__nullable(const std::shared_ptr& field); -extern "C" SEXP _arrow_Field__nullable(SEXP field_sexp){ -BEGIN_CPP11 - arrow::r::Input&>::type field(field_sexp); - return cpp11::as_sexp(Field__nullable(field)); -END_CPP11 -} -// field.cpp -std::shared_ptr Field__type(const std::shared_ptr& field); -extern "C" SEXP _arrow_Field__type(SEXP field_sexp){ -BEGIN_CPP11 - arrow::r::Input&>::type field(field_sexp); - return cpp11::as_sexp(Field__type(field)); -END_CPP11 -} -// field.cpp bool Field__HasMetadata(const std::shared_ptr& field); extern "C" SEXP _arrow_Field__HasMetadata(SEXP field_sexp){ BEGIN_CPP11 @@ -3286,6 +3277,22 @@ BEGIN_CPP11 return cpp11::as_sexp(Field__RemoveMetadata(field)); END_CPP11 } +// field.cpp +bool Field__nullable(const std::shared_ptr& field); +extern "C" SEXP _arrow_Field__nullable(SEXP field_sexp){ +BEGIN_CPP11 + arrow::r::Input&>::type field(field_sexp); + return cpp11::as_sexp(Field__nullable(field)); +END_CPP11 +} +// field.cpp +std::shared_ptr Field__type(const std::shared_ptr& field); +extern "C" SEXP _arrow_Field__type(SEXP field_sexp){ +BEGIN_CPP11 + arrow::r::Input&>::type field(field_sexp); + return cpp11::as_sexp(Field__type(field)); +END_CPP11 +} // filesystem.cpp fs::FileType fs___FileInfo__type(const std::shared_ptr& x); extern "C" SEXP _arrow_fs___FileInfo__type(SEXP x_sexp){ @@ -5967,6 +5974,7 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_Float64__initialize", (DL_FUNC) &_arrow_Float64__initialize, 0}, { "_arrow_Boolean__initialize", (DL_FUNC) &_arrow_Boolean__initialize, 0}, { "_arrow_Utf8__initialize", (DL_FUNC) &_arrow_Utf8__initialize, 0}, + { "_arrow_StringView__initialize", (DL_FUNC) &_arrow_StringView__initialize, 0}, { "_arrow_LargeUtf8__initialize", (DL_FUNC) &_arrow_LargeUtf8__initialize, 0}, { "_arrow_Binary__initialize", (DL_FUNC) &_arrow_Binary__initialize, 0}, { "_arrow_LargeBinary__initialize", (DL_FUNC) &_arrow_LargeBinary__initialize, 0}, @@ -6054,12 +6062,12 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_Field__ToString", (DL_FUNC) &_arrow_Field__ToString, 1}, { "_arrow_Field__name", (DL_FUNC) &_arrow_Field__name, 1}, { "_arrow_Field__Equals", (DL_FUNC) &_arrow_Field__Equals, 3}, - { "_arrow_Field__nullable", (DL_FUNC) &_arrow_Field__nullable, 1}, - { "_arrow_Field__type", (DL_FUNC) &_arrow_Field__type, 1}, { "_arrow_Field__HasMetadata", (DL_FUNC) &_arrow_Field__HasMetadata, 1}, { "_arrow_Field__metadata", (DL_FUNC) &_arrow_Field__metadata, 1}, { "_arrow_Field__WithMetadata", (DL_FUNC) &_arrow_Field__WithMetadata, 2}, { "_arrow_Field__RemoveMetadata", (DL_FUNC) &_arrow_Field__RemoveMetadata, 1}, + { "_arrow_Field__nullable", (DL_FUNC) &_arrow_Field__nullable, 1}, + { "_arrow_Field__type", (DL_FUNC) &_arrow_Field__type, 1}, { "_arrow_fs___FileInfo__type", (DL_FUNC) &_arrow_fs___FileInfo__type, 1}, { "_arrow_fs___FileInfo__set_type", (DL_FUNC) &_arrow_fs___FileInfo__set_type, 2}, { "_arrow_fs___FileInfo__path", (DL_FUNC) &_arrow_fs___FileInfo__path, 1}, @@ -6303,4 +6311,3 @@ extern "C" void R_init_arrow(DllInfo* dll){ _arrow_compute__Initialize(); } - diff --git a/r/src/datatype.cpp b/r/src/datatype.cpp index 3360159c58e6..0346332c0dd6 100644 --- a/r/src/datatype.cpp +++ b/r/src/datatype.cpp @@ -57,6 +57,8 @@ const char* r6_class_name::get( case Type::STRING: return "Utf8"; + case Type::STRING_VIEW: + return "StringView"; case Type::LARGE_STRING: return "LargeUtf8"; @@ -165,6 +167,9 @@ std::shared_ptr Boolean__initialize() { return arrow::boolean() // [[arrow::export]] std::shared_ptr Utf8__initialize() { return arrow::utf8(); } +// [[arrow::export]] +std::shared_ptr StringView__initialize() { return arrow::utf8_view(); } + // [[arrow::export]] std::shared_ptr LargeUtf8__initialize() { return arrow::large_utf8(); } diff --git a/r/src/r_to_arrow.cpp b/r/src/r_to_arrow.cpp index 45d68043af5a..20f45e00361b 100644 --- a/r/src/r_to_arrow.cpp +++ b/r/src/r_to_arrow.cpp @@ -910,6 +910,49 @@ class RPrimitiveConverter> } }; +template +class RPrimitiveConverter> + : public PrimitiveConverter { + public: + Status Extend(SEXP x, int64_t size, int64_t offset = 0) override { + RVectorType rtype = GetVectorType(x); + if (rtype != STRING) { + return Status::Invalid("Expecting a character vector"); + } + return UnsafeAppendUtf8Strings(arrow::r::utf8_strings(x), size, offset); + } + + void DelayedExtend(SEXP values, int64_t size, RTasks& tasks) override { + auto task = [this, values, size]() { return this->Extend(values, size); }; + tasks.Append(false, std::move(task)); + } + + private: + Status UnsafeAppendUtf8Strings(const cpp11::strings& s, int64_t size, int64_t offset) { + RETURN_NOT_OK(this->primitive_builder_->Reserve(s.size())); + const SEXP* p_strings = reinterpret_cast(DATAPTR_RO(s)); + + int64_t total_length = 0; + for (R_xlen_t i = offset; i < size; i++, ++p_strings) { + SEXP si = *p_strings; + total_length += si == NA_STRING ? 0 : LENGTH(si); + } + RETURN_NOT_OK(this->primitive_builder_->ReserveData(total_length)); + + p_strings = reinterpret_cast(DATAPTR_RO(s)); + for (R_xlen_t i = offset; i < size; i++, ++p_strings) { + SEXP si = *p_strings; + if (si == NA_STRING) { + this->primitive_builder_->UnsafeAppendNull(); + } else { + this->primitive_builder_->UnsafeAppend(CHAR(si), LENGTH(si)); + } + } + + return Status::OK(); + } +}; + template class RPrimitiveConverter::value>> : public PrimitiveConverter { @@ -1029,8 +1072,8 @@ class RDictionaryConverter> // first we need to handle the levels SEXP levels = Rf_getAttrib(x, R_LevelsSymbol); - auto memo_chunked_chunked_array = - arrow::r::vec_to_arrow_ChunkedArray(levels, utf8(), false); + auto memo_chunked_chunked_array = arrow::r::vec_to_arrow_ChunkedArray( + levels, this->dict_type_->value_type(), false); for (const auto& chunk : memo_chunked_chunked_array->chunks()) { RETURN_NOT_OK(this->value_builder_->InsertMemoValues(*chunk)); } @@ -1062,7 +1105,13 @@ struct RConverterTrait< }; template -struct RConverterTrait> { +struct RConverterTrait> { + using type = RPrimitiveConverter; +}; + +template +struct RConverterTrait::value && + !is_string_view_type::value>> { // not implemented }; diff --git a/r/tests/testthat/test-Table.R b/r/tests/testthat/test-Table.R index e404da1d029e..c43d20f8fc63 100644 --- a/r/tests/testthat/test-Table.R +++ b/r/tests/testthat/test-Table.R @@ -387,6 +387,13 @@ test_that("Table converts dictionary arrays with wider index types back to R", { expect_equal_data_frame(tab_uint64, fact) }) +test_that("Table converts dictionary arrays with string_view values", { + expected <- data.frame(foo = factor(c("x", "y", "x"))) + tab <- Table$create(expected, schema = schema(foo = dictionary(uint32(), string_view()))) + + expect_equal_data_frame(tab, expected) +}) + test_that("Table unifies dictionary on conversion back to R (ARROW-8374)", { b1 <- record_batch(f = factor(c("a"), levels = c("a", "b"))) b2 <- record_batch(f = factor(c("c"), levels = c("c", "d"))) diff --git a/r/tests/testthat/test-data-type.R b/r/tests/testthat/test-data-type.R index fa2e5bcd6e8d..44c8c67f4a96 100644 --- a/r/tests/testthat/test-data-type.R +++ b/r/tests/testthat/test-data-type.R @@ -163,6 +163,17 @@ test_that("utf8 type works as expected", { expect_equal(x$fields(), list()) }) +test_that("string_view type works as expected", { + x <- string_view() + expect_equal(x$id, Type$STRING_VIEW) + expect_equal(x$name, "utf8_view") + expect_equal(x$ToString(), "string_view") + expect_true(x == x) + expect_false(x == null()) + expect_equal(x$num_fields, 0L) + expect_equal(x$fields(), list()) +}) + test_that("date types work as expected", { x <- date32() expect_equal(x$id, Type$DATE32) @@ -556,6 +567,7 @@ test_that("DataType$code()", { expect_code_roundtrip(boolean()) expect_code_roundtrip(utf8()) + expect_code_roundtrip(string_view()) expect_code_roundtrip(large_utf8()) expect_code_roundtrip(binary()) From 2ce9621a5fe79e2df5e9841c5a6b6dfde717d98a Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 5 May 2026 11:41:18 -0400 Subject: [PATCH 3/9] revert unwanted docs change --- r/R/dplyr-funcs-doc.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/dplyr-funcs-doc.R b/r/R/dplyr-funcs-doc.R index 176181a09bbb..f7ca29833c81 100644 --- a/r/R/dplyr-funcs-doc.R +++ b/r/R/dplyr-funcs-doc.R @@ -84,7 +84,7 @@ #' Functions can be called either as `pkg::fun()` or just `fun()`, i.e. both #' `str_sub()` and `stringr::str_sub()` work. #' -#' In addition to these functions, you can call any of Arrow's 253 compute +#' In addition to these functions, you can call any of Arrow's 281 compute #' functions directly. Arrow has many functions that don't map to an existing R #' function. In other cases where there is an R function mapping, you can still #' call the Arrow function directly if you don't want the adaptations that the R From cde21b51e38a983e5c699ab8e5ff8add0483b305 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 5 May 2026 11:41:36 -0400 Subject: [PATCH 4/9] add context --- r/src/array_to_vector.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/r/src/array_to_vector.cpp b/r/src/array_to_vector.cpp index bad234eb1120..4b157fc61ac9 100644 --- a/r/src/array_to_vector.cpp +++ b/r/src/array_to_vector.cpp @@ -290,6 +290,8 @@ struct Converter_String : public Converter { Status Ingest_some_nulls(SEXP data, const std::shared_ptr& array, R_xlen_t start, R_xlen_t n, size_t chunk_index) const { + // StringViewArray uses a different memory layout (views + data buffers) rather + // than offsets, so skip the offset-based fast path and fall through to GetString(). if constexpr (!std::is_same_v) { auto p_offset = array->data()->GetValues(1); if (!p_offset) { From bb711a774ee3886a22990f4e01beff0fdecacba8 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 5 May 2026 11:42:36 -0400 Subject: [PATCH 5/9] expand tests --- r/tests/testthat/test-Array.R | 8 ++++++++ r/tests/testthat/test-Table.R | 15 +++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/r/tests/testthat/test-Array.R b/r/tests/testthat/test-Array.R index 8520160d1255..b0bfc9e53564 100644 --- a/r/tests/testthat/test-Array.R +++ b/r/tests/testthat/test-Array.R @@ -203,6 +203,14 @@ test_that("Array supports character vectors (ARROW-3339)", { # with NA expect_array_roundtrip(c("itsy", NA, "spider"), utf8()) expect_array_roundtrip(c("itsy", NA, "spider"), large_utf8(), as = large_utf8()) + + # string_view + expect_array_roundtrip(c("itsy", "bitsy", "spider"), string_view(), as = string_view()) + expect_array_roundtrip(c("itsy", NA, "spider"), string_view(), as = string_view()) + + # string_view with empty strings + expect_array_roundtrip(c("", "bitsy", ""), string_view(), as = string_view()) + expect_array_roundtrip(c("", NA, ""), string_view(), as = string_view()) }) test_that("Character vectors > 2GB become large_utf8", { diff --git a/r/tests/testthat/test-Table.R b/r/tests/testthat/test-Table.R index c43d20f8fc63..01f204539b29 100644 --- a/r/tests/testthat/test-Table.R +++ b/r/tests/testthat/test-Table.R @@ -390,8 +390,23 @@ test_that("Table converts dictionary arrays with wider index types back to R", { test_that("Table converts dictionary arrays with string_view values", { expected <- data.frame(foo = factor(c("x", "y", "x"))) tab <- Table$create(expected, schema = schema(foo = dictionary(uint32(), string_view()))) + expect_equal_data_frame(tab, expected) + + # with NAs + expected_na <- data.frame(foo = factor(c("x", NA, "x"))) + tab_na <- Table$create(expected_na, schema = schema(foo = dictionary(uint32(), string_view()))) + expect_equal_data_frame(tab_na, expected_na) +}) +test_that("Table round-trips string_view columns", { + expected <- data.frame(x = c("hello", "world", "")) + tab <- Table$create(expected, schema = schema(x = string_view())) expect_equal_data_frame(tab, expected) + + # with NAs + expected_na <- data.frame(x = c("hello", NA, "")) + tab_na <- Table$create(expected_na, schema = schema(x = string_view())) + expect_equal_data_frame(tab_na, expected_na) }) test_that("Table unifies dictionary on conversion back to R (ARROW-8374)", { From 0d77a6f5b568245ae57e32c83bcbef46fc4260e4 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 5 May 2026 13:05:36 -0400 Subject: [PATCH 6/9] Fix offset bug --- r/man/CsvReadOptions.Rd | 4 +- r/man/JsonFileFormat.Rd | 4 - r/man/acero.Rd | 346 ++++++++++++++++----------------- r/man/arrow-package.Rd | 1 + r/man/csv_convert_options.Rd | 4 +- r/man/csv_read_options.Rd | 8 +- r/man/enums.Rd | 33 ---- r/man/read_json_arrow.Rd | 2 +- r/man/reexports.Rd | 4 +- r/man/schema.Rd | 2 +- r/man/vctrs_extension_array.Rd | 4 +- r/src/r_to_arrow.cpp | 6 +- 12 files changed, 192 insertions(+), 226 deletions(-) diff --git a/r/man/CsvReadOptions.Rd b/r/man/CsvReadOptions.Rd index d4544cf829f2..320685b05c0d 100644 --- a/r/man/CsvReadOptions.Rd +++ b/r/man/CsvReadOptions.Rd @@ -69,9 +69,9 @@ generate a row of missing values (if \code{FALSE})? \item \code{check_utf8} Logical: check UTF8 validity of string columns? (default \code{TRUE}) \item \code{null_values} character vector of recognized spellings for null values. Analogous to the \code{na.strings} argument to -\code{\link[utils:read.table]{read.csv()}} or \code{na} in \code{\link[readr:read_delim]{readr::read_csv()}}. +\code{\link[utils:read.csv]{read.csv()}} or \code{na} in \code{\link[readr:read_csv]{readr::read_csv()}}. \item \code{strings_can_be_null} Logical: can string / binary columns have -null values? Similar to the \code{quoted_na} argument to \code{\link[readr:read_delim]{readr::read_csv()}}. +null values? Similar to the \code{quoted_na} argument to \code{\link[readr:read_csv]{readr::read_csv()}}. (default \code{FALSE}) \item \code{true_values} character vector of recognized spellings for \code{TRUE} values \item \code{false_values} character vector of recognized spellings for \code{FALSE} values diff --git a/r/man/JsonFileFormat.Rd b/r/man/JsonFileFormat.Rd index a0edb50bb5ec..79a2f22efd7d 100644 --- a/r/man/JsonFileFormat.Rd +++ b/r/man/JsonFileFormat.Rd @@ -30,10 +30,6 @@ characters? (default \code{FALSE}) } } -\examples{ -\dontshow{if (arrow_with_dataset()) withAutoprint(\{ # examplesIf} -\dontshow{\}) # examplesIf} -} \seealso{ \link{FileFormat} } diff --git a/r/man/acero.Rd b/r/man/acero.Rd index 2e8b1fba1e89..6d4c27b18d5c 100644 --- a/r/man/acero.Rd +++ b/r/man/acero.Rd @@ -23,44 +23,44 @@ the query on the data. To run the query, call either \code{compute()}, which returns an \code{arrow} \link{Table}, or \code{collect()}, which pulls the resulting Table into an R \code{tibble}. \itemize{ -\item \code{\link[dplyr:filter-joins]{anti_join()}}: the \code{copy} argument is ignored +\item \code{\link[dplyr:anti_join]{anti_join()}}: the \code{copy} argument is ignored \item \code{\link[dplyr:arrange]{arrange()}} -\item \code{\link[dplyr:compute]{collapse()}} -\item \code{\link[dplyr:compute]{collect()}} +\item \code{\link[dplyr:collapse]{collapse()}} +\item \code{\link[dplyr:collect]{collect()}} \item \code{\link[dplyr:compute]{compute()}} \item \code{\link[dplyr:count]{count()}} \item \code{\link[dplyr:distinct]{distinct()}}: \code{.keep_all = TRUE} returns a non-missing value if present, only returning missing values if all are missing. \item \code{\link[dplyr:explain]{explain()}} \item \code{\link[dplyr:filter]{filter()}} -\item \code{\link[dplyr:filter]{filter_out()}} -\item \code{\link[dplyr:mutate-joins]{full_join()}}: the \code{copy} argument is ignored +\item \code{\link[dplyr:filter_out]{filter_out()}} +\item \code{\link[dplyr:full_join]{full_join()}}: the \code{copy} argument is ignored \item \code{\link[dplyr:glimpse]{glimpse()}} \item \code{\link[dplyr:group_by]{group_by()}} \item \code{\link[dplyr:group_by_drop_default]{group_by_drop_default()}} -\item \code{\link[dplyr:group_data]{group_vars()}} -\item \code{\link[dplyr:group_data]{groups()}} -\item \code{\link[dplyr:mutate-joins]{inner_join()}}: the \code{copy} argument is ignored -\item \code{\link[dplyr:mutate-joins]{left_join()}}: the \code{copy} argument is ignored +\item \code{\link[dplyr:group_vars]{group_vars()}} +\item \code{\link[dplyr:groups]{groups()}} +\item \code{\link[dplyr:inner_join]{inner_join()}}: the \code{copy} argument is ignored +\item \code{\link[dplyr:left_join]{left_join()}}: the \code{copy} argument is ignored \item \code{\link[dplyr:mutate]{mutate()}} \item \code{\link[dplyr:pull]{pull()}}: the \code{name} argument is not supported; returns an R vector by default but this behavior is deprecated and will return an Arrow \link{ChunkedArray} in a future release. Provide \code{as_vector = TRUE/FALSE} to control this behavior, or set \code{options(arrow.pull_as_vector)} globally. \item \code{\link[dplyr:relocate]{relocate()}} \item \code{\link[dplyr:rename]{rename()}} -\item \code{\link[dplyr:rename]{rename_with()}} -\item \code{\link[dplyr:mutate-joins]{right_join()}}: the \code{copy} argument is ignored +\item \code{\link[dplyr:rename_with]{rename_with()}} +\item \code{\link[dplyr:right_join]{right_join()}}: the \code{copy} argument is ignored \item \code{\link[dplyr:select]{select()}} -\item \code{\link[dplyr:filter-joins]{semi_join()}}: the \code{copy} argument is ignored -\item \code{\link[dplyr:explain]{show_query()}} -\item \code{\link[dplyr:slice]{slice_head()}}: slicing within groups not supported; Arrow datasets do not have row order, so head is non-deterministic; \code{prop} only supported on queries where \code{nrow()} is knowable without evaluating -\item \code{\link[dplyr:slice]{slice_max()}}: slicing within groups not supported; \code{with_ties = TRUE} (dplyr default) is not supported; \code{prop} only supported on queries where \code{nrow()} is knowable without evaluating -\item \code{\link[dplyr:slice]{slice_min()}}: slicing within groups not supported; \code{with_ties = TRUE} (dplyr default) is not supported; \code{prop} only supported on queries where \code{nrow()} is knowable without evaluating -\item \code{\link[dplyr:slice]{slice_sample()}}: slicing within groups not supported; \code{replace = TRUE} and the \code{weight_by} argument not supported; \code{n} only supported on queries where \code{nrow()} is knowable without evaluating -\item \code{\link[dplyr:slice]{slice_tail()}}: slicing within groups not supported; Arrow datasets do not have row order, so tail is non-deterministic; \code{prop} only supported on queries where \code{nrow()} is knowable without evaluating +\item \code{\link[dplyr:semi_join]{semi_join()}}: the \code{copy} argument is ignored +\item \code{\link[dplyr:show_query]{show_query()}} +\item \code{\link[dplyr:slice_head]{slice_head()}}: slicing within groups not supported; Arrow datasets do not have row order, so head is non-deterministic; \code{prop} only supported on queries where \code{nrow()} is knowable without evaluating +\item \code{\link[dplyr:slice_max]{slice_max()}}: slicing within groups not supported; \code{with_ties = TRUE} (dplyr default) is not supported; \code{prop} only supported on queries where \code{nrow()} is knowable without evaluating +\item \code{\link[dplyr:slice_min]{slice_min()}}: slicing within groups not supported; \code{with_ties = TRUE} (dplyr default) is not supported; \code{prop} only supported on queries where \code{nrow()} is knowable without evaluating +\item \code{\link[dplyr:slice_sample]{slice_sample()}}: slicing within groups not supported; \code{replace = TRUE} and the \code{weight_by} argument not supported; \code{n} only supported on queries where \code{nrow()} is knowable without evaluating +\item \code{\link[dplyr:slice_tail]{slice_tail()}}: slicing within groups not supported; Arrow datasets do not have row order, so tail is non-deterministic; \code{prop} only supported on queries where \code{nrow()} is knowable without evaluating \item \code{\link[dplyr:summarise]{summarise()}}: window functions not currently supported; arguments \code{.drop = FALSE} and \code{.groups = "rowwise"} not supported -\item \code{\link[dplyr:count]{tally()}} +\item \code{\link[dplyr:tally]{tally()}} \item \code{\link[dplyr:transmute]{transmute()}} -\item \code{\link[dplyr:group_by]{ungroup()}} -\item \code{\link[dplyr:setops]{union()}} -\item \code{\link[dplyr:setops]{union_all()}} +\item \code{\link[dplyr:ungroup]{ungroup()}} +\item \code{\link[dplyr:union]{union()}} +\item \code{\link[dplyr:union_all]{union_all()}} } } @@ -72,7 +72,7 @@ can assume that the function works in Acero just as it does in R. Functions can be called either as \code{pkg::fun()} or just \code{fun()}, i.e. both \code{str_sub()} and \code{stringr::str_sub()} work. -In addition to these functions, you can call any of Arrow's 281 compute +In addition to these functions, you can call any of Arrow's 253 compute functions directly. Arrow has many functions that don't map to an existing R function. In other cases where there is an R function mapping, you can still call the Arrow function directly if you don't want the adaptations that the R @@ -89,109 +89,109 @@ as \code{arrow_ascii_is_decimal}. \subsection{base}{ \itemize{ -\item \code{\link[=!]{!}} -\item \code{\link[=!=]{!=}} -\item \code{\link[=\%\%]{\%\%}} -\item \code{\link[=\%/\%]{\%/\%}} -\item \code{\link[=\%in\%]{\%in\%}} -\item \code{\link[=&]{&}} -\item \code{\link[=*]{*}} -\item \code{\link[=+]{+}} -\item \code{\link[=-]{-}} -\item \code{\link[=/]{/}} -\item \code{\link[=<]{<}} -\item \code{\link[=<=]{<=}} -\item \code{\link[===]{==}} -\item \code{\link[=>]{>}} -\item \code{\link[=>=]{>=}} -\item \code{\link[base:ISOdatetime]{ISOdate()}} +\item \code{\link{!}} +\item \code{\link{!=}} +\item \code{\link{\%\%}} +\item \code{\link{\%/\%}} +\item \code{\link{\%in\%}} +\item \code{\link{&}} +\item \code{\link{*}} +\item \code{\link{+}} +\item \code{\link{-}} +\item \code{\link{/}} +\item \code{\link{<}} +\item \code{\link{<=}} +\item \code{\link{==}} +\item \code{\link{>}} +\item \code{\link{>=}} +\item \code{\link[base:ISOdate]{ISOdate()}} \item \code{\link[base:ISOdatetime]{ISOdatetime()}} -\item \code{\link[=^]{^}} -\item \code{\link[base:MathFun]{abs()}} -\item \code{\link[base:Trig]{acos()}} -\item \code{\link[base:Hyperbolic]{acosh()}} +\item \code{\link{^}} +\item \code{\link[base:abs]{abs()}} +\item \code{\link[base:acos]{acos()}} +\item \code{\link[base:acosh]{acosh()}} \item \code{\link[base:all]{all()}} \item \code{\link[base:any]{any()}} \item \code{\link[base:as.Date]{as.Date()}}: Multiple \code{tryFormats} not supported in Arrow. Consider using the lubridate specialised parsing functions \code{ymd()}, \code{ymd()}, etc. -\item \code{\link[base:character]{as.character()}} -\item \code{\link[base:difftime]{as.difftime()}}: only supports \code{units = "secs"} (the default) -\item \code{\link[base:double]{as.double()}} -\item \code{\link[base:integer]{as.integer()}} -\item \code{\link[base:logical]{as.logical()}} -\item \code{\link[base:numeric]{as.numeric()}} -\item \code{\link[base:Trig]{asin()}} -\item \code{\link[base:Hyperbolic]{asinh()}} -\item \code{\link[base:Trig]{atan()}} -\item \code{\link[base:Hyperbolic]{atanh()}} -\item \code{\link[base:Round]{ceiling()}} -\item \code{\link[base:Trig]{cos()}} -\item \code{\link[base:Hyperbolic]{cosh()}} +\item \code{\link[base:as.character]{as.character()}} +\item \code{\link[base:as.difftime]{as.difftime()}}: only supports \code{units = "secs"} (the default) +\item \code{\link[base:as.double]{as.double()}} +\item \code{\link[base:as.integer]{as.integer()}} +\item \code{\link[base:as.logical]{as.logical()}} +\item \code{\link[base:as.numeric]{as.numeric()}} +\item \code{\link[base:asin]{asin()}} +\item \code{\link[base:asinh]{asinh()}} +\item \code{\link[base:atan]{atan()}} +\item \code{\link[base:atanh]{atanh()}} +\item \code{\link[base:ceiling]{ceiling()}} +\item \code{\link[base:cos]{cos()}} +\item \code{\link[base:cosh]{cosh()}} \item \code{\link[base:data.frame]{data.frame()}}: \code{row.names} and \code{check.rows} arguments not supported; \code{stringsAsFactors} must be \code{FALSE} \item \code{\link[base:difftime]{difftime()}}: only supports \code{units = "secs"} (the default); \code{tz} argument not supported -\item \code{\link[base:startsWith]{endsWith()}} -\item \code{\link[base:Log]{exp()}} -\item \code{\link[base:Log]{expm1()}} -\item \code{\link[base:Round]{floor()}} +\item \code{\link[base:endsWith]{endsWith()}} +\item \code{\link[base:exp]{exp()}} +\item \code{\link[base:expm1]{expm1()}} +\item \code{\link[base:floor]{floor()}} \item \code{\link[base:format]{format()}} -\item \code{\link[base:grep]{grepl()}} -\item \code{\link[base:grep]{gsub()}} +\item \code{\link[base:grepl]{grepl()}} +\item \code{\link[base:gsub]{gsub()}} \item \code{\link[base:ifelse]{ifelse()}} -\item \code{\link[base:character]{is.character()}} -\item \code{\link[base:double]{is.double()}} -\item \code{\link[base:factor]{is.factor()}} +\item \code{\link[base:is.character]{is.character()}} +\item \code{\link[base:is.double]{is.double()}} +\item \code{\link[base:is.factor]{is.factor()}} \item \code{\link[base:is.finite]{is.finite()}} -\item \code{\link[base:is.finite]{is.infinite()}} -\item \code{\link[base:integer]{is.integer()}} -\item \code{\link[base:list]{is.list()}} -\item \code{\link[base:logical]{is.logical()}} -\item \code{\link[base:NA]{is.na()}} -\item \code{\link[base:is.finite]{is.nan()}} -\item \code{\link[base:numeric]{is.numeric()}} -\item \code{\link[base:Log]{log()}} -\item \code{\link[base:Log]{log10()}} -\item \code{\link[base:Log]{log1p()}} -\item \code{\link[base:Log]{log2()}} -\item \code{\link[base:Log]{logb()}} -\item \code{\link[base:Extremes]{max()}} +\item \code{\link[base:is.infinite]{is.infinite()}} +\item \code{\link[base:is.integer]{is.integer()}} +\item \code{\link[base:is.list]{is.list()}} +\item \code{\link[base:is.logical]{is.logical()}} +\item \code{\link[base:is.na]{is.na()}} +\item \code{\link[base:is.nan]{is.nan()}} +\item \code{\link[base:is.numeric]{is.numeric()}} +\item \code{\link[base:log]{log()}} +\item \code{\link[base:log10]{log10()}} +\item \code{\link[base:log1p]{log1p()}} +\item \code{\link[base:log2]{log2()}} +\item \code{\link[base:logb]{logb()}} +\item \code{\link[base:max]{max()}} \item \code{\link[base:mean]{mean()}} -\item \code{\link[base:Extremes]{min()}} +\item \code{\link[base:min]{min()}} \item \code{\link[base:nchar]{nchar()}}: \code{allowNA = TRUE} and \code{keepNA = TRUE} not supported \item \code{\link[base:paste]{paste()}}: the \code{collapse} argument is not yet supported -\item \code{\link[base:paste]{paste0()}}: the \code{collapse} argument is not yet supported -\item \code{\link[base:Extremes]{pmax()}} -\item \code{\link[base:Extremes]{pmin()}} +\item \code{\link[base:paste0]{paste0()}}: the \code{collapse} argument is not yet supported +\item \code{\link[base:pmax]{pmax()}} +\item \code{\link[base:pmin]{pmin()}} \item \code{\link[base:prod]{prod()}} -\item \code{\link[base:Round]{round()}} +\item \code{\link[base:round]{round()}} \item \code{\link[base:sign]{sign()}} -\item \code{\link[base:Trig]{sin()}} -\item \code{\link[base:Hyperbolic]{sinh()}} -\item \code{\link[base:MathFun]{sqrt()}} +\item \code{\link[base:sin]{sin()}} +\item \code{\link[base:sinh]{sinh()}} +\item \code{\link[base:sqrt]{sqrt()}} \item \code{\link[base:startsWith]{startsWith()}} -\item \code{\link[base:strptime]{strftime()}} +\item \code{\link[base:strftime]{strftime()}} \item \code{\link[base:strptime]{strptime()}}: accepts a \code{unit} argument not present in the \code{base} function. Valid values are "s", "ms" (default), "us", "ns". \item \code{\link[base:strrep]{strrep()}} \item \code{\link[base:strsplit]{strsplit()}} -\item \code{\link[base:grep]{sub()}} +\item \code{\link[base:sub]{sub()}} \item \code{\link[base:substr]{substr()}}: \code{start} and \code{stop} must be length 1 -\item \code{\link[base:substr]{substring()}} +\item \code{\link[base:substring]{substring()}} \item \code{\link[base:sum]{sum()}} -\item \code{\link[base:Trig]{tan()}} -\item \code{\link[base:Hyperbolic]{tanh()}} -\item \code{\link[base:chartr]{tolower()}} -\item \code{\link[base:chartr]{toupper()}} -\item \code{\link[base:Round]{trunc()}} -\item \code{\link[=|]{|}} +\item \code{\link[base:tan]{tan()}} +\item \code{\link[base:tanh]{tanh()}} +\item \code{\link[base:tolower]{tolower()}} +\item \code{\link[base:toupper]{toupper()}} +\item \code{\link[base:trunc]{trunc()}} +\item \code{\link{|}} } } \subsection{bit64}{ \itemize{ -\item \code{\link[bit64:as.integer64.character]{as.integer64()}} -\item \code{\link[bit64:bit64-package]{is.integer64()}} +\item \code{\link[bit64:as.integer64]{as.integer64()}} +\item \code{\link[bit64:is.integer64]{is.integer64()}} } } @@ -199,25 +199,25 @@ Valid values are "s", "ms" (default), "us", "ns". \itemize{ \item \code{\link[dplyr:across]{across()}} \item \code{\link[dplyr:between]{between()}} -\item \code{\link[dplyr:case-and-replace-when]{case_when()}}: \code{.ptype} and \code{.size} arguments not supported +\item \code{\link[dplyr:case_when]{case_when()}}: \code{.ptype} and \code{.size} arguments not supported \item \code{\link[dplyr:coalesce]{coalesce()}} \item \code{\link[dplyr:desc]{desc()}} -\item \code{\link[dplyr:across]{if_all()}} -\item \code{\link[dplyr:across]{if_any()}} +\item \code{\link[dplyr:if_all]{if_all()}} +\item \code{\link[dplyr:if_any]{if_any()}} \item \code{\link[dplyr:if_else]{if_else()}} -\item \code{\link[dplyr:context]{n()}} +\item \code{\link[dplyr:n]{n()}} \item \code{\link[dplyr:n_distinct]{n_distinct()}} -\item \code{\link[dplyr:recode-and-replace-values]{recode_values()}}: \code{ptype} argument and \code{unmatched = "error"} not supported -\item \code{\link[dplyr:recode-and-replace-values]{replace_values()}} -\item \code{\link[dplyr:case-and-replace-when]{replace_when()}} -\item \code{\link[dplyr:when-any-all]{when_all()}} -\item \code{\link[dplyr:when-any-all]{when_any()}} +\item \code{\link[dplyr:recode_values]{recode_values()}}: \code{ptype} argument and \code{unmatched = "error"} not supported +\item \code{\link[dplyr:replace_values]{replace_values()}} +\item \code{\link[dplyr:replace_when]{replace_when()}} +\item \code{\link[dplyr:when_all]{when_all()}} +\item \code{\link[dplyr:when_any]{when_any()}} } } \subsection{hms}{ \itemize{ -\item \code{\link[hms:hms]{as_hms()}}: subsecond precision not supported for character input +\item \code{\link[hms:as_hms]{as_hms()}}: subsecond precision not supported for character input \item \code{\link[hms:hms]{hms()}}: nanosecond times not supported } } @@ -226,83 +226,83 @@ Valid values are "s", "ms" (default), "us", "ns". \itemize{ \item \code{\link[lubridate:am]{am()}} \item \code{\link[lubridate:as_date]{as_date()}} -\item \code{\link[lubridate:as_date]{as_datetime()}} -\item \code{\link[lubridate:round_date]{ceiling_date()}} +\item \code{\link[lubridate:as_datetime]{as_datetime()}} +\item \code{\link[lubridate:ceiling_date]{ceiling_date()}} \item \code{\link[lubridate:date]{date()}} \item \code{\link[lubridate:date_decimal]{date_decimal()}} \item \code{\link[lubridate:day]{day()}} -\item \code{\link[lubridate:duration]{ddays()}} +\item \code{\link[lubridate:ddays]{ddays()}} \item \code{\link[lubridate:decimal_date]{decimal_date()}} -\item \code{\link[lubridate:duration]{dhours()}} -\item \code{\link[lubridate:duration]{dmicroseconds()}} -\item \code{\link[lubridate:duration]{dmilliseconds()}} -\item \code{\link[lubridate:duration]{dminutes()}} -\item \code{\link[lubridate:duration]{dmonths()}} -\item \code{\link[lubridate:ymd]{dmy()}}: \code{locale} argument not supported -\item \code{\link[lubridate:ymd_hms]{dmy_h()}}: \code{locale} argument not supported -\item \code{\link[lubridate:ymd_hms]{dmy_hm()}}: \code{locale} argument not supported -\item \code{\link[lubridate:ymd_hms]{dmy_hms()}}: \code{locale} argument not supported -\item \code{\link[lubridate:duration]{dnanoseconds()}} -\item \code{\link[lubridate:duration]{dpicoseconds()}}: not supported -\item \code{\link[lubridate:duration]{dseconds()}} +\item \code{\link[lubridate:dhours]{dhours()}} +\item \code{\link[lubridate:dmicroseconds]{dmicroseconds()}} +\item \code{\link[lubridate:dmilliseconds]{dmilliseconds()}} +\item \code{\link[lubridate:dminutes]{dminutes()}} +\item \code{\link[lubridate:dmonths]{dmonths()}} +\item \code{\link[lubridate:dmy]{dmy()}}: \code{locale} argument not supported +\item \code{\link[lubridate:dmy_h]{dmy_h()}}: \code{locale} argument not supported +\item \code{\link[lubridate:dmy_hm]{dmy_hm()}}: \code{locale} argument not supported +\item \code{\link[lubridate:dmy_hms]{dmy_hms()}}: \code{locale} argument not supported +\item \code{\link[lubridate:dnanoseconds]{dnanoseconds()}} +\item \code{\link[lubridate:dpicoseconds]{dpicoseconds()}}: not supported +\item \code{\link[lubridate:dseconds]{dseconds()}} \item \code{\link[lubridate:dst]{dst()}} -\item \code{\link[lubridate:duration]{dweeks()}} -\item \code{\link[lubridate:duration]{dyears()}} -\item \code{\link[lubridate:ymd]{dym()}}: \code{locale} argument not supported -\item \code{\link[lubridate:week]{epiweek()}} -\item \code{\link[lubridate:year]{epiyear()}} -\item \code{\link[lubridate:parse_date_time]{fast_strptime()}}: non-default values of \code{lt} and \code{cutoff_2000} not supported -\item \code{\link[lubridate:round_date]{floor_date()}} +\item \code{\link[lubridate:dweeks]{dweeks()}} +\item \code{\link[lubridate:dyears]{dyears()}} +\item \code{\link[lubridate:dym]{dym()}}: \code{locale} argument not supported +\item \code{\link[lubridate:epiweek]{epiweek()}} +\item \code{\link[lubridate:epiyear]{epiyear()}} +\item \code{\link[lubridate:fast_strptime]{fast_strptime()}}: non-default values of \code{lt} and \code{cutoff_2000} not supported +\item \code{\link[lubridate:floor_date]{floor_date()}} \item \code{\link[lubridate:force_tz]{force_tz()}}: Timezone conversion from non-UTC timezone not supported; \code{roll_dst} values of 'error' and 'boundary' are supported for nonexistent times, \code{roll_dst} values of 'error', 'pre', and 'post' are supported for ambiguous times. \item \code{\link[lubridate:format_ISO8601]{format_ISO8601()}} \item \code{\link[lubridate:hour]{hour()}} -\item \code{\link[lubridate:date_utils]{is.Date()}} -\item \code{\link[lubridate:posix_utils]{is.POSIXct()}} +\item \code{\link[lubridate:is.Date]{is.Date()}} +\item \code{\link[lubridate:is.POSIXct]{is.POSIXct()}} \item \code{\link[lubridate:is.instant]{is.instant()}} -\item \code{\link[lubridate:is.instant]{is.timepoint()}} -\item \code{\link[lubridate:week]{isoweek()}} -\item \code{\link[lubridate:year]{isoyear()}} +\item \code{\link[lubridate:is.timepoint]{is.timepoint()}} +\item \code{\link[lubridate:isoweek]{isoweek()}} +\item \code{\link[lubridate:isoyear]{isoyear()}} \item \code{\link[lubridate:leap_year]{leap_year()}} -\item \code{\link[lubridate:make_datetime]{make_date()}} +\item \code{\link[lubridate:make_date]{make_date()}} \item \code{\link[lubridate:make_datetime]{make_datetime()}}: only supports UTC (default) timezone \item \code{\link[lubridate:make_difftime]{make_difftime()}}: only supports \code{units = "secs"} (the default); providing both \code{num} and \code{...} is not supported -\item \code{\link[lubridate:day]{mday()}} -\item \code{\link[lubridate:ymd]{mdy()}}: \code{locale} argument not supported -\item \code{\link[lubridate:ymd_hms]{mdy_h()}}: \code{locale} argument not supported -\item \code{\link[lubridate:ymd_hms]{mdy_hm()}}: \code{locale} argument not supported -\item \code{\link[lubridate:ymd_hms]{mdy_hms()}}: \code{locale} argument not supported +\item \code{\link[lubridate:mday]{mday()}} +\item \code{\link[lubridate:mdy]{mdy()}}: \code{locale} argument not supported +\item \code{\link[lubridate:mdy_h]{mdy_h()}}: \code{locale} argument not supported +\item \code{\link[lubridate:mdy_hm]{mdy_hm()}}: \code{locale} argument not supported +\item \code{\link[lubridate:mdy_hms]{mdy_hms()}}: \code{locale} argument not supported \item \code{\link[lubridate:minute]{minute()}} \item \code{\link[lubridate:month]{month()}} -\item \code{\link[lubridate:ymd]{my()}}: \code{locale} argument not supported -\item \code{\link[lubridate:ymd]{myd()}}: \code{locale} argument not supported +\item \code{\link[lubridate:my]{my()}}: \code{locale} argument not supported +\item \code{\link[lubridate:myd]{myd()}}: \code{locale} argument not supported \item \code{\link[lubridate:parse_date_time]{parse_date_time()}}: \code{quiet = FALSE} is not supported Available formats are H, I, j, M, S, U, w, W, y, Y, R, T. On Linux and OS X additionally a, A, b, B, Om, p, r are available. -\item \code{\link[lubridate:am]{pm()}} -\item \code{\link[lubridate:day]{qday()}} +\item \code{\link[lubridate:pm]{pm()}} +\item \code{\link[lubridate:qday]{qday()}} \item \code{\link[lubridate:quarter]{quarter()}} \item \code{\link[lubridate:round_date]{round_date()}} \item \code{\link[lubridate:second]{second()}} -\item \code{\link[lubridate:quarter]{semester()}} +\item \code{\link[lubridate:semester]{semester()}} \item \code{\link[lubridate:tz]{tz()}} -\item \code{\link[lubridate:day]{wday()}} +\item \code{\link[lubridate:wday]{wday()}} \item \code{\link[lubridate:week]{week()}} \item \code{\link[lubridate:with_tz]{with_tz()}} -\item \code{\link[lubridate:day]{yday()}} -\item \code{\link[lubridate:ymd]{ydm()}}: \code{locale} argument not supported -\item \code{\link[lubridate:ymd_hms]{ydm_h()}}: \code{locale} argument not supported -\item \code{\link[lubridate:ymd_hms]{ydm_hm()}}: \code{locale} argument not supported -\item \code{\link[lubridate:ymd_hms]{ydm_hms()}}: \code{locale} argument not supported +\item \code{\link[lubridate:yday]{yday()}} +\item \code{\link[lubridate:ydm]{ydm()}}: \code{locale} argument not supported +\item \code{\link[lubridate:ydm_h]{ydm_h()}}: \code{locale} argument not supported +\item \code{\link[lubridate:ydm_hm]{ydm_hm()}}: \code{locale} argument not supported +\item \code{\link[lubridate:ydm_hms]{ydm_hms()}}: \code{locale} argument not supported \item \code{\link[lubridate:year]{year()}} -\item \code{\link[lubridate:ymd]{ym()}}: \code{locale} argument not supported +\item \code{\link[lubridate:ym]{ym()}}: \code{locale} argument not supported \item \code{\link[lubridate:ymd]{ymd()}}: \code{locale} argument not supported -\item \code{\link[lubridate:ymd_hms]{ymd_h()}}: \code{locale} argument not supported -\item \code{\link[lubridate:ymd_hms]{ymd_hm()}}: \code{locale} argument not supported +\item \code{\link[lubridate:ymd_h]{ymd_h()}}: \code{locale} argument not supported +\item \code{\link[lubridate:ymd_hm]{ymd_hm()}}: \code{locale} argument not supported \item \code{\link[lubridate:ymd_hms]{ymd_hms()}}: \code{locale} argument not supported -\item \code{\link[lubridate:ymd]{yq()}}: \code{locale} argument not supported +\item \code{\link[lubridate:yq]{yq()}}: \code{locale} argument not supported } } @@ -314,11 +314,11 @@ On Linux and OS X additionally a, A, b, B, Om, p, r are available. \subsection{rlang}{ \itemize{ -\item \code{\link[rlang:type-predicates]{is_character()}} -\item \code{\link[rlang:type-predicates]{is_double()}} -\item \code{\link[rlang:type-predicates]{is_integer()}} -\item \code{\link[rlang:type-predicates]{is_list()}} -\item \code{\link[rlang:type-predicates]{is_logical()}} +\item \code{\link[rlang:is_character]{is_character()}} +\item \code{\link[rlang:is_double]{is_double()}} +\item \code{\link[rlang:is_integer]{is_integer()}} +\item \code{\link[rlang:is_list]{is_list()}} +\item \code{\link[rlang:is_logical]{is_logical()}} } } @@ -328,7 +328,7 @@ On Linux and OS X additionally a, A, b, B, Om, p, r are available. \item \code{\link[stats:quantile]{quantile()}}: \code{probs} must be length 1; approximate quantile (t-digest) is computed \item \code{\link[stats:sd]{sd()}} -\item \code{\link[stats:cor]{var()}} +\item \code{\link[stats:var]{var()}} } } @@ -346,22 +346,22 @@ Pattern modifiers \code{coll()} and \code{boundary()} are not supported in any f \item \code{\link[stringr:str_count]{str_count()}}: \code{pattern} must be a length 1 character vector \item \code{\link[stringr:str_detect]{str_detect()}} \item \code{\link[stringr:str_dup]{str_dup()}} -\item \code{\link[stringr:str_starts]{str_ends()}} -\item \code{\link[stringr:str_like]{str_ilike()}} +\item \code{\link[stringr:str_ends]{str_ends()}} +\item \code{\link[stringr:str_ilike]{str_ilike()}} \item \code{\link[stringr:str_length]{str_length()}} \item \code{\link[stringr:str_like]{str_like()}} \item \code{\link[stringr:str_pad]{str_pad()}} \item \code{\link[stringr:str_remove]{str_remove()}} -\item \code{\link[stringr:str_remove]{str_remove_all()}} +\item \code{\link[stringr:str_remove_all]{str_remove_all()}} \item \code{\link[stringr:str_replace]{str_replace()}} -\item \code{\link[stringr:str_replace]{str_replace_all()}} +\item \code{\link[stringr:str_replace_all]{str_replace_all()}} \item \code{\link[stringr:str_replace_na]{str_replace_na()}} \item \code{\link[stringr:str_split]{str_split()}}: Case-insensitive string splitting and splitting into 0 parts not supported \item \code{\link[stringr:str_starts]{str_starts()}} \item \code{\link[stringr:str_sub]{str_sub()}}: \code{start} and \code{end} must be length 1 -\item \code{\link[stringr:case]{str_to_lower()}} -\item \code{\link[stringr:case]{str_to_title()}} -\item \code{\link[stringr:case]{str_to_upper()}} +\item \code{\link[stringr:str_to_lower]{str_to_lower()}} +\item \code{\link[stringr:str_to_title]{str_to_title()}} +\item \code{\link[stringr:str_to_upper]{str_to_upper()}} \item \code{\link[stringr:str_trim]{str_trim()}} } } @@ -375,12 +375,12 @@ Pattern modifiers \code{coll()} and \code{boundary()} are not supported in any f \subsection{tidyselect}{ \itemize{ \item \code{\link[tidyselect:all_of]{all_of()}} -\item \code{\link[tidyselect:starts_with]{contains()}} -\item \code{\link[tidyselect:starts_with]{ends_with()}} +\item \code{\link[tidyselect:contains]{contains()}} +\item \code{\link[tidyselect:ends_with]{ends_with()}} \item \code{\link[tidyselect:everything]{everything()}} -\item \code{\link[tidyselect:everything]{last_col()}} -\item \code{\link[tidyselect:starts_with]{matches()}} -\item \code{\link[tidyselect:starts_with]{num_range()}} +\item \code{\link[tidyselect:last_col]{last_col()}} +\item \code{\link[tidyselect:matches]{matches()}} +\item \code{\link[tidyselect:num_range]{num_range()}} \item \code{\link[tidyselect:one_of]{one_of()}} \item \code{\link[tidyselect:starts_with]{starts_with()}} } diff --git a/r/man/arrow-package.Rd b/r/man/arrow-package.Rd index 69cef1bacccf..c96a0ddb84e3 100644 --- a/r/man/arrow-package.Rd +++ b/r/man/arrow-package.Rd @@ -22,6 +22,7 @@ Useful links: Authors: \itemize{ + \item Jonathan Keane \email{jkeane@gmail.com} \item Neal Richardson \email{neal.p.richardson@gmail.com} \item Ian Cook \email{ianmcook@gmail.com} \item Nic Crane \email{thisisnic@gmail.com} diff --git a/r/man/csv_convert_options.Rd b/r/man/csv_convert_options.Rd index 58e685351b91..c61da51fb7ce 100644 --- a/r/man/csv_convert_options.Rd +++ b/r/man/csv_convert_options.Rd @@ -24,14 +24,14 @@ csv_convert_options( \item{null_values}{Character vector of recognized spellings for null values. Analogous to the \code{na.strings} argument to -\code{\link[utils:read.table]{read.csv()}} or \code{na} in \code{\link[readr:read_delim]{readr::read_csv()}}.} +\code{\link[utils:read.csv]{read.csv()}} or \code{na} in \code{\link[readr:read_csv]{readr::read_csv()}}.} \item{true_values}{Character vector of recognized spellings for \code{TRUE} values} \item{false_values}{Character vector of recognized spellings for \code{FALSE} values} \item{strings_can_be_null}{Logical: can string / binary columns have -null values? Similar to the \code{quoted_na} argument to \code{\link[readr:read_delim]{readr::read_csv()}}} +null values? Similar to the \code{quoted_na} argument to \code{\link[readr:read_csv]{readr::read_csv()}}} \item{col_types}{A \code{Schema} or \code{NULL} to infer types} diff --git a/r/man/csv_read_options.Rd b/r/man/csv_read_options.Rd index 8049403f26dc..cea245ff0ac9 100644 --- a/r/man/csv_read_options.Rd +++ b/r/man/csv_read_options.Rd @@ -35,9 +35,11 @@ be "f0", "f1", ..., "fN".} \item{skip_rows_after_names}{Number of lines to skip after the column names (default 0). This number can be larger than the number of rows in one block, and empty rows are counted. The order of application is as follows: -- \code{skip_rows} is applied (if non-zero); -- column names are read (unless \code{column_names} is set); -- \code{skip_rows_after_names} is applied (if non-zero).} +\itemize{ +\item \code{skip_rows} is applied (if non-zero); +\item column names are read (unless \code{column_names} is set); +\item \code{skip_rows_after_names} is applied (if non-zero). +}} } \description{ CSV Reading Options diff --git a/r/man/enums.Rd b/r/man/enums.Rd index 4088e7d843b8..6807ce662555 100644 --- a/r/man/enums.Rd +++ b/r/man/enums.Rd @@ -1,6 +1,5 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/enums.R -\docType{data} \name{enums} \alias{enums} \alias{TimeUnit} @@ -19,37 +18,6 @@ \alias{RoundMode} \alias{JoinType} \title{Arrow enums} -\format{ -An object of class \code{TimeUnit::type} (inherits from \code{arrow-enum}) of length 4. - -An object of class \code{DateUnit} (inherits from \code{arrow-enum}) of length 2. - -An object of class \code{Type::type} (inherits from \code{arrow-enum}) of length 45. - -An object of class \code{StatusCode} (inherits from \code{arrow-enum}) of length 13. - -An object of class \code{FileMode} (inherits from \code{arrow-enum}) of length 3. - -An object of class \code{MessageType} (inherits from \code{arrow-enum}) of length 5. - -An object of class \code{Compression::type} (inherits from \code{arrow-enum}) of length 9. - -An object of class \code{FileType} (inherits from \code{arrow-enum}) of length 4. - -An object of class \code{ParquetVersionType} (inherits from \code{arrow-enum}) of length 3. - -An object of class \code{MetadataVersion} (inherits from \code{arrow-enum}) of length 5. - -An object of class \code{QuantileInterpolation} (inherits from \code{arrow-enum}) of length 5. - -An object of class \code{NullEncodingBehavior} (inherits from \code{arrow-enum}) of length 2. - -An object of class \code{NullHandlingBehavior} (inherits from \code{arrow-enum}) of length 3. - -An object of class \code{RoundMode} (inherits from \code{arrow-enum}) of length 10. - -An object of class \code{JoinType} (inherits from \code{arrow-enum}) of length 8. -} \usage{ TimeUnit @@ -84,5 +52,4 @@ JoinType \description{ Arrow enums } -\keyword{datasets} \keyword{internal} diff --git a/r/man/read_json_arrow.Rd b/r/man/read_json_arrow.Rd index abf6b8fc44a8..b809a63bcc6f 100644 --- a/r/man/read_json_arrow.Rd +++ b/r/man/read_json_arrow.Rd @@ -54,7 +54,7 @@ If \code{schema} is not provided, Arrow data types are inferred from the data: \item JSON numbers convert to \code{\link[=int64]{int64()}}, falling back to \code{\link[=float64]{float64()}} if a non-integer is encountered. \item JSON strings of the kind "YYYY-MM-DD" and "YYYY-MM-DD hh:mm:ss" convert to \code{\link[=timestamp]{timestamp(unit = "s")}}, falling back to \code{\link[=utf8]{utf8()}} if a conversion error occurs. -\item JSON arrays convert to a \code{\link[vctrs:list_of]{vctrs::list_of()}} type, and inference proceeds recursively on the JSON arrays' values. +\item JSON arrays convert to a \code{\link[=list_of]{list_of()}} type, and inference proceeds recursively on the JSON arrays' values. \item Nested JSON objects convert to a \code{\link[=struct]{struct()}} type, and inference proceeds recursively on the JSON objects' values. } diff --git a/r/man/reexports.Rd b/r/man/reexports.Rd index 591158c72f4c..756df2d884c3 100644 --- a/r/man/reexports.Rd +++ b/r/man/reexports.Rd @@ -22,8 +22,8 @@ These objects are imported from other packages. Follow the links below to see their documentation. \describe{ - \item{bit64}{\code{\link[bit64:bit64-package]{print.integer64}}, \code{\link[bit64:bit64-package]{str.integer64}}} + \item{bit64}{\code{\link[bit64:print.integer64]{print.integer64()}}, \code{\link[bit64:str.integer64]{str.integer64()}}} - \item{tidyselect}{\code{\link[tidyselect]{all_of}}, \code{\link[tidyselect:starts_with]{contains}}, \code{\link[tidyselect:starts_with]{ends_with}}, \code{\link[tidyselect]{everything}}, \code{\link[tidyselect:everything]{last_col}}, \code{\link[tidyselect:starts_with]{matches}}, \code{\link[tidyselect:starts_with]{num_range}}, \code{\link[tidyselect]{one_of}}, \code{\link[tidyselect]{starts_with}}} + \item{tidyselect}{\code{\link[tidyselect:all_of]{all_of()}}, \code{\link[tidyselect:contains]{contains()}}, \code{\link[tidyselect:ends_with]{ends_with()}}, \code{\link[tidyselect:everything]{everything()}}, \code{\link[tidyselect:last_col]{last_col()}}, \code{\link[tidyselect:matches]{matches()}}, \code{\link[tidyselect:num_range]{num_range()}}, \code{\link[tidyselect:one_of]{one_of()}}, \code{\link[tidyselect:starts_with]{starts_with()}}} }} diff --git a/r/man/schema.Rd b/r/man/schema.Rd index ff77a05d84aa..65ab2eea0d27 100644 --- a/r/man/schema.Rd +++ b/r/man/schema.Rd @@ -7,7 +7,7 @@ schema(...) } \arguments{ -\item{...}{\link[vctrs:fields]{fields}, field name/\link[=data-type]{data type} pairs (or a list of), or object from which to extract +\item{...}{\link[=field]{fields}, field name/\link[=data-type]{data type} pairs (or a list of), or object from which to extract a schema} } \description{ diff --git a/r/man/vctrs_extension_array.Rd b/r/man/vctrs_extension_array.Rd index a3b9d902a1fb..6fb1b333277f 100644 --- a/r/man/vctrs_extension_array.Rd +++ b/r/man/vctrs_extension_array.Rd @@ -10,7 +10,7 @@ vctrs_extension_array(x, ptype = vctrs::vec_ptype(x), storage_type = NULL) vctrs_extension_type(x, storage_type = infer_type(vctrs::vec_data(x))) } \arguments{ -\item{x}{A vctr (i.e., \code{\link[vctrs:vec_assert]{vctrs::vec_is()}} returns \code{TRUE}).} +\item{x}{A vctr (i.e., \code{\link[vctrs:vec_is]{vctrs::vec_is()}} returns \code{TRUE}).} \item{ptype}{A \code{\link[vctrs:vec_ptype]{vctrs::vec_ptype()}}, which is usually a zero-length version of the object with the appropriate attributes set. This value @@ -33,7 +33,7 @@ Most common R vector types are converted automatically to a suitable Arrow \link[=data-type]{data type} without the need for an extension type. For vector types whose conversion is not suitably handled by default, you can create a \code{\link[=vctrs_extension_array]{vctrs_extension_array()}}, which passes \code{\link[vctrs:vec_data]{vctrs::vec_data()}} to -\code{Array$create()} and calls \code{\link[vctrs:vec_proxy]{vctrs::vec_restore()}} when the \link{Array} is +\code{Array$create()} and calls \code{\link[vctrs:vec_restore]{vctrs::vec_restore()}} when the \link{Array} is converted back into an R vector. } \examples{ diff --git a/r/src/r_to_arrow.cpp b/r/src/r_to_arrow.cpp index 20f45e00361b..9ce20b559d0f 100644 --- a/r/src/r_to_arrow.cpp +++ b/r/src/r_to_arrow.cpp @@ -929,8 +929,8 @@ class RPrimitiveConverter> private: Status UnsafeAppendUtf8Strings(const cpp11::strings& s, int64_t size, int64_t offset) { - RETURN_NOT_OK(this->primitive_builder_->Reserve(s.size())); - const SEXP* p_strings = reinterpret_cast(DATAPTR_RO(s)); + RETURN_NOT_OK(this->primitive_builder_->Reserve(size - offset)); + const SEXP* p_strings = reinterpret_cast(DATAPTR_RO(s)) + offset; int64_t total_length = 0; for (R_xlen_t i = offset; i < size; i++, ++p_strings) { @@ -939,7 +939,7 @@ class RPrimitiveConverter> } RETURN_NOT_OK(this->primitive_builder_->ReserveData(total_length)); - p_strings = reinterpret_cast(DATAPTR_RO(s)); + p_strings = reinterpret_cast(DATAPTR_RO(s)) + offset; for (R_xlen_t i = offset; i < size; i++, ++p_strings) { SEXP si = *p_strings; if (si == NA_STRING) { From 9f59d40a2cf61bf5078081a4316e7839e87f9c1d Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 5 May 2026 13:07:23 -0400 Subject: [PATCH 7/9] auto-update description --- r/DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/DESCRIPTION b/r/DESCRIPTION index c76dfc5572fe..b21b12cba7bd 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -44,7 +44,6 @@ Imports: utils, vctrs Roxygen: list(markdown = TRUE, r6 = FALSE, load = "source") -RoxygenNote: 7.3.3 Config/testthat/edition: 3 Config/build/bootstrap: TRUE Suggests: @@ -152,3 +151,4 @@ Collate: 'schema.R' 'udf.R' 'util.R' +Config/roxygen2/version: 8.0.0 From 8925a6b8c848676670a3b52ab28e2088cc8bfe10 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 5 May 2026 13:21:37 -0400 Subject: [PATCH 8/9] Fix DictionaryBuilder Append signature in Python bindings --- python/pyarrow/src/arrow/python/python_to_arrow.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/src/arrow/python/python_to_arrow.cc b/python/pyarrow/src/arrow/python/python_to_arrow.cc index e7ce54abcd8f..16b91ce7f251 100644 --- a/python/pyarrow/src/arrow/python/python_to_arrow.cc +++ b/python/pyarrow/src/arrow/python/python_to_arrow.cc @@ -826,7 +826,8 @@ class PyDictionaryConverter> } else { ARROW_RETURN_NOT_OK( PyValue::Convert(this->value_type_, this->options_, value, view_)); - return this->value_builder_->Append(view_.bytes, static_cast(view_.size)); + return this->value_builder_->Append(reinterpret_cast(view_.bytes), + static_cast(view_.size)); } } From de59365eb6edf83d28341621a58e224c52e2d04f Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 5 May 2026 14:18:31 -0400 Subject: [PATCH 9/9] Fix DictionaryBuilder::Append for StringViewType in Python bindings --- python/pyarrow/src/arrow/python/python_to_arrow.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/src/arrow/python/python_to_arrow.cc b/python/pyarrow/src/arrow/python/python_to_arrow.cc index 16b91ce7f251..8c92918cf30a 100644 --- a/python/pyarrow/src/arrow/python/python_to_arrow.cc +++ b/python/pyarrow/src/arrow/python/python_to_arrow.cc @@ -826,8 +826,8 @@ class PyDictionaryConverter> } else { ARROW_RETURN_NOT_OK( PyValue::Convert(this->value_type_, this->options_, value, view_)); - return this->value_builder_->Append(reinterpret_cast(view_.bytes), - static_cast(view_.size)); + return this->value_builder_->Append( + std::string_view(view_.bytes, static_cast(view_.size))); } }