diff --git a/cmake/warnings.cmake b/cmake/warnings.cmake index 76a73911eb4a..228e0c254e9c 100644 --- a/cmake/warnings.cmake +++ b/cmake/warnings.cmake @@ -44,3 +44,4 @@ no_warning(thread-safety-negative) # experimental flag, too many false positives no_warning(unsafe-buffer-usage) # too aggressive no_warning(switch-default) # conflicts with "defaults in a switch covering all enum values" no_warning(nrvo) # not eliding copy on return - too aggressive +no_warning(missing-noreturn) # Clang: many throw-only overrides; marking [[noreturn]] on all of them is impractical diff --git a/contrib/libhdfs3 b/contrib/libhdfs3 index d0ae7d256815..ceb428c52e6b 160000 --- a/contrib/libhdfs3 +++ b/contrib/libhdfs3 @@ -1 +1 @@ -Subproject commit d0ae7d2568151feef61d3ec7896803262f0e0f91 +Subproject commit ceb428c52e6b4362a35ec18b69206d9bb94edce3 diff --git a/contrib/libhdfs3-cmake/CMakeLists.txt b/contrib/libhdfs3-cmake/CMakeLists.txt index 0921c3c3831a..870ccfa46857 100644 --- a/contrib/libhdfs3-cmake/CMakeLists.txt +++ b/contrib/libhdfs3-cmake/CMakeLists.txt @@ -97,6 +97,7 @@ set(SRCS "${HDFS3_SOURCE_DIR}/client/RawErasureCoderFactory.cpp" "${HDFS3_SOURCE_DIR}/client/RawErasureDecoder.cpp" "${HDFS3_SOURCE_DIR}/client/RawErasureEncoder.cpp" + "${HDFS3_SOURCE_DIR}/client/PositionStripeReader.cpp" "${HDFS3_SOURCE_DIR}/client/StatefulStripeReader.cpp" "${HDFS3_SOURCE_DIR}/client/StripeReader.cpp" "${HDFS3_SOURCE_DIR}/client/StripedBlockUtil.cpp" diff --git a/src/Processors/Formats/Impl/NativeORCBlockInputFormat.cpp b/src/Processors/Formats/Impl/NativeORCBlockInputFormat.cpp index 94c5d00f852d..b0310a3f86c8 100644 --- a/src/Processors/Formats/Impl/NativeORCBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/NativeORCBlockInputFormat.cpp @@ -96,9 +96,12 @@ extern const int ARGUMENT_OUT_OF_BOUND; ORCInputStream::ORCInputStream(SeekableReadBuffer & in_, size_t file_size_, bool use_prefetch) - : in(in_), file_size(file_size_), supports_read_at(use_prefetch && in_.supportsReadAt()) + : in(in_) + , file_size(file_size_) + , use_offset_based_read(in_.supportsReadAt()) + , use_async_prefetch(use_prefetch && use_offset_based_read) { - if (supports_read_at) + if (use_async_prefetch) async_runner = threadPoolCallbackRunnerUnsafe(getIOThreadPool().get(), "ORCFile"); } @@ -114,13 +117,20 @@ UInt64 ORCInputStream::getNaturalReadSize() const void ORCInputStream::read(void * buf, UInt64 length, UInt64 offset) { - if (supports_read_at) + if (use_offset_based_read) { size_t bytes_read = 0; while (bytes_read < length) { size_t bytes_to_read = length - bytes_read; size_t n = in.readBigAt(reinterpret_cast(buf) + bytes_read, bytes_to_read, offset + bytes_read, nullptr); + if (n == 0) + throw Exception( + ErrorCodes::INCORRECT_DATA, + "ORC readBigAt returned no bytes at offset {} ({} bytes remaining of {}); input may be truncated or corrupted", + offset + bytes_read, + bytes_to_read, + length); bytes_read += n; } } @@ -134,7 +144,7 @@ void ORCInputStream::read(void * buf, UInt64 length, UInt64 offset) std::future ORCInputStream::readAsync(void * buf, uint64_t length, uint64_t offset) { - if (supports_read_at) + if (use_async_prefetch) { return async_runner( [this, buf, length, offset] diff --git a/src/Processors/Formats/Impl/NativeORCBlockInputFormat.h b/src/Processors/Formats/Impl/NativeORCBlockInputFormat.h index 860bbd28b515..bba818fac082 100644 --- a/src/Processors/Formats/Impl/NativeORCBlockInputFormat.h +++ b/src/Processors/Formats/Impl/NativeORCBlockInputFormat.h @@ -33,7 +33,10 @@ class ORCInputStream : public orc::InputStream protected: SeekableReadBuffer & in; size_t file_size; - bool supports_read_at; + /// Use offset-based reads (ReadBuffer::readBigAt, e.g. hdfs pread) instead of seek+read; needed for ORC tail on HDFS EC. + bool use_offset_based_read; + /// Async wrapper only when caller enabled prefetch and the buffer supports read-at. + bool use_async_prefetch; ThreadPoolCallbackRunnerUnsafe async_runner; std::string name = "ORCInputStream";