diff --git a/clickhouse/columns/array.cpp b/clickhouse/columns/array.cpp index 1867d778..4e452000 100644 --- a/clickhouse/columns/array.cpp +++ b/clickhouse/columns/array.cpp @@ -52,11 +52,6 @@ ColumnRef ColumnArray::CloneEmpty() const { return std::make_shared(data_->CloneEmpty()); } -void ColumnArray::Reserve(size_t new_cap) { - data_->Reserve(new_cap); - offsets_->Reserve(new_cap); -} - void ColumnArray::Append(ColumnRef column) { if (auto col = column->As()) { for (size_t i = 0; i < col->Size(); ++i) { @@ -65,6 +60,15 @@ void ColumnArray::Append(ColumnRef column) { } } +void ColumnArray::Reserve(size_t new_cap) { + data_->Reserve(new_cap); + offsets_->Reserve(new_cap); +} + +size_t ColumnArray::Capacity() const { + return data_->Capacity(); +} + bool ColumnArray::LoadPrefix(InputStream* input, size_t rows) { if (!rows) { return true; @@ -110,6 +114,10 @@ size_t ColumnArray::Size() const { return offsets_->Size(); } +size_t ColumnArray::MemoryUsage() const { + return offsets_->MemoryUsage() + data_->MemoryUsage(); +} + void ColumnArray::Swap(Column& other) { auto & col = dynamic_cast(other); data_.swap(col.data_); diff --git a/clickhouse/columns/array.h b/clickhouse/columns/array.h index 3ad9c94d..83bacff0 100644 --- a/clickhouse/columns/array.h +++ b/clickhouse/columns/array.h @@ -47,11 +47,11 @@ class ColumnArray : public Column { } public: - /// Increase the capacity of the column for large block insertion. - void Reserve(size_t new_cap) override; - /// Appends content of given column to the end of current one. void Append(ColumnRef column) override; + /// Increase the capacity of the column for large block insertion. + void Reserve(size_t new_cap) override; + size_t Capacity() const override; /// Loads column prefix from input stream. bool LoadPrefix(InputStream* input, size_t rows) override; @@ -71,6 +71,8 @@ class ColumnArray : public Column { /// Returns count of rows in the column. size_t Size() const override; + size_t MemoryUsage() const override; + /// Makes slice of the current column. ColumnRef Slice(size_t, size_t) const override; ColumnRef CloneEmpty() const override; diff --git a/clickhouse/columns/column.h b/clickhouse/columns/column.h index 475df89a..b1fc643d 100644 --- a/clickhouse/columns/column.h +++ b/clickhouse/columns/column.h @@ -54,6 +54,7 @@ class Column : public std::enable_shared_from_this { /// Increase the capacity of the column for large block insertion. virtual void Reserve(size_t new_cap) = 0; + virtual size_t Capacity() const = 0; /// Template method to load column data from input stream. It'll call LoadPrefix and LoadBody. /// Should be called only once from the client. Derived classes should not call it. @@ -90,6 +91,9 @@ class Column : public std::enable_shared_from_this { virtual void Swap(Column&) = 0; + /// Estimated RAM usage by the column in bytes. + virtual size_t MemoryUsage() const = 0; + /// Get a view on raw item data if it is supported by column, will throw an exception if index is out of range. /// Please note that view is invalidated once column items are added or deleted, column is loaded from strean or destroyed. virtual ItemView GetItem(size_t) const { diff --git a/clickhouse/columns/date.cpp b/clickhouse/columns/date.cpp index 132c6fc7..5be752a5 100644 --- a/clickhouse/columns/date.cpp +++ b/clickhouse/columns/date.cpp @@ -37,16 +37,16 @@ uint16_t ColumnDate::RawAt(size_t n) const { return data_->At(n); } +std::vector& ColumnDate::GetWritableData() { + return data_->GetWritableData(); +} + void ColumnDate::Append(ColumnRef column) { if (auto col = column->As()) { data_->Append(col->data_); } } -std::vector& ColumnDate::GetWritableData() { - return data_->GetWritableData(); -} - void ColumnDate::Reserve(size_t new_cap) { data_->Reserve(new_cap); } @@ -67,6 +67,10 @@ size_t ColumnDate::Size() const { return data_->Size(); } +size_t ColumnDate::MemoryUsage() const { + return data_->MemoryUsage(); +} + ColumnRef ColumnDate::Slice(size_t begin, size_t len) const { auto col = data_->Slice(begin, len)->As(); auto result = std::make_shared(); @@ -154,6 +158,10 @@ size_t ColumnDate32::Size() const { return data_->Size(); } +size_t ColumnDate32::MemoryUsage() const { + return data_->MemoryUsage(); +} + ColumnRef ColumnDate32::Slice(size_t begin, size_t len) const { auto col = data_->Slice(begin, len)->As(); auto result = std::make_shared(); @@ -244,6 +252,10 @@ size_t ColumnDateTime::Size() const { return data_->Size(); } +size_t ColumnDateTime::MemoryUsage() const { + return data_->MemoryUsage(); +} + void ColumnDateTime::Clear() { data_->Clear(); } @@ -303,11 +315,14 @@ std::string ColumnDateTime64::Timezone() const { return type_->As()->Timezone(); } -void ColumnDateTime64::Reserve(size_t new_cap) -{ +void ColumnDateTime64::Reserve(size_t new_cap) { data_->Reserve(new_cap); } +size_t ColumnDateTime64::Capacity() const { + return data_->Capacity(); +} + void ColumnDateTime64::Append(ColumnRef column) { if (auto col = column->As()) { data_->Append(col->data_); @@ -330,6 +345,10 @@ size_t ColumnDateTime64::Size() const { return data_->Size(); } +size_t ColumnDateTime64::MemoryUsage() const { + return data_->MemoryUsage(); +} + ItemView ColumnDateTime64::GetItem(size_t index) const { return ItemView(Type::DateTime64, data_->GetItem(index)); } diff --git a/clickhouse/columns/date.h b/clickhouse/columns/date.h index bf501723..a2c5fcc2 100644 --- a/clickhouse/columns/date.h +++ b/clickhouse/columns/date.h @@ -27,18 +27,18 @@ class ColumnDate : public Column { /// Do append data as is -- number of day in Unix epoch, no conversions performed. void AppendRaw(uint16_t value); uint16_t RawAt(size_t n) const; + /// Get Raw Vector Contents + std::vector& GetWritableData(); +public: /// Appends content of given column to the end of current one. void Append(ColumnRef column) override; - /// Get Raw Vector Contents - std::vector& GetWritableData(); - /// Increase the capacity of the column for large block insertion. void Reserve(size_t new_cap) override; /// Returns the capacity of the column - size_t Capacity() const; + size_t Capacity() const override; /// Loads column data from input stream. bool LoadBody(InputStream* input, size_t rows) override; @@ -51,6 +51,7 @@ class ColumnDate : public Column { /// Returns count of rows in the column. size_t Size() const override; + size_t MemoryUsage() const override; /// Makes slice of the current column. ColumnRef Slice(size_t begin, size_t len) const override; @@ -88,15 +89,13 @@ class ColumnDate32 : public Column { /// Get Raw Vector Contents std::vector& GetWritableData(); - /// Returns the capacity of the column - size_t Capacity() const; - public: - /// Increase the capacity of the column for large block insertion. - void Reserve(size_t new_cap) override; - /// Appends content of given column to the end of current one. void Append(ColumnRef column) override; + /// Increase the capacity of the column for large block insertion. + void Reserve(size_t new_cap) override; + /// Returns the capacity of the column + size_t Capacity() const override; /// Loads column data from input stream. bool LoadBody(InputStream* input, size_t rows) override; @@ -109,6 +108,7 @@ class ColumnDate32 : public Column { /// Returns count of rows in the column. size_t Size() const override; + size_t MemoryUsage() const override; /// Makes slice of the current column. ColumnRef Slice(size_t begin, size_t len) const override; @@ -149,15 +149,13 @@ class ColumnDateTime : public Column { /// Get Raw Vector Contents std::vector& GetWritableData(); - /// Returns the capacity of the column - size_t Capacity() const; - public: - /// Increase the capacity of the column for large block insertion. - void Reserve(size_t new_cap) override; - /// Appends content of given column to the end of current one. void Append(ColumnRef column) override; + /// Increase the capacity of the column for large block insertion. + void Reserve(size_t new_cap) override; + /// Returns the capacity of the column + size_t Capacity() const override; /// Loads column data from input stream. bool LoadBody(InputStream* input, size_t rows) override; @@ -170,6 +168,7 @@ class ColumnDateTime : public Column { /// Returns count of rows in the column. size_t Size() const override; + size_t MemoryUsage() const override; /// Makes slice of the current column. ColumnRef Slice(size_t begin, size_t len) const override; @@ -206,11 +205,11 @@ class ColumnDateTime64 : public Column { std::string Timezone() const; public: - /// Increase the capacity of the column for large block insertion. - void Reserve(size_t new_cap) override; - /// Appends content of given column to the end of current one. void Append(ColumnRef column) override; + /// Increase the capacity of the column for large block insertion. + void Reserve(size_t new_cap) override; + size_t Capacity() const override; /// Loads column data from input stream. bool LoadBody(InputStream* input, size_t rows) override; @@ -223,6 +222,7 @@ class ColumnDateTime64 : public Column { /// Returns count of rows in the column. size_t Size() const override; + size_t MemoryUsage() const override; /// Makes slice of the current column. ColumnRef Slice(size_t begin, size_t len) const override; diff --git a/clickhouse/columns/decimal.cpp b/clickhouse/columns/decimal.cpp index 2d214ecf..03c2765b 100644 --- a/clickhouse/columns/decimal.cpp +++ b/clickhouse/columns/decimal.cpp @@ -191,13 +191,26 @@ Int128 ColumnDecimal::At(size_t i) const { } } +void ColumnDecimal::Append(ColumnRef column) { + if (auto col = column->As()) { + data_->Append(col->data_); + } +} + void ColumnDecimal::Reserve(size_t new_cap) { data_->Reserve(new_cap); } -void ColumnDecimal::Append(ColumnRef column) { - if (auto col = column->As()) { - data_->Append(col->data_); +size_t ColumnDecimal::Capacity() const { + switch (data_->Type()->GetCode()) { + case Type::Int32: + return data_->As()->Capacity(); + case Type::Int64: + return data_->As()->Capacity(); + case Type::Int128: + return data_->As()->Capacity(); + default: + throw ValidationError("Invalid data_ column type in ColumnDecimal"); } } @@ -217,6 +230,10 @@ size_t ColumnDecimal::Size() const { return data_->Size(); } +size_t ColumnDecimal::MemoryUsage() const { + return data_->MemoryUsage(); +} + ColumnRef ColumnDecimal::Slice(size_t begin, size_t len) const { // coundn't use std::make_shared since this c-tor is private return ColumnRef{new ColumnDecimal(type_, data_->Slice(begin, len))}; diff --git a/clickhouse/columns/decimal.h b/clickhouse/columns/decimal.h index aa499a12..fea87f68 100644 --- a/clickhouse/columns/decimal.h +++ b/clickhouse/columns/decimal.h @@ -22,12 +22,14 @@ class ColumnDecimal : public Column { public: /// Increase the capacity of the column for large block insertion. - void Reserve(size_t new_cap) override; void Append(ColumnRef column) override; + void Reserve(size_t new_cap) override; + size_t Capacity() const override; bool LoadBody(InputStream* input, size_t rows) override; void SaveBody(OutputStream* output) override; void Clear() override; size_t Size() const override; + size_t MemoryUsage() const override; ColumnRef Slice(size_t begin, size_t len) const override; ColumnRef CloneEmpty() const override; void Swap(Column& other) override; diff --git a/clickhouse/columns/enum.cpp b/clickhouse/columns/enum.cpp index 43fab893..3f99762d 100644 --- a/clickhouse/columns/enum.cpp +++ b/clickhouse/columns/enum.cpp @@ -68,10 +68,6 @@ void ColumnEnum::SetNameAt(size_t n, const std::string& name) { data_.at(n) = static_cast(type_->As()->GetEnumValue(name)); } -template -void ColumnEnum::Reserve(size_t new_cap) { - data_.reserve(new_cap); -} template void ColumnEnum::Append(ColumnRef column) { @@ -80,6 +76,17 @@ void ColumnEnum::Append(ColumnRef column) { } } +template +void ColumnEnum::Reserve(size_t new_cap) { + data_.reserve(new_cap); +} + +template +size_t ColumnEnum::Capacity() const { + return data_.capacity(); +} + + template bool ColumnEnum::LoadBody(InputStream* input, size_t rows) { data_.resize(rows); @@ -96,6 +103,11 @@ size_t ColumnEnum::Size() const { return data_.size(); } +template +size_t ColumnEnum::MemoryUsage() const { + return data_.capacity() * sizeof(*data_.begin()); +} + template ColumnRef ColumnEnum::Slice(size_t begin, size_t len) const { return std::make_shared>(type_, SliceVector(data_, begin, len)); diff --git a/clickhouse/columns/enum.h b/clickhouse/columns/enum.h index 43900f6c..f251ae47 100644 --- a/clickhouse/columns/enum.h +++ b/clickhouse/columns/enum.h @@ -30,11 +30,11 @@ class ColumnEnum : public Column { void SetNameAt(size_t n, const std::string& name); public: - /// Increase the capacity of the column for large block insertion. - void Reserve(size_t new_cap) override; - /// Appends content of given column to the end of current one. void Append(ColumnRef column) override; + /// Increase the capacity of the column for large block insertion. + void Reserve(size_t new_cap) override; + size_t Capacity() const override; /// Loads column data from input stream. bool LoadBody(InputStream* input, size_t rows) override; @@ -47,6 +47,7 @@ class ColumnEnum : public Column { /// Returns count of rows in the column. size_t Size() const override; + size_t MemoryUsage() const override; /// Makes slice of the current column. ColumnRef Slice(size_t begin, size_t len) const override; diff --git a/clickhouse/columns/geo.cpp b/clickhouse/columns/geo.cpp index fa987732..bb2eb9c2 100644 --- a/clickhouse/columns/geo.cpp +++ b/clickhouse/columns/geo.cpp @@ -54,11 +54,6 @@ const typename ColumnGeo::ValueType ColumnGeoAt(n); } -template -void ColumnGeo::Reserve(size_t new_cap) { - data_->Reserve(new_cap); -} - template void ColumnGeo::Append(ColumnRef column) { if (auto col = column->template As()) { @@ -66,6 +61,15 @@ void ColumnGeo::Append(ColumnRef column) { } } +template +void ColumnGeo::Reserve(size_t new_cap) { + data_->Reserve(new_cap); +} +template +size_t ColumnGeo::Capacity() const { + return data_->Capacity(); +} + template bool ColumnGeo::LoadBody(InputStream* input, size_t rows) { return data_->LoadBody(input, rows); @@ -76,11 +80,17 @@ void ColumnGeo::SaveBody(OutputStream* output) { data_->SaveBody(output); } + template size_t ColumnGeo::Size() const { return data_->Size(); } +template +size_t ColumnGeo::MemoryUsage() const { + return data_->MemoryUsage(); +} + template ColumnRef ColumnGeo::Slice(size_t begin, size_t len) const { return std::make_shared(data_->Slice(begin, len)); diff --git a/clickhouse/columns/geo.h b/clickhouse/columns/geo.h index 1b129739..40748e7e 100644 --- a/clickhouse/columns/geo.h +++ b/clickhouse/columns/geo.h @@ -29,11 +29,11 @@ class ColumnGeo : public Column { inline const ValueType operator[](size_t n) const { return At(n); } public: - /// Increase the capacity of the column for large block insertion. - void Reserve(size_t new_cap) override; - /// Appends content of given column to the end of current one. void Append(ColumnRef column) override; + /// Increase the capacity of the column for large block insertion. + void Reserve(size_t new_cap) override; + size_t Capacity() const override; /// Loads column data from input stream. bool LoadBody(InputStream* input, size_t rows) override; @@ -46,6 +46,7 @@ class ColumnGeo : public Column { /// Returns count of rows in the column. size_t Size() const override; + size_t MemoryUsage() const override; /// Makes slice of the current column. ColumnRef Slice(size_t begin, size_t len) const override; diff --git a/clickhouse/columns/ip4.cpp b/clickhouse/columns/ip4.cpp index 8790afb6..867d4cfb 100644 --- a/clickhouse/columns/ip4.cpp +++ b/clickhouse/columns/ip4.cpp @@ -74,16 +74,20 @@ std::string ColumnIPv4::AsString(size_t n) const { return ip_str; } -void ColumnIPv4::Reserve(size_t new_cap) { - data_->Reserve(new_cap); -} - void ColumnIPv4::Append(ColumnRef column) { if (auto col = column->As()) { data_->Append(col->data_); } } +void ColumnIPv4::Reserve(size_t new_cap) { + data_->Reserve(new_cap); +} + +size_t ColumnIPv4::Capacity() const { + return data_->Capacity(); +} + bool ColumnIPv4::LoadBody(InputStream * input, size_t rows) { return data_->LoadBody(input, rows); } @@ -96,6 +100,10 @@ size_t ColumnIPv4::Size() const { return data_->Size(); } +size_t ColumnIPv4::MemoryUsage() const { + return data_->MemoryUsage(); +} + ColumnRef ColumnIPv4::Slice(size_t begin, size_t len) const { return std::make_shared(data_->Slice(begin, len)); } diff --git a/clickhouse/columns/ip4.h b/clickhouse/columns/ip4.h index 2253e305..b13fd486 100644 --- a/clickhouse/columns/ip4.h +++ b/clickhouse/columns/ip4.h @@ -39,11 +39,11 @@ class ColumnIPv4 : public Column { std::string AsString(size_t n) const; public: - /// Increase the capacity of the column for large block insertion. - void Reserve(size_t new_cap) override; - /// Appends content of given column to the end of current one. void Append(ColumnRef column) override; + /// Increase the capacity of the column for large block insertion. + void Reserve(size_t new_cap) override; + size_t Capacity() const override; /// Loads column data from input stream. bool LoadBody(InputStream* input, size_t rows) override; @@ -56,6 +56,7 @@ class ColumnIPv4 : public Column { /// Returns count of rows in the column. size_t Size() const override; + size_t MemoryUsage() const override; /// Makes slice of the current column. ColumnRef Slice(size_t begin, size_t len) const override; diff --git a/clickhouse/columns/ip6.cpp b/clickhouse/columns/ip6.cpp index 0d47b5e8..0199eed9 100644 --- a/clickhouse/columns/ip6.cpp +++ b/clickhouse/columns/ip6.cpp @@ -65,16 +65,20 @@ in6_addr ColumnIPv6::operator [] (size_t n) const { return *reinterpret_cast(data_->At(n).data()); } -void ColumnIPv6::Reserve(size_t new_cap) { - data_->Reserve(new_cap); -} - void ColumnIPv6::Append(ColumnRef column) { if (auto col = column->As()) { data_->Append(col->data_); } } +void ColumnIPv6::Reserve(size_t new_cap) { + data_->Reserve(new_cap); +} + +size_t ColumnIPv6::Capacity() const { + return data_->Capacity(); +} + bool ColumnIPv6::LoadBody(InputStream* input, size_t rows) { return data_->LoadBody(input, rows); } @@ -87,6 +91,10 @@ size_t ColumnIPv6::Size() const { return data_->Size(); } +size_t ColumnIPv6::MemoryUsage() const { + return data_->MemoryUsage(); +} + ColumnRef ColumnIPv6::Slice(size_t begin, size_t len) const { return std::make_shared(data_->Slice(begin, len)); } diff --git a/clickhouse/columns/ip6.h b/clickhouse/columns/ip6.h index 41af0d58..dcb969b4 100644 --- a/clickhouse/columns/ip6.h +++ b/clickhouse/columns/ip6.h @@ -35,11 +35,11 @@ class ColumnIPv6 : public Column { std::string AsString(size_t n) const; public: - /// Increase the capacity of the column for large block insertion. - void Reserve(size_t new_cap) override; - /// Appends content of given column to the end of current one. void Append(ColumnRef column) override; + /// Increase the capacity of the column for large block insertion. + void Reserve(size_t new_cap) override; + size_t Capacity() const override; /// Loads column data from input stream. bool LoadBody(InputStream* input, size_t rows) override; @@ -52,6 +52,7 @@ class ColumnIPv6 : public Column { /// Returns count of rows in the column. size_t Size() const override; + size_t MemoryUsage() const override; /// Makes slice of the current column. ColumnRef Slice(size_t begin, size_t len) const override; diff --git a/clickhouse/columns/lowcardinality.cpp b/clickhouse/columns/lowcardinality.cpp index 19369d33..3bde75e1 100644 --- a/clickhouse/columns/lowcardinality.cpp +++ b/clickhouse/columns/lowcardinality.cpp @@ -174,9 +174,45 @@ ColumnLowCardinality::ColumnLowCardinality(std::shared_ptr dicti ColumnLowCardinality::~ColumnLowCardinality() {} +namespace +{ + size_t EstimateDictionaryCapacity(size_t new_cap) + { + // Estimate capacity of the LC dictionary column. + // For small columns we assume there are higher relative number of unique items + // hence the capacity of the dictionary column must be the same as capacity of index_column. + // For large columns we assume that there are at least 80% of duplicates, + // hence the capacity of the dictionary column is 0.20 of the index_column. + // Medium-sized columns have dictionary capacity somewhere in-between. + + const float max_ratio = 1.0; + const float min_ratio = 0.20; + const float min_ratio_at = 512; + const float max_ratio_at = 128; + + if (new_cap < max_ratio_at) + return new_cap; + + if (new_cap >= min_ratio_at) + return new_cap * min_ratio; + + // Ratio of the dict capacity to the index column capacity, + // linearly falls down from `max_ratio` at `max_ratio_at` down to `min_ratio` at min_ratio_at; + const float ratio = max_ratio + (max_ratio_at - static_cast(new_cap)) * (max_ratio - min_ratio) / (min_ratio_at - max_ratio_at); + return new_cap * ratio; + } +} + void ColumnLowCardinality::Reserve(size_t new_cap) { - dictionary_column_->Reserve(new_cap); index_column_->Reserve(new_cap); + + dictionary_column_->Reserve(EstimateDictionaryCapacity(new_cap)); +} + +size_t ColumnLowCardinality::Capacity() const { + return VisitIndexColumn([](auto & index_column) { + return index_column.Capacity(); + }, *index_column_); } void ColumnLowCardinality::Setup(ColumnRef dictionary_column) { @@ -379,6 +415,13 @@ size_t ColumnLowCardinality::Size() const { return index_column_->Size(); } +size_t ColumnLowCardinality::MemoryUsage() const { + return unique_items_map_.bucket_count() * unique_items_map_.max_load_factor() + * (sizeof(unique_items_map_.begin()->first) + sizeof(unique_items_map_.begin()->second)) + + index_column_->MemoryUsage() + + dictionary_column_->MemoryUsage(); +} + ColumnRef ColumnLowCardinality::Slice(size_t begin, size_t len) const { begin = std::min(begin, Size()); len = std::min(len, Size() - begin); @@ -451,15 +494,13 @@ void ColumnLowCardinality::AppendUnsafe(const ItemView & value) { } } -void ColumnLowCardinality::AppendNullItem() -{ +void ColumnLowCardinality::AppendNullItem() { const auto null_item = GetNullItemForDictionary(dictionary_column_); AppendToDictionary(*dictionary_column_, null_item); unique_items_map_.emplace(computeHashKey(null_item), 0); } -void ColumnLowCardinality::AppendDefaultItem() -{ +void ColumnLowCardinality::AppendDefaultItem() { const auto defaultItem = GetDefaultItemForDictionary(dictionary_column_); unique_items_map_.emplace(computeHashKey(defaultItem), dictionary_column_->Size()); AppendToDictionary(*dictionary_column_, defaultItem); diff --git a/clickhouse/columns/lowcardinality.h b/clickhouse/columns/lowcardinality.h index 17e3ce99..09c6c1c0 100644 --- a/clickhouse/columns/lowcardinality.h +++ b/clickhouse/columns/lowcardinality.h @@ -65,11 +65,13 @@ class ColumnLowCardinality : public Column { ~ColumnLowCardinality(); + /// Appends another LowCardinality column to the end of this one, updating dictionary. + void Append(ColumnRef /*column*/) override; + /// Increase the capacity of the column for large block insertion. void Reserve(size_t new_cap) override; - /// Appends another LowCardinality column to the end of this one, updating dictionary. - void Append(ColumnRef /*column*/) override; + size_t Capacity() const override; bool LoadPrefix(InputStream* input, size_t rows) override; @@ -87,6 +89,7 @@ class ColumnLowCardinality : public Column { /// Returns count of rows in the column. size_t Size() const override; + size_t MemoryUsage() const override; /// Makes slice of current column, with compacted dictionary ColumnRef Slice(size_t begin, size_t len) const override; diff --git a/clickhouse/columns/map.cpp b/clickhouse/columns/map.cpp index 839b0668..dfefbed0 100644 --- a/clickhouse/columns/map.cpp +++ b/clickhouse/columns/map.cpp @@ -33,18 +33,22 @@ ColumnMap::ColumnMap(ColumnRef data) : Column(GetMapType(data->GetType())), data_(data->As()) { } +void ColumnMap::Append(ColumnRef column) { + if (auto col = column->As()) { + data_->Append(col->data_); + } +} + void ColumnMap::Reserve(size_t new_cap) { data_->Reserve(new_cap); } -void ColumnMap::Clear() { - data_->Clear(); +size_t ColumnMap::Capacity() const { + return data_->Capacity(); } -void ColumnMap::Append(ColumnRef column) { - if (auto col = column->As()) { - data_->Append(col->data_); - } +void ColumnMap::Clear() { + data_->Clear(); } bool ColumnMap::LoadPrefix(InputStream* input, size_t rows) { @@ -67,6 +71,10 @@ size_t ColumnMap::Size() const { return data_->Size(); } +size_t ColumnMap::MemoryUsage() const { + return data_->MemoryUsage(); +} + ColumnRef ColumnMap::Slice(size_t begin, size_t len) const { return std::make_shared(data_->Slice(begin, len)); } diff --git a/clickhouse/columns/map.h b/clickhouse/columns/map.h index 4d644802..4117db72 100644 --- a/clickhouse/columns/map.h +++ b/clickhouse/columns/map.h @@ -25,11 +25,11 @@ class ColumnMap : public Column { */ explicit ColumnMap(ColumnRef data); - /// Increase the capacity of the column for large block insertion. - void Reserve(size_t new_cap) override; - /// Appends content of given column to the end of current one. void Append(ColumnRef column) override; + /// Increase the capacity of the column for large block insertion. + void Reserve(size_t new_cap) override; + size_t Capacity() const override; /// Loads column prefix from input stream. bool LoadPrefix(InputStream* input, size_t rows) override; @@ -48,6 +48,7 @@ class ColumnMap : public Column { /// Returns count of rows in the column. size_t Size() const override; + size_t MemoryUsage() const override; /// Makes slice of the current column. ColumnRef Slice(size_t, size_t) const override; diff --git a/clickhouse/columns/nothing.h b/clickhouse/columns/nothing.h index 8e1a4e30..bc7b9047 100644 --- a/clickhouse/columns/nothing.h +++ b/clickhouse/columns/nothing.h @@ -26,11 +26,11 @@ class ColumnNothing : public Column { { } - /// Increase the capacity of the column for large block insertion. - void Reserve(size_t) override {}; - /// Appends one element to the column. void Append(std::unique_ptr) { ++size_; } + /// Increase the capacity of the column for large block insertion. + void Reserve(size_t) override {}; + size_t Capacity() const override {return 0;} /// Returns element at given row number. std::nullptr_t At(size_t) const { return nullptr; }; @@ -75,6 +75,8 @@ class ColumnNothing : public Column { /// Returns count of rows in the column. size_t Size() const override { return size_; } + size_t MemoryUsage() const override { return 0; } + void Swap(Column& other) override { auto & col = dynamic_cast(other); std::swap(size_, col.size_); diff --git a/clickhouse/columns/nullable.cpp b/clickhouse/columns/nullable.cpp index 23940c12..23e66276 100644 --- a/clickhouse/columns/nullable.cpp +++ b/clickhouse/columns/nullable.cpp @@ -34,11 +34,6 @@ ColumnRef ColumnNullable::Nulls() const return nulls_; } -void ColumnNullable::Reserve(size_t new_cap) { - nested_->Reserve(new_cap); - nulls_->Reserve(new_cap); -} - void ColumnNullable::Append(ColumnRef column) { if (auto col = column->As()) { if (!col->nested_->Type()->IsEqual(nested_->Type())) { @@ -50,6 +45,15 @@ void ColumnNullable::Append(ColumnRef column) { } } +void ColumnNullable::Reserve(size_t new_cap) { + nested_->Reserve(new_cap); + nulls_->Reserve(new_cap); +} + +size_t ColumnNullable::Capacity() const { + return nested_->Capacity(); +} + void ColumnNullable::Clear() { nested_->Clear(); nulls_->Clear(); @@ -82,6 +86,10 @@ size_t ColumnNullable::Size() const { return nulls_->Size(); } +size_t ColumnNullable::MemoryUsage() const { + return nested_->MemoryUsage() + nulls_->MemoryUsage(); +} + ColumnRef ColumnNullable::Slice(size_t begin, size_t len) const { return std::make_shared(nested_->Slice(begin, len), nulls_->Slice(begin, len)); } diff --git a/clickhouse/columns/nullable.h b/clickhouse/columns/nullable.h index 1946e8b9..85624b81 100644 --- a/clickhouse/columns/nullable.h +++ b/clickhouse/columns/nullable.h @@ -27,11 +27,11 @@ class ColumnNullable : public Column { ColumnRef Nulls() const; public: - /// Increase the capacity of the column for large block insertion. - void Reserve(size_t new_cap) override; - /// Appends content of given column to the end of current one. void Append(ColumnRef column) override; + /// Increase the capacity of the column for large block insertion. + void Reserve(size_t new_cap) override; + size_t Capacity() const override; /// Loads column prefix from input stream. bool LoadPrefix(InputStream* input, size_t rows) override; @@ -50,6 +50,7 @@ class ColumnNullable : public Column { /// Returns count of rows in the column. size_t Size() const override; + size_t MemoryUsage() const override; /// Makes slice of the current column. ColumnRef Slice(size_t begin, size_t len) const override; diff --git a/clickhouse/columns/numeric.cpp b/clickhouse/columns/numeric.cpp index 1c611c79..71da9c9d 100644 --- a/clickhouse/columns/numeric.cpp +++ b/clickhouse/columns/numeric.cpp @@ -43,6 +43,13 @@ std::vector& ColumnVector::GetWritableData() { return data_; } +template +void ColumnVector::Append(ColumnRef column) { + if (auto col = column->As>()) { + data_.insert(data_.end(), col->data_.begin(), col->data_.end()); + } +} + template void ColumnVector::Reserve(size_t new_cap) { data_.reserve(new_cap); @@ -63,13 +70,6 @@ const T& ColumnVector::At(size_t n) const { return data_.at(n); } -template -void ColumnVector::Append(ColumnRef column) { - if (auto col = column->As>()) { - data_.insert(data_.end(), col->data_.begin(), col->data_.end()); - } -} - template bool ColumnVector::LoadBody(InputStream* input, size_t rows) { data_.resize(rows); @@ -87,6 +87,11 @@ size_t ColumnVector::Size() const { return data_.size(); } +template +size_t ColumnVector::MemoryUsage() const { + return data_.capacity() * sizeof(data_[0]); +} + template ColumnRef ColumnVector::Slice(size_t begin, size_t len) const { return std::make_shared>(SliceVector(data_, begin, len)); diff --git a/clickhouse/columns/numeric.h b/clickhouse/columns/numeric.h index e2a7675e..58f7daeb 100644 --- a/clickhouse/columns/numeric.h +++ b/clickhouse/columns/numeric.h @@ -19,9 +19,6 @@ class ColumnVector : public Column { explicit ColumnVector(const std::vector& data); explicit ColumnVector(std::vector && data); - /// Increase the capacity of the column for large block insertion. - void Reserve(size_t new_cap) override; - /// Appends one element to the end of column. void Append(const T& value); @@ -36,12 +33,13 @@ class ColumnVector : public Column { /// Get Raw Vector Contents std::vector& GetWritableData(); - /// Returns the capacity of the column - size_t Capacity() const; - public: /// Appends content of given column to the end of current one. void Append(ColumnRef column) override; + /// Increase the capacity of the column for large block insertion. + void Reserve(size_t new_cap) override; + /// Returns the capacity of the column + size_t Capacity() const override; /// Loads column data from input stream. bool LoadBody(InputStream* input, size_t rows) override; @@ -57,6 +55,7 @@ class ColumnVector : public Column { /// Makes slice of the current column. ColumnRef Slice(size_t begin, size_t len) const override; + size_t MemoryUsage() const override; ColumnRef CloneEmpty() const override; void Swap(Column& other) override; diff --git a/clickhouse/columns/string.cpp b/clickhouse/columns/string.cpp index dff45bac..8ea60bfc 100644 --- a/clickhouse/columns/string.cpp +++ b/clickhouse/columns/string.cpp @@ -1,4 +1,6 @@ #include "string.h" +#include +#include "clickhouse/exceptions.h" #include "utils.h" #include "../base/wire_format.h" @@ -20,6 +22,40 @@ size_t ComputeTotalSize(const Container & strings, size_t begin = 0, size_t len return result; } +// based on https://stackoverflow.com/a/9194117 +size_t RoundUp(size_t numToRound, size_t multiple) { + return ((numToRound + (multiple - 1)) / multiple) * multiple; +} + +size_t ComputeValueSizeEstimation(size_t total_size, size_t number_of_items) { + number_of_items = number_of_items ? number_of_items : 1; // just to avoid divide by zero + size_t estimation = std::ceil(static_cast(total_size) / number_of_items); + + return estimation == 0; +} + +size_t EstimateBlockSize(size_t value_size_estimation) { + size_t estimated_number_of_items_per_block = 32; // just arbitrary value + + // do not pre-allocate too big blocks when expected values are big to minimize waste or when user explicitly requested not to + if (value_size_estimation > DEFAULT_BLOCK_SIZE || value_size_estimation == static_cast(clickhouse::ColumnString::NO_PREALLOCATE)) { + // for really big items do not pre-allocate blocks, and allowing later code to put 1 item per block + return 0; + } else if (value_size_estimation > static_cast(clickhouse::ColumnString::EstimatedValueSize::MEDIUM)) { + // for not so big items, create blocks that fit smaller number of items, reducing produced block size. + estimated_number_of_items_per_block = ceil(DEFAULT_BLOCK_SIZE / static_cast(value_size_estimation)); + } + + return std::max(DEFAULT_BLOCK_SIZE, RoundUp(value_size_estimation * estimated_number_of_items_per_block, DEFAULT_BLOCK_SIZE)); +} + +inline auto Validate(clickhouse::ColumnString::EstimatedValueSize value_size_estimation) { + if (static_cast(value_size_estimation) < 0) + throw clickhouse::ValidationError("ColumnString received negative number as value size estimation"); + + return value_size_estimation; +} + } namespace clickhouse { @@ -34,6 +70,11 @@ void ColumnFixedString::Reserve(size_t new_cap) { data_.reserve(string_size_ * new_cap); } +size_t ColumnFixedString::Capacity() const { + const auto data_cap = data_.capacity(); + return (data_cap - data_.size()) / string_size_; +} + void ColumnFixedString::Append(std::string_view str) { if (str.size() > string_size_) { throw ValidationError("Expected string of length not greater than " @@ -105,6 +146,10 @@ ColumnRef ColumnFixedString::Slice(size_t begin, size_t len) const { return result; } +size_t ColumnFixedString::MemoryUsage() const { + return data_.capacity(); +} + ColumnRef ColumnFixedString::CloneEmpty() const { return std::make_shared(string_size_); } @@ -157,28 +202,32 @@ struct ColumnString::Block std::unique_ptr data_; }; -ColumnString::ColumnString() +ColumnString::ColumnString(EstimatedValueSize value_size_estimation) : Column(Type::CreateString()) + , value_size_estimation_(static_cast(Validate(value_size_estimation))) + , next_block_size_(DEFAULT_BLOCK_SIZE) { } -ColumnString::ColumnString(size_t element_count) - : Column(Type::CreateString()) +ColumnString::ColumnString(size_t element_count, EstimatedValueSize value_size_estimation) + : ColumnString(value_size_estimation) { items_.reserve(element_count); - // 16 is arbitrary number, assumption that string values are about ~256 bytes long. - blocks_.reserve(std::max(1, element_count / 16)); + blocks_.emplace_back(std::max(DEFAULT_BLOCK_SIZE, RoundUp(element_count * value_size_estimation_, DEFAULT_BLOCK_SIZE))); } ColumnString::ColumnString(const std::vector& data) : ColumnString() { + const auto total_size = ComputeTotalSize(data); items_.reserve(data.size()); - blocks_.emplace_back(ComputeTotalSize(data)); + blocks_.emplace_back(total_size); for (const auto & s : data) { AppendUnsafe(s); } + + value_size_estimation_ = ComputeValueSizeEstimation(total_size, data.size()); } ColumnString::ColumnString(std::vector&& data) @@ -191,6 +240,8 @@ ColumnString::ColumnString(std::vector&& data) auto& last_data = append_data_.back(); items_.emplace_back(std::string_view{ last_data.data(),last_data.length() }); } + + value_size_estimation_ = ComputeValueSizeEstimation(ComputeTotalSize(items_), items_.size()); } ColumnString::~ColumnString() @@ -198,16 +249,29 @@ ColumnString::~ColumnString() void ColumnString::Reserve(size_t new_cap) { items_.reserve(new_cap); - // 16 is arbitrary number, assumption that string values are about ~256 bytes long. - blocks_.reserve(std::max(1, new_cap / 16)); + + if (blocks_.empty() || blocks_.back().GetAvailable() < value_size_estimation_) { + if (value_size_estimation_ != static_cast(NO_PREALLOCATE)) + blocks_.emplace_back(new_cap * value_size_estimation_); + } else { + // Estimate space required for items that woudn't fit into current Block. + const size_t estimated_items_in_next_block = value_size_estimation_ ? new_cap - blocks_.back().GetAvailable() / value_size_estimation_ : new_cap; + next_block_size_ = std::max(DEFAULT_BLOCK_SIZE, estimated_items_in_next_block * value_size_estimation_); + } +} + +size_t ColumnString::Capacity() const { + return items_.capacity(); +} + +void ColumnString::SetEstimatedValueSize(EstimatedValueSize value_size_estimation) { + value_size_estimation_ = static_cast(Validate(value_size_estimation)); } void ColumnString::Append(std::string_view str) { - if (blocks_.size() == 0 || blocks_.back().GetAvailable() < str.length()) { - blocks_.emplace_back(std::max(DEFAULT_BLOCK_SIZE, str.size())); - } + auto & block = PrepareBlockWithSpaceForAtLeast(str.length()); - items_.emplace_back(blocks_.back().AppendUnsafe(str)); + items_.emplace_back(block.AppendUnsafe(str)); } void ColumnString::Append(const char* str) { @@ -228,6 +292,18 @@ void ColumnString::AppendUnsafe(std::string_view str) { items_.emplace_back(blocks_.back().AppendUnsafe(str)); } +ColumnString::Block & ColumnString::PrepareBlockWithSpaceForAtLeast(size_t minimum_required_bytes) { + if (blocks_.empty() || blocks_.back().GetAvailable() < minimum_required_bytes) { + if (next_block_size_ == 0) + next_block_size_ = DEFAULT_BLOCK_SIZE; + + blocks_.emplace_back(std::max(next_block_size_, minimum_required_bytes)); + next_block_size_ = EstimateBlockSize(value_size_estimation_); + } + + return blocks_.back(); +} + void ColumnString::Clear() { items_.clear(); blocks_.clear(); @@ -243,8 +319,7 @@ void ColumnString::Append(ColumnRef column) { const auto total_size = ComputeTotalSize(col->items_); // TODO: fill up existing block with some items and then add a new one for the rest of items - if (blocks_.size() == 0 || blocks_.back().GetAvailable() < total_size) - blocks_.emplace_back(std::max(DEFAULT_BLOCK_SIZE, total_size)); + PrepareBlockWithSpaceForAtLeast(total_size); // Intentionally not doing items_.reserve() since that cripples performance. for (size_t i = 0; i < column->Size(); ++i) { @@ -267,6 +342,8 @@ bool ColumnString::LoadBody(InputStream* input, size_t rows) { new_items.reserve(rows); // Suboptimzal if the first row string is >DEFAULT_BLOCK_SIZE, but that must be a very rare case. + // Not using next_block_size_ here since it set in Reserve() which doesn't know + // about number of items and estimated item size in InputStream. Block * block = &new_blocks.emplace_back(DEFAULT_BLOCK_SIZE); for (size_t i = 0; i < rows; ++i) { @@ -299,24 +376,42 @@ size_t ColumnString::Size() const { return items_.size(); } +size_t ColumnString::MemoryUsage() const { + auto vector_used_bytes = [](const auto & v) { + return sizeof(v[0]) * v.capacity(); + }; + + size_t result = ComputeTotalSize(append_data_) + sizeof(append_data_[0]) * append_data_.size(); + result += vector_used_bytes(items_); + result += vector_used_bytes(blocks_); + + for (const auto & b : blocks_) + result += b.capacity; + + return result; +} + ColumnRef ColumnString::Slice(size_t begin, size_t len) const { - auto result = std::make_shared(); + if (begin >= items_.size()) { + return this->CloneEmpty(); + } - if (begin < items_.size()) { - len = std::min(len, items_.size() - begin); - result->items_.reserve(len); + len = std::min(len, items_.size() - begin); - result->blocks_.emplace_back(ComputeTotalSize(items_, begin, len)); - for (size_t i = begin; i < begin + len; ++i) { - result->Append(items_[i]); - } + auto result = std::make_shared(EstimatedValueSize(value_size_estimation_)); + + result->items_.reserve(len); + result->PrepareBlockWithSpaceForAtLeast(ComputeTotalSize(items_, begin, len)); + + for (size_t i = begin; i < begin + len; ++i) { + result->AppendUnsafe(items_[i]); } return result; } ColumnRef ColumnString::CloneEmpty() const { - return std::make_shared(); + return std::make_shared(EstimatedValueSize(value_size_estimation_)); } void ColumnString::Swap(Column& other) { diff --git a/clickhouse/columns/string.h b/clickhouse/columns/string.h index d6006556..839f5b9b 100644 --- a/clickhouse/columns/string.h +++ b/clickhouse/columns/string.h @@ -27,9 +27,6 @@ class ColumnFixedString : public Column { Append(v); } - /// Increase the capacity of the column for large block insertion. - void Reserve(size_t) override; - /// Appends one element to the column. void Append(std::string_view str); @@ -45,6 +42,9 @@ class ColumnFixedString : public Column { public: /// Appends content of given column to the end of current one. void Append(ColumnRef column) override; + /// Increase the capacity of the column for large block insertion. + void Reserve(size_t) override; + size_t Capacity() const override; /// Loads column data from input stream. bool LoadBody(InputStream* input, size_t rows) override; @@ -60,6 +60,8 @@ class ColumnFixedString : public Column { /// Makes slice of the current column. ColumnRef Slice(size_t begin, size_t len) const override; + size_t MemoryUsage() const override; + ColumnRef CloneEmpty() const override; void Swap(Column& other) override; @@ -78,17 +80,36 @@ class ColumnString : public Column { // Type this column takes as argument of Append and returns with At() and operator[] using ValueType = std::string_view; - ColumnString(); - ~ColumnString(); - - explicit ColumnString(size_t element_count); + // Estimation on average size of the value in column, + // helps to reduce used memory and number of re-allocation. + // Choosing a bad estimation woudn't crash the program, + // but may cause more frequent smaller memory allocations, + // reducing overall performance. + // int32_t to be able to validate againts (unintentional) negative values in ColumnString c-tor. + // Otherwise those just silently underflow unsigned type, + // resulting in attempt to allocate enormous amount of memory at run time. + enum class EstimatedValueSize : int32_t { + TINY = 8, + SMALL = 32, + MEDIUM = 128, + LARGE = 512, + }; + + // Memory for item storage is not pre-allocated on Reserve(), same as old behaviour. + static constexpr auto NO_PREALLOCATE = EstimatedValueSize(0); + + explicit ColumnString(EstimatedValueSize value_size_estimation = NO_PREALLOCATE); + explicit ColumnString(size_t element_count, EstimatedValueSize value_size_estimation = NO_PREALLOCATE); explicit ColumnString(const std::vector & data); explicit ColumnString(std::vector&& data); + + ~ColumnString(); + ColumnString& operator=(const ColumnString&) = delete; ColumnString(const ColumnString&) = delete; - /// Increase the capacity of the column for large block insertion. - void Reserve(size_t new_cap) override; + /// Change how memory is allocated for future Reserve() or Append() calls. Doesn't affect items that are already added to the column. + void SetEstimatedValueSize(EstimatedValueSize value_size_estimation); /// Appends one element to the column. void Append(std::string_view str); @@ -113,6 +134,12 @@ class ColumnString : public Column { /// Appends content of given column to the end of current one. void Append(ColumnRef column) override; + /// Increase the capacity of the column for large block insertion. + void Reserve(size_t new_cap) override; + + /// Returns the capacity of the column + size_t Capacity() const override; + /// Loads column data from input stream. bool LoadBody(InputStream* input, size_t rows) override; @@ -125,6 +152,8 @@ class ColumnString : public Column { /// Returns count of rows in the column. size_t Size() const override; + size_t MemoryUsage() const override; + /// Makes slice of the current column. ColumnRef Slice(size_t begin, size_t len) const override; ColumnRef CloneEmpty() const override; @@ -132,14 +161,19 @@ class ColumnString : public Column { ItemView GetItem(size_t) const override; private: + struct Block; + void AppendUnsafe(std::string_view); + Block & PrepareBlockWithSpaceForAtLeast(size_t minimum_required_bytes); private: - struct Block; std::vector items_; std::vector blocks_; std::deque append_data_; + + uint32_t value_size_estimation_ = 0; + size_t next_block_size_ = 0; }; } diff --git a/clickhouse/columns/tuple.cpp b/clickhouse/columns/tuple.cpp index 56858590..3174449b 100644 --- a/clickhouse/columns/tuple.cpp +++ b/clickhouse/columns/tuple.cpp @@ -20,12 +20,6 @@ size_t ColumnTuple::TupleSize() const { return columns_.size(); } -void ColumnTuple::Reserve(size_t new_cap) { - for (auto& column : columns_) { - column->Reserve(new_cap); - } -} - void ColumnTuple::Append(ColumnRef column) { if (!this->Type()->IsEqual(column->Type())) { throw ValidationError( @@ -37,10 +31,29 @@ void ColumnTuple::Append(ColumnRef column) { columns_[ci]->Append((*source_tuple_column)[ci]); } } + +void ColumnTuple::Reserve(size_t new_cap) { + for (auto& column : columns_) { + column->Reserve(new_cap); + } +} + +size_t ColumnTuple::Capacity() const { + return columns_.size() ? columns_[0]->Capacity() : 0; +} + size_t ColumnTuple::Size() const { return columns_.empty() ? 0 : columns_[0]->Size(); } +size_t ColumnTuple::MemoryUsage() const { + size_t result = sizeof(columns_[0]) * columns_.capacity(); + for (const auto & c : columns_) + result += c->MemoryUsage(); + + return result; +} + ColumnRef ColumnTuple::Slice(size_t begin, size_t len) const { std::vector sliced_columns; sliced_columns.reserve(columns_.size()); diff --git a/clickhouse/columns/tuple.h b/clickhouse/columns/tuple.h index ebc1b895..307d9a42 100644 --- a/clickhouse/columns/tuple.h +++ b/clickhouse/columns/tuple.h @@ -26,11 +26,11 @@ class ColumnTuple : public Column { } public: - /// Increase the capacity of the column for large block insertion. - void Reserve(size_t new_cap) override; - /// Appends content of given column to the end of current one. void Append(ColumnRef column) override; + /// Increase the capacity of the column for large block insertion. + void Reserve(size_t new_cap) override; + size_t Capacity() const override; /// Loads column prefix from input stream. bool LoadPrefix(InputStream* input, size_t rows) override; @@ -50,6 +50,8 @@ class ColumnTuple : public Column { /// Returns count of rows in the column. size_t Size() const override; + size_t MemoryUsage() const override; + /// Makes slice of the current column. ColumnRef Slice(size_t, size_t) const override; ColumnRef CloneEmpty() const override; diff --git a/clickhouse/columns/uuid.cpp b/clickhouse/columns/uuid.cpp index fbaff97d..6dd2af69 100644 --- a/clickhouse/columns/uuid.cpp +++ b/clickhouse/columns/uuid.cpp @@ -38,6 +38,10 @@ void ColumnUUID::Reserve(size_t new_cap) { data_->Reserve(new_cap); } +size_t ColumnUUID::Capacity() const { + return data_->Capacity(); +} + void ColumnUUID::Append(ColumnRef column) { if (auto col = column->As()) { data_->Append(col->data_); @@ -56,6 +60,10 @@ size_t ColumnUUID::Size() const { return data_->Size() / 2; } +size_t ColumnUUID::MemoryUsage() const { + return data_->MemoryUsage(); +} + ColumnRef ColumnUUID::Slice(size_t begin, size_t len) const { return std::make_shared(data_->Slice(begin * 2, len * 2)); } diff --git a/clickhouse/columns/uuid.h b/clickhouse/columns/uuid.h index ccd03f84..07547f58 100644 --- a/clickhouse/columns/uuid.h +++ b/clickhouse/columns/uuid.h @@ -26,11 +26,11 @@ class ColumnUUID : public Column { inline const UUID operator [] (size_t n) const { return At(n); } public: - /// Increase the capacity of the column for large block insertion. - void Reserve(size_t new_cap) override; - /// Appends content of given column to the end of current one. void Append(ColumnRef column) override; + /// Increase the capacity of the column for large block insertion. + void Reserve(size_t new_cap) override; + size_t Capacity() const override; /// Loads column data from input stream. bool LoadBody(InputStream* input, size_t rows) override; @@ -43,6 +43,7 @@ class ColumnUUID : public Column { /// Returns count of rows in the column. size_t Size() const override; + size_t MemoryUsage() const override; /// Makes slice of the current column. ColumnRef Slice(size_t begin, size_t len) const override; diff --git a/ut/CMakeLists.txt b/ut/CMakeLists.txt index 13ad51d8..e4e89c84 100644 --- a/ut/CMakeLists.txt +++ b/ut/CMakeLists.txt @@ -26,6 +26,9 @@ SET ( clickhouse-cpp-ut-src utils.cpp value_generators.cpp low_cardinality_nullable_tests.cpp + + ColumnString_ut.cpp + ColumnLowCardinalityT_ut.cpp ) IF (WITH_OPENSSL) diff --git a/ut/ColumnLowCardinalityT_ut.cpp b/ut/ColumnLowCardinalityT_ut.cpp new file mode 100644 index 00000000..73b3aeba --- /dev/null +++ b/ut/ColumnLowCardinalityT_ut.cpp @@ -0,0 +1,232 @@ +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include +#include + +namespace { + +using namespace clickhouse; +using namespace std::literals::string_view_literals; + +static const auto LOWCARDINALITY_STRING_FOOBAR_10_ITEMS_BINARY = + "\x01\x00\x00\x00\x00\x00\x00\x00\x00\x06\x00\x00\x00\x00\x00\x00" + "\x09\x00\x00\x00\x00\x00\x00\x00\x00\x06\x46\x6f\x6f\x42\x61\x72" + "\x01\x31\x01\x32\x03\x46\x6f\x6f\x01\x34\x03\x42\x61\x72\x01\x37" + "\x01\x38\x0a\x00\x00\x00\x00\x00\x00\x00\x01\x02\x03\x04\x05\x06" + "\x04\x07\x08\x04"sv; +} + + +TEST(ColumnsCase, ColumnLowCardinalityString_Append_and_Read) { + const size_t items_count = 11; + ColumnLowCardinalityT col; + for (const auto & item : GenerateVector(items_count, &FooBarGenerator)) { + col.Append(item); + } + + ASSERT_EQ(col.Size(), items_count); + ASSERT_EQ(col.GetDictionarySize(), 8u + 1); // 8 unique items from sequence + 1 null-item + + for (size_t i = 0; i < items_count; ++i) { + ASSERT_EQ(col.At(i), FooBarGenerator(i)) << " at pos: " << i; + ASSERT_EQ(col[i], FooBarGenerator(i)) << " at pos: " << i; + } +} + +TEST(ColumnsCase, ColumnLowCardinalityString_Clear_and_Append) { + const size_t items_count = 11; + ColumnLowCardinalityT col; + for (const auto & item : GenerateVector(items_count, &FooBarGenerator)) + { + col.Append(item); + } + + col.Clear(); + ASSERT_EQ(col.Size(), 0u); + ASSERT_EQ(col.GetDictionarySize(), 1u); // null-item + + for (const auto & item : GenerateVector(items_count, &FooBarGenerator)) + { + col.Append(item); + } + + ASSERT_EQ(col.Size(), items_count); + ASSERT_EQ(col.GetDictionarySize(), 8u + 1); // 8 unique items from sequence + 1 null-item +} + +TEST(ColumnsCase, ColumnLowCardinalityString_Load) { + const size_t items_count = 10; + ColumnLowCardinalityT col; + + const auto & data = LOWCARDINALITY_STRING_FOOBAR_10_ITEMS_BINARY; + ArrayInput buffer(data.data(), data.size()); + + ASSERT_TRUE(col.Load(&buffer, items_count)); + + for (size_t i = 0; i < items_count; ++i) { + EXPECT_EQ(col.At(i), FooBarGenerator(i)) << " at pos: " << i; + } +} + +// This is temporary disabled since we are not 100% compatitable with ClickHouse +// on how we serailize LC columns, but we check interoperability in other tests (see client_ut.cpp) +TEST(ColumnsCase, DISABLED_ColumnLowCardinalityString_Save) { + const size_t items_count = 10; + ColumnLowCardinalityT col; + for (const auto & item : GenerateVector(items_count, &FooBarGenerator)) { + col.Append(item); + } + + ArrayOutput output(0, 0); + + const size_t expected_output_size = LOWCARDINALITY_STRING_FOOBAR_10_ITEMS_BINARY.size(); + // Enough space to account for possible overflow from both right and left sides. + std::string buffer(expected_output_size * 10, '\0');// = {'\0'}; + const char margin_content[sizeof(buffer)] = {'\0'}; + + const size_t left_margin_size = 10; + const size_t right_margin_size = sizeof(buffer) - left_margin_size - expected_output_size; + + // Since overflow from left side is less likely to happen, leave only tiny margin there. + auto write_pos = buffer.data() + left_margin_size; + const auto left_margin = buffer.data(); + const auto right_margin = write_pos + expected_output_size; + + output.Reset(write_pos, expected_output_size); + + EXPECT_NO_THROW(col.Save(&output)); + + // Left margin should be blank + EXPECT_EQ(std::string_view(margin_content, left_margin_size), std::string_view(left_margin, left_margin_size)); + // Right margin should be blank too + EXPECT_EQ(std::string_view(margin_content, right_margin_size), std::string_view(right_margin, right_margin_size)); + + // TODO: right now LC columns do not write indexes in the most compact way possible, so binary representation is a bit different + // (there might be other inconsistances too) + EXPECT_EQ(LOWCARDINALITY_STRING_FOOBAR_10_ITEMS_BINARY, std::string_view(write_pos, expected_output_size)); +} + +TEST(ColumnsCase, ColumnLowCardinalityString_SaveAndLoad) { + // Verify that we can load binary representation back + ColumnLowCardinalityT col; + + const auto items = GenerateVector(10, &FooBarGenerator); + for (const auto & item : items) { + col.Append(item); + } + + char buffer[256] = {'\0'}; // about 3 times more space than needed for this set of values. + { + ArrayOutput output(buffer, sizeof(buffer)); + EXPECT_NO_THROW(col.Save(&output)); + } + + col.Clear(); + + { + // Load the data back + ArrayInput input(buffer, sizeof(buffer)); + EXPECT_TRUE(col.Load(&input, items.size())); + } + + for (size_t i = 0; i < items.size(); ++i) { + EXPECT_EQ(col.At(i), items[i]) << " at pos: " << i; + } +} + +TEST(ColumnsCase, ColumnLowCardinalityString_WithEmptyString_1) { + // Verify that when empty string is added to a LC column it can be retrieved back as empty string. + ColumnLowCardinalityT col; + const auto values = GenerateVector(10, AlternateGenerators(SameValueGenerator(""), FooBarGenerator)); + for (const auto & item : values) { + col.Append(item); + } + + for (size_t i = 0; i < values.size(); ++i) { + EXPECT_EQ(values[i], col.At(i)) << " at pos: " << i; + } +} + +TEST(ColumnsCase, ColumnLowCardinalityString_WithEmptyString_2) { + // Verify that when empty string is added to a LC column it can be retrieved back as empty string. + // (Ver2): Make sure that outcome doesn't depend if empty values are on odd positions + ColumnLowCardinalityT col; + const auto values = GenerateVector(10, AlternateGenerators(FooBarGenerator, SameValueGenerator(""))); + for (const auto & item : values) { + col.Append(item); + } + + for (size_t i = 0; i < values.size(); ++i) { + EXPECT_EQ(values[i], col.At(i)) << " at pos: " << i; + } +} + +TEST(ColumnsCase, ColumnLowCardinalityString_WithEmptyString_3) { + // When we have many leading empty strings and some non-empty values. + ColumnLowCardinalityT col; + const auto values = ConcatSequences(GenerateVector(100, SameValueGenerator("")), GenerateVector(5, FooBarGenerator)); + for (const auto & item : values) { + col.Append(item); + } + + for (size_t i = 0; i < values.size(); ++i) { + EXPECT_EQ(values[i], col.At(i)) << " at pos: " << i; + } +} + +TEST(ColumnLowCardinalityString, WithSizeEstimation) { + const ColumnString::EstimatedValueSize value_size_estimations[] = { + ColumnString::EstimatedValueSize::TINY, + ColumnString::EstimatedValueSize::SMALL, + ColumnString::EstimatedValueSize::MEDIUM, + ColumnString::EstimatedValueSize::LARGE, + + // ColumnString::EstimatedValueSize(0), + ColumnString::EstimatedValueSize(1), + ColumnString::EstimatedValueSize(300), + ColumnString::EstimatedValueSize(10'000), + }; + + auto values = MakeStrings(); + + // How many times to append items from values to column. + for (size_t count = 512; count <= 1024; count *= 2) + { + std::cerr << "\nNumber of values: " << values.size() * count << std::endl; + for (ColumnString::EstimatedValueSize estimation : value_size_estimations) { + SCOPED_TRACE(::testing::Message("with estimation: ") << estimation); + std::cerr << "Estimation " << estimation << std::endl; + + auto col = std::make_shared>(estimation); + + dumpMemoryUsage("After constructing with estimation", col); +//ASSERT_NO_FATAL_FAILURE + col->Reserve(values.size() * count); + dumpMemoryUsage("After Reserve()", col); + + size_t i = 0; + for (size_t j = 0; j < count; ++j) + { + for (const auto & v : values) { + col->Append(v); + + EXPECT_EQ(i + 1, col->Size()); + EXPECT_EQ(v, col->At(i)); + + ++i; + } + } + + dumpMemoryUsage("After appending all values", col); + } + } +} diff --git a/ut/ColumnString_ut.cpp b/ut/ColumnString_ut.cpp new file mode 100644 index 00000000..c6681d59 --- /dev/null +++ b/ut/ColumnString_ut.cpp @@ -0,0 +1,467 @@ +#include + +#include + +#include +#include +#include + +#include +#include +#include "gtest/gtest-param-test.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace clickhouse; + +namespace +{ +// TODO: convert to generator with container-like interface (for comparison with CompareRecursive), should implement size(), begin() and end(), iterator i with ++i and *i +size_t EstimateColumnStringMemoryUsage( + size_t number_of_items, // number of items in column + ColumnString::EstimatedValueSize item_estimated_size, // estimated item size + float value_to_estimation_average_size_ratio = 1.0, // expected real item size to estimated item size + std::optional total_items_size = std::nullopt // total length of all items + ) { + static const size_t COLUMN_STRING_DEFAULT_BLOCK_SIZE = 4096; + static const size_t COLUMN_STRING_MAX_EXPECTED_MEMORY_OVERHEAD = 4096; + const float max_estimation_error_factor = item_estimated_size == ColumnString::NO_PREALLOCATE ? 2.5 : 2; + + // space wasted in block since not all items can be fit perfectly, and there is some unused space at the end of the block. + const auto estimate_lost_space_in_block = (static_cast(item_estimated_size) != 0 + ? COLUMN_STRING_DEFAULT_BLOCK_SIZE % static_cast(static_cast(item_estimated_size) * std::max(1.0f, value_to_estimation_average_size_ratio)) + : COLUMN_STRING_DEFAULT_BLOCK_SIZE / 10); + + // if no estimation provided, use factual total size of all items + if (item_estimated_size == ColumnString::EstimatedValueSize{0} && total_items_size && number_of_items) + item_estimated_size = ColumnString::EstimatedValueSize(static_cast(*total_items_size) / number_of_items); + + const size_t estimated_total_item_size = number_of_items * static_cast(item_estimated_size) * value_to_estimation_average_size_ratio; + const auto estimated_number_of_blocks = std::max(1, estimated_total_item_size ? COLUMN_STRING_DEFAULT_BLOCK_SIZE / estimated_total_item_size : 1); + + return (number_of_items * sizeof(std::string_view) + + estimated_total_item_size + + estimate_lost_space_in_block * estimated_number_of_blocks + + COLUMN_STRING_DEFAULT_BLOCK_SIZE + // It is hard to compute overhead added by vector + // (mostly because we don't know number of ColumnString::Block instances from outside, and this number depends on many factors), + // so we just make a guess. + + COLUMN_STRING_MAX_EXPECTED_MEMORY_OVERHEAD) + * max_estimation_error_factor; +} + +std::string ScaleString(std::string str, size_t required_size) { + if (required_size < str.length()) { + str.resize(required_size); + + return str; + } + + str.reserve(required_size); + while (str.length() < required_size) { + const size_t remaining_size = required_size - str.length(); + str.insert(str.length(), str.data(), std::min(str.length(), remaining_size)); + } + str.resize(required_size); + + return str; +} + +std::vector GenerateSizedStrings( + const std::vector& initial_values, // values from which result is generated + size_t required_number_of_items, // number of strings in result + size_t required_single_value_size, // how long should resulting strings be + const std::vector& scale_factors = {1.0} // length variations on resulting string, must be > 0 + ) { + std::vector result; + result.reserve(required_number_of_items); + + for (size_t i = 0; i < required_number_of_items; ++i) { + const auto & value = initial_values[i % initial_values.size()]; + const auto & scale_factor = scale_factors[i % scale_factors.size()]; + + size_t value_size = required_single_value_size; + if (value_size == 0) + value_size = value.length(); + value_size = std::lround(value_size * scale_factor); + + result.push_back(ScaleString(value, value_size)); + } + + return result; +} + +// class GenerateSizedStrings +// { +// const std::vector& initial_values; // values from which result is generated +// size_t required_single_value_size; // how long should resulting strings be +// const std::vector& scale_factors = {1.0}; // length variations on resulting string, must be > 0 + +// public: +// GenerateSizedStrings( +// const std::vector& initial_values, // values from which result is generated +// // size_t required_number_of_items, // number of strings in result +// size_t required_single_value_size, // how long should resulting strings be +// const std::vector& scale_factors = {1.0} // length variations on resulting string, must be > 0 +// ) +// : initial_values(initial_values) +// , required_single_value_size(required_single_value_size) +// , scale_factors(scale_factors) +// { +// } + +// std::string operator()(size_t i) const { +// const auto & value = initial_values[i % initial_values.size()]; +// const auto & scale_factor = scale_factors[i % scale_factors.size()]; + +// size_t value_size = required_single_value_size; +// if (value_size == 0) +// value_size = value.length(); +// value_size = std::lround(value_size * scale_factor); + +// return ScaleString(value, value_size); +// } +// }; + + + + +} + +TEST(ColumnString, ConstructorThatCopiesValues) { + auto values = MakeStrings(); + auto col = std::make_shared(values); + + ASSERT_EQ(col->Size(), values.size()); + ASSERT_EQ(col->At(1), "ab"); + ASSERT_EQ(col->At(3), "abcd"); +} + +TEST(ColumnString, ConstructorThatMovesValues) { + auto values = MakeStrings(); + auto copy = values; + auto col = ColumnString(std::move(copy)); + + EXPECT_TRUE(CompareRecursive(values, col)); +} + +TEST(ColumnString, Append) { + auto col = std::make_shared(); + const char* expected = "ufiudhf3493fyiudferyer3yrifhdflkdjfeuroe"; + std::string data(expected); + col->Append(data); + col->Append(std::move(data)); + col->Append("11"); + + ASSERT_EQ(col->Size(), 3u); + ASSERT_EQ(col->At(0), expected); + ASSERT_EQ(col->At(1), expected); + ASSERT_EQ(col->At(2), "11"); +} + +//TEST(ColumnString, DefaultSizeEstimation) { +// auto values = MakeStrings(); + +// const ColumnString::EstimatedValueSize value_size_estimations[] = { +// ColumnString::EstimatedValueSize::TINY, +// ColumnString::EstimatedValueSize::SMALL, +// ColumnString::EstimatedValueSize::MEDIUM, +// ColumnString::EstimatedValueSize::LARGE, +// ColumnString::EstimatedValueSize::HUGE, +// }; + +// for (auto estimation : value_size_estimations) { +// SCOPED_TRACE(::testing::Message("with estimation: ") << estimation); + +// auto col = std::make_shared(estimation); + +// col->Reserve(values.size()); + +// size_t i = 0; +// for (const auto & v : values) { +// col->Append(v); + +// EXPECT_EQ(i + 1, col->Size()); +// EXPECT_EQ(v, col->At(i)); + +// ++i; +// } +// } +//} + +TEST(ColumnString, InvalidSizeEstimation) { + // Negative values or values that are too big (> INTMAX) that are wrapped and implicitly converted to negative + // should cause an exception. + + EXPECT_THROW(std::make_shared(ColumnString::EstimatedValueSize(-1)), ValidationError); + EXPECT_THROW(std::make_shared(ColumnString::EstimatedValueSize(static_cast(std::numeric_limits::max()) + 1)), ValidationError); + EXPECT_THROW(std::make_shared(ColumnString::EstimatedValueSize(std::numeric_limits::max())), ValidationError); + + ColumnString col; + EXPECT_THROW(col.SetEstimatedValueSize(ColumnString::EstimatedValueSize(-1)), ValidationError); + EXPECT_THROW(col.SetEstimatedValueSize(ColumnString::EstimatedValueSize(static_cast(std::numeric_limits::max()) + 1)), ValidationError); + EXPECT_THROW(col.SetEstimatedValueSize(ColumnString::EstimatedValueSize(std::numeric_limits::max())), ValidationError); +} + +struct SizeRatio { + std::vector ratios; + float average; + + SizeRatio(std::vector ratios_) + : ratios(std::move(ratios_)) + { + float sum = 0; + for (const auto & r : ratios) { + sum += r; + } + + average = sum / ratios.size(); + } +}; + +std::ostream & operator<<(std::ostream& ostr, const SizeRatio & r) { + return ostr << "SizeRatio{ average: " << std::fixed << r.average << " } "; +} + +/** Make sure that setting value size estimates with ColumnString::EstimatedValueSize either via contructor or via SetEstimatedValueSize + * doesn't break ColumnString functionality and well-behaves with Reserve() and Append(). + * I.e. values are appended properly, nothing crashes and memory usage is not crazy-high if estimation is incorrect. + */ +struct ColumnStringEstimatedValueSizeTest : public ::testing::TestWithParam> +{ + void SetUp() override { + const size_t MAX_MEMORY_USAGE = 100 * 1024 * 1024; + const auto & [single_value_size_estimation, size_ratio] = GetParam(); + + // Adjust number of items so the test doesn't use too much memory + if (static_cast(single_value_size_estimation) != 0 + // *2 since we store both reference values and values in column itself. + && EstimateColumnStringMemoryUsage(expected_number_of_items, single_value_size_estimation, size_ratio.average) > MAX_MEMORY_USAGE) { + const auto old_expected_number_of_items = expected_number_of_items; + expected_number_of_items = MAX_MEMORY_USAGE / (static_cast(single_value_size_estimation) * 2 * size_ratio.average); + + std::cerr << "To avoid using too much memory, reduced number of items in test" + << " from " << old_expected_number_of_items + << " to " << expected_number_of_items + << ", expected item size is " << single_value_size_estimation + << std::endl; + } + } + + size_t expected_number_of_items = 10000; + + void AppendStrings( + ColumnString & column, + size_t & total_values_size) const { + + const auto & [single_value_size_estimation, size_ratio] = GetParam(); + + const auto values = GenerateSizedStrings( + MakeStrings(), + expected_number_of_items, + static_cast(single_value_size_estimation), + size_ratio.ratios); + + total_values_size = 0; + for (const auto & v : values) { + total_values_size += v.size(); + + column.Append(v); + } + + ASSERT_TRUE(CompareRecursive(values, column)); + } + + size_t EstimateMemoryUsage(size_t total_values_size, float expected_number_of_items_multiplier = 1.0) { + const auto & [single_value_size_estimation, size_ratio] = GetParam(); + return EstimateColumnStringMemoryUsage(expected_number_of_items * expected_number_of_items_multiplier, + single_value_size_estimation, + size_ratio.average, + total_values_size); + } +}; + +TEST_P(ColumnStringEstimatedValueSizeTest, ConstructorWithEstimation) { + const auto & [single_value_size_estimation, size_ratio] = GetParam(); + + ColumnString col(single_value_size_estimation); + + // basically no memory pre-allocated + EXPECT_LT(col.MemoryUsage(), 1u); +} + +//TEST_P(ColumnStringEstimatedValueSizeTest, ConstructorWithEstimationAsInt) { +//// auto single_value_size_estimation = GetParam(); +// ColumnString col(1); + +// // basically no memory pre-allocated except for some constant factor +// EXPECT_LT(col.MemoryUsage(), 1u); +//} + +TEST_P(ColumnStringEstimatedValueSizeTest, ConstructorWithNumberOfItemsAndEstimation) { + // Constructor that receives both number of items and estimation pre-allocates memory for given number of items of estimated size. + const auto & [single_value_size_estimation, size_ratio] = GetParam(); + + ColumnString col(expected_number_of_items, single_value_size_estimation); + + // space for at least all the items, maybe more + EXPECT_GT(col.MemoryUsage(), static_cast(single_value_size_estimation) * expected_number_of_items); + + // but not too much + EXPECT_LT(col.MemoryUsage(), EstimateColumnStringMemoryUsage(expected_number_of_items, single_value_size_estimation)); +} + +TEST_P(ColumnStringEstimatedValueSizeTest, AppendNoReserve) +{ + const auto & [single_value_size_estimation, size_ratio] = GetParam(); + + auto col = ColumnString(single_value_size_estimation); + size_t total_values_size = 0; + + EXPECT_NO_FATAL_FAILURE(AppendStrings(col, total_values_size)); + + // since there was no Reserve call prior, there could be more some overallocations, hence some estimation error + EXPECT_LT(col.MemoryUsage(), EstimateMemoryUsage(total_values_size)); +} + +TEST_P(ColumnStringEstimatedValueSizeTest, ReserveExactAndAppend) +{ + const auto & [single_value_size_estimation, size_ratio] = GetParam(); + + auto col = ColumnString(single_value_size_estimation); + size_t total_values_size = 0; + + EXPECT_NO_THROW(col.Reserve(expected_number_of_items)); + EXPECT_NO_FATAL_FAILURE(AppendStrings(col, total_values_size)); + + EXPECT_LT(col.MemoryUsage(), EstimateMemoryUsage(total_values_size)); +} + +TEST_P(ColumnStringEstimatedValueSizeTest, ReserveLessAndAppend) +{ + const auto & [single_value_size_estimation, size_ratio] = GetParam(); + + auto col = ColumnString(single_value_size_estimation); + size_t total_values_size = 0; + + EXPECT_NO_THROW(col.Reserve(expected_number_of_items * .8)); + EXPECT_NO_FATAL_FAILURE(AppendStrings(col, total_values_size)); + + EXPECT_LT(col.MemoryUsage(), EstimateMemoryUsage(total_values_size)); +} + +TEST_P(ColumnStringEstimatedValueSizeTest, ReserveMoreAndAppend) +{ + const auto & [single_value_size_estimation, size_ratio] = GetParam(); + + auto col = ColumnString(single_value_size_estimation); + size_t total_values_size = 0; + + EXPECT_NO_THROW(col.Reserve(expected_number_of_items * 1.2)); + EXPECT_NO_FATAL_FAILURE(AppendStrings(col, total_values_size)); + + EXPECT_LT(col.MemoryUsage(), EstimateMemoryUsage(total_values_size, 1.2)); +} + +/** TODO more tests + * "Basic tests": + * - first Reserve(), then Append() same number of items that were Reserved() + * - first Reserve(), then Append() more items than were Reserved() + * - first Reserve(), then Append() less items than were Reserved() + * + * "Extended tests": + * - Basic tests, but with items that are smaller than estimated + * - Basic tests, but with items that are exactly as estimated + * - Basic tests, but with items that are bigger than estimated + * + * "Non-empty column tests": + * - same as "Extended tests", but first Append() data below estimated item size + * - same as "Extended tests", but first Append() data above estimated item size + * - same as "Extended tests", but first Append() data above default block size + * + * "Re-estimation tests": do multiple SetEstimatedValueSize(), Reserve() calls + * - first smaller estimation, then larger estimation + * - first larger estimation, then smaller estimation + * - first no estimation (0), then some estimation + * + * Test all that groups of tests against various valid EstimatedValueSize values. + */ + +const auto SIZE_RATIOS = ::testing::ValuesIn(std::initializer_list{ + // estimation is about right + SizeRatio({0.9, 0.95, 1.0, 1.05, 1.1}), + // estimation is a bit high, real values are about 0.8 of estimated size + SizeRatio({0.75, 0.8, 0.85}), + // estimation is a bit low, real values are about 1.2 of estimated size + SizeRatio({1.25, 1.2, 1.25}), + // estimation is to high, real values are about 2.0 of estimated size + SizeRatio({1.9, 2, 2.1}), + // estimation is to low, real values are about 0.5 of estimated size + SizeRatio({0.4, 0.5, 0.6}), +}); + +INSTANTIATE_TEST_SUITE_P( + NO_PRE_ALLOCATE, ColumnStringEstimatedValueSizeTest, + ::testing::Combine( + ::testing::Values( + ColumnString::NO_PREALLOCATE + ), + SIZE_RATIOS + ) +); + +INSTANTIATE_TEST_SUITE_P( + EstimatedValueSize_Values, ColumnStringEstimatedValueSizeTest, + ::testing::Combine( + ::testing::Values( + ColumnString::EstimatedValueSize::TINY, + ColumnString::EstimatedValueSize::SMALL, + ColumnString::EstimatedValueSize::MEDIUM, + ColumnString::EstimatedValueSize::LARGE + ), + SIZE_RATIOS + ) +); + +// Because whone number of those does't fit in ColumnString::Block of default size, +// there are going to be some unused regions of memory in ColumnString::Block's, +// hitting various corner cases. +INSTANTIATE_TEST_SUITE_P( + Primes, ColumnStringEstimatedValueSizeTest, + ::testing::Combine( + ::testing::Values( + ColumnString::EstimatedValueSize(3), + ColumnString::EstimatedValueSize(5), + + ColumnString::EstimatedValueSize(503), + ColumnString::EstimatedValueSize(509), + + ColumnString::EstimatedValueSize(1009), + ColumnString::EstimatedValueSize(1013) + ), + SIZE_RATIOS + ) +); + +INSTANTIATE_TEST_SUITE_P( + Big, ColumnStringEstimatedValueSizeTest, + ::testing::Combine( + ::testing::Values( + // bigger than 1K + ColumnString::EstimatedValueSize(4 * 1024), + ColumnString::EstimatedValueSize(64 * 1024), + ColumnString::EstimatedValueSize(1024 * 1024), + ColumnString::EstimatedValueSize(4 * 1024 * 1024) + ), + SIZE_RATIOS + ) +); diff --git a/ut/Column_ut.cpp b/ut/Column_ut.cpp index bac666ec..ef458517 100644 --- a/ut/Column_ut.cpp +++ b/ut/Column_ut.cpp @@ -15,19 +15,21 @@ #include +#include +#include +#include +#include +#include + #include + #include +#include #include #include +#include #include -#include "gtest/internal/gtest-internal.h" -#include "ut/utils_comparison.h" -#include "ut/utils_meta.h" -#include "utils.h" -#include "roundtrip_column.h" -#include "value_generators.h" - namespace { using namespace clickhouse; } @@ -74,6 +76,7 @@ struct GenericColumnTestCase template class GenericColumnTest : public testing::Test { public: + using TestCase = T; using ColumnType = typename T::ColumnType; static auto MakeColumn() @@ -238,6 +241,8 @@ TYPED_TEST_SUITE(GenericColumnTest, TestCases); TYPED_TEST(GenericColumnTest, Construct) { auto column = this->MakeColumn(); ASSERT_EQ(0u, column->Size()); + + dumpMemoryUsage("Newly constructed column", column); } TYPED_TEST(GenericColumnTest, EmptyColumn) { @@ -275,6 +280,7 @@ TYPED_TEST(GenericColumnTest, Append) { } EXPECT_TRUE(CompareRecursive(values, *column)); + dumpMemoryUsage("After appending 10000 items ", column); } // To make some value types compatitable with Column::GetItem() @@ -335,6 +341,7 @@ TYPED_TEST(GenericColumnTest, Slice) { EXPECT_TRUE(CompareRecursive(values, *slice)); + dumpMemoryUsage("Memory usage of slice ", slice); // TODO: slices of different sizes } @@ -348,14 +355,62 @@ TYPED_TEST(GenericColumnTest, CloneEmpty) { EXPECT_EQ(0u, clone->Size()); EXPECT_EQ(column->GetType(), clone->GetType()); + + dumpMemoryUsage("Memory usage of empty clone ", clone); } TYPED_TEST(GenericColumnTest, Clear) { auto [column, values] = this->MakeColumnWithValues(10'000); EXPECT_EQ(values.size(), column->Size()); + dumpMemoryUsage("Memory usage before clear ", column); column->Clear(); EXPECT_EQ(0u, column->Size()); + dumpMemoryUsage("Memory usage after clear ", column); +} + +TYPED_TEST(GenericColumnTest, MemoryUsage) { + auto column = this->MakeColumn(); + const auto values = this->GenerateValues(10'000); + + auto max_memory_usage = sizeof(values.front()) * values.size(); + if (column->GetType().GetCode() == Type::Code::LowCardinality) { + // Low cardinality has a different memory usage profile: + // only unique values take space in the dictionary, + // rest are just indicies to said dictionary. + + const auto unique_values = TestFixture::TestCase::generateValues(); + max_memory_usage = sizeof(unique_values.begin()) * unique_values.size() + + sizeof(int32_t) * values.size() // indices + + sizeof(uint64_t) * values.size() * 2; // hashes for uniques checks + } + + if constexpr (std::is_same_v>) { + const auto unique_values = TestFixture::TestCase::generateValues(); + const size_t total_size = std::accumulate(unique_values.begin(), unique_values.end(), 0, [](auto accumulator, auto i) { + return accumulator + i.size(); + }); + max_memory_usage = total_size / unique_values.size() * values.size(); + + if constexpr (std::is_same_v) { + column->SetEstimatedValueSize(ColumnString::EstimatedValueSize(total_size / unique_values.size())); + // There is some over-allocation for ColumnString + max_memory_usage *= 1.2; + } + } + + // Empty column should have low memory usage from the start, + // from 0 bytes to 1% of max estimated memory usage due to some pre-reservations. + EXPECT_NEAR(max_memory_usage * 0.01, column->MemoryUsage(), max_memory_usage * 0.01) + << "On empty column"; + + column->Reserve(values.size()); + EXPECT_GE(max_memory_usage, column->MemoryUsage()) + << "After reserve"; + + TestFixture::AppendValues(column, values); + EXPECT_GE(max_memory_usage, column->MemoryUsage()) + << " After appending " << values.size() << " items"; } TYPED_TEST(GenericColumnTest, Swap) { @@ -389,10 +444,15 @@ TYPED_TEST(GenericColumnTest, ReserveAndCapacity) { if constexpr (has_method_Reserve_v && has_method_Capacity_v) { auto column = this->MakeColumn(); - EXPECT_EQ(0u, column->Capacity()); + + EXPECT_GT(16u, column->Capacity()); // Column might have some non-zero capacity initially + EXPECT_NO_THROW(column->Reserve(100u)); + EXPECT_EQ(100u, column->Capacity()); EXPECT_EQ(0u, column->Size()); + + dumpMemoryUsage("After Reserving space for 100 items ", column); } else { COLUMN_DOESNT_IMPLEMENT("method Reserve() and Capacity()"); diff --git a/ut/abnormal_column_names_test.cpp b/ut/abnormal_column_names_test.cpp index 11868f73..0ff6fd7c 100644 --- a/ut/abnormal_column_names_test.cpp +++ b/ut/abnormal_column_names_test.cpp @@ -1,14 +1,12 @@ #include #include +#include -#include "utils.h" +#include #include -#include -#include - namespace { using namespace clickhouse; diff --git a/ut/array_of_low_cardinality_tests.cpp b/ut/array_of_low_cardinality_tests.cpp index 56171236..20ec8108 100644 --- a/ut/array_of_low_cardinality_tests.cpp +++ b/ut/array_of_low_cardinality_tests.cpp @@ -1,16 +1,16 @@ -#include -#include -#include -#include - #include #include #include -#include "clickhouse/block.h" -#include "clickhouse/client.h" -#include "utils.h" -#include "clickhouse/base/buffer.h" -#include "clickhouse/base/output.h" +#include +#include + +#include +#include + +#include + +#include +#include namespace { diff --git a/ut/block_ut.cpp b/ut/block_ut.cpp index 3828cb65..53dc2627 100644 --- a/ut/block_ut.cpp +++ b/ut/block_ut.cpp @@ -1,10 +1,20 @@ -#include -#include "readonly_client_test.h" -#include "connection_failed_client_test.h" -#include "utils.h" +#include +#include +#include +#include + +#include +#include +#include #include +#include +#include +#include +#include +#include + namespace { using namespace clickhouse; diff --git a/ut/column_array_ut.cpp b/ut/column_array_ut.cpp index 6fe0bd19..9cc67a9b 100644 --- a/ut/column_array_ut.cpp +++ b/ut/column_array_ut.cpp @@ -17,7 +17,6 @@ #include "utils.h" #include -#include #include namespace { diff --git a/ut/columns_ut.cpp b/ut/columns_ut.cpp index 623fdf0e..577f32f2 100644 --- a/ut/columns_ut.cpp +++ b/ut/columns_ut.cpp @@ -14,27 +14,24 @@ #include #include #include // for ipv4-ipv6 platform-specific stuff +#include + +#include +#include #include -#include "utils.h" -#include "value_generators.h" +#include +#include #include #include #include -#include namespace { using namespace clickhouse; using namespace std::literals::string_view_literals; -static const auto LOWCARDINALITY_STRING_FOOBAR_10_ITEMS_BINARY = - "\x01\x00\x00\x00\x00\x00\x00\x00\x00\x06\x00\x00\x00\x00\x00\x00" - "\x09\x00\x00\x00\x00\x00\x00\x00\x00\x06\x46\x6f\x6f\x42\x61\x72" - "\x01\x31\x01\x32\x03\x46\x6f\x6f\x01\x34\x03\x42\x61\x72\x01\x37" - "\x01\x38\x0a\x00\x00\x00\x00\x00\x00\x00\x01\x02\x03\x04\x05\x06" - "\x04\x07\x08\x04"sv; } // TODO: add tests for ColumnDecimal. @@ -137,6 +134,7 @@ TEST(ColumnsCase, StringAppend) { ASSERT_EQ(col->At(2), "11"); } + TEST(ColumnsCase, TupleAppend){ auto tuple1 = std::make_shared(std::vector({ std::make_shared(), @@ -719,169 +717,6 @@ TEST(ColumnsCase, ColumnDecimal128_from_string_overflow) { #endif } -TEST(ColumnsCase, ColumnLowCardinalityString_Append_and_Read) { - const size_t items_count = 11; - ColumnLowCardinalityT col; - for (const auto & item : GenerateVector(items_count, &FooBarGenerator)) { - col.Append(item); - } - - ASSERT_EQ(col.Size(), items_count); - ASSERT_EQ(col.GetDictionarySize(), 8u + 1); // 8 unique items from sequence + 1 null-item - - for (size_t i = 0; i < items_count; ++i) { - ASSERT_EQ(col.At(i), FooBarGenerator(i)) << " at pos: " << i; - ASSERT_EQ(col[i], FooBarGenerator(i)) << " at pos: " << i; - } -} - -TEST(ColumnsCase, ColumnLowCardinalityString_Clear_and_Append) { - const size_t items_count = 11; - ColumnLowCardinalityT col; - for (const auto & item : GenerateVector(items_count, &FooBarGenerator)) - { - col.Append(item); - } - - col.Clear(); - ASSERT_EQ(col.Size(), 0u); - ASSERT_EQ(col.GetDictionarySize(), 1u); // null-item - - for (const auto & item : GenerateVector(items_count, &FooBarGenerator)) - { - col.Append(item); - } - - ASSERT_EQ(col.Size(), items_count); - ASSERT_EQ(col.GetDictionarySize(), 8u + 1); // 8 unique items from sequence + 1 null-item -} - -TEST(ColumnsCase, ColumnLowCardinalityString_Load) { - const size_t items_count = 10; - ColumnLowCardinalityT col; - - const auto & data = LOWCARDINALITY_STRING_FOOBAR_10_ITEMS_BINARY; - ArrayInput buffer(data.data(), data.size()); - - ASSERT_TRUE(col.Load(&buffer, items_count)); - - for (size_t i = 0; i < items_count; ++i) { - EXPECT_EQ(col.At(i), FooBarGenerator(i)) << " at pos: " << i; - } -} - -// This is temporary disabled since we are not 100% compatitable with ClickHouse -// on how we serailize LC columns, but we check interoperability in other tests (see client_ut.cpp) -TEST(ColumnsCase, DISABLED_ColumnLowCardinalityString_Save) { - const size_t items_count = 10; - ColumnLowCardinalityT col; - for (const auto & item : GenerateVector(items_count, &FooBarGenerator)) { - col.Append(item); - } - - ArrayOutput output(0, 0); - - const size_t expected_output_size = LOWCARDINALITY_STRING_FOOBAR_10_ITEMS_BINARY.size(); - // Enough space to account for possible overflow from both right and left sides. - std::string buffer(expected_output_size * 10, '\0');// = {'\0'}; - const char margin_content[sizeof(buffer)] = {'\0'}; - - const size_t left_margin_size = 10; - const size_t right_margin_size = sizeof(buffer) - left_margin_size - expected_output_size; - - // Since overflow from left side is less likely to happen, leave only tiny margin there. - auto write_pos = buffer.data() + left_margin_size; - const auto left_margin = buffer.data(); - const auto right_margin = write_pos + expected_output_size; - - output.Reset(write_pos, expected_output_size); - - EXPECT_NO_THROW(col.Save(&output)); - - // Left margin should be blank - EXPECT_EQ(std::string_view(margin_content, left_margin_size), std::string_view(left_margin, left_margin_size)); - // Right margin should be blank too - EXPECT_EQ(std::string_view(margin_content, right_margin_size), std::string_view(right_margin, right_margin_size)); - - // TODO: right now LC columns do not write indexes in the most compact way possible, so binary representation is a bit different - // (there might be other inconsistances too) - EXPECT_EQ(LOWCARDINALITY_STRING_FOOBAR_10_ITEMS_BINARY, std::string_view(write_pos, expected_output_size)); -} - -TEST(ColumnsCase, ColumnLowCardinalityString_SaveAndLoad) { - // Verify that we can load binary representation back - ColumnLowCardinalityT col; - - const auto items = GenerateVector(10, &FooBarGenerator); - for (const auto & item : items) { - col.Append(item); - } - - char buffer[256] = {'\0'}; // about 3 times more space than needed for this set of values. - { - ArrayOutput output(buffer, sizeof(buffer)); - EXPECT_NO_THROW(col.Save(&output)); - } - - col.Clear(); - - { - // Load the data back - ArrayInput input(buffer, sizeof(buffer)); - EXPECT_TRUE(col.Load(&input, items.size())); - } - - for (size_t i = 0; i < items.size(); ++i) { - EXPECT_EQ(col.At(i), items[i]) << " at pos: " << i; - } -} - -TEST(ColumnsCase, ColumnLowCardinalityString_WithEmptyString_1) { - // Verify that when empty string is added to a LC column it can be retrieved back as empty string. - ColumnLowCardinalityT col; - const auto values = GenerateVector(10, AlternateGenerators(SameValueGenerator(""), FooBarGenerator)); - for (const auto & item : values) { - col.Append(item); - } - - for (size_t i = 0; i < values.size(); ++i) { - EXPECT_EQ(values[i], col.At(i)) << " at pos: " << i; - } -} - -TEST(ColumnsCase, ColumnLowCardinalityString_WithEmptyString_2) { - // Verify that when empty string is added to a LC column it can be retrieved back as empty string. - // (Ver2): Make sure that outcome doesn't depend if empty values are on odd positions - ColumnLowCardinalityT col; - const auto values = GenerateVector(10, AlternateGenerators(FooBarGenerator, SameValueGenerator(""))); - for (const auto & item : values) { - col.Append(item); - } - - for (size_t i = 0; i < values.size(); ++i) { - EXPECT_EQ(values[i], col.At(i)) << " at pos: " << i; - } -} - -TEST(ColumnsCase, ColumnLowCardinalityString_WithEmptyString_3) { - // When we have many leading empty strings and some non-empty values. - ColumnLowCardinalityT col; - const auto values = ConcatSequences(GenerateVector(100, SameValueGenerator("")), GenerateVector(5, FooBarGenerator)); - for (const auto & item : values) { - col.Append(item); - } - - for (size_t i = 0; i < values.size(); ++i) { - EXPECT_EQ(values[i], col.At(i)) << " at pos: " << i; - } -} - -TEST(ColumnsCase, ColumnLowCardinalityFixedString_Type_Size_Eq) { - const size_t fixed_size = 10; - const auto col = std::make_shared>(fixed_size); - - ASSERT_EQ(fixed_size, col->GetNestedType()->As()->GetSize()); -} TEST(ColumnsCase, ColumnTupleT) { using TestTuple = ColumnTupleT; diff --git a/ut/connection_failed_client_test.cpp b/ut/connection_failed_client_test.cpp index 3da045a9..edc32138 100644 --- a/ut/connection_failed_client_test.cpp +++ b/ut/connection_failed_client_test.cpp @@ -1,5 +1,5 @@ -#include "connection_failed_client_test.h" -#include "utils.h" +#include +#include #include #include diff --git a/ut/low_cardinality_nullable_tests.cpp b/ut/low_cardinality_nullable_tests.cpp index 357f28a9..361f66b3 100644 --- a/ut/low_cardinality_nullable_tests.cpp +++ b/ut/low_cardinality_nullable_tests.cpp @@ -1,23 +1,25 @@ -#include #include -#include "clickhouse/columns/nullable.h" -#include "clickhouse/columns/lowcardinality.h" -#include "clickhouse/client.h" -#include "utils.h" -#include "clickhouse/base/wire_format.h" +#include +#include +#include +#include #include +#include + +#include + namespace { using namespace clickhouse; } static const auto localHostEndpoint = ClientOptions() - .SetHost( getEnvOrDefault("CLICKHOUSE_HOST", "localhost")) - .SetPort( getEnvOrDefault("CLICKHOUSE_PORT", "9000")) - .SetUser( getEnvOrDefault("CLICKHOUSE_USER", "default")) - .SetPassword( getEnvOrDefault("CLICKHOUSE_PASSWORD", "")) - .SetDefaultDatabase(getEnvOrDefault("CLICKHOUSE_DB", "default")); + .SetHost( getEnvOrDefault("CLICKHOUSE_HOST", "localhost")) + .SetPort( getEnvOrDefault("CLICKHOUSE_PORT", "9000")) + .SetUser( getEnvOrDefault("CLICKHOUSE_USER", "default")) + .SetPassword( getEnvOrDefault("CLICKHOUSE_PASSWORD", "")) + .SetDefaultDatabase(getEnvOrDefault("CLICKHOUSE_DB", "default")); ColumnRef buildTestColumn(const std::vector& rowsData, const std::vector& nulls) { diff --git a/ut/performance_tests.cpp b/ut/performance_tests.cpp index 74bd4f03..ebae858c 100644 --- a/ut/performance_tests.cpp +++ b/ut/performance_tests.cpp @@ -10,13 +10,13 @@ #include #include +#include +#include + #include #include -#include "utils.h" -#include "utils_performance.h" - using namespace clickhouse; inline std::uint64_t generate(const ColumnUInt64&, size_t index) { diff --git a/ut/utils.cpp b/ut/utils.cpp index bfa4872e..345a73a7 100644 --- a/ut/utils.cpp +++ b/ut/utils.cpp @@ -1,4 +1,4 @@ -#include "utils.h" +#include #include #include @@ -20,7 +20,9 @@ #include #include +#include #include +#include #include @@ -256,6 +258,48 @@ std::ostream& operator<<(std::ostream & ostr, const PrettyPrintBlock & pretty_pr return ostr; } +std::ostream& operator<<(std::ostream & ostr, const PrettyPrintByteSize & byte_size) { + static const std::pair FACTORS[] = { + { 1, "bytes" }, + { 1024, "KiB" }, + { 1024*1024, "MiB" }, + { 1024*1024*1024, "GiB" }, + }; + + auto p = std::find_if(std::begin(FACTORS), std::end(FACTORS), [&byte_size](const auto v) { + return byte_size.bytes < v.first; + }); + + if (p != std::begin(FACTORS)) { + --p; + } + + const float resulting_size = byte_size.bytes / static_cast(p->first); + + // Trim trailing zeroes after decimal point, if present. + { + std::stringstream sstr; + sstr << std::fixed << std::setprecision(byte_size.max_decimal_points) << resulting_size; + + auto s = sstr.str(); + + // here we completely ignore locales and just assume that '.' is used as decimal point + const auto decimal_point_position = s.find_last_of('.'); + const auto last_non_zero_decimal_number_pos = s.find_last_not_of('0'); + + if (decimal_point_position != s.npos && last_non_zero_decimal_number_pos != s.npos) { + if (decimal_point_position == last_non_zero_decimal_number_pos) + s.erase(decimal_point_position); + else + s.erase(std::max(decimal_point_position, last_non_zero_decimal_number_pos + 1)); + } + + ostr << s; + } + + return ostr << " " << p->second; +} + std::ostream& operator<<(std::ostream& ostr, const in_addr& addr) { char buf[INET_ADDRSTRLEN]; const char* ip_str = inet_ntop(AF_INET, &addr, buf, sizeof(buf)); @@ -332,6 +376,28 @@ std::ostream & operator<<(std::ostream & ostr, const Progress & progress) { << " written_bytes : " << progress.written_bytes; } +std::ostream & operator<<(std::ostream & ostr, const ColumnString::EstimatedValueSize & estimation) { + static const std::pair NAMES_OF_DEFAULT_VALUES[] = { + { ColumnString::NO_PREALLOCATE, "DO NOT PREALLOCATE" }, + { ColumnString::EstimatedValueSize::TINY, "TINY (8 bytes)" }, + { ColumnString::EstimatedValueSize::SMALL, "SMALL (32 bytes)" }, + { ColumnString::EstimatedValueSize::MEDIUM, "MEDIUM (128 bytes)" }, + { ColumnString::EstimatedValueSize::LARGE, "LARGE (512 bytes)" } + }; + + const auto p = std::find_if(std::begin(NAMES_OF_DEFAULT_VALUES), std::end(NAMES_OF_DEFAULT_VALUES), [&estimation](const auto v) { + return v.first == estimation; + }); + + ostr << "ColumnString::EstimatedValueSize{ "; + if (p != std::end(NAMES_OF_DEFAULT_VALUES)) + ostr << p->second; + else + ostr << PrettyPrintByteSize{static_cast(estimation)}; + + return ostr << " }"; +} + } uint64_t versionNumber(const ServerInfo & server_info) { @@ -348,3 +414,11 @@ std::string ToString(const clickhouse::UUID& v) { } return result; } + +void dumpMemoryUsage(const char * prefix, const clickhouse::ColumnRef col) { +#if defined(_NDEBUG) + return; +#else + std::cerr << prefix << " " << col->GetType().GetName() << " : " << PrettyPrintByteSize{col->MemoryUsage()} << std::endl; +#endif +} diff --git a/ut/utils.h b/ut/utils.h index 99216743..36306acb 100644 --- a/ut/utils.h +++ b/ut/utils.h @@ -3,6 +3,7 @@ #include #include +#include "clickhouse/columns/string.h" #include "clickhouse/query.h" #include "utils_meta.h" #include "utils_comparison.h" @@ -29,6 +30,8 @@ namespace clickhouse { struct Profile; struct QuerySettingsField; struct Progress; + + using ColumnRef = std::shared_ptr; } template @@ -138,15 +141,25 @@ struct PrettyPrintBlock { const clickhouse::Block & block; }; +// Print byte size in either in bytes, KiB, MiB, or GiB. +struct PrettyPrintByteSize { + size_t bytes; + size_t max_decimal_points = 2; +}; + namespace clickhouse { std::ostream& operator<<(std::ostream & ostr, const Block & block); std::ostream& operator<<(std::ostream & ostr, const Type & type); std::ostream & operator<<(std::ostream & ostr, const ServerInfo & server_info); std::ostream & operator<<(std::ostream & ostr, const Profile & profile); std::ostream & operator<<(std::ostream & ostr, const Progress & progress); +std::ostream & operator<<(std::ostream & ostr, const ColumnString::EstimatedValueSize & estimation); + } std::ostream& operator<<(std::ostream & ostr, const PrettyPrintBlock & block); +std::ostream& operator<<(std::ostream & ostr, const PrettyPrintByteSize & block); + std::ostream& operator<<(std::ostream& ostr, const in_addr& addr); std::ostream& operator<<(std::ostream& ostr, const in6_addr& addr); @@ -207,3 +220,5 @@ inline uint64_t versionNumber( uint64_t versionNumber(const clickhouse::ServerInfo & server_info); std::string ToString(const clickhouse::UUID& v); + +void dumpMemoryUsage(const char * prefix, const clickhouse::ColumnRef col); diff --git a/ut/utils_comparison.h b/ut/utils_comparison.h index e500e4f3..36b66b82 100644 --- a/ut/utils_comparison.h +++ b/ut/utils_comparison.h @@ -142,44 +142,44 @@ template struct PrintContainer; template -::testing::AssertionResult CompareRecursive(const Left & left, const Right & right) { +::testing::AssertionResult CompareRecursive(const Left & expected, const Right & actual) { if constexpr (!is_string_v && !is_string_v && (is_container_v || std::is_base_of_v>) && (is_container_v || std::is_base_of_v>) ) { - const auto & l = maybeWrapColumnAsContainer(left); - const auto & r = maybeWrapColumnAsContainer(right); + const auto & e = maybeWrapColumnAsContainer(expected); + const auto & a = maybeWrapColumnAsContainer(actual); - if (auto result = CompareCotainersRecursive(l, r)) + if (auto result = CompareCotainersRecursive(e, a)) return result; else - return result << "\nExpected container: " << PrintContainer{l} - << "\nActual container : " << PrintContainer{r}; + return result << "\nExpected container: " << PrintContainer{e} + << "\nActual container : " << PrintContainer{a}; } else { - if (left != right) { + if (expected != actual) { // Handle std::optional(nan) - // I'm too lazy to code comparison against std::nullopt, but this shpudn't be a problem in real life. + // I'm too lazy to code comparison against std::nullopt, but this shoudn't be a problem in real life. // RN comparing against std::nullopt, you'll receive an compilation error. if constexpr (is_instantiation_of::value && is_instantiation_of::value) { - if (left.has_value() && right.has_value()) - return CompareRecursive(*left, *right); + if (expected.has_value() && actual.has_value()) + return CompareRecursive(*expected, *actual); } else if constexpr (is_instantiation_of::value) { - if (left) - return CompareRecursive(*left, right); + if (expected) + return CompareRecursive(*expected, actual); } else if constexpr (is_instantiation_of::value) { - if (right) - return CompareRecursive(left, *right); + if (actual) + return CompareRecursive(expected, *actual); } else if constexpr (std::is_floating_point_v && std::is_floating_point_v) { - if (std::isnan(left) && std::isnan(right)) + if (std::isnan(expected) && std::isnan(actual)) return ::testing::AssertionSuccess(); } return ::testing::AssertionFailure() - << "\nExpected value: " << left - << "\nActual value : " << right; + << "\nExpected value: " << expected + << "\nActual value : " << actual; } return ::testing::AssertionSuccess(); diff --git a/ut/utils_ut.cpp b/ut/utils_ut.cpp index bd015751..d9cd4490 100644 --- a/ut/utils_ut.cpp +++ b/ut/utils_ut.cpp @@ -2,6 +2,7 @@ #include "ut/value_generators.h" #include "utils.h" +#include #include #include #include @@ -120,3 +121,57 @@ TEST(Generators, MakeArrays) { auto arrays = MakeArrays(); ASSERT_LT(0u, arrays.size()); } + +class OutputTest : public ::testing::Test { +public: + template + static std::string ToString(const T & t) { + std::stringstream sstr; + sstr << t; + + return sstr.str(); + } +}; + +TEST_F(OutputTest, PrettyPrintByteSize) +{ + EXPECT_EQ("3 bytes", ToString(PrettyPrintByteSize{3})); + + EXPECT_EQ("30 bytes", ToString(PrettyPrintByteSize{30})); + EXPECT_EQ("300 bytes", ToString(PrettyPrintByteSize{300})); + + EXPECT_EQ("123 bytes", ToString(PrettyPrintByteSize{123})); + + for (const auto & [base, base_name] : std::initializer_list>{ + // {1, "bytes"}, + {1024, "KiB"}, + {1024*1024, "MiB"}, + {1024*1024*1024, "GiB"}, + } ) + { + for (const auto & [value, value_str] : std::initializer_list>{ + {1, "1"}, + {1.01, "1.01"}, + {1.10, "1.1"}, + {1.5, "1.5"}, + {3, "3"}, + {3.25, "3.25"}, + {13.75, "13.75"}, + {135.5, "135.5"}, + {135.125, "135.12"}, + {10, "10"}, + {100, "100"}, + {1000, "1000"}, + }) + { + const auto bytes_value = static_cast(base * value); + const auto expected_str = std::string(value_str) + " " + base_name; + + EXPECT_EQ(expected_str, ToString(PrettyPrintByteSize{bytes_value, 2})) + << "\n\tbase: " << base + << "\n\tbase_name: " << base_name + << "\n\tvalue: " << value + << "\n\tvalue_str: " << value_str; + } + } +} diff --git a/ut/value_generators.cpp b/ut/value_generators.cpp index f6d7baf9..45cf5a1c 100644 --- a/ut/value_generators.cpp +++ b/ut/value_generators.cpp @@ -29,25 +29,26 @@ std::vector MakeFixedStrings(size_t string_size) { std::vector MakeStrings() { return { "a", "ab", "abc", "abcd", - "long string to test how those are handled. Here goes more text. " - "long string to test how those are handled. Here goes more text. " - "long string to test how those are handled. Here goes more text. " - "long string to test how those are handled. Here goes more text. " - "long string to test how those are handled. Here goes more text. " - "long string to test how those are handled. Here goes more text. " - "long string to test how those are handled. Here goes more text. " - "long string to test how those are handled. Here goes more text. " - "long string to test how those are handled. Here goes more text. " - "long string to test how those are handled. Here goes more text. " - "long string to test how those are handled. Here goes more text. " - "long string to test how those are handled. Here goes more text. " - "long string to test how those are handled. Here goes more text. " - "long string to test how those are handled. Here goes more text. " - "long string to test how those are handled. Here goes more text. " - "long string to test how those are handled. Here goes more text. " - "long string to test how those are handled. Here goes more text. " - "long string to test how those are handled. Here goes more text. " - "long string to test how those are handled. Here goes more text. " + "A a B b C c D d E e F f G g H h I i J j K k L l M m N n O o P p Q q R r S s T t U u V v W w X x Y y Z z", + "line 0: long string to test how those are handled. Here goes more text. \n" + "line 1: long string to test how those are handled. Here goes more text. \n" + "line 2: long string to test how those are handled. Here goes more text. \n" + "line 3: long string to test how those are handled. Here goes more text. \n" + "line 4: long string to test how those are handled. Here goes more text. \n" + "line 5: long string to test how those are handled. Here goes more text. \n" + "line 6: long string to test how those are handled. Here goes more text. \n" + "line 7: long string to test how those are handled. Here goes more text. \n" + "line 8: long string to test how those are handled. Here goes more text. \n" + "line 9: long string to test how those are handled. Here goes more text. \n" + "line 10: long string to test how those are handled. Here goes more text. \n" + "line 11: long string to test how those are handled. Here goes more text. \n" + "line 12: long string to test how those are handled. Here goes more text. \n" + "line 13: long string to test how those are handled. Here goes more text. \n" + "line 14: long string to test how those are handled. Here goes more text. \n" + "line 15: long string to test how those are handled. Here goes more text. \n" + "line 16: long string to test how those are handled. Here goes more text. \n" + "line 17: long string to test how those are handled. Here goes more text. \n" + "line 18: long string to test how those are handled. Here goes more text. " }; }