Skip to content

Commit 90414cc

Browse files
committed
Better estimation for ColumnLowCardinality::Reserve and ColumnString::Reserve
ColumnLowCardinality assumes that not all items are unique, hence dictionary column can be reserved for smaller capacity; ColumnString now allows to set average value size estimation in constructor or on existing instance. If estimation is close to real average value size, then memory is pre-allocations are close to optimum.
1 parent 0f8b396 commit 90414cc

32 files changed

+442
-33
lines changed

clickhouse/columns/array.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,10 @@ size_t ColumnArray::Size() const {
110110
return offsets_->Size();
111111
}
112112

113+
size_t ColumnArray::MemoryUsage() const {
114+
return offsets_->MemoryUsage() + data_->MemoryUsage();
115+
}
116+
113117
void ColumnArray::Swap(Column& other) {
114118
auto & col = dynamic_cast<ColumnArray &>(other);
115119
data_.swap(col.data_);

clickhouse/columns/array.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,8 @@ class ColumnArray : public Column {
7171
/// Returns count of rows in the column.
7272
size_t Size() const override;
7373

74+
size_t MemoryUsage() const override;
75+
7476
/// Makes slice of the current column.
7577
ColumnRef Slice(size_t, size_t) const override;
7678
ColumnRef CloneEmpty() const override;

clickhouse/columns/column.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,9 @@ class Column : public std::enable_shared_from_this<Column> {
9090

9191
virtual void Swap(Column&) = 0;
9292

93+
/// Estimated RAM usage by the column in bytes.
94+
virtual size_t MemoryUsage() const = 0;
95+
9396
/// Get a view on raw item data if it is supported by column, will throw an exception if index is out of range.
9497
/// Please note that view is invalidated once column items are added or deleted, column is loaded from strean or destroyed.
9598
virtual ItemView GetItem(size_t) const {

clickhouse/columns/date.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,10 @@ size_t ColumnDate::Size() const {
6767
return data_->Size();
6868
}
6969

70+
size_t ColumnDate::MemoryUsage() const {
71+
return data_->MemoryUsage();
72+
}
73+
7074
ColumnRef ColumnDate::Slice(size_t begin, size_t len) const {
7175
auto col = data_->Slice(begin, len)->As<ColumnUInt16>();
7276
auto result = std::make_shared<ColumnDate>();
@@ -154,6 +158,10 @@ size_t ColumnDate32::Size() const {
154158
return data_->Size();
155159
}
156160

161+
size_t ColumnDate32::MemoryUsage() const {
162+
return data_->MemoryUsage();
163+
}
164+
157165
ColumnRef ColumnDate32::Slice(size_t begin, size_t len) const {
158166
auto col = data_->Slice(begin, len)->As<ColumnInt32>();
159167
auto result = std::make_shared<ColumnDate32>();
@@ -244,6 +252,10 @@ size_t ColumnDateTime::Size() const {
244252
return data_->Size();
245253
}
246254

255+
size_t ColumnDateTime::MemoryUsage() const {
256+
return data_->MemoryUsage();
257+
}
258+
247259
void ColumnDateTime::Clear() {
248260
data_->Clear();
249261
}
@@ -330,6 +342,10 @@ size_t ColumnDateTime64::Size() const {
330342
return data_->Size();
331343
}
332344

345+
size_t ColumnDateTime64::MemoryUsage() const {
346+
return data_->MemoryUsage();
347+
}
348+
333349
ItemView ColumnDateTime64::GetItem(size_t index) const {
334350
return ItemView(Type::DateTime64, data_->GetItem(index));
335351
}

clickhouse/columns/date.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ class ColumnDate : public Column {
5151

5252
/// Returns count of rows in the column.
5353
size_t Size() const override;
54+
size_t MemoryUsage() const override;
5455

5556
/// Makes slice of the current column.
5657
ColumnRef Slice(size_t begin, size_t len) const override;
@@ -109,6 +110,7 @@ class ColumnDate32 : public Column {
109110

110111
/// Returns count of rows in the column.
111112
size_t Size() const override;
113+
size_t MemoryUsage() const override;
112114

113115
/// Makes slice of the current column.
114116
ColumnRef Slice(size_t begin, size_t len) const override;
@@ -170,6 +172,7 @@ class ColumnDateTime : public Column {
170172

171173
/// Returns count of rows in the column.
172174
size_t Size() const override;
175+
size_t MemoryUsage() const override;
173176

174177
/// Makes slice of the current column.
175178
ColumnRef Slice(size_t begin, size_t len) const override;
@@ -223,6 +226,7 @@ class ColumnDateTime64 : public Column {
223226

224227
/// Returns count of rows in the column.
225228
size_t Size() const override;
229+
size_t MemoryUsage() const override;
226230

227231
/// Makes slice of the current column.
228232
ColumnRef Slice(size_t begin, size_t len) const override;

clickhouse/columns/decimal.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,10 @@ size_t ColumnDecimal::Size() const {
217217
return data_->Size();
218218
}
219219

220+
size_t ColumnDecimal::MemoryUsage() const {
221+
return data_->MemoryUsage();
222+
}
223+
220224
ColumnRef ColumnDecimal::Slice(size_t begin, size_t len) const {
221225
// coundn't use std::make_shared since this c-tor is private
222226
return ColumnRef{new ColumnDecimal(type_, data_->Slice(begin, len))};

clickhouse/columns/decimal.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ class ColumnDecimal : public Column {
2828
void SaveBody(OutputStream* output) override;
2929
void Clear() override;
3030
size_t Size() const override;
31+
size_t MemoryUsage() const override;
3132
ColumnRef Slice(size_t begin, size_t len) const override;
3233
ColumnRef CloneEmpty() const override;
3334
void Swap(Column& other) override;

clickhouse/columns/enum.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,11 @@ size_t ColumnEnum<T>::Size() const {
9696
return data_.size();
9797
}
9898

99+
template <typename T>
100+
size_t ColumnEnum<T>::MemoryUsage() const {
101+
return data_.capacity() * sizeof(*data_.begin());
102+
}
103+
99104
template <typename T>
100105
ColumnRef ColumnEnum<T>::Slice(size_t begin, size_t len) const {
101106
return std::make_shared<ColumnEnum<T>>(type_, SliceVector(data_, begin, len));

clickhouse/columns/enum.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ class ColumnEnum : public Column {
4747

4848
/// Returns count of rows in the column.
4949
size_t Size() const override;
50+
size_t MemoryUsage() const override;
5051

5152
/// Makes slice of the current column.
5253
ColumnRef Slice(size_t begin, size_t len) const override;

clickhouse/columns/geo.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,11 +76,17 @@ void ColumnGeo<NestedColumnType, type_code>::SaveBody(OutputStream* output) {
7676
data_->SaveBody(output);
7777
}
7878

79+
7980
template <typename NestedColumnType, Type::Code type_code>
8081
size_t ColumnGeo<NestedColumnType, type_code>::Size() const {
8182
return data_->Size();
8283
}
8384

85+
template <typename NestedColumnType, Type::Code type_code>
86+
size_t ColumnGeo<NestedColumnType, type_code>::MemoryUsage() const {
87+
return data_->MemoryUsage();
88+
}
89+
8490
template <typename NestedColumnType, Type::Code type_code>
8591
ColumnRef ColumnGeo<NestedColumnType, type_code>::Slice(size_t begin, size_t len) const {
8692
return std::make_shared<ColumnGeo>(data_->Slice(begin, len));

clickhouse/columns/geo.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ class ColumnGeo : public Column {
4646

4747
/// Returns count of rows in the column.
4848
size_t Size() const override;
49+
size_t MemoryUsage() const override;
4950

5051
/// Makes slice of the current column.
5152
ColumnRef Slice(size_t begin, size_t len) const override;

clickhouse/columns/ip4.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,10 @@ size_t ColumnIPv4::Size() const {
9696
return data_->Size();
9797
}
9898

99+
size_t ColumnIPv4::MemoryUsage() const {
100+
return data_->MemoryUsage();
101+
}
102+
99103
ColumnRef ColumnIPv4::Slice(size_t begin, size_t len) const {
100104
return std::make_shared<ColumnIPv4>(data_->Slice(begin, len));
101105
}

clickhouse/columns/ip4.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ class ColumnIPv4 : public Column {
5656

5757
/// Returns count of rows in the column.
5858
size_t Size() const override;
59+
size_t MemoryUsage() const override;
5960

6061
/// Makes slice of the current column.
6162
ColumnRef Slice(size_t begin, size_t len) const override;

clickhouse/columns/ip6.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,10 @@ size_t ColumnIPv6::Size() const {
8787
return data_->Size();
8888
}
8989

90+
size_t ColumnIPv6::MemoryUsage() const {
91+
return data_->MemoryUsage();
92+
}
93+
9094
ColumnRef ColumnIPv6::Slice(size_t begin, size_t len) const {
9195
return std::make_shared<ColumnIPv6>(data_->Slice(begin, len));
9296
}

clickhouse/columns/ip6.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ class ColumnIPv6 : public Column {
5252

5353
/// Returns count of rows in the column.
5454
size_t Size() const override;
55+
size_t MemoryUsage() const override;
5556

5657
/// Makes slice of the current column.
5758
ColumnRef Slice(size_t begin, size_t len) const override;

clickhouse/columns/lowcardinality.cpp

Lines changed: 40 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -174,9 +174,39 @@ ColumnLowCardinality::ColumnLowCardinality(std::shared_ptr<ColumnNullable> dicti
174174
ColumnLowCardinality::~ColumnLowCardinality()
175175
{}
176176

177+
namespace
178+
{
179+
size_t EstimateDictionaryCapacity(size_t new_cap)
180+
{
181+
// Estimate capacity of the LC dictionary column.
182+
// For small columns we assume there are higher relative number of unique items
183+
// hence the capacity of the dictionary column must be the same as capacity of index_column.
184+
// For large columns we assume that there are at least 80% of duplicates,
185+
// hence the capacity of the dictionary column is 0.20 of the index_column.
186+
// Medium-sized columns have dictionary capacity somewhere in-between.
187+
188+
const float max_ratio = 1.0;
189+
const float min_ratio = 0.20;
190+
const float min_ratio_at = 512;
191+
const float max_ratio_at = 128;
192+
193+
if (new_cap < max_ratio_at)
194+
return new_cap;
195+
196+
if (new_cap >= min_ratio_at)
197+
return new_cap * min_ratio;
198+
199+
// Ratio of the dict capacity to the index column capacity,
200+
// linearly falls down from `max_ratio` at `max_ratio_at` down to `min_ratio` at min_ratio_at;
201+
const float ratio = max_ratio + (max_ratio_at - static_cast<int>(new_cap)) * (max_ratio - min_ratio) / (min_ratio_at - max_ratio_at);
202+
return new_cap * ratio;
203+
}
204+
}
205+
177206
void ColumnLowCardinality::Reserve(size_t new_cap) {
178-
dictionary_column_->Reserve(new_cap);
179207
index_column_->Reserve(new_cap);
208+
209+
dictionary_column_->Reserve(EstimateDictionaryCapacity(new_cap));
180210
}
181211

182212
void ColumnLowCardinality::Setup(ColumnRef dictionary_column) {
@@ -379,6 +409,13 @@ size_t ColumnLowCardinality::Size() const {
379409
return index_column_->Size();
380410
}
381411

412+
size_t ColumnLowCardinality::MemoryUsage() const {
413+
return unique_items_map_.bucket_count() * unique_items_map_.max_load_factor()
414+
* (sizeof(unique_items_map_.begin()->first) + sizeof(unique_items_map_.begin()->second))
415+
+ index_column_->MemoryUsage()
416+
+ dictionary_column_->MemoryUsage();
417+
}
418+
382419
ColumnRef ColumnLowCardinality::Slice(size_t begin, size_t len) const {
383420
begin = std::min(begin, Size());
384421
len = std::min(len, Size() - begin);
@@ -451,15 +488,13 @@ void ColumnLowCardinality::AppendUnsafe(const ItemView & value) {
451488
}
452489
}
453490

454-
void ColumnLowCardinality::AppendNullItem()
455-
{
491+
void ColumnLowCardinality::AppendNullItem() {
456492
const auto null_item = GetNullItemForDictionary(dictionary_column_);
457493
AppendToDictionary(*dictionary_column_, null_item);
458494
unique_items_map_.emplace(computeHashKey(null_item), 0);
459495
}
460496

461-
void ColumnLowCardinality::AppendDefaultItem()
462-
{
497+
void ColumnLowCardinality::AppendDefaultItem() {
463498
const auto defaultItem = GetDefaultItemForDictionary(dictionary_column_);
464499
unique_items_map_.emplace(computeHashKey(defaultItem), dictionary_column_->Size());
465500
AppendToDictionary(*dictionary_column_, defaultItem);

clickhouse/columns/lowcardinality.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ class ColumnLowCardinality : public Column {
8787

8888
/// Returns count of rows in the column.
8989
size_t Size() const override;
90+
size_t MemoryUsage() const override;
9091

9192
/// Makes slice of current column, with compacted dictionary
9293
ColumnRef Slice(size_t begin, size_t len) const override;

clickhouse/columns/map.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,10 @@ size_t ColumnMap::Size() const {
6767
return data_->Size();
6868
}
6969

70+
size_t ColumnMap::MemoryUsage() const {
71+
return data_->MemoryUsage();
72+
}
73+
7074
ColumnRef ColumnMap::Slice(size_t begin, size_t len) const {
7175
return std::make_shared<ColumnMap>(data_->Slice(begin, len));
7276
}

clickhouse/columns/map.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ class ColumnMap : public Column {
4848

4949
/// Returns count of rows in the column.
5050
size_t Size() const override;
51+
size_t MemoryUsage() const override;
5152

5253
/// Makes slice of the current column.
5354
ColumnRef Slice(size_t, size_t) const override;

clickhouse/columns/nothing.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,8 @@ class ColumnNothing : public Column {
7575
/// Returns count of rows in the column.
7676
size_t Size() const override { return size_; }
7777

78+
size_t MemoryUsage() const override { return 0; }
79+
7880
void Swap(Column& other) override {
7981
auto & col = dynamic_cast<ColumnNothing &>(other);
8082
std::swap(size_, col.size_);

clickhouse/columns/nullable.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,10 @@ size_t ColumnNullable::Size() const {
8282
return nulls_->Size();
8383
}
8484

85+
size_t ColumnNullable::MemoryUsage() const {
86+
return nested_->MemoryUsage() + nulls_->MemoryUsage();
87+
}
88+
8589
ColumnRef ColumnNullable::Slice(size_t begin, size_t len) const {
8690
return std::make_shared<ColumnNullable>(nested_->Slice(begin, len), nulls_->Slice(begin, len));
8791
}

clickhouse/columns/nullable.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ class ColumnNullable : public Column {
5050

5151
/// Returns count of rows in the column.
5252
size_t Size() const override;
53+
size_t MemoryUsage() const override;
5354

5455
/// Makes slice of the current column.
5556
ColumnRef Slice(size_t begin, size_t len) const override;

clickhouse/columns/numeric.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,11 @@ size_t ColumnVector<T>::Size() const {
8787
return data_.size();
8888
}
8989

90+
template <typename T>
91+
size_t ColumnVector<T>::MemoryUsage() const {
92+
return data_.capacity() * sizeof(data_[0]);
93+
}
94+
9095
template <typename T>
9196
ColumnRef ColumnVector<T>::Slice(size_t begin, size_t len) const {
9297
return std::make_shared<ColumnVector<T>>(SliceVector(data_, begin, len));

clickhouse/columns/numeric.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ class ColumnVector : public Column {
5757

5858
/// Makes slice of the current column.
5959
ColumnRef Slice(size_t begin, size_t len) const override;
60+
size_t MemoryUsage() const override;
6061
ColumnRef CloneEmpty() const override;
6162
void Swap(Column& other) override;
6263

0 commit comments

Comments
 (0)