From b9f4a958b326f5b02f8bf07679ab7e623b57a81d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=CE=B6eh=20Matt?= <5415177+ZehMatt@users.noreply.github.com> Date: Mon, 4 Nov 2024 11:40:39 +0200 Subject: [PATCH 1/8] Overhaul StringPool, make pointer stable and better performance --- zasm/include/zasm/core/stringpool.hpp | 368 +++++++++++++++++++++----- 1 file changed, 302 insertions(+), 66 deletions(-) diff --git a/zasm/include/zasm/core/stringpool.hpp b/zasm/include/zasm/core/stringpool.hpp index bd413cb..60c41f7 100644 --- a/zasm/include/zasm/core/stringpool.hpp +++ b/zasm/include/zasm/core/stringpool.hpp @@ -1,9 +1,11 @@ #pragma once #include +#include #include #include #include +#include #include #include #include @@ -16,34 +18,59 @@ namespace zasm class StringPool { + public: + enum class Id : std::int32_t + { + Invalid = -1, + }; + + static constexpr std::size_t kMaxStringSize = 0xFFFF; + static constexpr std::size_t kAverageStringSize = 24; + static constexpr std::size_t kStringsPerBlock = 4096; + static constexpr std::size_t kBlockSize = kAverageStringSize * kStringsPerBlock; + static constexpr std::size_t kUnspecifiedSize = ~std::size_t{ 0 }; + + // 16 bit prime number. + static constexpr std::size_t kMaxHashBuckets = 39119; + + private: struct Entry { std::size_t hash{}; - std::int32_t offset{}; + std::size_t blockIndex{}; + std::size_t sortedIndex{}; + std::int32_t offsetInBlock{}; std::int32_t len{}; std::int32_t capacity{}; std::int32_t refCount{}; }; std::vector _entries; - std::vector _data; + std::vector _freeEntries; + std::vector> _hashBuckets; - public: - enum class Id : std::int32_t + struct Block { - Invalid = -1, + std::size_t index{}; + std::size_t used{}; + std::array data{}; }; + std::vector> _blocks; + public: - // NOLINTNEXTLINE - template Id aquire(const char (&value)[N]) + StringPool() { - return aquire_(value, N); + _hashBuckets.resize(kMaxHashBuckets); } - Id aquire(const char* value) + Id aquire(const char* value, std::size_t size = kUnspecifiedSize) { - return aquire_(value, strlen(value)); + if (size == kUnspecifiedSize) + { + size = std::strlen(value); + } + return aquire_(value, size); } Id aquire(const std::string& val) @@ -60,14 +87,26 @@ namespace zasm } const auto newRefCount = --entry->refCount; + assert(newRefCount >= 0); + + if (newRefCount == 0) + { + _freeEntries.push_back(stringId); + + const auto bucketIndex = entry->hash % kMaxHashBuckets; + auto& bucket = _hashBuckets[bucketIndex]; + + bucket.erase(bucket.begin() + entry->sortedIndex); + } + return newRefCount; } - // NOLINTNEXTLINE - template Id find(const char (&value)[N]) const noexcept + Id find(const char* str) const noexcept { - const auto hash = getHash(value, N); - return find(value, N, hash); + const auto len = std::strlen(str); + const auto hash = getHash(str, len); + return find_(str, len, hash); } bool isValid(Id stringId) const noexcept @@ -84,7 +123,7 @@ namespace zasm return nullptr; } - return _data.data() + entry->offset; + return _blocks[entry->blockIndex]->data.data() + entry->offsetInBlock; } int32_t getLength(Id stringId) const noexcept @@ -111,12 +150,7 @@ namespace zasm void clear() noexcept { _entries.clear(); - _data.clear(); - } - - const char* data() const noexcept - { - return _data.data(); + _blocks.clear(); } std::size_t size() const noexcept @@ -126,6 +160,7 @@ namespace zasm Error save(IStream& stream) const { + // Serialize entries. const auto entryCount = static_cast(_entries.size()); if (auto len = stream.write(&entryCount, sizeof(entryCount)); len == 0) { @@ -138,7 +173,15 @@ namespace zasm { return ErrorCode::InvalidParameter; } - if (auto len = stream.write(entry.offset); len == 0) + if (auto len = stream.write(entry.blockIndex); len == 0) + { + return ErrorCode::InvalidParameter; + } + if (auto len = stream.write(entry.sortedIndex); len == 0) + { + return ErrorCode::InvalidParameter; + } + if (auto len = stream.write(entry.offsetInBlock); len == 0) { return ErrorCode::InvalidParameter; } @@ -156,15 +199,57 @@ namespace zasm } } - const auto dataSize = static_cast(_data.size()); - if (auto len = stream.write(dataSize); len == 0) + // Serialize free entries. + const auto freeEntryCount = static_cast(_freeEntries.size()); + if (auto len = stream.write(&freeEntryCount, sizeof(freeEntryCount)); len == 0) + { + return ErrorCode::InvalidParameter; + } + + for (const auto& freeEntry : _freeEntries) + { + if (auto len = stream.write(freeEntry); len == 0) + { + return ErrorCode::InvalidParameter; + } + } + + // Serialize hash buckets. + for (const auto& bucket : _hashBuckets) + { + const auto bucketSize = static_cast(bucket.size()); + if (auto len = stream.write(&bucketSize, sizeof(bucketSize)); len == 0) + { + return ErrorCode::InvalidParameter; + } + + for (const auto& id : bucket) + { + if (auto len = stream.write(id); len == 0) + { + return ErrorCode::InvalidParameter; + } + } + } + + // Serialize blocks. + const auto blockCount = static_cast(_blocks.size()); + if (auto len = stream.write(&blockCount, sizeof(blockCount)); len == 0) { return ErrorCode::InvalidParameter; } - if (dataSize > 0) + for (const auto& block : _blocks) { - if (auto len = stream.write(_data.data(), dataSize); len == 0) + if (auto len = stream.write(block->index); len == 0) + { + return ErrorCode::InvalidParameter; + } + if (auto len = stream.write(block->used); len == 0) + { + return ErrorCode::InvalidParameter; + } + if (auto len = stream.write(block->data.data(), block->used); len == 0) { return ErrorCode::InvalidParameter; } @@ -175,8 +260,7 @@ namespace zasm Error load(IStream& stream) { - clear(); - + // Deserialize entries. std::uint32_t entryCount{}; if (auto len = stream.read(&entryCount, sizeof(entryCount)); len == 0) { @@ -191,7 +275,15 @@ namespace zasm { return ErrorCode::InvalidParameter; } - if (auto len = stream.read(entry.offset); len == 0) + if (auto len = stream.read(entry.blockIndex); len == 0) + { + return ErrorCode::InvalidParameter; + } + if (auto len = stream.read(entry.sortedIndex); len == 0) + { + return ErrorCode::InvalidParameter; + } + if (auto len = stream.read(entry.offsetInBlock); len == 0) { return ErrorCode::InvalidParameter; } @@ -209,24 +301,76 @@ namespace zasm } } - std::uint32_t dataSize{}; - if (auto len = stream.read(&dataSize, sizeof(dataSize)); len == 0) + // Deserialize free entries. + std::uint32_t freeEntryCount{}; + if (auto len = stream.read(&freeEntryCount, sizeof(freeEntryCount)); len == 0) + { + return ErrorCode::InvalidParameter; + } + + auto freeEntries = std::vector(); + freeEntries.resize(freeEntryCount); + for (auto& freeEntry : freeEntries) + { + if (auto len = stream.read(freeEntry); len == 0) + { + return ErrorCode::InvalidParameter; + } + } + + // Deserialize hash buckets. + auto hashBuckets = std::vector>(); + hashBuckets.resize(kMaxHashBuckets); + for (auto& bucket : hashBuckets) + { + std::uint32_t bucketSize{}; + if (auto len = stream.read(&bucketSize, sizeof(bucketSize)); len == 0) + { + return ErrorCode::InvalidParameter; + } + + bucket.resize(bucketSize); + for (auto& id : bucket) + { + if (auto len = stream.read(id); len == 0) + { + return ErrorCode::InvalidParameter; + } + } + } + + // Deserialize blocks. + std::uint32_t blockCount{}; + if (auto len = stream.read(&blockCount, sizeof(blockCount)); len == 0) { return ErrorCode::InvalidParameter; } - std::vector loadedData; - if (dataSize > 0) + auto blocks = std::vector>(); + blocks.resize(blockCount); + for (auto& block : blocks) { - loadedData.resize(dataSize); - if (auto len = stream.read(loadedData.data(), dataSize); len == 0) + block = std::make_unique(); + + if (auto len = stream.read(block->index); len == 0) + { + return ErrorCode::InvalidParameter; + } + if (auto len = stream.read(block->used); len == 0) + { + return ErrorCode::InvalidParameter; + } + if (auto len = stream.read(block->data.data(), block->used); len == 0) { return ErrorCode::InvalidParameter; } } + // Swap state. _entries = std::move(loadedEntries); - _data = std::move(loadedData); + _freeEntries = std::move(freeEntries); + _hashBuckets = std::move(hashBuckets); + _blocks = std::move(blocks); return ErrorCode::None; } @@ -241,7 +385,13 @@ namespace zasm { return nullptr; } - if (self._entries[idx].refCount <= 0) + + const auto refCount = self._entries[idx].refCount; +#ifndef IN_TESTS + assert(refCount > 0); +#endif + + if (refCount <= 0) { return nullptr; } @@ -251,29 +401,80 @@ namespace zasm Id find_(const char* buf, std::size_t len, std::size_t hash) const noexcept { - auto it = std::find_if(std::begin(_entries), std::end(_entries), [&](const auto& entry) noexcept { - if (entry.refCount == 0 || entry.hash != hash || entry.len != len) - { - return false; - } - const char* str = _data.data() + entry.offset; - return memcmp(buf, str, len) == 0; + const auto bucketIndex = hash % kMaxHashBuckets; + const auto& bucket = _hashBuckets[bucketIndex]; + + // Find the lowest hash. + auto itLowest = std::lower_bound(bucket.begin(), bucket.end(), Id::Invalid, [&](Id id, Id) { + return _entries[static_cast(id)].hash < hash; }); - if (it != std::end(_entries)) + + // Iterate all matching hashes and compare the string. + for (auto it = itLowest; it != bucket.end(); ++it) { - const auto index = std::distance(_entries.begin(), it); - return static_cast(index); + const auto id = *it; + const auto& entry = _entries[static_cast(id)]; + if (entry.hash != hash) + { + break; + } + + if (entry.len != len) + { + continue; + } + + const auto* str = _blocks[entry.blockIndex]->data.data() + entry.offsetInBlock; + if (std::memcmp(str, buf, len) == 0) + { + return id; + } } + return Id::Invalid; } - Id aquire_(const char* buf, std::size_t len) + Block& getBlock(std::size_t len) { - constexpr std::int32_t kTerminatorLength = 1; + // If there are no blocks create a new one. + if (_blocks.empty()) + { + _blocks.emplace_back(std::make_unique()); + return *_blocks.back(); + } - const auto hash = getHash(buf, len); + // See if the last block has enough space. + auto& lastBlock = *_blocks.back(); + if (lastBlock.used + len < kBlockSize) + { + return lastBlock; + } + + // Create a new block. + auto newBlock = std::make_unique(); + newBlock->index = _blocks.size(); + _blocks.emplace_back(std::move(newBlock)); + + return *_blocks.back(); + } - auto stringId = find_(buf, len, hash); + Id aquire_(const char* inputStr, std::size_t len) + { + const auto hash = getHash(inputStr, len); + return aquire_(inputStr, len, hash); + } + + Id aquire_(const char* inputStr, std::size_t len, std::size_t hash) + { + // Strings can not be larger than kMaxStringSize. + if (len >= kMaxStringSize) + { + assert(len < kMaxStringSize); + return Id::Invalid; + } + + // Find existing string and increase ref count. + auto stringId = find_(inputStr, len, hash); if (stringId != Id::Invalid) { auto& entry = _entries[static_cast(stringId)]; @@ -282,48 +483,82 @@ namespace zasm } const auto len2 = static_cast(len); + constexpr std::int32_t kTerminatorLength = 1; // Use empty entry if any exist. - auto itEntry = std::find_if(_entries.begin(), _entries.end(), [&](auto&& entry) { + // TODO: Optimize this, we should prioritize finding entries at the end of the list. + // This would mean shrinking the vector has less to copy. + auto itEntry = std::find_if(_freeEntries.begin(), _freeEntries.end(), [&](Id id) { + auto& entry = _entries[static_cast(id)]; return entry.refCount <= 0 && entry.capacity >= len2 + kTerminatorLength; }); - if (itEntry != _entries.end()) - { - // Found empty space. - auto& entry = *itEntry; + const auto insertToHashBucket = [&](Entry& entry, Id id) { + const auto bucketIndex = entry.hash % kMaxHashBuckets; + auto& bucket = _hashBuckets[bucketIndex]; + + auto sortedIt = std::lower_bound(bucket.begin(), bucket.end(), id, [&](Id id, Id id2) { + return _entries[static_cast(id)].hash < _entries[static_cast(id2)].hash; + }); + + const auto sortedIndex = static_cast(std::distance(bucket.begin(), sortedIt)); + bucket.insert(sortedIt, id); - stringId = static_cast(std::distance(_entries.begin(), itEntry)); - std::memcpy(_data.data() + entry.offset, buf, len2); + return sortedIndex; + }; + const auto writeStringToBlock = [&](std::size_t blockOffset, Block& block) { + std::memcpy(block.data.data() + blockOffset, inputStr, len2); // Ensure null termination. - const auto termOffset = static_cast(entry.offset) + len2; - _data[termOffset] = '\0'; + block.data[blockOffset + len2] = '\0'; + }; + + if (itEntry != _freeEntries.end()) + { + // Found empty space. + stringId = *itEntry; + auto& entry = _entries[static_cast(stringId)]; + + auto& block = _blocks[entry.blockIndex]; + writeStringToBlock(entry.offsetInBlock, *block); entry.hash = hash; entry.len = len2; entry.refCount = 1; + entry.sortedIndex = insertToHashBucket(entry, stringId); + + _freeEntries.erase(itEntry); return stringId; } // New entry. - const auto offset = static_cast(_data.size()); - std::copy_n(buf, len, std::back_insert_iterator(_data)); + auto& block = getBlock(len2 + kTerminatorLength); + const auto offset = static_cast(block.used); - // Ensure null termination. - _data.push_back('\0'); + writeStringToBlock(offset, block); + block.used += len2 + kTerminatorLength; stringId = static_cast(_entries.size()); - _entries.push_back({ hash, offset, len2, len2 + kTerminatorLength, 1 }); + auto& entry = _entries.emplace_back(); + entry.hash = hash; + entry.blockIndex = block.index; + entry.offsetInBlock = offset; + entry.len = len2; + entry.capacity = len2 + kTerminatorLength; + entry.refCount = 1; + entry.sortedIndex = insertToHashBucket(entry, stringId); return stringId; } static constexpr std::size_t getHash(const char* buf, size_t len) noexcept { - if (buf == nullptr) + assert(buf != nullptr); + assert(len > 0); + + if (buf == nullptr || len == 0) { return 0; } @@ -340,6 +575,7 @@ namespace zasm result ^= static_cast(buf[i]); result *= prime; } + return result; } }; From 1bf84c17a04e4be9697e0d13fbe5f793111e259e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=CE=B6eh=20Matt?= <5415177+ZehMatt@users.noreply.github.com> Date: Mon, 4 Nov 2024 11:41:00 +0200 Subject: [PATCH 2/8] Improve tests for StringPool --- tests/src/tests/tests.stringpool.cpp | 36 ++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/tests/src/tests/tests.stringpool.cpp b/tests/src/tests/tests.stringpool.cpp index 5630da1..c05c513 100644 --- a/tests/src/tests/tests.stringpool.cpp +++ b/tests/src/tests/tests.stringpool.cpp @@ -1,4 +1,6 @@ #include + +#define IN_TESTS #include namespace zasm::tests @@ -87,4 +89,38 @@ namespace zasm::tests ASSERT_EQ(pool.getRefCount(id5), 1); } + TEST(StringPoolTests, TestManyStrings) + { + StringPool pool; + + std::vector ids; + + constexpr auto kTestSize = 1'000'000; + for (auto i = 0U; i < kTestSize; i++) + { + std::string str = std::string("hello") + std::to_string(i); + + const auto id = pool.aquire(str); + ASSERT_NE(id, StringPool::Id::Invalid); + ASSERT_EQ(pool.getLength(id), str.size()); + ASSERT_EQ(pool.getRefCount(id), 1); + + const auto* cstr = pool.get(id); + ASSERT_NE(cstr, nullptr); + ASSERT_EQ(strcmp(cstr, str.c_str()), 0); + + ids.push_back(id); + } + + // Validate that the contents still match. + for (auto i = 0U; i < kTestSize; i++) + { + std::string str = std::string("hello") + std::to_string(i); + + const auto* cstr = pool.get(ids[i]); + ASSERT_NE(cstr, nullptr); + ASSERT_EQ(strcmp(cstr, str.c_str()), 0); + } + } + } // namespace zasm::tests From 3bd31972f9892b173dbfdbe7703a0ac2f7e24a8d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=CE=B6eh=20Matt?= <5415177+ZehMatt@users.noreply.github.com> Date: Mon, 4 Nov 2024 11:41:12 +0200 Subject: [PATCH 3/8] Improve benchmark for StringPool --- .../src/benchmarks/benchmark.stringpool.cpp | 83 +++++++++++++------ 1 file changed, 56 insertions(+), 27 deletions(-) diff --git a/benchmark/src/benchmarks/benchmark.stringpool.cpp b/benchmark/src/benchmarks/benchmark.stringpool.cpp index 2d82adc..03951a9 100644 --- a/benchmark/src/benchmarks/benchmark.stringpool.cpp +++ b/benchmark/src/benchmarks/benchmark.stringpool.cpp @@ -1,73 +1,102 @@ #include +#include #include namespace zasm::benchmarks { - static const std::string TestStrings[] = { "hello", "world", "longer string", "even longer string", - "even longer longer string" }; + static constexpr auto kTestSize = 500'000; + static constexpr const char kChars[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; + + static const std::vector kInputStrings = []() { + std::vector strings; + + std::mt19937 prng(42); + for (int i = 0; i < kTestSize; ++i) + { + std::string str; + for (size_t i = 0; i < 4 + (prng() % 24); ++i) + { + str.push_back(kChars[prng() % (sizeof(kChars) - 1)]); + } + strings.push_back(std::move(str)); + } + return strings; + }(); static void BM_StringPool_Aquire(benchmark::State& state) { StringPool pool; for (auto _ : state) { - const auto& str = TestStrings[state.range(0)]; + for (auto i = 0; i < state.range(0); ++i) + { + const auto& str = kInputStrings[i]; - auto stringId = pool.aquire(str); - benchmark::DoNotOptimize(stringId); + auto stringId = pool.aquire(str); + benchmark::DoNotOptimize(stringId); + } } } - BENCHMARK(BM_StringPool_Aquire)->DenseRange(0, std::size(TestStrings) - 1); + BENCHMARK(BM_StringPool_Aquire)->Range(0, kTestSize)->Unit(benchmark::kMillisecond); static void BM_StringPool_Release(benchmark::State& state) { StringPool pool; for (auto _ : state) { - state.PauseTiming(); - const auto& str = TestStrings[state.range(0)]; + for (auto i = 0; i < state.range(0); ++i) + { + state.PauseTiming(); + const auto& str = kInputStrings[i]; - auto stringId = pool.aquire(str); - state.ResumeTiming(); + auto stringId = pool.aquire(str); + state.ResumeTiming(); - auto refCount = pool.release(stringId); - benchmark::DoNotOptimize(refCount); + auto refCount = pool.release(stringId); + benchmark::DoNotOptimize(refCount); + } } } - BENCHMARK(BM_StringPool_Release)->DenseRange(0, std::size(TestStrings) - 1); + BENCHMARK(BM_StringPool_Release)->Range(0, kTestSize)->Unit(benchmark::kMillisecond); static void BM_StringPool_Get(benchmark::State& state) { StringPool pool; for (auto _ : state) { - state.PauseTiming(); - const auto& str = TestStrings[state.range(0)]; + for (auto i = 0; i < state.range(0); ++i) + { + state.PauseTiming(); + const auto& str = kInputStrings[i]; - auto stringId = pool.aquire(str); - state.ResumeTiming(); + auto stringId = pool.aquire(str); + state.ResumeTiming(); - const char* res = pool.get(stringId); - benchmark::DoNotOptimize(res); + const char* res = pool.get(stringId); + benchmark::DoNotOptimize(res); + } } } - BENCHMARK(BM_StringPool_Get)->DenseRange(0, std::size(TestStrings) - 1); + BENCHMARK(BM_StringPool_Get)->Range(0, kTestSize)->Unit(benchmark::kMillisecond); static void BM_StringPool_GetLength(benchmark::State& state) { StringPool pool; for (auto _ : state) { - state.PauseTiming(); - const auto& str = TestStrings[state.range(0)]; + for (auto i = 0; i < state.range(0); ++i) + { + state.PauseTiming(); + const auto& str = kInputStrings[i]; - auto stringId = pool.aquire(str); - state.ResumeTiming(); + auto stringId = pool.aquire(str); + state.ResumeTiming(); - auto strLen = pool.getLength(stringId); - benchmark::DoNotOptimize(strLen); + auto strLen = pool.getLength(stringId); + benchmark::DoNotOptimize(strLen); + } } } - BENCHMARK(BM_StringPool_GetLength)->DenseRange(0, std::size(TestStrings) - 1); + BENCHMARK(BM_StringPool_GetLength)->Range(0, kTestSize)->Unit(benchmark::kMillisecond); } // namespace zasm::benchmarks From 07dfa78f67bd527619fd8b714d27879b03fddd73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=CE=B6eh=20Matt?= <5415177+ZehMatt@users.noreply.github.com> Date: Mon, 4 Nov 2024 11:51:34 +0200 Subject: [PATCH 4/8] Actually clear the StringPool --- zasm/include/zasm/core/stringpool.hpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/zasm/include/zasm/core/stringpool.hpp b/zasm/include/zasm/core/stringpool.hpp index 60c41f7..79b55bd 100644 --- a/zasm/include/zasm/core/stringpool.hpp +++ b/zasm/include/zasm/core/stringpool.hpp @@ -151,6 +151,11 @@ namespace zasm { _entries.clear(); _blocks.clear(); + _freeEntries.clear(); + for (auto& bucket : _hashBuckets) + { + bucket.clear(); + } } std::size_t size() const noexcept From 85605fa83d32158c60ee0a28e6155b40fb4e0e3a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=CE=B6eh=20Matt?= <5415177+ZehMatt@users.noreply.github.com> Date: Thu, 7 Nov 2024 16:35:01 +0200 Subject: [PATCH 5/8] More refinements --- .../src/benchmarks/benchmark.stringpool.cpp | 34 ++++ tests/src/tests/tests.stringpool.cpp | 124 ++++++++++++-- zasm/include/zasm/core/stringpool.hpp | 159 +++++++++++------- 3 files changed, 242 insertions(+), 75 deletions(-) diff --git a/benchmark/src/benchmarks/benchmark.stringpool.cpp b/benchmark/src/benchmarks/benchmark.stringpool.cpp index 03951a9..df5fdf4 100644 --- a/benchmark/src/benchmarks/benchmark.stringpool.cpp +++ b/benchmark/src/benchmarks/benchmark.stringpool.cpp @@ -59,6 +59,40 @@ namespace zasm::benchmarks } BENCHMARK(BM_StringPool_Release)->Range(0, kTestSize)->Unit(benchmark::kMillisecond); + static void BM_StringPool_Reuse(benchmark::State& state) + { + StringPool pool; + for (auto _ : state) + { + std::vector stringIds; + + state.PauseTiming(); + for (auto i = 0; i < state.range(0); ++i) + { + const auto& str = kInputStrings[i]; + + auto stringId = pool.aquire(str); + stringIds.push_back(stringId); + } + + // Clear. + for (auto i = 0; i < state.range(0); ++i) + { + auto stringId = pool.release(stringIds[i]); + } + state.ResumeTiming(); + + for (auto i = 0; i < state.range(0); ++i) + { + const auto& str = kInputStrings[i]; + + auto stringId = pool.aquire(str); + benchmark::DoNotOptimize(stringId); + } + } + } + BENCHMARK(BM_StringPool_Reuse)->Range(0, kTestSize)->Unit(benchmark::kMillisecond); + static void BM_StringPool_Get(benchmark::State& state) { StringPool pool; diff --git a/tests/src/tests/tests.stringpool.cpp b/tests/src/tests/tests.stringpool.cpp index c05c513..2439aa6 100644 --- a/tests/src/tests/tests.stringpool.cpp +++ b/tests/src/tests/tests.stringpool.cpp @@ -1,4 +1,5 @@ #include +#include #define IN_TESTS #include @@ -84,22 +85,36 @@ namespace zasm::tests const auto id5 = pool.aquire(str5); ASSERT_NE(id5, StringPool::Id::Invalid); - ASSERT_NE(id5, id1); + ASSERT_EQ(id5, id1); ASSERT_EQ(pool.getLength(id5), std::size(str5) - 1); ASSERT_EQ(pool.getRefCount(id5), 1); } - TEST(StringPoolTests, TestManyStrings) - { - StringPool pool; + constexpr auto kTestSize = 100'000; - std::vector ids; + static constexpr const char kChars[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; - constexpr auto kTestSize = 1'000'000; - for (auto i = 0U; i < kTestSize; i++) + static const std::vector kInputStrings = []() { + std::vector strings; + std::set uniqueStrings; + std::mt19937 prng(42); + while (uniqueStrings.size() < kTestSize) { - std::string str = std::string("hello") + std::to_string(i); + std::string str; + for (size_t i = 0; i < 4 + (prng() % 24); ++i) + { + str.push_back(kChars[prng() % (sizeof(kChars) - 1)]); + } + uniqueStrings.emplace(std::move(str)); + } + strings.insert(strings.end(), uniqueStrings.begin(), uniqueStrings.end()); + return strings; + }(); + const auto fillPool = [](StringPool& pool, std::vector& ids) { + ids.clear(); + for (const auto& str : kInputStrings) + { const auto id = pool.aquire(str); ASSERT_NE(id, StringPool::Id::Invalid); ASSERT_EQ(pool.getLength(id), str.size()); @@ -111,16 +126,103 @@ namespace zasm::tests ids.push_back(id); } + ASSERT_EQ(ids.size(), kInputStrings.size()); + }; + + TEST(StringPoolTests, TestManyStrings) + { + StringPool pool; + std::vector ids; + + fillPool(pool, ids); // Validate that the contents still match. - for (auto i = 0U; i < kTestSize; i++) + for (size_t i = 0; i < kTestSize; i++) { - std::string str = std::string("hello") + std::to_string(i); - + const auto& str = kInputStrings[i]; const auto* cstr = pool.get(ids[i]); ASSERT_NE(cstr, nullptr); ASSERT_EQ(strcmp(cstr, str.c_str()), 0); } } + TEST(StringPoolTests, TestClearIndividually) + { + StringPool pool; + std::vector ids; + + fillPool(pool, ids); + + // Clear. + for (auto id : ids) + { + ASSERT_EQ(pool.release(id), 0); + } + + ASSERT_EQ(pool.size(), 0); + } + + TEST(StringPoolTests, TestReuseSingle) + { + StringPool pool; + std::vector ids; + + fillPool(pool, ids); + + // Validate strings. + for (auto i = 0U; i < kTestSize; i++) + { + const auto& str = kInputStrings[i]; + + const auto* cstr = pool.get(ids[i]); + ASSERT_NE(cstr, nullptr) << "i = " << i; + ASSERT_EQ(strcmp(cstr, str.c_str()), 0) << "i = " << i; + } + + // Clear. + for (auto id : ids) + { + ASSERT_EQ(pool.release(id), 0); + } + + fillPool(pool, ids); + + // Validate strings. + for (auto i = 0U; i < kTestSize; i++) + { + const auto& str = kInputStrings[i]; + + const auto* cstr = pool.get(ids[i]); + ASSERT_NE(cstr, nullptr) << "i = " << i; + ASSERT_EQ(strcmp(cstr, str.c_str()), 0) << "i = " << i; + } + } + + TEST(StringPoolTests, TestReuseMultiple) + { + StringPool pool; + std::vector ids; + + for (size_t i = 0; i < 5; i++) + { + fillPool(pool, ids); + + // Validate that the contents still match. + for (auto i = 0U; i < kTestSize; i++) + { + const auto& str = kInputStrings[i]; + + const auto* cstr = pool.get(ids[i]); + ASSERT_NE(cstr, nullptr); + ASSERT_EQ(strcmp(cstr, str.c_str()), 0); + } + + // Clear. + for (auto id : ids) + { + ASSERT_EQ(pool.release(id), 0); + } + } + } + } // namespace zasm::tests diff --git a/zasm/include/zasm/core/stringpool.hpp b/zasm/include/zasm/core/stringpool.hpp index 79b55bd..162e198 100644 --- a/zasm/include/zasm/core/stringpool.hpp +++ b/zasm/include/zasm/core/stringpool.hpp @@ -4,7 +4,6 @@ #include #include #include -#include #include #include #include @@ -29,6 +28,9 @@ namespace zasm static constexpr std::size_t kStringsPerBlock = 4096; static constexpr std::size_t kBlockSize = kAverageStringSize * kStringsPerBlock; static constexpr std::size_t kUnspecifiedSize = ~std::size_t{ 0 }; + static constexpr std::size_t kMinStringCapacity = 16; + + static_assert(kBlockSize >= kMaxStringSize, "Block size must be bigger than max string size."); // 16 bit prime number. static constexpr std::size_t kMaxHashBuckets = 39119; @@ -38,16 +40,17 @@ namespace zasm { std::size_t hash{}; std::size_t blockIndex{}; - std::size_t sortedIndex{}; std::int32_t offsetInBlock{}; std::int32_t len{}; std::int32_t capacity{}; std::int32_t refCount{}; + Id nextFreeId{ Id::Invalid }; }; std::vector _entries; - std::vector _freeEntries; std::vector> _hashBuckets; + Id _nextFreeId{ Id::Invalid }; + std::size_t _numFree{}; struct Block { @@ -91,12 +94,14 @@ namespace zasm if (newRefCount == 0) { - _freeEntries.push_back(stringId); + entry->nextFreeId = _nextFreeId; + _nextFreeId = stringId; + _numFree++; const auto bucketIndex = entry->hash % kMaxHashBuckets; auto& bucket = _hashBuckets[bucketIndex]; - bucket.erase(bucket.begin() + entry->sortedIndex); + bucket.erase(std::remove(bucket.begin(), bucket.end(), stringId), bucket.end()); } return newRefCount; @@ -150,8 +155,13 @@ namespace zasm void clear() noexcept { _entries.clear(); - _blocks.clear(); - _freeEntries.clear(); + if (_blocks.size() > 1) + { + _blocks.resize(1); + _blocks[0]->used = 0; + } + _nextFreeId = Id::Invalid; + _numFree = 0; for (auto& bucket : _hashBuckets) { bucket.clear(); @@ -160,7 +170,7 @@ namespace zasm std::size_t size() const noexcept { - return _entries.size(); + return _entries.size() - _numFree; } Error save(IStream& stream) const @@ -182,10 +192,6 @@ namespace zasm { return ErrorCode::InvalidParameter; } - if (auto len = stream.write(entry.sortedIndex); len == 0) - { - return ErrorCode::InvalidParameter; - } if (auto len = stream.write(entry.offsetInBlock); len == 0) { return ErrorCode::InvalidParameter; @@ -202,23 +208,18 @@ namespace zasm { return ErrorCode::InvalidParameter; } + if (auto len = stream.write(entry.nextFreeId); len == 0) + { + return ErrorCode::InvalidParameter; + } } // Serialize free entries. - const auto freeEntryCount = static_cast(_freeEntries.size()); - if (auto len = stream.write(&freeEntryCount, sizeof(freeEntryCount)); len == 0) + if (auto len = stream.write(&_nextFreeId, sizeof(_nextFreeId)); len == 0) { return ErrorCode::InvalidParameter; } - for (const auto& freeEntry : _freeEntries) - { - if (auto len = stream.write(freeEntry); len == 0) - { - return ErrorCode::InvalidParameter; - } - } - // Serialize hash buckets. for (const auto& bucket : _hashBuckets) { @@ -284,10 +285,6 @@ namespace zasm { return ErrorCode::InvalidParameter; } - if (auto len = stream.read(entry.sortedIndex); len == 0) - { - return ErrorCode::InvalidParameter; - } if (auto len = stream.read(entry.offsetInBlock); len == 0) { return ErrorCode::InvalidParameter; @@ -304,25 +301,19 @@ namespace zasm { return ErrorCode::InvalidParameter; } + if (auto len = stream.read(entry.nextFreeId); len == 0) + { + return ErrorCode::InvalidParameter; + } } // Deserialize free entries. - std::uint32_t freeEntryCount{}; - if (auto len = stream.read(&freeEntryCount, sizeof(freeEntryCount)); len == 0) + Id nextFreeEntry{ Id::Invalid }; + if (auto len = stream.read(&nextFreeEntry, sizeof(nextFreeEntry)); len == 0) { return ErrorCode::InvalidParameter; } - auto freeEntries = std::vector(); - freeEntries.resize(freeEntryCount); - for (auto& freeEntry : freeEntries) - { - if (auto len = stream.read(freeEntry); len == 0) - { - return ErrorCode::InvalidParameter; - } - } - // Deserialize hash buckets. auto hashBuckets = std::vector>(); hashBuckets.resize(kMaxHashBuckets); @@ -373,9 +364,17 @@ namespace zasm // Swap state. _entries = std::move(loadedEntries); - _freeEntries = std::move(freeEntries); _hashBuckets = std::move(hashBuckets); _blocks = std::move(blocks); + _nextFreeId = nextFreeEntry; + + // Compute free count. + _numFree = 0; + for (auto freeId = _nextFreeId; freeId != Id::Invalid; + freeId = _entries[static_cast(freeId)].nextFreeId) + { + _numFree++; + } return ErrorCode::None; } @@ -463,6 +462,42 @@ namespace zasm return *_blocks.back(); } + Id getFreeEntry(std::size_t requiredLength) + { + if (_nextFreeId == Id::Invalid) + { + return Id::Invalid; + } + + auto nextFreeId = _nextFreeId; + auto parentId = Id::Invalid; + + while (nextFreeId != Id::Invalid) + { + const auto& entry = _entries[static_cast(nextFreeId)]; + if (entry.capacity >= requiredLength) + { + if (parentId != Id::Invalid) + { + // Update the next free id of the parent. + auto& parentEntry = _entries[static_cast(parentId)]; + parentEntry.nextFreeId = entry.nextFreeId; + } + else + { + // Update the head of the free list. + _nextFreeId = entry.nextFreeId; + } + _numFree--; + return nextFreeId; + } + parentId = nextFreeId; + nextFreeId = entry.nextFreeId; + } + + return Id::Invalid; + } + Id aquire_(const char* inputStr, std::size_t len) { const auto hash = getHash(inputStr, len); @@ -487,16 +522,8 @@ namespace zasm return stringId; } - const auto len2 = static_cast(len); - constexpr std::int32_t kTerminatorLength = 1; - - // Use empty entry if any exist. - // TODO: Optimize this, we should prioritize finding entries at the end of the list. - // This would mean shrinking the vector has less to copy. - auto itEntry = std::find_if(_freeEntries.begin(), _freeEntries.end(), [&](Id id) { - auto& entry = _entries[static_cast(id)]; - return entry.refCount <= 0 && entry.capacity >= len2 + kTerminatorLength; - }); + const auto actualLength = static_cast(len); + const auto requiredLength = static_cast(len) + 1; const auto insertToHashBucket = [&](Entry& entry, Id id) { const auto bucketIndex = entry.hash % kMaxHashBuckets; @@ -506,43 +533,46 @@ namespace zasm return _entries[static_cast(id)].hash < _entries[static_cast(id2)].hash; }); - const auto sortedIndex = static_cast(std::distance(bucket.begin(), sortedIt)); bucket.insert(sortedIt, id); - - return sortedIndex; }; const auto writeStringToBlock = [&](std::size_t blockOffset, Block& block) { - std::memcpy(block.data.data() + blockOffset, inputStr, len2); + std::memcpy(block.data.data() + blockOffset, inputStr, actualLength); // Ensure null termination. - block.data[blockOffset + len2] = '\0'; + block.data[blockOffset + actualLength] = '\0'; }; - if (itEntry != _freeEntries.end()) + // Use empty entry if any exist. + stringId = getFreeEntry(requiredLength); + if (stringId != Id::Invalid) { // Found empty space. - stringId = *itEntry; auto& entry = _entries[static_cast(stringId)]; + assert(entry.refCount == 0); auto& block = _blocks[entry.blockIndex]; writeStringToBlock(entry.offsetInBlock, *block); entry.hash = hash; - entry.len = len2; + entry.len = actualLength; entry.refCount = 1; - entry.sortedIndex = insertToHashBucket(entry, stringId); + entry.nextFreeId = Id::Invalid; - _freeEntries.erase(itEntry); + insertToHashBucket(entry, stringId); return stringId; } // New entry. - auto& block = getBlock(len2 + kTerminatorLength); + + // We align the capacity to 8 bytes. + const auto capacity = std::max(kMinStringCapacity, (requiredLength + 7) & ~7); + + auto& block = getBlock(capacity); const auto offset = static_cast(block.used); writeStringToBlock(offset, block); - block.used += len2 + kTerminatorLength; + block.used += capacity; stringId = static_cast(_entries.size()); @@ -550,10 +580,11 @@ namespace zasm entry.hash = hash; entry.blockIndex = block.index; entry.offsetInBlock = offset; - entry.len = len2; - entry.capacity = len2 + kTerminatorLength; + entry.len = actualLength; + entry.capacity = capacity; entry.refCount = 1; - entry.sortedIndex = insertToHashBucket(entry, stringId); + + insertToHashBucket(entry, stringId); return stringId; } From 982f88d6bc5051e212d7e5e01177a78ac8259101 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=CE=B6eh=20Matt?= <5415177+ZehMatt@users.noreply.github.com> Date: Fri, 8 Nov 2024 04:36:02 +0200 Subject: [PATCH 6/8] Code cleanup --- zasm/include/zasm/core/stringpool.hpp | 52 ++++++++++++++++----------- 1 file changed, 31 insertions(+), 21 deletions(-) diff --git a/zasm/include/zasm/core/stringpool.hpp b/zasm/include/zasm/core/stringpool.hpp index 162e198..5a9ffc3 100644 --- a/zasm/include/zasm/core/stringpool.hpp +++ b/zasm/include/zasm/core/stringpool.hpp @@ -27,8 +27,8 @@ namespace zasm static constexpr std::size_t kAverageStringSize = 24; static constexpr std::size_t kStringsPerBlock = 4096; static constexpr std::size_t kBlockSize = kAverageStringSize * kStringsPerBlock; - static constexpr std::size_t kUnspecifiedSize = ~std::size_t{ 0 }; static constexpr std::size_t kMinStringCapacity = 16; + static constexpr std::size_t kUnspecifiedSize = ~std::size_t{ 0 }; static_assert(kBlockSize >= kMaxStringSize, "Block size must be bigger than max string size."); @@ -36,13 +36,21 @@ namespace zasm static constexpr std::size_t kMaxHashBuckets = 39119; private: + using StringSize = std::conditional_t< + kMaxStringSize <= std::numeric_limits::max(), std::uint16_t, std::uint32_t>; + + using BlockOffset = std::conditional_t< + kBlockSize <= std::numeric_limits::max(), std::uint32_t, std::uint64_t>; + + using BlockIndex = std::uint32_t; + struct Entry { - std::size_t hash{}; - std::size_t blockIndex{}; - std::int32_t offsetInBlock{}; - std::int32_t len{}; - std::int32_t capacity{}; + std::uint64_t hash{}; + BlockIndex blockIndex{}; + BlockOffset offsetInBlock{}; + StringSize len{}; + StringSize capacity{}; std::int32_t refCount{}; Id nextFreeId{ Id::Invalid }; }; @@ -54,8 +62,8 @@ namespace zasm struct Block { - std::size_t index{}; - std::size_t used{}; + BlockIndex index{}; + std::uint32_t used{}; std::array data{}; }; @@ -76,6 +84,11 @@ namespace zasm return aquire_(value, size); } + Id aquire(std::string_view str) + { + return aquire_(str.data(), str.size()); + } + Id aquire(const std::string& val) { return aquire_(val.c_str(), val.size()); @@ -131,7 +144,7 @@ namespace zasm return _blocks[entry->blockIndex]->data.data() + entry->offsetInBlock; } - int32_t getLength(Id stringId) const noexcept + std::size_t getLength(Id stringId) const noexcept { const auto* entry = getEntry(*this, stringId); if (entry == nullptr) @@ -403,7 +416,7 @@ namespace zasm return &self._entries[idx]; } - Id find_(const char* buf, std::size_t len, std::size_t hash) const noexcept + Id find_(const char* buf, std::size_t len, std::uint64_t hash) const noexcept { const auto bucketIndex = hash % kMaxHashBuckets; const auto& bucket = _hashBuckets[bucketIndex]; @@ -456,7 +469,7 @@ namespace zasm // Create a new block. auto newBlock = std::make_unique(); - newBlock->index = _blocks.size(); + newBlock->index = static_cast(_blocks.size()); _blocks.emplace_back(std::move(newBlock)); return *_blocks.back(); @@ -504,7 +517,7 @@ namespace zasm return aquire_(inputStr, len, hash); } - Id aquire_(const char* inputStr, std::size_t len, std::size_t hash) + Id aquire_(const char* inputStr, std::size_t len, std::uint64_t hash) { // Strings can not be larger than kMaxStringSize. if (len >= kMaxStringSize) @@ -589,7 +602,7 @@ namespace zasm return stringId; } - static constexpr std::size_t getHash(const char* buf, size_t len) noexcept + static constexpr std::uint64_t getHash(const char* buf, size_t len) noexcept { assert(buf != nullptr); assert(len > 0); @@ -598,14 +611,11 @@ namespace zasm { return 0; } -#ifdef _M_AMD64 - constexpr std::size_t offset = 0xcbf29ce484222325ULL; - constexpr std::size_t prime = 0x00000100000001B3ULL; -#else - constexpr std::size_t offset = 0x811c9dc5U; - constexpr std::size_t prime = 0x01000193U; -#endif - std::size_t result = offset; + + constexpr std::uint64_t offset = 0xcbf29ce484222325ULL; + constexpr std::uint64_t prime = 0x00000100000001B3ULL; + + std::uint64_t result = offset; for (std::size_t i = 0; i < len; ++i) { result ^= static_cast(buf[i]); From 9e920e0f00c50e0d3ce628e33338ac56684085a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=CE=B6eh=20Matt?= <5415177+ZehMatt@users.noreply.github.com> Date: Fri, 8 Nov 2024 04:57:09 +0200 Subject: [PATCH 7/8] Fix typo --- .../src/benchmarks/benchmark.stringpool.cpp | 16 +++++++-------- tests/src/tests/tests.stringpool.cpp | 20 +++++++++---------- zasm/include/zasm/core/stringpool.hpp | 11 +++++++--- zasm/src/zasm/src/program/program.cpp | 8 ++++---- .../src/zasm/src/serialization/serializer.cpp | 2 +- 5 files changed, 31 insertions(+), 26 deletions(-) diff --git a/benchmark/src/benchmarks/benchmark.stringpool.cpp b/benchmark/src/benchmarks/benchmark.stringpool.cpp index df5fdf4..73106fa 100644 --- a/benchmark/src/benchmarks/benchmark.stringpool.cpp +++ b/benchmark/src/benchmarks/benchmark.stringpool.cpp @@ -23,7 +23,7 @@ namespace zasm::benchmarks return strings; }(); - static void BM_StringPool_Aquire(benchmark::State& state) + static void BM_StringPool_Acquire(benchmark::State& state) { StringPool pool; for (auto _ : state) @@ -32,12 +32,12 @@ namespace zasm::benchmarks { const auto& str = kInputStrings[i]; - auto stringId = pool.aquire(str); + auto stringId = pool.acquire(str); benchmark::DoNotOptimize(stringId); } } } - BENCHMARK(BM_StringPool_Aquire)->Range(0, kTestSize)->Unit(benchmark::kMillisecond); + BENCHMARK(BM_StringPool_Acquire)->Range(0, kTestSize)->Unit(benchmark::kMillisecond); static void BM_StringPool_Release(benchmark::State& state) { @@ -49,7 +49,7 @@ namespace zasm::benchmarks state.PauseTiming(); const auto& str = kInputStrings[i]; - auto stringId = pool.aquire(str); + auto stringId = pool.acquire(str); state.ResumeTiming(); auto refCount = pool.release(stringId); @@ -71,7 +71,7 @@ namespace zasm::benchmarks { const auto& str = kInputStrings[i]; - auto stringId = pool.aquire(str); + auto stringId = pool.acquire(str); stringIds.push_back(stringId); } @@ -86,7 +86,7 @@ namespace zasm::benchmarks { const auto& str = kInputStrings[i]; - auto stringId = pool.aquire(str); + auto stringId = pool.acquire(str); benchmark::DoNotOptimize(stringId); } } @@ -103,7 +103,7 @@ namespace zasm::benchmarks state.PauseTiming(); const auto& str = kInputStrings[i]; - auto stringId = pool.aquire(str); + auto stringId = pool.acquire(str); state.ResumeTiming(); const char* res = pool.get(stringId); @@ -123,7 +123,7 @@ namespace zasm::benchmarks state.PauseTiming(); const auto& str = kInputStrings[i]; - auto stringId = pool.aquire(str); + auto stringId = pool.acquire(str); state.ResumeTiming(); auto strLen = pool.getLength(stringId); diff --git a/tests/src/tests/tests.stringpool.cpp b/tests/src/tests/tests.stringpool.cpp index 2439aa6..83bc9a5 100644 --- a/tests/src/tests/tests.stringpool.cpp +++ b/tests/src/tests/tests.stringpool.cpp @@ -13,7 +13,7 @@ namespace zasm::tests constexpr const char str1[] = "hello"; constexpr const char str2[] = "Hello"; - const auto id0 = pool.aquire(str1); + const auto id0 = pool.acquire(str1); ASSERT_NE(id0, StringPool::Id::Invalid); ASSERT_EQ(pool.getLength(id0), std::size(str1) - 1); ASSERT_EQ(pool.getRefCount(id0), 1); @@ -21,7 +21,7 @@ namespace zasm::tests ASSERT_NE(cstr0, nullptr); ASSERT_EQ(strcmp(cstr0, str1), 0); - const auto id1 = pool.aquire(str1); + const auto id1 = pool.acquire(str1); ASSERT_NE(id1, StringPool::Id::Invalid); ASSERT_EQ(id1, id0); ASSERT_EQ(pool.getLength(id1), std::size(str1) - 1); @@ -30,7 +30,7 @@ namespace zasm::tests ASSERT_EQ(cstr1, cstr0); ASSERT_EQ(strcmp(cstr1, str1), 0); - const auto id2 = pool.aquire(str2); + const auto id2 = pool.acquire(str2); ASSERT_NE(id2, StringPool::Id::Invalid); ASSERT_NE(id2, id1); ASSERT_EQ(pool.getLength(id2), std::size(str2) - 1); @@ -49,17 +49,17 @@ namespace zasm::tests constexpr const char str4[] = "hello4"; constexpr const char str5[] = "hello..."; - const auto id0 = pool.aquire(str1); + const auto id0 = pool.acquire(str1); ASSERT_NE(id0, StringPool::Id::Invalid); ASSERT_EQ(pool.getLength(id0), std::size(str1) - 1); ASSERT_EQ(pool.getRefCount(id0), 1); - const auto id1 = pool.aquire(str2); + const auto id1 = pool.acquire(str2); ASSERT_NE(id1, StringPool::Id::Invalid); ASSERT_EQ(pool.getLength(id1), std::size(str2) - 1); ASSERT_EQ(pool.getRefCount(id1), 1); - const auto id2 = pool.aquire(str3); + const auto id2 = pool.acquire(str3); ASSERT_NE(id2, StringPool::Id::Invalid); ASSERT_EQ(pool.getLength(id2), std::size(str3) - 1); ASSERT_EQ(pool.getRefCount(id2), 1); @@ -68,13 +68,13 @@ namespace zasm::tests const auto* cstr0 = pool.get(id1); ASSERT_EQ(cstr0, nullptr); - const auto id3 = pool.aquire(str4); + const auto id3 = pool.acquire(str4); ASSERT_NE(id3, StringPool::Id::Invalid); ASSERT_EQ(id1, id3); ASSERT_EQ(pool.getLength(id3), std::size(str4) - 1); ASSERT_EQ(pool.getRefCount(id3), 1); - const auto id4 = pool.aquire(str4); + const auto id4 = pool.acquire(str4); ASSERT_NE(id4, StringPool::Id::Invalid); ASSERT_EQ(id4, id3); ASSERT_EQ(pool.getLength(id4), std::size(str4) - 1); @@ -83,7 +83,7 @@ namespace zasm::tests ASSERT_EQ(pool.release(id4), 1); ASSERT_EQ(pool.release(id4), 0); - const auto id5 = pool.aquire(str5); + const auto id5 = pool.acquire(str5); ASSERT_NE(id5, StringPool::Id::Invalid); ASSERT_EQ(id5, id1); ASSERT_EQ(pool.getLength(id5), std::size(str5) - 1); @@ -115,7 +115,7 @@ namespace zasm::tests ids.clear(); for (const auto& str : kInputStrings) { - const auto id = pool.aquire(str); + const auto id = pool.acquire(str); ASSERT_NE(id, StringPool::Id::Invalid); ASSERT_EQ(pool.getLength(id), str.size()); ASSERT_EQ(pool.getRefCount(id), 1); diff --git a/zasm/include/zasm/core/stringpool.hpp b/zasm/include/zasm/core/stringpool.hpp index 5a9ffc3..849f6f4 100644 --- a/zasm/include/zasm/core/stringpool.hpp +++ b/zasm/include/zasm/core/stringpool.hpp @@ -75,7 +75,7 @@ namespace zasm _hashBuckets.resize(kMaxHashBuckets); } - Id aquire(const char* value, std::size_t size = kUnspecifiedSize) + Id acquire(const char* value, std::size_t size = kUnspecifiedSize) { if (size == kUnspecifiedSize) { @@ -84,12 +84,12 @@ namespace zasm return aquire_(value, size); } - Id aquire(std::string_view str) + Id acquire(std::string_view str) { return aquire_(str.data(), str.size()); } - Id aquire(const std::string& val) + Id acquire(const std::string& val) { return aquire_(val.c_str(), val.size()); } @@ -167,6 +167,11 @@ namespace zasm void clear() noexcept { + if (_entries.empty()) + { + // Because clearing the buckets is expensive do nothing if already empty. + return; + } _entries.clear(); if (_blocks.size() > 1) { diff --git a/zasm/src/zasm/src/program/program.cpp b/zasm/src/zasm/src/program/program.cpp index ddcb651..ce5685c 100644 --- a/zasm/src/zasm/src/program/program.cpp +++ b/zasm/src/zasm/src/program/program.cpp @@ -154,7 +154,7 @@ namespace zasm if (name != nullptr) { - entry.nameId = _state->symbolNames.aquire(name); + entry.nameId = _state->symbolNames.acquire(name); } } @@ -594,7 +594,7 @@ namespace zasm { return StringPool::Id::Invalid; } - return state.symbolNames.aquire(str); + return state.symbolNames.acquire(str); } static Label createLabel_(detail::ProgramState& state, StringPool::Id nameId, StringPool::Id modId, LabelFlags flags) @@ -731,7 +731,7 @@ namespace zasm if (name != nullptr) { - entry.nameId = _state->symbolNames.aquire(name); + entry.nameId = _state->symbolNames.acquire(name); } return Section{ sectId }; @@ -800,7 +800,7 @@ namespace zasm entry->nameId = StringPool::Id::Invalid; } - entry->nameId = _state->symbolNames.aquire(name); + entry->nameId = _state->symbolNames.acquire(name); return ErrorCode::None; } diff --git a/zasm/src/zasm/src/serialization/serializer.cpp b/zasm/src/zasm/src/serialization/serializer.cpp index 05e57aa..548ceb0 100644 --- a/zasm/src/zasm/src/serialization/serializer.cpp +++ b/zasm/src/zasm/src/serialization/serializer.cpp @@ -475,7 +475,7 @@ namespace zasm defaultSect.attribs = Section::kDefaultAttribs; defaultSect.align = Section::kDefaultAlign; defaultSect.address = newBase; - defaultSect.nameId = programState.symbolNames.aquire(".text"); + defaultSect.nameId = programState.symbolNames.acquire(".text"); const auto serializePass = [&]() -> Error { state.buffer.clear(); From 9dc918ef608fb167a0bafb572bb9b80607b5d239 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=CE=B6eh=20Matt?= <5415177+ZehMatt@users.noreply.github.com> Date: Fri, 8 Nov 2024 05:04:54 +0200 Subject: [PATCH 8/8] Add another test for StringPool --- tests/src/tests/tests.stringpool.cpp | 42 ++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/tests/src/tests/tests.stringpool.cpp b/tests/src/tests/tests.stringpool.cpp index 83bc9a5..2a9d738 100644 --- a/tests/src/tests/tests.stringpool.cpp +++ b/tests/src/tests/tests.stringpool.cpp @@ -39,6 +39,48 @@ namespace zasm::tests ASSERT_EQ(strcmp(cstr2, str2), 0); } + TEST(StringPoolTests, TestDuplicate) + { + StringPool pool; + + constexpr const char str1[] = "hello"; + constexpr const char str2[] = "hello"; + constexpr const char str3[] = "hello"; + constexpr const char str4[] = "hello1"; + + const auto id0 = pool.acquire(str1); + ASSERT_NE(id0, StringPool::Id::Invalid); + ASSERT_EQ(pool.getLength(id0), std::size(str1) - 1); + ASSERT_EQ(pool.getRefCount(id0), 1); + const auto* cstr0 = pool.get(id0); + ASSERT_NE(cstr0, nullptr); + ASSERT_EQ(strcmp(cstr0, str1), 0); + + const auto id1 = pool.acquire(str2); + ASSERT_NE(id1, StringPool::Id::Invalid); + ASSERT_EQ(id1, id0); + ASSERT_EQ(pool.getLength(id1), std::size(str2) - 1); + ASSERT_EQ(pool.getRefCount(id1), 2); + const auto* cstr2 = pool.get(id1); + ASSERT_EQ(strcmp(cstr2, str2), 0); + + const auto id2 = pool.acquire(str3); + ASSERT_NE(id2, StringPool::Id::Invalid); + ASSERT_EQ(id2, id0); + ASSERT_EQ(pool.getLength(id2), std::size(str3) - 1); + ASSERT_EQ(pool.getRefCount(id2), 3); + const auto* cstr3 = pool.get(id2); + ASSERT_EQ(strcmp(cstr3, str3), 0); + + const auto id3 = pool.acquire(str4); + ASSERT_NE(id3, StringPool::Id::Invalid); + ASSERT_NE(id3, id0); + ASSERT_EQ(pool.getLength(id3), std::size(str4) - 1); + ASSERT_EQ(pool.getRefCount(id3), 1); + const auto* cstr4 = pool.get(id3); + ASSERT_EQ(strcmp(cstr4, str4), 0); + } + TEST(StringPoolTests, TestRelease) { StringPool pool;