From b9f4a958b326f5b02f8bf07679ab7e623b57a81d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=CE=B6eh=20Matt?= <5415177+ZehMatt@users.noreply.github.com>
Date: Mon, 4 Nov 2024 11:40:39 +0200
Subject: [PATCH 1/8] Overhaul StringPool, make pointer stable and better
 performance

---
 zasm/include/zasm/core/stringpool.hpp | 368 +++++++++++++++++++++-----
 1 file changed, 302 insertions(+), 66 deletions(-)
diff --git a/zasm/include/zasm/core/stringpool.hpp b/zasm/include/zasm/core/stringpool.hpp
index bd413cb..60c41f7 100644
--- a/zasm/include/zasm/core/stringpool.hpp
+++ b/zasm/include/zasm/core/stringpool.hpp
@@ -1,9 +1,11 @@
 #pragma once
 
 #include <algorithm>
+#include <array>
 #include <cstddef>
 #include <cstring>
 #include <iterator>
+#include <memory>
 #include <string.h>
 #include <string>
 #include <string_view>
@@ -16,34 +18,59 @@ namespace zasm
 
     class StringPool
     {
+    public:
+        enum class Id : std::int32_t
+        {
+            Invalid = -1,
+        };
+
+        static constexpr std::size_t kMaxStringSize = 0xFFFF;
+        static constexpr std::size_t kAverageStringSize = 24;
+        static constexpr std::size_t kStringsPerBlock = 4096;
+        static constexpr std::size_t kBlockSize = kAverageStringSize * kStringsPerBlock;
+        static constexpr std::size_t kUnspecifiedSize = ~std::size_t{ 0 };
+
+        // 16 bit prime number.
+        static constexpr std::size_t kMaxHashBuckets = 39119;
+
+    private:
         struct Entry
         {
             std::size_t hash{};
-            std::int32_t offset{};
+            std::size_t blockIndex{};
+            std::size_t sortedIndex{};
+            std::int32_t offsetInBlock{};
             std::int32_t len{};
             std::int32_t capacity{};
             std::int32_t refCount{};
         };
 
         std::vector<Entry> _entries;
-        std::vector<char> _data;
+        std::vector<Id> _freeEntries;
+        std::vector<std::vector<Id>> _hashBuckets;
 
-    public:
-        enum class Id : std::int32_t
+        struct Block
         {
-            Invalid = -1,
+            std::size_t index{};
+            std::size_t used{};
+            std::array<char, kBlockSize> data{};
         };
 
+        std::vector<std::unique_ptr<Block>> _blocks;
+
     public:
-        // NOLINTNEXTLINE
-        template<size_t N> Id aquire(const char (&value)[N])
+        StringPool()
         {
-            return aquire_(value, N);
+            _hashBuckets.resize(kMaxHashBuckets);
         }
 
-        Id aquire(const char* value)
+        Id aquire(const char* value, std::size_t size = kUnspecifiedSize)
         {
-            return aquire_(value, strlen(value));
+            if (size == kUnspecifiedSize)
+            {
+                size = std::strlen(value);
+            }
+            return aquire_(value, size);
         }
 
         Id aquire(const std::string& val)
@@ -60,14 +87,26 @@ namespace zasm
             }
 
             const auto newRefCount = --entry->refCount;
+            assert(newRefCount >= 0);
+
+            if (newRefCount == 0)
+            {
+                _freeEntries.push_back(stringId);
+
+                const auto bucketIndex = entry->hash % kMaxHashBuckets;
+                auto& bucket = _hashBuckets[bucketIndex];
+
+                bucket.erase(bucket.begin() + entry->sortedIndex);
+            }
+
             return newRefCount;
         }
 
-        // NOLINTNEXTLINE
-        template<std::size_t N> Id find(const char (&value)[N]) const noexcept
+        Id find(const char* str) const noexcept
         {
-            const auto hash = getHash(value, N);
-            return find(value, N, hash);
+            const auto len = std::strlen(str);
+            const auto hash = getHash(str, len);
+            return find_(str, len, hash);
         }
 
         bool isValid(Id stringId) const noexcept
@@ -84,7 +123,7 @@ namespace zasm
                 return nullptr;
             }
 
-            return _data.data() + entry->offset;
+            return _blocks[entry->blockIndex]->data.data() + entry->offsetInBlock;
         }
 
         int32_t getLength(Id stringId) const noexcept
@@ -111,12 +150,7 @@ namespace zasm
         void clear() noexcept
         {
             _entries.clear();
-            _data.clear();
-        }
-
-        const char* data() const noexcept
-        {
-            return _data.data();
+            _blocks.clear();
         }
 
         std::size_t size() const noexcept
@@ -126,6 +160,7 @@ namespace zasm
 
         Error save(IStream& stream) const
         {
+            // Serialize entries.
             const auto entryCount = static_cast<std::uint32_t>(_entries.size());
             if (auto len = stream.write(&entryCount, sizeof(entryCount)); len == 0)
             {
@@ -138,7 +173,15 @@ namespace zasm
                 {
                     return ErrorCode::InvalidParameter;
                 }
-                if (auto len = stream.write(entry.offset); len == 0)
+                if (auto len = stream.write(entry.blockIndex); len == 0)
+                {
+                    return ErrorCode::InvalidParameter;
+                }
+                if (auto len = stream.write(entry.sortedIndex); len == 0)
+                {
+                    return ErrorCode::InvalidParameter;
+                }
+                if (auto len = stream.write(entry.offsetInBlock); len == 0)
                 {
                     return ErrorCode::InvalidParameter;
                 }
@@ -156,15 +199,57 @@ namespace zasm
                 }
             }
 
-            const auto dataSize = static_cast<std::uint32_t>(_data.size());
-            if (auto len = stream.write(dataSize); len == 0)
+            // Serialize free entries.
+            const auto freeEntryCount = static_cast<std::uint32_t>(_freeEntries.size());
+            if (auto len = stream.write(&freeEntryCount, sizeof(freeEntryCount)); len == 0)
+            {
+                return ErrorCode::InvalidParameter;
+            }
+
+            for (const auto& freeEntry : _freeEntries)
+            {
+                if (auto len = stream.write(freeEntry); len == 0)
+                {
+                    return ErrorCode::InvalidParameter;
+                }
+            }
+
+            // Serialize hash buckets.
+            for (const auto& bucket : _hashBuckets)
+            {
+                const auto bucketSize = static_cast<std::uint32_t>(bucket.size());
+                if (auto len = stream.write(&bucketSize, sizeof(bucketSize)); len == 0)
+                {
+                    return ErrorCode::InvalidParameter;
+                }
+
+                for (const auto& id : bucket)
+                {
+                    if (auto len = stream.write(id); len == 0)
+                    {
+                        return ErrorCode::InvalidParameter;
+                    }
+                }
+            }
+
+            // Serialize blocks.
+            const auto blockCount = static_cast<std::uint32_t>(_blocks.size());
+            if (auto len = stream.write(&blockCount, sizeof(blockCount)); len == 0)
             {
                 return ErrorCode::InvalidParameter;
             }
 
-            if (dataSize > 0)
+            for (const auto& block : _blocks)
             {
-                if (auto len = stream.write(_data.data(), dataSize); len == 0)
+                if (auto len = stream.write(block->index); len == 0)
+                {
+                    return ErrorCode::InvalidParameter;
+                }
+                if (auto len = stream.write(block->used); len == 0)
+                {
+                    return ErrorCode::InvalidParameter;
+                }
+                if (auto len = stream.write(block->data.data(), block->used); len == 0)
                 {
                     return ErrorCode::InvalidParameter;
                 }
@@ -175,8 +260,7 @@ namespace zasm
 
         Error load(IStream& stream)
         {
-            clear();
-
+            // Deserialize entries.
             std::uint32_t entryCount{};
             if (auto len = stream.read(&entryCount, sizeof(entryCount)); len == 0)
             {
@@ -191,7 +275,15 @@ namespace zasm
                 {
                     return ErrorCode::InvalidParameter;
                 }
-                if (auto len = stream.read(entry.offset); len == 0)
+                if (auto len = stream.read(entry.blockIndex); len == 0)
+                {
+                    return ErrorCode::InvalidParameter;
+                }
+                if (auto len = stream.read(entry.sortedIndex); len == 0)
+                {
+                    return ErrorCode::InvalidParameter;
+                }
+                if (auto len = stream.read(entry.offsetInBlock); len == 0)
                 {
                     return ErrorCode::InvalidParameter;
                 }
@@ -209,24 +301,76 @@ namespace zasm
                 }
             }
 
-            std::uint32_t dataSize{};
-            if (auto len = stream.read(&dataSize, sizeof(dataSize)); len == 0)
+            // Deserialize free entries.
+            std::uint32_t freeEntryCount{};
+            if (auto len = stream.read(&freeEntryCount, sizeof(freeEntryCount)); len == 0)
+            {
+                return ErrorCode::InvalidParameter;
+            }
+
+            auto freeEntries = std::vector<Id>();
+            freeEntries.resize(freeEntryCount);
+            for (auto& freeEntry : freeEntries)
+            {
+                if (auto len = stream.read(freeEntry); len == 0)
+                {
+                    return ErrorCode::InvalidParameter;
+                }
+            }
+
+            // Deserialize hash buckets.
+            auto hashBuckets = std::vector<std::vector<Id>>();
+            hashBuckets.resize(kMaxHashBuckets);
+            for (auto& bucket : hashBuckets)
+            {
+                std::uint32_t bucketSize{};
+                if (auto len = stream.read(&bucketSize, sizeof(bucketSize)); len == 0)
+                {
+                    return ErrorCode::InvalidParameter;
+                }
+
+                bucket.resize(bucketSize);
+                for (auto& id : bucket)
+                {
+                    if (auto len = stream.read(id); len == 0)
+                    {
+                        return ErrorCode::InvalidParameter;
+                    }
+                }
+            }
+
+            // Deserialize blocks.
+            std::uint32_t blockCount{};
+            if (auto len = stream.read(&blockCount, sizeof(blockCount)); len == 0)
             {
                 return ErrorCode::InvalidParameter;
             }
 
-            std::vector<char> loadedData;
-            if (dataSize > 0)
+            auto blocks = std::vector<std::unique_ptr<Block>>();
+            blocks.resize(blockCount);
+            for (auto& block : blocks)
             {
-                loadedData.resize(dataSize);
-                if (auto len = stream.read(loadedData.data(), dataSize); len == 0)
+                block = std::make_unique<Block>();
+
+                if (auto len = stream.read(block->index); len == 0)
+                {
+                    return ErrorCode::InvalidParameter;
+                }
+                if (auto len = stream.read(block->used); len == 0)
+                {
+                    return ErrorCode::InvalidParameter;
+                }
+                if (auto len = stream.read(block->data.data(), block->used); len == 0)
                 {
                     return ErrorCode::InvalidParameter;
                 }
             }
 
+            // Swap state.
             _entries = std::move(loadedEntries);
-            _data = std::move(loadedData);
+            _freeEntries = std::move(freeEntries);
+            _hashBuckets = std::move(hashBuckets);
+            _blocks = std::move(blocks);
 
             return ErrorCode::None;
         }
@@ -241,7 +385,13 @@ namespace zasm
             {
                 return nullptr;
             }
-            if (self._entries[idx].refCount <= 0)
+
+            const auto refCount = self._entries[idx].refCount;
+#ifndef IN_TESTS
+            assert(refCount > 0);
+#endif
+
+            if (refCount <= 0)
             {
                 return nullptr;
             }
@@ -251,29 +401,80 @@ namespace zasm
 
         Id find_(const char* buf, std::size_t len, std::size_t hash) const noexcept
         {
-            auto it = std::find_if(std::begin(_entries), std::end(_entries), [&](const auto& entry) noexcept {
-                if (entry.refCount == 0 || entry.hash != hash || entry.len != len)
-                {
-                    return false;
-                }
-                const char* str = _data.data() + entry.offset;
-                return memcmp(buf, str, len) == 0;
+            const auto bucketIndex = hash % kMaxHashBuckets;
+            const auto& bucket = _hashBuckets[bucketIndex];
+
+            // Find the lowest hash.
+            auto itLowest = std::lower_bound(bucket.begin(), bucket.end(), Id::Invalid, [&](Id id, Id) {
+                return _entries[static_cast<std::size_t>(id)].hash < hash;
             });
-            if (it != std::end(_entries))
+
+            // Iterate all matching hashes and compare the string.
+            for (auto it = itLowest; it != bucket.end(); ++it)
             {
-                const auto index = std::distance(_entries.begin(), it);
-                return static_cast<Id>(index);
+                const auto id = *it;
+                const auto& entry = _entries[static_cast<std::size_t>(id)];
+                if (entry.hash != hash)
+                {
+                    break;
+                }
+
+                if (entry.len != len)
+                {
+                    continue;
+                }
+
+                const auto* str = _blocks[entry.blockIndex]->data.data() + entry.offsetInBlock;
+                if (std::memcmp(str, buf, len) == 0)
+                {
+                    return id;
+                }
             }
+
             return Id::Invalid;
         }
 
-        Id aquire_(const char* buf, std::size_t len)
+        Block& getBlock(std::size_t len)
         {
-            constexpr std::int32_t kTerminatorLength = 1;
+            // If there are no blocks create a new one.
+            if (_blocks.empty())
+            {
+                _blocks.emplace_back(std::make_unique<Block>());
+                return *_blocks.back();
+            }
 
-            const auto hash = getHash(buf, len);
+            // See if the last block has enough space.
+            auto& lastBlock = *_blocks.back();
+            if (lastBlock.used + len < kBlockSize)
+            {
+                return lastBlock;
+            }
+
+            // Create a new block.
+            auto newBlock = std::make_unique<Block>();
+            newBlock->index = _blocks.size();
+            _blocks.emplace_back(std::move(newBlock));
+
+            return *_blocks.back();
+        }
 
-            auto stringId = find_(buf, len, hash);
+        Id aquire_(const char* inputStr, std::size_t len)
+        {
+            const auto hash = getHash(inputStr, len);
+            return aquire_(inputStr, len, hash);
+        }
+
+        Id aquire_(const char* inputStr, std::size_t len, std::size_t hash)
+        {
+            // Strings can not be larger than kMaxStringSize.
+            if (len >= kMaxStringSize)
+            {
+                assert(len < kMaxStringSize);
+                return Id::Invalid;
+            }
+
+            // Find existing string and increase ref count.
+            auto stringId = find_(inputStr, len, hash);
             if (stringId != Id::Invalid)
             {
                 auto& entry = _entries[static_cast<std::size_t>(stringId)];
@@ -282,48 +483,82 @@ namespace zasm
             }
 
             const auto len2 = static_cast<std::int32_t>(len);
+            constexpr std::int32_t kTerminatorLength = 1;
 
             // Use empty entry if any exist.
-            auto itEntry = std::find_if(_entries.begin(), _entries.end(), [&](auto&& entry) {
+            // TODO: Optimize this, we should prioritize finding entries at the end of the list.
+            //       This would mean shrinking the vector has less to copy.
+            auto itEntry = std::find_if(_freeEntries.begin(), _freeEntries.end(), [&](Id id) {
+                auto& entry = _entries[static_cast<std::size_t>(id)];
                 return entry.refCount <= 0 && entry.capacity >= len2 + kTerminatorLength;
             });
 
-            if (itEntry != _entries.end())
-            {
-                // Found empty space.
-                auto& entry = *itEntry;
+            const auto insertToHashBucket = [&](Entry& entry, Id id) {
+                const auto bucketIndex = entry.hash % kMaxHashBuckets;
+                auto& bucket = _hashBuckets[bucketIndex];
+
+                auto sortedIt = std::lower_bound(bucket.begin(), bucket.end(), id, [&](Id id, Id id2) {
+                    return _entries[static_cast<std::size_t>(id)].hash < _entries[static_cast<std::size_t>(id2)].hash;
+                });
+
+                const auto sortedIndex = static_cast<std::size_t>(std::distance(bucket.begin(), sortedIt));
+                bucket.insert(sortedIt, id);
 
-                stringId = static_cast<Id>(std::distance(_entries.begin(), itEntry));
-                std::memcpy(_data.data() + entry.offset, buf, len2);
+                return sortedIndex;
+            };
 
+            const auto writeStringToBlock = [&](std::size_t blockOffset, Block& block) {
+                std::memcpy(block.data.data() + blockOffset, inputStr, len2);
                 // Ensure null termination.
-                const auto termOffset = static_cast<std::size_t>(entry.offset) + len2;
-                _data[termOffset] = '\0';
+                block.data[blockOffset + len2] = '\0';
+            };
+
+            if (itEntry != _freeEntries.end())
+            {
+                // Found empty space.
+                stringId = *itEntry;
+                auto& entry = _entries[static_cast<std::size_t>(stringId)];
+
+                auto& block = _blocks[entry.blockIndex];
+                writeStringToBlock(entry.offsetInBlock, *block);
 
                 entry.hash = hash;
                 entry.len = len2;
                 entry.refCount = 1;
+                entry.sortedIndex = insertToHashBucket(entry, stringId);
+
+                _freeEntries.erase(itEntry);
 
                 return stringId;
             }
 
             // New entry.
-            const auto offset = static_cast<std::int32_t>(_data.size());
-            std::copy_n(buf, len, std::back_insert_iterator(_data));
+            auto& block = getBlock(len2 + kTerminatorLength);
+            const auto offset = static_cast<std::int32_t>(block.used);
 
-            // Ensure null termination.
-            _data.push_back('\0');
+            writeStringToBlock(offset, block);
+            block.used += len2 + kTerminatorLength;
 
             stringId = static_cast<Id>(_entries.size());
 
-            _entries.push_back({ hash, offset, len2, len2 + kTerminatorLength, 1 });
+            auto& entry = _entries.emplace_back();
+            entry.hash = hash;
+            entry.blockIndex = block.index;
+            entry.offsetInBlock = offset;
+            entry.len = len2;
+            entry.capacity = len2 + kTerminatorLength;
+            entry.refCount = 1;
+            entry.sortedIndex = insertToHashBucket(entry, stringId);
 
             return stringId;
         }
 
         static constexpr std::size_t getHash(const char* buf, size_t len) noexcept
         {
-            if (buf == nullptr)
+            assert(buf != nullptr);
+            assert(len > 0);
+
+            if (buf == nullptr || len == 0)
             {
                 return 0;
             }
@@ -340,6 +575,7 @@ namespace zasm
                 result ^= static_cast<unsigned char>(buf[i]);
                 result *= prime;
             }
+
             return result;
         }
     };

From 1bf84c17a04e4be9697e0d13fbe5f793111e259e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=CE=B6eh=20Matt?= <5415177+ZehMatt@users.noreply.github.com>
Date: Mon, 4 Nov 2024 11:41:00 +0200
Subject: [PATCH 2/8] Improve tests for StringPool

---
 tests/src/tests/tests.stringpool.cpp | 36 ++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/tests/src/tests/tests.stringpool.cpp b/tests/src/tests/tests.stringpool.cpp
index 5630da1..c05c513 100644
--- a/tests/src/tests/tests.stringpool.cpp
+++ b/tests/src/tests/tests.stringpool.cpp
@@ -1,4 +1,6 @@
 #include <gtest/gtest.h>
+
+#define IN_TESTS
 #include <zasm/core/stringpool.hpp>
 
 namespace zasm::tests
@@ -87,4 +89,38 @@ namespace zasm::tests
         ASSERT_EQ(pool.getRefCount(id5), 1);
     }
 
+    TEST(StringPoolTests, TestManyStrings)
+    {
+        StringPool pool;
+
+        std::vector<StringPool::Id> ids;
+
+        constexpr auto kTestSize = 1'000'000;
+        for (auto i = 0U; i < kTestSize; i++)
+        {
+            std::string str = std::string("hello") + std::to_string(i);
+
+            const auto id = pool.aquire(str);
+            ASSERT_NE(id, StringPool::Id::Invalid);
+            ASSERT_EQ(pool.getLength(id), str.size());
+            ASSERT_EQ(pool.getRefCount(id), 1);
+
+            const auto* cstr = pool.get(id);
+            ASSERT_NE(cstr, nullptr);
+            ASSERT_EQ(strcmp(cstr, str.c_str()), 0);
+
+            ids.push_back(id);
+        }
+
+        // Validate that the contents still match.
+        for (auto i = 0U; i < kTestSize; i++)
+        {
+            std::string str = std::string("hello") + std::to_string(i);
+
+            const auto* cstr = pool.get(ids[i]);
+            ASSERT_NE(cstr, nullptr);
+            ASSERT_EQ(strcmp(cstr, str.c_str()), 0);
+        }
+    }
+
 } // namespace zasm::tests

From 3bd31972f9892b173dbfdbe7703a0ac2f7e24a8d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=CE=B6eh=20Matt?= <5415177+ZehMatt@users.noreply.github.com>
Date: Mon, 4 Nov 2024 11:41:12 +0200
Subject: [PATCH 3/8] Improve benchmark for StringPool

---
 .../src/benchmarks/benchmark.stringpool.cpp   | 83 +++++++++++++------
 1 file changed, 56 insertions(+), 27 deletions(-)

diff --git a/benchmark/src/benchmarks/benchmark.stringpool.cpp b/benchmark/src/benchmarks/benchmark.stringpool.cpp
index 2d82adc..03951a9 100644
--- a/benchmark/src/benchmarks/benchmark.stringpool.cpp
+++ b/benchmark/src/benchmarks/benchmark.stringpool.cpp
@@ -1,73 +1,102 @@
 #include <benchmark/benchmark.h>
+#include <random>
 #include <zasm/core/stringpool.hpp>
 
 namespace zasm::benchmarks
 {
-    static const std::string TestStrings[] = { "hello", "world", "longer string", "even longer string",
-                                               "even longer longer string" };
+    static constexpr auto kTestSize = 500'000;
+    static constexpr const char kChars[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
+
+    static const std::vector<std::string> kInputStrings = []() {
+        std::vector<std::string> strings;
+
+        std::mt19937 prng(42);
+        for (int i = 0; i < kTestSize; ++i)
+        {
+            std::string str;
+            for (size_t i = 0; i < 4 + (prng() % 24); ++i)
+            {
+                str.push_back(kChars[prng() % (sizeof(kChars) - 1)]);
+            }
+            strings.push_back(std::move(str));
+        }
+        return strings;
+    }();
 
     static void BM_StringPool_Aquire(benchmark::State& state)
     {
         StringPool pool;
         for (auto _ : state)
         {
-            const auto& str = TestStrings[state.range(0)];
+            for (auto i = 0; i < state.range(0); ++i)
+            {
+                const auto& str = kInputStrings[i];
 
-            auto stringId = pool.aquire(str);
-            benchmark::DoNotOptimize(stringId);
+                auto stringId = pool.aquire(str);
+                benchmark::DoNotOptimize(stringId);
+            }
         }
     }
-    BENCHMARK(BM_StringPool_Aquire)->DenseRange(0, std::size(TestStrings) - 1);
+    BENCHMARK(BM_StringPool_Aquire)->Range(0, kTestSize)->Unit(benchmark::kMillisecond);
 
     static void BM_StringPool_Release(benchmark::State& state)
     {
         StringPool pool;
         for (auto _ : state)
         {
-            state.PauseTiming();
-            const auto& str = TestStrings[state.range(0)];
+            for (auto i = 0; i < state.range(0); ++i)
+            {
+                state.PauseTiming();
+                const auto& str = kInputStrings[i];
 
-            auto stringId = pool.aquire(str);
-            state.ResumeTiming();
+                auto stringId = pool.aquire(str);
+                state.ResumeTiming();
 
-            auto refCount = pool.release(stringId);
-            benchmark::DoNotOptimize(refCount);
+                auto refCount = pool.release(stringId);
+                benchmark::DoNotOptimize(refCount);
+            }
         }
     }
-    BENCHMARK(BM_StringPool_Release)->DenseRange(0, std::size(TestStrings) - 1);
+    BENCHMARK(BM_StringPool_Release)->Range(0, kTestSize)->Unit(benchmark::kMillisecond);
 
     static void BM_StringPool_Get(benchmark::State& state)
     {
         StringPool pool;
         for (auto _ : state)
         {
-            state.PauseTiming();
-            const auto& str = TestStrings[state.range(0)];
+            for (auto i = 0; i < state.range(0); ++i)
+            {
+                state.PauseTiming();
+                const auto& str = kInputStrings[i];
 
-            auto stringId = pool.aquire(str);
-            state.ResumeTiming();
+                auto stringId = pool.aquire(str);
+                state.ResumeTiming();
 
-            const char* res = pool.get(stringId);
-            benchmark::DoNotOptimize(res);
+                const char* res = pool.get(stringId);
+                benchmark::DoNotOptimize(res);
+            }
         }
     }
-    BENCHMARK(BM_StringPool_Get)->DenseRange(0, std::size(TestStrings) - 1);
+    BENCHMARK(BM_StringPool_Get)->Range(0, kTestSize)->Unit(benchmark::kMillisecond);
 
     static void BM_StringPool_GetLength(benchmark::State& state)
     {
         StringPool pool;
         for (auto _ : state)
         {
-            state.PauseTiming();
-            const auto& str = TestStrings[state.range(0)];
+            for (auto i = 0; i < state.range(0); ++i)
+            {
+                state.PauseTiming();
+                const auto& str = kInputStrings[i];
 
-            auto stringId = pool.aquire(str);
-            state.ResumeTiming();
+                auto stringId = pool.aquire(str);
+                state.ResumeTiming();
 
-            auto strLen = pool.getLength(stringId);
-            benchmark::DoNotOptimize(strLen);
+                auto strLen = pool.getLength(stringId);
+                benchmark::DoNotOptimize(strLen);
+            }
         }
     }
-    BENCHMARK(BM_StringPool_GetLength)->DenseRange(0, std::size(TestStrings) - 1);
+    BENCHMARK(BM_StringPool_GetLength)->Range(0, kTestSize)->Unit(benchmark::kMillisecond);
 
 } // namespace zasm::benchmarks

From 07dfa78f67bd527619fd8b714d27879b03fddd73 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=CE=B6eh=20Matt?= <5415177+ZehMatt@users.noreply.github.com>
Date: Mon, 4 Nov 2024 11:51:34 +0200
Subject: [PATCH 4/8] Actually clear the StringPool

---
 zasm/include/zasm/core/stringpool.hpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/zasm/include/zasm/core/stringpool.hpp b/zasm/include/zasm/core/stringpool.hpp
index 60c41f7..79b55bd 100644
--- a/zasm/include/zasm/core/stringpool.hpp
+++ b/zasm/include/zasm/core/stringpool.hpp
@@ -151,6 +151,11 @@ namespace zasm
         {
             _entries.clear();
             _blocks.clear();
+            _freeEntries.clear();
+            for (auto& bucket : _hashBuckets)
+            {
+                bucket.clear();
+            }
         }
 
         std::size_t size() const noexcept

From 85605fa83d32158c60ee0a28e6155b40fb4e0e3a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=CE=B6eh=20Matt?= <5415177+ZehMatt@users.noreply.github.com>
Date: Thu, 7 Nov 2024 16:35:01 +0200
Subject: [PATCH 5/8] More refinements

---
 .../src/benchmarks/benchmark.stringpool.cpp   |  34 ++++
 tests/src/tests/tests.stringpool.cpp          | 124 ++++++++++++--
 zasm/include/zasm/core/stringpool.hpp         | 159 +++++++++++-------
 3 files changed, 242 insertions(+), 75 deletions(-)

diff --git a/benchmark/src/benchmarks/benchmark.stringpool.cpp b/benchmark/src/benchmarks/benchmark.stringpool.cpp
index 03951a9..df5fdf4 100644
--- a/benchmark/src/benchmarks/benchmark.stringpool.cpp
+++ b/benchmark/src/benchmarks/benchmark.stringpool.cpp
@@ -59,6 +59,40 @@ namespace zasm::benchmarks
     }
     BENCHMARK(BM_StringPool_Release)->Range(0, kTestSize)->Unit(benchmark::kMillisecond);
 
+    static void BM_StringPool_Reuse(benchmark::State& state)
+    {
+        StringPool pool;
+        for (auto _ : state)
+        {
+            std::vector<StringPool::Id> stringIds;
+
+            state.PauseTiming();
+            for (auto i = 0; i < state.range(0); ++i)
+            {
+                const auto& str = kInputStrings[i];
+
+                auto stringId = pool.aquire(str);
+                stringIds.push_back(stringId);
+            }
+
+            // Clear.
+            for (auto i = 0; i < state.range(0); ++i)
+            {
+                auto stringId = pool.release(stringIds[i]);
+            }
+            state.ResumeTiming();
+
+            for (auto i = 0; i < state.range(0); ++i)
+            {
+                const auto& str = kInputStrings[i];
+
+                auto stringId = pool.aquire(str);
+                benchmark::DoNotOptimize(stringId);
+            }
+        }
+    }
+    BENCHMARK(BM_StringPool_Reuse)->Range(0, kTestSize)->Unit(benchmark::kMillisecond);
+
     static void BM_StringPool_Get(benchmark::State& state)
     {
         StringPool pool;
diff --git a/tests/src/tests/tests.stringpool.cpp b/tests/src/tests/tests.stringpool.cpp
index c05c513..2439aa6 100644
--- a/tests/src/tests/tests.stringpool.cpp
+++ b/tests/src/tests/tests.stringpool.cpp
@@ -1,4 +1,5 @@
 #include <gtest/gtest.h>
+#include <random>
 
 #define IN_TESTS
 #include <zasm/core/stringpool.hpp>
@@ -84,22 +85,36 @@ namespace zasm::tests
 
         const auto id5 = pool.aquire(str5);
         ASSERT_NE(id5, StringPool::Id::Invalid);
-        ASSERT_NE(id5, id1);
+        ASSERT_EQ(id5, id1);
         ASSERT_EQ(pool.getLength(id5), std::size(str5) - 1);
         ASSERT_EQ(pool.getRefCount(id5), 1);
     }
 
-    TEST(StringPoolTests, TestManyStrings)
-    {
-        StringPool pool;
+    constexpr auto kTestSize = 100'000;
 
-        std::vector<StringPool::Id> ids;
+    static constexpr const char kChars[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
 
-        constexpr auto kTestSize = 1'000'000;
-        for (auto i = 0U; i < kTestSize; i++)
+    static const std::vector<std::string> kInputStrings = []() {
+        std::vector<std::string> strings;
+        std::set<std::string> uniqueStrings;
+        std::mt19937 prng(42);
+        while (uniqueStrings.size() < kTestSize)
         {
-            std::string str = std::string("hello") + std::to_string(i);
+            std::string str;
+            for (size_t i = 0; i < 4 + (prng() % 24); ++i)
+            {
+                str.push_back(kChars[prng() % (sizeof(kChars) - 1)]);
+            }
+            uniqueStrings.emplace(std::move(str));
+        }
+        strings.insert(strings.end(), uniqueStrings.begin(), uniqueStrings.end());
+        return strings;
+    }();
 
+    const auto fillPool = [](StringPool& pool, std::vector<StringPool::Id>& ids) {
+        ids.clear();
+        for (const auto& str : kInputStrings)
+        {
             const auto id = pool.aquire(str);
             ASSERT_NE(id, StringPool::Id::Invalid);
             ASSERT_EQ(pool.getLength(id), str.size());
@@ -111,16 +126,103 @@ namespace zasm::tests
 
             ids.push_back(id);
         }
+        ASSERT_EQ(ids.size(), kInputStrings.size());
+    };
+
+    TEST(StringPoolTests, TestManyStrings)
+    {
+        StringPool pool;
+        std::vector<StringPool::Id> ids;
+
+        fillPool(pool, ids);
 
         // Validate that the contents still match.
-        for (auto i = 0U; i < kTestSize; i++)
+        for (size_t i = 0; i < kTestSize; i++)
         {
-            std::string str = std::string("hello") + std::to_string(i);
-
+            const auto& str = kInputStrings[i];
             const auto* cstr = pool.get(ids[i]);
             ASSERT_NE(cstr, nullptr);
             ASSERT_EQ(strcmp(cstr, str.c_str()), 0);
         }
     }
 
+    TEST(StringPoolTests, TestClearIndividually)
+    {
+        StringPool pool;
+        std::vector<StringPool::Id> ids;
+
+        fillPool(pool, ids);
+
+        // Clear.
+        for (auto id : ids)
+        {
+            ASSERT_EQ(pool.release(id), 0);
+        }
+
+        ASSERT_EQ(pool.size(), 0);
+    }
+
+    TEST(StringPoolTests, TestReuseSingle)
+    {
+        StringPool pool;
+        std::vector<StringPool::Id> ids;
+
+        fillPool(pool, ids);
+
+        // Validate strings.
+        for (auto i = 0U; i < kTestSize; i++)
+        {
+            const auto& str = kInputStrings[i];
+
+            const auto* cstr = pool.get(ids[i]);
+            ASSERT_NE(cstr, nullptr) << "i = " << i;
+            ASSERT_EQ(strcmp(cstr, str.c_str()), 0) << "i = " << i;
+        }
+
+        // Clear.
+        for (auto id : ids)
+        {
+            ASSERT_EQ(pool.release(id), 0);
+        }
+
+        fillPool(pool, ids);
+
+        // Validate strings.
+        for (auto i = 0U; i < kTestSize; i++)
+        {
+            const auto& str = kInputStrings[i];
+
+            const auto* cstr = pool.get(ids[i]);
+            ASSERT_NE(cstr, nullptr) << "i = " << i;
+            ASSERT_EQ(strcmp(cstr, str.c_str()), 0) << "i = " << i;
+        }
+    }
+
+    TEST(StringPoolTests, TestReuseMultiple)
+    {
+        StringPool pool;
+        std::vector<StringPool::Id> ids;
+
+        for (size_t i = 0; i < 5; i++)
+        {
+            fillPool(pool, ids);
+
+            // Validate that the contents still match.
+            for (auto i = 0U; i < kTestSize; i++)
+            {
+                const auto& str = kInputStrings[i];
+
+                const auto* cstr = pool.get(ids[i]);
+                ASSERT_NE(cstr, nullptr);
+                ASSERT_EQ(strcmp(cstr, str.c_str()), 0);
+            }
+
+            // Clear.
+            for (auto id : ids)
+            {
+                ASSERT_EQ(pool.release(id), 0);
+            }
+        }
+    }
+
 } // namespace zasm::tests
diff --git a/zasm/include/zasm/core/stringpool.hpp b/zasm/include/zasm/core/stringpool.hpp
index 79b55bd..162e198 100644
--- a/zasm/include/zasm/core/stringpool.hpp
+++ b/zasm/include/zasm/core/stringpool.hpp
@@ -4,7 +4,6 @@
 #include <array>
 #include <cstddef>
 #include <cstring>
-#include <iterator>
 #include <memory>
 #include <string.h>
 #include <string>
@@ -29,6 +28,9 @@ namespace zasm
         static constexpr std::size_t kStringsPerBlock = 4096;
         static constexpr std::size_t kBlockSize = kAverageStringSize * kStringsPerBlock;
         static constexpr std::size_t kUnspecifiedSize = ~std::size_t{ 0 };
+        static constexpr std::size_t kMinStringCapacity = 16;
+
+        static_assert(kBlockSize >= kMaxStringSize, "Block size must be bigger than max string size.");
 
         // 16 bit prime number.
         static constexpr std::size_t kMaxHashBuckets = 39119;
@@ -38,16 +40,17 @@ namespace zasm
         {
             std::size_t hash{};
             std::size_t blockIndex{};
-            std::size_t sortedIndex{};
             std::int32_t offsetInBlock{};
             std::int32_t len{};
             std::int32_t capacity{};
             std::int32_t refCount{};
+            Id nextFreeId{ Id::Invalid };
         };
 
         std::vector<Entry> _entries;
-        std::vector<Id> _freeEntries;
         std::vector<std::vector<Id>> _hashBuckets;
+        Id _nextFreeId{ Id::Invalid };
+        std::size_t _numFree{};
 
         struct Block
         {
@@ -91,12 +94,14 @@ namespace zasm
 
             if (newRefCount == 0)
             {
-                _freeEntries.push_back(stringId);
+                entry->nextFreeId = _nextFreeId;
+                _nextFreeId = stringId;
+                _numFree++;
 
                 const auto bucketIndex = entry->hash % kMaxHashBuckets;
                 auto& bucket = _hashBuckets[bucketIndex];
 
-                bucket.erase(bucket.begin() + entry->sortedIndex);
+                bucket.erase(std::remove(bucket.begin(), bucket.end(), stringId), bucket.end());
             }
 
             return newRefCount;
@@ -150,8 +155,13 @@ namespace zasm
         void clear() noexcept
         {
             _entries.clear();
-            _blocks.clear();
-            _freeEntries.clear();
+            if (_blocks.size() > 1)
+            {
+                _blocks.resize(1);
+                _blocks[0]->used = 0;
+            }
+            _nextFreeId = Id::Invalid;
+            _numFree = 0;
             for (auto& bucket : _hashBuckets)
             {
                 bucket.clear();
@@ -160,7 +170,7 @@ namespace zasm
 
         std::size_t size() const noexcept
         {
-            return _entries.size();
+            return _entries.size() - _numFree;
         }
 
         Error save(IStream& stream) const
@@ -182,10 +192,6 @@ namespace zasm
                 {
                     return ErrorCode::InvalidParameter;
                 }
-                if (auto len = stream.write(entry.sortedIndex); len == 0)
-                {
-                    return ErrorCode::InvalidParameter;
-                }
                 if (auto len = stream.write(entry.offsetInBlock); len == 0)
                 {
                     return ErrorCode::InvalidParameter;
@@ -202,23 +208,18 @@ namespace zasm
                 {
                     return ErrorCode::InvalidParameter;
                 }
+                if (auto len = stream.write(entry.nextFreeId); len == 0)
+                {
+                    return ErrorCode::InvalidParameter;
+                }
             }
 
             // Serialize free entries.
-            const auto freeEntryCount = static_cast<std::uint32_t>(_freeEntries.size());
-            if (auto len = stream.write(&freeEntryCount, sizeof(freeEntryCount)); len == 0)
+            if (auto len = stream.write(&_nextFreeId, sizeof(_nextFreeId)); len == 0)
             {
                 return ErrorCode::InvalidParameter;
             }
 
-            for (const auto& freeEntry : _freeEntries)
-            {
-                if (auto len = stream.write(freeEntry); len == 0)
-                {
-                    return ErrorCode::InvalidParameter;
-                }
-            }
-
             // Serialize hash buckets.
             for (const auto& bucket : _hashBuckets)
             {
@@ -284,10 +285,6 @@ namespace zasm
                 {
                     return ErrorCode::InvalidParameter;
                 }
-                if (auto len = stream.read(entry.sortedIndex); len == 0)
-                {
-                    return ErrorCode::InvalidParameter;
-                }
                 if (auto len = stream.read(entry.offsetInBlock); len == 0)
                 {
                     return ErrorCode::InvalidParameter;
@@ -304,25 +301,19 @@ namespace zasm
                 {
                     return ErrorCode::InvalidParameter;
                 }
+                if (auto len = stream.read(entry.nextFreeId); len == 0)
+                {
+                    return ErrorCode::InvalidParameter;
+                }
             }
 
             // Deserialize free entries.
-            std::uint32_t freeEntryCount{};
-            if (auto len = stream.read(&freeEntryCount, sizeof(freeEntryCount)); len == 0)
+            Id nextFreeEntry{ Id::Invalid };
+            if (auto len = stream.read(&nextFreeEntry, sizeof(nextFreeEntry)); len == 0)
             {
                 return ErrorCode::InvalidParameter;
             }
 
-            auto freeEntries = std::vector<Id>();
-            freeEntries.resize(freeEntryCount);
-            for (auto& freeEntry : freeEntries)
-            {
-                if (auto len = stream.read(freeEntry); len == 0)
-                {
-                    return ErrorCode::InvalidParameter;
-                }
-            }
-
             // Deserialize hash buckets.
             auto hashBuckets = std::vector<std::vector<Id>>();
             hashBuckets.resize(kMaxHashBuckets);
@@ -373,9 +364,17 @@ namespace zasm
 
             // Swap state.
             _entries = std::move(loadedEntries);
-            _freeEntries = std::move(freeEntries);
             _hashBuckets = std::move(hashBuckets);
             _blocks = std::move(blocks);
+            _nextFreeId = nextFreeEntry;
+
+            // Compute free count.
+            _numFree = 0;
+            for (auto freeId = _nextFreeId; freeId != Id::Invalid;
+                 freeId = _entries[static_cast<std::size_t>(freeId)].nextFreeId)
+            {
+                _numFree++;
+            }
 
             return ErrorCode::None;
         }
@@ -463,6 +462,42 @@ namespace zasm
             return *_blocks.back();
         }
 
+        Id getFreeEntry(std::size_t requiredLength)
+        {
+            if (_nextFreeId == Id::Invalid)
+            {
+                return Id::Invalid;
+            }
+
+            auto nextFreeId = _nextFreeId;
+            auto parentId = Id::Invalid;
+
+            while (nextFreeId != Id::Invalid)
+            {
+                const auto& entry = _entries[static_cast<std::size_t>(nextFreeId)];
+                if (entry.capacity >= requiredLength)
+                {
+                    if (parentId != Id::Invalid)
+                    {
+                        // Update the next free id of the parent.
+                        auto& parentEntry = _entries[static_cast<std::size_t>(parentId)];
+                        parentEntry.nextFreeId = entry.nextFreeId;
+                    }
+                    else
+                    {
+                        // Update the head of the free list.
+                        _nextFreeId = entry.nextFreeId;
+                    }
+                    _numFree--;
+                    return nextFreeId;
+                }
+                parentId = nextFreeId;
+                nextFreeId = entry.nextFreeId;
+            }
+
+            return Id::Invalid;
+        }
+
         Id aquire_(const char* inputStr, std::size_t len)
         {
             const auto hash = getHash(inputStr, len);
@@ -487,16 +522,8 @@ namespace zasm
                 return stringId;
             }
 
-            const auto len2 = static_cast<std::int32_t>(len);
-            constexpr std::int32_t kTerminatorLength = 1;
-
-            // Use empty entry if any exist.
-            // TODO: Optimize this, we should prioritize finding entries at the end of the list.
-            //       This would mean shrinking the vector has less to copy.
-            auto itEntry = std::find_if(_freeEntries.begin(), _freeEntries.end(), [&](Id id) {
-                auto& entry = _entries[static_cast<std::size_t>(id)];
-                return entry.refCount <= 0 && entry.capacity >= len2 + kTerminatorLength;
-            });
+            const auto actualLength = static_cast<std::int32_t>(len);
+            const auto requiredLength = static_cast<std::int32_t>(len) + 1;
 
             const auto insertToHashBucket = [&](Entry& entry, Id id) {
                 const auto bucketIndex = entry.hash % kMaxHashBuckets;
@@ -506,43 +533,46 @@ namespace zasm
                     return _entries[static_cast<std::size_t>(id)].hash < _entries[static_cast<std::size_t>(id2)].hash;
                 });
 
-                const auto sortedIndex = static_cast<std::size_t>(std::distance(bucket.begin(), sortedIt));
                 bucket.insert(sortedIt, id);
-
-                return sortedIndex;
             };
 
             const auto writeStringToBlock = [&](std::size_t blockOffset, Block& block) {
-                std::memcpy(block.data.data() + blockOffset, inputStr, len2);
+                std::memcpy(block.data.data() + blockOffset, inputStr, actualLength);
                 // Ensure null termination.
-                block.data[blockOffset + len2] = '\0';
+                block.data[blockOffset + actualLength] = '\0';
             };
 
-            if (itEntry != _freeEntries.end())
+            // Use empty entry if any exist.
+            stringId = getFreeEntry(requiredLength);
+            if (stringId != Id::Invalid)
             {
                 // Found empty space.
-                stringId = *itEntry;
                 auto& entry = _entries[static_cast<std::size_t>(stringId)];
+                assert(entry.refCount == 0);
 
                 auto& block = _blocks[entry.blockIndex];
                 writeStringToBlock(entry.offsetInBlock, *block);
 
                 entry.hash = hash;
-                entry.len = len2;
+                entry.len = actualLength;
                 entry.refCount = 1;
-                entry.sortedIndex = insertToHashBucket(entry, stringId);
+                entry.nextFreeId = Id::Invalid;
 
-                _freeEntries.erase(itEntry);
+                insertToHashBucket(entry, stringId);
 
                 return stringId;
             }
 
             // New entry.
-            auto& block = getBlock(len2 + kTerminatorLength);
+
+            // We align the capacity to 8 bytes.
+            const auto capacity = std::max<std::int32_t>(kMinStringCapacity, (requiredLength + 7) & ~7);
+
+            auto& block = getBlock(capacity);
             const auto offset = static_cast<std::int32_t>(block.used);
 
             writeStringToBlock(offset, block);
-            block.used += len2 + kTerminatorLength;
+            block.used += capacity;
 
             stringId = static_cast<Id>(_entries.size());
 
@@ -550,10 +580,11 @@ namespace zasm
             entry.hash = hash;
             entry.blockIndex = block.index;
             entry.offsetInBlock = offset;
-            entry.len = len2;
-            entry.capacity = len2 + kTerminatorLength;
+            entry.len = actualLength;
+            entry.capacity = capacity;
             entry.refCount = 1;
-            entry.sortedIndex = insertToHashBucket(entry, stringId);
+
+            insertToHashBucket(entry, stringId);
 
             return stringId;
         }

From 982f88d6bc5051e212d7e5e01177a78ac8259101 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=CE=B6eh=20Matt?= <5415177+ZehMatt@users.noreply.github.com>
Date: Fri, 8 Nov 2024 04:36:02 +0200
Subject: [PATCH 6/8] Code cleanup

---
 zasm/include/zasm/core/stringpool.hpp | 52 ++++++++++++++++-----------
 1 file changed, 31 insertions(+), 21 deletions(-)

diff --git a/zasm/include/zasm/core/stringpool.hpp b/zasm/include/zasm/core/stringpool.hpp
index 162e198..5a9ffc3 100644
--- a/zasm/include/zasm/core/stringpool.hpp
+++ b/zasm/include/zasm/core/stringpool.hpp
@@ -27,8 +27,8 @@ namespace zasm
         static constexpr std::size_t kAverageStringSize = 24;
         static constexpr std::size_t kStringsPerBlock = 4096;
         static constexpr std::size_t kBlockSize = kAverageStringSize * kStringsPerBlock;
-        static constexpr std::size_t kUnspecifiedSize = ~std::size_t{ 0 };
         static constexpr std::size_t kMinStringCapacity = 16;
+        static constexpr std::size_t kUnspecifiedSize = ~std::size_t{ 0 };
 
         static_assert(kBlockSize >= kMaxStringSize, "Block size must be bigger than max string size.");
 
@@ -36,13 +36,21 @@ namespace zasm
         static constexpr std::size_t kMaxHashBuckets = 39119;
 
     private:
+        using StringSize = std::conditional_t<
+            kMaxStringSize <= std::numeric_limits<std::uint16_t>::max(), std::uint16_t, std::uint32_t>;
+
+        using BlockOffset = std::conditional_t<
+            kBlockSize <= std::numeric_limits<std::uint32_t>::max(), std::uint32_t, std::uint64_t>;
+
+        using BlockIndex = std::uint32_t;
+
         struct Entry
         {
-            std::size_t hash{};
-            std::size_t blockIndex{};
-            std::int32_t offsetInBlock{};
-            std::int32_t len{};
-            std::int32_t capacity{};
+            std::uint64_t hash{};
+            BlockIndex blockIndex{};
+            BlockOffset offsetInBlock{};
+            StringSize len{};
+            StringSize capacity{};
             std::int32_t refCount{};
             Id nextFreeId{ Id::Invalid };
         };
@@ -54,8 +62,8 @@ namespace zasm
 
         struct Block
         {
-            std::size_t index{};
-            std::size_t used{};
+            BlockIndex index{};
+            std::uint32_t used{};
             std::array<char, kBlockSize> data{};
         };
 
@@ -76,6 +84,11 @@ namespace zasm
             return aquire_(value, size);
         }
 
+        Id aquire(std::string_view str)
+        {
+            return aquire_(str.data(), str.size());
+        }
+
         Id aquire(const std::string& val)
         {
             return aquire_(val.c_str(), val.size());
@@ -131,7 +144,7 @@ namespace zasm
             return _blocks[entry->blockIndex]->data.data() + entry->offsetInBlock;
         }
 
-        int32_t getLength(Id stringId) const noexcept
+        std::size_t getLength(Id stringId) const noexcept
         {
             const auto* entry = getEntry(*this, stringId);
             if (entry == nullptr)
@@ -403,7 +416,7 @@ namespace zasm
             return &self._entries[idx];
         }
 
-        Id find_(const char* buf, std::size_t len, std::size_t hash) const noexcept
+        Id find_(const char* buf, std::size_t len, std::uint64_t hash) const noexcept
         {
             const auto bucketIndex = hash % kMaxHashBuckets;
             const auto& bucket = _hashBuckets[bucketIndex];
@@ -456,7 +469,7 @@ namespace zasm
 
             // Create a new block.
             auto newBlock = std::make_unique<Block>();
-            newBlock->index = _blocks.size();
+            newBlock->index = static_cast<BlockIndex>(_blocks.size());
             _blocks.emplace_back(std::move(newBlock));
 
             return *_blocks.back();
@@ -504,7 +517,7 @@ namespace zasm
             return aquire_(inputStr, len, hash);
         }
 
-        Id aquire_(const char* inputStr, std::size_t len, std::size_t hash)
+        Id aquire_(const char* inputStr, std::size_t len, std::uint64_t hash)
         {
             // Strings can not be larger than kMaxStringSize.
             if (len >= kMaxStringSize)
@@ -589,7 +602,7 @@ namespace zasm
             return stringId;
         }
 
-        static constexpr std::size_t getHash(const char* buf, size_t len) noexcept
+        static constexpr std::uint64_t getHash(const char* buf, size_t len) noexcept
         {
             assert(buf != nullptr);
             assert(len > 0);
@@ -598,14 +611,11 @@ namespace zasm
             {
                 return 0;
             }
-#ifdef _M_AMD64
-            constexpr std::size_t offset = 0xcbf29ce484222325ULL;
-            constexpr std::size_t prime = 0x00000100000001B3ULL;
-#else
-            constexpr std::size_t offset = 0x811c9dc5U;
-            constexpr std::size_t prime = 0x01000193U;
-#endif
-            std::size_t result = offset;
+
+            constexpr std::uint64_t offset = 0xcbf29ce484222325ULL;
+            constexpr std::uint64_t prime = 0x00000100000001B3ULL;
+
+            std::uint64_t result = offset;
             for (std::size_t i = 0; i < len; ++i)
             {
                 result ^= static_cast<unsigned char>(buf[i]);

From 9e920e0f00c50e0d3ce628e33338ac56684085a5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=CE=B6eh=20Matt?= <5415177+ZehMatt@users.noreply.github.com>
Date: Fri, 8 Nov 2024 04:57:09 +0200
Subject: [PATCH 7/8] Fix typo

---
 .../src/benchmarks/benchmark.stringpool.cpp   | 16 +++++++--------
 tests/src/tests/tests.stringpool.cpp          | 20 +++++++++----------
 zasm/include/zasm/core/stringpool.hpp         | 11 +++++++---
 zasm/src/zasm/src/program/program.cpp         |  8 ++++----
 .../src/zasm/src/serialization/serializer.cpp |  2 +-
 5 files changed, 31 insertions(+), 26 deletions(-)

diff --git a/benchmark/src/benchmarks/benchmark.stringpool.cpp b/benchmark/src/benchmarks/benchmark.stringpool.cpp
index df5fdf4..73106fa 100644
--- a/benchmark/src/benchmarks/benchmark.stringpool.cpp
+++ b/benchmark/src/benchmarks/benchmark.stringpool.cpp
@@ -23,7 +23,7 @@ namespace zasm::benchmarks
         return strings;
     }();
 
-    static void BM_StringPool_Aquire(benchmark::State& state)
+    static void BM_StringPool_Acquire(benchmark::State& state)
     {
         StringPool pool;
         for (auto _ : state)
@@ -32,12 +32,12 @@ namespace zasm::benchmarks
             {
                 const auto& str = kInputStrings[i];
 
-                auto stringId = pool.aquire(str);
+                auto stringId = pool.acquire(str);
                 benchmark::DoNotOptimize(stringId);
             }
         }
     }
-    BENCHMARK(BM_StringPool_Aquire)->Range(0, kTestSize)->Unit(benchmark::kMillisecond);
+    BENCHMARK(BM_StringPool_Acquire)->Range(0, kTestSize)->Unit(benchmark::kMillisecond);
 
     static void BM_StringPool_Release(benchmark::State& state)
     {
@@ -49,7 +49,7 @@ namespace zasm::benchmarks
                 state.PauseTiming();
                 const auto& str = kInputStrings[i];
 
-                auto stringId = pool.aquire(str);
+                auto stringId = pool.acquire(str);
                 state.ResumeTiming();
 
                 auto refCount = pool.release(stringId);
@@ -71,7 +71,7 @@ namespace zasm::benchmarks
             {
                 const auto& str = kInputStrings[i];
 
-                auto stringId = pool.aquire(str);
+                auto stringId = pool.acquire(str);
                 stringIds.push_back(stringId);
             }
 
@@ -86,7 +86,7 @@ namespace zasm::benchmarks
             {
                 const auto& str = kInputStrings[i];
 
-                auto stringId = pool.aquire(str);
+                auto stringId = pool.acquire(str);
                 benchmark::DoNotOptimize(stringId);
             }
         }
@@ -103,7 +103,7 @@ namespace zasm::benchmarks
                 state.PauseTiming();
                 const auto& str = kInputStrings[i];
 
-                auto stringId = pool.aquire(str);
+                auto stringId = pool.acquire(str);
                 state.ResumeTiming();
 
                 const char* res = pool.get(stringId);
@@ -123,7 +123,7 @@ namespace zasm::benchmarks
                 state.PauseTiming();
                 const auto& str = kInputStrings[i];
 
-                auto stringId = pool.aquire(str);
+                auto stringId = pool.acquire(str);
                 state.ResumeTiming();
 
                 auto strLen = pool.getLength(stringId);
diff --git a/tests/src/tests/tests.stringpool.cpp b/tests/src/tests/tests.stringpool.cpp
index 2439aa6..83bc9a5 100644
--- a/tests/src/tests/tests.stringpool.cpp
+++ b/tests/src/tests/tests.stringpool.cpp
@@ -13,7 +13,7 @@ namespace zasm::tests
         constexpr const char str1[] = "hello";
         constexpr const char str2[] = "Hello";
 
-        const auto id0 = pool.aquire(str1);
+        const auto id0 = pool.acquire(str1);
         ASSERT_NE(id0, StringPool::Id::Invalid);
         ASSERT_EQ(pool.getLength(id0), std::size(str1) - 1);
         ASSERT_EQ(pool.getRefCount(id0), 1);
@@ -21,7 +21,7 @@ namespace zasm::tests
         ASSERT_NE(cstr0, nullptr);
         ASSERT_EQ(strcmp(cstr0, str1), 0);
 
-        const auto id1 = pool.aquire(str1);
+        const auto id1 = pool.acquire(str1);
         ASSERT_NE(id1, StringPool::Id::Invalid);
         ASSERT_EQ(id1, id0);
         ASSERT_EQ(pool.getLength(id1), std::size(str1) - 1);
@@ -30,7 +30,7 @@ namespace zasm::tests
         ASSERT_EQ(cstr1, cstr0);
         ASSERT_EQ(strcmp(cstr1, str1), 0);
 
-        const auto id2 = pool.aquire(str2);
+        const auto id2 = pool.acquire(str2);
         ASSERT_NE(id2, StringPool::Id::Invalid);
         ASSERT_NE(id2, id1);
         ASSERT_EQ(pool.getLength(id2), std::size(str2) - 1);
@@ -49,17 +49,17 @@ namespace zasm::tests
         constexpr const char str4[] = "hello4";
         constexpr const char str5[] = "hello...";
 
-        const auto id0 = pool.aquire(str1);
+        const auto id0 = pool.acquire(str1);
         ASSERT_NE(id0, StringPool::Id::Invalid);
         ASSERT_EQ(pool.getLength(id0), std::size(str1) - 1);
         ASSERT_EQ(pool.getRefCount(id0), 1);
 
-        const auto id1 = pool.aquire(str2);
+        const auto id1 = pool.acquire(str2);
         ASSERT_NE(id1, StringPool::Id::Invalid);
         ASSERT_EQ(pool.getLength(id1), std::size(str2) - 1);
         ASSERT_EQ(pool.getRefCount(id1), 1);
 
-        const auto id2 = pool.aquire(str3);
+        const auto id2 = pool.acquire(str3);
         ASSERT_NE(id2, StringPool::Id::Invalid);
         ASSERT_EQ(pool.getLength(id2), std::size(str3) - 1);
         ASSERT_EQ(pool.getRefCount(id2), 1);
@@ -68,13 +68,13 @@ namespace zasm::tests
         const auto* cstr0 = pool.get(id1);
         ASSERT_EQ(cstr0, nullptr);
 
-        const auto id3 = pool.aquire(str4);
+        const auto id3 = pool.acquire(str4);
         ASSERT_NE(id3, StringPool::Id::Invalid);
         ASSERT_EQ(id1, id3);
         ASSERT_EQ(pool.getLength(id3), std::size(str4) - 1);
         ASSERT_EQ(pool.getRefCount(id3), 1);
 
-        const auto id4 = pool.aquire(str4);
+        const auto id4 = pool.acquire(str4);
         ASSERT_NE(id4, StringPool::Id::Invalid);
         ASSERT_EQ(id4, id3);
         ASSERT_EQ(pool.getLength(id4), std::size(str4) - 1);
@@ -83,7 +83,7 @@ namespace zasm::tests
         ASSERT_EQ(pool.release(id4), 1);
         ASSERT_EQ(pool.release(id4), 0);
 
-        const auto id5 = pool.aquire(str5);
+        const auto id5 = pool.acquire(str5);
         ASSERT_NE(id5, StringPool::Id::Invalid);
         ASSERT_EQ(id5, id1);
         ASSERT_EQ(pool.getLength(id5), std::size(str5) - 1);
@@ -115,7 +115,7 @@ namespace zasm::tests
         ids.clear();
         for (const auto& str : kInputStrings)
         {
-            const auto id = pool.aquire(str);
+            const auto id = pool.acquire(str);
             ASSERT_NE(id, StringPool::Id::Invalid);
             ASSERT_EQ(pool.getLength(id), str.size());
             ASSERT_EQ(pool.getRefCount(id), 1);
diff --git a/zasm/include/zasm/core/stringpool.hpp b/zasm/include/zasm/core/stringpool.hpp
index 5a9ffc3..849f6f4 100644
--- a/zasm/include/zasm/core/stringpool.hpp
+++ b/zasm/include/zasm/core/stringpool.hpp
@@ -75,7 +75,7 @@ namespace zasm
             _hashBuckets.resize(kMaxHashBuckets);
         }
 
-        Id aquire(const char* value, std::size_t size = kUnspecifiedSize)
+        Id acquire(const char* value, std::size_t size = kUnspecifiedSize)
         {
             if (size == kUnspecifiedSize)
             {
@@ -84,12 +84,12 @@ namespace zasm
             return aquire_(value, size);
         }
 
-        Id aquire(std::string_view str)
+        Id acquire(std::string_view str)
         {
             return aquire_(str.data(), str.size());
         }
 
-        Id aquire(const std::string& val)
+        Id acquire(const std::string& val)
         {
             return aquire_(val.c_str(), val.size());
         }
@@ -167,6 +167,11 @@ namespace zasm
 
         void clear() noexcept
         {
+            if (_entries.empty())
+            {
+                // Because clearing the buckets is expensive do nothing if already empty.
+                return;
+            }
             _entries.clear();
             if (_blocks.size() > 1)
             {
diff --git a/zasm/src/zasm/src/program/program.cpp b/zasm/src/zasm/src/program/program.cpp
index ddcb651..ce5685c 100644
--- a/zasm/src/zasm/src/program/program.cpp
+++ b/zasm/src/zasm/src/program/program.cpp
@@ -154,7 +154,7 @@ namespace zasm
 
         if (name != nullptr)
         {
-            entry.nameId = _state->symbolNames.aquire(name);
+            entry.nameId = _state->symbolNames.acquire(name);
         }
     }
 
@@ -594,7 +594,7 @@ namespace zasm
         {
             return StringPool::Id::Invalid;
         }
-        return state.symbolNames.aquire(str);
+        return state.symbolNames.acquire(str);
     }
 
     static Label createLabel_(detail::ProgramState& state, StringPool::Id nameId, StringPool::Id modId, LabelFlags flags)
@@ -731,7 +731,7 @@ namespace zasm
 
         if (name != nullptr)
         {
-            entry.nameId = _state->symbolNames.aquire(name);
+            entry.nameId = _state->symbolNames.acquire(name);
         }
 
         return Section{ sectId };
@@ -800,7 +800,7 @@ namespace zasm
             entry->nameId = StringPool::Id::Invalid;
         }
 
-        entry->nameId = _state->symbolNames.aquire(name);
+        entry->nameId = _state->symbolNames.acquire(name);
         return ErrorCode::None;
     }
 
diff --git a/zasm/src/zasm/src/serialization/serializer.cpp b/zasm/src/zasm/src/serialization/serializer.cpp
index 05e57aa..548ceb0 100644
--- a/zasm/src/zasm/src/serialization/serializer.cpp
+++ b/zasm/src/zasm/src/serialization/serializer.cpp
@@ -475,7 +475,7 @@ namespace zasm
         defaultSect.attribs = Section::kDefaultAttribs;
         defaultSect.align = Section::kDefaultAlign;
         defaultSect.address = newBase;
-        defaultSect.nameId = programState.symbolNames.aquire(".text");
+        defaultSect.nameId = programState.symbolNames.acquire(".text");
 
         const auto serializePass = [&]() -> Error {
             state.buffer.clear();

From 9dc918ef608fb167a0bafb572bb9b80607b5d239 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=CE=B6eh=20Matt?= <5415177+ZehMatt@users.noreply.github.com>
Date: Fri, 8 Nov 2024 05:04:54 +0200
Subject: [PATCH 8/8] Add another test for StringPool

---
 tests/src/tests/tests.stringpool.cpp | 42 ++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/tests/src/tests/tests.stringpool.cpp b/tests/src/tests/tests.stringpool.cpp
index 83bc9a5..2a9d738 100644
--- a/tests/src/tests/tests.stringpool.cpp
+++ b/tests/src/tests/tests.stringpool.cpp
@@ -39,6 +39,48 @@ namespace zasm::tests
         ASSERT_EQ(strcmp(cstr2, str2), 0);
     }
 
+    TEST(StringPoolTests, TestDuplicate)
+    {
+        StringPool pool;
+
+        constexpr const char str1[] = "hello";
+        constexpr const char str2[] = "hello";
+        constexpr const char str3[] = "hello";
+        constexpr const char str4[] = "hello1";
+
+        const auto id0 = pool.acquire(str1);
+        ASSERT_NE(id0, StringPool::Id::Invalid);
+        ASSERT_EQ(pool.getLength(id0), std::size(str1) - 1);
+        ASSERT_EQ(pool.getRefCount(id0), 1);
+        const auto* cstr0 = pool.get(id0);
+        ASSERT_NE(cstr0, nullptr);
+        ASSERT_EQ(strcmp(cstr0, str1), 0);
+
+        const auto id1 = pool.acquire(str2);
+        ASSERT_NE(id1, StringPool::Id::Invalid);
+        ASSERT_EQ(id1, id0);
+        ASSERT_EQ(pool.getLength(id1), std::size(str2) - 1);
+        ASSERT_EQ(pool.getRefCount(id1), 2);
+        const auto* cstr2 = pool.get(id1);
+        ASSERT_EQ(strcmp(cstr2, str2), 0);
+
+        const auto id2 = pool.acquire(str3);
+        ASSERT_NE(id2, StringPool::Id::Invalid);
+        ASSERT_EQ(id2, id0);
+        ASSERT_EQ(pool.getLength(id2), std::size(str3) - 1);
+        ASSERT_EQ(pool.getRefCount(id2), 3);
+        const auto* cstr3 = pool.get(id2);
+        ASSERT_EQ(strcmp(cstr3, str3), 0);
+
+        const auto id3 = pool.acquire(str4);
+        ASSERT_NE(id3, StringPool::Id::Invalid);
+        ASSERT_NE(id3, id0);
+        ASSERT_EQ(pool.getLength(id3), std::size(str4) - 1);
+        ASSERT_EQ(pool.getRefCount(id3), 1);
+        const auto* cstr4 = pool.get(id3);
+        ASSERT_EQ(strcmp(cstr4, str4), 0);
+    }
+
     TEST(StringPoolTests, TestRelease)
     {
         StringPool pool;