24958: A general fix for SBFDS index issues #490

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged

howsohazard merged 1 commit into main from 24958-sbfds-index-fix

Jan 21, 2026

src/Amalgam/SBFDSColumnData.cpp

-Original file line number
+Diff line change
@@ Expand Up @@
     			auto old_value_entry = sortedNumberValueEntries.find(old_number_value);
     			if(old_value_entry == end(sortedNumberValueEntries))
-    			{
-    				//value must have changed sizes, look in each size
-    				//note that this is inefficient -- if this ends up being a bottleneck,
-    				//an additional data structure will need to be built to maintain the previous size
-    				//TODO 24298: ensure index size is always correct and updated so entities won't be missed, remove this code and assert false if not found
-    				for(auto cur_id_entry = begin(sortedNumberValueEntries); cur_id_entry != end(sortedNumberValueEntries); ++cur_id_entry)
-    				{
-    					if(cur_id_entry->second.indicesWithValue.contains(index))
-    					{
-    						old_value_entry = cur_id_entry;
-    						break;
-    					}
-    				}
-    				//if not found anywhere, then there's index corruption
-    				if(old_value_entry == end(sortedNumberValueEntries))
-    					assert(false);
-    			}
+    				assert(false);
     			//if there are multiple entries for this number, just remove the id from the old value
     			if(old_value_entry->second.indicesWithValue.size() > 1)
@@ Expand Down Expand Up @@
     			size_t new_value_index = 0;
     			auto old_id_entry = stringIdValueEntries.find(old_sid_value);
     			if(old_id_entry == end(stringIdValueEntries))
-    			{
-    				//value must have changed sizes, look in each size
-    				//note that this is inefficient -- if this ends up being a bottleneck,
-    				//an additional data structure will need to be built to maintain the previous size
-    				//TODO 24298: ensure index size is always correct and updated so entities won't be missed, remove this code and assert false if not found
-    				for(auto cur_id_entry = begin(stringIdValueEntries); cur_id_entry != end(stringIdValueEntries); ++cur_id_entry)
-    				{
-    					if(cur_id_entry->second != nullptr && cur_id_entry->second->indicesWithValue.contains(index))
-    					{
-    						old_id_entry = cur_id_entry;
-    						break;
-    					}
-    				}
-    				//if not found anywhere, then there's index corruption
-    				if(old_id_entry == end(stringIdValueEntries))
-    					assert(false);
-    			}
+    				assert(false);
     			//if there are multiple entries for this string, just move the id
     			if(old_id_entry->second->indicesWithValue.size() > 1)
@@ Expand Down Expand Up @@
     				//need to emplace above before searching to ensure new_size_entry does not become invalidated
     				auto old_size_entry = valueCodeSizeToIndices.find(old_code_size);
     				if(old_size_entry == end(valueCodeSizeToIndices))
-    				{
-    					//value must have changed sizes, look in each size
-    					//note that this is inefficient -- if this ends up being a bottleneck,
-    					//an additional data structure will need to be built to maintain the previous size
-    					//TODO 24298: ensure index size is always correct and updated so entities won't be missed, remove this code and assert false if not found
-    					for(auto cur_id_entry = begin(valueCodeSizeToIndices); cur_id_entry != end(valueCodeSizeToIndices); ++cur_id_entry)
-    					{
-    						if(cur_id_entry->second != nullptr && cur_id_entry->second->contains(index))
-    						{
-    							old_size_entry = cur_id_entry;
-    							break;
-    						}
-    					}
-    					//if not found anywhere, then there's index corruption
-    					if(old_size_entry == end(valueCodeSizeToIndices))
-    						assert(false);
-    				}
+    					assert(false);
     				//if there are multiple entries for this string, just move the id
     				if(old_size_entry->second->size() > 1)
@@ Expand Down Expand Up @@
     		//look up value
     		auto value_entry = sortedNumberValueEntries.find(resolved_value.number);
     		if(value_entry == end(sortedNumberValueEntries))
-    		{
-    			//value must have changed sizes, look in each size
-    			//note that this is inefficient -- if this ends up being a bottleneck,
-    			//an additional data structure will need to be built to maintain the previous size
-    			//TODO 24298: ensure index size is always correct and updated so entities won't be missed, remove this code and assert false if not found
-    			for(auto cur_value_entry = begin(sortedNumberValueEntries); cur_value_entry != end(sortedNumberValueEntries); ++cur_value_entry)
-    			{
-    				if(cur_value_entry->second.indicesWithValue.contains(index))
-    				{
-    					value_entry = cur_value_entry;
-    					break;
-    				}
-    			}
-    			//if not found anywhere, then there's index corruption
-    			if(value_entry == end(sortedNumberValueEntries))
-    				assert(false);
-    		}
+    			assert(false);
     		//if the bucket has only one entry, we must delete the entire bucket
     		if(value_entry->second.indicesWithValue.size() == 1)
@@ Expand All @@
     		auto id_entry = stringIdValueEntries.find(resolved_value.stringID);
     		if(id_entry == end(stringIdValueEntries))
-    		{
-    			//value must have changed sizes, look in each size
-    			//note that this is inefficient -- if this ends up being a bottleneck,
-    			//an additional data structure will need to be built to maintain the previous size
-    			//TODO 24298: ensure index size is always correct and updated so entities won't be missed, remove this code and assert false if not found
-    			for(auto cur_id_entry = begin(stringIdValueEntries); cur_id_entry != end(stringIdValueEntries); ++cur_id_entry)
-    			{
-    				if(cur_id_entry->second->indicesWithValue.contains(index))
-    				{
-    					id_entry = cur_id_entry;
-    					break;
-    				}
-    			}
-    			//if not found anywhere, then there's index corruption
-    			if(id_entry == end(stringIdValueEntries))
-    				assert(false);
-    		}
+    			assert(false);
     		auto &entities = id_entry->second->indicesWithValue;
     		entities.erase(index);
@@ Expand All @@
     		size_t num_indices = EvaluableNode::GetDeepSize(value.code);
     		auto id_entry = valueCodeSizeToIndices.find(num_indices);
     		if(id_entry == end(valueCodeSizeToIndices))
-    		{
-    			//value must have changed sizes, look in each size
-    			//note that this is inefficient -- if this ends up being a bottleneck,
-    			//an additional data structure will need to be built to maintain the previous size
-    			//TODO 24298: ensure index size is always correct and updated so entities won't be missed, remove this code and assert false if not found
-    			for(auto cur_id_entry = begin(valueCodeSizeToIndices); cur_id_entry != end(valueCodeSizeToIndices); ++cur_id_entry)
-    			{
-    				if(cur_id_entry->second->contains(index))
-    				{
-    					id_entry = cur_id_entry;
-    					break;
-    				}
-    			}
-    			//if not found anywhere, then there's index corruption
-    			if(id_entry == end(valueCodeSizeToIndices))
-    				assert(false);
-    		}
+    			assert(false);
     		//remove the entity
     		auto &entities = *(id_entry->second);
@@ Expand All @@
     		valueEntries[index] = std::numeric_limits<double>::quiet_NaN();
     }
+    void SBFDSColumnData::RemoveIndexFromCaches(size_t index)
+    {
+    	if(invalidIndices.EraseAndRetrieve(index))
+    		return;
+    	if(nullIndices.EraseAndRetrieve(index))
+    		return;
+    	if(falseBoolIndices.EraseAndRetrieve(index))
+    		return;
+    	if(trueBoolIndices.EraseAndRetrieve(index))
+    		return;
+    	for(auto cur_value_entry = begin(sortedNumberValueEntries); cur_value_entry != end(sortedNumberValueEntries); ++cur_value_entry)
+    	{
+    		if(cur_value_entry->second.indicesWithValue.contains(index))
+    		{
+    			if(cur_value_entry->second.indicesWithValue.size() == 1)
+    			{
+    				internedNumberValues.DeleteInternIndex(cur_value_entry->second.valueInternIndex);
+    				sortedNumberValueEntries.erase(cur_value_entry);
+    			}
+    			else //else we can just remove the id from the bucket
+    			{
+    				cur_value_entry->second.indicesWithValue.erase(index);
+    			}
+    			numberIndices.erase(index);
+    			return;
+    		}
+    	}
+    	for(auto cur_id_entry = begin(stringIdValueEntries); cur_id_entry != end(stringIdValueEntries); ++cur_id_entry)
+    	{
+    		if(cur_id_entry->second->indicesWithValue.contains(index))
+    		{
+    			auto &entities = cur_id_entry->second->indicesWithValue;
+    			entities.erase(index);
+    			//if no more entries have the value, remove it
+    			if(entities.size() == 0)
+    			{
+    				internedStringIdValues.DeleteInternIndex(cur_id_entry->second->valueInternIndex);
+    				stringIdValueEntries.erase(cur_id_entry);
+    			}
+    			//see if need to compute new longest string
+    			if(index == indexWithLongestString)
+    				RecomputeLongestString();
+    			stringIdIndices.erase(index);
+    			return;
+    		}
+    	}
+    	for(auto cur_id_entry = begin(valueCodeSizeToIndices); cur_id_entry != end(valueCodeSizeToIndices); ++cur_id_entry)
+    	{
+    		if(cur_id_entry->second->contains(index))
+    		{
+    			auto &entities = *(cur_id_entry->second);
+    			entities.erase(index);
+    			if(entities.size() == 0)
+    				valueCodeSizeToIndices.erase(cur_id_entry);
+    			//see if need to update largest code
+    			if(index == indexWithLargestCode)
+    				RecomputeLargestCode();
+    			codeIndices.erase(index);
+    			return;
+    		}
+    	}
+    }
     void SBFDSColumnData::Optimize()
     {
     #ifdef SBFDS_VERIFICATION
@@ Expand Down @@

src/Amalgam/SBFDSColumnData.h

-Original file line number
+Diff line change
@@ Expand Up / @@ -167,6 +167,12 @@ class SBFDSColumnData @@
     	void DeleteIndexValue(EvaluableNodeImmediateValueType value_type, EvaluableNodeImmediateValue value,
     		size_t index, bool remove_last_entity);
+    	//removes all of an index's data from the caches regardless of type
+    	// it should be followed up with an appropriate insert operation with the new value
+    	// to maintain cache consistency
+    	//TODO 24298: attempt to remove this and pass in the previous value for every label change
+    	void RemoveIndexFromCaches(size_t index);
     	//changes column to/from interning as would yield best performance
     	void Optimize();
@@ Expand Down Expand Up / @@ -443,6 +449,12 @@ class SBFDSColumnData @@
     	//used for debugging to make sure all entities are valid
     	inline void VerifyAllEntities(size_t max_num_entities = std::numeric_limits<size_t>::max())
     	{
+    		size_t num_entities = invalidIndices.size() + nullIndices.size() + falseBoolIndices.size() + trueBoolIndices.size()
+    			+ numberIndices.size() + stringIdIndices.size() + codeIndices.size();
+    		if(max_num_entities < std::numeric_limits<size_t>::max())
+    			assert(num_entities == max_num_entities);
     		for(auto &value_entry : sortedNumberValueEntries)
     		{
     			//ensure all interned values are valid
@@ Expand Down @@

src/Amalgam/SeparableBoxFilterDataStore.cpp

-Original file line number
+Diff line change
@@ Expand Up @@
     	for(auto &column_data : columnData)
     	{
+    		//TODO 24298: switch this to use ChangeIndexValue if possible
+    		column_data->RemoveIndexFromCaches(entity_index);
     		auto [value, found] = entity->GetValueAtLabelAsImmediateValue(column_data->stringId);
-    		column_data->ChangeIndexValue(value.nodeType, value.nodeValue, entity_index);
+    		column_data->InsertIndexValue(value.nodeType, value.nodeValue, entity_index);
     	}
     	//clean up any labels that aren't relevant
@@ Expand Down Expand Up @@
     	VerifyAllEntitiesForColumn(column_index);
     #endif
-    	//get the new value
+    	//TODO 24298: switch this to use ChangeIndexValue if possible
+    	column_data->RemoveIndexFromCaches(entity_index);
     	auto [value, found] = entity->GetValueAtLabelAsImmediateValue(column_data->stringId);
-    	column_data->ChangeIndexValue(value.nodeType, value.nodeValue, entity_index);
+    	column_data->InsertIndexValue(value.nodeType, value.nodeValue, entity_index);
     	//remove the label if no longer relevant
     	if(IsColumnIndexRemovable(column_index))
     		RemoveColumnIndex(column_index);
     	else
     		OptimizeColumn(column_index);
+    #ifdef SBFDS_VERIFICATION
+    	VerifyAllEntitiesForColumn(column_index);
+    #endif
     }
     //populates distances_out with all entities and their distances that have a distance to target less than max_dist
@@ Expand Down @@

src/Amalgam/SeparableBoxFilterDataStore.h

-Original file line number
+Diff line change
@@ Expand Up / @@ -672,7 +672,7 @@ class SeparableBoxFilterDataStore @@
     	inline void VerifyAllEntitiesForAllColumns()
     	{
     		for(auto &column_data : columnData)
-    			column_data->VerifyAllEntities();
+    			column_data->VerifyAllEntities(numEntities);
     	}
     	//deletes the index and associated data
@@ Expand Down @@

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

24958: A general fix for SBFDS index issues #490

Uh oh!

Diff view

Diff view

There are no files selected for viewing