-
-
Notifications
You must be signed in to change notification settings - Fork 28
memtable/skiplist: add a purpose-built skiplist #131
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
1f12a1a
41783c9
b53b708
ee1046a
b0cb95e
0c0affb
868929b
0a7ebc1
d76fe11
e526b55
b9476b2
65a59d1
a222a4c
f48ac0f
e3df87e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,120 @@ | ||
| // Copyright (c) 2024-present, fjall-rs | ||
| // This source code is licensed under both the Apache 2.0 and MIT License | ||
| // (found in the LICENSE-* files in the repository) | ||
|
|
||
| use std::{ | ||
| alloc::Layout, | ||
| mem::offset_of, | ||
| sync::{ | ||
| atomic::{AtomicPtr, AtomicUsize, Ordering}, | ||
| Mutex, | ||
| }, | ||
| }; | ||
|
|
||
| // DEFAULT_BUFFER_SIZE needs to be at least big enough for one fullly-aligned node | ||
| // for the crate to work correctly. Anything larger than that will work. | ||
| // | ||
| // TODO: Justify this size. | ||
| const DEFAULT_BUFFER_SIZE: usize = (32 << 10) - size_of::<AtomicUsize>(); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Need to play with this a bit - but should probably be much higher by default: 1 MB or so?
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, it should be bigger than 32k, but 1MiB might be too big. The keys and values are not inline, it’s just the metadata. The questions I’d have are how expensive is allocating a new block, and how expensive is inserting into the skip map. My guess is that the alloc is not likely worse than 10us (it’s probably way less) and the inserts are ~100ns. If you can fit 1000 in here (if we say the average links is 32 and the key and value are each 32 bytes), then you’ll have spent at least 10x as long doing the inserting. In practice I think the mallocs even with zeroing is a lot cheaper. The benchmarks I was playing with don’t show much win above 256KiB. |
||
|
|
||
| impl<const BUFFER_SIZE: usize> Default for Arenas<BUFFER_SIZE> { | ||
| fn default() -> Self { | ||
| Self::new() | ||
| } | ||
| } | ||
|
|
||
| unsafe impl<const N: usize> Send for Arenas<N> {} | ||
| unsafe impl<const N: usize> Sync for Arenas<N> {} | ||
|
|
||
| pub(crate) struct Arenas<const BUFFER_SIZE: usize = DEFAULT_BUFFER_SIZE> { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Eventually, for write transactions, the size should be much smaller (so that small transactions don't overallocate too much) - so this needs to be a non-generic parameter.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Okay, I can do that. |
||
| // The current set of Arenas | ||
| arenas: Mutex<Vec<*mut Buffer<BUFFER_SIZE>>>, | ||
| // Cache of the currently open Arena. It'll be the last item in the buffers | ||
| // vec. This atomic is only ever written while holding the buffers Mutex. | ||
| open_arena: AtomicPtr<Buffer<BUFFER_SIZE>>, | ||
| } | ||
|
|
||
| impl<const BUFFER_SIZE: usize> Arenas<BUFFER_SIZE> { | ||
| pub(crate) fn new() -> Self { | ||
| Self { | ||
| arenas: Mutex::default(), | ||
| open_arena: AtomicPtr::default(), | ||
| } | ||
| } | ||
| } | ||
|
|
||
| impl<const BUFFER_SIZE: usize> Arenas<BUFFER_SIZE> { | ||
| pub(crate) fn alloc(&self, layout: Layout) -> *mut u8 { | ||
| loop { | ||
| let buffer_tail = self.open_arena.load(Ordering::Acquire); | ||
| if !buffer_tail.is_null() { | ||
| if let Some(offset) = try_alloc(buffer_tail, layout) { | ||
| return offset; | ||
| } | ||
| } | ||
|
|
||
| let mut buffers = self.arenas.lock().expect("lock is poisoned"); | ||
| let buffer = buffers.last().unwrap_or(&std::ptr::null_mut()); | ||
| if *buffer != buffer_tail { | ||
| // Lost the race with somebody else. | ||
| continue; | ||
| } | ||
|
|
||
| let new_buffer: Box<Buffer<BUFFER_SIZE>> = Box::default(); | ||
| let new_buffer = Box::into_raw(new_buffer); | ||
| self.open_arena.store(new_buffer, Ordering::Release); | ||
| buffers.push(new_buffer); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| struct Buffer<const N: usize> { | ||
| offset: AtomicUsize, | ||
| data: [u8; N], | ||
| } | ||
|
|
||
| impl<const N: usize> Default for Buffer<N> { | ||
| fn default() -> Self { | ||
| Self { | ||
| offset: AtomicUsize::default(), | ||
| data: [0; N], | ||
| } | ||
| } | ||
| } | ||
|
|
||
| impl<const N: usize> Drop for Arenas<N> { | ||
| fn drop(&mut self) { | ||
| let mut buffers = self.arenas.lock().expect("lock is poisoned"); | ||
|
|
||
| for buffer in buffers.drain(..) { | ||
| drop(unsafe { Box::from_raw(buffer) }); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| fn try_alloc<const N: usize>(buf: *mut Buffer<N>, layout: Layout) -> Option<*mut u8> { | ||
| let mut cur_offset = unsafe { &(*buf).offset }.load(Ordering::Relaxed); | ||
|
|
||
| loop { | ||
| let buf_start = unsafe { buf.byte_add(offset_of!(Buffer<N>, data)) as *mut u8 }; | ||
| let free_start = unsafe { buf_start.byte_add(cur_offset) }; | ||
| let start_addr = unsafe { free_start.byte_add(free_start.align_offset(layout.align())) }; | ||
| let new_offset = ((start_addr as usize) + layout.size()) - (buf_start as usize); | ||
| if new_offset > N { | ||
| return None; | ||
| } | ||
|
|
||
| // Note that we can get away with using relaxed ordering here because we're not | ||
| // asserting anything about the contents of the buffer. We're just trying to | ||
| // allocate a new node. | ||
| match unsafe { &(*buf).offset }.compare_exchange( | ||
| cur_offset, | ||
| new_offset, | ||
| Ordering::Relaxed, | ||
| Ordering::Relaxed, | ||
| ) { | ||
| Ok(_offset) => return Some(start_addr), | ||
| Err(offset) => cur_offset = offset, | ||
| } | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,30 @@ | ||
| // Copyright (c) 2024-present, fjall-rs | ||
| // This source code is licensed under both the Apache 2.0 and MIT License | ||
| // (found in the LICENSE-* files in the repository) | ||
|
|
||
| // This implementation was heavily inspired by: | ||
| // * https://github.com/andy-kimball/arenaskl/tree/f7010085 | ||
| // * https://github.com/crossbeam-rs/crossbeam/tree/983d56b6/crossbeam-skiplist | ||
|
|
||
| //! This mod is a purpose-built concurrent skiplist intended for use | ||
| //! by the memtable. | ||
| //! | ||
| //! Due to the requirements of memtable, there are a number of notable in the | ||
| //! features it lacks: | ||
| //! - Updates | ||
| //! - Deletes | ||
| //! - Overwrites | ||
| //! | ||
| //! The main reasons for its existence are that it | ||
| //! - provides concurrent reads and inserts, and | ||
| //! - batches memory allocations | ||
| //! | ||
| //! Prior to this implementation, `crossbeam_skiplist` was used. | ||
|
|
||
| mod arena; | ||
| mod skipmap; | ||
|
|
||
| pub use skipmap::SkipMap; | ||
|
|
||
| #[cfg(test)] | ||
| mod test; |
Uh oh!
There was an error while loading. Please reload this page.