linera-io
diff --git a/‎Cargo.lock
Lines changed: 1 addition & 0 deletions b/‎Cargo.lock
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/Cargo.lock
Lines changed: 1 addition & 0 deletions b/‎examples/Cargo.lock
Lines changed: 1 addition & 0 deletions
diff --git a/‎linera-views/Cargo.toml
Lines changed: 1 addition & 0 deletions b/‎linera-views/Cargo.toml
Lines changed: 1 addition & 0 deletions
diff --git a/‎linera-views/src/backends/scylla_db.rs
Lines changed: 222 additions & 25 deletions b/‎linera-views/src/backends/scylla_db.rs
Lines changed: 222 additions & 25 deletions
@@ -46,6 +46,7 @@ linera-base.workspace = true
 linera-views-derive.workspace = true
 linera-witty.workspace = true
 linked-hash-map.workspace = true
+lru.workspace = true
 prometheus.workspace = true
 rand = { workspace = true, features = ["small_rng"] }
 rocksdb = { workspace = true, optional = true }
 
@@ -9,31 +9,46 @@
 
 use std::{
     collections::{BTreeSet, HashMap},
+    num::NonZeroUsize,
     ops::Deref,
-    sync::Arc,
+    sync::{
+        atomic::{AtomicUsize, Ordering},
+        Arc, Mutex,
+    },
 };
 
 use async_lock::{Semaphore, SemaphoreGuard};
 use dashmap::{mapref::entry::Entry, DashMap};
 use futures::{future::join_all, FutureExt as _, StreamExt};
-use linera_base::ensure;
+use linera_base::{
+    data_types::{TimeDelta, Timestamp},
+    ensure,
+};
+use lru::LruCache;
 use scylla::{
     client::{
         execution_profile::{ExecutionProfile, ExecutionProfileHandle},
         session::Session,
         session_builder::SessionBuilder,
     },
+    cluster::{ClusterState, Node, NodeRef},
     deserialize::{DeserializationError, TypeCheckError},
     errors::{
-        DbError, ExecutionError, IntoRowsResultError, NewSessionError, NextPageError, NextRowError,
-        PagerExecutionError, PrepareError, RequestAttemptError, RequestError, RowsError,
+        ClusterStateTokenError, DbError, ExecutionError, IntoRowsResultError, NewSessionError,
+        NextPageError, NextRowError, PagerExecutionError, PrepareError, RequestAttemptError,
+        RequestError, RowsError,
     },
     policies::{
-        load_balancing::{DefaultPolicy, LoadBalancingPolicy},
+        load_balancing::{DefaultPolicy, FallbackPlan, LoadBalancingPolicy, RoutingInfo},
         retry::DefaultRetryPolicy,
     },
     response::PagingState,
-    statement::{batch::BatchType, prepared::PreparedStatement, Consistency},
+    routing::{Shard, Token},
+    statement::{
+        batch::{Batch, BatchType},
+        prepared::PreparedStatement,
+        Consistency,
+    },
 };
 use serde::{Deserialize, Serialize};
 use thiserror::Error;
@@ -97,6 +112,15 @@ const MAX_BATCH_SIZE: usize = 5000;
 /// The keyspace to use for the ScyllaDB database.
 const KEYSPACE: &str = "kv";
 
+/// The default size of the cache for the load balancing policies.
+const DEFAULT_LOAD_BALANCING_POLICY_CACHE_SIZE: usize = 50_000;
+
+enum LoadBalancingPolicyCacheEntry {
+    Ready(Arc<dyn LoadBalancingPolicy>),
+    // The timestamp of the last time the policy creation was attempted.
+    NotReady(Timestamp, Option<Token>),
+}
+
 /// The client for ScyllaDB:
 /// * The session allows to pass queries
 /// * The namespace that is being assigned to the database
@@ -116,6 +140,7 @@ struct ScyllaDbClient {
     find_key_values_by_prefix_bounded: PreparedStatement,
     multi_key_values: DashMap<usize, PreparedStatement>,
     multi_keys: DashMap<usize, PreparedStatement>,
+    batch_load_balancing_policies: Mutex<LruCache<Vec<u8>, LoadBalancingPolicyCacheEntry>>,
 }
 
 impl ScyllaDbClient {
@@ -206,6 +231,10 @@ impl ScyllaDbClient {
             find_key_values_by_prefix_bounded,
             multi_key_values: DashMap::new(),
             multi_keys: DashMap::new(),
+            batch_load_balancing_policies: Mutex::new(LruCache::new(
+                NonZeroUsize::try_from(DEFAULT_LOAD_BALANCING_POLICY_CACHE_SIZE)
+                    .expect("DEFAULT_LOAD_BALANCING_POLICY_CACHE_SIZE should not be zero"),
+            )),
         })
     }
 
@@ -411,46 +440,122 @@ impl ScyllaDbClient {
         Ok(rows.next().is_some())
     }
 
+    fn attempt_sticky_shard_policy_creation(
+        &self,
+        partition_key: &[u8],
+        token: Option<Token>,
+        cache: &mut LruCache<Vec<u8>, LoadBalancingPolicyCacheEntry>,
+    ) -> Arc<dyn LoadBalancingPolicy> {
+        match StickyShardPolicy::new(
+            &self.session,
+            &self.namespace,
+            partition_key,
+            token,
+            ScyllaDbClient::build_default_policy(),
+        ) {
+            Ok(policy) => {
+                let policy = Arc::new(policy);
+                cache.push(
+                    partition_key.to_vec(),
+                    LoadBalancingPolicyCacheEntry::Ready(policy.clone()),
+                );
+                policy
+            }
+            Err(error) => {
+                // Cache that the policy creation failed, so we don't try again too soon, and don't
+                // recalculate the token if not needed.
+                let token = match error {
+                    ScyllaDbStoreInternalError::MissingTokenEndpoints(token) => Some(token),
+                    _ => None,
+                };
+                cache.push(
+                    partition_key.to_vec(),
+                    LoadBalancingPolicyCacheEntry::NotReady(Timestamp::now(), token),
+                );
+                ScyllaDbClient::build_default_policy()
+            }
+        }
+    }
+
+    // Returns a batch query with a sticky shard policy, that always tries to route to the same
+    // ScyllaDB shard.
+    // Should be used only on batches where all statements are to the same partition key.
+    fn get_sticky_batch_query(
+        &self,
+        partition_key: &[u8],
+    ) -> Result<Batch, ScyllaDbStoreInternalError> {
+        // Since we assume this is all to the same partition key, we can use an unlogged batch.
+        // We could use a logged batch to get atomicity across different partitions, but that
+        // comes with a huge performance penalty (seems to double write latency).
+        let mut batch_query = Batch::new(BatchType::Unlogged);
+        // Getting the sticky shard policy does some serializing and hashing under the hood, so
+        // we cache the policy to avoid that extra work.
+        let policy = {
+            let mut cache = self
+                .batch_load_balancing_policies
+                .lock()
+                .map_err(|_| ScyllaDbStoreInternalError::PoisonedMutex)?;
+            if let Some(policy) = cache.get(partition_key) {
+                match policy {
+                    LoadBalancingPolicyCacheEntry::Ready(policy) => policy.clone(),
+                    LoadBalancingPolicyCacheEntry::NotReady(timestamp, token) => {
+                        if Timestamp::now().delta_since(*timestamp) > TimeDelta::from_secs(2) {
+                            self.attempt_sticky_shard_policy_creation(
+                                partition_key,
+                                *token,
+                                &mut cache,
+                            )
+                        } else {
+                            ScyllaDbClient::build_default_policy()
+                        }
+                    }
+                }
+            } else {
+                self.attempt_sticky_shard_policy_creation(partition_key, None, &mut cache)
+            }
+        };
+        let handle = Self::build_default_execution_profile_handle(policy);
+        batch_query.set_execution_profile_handle(Some(handle));
+
+        Ok(batch_query)
+    }
+
+    // Batches should be always to the same partition key. Batches across different partitions
+    // will not be atomic. If the caller wants atomicity, it's the caller's responsibility to
+    // make sure that the batch only has statements to the same partition key.
     async fn write_batch_internal(
         &self,
         root_key: &[u8],
         batch: UnorderedBatch,
     ) -> Result<(), ScyllaDbStoreInternalError> {
-        let session = &self.session;
-        let mut batch_query = scylla::statement::batch::Batch::new(BatchType::Unlogged);
-        let mut batch_values = Vec::new();
-        let query1 = &self.write_batch_delete_prefix_unbounded;
-        let query2 = &self.write_batch_delete_prefix_bounded;
         Self::check_batch_len(&batch)?;
+        let session = &self.session;
+        let mut batch_query = self.get_sticky_batch_query(root_key)?;
+        let mut batch_values = Vec::with_capacity(batch.len());
+
         for key_prefix in batch.key_prefix_deletions {
             Self::check_key_size(&key_prefix)?;
             match get_upper_bound_option(&key_prefix) {
                 None => {
-                    let values = vec![root_key.to_vec(), key_prefix];
-                    batch_values.push(values);
-                    batch_query.append_statement(query1.clone());
+                    batch_query.append_statement(self.write_batch_delete_prefix_unbounded.clone());
+                    batch_values.push(vec![root_key.to_vec(), key_prefix]);
                 }
                 Some(upper_bound) => {
-                    let values = vec![root_key.to_vec(), key_prefix, upper_bound];
-                    batch_values.push(values);
-                    batch_query.append_statement(query2.clone());
+                    batch_query.append_statement(self.write_batch_delete_prefix_bounded.clone());
+                    batch_values.push(vec![root_key.to_vec(), key_prefix, upper_bound]);
                 }
             }
         }
-        let query3 = &self.write_batch_deletion;
         for key in batch.simple_unordered_batch.deletions {
             Self::check_key_size(&key)?;
-            let values = vec![root_key.to_vec(), key];
-            batch_values.push(values);
-            batch_query.append_statement(query3.clone());
+            batch_query.append_statement(self.write_batch_deletion.clone());
+            batch_values.push(vec![root_key.to_vec(), key]);
         }
-        let query4 = &self.write_batch_insertion;
         for (key, value) in batch.simple_unordered_batch.insertions {
             Self::check_key_size(&key)?;
             Self::check_value_size(&value)?;
-            let values = vec![root_key.to_vec(), key, value];
-            batch_values.push(values);
-            batch_query.append_statement(query4.clone());
+            batch_query.append_statement(self.write_batch_insertion.clone());
+            batch_values.push(vec![root_key.to_vec(), key, value]);
         }
         session.batch(&batch_query, batch_values).await?;
         Ok(())
@@ -523,6 +628,83 @@ impl ScyllaDbClient {
     }
 }
 
+// Batch statements in ScyllaDb are currently not token aware. The batch gets sent to a random
+// node: https://rust-driver.docs.scylladb.com/stable/statements/batch.html#performance
+// However, for batches where all statements are to the same partition key, we can use a sticky
+// shard policy to route to the same shard, and make batches be token aware.
+//
+// This is a policy that always tries to route to the ScyllaDB shards that contain the token, in a
+// round-robin fashion.
+#[derive(Debug)]
+struct StickyShardPolicy {
+    replicas: Vec<(Arc<Node>, Shard)>,
+    current_replica_index: AtomicUsize,
+    fallback: Arc<dyn LoadBalancingPolicy>,
+}
+
+impl StickyShardPolicy {
+    fn new(
+        session: &Session,
+        namespace: &str,
+        partition_key: &[u8],
+        token: Option<Token>,
+        fallback: Arc<dyn LoadBalancingPolicy>,
+    ) -> Result<Self, ScyllaDbStoreInternalError> {
+        let cluster = session.get_cluster_state();
+        let token = if let Some(token) = token {
+            token
+        } else {
+            cluster.compute_token(KEYSPACE, namespace, &(partition_key,))?
+        };
+        let replicas = cluster.get_token_endpoints(KEYSPACE, namespace, token);
+        if replicas.is_empty() {
+            // The driver won't always have all the token information available,
+            // but we can try again later.
+            return Err(ScyllaDbStoreInternalError::MissingTokenEndpoints(token));
+        }
+        Ok(Self {
+            replicas,
+            current_replica_index: AtomicUsize::new(0),
+            fallback,
+        })
+    }
+}
+
+impl LoadBalancingPolicy for StickyShardPolicy {
+    fn name(&self) -> String {
+        "StickyShardPolicy".to_string()
+    }
+
+    // Always try first to route to the sticky shard.
+    fn pick<'a>(
+        &'a self,
+        _request: &'a RoutingInfo<'a>,
+        _cluster: &'a ClusterState,
+    ) -> Option<(NodeRef<'a>, Option<Shard>)> {
+        let current_replica_index = self.current_replica_index.load(Ordering::Acquire);
+        let new_replica_index = (current_replica_index + 1) % self.replicas.len();
+        self.current_replica_index
+            .compare_exchange(
+                current_replica_index,
+                new_replica_index,
+                Ordering::Release,
+                Ordering::Relaxed,
+            )
+            .ok()?;
+        let (node, shard) = &self.replicas[new_replica_index];
+        Some((node, Some(*shard)))
+    }
+
+    // Fallback to the default policy.
+    fn fallback<'a>(
+        &'a self,
+        request: &'a RoutingInfo,
+        cluster: &'a ClusterState,
+    ) -> FallbackPlan<'a> {
+        self.fallback.fallback(request, cluster)
+    }
+}
+
 /// The client itself and the keeping of the count of active connections.
 #[derive(Clone)]
 pub struct ScyllaDbStoreInternal {
@@ -594,6 +776,18 @@ pub enum ScyllaDbStoreInternalError {
     /// A next row error in ScyllaDB
     #[error(transparent)]
     NextRowError(#[from] NextRowError),
+
+    /// A token error in ScyllaDB
+    #[error(transparent)]
+    ClusterStateTokenError(#[from] ClusterStateTokenError),
+
+    /// The token endpoint information is currently missing from the driver
+    #[error("The token endpoint information is currently missing from the driver")]
+    MissingTokenEndpoints(Token),
+
+    /// The mutex is poisoned
+    #[error("The mutex is poisoned")]
+    PoisonedMutex,
 }
 
 impl KeyValueStoreError for ScyllaDbStoreInternalError {
@@ -705,6 +899,9 @@ impl DirectWritableKeyValueStore for ScyllaDbStoreInternal {
     // https://github.com/scylladb/scylladb/blob/master/docs/dev/timestamp-conflict-resolution.md
     type Batch = UnorderedBatch;
 
+    // Batches should be always to the same partition key. Batches across different partitions
+    // will not be atomic. If the caller wants atomicity, it's the caller's responsibility to
+    // make sure that the batch only has statements to the same partition key.
     async fn write_batch(&self, batch: Self::Batch) -> Result<(), ScyllaDbStoreInternalError> {
         let store = self.store.deref();
         let _guard = self.acquire().await;