Skip to content

plan: Handle no leader and invalidate store region #484

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ jobs:
CARGO_INCREMENTAL: 0
NEXTEST_PROFILE: ci
TIKV_VERSION: v8.5.1
RUST_LOG: debug
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
Expand Down
5 changes: 3 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ ALL_FEATURES := integration-tests

NEXTEST_ARGS := --config-file $(shell pwd)/config/nextest.toml

INTEGRATION_TEST_ARGS := --features "integration-tests" --test-threads 1
#INTEGRATION_TEST_ARGS := --features "integration-tests" --test-threads 1
INTEGRATION_TEST_ARGS := --features "integration-tests" --no-capture

RUN_INTEGRATION_TEST := cargo nextest run ${NEXTEST_ARGS} --all ${INTEGRATION_TEST_ARGS}

Expand All @@ -29,7 +30,7 @@ unit-test: generate
integration-test: integration-test-txn integration-test-raw

integration-test-txn: generate
$(RUN_INTEGRATION_TEST) txn_
$(RUN_INTEGRATION_TEST) txn_split_batch

integration-test-raw: generate
$(RUN_INTEGRATION_TEST) raw_
Expand Down
5 changes: 3 additions & 2 deletions src/common/errors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ use std::result;
use thiserror::Error;

use crate::proto::kvrpcpb;
use crate::region::RegionVerId;
use crate::BoundRange;

/// An error originating from the TiKV client or dependencies.
Expand Down Expand Up @@ -90,8 +91,8 @@ pub enum Error {
#[error("Region {} is not found in the response", region_id)]
RegionNotFoundInResponse { region_id: u64 },
/// No leader is found for the given id.
#[error("Leader of region {} is not found", region_id)]
LeaderNotFound { region_id: u64 },
#[error("Leader of region {} is not found", region.id)]
LeaderNotFound { region: RegionVerId },
/// Scan limit exceeds the maximum
#[error("Limit {} exceeds max scan limit {}", limit, max_limit)]
MaxScanLimitExceeded { limit: u32, max_limit: u32 },
Expand Down
2 changes: 2 additions & 0 deletions src/mock.rs
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,8 @@ impl PdClient for MockPdClient {

async fn invalidate_region_cache(&self, _ver_id: crate::region::RegionVerId) {}

async fn invalidate_store_cache(&self, _store_id: crate::region::StoreId) {}

async fn load_keyspace(&self, _keyspace: &str) -> Result<keyspacepb::KeyspaceMeta> {
unimplemented!()
}
Expand Down
7 changes: 7 additions & 0 deletions src/pd/client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ use crate::proto::metapb;
use crate::region::RegionId;
use crate::region::RegionVerId;
use crate::region::RegionWithLeader;
use crate::region::StoreId;
use crate::region_cache::RegionCache;
use crate::store::KvConnect;
use crate::store::RegionStore;
Expand Down Expand Up @@ -205,6 +206,8 @@ pub trait PdClient: Send + Sync + 'static {
async fn update_leader(&self, ver_id: RegionVerId, leader: metapb::Peer) -> Result<()>;

async fn invalidate_region_cache(&self, ver_id: RegionVerId);

async fn invalidate_store_cache(&self, store_id: StoreId);
}

/// This client converts requests for the logical TiKV cluster into requests
Expand Down Expand Up @@ -271,6 +274,10 @@ impl<KvC: KvConnect + Send + Sync + 'static> PdClient for PdRpcClient<KvC> {
self.region_cache.invalidate_region_cache(ver_id).await
}

async fn invalidate_store_cache(&self, store_id: StoreId) {
self.region_cache.invalidate_store_cache(store_id).await
}

async fn load_keyspace(&self, keyspace: &str) -> Result<keyspacepb::KeyspaceMeta> {
self.pd.load_keyspace(keyspace).await
}
Expand Down
2 changes: 1 addition & 1 deletion src/region.rs
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ impl RegionWithLeader {
.as_ref()
.cloned()
.ok_or_else(|| Error::LeaderNotFound {
region_id: self.id(),
region: self.ver_id(),
})
.map(|s| s.store_id)
}
Expand Down
5 changes: 5 additions & 0 deletions src/region_cache.rs
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,11 @@ impl<C: RetryClientTrait> RegionCache<C> {
}
}

pub async fn invalidate_store_cache(&self, store_id: StoreId) {
let mut cache = self.store_cache.write().await;
cache.remove(&store_id);
}

pub async fn read_through_all_stores(&self) -> Result<Vec<Store>> {
let stores = self
.inner_client
Expand Down
79 changes: 64 additions & 15 deletions src/request/plan.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ use crate::pd::PdClient;
use crate::proto::errorpb;
use crate::proto::errorpb::EpochNotMatch;
use crate::proto::kvrpcpb;
use crate::region::RegionVerId;
use crate::region::StoreId;
use crate::request::shard::HasNextBatch;
use crate::request::NextBatch;
use crate::request::Shardable;
Expand Down Expand Up @@ -114,15 +116,13 @@ where
preserve_region_results: bool,
) -> Result<<Self as Plan>::Result> {
let shards = current_plan.shards(&pd_client).collect::<Vec<_>>().await;
debug!("single_plan_handler, shards: {}", shards.len());
let mut handles = Vec::new();
for shard in shards {
let (shard, region_store) = shard?;
let mut clone = current_plan.clone();
clone.apply_shard(shard, &region_store)?;
let handle = tokio::spawn(Self::single_shard_handler(
pd_client.clone(),
clone,
region_store,
current_plan.clone(),
shard,
backoff.clone(),
permits.clone(),
preserve_region_results,
Expand Down Expand Up @@ -153,12 +153,40 @@ where
#[async_recursion]
async fn single_shard_handler(
pd_client: Arc<PdC>,
plan: P,
region_store: RegionStore,
mut plan: P,
shard: Result<(<P as Shardable>::Shard, RegionStore)>,
mut backoff: Backoff,
permits: Arc<Semaphore>,
preserve_region_results: bool,
) -> Result<<Self as Plan>::Result> {
debug!("single_shard_handler");
let region_store = match shard.and_then(|(shard, region_store)| {
plan.apply_shard(shard, &region_store).map(|_| region_store)
}) {
Ok(region_store) => region_store,
Err(Error::LeaderNotFound { region }) => {
debug!(
"single_shard_handler::sharding: leader not found: {:?}",
region
);
return Self::handle_other_error(
pd_client,
plan,
region.clone(),
None,
backoff,
permits,
preserve_region_results,
Error::LeaderNotFound { region },
)
.await;
}
Err(err) => {
debug!("single_shard_handler::sharding, error: {:?}", err);
return Err(err);
}
};

// limit concurrent requests
let permit = permits.acquire().await.unwrap();
let res = plan.execute().await;
Expand All @@ -167,23 +195,30 @@ where
let mut resp = match res {
Ok(resp) => resp,
Err(e) if is_grpc_error(&e) => {
return Self::handle_grpc_error(
debug!("single_shard_handler:execute: grpc error: {:?}", e);
return Self::handle_other_error(
pd_client,
plan,
region_store,
region_store.region_with_leader.ver_id(),
region_store.region_with_leader.get_store_id().ok(),
backoff,
permits,
preserve_region_results,
e,
)
.await;
}
Err(e) => return Err(e),
Err(e) => {
debug!("single_shard_handler:execute: error: {:?}", e);
return Err(e);
}
};

if let Some(e) = resp.key_errors() {
debug!("single_shard_handler:execute: key errors: {:?}", e);
Ok(vec![Err(Error::MultipleKeyErrors(e))])
} else if let Some(e) = resp.region_error() {
debug!("single_shard_handler:execute: region error: {:?}", e);
match backoff.next_delay_duration() {
Some(duration) => {
let region_error_resolved =
Expand All @@ -208,18 +243,24 @@ where
}
}

async fn handle_grpc_error(
#[allow(clippy::too_many_arguments)]
async fn handle_other_error(
pd_client: Arc<PdC>,
plan: P,
region_store: RegionStore,
region: RegionVerId,
store: Option<StoreId>,
mut backoff: Backoff,
permits: Arc<Semaphore>,
preserve_region_results: bool,
e: Error,
) -> Result<<Self as Plan>::Result> {
debug!("handle grpc error: {:?}", e);
let ver_id = region_store.region_with_leader.ver_id();
pd_client.invalidate_region_cache(ver_id).await;
debug!("handle_other_error: {:?}", e);
pd_client.invalidate_region_cache(region).await;
if is_grpc_error(&e) {
if let Some(store_id) = store {
pd_client.invalidate_store_cache(store_id).await;
}
}
match backoff.next_delay_duration() {
Some(duration) => {
sleep(duration).await;
Expand All @@ -246,7 +287,9 @@ pub(crate) async fn handle_region_error<PdC: PdClient>(
e: errorpb::Error,
region_store: RegionStore,
) -> Result<bool> {
debug!("handle_region_error: {:?}", e);
let ver_id = region_store.region_with_leader.ver_id();
let store_id = region_store.region_with_leader.get_store_id();
if let Some(not_leader) = e.not_leader {
if let Some(leader) = not_leader.leader {
match pd_client
Expand All @@ -269,6 +312,9 @@ pub(crate) async fn handle_region_error<PdC: PdClient>(
}
} else if e.store_not_match.is_some() {
pd_client.invalidate_region_cache(ver_id).await;
if let Ok(store_id) = store_id {
pd_client.invalidate_store_cache(store_id).await;
}
Ok(false)
} else if e.epoch_not_match.is_some() {
on_region_epoch_not_match(pd_client.clone(), region_store, e.epoch_not_match.unwrap()).await
Expand All @@ -284,6 +330,9 @@ pub(crate) async fn handle_region_error<PdC: PdClient>(
// TODO: pass the logger around
// info!("unknwon region error: {:?}", e);
pd_client.invalidate_region_cache(ver_id).await;
if let Ok(store_id) = store_id {
pd_client.invalidate_store_cache(store_id).await;
}
Ok(false)
}
}
Expand Down
2 changes: 1 addition & 1 deletion src/store/request.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ macro_rules! impl_request {
fn set_leader(&mut self, leader: &RegionWithLeader) -> Result<()> {
let ctx = self.context.get_or_insert(kvrpcpb::Context::default());
let leader_peer = leader.leader.as_ref().ok_or(Error::LeaderNotFound {
region_id: leader.region.id,
region: leader.ver_id(),
})?;
ctx.region_id = leader.region.id;
ctx.region_epoch = leader.region.region_epoch.clone();
Expand Down
4 changes: 2 additions & 2 deletions src/transaction/snapshot.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0.

use derive_new::new;
use log::debug;
use log::{debug, trace};

use crate::BoundRange;
use crate::Key;
Expand All @@ -25,7 +25,7 @@ pub struct Snapshot {
impl Snapshot {
/// Get the value associated with the given key.
pub async fn get(&mut self, key: impl Into<Key>) -> Result<Option<Value>> {
debug!("invoking get request on snapshot");
trace!("invoking get request on snapshot");
self.transaction.get(key).await
}

Expand Down
6 changes: 3 additions & 3 deletions src/transaction/transaction.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ use std::time::Instant;
use derive_new::new;
use fail::fail_point;
use futures::prelude::*;
use log::debug;
use log::warn;
use log::{debug, trace};
use tokio::time::Duration;

use crate::backoff::Backoff;
Expand Down Expand Up @@ -132,7 +132,7 @@ impl<PdC: PdClient> Transaction<PdC> {
/// # });
/// ```
pub async fn get(&mut self, key: impl Into<Key>) -> Result<Option<Value>> {
debug!("invoking transactional get request");
trace!("invoking transactional get request");
self.check_allow_operation().await?;
let timestamp = self.timestamp.clone();
let rpc = self.rpc.clone();
Expand Down Expand Up @@ -461,7 +461,7 @@ impl<PdC: PdClient> Transaction<PdC> {
/// # });
/// ```
pub async fn put(&mut self, key: impl Into<Key>, value: impl Into<Value>) -> Result<()> {
debug!("invoking transactional put request");
trace!("invoking transactional put request");
self.check_allow_operation().await?;
let key = key.into().encode_keyspace(self.keyspace, KeyMode::Txn);
if self.is_pessimistic() {
Expand Down
2 changes: 2 additions & 0 deletions tests/common/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ pub async fn clear_tikv() {
// To test with multiple regions, prewrite some data. Tests that hope to test
// with multiple regions should use keys in the corresponding ranges.
pub async fn init() -> Result<()> {
let _ = env_logger::try_init();

if env::var(ENV_ENABLE_MULIT_REGION).is_ok() {
// 1000 keys: 0..1000
let keys_1 = std::iter::successors(Some(0u32), |x| Some(x + 1))
Expand Down
Loading