From 630ddc059a19dfee704d3ba80afa6e6f6e7483ba Mon Sep 17 00:00:00 2001 From: Connor Date: Wed, 23 Aug 2023 21:09:34 -0700 Subject: [PATCH 001/220] server: add back heap profile HTTP API and make it secure (#15408) close tikv/tikv#11161 Add back heap profile HTTP API and make it secure. The API is removed by #11162 due to a secure issue that can visit arbitrary files on the server. This PR makes it only show the file name instead of the absolute path, and adds a paranoid check to make sure the passed file name is in the set of heap profiles. Signed-off-by: Connor1996 Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- src/server/status_server/mod.rs | 41 +++++++++++++++++++++++------ src/server/status_server/profile.rs | 17 +++++++++--- 2 files changed, 46 insertions(+), 12 deletions(-) diff --git a/src/server/status_server/mod.rs b/src/server/status_server/mod.rs index 679f21fdf6c..b49fdce12af 100644 --- a/src/server/status_server/mod.rs +++ b/src/server/status_server/mod.rs @@ -40,8 +40,9 @@ use openssl::{ }; use pin_project::pin_project; pub use profile::{ - activate_heap_profile, deactivate_heap_profile, jeprof_heap_profile, list_heap_profiles, - read_file, start_one_cpu_profile, start_one_heap_profile, + activate_heap_profile, deactivate_heap_profile, heap_profiles_dir, jeprof_heap_profile, + list_heap_profiles, read_file, start_one_cpu_profile, start_one_heap_profile, + HEAP_PROFILE_REGEX, }; use prometheus::TEXT_FORMAT; use regex::Regex; @@ -207,10 +208,34 @@ where let use_jeprof = query_pairs.get("jeprof").map(|x| x.as_ref()) == Some("true"); let result = if let Some(name) = query_pairs.get("name") { - if use_jeprof { - jeprof_heap_profile(name) + let re = Regex::new(HEAP_PROFILE_REGEX).unwrap(); + if !re.is_match(name) { + let errmsg = format!("heap profile name {} is invalid", name); + return Ok(make_response(StatusCode::BAD_REQUEST, errmsg)); + } + let profiles = match list_heap_profiles() { + Ok(s) => s, + Err(e) => return Ok(make_response(StatusCode::INTERNAL_SERVER_ERROR, e)), + }; + if profiles.iter().any(|(f, _)| f == name) { + let dir = match heap_profiles_dir() { + Some(path) => path, + None => { + return Ok(make_response( + StatusCode::INTERNAL_SERVER_ERROR, + "heap profile is not active", + )); + } + }; + let path = dir.join(name.as_ref()); + if use_jeprof { + jeprof_heap_profile(path.to_str().unwrap()) + } else { + read_file(path.to_str().unwrap()) + } } else { - read_file(name) + let errmsg = format!("heap profile {} not found", name); + return Ok(make_response(StatusCode::BAD_REQUEST, errmsg)); } } else { let mut seconds = 10; @@ -649,9 +674,9 @@ where (Method::GET, "/debug/pprof/heap_deactivate") => { Self::deactivate_heap_prof(req) } - // (Method::GET, "/debug/pprof/heap") => { - // Self::dump_heap_prof_to_resp(req).await - // } + (Method::GET, "/debug/pprof/heap") => { + Self::dump_heap_prof_to_resp(req).await + } (Method::GET, "/config") => { Self::get_config(req, &cfg_controller).await } diff --git a/src/server/status_server/profile.rs b/src/server/status_server/profile.rs index b3d91d3bea6..dd49c394046 100644 --- a/src/server/status_server/profile.rs +++ b/src/server/status_server/profile.rs @@ -31,7 +31,8 @@ pub use self::test_utils::TEST_PROFILE_MUTEX; use self::test_utils::{activate_prof, deactivate_prof, dump_prof}; // File name suffix for periodically dumped heap profiles. -const HEAP_PROFILE_SUFFIX: &str = ".heap"; +pub const HEAP_PROFILE_SUFFIX: &str = ".heap"; +pub const HEAP_PROFILE_REGEX: &str = r"^[0-9]{6,6}\.heap$"; lazy_static! { // If it's locked it means there are already a heap or CPU profiling. @@ -244,9 +245,17 @@ pub fn jeprof_heap_profile(path: &str) -> Result, String> { Ok(output.stdout) } +pub fn heap_profiles_dir() -> Option { + PROFILE_ACTIVE + .lock() + .unwrap() + .as_ref() + .map(|(_, dir)| dir.path().to_owned()) +} + pub fn list_heap_profiles() -> Result, String> { - let path = match &*PROFILE_ACTIVE.lock().unwrap() { - Some((_, ref dir)) => dir.path().to_str().unwrap().to_owned(), + let path = match heap_profiles_dir() { + Some(path) => path.into_os_string().into_string().unwrap(), None => return Ok(vec![]), }; @@ -257,7 +266,7 @@ pub fn list_heap_profiles() -> Result, String> { Ok(x) => x, _ => continue, }; - let f = item.path().to_str().unwrap().to_owned(); + let f = item.file_name().to_str().unwrap().to_owned(); if !f.ends_with(HEAP_PROFILE_SUFFIX) { continue; } From 6560d758f9143dc5125b0c5c3b0eaadbfecffa3c Mon Sep 17 00:00:00 2001 From: Xinye Tao Date: Thu, 24 Aug 2023 13:59:04 +0800 Subject: [PATCH 002/220] raftstore-v2: fix compact range bugs that causes false positive clean tablet (#15332) ref tikv/tikv#12842 - Fix a bug of compact range that causes a dirty tablet being reported as clean. - Added an additional check to ensure trim's correctness. - Fix a bug that some tablets are not destroyed and block peer destroy progress. Signed-off-by: tabokie Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- Cargo.lock | 6 +- components/engine_panic/src/compact.rs | 4 ++ components/engine_rocks/src/compact.rs | 4 ++ components/engine_traits/src/compact.rs | 3 + .../operation/command/admin/compact_log.rs | 42 +++++++++++--- components/raftstore-v2/src/operation/life.rs | 6 +- .../src/operation/ready/snapshot.rs | 2 + components/raftstore-v2/src/worker/tablet.rs | 12 ++++ components/test_raftstore/src/util.rs | 9 ++- tests/failpoints/cases/test_sst_recovery.rs | 4 +- .../raftstore/test_compact_after_delete.rs | 6 +- tests/integrations/raftstore/test_snap.rs | 55 ++++++++++++++++++- 12 files changed, 131 insertions(+), 22 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index bc8233ed509..abe174e638f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3107,7 +3107,7 @@ dependencies = [ [[package]] name = "librocksdb_sys" version = "0.1.0" -source = "git+https://github.com/tikv/rust-rocksdb.git#0c78f4072d766b152e83b25d3068b5c72b5feca1" +source = "git+https://github.com/tikv/rust-rocksdb.git#d861ede96cc2aae3c2ed5ea1c1c71454130a325e" dependencies = [ "bindgen 0.65.1", "bzip2-sys", @@ -3126,7 +3126,7 @@ dependencies = [ [[package]] name = "libtitan_sys" version = "0.0.1" -source = "git+https://github.com/tikv/rust-rocksdb.git#0c78f4072d766b152e83b25d3068b5c72b5feca1" +source = "git+https://github.com/tikv/rust-rocksdb.git#d861ede96cc2aae3c2ed5ea1c1c71454130a325e" dependencies = [ "bzip2-sys", "cc", @@ -5100,7 +5100,7 @@ dependencies = [ [[package]] name = "rocksdb" version = "0.3.0" -source = "git+https://github.com/tikv/rust-rocksdb.git#0c78f4072d766b152e83b25d3068b5c72b5feca1" +source = "git+https://github.com/tikv/rust-rocksdb.git#d861ede96cc2aae3c2ed5ea1c1c71454130a325e" dependencies = [ "libc 0.2.146", "librocksdb_sys", diff --git a/components/engine_panic/src/compact.rs b/components/engine_panic/src/compact.rs index 988bec790de..f64c97ff5b0 100644 --- a/components/engine_panic/src/compact.rs +++ b/components/engine_panic/src/compact.rs @@ -44,6 +44,10 @@ impl CompactExt for PanicEngine { ) -> Result<()> { panic!() } + + fn check_in_range(&self, start: Option<&[u8]>, end: Option<&[u8]>) -> Result<()> { + panic!() + } } pub struct PanicCompactedEvent; diff --git a/components/engine_rocks/src/compact.rs b/components/engine_rocks/src/compact.rs index 199b7d9f3be..f64c9a7d49e 100644 --- a/components/engine_rocks/src/compact.rs +++ b/components/engine_rocks/src/compact.rs @@ -121,6 +121,10 @@ impl CompactExt for RocksEngine { db.compact_files_cf(handle, &opts, &files, output_level) .map_err(r2e) } + + fn check_in_range(&self, start: Option<&[u8]>, end: Option<&[u8]>) -> Result<()> { + self.as_inner().check_in_range(start, end).map_err(r2e) + } } #[cfg(test)] diff --git a/components/engine_traits/src/compact.rs b/components/engine_traits/src/compact.rs index 05590a1ff32..2a4341a6788 100644 --- a/components/engine_traits/src/compact.rs +++ b/components/engine_traits/src/compact.rs @@ -71,6 +71,9 @@ pub trait CompactExt: CfNamesExt { max_subcompactions: u32, exclude_l0: bool, ) -> Result<()>; + + // Check all data is in the range [start, end). + fn check_in_range(&self, start: Option<&[u8]>, end: Option<&[u8]>) -> Result<()>; } pub trait CompactedEvent: Send { diff --git a/components/raftstore-v2/src/operation/command/admin/compact_log.rs b/components/raftstore-v2/src/operation/command/admin/compact_log.rs index 8920ea97e1d..93876475f5f 100644 --- a/components/raftstore-v2/src/operation/command/admin/compact_log.rs +++ b/components/raftstore-v2/src/operation/command/admin/compact_log.rs @@ -13,7 +13,13 @@ //! Updates truncated index, and compacts logs if the corresponding changes have //! been persisted in kvdb. -use std::path::PathBuf; +use std::{ + path::PathBuf, + sync::{ + atomic::{AtomicU64, Ordering}, + Arc, + }, +}; use engine_traits::{KvEngine, RaftEngine, RaftLogBatch}; use kvproto::raft_cmdpb::{AdminCmdType, AdminRequest, AdminResponse, RaftCmdRequest}; @@ -50,6 +56,10 @@ pub struct CompactLogContext { /// persisted. When persisted_apply is advanced, we need to notify tablet /// worker to destroy them. tombstone_tablets_wait_index: Vec, + /// Sometimes a tombstone tablet can be registered after tablet index is + /// advanced. We should not consider it as an active tablet otherwise it + /// might block peer destroy progress. + persisted_tablet_index: Arc, } impl CompactLogContext { @@ -60,6 +70,7 @@ impl CompactLogContext { last_applying_index, last_compacted_idx: 0, tombstone_tablets_wait_index: vec![], + persisted_tablet_index: AtomicU64::new(0).into(), } } @@ -379,7 +390,9 @@ impl Peer { )); } - /// Returns if there's any tombstone being removed. + /// Returns if there's any tombstone being removed. `persisted` state may + /// not be persisted yet, caller is responsible for actually destroying the + /// physical tablets afterwards. #[inline] pub fn remove_tombstone_tablets(&mut self, persisted: u64) -> bool { let compact_log_context = self.compact_log_context_mut(); @@ -398,11 +411,21 @@ impl Peer { } } + /// User can only increase this counter. + #[inline] + pub fn remember_persisted_tablet_index(&self) -> Arc { + self.compact_log_context().persisted_tablet_index.clone() + } + + /// Returns whether there's any tombstone tablet newer than persisted tablet + /// index. They might still be referenced by inflight apply and cannot be + /// destroyed. pub fn has_pending_tombstone_tablets(&self) -> bool { - !self - .compact_log_context() - .tombstone_tablets_wait_index - .is_empty() + let ctx = self.compact_log_context(); + let persisted = ctx.persisted_tablet_index.load(Ordering::Relaxed); + ctx.tombstone_tablets_wait_index + .iter() + .any(|i| *i > persisted) } #[inline] @@ -411,6 +434,8 @@ impl Peer { ctx: &StoreContext, task: &mut WriteTask, ) { + let applied_index = self.entry_storage().applied_index(); + self.remove_tombstone_tablets(applied_index); assert!( !self.has_pending_tombstone_tablets(), "{} all tombstone should be cleared before being destroyed.", @@ -421,7 +446,6 @@ impl Peer { None => return, }; let region_id = self.region_id(); - let applied_index = self.entry_storage().applied_index(); let sched = ctx.schedulers.tablet.clone(); let _ = sched.schedule(tablet::Task::prepare_destroy( tablet, @@ -557,13 +581,17 @@ impl Peer { } if self.remove_tombstone_tablets(new_persisted) { let sched = store_ctx.schedulers.tablet.clone(); + let counter = self.remember_persisted_tablet_index(); if !task.has_snapshot { task.persisted_cbs.push(Box::new(move || { let _ = sched.schedule(tablet::Task::destroy(region_id, new_persisted)); + // Writer guarantees no race between different callbacks. + counter.store(new_persisted, Ordering::Relaxed); })); } else { // In snapshot, the index is persisted, tablet can be destroyed directly. let _ = sched.schedule(tablet::Task::destroy(region_id, new_persisted)); + counter.store(new_persisted, Ordering::Relaxed); } } } diff --git a/components/raftstore-v2/src/operation/life.rs b/components/raftstore-v2/src/operation/life.rs index 52f00d137f8..e0e7f63785d 100644 --- a/components/raftstore-v2/src/operation/life.rs +++ b/components/raftstore-v2/src/operation/life.rs @@ -795,9 +795,13 @@ impl Peer { } // Wait for critical commands like split. if self.has_pending_tombstone_tablets() { + let applied_index = self.entry_storage().applied_index(); + let last_index = self.entry_storage().last_index(); info!( self.logger, - "postpone destroy because there're pending tombstone tablets" + "postpone destroy because there're pending tombstone tablets"; + "applied_index" => applied_index, + "last_index" => last_index, ); return true; } diff --git a/components/raftstore-v2/src/operation/ready/snapshot.rs b/components/raftstore-v2/src/operation/ready/snapshot.rs index 17deed333c1..9e0ed449cef 100644 --- a/components/raftstore-v2/src/operation/ready/snapshot.rs +++ b/components/raftstore-v2/src/operation/ready/snapshot.rs @@ -343,10 +343,12 @@ impl Peer { } self.schedule_apply_fsm(ctx); if self.remove_tombstone_tablets(snapshot_index) { + let counter = self.remember_persisted_tablet_index(); let _ = ctx .schedulers .tablet .schedule(tablet::Task::destroy(region_id, snapshot_index)); + counter.store(snapshot_index, Ordering::Relaxed); } if let Some(msg) = self.split_pending_append_mut().take_append_message() { let _ = ctx.router.send_raft_message(msg); diff --git a/components/raftstore-v2/src/worker/tablet.rs b/components/raftstore-v2/src/worker/tablet.rs index 183bb33cd34..7c330353836 100644 --- a/components/raftstore-v2/src/worker/tablet.rs +++ b/components/raftstore-v2/src/worker/tablet.rs @@ -298,6 +298,8 @@ impl Runner { .spawn(async move { let range1 = Range::new(&[], &start_key); let range2 = Range::new(&end_key, keys::DATA_MAX_KEY); + // Note: Refer to https://github.com/facebook/rocksdb/pull/11468. There's could be + // some files missing from compaction if dynamic_level_bytes is off. for r in [range1, range2] { // When compaction filter is present, trivial move is disallowed. if let Err(e) = @@ -323,6 +325,16 @@ impl Runner { return; } } + if let Err(e) = tablet.check_in_range(Some(&start_key), Some(&end_key)) { + debug_assert!(false, "check_in_range failed {:?}, is titan enabled?", e); + error!( + logger, + "trim did not remove all dirty data"; + "path" => tablet.path(), + "err" => %e, + ); + return; + } // drop before callback. drop(tablet); fail_point!("tablet_trimmed_finished"); diff --git a/components/test_raftstore/src/util.rs b/components/test_raftstore/src/util.rs index 02a74136bb6..f63c69f9631 100644 --- a/components/test_raftstore/src/util.rs +++ b/components/test_raftstore/src/util.rs @@ -81,15 +81,14 @@ pub fn must_get( } debug!("last try to get {}", log_wrappers::hex_encode_upper(key)); let res = engine.get_value_cf(cf, &keys::data_key(key)).unwrap(); - if value.is_none() && res.is_none() - || value.is_some() && res.is_some() && value.unwrap() == &*res.unwrap() - { + if value == res.as_ref().map(|r| r.as_ref()) { return; } panic!( - "can't get value {:?} for key {}", + "can't get value {:?} for key {}, actual={:?}", value.map(escape), - log_wrappers::hex_encode_upper(key) + log_wrappers::hex_encode_upper(key), + res ) } diff --git a/tests/failpoints/cases/test_sst_recovery.rs b/tests/failpoints/cases/test_sst_recovery.rs index a4c1f10b5ae..da5a3da1a32 100644 --- a/tests/failpoints/cases/test_sst_recovery.rs +++ b/tests/failpoints/cases/test_sst_recovery.rs @@ -105,7 +105,7 @@ fn test_sst_recovery_overlap_range_sst_exist() { must_get_equal(&engine1, b"7", b"val_1"); // Validate the damaged sst has been deleted. - compact_files_to_target_level(&engine1, true, 3).unwrap(); + compact_files_to_target_level(&engine1, true, 6).unwrap(); let files = engine1.as_inner().get_live_files(); assert_eq!(files.get_files_count(), 1); @@ -252,7 +252,7 @@ fn create_tikv_cluster_with_one_node_damaged() disturb_sst_file(&sst_path); // The sst file is damaged, so this action will fail. - assert_corruption(compact_files_to_target_level(&engine1, true, 3)); + assert_corruption(compact_files_to_target_level(&engine1, true, 6)); (cluster, pd_client, engine1) } diff --git a/tests/integrations/raftstore/test_compact_after_delete.rs b/tests/integrations/raftstore/test_compact_after_delete.rs index 6ba405bb918..a79fdfd4425 100644 --- a/tests/integrations/raftstore/test_compact_after_delete.rs +++ b/tests/integrations/raftstore/test_compact_after_delete.rs @@ -98,7 +98,8 @@ fn test_node_compact_after_delete_v2() { // disable it cluster.cfg.raft_store.region_compact_min_redundant_rows = 10000000; cluster.cfg.raft_store.region_compact_check_step = Some(2); - cluster.cfg.rocksdb.titan.enabled = true; + // TODO: v2 doesn't support titan. + // cluster.cfg.rocksdb.titan.enabled = true; cluster.run(); let region = cluster.get_region(b""); @@ -169,7 +170,8 @@ fn test_node_compact_after_update_v2() { cluster.cfg.raft_store.region_compact_redundant_rows_percent = 40; cluster.cfg.raft_store.region_compact_min_redundant_rows = 50; cluster.cfg.raft_store.region_compact_check_step = Some(2); - cluster.cfg.rocksdb.titan.enabled = true; + // TODO: titan is not supported in v2. + // cluster.cfg.rocksdb.titan.enabled = true; cluster.run(); let region = cluster.get_region(b""); diff --git a/tests/integrations/raftstore/test_snap.rs b/tests/integrations/raftstore/test_snap.rs index 9eda281e9e4..0b71978f63b 100644 --- a/tests/integrations/raftstore/test_snap.rs +++ b/tests/integrations/raftstore/test_snap.rs @@ -227,8 +227,6 @@ fn test_server_snap_gc() { #[test_case(test_raftstore::new_node_cluster)] #[test_case(test_raftstore::new_server_cluster)] -#[test_case(test_raftstore_v2::new_node_cluster)] -#[test_case(test_raftstore_v2::new_server_cluster)] fn test_concurrent_snap() { let mut cluster = new_cluster(0, 3); // Test that the handling of snapshot is correct when there are multiple @@ -279,6 +277,59 @@ fn test_concurrent_snap() { must_get_equal(&cluster.get_engine(3), b"k4", b"v4"); } +#[test_case(test_raftstore_v2::new_node_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] +fn test_concurrent_snap_v2() { + let mut cluster = new_cluster(0, 3); + // TODO: v2 doesn't support titan. + // Test that the handling of snapshot is correct when there are multiple + // snapshots which have overlapped region ranges arrive at the same + // raftstore. + // cluster.cfg.rocksdb.titan.enabled = true; + // Disable raft log gc in this test case. + cluster.cfg.raft_store.raft_log_gc_tick_interval = ReadableDuration::secs(60); + // For raftstore v2, after split, follower delays first messages (see + // is_first_message() for details), so leader does not send snapshot to + // follower and CollectSnapshotFilter holds parent region snapshot forever. + // We need to set a short wait duration so that leader can send snapshot + // in time and thus CollectSnapshotFilter can send parent region snapshot. + cluster.cfg.raft_store.snap_wait_split_duration = ReadableDuration::millis(100); + + let pd_client = Arc::clone(&cluster.pd_client); + // Disable default max peer count check. + pd_client.disable_default_operator(); + + let r1 = cluster.run_conf_change(); + cluster.must_put(b"k1", b"v1"); + pd_client.must_add_peer(r1, new_peer(2, 2)); + // Force peer 2 to be followers all the way. + cluster.add_send_filter(CloneFilterFactory( + RegionPacketFilter::new(r1, 2) + .msg_type(MessageType::MsgRequestVote) + .direction(Direction::Send), + )); + cluster.must_transfer_leader(r1, new_peer(1, 1)); + cluster.must_put(b"k3", b"v3"); + // Pile up snapshots of overlapped region ranges and deliver them all at once. + let (tx, rx) = mpsc::channel(); + cluster.add_recv_filter_on_node(3, Box::new(CollectSnapshotFilter::new(tx))); + pd_client.must_add_peer(r1, new_peer(3, 3)); + let region = cluster.get_region(b"k1"); + // Ensure the snapshot of range ("", "") is sent and piled in filter. + if let Err(e) = rx.recv_timeout(Duration::from_secs(1)) { + panic!("the snapshot is not sent before split, e: {:?}", e); + } + // Split the region range and then there should be another snapshot for the + // split ranges. + cluster.must_split(®ion, b"k2"); + must_get_equal(&cluster.get_engine(3), b"k3", b"v3"); + // Ensure the regions work after split. + cluster.must_put(b"k11", b"v11"); + must_get_equal(&cluster.get_engine(3), b"k11", b"v11"); + cluster.must_put(b"k4", b"v4"); + must_get_equal(&cluster.get_engine(3), b"k4", b"v4"); +} + #[test_case(test_raftstore::new_node_cluster)] #[test_case(test_raftstore::new_server_cluster)] #[test_case(test_raftstore_v2::new_node_cluster)] From 3ae1fb4320737c71a1c9d3f8ee6a3b7a9af6f6ea Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Thu, 24 Aug 2023 16:36:35 +0800 Subject: [PATCH 003/220] scheduler: not panic in the case of unexepected dropped channel when shutting dowm (#15426) ref tikv/tikv#15202 not panic in the case of unexepected dropped channel when shutting dowm Signed-off-by: SpadeA-Tang --- src/storage/txn/scheduler.rs | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/storage/txn/scheduler.rs b/src/storage/txn/scheduler.rs index 4df7033c21a..3c6a66c3941 100644 --- a/src/storage/txn/scheduler.rs +++ b/src/storage/txn/scheduler.rs @@ -1665,10 +1665,15 @@ impl TxnScheduler { // it may break correctness. // However, not release latch will cause deadlock which may ultimately block all // following txns, so we panic here. - panic!( - "response channel is unexpectedly dropped, tag {:?}, cid {}", - tag, cid - ); + // + // todo(spadea): Now, we only panic if it's not shutting down, although even in + // close, this behavior is not acceptable. + if !tikv_util::thread_group::is_shutdown(!cfg!(test)) { + panic!( + "response channel is unexpectedly dropped, tag {:?}, cid {}", + tag, cid + ); + } } /// Returns whether it succeeds to write pessimistic locks to the in-memory From 8a44a2c4c11b3da9d776d2877f631922d3833933 Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Thu, 24 Aug 2023 19:50:06 +0800 Subject: [PATCH 004/220] raftstore: disable duplicated mvcc key compaction check by default (#15427) close tikv/tikv#15282 disable duplicated mvcc key check compaction by default Signed-off-by: SpadeA-Tang --- components/raftstore-v2/src/operation/misc.rs | 2 +- components/raftstore/src/store/config.rs | 27 ++++++++++++++++--- components/raftstore/src/store/fsm/store.rs | 2 +- etc/config-template.toml | 9 +++++++ src/config/mod.rs | 3 +++ tests/integrations/config/mod.rs | 2 +- .../raftstore/test_compact_after_delete.rs | 4 ++- 7 files changed, 41 insertions(+), 8 deletions(-) diff --git a/components/raftstore-v2/src/operation/misc.rs b/components/raftstore-v2/src/operation/misc.rs index 867b4192dac..fafca29ea85 100644 --- a/components/raftstore-v2/src/operation/misc.rs +++ b/components/raftstore-v2/src/operation/misc.rs @@ -102,7 +102,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T> StoreFsmDelegate<'a, EK, ER, T> { self.store_ctx.cfg.region_compact_min_tombstones, self.store_ctx.cfg.region_compact_tombstones_percent, self.store_ctx.cfg.region_compact_min_redundant_rows, - self.store_ctx.cfg.region_compact_redundant_rows_percent, + self.store_ctx.cfg.region_compact_redundant_rows_percent(), ), })) { diff --git a/components/raftstore/src/store/config.rs b/components/raftstore/src/store/config.rs index 817be7eb969..257480b4c25 100644 --- a/components/raftstore/src/store/config.rs +++ b/components/raftstore/src/store/config.rs @@ -140,7 +140,7 @@ pub struct Config { pub region_compact_min_redundant_rows: u64, /// Minimum percentage of redundant rows to trigger manual compaction. /// Should between 1 and 100. - pub region_compact_redundant_rows_percent: u64, + pub region_compact_redundant_rows_percent: Option, pub pd_heartbeat_tick_interval: ReadableDuration, pub pd_store_heartbeat_tick_interval: ReadableDuration, pub snap_mgr_gc_tick_interval: ReadableDuration, @@ -429,7 +429,7 @@ impl Default for Config { region_compact_min_tombstones: 10000, region_compact_tombstones_percent: 30, region_compact_min_redundant_rows: 50000, - region_compact_redundant_rows_percent: 20, + region_compact_redundant_rows_percent: None, pd_heartbeat_tick_interval: ReadableDuration::minutes(1), pd_store_heartbeat_tick_interval: ReadableDuration::secs(10), notify_capacity: 40960, @@ -581,6 +581,10 @@ impl Config { self.region_compact_check_step.unwrap() } + pub fn region_compact_redundant_rows_percent(&self) -> u64 { + self.region_compact_redundant_rows_percent.unwrap() + } + #[inline] pub fn warmup_entry_cache_enabled(&self) -> bool { self.max_entry_cache_warmup_duration.0 != Duration::from_secs(0) @@ -604,8 +608,11 @@ impl Config { if self.region_compact_check_step.is_none() { if raft_kv_v2 { self.region_compact_check_step = Some(5); + self.region_compact_redundant_rows_percent = Some(20); } else { self.region_compact_check_step = Some(100); + // Disable redundant rows check in default for v1. + self.region_compact_redundant_rows_percent = Some(100); } } @@ -766,6 +773,15 @@ impl Config { )); } + let region_compact_redundant_rows_percent = + self.region_compact_redundant_rows_percent.unwrap(); + if !(1..=100).contains(®ion_compact_redundant_rows_percent) { + return Err(box_err!( + "region-compact-redundant-rows-percent must between 1 and 100, current value is {}", + region_compact_redundant_rows_percent + )); + } + if self.local_read_batch_size == 0 { return Err(box_err!("local-read-batch-size must be greater than 0")); } @@ -992,8 +1008,11 @@ impl Config { .with_label_values(&["region_compact_min_redundant_rows"]) .set(self.region_compact_min_redundant_rows as f64); CONFIG_RAFTSTORE_GAUGE - .with_label_values(&["region_compact_tombstones_percent"]) - .set(self.region_compact_tombstones_percent as f64); + .with_label_values(&["region_compact_redundant_rows_percent"]) + .set( + self.region_compact_redundant_rows_percent + .unwrap_or_default() as f64, + ); CONFIG_RAFTSTORE_GAUGE .with_label_values(&["pd_heartbeat_tick_interval"]) .set(self.pd_heartbeat_tick_interval.as_secs_f64()); diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index c21ea65a589..df11ba51fc8 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -2525,7 +2525,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER self.ctx.cfg.region_compact_min_tombstones, self.ctx.cfg.region_compact_tombstones_percent, self.ctx.cfg.region_compact_min_redundant_rows, - self.ctx.cfg.region_compact_redundant_rows_percent, + self.ctx.cfg.region_compact_redundant_rows_percent(), ), }, )) { diff --git a/etc/config-template.toml b/etc/config-template.toml index 36d8d25d883..3c8a6015910 100644 --- a/etc/config-template.toml +++ b/etc/config-template.toml @@ -437,6 +437,15 @@ ## exceeds `region-compact-tombstones-percent`. # region-compact-tombstones-percent = 30 +## The minimum number of duplicated MVCC keys to trigger manual compaction. +# region-compact-min-redundant-rows = 50000 + +## The minimum percentage of duplicated MVCC keys to trigger manual compaction. +## It should be set between 1 and 100. Manual compaction is only triggered when the number of +## duplicated MVCC keys exceeds `region-compact-min-redundant-rows` and the percentage of duplicated MVCC keys +## exceeds `region-compact-redundant-rows-percent`. +# region-compact-redundant-rows-percent = 100 + ## Interval to check whether to start a manual compaction for Lock Column Family. ## If written bytes reach `lock-cf-compact-bytes-threshold` for Lock Column Family, TiKV will ## trigger a manual compaction for Lock Column Family. diff --git a/src/config/mod.rs b/src/config/mod.rs index 5c7f1424c38..ecb31c8aec6 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -5966,6 +5966,9 @@ mod tests { default_cfg .server .optimize_for(default_cfg.coprocessor.region_split_size()); + default_cfg + .raft_store + .optimize_for(default_cfg.storage.engine == EngineType::RaftKv2); default_cfg.security.redact_info_log = Some(false); default_cfg.coprocessor.region_max_size = Some(default_cfg.coprocessor.region_max_size()); default_cfg.coprocessor.region_max_keys = Some(default_cfg.coprocessor.region_max_keys()); diff --git a/tests/integrations/config/mod.rs b/tests/integrations/config/mod.rs index a65d4cfb46c..8fdbaa00f25 100644 --- a/tests/integrations/config/mod.rs +++ b/tests/integrations/config/mod.rs @@ -200,7 +200,7 @@ fn test_serde_custom_tikv_config() { region_compact_min_tombstones: 999, region_compact_tombstones_percent: 33, region_compact_min_redundant_rows: 999, - region_compact_redundant_rows_percent: 33, + region_compact_redundant_rows_percent: Some(33), pd_heartbeat_tick_interval: ReadableDuration::minutes(12), pd_store_heartbeat_tick_interval: ReadableDuration::secs(12), notify_capacity: 12_345, diff --git a/tests/integrations/raftstore/test_compact_after_delete.rs b/tests/integrations/raftstore/test_compact_after_delete.rs index a79fdfd4425..24034c83192 100644 --- a/tests/integrations/raftstore/test_compact_after_delete.rs +++ b/tests/integrations/raftstore/test_compact_after_delete.rs @@ -36,6 +36,7 @@ fn test_compact_after_delete(cluster: &mut Cluster) { cluster.cfg.raft_store.region_compact_check_interval = ReadableDuration::millis(100); cluster.cfg.raft_store.region_compact_min_tombstones = 500; cluster.cfg.raft_store.region_compact_tombstones_percent = 50; + cluster.cfg.raft_store.region_compact_redundant_rows_percent = Some(1); cluster.cfg.raft_store.region_compact_check_step = Some(1); cluster.cfg.rocksdb.titan.enabled = true; cluster.run(); @@ -97,6 +98,7 @@ fn test_node_compact_after_delete_v2() { cluster.cfg.raft_store.region_compact_tombstones_percent = 50; // disable it cluster.cfg.raft_store.region_compact_min_redundant_rows = 10000000; + cluster.cfg.raft_store.region_compact_redundant_rows_percent = Some(100); cluster.cfg.raft_store.region_compact_check_step = Some(2); // TODO: v2 doesn't support titan. // cluster.cfg.rocksdb.titan.enabled = true; @@ -167,7 +169,7 @@ fn test_node_compact_after_update_v2() { cluster.cfg.raft_store.region_compact_check_interval = ReadableDuration::millis(100); // disable it cluster.cfg.raft_store.region_compact_min_tombstones = 1000000; - cluster.cfg.raft_store.region_compact_redundant_rows_percent = 40; + cluster.cfg.raft_store.region_compact_redundant_rows_percent = Some(40); cluster.cfg.raft_store.region_compact_min_redundant_rows = 50; cluster.cfg.raft_store.region_compact_check_step = Some(2); // TODO: titan is not supported in v2. From 25959655f33ac27985962887d25a0da593fd62c8 Mon Sep 17 00:00:00 2001 From: tonyxuqqi Date: Thu, 24 Aug 2023 22:48:35 +0800 Subject: [PATCH 005/220] server: fix memory trace's leak metrics (#15353) close tikv/tikv#15357 Correct the raft_router/apply_router's alive and leak metrics. Signed-off-by: tonyxuqqi --- components/server/src/memory.rs | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/components/server/src/memory.rs b/components/server/src/memory.rs index 303ff257a78..fadf18f7534 100644 --- a/components/server/src/memory.rs +++ b/components/server/src/memory.rs @@ -19,9 +19,24 @@ impl MemoryTraceManager { for id in ids { let sub_trace = provider.sub_trace(id); let sub_trace_name = sub_trace.name(); - MEM_TRACE_SUM_GAUGE - .with_label_values(&[&format!("{}-{}", provider_name, sub_trace_name)]) - .set(sub_trace.sum() as i64) + let leaf_ids = sub_trace.get_children_ids(); + if leaf_ids.is_empty() { + MEM_TRACE_SUM_GAUGE + .with_label_values(&[&format!("{}-{}", provider_name, sub_trace_name)]) + .set(sub_trace.sum() as i64); + } else { + for leaf_id in leaf_ids { + let leaf = sub_trace.sub_trace(leaf_id); + MEM_TRACE_SUM_GAUGE + .with_label_values(&[&format!( + "{}-{}-{}", + provider_name, + sub_trace_name, + leaf.name(), + )]) + .set(leaf.sum() as i64); + } + } } MEM_TRACE_SUM_GAUGE From bea230d98c61de9847121a0f0bb9c4588b20e4de Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Fri, 25 Aug 2023 12:35:07 +0800 Subject: [PATCH 006/220] raftstore: fix unwrap panic of region_compact_redundant_rows_percent (#15440) close tikv/tikv#15438 fix unwrap panic of region_compact_redundant_rows_percent Signed-off-by: SpadeA-Tang --- components/raftstore/src/store/config.rs | 8 ++- src/config/mod.rs | 63 ++++++++++++++++++++++++ 2 files changed, 70 insertions(+), 1 deletion(-) diff --git a/components/raftstore/src/store/config.rs b/components/raftstore/src/store/config.rs index 257480b4c25..f96ed2b7a45 100644 --- a/components/raftstore/src/store/config.rs +++ b/components/raftstore/src/store/config.rs @@ -608,9 +608,15 @@ impl Config { if self.region_compact_check_step.is_none() { if raft_kv_v2 { self.region_compact_check_step = Some(5); - self.region_compact_redundant_rows_percent = Some(20); } else { self.region_compact_check_step = Some(100); + } + } + + if self.region_compact_redundant_rows_percent.is_none() { + if raft_kv_v2 { + self.region_compact_redundant_rows_percent = Some(20); + } else { // Disable redundant rows check in default for v1. self.region_compact_redundant_rows_percent = Some(100); } diff --git a/src/config/mod.rs b/src/config/mod.rs index ecb31c8aec6..f7c338379ef 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -6428,4 +6428,67 @@ mod tests { Some(ReadableSize::gb(1)) ); } + + #[test] + fn test_compact_check_default() { + let content = r#" + [raftstore] + region-compact-check-step = 50 + "#; + let mut cfg: TikvConfig = toml::from_str(content).unwrap(); + cfg.validate().unwrap(); + assert_eq!(cfg.raft_store.region_compact_check_step.unwrap(), 50); + assert_eq!( + cfg.raft_store + .region_compact_redundant_rows_percent + .unwrap(), + 100 + ); + + let content = r#" + [raftstore] + region-compact-check-step = 50 + [storage] + engine = "partitioned-raft-kv" + "#; + let mut cfg: TikvConfig = toml::from_str(content).unwrap(); + cfg.validate().unwrap(); + assert_eq!(cfg.raft_store.region_compact_check_step.unwrap(), 50); + assert_eq!( + cfg.raft_store + .region_compact_redundant_rows_percent + .unwrap(), + 20 + ); + + let content = r#" + [raftstore] + region-compact-redundant-rows-percent = 50 + "#; + let mut cfg: TikvConfig = toml::from_str(content).unwrap(); + cfg.validate().unwrap(); + assert_eq!(cfg.raft_store.region_compact_check_step.unwrap(), 100); + assert_eq!( + cfg.raft_store + .region_compact_redundant_rows_percent + .unwrap(), + 50 + ); + + let content = r#" + [raftstore] + region-compact-redundant-rows-percent = 50 + [storage] + engine = "partitioned-raft-kv" + "#; + let mut cfg: TikvConfig = toml::from_str(content).unwrap(); + cfg.validate().unwrap(); + assert_eq!(cfg.raft_store.region_compact_check_step.unwrap(), 5); + assert_eq!( + cfg.raft_store + .region_compact_redundant_rows_percent + .unwrap(), + 50 + ); + } } From 40440210d81ea1770d5921475a51350f0bee50cd Mon Sep 17 00:00:00 2001 From: Connor Date: Fri, 25 Aug 2023 00:14:05 -0700 Subject: [PATCH 007/220] batch-system: use concurrent hashmap to avoid router cache (#15431) close tikv/tikv#15430 Use concurrent hashmap to avoid router cache occupying too much memory Signed-off-by: Connor1996 Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- Cargo.lock | 5 +- components/batch-system/Cargo.toml | 1 + components/batch-system/src/router.rs | 171 +++++------------- components/batch-system/tests/cases/router.rs | 20 +- components/raftstore/src/store/fsm/store.rs | 6 - components/tikv_util/src/mpsc/mod.rs | 25 ++- 6 files changed, 65 insertions(+), 163 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index abe174e638f..3c44a639e38 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -652,6 +652,7 @@ dependencies = [ "collections", "criterion", "crossbeam", + "dashmap", "derive_more", "fail", "file_system", @@ -1449,9 +1450,9 @@ dependencies = [ [[package]] name = "dashmap" -version = "5.1.0" +version = "5.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c0834a35a3fce649144119e18da2a4d8ed12ef3862f47183fd46f625d072d96c" +checksum = "4c8858831f7781322e539ea39e72449c46b059638250c14344fec8d0aa6e539c" dependencies = [ "cfg-if 1.0.0", "num_cpus", diff --git a/components/batch-system/Cargo.toml b/components/batch-system/Cargo.toml index ac69d544a21..bd1ae6c56b4 100644 --- a/components/batch-system/Cargo.toml +++ b/components/batch-system/Cargo.toml @@ -10,6 +10,7 @@ test-runner = ["derive_more"] [dependencies] collections = { workspace = true } crossbeam = "0.8" +dashmap = "5.2" derive_more = { version = "0.99", optional = true } fail = "0.5" file_system = { workspace = true } diff --git a/components/batch-system/src/router.rs b/components/batch-system/src/router.rs index 119b7875506..4f886fe3b3d 100644 --- a/components/batch-system/src/router.rs +++ b/components/batch-system/src/router.rs @@ -1,21 +1,17 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. // #[PerformanceCriticalPath] -use std::{ - cell::Cell, - mem, - sync::{ - atomic::{AtomicBool, AtomicUsize, Ordering}, - Arc, Mutex, - }, +use std::sync::{ + atomic::{AtomicBool, AtomicUsize, Ordering}, + Arc, }; -use collections::HashMap; use crossbeam::channel::{SendError, TrySendError}; -use tikv_util::{debug, info, lru::LruCache, time::Instant, Either}; +use dashmap::DashMap; +use tikv_util::{debug, info, time::Instant, Either}; use crate::{ - fsm::{Fsm, FsmScheduler, FsmState}, + fsm::{Fsm, FsmScheduler}, mailbox::{BasicMailbox, Mailbox}, metrics::*, }; @@ -27,18 +23,14 @@ pub struct RouterTrace { pub leak: usize, } -struct NormalMailMap { - map: HashMap>, - // Count of Mailboxes that is stored in `map`. - alive_cnt: Arc, -} - enum CheckDoResult { NotExist, Invalid, Valid(T), } +const ROUTER_SHRINK_SIZE: usize = 1000; + /// Router routes messages to its target FSM's mailbox. /// /// In our abstract model, every batch system has two different kind of @@ -54,8 +46,7 @@ enum CheckDoResult { /// Normal FSM and control FSM can have different scheduler, but this is not /// required. pub struct Router { - normals: Arc>>, - caches: Cell>>, + normals: Arc>>, pub(super) control_box: BasicMailbox, // TODO: These two schedulers should be unified as single one. However // it's not possible to write FsmScheduler + FsmScheduler @@ -85,11 +76,7 @@ where state_cnt: Arc, ) -> Router { Router { - normals: Arc::new(Mutex::new(NormalMailMap { - map: HashMap::default(), - alive_cnt: Arc::default(), - })), - caches: Cell::new(LruCache::with_capacity_and_sample(1024, 7)), + normals: Arc::new(DashMap::default()), control_box, normal_scheduler, control_scheduler, @@ -106,72 +93,32 @@ where /// A helper function that tries to unify a common access pattern to /// mailbox. /// - /// Generally, when sending a message to a mailbox, cache should be - /// check first, if not found, lock should be acquired. - /// /// Returns None means there is no mailbox inside the normal registry. /// Some(None) means there is expected mailbox inside the normal registry /// but it returns None after apply the given function. Some(Some) means - /// the given function returns Some and cache is updated if it's invalid. + /// the given function returns Some. #[inline] fn check_do(&self, addr: u64, mut f: F) -> CheckDoResult where F: FnMut(&BasicMailbox) -> Option, { - let caches = unsafe { &mut *self.caches.as_ptr() }; - let mut connected = true; - if let Some(mailbox) = caches.get(&addr) { - match f(mailbox) { - Some(r) => return CheckDoResult::Valid(r), - None => { - connected = false; - } - } - } - - let (cnt, mailbox) = { - let mut boxes = self.normals.lock().unwrap(); - let cnt = boxes.map.len(); - let b = match boxes.map.get_mut(&addr) { - Some(mailbox) => mailbox.clone(), - None => { - drop(boxes); - if !connected { - caches.remove(&addr); - } - return CheckDoResult::NotExist; - } - }; - (cnt, b) - }; - if cnt > caches.capacity() || cnt < caches.capacity() / 2 { - caches.resize(cnt); - } - - let res = f(&mailbox); - match res { - Some(r) => { - caches.insert(addr, mailbox); - CheckDoResult::Valid(r) - } + let mailbox = match self.normals.get_mut(&addr) { + Some(mailbox) => mailbox, None => { - if !connected { - caches.remove(&addr); - } - CheckDoResult::Invalid + return CheckDoResult::NotExist; } + }; + match f(&mailbox) { + Some(r) => CheckDoResult::Valid(r), + None => CheckDoResult::Invalid, } } /// Register a mailbox with given address. pub fn register(&self, addr: u64, mailbox: BasicMailbox) { - let mut normals = self.normals.lock().unwrap(); - if let Some(mailbox) = normals.map.insert(addr, mailbox) { + if let Some(mailbox) = self.normals.insert(addr, mailbox) { mailbox.close(); } - normals - .alive_cnt - .store(normals.map.len(), Ordering::Relaxed); } /// Same as send a message and then register the mailbox. @@ -183,32 +130,22 @@ where mailbox: BasicMailbox, msg: N::Message, ) -> Result<(), (BasicMailbox, N::Message)> { - let mut normals = self.normals.lock().unwrap(); - // Send has to be done within lock, otherwise the message may be handled - // before the mailbox is register. + if let Some(mailbox) = self.normals.insert(addr, mailbox.clone()) { + mailbox.close(); + } if let Err(SendError(m)) = mailbox.force_send(msg, &self.normal_scheduler) { + self.normals.remove(&addr); return Err((mailbox, m)); } - if let Some(mailbox) = normals.map.insert(addr, mailbox) { - mailbox.close(); - } - normals - .alive_cnt - .store(normals.map.len(), Ordering::Relaxed); Ok(()) } pub fn register_all(&self, mailboxes: Vec<(u64, BasicMailbox)>) { - let mut normals = self.normals.lock().unwrap(); - normals.map.reserve(mailboxes.len()); for (addr, mailbox) in mailboxes { - if let Some(m) = normals.map.insert(addr, mailbox) { + if let Some(m) = self.normals.insert(addr, mailbox) { m.close(); } } - normals - .alive_cnt - .store(normals.map.len(), Ordering::Relaxed); } /// Get the mailbox of specified address. @@ -280,13 +217,11 @@ where pub fn force_send(&self, addr: u64, msg: N::Message) -> Result<(), SendError> { match self.send(addr, msg) { Ok(()) => Ok(()), - Err(TrySendError::Full(m)) => { - let caches = unsafe { &mut *self.caches.as_ptr() }; - caches - .get(&addr) - .unwrap() - .force_send(m, &self.normal_scheduler) - } + Err(TrySendError::Full(m)) => self + .normals + .get(&addr) + .unwrap() + .force_send(m, &self.normal_scheduler), Err(TrySendError::Disconnected(m)) => { if self.is_shutdown() { Ok(()) @@ -321,10 +256,9 @@ where /// Try to notify all normal FSMs a message. pub fn broadcast_normal(&self, mut msg_gen: impl FnMut() -> N::Message) { let timer = Instant::now_coarse(); - let mailboxes = self.normals.lock().unwrap(); - for mailbox in mailboxes.map.values() { + self.normals.iter().for_each(|mailbox| { let _ = mailbox.force_send(msg_gen(), &self.normal_scheduler); - } + }); BROADCAST_NORMAL_DURATION.observe(timer.saturating_elapsed_secs()); } @@ -332,12 +266,13 @@ where pub fn broadcast_shutdown(&self) { info!("broadcasting shutdown"); self.shutdown.store(true, Ordering::SeqCst); - unsafe { &mut *self.caches.as_ptr() }.clear(); - let mut mailboxes = self.normals.lock().unwrap(); - for (addr, mailbox) in mailboxes.map.drain() { + for e in self.normals.iter() { + let addr = e.key(); + let mailbox = e.value(); debug!("[region {}] shutdown mailbox", addr); mailbox.close(); } + self.normals.clear(); self.control_box.close(); self.normal_scheduler.shutdown(); self.control_scheduler.shutdown(); @@ -346,51 +281,32 @@ where /// Close the mailbox of address. pub fn close(&self, addr: u64) { info!("shutdown mailbox"; "region_id" => addr); - unsafe { &mut *self.caches.as_ptr() }.remove(&addr); - let mut mailboxes = self.normals.lock().unwrap(); - if let Some(mb) = mailboxes.map.remove(&addr) { + if let Some((_, mb)) = self.normals.remove(&addr) { mb.close(); } - mailboxes - .alive_cnt - .store(mailboxes.map.len(), Ordering::Relaxed); - } - - pub fn clear_cache(&self) { - unsafe { &mut *self.caches.as_ptr() }.clear(); + if self.normals.capacity() - self.normals.len() > ROUTER_SHRINK_SIZE { + self.normals.shrink_to_fit(); + } } pub fn state_cnt(&self) -> &Arc { &self.state_cnt } - pub fn alive_cnt(&self) -> Arc { - self.normals.lock().unwrap().alive_cnt.clone() + pub fn alive_cnt(&self) -> usize { + self.normals.len() } pub fn trace(&self) -> RouterTrace { - let alive = self.normals.lock().unwrap().alive_cnt.clone(); + let alive = self.alive_cnt(); let total = self.state_cnt.load(Ordering::Relaxed); - let alive = alive.load(Ordering::Relaxed); // 1 represents the control fsm. let leak = if total > alive + 1 { total - alive - 1 } else { 0 }; - let mailbox_unit = mem::size_of::<(u64, BasicMailbox)>(); - let state_unit = mem::size_of::>(); - // Every message in crossbeam sender needs 8 bytes to store state. - let message_unit = mem::size_of::() + 8; - // crossbeam unbounded channel sender has a list of blocks. Every block has 31 - // unit and every sender has at least one sender. - let sender_block_unit = 31; - RouterTrace { - alive: (mailbox_unit * 8 / 7 // hashmap uses 7/8 of allocated memory. - + state_unit + message_unit * sender_block_unit) - * alive, - leak: (state_unit + message_unit * sender_block_unit) * leak, - } + RouterTrace { alive, leak } } } @@ -398,7 +314,6 @@ impl Clone for Router { fn clone(&self) -> Router { Router { normals: self.normals.clone(), - caches: Cell::new(LruCache::with_capacity_and_sample(1024, 7)), control_box: self.control_box.clone(), // These two schedulers should be unified as single one. However // it's not possible to write FsmScheduler + FsmScheduler diff --git a/components/batch-system/tests/cases/router.rs b/components/batch-system/tests/cases/router.rs index d746dfad5cb..66d0770d544 100644 --- a/components/batch-system/tests/cases/router.rs +++ b/components/batch-system/tests/cases/router.rs @@ -143,25 +143,19 @@ fn test_router_trace() { router.close(addr); }; - let router_clone = router.clone(); + let mut mailboxes = vec![]; for i in 0..10 { register_runner(i); - // Read mailbox to cache. - router_clone.mailbox(i).unwrap(); + mailboxes.push(router.mailbox(i).unwrap()); } - assert_eq!(router.alive_cnt().load(Ordering::Relaxed), 10); + assert_eq!(router.alive_cnt(), 10); assert_eq!(router.state_cnt().load(Ordering::Relaxed), 11); - // Routers closed but exist in the cache. for i in 0..10 { close_runner(i); } - assert_eq!(router.alive_cnt().load(Ordering::Relaxed), 0); + assert_eq!(router.alive_cnt(), 0); assert_eq!(router.state_cnt().load(Ordering::Relaxed), 11); - for i in 0..1024 { - register_runner(i); - // Read mailbox to cache, closed routers should be evicted. - router_clone.mailbox(i).unwrap(); - } - assert_eq!(router.alive_cnt().load(Ordering::Relaxed), 1024); - assert_eq!(router.state_cnt().load(Ordering::Relaxed), 1025); + drop(mailboxes); + assert_eq!(router.alive_cnt(), 0); + assert_eq!(router.state_cnt().load(Ordering::Relaxed), 1); } diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index df11ba51fc8..11167a4c395 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -468,10 +468,6 @@ where self.update_trace(); } - pub fn clear_cache(&self) { - self.router.clear_cache(); - } - fn update_trace(&self) { let router_trace = self.router.trace(); MEMTRACE_RAFT_ROUTER_ALIVE.trace(TraceEvent::Reset(router_trace.alive)); @@ -1847,8 +1843,6 @@ impl RaftBatchSystem { warn!("set thread priority for raftstore failed"; "error" => ?e); } self.workers = Some(workers); - // This router will not be accessed again, free all caches. - self.router.clear_cache(); Ok(()) } diff --git a/components/tikv_util/src/mpsc/mod.rs b/components/tikv_util/src/mpsc/mod.rs index 700691f1189..9a71dbc0c5e 100644 --- a/components/tikv_util/src/mpsc/mod.rs +++ b/components/tikv_util/src/mpsc/mod.rs @@ -8,9 +8,8 @@ pub mod future; pub mod priority_queue; use std::{ - cell::Cell, sync::{ - atomic::{AtomicBool, AtomicIsize, Ordering}, + atomic::{AtomicBool, AtomicIsize, AtomicUsize, Ordering}, Arc, }, time::Duration, @@ -208,7 +207,7 @@ const CHECK_INTERVAL: usize = 8; /// A sender of channel that limits the maximun pending messages count loosely. pub struct LooseBoundedSender { sender: Sender, - tried_cnt: Cell, + tried_cnt: AtomicUsize, limit: usize, } @@ -230,25 +229,23 @@ impl LooseBoundedSender { /// Send a message regardless its capacity limit. #[inline] pub fn force_send(&self, t: T) -> Result<(), SendError> { - let cnt = self.tried_cnt.get(); - self.tried_cnt.set(cnt + 1); + self.tried_cnt.fetch_add(1, Ordering::AcqRel); self.sender.send(t) } /// Attempts to send a message into the channel without blocking. #[inline] pub fn try_send(&self, t: T) -> Result<(), TrySendError> { - let cnt = self.tried_cnt.get(); let check_interval = || { fail_point!("loose_bounded_sender_check_interval", |_| 0); CHECK_INTERVAL }; - if cnt < check_interval() { - self.tried_cnt.set(cnt + 1); - } else if self.len() < self.limit { - self.tried_cnt.set(1); - } else { - return Err(TrySendError::Full(t)); + if self.tried_cnt.fetch_add(1, Ordering::AcqRel) >= check_interval() { + if self.len() < self.limit { + self.tried_cnt.store(1, Ordering::Release); + } else { + return Err(TrySendError::Full(t)); + } } match self.sender.send(t) { @@ -275,7 +272,7 @@ impl Clone for LooseBoundedSender { fn clone(&self) -> LooseBoundedSender { LooseBoundedSender { sender: self.sender.clone(), - tried_cnt: self.tried_cnt.clone(), + tried_cnt: AtomicUsize::new(0), limit: self.limit, } } @@ -287,7 +284,7 @@ pub fn loose_bounded(cap: usize) -> (LooseBoundedSender, Receiver) { ( LooseBoundedSender { sender, - tried_cnt: Cell::new(0), + tried_cnt: AtomicUsize::new(0), limit: cap, }, receiver, From 40b225f70c92db96baae7b85891c193c1674d2d4 Mon Sep 17 00:00:00 2001 From: cfzjywxk Date: Fri, 25 Aug 2023 15:29:05 +0800 Subject: [PATCH 008/220] raftstore: fix meta inconsistency issue (#15423) close tikv/tikv#13311 Fix the possible meta inconsistency issue. Signed-off-by: cfzjywxk Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- components/raftstore/src/store/fsm/peer.rs | 62 ++++++++------ components/raftstore/src/store/fsm/store.rs | 3 +- .../raftstore/src/store/peer_storage.rs | 3 + tests/failpoints/cases/test_split_region.rs | 80 ++++++++++++++++++- 4 files changed, 121 insertions(+), 27 deletions(-) diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index da91e26eb09..62a3a2650de 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -97,7 +97,7 @@ use crate::{ UnsafeRecoveryState, UnsafeRecoveryWaitApplySyncer, }, util, - util::{is_region_initialized, KeysInfoFormatter, LeaseState}, + util::{KeysInfoFormatter, LeaseState}, worker::{ Bucket, BucketRange, CleanupTask, ConsistencyCheckTask, GcSnapshotTask, RaftlogGcTask, ReadDelegate, ReadProgress, RegionTask, SplitCheckTask, @@ -322,6 +322,7 @@ where "replicate peer"; "region_id" => region_id, "peer_id" => peer.get_id(), + "store_id" => store_id, ); let mut region = metapb::Region::default(); @@ -2460,6 +2461,7 @@ where } }); + let is_initialized_peer = self.fsm.peer.is_initialized(); debug!( "handle raft message"; "region_id" => self.region_id(), @@ -2467,6 +2469,7 @@ where "message_type" => %util::MsgType(&msg), "from_peer_id" => msg.get_from_peer().get_id(), "to_peer_id" => msg.get_to_peer().get_id(), + "is_initialized_peer" => is_initialized_peer, ); if self.fsm.peer.pending_remove || self.fsm.stopped { @@ -3664,14 +3667,7 @@ where } let region_id = self.region_id(); - let is_initialized = self.fsm.peer.is_initialized(); - info!( - "starts destroy"; - "region_id" => region_id, - "peer_id" => self.fsm.peer_id(), - "merged_by_target" => merged_by_target, - "is_initialized" => is_initialized, - ); + let is_peer_initialized = self.fsm.peer.is_initialized(); // We can't destroy a peer which is handling snapshot. assert!(!self.fsm.peer.is_handling_snapshot()); @@ -3688,27 +3684,40 @@ where .snapshot_recovery_maybe_finish_wait_apply(/* force= */ true); } + (|| { + fail_point!( + "before_destroy_peer_on_peer_1003", + self.fsm.peer.peer_id() == 1003, + |_| {} + ); + })(); let mut meta = self.ctx.store_meta.lock().unwrap(); - let is_region_initialized_in_meta = meta - .regions - .get(®ion_id) - .map_or(false, |region| is_region_initialized(region)); - if !is_initialized && is_region_initialized_in_meta { - let region_in_meta = meta.regions.get(®ion_id).unwrap(); - error!( - "peer is destroyed inconsistently"; - "region_id" => region_id, + let is_latest_initialized = { + if let Some(latest_region_info) = meta.regions.get(®ion_id) { + util::is_region_initialized(latest_region_info) + } else { + false + } + }; + + if !is_peer_initialized && is_latest_initialized { + info!("skip destroy uninitialized peer as it's already initialized in meta"; + "region_id" => self.fsm.region_id(), "peer_id" => self.fsm.peer_id(), - "peers" => ?self.region().get_peers(), "merged_by_target" => merged_by_target, - "is_initialized" => is_initialized, - "is_region_initialized_in_meta" => is_region_initialized_in_meta, - "start_key_in_meta" => log_wrappers::Value::key(region_in_meta.get_start_key()), - "end_key_in_meta" => log_wrappers::Value::key(region_in_meta.get_end_key()), - "peers_in_meta" => ?region_in_meta.get_peers(), ); + return false; } + info!( + "starts destroy"; + "region_id" => self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + "merged_by_target" => merged_by_target, + "is_peer_initialized" => is_peer_initialized, + "is_latest_initialized" => is_latest_initialized, + ); + if meta.atomic_snap_regions.contains_key(&self.region_id()) { drop(meta); panic!( @@ -3764,7 +3773,7 @@ where self.ctx.router.close(region_id); self.fsm.stop(); - if is_initialized + if is_peer_initialized && !merged_by_target && meta .region_ranges @@ -3773,6 +3782,7 @@ where { panic!("{} meta corruption detected", self.fsm.peer.tag); } + if meta.regions.remove(®ion_id).is_none() && !merged_by_target { panic!("{} meta corruption detected", self.fsm.peer.tag) } @@ -4139,6 +4149,7 @@ where // Insert new regions and validation let mut is_uninitialized_peer_exist = false; + let self_store_id = self.ctx.store.get_id(); if let Some(r) = meta.regions.get(&new_region_id) { // Suppose a new node is added by conf change and the snapshot comes slowly. // Then, the region splits and the first vote message comes to the new node @@ -4160,6 +4171,7 @@ where "region_id" => new_region_id, "region" => ?new_region, "is_uninitialized_peer_exist" => is_uninitialized_peer_exist, + "store_id" => self_store_id, ); let (sender, mut new_peer) = match PeerFsm::create( diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index 11167a4c395..53559bbe1b8 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -1955,7 +1955,8 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER } info!( "region doesn't exist yet, wait for it to be split"; - "region_id" => region_id + "region_id" => region_id, + "to_peer_id" => msg.get_to_peer().get_id(), ); return Ok(CheckMsgStatus::FirstRequest); } diff --git a/components/raftstore/src/store/peer_storage.rs b/components/raftstore/src/store/peer_storage.rs index d89eafc3a46..a888929ca98 100644 --- a/components/raftstore/src/store/peer_storage.rs +++ b/components/raftstore/src/store/peer_storage.rs @@ -1017,6 +1017,9 @@ where // The `region` is updated after persisting in order to stay consistent with the // one in `StoreMeta::regions` (will be updated soon). // See comments in `apply_snapshot` for more details. + (|| { + fail_point!("before_set_region_on_peer_3", self.peer_id == 3, |_| {}); + })(); self.set_region(res.region.clone()); } } diff --git a/tests/failpoints/cases/test_split_region.rs b/tests/failpoints/cases/test_split_region.rs index 3520de4e3ad..dfd7002495c 100644 --- a/tests/failpoints/cases/test_split_region.rs +++ b/tests/failpoints/cases/test_split_region.rs @@ -1,5 +1,4 @@ // Copyright 2018 TiKV Project Authors. Licensed under Apache-2.0. - use std::{ sync::{ atomic::{AtomicBool, Ordering}, @@ -41,6 +40,85 @@ use tikv_util::{ }; use txn_types::{Key, LastChange, PessimisticLock, TimeStamp}; +#[test] +fn test_meta_inconsistency() { + let mut cluster = new_server_cluster(0, 3); + cluster.cfg.raft_store.store_batch_system.pool_size = 2; + cluster.cfg.raft_store.store_batch_system.max_batch_size = Some(1); + cluster.cfg.raft_store.apply_batch_system.pool_size = 2; + cluster.cfg.raft_store.apply_batch_system.max_batch_size = Some(1); + cluster.cfg.raft_store.hibernate_regions = false; + cluster.cfg.raft_store.raft_log_gc_threshold = 1000; + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + let region_id = cluster.run_conf_change(); + pd_client.must_add_peer(region_id, new_peer(2, 2)); + cluster.must_transfer_leader(region_id, new_peer(1, 1)); + cluster.must_put(b"k1", b"v1"); + + // Add new peer on node 3, its snapshot apply is paused. + fail::cfg("before_set_region_on_peer_3", "pause").unwrap(); + pd_client.must_add_peer(region_id, new_peer(3, 3)); + + // Let only heartbeat msg to pass so a replicate peer could be created on node 3 + // for peer 1003. + let region_packet_filter_region_1000_peer_1003 = + RegionPacketFilter::new(1000, 3).skip(MessageType::MsgHeartbeat); + cluster + .sim + .wl() + .add_recv_filter(3, Box::new(region_packet_filter_region_1000_peer_1003)); + + // Trigger a region split to create region 1000 with peer 1001, 1002 and 1003. + let region = cluster.get_region(b""); + cluster.must_split(®ion, b"k5"); + + // Scheduler a larger peed id heartbeat msg to trigger peer destroy for peer + // 1003, pause it before the meta.lock operation so new region insertions by + // region split could go first. + // Thus a inconsistency could happen because the destroy is handled + // by a uninitialized peer but the new initialized region info is inserted into + // the meta by region split. + fail::cfg("before_destroy_peer_on_peer_1003", "pause").unwrap(); + let new_region = cluster.get_region(b"k4"); + let mut larger_id_msg = Box::::default(); + larger_id_msg.set_region_id(1000); + larger_id_msg.set_to_peer(new_peer(3, 1113)); + larger_id_msg.set_region_epoch(new_region.get_region_epoch().clone()); + larger_id_msg + .mut_region_epoch() + .set_conf_ver(new_region.get_region_epoch().get_conf_ver() + 1); + larger_id_msg.set_from_peer(new_peer(1, 1001)); + let raft_message = larger_id_msg.mut_message(); + raft_message.set_msg_type(MessageType::MsgHeartbeat); + raft_message.set_from(1001); + raft_message.set_to(1113); + raft_message.set_term(6); + cluster.sim.wl().send_raft_msg(*larger_id_msg).unwrap(); + thread::sleep(Duration::from_millis(500)); + + // Let snapshot apply continue on peer 3 from region 0, then region split would + // be applied too. + fail::remove("before_set_region_on_peer_3"); + thread::sleep(Duration::from_millis(2000)); + + // Let self destroy continue after the region split is finished. + fail::remove("before_destroy_peer_on_peer_1003"); + sleep_ms(1000); + + // Clear the network partition nemesis, trigger a new region split, panic would + // be encountered The thread 'raftstore-3-1::test_message_order_3' panicked + // at 'meta corrupted: no region for 1000 7A6B35 when creating 1004 + // region_id: 1004 from_peer { id: 1005 store_id: 1 } to_peer { id: 1007 + // store_id: 3 } message { msg_type: MsgRequestPreVote to: 1007 from: 1005 + // term: 6 log_term: 5 index: 5 commit: 5 commit_term: 5 } region_epoch { + // conf_ver: 3 version: 3 } end_key: 6B32'. + cluster.sim.wl().clear_recv_filters(3); + let region = cluster.get_region(b"k1"); + cluster.must_split(®ion, b"k2"); + cluster.must_put(b"k1", b"v1"); +} + #[test] fn test_follower_slow_split() { let mut cluster = new_node_cluster(0, 3); From 503648f18312b8978f19b17f4e58b3f011bb3cb0 Mon Sep 17 00:00:00 2001 From: Neil Shen Date: Fri, 25 Aug 2023 18:42:35 +0800 Subject: [PATCH 009/220] *: add memory quota to resolved_ts::Resolver (#15400) ref tikv/tikv#14864 This is the first PR to fix OOM caused by Resolver tracking large txns. Resolver checks memory quota before tracking a lock, and returns false if it exceeds memory quota. Signed-off-by: Neil Shen Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- .../backup-stream/src/subscription_track.rs | 19 ++- components/cdc/src/channel.rs | 87 ++------------ components/cdc/src/delegate.rs | 23 ++-- components/cdc/src/endpoint.rs | 49 ++++---- components/cdc/src/initializer.rs | 14 ++- components/cdc/src/lib.rs | 2 +- components/cdc/src/service.rs | 10 +- components/cdc/tests/mod.rs | 7 +- components/resolved_ts/src/endpoint.rs | 100 ++++++++++------ components/resolved_ts/src/resolver.rs | 87 +++++++++++--- components/server/src/server.rs | 9 +- components/server/src/server2.rs | 9 +- components/tikv_util/src/memory.rs | 113 +++++++++++++++++- 13 files changed, 347 insertions(+), 182 deletions(-) diff --git a/components/backup-stream/src/subscription_track.rs b/components/backup-stream/src/subscription_track.rs index e92759bc2b2..ef6e24d9d8f 100644 --- a/components/backup-stream/src/subscription_track.rs +++ b/components/backup-stream/src/subscription_track.rs @@ -9,7 +9,7 @@ use dashmap::{ use kvproto::metapb::Region; use raftstore::coprocessor::*; use resolved_ts::Resolver; -use tikv_util::{info, warn}; +use tikv_util::{info, memory::MemoryQuota, warn}; use txn_types::TimeStamp; use crate::{debug, metrics::TRACK_REGION, utils}; @@ -401,7 +401,7 @@ impl<'a> SubscriptionRef<'a> { } } -/// This enhanced version of `Resolver` allow some unordered lock events. +/// This enhanced version of `Resolver` allow some unordered lock events. /// The name "2-phase" means this is used for 2 *concurrency* phases of /// observing a region: /// 1. Doing the initial scanning. @@ -479,7 +479,8 @@ impl TwoPhaseResolver { if !self.in_phase_one() { warn!("backup stream tracking lock as if in phase one"; "start_ts" => %start_ts, "key" => %utils::redact(&key)) } - self.resolver.track_lock(start_ts, key, None) + // TODO: handle memory quota exceed, for now, quota is set to usize::MAX. + assert!(self.resolver.track_lock(start_ts, key, None)); } pub fn track_lock(&mut self, start_ts: TimeStamp, key: Vec) { @@ -487,7 +488,8 @@ impl TwoPhaseResolver { self.future_locks.push(FutureLock::Lock(key, start_ts)); return; } - self.resolver.track_lock(start_ts, key, None) + // TODO: handle memory quota exceed, for now, quota is set to usize::MAX. + assert!(self.resolver.track_lock(start_ts, key, None)); } pub fn untrack_lock(&mut self, key: &[u8]) { @@ -501,7 +503,10 @@ impl TwoPhaseResolver { fn handle_future_lock(&mut self, lock: FutureLock) { match lock { - FutureLock::Lock(key, ts) => self.resolver.track_lock(ts, key, None), + FutureLock::Lock(key, ts) => { + // TODO: handle memory quota exceed, for now, quota is set to usize::MAX. + assert!(self.resolver.track_lock(ts, key, None)); + } FutureLock::Unlock(key) => self.resolver.untrack_lock(&key, None), } } @@ -523,8 +528,10 @@ impl TwoPhaseResolver { } pub fn new(region_id: u64, stable_ts: Option) -> Self { + // TODO: limit the memory usage of the resolver. + let memory_quota = Arc::new(MemoryQuota::new(std::usize::MAX)); Self { - resolver: Resolver::new(region_id), + resolver: Resolver::new(region_id, memory_quota), future_locks: Default::default(), stable_ts, } diff --git a/components/cdc/src/channel.rs b/components/cdc/src/channel.rs index b11799d87c1..6a8c3d5c3aa 100644 --- a/components/cdc/src/channel.rs +++ b/components/cdc/src/channel.rs @@ -1,13 +1,6 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. -use std::{ - fmt, - sync::{ - atomic::{AtomicUsize, Ordering}, - Arc, - }, - time::Duration, -}; +use std::{fmt, sync::Arc, time::Duration}; use futures::{ channel::mpsc::{ @@ -20,7 +13,9 @@ use futures::{ use grpcio::WriteFlags; use kvproto::cdcpb::{ChangeDataEvent, Event, ResolvedTs}; use protobuf::Message; -use tikv_util::{future::block_on_timeout, impl_display_as_debug, time::Instant, warn}; +use tikv_util::{ + future::block_on_timeout, impl_display_as_debug, memory::MemoryQuota, time::Instant, warn, +}; use crate::metrics::*; @@ -185,71 +180,7 @@ impl EventBatcher { } } -#[derive(Clone)] -pub struct MemoryQuota { - capacity: Arc, - in_use: Arc, -} - -impl MemoryQuota { - pub fn new(capacity: usize) -> MemoryQuota { - MemoryQuota { - capacity: Arc::new(AtomicUsize::new(capacity)), - in_use: Arc::new(AtomicUsize::new(0)), - } - } - - pub fn in_use(&self) -> usize { - self.in_use.load(Ordering::Relaxed) - } - - pub(crate) fn capacity(&self) -> usize { - self.capacity.load(Ordering::Acquire) - } - - pub(crate) fn set_capacity(&self, capacity: usize) { - self.capacity.store(capacity, Ordering::Release) - } - - fn alloc(&self, bytes: usize) -> bool { - let mut in_use_bytes = self.in_use.load(Ordering::Relaxed); - let capacity = self.capacity.load(Ordering::Acquire); - loop { - if in_use_bytes + bytes > capacity { - return false; - } - let new_in_use_bytes = in_use_bytes + bytes; - match self.in_use.compare_exchange_weak( - in_use_bytes, - new_in_use_bytes, - Ordering::Acquire, - Ordering::Relaxed, - ) { - Ok(_) => return true, - Err(current) => in_use_bytes = current, - } - } - } - - fn free(&self, bytes: usize) { - let mut in_use_bytes = self.in_use.load(Ordering::Relaxed); - loop { - // Saturating at the numeric bounds instead of overflowing. - let new_in_use_bytes = in_use_bytes - std::cmp::min(bytes, in_use_bytes); - match self.in_use.compare_exchange_weak( - in_use_bytes, - new_in_use_bytes, - Ordering::Acquire, - Ordering::Relaxed, - ) { - Ok(_) => return, - Err(current) => in_use_bytes = current, - } - } - } -} - -pub fn channel(buffer: usize, memory_quota: MemoryQuota) -> (Sink, Drain) { +pub fn channel(buffer: usize, memory_quota: Arc) -> (Sink, Drain) { let (unbounded_sender, unbounded_receiver) = unbounded(); let (bounded_sender, bounded_receiver) = bounded(buffer); ( @@ -304,7 +235,7 @@ impl_from_future_send_error! { pub struct Sink { unbounded_sender: UnboundedSender<(CdcEvent, usize)>, bounded_sender: Sender<(CdcEvent, usize)>, - memory_quota: MemoryQuota, + memory_quota: Arc, } impl Sink { @@ -354,7 +285,7 @@ impl Sink { pub struct Drain { unbounded_receiver: UnboundedReceiver<(CdcEvent, usize)>, bounded_receiver: Receiver<(CdcEvent, usize)>, - memory_quota: MemoryQuota, + memory_quota: Arc, } impl<'a> Drain { @@ -451,7 +382,7 @@ mod tests { type Send = Box Result<(), SendError>>; fn new_test_channel(buffer: usize, capacity: usize, force_send: bool) -> (Send, Drain) { - let memory_quota = MemoryQuota::new(capacity); + let memory_quota = Arc::new(MemoryQuota::new(capacity)); let (mut tx, rx) = channel(buffer, memory_quota); let mut flag = true; let send = move |event| { @@ -599,7 +530,7 @@ mod tests { // 1KB let max_pending_bytes = 1024; let buffer = max_pending_bytes / event.size(); - let memory_quota = MemoryQuota::new(max_pending_bytes as _); + let memory_quota = Arc::new(MemoryQuota::new(max_pending_bytes as _)); let (tx, _rx) = channel(buffer as _, memory_quota); for _ in 0..buffer { tx.unbounded_send(CdcEvent::Event(e.clone()), false) diff --git a/components/cdc/src/delegate.rs b/components/cdc/src/delegate.rs index 4c8b2226f49..da5c26aad30 100644 --- a/components/cdc/src/delegate.rs +++ b/components/cdc/src/delegate.rs @@ -414,7 +414,10 @@ impl Delegate { for lock in mem::take(&mut pending.locks) { match lock { - PendingLock::Track { key, start_ts } => resolver.track_lock(start_ts, key, None), + PendingLock::Track { key, start_ts } => { + // TODO: handle memory quota exceed, for now, quota is set to usize::MAX. + assert!(resolver.track_lock(start_ts, key, None)); + } PendingLock::Untrack { key } => resolver.untrack_lock(&key, None), } } @@ -822,7 +825,8 @@ impl Delegate { // In order to compute resolved ts, we must track inflight txns. match self.resolver { Some(ref mut resolver) => { - resolver.track_lock(row.start_ts.into(), row.key.clone(), None) + // TODO: handle memory quota exceed, for now, quota is set to usize::MAX. + assert!(resolver.track_lock(row.start_ts.into(), row.key.clone(), None)); } None => { assert!(self.pending.is_some(), "region resolver not ready"); @@ -1151,9 +1155,10 @@ mod tests { use api_version::RawValue; use futures::{executor::block_on, stream::StreamExt}; use kvproto::{errorpb::Error as ErrorHeader, metapb::Region}; + use tikv_util::memory::MemoryQuota; use super::*; - use crate::channel::{channel, recv_timeout, MemoryQuota}; + use crate::channel::{channel, recv_timeout}; #[test] fn test_error() { @@ -1165,7 +1170,7 @@ mod tests { region.mut_region_epoch().set_conf_ver(2); let region_epoch = region.get_region_epoch().clone(); - let quota = crate::channel::MemoryQuota::new(usize::MAX); + let quota = Arc::new(MemoryQuota::new(usize::MAX)); let (sink, mut drain) = crate::channel::channel(1, quota); let rx = drain.drain(); let request_id = 123; @@ -1182,7 +1187,8 @@ mod tests { let mut delegate = Delegate::new(region_id, Default::default()); delegate.subscribe(downstream).unwrap(); assert!(delegate.handle.is_observing()); - let resolver = Resolver::new(region_id); + let memory_quota = Arc::new(MemoryQuota::new(std::usize::MAX)); + let resolver = Resolver::new(region_id, memory_quota); assert!(delegate.on_region_ready(resolver, region).is_empty()); assert!(delegate.downstreams()[0].observed_range.all_key_covered); @@ -1333,7 +1339,8 @@ mod tests { region.mut_region_epoch().set_conf_ver(1); region.mut_region_epoch().set_version(1); { - let failures = delegate.on_region_ready(Resolver::new(1), region); + let memory_quota = Arc::new(MemoryQuota::new(std::usize::MAX)); + let failures = delegate.on_region_ready(Resolver::new(1, memory_quota), region); assert_eq!(failures.len(), 1); let id = failures[0].0.id; delegate.unsubscribe(id, None); @@ -1456,7 +1463,7 @@ mod tests { } assert_eq!(map.len(), 5); - let (sink, mut drain) = channel(1, MemoryQuota::new(1024)); + let (sink, mut drain) = channel(1, Arc::new(MemoryQuota::new(1024))); let downstream = Downstream { id: DownstreamId::new(), req_id: 1, @@ -1529,7 +1536,7 @@ mod tests { } assert_eq!(map.len(), 5); - let (sink, mut drain) = channel(1, MemoryQuota::new(1024)); + let (sink, mut drain) = channel(1, Arc::new(MemoryQuota::new(1024))); let downstream = Downstream { id: DownstreamId::new(), req_id: 1, diff --git a/components/cdc/src/endpoint.rs b/components/cdc/src/endpoint.rs index 23a3e410467..72042bb5aec 100644 --- a/components/cdc/src/endpoint.rs +++ b/components/cdc/src/endpoint.rs @@ -41,6 +41,7 @@ use tikv::{ }; use tikv_util::{ debug, defer, error, impl_display_as_debug, info, + memory::MemoryQuota, mpsc::bounded, slow_log, sys::thread::ThreadBuildWrapper, @@ -56,7 +57,7 @@ use tokio::{ use txn_types::{TimeStamp, TxnExtra, TxnExtraScheduler}; use crate::{ - channel::{CdcEvent, MemoryQuota, SendError}, + channel::{CdcEvent, SendError}, delegate::{on_init_downstream, Delegate, Downstream, DownstreamId, DownstreamState}, initializer::Initializer, metrics::*, @@ -370,7 +371,7 @@ pub struct Endpoint { scan_speed_limiter: Limiter, max_scan_batch_bytes: usize, max_scan_batch_size: usize, - sink_memory_quota: MemoryQuota, + sink_memory_quota: Arc, old_value_cache: OldValueCache, resolved_region_heap: RefCell, @@ -401,7 +402,7 @@ impl, E: KvEngine, S: StoreRegionMeta> Endpoint, security_mgr: Arc, - sink_memory_quota: MemoryQuota, + sink_memory_quota: Arc, causal_ts_provider: Option>, ) -> Endpoint { let workers = Builder::new_multi_thread() @@ -1455,7 +1456,7 @@ mod tests { ConcurrencyManager::new(1.into()), env, security_mgr, - MemoryQuota::new(usize::MAX), + Arc::new(MemoryQuota::new(usize::MAX)), causal_ts_provider, ); @@ -1476,7 +1477,7 @@ mod tests { let mut suite = mock_endpoint(&cfg, None, ApiVersion::V1); suite.add_region(1, 100); - let quota = crate::channel::MemoryQuota::new(usize::MAX); + let quota = Arc::new(MemoryQuota::new(usize::MAX)); let (tx, mut rx) = channel::channel(1, quota); let mut rx = rx.drain(); @@ -1732,7 +1733,7 @@ mod tests { #[test] fn test_raftstore_is_busy() { - let quota = crate::channel::MemoryQuota::new(usize::MAX); + let quota = Arc::new(MemoryQuota::new(usize::MAX)); let (tx, _rx) = channel::channel(1, quota); let mut suite = mock_endpoint(&CdcConfig::default(), None, ApiVersion::V1); @@ -1785,7 +1786,7 @@ mod tests { }; let mut suite = mock_endpoint(&cfg, None, ApiVersion::V1); suite.add_region(1, 100); - let quota = crate::channel::MemoryQuota::new(usize::MAX); + let quota = Arc::new(MemoryQuota::new(usize::MAX)); let (tx, mut rx) = channel::channel(1, quota); let mut rx = rx.drain(); @@ -1966,7 +1967,7 @@ mod tests { let mut suite = mock_endpoint(&cfg, None, ApiVersion::V1); suite.add_region(1, 100); - let quota = crate::channel::MemoryQuota::new(usize::MAX); + let quota = Arc::new(MemoryQuota::new(usize::MAX)); let (tx, mut rx) = channel::channel(1, quota); let mut rx = rx.drain(); let mut region = Region::default(); @@ -1999,7 +2000,8 @@ mod tests { downstream, conn_id, }); - let resolver = Resolver::new(1); + let memory_quota = Arc::new(MemoryQuota::new(std::usize::MAX)); + let resolver = Resolver::new(1, memory_quota); let observe_id = suite.endpoint.capture_regions[&1].handle.id; suite.on_region_ready(observe_id, resolver, region.clone()); suite.run(Task::MinTs { @@ -2035,7 +2037,8 @@ mod tests { downstream, conn_id, }); - let resolver = Resolver::new(2); + let memory_quota = Arc::new(MemoryQuota::new(std::usize::MAX)); + let resolver = Resolver::new(2, memory_quota); region.set_id(2); let observe_id = suite.endpoint.capture_regions[&2].handle.id; suite.on_region_ready(observe_id, resolver, region); @@ -2056,7 +2059,7 @@ mod tests { } // Register region 3 to another conn which is not support batch resolved ts. - let quota = crate::channel::MemoryQuota::new(usize::MAX); + let quota = Arc::new(MemoryQuota::new(usize::MAX)); let (tx, mut rx2) = channel::channel(1, quota); let mut rx2 = rx2.drain(); let mut region = Region::default(); @@ -2084,7 +2087,8 @@ mod tests { downstream, conn_id, }); - let resolver = Resolver::new(3); + let memory_quota = Arc::new(MemoryQuota::new(std::usize::MAX)); + let resolver = Resolver::new(3, memory_quota); region.set_id(3); let observe_id = suite.endpoint.capture_regions[&3].handle.id; suite.on_region_ready(observe_id, resolver, region); @@ -2127,7 +2131,7 @@ mod tests { fn test_deregister() { let mut suite = mock_endpoint(&CdcConfig::default(), None, ApiVersion::V1); suite.add_region(1, 100); - let quota = crate::channel::MemoryQuota::new(usize::MAX); + let quota = Arc::new(MemoryQuota::new(usize::MAX)); let (tx, mut rx) = channel::channel(1, quota); let mut rx = rx.drain(); @@ -2279,7 +2283,7 @@ mod tests { // Open two connections a and b, registers region 1, 2 to conn a and // region 3 to conn b. let mut conn_rxs = vec![]; - let quota = channel::MemoryQuota::new(usize::MAX); + let quota = Arc::new(MemoryQuota::new(usize::MAX)); for region_ids in vec![vec![1, 2], vec![3]] { let (tx, rx) = channel::channel(1, quota.clone()); conn_rxs.push(rx); @@ -2311,7 +2315,8 @@ mod tests { downstream, conn_id, }); - let resolver = Resolver::new(region_id); + let memory_quota = Arc::new(MemoryQuota::new(std::usize::MAX)); + let resolver = Resolver::new(region_id, memory_quota); let observe_id = suite.endpoint.capture_regions[®ion_id].handle.id; let mut region = Region::default(); region.set_id(region_id); @@ -2392,7 +2397,7 @@ mod tests { fn test_deregister_conn_then_delegate() { let mut suite = mock_endpoint(&CdcConfig::default(), None, ApiVersion::V1); suite.add_region(1, 100); - let quota = crate::channel::MemoryQuota::new(usize::MAX); + let quota = Arc::new(MemoryQuota::new(usize::MAX)); // Open conn a let (tx1, _rx1) = channel::channel(1, quota.clone()); @@ -2470,10 +2475,11 @@ mod tests { let mut region = Region::default(); region.id = 1; region.set_region_epoch(region_epoch_2); + let memory_quota = Arc::new(MemoryQuota::new(std::usize::MAX)); suite.run(Task::ResolverReady { observe_id, region: region.clone(), - resolver: Resolver::new(1), + resolver: Resolver::new(1, memory_quota), }); // Deregister deletgate due to epoch not match for conn b. @@ -2557,7 +2563,7 @@ mod tests { ..Default::default() }; let mut suite = mock_endpoint(&cfg, None, ApiVersion::V1); - let quota = crate::channel::MemoryQuota::new(usize::MAX); + let quota = Arc::new(MemoryQuota::new(usize::MAX)); let (tx, mut rx) = channel::channel(1, quota); let mut rx = rx.drain(); @@ -2596,8 +2602,9 @@ mod tests { conn_id, }); - let mut resolver = Resolver::new(id); - resolver.track_lock(TimeStamp::compose(0, id), vec![], None); + let memory_quota = Arc::new(MemoryQuota::new(std::usize::MAX)); + let mut resolver = Resolver::new(id, memory_quota); + assert!(resolver.track_lock(TimeStamp::compose(0, id), vec![], None)); let mut region = Region::default(); region.id = id; region.set_region_epoch(region_epoch); @@ -2646,7 +2653,7 @@ mod tests { }; let mut suite = mock_endpoint(&cfg, None, ApiVersion::V1); suite.add_region(1, 100); - let quota = crate::channel::MemoryQuota::new(usize::MAX); + let quota = Arc::new(MemoryQuota::new(usize::MAX)); let (tx, mut rx) = channel::channel(1, quota); let mut rx = rx.drain(); diff --git a/components/cdc/src/initializer.rs b/components/cdc/src/initializer.rs index 2c0884bb303..44b564ce663 100644 --- a/components/cdc/src/initializer.rs +++ b/components/cdc/src/initializer.rs @@ -36,6 +36,7 @@ use tikv_util::{ box_err, codec::number, debug, error, info, + memory::MemoryQuota, sys::inspector::{self_thread_inspector, ThreadInspector}, time::{Instant, Limiter}, warn, @@ -215,7 +216,9 @@ impl Initializer { "end_key" => log_wrappers::Value::key(snap.upper_bound().unwrap_or_default())); let mut resolver = if self.build_resolver { - Some(Resolver::new(region_id)) + // TODO: limit the memory usage of the resolver. + let memory_quota = Arc::new(MemoryQuota::new(std::usize::MAX)); + Some(Resolver::new(region_id, memory_quota)) } else { None }; @@ -418,7 +421,11 @@ impl Initializer { let key = Key::from_encoded_slice(encoded_key).into_raw().unwrap(); let lock = Lock::parse(value)?; match lock.lock_type { - LockType::Put | LockType::Delete => resolver.track_lock(lock.ts, key, None), + LockType::Put | LockType::Delete => { + // TODO: handle memory quota exceed, for now, quota is set to + // usize::MAX. + assert!(resolver.track_lock(lock.ts, key, None)); + } _ => (), }; } @@ -587,6 +594,7 @@ mod tests { TestEngineBuilder, }; use tikv_util::{ + memory::MemoryQuota, sys::thread::ThreadBuildWrapper, worker::{LazyWorker, Runnable}, }; @@ -629,7 +637,7 @@ mod tests { crate::channel::Drain, ) { let (receiver_worker, rx) = new_receiver_worker(); - let quota = crate::channel::MemoryQuota::new(usize::MAX); + let quota = Arc::new(MemoryQuota::new(usize::MAX)); let (sink, drain) = crate::channel::channel(buffer, quota); let pool = Builder::new_multi_thread() diff --git a/components/cdc/src/lib.rs b/components/cdc/src/lib.rs index c913cefb92e..64f110f5c45 100644 --- a/components/cdc/src/lib.rs +++ b/components/cdc/src/lib.rs @@ -15,7 +15,7 @@ mod old_value; mod service; mod txn_source; -pub use channel::{recv_timeout, CdcEvent, MemoryQuota}; +pub use channel::{recv_timeout, CdcEvent}; pub use config::CdcConfigManager; pub use delegate::Delegate; pub use endpoint::{CdcTxnExtraScheduler, Endpoint, Task, Validate}; diff --git a/components/cdc/src/service.rs b/components/cdc/src/service.rs index d07b5283380..7478e3afbad 100644 --- a/components/cdc/src/service.rs +++ b/components/cdc/src/service.rs @@ -16,10 +16,10 @@ use kvproto::{ }, kvrpcpb::ApiVersion, }; -use tikv_util::{error, info, warn, worker::*}; +use tikv_util::{error, info, memory::MemoryQuota, warn, worker::*}; use crate::{ - channel::{channel, MemoryQuota, Sink, CDC_CHANNLE_CAPACITY}, + channel::{channel, Sink, CDC_CHANNLE_CAPACITY}, delegate::{Downstream, DownstreamId, DownstreamState, ObservedRange}, endpoint::{Deregister, Task}, }; @@ -244,14 +244,14 @@ impl EventFeedHeaders { #[derive(Clone)] pub struct Service { scheduler: Scheduler, - memory_quota: MemoryQuota, + memory_quota: Arc, } impl Service { /// Create a ChangeData service. /// /// It requires a scheduler of an `Endpoint` in order to schedule tasks. - pub fn new(scheduler: Scheduler, memory_quota: MemoryQuota) -> Service { + pub fn new(scheduler: Scheduler, memory_quota: Arc) -> Service { Service { scheduler, memory_quota, @@ -518,7 +518,7 @@ mod tests { use crate::channel::{recv_timeout, CdcEvent}; fn new_rpc_suite(capacity: usize) -> (Server, ChangeDataClient, ReceiverWrapper) { - let memory_quota = MemoryQuota::new(capacity); + let memory_quota = Arc::new(MemoryQuota::new(capacity)); let (scheduler, rx) = dummy_scheduler(); let cdc_service = Service::new(scheduler, memory_quota); let env = Arc::new(EnvBuilder::new().build()); diff --git a/components/cdc/tests/mod.rs b/components/cdc/tests/mod.rs index d2c4519d50d..ec479909793 100644 --- a/components/cdc/tests/mod.rs +++ b/components/cdc/tests/mod.rs @@ -6,7 +6,7 @@ use std::{ }; use causal_ts::CausalTsProvider; -use cdc::{recv_timeout, CdcObserver, Delegate, FeatureGate, MemoryQuota, Task, Validate}; +use cdc::{recv_timeout, CdcObserver, Delegate, FeatureGate, Task, Validate}; use collections::HashMap; use concurrency_manager::ConcurrencyManager; use engine_rocks::RocksEngine; @@ -26,6 +26,7 @@ use test_raftstore::*; use tikv::{config::CdcConfig, server::DEFAULT_CLUSTER_ID, storage::kv::LocalTablets}; use tikv_util::{ config::ReadableDuration, + memory::MemoryQuota, worker::{LazyWorker, Runnable}, HandyRwLock, }; @@ -183,7 +184,7 @@ impl TestSuiteBuilder { .push(Box::new(move || { create_change_data(cdc::Service::new( scheduler.clone(), - MemoryQuota::new(memory_quota), + Arc::new(MemoryQuota::new(memory_quota)), )) })); sim.txn_extra_schedulers.insert( @@ -223,7 +224,7 @@ impl TestSuiteBuilder { cm.clone(), env, sim.security_mgr.clone(), - MemoryQuota::new(usize::MAX), + Arc::new(MemoryQuota::new(usize::MAX)), sim.get_causal_ts_provider(*id), ); let mut updated_cfg = cfg.clone(); diff --git a/components/resolved_ts/src/endpoint.rs b/components/resolved_ts/src/endpoint.rs index 5d0dbdcd689..36cd3030d2a 100644 --- a/components/resolved_ts/src/endpoint.rs +++ b/components/resolved_ts/src/endpoint.rs @@ -28,6 +28,7 @@ use raftstore::{ use security::SecurityManager; use tikv::config::ResolvedTsConfig; use tikv_util::{ + memory::MemoryQuota, warn, worker::{Runnable, RunnableWithTimer, Scheduler}, }; @@ -80,9 +81,9 @@ struct ObserveRegion { } impl ObserveRegion { - fn new(meta: Region, rrp: Arc) -> Self { + fn new(meta: Region, rrp: Arc, memory_quota: Arc) -> Self { ObserveRegion { - resolver: Resolver::with_read_progress(meta.id, Some(rrp)), + resolver: Resolver::with_read_progress(meta.id, Some(rrp), memory_quota), meta, handle: ObserveHandle::new(), resolver_status: ResolverStatus::Pending { @@ -93,8 +94,8 @@ impl ObserveRegion { } } - fn read_progress(&self) -> &RegionReadProgress { - self.resolver.read_progress.as_ref().unwrap() + fn read_progress(&self) -> &Arc { + self.resolver.read_progress().unwrap() } fn track_change_log(&mut self, change_logs: &[ChangeLog]) -> std::result::Result<(), String> { @@ -192,21 +193,29 @@ impl ObserveRegion { } }, ChangeLog::Rows { rows, index } => { - rows.iter().for_each(|row| match row { - ChangeRow::Prewrite { key, start_ts, .. } => self - .resolver - .track_lock(*start_ts, key.to_raw().unwrap(), Some(*index)), - ChangeRow::Commit { key, .. } => self - .resolver - .untrack_lock(&key.to_raw().unwrap(), Some(*index)), - // One pc command do not contains any lock, so just skip it - ChangeRow::OnePc { .. } => { - self.resolver.update_tracked_index(*index); - } - ChangeRow::IngestSsT => { - self.resolver.update_tracked_index(*index); + for row in rows { + match row { + ChangeRow::Prewrite { key, start_ts, .. } => { + if !self.resolver.track_lock( + *start_ts, + key.to_raw().unwrap(), + Some(*index), + ) { + return Err("memory quota exceed".to_owned()); + } + } + ChangeRow::Commit { key, .. } => self + .resolver + .untrack_lock(&key.to_raw().unwrap(), Some(*index)), + // One pc command do not contains any lock, so just skip it + ChangeRow::OnePc { .. } => { + self.resolver.update_tracked_index(*index); + } + ChangeRow::IngestSsT => { + self.resolver.update_tracked_index(*index); + } } - }); + } } } } @@ -215,7 +224,10 @@ impl ObserveRegion { Ok(()) } - fn track_scan_locks(&mut self, entries: Vec, apply_index: u64) { + /// Track locks in incoming scan entries. + /// Return false if resolver exceeds memory quota. + #[must_use] + fn track_scan_locks(&mut self, entries: Vec, apply_index: u64) -> bool { for es in entries { match es { ScanEntry::Lock(locks) => { @@ -223,8 +235,13 @@ impl ObserveRegion { panic!("region {:?} resolver has ready", self.meta.id) } for (key, lock) in locks { - self.resolver - .track_lock(lock.ts, key.to_raw().unwrap(), Some(apply_index)); + if !self.resolver.track_lock( + lock.ts, + key.to_raw().unwrap(), + Some(apply_index), + ) { + return false; + } } } ScanEntry::None => { @@ -237,18 +254,25 @@ impl ObserveRegion { tracked_index, .. } => { - locks.into_iter().for_each(|lock| match lock { - PendingLock::Track { key, start_ts } => { - self.resolver.track_lock( - start_ts, - key.to_raw().unwrap(), - Some(tracked_index), - ) + for lock in locks { + match lock { + PendingLock::Track { key, start_ts } => { + if !self.resolver.track_lock( + start_ts, + key.to_raw().unwrap(), + Some(tracked_index), + ) { + return false; + } + } + PendingLock::Untrack { key, .. } => { + self.resolver.untrack_lock( + &key.to_raw().unwrap(), + Some(tracked_index), + ) + } } - PendingLock::Untrack { key, .. } => self - .resolver - .untrack_lock(&key.to_raw().unwrap(), Some(tracked_index)), - }); + } tracked_index } ResolverStatus::Ready => { @@ -266,12 +290,14 @@ impl ObserveRegion { ScanEntry::TxnEntry(_) => panic!("unexpected entry type"), } } + true } } pub struct Endpoint { store_id: Option, cfg: ResolvedTsConfig, + memory_quota: Arc, advance_notify: Arc, store_meta: Arc>, region_read_progress: RegionReadProgressRegistry, @@ -321,6 +347,8 @@ where let ep = Self { store_id: Some(store_id), cfg: cfg.clone(), + // TODO: add memory quota to config. + memory_quota: Arc::new(MemoryQuota::new(std::usize::MAX)), advance_notify: Arc::new(Notify::new()), scheduler, store_meta, @@ -343,7 +371,7 @@ where "register observe region"; "region" => ?region ); - ObserveRegion::new(region.clone(), read_progress) + ObserveRegion::new(region.clone(), read_progress, self.memory_quota.clone()) } else { warn!( "try register unexit region"; @@ -537,6 +565,7 @@ where if observe_region.handle.id == observe_id { let logs = ChangeLog::encode_change_log(region_id, batch); if let Err(e) = observe_region.track_change_log(&logs) { + // TODO: handle memory quota exceed, for now, quota is set to usize::MAX. drop(observe_region); self.re_register_region(region_id, observe_id, e); } @@ -561,7 +590,8 @@ where match self.regions.get_mut(®ion_id) { Some(observe_region) => { if observe_region.handle.id == observe_id { - observe_region.track_scan_locks(entries, apply_index); + // TODO: handle memory quota exceed, for now, quota is set to usize::MAX. + assert!(observe_region.track_scan_locks(entries, apply_index)); } } None => { @@ -904,7 +934,7 @@ where .next() .cloned() .map(TimeStamp::into_inner); - lock_num = Some(ob.resolver.locks_by_key.len()); + lock_num = Some(ob.resolver.num_locks()); } info!( "the max gap of safe-ts is large"; diff --git a/components/resolved_ts/src/resolver.rs b/components/resolved_ts/src/resolver.rs index 799c5584723..4b04bf02322 100644 --- a/components/resolved_ts/src/resolver.rs +++ b/components/resolved_ts/src/resolver.rs @@ -4,7 +4,10 @@ use std::{cmp, collections::BTreeMap, sync::Arc}; use collections::{HashMap, HashSet}; use raftstore::store::RegionReadProgress; -use tikv_util::time::Instant; +use tikv_util::{ + memory::{HeapSize, MemoryQuota}, + time::Instant, +}; use txn_types::TimeStamp; use crate::metrics::RTS_RESOLVED_FAIL_ADVANCE_VEC; @@ -16,7 +19,7 @@ const MAX_NUMBER_OF_LOCKS_IN_LOG: usize = 10; pub struct Resolver { region_id: u64, // key -> start_ts - pub(crate) locks_by_key: HashMap, TimeStamp>, + locks_by_key: HashMap, TimeStamp>, // start_ts -> locked keys. lock_ts_heap: BTreeMap>>, // The timestamps that guarantees no more commit will happen before. @@ -24,11 +27,14 @@ pub struct Resolver { // The highest index `Resolver` had been tracked tracked_index: u64, // The region read progress used to utilize `resolved_ts` to serve stale read request - pub(crate) read_progress: Option>, + read_progress: Option>, // The timestamps that advance the resolved_ts when there is no more write. min_ts: TimeStamp, // Whether the `Resolver` is stopped stopped: bool, + + // The memory quota for the `Resolver` and its lock keys and timestamps. + memory_quota: Arc, } impl std::fmt::Debug for Resolver { @@ -39,27 +45,38 @@ impl std::fmt::Debug for Resolver { if let Some((ts, keys)) = far_lock { dt.field(&format_args!( - "far_lock={:?}", + "oldest_lock={:?}", keys.iter() // We must use Display format here or the redact won't take effect. .map(|k| format!("{}", log_wrappers::Value::key(k))) .collect::>() )); - dt.field(&format_args!("far_lock_ts={:?}", ts)); + dt.field(&format_args!("oldest_lock_ts={:?}", ts)); } dt.finish() } } +impl Drop for Resolver { + fn drop(&mut self) { + // Free memory quota used by locks_by_key. + for key in self.locks_by_key.keys() { + let bytes = key.heap_size(); + self.memory_quota.free(bytes); + } + } +} + impl Resolver { - pub fn new(region_id: u64) -> Resolver { - Resolver::with_read_progress(region_id, None) + pub fn new(region_id: u64, memory_quota: Arc) -> Resolver { + Resolver::with_read_progress(region_id, None, memory_quota) } pub fn with_read_progress( region_id: u64, read_progress: Option>, + memory_quota: Arc, ) -> Resolver { Resolver { region_id, @@ -70,6 +87,7 @@ impl Resolver { tracked_index: 0, min_ts: TimeStamp::zero(), stopped: false, + memory_quota, } } @@ -87,11 +105,9 @@ impl Resolver { pub fn size(&self) -> usize { self.locks_by_key.keys().map(|k| k.len()).sum::() - + self - .lock_ts_heap - .values() - .map(|h| h.iter().map(|k| k.len()).sum::()) - .sum::() + + self.locks_by_key.len() * std::mem::size_of::() + + self.lock_ts_heap.len() + * (std::mem::size_of::() + std::mem::size_of::>>()) } pub fn locks(&self) -> &BTreeMap>> { @@ -115,7 +131,8 @@ impl Resolver { self.tracked_index = index; } - pub fn track_lock(&mut self, start_ts: TimeStamp, key: Vec, index: Option) { + #[must_use] + pub fn track_lock(&mut self, start_ts: TimeStamp, key: Vec, index: Option) -> bool { if let Some(index) = index { self.update_tracked_index(index); } @@ -125,9 +142,14 @@ impl Resolver { start_ts, self.region_id ); + let bytes = key.as_slice().heap_size(); + if !self.memory_quota.alloc(bytes) { + return false; + } let key: Arc<[u8]> = key.into_boxed_slice().into(); self.locks_by_key.insert(key.clone(), start_ts); self.lock_ts_heap.entry(start_ts).or_default().insert(key); + true } pub fn untrack_lock(&mut self, key: &[u8], index: Option) { @@ -135,6 +157,8 @@ impl Resolver { self.update_tracked_index(index); } let start_ts = if let Some(start_ts) = self.locks_by_key.remove(key) { + let bytes = key.heap_size(); + self.memory_quota.free(bytes); start_ts } else { debug!("untrack a lock that was not tracked before"; "key" => &log_wrappers::Value::key(key)); @@ -230,6 +254,10 @@ impl Resolver { pub(crate) fn num_transactions(&self) -> u64 { self.lock_ts_heap.len() as u64 } + + pub(crate) fn read_progress(&self) -> Option<&Arc> { + self.read_progress.as_ref() + } } #[cfg(test)] @@ -300,11 +328,16 @@ mod tests { ]; for (i, case) in cases.into_iter().enumerate() { - let mut resolver = Resolver::new(1); + let memory_quota = Arc::new(MemoryQuota::new(std::usize::MAX)); + let mut resolver = Resolver::new(1, memory_quota); for e in case.clone() { match e { Event::Lock(start_ts, key) => { - resolver.track_lock(start_ts.into(), key.into_raw().unwrap(), None) + assert!(resolver.track_lock( + start_ts.into(), + key.into_raw().unwrap(), + None + )); } Event::Unlock(key) => resolver.untrack_lock(&key.into_raw().unwrap(), None), Event::Resolve(min_ts, expect) => { @@ -319,4 +352,28 @@ mod tests { } } } + + #[test] + fn test_memory_quota() { + let memory_quota = Arc::new(MemoryQuota::new(1024)); + let mut resolver = Resolver::new(1, memory_quota.clone()); + let mut key = vec![0; 77]; + let mut ts = TimeStamp::default(); + while resolver.track_lock(ts, key.clone(), None) { + ts.incr(); + key[0..8].copy_from_slice(&ts.into_inner().to_be_bytes()); + } + let remain = 1024 % key.len(); + assert_eq!(memory_quota.in_use(), 1024 - remain); + + let mut ts = TimeStamp::default(); + for _ in 0..5 { + ts.incr(); + key[0..8].copy_from_slice(&ts.into_inner().to_be_bytes()); + resolver.untrack_lock(&key, None); + } + assert_eq!(memory_quota.in_use(), 1024 - 5 * key.len() - remain); + drop(resolver); + assert_eq!(memory_quota.in_use(), 0); + } } diff --git a/components/server/src/server.rs b/components/server/src/server.rs index 7ff51474d7d..57afb85d5b5 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -28,7 +28,7 @@ use backup_stream::{ BackupStreamResolver, }; use causal_ts::CausalTsProviderImpl; -use cdc::{CdcConfigManager, MemoryQuota}; +use cdc::CdcConfigManager; use concurrency_manager::ConcurrencyManager; use engine_rocks::{from_rocks_compression_type, RocksEngine, RocksStatistics}; use engine_rocks_helper::sst_recovery::{RecoveryRunner, DEFAULT_CHECK_INTERVAL}; @@ -108,6 +108,7 @@ use tikv::{ use tikv_util::{ check_environment_variables, config::VersionTrack, + memory::MemoryQuota, mpsc as TikvMpsc, quota_limiter::{QuotaLimitConfigManager, QuotaLimiter}, sys::{disk, path_in_diff_mount_point, register_memory_usage_high_water, SysQuota}, @@ -266,7 +267,7 @@ struct Servers { node: Node, importer: Arc, cdc_scheduler: tikv_util::worker::Scheduler, - cdc_memory_quota: MemoryQuota, + cdc_memory_quota: Arc, rsmeter_pubsub_service: resource_metering::PubSubService, backup_stream_scheduler: Option>, debugger: DebuggerImpl>, LockManager, F>, @@ -986,7 +987,9 @@ where } // Start CDC. - let cdc_memory_quota = MemoryQuota::new(self.core.config.cdc.sink_memory_quota.0 as _); + let cdc_memory_quota = Arc::new(MemoryQuota::new( + self.core.config.cdc.sink_memory_quota.0 as _, + )); let cdc_endpoint = cdc::Endpoint::new( self.core.config.server.cluster_id, &self.core.config.cdc, diff --git a/components/server/src/server2.rs b/components/server/src/server2.rs index fe2b685313e..32d7ab14da9 100644 --- a/components/server/src/server2.rs +++ b/components/server/src/server2.rs @@ -32,7 +32,7 @@ use backup_stream::{ BackupStreamResolver, }; use causal_ts::CausalTsProviderImpl; -use cdc::{CdcConfigManager, MemoryQuota}; +use cdc::CdcConfigManager; use concurrency_manager::ConcurrencyManager; use engine_rocks::{from_rocks_compression_type, RocksEngine, RocksStatistics}; use engine_traits::{Engines, KvEngine, MiscExt, RaftEngine, TabletRegistry, CF_DEFAULT, CF_WRITE}; @@ -106,6 +106,7 @@ use tikv::{ use tikv_util::{ check_environment_variables, config::VersionTrack, + memory::MemoryQuota, mpsc as TikvMpsc, quota_limiter::{QuotaLimitConfigManager, QuotaLimiter}, sys::{disk, path_in_diff_mount_point, register_memory_usage_high_water, SysQuota}, @@ -243,7 +244,7 @@ struct TikvServer { env: Arc, cdc_worker: Option>>, cdc_scheduler: Option>, - cdc_memory_quota: Option, + cdc_memory_quota: Option>, backup_stream_scheduler: Option>, sst_worker: Option>>, quota_limiter: Arc, @@ -637,7 +638,9 @@ where Box::new(CdcConfigManager(cdc_scheduler.clone())), ); // Start cdc endpoint. - let cdc_memory_quota = MemoryQuota::new(self.core.config.cdc.sink_memory_quota.0 as _); + let cdc_memory_quota = Arc::new(MemoryQuota::new( + self.core.config.cdc.sink_memory_quota.0 as _, + )); let cdc_endpoint = cdc::Endpoint::new( self.core.config.server.cluster_id, &self.core.config.cdc, diff --git a/components/tikv_util/src/memory.rs b/components/tikv_util/src/memory.rs index 0a2f49461c5..17b6b23cf78 100644 --- a/components/tikv_util/src/memory.rs +++ b/components/tikv_util/src/memory.rs @@ -1,6 +1,9 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. -use std::mem; +use std::{ + mem, + sync::atomic::{AtomicUsize, Ordering}, +}; use kvproto::{ encryptionpb::EncryptionMeta, @@ -28,6 +31,12 @@ pub trait HeapSize { } } +impl HeapSize for [u8] { + fn heap_size(&self) -> usize { + self.len() * mem::size_of::() + } +} + impl HeapSize for Region { fn heap_size(&self) -> usize { let mut size = self.start_key.capacity() + self.end_key.capacity(); @@ -65,3 +74,105 @@ impl HeapSize for RaftCmdRequest { + mem::size_of_val(&self.status_request) } } + +pub struct MemoryQuota { + capacity: AtomicUsize, + in_use: AtomicUsize, +} + +impl MemoryQuota { + pub fn new(capacity: usize) -> MemoryQuota { + MemoryQuota { + capacity: AtomicUsize::new(capacity), + in_use: AtomicUsize::new(0), + } + } + + pub fn in_use(&self) -> usize { + self.in_use.load(Ordering::Relaxed) + } + + pub fn capacity(&self) -> usize { + self.capacity.load(Ordering::Acquire) + } + + pub fn set_capacity(&self, capacity: usize) { + self.capacity.store(capacity, Ordering::Release) + } + + pub fn alloc(&self, bytes: usize) -> bool { + let mut in_use_bytes = self.in_use.load(Ordering::Relaxed); + let capacity = self.capacity.load(Ordering::Acquire); + loop { + if in_use_bytes + bytes > capacity { + return false; + } + let new_in_use_bytes = in_use_bytes + bytes; + match self.in_use.compare_exchange_weak( + in_use_bytes, + new_in_use_bytes, + Ordering::Acquire, + Ordering::Relaxed, + ) { + Ok(_) => return true, + Err(current) => in_use_bytes = current, + } + } + } + + pub fn free(&self, bytes: usize) { + let mut in_use_bytes = self.in_use.load(Ordering::Relaxed); + loop { + // Saturating at the numeric bounds instead of overflowing. + let new_in_use_bytes = in_use_bytes - std::cmp::min(bytes, in_use_bytes); + match self.in_use.compare_exchange_weak( + in_use_bytes, + new_in_use_bytes, + Ordering::Acquire, + Ordering::Relaxed, + ) { + Ok(_) => return, + Err(current) => in_use_bytes = current, + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_memory_quota() { + let quota = MemoryQuota::new(100); + assert!(quota.alloc(10)); + assert_eq!(quota.in_use(), 10); + assert!(!quota.alloc(100)); + assert_eq!(quota.in_use(), 10); + quota.free(5); + assert_eq!(quota.in_use(), 5); + assert!(quota.alloc(95)); + assert_eq!(quota.in_use(), 100); + quota.free(95); + assert_eq!(quota.in_use(), 5); + } + + #[test] + fn test_resize_memory_quota() { + let quota = MemoryQuota::new(100); + assert!(quota.alloc(10)); + assert_eq!(quota.in_use(), 10); + assert!(!quota.alloc(100)); + assert_eq!(quota.in_use(), 10); + quota.set_capacity(200); + assert!(quota.alloc(100)); + assert_eq!(quota.in_use(), 110); + quota.set_capacity(50); + assert!(!quota.alloc(100)); + assert_eq!(quota.in_use(), 110); + quota.free(100); + assert_eq!(quota.in_use(), 10); + assert!(quota.alloc(40)); + assert_eq!(quota.in_use(), 50); + } +} From f3b5bf51e9105fb5685ef23e454301f48fd27caf Mon Sep 17 00:00:00 2001 From: glorv Date: Mon, 28 Aug 2023 11:30:36 +0800 Subject: [PATCH 010/220] config: support changed adjust max-background-compactions dynamically (#15425) close tikv/tikv#15424 Signed-off-by: glorv Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- components/server/src/common.rs | 6 ++++- components/server/src/server.rs | 6 ++++- components/server/src/server2.rs | 6 ++++- src/config/mod.rs | 42 +++++++++++++++++++++++++------- 4 files changed, 48 insertions(+), 12 deletions(-) diff --git a/components/server/src/common.rs b/components/server/src/common.rs index 165a1c8509e..c8cf879d905 100644 --- a/components/server/src/common.rs +++ b/components/server/src/common.rs @@ -762,7 +762,11 @@ impl ConfiguredRaftEngine for RocksEngine { fn register_config(&self, cfg_controller: &mut ConfigController) { cfg_controller.register( tikv::config::Module::Raftdb, - Box::new(DbConfigManger::new(self.clone(), DbType::Raft)), + Box::new(DbConfigManger::new( + cfg_controller.get_current().rocksdb, + self.clone(), + DbType::Raft, + )), ); } } diff --git a/components/server/src/server.rs b/components/server/src/server.rs index 57afb85d5b5..72f7b936956 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -1552,7 +1552,11 @@ impl TikvServer { let cfg_controller = self.cfg_controller.as_mut().unwrap(); cfg_controller.register( tikv::config::Module::Rocksdb, - Box::new(DbConfigManger::new(kv_engine.clone(), DbType::Kv)), + Box::new(DbConfigManger::new( + cfg_controller.get_current().rocksdb, + kv_engine.clone(), + DbType::Kv, + )), ); let reg = TabletRegistry::new( Box::new(SingletonFactory::new(kv_engine)), diff --git a/components/server/src/server2.rs b/components/server/src/server2.rs index 32d7ab14da9..1289ffe848d 100644 --- a/components/server/src/server2.rs +++ b/components/server/src/server2.rs @@ -1485,7 +1485,11 @@ impl TikvServer { let cfg_controller = self.cfg_controller.as_mut().unwrap(); cfg_controller.register( tikv::config::Module::Rocksdb, - Box::new(DbConfigManger::new(registry.clone(), DbType::Kv)), + Box::new(DbConfigManger::new( + cfg_controller.get_current().rocksdb, + registry.clone(), + DbType::Kv, + )), ); self.tablet_registry = Some(registry.clone()); raft_engine.register_config(cfg_controller); diff --git a/src/config/mod.rs b/src/config/mod.rs index f7c338379ef..38369b3ee93 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -1903,13 +1903,14 @@ pub enum DbType { } pub struct DbConfigManger { + cfg: DbConfig, db: D, db_type: DbType, } impl DbConfigManger { - pub fn new(db: D, db_type: DbType) -> Self { - DbConfigManger { db, db_type } + pub fn new(cfg: DbConfig, db: D, db_type: DbType) -> Self { + DbConfigManger { cfg, db, db_type } } } @@ -1944,10 +1945,31 @@ impl DbConfigManger { _ => Err(format!("invalid cf {:?} for db {:?}", cf, self.db_type).into()), } } + + fn update_background_cfg( + &self, + max_background_jobs: i32, + max_background_flushes: i32, + ) -> Result<(), Box> { + assert!(max_background_jobs > 0 && max_background_flushes > 0); + let max_background_compacts = + std::cmp::max(max_background_jobs - max_background_flushes, 1); + self.db + .set_db_config(&[("max_background_jobs", &max_background_jobs.to_string())])?; + self.db.set_db_config(&[( + "max_background_flushes", + &max_background_flushes.to_string(), + )])?; + self.db.set_db_config(&[( + "max_background_compactions", + &max_background_compacts.to_string(), + )]) + } } impl ConfigManager for DbConfigManger { fn dispatch(&mut self, change: ConfigChange) -> Result<(), Box> { + self.cfg.update(change.clone())?; let change_str = format!("{:?}", change); let mut change: Vec<(String, ConfigValue)> = change.into_iter().collect(); let cf_config = change.drain_filter(|(name, _)| name.ends_with("cf")); @@ -2011,8 +2033,7 @@ impl ConfigManager for DbConfigManger { .next() { let max_background_jobs: i32 = background_jobs_config.1.into(); - self.db - .set_db_config(&[("max_background_jobs", &max_background_jobs.to_string())])?; + self.update_background_cfg(max_background_jobs, self.cfg.max_background_flushes)?; } if let Some(background_subcompactions_config) = change @@ -2029,10 +2050,7 @@ impl ConfigManager for DbConfigManger { .next() { let max_background_flushes: i32 = background_flushes_config.1.into(); - self.db.set_db_config(&[( - "max_background_flushes", - &max_background_flushes.to_string(), - )])?; + self.update_background_cfg(self.cfg.max_background_jobs, max_background_flushes)?; } if !change.is_empty() { @@ -4958,7 +4976,11 @@ mod tests { let cfg_controller = ConfigController::new(cfg); cfg_controller.register( Module::Rocksdb, - Box::new(DbConfigManger::new(engine.clone(), DbType::Kv)), + Box::new(DbConfigManger::new( + cfg_controller.get_current().rocksdb, + engine.clone(), + DbType::Kv, + )), ); let (scheduler, receiver) = dummy_scheduler(); cfg_controller.register( @@ -5108,6 +5130,7 @@ mod tests { .update_config("rocksdb.max-background-jobs", "8") .unwrap(); assert_eq!(db.get_db_options().get_max_background_jobs(), 8); + assert_eq!(db.get_db_options().get_max_background_compactions(), 6); // update max_background_flushes, set to a bigger value assert_eq!(db.get_db_options().get_max_background_flushes(), 2); @@ -5116,6 +5139,7 @@ mod tests { .update_config("rocksdb.max-background-flushes", "5") .unwrap(); assert_eq!(db.get_db_options().get_max_background_flushes(), 5); + assert_eq!(db.get_db_options().get_max_background_compactions(), 3); // update rate_bytes_per_sec assert_eq!( From e5efbe697455bd7814c6979df06a8ccf0189909a Mon Sep 17 00:00:00 2001 From: buffer <1045931706@qq.com> Date: Mon, 28 Aug 2023 15:53:06 +0800 Subject: [PATCH 011/220] raftstore-v2: enable failpoint for raftstore v2 in stale-peer (#15421) ref tikv/tikv#15409 Signed-off-by: bufferflies <1045931706@qq.com> Co-authored-by: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> --- .../src/operation/command/admin/conf_change.rs | 9 +++++++++ components/raftstore/src/store/fsm/apply.rs | 4 ++-- tests/failpoints/cases/test_stale_peer.rs | 18 +++++++++++++----- 3 files changed, 24 insertions(+), 7 deletions(-) diff --git a/components/raftstore-v2/src/operation/command/admin/conf_change.rs b/components/raftstore-v2/src/operation/command/admin/conf_change.rs index 2bd06fca6c2..c7b8481aa7c 100644 --- a/components/raftstore-v2/src/operation/command/admin/conf_change.rs +++ b/components/raftstore-v2/src/operation/command/admin/conf_change.rs @@ -10,6 +10,7 @@ use std::time::Instant; use engine_traits::{KvEngine, RaftEngine, RaftLogBatch}; +use fail::fail_point; use kvproto::{ metapb::{self, PeerRole}, raft_cmdpb::{AdminRequest, AdminResponse, ChangePeerRequest, RaftCmdRequest}, @@ -392,6 +393,14 @@ impl Apply { match change_type { ConfChangeType::AddNode => { + let add_node_fp = || { + fail_point!( + "apply_on_add_node_1_2", + self.peer_id() == 2 && self.region_id() == 1, + |_| {} + ) + }; + add_node_fp(); PEER_ADMIN_CMD_COUNTER_VEC .with_label_values(&["add_peer", "all"]) .inc(); diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index e2b1cedc88d..0bc1ccf7d85 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -2114,14 +2114,14 @@ where match change_type { ConfChangeType::AddNode => { - let add_ndoe_fp = || { + let add_node_fp = || { fail_point!( "apply_on_add_node_1_2", self.id() == 2 && self.region_id() == 1, |_| {} ) }; - add_ndoe_fp(); + add_node_fp(); PEER_ADMIN_CMD_COUNTER_VEC .with_label_values(&["add_peer", "all"]) diff --git a/tests/failpoints/cases/test_stale_peer.rs b/tests/failpoints/cases/test_stale_peer.rs index 39fa09ef014..80c73f03a16 100644 --- a/tests/failpoints/cases/test_stale_peer.rs +++ b/tests/failpoints/cases/test_stale_peer.rs @@ -12,6 +12,7 @@ use kvproto::raft_serverpb::{PeerState, RaftLocalState, RaftMessage}; use pd_client::PdClient; use raft::eraftpb::MessageType; use test_raftstore::*; +use test_raftstore_macro::test_case; use tikv_util::{config::ReadableDuration, time::Instant, HandyRwLock}; #[test] @@ -44,7 +45,8 @@ fn test_one_node_leader_missing() { fail::remove(check_stale_state); } -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_node_update_localreader_after_removed() { let mut cluster = new_node_cluster(0, 6); let pd_client = cluster.pd_client.clone(); @@ -90,7 +92,8 @@ fn test_node_update_localreader_after_removed() { cluster.must_region_not_exist(r1, 2); } -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_stale_learner_restart() { let mut cluster = new_node_cluster(0, 2); cluster.pd_client.disable_default_operator(); @@ -133,9 +136,11 @@ fn test_stale_learner_restart() { must_get_equal(&cluster.get_engine(2), b"k2", b"v2"); } +/// pass /// Test if a peer can be destroyed through tombstone msg when applying /// snapshot. -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_stale_peer_destroy_when_apply_snapshot() { let mut cluster = new_node_cluster(0, 3); configure_for_snapshot(&mut cluster.cfg); @@ -210,9 +215,11 @@ fn test_stale_peer_destroy_when_apply_snapshot() { must_get_none(&cluster.get_engine(3), b"k1"); } +/// pass /// Test if destroy a uninitialized peer through tombstone msg would allow a /// staled peer be created again. -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_destroy_uninitialized_peer_when_there_exists_old_peer() { // 4 stores cluster. let mut cluster = new_node_cluster(0, 4); @@ -291,7 +298,8 @@ fn test_destroy_uninitialized_peer_when_there_exists_old_peer() { /// Logs scan are now moved to raftlog gc threads. The case is to test if logs /// are still cleaned up when there is stale logs before first index during /// destroy. -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_destroy_clean_up_logs_with_unfinished_log_gc() { let mut cluster = new_node_cluster(0, 3); cluster.cfg.raft_store.raft_log_gc_count_limit = Some(15); From c66bfe87c17a2892c5d7440cd30d17147b3fff15 Mon Sep 17 00:00:00 2001 From: Neil Shen Date: Tue, 29 Aug 2023 17:03:38 +0800 Subject: [PATCH 012/220] resolved_ts: re-register region if memory quota exceeded (#15411) close tikv/tikv#14864 Fix resolved ts OOM caused by Resolver tracking large txns. `ObserveRegion` is deregistered if it exceeds memory quota. It may cause higher CPU usage because of scanning locks, but it's better than OOM. Signed-off-by: Neil Shen Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- components/resolved_ts/src/endpoint.rs | 98 ++++++++++++------- components/resolved_ts/src/errors.rs | 53 +--------- components/resolved_ts/src/resolver.rs | 57 ++++++++--- components/resolved_ts/src/scanner.rs | 61 +++++++----- .../resolved_ts/tests/integrations/mod.rs | 92 ++++++++++++++++- components/resolved_ts/tests/mod.rs | 15 ++- src/config/mod.rs | 2 + tests/integrations/config/mod.rs | 1 + tests/integrations/config/test-custom.toml | 1 + 9 files changed, 254 insertions(+), 126 deletions(-) diff --git a/components/resolved_ts/src/endpoint.rs b/components/resolved_ts/src/endpoint.rs index 36cd3030d2a..3c1ad9d8c8d 100644 --- a/components/resolved_ts/src/endpoint.rs +++ b/components/resolved_ts/src/endpoint.rs @@ -41,10 +41,12 @@ use crate::{ metrics::*, resolver::Resolver, scanner::{ScanEntry, ScanMode, ScanTask, ScannerPool}, + Error, Result, }; /// grace period for logging safe-ts and resolved-ts gap in slow log const SLOW_LOG_GRACE_PERIOD_MS: u64 = 1000; +const MEMORY_QUOTA_EXCEEDED_BACKOFF: Duration = Duration::from_secs(30); enum ResolverStatus { Pending { @@ -98,7 +100,7 @@ impl ObserveRegion { self.resolver.read_progress().unwrap() } - fn track_change_log(&mut self, change_logs: &[ChangeLog]) -> std::result::Result<(), String> { + fn track_change_log(&mut self, change_logs: &[ChangeLog]) -> Result<()> { match &mut self.resolver_status { ResolverStatus::Pending { locks, @@ -119,7 +121,7 @@ impl ObserveRegion { // TODO: for admin cmd that won't change the region meta like peer list // and key range (i.e. `CompactLog`, `ComputeHash`) we may not need to // return error - return Err(format!( + return Err(box_err!( "region met admin command {:?} while initializing resolver", req_type )); @@ -201,7 +203,7 @@ impl ObserveRegion { key.to_raw().unwrap(), Some(*index), ) { - return Err("memory quota exceed".to_owned()); + return Err(Error::MemoryQuotaExceeded); } } ChangeRow::Commit { key, .. } => self @@ -225,9 +227,7 @@ impl ObserveRegion { } /// Track locks in incoming scan entries. - /// Return false if resolver exceeds memory quota. - #[must_use] - fn track_scan_locks(&mut self, entries: Vec, apply_index: u64) -> bool { + fn track_scan_locks(&mut self, entries: Vec, apply_index: u64) -> Result<()> { for es in entries { match es { ScanEntry::Lock(locks) => { @@ -240,7 +240,7 @@ impl ObserveRegion { key.to_raw().unwrap(), Some(apply_index), ) { - return false; + return Err(Error::MemoryQuotaExceeded); } } } @@ -262,7 +262,7 @@ impl ObserveRegion { key.to_raw().unwrap(), Some(tracked_index), ) { - return false; + return Err(Error::MemoryQuotaExceeded); } } PendingLock::Untrack { key, .. } => { @@ -290,7 +290,7 @@ impl ObserveRegion { ScanEntry::TxnEntry(_) => panic!("unexpected entry type"), } } - true + Ok(()) } } @@ -347,8 +347,7 @@ where let ep = Self { store_id: Some(store_id), cfg: cfg.clone(), - // TODO: add memory quota to config. - memory_quota: Arc::new(MemoryQuota::new(std::usize::MAX)), + memory_quota: Arc::new(MemoryQuota::new(cfg.memory_quota.0 as usize)), advance_notify: Arc::new(Notify::new()), scheduler, store_meta, @@ -362,7 +361,7 @@ where ep } - fn register_region(&mut self, region: Region) { + fn register_region(&mut self, region: Region, backoff: Option) { let region_id = region.get_id(); assert!(self.regions.get(®ion_id).is_none()); let observe_region = { @@ -390,7 +389,7 @@ where .update_advance_resolved_ts_notify(self.advance_notify.clone()); self.regions.insert(region_id, observe_region); - let scan_task = self.build_scan_task(region, observe_handle, cancelled); + let scan_task = self.build_scan_task(region, observe_handle, cancelled, backoff); self.scanner_pool.spawn_task(scan_task); RTS_SCAN_TASKS.with_label_values(&["total"]).inc(); } @@ -400,6 +399,7 @@ where region: Region, observe_handle: ObserveHandle, cancelled: Arc, + backoff: Option, ) -> ScanTask { let scheduler = self.scheduler.clone(); let scheduler_error = self.scheduler.clone(); @@ -411,6 +411,7 @@ where mode: ScanMode::LockOnly, region, checkpoint_ts: TimeStamp::zero(), + backoff, is_cancelled: Box::new(move || cancelled.load(Ordering::Acquire)), send_entries: Box::new(move |entries, apply_index| { scheduler @@ -424,13 +425,16 @@ where RTS_SCAN_TASKS.with_label_values(&["finish"]).inc(); }), on_error: Some(Box::new(move |observe_id, _region, e| { - scheduler_error - .schedule(Task::ReRegisterRegion { - region_id, - observe_id, - cause: format!("met error while handle scan task {:?}", e), - }) - .unwrap_or_else(|schedule_err| warn!("schedule re-register task failed"; "err" => ?schedule_err, "re_register_cause" => ?e)); + if let Err(e) = scheduler_error.schedule(Task::ReRegisterRegion { + region_id, + observe_id, + cause: e, + }) { + warn!("schedule re-register task failed"; + "region_id" => region_id, + "observe_id" => ?observe_id, + "error" => ?e); + } RTS_SCAN_TASKS.with_label_values(&["abort"]).inc(); })), } @@ -476,7 +480,7 @@ where // the `Resolver`'s lock heap // - `PrepareMerge` and `RollbackMerge`, the key range is unchanged self.deregister_region(region_id); - self.register_region(incoming_region); + self.register_region(incoming_region, None); } } @@ -507,7 +511,13 @@ where } // Deregister current observed region and try to register it again. - fn re_register_region(&mut self, region_id: u64, observe_id: ObserveId, cause: String) { + fn re_register_region( + &mut self, + region_id: u64, + observe_id: ObserveId, + cause: Error, + backoff: Option, + ) { if let Some(observe_region) = self.regions.get(®ion_id) { if observe_region.handle.id != observe_id { warn!("resolved ts deregister region failed due to observe_id not match"); @@ -518,7 +528,7 @@ where "register region again"; "region_id" => region_id, "observe_id" => ?observe_id, - "cause" => cause + "cause" => ?cause ); self.deregister_region(region_id); let region; @@ -529,7 +539,7 @@ where None => return, } } - self.register_region(region); + self.register_region(region, backoff); } } @@ -565,9 +575,12 @@ where if observe_region.handle.id == observe_id { let logs = ChangeLog::encode_change_log(region_id, batch); if let Err(e) = observe_region.track_change_log(&logs) { - // TODO: handle memory quota exceed, for now, quota is set to usize::MAX. drop(observe_region); - self.re_register_region(region_id, observe_id, e); + let backoff = match e { + Error::MemoryQuotaExceeded => Some(MEMORY_QUOTA_EXCEEDED_BACKOFF), + Error::Other(_) => None, + }; + self.re_register_region(region_id, observe_id, e, backoff); } } else { debug!("resolved ts CmdBatch discarded"; @@ -587,16 +600,23 @@ where entries: Vec, apply_index: u64, ) { - match self.regions.get_mut(®ion_id) { - Some(observe_region) => { - if observe_region.handle.id == observe_id { - // TODO: handle memory quota exceed, for now, quota is set to usize::MAX. - assert!(observe_region.track_scan_locks(entries, apply_index)); + let mut is_memory_quota_exceeded = false; + if let Some(observe_region) = self.regions.get_mut(®ion_id) { + if observe_region.handle.id == observe_id { + if let Err(Error::MemoryQuotaExceeded) = + observe_region.track_scan_locks(entries, apply_index) + { + is_memory_quota_exceeded = true; } } - None => { - debug!("scan locks region not exist"; "region_id" => region_id, "observe_id" => ?observe_id); - } + } else { + debug!("scan locks region not exist"; + "region_id" => region_id, + "observe_id" => ?observe_id); + } + if is_memory_quota_exceeded { + let backoff = Some(MEMORY_QUOTA_EXCEEDED_BACKOFF); + self.re_register_region(region_id, observe_id, Error::MemoryQuotaExceeded, backoff); } } @@ -616,6 +636,8 @@ where warn!("resolved-ts config fails"; "error" => ?e); } else { self.advance_notify.notify_waiters(); + self.memory_quota + .set_capacity(self.cfg.memory_quota.0 as usize); info!( "resolved-ts config changed"; "prev" => prev, @@ -668,7 +690,7 @@ pub enum Task { ReRegisterRegion { region_id: u64, observe_id: ObserveId, - cause: String, + cause: Error, }, AdvanceResolvedTs { leader_resolver: LeadershipResolver, @@ -780,13 +802,13 @@ where match task { Task::RegionDestroyed(region) => self.region_destroyed(region), Task::RegionUpdated(region) => self.region_updated(region), - Task::RegisterRegion { region } => self.register_region(region), + Task::RegisterRegion { region } => self.register_region(region, None), Task::DeRegisterRegion { region_id } => self.deregister_region(region_id), Task::ReRegisterRegion { region_id, observe_id, cause, - } => self.re_register_region(region_id, observe_id, cause), + } => self.re_register_region(region_id, observe_id, cause, None), Task::AdvanceResolvedTs { leader_resolver } => { self.handle_advance_resolved_ts(leader_resolver) } @@ -897,7 +919,7 @@ where unresolved_count += 1; } ResolverStatus::Ready { .. } => { - lock_heap_size += observe_region.resolver.size(); + lock_heap_size += observe_region.resolver.approximate_heap_bytes(); resolved_count += 1; } } diff --git a/components/resolved_ts/src/errors.rs b/components/resolved_ts/src/errors.rs index d9845440c07..b4a59a2c7a0 100644 --- a/components/resolved_ts/src/errors.rs +++ b/components/resolved_ts/src/errors.rs @@ -1,62 +1,13 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. -use std::io::Error as IoError; - -use engine_traits::Error as EngineTraitsError; -use kvproto::errorpb::Error as ErrorHeader; -use raftstore::Error as RaftstoreError; use thiserror::Error; -use tikv::storage::{ - kv::{Error as KvError, ErrorInner as EngineErrorInner}, - mvcc::{Error as MvccError, ErrorInner as MvccErrorInner}, - txn::{Error as TxnError, ErrorInner as TxnErrorInner}, -}; -use txn_types::Error as TxnTypesError; #[derive(Debug, Error)] pub enum Error { - #[error("IO error {0}")] - Io(#[from] IoError), - #[error("Engine error {0}")] - Kv(#[from] KvError), - #[error("Transaction error {0}")] - Txn(#[from] TxnError), - #[error("Mvcc error {0}")] - Mvcc(#[from] MvccError), - #[error("Request error {0:?}")] - Request(Box), - #[error("Engine traits error {0}")] - EngineTraits(#[from] EngineTraitsError), - #[error("Txn types error {0}")] - TxnTypes(#[from] TxnTypesError), - #[error("Raftstore error {0}")] - Raftstore(#[from] RaftstoreError), + #[error("Memory quota exceeded")] + MemoryQuotaExceeded, #[error("Other error {0}")] Other(#[from] Box), } -impl Error { - pub fn request(err: ErrorHeader) -> Error { - Error::Request(Box::new(err)) - } - - pub fn extract_error_header(self) -> ErrorHeader { - match self { - Error::Kv(KvError(box EngineErrorInner::Request(e))) - | Error::Txn(TxnError(box TxnErrorInner::Engine(KvError( - box EngineErrorInner::Request(e), - )))) - | Error::Txn(TxnError(box TxnErrorInner::Mvcc(MvccError(box MvccErrorInner::Kv( - KvError(box EngineErrorInner::Request(e)), - ))))) - | Error::Request(box e) => e, - other => { - let mut e = ErrorHeader::default(); - e.set_message(format!("{:?}", other)); - e - } - } - } -} - pub type Result = std::result::Result; diff --git a/components/resolved_ts/src/resolver.rs b/components/resolved_ts/src/resolver.rs index 4b04bf02322..1b0a07bf8e2 100644 --- a/components/resolved_ts/src/resolver.rs +++ b/components/resolved_ts/src/resolver.rs @@ -13,6 +13,7 @@ use txn_types::TimeStamp; use crate::metrics::RTS_RESOLVED_FAIL_ADVANCE_VEC; const MAX_NUMBER_OF_LOCKS_IN_LOG: usize = 10; +const ON_DROP_WARN_HEAP_SIZE: usize = 64 * 1024 * 1024; // 64MB // Resolver resolves timestamps that guarantee no more commit will happen before // the timestamp. @@ -61,10 +62,19 @@ impl std::fmt::Debug for Resolver { impl Drop for Resolver { fn drop(&mut self) { // Free memory quota used by locks_by_key. + let mut bytes = 0; + let num_locks = self.num_locks(); for key in self.locks_by_key.keys() { - let bytes = key.heap_size(); - self.memory_quota.free(bytes); + bytes += self.lock_heap_size(key); + } + if bytes > ON_DROP_WARN_HEAP_SIZE { + warn!("drop huge resolver"; + "region_id" => self.region_id, + "bytes" => bytes, + "num_locks" => num_locks, + ); } + self.memory_quota.free(bytes); } } @@ -103,13 +113,6 @@ impl Resolver { self.stopped } - pub fn size(&self) -> usize { - self.locks_by_key.keys().map(|k| k.len()).sum::() - + self.locks_by_key.len() * std::mem::size_of::() - + self.lock_ts_heap.len() - * (std::mem::size_of::() + std::mem::size_of::>>()) - } - pub fn locks(&self) -> &BTreeMap>> { &self.lock_ts_heap } @@ -131,6 +134,33 @@ impl Resolver { self.tracked_index = index; } + // Return an approximate heap memory usage in bytes. + pub fn approximate_heap_bytes(&self) -> usize { + // memory used by locks_by_key. + let memory_quota_in_use = self.memory_quota.in_use(); + + // memory used by lock_ts_heap. + let memory_lock_ts_heap = self.lock_ts_heap.len() + * (std::mem::size_of::() + std::mem::size_of::>>()) + // memory used by HashSet> + + self.locks_by_key.len() * std::mem::size_of::>(); + + memory_quota_in_use + memory_lock_ts_heap + } + + fn lock_heap_size(&self, key: &[u8]) -> usize { + // A resolver has + // * locks_by_key: HashMap, TimeStamp> + // * lock_ts_heap: BTreeMap>> + // + // We only count memory used by locks_by_key. Because the majority of + // memory is consumed by keys, locks_by_key and lock_ts_heap shares + // the same Arc<[u8]>, so lock_ts_heap is negligible. Also, it's hard to + // track accurate memory usage of lock_ts_heap as a timestamp may have + // many keys. + key.heap_size() + std::mem::size_of::() + } + #[must_use] pub fn track_lock(&mut self, start_ts: TimeStamp, key: Vec, index: Option) -> bool { if let Some(index) = index { @@ -142,7 +172,7 @@ impl Resolver { start_ts, self.region_id ); - let bytes = key.as_slice().heap_size(); + let bytes = self.lock_heap_size(&key); if !self.memory_quota.alloc(bytes) { return false; } @@ -157,7 +187,7 @@ impl Resolver { self.update_tracked_index(index); } let start_ts = if let Some(start_ts) = self.locks_by_key.remove(key) { - let bytes = key.heap_size(); + let bytes = self.lock_heap_size(key); self.memory_quota.free(bytes); start_ts } else { @@ -358,12 +388,13 @@ mod tests { let memory_quota = Arc::new(MemoryQuota::new(1024)); let mut resolver = Resolver::new(1, memory_quota.clone()); let mut key = vec![0; 77]; + let lock_size = resolver.lock_heap_size(&key); let mut ts = TimeStamp::default(); while resolver.track_lock(ts, key.clone(), None) { ts.incr(); key[0..8].copy_from_slice(&ts.into_inner().to_be_bytes()); } - let remain = 1024 % key.len(); + let remain = 1024 % lock_size; assert_eq!(memory_quota.in_use(), 1024 - remain); let mut ts = TimeStamp::default(); @@ -372,7 +403,7 @@ mod tests { key[0..8].copy_from_slice(&ts.into_inner().to_be_bytes()); resolver.untrack_lock(&key, None); } - assert_eq!(memory_quota.in_use(), 1024 - 5 * key.len() - remain); + assert_eq!(memory_quota.in_use(), 1024 - 5 * lock_size - remain); drop(resolver); assert_eq!(memory_quota.in_use(), 0); } diff --git a/components/resolved_ts/src/scanner.rs b/components/resolved_ts/src/scanner.rs index 0ca74bda29d..e8665e9d860 100644 --- a/components/resolved_ts/src/scanner.rs +++ b/components/resolved_ts/src/scanner.rs @@ -45,6 +45,7 @@ pub struct ScanTask { pub mode: ScanMode, pub region: Region, pub checkpoint_ts: TimeStamp, + pub backoff: Option, pub is_cancelled: IsCancelledCallback, pub send_entries: OnEntriesCallback, pub on_error: Option, @@ -84,6 +85,18 @@ impl, E: KvEngine> ScannerPool { pub fn spawn_task(&self, mut task: ScanTask) { let cdc_handle = self.cdc_handle.clone(); let fut = async move { + if let Some(backoff) = task.backoff { + if let Err(e) = GLOBAL_TIMER_HANDLE + .delay(std::time::Instant::now() + backoff) + .compat() + .await + { + error!("failed to backoff"; "err" => ?e); + } + if (task.is_cancelled)() { + return; + } + } let snap = match Self::get_snapshot(&mut task, cdc_handle).await { Ok(snap) => snap, Err(e) => { @@ -193,37 +206,36 @@ impl, E: KvEngine> ScannerPool { error!("failed to backoff"; "err" => ?e); } if (task.is_cancelled)() { - return Err(Error::Other("scan task cancelled".into())); + return Err(box_err!("scan task cancelled")); } } let (cb, fut) = tikv_util::future::paired_future_callback(); let change_cmd = ChangeObserver::from_rts(task.region.id, task.handle.clone()); - cdc_handle.capture_change( - task.region.id, - task.region.get_region_epoch().clone(), - change_cmd, - Callback::read(Box::new(cb)), - )?; + cdc_handle + .capture_change( + task.region.id, + task.region.get_region_epoch().clone(), + change_cmd, + Callback::read(Box::new(cb)), + ) + .map_err(|e| Error::Other(box_err!("{:?}", e)))?; let mut resp = box_try!(fut.await); if resp.response.get_header().has_error() { let err = resp.response.take_header().take_error(); // These two errors can't handled by retrying since the epoch and observe id is // unchanged if err.has_epoch_not_match() || err.get_message().contains("stale observe id") { - return Err(Error::request(err)); + return Err(box_err!("get snapshot failed: {:?}", err)); } last_err = Some(err) } else { return Ok(resp.snapshot.unwrap()); } } - Err(Error::Other( - format!( - "backoff timeout after {} try, last error: {:?}", - GET_SNAPSHOT_RETRY_TIME, - last_err.unwrap() - ) - .into(), + Err(box_err!( + "backoff timeout after {} try, last error: {:?}", + GET_SNAPSHOT_RETRY_TIME, + last_err.unwrap() )) } @@ -232,12 +244,14 @@ impl, E: KvEngine> ScannerPool { start: Option<&Key>, _checkpoint_ts: TimeStamp, ) -> Result<(Vec<(Key, Lock)>, bool)> { - let (locks, has_remaining) = reader.scan_locks( - start, - None, - |lock| matches!(lock.lock_type, LockType::Put | LockType::Delete), - DEFAULT_SCAN_BATCH_SIZE, - )?; + let (locks, has_remaining) = reader + .scan_locks( + start, + None, + |lock| matches!(lock.lock_type, LockType::Put | LockType::Delete), + DEFAULT_SCAN_BATCH_SIZE, + ) + .map_err(|e| Error::Other(box_err!("{:?}", e)))?; Ok((locks, has_remaining)) } @@ -245,7 +259,10 @@ impl, E: KvEngine> ScannerPool { let mut entries = Vec::with_capacity(DEFAULT_SCAN_BATCH_SIZE); let mut has_remaining = true; while entries.len() < entries.capacity() { - match scanner.next_entry()? { + match scanner + .next_entry() + .map_err(|e| Error::Other(box_err!("{:?}", e)))? + { Some(entry) => { entries.push(entry); } diff --git a/components/resolved_ts/tests/integrations/mod.rs b/components/resolved_ts/tests/integrations/mod.rs index 7802108b92b..634aa66c601 100644 --- a/components/resolved_ts/tests/integrations/mod.rs +++ b/components/resolved_ts/tests/integrations/mod.rs @@ -2,11 +2,12 @@ #[path = "../mod.rs"] mod testsuite; -use std::time::Duration; +use std::{sync::mpsc::channel, time::Duration}; use futures::executor::block_on; use kvproto::{kvrpcpb::*, metapb::RegionEpoch}; use pd_client::PdClient; +use resolved_ts::Task; use tempfile::Builder; use test_raftstore::sleep_ms; use test_sst_importer::*; @@ -141,3 +142,92 @@ fn test_dynamic_change_advance_ts_interval() { suite.stop(); } + +#[test] +fn test_change_log_memory_quota_exceeded() { + let mut suite = TestSuite::new(1); + let region = suite.cluster.get_region(&[]); + + suite.must_get_rts_ge( + region.id, + block_on(suite.cluster.pd_client.get_tso()).unwrap(), + ); + + // Set a small memory quota to trigger memory quota exceeded. + suite.must_change_memory_quota(1, 1); + let (k, v) = (b"k1", b"v"); + let start_ts = block_on(suite.cluster.pd_client.get_tso()).unwrap(); + let mut mutation = Mutation::default(); + mutation.set_op(Op::Put); + mutation.key = k.to_vec(); + mutation.value = v.to_vec(); + suite.must_kv_prewrite(region.id, vec![mutation], k.to_vec(), start_ts, false); + + // Resolved ts should not advance. + let (tx, rx) = channel(); + suite.must_schedule_task( + 1, + Task::GetDiagnosisInfo { + region_id: 1, + log_locks: false, + min_start_ts: u64::MAX, + callback: Box::new(move |res| { + tx.send(res).unwrap(); + }), + }, + ); + let res = rx.recv_timeout(Duration::from_secs(5)).unwrap(); + assert_eq!(res.unwrap().1, 0, "{:?}", res); + + suite.stop(); +} + +#[test] +fn test_scan_log_memory_quota_exceeded() { + let mut suite = TestSuite::new(1); + let region = suite.cluster.get_region(&[]); + + suite.must_get_rts_ge( + region.id, + block_on(suite.cluster.pd_client.get_tso()).unwrap(), + ); + + let (k, v) = (b"k1", b"v"); + let start_ts = block_on(suite.cluster.pd_client.get_tso()).unwrap(); + let mut mutation = Mutation::default(); + mutation.set_op(Op::Put); + mutation.key = k.to_vec(); + mutation.value = v.to_vec(); + suite.must_kv_prewrite(region.id, vec![mutation], k.to_vec(), start_ts, false); + + // Set a small memory quota to trigger memory quota exceeded. + suite.must_change_memory_quota(1, 1); + // Split region + suite.cluster.must_split(®ion, k); + + let r1 = suite.cluster.get_region(&[]); + let r2 = suite.cluster.get_region(k); + let current_ts = block_on(suite.cluster.pd_client.get_tso()).unwrap(); + // Wait for scan log. + sleep_ms(500); + // Resolved ts of region1 should be advanced + suite.must_get_rts_ge(r1.id, current_ts); + + // Resolved ts should not advance. + let (tx, rx) = channel(); + suite.must_schedule_task( + r2.id, + Task::GetDiagnosisInfo { + region_id: r2.id, + log_locks: false, + min_start_ts: u64::MAX, + callback: Box::new(move |res| { + tx.send(res).unwrap(); + }), + }, + ); + let res = rx.recv_timeout(Duration::from_secs(5)).unwrap(); + assert_eq!(res.unwrap().1, 0, "{:?}", res); + + suite.stop(); +} diff --git a/components/resolved_ts/tests/mod.rs b/components/resolved_ts/tests/mod.rs index 4e6226f8935..830e2156e9f 100644 --- a/components/resolved_ts/tests/mod.rs +++ b/components/resolved_ts/tests/mod.rs @@ -122,8 +122,21 @@ impl TestSuite { ); c }; + self.must_schedule_task(store_id, Task::ChangeConfig { change }); + } + + pub fn must_change_memory_quota(&self, store_id: u64, bytes: u64) { + let change = { + let mut c = std::collections::HashMap::default(); + c.insert("memory_quota".to_owned(), ConfigValue::Size(bytes)); + c + }; + self.must_schedule_task(store_id, Task::ChangeConfig { change }); + } + + pub fn must_schedule_task(&self, store_id: u64, task: Task) { let scheduler = self.endpoints.get(&store_id).unwrap().scheduler(); - scheduler.schedule(Task::ChangeConfig { change }).unwrap(); + scheduler.schedule(task).unwrap(); } pub fn must_kv_prewrite( diff --git a/src/config/mod.rs b/src/config/mod.rs index 38369b3ee93..d9b9263e928 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -2955,6 +2955,7 @@ pub struct ResolvedTsConfig { pub advance_ts_interval: ReadableDuration, #[online_config(skip)] pub scan_lock_pool_size: usize, + pub memory_quota: ReadableSize, } impl ResolvedTsConfig { @@ -2975,6 +2976,7 @@ impl Default for ResolvedTsConfig { enable: true, advance_ts_interval: ReadableDuration::secs(20), scan_lock_pool_size: 2, + memory_quota: ReadableSize::mb(256), } } } diff --git a/tests/integrations/config/mod.rs b/tests/integrations/config/mod.rs index 8fdbaa00f25..87b1830e4f6 100644 --- a/tests/integrations/config/mod.rs +++ b/tests/integrations/config/mod.rs @@ -851,6 +851,7 @@ fn test_serde_custom_tikv_config() { enable: true, advance_ts_interval: ReadableDuration::secs(5), scan_lock_pool_size: 1, + memory_quota: ReadableSize::mb(1), }; value.causal_ts = CausalTsConfig { renew_interval: ReadableDuration::millis(100), diff --git a/tests/integrations/config/test-custom.toml b/tests/integrations/config/test-custom.toml index 053e7c45939..94f9ef1ecf1 100644 --- a/tests/integrations/config/test-custom.toml +++ b/tests/integrations/config/test-custom.toml @@ -705,6 +705,7 @@ sink-memory-quota = "7MB" enable = true advance-ts-interval = "5s" scan-lock-pool-size = 1 +memory-quota = "1MB" [split] detect-times = 10 From 517522b5e77b8e0aae667790b2961d88fb61a23b Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Tue, 29 Aug 2023 18:57:37 +0800 Subject: [PATCH 013/220] raftstore-v2: support column family based write buffer manager (#15453) ref tikv/tikv#12842 support column family based write buffer manager Signed-off-by: SpadeA-Tang --- Cargo.lock | 6 +-- components/engine_traits/src/flush.rs | 5 ++ src/config/mod.rs | 49 ++++++++++++++++++-- tests/failpoints/cases/mod.rs | 1 + tests/failpoints/cases/test_engine.rs | 53 ++++++++++++++++++++++ tests/integrations/config/mod.rs | 5 ++ tests/integrations/config/test-custom.toml | 1 + 7 files changed, 114 insertions(+), 6 deletions(-) create mode 100644 tests/failpoints/cases/test_engine.rs diff --git a/Cargo.lock b/Cargo.lock index 3c44a639e38..162d1f3ae07 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3108,7 +3108,7 @@ dependencies = [ [[package]] name = "librocksdb_sys" version = "0.1.0" -source = "git+https://github.com/tikv/rust-rocksdb.git#d861ede96cc2aae3c2ed5ea1c1c71454130a325e" +source = "git+https://github.com/tikv/rust-rocksdb.git#b68565569d711d78f8ae0d24e2d2b59f0fd03ef1" dependencies = [ "bindgen 0.65.1", "bzip2-sys", @@ -3127,7 +3127,7 @@ dependencies = [ [[package]] name = "libtitan_sys" version = "0.0.1" -source = "git+https://github.com/tikv/rust-rocksdb.git#d861ede96cc2aae3c2ed5ea1c1c71454130a325e" +source = "git+https://github.com/tikv/rust-rocksdb.git#b68565569d711d78f8ae0d24e2d2b59f0fd03ef1" dependencies = [ "bzip2-sys", "cc", @@ -5101,7 +5101,7 @@ dependencies = [ [[package]] name = "rocksdb" version = "0.3.0" -source = "git+https://github.com/tikv/rust-rocksdb.git#d861ede96cc2aae3c2ed5ea1c1c71454130a325e" +source = "git+https://github.com/tikv/rust-rocksdb.git#b68565569d711d78f8ae0d24e2d2b59f0fd03ef1" dependencies = [ "libc 0.2.146", "librocksdb_sys", diff --git a/components/engine_traits/src/flush.rs b/components/engine_traits/src/flush.rs index eebf0e7c32a..d0f9f892f34 100644 --- a/components/engine_traits/src/flush.rs +++ b/components/engine_traits/src/flush.rs @@ -204,6 +204,11 @@ impl PersistenceListener { /// /// `smallest_seqno` should be the smallest seqno of the memtable. pub fn on_memtable_sealed(&self, cf: String, smallest_seqno: u64) { + (|| { + fail_point!("on_memtable_sealed", |t| { + assert_eq!(t.unwrap().as_str(), cf); + }) + })(); // The correctness relies on the assumption that there will be only one // thread writting to the DB and increasing apply index. // Apply index will be set within DB lock, so it's correct even with manual diff --git a/src/config/mod.rs b/src/config/mod.rs index d9b9263e928..2494e84dfbd 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -110,6 +110,7 @@ const RAFT_ENGINE_MEMORY_LIMIT_RATE: f64 = 0.15; const WRITE_BUFFER_MEMORY_LIMIT_RATE: f64 = 0.2; // Too large will increase Raft Engine memory usage. const WRITE_BUFFER_MEMORY_LIMIT_MAX: u64 = ReadableSize::gb(8).0; +const DEFAULT_LOCK_BUFFER_MEMORY_LIMIT: u64 = ReadableSize::mb(32).0; /// Configs that actually took effect in the last run pub const LAST_CONFIG_FILE: &str = "last_tikv.toml"; @@ -311,6 +312,7 @@ macro_rules! cf_config { #[online_config(skip)] pub compression_per_level: [DBCompressionType; 7], pub write_buffer_size: Option, + pub write_buffer_limit: Option, pub max_write_buffer_number: i32, #[online_config(skip)] pub min_write_buffer_number_to_merge: i32, @@ -668,6 +670,7 @@ macro_rules! build_cf_opt { pub struct CfResources { pub cache: Cache, pub compaction_thread_limiters: HashMap<&'static str, ConcurrentTaskLimiter>, + pub write_buffer_managers: HashMap<&'static str, Arc>, } cf_config!(DefaultCfConfig); @@ -734,6 +737,7 @@ impl Default for DefaultCfConfig { ttl: None, periodic_compaction_seconds: None, titan: TitanCfConfig::default(), + write_buffer_limit: None, } } } @@ -832,6 +836,9 @@ impl DefaultCfConfig { } } cf_opts.set_titan_cf_options(&self.titan.build_opts()); + if let Some(write_buffer_manager) = shared.write_buffer_managers.get(CF_DEFAULT) { + cf_opts.set_write_buffer_manager(write_buffer_manager); + } cf_opts } } @@ -906,6 +913,7 @@ impl Default for WriteCfConfig { ttl: None, periodic_compaction_seconds: None, titan, + write_buffer_limit: None, } } } @@ -962,6 +970,9 @@ impl WriteCfConfig { .unwrap(); } cf_opts.set_titan_cf_options(&self.titan.build_opts()); + if let Some(write_buffer_manager) = shared.write_buffer_managers.get(CF_WRITE) { + cf_opts.set_write_buffer_manager(write_buffer_manager); + } cf_opts } } @@ -1028,6 +1039,7 @@ impl Default for LockCfConfig { ttl: None, periodic_compaction_seconds: None, titan, + write_buffer_limit: None, } } } @@ -1062,6 +1074,9 @@ impl LockCfConfig { .unwrap(); } cf_opts.set_titan_cf_options(&self.titan.build_opts()); + if let Some(write_buffer_manager) = shared.write_buffer_managers.get(CF_LOCK) { + cf_opts.set_write_buffer_manager(write_buffer_manager); + } cf_opts } } @@ -1127,6 +1142,7 @@ impl Default for RaftCfConfig { ttl: None, periodic_compaction_seconds: None, titan, + write_buffer_limit: None, } } } @@ -1385,9 +1401,12 @@ impl DbConfig { // strategy is consistent with single RocksDB. self.defaultcf.max_compactions.get_or_insert(1); self.writecf.max_compactions.get_or_insert(1); - if self.lockcf.write_buffer_size.is_none() { - self.lockcf.write_buffer_size = Some(ReadableSize::mb(4)); - } + self.lockcf + .write_buffer_size + .get_or_insert(ReadableSize::mb(4)); + self.lockcf + .write_buffer_limit + .get_or_insert(ReadableSize::mb(DEFAULT_LOCK_BUFFER_MEMORY_LIMIT)); } } } @@ -1510,9 +1529,29 @@ impl DbConfig { ConcurrentTaskLimiter::new(CF_RAFT, n), ); } + let mut write_buffer_managers = HashMap::default(); + self.lockcf.write_buffer_limit.map(|limit| { + write_buffer_managers.insert( + CF_LOCK, + Arc::new(WriteBufferManager::new(limit.0 as usize, 0f32, true)), + ) + }); + self.defaultcf.write_buffer_limit.map(|limit| { + write_buffer_managers.insert( + CF_DEFAULT, + Arc::new(WriteBufferManager::new(limit.0 as usize, 0f32, true)), + ) + }); + self.writecf.write_buffer_limit.map(|limit| { + write_buffer_managers.insert( + CF_WRITE, + Arc::new(WriteBufferManager::new(limit.0 as usize, 0f32, true)), + ) + }); CfResources { cache, compaction_thread_limiters, + write_buffer_managers, } } @@ -1556,6 +1595,9 @@ impl DbConfig { self.writecf.validate()?; self.raftcf.validate()?; self.titan.validate()?; + if self.raftcf.write_buffer_limit.is_some() { + return Err("raftcf does not support cf based write buffer manager".into()); + } if self.enable_unordered_write { if self.titan.enabled { return Err("RocksDB.unordered_write does not support Titan".into()); @@ -1660,6 +1702,7 @@ impl Default for RaftDefaultCfConfig { ttl: None, periodic_compaction_seconds: None, titan: TitanCfConfig::default(), + write_buffer_limit: None, } } } diff --git a/tests/failpoints/cases/mod.rs b/tests/failpoints/cases/mod.rs index 9c90211c073..9baa04d0b4f 100644 --- a/tests/failpoints/cases/mod.rs +++ b/tests/failpoints/cases/mod.rs @@ -10,6 +10,7 @@ mod test_coprocessor; mod test_disk_full; mod test_early_apply; mod test_encryption; +mod test_engine; mod test_gc_metrics; mod test_gc_worker; mod test_hibernate; diff --git a/tests/failpoints/cases/test_engine.rs b/tests/failpoints/cases/test_engine.rs new file mode 100644 index 00000000000..93d1c96597b --- /dev/null +++ b/tests/failpoints/cases/test_engine.rs @@ -0,0 +1,53 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use engine_traits::{CF_DEFAULT, CF_LOCK, CF_WRITE}; +use tikv_util::config::ReadableSize; + +fn dummy_string(len: usize) -> String { + String::from_utf8(vec![0; len]).unwrap() +} + +#[test] +fn test_write_buffer_manager() { + use test_raftstore_v2::*; + let count = 1; + let mut cluster = new_node_cluster(0, count); + cluster.cfg.rocksdb.lockcf.write_buffer_limit = Some(ReadableSize::kb(10)); + cluster.cfg.rocksdb.defaultcf.write_buffer_limit = Some(ReadableSize::kb(10)); + cluster.cfg.rocksdb.write_buffer_limit = Some(ReadableSize::kb(30)); + + // Let write buffer size small to make memtable request fewer memories. + // Otherwise, one single memory request can exceeds the write buffer limit set + // above. + cluster.cfg.rocksdb.lockcf.write_buffer_size = Some(ReadableSize::kb(64)); + cluster.cfg.rocksdb.writecf.write_buffer_size = Some(ReadableSize::kb(64)); + cluster.cfg.rocksdb.defaultcf.write_buffer_size = Some(ReadableSize::kb(64)); + cluster.run(); + + let dummy = dummy_string(500); + let fp = "on_memtable_sealed"; + fail::cfg(fp, "return(lock)").unwrap(); + + for i in 0..10 { + let key = format!("key-{:03}", i); + for cf in &[CF_WRITE, CF_LOCK] { + cluster.must_put_cf(cf, key.as_bytes(), dummy.as_bytes()); + } + } + + fail::cfg(fp, "return(default)").unwrap(); + + for i in 0..10 { + let key = format!("key-{:03}", i); + for cf in &[CF_WRITE, CF_DEFAULT] { + cluster.must_put_cf(cf, key.as_bytes(), dummy.as_bytes()); + } + } + + fail::cfg(fp, "return(write)").unwrap(); + let dummy = dummy_string(1000); + for i in 0..10 { + let key = format!("key-{:03}", i); + cluster.must_put_cf(CF_WRITE, key.as_bytes(), dummy.as_bytes()); + } +} diff --git a/tests/integrations/config/mod.rs b/tests/integrations/config/mod.rs index 87b1830e4f6..d3091e30eed 100644 --- a/tests/integrations/config/mod.rs +++ b/tests/integrations/config/mod.rs @@ -388,6 +388,7 @@ fn test_serde_custom_tikv_config() { max_compactions: Some(3), ttl: Some(ReadableDuration::days(10)), periodic_compaction_seconds: Some(ReadableDuration::days(10)), + write_buffer_limit: None, }, writecf: WriteCfConfig { block_size: ReadableSize::kb(12), @@ -461,6 +462,7 @@ fn test_serde_custom_tikv_config() { max_compactions: Some(3), ttl: Some(ReadableDuration::days(10)), periodic_compaction_seconds: Some(ReadableDuration::days(10)), + write_buffer_limit: None, }, lockcf: LockCfConfig { block_size: ReadableSize::kb(12), @@ -534,6 +536,7 @@ fn test_serde_custom_tikv_config() { max_compactions: Some(3), ttl: Some(ReadableDuration::days(10)), periodic_compaction_seconds: Some(ReadableDuration::days(10)), + write_buffer_limit: Some(ReadableSize::mb(16)), }, raftcf: RaftCfConfig { block_size: ReadableSize::kb(12), @@ -607,6 +610,7 @@ fn test_serde_custom_tikv_config() { max_compactions: Some(3), ttl: Some(ReadableDuration::days(10)), periodic_compaction_seconds: Some(ReadableDuration::days(10)), + write_buffer_limit: None, }, titan: titan_db_config.clone(), }; @@ -695,6 +699,7 @@ fn test_serde_custom_tikv_config() { max_compactions: Some(3), ttl: None, periodic_compaction_seconds: None, + write_buffer_limit: None, }, titan: titan_db_config, }; diff --git a/tests/integrations/config/test-custom.toml b/tests/integrations/config/test-custom.toml index 94f9ef1ecf1..653c3d2daef 100644 --- a/tests/integrations/config/test-custom.toml +++ b/tests/integrations/config/test-custom.toml @@ -435,6 +435,7 @@ compression-per-level = [ "lz4", ] write-buffer-size = "1MB" +write-buffer-limit = "16MB" max-write-buffer-number = 12 min-write-buffer-number-to-merge = 12 max-bytes-for-level-base = "12KB" From 4b3e33e6c208e445388c43a99a5707d03421f7bd Mon Sep 17 00:00:00 2001 From: ShuNing Date: Wed, 30 Aug 2023 10:21:37 +0800 Subject: [PATCH 014/220] pd_client: add backoff for the reconnect retries (#15429) ref tikv/pd#6556, close tikv/tikv#15428 pc_client: add store-level backoff for the reconnect retries Signed-off-by: nolouch Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- components/pd_client/src/client_v2.rs | 14 ++-- components/pd_client/src/metrics.rs | 27 +++++-- components/pd_client/src/util.rs | 100 +++++++++++++++++++------- 3 files changed, 99 insertions(+), 42 deletions(-) diff --git a/components/pd_client/src/client_v2.rs b/components/pd_client/src/client_v2.rs index 5b0d563f2b8..97b2702fc39 100644 --- a/components/pd_client/src/client_v2.rs +++ b/components/pd_client/src/client_v2.rs @@ -117,7 +117,7 @@ impl RawClient { /// Returns Ok(true) when a new connection is established. async fn maybe_reconnect(&mut self, ctx: &ConnectContext, force: bool) -> Result { - PD_RECONNECT_COUNTER_VEC.with_label_values(&["try"]).inc(); + PD_RECONNECT_COUNTER_VEC.try_connect.inc(); let start = Instant::now(); let members = self.members.clone(); @@ -135,21 +135,15 @@ impl RawClient { .await { Err(e) => { - PD_RECONNECT_COUNTER_VEC - .with_label_values(&["failure"]) - .inc(); + PD_RECONNECT_COUNTER_VEC.failure.inc(); return Err(e); } Ok(None) => { - PD_RECONNECT_COUNTER_VEC - .with_label_values(&["no-need"]) - .inc(); + PD_RECONNECT_COUNTER_VEC.no_need.inc(); return Ok(false); } Ok(Some(tuple)) => { - PD_RECONNECT_COUNTER_VEC - .with_label_values(&["success"]) - .inc(); + PD_RECONNECT_COUNTER_VEC.success.inc(); tuple } }; diff --git a/components/pd_client/src/metrics.rs b/components/pd_client/src/metrics.rs index d92e334396a..4e185658f15 100644 --- a/components/pd_client/src/metrics.rs +++ b/components/pd_client/src/metrics.rs @@ -2,7 +2,7 @@ use lazy_static::lazy_static; use prometheus::*; -use prometheus_static_metric::{make_static_metric, register_static_histogram_vec}; +use prometheus_static_metric::*; make_static_metric! { pub label_enum PDRequestEventType { @@ -40,9 +40,20 @@ make_static_metric! { meta_storage_watch, } + pub label_enum PDReconnectEventKind { + success, + failure, + no_need, + cancel, + try_connect, + } + pub struct PDRequestEventHistogramVec: Histogram { "type" => PDRequestEventType, } + pub struct PDReconnectEventCounterVec: IntCounter { + "type" => PDReconnectEventKind, + } } lazy_static! { @@ -66,12 +77,14 @@ lazy_static! { &["type"] ) .unwrap(); - pub static ref PD_RECONNECT_COUNTER_VEC: IntCounterVec = register_int_counter_vec!( - "tikv_pd_reconnect_total", - "Total number of PD reconnections.", - &["type"] - ) - .unwrap(); + pub static ref PD_RECONNECT_COUNTER_VEC: PDReconnectEventCounterVec = + register_static_int_counter_vec!( + PDReconnectEventCounterVec, + "tikv_pd_reconnect_total", + "Total number of PD reconnections.", + &["type"] + ) + .unwrap(); pub static ref PD_PENDING_HEARTBEAT_GAUGE: IntGauge = register_int_gauge!( "tikv_pd_pending_heartbeat_total", "Total number of pending region heartbeat" diff --git a/components/pd_client/src/util.rs b/components/pd_client/src/util.rs index 5491a51c047..66b084d4998 100644 --- a/components/pd_client/src/util.rs +++ b/components/pd_client/src/util.rs @@ -50,6 +50,7 @@ const MAX_RETRY_TIMES: u64 = 5; // The max duration when retrying to connect to leader. No matter if the // MAX_RETRY_TIMES is reached. const MAX_RETRY_DURATION: Duration = Duration::from_secs(10); +const MAX_BACKOFF: Duration = Duration::from_secs(3); // FIXME: Use a request-independent way to handle reconnection. pub const REQUEST_RECONNECT_INTERVAL: Duration = Duration::from_secs(1); // 1s @@ -116,6 +117,7 @@ pub struct Inner { pub rg_resp: Option>, last_try_reconnect: Instant, + bo: ExponentialBackoff, } impl Inner { @@ -168,7 +170,6 @@ pub struct Client { pub(crate) inner: RwLock, pub feature_gate: FeatureGate, enable_forwarding: bool, - retry_interval: Duration, } impl Client { @@ -219,6 +220,7 @@ impl Client { pending_heartbeat: Arc::default(), pending_buckets: Arc::default(), last_try_reconnect: Instant::now(), + bo: ExponentialBackoff::new(retry_interval), tso, meta_storage, rg_sender: Either::Left(Some(rg_sender)), @@ -226,7 +228,6 @@ impl Client { }), feature_gate: FeatureGate::default(), enable_forwarding, - retry_interval, } } @@ -363,17 +364,15 @@ impl Client { /// Note: Retrying too quickly will return an error due to cancellation. /// Please always try to reconnect after sending the request first. pub async fn reconnect(&self, force: bool) -> Result<()> { - PD_RECONNECT_COUNTER_VEC.with_label_values(&["try"]).inc(); + PD_RECONNECT_COUNTER_VEC.try_connect.inc(); let start = Instant::now(); let future = { let inner = self.inner.rl(); - if start.saturating_duration_since(inner.last_try_reconnect) < self.retry_interval { + if start.saturating_duration_since(inner.last_try_reconnect) < inner.bo.get_interval() { // Avoid unnecessary updating. // Prevent a large number of reconnections in a short time. - PD_RECONNECT_COUNTER_VEC - .with_label_values(&["cancel"]) - .inc(); + PD_RECONNECT_COUNTER_VEC.cancel.inc(); return Err(box_err!("cancel reconnection due to too small interval")); } let connector = PdConnector::new(inner.env.clone(), inner.security_mgr.clone()); @@ -394,36 +393,38 @@ impl Client { { let mut inner = self.inner.wl(); - if start.saturating_duration_since(inner.last_try_reconnect) < self.retry_interval { + if start.saturating_duration_since(inner.last_try_reconnect) < inner.bo.get_interval() { // There may be multiple reconnections that pass the read lock at the same time. // Check again in the write lock to avoid unnecessary updating. - PD_RECONNECT_COUNTER_VEC - .with_label_values(&["cancel"]) - .inc(); + PD_RECONNECT_COUNTER_VEC.cancel.inc(); return Err(box_err!("cancel reconnection due to too small interval")); } inner.last_try_reconnect = start; + inner.bo.next_backoff(); } slow_log!(start.saturating_elapsed(), "try reconnect pd"); let (client, target_info, members, tso) = match future.await { Err(e) => { - PD_RECONNECT_COUNTER_VEC - .with_label_values(&["failure"]) - .inc(); + PD_RECONNECT_COUNTER_VEC.failure.inc(); return Err(e); } - Ok(None) => { - PD_RECONNECT_COUNTER_VEC - .with_label_values(&["no-need"]) - .inc(); - return Ok(()); - } - Ok(Some(tuple)) => { - PD_RECONNECT_COUNTER_VEC - .with_label_values(&["success"]) - .inc(); - tuple + Ok(res) => { + // Reset the retry count. + { + let mut inner = self.inner.wl(); + inner.bo.reset() + } + match res { + None => { + PD_RECONNECT_COUNTER_VEC.no_need.inc(); + return Ok(()); + } + Some(tuple) => { + PD_RECONNECT_COUNTER_VEC.success.inc(); + tuple + } + } } }; @@ -900,6 +901,33 @@ impl PdConnector { } } +/// Simple backoff strategy. +struct ExponentialBackoff { + base: Duration, + interval: Duration, +} + +impl ExponentialBackoff { + pub fn new(base: Duration) -> Self { + Self { + base, + interval: base, + } + } + pub fn next_backoff(&mut self) -> Duration { + self.interval = std::cmp::min(self.interval * 2, MAX_BACKOFF); + self.interval + } + + pub fn get_interval(&self) -> Duration { + self.interval + } + + pub fn reset(&mut self) { + self.interval = self.base; + } +} + pub fn trim_http_prefix(s: &str) -> &str { s.trim_start_matches("http://") .trim_start_matches("https://") @@ -1045,8 +1073,11 @@ pub fn merge_bucket_stats, I: AsRef<[u8]>>( mod test { use kvproto::metapb::BucketStats; + use super::*; use crate::{merge_bucket_stats, util::find_bucket_index}; + const BASE_BACKOFF: Duration = Duration::from_millis(100); + #[test] fn test_merge_bucket_stats() { #[allow(clippy::type_complexity)] @@ -1162,4 +1193,23 @@ mod test { assert_eq!(find_bucket_index(b"k7", &keys), Some(4)); assert_eq!(find_bucket_index(b"k8", &keys), Some(4)); } + + #[test] + fn test_exponential_backoff() { + let mut backoff = ExponentialBackoff::new(BASE_BACKOFF); + assert_eq!(backoff.get_interval(), BASE_BACKOFF); + + assert_eq!(backoff.next_backoff(), 2 * BASE_BACKOFF); + assert_eq!(backoff.next_backoff(), Duration::from_millis(400)); + assert_eq!(backoff.get_interval(), Duration::from_millis(400)); + + // Should not exceed MAX_BACKOFF + for _ in 0..20 { + backoff.next_backoff(); + } + assert_eq!(backoff.get_interval(), MAX_BACKOFF); + + backoff.reset(); + assert_eq!(backoff.get_interval(), BASE_BACKOFF); + } } From 0bb270621f6d561560156c38cc21240ceae97c00 Mon Sep 17 00:00:00 2001 From: buffer <1045931706@qq.com> Date: Wed, 30 Aug 2023 14:44:08 +0800 Subject: [PATCH 015/220] coprocessor: skip transient read request (#15406) close tikv/tikv#15405 Signed-off-by: bufferflies <1045931706@qq.com> Co-authored-by: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> --- src/coprocessor/metrics.rs | 6 ++ src/coprocessor/tracker.rs | 175 +++++++++++++++++++++---------------- 2 files changed, 104 insertions(+), 77 deletions(-) diff --git a/src/coprocessor/metrics.rs b/src/coprocessor/metrics.rs index 64905b3dfba..02f45d35311 100644 --- a/src/coprocessor/metrics.rs +++ b/src/coprocessor/metrics.rs @@ -208,6 +208,12 @@ impl CopLocalMetrics { pub fn local_read_stats(&self) -> &ReadStats { &self.local_read_stats } + + #[cfg(test)] + pub fn clear(&mut self) { + self.local_read_stats.region_infos.clear(); + self.local_read_stats.region_buckets.clear(); + } } thread_local! { diff --git a/src/coprocessor/tracker.rs b/src/coprocessor/tracker.rs index 18eaa0b6e98..71d84388c3b 100644 --- a/src/coprocessor/tracker.rs +++ b/src/coprocessor/tracker.rs @@ -350,20 +350,24 @@ impl Tracker { false }; - tls_collect_query( - region_id, - peer, - start_key.as_encoded(), - end_key.as_encoded(), - reverse_scan, - ); - tls_collect_read_flow( - self.req_ctx.context.get_region_id(), - Some(start_key.as_encoded()), - Some(end_key.as_encoded()), - &total_storage_stats, - self.buckets.as_ref(), - ); + // only collect metrics for select and index, exclude transient read flow such + // like analyze and checksum. + if self.req_ctx.tag == ReqTag::select || self.req_ctx.tag == ReqTag::index { + tls_collect_query( + region_id, + peer, + start_key.as_encoded(), + end_key.as_encoded(), + reverse_scan, + ); + tls_collect_read_flow( + self.req_ctx.context.get_region_id(), + Some(start_key.as_encoded()), + Some(end_key.as_encoded()), + &total_storage_stats, + self.buckets.as_ref(), + ); + } self.current_stage = TrackerState::Tracked; } @@ -443,69 +447,86 @@ mod tests { #[test] fn test_track() { - let mut context = kvrpcpb::Context::default(); - context.set_region_id(1); - - let mut req_ctx = ReqContext::new( - ReqTag::test, - context, - vec![], - Duration::from_secs(0), - None, - None, - TimeStamp::max(), - None, - PerfLevel::EnableCount, - ); - req_ctx.lower_bound = vec![ - 116, 128, 0, 0, 0, 0, 0, 0, 184, 95, 114, 128, 0, 0, 0, 0, 0, 70, 67, - ]; - req_ctx.upper_bound = vec![ - 116, 128, 0, 0, 0, 0, 0, 0, 184, 95, 114, 128, 0, 0, 0, 0, 0, 70, 167, - ]; - let mut track: Tracker = Tracker::new(req_ctx, Duration::default()); - let mut bucket = BucketMeta::default(); - bucket.region_id = 1; - bucket.version = 1; - bucket.keys = vec![ - vec![ - 116, 128, 0, 0, 0, 0, 0, 0, 255, 179, 95, 114, 128, 0, 0, 0, 0, 255, 0, 175, 155, - 0, 0, 0, 0, 0, 250, - ], - vec![ - 116, 128, 0, 255, 255, 255, 255, 255, 255, 254, 0, 0, 0, 0, 0, 0, 0, 248, - ], - ]; - bucket.sizes = vec![10]; - track.buckets = Some(Arc::new(bucket)); - - let mut stat = Statistics::default(); - stat.write.flow_stats.read_keys = 10; - track.total_storage_stats = stat; - - track.track(); - drop(track); - TLS_COP_METRICS.with(|m| { - assert_eq!( - 10, - m.borrow() - .local_read_stats() - .region_infos - .get(&1) - .unwrap() - .flow - .read_keys - ); - assert_eq!( - vec![10], - m.borrow() - .local_read_stats() - .region_buckets - .get(&1) - .unwrap() - .stats - .read_keys + let check = move |tag: ReqTag, flow: u64| { + let mut context = kvrpcpb::Context::default(); + context.set_region_id(1); + let mut req_ctx = ReqContext::new( + tag, + context, + vec![], + Duration::from_secs(0), + None, + None, + TimeStamp::max(), + None, + PerfLevel::EnableCount, ); - }); + + req_ctx.lower_bound = vec![ + 116, 128, 0, 0, 0, 0, 0, 0, 184, 95, 114, 128, 0, 0, 0, 0, 0, 70, 67, + ]; + req_ctx.upper_bound = vec![ + 116, 128, 0, 0, 0, 0, 0, 0, 184, 95, 114, 128, 0, 0, 0, 0, 0, 70, 167, + ]; + let mut track: Tracker = Tracker::new(req_ctx, Duration::default()); + let mut bucket = BucketMeta::default(); + bucket.region_id = 1; + bucket.version = 1; + bucket.keys = vec![ + vec![ + 116, 128, 0, 0, 0, 0, 0, 0, 255, 179, 95, 114, 128, 0, 0, 0, 0, 255, 0, 175, + 155, 0, 0, 0, 0, 0, 250, + ], + vec![ + 116, 128, 0, 255, 255, 255, 255, 255, 255, 254, 0, 0, 0, 0, 0, 0, 0, 248, + ], + ]; + bucket.sizes = vec![10]; + track.buckets = Some(Arc::new(bucket)); + + let mut stat = Statistics::default(); + stat.write.flow_stats.read_keys = 10; + track.total_storage_stats = stat; + + track.track(); + drop(track); + TLS_COP_METRICS.with(|m| { + if flow > 0 { + assert_eq!( + flow as usize, + m.borrow() + .local_read_stats() + .region_infos + .get(&1) + .unwrap() + .flow + .read_keys + ); + assert_eq!( + flow, + m.borrow() + .local_read_stats() + .region_buckets + .get(&1) + .unwrap() + .stats + .read_keys[0] + ); + } else { + assert!(m.borrow().local_read_stats().region_infos.get(&1).is_none()); + assert!( + m.borrow() + .local_read_stats() + .region_buckets + .get(&1) + .is_none() + ); + } + + m.borrow_mut().clear(); + }); + }; + check(ReqTag::select, 10); + check(ReqTag::analyze_full_sampling, 0); } } From fb9a40d20dcfb9ceb7cecba9d471fa8575c05913 Mon Sep 17 00:00:00 2001 From: Xinye Tao Date: Wed, 30 Aug 2023 15:18:38 +0800 Subject: [PATCH 016/220] raftstore-v2: init persisted_tablet_index on startup (#15441) ref tikv/tikv#12842 - Initialize `persisted_apply_index` on startup. Signed-off-by: tabokie --- .../raftstore-v2/src/operation/command/admin/compact_log.rs | 4 ++-- components/raftstore-v2/src/operation/life.rs | 4 ++++ components/raftstore-v2/src/raft/peer.rs | 3 ++- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/components/raftstore-v2/src/operation/command/admin/compact_log.rs b/components/raftstore-v2/src/operation/command/admin/compact_log.rs index 93876475f5f..d054234b46f 100644 --- a/components/raftstore-v2/src/operation/command/admin/compact_log.rs +++ b/components/raftstore-v2/src/operation/command/admin/compact_log.rs @@ -63,14 +63,14 @@ pub struct CompactLogContext { } impl CompactLogContext { - pub fn new(last_applying_index: u64) -> CompactLogContext { + pub fn new(last_applying_index: u64, persisted_applied: u64) -> CompactLogContext { CompactLogContext { skipped_ticks: 0, approximate_log_size: 0, last_applying_index, last_compacted_idx: 0, tombstone_tablets_wait_index: vec![], - persisted_tablet_index: AtomicU64::new(0).into(), + persisted_tablet_index: AtomicU64::new(persisted_applied).into(), } } diff --git a/components/raftstore-v2/src/operation/life.rs b/components/raftstore-v2/src/operation/life.rs index e0e7f63785d..8fe1d2a07b3 100644 --- a/components/raftstore-v2/src/operation/life.rs +++ b/components/raftstore-v2/src/operation/life.rs @@ -797,11 +797,15 @@ impl Peer { if self.has_pending_tombstone_tablets() { let applied_index = self.entry_storage().applied_index(); let last_index = self.entry_storage().last_index(); + let persisted = self + .remember_persisted_tablet_index() + .load(std::sync::atomic::Ordering::Relaxed); info!( self.logger, "postpone destroy because there're pending tombstone tablets"; "applied_index" => applied_index, "last_index" => last_index, + "persisted_applied" => persisted, ); return true; } diff --git a/components/raftstore-v2/src/raft/peer.rs b/components/raftstore-v2/src/raft/peer.rs index 2f3a3376fe9..c3a80e3756c 100644 --- a/components/raftstore-v2/src/raft/peer.rs +++ b/components/raftstore-v2/src/raft/peer.rs @@ -158,6 +158,7 @@ impl Peer { let region_id = storage.region().get_id(); let tablet_index = storage.region_state().get_tablet_index(); let merge_context = MergeContext::from_region_state(&logger, storage.region_state()); + let persisted_applied = storage.apply_trace().persisted_apply_index(); let raft_group = RawNode::new(&raft_cfg, storage, &logger)?; let region = raft_group.store().region_state().get_region().clone(); @@ -184,7 +185,7 @@ impl Peer { self_stat: PeerStat::default(), peer_cache: vec![], peer_heartbeats: HashMap::default(), - compact_log_context: CompactLogContext::new(applied_index), + compact_log_context: CompactLogContext::new(applied_index, persisted_applied), merge_context: merge_context.map(|c| Box::new(c)), last_sent_snapshot_index: 0, raw_write_encoder: None, From 69b8ac5717119290ba721fae61edb894440a80fc Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Wed, 30 Aug 2023 17:30:39 +0800 Subject: [PATCH 017/220] raftstore-v2: consider unmatch between region range and tablet range for mvcc scan (#15455) ref tikv/tikv#14654 consider unmatch between region range and tablet range for mvcc scan --- components/engine_rocks/src/util.rs | 8 + src/server/debug2.rs | 240 ++++++------------------ tests/failpoints/cases/mod.rs | 1 + tests/failpoints/cases/test_debugger.rs | 147 +++++++++++++++ 4 files changed, 216 insertions(+), 180 deletions(-) create mode 100644 tests/failpoints/cases/test_debugger.rs diff --git a/components/engine_rocks/src/util.rs b/components/engine_rocks/src/util.rs index 225cd1d7f06..e4991419eed 100644 --- a/components/engine_rocks/src/util.rs +++ b/components/engine_rocks/src/util.rs @@ -3,6 +3,7 @@ use std::{ffi::CString, fs, path::Path, str::FromStr, sync::Arc}; use engine_traits::{Engines, Range, Result, CF_DEFAULT}; +use fail::fail_point; use rocksdb::{ load_latest_options, CColumnFamilyDescriptor, CFHandle, ColumnFamilyOptions, CompactionFilter, CompactionFilterContext, CompactionFilterDecision, CompactionFilterFactory, @@ -462,6 +463,13 @@ pub struct RangeCompactionFilterFactory(Arc); impl RangeCompactionFilterFactory { pub fn new(start_key: Box<[u8]>, end_key: Box<[u8]>) -> Self { + fail_point!("unlimited_range_compaction_filter", |_| { + let range = OwnedRange { + start_key: keys::data_key(b"").into_boxed_slice(), + end_key: keys::data_end_key(b"").into_boxed_slice(), + }; + Self(Arc::new(range)) + }); let range = OwnedRange { start_key, end_key }; Self(Arc::new(range)) } diff --git a/src/server/debug2.rs b/src/server/debug2.rs index e914b353760..cf17aea81eb 100644 --- a/src/server/debug2.rs +++ b/src/server/debug2.rs @@ -10,7 +10,7 @@ use engine_traits::{ TabletRegistry, CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE, }; use futures::future::Future; -use keys::{data_key, DATA_MAX_KEY, DATA_PREFIX_KEY}; +use keys::{data_key, enc_end_key, enc_start_key, DATA_MAX_KEY, DATA_PREFIX_KEY}; use kvproto::{ debugpb::Db as DbType, kvrpcpb::MvccInfo, @@ -36,6 +36,34 @@ use crate::{ storage::mvcc::{MvccInfoCollector, MvccInfoScanner}, }; +// `key1` and `key2` should both be start_key or end_key. +fn smaller_key<'a>(key1: &'a [u8], key2: &'a [u8], is_end_key: bool) -> &'a [u8] { + if is_end_key && key1.is_empty() { + return key2; + } + if is_end_key && key2.is_empty() { + return key1; + } + if key1 < key2 { + return key1; + } + key2 +} + +// `key1` and `key2` should both be start_key or end_key. +fn larger_key<'a>(key1: &'a [u8], key2: &'a [u8], is_end_key: bool) -> &'a [u8] { + if is_end_key && key1.is_empty() { + return key1; + } + if is_end_key && key2.is_empty() { + return key2; + } + if key1 < key2 { + return key2; + } + key1 +} + // return the region containing the seek_key or the next region if not existed fn seek_region( seek_key: &[u8], @@ -98,11 +126,16 @@ impl MvccInfoIteratorV2 { )?; let tablet = tablet_cache.latest().unwrap(); + let region_start_key = enc_start_key(first_region_state.get_region()); + let region_end_key = enc_end_key(first_region_state.get_region()); + let iter_start = larger_key(start, ®ion_start_key, false); + let iter_end = smaller_key(end, ®ion_end_key, true); + assert!(!iter_start.is_empty() && !iter_start.is_empty()); let scanner = Some( MvccInfoScanner::new( |cf, opts| tablet.iterator_opt(cf, opts).map_err(|e| box_err!(e)), - if start.is_empty() { None } else { Some(start) }, - if end.is_empty() { None } else { Some(end) }, + Some(iter_start), + Some(iter_end), MvccInfoCollector::default(), ) .map_err(|e| -> Error { box_err!(e) })?, @@ -171,19 +204,16 @@ impl Iterator for MvccInfoIteratorV2 { ) .unwrap(); let tablet = tablet_cache.latest().unwrap(); + let region_start_key = enc_start_key(&self.cur_region); + let region_end_key = enc_end_key(&self.cur_region); + let iter_start = larger_key(&self.start, ®ion_start_key, false); + let iter_end = smaller_key(&self.end, ®ion_end_key, true); + assert!(!iter_start.is_empty() && !iter_start.is_empty()); self.scanner = Some( MvccInfoScanner::new( |cf, opts| tablet.iterator_opt(cf, opts).map_err(|e| box_err!(e)), - if self.start.is_empty() { - None - } else { - Some(self.start.as_bytes()) - }, - if self.end.is_empty() { - None - } else { - Some(self.end.as_bytes()) - }, + Some(iter_start), + Some(iter_end), MvccInfoCollector::default(), ) .unwrap(), @@ -1154,38 +1184,28 @@ fn deivde_regions_for_concurrency( Ok(regions_groups) } -// `key1` and `key2` should both be start_key or end_key. -fn smaller_key<'a>(key1: &'a [u8], key2: &'a [u8], end_key: bool) -> &'a [u8] { - if end_key && key1.is_empty() { - return key2; - } - if end_key && key2.is_empty() { - return key1; - } - if key1 < key2 { - return key1; - } - key2 -} +#[cfg(any(test, feature = "testexport"))] +pub fn new_debugger(path: &std::path::Path) -> DebuggerImplV2 { + use crate::{config::TikvConfig, server::KvEngineFactoryBuilder}; -// `key1` and `key2` should both be start_key or end_key. -fn larger_key<'a>(key1: &'a [u8], key2: &'a [u8], end_key: bool) -> &'a [u8] { - if end_key && key1.is_empty() { - return key1; - } - if end_key && key2.is_empty() { - return key2; - } - if key1 < key2 { - return key2; - } - key1 + let mut cfg = TikvConfig::default(); + cfg.storage.data_dir = path.to_str().unwrap().to_string(); + cfg.raft_store.raftdb_path = cfg.infer_raft_db_path(None).unwrap(); + cfg.raft_engine.mut_config().dir = cfg.infer_raft_engine_path(None).unwrap(); + let cache = cfg.storage.block_cache.build_shared_cache(); + let env = cfg.build_shared_rocks_env(None, None).unwrap(); + + let factory = KvEngineFactoryBuilder::new(env, &cfg, cache, None).build(); + let reg = TabletRegistry::new(Box::new(factory), path).unwrap(); + + let raft_engine = + raft_log_engine::RaftLogEngine::new(cfg.raft_engine.config(), None, None).unwrap(); + + DebuggerImplV2::new(reg, raft_engine, ConfigController::default()) } #[cfg(test)] mod tests { - use std::path::Path; - use collections::HashMap; use engine_traits::{ RaftEngineReadOnly, RaftLogBatch, SyncMutable, ALL_CFS, CF_DEFAULT, CF_LOCK, CF_WRITE, @@ -1196,36 +1216,13 @@ mod tests { raft_serverpb::*, }; use raft::prelude::EntryType; - use raft_log_engine::RaftLogEngine; use raftstore::store::RAFT_INIT_LOG_INDEX; use tikv_util::store::new_peer; use super::*; - use crate::{ - config::TikvConfig, - server::KvEngineFactoryBuilder, - storage::{txn::tests::must_prewrite_put, TestEngineBuilder}, - }; - const INITIAL_TABLET_INDEX: u64 = 5; const INITIAL_APPLY_INDEX: u64 = 5; - fn new_debugger(path: &Path) -> DebuggerImplV2 { - let mut cfg = TikvConfig::default(); - cfg.storage.data_dir = path.to_str().unwrap().to_string(); - cfg.raft_store.raftdb_path = cfg.infer_raft_db_path(None).unwrap(); - cfg.raft_engine.mut_config().dir = cfg.infer_raft_engine_path(None).unwrap(); - let cache = cfg.storage.block_cache.build_shared_cache(); - let env = cfg.build_shared_rocks_env(None, None).unwrap(); - - let factory = KvEngineFactoryBuilder::new(env, &cfg, cache, None).build(); - let reg = TabletRegistry::new(Box::new(factory), path).unwrap(); - - let raft_engine = RaftLogEngine::new(cfg.raft_engine.config(), None, None).unwrap(); - - DebuggerImplV2::new(reg, raft_engine, ConfigController::default()) - } - impl DebuggerImplV2 { fn set_store_id(&self, store_id: u64) { let mut ident = self.get_store_ident().unwrap_or_default(); @@ -1458,123 +1455,6 @@ mod tests { debugger.region_size(region_id, cfs.clone()).unwrap_err(); } - // For simplicity, the format of the key is inline with data in - // prepare_data_on_disk - fn extract_key(key: &[u8]) -> &[u8] { - &key[1..4] - } - - // Prepare some data - // Data for each region: - // Region 1: k00 .. k04 - // Region 2: k05 .. k09 - // Region 3: k10 .. k14 - // Region 4: k15 .. k19 - // Region 5: k20 .. k24 - // Region 6: k26 .. k28 - fn prepare_data_on_disk(path: &Path) { - let mut cfg = TikvConfig::default(); - cfg.storage.data_dir = path.to_str().unwrap().to_string(); - cfg.raft_store.raftdb_path = cfg.infer_raft_db_path(None).unwrap(); - cfg.raft_engine.mut_config().dir = cfg.infer_raft_engine_path(None).unwrap(); - cfg.gc.enable_compaction_filter = false; - let cache = cfg.storage.block_cache.build_shared_cache(); - let env = cfg.build_shared_rocks_env(None, None).unwrap(); - - let factory = KvEngineFactoryBuilder::new(env, &cfg, cache, None).build(); - let reg = TabletRegistry::new(Box::new(factory), path).unwrap(); - - let raft_engine = RaftLogEngine::new(cfg.raft_engine.config(), None, None).unwrap(); - let mut wb = raft_engine.log_batch(5); - for i in 0..6 { - let mut region = metapb::Region::default(); - let start_key = format!("k{:02}", i * 5); - let end_key = format!("k{:02}", (i + 1) * 5); - region.set_id(i + 1); - region.set_start_key(start_key.into_bytes()); - region.set_end_key(end_key.into_bytes()); - let mut region_state = RegionLocalState::default(); - region_state.set_tablet_index(INITIAL_TABLET_INDEX); - if region.get_id() == 4 { - region_state.set_state(PeerState::Tombstone); - } else if region.get_id() == 6 { - region.set_start_key(b"k26".to_vec()); - region.set_end_key(b"k28".to_vec()); - } - region_state.set_region(region); - - let tablet_path = reg.tablet_path(i + 1, INITIAL_TABLET_INDEX); - // Use tikv_kv::RocksEngine instead of loading tablet from registry in order to - // use prewrite method to prepare mvcc data - let mut engine = TestEngineBuilder::new().path(tablet_path).build().unwrap(); - for i in i * 5..(i + 1) * 5 { - let key = format!("zk{:02}", i); - let val = format!("val{:02}", i); - // Use prewrite only is enough for preparing mvcc data - must_prewrite_put( - &mut engine, - key.as_bytes(), - val.as_bytes(), - key.as_bytes(), - 10, - ); - } - - wb.put_region_state(i + 1, INITIAL_APPLY_INDEX, ®ion_state) - .unwrap(); - } - raft_engine.consume(&mut wb, true).unwrap(); - } - - #[test] - fn test_scan_mvcc() { - let dir = test_util::temp_dir("test-debugger", false); - prepare_data_on_disk(dir.path()); - let debugger = new_debugger(dir.path()); - // Test scan with bad start, end or limit. - assert!(debugger.scan_mvcc(b"z", b"", 0).is_err()); - assert!(debugger.scan_mvcc(b"z", b"x", 3).is_err()); - - let verify_scanner = - |range, scanner: &mut dyn Iterator, MvccInfo)>>| { - for i in range { - let key = format!("k{:02}", i).into_bytes(); - assert_eq!(key, extract_key(&scanner.next().unwrap().unwrap().0)); - } - }; - - // full scann - let mut scanner = debugger.scan_mvcc(b"", b"", 100).unwrap(); - verify_scanner(0..15, &mut scanner); - verify_scanner(20..25, &mut scanner); - verify_scanner(26..28, &mut scanner); - assert!(scanner.next().is_none()); - - // Range has more elements than limit - let mut scanner = debugger.scan_mvcc(b"zk01", b"zk09", 5).unwrap(); - verify_scanner(1..6, &mut scanner); - assert!(scanner.next().is_none()); - - // Range has less elements than limit - let mut scanner = debugger.scan_mvcc(b"zk07", b"zk10", 10).unwrap(); - verify_scanner(7..10, &mut scanner); - assert!(scanner.next().is_none()); - - // Start from the key where no region contains it - let mut scanner = debugger.scan_mvcc(b"zk16", b"", 100).unwrap(); - verify_scanner(20..25, &mut scanner); - verify_scanner(26..28, &mut scanner); - assert!(scanner.next().is_none()); - - // Scan a range not existed in the cluster - let mut scanner = debugger.scan_mvcc(b"zk16", b"zk19", 100).unwrap(); - assert!(scanner.next().is_none()); - - // The end key is less than the start_key of the first region - let mut scanner = debugger.scan_mvcc(b"", b"zj", 100).unwrap(); - assert!(scanner.next().is_none()); - } - #[test] fn test_compact() { let dir = test_util::temp_dir("test-debugger", false); diff --git a/tests/failpoints/cases/mod.rs b/tests/failpoints/cases/mod.rs index 9baa04d0b4f..a9dbd36a81a 100644 --- a/tests/failpoints/cases/mod.rs +++ b/tests/failpoints/cases/mod.rs @@ -7,6 +7,7 @@ mod test_bootstrap; mod test_cmd_epoch_checker; mod test_conf_change; mod test_coprocessor; +mod test_debugger; mod test_disk_full; mod test_early_apply; mod test_encryption; diff --git a/tests/failpoints/cases/test_debugger.rs b/tests/failpoints/cases/test_debugger.rs new file mode 100644 index 00000000000..f70ebcb6d32 --- /dev/null +++ b/tests/failpoints/cases/test_debugger.rs @@ -0,0 +1,147 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use std::path::Path; + +use engine_traits::{RaftEngine, RaftLogBatch, TabletRegistry}; +use kvproto::{ + kvrpcpb::MvccInfo, + metapb, + raft_serverpb::{PeerState, RegionLocalState}, +}; +use raft_log_engine::RaftLogEngine; +use test_raftstore::new_peer; +use tikv::{ + config::TikvConfig, + server::{debug::Debugger, debug2::new_debugger, KvEngineFactoryBuilder}, + storage::{txn::tests::must_prewrite_put, TestEngineBuilder}, +}; + +const INITIAL_TABLET_INDEX: u64 = 5; +const INITIAL_APPLY_INDEX: u64 = 5; + +// Prepare some data +// Region meta range and rocksdb range of each region: +// Region 1: k01 .. k04 rocksdb: zk00 .. zk04 +// Region 2: k05 .. k09 rocksdb: zk05 .. zk09 +// Region 3: k10 .. k14 rocksdb: zk10 .. zk14 +// Region 4: k15 .. k19 rocksdb: zk15 .. zk19 +// Region 5: k20 .. k24 rocksdb: zk20 .. zk24 +// Region 6: k26 .. k27 rocksdb: zk25 .. zk29 +fn prepare_data_on_disk(path: &Path) { + let mut cfg = TikvConfig::default(); + cfg.storage.data_dir = path.to_str().unwrap().to_string(); + cfg.raft_store.raftdb_path = cfg.infer_raft_db_path(None).unwrap(); + cfg.raft_engine.mut_config().dir = cfg.infer_raft_engine_path(None).unwrap(); + cfg.gc.enable_compaction_filter = false; + let cache = cfg.storage.block_cache.build_shared_cache(); + let env = cfg.build_shared_rocks_env(None, None).unwrap(); + + let factory = KvEngineFactoryBuilder::new(env, &cfg, cache, None).build(); + let reg = TabletRegistry::new(Box::new(factory), path).unwrap(); + + let raft_engine = RaftLogEngine::new(cfg.raft_engine.config(), None, None).unwrap(); + let mut wb = raft_engine.log_batch(5); + for i in 0..6 { + let mut region = metapb::Region::default(); + let start_key = if i != 0 { + format!("k{:02}", i * 5) + } else { + String::from("k01") + }; + let end_key = format!("k{:02}", (i + 1) * 5); + region.set_id(i + 1); + region.set_start_key(start_key.into_bytes()); + region.set_end_key(end_key.into_bytes()); + let mut region_state = RegionLocalState::default(); + region_state.set_tablet_index(INITIAL_TABLET_INDEX); + if region.get_id() == 4 { + region_state.set_state(PeerState::Tombstone); + } else if region.get_id() == 6 { + region.set_start_key(b"k26".to_vec()); + region.set_end_key(b"k28".to_vec()); + } + // add dummy peer to pass verification + region.mut_peers().push(new_peer(0, 0)); + region_state.set_region(region); + + let tablet_path = reg.tablet_path(i + 1, INITIAL_TABLET_INDEX); + // Use tikv_kv::RocksEngine instead of loading tablet from registry in order to + // use prewrite method to prepare mvcc data + let mut engine = TestEngineBuilder::new().path(tablet_path).build().unwrap(); + for i in i * 5..(i + 1) * 5 { + let key = format!("zk{:02}", i); + let val = format!("val{:02}", i); + // Use prewrite only is enough for preparing mvcc data + must_prewrite_put( + &mut engine, + key.as_bytes(), + val.as_bytes(), + key.as_bytes(), + 10, + ); + } + + wb.put_region_state(i + 1, INITIAL_APPLY_INDEX, ®ion_state) + .unwrap(); + } + raft_engine.consume(&mut wb, true).unwrap(); +} + +// For simplicity, the format of the key is inline with data in +// prepare_data_on_disk +fn extract_key(key: &[u8]) -> &[u8] { + &key[1..4] +} + +#[test] +fn test_scan_mvcc() { + // We deliberately make region meta not match with rocksdb, set unlimited range + // compaction filter to avoid trim operation. + fail::cfg("unlimited_range_compaction_filter", "return").unwrap(); + + let dir = test_util::temp_dir("test-debugger", false); + prepare_data_on_disk(dir.path()); + let debugger = new_debugger(dir.path()); + // Test scan with bad start, end or limit. + assert!(debugger.scan_mvcc(b"z", b"", 0).is_err()); + assert!(debugger.scan_mvcc(b"z", b"x", 3).is_err()); + + let verify_scanner = + |range, scanner: &mut dyn Iterator, MvccInfo)>>| { + for i in range { + let key = format!("k{:02}", i).into_bytes(); + assert_eq!(key, extract_key(&scanner.next().unwrap().unwrap().0)); + } + }; + + // full scan + let mut scanner = debugger.scan_mvcc(b"", b"", 100).unwrap(); + verify_scanner(1..15, &mut scanner); + verify_scanner(20..25, &mut scanner); + verify_scanner(26..28, &mut scanner); + assert!(scanner.next().is_none()); + + // Range has more elements than limit + let mut scanner = debugger.scan_mvcc(b"zk01", b"zk09", 5).unwrap(); + verify_scanner(1..6, &mut scanner); + assert!(scanner.next().is_none()); + + // Range has less elements than limit + let mut scanner = debugger.scan_mvcc(b"zk07", b"zk10", 10).unwrap(); + verify_scanner(7..10, &mut scanner); + assert!(scanner.next().is_none()); + + // Start from the key where no region contains it + let mut scanner = debugger.scan_mvcc(b"zk16", b"", 100).unwrap(); + verify_scanner(20..25, &mut scanner); + verify_scanner(26..28, &mut scanner); + assert!(scanner.next().is_none()); + + // Scan a range not existed in the cluster + let mut scanner = debugger.scan_mvcc(b"zk16", b"zk19", 100).unwrap(); + assert!(scanner.next().is_none()); + + // The end key is less than the start_key of the first region + let mut scanner = debugger.scan_mvcc(b"", b"zj", 100).unwrap(); + assert!(scanner.next().is_none()); +} From 1669a72fac8176cc7a2be7fe10f43f1657d4c21f Mon Sep 17 00:00:00 2001 From: ekexium Date: Wed, 30 Aug 2023 17:45:40 +0800 Subject: [PATCH 018/220] txn: add logs for assertion failure (#12305) close tikv/tikv#12304 Add logs for assertion failure Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- src/storage/txn/actions/prewrite.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/storage/txn/actions/prewrite.rs b/src/storage/txn/actions/prewrite.rs index 90f739b8705..64e22a13585 100644 --- a/src/storage/txn/actions/prewrite.rs +++ b/src/storage/txn/actions/prewrite.rs @@ -239,6 +239,7 @@ impl LockStatus { } /// A single mutation to be prewritten. +#[derive(Debug)] struct PrewriteMutation<'a> { key: Key, value: Option, @@ -677,6 +678,12 @@ impl<'a> PrewriteMutation<'a> { if self.skip_constraint_check() { self.check_for_newer_version(reader)?; } + let (write, commit_ts) = write + .as_ref() + .map(|(w, ts)| (Some(w), Some(ts))) + .unwrap_or((None, None)); + error!("assertion failure"; "assertion" => ?self.assertion, "write" => ?write, + "commit_ts" => commit_ts, "mutation" => ?self); assertion_err?; } From b507aad3be0eaa6c96033ef7300605bda833bf54 Mon Sep 17 00:00:00 2001 From: buffer <1045931706@qq.com> Date: Thu, 31 Aug 2023 16:05:09 +0800 Subject: [PATCH 019/220] config: make split config can update (#15473) close tikv/tikv#15403 1. split config support to update dynamic. In past, the `optimize_for` function will set the config immutable. Signed-off-by: bufferflies <1045931706@qq.com> --- components/raftstore/src/store/worker/pd.rs | 2 +- .../src/store/worker/split_config.rs | 58 ++++++++++++++----- .../src/store/worker/split_controller.rs | 45 +++++++------- src/config/mod.rs | 12 ++-- tests/integrations/config/mod.rs | 6 +- tests/integrations/raftstore/test_stats.rs | 2 +- 6 files changed, 81 insertions(+), 44 deletions(-) diff --git a/components/raftstore/src/store/worker/pd.rs b/components/raftstore/src/store/worker/pd.rs index d812830569a..e8c8e2f575b 100644 --- a/components/raftstore/src/store/worker/pd.rs +++ b/components/raftstore/src/store/worker/pd.rs @@ -658,7 +658,7 @@ where // Register the region CPU records collector. if auto_split_controller .cfg - .region_cpu_overload_threshold_ratio + .region_cpu_overload_threshold_ratio() > 0.0 { region_cpu_records_collector = diff --git a/components/raftstore/src/store/worker/split_config.rs b/components/raftstore/src/store/worker/split_config.rs index 8fec853bb00..2d29bd21a89 100644 --- a/components/raftstore/src/store/worker/split_config.rs +++ b/components/raftstore/src/store/worker/split_config.rs @@ -68,18 +68,18 @@ pub fn get_sample_num() -> usize { #[serde(default)] #[serde(rename_all = "kebab-case")] pub struct SplitConfig { - pub qps_threshold: usize, + pub qps_threshold: Option, pub split_balance_score: f64, pub split_contained_score: f64, pub detect_times: u64, pub sample_num: usize, pub sample_threshold: u64, - pub byte_threshold: usize, + pub byte_threshold: Option, #[doc(hidden)] pub grpc_thread_cpu_overload_threshold_ratio: f64, #[doc(hidden)] pub unified_read_pool_thread_cpu_overload_threshold_ratio: f64, - pub region_cpu_overload_threshold_ratio: f64, + pub region_cpu_overload_threshold_ratio: Option, // deprecated. #[online_config(skip)] #[doc(hidden)] @@ -95,18 +95,18 @@ pub struct SplitConfig { impl Default for SplitConfig { fn default() -> SplitConfig { SplitConfig { - qps_threshold: DEFAULT_QPS_THRESHOLD, + qps_threshold: None, split_balance_score: DEFAULT_SPLIT_BALANCE_SCORE, split_contained_score: DEFAULT_SPLIT_CONTAINED_SCORE, detect_times: DEFAULT_DETECT_TIMES, sample_num: DEFAULT_SAMPLE_NUM, sample_threshold: DEFAULT_SAMPLE_THRESHOLD, - byte_threshold: DEFAULT_BYTE_THRESHOLD, + byte_threshold: None, grpc_thread_cpu_overload_threshold_ratio: DEFAULT_GRPC_THREAD_CPU_OVERLOAD_THRESHOLD_RATIO, unified_read_pool_thread_cpu_overload_threshold_ratio: DEFAULT_UNIFIED_READ_POOL_THREAD_CPU_OVERLOAD_THRESHOLD_RATIO, - region_cpu_overload_threshold_ratio: REGION_CPU_OVERLOAD_THRESHOLD_RATIO, + region_cpu_overload_threshold_ratio: None, size_threshold: None, // deprecated. key_threshold: None, // deprecated. } @@ -124,7 +124,7 @@ impl SplitConfig { ("split_balance_score or split_contained_score should be between 0 and 1.").into(), ); } - if self.sample_num >= self.qps_threshold { + if self.sample_num >= self.qps_threshold() { return Err( ("sample_num should be less than qps_threshold for load-base-split.").into(), ); @@ -133,20 +133,52 @@ impl SplitConfig { || self.grpc_thread_cpu_overload_threshold_ratio < 0.0 || self.unified_read_pool_thread_cpu_overload_threshold_ratio > 1.0 || self.unified_read_pool_thread_cpu_overload_threshold_ratio < 0.0 - || self.region_cpu_overload_threshold_ratio > 1.0 - || self.region_cpu_overload_threshold_ratio < 0.0 + || self.region_cpu_overload_threshold_ratio() > 1.0 + || self.region_cpu_overload_threshold_ratio() < 0.0 { return Err(("threshold ratio should be between 0 and 1.").into()); } Ok(()) } + pub fn qps_threshold(&self) -> usize { + self.qps_threshold.unwrap_or(DEFAULT_QPS_THRESHOLD) + } + + pub fn byte_threshold(&self) -> usize { + self.byte_threshold.unwrap_or(DEFAULT_BYTE_THRESHOLD) + } + + pub fn region_cpu_overload_threshold_ratio(&self) -> f64 { + self.region_cpu_overload_threshold_ratio + .unwrap_or(REGION_CPU_OVERLOAD_THRESHOLD_RATIO) + } + pub fn optimize_for(&mut self, region_size: ReadableSize) { const LARGE_REGION_SIZE_IN_MB: u64 = 4096; - if region_size.as_mb() >= LARGE_REGION_SIZE_IN_MB { - self.qps_threshold = DEFAULT_BIG_REGION_QPS_THRESHOLD; - self.region_cpu_overload_threshold_ratio = BIG_REGION_CPU_OVERLOAD_THRESHOLD_RATIO; - self.byte_threshold = DEFAULT_BIG_REGION_BYTE_THRESHOLD; + let big_size = region_size.as_mb() >= LARGE_REGION_SIZE_IN_MB; + if self.qps_threshold.is_none() { + self.qps_threshold = Some(if big_size { + DEFAULT_BIG_REGION_QPS_THRESHOLD + } else { + DEFAULT_QPS_THRESHOLD + }); + } + + if self.byte_threshold.is_none() { + self.byte_threshold = Some(if big_size { + DEFAULT_BIG_REGION_BYTE_THRESHOLD + } else { + DEFAULT_BYTE_THRESHOLD + }); + } + + if self.region_cpu_overload_threshold_ratio.is_none() { + self.region_cpu_overload_threshold_ratio = Some(if big_size { + BIG_REGION_CPU_OVERLOAD_THRESHOLD_RATIO + } else { + REGION_CPU_OVERLOAD_THRESHOLD_RATIO + }); } } } diff --git a/components/raftstore/src/store/worker/split_controller.rs b/components/raftstore/src/store/worker/split_controller.rs index d432f264e01..4bbcc773763 100644 --- a/components/raftstore/src/store/worker/split_controller.rs +++ b/components/raftstore/src/store/worker/split_controller.rs @@ -608,7 +608,7 @@ impl AutoSplitController { } fn should_check_region_cpu(&self) -> bool { - self.cfg.region_cpu_overload_threshold_ratio > 0.0 + self.cfg.region_cpu_overload_threshold_ratio() > 0.0 } fn is_grpc_poll_busy(&self, avg_grpc_thread_usage: f64) -> bool { @@ -643,7 +643,7 @@ impl AutoSplitController { return false; } region_cpu_usage / unified_read_pool_thread_usage - >= self.cfg.region_cpu_overload_threshold_ratio + >= self.cfg.region_cpu_overload_threshold_ratio() } // collect the read stats from read_stats_vec and dispatch them to a Region @@ -787,9 +787,9 @@ impl AutoSplitController { debug!("load base split params"; "region_id" => region_id, "qps" => qps, - "qps_threshold" => self.cfg.qps_threshold, + "qps_threshold" => self.cfg.qps_threshold(), "byte" => byte, - "byte_threshold" => self.cfg.byte_threshold, + "byte_threshold" => self.cfg.byte_threshold(), "cpu_usage" => cpu_usage, "is_region_busy" => is_region_busy, ); @@ -800,8 +800,8 @@ impl AutoSplitController { // 1. If the QPS or the byte does not meet the threshold, skip. // 2. If the Unified Read Pool or the region is not hot enough, skip. - if qps < self.cfg.qps_threshold - && byte < self.cfg.byte_threshold + if qps < self.cfg.qps_threshold() + && byte < self.cfg.byte_threshold() && (!is_unified_read_pool_busy || !is_region_busy) { self.recorders.remove_entry(®ion_id); @@ -917,13 +917,13 @@ impl AutoSplitController { pub fn refresh_and_check_cfg(&mut self) -> SplitConfigChange { let mut cfg_change = SplitConfigChange::Noop; if let Some(incoming) = self.cfg_tracker.any_new() { - if self.cfg.region_cpu_overload_threshold_ratio <= 0.0 - && incoming.region_cpu_overload_threshold_ratio > 0.0 + if self.cfg.region_cpu_overload_threshold_ratio() <= 0.0 + && incoming.region_cpu_overload_threshold_ratio() > 0.0 { cfg_change = SplitConfigChange::UpdateRegionCpuCollector(true); } - if self.cfg.region_cpu_overload_threshold_ratio > 0.0 - && incoming.region_cpu_overload_threshold_ratio <= 0.0 + if self.cfg.region_cpu_overload_threshold_ratio() > 0.0 + && incoming.region_cpu_overload_threshold_ratio() <= 0.0 { cfg_change = SplitConfigChange::UpdateRegionCpuCollector(false); } @@ -943,12 +943,12 @@ impl AutoSplitController { mod tests { use online_config::{ConfigChange, ConfigManager, ConfigValue}; use resource_metering::{RawRecord, TagInfos}; - use tikv_util::config::VersionTrack; + use tikv_util::config::{ReadableSize, VersionTrack}; use txn_types::Key; use super::*; use crate::store::worker::split_config::{ - DEFAULT_SAMPLE_NUM, REGION_CPU_OVERLOAD_THRESHOLD_RATIO, + BIG_REGION_CPU_OVERLOAD_THRESHOLD_RATIO, DEFAULT_SAMPLE_NUM, }; enum Position { @@ -1193,7 +1193,7 @@ mod tests { fn check_split_key(mode: &[u8], qps_stats: Vec, split_keys: Vec<&[u8]>) { let mode = String::from_utf8(Vec::from(mode)).unwrap(); let mut hub = AutoSplitController::default(); - hub.cfg.qps_threshold = 1; + hub.cfg.qps_threshold = Some(1); hub.cfg.sample_threshold = 0; for i in 0..10 { @@ -1226,7 +1226,7 @@ mod tests { ) { let mode = String::from_utf8(Vec::from(mode)).unwrap(); let mut hub = AutoSplitController::default(); - hub.cfg.qps_threshold = 1; + hub.cfg.qps_threshold = Some(1); hub.cfg.sample_threshold = 0; for i in 0..10 { @@ -1291,7 +1291,7 @@ mod tests { #[test] fn test_sample_key_num() { let mut hub = AutoSplitController::default(); - hub.cfg.qps_threshold = 2000; + hub.cfg.qps_threshold = Some(2000); hub.cfg.sample_num = 2000; hub.cfg.sample_threshold = 0; @@ -1608,7 +1608,8 @@ mod tests { #[test] fn test_refresh_and_check_cfg() { - let split_config = SplitConfig::default(); + let mut split_config = SplitConfig::default(); + split_config.optimize_for(ReadableSize::mb(5000)); let mut split_cfg_manager = SplitConfigManager::new(Arc::new(VersionTrack::new(split_config))); let mut auto_split_controller = @@ -1620,8 +1621,8 @@ mod tests { assert_eq!( auto_split_controller .cfg - .region_cpu_overload_threshold_ratio, - REGION_CPU_OVERLOAD_THRESHOLD_RATIO + .region_cpu_overload_threshold_ratio(), + BIG_REGION_CPU_OVERLOAD_THRESHOLD_RATIO ); // Set to zero. dispatch_split_cfg_change( @@ -1636,7 +1637,7 @@ mod tests { assert_eq!( auto_split_controller .cfg - .region_cpu_overload_threshold_ratio, + .region_cpu_overload_threshold_ratio(), 0.0 ); assert_eq!( @@ -1647,7 +1648,7 @@ mod tests { dispatch_split_cfg_change( &mut split_cfg_manager, "region_cpu_overload_threshold_ratio", - ConfigValue::F64(REGION_CPU_OVERLOAD_THRESHOLD_RATIO), + ConfigValue::F64(0.1), ); assert_eq!( auto_split_controller.refresh_and_check_cfg(), @@ -1656,8 +1657,8 @@ mod tests { assert_eq!( auto_split_controller .cfg - .region_cpu_overload_threshold_ratio, - REGION_CPU_OVERLOAD_THRESHOLD_RATIO + .region_cpu_overload_threshold_ratio(), + 0.1 ); assert_eq!( auto_split_controller.refresh_and_check_cfg(), diff --git a/src/config/mod.rs b/src/config/mod.rs index 2494e84dfbd..8c0c04957b1 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -6139,12 +6139,12 @@ mod tests { assert_eq!(default_cfg.coprocessor.region_split_size(), SPLIT_SIZE); assert!(!default_cfg.coprocessor.enable_region_bucket()); - assert_eq!(default_cfg.split.qps_threshold, DEFAULT_QPS_THRESHOLD); + assert_eq!(default_cfg.split.qps_threshold(), DEFAULT_QPS_THRESHOLD); assert_eq!( - default_cfg.split.region_cpu_overload_threshold_ratio, + default_cfg.split.region_cpu_overload_threshold_ratio(), REGION_CPU_OVERLOAD_THRESHOLD_RATIO ); - assert_eq!(default_cfg.split.byte_threshold, DEFAULT_BYTE_THRESHOLD); + assert_eq!(default_cfg.split.byte_threshold(), DEFAULT_BYTE_THRESHOLD); let mut default_cfg = TikvConfig::default(); default_cfg.storage.engine = EngineType::RaftKv2; @@ -6154,15 +6154,15 @@ mod tests { RAFTSTORE_V2_SPLIT_SIZE ); assert_eq!( - default_cfg.split.qps_threshold, + default_cfg.split.qps_threshold(), DEFAULT_BIG_REGION_QPS_THRESHOLD ); assert_eq!( - default_cfg.split.region_cpu_overload_threshold_ratio, + default_cfg.split.region_cpu_overload_threshold_ratio(), BIG_REGION_CPU_OVERLOAD_THRESHOLD_RATIO ); assert_eq!( - default_cfg.split.byte_threshold, + default_cfg.split.byte_threshold(), DEFAULT_BIG_REGION_BYTE_THRESHOLD ); assert!(default_cfg.coprocessor.enable_region_bucket()); diff --git a/tests/integrations/config/mod.rs b/tests/integrations/config/mod.rs index d3091e30eed..c6e98e95c05 100644 --- a/tests/integrations/config/mod.rs +++ b/tests/integrations/config/mod.rs @@ -864,10 +864,14 @@ fn test_serde_custom_tikv_config() { renew_batch_max_size: 8192, alloc_ahead_buffer: ReadableDuration::millis(3000), }; + value + .split + .optimize_for(value.coprocessor.region_max_size()); value.resource_control = ResourceControlConfig { enabled: false }; let custom = read_file_in_project_dir("integrations/config/test-custom.toml"); - let load = toml::from_str(&custom).unwrap(); + let mut load: TikvConfig = toml::from_str(&custom).unwrap(); + load.split.optimize_for(load.coprocessor.region_max_size()); assert_eq_debug(&value, &load); let dump = toml::to_string_pretty(&load).unwrap(); diff --git a/tests/integrations/raftstore/test_stats.rs b/tests/integrations/raftstore/test_stats.rs index 67e5e261dab..d61d6a59182 100644 --- a/tests/integrations/raftstore/test_stats.rs +++ b/tests/integrations/raftstore/test_stats.rs @@ -575,7 +575,7 @@ pub fn test_rollback() { fn test_query_num(query: Box, is_raw_kv: bool) { let (mut cluster, client, mut ctx) = must_new_and_configure_cluster_and_kv_client(|cluster| { cluster.cfg.raft_store.pd_store_heartbeat_tick_interval = ReadableDuration::millis(50); - cluster.cfg.split.qps_threshold = 0; + cluster.cfg.split.qps_threshold = Some(0); cluster.cfg.split.split_balance_score = 2.0; cluster.cfg.split.split_contained_score = 2.0; cluster.cfg.split.detect_times = 1; From 251df183b0d089d01e629791124f70c3cbb6fdbf Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Thu, 31 Aug 2023 16:24:39 +0800 Subject: [PATCH 020/220] raftstore-v2: reuse failpoint tests in async_io_test.rs (#15476) ref tikv/tikv#15409 reuse failpoint tests in async_io_test Signed-off-by: SpadeA-Tang --- .../raftstore-v2/src/operation/command/mod.rs | 1 + components/test_raftstore-v2/src/cluster.rs | 25 ++++++++++++++++++- components/test_raftstore/src/cluster.rs | 22 ++++++++-------- tests/failpoints/cases/test_async_io.rs | 17 ++++++++++--- .../cases/test_cmd_epoch_checker.rs | 9 +++---- tests/failpoints/cases/test_disk_full.rs | 20 +++++++-------- tests/failpoints/cases/test_merge.rs | 6 ++--- .../raftstore/test_joint_consensus.rs | 16 ++++-------- tests/integrations/raftstore/test_merge.rs | 6 ++--- 9 files changed, 74 insertions(+), 48 deletions(-) diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index 0fd88cc987b..c39f2412f32 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -583,6 +583,7 @@ impl Apply { fail::fail_point!("APPLY_COMMITTED_ENTRIES"); fail::fail_point!("on_handle_apply_1003", self.peer_id() == 1003, |_| {}); fail::fail_point!("on_handle_apply_2", self.peer_id() == 2, |_| {}); + fail::fail_point!("on_handle_apply", |_| {}); fail::fail_point!("on_handle_apply_store_1", self.store_id() == 1, |_| {}); let now = std::time::Instant::now(); let apply_wait_time = APPLY_TASK_WAIT_TIME_HISTOGRAM.local(); diff --git a/components/test_raftstore-v2/src/cluster.rs b/components/test_raftstore-v2/src/cluster.rs index 08de4cc3aa1..8ede3290167 100644 --- a/components/test_raftstore-v2/src/cluster.rs +++ b/components/test_raftstore-v2/src/cluster.rs @@ -51,7 +51,7 @@ use tempfile::TempDir; use test_pd_client::TestPdClient; use test_raftstore::{ check_raft_cmd_request, is_error_response, new_admin_request, new_delete_cmd, - new_delete_range_cmd, new_get_cf_cmd, new_peer, new_prepare_merge, new_put_cf_cmd, + new_delete_range_cmd, new_get_cf_cmd, new_peer, new_prepare_merge, new_put_cf_cmd, new_put_cmd, new_region_detail_cmd, new_region_leader_cmd, new_request, new_status_request, new_store, new_tikv_config_with_api_ver, new_transfer_leader_cmd, sleep_ms, Config, Filter, FilterFactory, PartitionFilterFactory, RawEngine, @@ -1263,6 +1263,29 @@ impl, EK: KvEngine> Cluster { panic!("find no region for {}", log_wrappers::hex_encode_upper(key)); } + pub fn async_request( + &mut self, + mut req: RaftCmdRequest, + ) -> BoxFuture<'static, RaftCmdResponse> { + let region_id = req.get_header().get_region_id(); + let leader = self.leader_of_region(region_id).unwrap(); + req.mut_header().set_peer(leader.clone()); + self.sim + .wl() + .async_command_on_node(leader.get_store_id(), req) + } + + pub fn async_put( + &mut self, + key: &[u8], + value: &[u8], + ) -> Result> { + let mut region = self.get_region(key); + let reqs = vec![new_put_cmd(key, value)]; + let put = new_request(region.get_id(), region.take_region_epoch(), reqs, false); + Ok(self.async_request(put)) + } + pub fn must_put(&mut self, key: &[u8], value: &[u8]) { self.must_put_cf(CF_DEFAULT, key, value); } diff --git a/components/test_raftstore/src/cluster.rs b/components/test_raftstore/src/cluster.rs index 23edf0efab1..e65028fe968 100644 --- a/components/test_raftstore/src/cluster.rs +++ b/components/test_raftstore/src/cluster.rs @@ -19,7 +19,7 @@ use engine_traits::{ WriteBatch, WriteBatchExt, CF_DEFAULT, CF_RAFT, }; use file_system::IoRateLimiter; -use futures::{self, channel::oneshot, executor::block_on, future::BoxFuture}; +use futures::{self, channel::oneshot, executor::block_on, future::BoxFuture, StreamExt}; use kvproto::{ errorpb::Error as PbError, kvrpcpb::{ApiVersion, Context, DiskFullOpt}, @@ -51,7 +51,6 @@ use tempfile::TempDir; use test_pd_client::TestPdClient; use tikv::server::Result as ServerResult; use tikv_util::{ - mpsc::future, thread_group::GroupProperties, time::{Instant, ThreadReadId}, worker::LazyWorker, @@ -969,7 +968,7 @@ impl Cluster { pub fn async_request( &mut self, req: RaftCmdRequest, - ) -> Result> { + ) -> Result> { self.async_request_with_opts(req, Default::default()) } @@ -977,21 +976,24 @@ impl Cluster { &mut self, mut req: RaftCmdRequest, opts: RaftCmdExtraOpts, - ) -> Result> { + ) -> Result> { let region_id = req.get_header().get_region_id(); let leader = self.leader_of_region(region_id).unwrap(); req.mut_header().set_peer(leader.clone()); - let (cb, rx) = make_cb(&req); + let (cb, mut rx) = make_cb(&req); self.sim .rl() .async_command_on_node_with_opts(leader.get_store_id(), req, cb, opts)?; - Ok(rx) + Ok(Box::pin(async move { + let fut = rx.next(); + fut.await.unwrap() + })) } pub fn async_exit_joint( &mut self, region_id: u64, - ) -> Result> { + ) -> Result> { let region = block_on(self.pd_client.get_region_by_id(region_id)) .unwrap() .unwrap(); @@ -1007,7 +1009,7 @@ impl Cluster { &mut self, key: &[u8], value: &[u8], - ) -> Result> { + ) -> Result> { let mut region = self.get_region(key); let reqs = vec![new_put_cmd(key, value)]; let put = new_request(region.get_id(), region.take_region_epoch(), reqs, false); @@ -1018,7 +1020,7 @@ impl Cluster { &mut self, region_id: u64, peer: metapb::Peer, - ) -> Result> { + ) -> Result> { let region = block_on(self.pd_client.get_region_by_id(region_id)) .unwrap() .unwrap(); @@ -1031,7 +1033,7 @@ impl Cluster { &mut self, region_id: u64, peer: metapb::Peer, - ) -> Result> { + ) -> Result> { let region = block_on(self.pd_client.get_region_by_id(region_id)) .unwrap() .unwrap(); diff --git a/tests/failpoints/cases/test_async_io.rs b/tests/failpoints/cases/test_async_io.rs index 3d53b9c5f14..8ce349805b0 100644 --- a/tests/failpoints/cases/test_async_io.rs +++ b/tests/failpoints/cases/test_async_io.rs @@ -8,13 +8,15 @@ use std::{ use pd_client::PdClient; use raft::eraftpb::MessageType; use test_raftstore::*; +use test_raftstore_macro::test_case; use tikv_util::HandyRwLock; // Test if the entries can be committed and applied on followers even when // leader's io is paused. -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_async_io_commit_without_leader_persist() { - let mut cluster = new_node_cluster(0, 3); + let mut cluster = new_cluster(0, 3); cluster.cfg.raft_store.cmd_batch_concurrent_ready_max_count = 0; cluster.cfg.raft_store.store_io_pool_size = 2; let pd_client = Arc::clone(&cluster.pd_client); @@ -49,9 +51,10 @@ fn test_async_io_commit_without_leader_persist() { /// Test if the leader delays its destroy after applying conf change to /// remove itself. -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_async_io_delay_destroy_after_conf_change() { - let mut cluster = new_node_cluster(0, 3); + let mut cluster = new_cluster(0, 3); cluster.cfg.raft_store.store_io_pool_size = 2; let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); @@ -93,6 +96,9 @@ fn test_async_io_delay_destroy_after_conf_change() { /// Test if the peer can be destroyed when it receives a tombstone msg and /// its snapshot is persisting. +/// +/// Note: snapshot flow is changed, so partitioend-raft-kv does not support this +/// test. #[test] fn test_async_io_cannot_destroy_when_persist_snapshot() { let mut cluster = new_node_cluster(0, 3); @@ -176,6 +182,9 @@ fn test_async_io_cannot_destroy_when_persist_snapshot() { } /// Test if the peer can handle ready when its snapshot is persisting. +/// +/// Note: snapshot flow is changed, so partitioend-raft-kv does not support this +/// test. #[test] fn test_async_io_cannot_handle_ready_when_persist_snapshot() { let mut cluster = new_node_cluster(0, 3); diff --git a/tests/failpoints/cases/test_cmd_epoch_checker.rs b/tests/failpoints/cases/test_cmd_epoch_checker.rs index 73bc741d9bb..8af8e29f3ac 100644 --- a/tests/failpoints/cases/test_cmd_epoch_checker.rs +++ b/tests/failpoints/cases/test_cmd_epoch_checker.rs @@ -10,7 +10,7 @@ use kvproto::raft_cmdpb::{RaftCmdRequest, RaftCmdResponse}; use raft::eraftpb::MessageType; use raftstore::store::msg::*; use test_raftstore::*; -use tikv_util::{mpsc::future, HandyRwLock}; +use tikv_util::{future::block_on_timeout, mpsc::future, HandyRwLock}; struct CbReceivers { proposed: mpsc::Receiver<()>, @@ -399,9 +399,7 @@ fn test_accept_proposal_during_conf_change() { let conf_change_fp = "apply_on_conf_change_all_1"; fail::cfg(conf_change_fp, "pause").unwrap(); let mut add_peer_rx = cluster.async_add_peer(r, new_peer(2, 2)).unwrap(); - add_peer_rx - .recv_timeout(Duration::from_millis(100)) - .unwrap_err(); + block_on_timeout(add_peer_rx.as_mut(), Duration::from_millis(100)).unwrap_err(); // Conf change doesn't affect proposals. let write_req = make_write_req(&mut cluster, b"k"); @@ -419,8 +417,7 @@ fn test_accept_proposal_during_conf_change() { fail::remove(conf_change_fp); assert!( - !add_peer_rx - .recv_timeout(Duration::from_secs(1)) + !block_on_timeout(add_peer_rx, Duration::from_secs(1)) .unwrap() .get_header() .has_error() diff --git a/tests/failpoints/cases/test_disk_full.rs b/tests/failpoints/cases/test_disk_full.rs index bd4271be12d..217269bb5b8 100644 --- a/tests/failpoints/cases/test_disk_full.rs +++ b/tests/failpoints/cases/test_disk_full.rs @@ -86,8 +86,8 @@ fn test_disk_full_leader_behaviors(usage: DiskUsage) { // Test new normal proposals won't be allowed when disk is full. let old_last_index = cluster.raft_local_state(1, 1).last_index; - let mut rx = cluster.async_put(b"k2", b"v2").unwrap(); - assert_disk_full(&rx.recv_timeout(Duration::from_secs(2)).unwrap()); + let rx = cluster.async_put(b"k2", b"v2").unwrap(); + assert_disk_full(&block_on_timeout(rx, Duration::from_secs(2)).unwrap()); let new_last_index = cluster.raft_local_state(1, 1).last_index; assert_eq!(old_last_index, new_last_index); @@ -299,8 +299,8 @@ fn test_majority_disk_full() { } // Normal proposals will be rejected because of majority peers' disk full. - let mut ch = cluster.async_put(b"k2", b"v2").unwrap(); - let resp = ch.recv_timeout(Duration::from_secs(1)).unwrap(); + let ch = cluster.async_put(b"k2", b"v2").unwrap(); + let resp = block_on_timeout(ch, Duration::from_secs(1)).unwrap(); assert_eq!(disk_full_stores(&resp), vec![2, 3]); // Proposals with special `DiskFullOpt`s can be accepted even if all peers are @@ -310,8 +310,8 @@ fn test_majority_disk_full() { let put = new_request(1, epoch.clone(), reqs, false); let mut opts = RaftCmdExtraOpts::default(); opts.disk_full_opt = DiskFullOpt::AllowedOnAlmostFull; - let mut ch = cluster.async_request_with_opts(put, opts).unwrap(); - let resp = ch.recv_timeout(Duration::from_secs(1)).unwrap(); + let ch = cluster.async_request_with_opts(put, opts).unwrap(); + let resp = block_on_timeout(ch, Duration::from_secs(1)).unwrap(); assert!(!resp.get_header().has_error()); // Reset disk full status for peer 2 and 3. 2 follower reads must success @@ -335,8 +335,8 @@ fn test_majority_disk_full() { let put = new_request(1, epoch.clone(), reqs, false); let mut opts = RaftCmdExtraOpts::default(); opts.disk_full_opt = DiskFullOpt::AllowedOnAlmostFull; - let mut ch = cluster.async_request_with_opts(put, opts).unwrap(); - let resp = ch.recv_timeout(Duration::from_secs(10)).unwrap(); + let ch = cluster.async_request_with_opts(put, opts).unwrap(); + let resp = block_on_timeout(ch, Duration::from_secs(10)).unwrap(); assert_eq!(disk_full_stores(&resp), vec![2, 3]); // Peer 2 disk usage changes from already full to almost full. @@ -354,8 +354,8 @@ fn test_majority_disk_full() { let put = new_request(1, epoch, reqs, false); let mut opts = RaftCmdExtraOpts::default(); opts.disk_full_opt = DiskFullOpt::AllowedOnAlmostFull; - let mut ch = cluster.async_request_with_opts(put, opts).unwrap(); - let resp = ch.recv_timeout(Duration::from_secs(1)).unwrap(); + let ch = cluster.async_request_with_opts(put, opts).unwrap(); + let resp = block_on_timeout(ch, Duration::from_secs(1)).unwrap(); assert_eq!(disk_full_stores(&resp), vec![3]); for i in 0..3 { diff --git a/tests/failpoints/cases/test_merge.rs b/tests/failpoints/cases/test_merge.rs index 3cc72d44da1..eb6b8a235e1 100644 --- a/tests/failpoints/cases/test_merge.rs +++ b/tests/failpoints/cases/test_merge.rs @@ -21,7 +21,7 @@ use raft::eraftpb::MessageType; use raftstore::store::*; use test_raftstore::*; use tikv::storage::{kv::SnapshotExt, Snapshot}; -use tikv_util::{config::*, time::Instant, HandyRwLock}; +use tikv_util::{config::*, future::block_on_timeout, time::Instant, HandyRwLock}; use txn_types::{Key, LastChange, PessimisticLock}; /// Test if merge is rollback as expected. @@ -1532,7 +1532,7 @@ fn test_retry_pending_prepare_merge_fail() { let mut rx = cluster.async_put(b"k1", b"v11").unwrap(); propose_rx.recv_timeout(Duration::from_secs(2)).unwrap(); - rx.recv_timeout(Duration::from_millis(200)).unwrap_err(); + block_on_timeout(rx.as_mut(), Duration::from_millis(200)).unwrap_err(); // Then, start merging. PrepareMerge should become pending because applied_index // is smaller than proposed_index. @@ -1546,7 +1546,7 @@ fn test_retry_pending_prepare_merge_fail() { fail::cfg("disk_already_full_peer_1", "return").unwrap(); fail::cfg("disk_already_full_peer_2", "return").unwrap(); fail::remove("on_handle_apply"); - let res = rx.recv_timeout(Duration::from_secs(1)).unwrap(); + let res = block_on_timeout(rx, Duration::from_secs(1)).unwrap(); assert!(!res.get_header().has_error(), "{:?}", res); propose_rx.recv_timeout(Duration::from_secs(2)).unwrap(); diff --git a/tests/integrations/raftstore/test_joint_consensus.rs b/tests/integrations/raftstore/test_joint_consensus.rs index 282d0d0525c..55def7a099b 100644 --- a/tests/integrations/raftstore/test_joint_consensus.rs +++ b/tests/integrations/raftstore/test_joint_consensus.rs @@ -10,7 +10,7 @@ use pd_client::PdClient; use raft::eraftpb::ConfChangeType; use raftstore::Result; use test_raftstore::*; -use tikv_util::{mpsc::future, store::find_peer}; +use tikv_util::{future::block_on_timeout, store::find_peer}; /// Tests multiple confchange commands can be done by one request #[test] @@ -164,24 +164,18 @@ fn test_request_in_joint_state() { // Isolated peer 2, so the old configuation can't reach quorum cluster.add_send_filter(IsolationFilterFactory::new(2)); - let mut rx = cluster + let rx = cluster .async_request(put_request(®ion, 1, b"k3", b"v3")) .unwrap(); - assert_eq!( - rx.recv_timeout(Duration::from_millis(100)), - Err(future::RecvTimeoutError::Timeout) - ); + block_on_timeout(rx, Duration::from_millis(100)).unwrap_err(); cluster.clear_send_filters(); // Isolated peer 3, so the new configuation can't reach quorum cluster.add_send_filter(IsolationFilterFactory::new(3)); - let mut rx = cluster + let rx = cluster .async_request(put_request(®ion, 1, b"k4", b"v4")) .unwrap(); - assert_eq!( - rx.recv_timeout(Duration::from_millis(100)), - Err(future::RecvTimeoutError::Timeout) - ); + block_on_timeout(rx, Duration::from_millis(100)).unwrap_err(); cluster.clear_send_filters(); // Leave joint diff --git a/tests/integrations/raftstore/test_merge.rs b/tests/integrations/raftstore/test_merge.rs index afc0c9afab4..ceb888a2b22 100644 --- a/tests/integrations/raftstore/test_merge.rs +++ b/tests/integrations/raftstore/test_merge.rs @@ -14,7 +14,7 @@ use raftstore::store::{Callback, LocksStatus}; use test_raftstore::*; use test_raftstore_macro::test_case; use tikv::storage::{kv::SnapshotExt, Snapshot}; -use tikv_util::{config::*, HandyRwLock}; +use tikv_util::{config::*, future::block_on_timeout, HandyRwLock}; use txn_types::{Key, LastChange, PessimisticLock}; /// Test if merge is working as expected in a general condition. @@ -1444,10 +1444,10 @@ fn test_merge_pessimistic_locks_when_gap_is_too_large() { // The gap is too large, so the previous merge should fail. And this new put // request should be allowed. - let mut res = cluster.async_put(b"k1", b"new_val").unwrap(); + let res = cluster.async_put(b"k1", b"new_val").unwrap(); cluster.clear_send_filters(); - res.recv_timeout(Duration::from_secs(5)).unwrap(); + block_on_timeout(res, Duration::from_secs(5)).unwrap(); assert_eq!(cluster.must_get(b"k1").unwrap(), b"new_val"); } From 437a68d7daff44ad243d24cb5caeee9fc29b3a5a Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Fri, 1 Sep 2023 10:14:09 +0800 Subject: [PATCH 021/220] storage: avoid duplicated Instant:now (#15489) close tikv/tikv#15490 avoid duplicated Instant:now Signed-off-by: SpadeA-Tang --- src/storage/mod.rs | 171 ++++++++++++++++++++++++++------------------- 1 file changed, 99 insertions(+), 72 deletions(-) diff --git a/src/storage/mod.rs b/src/storage/mod.rs index 0d4679fbe18..cb4057bfd7e 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -97,7 +97,7 @@ use tikv_util::{ deadline::Deadline, future::try_poll, quota_limiter::QuotaLimiter, - time::{duration_to_ms, Instant, ThreadReadId}, + time::{duration_to_ms, duration_to_sec, Instant, ThreadReadId}, }; use tracker::{ clear_tls_tracker_token, set_tls_tracker_token, with_tls_tracker, TrackedFuture, TrackerToken, @@ -645,7 +645,7 @@ impl Storage { Self::check_api_version(api_version, ctx.api_version, CMD, [key.as_encoded()])?; - let command_duration = tikv_util::time::Instant::now(); + let command_duration = Instant::now(); // The bypass_locks and access_locks set will be checked at most once. // `TsSet::vec` is more efficient here. @@ -697,12 +697,15 @@ impl Storage { &statistics, buckets.as_ref(), ); + let now = Instant::now(); SCHED_PROCESSING_READ_HISTOGRAM_STATIC .get(CMD) - .observe(begin_instant.saturating_elapsed_secs()); - SCHED_HISTOGRAM_VEC_STATIC - .get(CMD) - .observe(command_duration.saturating_elapsed_secs()); + .observe(duration_to_sec( + now.saturating_duration_since(begin_instant), + )); + SCHED_HISTOGRAM_VEC_STATIC.get(CMD).observe(duration_to_sec( + now.saturating_duration_since(command_duration), + )); let read_bytes = key.len() + result @@ -765,7 +768,7 @@ impl Storage { ids: Vec, trackers: Vec, consumer: P, - begin_instant: tikv_util::time::Instant, + begin_instant: Instant, ) -> impl Future> { const CMD: CommandKind = CommandKind::batch_get_command; // all requests in a batch have the same region, epoch, term, replica_read @@ -805,7 +808,7 @@ impl Storage { KV_COMMAND_KEYREAD_HISTOGRAM_STATIC .get(CMD) .observe(requests.len() as f64); - let command_duration = tikv_util::time::Instant::now(); + let command_duration = Instant::now(); let read_id = Some(ThreadReadId::new()); let mut statistics = Statistics::default(); let mut req_snaps = vec![]; @@ -1019,7 +1022,7 @@ impl Storage { keys.iter().map(Key::as_encoded), )?; - let command_duration = tikv_util::time::Instant::now(); + let command_duration = Instant::now(); let bypass_locks = TsSet::from_u64s(ctx.take_resolved_locks()); let access_locks = TsSet::from_u64s(ctx.take_committed_locks()); @@ -1086,12 +1089,15 @@ impl Storage { (result, stats) }); metrics::tls_collect_scan_details(CMD, &stats); + let now = Instant::now(); SCHED_PROCESSING_READ_HISTOGRAM_STATIC .get(CMD) - .observe(begin_instant.saturating_elapsed_secs()); - SCHED_HISTOGRAM_VEC_STATIC - .get(CMD) - .observe(command_duration.saturating_elapsed_secs()); + .observe(duration_to_sec( + now.saturating_duration_since(begin_instant), + )); + SCHED_HISTOGRAM_VEC_STATIC.get(CMD).observe(duration_to_sec( + now.saturating_duration_since(command_duration), + )); let read_bytes = stats.cf_statistics(CF_DEFAULT).flow_stats.read_bytes + stats.cf_statistics(CF_LOCK).flow_stats.read_bytes @@ -1217,7 +1223,7 @@ impl Storage { if reverse_scan { std::mem::swap(&mut start_key, &mut end_key); } - let command_duration = tikv_util::time::Instant::now(); + let command_duration = Instant::now(); let bypass_locks = TsSet::from_u64s(ctx.take_resolved_locks()); let access_locks = TsSet::from_u64s(ctx.take_committed_locks()); @@ -1296,12 +1302,15 @@ impl Storage { &statistics, buckets.as_ref(), ); + let now = Instant::now(); SCHED_PROCESSING_READ_HISTOGRAM_STATIC .get(CMD) - .observe(begin_instant.saturating_elapsed_secs()); - SCHED_HISTOGRAM_VEC_STATIC - .get(CMD) - .observe(command_duration.saturating_elapsed_secs()); + .observe(duration_to_sec( + now.saturating_duration_since(begin_instant), + )); + SCHED_HISTOGRAM_VEC_STATIC.get(CMD).observe(duration_to_sec( + now.saturating_duration_since(command_duration), + )); res.map_err(Error::from).map(|results| { KV_COMMAND_KEYREAD_HISTOGRAM_STATIC @@ -1383,7 +1392,7 @@ impl Storage { // which resolves locks on regions, and boundary of regions will be out of range // of TiDB keys. - let command_duration = tikv_util::time::Instant::now(); + let command_duration = Instant::now(); concurrency_manager.update_max_ts(max_ts); let begin_instant = Instant::now(); @@ -1455,12 +1464,15 @@ impl Storage { &statistics, buckets.as_ref(), ); + let now = Instant::now(); SCHED_PROCESSING_READ_HISTOGRAM_STATIC .get(CMD) - .observe(begin_instant.saturating_elapsed_secs()); - SCHED_HISTOGRAM_VEC_STATIC - .get(CMD) - .observe(command_duration.saturating_elapsed_secs()); + .observe(duration_to_sec( + now.saturating_duration_since(begin_instant), + )); + SCHED_HISTOGRAM_VEC_STATIC.get(CMD).observe(duration_to_sec( + now.saturating_duration_since(command_duration), + )); Ok(locks) }) @@ -1669,7 +1681,7 @@ impl Storage { Self::check_api_version(api_version, ctx.api_version, CMD, [&key])?; - let command_duration = tikv_util::time::Instant::now(); + let command_duration = Instant::now(); let snap_ctx = SnapContext { pb_ctx: &ctx, ..Default::default() @@ -1704,12 +1716,15 @@ impl Storage { &stats, buckets.as_ref(), ); + let now = Instant::now(); SCHED_PROCESSING_READ_HISTOGRAM_STATIC .get(CMD) - .observe(begin_instant.saturating_elapsed_secs()); - SCHED_HISTOGRAM_VEC_STATIC - .get(CMD) - .observe(command_duration.saturating_elapsed_secs()); + .observe(duration_to_sec( + now.saturating_duration_since(begin_instant), + )); + SCHED_HISTOGRAM_VEC_STATIC.get(CMD).observe(duration_to_sec( + now.saturating_duration_since(command_duration), + )); r } } @@ -1776,7 +1791,7 @@ impl Storage { .map_err(Error::from)?; } - let command_duration = tikv_util::time::Instant::now(); + let command_duration = Instant::now(); let read_id = Some(ThreadReadId::new()); let mut snaps = vec![]; for (mut req, id) in gets.into_iter().zip(ids) { @@ -1845,12 +1860,15 @@ impl Storage { } } + let now = Instant::now(); SCHED_PROCESSING_READ_HISTOGRAM_STATIC .get(CMD) - .observe(begin_instant.saturating_elapsed_secs()); - SCHED_HISTOGRAM_VEC_STATIC - .get(CMD) - .observe(command_duration.saturating_elapsed_secs()); + .observe(duration_to_sec( + now.saturating_duration_since(begin_instant), + )); + SCHED_HISTOGRAM_VEC_STATIC.get(CMD).observe(duration_to_sec( + now.saturating_duration_since(command_duration), + )); Ok(()) } .in_resource_metering_tag(resource_tag), @@ -1896,7 +1914,7 @@ impl Storage { Self::check_api_version(api_version, ctx.api_version, CMD, &keys)?; - let command_duration = tikv_util::time::Instant::now(); + let command_duration = Instant::now(); let snap_ctx = SnapContext { pb_ctx: &ctx, ..Default::default() @@ -1947,12 +1965,15 @@ impl Storage { KV_COMMAND_KEYREAD_HISTOGRAM_STATIC .get(CMD) .observe(stats.data.flow_stats.read_keys as f64); + let now = Instant::now(); SCHED_PROCESSING_READ_HISTOGRAM_STATIC .get(CMD) - .observe(begin_instant.saturating_elapsed_secs()); - SCHED_HISTOGRAM_VEC_STATIC - .get(CMD) - .observe(command_duration.saturating_elapsed_secs()); + .observe(duration_to_sec( + now.saturating_duration_since(begin_instant), + )); + SCHED_HISTOGRAM_VEC_STATIC.get(CMD).observe(duration_to_sec( + now.saturating_duration_since(command_duration), + )); Ok(result) } } @@ -2028,7 +2049,7 @@ impl Storage { if let Err(e) = deadline.check() { return callback(Err(Error::from(e))); } - let command_duration = tikv_util::time::Instant::now(); + let command_duration = Instant::now(); if let Err(e) = Self::check_causal_ts_flushed(&mut ctx, CMD).await { return callback(Err(e)); @@ -2140,7 +2161,7 @@ impl Storage { if let Err(e) = deadline.check() { return callback(Err(Error::from(e))); } - let command_duration = tikv_util::time::Instant::now(); + let command_duration = Instant::now(); if let Err(e) = Self::check_causal_ts_flushed(&mut ctx, CMD).await { return callback(Err(e)); @@ -2205,7 +2226,7 @@ impl Storage { if let Err(e) = deadline.check() { return callback(Err(Error::from(e))); } - let command_duration = tikv_util::time::Instant::now(); + let command_duration = Instant::now(); if let Err(e) = Self::check_causal_ts_flushed(&mut ctx, CMD).await { return callback(Err(e)); @@ -2266,7 +2287,7 @@ impl Storage { if let Err(e) = deadline.check() { return callback(Err(Error::from(e))); } - let command_duration = tikv_util::time::Instant::now(); + let command_duration = Instant::now(); let start_key = F::encode_raw_key_owned(start_key, None); let end_key = F::encode_raw_key_owned(end_key, None); @@ -2314,7 +2335,7 @@ impl Storage { if let Err(e) = deadline.check() { return callback(Err(Error::from(e))); } - let command_duration = tikv_util::time::Instant::now(); + let command_duration = Instant::now(); if let Err(e) = Self::check_causal_ts_flushed(&mut ctx, CMD).await { return callback(Err(e)); @@ -2400,7 +2421,7 @@ impl Storage { [(Some(&start_key), end_key.as_ref())], )?; - let command_duration = tikv_util::time::Instant::now(); + let command_duration = Instant::now(); let snap_ctx = SnapContext { pb_ctx: &ctx, ..Default::default() @@ -2477,12 +2498,15 @@ impl Storage { .get(CMD) .observe(statistics.data.flow_stats.read_keys as f64); metrics::tls_collect_scan_details(CMD, &statistics); + let now = Instant::now(); SCHED_PROCESSING_READ_HISTOGRAM_STATIC .get(CMD) - .observe(begin_instant.saturating_elapsed_secs()); - SCHED_HISTOGRAM_VEC_STATIC - .get(CMD) - .observe(command_duration.saturating_elapsed_secs()); + .observe(duration_to_sec( + now.saturating_duration_since(begin_instant), + )); + SCHED_HISTOGRAM_VEC_STATIC.get(CMD).observe(duration_to_sec( + now.saturating_duration_since(command_duration), + )); result } @@ -2542,7 +2566,7 @@ impl Storage { .map(|range| (Some(range.get_start_key()), Some(range.get_end_key()))), )?; - let command_duration = tikv_util::time::Instant::now(); + let command_duration = Instant::now(); let snap_ctx = SnapContext { pb_ctx: &ctx, ..Default::default() @@ -2640,12 +2664,15 @@ impl Storage { .get(CMD) .observe(statistics.data.flow_stats.read_keys as f64); metrics::tls_collect_scan_details(CMD, &statistics); + let now = Instant::now(); SCHED_PROCESSING_READ_HISTOGRAM_STATIC .get(CMD) - .observe(begin_instant.saturating_elapsed_secs()); - SCHED_HISTOGRAM_VEC_STATIC - .get(CMD) - .observe(command_duration.saturating_elapsed_secs()); + .observe(duration_to_sec( + now.saturating_duration_since(begin_instant), + )); + SCHED_HISTOGRAM_VEC_STATIC.get(CMD).observe(duration_to_sec( + now.saturating_duration_since(command_duration), + )); Ok(result) } } @@ -2690,7 +2717,7 @@ impl Storage { Self::check_api_version(api_version, ctx.api_version, CMD, [&key])?; - let command_duration = tikv_util::time::Instant::now(); + let command_duration = Instant::now(); let snap_ctx = SnapContext { pb_ctx: &ctx, ..Default::default() @@ -2725,12 +2752,15 @@ impl Storage { &stats, buckets.as_ref(), ); + let now = Instant::now(); SCHED_PROCESSING_READ_HISTOGRAM_STATIC .get(CMD) - .observe(begin_instant.saturating_elapsed_secs()); - SCHED_HISTOGRAM_VEC_STATIC - .get(CMD) - .observe(command_duration.saturating_elapsed_secs()); + .observe(duration_to_sec( + now.saturating_duration_since(begin_instant), + )); + SCHED_HISTOGRAM_VEC_STATIC.get(CMD).observe(duration_to_sec( + now.saturating_duration_since(command_duration), + )); r } } @@ -2887,7 +2917,7 @@ impl Storage { range.set_end_key(end_key.into_encoded()); } - let command_duration = tikv_util::time::Instant::now(); + let command_duration = Instant::now(); let snap_ctx = SnapContext { pb_ctx: &ctx, ..Default::default() @@ -2898,7 +2928,7 @@ impl Storage { let store = RawStore::new(snapshot, api_version); let cf = Self::rawkv_cf("", api_version)?; - let begin_instant = tikv_util::time::Instant::now(); + let begin_instant = Instant::now(); let mut stats = Vec::with_capacity(ranges.len()); let ret = store .raw_checksum_ranges(cf, &ranges, &mut stats) @@ -2913,12 +2943,15 @@ impl Storage { buckets.as_ref(), ); }); + let now = Instant::now(); SCHED_PROCESSING_READ_HISTOGRAM_STATIC .get(CMD) - .observe(begin_instant.saturating_elapsed().as_secs_f64()); - SCHED_HISTOGRAM_VEC_STATIC - .get(CMD) - .observe(command_duration.saturating_elapsed().as_secs_f64()); + .observe(duration_to_sec( + now.saturating_duration_since(begin_instant), + )); + SCHED_HISTOGRAM_VEC_STATIC.get(CMD).observe(duration_to_sec( + now.saturating_duration_since(command_duration), + )); ret } @@ -3695,7 +3728,7 @@ pub mod test_util { &self, id: u64, res: Result<(Option>, Statistics)>, - _: tikv_util::time::Instant, + _: Instant, _source: String, ) { self.data.lock().unwrap().push(GetResult { @@ -3706,13 +3739,7 @@ pub mod test_util { } impl ResponseBatchConsumer>> for GetConsumer { - fn consume( - &self, - id: u64, - res: Result>>, - _: tikv_util::time::Instant, - _source: String, - ) { + fn consume(&self, id: u64, res: Result>>, _: Instant, _source: String) { self.data.lock().unwrap().push(GetResult { id, res }); } } From 87b2fe35aefc0d12e53ea0a471b5d9a7cb8606c9 Mon Sep 17 00:00:00 2001 From: Neil Shen Date: Fri, 1 Sep 2023 10:29:09 +0800 Subject: [PATCH 022/220] resolved_ts: shrink resolver lock map (#15484) close tikv/tikv#15458 Resolver owns a hash map to tracking locks and unlock events, and so for calculating resolved ts. However, it does not shrink map even after all lock are removed, this may result OOM if there are transactions that modify many rows across many regions. The total memory usage is proportional to the number of modified rows. Signed-off-by: Neil Shen Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- components/resolved_ts/src/lib.rs | 1 + components/resolved_ts/src/resolver.rs | 129 ++++++++++++++++++++++++- 2 files changed, 127 insertions(+), 3 deletions(-) diff --git a/components/resolved_ts/src/lib.rs b/components/resolved_ts/src/lib.rs index eef1211a580..f9eeb7c8b70 100644 --- a/components/resolved_ts/src/lib.rs +++ b/components/resolved_ts/src/lib.rs @@ -14,6 +14,7 @@ #![feature(box_patterns)] #![feature(result_flattening)] +#![feature(let_chains)] #[macro_use] extern crate tikv_util; diff --git a/components/resolved_ts/src/resolver.rs b/components/resolved_ts/src/resolver.rs index 1b0a07bf8e2..6bee5efd2f6 100644 --- a/components/resolved_ts/src/resolver.rs +++ b/components/resolved_ts/src/resolver.rs @@ -1,6 +1,6 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. -use std::{cmp, collections::BTreeMap, sync::Arc}; +use std::{cmp, collections::BTreeMap, sync::Arc, time::Duration}; use collections::{HashMap, HashSet}; use raftstore::store::RegionReadProgress; @@ -23,6 +23,8 @@ pub struct Resolver { locks_by_key: HashMap, TimeStamp>, // start_ts -> locked keys. lock_ts_heap: BTreeMap>>, + // The last shrink time. + last_aggressive_shrink_time: Instant, // The timestamps that guarantees no more commit will happen before. resolved_ts: TimeStamp, // The highest index `Resolver` had been tracked @@ -93,6 +95,7 @@ impl Resolver { resolved_ts: TimeStamp::zero(), locks_by_key: HashMap::default(), lock_ts_heap: BTreeMap::new(), + last_aggressive_shrink_time: Instant::now_coarse(), read_progress, tracked_index: 0, min_ts: TimeStamp::zero(), @@ -161,6 +164,23 @@ impl Resolver { key.heap_size() + std::mem::size_of::() } + fn shrink_ratio(&mut self, ratio: usize, timestamp: Option) { + // HashMap load factor is 87% approximately, leave some margin to avoid + // frequent rehash. + // + // See https://github.com/rust-lang/hashbrown/blob/v0.14.0/src/raw/mod.rs#L208-L220 + const MIN_SHRINK_RATIO: usize = 2; + if self.locks_by_key.capacity() + > self.locks_by_key.len() * cmp::max(MIN_SHRINK_RATIO, ratio) + { + self.locks_by_key.shrink_to_fit(); + } + if let Some(ts) = timestamp && let Some(lock_set) = self.lock_ts_heap.get_mut(&ts) + && lock_set.capacity() > lock_set.len() * cmp::max(MIN_SHRINK_RATIO, ratio) { + lock_set.shrink_to_fit(); + } + } + #[must_use] pub fn track_lock(&mut self, start_ts: TimeStamp, key: Vec, index: Option) -> bool { if let Some(index) = index { @@ -201,13 +221,22 @@ impl Resolver { self.region_id, ); - let entry = self.lock_ts_heap.get_mut(&start_ts); - if let Some(locked_keys) = entry { + let mut shrink_ts = None; + if let Some(locked_keys) = self.lock_ts_heap.get_mut(&start_ts) { + // Only shrink large set, because committing a small transaction is + // fast and shrink adds unnecessary overhead. + const SHRINK_SET_CAPACITY: usize = 256; + if locked_keys.capacity() > SHRINK_SET_CAPACITY { + shrink_ts = Some(start_ts); + } locked_keys.remove(key); if locked_keys.is_empty() { self.lock_ts_heap.remove(&start_ts); } } + // Use a large ratio to amortize the cost of rehash. + let shrink_ratio = 8; + self.shrink_ratio(shrink_ratio, shrink_ts); } /// Try to advance resolved ts. @@ -215,11 +244,20 @@ impl Resolver { /// `min_ts` advances the resolver even if there is no write. /// Return None means the resolver is not initialized. pub fn resolve(&mut self, min_ts: TimeStamp, now: Option) -> TimeStamp { + // Use a small ratio to shrink the memory usage aggressively. + const AGGRESSIVE_SHRINK_RATIO: usize = 2; + const AGGRESSIVE_SHRINK_INTERVAL: Duration = Duration::from_secs(10); + if self.last_aggressive_shrink_time.saturating_elapsed() > AGGRESSIVE_SHRINK_INTERVAL { + self.shrink_ratio(AGGRESSIVE_SHRINK_RATIO, None); + self.last_aggressive_shrink_time = Instant::now_coarse(); + } + // The `Resolver` is stopped, not need to advance, just return the current // `resolved_ts` if self.stopped { return self.resolved_ts; } + // Find the min start ts. let min_lock = self.lock_ts_heap.keys().next().cloned(); let has_lock = min_lock.is_some(); @@ -407,4 +445,89 @@ mod tests { drop(resolver); assert_eq!(memory_quota.in_use(), 0); } + + #[test] + fn test_untrack_lock_shrink_ratio() { + let memory_quota = Arc::new(MemoryQuota::new(std::usize::MAX)); + let mut resolver = Resolver::new(1, memory_quota); + let mut key = vec![0; 16]; + let mut ts = TimeStamp::default(); + for _ in 0..1000 { + ts.incr(); + key[0..8].copy_from_slice(&ts.into_inner().to_be_bytes()); + let _ = resolver.track_lock(ts, key.clone(), None); + } + assert!( + resolver.locks_by_key.capacity() >= 1000, + "{}", + resolver.locks_by_key.capacity() + ); + + let mut ts = TimeStamp::default(); + for _ in 0..901 { + ts.incr(); + key[0..8].copy_from_slice(&ts.into_inner().to_be_bytes()); + resolver.untrack_lock(&key, None); + } + // shrink_to_fit may reserve some space in accordance with the resize + // policy, but it is expected to be less than 500. + assert!( + resolver.locks_by_key.capacity() < 500, + "{}, {}", + resolver.locks_by_key.capacity(), + resolver.locks_by_key.len(), + ); + + for _ in 0..99 { + ts.incr(); + key[0..8].copy_from_slice(&ts.into_inner().to_be_bytes()); + resolver.untrack_lock(&key, None); + } + assert!( + resolver.locks_by_key.capacity() < 100, + "{}, {}", + resolver.locks_by_key.capacity(), + resolver.locks_by_key.len(), + ); + + // Trigger aggressive shrink. + resolver.last_aggressive_shrink_time = Instant::now_coarse() - Duration::from_secs(600); + resolver.resolve(TimeStamp::new(0), None); + assert!( + resolver.locks_by_key.capacity() == 0, + "{}, {}", + resolver.locks_by_key.capacity(), + resolver.locks_by_key.len(), + ); + } + + #[test] + fn test_untrack_lock_set_shrink_ratio() { + let memory_quota = Arc::new(MemoryQuota::new(std::usize::MAX)); + let mut resolver = Resolver::new(1, memory_quota); + let mut key = vec![0; 16]; + let ts = TimeStamp::new(1); + for i in 0..1000usize { + key[0..8].copy_from_slice(&i.to_be_bytes()); + let _ = resolver.track_lock(ts, key.clone(), None); + } + assert!( + resolver.lock_ts_heap[&ts].capacity() >= 1000, + "{}", + resolver.lock_ts_heap[&ts].capacity() + ); + + for i in 0..990usize { + key[0..8].copy_from_slice(&i.to_be_bytes()); + resolver.untrack_lock(&key, None); + } + // shrink_to_fit may reserve some space in accordance with the resize + // policy, but it is expected to be less than 100. + assert!( + resolver.lock_ts_heap[&ts].capacity() < 500, + "{}, {}", + resolver.lock_ts_heap[&ts].capacity(), + resolver.lock_ts_heap[&ts].len(), + ); + } } From 32c030dcdb54e81718bce98b79f056a38cde9a7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B1=B1=E5=B2=9A?= <36239017+YuJuncen@users.noreply.github.com> Date: Fri, 1 Sep 2023 10:45:39 +0800 Subject: [PATCH 023/220] raftstore: don't return is_witness while region not found (#15475) close tikv/tikv#15468 Return `RegionNotFound` while cannot find peer in the current store. Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- components/raftstore/src/store/worker/read.rs | 19 ++++++-- tests/failpoints/cases/test_witness.rs | 47 +++++++++++++++++++ 2 files changed, 62 insertions(+), 4 deletions(-) diff --git a/components/raftstore/src/store/worker/read.rs b/components/raftstore/src/store/worker/read.rs index 0c4641770be..5d6ede9c193 100644 --- a/components/raftstore/src/store/worker/read.rs +++ b/components/raftstore/src/store/worker/read.rs @@ -824,10 +824,21 @@ where return Ok(None); } - // Check witness - if find_peer_by_id(&delegate.region, delegate.peer_id).map_or(true, |p| p.is_witness) { - TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.witness.inc()); - return Err(Error::IsWitness(region_id)); + match find_peer_by_id(&delegate.region, delegate.peer_id) { + // Check witness + Some(peer) => { + if peer.is_witness { + TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.witness.inc()); + return Err(Error::IsWitness(region_id)); + } + } + // This (rarely) happen in witness disabled clusters while the conf change applied but + // region not removed. We shouldn't return `IsWitness` here because our client back off + // for a long time while encountering that. + None => { + TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.no_region.inc()); + return Err(Error::RegionNotFound(region_id)); + } } // Check non-witness hasn't finish applying snapshot yet. diff --git a/tests/failpoints/cases/test_witness.rs b/tests/failpoints/cases/test_witness.rs index 02411ba1b76..33a62f0532b 100644 --- a/tests/failpoints/cases/test_witness.rs +++ b/tests/failpoints/cases/test_witness.rs @@ -16,6 +16,7 @@ fn test_witness_update_region_in_local_reader() { cluster.run(); let nodes = Vec::from_iter(cluster.get_node_ids()); assert_eq!(nodes.len(), 3); + assert_eq!(nodes[2], 3); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); @@ -64,6 +65,52 @@ fn test_witness_update_region_in_local_reader() { fail::remove("change_peer_after_update_region_store_3"); } +// This case is almost the same as `test_witness_update_region_in_local_reader`, +// but this omitted changing the peer to witness, for ensuring `peer_is_witness` +// won't be returned in a cluster without witnesses. +#[test] +fn test_witness_not_reported_while_disabled() { + let mut cluster = new_server_cluster(0, 3); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + assert_eq!(nodes[2], 3); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap().clone(); + cluster.must_transfer_leader(region.get_id(), peer_on_store1); + let peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + + cluster.must_put(b"k0", b"v0"); + + // update region but the peer is not destroyed yet + fail::cfg("change_peer_after_update_region_store_3", "pause").unwrap(); + + cluster + .pd_client + .must_remove_peer(region.get_id(), peer_on_store3.clone()); + + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + let mut request = new_request( + region.get_id(), + region.get_region_epoch().clone(), + vec![new_get_cmd(b"k0")], + false, + ); + request.mut_header().set_peer(peer_on_store3); + request.mut_header().set_replica_read(true); + + let resp = cluster + .read(None, request.clone(), Duration::from_millis(100)) + .unwrap(); + assert!(resp.get_header().has_error()); + assert!(!resp.get_header().get_error().has_is_witness()); + fail::remove("change_peer_after_update_region_store_3"); +} + // Test the case witness pull voter_replicated_index when has pending compact // cmd. #[test] From fa3892be7ff7acad80cdac19bbe2f5bb1423f8ac Mon Sep 17 00:00:00 2001 From: Neil Shen Date: Fri, 1 Sep 2023 11:54:39 +0800 Subject: [PATCH 024/220] server: track grpc threads memory throughput (#15488) ref tikv/tikv#8235 Signed-off-by: Neil Shen Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- components/server/src/server.rs | 8 ++++++++ components/server/src/server2.rs | 8 ++++++++ 2 files changed, 16 insertions(+) diff --git a/components/server/src/server.rs b/components/server/src/server.rs index 72f7b936956..8d44890e5a6 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -105,6 +105,7 @@ use tikv::{ Engine, Storage, }, }; +use tikv_alloc::{add_thread_memory_accessor, remove_thread_memory_accessor}; use tikv_util::{ check_environment_variables, config::VersionTrack, @@ -294,6 +295,13 @@ where EnvBuilder::new() .cq_count(config.server.grpc_concurrency) .name_prefix(thd_name!(GRPC_THREAD_PREFIX)) + .after_start(|| { + // SAFETY: we will call `remove_thread_memory_accessor` at before_stop. + unsafe { add_thread_memory_accessor() }; + }) + .before_stop(|| { + remove_thread_memory_accessor(); + }) .build(), ); let pd_client = TikvServerCore::connect_to_pd_cluster( diff --git a/components/server/src/server2.rs b/components/server/src/server2.rs index 1289ffe848d..2593035618d 100644 --- a/components/server/src/server2.rs +++ b/components/server/src/server2.rs @@ -103,6 +103,7 @@ use tikv::{ Engine, Storage, }, }; +use tikv_alloc::{add_thread_memory_accessor, remove_thread_memory_accessor}; use tikv_util::{ check_environment_variables, config::VersionTrack, @@ -289,6 +290,13 @@ where EnvBuilder::new() .cq_count(config.server.grpc_concurrency) .name_prefix(thd_name!(GRPC_THREAD_PREFIX)) + .after_start(|| { + // SAFETY: we will call `remove_thread_memory_accessor` at before_stop. + unsafe { add_thread_memory_accessor() }; + }) + .before_stop(|| { + remove_thread_memory_accessor(); + }) .build(), ); let pd_client = TikvServerCore::connect_to_pd_cluster( From a56fe6abdccdf98657eb880f1b55792bbabb29ac Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Mon, 4 Sep 2023 12:53:13 +0800 Subject: [PATCH 025/220] raftstore-v2: fix panic of dynamic changing write-buffer-limit (#15504) close tikv/tikv#15503 fix panic of dynamic changing write-buffer-limit Signed-off-by: SpadeA-Tang --- components/engine_panic/src/db_options.rs | 4 ++++ components/engine_rocks/src/db_options.rs | 8 ++++++++ components/engine_traits/src/db_options.rs | 1 + src/config/mod.rs | 13 ++++++++++--- 4 files changed, 23 insertions(+), 3 deletions(-) diff --git a/components/engine_panic/src/db_options.rs b/components/engine_panic/src/db_options.rs index c081a5c1d12..05147ca06fb 100644 --- a/components/engine_panic/src/db_options.rs +++ b/components/engine_panic/src/db_options.rs @@ -40,6 +40,10 @@ impl DbOptions for PanicDbOptions { panic!() } + fn get_flush_size(&self) -> Result { + panic!() + } + fn set_rate_limiter_auto_tuned(&mut self, rate_limiter_auto_tuned: bool) -> Result<()> { panic!() } diff --git a/components/engine_rocks/src/db_options.rs b/components/engine_rocks/src/db_options.rs index c9ef2cfda98..38587663084 100644 --- a/components/engine_rocks/src/db_options.rs +++ b/components/engine_rocks/src/db_options.rs @@ -100,6 +100,14 @@ impl DbOptions for RocksDbOptions { Ok(()) } + fn get_flush_size(&self) -> Result { + if let Some(m) = self.0.get_write_buffer_manager() { + return Ok(m.flush_size() as u64); + } + + Err(box_err!("write buffer manager not found")) + } + fn set_flush_oldest_first(&mut self, f: bool) -> Result<()> { if let Some(m) = self.0.get_write_buffer_manager() { m.set_flush_oldest_first(f); diff --git a/components/engine_traits/src/db_options.rs b/components/engine_traits/src/db_options.rs index 2c6e9c3d4e8..9713c406978 100644 --- a/components/engine_traits/src/db_options.rs +++ b/components/engine_traits/src/db_options.rs @@ -21,6 +21,7 @@ pub trait DbOptions { fn get_rate_limiter_auto_tuned(&self) -> Option; fn set_rate_limiter_auto_tuned(&mut self, rate_limiter_auto_tuned: bool) -> Result<()>; fn set_flush_size(&mut self, f: usize) -> Result<()>; + fn get_flush_size(&self) -> Result; fn set_flush_oldest_first(&mut self, f: bool) -> Result<()>; fn set_titandb_options(&mut self, opts: &Self::TitanDbOptions); } diff --git a/src/config/mod.rs b/src/config/mod.rs index 8c0c04957b1..be2a52d9b07 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -110,7 +110,7 @@ const RAFT_ENGINE_MEMORY_LIMIT_RATE: f64 = 0.15; const WRITE_BUFFER_MEMORY_LIMIT_RATE: f64 = 0.2; // Too large will increase Raft Engine memory usage. const WRITE_BUFFER_MEMORY_LIMIT_MAX: u64 = ReadableSize::gb(8).0; -const DEFAULT_LOCK_BUFFER_MEMORY_LIMIT: u64 = ReadableSize::mb(32).0; +const DEFAULT_LOCK_BUFFER_MEMORY_LIMIT: ReadableSize = ReadableSize::mb(32); /// Configs that actually took effect in the last run pub const LAST_CONFIG_FILE: &str = "last_tikv.toml"; @@ -1406,7 +1406,7 @@ impl DbConfig { .get_or_insert(ReadableSize::mb(4)); self.lockcf .write_buffer_limit - .get_or_insert(ReadableSize::mb(DEFAULT_LOCK_BUFFER_MEMORY_LIMIT)); + .get_or_insert(DEFAULT_LOCK_BUFFER_MEMORY_LIMIT); } } } @@ -2061,7 +2061,8 @@ impl ConfigManager for DbConfigManger { .drain_filter(|(name, _)| name == "write_buffer_limit") .next() { - self.db.set_flush_size(size.1.into())?; + let size: ReadableSize = size.1.into(); + self.db.set_flush_size(size.0 as usize)?; } if let Some(f) = change @@ -5200,6 +5201,12 @@ mod tests { ReadableSize::mb(128).0 as i64 ); + cfg_controller + .update_config("rocksdb.write-buffer-limit", "10MB") + .unwrap(); + let flush_size = db.get_db_options().get_flush_size().unwrap(); + assert_eq!(flush_size, ReadableSize::mb(10).0); + // update some configs on default cf let cf_opts = db.get_options_cf(CF_DEFAULT).unwrap(); assert_eq!(cf_opts.get_disable_auto_compactions(), false); From 280b39c1fa0ec4bf85dae06561f2f792bf826e6a Mon Sep 17 00:00:00 2001 From: qupeng Date: Mon, 4 Sep 2023 15:44:13 +0800 Subject: [PATCH 026/220] cdc: enhance deregister protocol (#15485) close tikv/tikv#15487 Signed-off-by: qupeng --- components/cdc/src/endpoint.rs | 97 ++++++++++++++++++++++++++++++++-- components/cdc/src/service.rs | 23 ++++++-- 2 files changed, 112 insertions(+), 8 deletions(-) diff --git a/components/cdc/src/endpoint.rs b/components/cdc/src/endpoint.rs index 72042bb5aec..969d0cba0d9 100644 --- a/components/cdc/src/endpoint.rs +++ b/components/cdc/src/endpoint.rs @@ -80,6 +80,11 @@ pub enum Deregister { conn_id: ConnId, request_id: u64, }, + Region { + conn_id: ConnId, + request_id: u64, + region_id: u64, + }, Downstream { conn_id: ConnId, request_id: u64, @@ -112,6 +117,16 @@ impl fmt::Debug for Deregister { .field("conn_id", conn_id) .field("request_id", request_id) .finish(), + Deregister::Region { + ref conn_id, + ref request_id, + ref region_id, + } => de + .field("deregister", &"region") + .field("conn_id", conn_id) + .field("request_id", request_id) + .field("region_id", region_id) + .finish(), Deregister::Downstream { ref conn_id, ref request_id, @@ -583,8 +598,20 @@ impl, E: KvEngine, S: StoreRegionMeta> Endpoint { let conn = self.connections.get_mut(&conn_id).unwrap(); - for (region, downstream) in conn.unsubscribe_request(request_id) { - self.deregister_downstream(region, downstream, None); + for (region_id, downstream) in conn.unsubscribe_request(request_id) { + let err = Some(Error::Other("region not found".into())); + self.deregister_downstream(region_id, downstream, err); + } + } + Deregister::Region { + conn_id, + request_id, + region_id, + } => { + let conn = self.connections.get_mut(&conn_id).unwrap(); + if let Some(downstream) = conn.unsubscribe(request_id, region_id) { + let err = Some(Error::Other("region not found".into())); + self.deregister_downstream(region_id, downstream, err); } } Deregister::Downstream { @@ -1248,13 +1275,12 @@ impl, E: KvEngine, S: StoreRegionMeta + Send> Runnable for Endpoint { fn on_timeout(&mut self) { - CDC_ENDPOINT_PENDING_TASKS.set(self.scheduler.pending_tasks() as _); - // Reclaim resolved_region_heap memory. self.resolved_region_heap .borrow_mut() .reset_and_shrink_to(self.capture_regions.len()); + CDC_ENDPOINT_PENDING_TASKS.set(self.scheduler.pending_tasks() as _); CDC_CAPTURED_REGION_COUNT.set(self.capture_regions.len() as i64); CDC_REGION_RESOLVE_STATUS_GAUGE_VEC .with_label_values(&["unresolved"]) @@ -1262,6 +1288,7 @@ impl, E: KvEngine, S: StoreRegionMeta + Send> Runnable CDC_REGION_RESOLVE_STATUS_GAUGE_VEC .with_label_values(&["resolved"]) .set(self.resolved_region_count as _); + if self.min_resolved_ts != TimeStamp::max() { CDC_MIN_RESOLVED_TS_REGION.set(self.min_ts_region_id as i64); CDC_MIN_RESOLVED_TS.set(self.min_resolved_ts.physical() as i64); @@ -2841,5 +2868,67 @@ mod tests { })); assert_eq!(suite.connections[&conn_id].downstreams_count(), 0); assert_eq!(suite.capture_regions.len(), 0); + for _ in 0..2 { + let cdc_event = channel::recv_timeout(&mut rx, Duration::from_millis(500)) + .unwrap() + .unwrap(); + let check = matches!(cdc_event.0, CdcEvent::Event(e) if { + matches!(e.event, Some(Event_oneof_event::Error(ref err)) if { + err.has_region_not_found() + }) + }); + assert!(check); + } + + // Resubscribe the region. + suite.add_region(2, 100); + for i in 1..=2 { + req.set_request_id(1); + req.set_region_id(i); + let downstream = Downstream::new( + "".to_string(), + region_epoch.clone(), + 1, + conn_id, + ChangeDataRequestKvApi::TiDb, + false, + ObservedRange::default(), + ); + suite.run(Task::Register { + request: req.clone(), + downstream, + conn_id, + }); + assert_eq!(suite.connections[&conn_id].downstreams_count(), i as usize); + } + + // Deregister regions one by one in the request. + suite.run(Task::Deregister(Deregister::Region { + conn_id, + request_id: 1, + region_id: 1, + })); + assert_eq!(suite.connections[&conn_id].downstreams_count(), 1); + assert_eq!(suite.capture_regions.len(), 1); + + suite.run(Task::Deregister(Deregister::Region { + conn_id, + request_id: 1, + region_id: 2, + })); + assert_eq!(suite.connections[&conn_id].downstreams_count(), 0); + assert_eq!(suite.capture_regions.len(), 0); + + for _ in 0..2 { + let cdc_event = channel::recv_timeout(&mut rx, Duration::from_millis(500)) + .unwrap() + .unwrap(); + let check = matches!(cdc_event.0, CdcEvent::Event(e) if { + matches!(e.event, Some(Event_oneof_event::Error(ref err)) if { + err.has_region_not_found() + }) + }); + assert!(check); + } } } diff --git a/components/cdc/src/service.rs b/components/cdc/src/service.rs index 7478e3afbad..7cbf268f2b7 100644 --- a/components/cdc/src/service.rs +++ b/components/cdc/src/service.rs @@ -304,6 +304,13 @@ impl Service { scheduler.schedule(task).map_err(|e| format!("{:?}", e)) } + // ### Command types: + // * Register registers a region. 1) both `request_id` and `region_id` must be + // specified; 2) `request_id` can be 0 but `region_id` can not. + // * Deregister deregisters some regions in one same `request_id` or just one + // region. 1) if both `request_id` and `region_id` are specified, just + // deregister the region; 2) if only `request_id` is specified, all region + // subscriptions with the same `request_id` will be deregistered. fn handle_request( scheduler: &Scheduler, peer: &str, @@ -361,10 +368,18 @@ impl Service { request: ChangeDataRequest, conn_id: ConnId, ) -> Result<(), String> { - let task = Task::Deregister(Deregister::Request { - conn_id, - request_id: request.request_id, - }); + let task = if request.region_id != 0 { + Task::Deregister(Deregister::Region { + conn_id, + request_id: request.request_id, + region_id: request.region_id, + }) + } else { + Task::Deregister(Deregister::Request { + conn_id, + request_id: request.request_id, + }) + }; scheduler.schedule(task).map_err(|e| format!("{:?}", e)) } From 1cd6dda7d351ed969811ebdea1a52f30c97d7094 Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Mon, 4 Sep 2023 16:14:15 +0800 Subject: [PATCH 027/220] raftstore-v2: reuse failpoint tests in test_early_apply.rs (#15501) ref tikv/tikv#15409 reuse failpoint tests in test_early_apply Signed-off-by: SpadeA-Tang --- components/test_raftstore/src/util.rs | 8 +++---- tests/failpoints/cases/test_early_apply.rs | 22 +++++++++++++------ tests/failpoints/cases/test_split_region.rs | 2 +- tests/failpoints/cases/test_stale_read.rs | 2 +- .../raftstore/test_early_apply.rs | 4 ++-- .../integrations/raftstore/test_lease_read.rs | 2 +- 6 files changed, 24 insertions(+), 16 deletions(-) diff --git a/components/test_raftstore/src/util.rs b/components/test_raftstore/src/util.rs index f63c69f9631..e88df1fb0ca 100644 --- a/components/test_raftstore/src/util.rs +++ b/components/test_raftstore/src/util.rs @@ -668,11 +668,11 @@ pub fn create_test_engine( ) } -pub fn configure_for_request_snapshot(cluster: &mut Cluster) { +pub fn configure_for_request_snapshot(config: &mut Config) { // We don't want to generate snapshots due to compact log. - cluster.cfg.raft_store.raft_log_gc_threshold = 1000; - cluster.cfg.raft_store.raft_log_gc_count_limit = Some(1000); - cluster.cfg.raft_store.raft_log_gc_size_limit = Some(ReadableSize::mb(20)); + config.raft_store.raft_log_gc_threshold = 1000; + config.raft_store.raft_log_gc_count_limit = Some(1000); + config.raft_store.raft_log_gc_size_limit = Some(ReadableSize::mb(20)); } pub fn configure_for_hibernate(config: &mut Config) { diff --git a/tests/failpoints/cases/test_early_apply.rs b/tests/failpoints/cases/test_early_apply.rs index a194ef74d8f..bf403fb4668 100644 --- a/tests/failpoints/cases/test_early_apply.rs +++ b/tests/failpoints/cases/test_early_apply.rs @@ -7,14 +7,16 @@ use std::sync::{ use raft::eraftpb::MessageType; use test_raftstore::*; +use test_raftstore_macro::test_case; // Test if a singleton can apply a log before persisting it. -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_singleton_cannot_early_apply() { - let mut cluster = new_node_cluster(0, 1); + let mut cluster = new_cluster(0, 1); cluster.pd_client.disable_default_operator(); // So compact log will not be triggered automatically. - configure_for_request_snapshot(&mut cluster); + configure_for_request_snapshot(&mut cluster.cfg); cluster.run(); // Put one key first to cache leader. @@ -33,13 +35,14 @@ fn test_singleton_cannot_early_apply() { must_get_equal(&cluster.get_engine(1), b"k1", b"v1"); } -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_multi_early_apply() { - let mut cluster = new_node_cluster(0, 3); + let mut cluster = new_cluster(0, 3); cluster.pd_client.disable_default_operator(); cluster.cfg.raft_store.store_batch_system.pool_size = 1; // So compact log will not be triggered automatically. - configure_for_request_snapshot(&mut cluster); + configure_for_request_snapshot(&mut cluster.cfg); cluster.run_conf_change(); // Check mixed regions can be scheduled correctly. @@ -68,9 +71,11 @@ fn test_multi_early_apply() { })), )); cluster.async_put(b"k4", b"v4").unwrap(); - // Sleep a while so that follower will send append response. + // Sleep a while so that follower will send append response sleep_ms(100); cluster.async_put(b"k11", b"v22").unwrap(); + // Sleep a while so that follower will send append response. + sleep_ms(100); // Now the store thread of store 1 pauses on `store_1_fp`. // Set `store_1_fp` again to make this store thread does not pause on it. // Then leader 1 will receive the append response and commit the log. @@ -92,6 +97,9 @@ fn test_multi_early_apply() { /// the peer to fix this issue. /// For simplicity, this test uses region merge to ensure that the apply state /// will be written to kv db before crash. +/// +/// Note: partitioned-raft-kv does not need this due to change in disk +/// persistence logic #[test] fn test_early_apply_yield_followed_with_many_entries() { let mut cluster = new_node_cluster(0, 3); diff --git a/tests/failpoints/cases/test_split_region.rs b/tests/failpoints/cases/test_split_region.rs index dfd7002495c..ed01386b528 100644 --- a/tests/failpoints/cases/test_split_region.rs +++ b/tests/failpoints/cases/test_split_region.rs @@ -752,7 +752,7 @@ impl Filter for CollectSnapshotFilter { #[test] fn test_split_duplicated_batch() { let mut cluster = new_node_cluster(0, 3); - configure_for_request_snapshot(&mut cluster); + configure_for_request_snapshot(&mut cluster.cfg); // Disable raft log gc in this test case. cluster.cfg.raft_store.raft_log_gc_tick_interval = ReadableDuration::secs(60); // Use one thread to make it more possible to be fetched into one batch. diff --git a/tests/failpoints/cases/test_stale_read.rs b/tests/failpoints/cases/test_stale_read.rs index 523bb54f7cb..a9c6fa5d6e6 100644 --- a/tests/failpoints/cases/test_stale_read.rs +++ b/tests/failpoints/cases/test_stale_read.rs @@ -325,7 +325,7 @@ fn test_read_index_when_transfer_leader_2() { // Increase the election tick to make this test case running reliably. configure_for_lease_read(&mut cluster.cfg, Some(50), Some(10_000)); // Stop log compaction to transfer leader with filter easier. - configure_for_request_snapshot(&mut cluster); + configure_for_request_snapshot(&mut cluster.cfg); let max_lease = Duration::from_secs(2); cluster.cfg.raft_store.raft_store_max_leader_lease = ReadableDuration(max_lease); diff --git a/tests/integrations/raftstore/test_early_apply.rs b/tests/integrations/raftstore/test_early_apply.rs index b30a861e2fe..44537e8b409 100644 --- a/tests/integrations/raftstore/test_early_apply.rs +++ b/tests/integrations/raftstore/test_early_apply.rs @@ -109,7 +109,7 @@ fn test_early_apply(mode: DataLost) { let mut cluster = new_node_cluster(0, 3); cluster.pd_client.disable_default_operator(); // So compact log will not be triggered automatically. - configure_for_request_snapshot(&mut cluster); + configure_for_request_snapshot(&mut cluster.cfg); cluster.run(); if mode == DataLost::LeaderCommit || mode == DataLost::AllLost { cluster.must_transfer_leader(1, new_peer(1, 1)); @@ -175,7 +175,7 @@ fn test_update_internal_apply_index() { let mut cluster = new_node_cluster(0, 4); cluster.pd_client.disable_default_operator(); // So compact log will not be triggered automatically. - configure_for_request_snapshot(&mut cluster); + configure_for_request_snapshot(&mut cluster.cfg); cluster.run(); cluster.must_transfer_leader(1, new_peer(3, 3)); cluster.must_put(b"k1", b"v1"); diff --git a/tests/integrations/raftstore/test_lease_read.rs b/tests/integrations/raftstore/test_lease_read.rs index 60c87fd4e00..abf17e01e9d 100644 --- a/tests/integrations/raftstore/test_lease_read.rs +++ b/tests/integrations/raftstore/test_lease_read.rs @@ -481,7 +481,7 @@ fn test_read_index_stale_in_suspect_lease() { configure_for_lease_read(&mut cluster.cfg, Some(50), Some(10_000)); let max_lease = Duration::from_secs(2); // Stop log compaction to transfer leader with filter easier. - configure_for_request_snapshot(&mut cluster); + configure_for_request_snapshot(&mut cluster.cfg); cluster.cfg.raft_store.raft_store_max_leader_lease = ReadableDuration(max_lease); cluster.pd_client.disable_default_operator(); From 640143a2daba90bfcc9a3848d19887a7a2f39170 Mon Sep 17 00:00:00 2001 From: buffer <1045931706@qq.com> Date: Mon, 4 Sep 2023 17:48:43 +0800 Subject: [PATCH 028/220] raftstore: region initial size depends on the split resource . (#15456) close tikv/tikv#15457 there are three triggers will split the regions: 1. load split include sizekeys, load etc. In this cases, the new region should contains the data after split. 2. tidb split tables or partition table, such like `create table test.t1(id int,b int) shard_row_id_bits=4 partition by hash(id) partitions 2000`. In this cases , the new region shouldn't contains any data after split. Signed-off-by: bufferflies <1045931706@qq.com> --- Cargo.lock | 2 +- .../src/operation/command/admin/split.rs | 33 ++++++++-- components/raftstore-v2/src/operation/pd.rs | 2 + components/raftstore-v2/src/router/imp.rs | 2 +- components/raftstore-v2/src/router/message.rs | 3 + components/raftstore-v2/src/worker/pd/mod.rs | 11 +++- .../raftstore-v2/src/worker/pd/region.rs | 1 + .../raftstore-v2/src/worker/pd/split.rs | 8 +++ components/raftstore/src/router.rs | 1 + components/raftstore/src/store/fsm/apply.rs | 7 ++ components/raftstore/src/store/fsm/peer.rs | 43 ++++++++++--- components/raftstore/src/store/msg.rs | 1 + components/raftstore/src/store/worker/pd.rs | 19 ++++++ components/test_raftstore/src/cluster.rs | 1 + src/server/raftkv/raft_extension.rs | 1 + src/server/raftkv2/raft_extension.rs | 2 +- tests/failpoints/cases/test_split_region.rs | 64 +++++++++++++++++++ 17 files changed, 181 insertions(+), 20 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 162d1f3ae07..4cd0882628b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2979,7 +2979,7 @@ dependencies = [ [[package]] name = "kvproto" version = "0.0.2" -source = "git+https://github.com/pingcap/kvproto.git#7b612d935bf96f9daf7a537db379bcc88b4644e0" +source = "git+https://github.com/pingcap/kvproto.git#ecdbf1f8c130089392a9bb5f86f7577deddfbed5" dependencies = [ "futures 0.3.15", "grpcio", diff --git a/components/raftstore-v2/src/operation/command/admin/split.rs b/components/raftstore-v2/src/operation/command/admin/split.rs index c744c1b9161..0f9cae7218d 100644 --- a/components/raftstore-v2/src/operation/command/admin/split.rs +++ b/components/raftstore-v2/src/operation/command/admin/split.rs @@ -75,6 +75,9 @@ pub struct SplitResult { // The index of the derived region in `regions` pub derived_index: usize, pub tablet_index: u64, + // new regions will share the region size if it's true. + // otherwise, the new region's size will be 0. + pub share_source_region_size: bool, // Hack: in common case we should use generic, but split is an infrequent // event that performance is not critical. And using `Any` can avoid polluting // all existing code. @@ -148,6 +151,9 @@ pub struct RequestSplit { pub epoch: RegionEpoch, pub split_keys: Vec>, pub source: Cow<'static, str>, + // new regions will share the region size if it's true. + // otherwise, the new region's size will be 0. + pub share_source_region_size: bool, } #[derive(Debug)] @@ -235,6 +241,7 @@ impl Peer { { return true; } + fail_point!("on_split_region_check_tick", |_| true); if ctx.schedulers.split_check.is_busy() { return false; } @@ -336,7 +343,7 @@ impl Peer { ch.set_result(cmd_resp::new_error(e)); return; } - self.ask_batch_split_pd(ctx, rs.split_keys, ch); + self.ask_batch_split_pd(ctx, rs.split_keys, rs.share_source_region_size, ch); } pub fn on_request_half_split( @@ -479,6 +486,7 @@ impl Apply { let derived_req = &[derived_req]; let right_derive = split_reqs.get_right_derive(); + let share_source_region_size = split_reqs.get_share_source_region_size(); let reqs = if right_derive { split_reqs.get_requests().iter().chain(derived_req) } else { @@ -615,6 +623,7 @@ impl Apply { derived_index, tablet_index: log_index, tablet: Box::new(tablet), + share_source_region_size, }), )) } @@ -665,6 +674,7 @@ impl Peer { fail_point!("on_split", self.peer().get_store_id() == 3, |_| {}); let derived = &res.regions[res.derived_index]; + let share_source_region_size = res.share_source_region_size; let region_id = derived.get_id(); let region_locks = self.txn_context().split(&res.regions, derived); @@ -695,8 +705,14 @@ impl Peer { let new_region_count = res.regions.len() as u64; let control = self.split_flow_control_mut(); - let estimated_size = control.approximate_size.map(|v| v / new_region_count); - let estimated_keys = control.approximate_keys.map(|v| v / new_region_count); + // if share_source_region_size is true, it means the new region contains any + // data from the origin region. + let mut share_size = None; + let mut share_keys = None; + if share_source_region_size { + share_size = control.approximate_size.map(|v| v / new_region_count); + share_keys = control.approximate_keys.map(|v| v / new_region_count); + } self.post_split(); @@ -714,8 +730,11 @@ impl Peer { // After split, the peer may need to update its metrics. let control = self.split_flow_control_mut(); control.may_skip_split_check = false; - control.approximate_size = estimated_size; - control.approximate_keys = estimated_keys; + if share_source_region_size { + control.approximate_size = share_size; + control.approximate_keys = share_keys; + } + self.add_pending_tick(PeerTick::SplitRegionCheck); } self.storage_mut().set_has_dirty_data(true); @@ -760,8 +779,8 @@ impl Peer { derived_region_id: region_id, check_split: last_region_id == new_region_id, scheduled: false, - approximate_size: estimated_size, - approximate_keys: estimated_keys, + approximate_size: share_size, + approximate_keys: share_keys, locks, })); diff --git a/components/raftstore-v2/src/operation/pd.rs b/components/raftstore-v2/src/operation/pd.rs index 817b3aa6eb6..9bce8f3ba02 100644 --- a/components/raftstore-v2/src/operation/pd.rs +++ b/components/raftstore-v2/src/operation/pd.rs @@ -215,6 +215,7 @@ impl Peer { &self, ctx: &StoreContext, split_keys: Vec>, + share_source_region_size: bool, ch: CmdResChannel, ) { let task = pd::Task::AskBatchSplit { @@ -222,6 +223,7 @@ impl Peer { split_keys, peer: self.peer().clone(), right_derive: ctx.cfg.right_derive_when_split, + share_source_region_size, ch, }; if let Err(e) = ctx.schedulers.pd.schedule(task) { diff --git a/components/raftstore-v2/src/router/imp.rs b/components/raftstore-v2/src/router/imp.rs index 9c6cca96ae4..23a8a3c7d4e 100644 --- a/components/raftstore-v2/src/router/imp.rs +++ b/components/raftstore-v2/src/router/imp.rs @@ -58,7 +58,7 @@ impl raftstore::coprocessor::StoreHandle for Store split_keys: Vec>, source: Cow<'static, str>, ) { - let (msg, _) = PeerMsg::request_split(region_epoch, split_keys, source.to_string()); + let (msg, _) = PeerMsg::request_split(region_epoch, split_keys, source.to_string(), true); let res = self.send(region_id, msg); if let Err(e) = res { warn!( diff --git a/components/raftstore-v2/src/router/message.rs b/components/raftstore-v2/src/router/message.rs index f09314b4f17..2d364af44e1 100644 --- a/components/raftstore-v2/src/router/message.rs +++ b/components/raftstore-v2/src/router/message.rs @@ -315,6 +315,7 @@ impl PeerMsg { epoch: metapb::RegionEpoch, split_keys: Vec>, source: String, + share_source_region_size: bool, ) -> (Self, CmdResSubscriber) { let (ch, sub) = CmdResChannel::pair(); ( @@ -323,6 +324,7 @@ impl PeerMsg { epoch, split_keys, source: source.into(), + share_source_region_size, }, ch, }, @@ -344,6 +346,7 @@ impl PeerMsg { epoch, split_keys, source: source.into(), + share_source_region_size: false, }, ch, }, diff --git a/components/raftstore-v2/src/worker/pd/mod.rs b/components/raftstore-v2/src/worker/pd/mod.rs index f89ea75b604..061a5ad5126 100644 --- a/components/raftstore-v2/src/worker/pd/mod.rs +++ b/components/raftstore-v2/src/worker/pd/mod.rs @@ -70,6 +70,7 @@ pub enum Task { split_keys: Vec>, peer: metapb::Peer, right_derive: bool, + share_source_region_size: bool, ch: CmdResChannel, }, ReportBatchSplit { @@ -324,7 +325,15 @@ where peer, right_derive, ch, - } => self.handle_ask_batch_split(region, split_keys, peer, right_derive, ch), + share_source_region_size, + } => self.handle_ask_batch_split( + region, + split_keys, + peer, + right_derive, + share_source_region_size, + ch, + ), Task::ReportBatchSplit { regions } => self.handle_report_batch_split(regions), Task::AutoSplit { split_infos } => self.handle_auto_split(split_infos), Task::UpdateMaxTimestamp { diff --git a/components/raftstore-v2/src/worker/pd/region.rs b/components/raftstore-v2/src/worker/pd/region.rs index e825dd54c32..763e12fff07 100644 --- a/components/raftstore-v2/src/worker/pd/region.rs +++ b/components/raftstore-v2/src/worker/pd/region.rs @@ -288,6 +288,7 @@ where epoch, split_keys: split_region.take_keys().into(), source: "pd".into(), + share_source_region_size: false, }, ch, } diff --git a/components/raftstore-v2/src/worker/pd/split.rs b/components/raftstore-v2/src/worker/pd/split.rs index bf13e01120a..7fec5a31bb6 100644 --- a/components/raftstore-v2/src/worker/pd/split.rs +++ b/components/raftstore-v2/src/worker/pd/split.rs @@ -17,10 +17,13 @@ fn new_batch_split_region_request( split_keys: Vec>, ids: Vec, right_derive: bool, + share_source_region_size: bool, ) -> AdminRequest { let mut req = AdminRequest::default(); req.set_cmd_type(AdminCmdType::BatchSplit); req.mut_splits().set_right_derive(right_derive); + req.mut_splits() + .set_share_source_region_size(share_source_region_size); let mut requests = Vec::with_capacity(ids.len()); for (mut id, key) in ids.into_iter().zip(split_keys) { let mut split = SplitRequest::default(); @@ -46,6 +49,7 @@ where split_keys: Vec>, peer: metapb::Peer, right_derive: bool, + share_source_region_size: bool, ch: CmdResChannel, ) { Self::ask_batch_split_imp( @@ -57,6 +61,7 @@ where split_keys, peer, right_derive, + share_source_region_size, Some(ch), ); } @@ -70,6 +75,7 @@ where split_keys: Vec>, peer: metapb::Peer, right_derive: bool, + share_source_region_size: bool, ch: Option, ) { if split_keys.is_empty() { @@ -98,6 +104,7 @@ where split_keys, resp.take_ids().into(), right_derive, + share_source_region_size, ); let region_id = region.get_id(); let epoch = region.take_region_epoch(); @@ -148,6 +155,7 @@ where vec![split_key], split_info.peer, true, + false, None, ); // Try to split the region on half within the given key diff --git a/components/raftstore/src/router.rs b/components/raftstore/src/router.rs index 3a76a5ad26f..09f389a2230 100644 --- a/components/raftstore/src/router.rs +++ b/components/raftstore/src/router.rs @@ -331,6 +331,7 @@ impl crate::coprocessor::StoreHandle for RaftRoute split_keys, callback: Callback::None, source, + share_source_region_size: true, }, ) { warn!( diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index 0bc1ccf7d85..c170e5a35f9 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -272,6 +272,7 @@ pub enum ExecResult { regions: Vec, derived: Region, new_split_regions: HashMap, + share_source_region_size: bool, }, PrepareMerge { region: Region, @@ -2516,6 +2517,9 @@ where admin_req .mut_splits() .set_right_derive(split.get_right_derive()); + admin_req + .mut_split() + .set_share_source_region_size(split.get_share_source_region_size()); admin_req.mut_splits().mut_requests().push(split); // This method is executed only when there are unapplied entries after being // restarted. So there will be no callback, it's OK to return a response @@ -2560,6 +2564,7 @@ where derived.mut_region_epoch().set_version(new_version); let right_derive = split_reqs.get_right_derive(); + let share_source_region_size = split_reqs.get_share_source_region_size(); let mut regions = Vec::with_capacity(new_region_cnt + 1); // Note that the split requests only contain ids for new regions, so we need // to handle new regions and old region separately. @@ -2724,6 +2729,7 @@ where regions, derived, new_split_regions, + share_source_region_size, }), )) } @@ -7088,6 +7094,7 @@ mod tests { regions, derived: _, new_split_regions: _, + share_source_region_size: _, } = apply_res.exec_res.front().unwrap() { let r8 = regions.get(0).unwrap(); diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index 62a3a2650de..9f7934e806e 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -1049,8 +1049,15 @@ where split_keys, callback, source, + share_source_region_size, } => { - self.on_prepare_split_region(region_epoch, split_keys, callback, &source); + self.on_prepare_split_region( + region_epoch, + split_keys, + callback, + &source, + share_source_region_size, + ); } CasualMessage::ComputeHashResult { index, @@ -4042,6 +4049,7 @@ where derived: metapb::Region, regions: Vec, new_split_regions: HashMap, + share_source_region_size: bool, ) { fail_point!("on_split", self.ctx.store_id() == 3, |_| {}); @@ -4063,8 +4071,15 @@ where // Roughly estimate the size and keys for new regions. let new_region_count = regions.len() as u64; - let estimated_size = self.fsm.peer.approximate_size.map(|v| v / new_region_count); - let estimated_keys = self.fsm.peer.approximate_keys.map(|v| v / new_region_count); + let mut share_size = None; + let mut share_keys = None; + // if share_source_region_size is true, it means the new region contains any + // data from the origin region + if share_source_region_size { + share_size = self.fsm.peer.approximate_size.map(|v| v / new_region_count); + share_keys = self.fsm.peer.approximate_keys.map(|v| v / new_region_count); + } + let mut meta = self.ctx.store_meta.lock().unwrap(); meta.set_region( &self.ctx.coprocessor_host, @@ -4079,8 +4094,10 @@ where let is_leader = self.fsm.peer.is_leader(); if is_leader { - self.fsm.peer.approximate_size = estimated_size; - self.fsm.peer.approximate_keys = estimated_keys; + if share_source_region_size { + self.fsm.peer.approximate_size = share_size; + self.fsm.peer.approximate_keys = share_keys; + } self.fsm.peer.heartbeat_pd(self.ctx); // Notify pd immediately to let it update the region meta. info!( @@ -4215,8 +4232,8 @@ where new_peer.has_ready |= campaigned; if is_leader { - new_peer.peer.approximate_size = estimated_size; - new_peer.peer.approximate_keys = estimated_keys; + new_peer.peer.approximate_size = share_size; + new_peer.peer.approximate_keys = share_keys; *new_peer.peer.txn_ext.pessimistic_locks.write() = locks; // The new peer is likely to become leader, send a heartbeat immediately to // reduce client query miss. @@ -5043,7 +5060,13 @@ where derived, regions, new_split_regions, - } => self.on_ready_split_region(derived, regions, new_split_regions), + share_source_region_size, + } => self.on_ready_split_region( + derived, + regions, + new_split_regions, + share_source_region_size, + ), ExecResult::PrepareMerge { region, state } => { self.on_ready_prepare_merge(region, state) } @@ -5768,7 +5791,7 @@ where return; } - fail_point!("on_split_region_check_tick"); + fail_point!("on_split_region_check_tick", |_| {}); self.register_split_region_check_tick(); // To avoid frequent scan, we only add new scan tasks if all previous tasks @@ -5828,6 +5851,7 @@ where split_keys: Vec>, cb: Callback, source: &str, + share_source_region_size: bool, ) { info!( "on split"; @@ -5873,6 +5897,7 @@ where split_keys, peer: self.fsm.peer.peer.clone(), right_derive: self.ctx.cfg.right_derive_when_split, + share_source_region_size, callback: cb, }; if let Err(ScheduleError::Stopped(t)) = self.ctx.pd_scheduler.schedule(task) { diff --git a/components/raftstore/src/store/msg.rs b/components/raftstore/src/store/msg.rs index 1ed8934e0f0..64c5be6d7e1 100644 --- a/components/raftstore/src/store/msg.rs +++ b/components/raftstore/src/store/msg.rs @@ -545,6 +545,7 @@ pub enum CasualMessage { split_keys: Vec>, callback: Callback, source: Cow<'static, str>, + share_source_region_size: bool, }, /// Hash result of ComputeHash command. diff --git a/components/raftstore/src/store/worker/pd.rs b/components/raftstore/src/store/worker/pd.rs index e8c8e2f575b..32fbdbc3145 100644 --- a/components/raftstore/src/store/worker/pd.rs +++ b/components/raftstore/src/store/worker/pd.rs @@ -141,6 +141,7 @@ where peer: metapb::Peer, // If true, right Region derives origin region_id. right_derive: bool, + share_source_region_size: bool, callback: Callback, }, AskBatchSplit { @@ -149,6 +150,7 @@ where peer: metapb::Peer, // If true, right Region derives origin region_id. right_derive: bool, + share_source_region_size: bool, callback: Callback, }, AutoSplit { @@ -1066,6 +1068,7 @@ where split_key: Vec, peer: metapb::Peer, right_derive: bool, + share_source_region_size: bool, callback: Callback, task: String, ) { @@ -1087,6 +1090,7 @@ where resp.get_new_region_id(), resp.take_new_peer_ids(), right_derive, + share_source_region_size, ); let region_id = region.get_id(); let epoch = region.take_region_epoch(); @@ -1121,6 +1125,7 @@ where mut split_keys: Vec>, peer: metapb::Peer, right_derive: bool, + share_source_region_size: bool, callback: Callback, task: String, remote: Remote, @@ -1146,6 +1151,7 @@ where split_keys, resp.take_ids().into(), right_derive, + share_source_region_size, ); let region_id = region.get_id(); let epoch = region.take_region_epoch(); @@ -1174,6 +1180,7 @@ where split_key: split_keys.pop().unwrap(), peer, right_derive, + share_source_region_size, callback, }; if let Err(ScheduleError::Stopped(t)) = scheduler.schedule(task) { @@ -1645,6 +1652,7 @@ where split_keys: split_region.take_keys().into(), callback: Callback::None, source: "pd".into(), + share_source_region_size: false, } } else { CasualMessage::HalfSplitRegion { @@ -2048,12 +2056,14 @@ where split_key, peer, right_derive, + share_source_region_size, callback, } => self.handle_ask_split( region, split_key, peer, right_derive, + share_source_region_size, callback, String::from("ask_split"), ), @@ -2062,6 +2072,7 @@ where split_keys, peer, right_derive, + share_source_region_size, callback, } => Self::handle_ask_batch_split( self.router.clone(), @@ -2071,6 +2082,7 @@ where split_keys, peer, right_derive, + share_source_region_size, callback, String::from("batch_split"), self.remote.clone(), @@ -2095,6 +2107,7 @@ where vec![split_key], split_info.peer, true, + false, Callback::None, String::from("auto_split"), remote.clone(), @@ -2385,6 +2398,7 @@ fn new_split_region_request( new_region_id: u64, peer_ids: Vec, right_derive: bool, + share_source_region_size: bool, ) -> AdminRequest { let mut req = AdminRequest::default(); req.set_cmd_type(AdminCmdType::Split); @@ -2392,6 +2406,8 @@ fn new_split_region_request( req.mut_split().set_new_region_id(new_region_id); req.mut_split().set_new_peer_ids(peer_ids); req.mut_split().set_right_derive(right_derive); + req.mut_split() + .set_share_source_region_size(share_source_region_size); req } @@ -2399,10 +2415,13 @@ fn new_batch_split_region_request( split_keys: Vec>, ids: Vec, right_derive: bool, + share_source_region_size: bool, ) -> AdminRequest { let mut req = AdminRequest::default(); req.set_cmd_type(AdminCmdType::BatchSplit); req.mut_splits().set_right_derive(right_derive); + req.mut_splits() + .set_share_source_region_size(share_source_region_size); let mut requests = Vec::with_capacity(ids.len()); for (mut id, key) in ids.into_iter().zip(split_keys) { let mut split = SplitRequest::default(); diff --git a/components/test_raftstore/src/cluster.rs b/components/test_raftstore/src/cluster.rs index e65028fe968..26fa2a47d5f 100644 --- a/components/test_raftstore/src/cluster.rs +++ b/components/test_raftstore/src/cluster.rs @@ -1463,6 +1463,7 @@ impl Cluster { split_keys: vec![split_key], callback: cb, source: "test".into(), + share_source_region_size: false, }, ) .unwrap(); diff --git a/src/server/raftkv/raft_extension.rs b/src/server/raftkv/raft_extension.rs index d3178842489..733d60c838c 100644 --- a/src/server/raftkv/raft_extension.rs +++ b/src/server/raftkv/raft_extension.rs @@ -121,6 +121,7 @@ where split_keys, callback: raftstore::store::Callback::write(cb), source: source.into(), + share_source_region_size: false, }; let res = self.router.send_casual_msg(region_id, req); Box::pin(async move { diff --git a/src/server/raftkv2/raft_extension.rs b/src/server/raftkv2/raft_extension.rs index f2f433999b9..f6bb66e9e11 100644 --- a/src/server/raftkv2/raft_extension.rs +++ b/src/server/raftkv2/raft_extension.rs @@ -71,7 +71,7 @@ impl tikv_kv::RaftExtension for Extension split_keys: Vec>, source: String, ) -> futures::future::BoxFuture<'static, tikv_kv::Result>> { - let (msg, sub) = PeerMsg::request_split(region_epoch, split_keys, source); + let (msg, sub) = PeerMsg::request_split(region_epoch, split_keys, source, true); let res = self.router.check_send(region_id, msg); Box::pin(async move { res?; diff --git a/tests/failpoints/cases/test_split_region.rs b/tests/failpoints/cases/test_split_region.rs index ed01386b528..65c50793d7a 100644 --- a/tests/failpoints/cases/test_split_region.rs +++ b/tests/failpoints/cases/test_split_region.rs @@ -17,6 +17,7 @@ use kvproto::{ Mutation, Op, PessimisticLockRequest, PrewriteRequest, PrewriteRequestPessimisticAction::*, }, metapb::Region, + pdpb::CheckPolicy, raft_serverpb::{PeerState, RaftMessage}, tikvpb::TikvClient, }; @@ -31,6 +32,7 @@ use raftstore::{ Result, }; use test_raftstore::*; +use test_raftstore_macro::test_case; use tikv::storage::{kv::SnapshotExt, Snapshot}; use tikv_util::{ config::{ReadableDuration, ReadableSize}, @@ -346,6 +348,68 @@ impl Filter for PrevoteRangeFilter { } } +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] +fn test_region_size_after_split() { + let mut cluster = new_cluster(0, 1); + cluster.cfg.raft_store.right_derive_when_split = true; + cluster.cfg.raft_store.split_region_check_tick_interval = ReadableDuration::millis(100); + cluster.cfg.raft_store.pd_heartbeat_tick_interval = ReadableDuration::millis(100); + cluster.cfg.raft_store.region_split_check_diff = Some(ReadableSize(10)); + let region_max_size = 1440; + let region_split_size = 960; + cluster.cfg.coprocessor.region_max_size = Some(ReadableSize(region_max_size)); + cluster.cfg.coprocessor.region_split_size = Some(ReadableSize(region_split_size)); + let pd_client = cluster.pd_client.clone(); + pd_client.disable_default_operator(); + let _r = cluster.run_conf_change(); + + // insert 20 key value pairs into the cluster. + // from 000000001 to 000000020 + let mut range = 1..; + put_till_size(&mut cluster, region_max_size - 100, &mut range); + sleep_ms(100); + // disable check split. + fail::cfg("on_split_region_check_tick", "return").unwrap(); + let max_key = put_till_size(&mut cluster, region_max_size, &mut range); + // split by use key, split region 1 to region 1 and region 2. + // region 1: ["000000010",""] + // region 2: ["","000000010") + let region = pd_client.get_region(&max_key).unwrap(); + cluster.must_split(®ion, b"000000010"); + let size = cluster + .pd_client + .get_region_approximate_size(region.get_id()) + .unwrap_or_default(); + assert!(size >= region_max_size - 100, "{}", size); + + let region = pd_client.get_region(b"000000009").unwrap(); + let size1 = cluster + .pd_client + .get_region_approximate_size(region.get_id()) + .unwrap_or_default(); + assert_eq!(0, size1, "{}", size1); + + // split region by size check, the region 1 will be split to region 1 and region + // 3. and the region3 will contains one half region size data. + let region = pd_client.get_region(&max_key).unwrap(); + pd_client.split_region(region.clone(), CheckPolicy::Scan, vec![]); + sleep_ms(200); + let size2 = cluster + .pd_client + .get_region_approximate_size(region.get_id()) + .unwrap_or_default(); + assert!(size > size2, "{}:{}", size, size2); + fail::remove("on_split_region_check_tick"); + + let region = pd_client.get_region(b"000000010").unwrap(); + let size3 = cluster + .pd_client + .get_region_approximate_size(region.get_id()) + .unwrap_or_default(); + assert!(size3 > 0, "{}", size3); +} + // Test if a peer is created from splitting when another initialized peer with // the same region id has already existed. In previous implementation, it can be // created and panic will happen because there are two initialized peer with the From 02061bec4b8c2520eb2d5b003c064e3cd1a76a21 Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Tue, 5 Sep 2023 15:09:43 +0800 Subject: [PATCH 029/220] raftstore-v2: limit the flush times during server stop (#15511) ref tikv/tikv#15461 limit the flush times during server stop Signed-off-by: SpadeA-Tang --- components/engine_traits/src/flush.rs | 2 +- .../src/operation/ready/apply_trace.rs | 15 ++++++++-- .../integrations/raftstore/test_bootstrap.rs | 30 +++++++++++++++++++ 3 files changed, 44 insertions(+), 3 deletions(-) diff --git a/components/engine_traits/src/flush.rs b/components/engine_traits/src/flush.rs index d0f9f892f34..9344e84bb4e 100644 --- a/components/engine_traits/src/flush.rs +++ b/components/engine_traits/src/flush.rs @@ -234,7 +234,7 @@ impl PersistenceListener { /// /// `largest_seqno` should be the largest seqno of the generated file. pub fn on_flush_completed(&self, cf: &str, largest_seqno: u64, file_no: u64) { - fail_point!("on_flush_completed"); + fail_point!("on_flush_completed", |_| {}); // Maybe we should hook the compaction to avoid the file is compacted before // being recorded. let offset = data_cf_offset(cf); diff --git a/components/raftstore-v2/src/operation/ready/apply_trace.rs b/components/raftstore-v2/src/operation/ready/apply_trace.rs index d4743448d07..1601e1f01dd 100644 --- a/components/raftstore-v2/src/operation/ready/apply_trace.rs +++ b/components/raftstore-v2/src/operation/ready/apply_trace.rs @@ -46,7 +46,7 @@ use kvproto::{ use raftstore::store::{ util, ReadTask, TabletSnapManager, WriteTask, RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM, }; -use slog::{info, trace, Logger}; +use slog::{info, trace, warn, Logger}; use tikv_util::{box_err, slog_panic, worker::Scheduler}; use crate::{ @@ -619,7 +619,18 @@ impl Peer { // flush the oldest cf one by one until we are under the replay count threshold loop { let replay_count = self.storage().estimate_replay_count(); - if replay_count < flush_threshold { + if replay_count < flush_threshold || tried_count == 3 { + // Ideally, the replay count should be 0 after three flush_oldest_cf. If not, + // there may exist bug, but it's not desireable to block here, so we at most try + // three times. + if replay_count >= flush_threshold && tried_count == 3 { + warn!( + self.logger, + "after three flush_oldest_cf, the expected replay count still exceeds the threshold"; + "replay_count" => replay_count, + "threshold" => flush_threshold, + ); + } if flushed { let admin_flush = self.storage_mut().apply_trace_mut().admin.flushed; let (_, _, tablet_index) = ctx diff --git a/tests/integrations/raftstore/test_bootstrap.rs b/tests/integrations/raftstore/test_bootstrap.rs index b43a3d00d16..056641e1e3f 100644 --- a/tests/integrations/raftstore/test_bootstrap.rs +++ b/tests/integrations/raftstore/test_bootstrap.rs @@ -252,6 +252,36 @@ fn test_flush_before_stop() { .unwrap(); } +// test flush_before_close will not flush forever +#[test] +fn test_flush_before_stop2() { + use test_raftstore_v2::*; + + let mut cluster = new_server_cluster(0, 3); + cluster.run(); + + fail::cfg("flush_before_cluse_threshold", "return(10)").unwrap(); + fail::cfg("on_flush_completed", "return").unwrap(); + + for i in 0..20 { + let key = format!("k{:03}", i); + cluster.must_put_cf(CF_WRITE, key.as_bytes(), b"val"); + cluster.must_put_cf(CF_LOCK, key.as_bytes(), b"val"); + } + + let router = cluster.get_router(1).unwrap(); + let raft_engine = cluster.get_raft_engine(1); + + let (tx, rx) = sync_channel(1); + let msg = PeerMsg::FlushBeforeClose { tx }; + router.force_send(1, msg).unwrap(); + + rx.recv().unwrap(); + + let admin_flush = raft_engine.get_flushed_index(1, CF_RAFT).unwrap().unwrap(); + assert!(admin_flush < 10); +} + // We cannot use a flushed index to call `maybe_advance_admin_flushed` // consider a case: // 1. lock `k` with index 6 From 1c21d07f2bfb181993838f2ae3ed34dceff1b6cb Mon Sep 17 00:00:00 2001 From: Neil Shen Date: Tue, 5 Sep 2023 15:41:11 +0800 Subject: [PATCH 030/220] resolved_ts: track pending lock memory usage (#15452) ref tikv/tikv#14864 * Fix resolved ts OOM caused by adding large txns locks to `ResolverStatus`. * Add initial scan backoff duration metrics. Signed-off-by: Neil Shen Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> Co-authored-by: Connor --- components/resolved_ts/src/endpoint.rs | 340 +++++++++++------- components/resolved_ts/src/metrics.rs | 6 + components/resolved_ts/src/resolver.rs | 4 +- components/resolved_ts/src/scanner.rs | 4 +- .../resolved_ts/tests/failpoints/mod.rs | 45 +++ metrics/grafana/tikv_details.json | 73 ++++ 6 files changed, 339 insertions(+), 133 deletions(-) diff --git a/components/resolved_ts/src/endpoint.rs b/components/resolved_ts/src/endpoint.rs index 3c1ad9d8c8d..fc3e24de1e4 100644 --- a/components/resolved_ts/src/endpoint.rs +++ b/components/resolved_ts/src/endpoint.rs @@ -28,7 +28,7 @@ use raftstore::{ use security::SecurityManager; use tikv::config::ResolvedTsConfig; use tikv_util::{ - memory::MemoryQuota, + memory::{HeapSize, MemoryQuota}, warn, worker::{Runnable, RunnableWithTimer, Scheduler}, }; @@ -41,7 +41,7 @@ use crate::{ metrics::*, resolver::Resolver, scanner::{ScanEntry, ScanMode, ScanTask, ScannerPool}, - Error, Result, + Error, Result, ON_DROP_WARN_HEAP_SIZE, }; /// grace period for logging safe-ts and resolved-ts gap in slow log @@ -53,10 +53,102 @@ enum ResolverStatus { tracked_index: u64, locks: Vec, cancelled: Arc, + memory_quota: Arc, }, Ready, } +impl Drop for ResolverStatus { + fn drop(&mut self) { + let ResolverStatus::Pending { + locks, + memory_quota, + .. + } = self else { + return; + }; + if locks.is_empty() { + return; + } + + // Free memory quota used by pending locks and unlocks. + let mut bytes = 0; + let num_locks = locks.len(); + for lock in locks { + bytes += lock.heap_size(); + } + if bytes > ON_DROP_WARN_HEAP_SIZE { + warn!("drop huge ResolverStatus"; + "bytes" => bytes, + "num_locks" => num_locks, + "memory_quota_in_use" => memory_quota.in_use(), + "memory_quota_capacity" => memory_quota.capacity(), + ); + } + memory_quota.free(bytes); + } +} + +impl ResolverStatus { + fn push_pending_lock(&mut self, lock: PendingLock, region_id: u64) -> Result<()> { + let ResolverStatus::Pending { + locks, + memory_quota, + .. + } = self else { + panic!("region {:?} resolver has ready", region_id) + }; + // Check if adding a new lock or unlock will exceed the memory + // quota. + if !memory_quota.alloc(lock.heap_size()) { + fail::fail_point!("resolved_ts_on_pending_locks_memory_quota_exceeded"); + return Err(Error::MemoryQuotaExceeded); + } + locks.push(lock); + Ok(()) + } + + fn update_tracked_index(&mut self, index: u64, region_id: u64) { + let ResolverStatus::Pending { + tracked_index, + .. + } = self else { + panic!("region {:?} resolver has ready", region_id) + }; + assert!( + *tracked_index < index, + "region {}, tracked_index: {}, incoming index: {}", + region_id, + *tracked_index, + index + ); + *tracked_index = index; + } + + fn drain_pending_locks( + &mut self, + region_id: u64, + ) -> (u64, impl Iterator + '_) { + let ResolverStatus::Pending { + locks, + memory_quota, + tracked_index, + .. + } = self else { + panic!("region {:?} resolver has ready", region_id) + }; + // Must take locks, otherwise it may double free memory quota on drop. + let locks = std::mem::take(locks); + ( + *tracked_index, + locks.into_iter().map(|lock| { + memory_quota.free(lock.heap_size()); + lock + }), + ) + } +} + #[allow(dead_code)] enum PendingLock { Track { @@ -70,6 +162,16 @@ enum PendingLock { }, } +impl HeapSize for PendingLock { + fn heap_size(&self) -> usize { + match self { + PendingLock::Track { key, .. } | PendingLock::Untrack { key, .. } => { + key.as_encoded().heap_size() + } + } + } +} + // Records information related to observed region. // observe_id is used for avoiding ABA problems in incremental scan task, // advance resolved ts task, and command observing. @@ -85,13 +187,14 @@ struct ObserveRegion { impl ObserveRegion { fn new(meta: Region, rrp: Arc, memory_quota: Arc) -> Self { ObserveRegion { - resolver: Resolver::with_read_progress(meta.id, Some(rrp), memory_quota), + resolver: Resolver::with_read_progress(meta.id, Some(rrp), memory_quota.clone()), meta, handle: ObserveHandle::new(), resolver_status: ResolverStatus::Pending { tracked_index: 0, locks: vec![], cancelled: Arc::new(AtomicBool::new(false)), + memory_quota, }, } } @@ -101,122 +204,109 @@ impl ObserveRegion { } fn track_change_log(&mut self, change_logs: &[ChangeLog]) -> Result<()> { - match &mut self.resolver_status { - ResolverStatus::Pending { - locks, - tracked_index, - .. - } => { - for log in change_logs { - match log { - ChangeLog::Error(e) => { - debug!( - "skip change log error"; - "region" => self.meta.id, - "error" => ?e, - ); - continue; - } - ChangeLog::Admin(req_type) => { - // TODO: for admin cmd that won't change the region meta like peer list - // and key range (i.e. `CompactLog`, `ComputeHash`) we may not need to - // return error - return Err(box_err!( - "region met admin command {:?} while initializing resolver", - req_type - )); - } - ChangeLog::Rows { rows, index } => { - rows.iter().for_each(|row| match row { - ChangeRow::Prewrite { key, start_ts, .. } => { - locks.push(PendingLock::Track { - key: key.clone(), - start_ts: *start_ts, - }) - } + if matches!(self.resolver_status, ResolverStatus::Pending { .. }) { + for log in change_logs { + match log { + ChangeLog::Error(e) => { + debug!( + "skip change log error"; + "region" => self.meta.id, + "error" => ?e, + ); + continue; + } + ChangeLog::Admin(req_type) => { + // TODO: for admin cmd that won't change the region meta like peer list + // and key range (i.e. `CompactLog`, `ComputeHash`) we may not need to + // return error + return Err(box_err!( + "region met admin command {:?} while initializing resolver", + req_type + )); + } + ChangeLog::Rows { rows, index } => { + for row in rows { + let lock = match row { + ChangeRow::Prewrite { key, start_ts, .. } => PendingLock::Track { + key: key.clone(), + start_ts: *start_ts, + }, ChangeRow::Commit { key, start_ts, commit_ts, .. - } => locks.push(PendingLock::Untrack { + } => PendingLock::Untrack { key: key.clone(), start_ts: *start_ts, commit_ts: *commit_ts, - }), + }, // One pc command do not contains any lock, so just skip it - ChangeRow::OnePc { .. } => {} - ChangeRow::IngestSsT => {} - }); - assert!( - *tracked_index < *index, - "region {}, tracked_index: {}, incoming index: {}", - self.meta.id, - *tracked_index, - *index - ); - *tracked_index = *index; + ChangeRow::OnePc { .. } | ChangeRow::IngestSsT => continue, + }; + self.resolver_status.push_pending_lock(lock, self.meta.id)?; } + self.resolver_status + .update_tracked_index(*index, self.meta.id); } } } - ResolverStatus::Ready => { - for log in change_logs { - match log { - ChangeLog::Error(e) => { + } else { + for log in change_logs { + match log { + ChangeLog::Error(e) => { + debug!( + "skip change log error"; + "region" => self.meta.id, + "error" => ?e, + ); + continue; + } + ChangeLog::Admin(req_type) => match req_type { + AdminCmdType::Split + | AdminCmdType::BatchSplit + | AdminCmdType::PrepareMerge + | AdminCmdType::RollbackMerge + | AdminCmdType::CommitMerge => { + info!( + "region met split/merge command, stop tracking since key range changed, wait for re-register"; + "req_type" => ?req_type, + ); + // Stop tracking so that `tracked_index` larger than the split/merge + // command index won't be published until `RegionUpdate` event + // trigger the region re-register and re-scan the new key range + self.resolver.stop_tracking(); + } + _ => { debug!( - "skip change log error"; + "skip change log admin"; "region" => self.meta.id, - "error" => ?e, + "req_type" => ?req_type, ); - continue; } - ChangeLog::Admin(req_type) => match req_type { - AdminCmdType::Split - | AdminCmdType::BatchSplit - | AdminCmdType::PrepareMerge - | AdminCmdType::RollbackMerge - | AdminCmdType::CommitMerge => { - info!( - "region met split/merge command, stop tracking since key range changed, wait for re-register"; - "req_type" => ?req_type, - ); - // Stop tracking so that `tracked_index` larger than the split/merge - // command index won't be published until `RegionUpdate` event - // trigger the region re-register and re-scan the new key range - self.resolver.stop_tracking(); - } - _ => { - debug!( - "skip change log admin"; - "region" => self.meta.id, - "req_type" => ?req_type, - ); - } - }, - ChangeLog::Rows { rows, index } => { - for row in rows { - match row { - ChangeRow::Prewrite { key, start_ts, .. } => { - if !self.resolver.track_lock( - *start_ts, - key.to_raw().unwrap(), - Some(*index), - ) { - return Err(Error::MemoryQuotaExceeded); - } - } - ChangeRow::Commit { key, .. } => self - .resolver - .untrack_lock(&key.to_raw().unwrap(), Some(*index)), - // One pc command do not contains any lock, so just skip it - ChangeRow::OnePc { .. } => { - self.resolver.update_tracked_index(*index); - } - ChangeRow::IngestSsT => { - self.resolver.update_tracked_index(*index); + }, + ChangeLog::Rows { rows, index } => { + for row in rows { + match row { + ChangeRow::Prewrite { key, start_ts, .. } => { + if !self.resolver.track_lock( + *start_ts, + key.to_raw().unwrap(), + Some(*index), + ) { + return Err(Error::MemoryQuotaExceeded); } } + ChangeRow::Commit { key, .. } => self + .resolver + .untrack_lock(&key.to_raw().unwrap(), Some(*index)), + // One pc command do not contains any lock, so just skip it + ChangeRow::OnePc { .. } => { + self.resolver.update_tracked_index(*index); + } + ChangeRow::IngestSsT => { + self.resolver.update_tracked_index(*index); + } } } } @@ -247,38 +337,26 @@ impl ObserveRegion { ScanEntry::None => { // Update the `tracked_index` to the snapshot's `apply_index` self.resolver.update_tracked_index(apply_index); - let pending_tracked_index = - match std::mem::replace(&mut self.resolver_status, ResolverStatus::Ready) { - ResolverStatus::Pending { - locks, - tracked_index, - .. - } => { - for lock in locks { - match lock { - PendingLock::Track { key, start_ts } => { - if !self.resolver.track_lock( - start_ts, - key.to_raw().unwrap(), - Some(tracked_index), - ) { - return Err(Error::MemoryQuotaExceeded); - } - } - PendingLock::Untrack { key, .. } => { - self.resolver.untrack_lock( - &key.to_raw().unwrap(), - Some(tracked_index), - ) - } - } + let mut resolver_status = + std::mem::replace(&mut self.resolver_status, ResolverStatus::Ready); + let (pending_tracked_index, pending_locks) = + resolver_status.drain_pending_locks(self.meta.id); + for lock in pending_locks { + match lock { + PendingLock::Track { key, start_ts } => { + if !self.resolver.track_lock( + start_ts, + key.to_raw().unwrap(), + Some(pending_tracked_index), + ) { + return Err(Error::MemoryQuotaExceeded); } - tracked_index } - ResolverStatus::Ready => { - panic!("region {:?} resolver has ready", self.meta.id) - } - }; + PendingLock::Untrack { key, .. } => self + .resolver + .untrack_lock(&key.to_raw().unwrap(), Some(pending_tracked_index)), + } + } info!( "Resolver initialized"; "region" => self.meta.id, @@ -457,7 +535,7 @@ where // Stop observing data handle.stop_observing(); // Stop scanning data - if let ResolverStatus::Pending { cancelled, .. } = resolver_status { + if let ResolverStatus::Pending { ref cancelled, .. } = resolver_status { cancelled.store(true, Ordering::Release); } } else { diff --git a/components/resolved_ts/src/metrics.rs b/components/resolved_ts/src/metrics.rs index 15b3463f70e..74da743952c 100644 --- a/components/resolved_ts/src/metrics.rs +++ b/components/resolved_ts/src/metrics.rs @@ -138,4 +138,10 @@ lazy_static! { "The minimal (non-zero) resolved ts gap for observe leader peers" ) .unwrap(); + pub static ref RTS_INITIAL_SCAN_BACKOFF_DURATION_HISTOGRAM: Histogram = register_histogram!( + "tikv_resolved_ts_initial_scan_backoff_duration_seconds", + "Bucketed histogram of resolved-ts initial scan backoff duration", + exponential_buckets(0.1, 2.0, 16).unwrap(), + ) + .unwrap(); } diff --git a/components/resolved_ts/src/resolver.rs b/components/resolved_ts/src/resolver.rs index 6bee5efd2f6..405138d41cf 100644 --- a/components/resolved_ts/src/resolver.rs +++ b/components/resolved_ts/src/resolver.rs @@ -13,7 +13,7 @@ use txn_types::TimeStamp; use crate::metrics::RTS_RESOLVED_FAIL_ADVANCE_VEC; const MAX_NUMBER_OF_LOCKS_IN_LOG: usize = 10; -const ON_DROP_WARN_HEAP_SIZE: usize = 64 * 1024 * 1024; // 64MB +pub(crate) const ON_DROP_WARN_HEAP_SIZE: usize = 64 * 1024 * 1024; // 64MB // Resolver resolves timestamps that guarantee no more commit will happen before // the timestamp. @@ -74,6 +74,8 @@ impl Drop for Resolver { "region_id" => self.region_id, "bytes" => bytes, "num_locks" => num_locks, + "memory_quota_in_use" => self.memory_quota.in_use(), + "memory_quota_capacity" => self.memory_quota.capacity(), ); } self.memory_quota.free(bytes); diff --git a/components/resolved_ts/src/scanner.rs b/components/resolved_ts/src/scanner.rs index e8665e9d860..615819db799 100644 --- a/components/resolved_ts/src/scanner.rs +++ b/components/resolved_ts/src/scanner.rs @@ -21,7 +21,7 @@ use txn_types::{Key, Lock, LockType, TimeStamp}; use crate::{ errors::{Error, Result}, - metrics::RTS_SCAN_DURATION_HISTOGRAM, + metrics::*, }; const DEFAULT_SCAN_BATCH_SIZE: usize = 1024; @@ -86,6 +86,7 @@ impl, E: KvEngine> ScannerPool { let cdc_handle = self.cdc_handle.clone(); let fut = async move { if let Some(backoff) = task.backoff { + RTS_INITIAL_SCAN_BACKOFF_DURATION_HISTOGRAM.observe(backoff.as_secs_f64()); if let Err(e) = GLOBAL_TIMER_HANDLE .delay(std::time::Instant::now() + backoff) .compat() @@ -113,6 +114,7 @@ impl, E: KvEngine> ScannerPool { return; } }; + fail::fail_point!("resolved_ts_after_scanner_get_snapshot"); let start = Instant::now(); let apply_index = snap.get_apply_index().unwrap(); let mut entries = vec![]; diff --git a/components/resolved_ts/tests/failpoints/mod.rs b/components/resolved_ts/tests/failpoints/mod.rs index 808f5ed62ff..0c594ab1d1d 100644 --- a/components/resolved_ts/tests/failpoints/mod.rs +++ b/components/resolved_ts/tests/failpoints/mod.rs @@ -2,6 +2,11 @@ #[path = "../mod.rs"] mod testsuite; +use std::{ + sync::{mpsc::channel, Mutex}, + time::Duration, +}; + use futures::executor::block_on; use kvproto::kvrpcpb::*; use pd_client::PdClient; @@ -128,3 +133,43 @@ fn test_report_min_resolved_ts_disable() { fail::remove("mock_min_resolved_ts_interval_disable"); suite.stop(); } + +#[test] +fn test_pending_locks_memory_quota_exceeded() { + // Pause scan lock so that locks will be put in pending locks. + fail::cfg("resolved_ts_after_scanner_get_snapshot", "pause").unwrap(); + // Check if memory quota exceeded is triggered. + let (tx, rx) = channel(); + let tx = Mutex::new(tx); + fail::cfg_callback( + "resolved_ts_on_pending_locks_memory_quota_exceeded", + move || { + let sender = tx.lock().unwrap(); + sender.send(()).unwrap(); + }, + ) + .unwrap(); + + let mut suite = TestSuite::new(1); + let region = suite.cluster.get_region(&[]); + + // Must not trigger memory quota exceeded. + rx.recv_timeout(Duration::from_millis(100)).unwrap_err(); + + // Set a small memory quota to trigger memory quota exceeded. + suite.must_change_memory_quota(1, 1); + let (k, v) = (b"k1", b"v"); + let start_ts = block_on(suite.cluster.pd_client.get_tso()).unwrap(); + let mut mutation = Mutation::default(); + mutation.set_op(Op::Put); + mutation.key = k.to_vec(); + mutation.value = v.to_vec(); + suite.must_kv_prewrite(region.id, vec![mutation], k.to_vec(), start_ts, false); + + // Must trigger memory quota exceeded. + rx.recv_timeout(Duration::from_secs(5)).unwrap(); + + fail::remove("resolved_ts_after_scanner_get_snapshot"); + fail::remove("resolved_ts_on_pending_locks_memory_quota_exceeded"); + suite.stop(); +} diff --git a/metrics/grafana/tikv_details.json b/metrics/grafana/tikv_details.json index d327041cd8a..c78540c601a 100644 --- a/metrics/grafana/tikv_details.json +++ b/metrics/grafana/tikv_details.json @@ -39471,6 +39471,79 @@ "yBucketNumber": null, "yBucketSize": null }, + { + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_TEST-CLUSTER}", + "description": "The backoff duration before starting initial scan", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 70 + }, + "heatmap": {}, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 23763573950, + "legend": { + "show": false + }, + "links": [], + "reverseYBuckets": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(tikv_resolved_ts_initial_scan_backoff_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le)", + "format": "heatmap", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{le}}", + "metric": "", + "refId": "A", + "step": 4 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Initial scan backoff duration", + "tooltip": { + "show": true, + "showHistogram": false + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 0, + "format": "s", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketBound": "upper", + "yBucketNumber": null, + "yBucketSize": null + }, { "aliasColors": {}, "bars": false, From 6b91e4a2284296887c1a0eb32865e5d8ab90ebb7 Mon Sep 17 00:00:00 2001 From: Neil Shen Date: Tue, 5 Sep 2023 16:45:42 +0800 Subject: [PATCH 031/220] cdc: deregister delegate if memory quota exceeded (#15486) close tikv/tikv#15412 Similar to resolved-ts endpoint, cdc endpoint maintains resolvers for subscribed regions. These resolvers also need memory quota, otherwise they may cause OOM. This commit lets cdc endpoint deregister regions if they exceed memory quota. Signed-off-by: Neil Shen --- components/cdc/src/channel.rs | 3 + components/cdc/src/delegate.rs | 183 ++++++++--- components/cdc/src/endpoint.rs | 40 ++- components/cdc/src/errors.rs | 2 + components/cdc/src/initializer.rs | 56 +++- components/cdc/tests/failpoints/mod.rs | 1 + .../cdc/tests/failpoints/test_memory_quota.rs | 289 ++++++++++++++++++ components/cdc/tests/mod.rs | 11 +- components/resolved_ts/src/resolver.rs | 25 +- 9 files changed, 517 insertions(+), 93 deletions(-) create mode 100644 components/cdc/tests/failpoints/test_memory_quota.rs diff --git a/components/cdc/src/channel.rs b/components/cdc/src/channel.rs index 6a8c3d5c3aa..a3ddeeb9030 100644 --- a/components/cdc/src/channel.rs +++ b/components/cdc/src/channel.rs @@ -52,6 +52,9 @@ pub enum CdcEvent { impl CdcEvent { pub fn size(&self) -> u32 { + fail::fail_point!("cdc_event_size", |size| size + .map(|s| s.parse::().unwrap()) + .unwrap_or(0)); match self { CdcEvent::ResolvedTs(ref r) => { // For region id, it is unlikely to exceed 100,000,000 which is diff --git a/components/cdc/src/delegate.rs b/components/cdc/src/delegate.rs index da5c26aad30..e109b3368b4 100644 --- a/components/cdc/src/delegate.rs +++ b/components/cdc/src/delegate.rs @@ -28,9 +28,13 @@ use raftstore::{ store::util::compare_region_epoch, Error as RaftStoreError, }; -use resolved_ts::Resolver; +use resolved_ts::{Resolver, ON_DROP_WARN_HEAP_SIZE}; use tikv::storage::{txn::TxnEntry, Statistics}; -use tikv_util::{debug, info, warn}; +use tikv_util::{ + debug, info, + memory::{HeapSize, MemoryQuota}, + warn, +}; use txn_types::{Key, Lock, LockType, TimeStamp, WriteBatchFlags, WriteRef, WriteType}; use crate::{ @@ -226,16 +230,77 @@ impl Downstream { } } -#[derive(Default)] struct Pending { - pub downstreams: Vec, - pub locks: Vec, - pub pending_bytes: usize, + downstreams: Vec, + locks: Vec, + pending_bytes: usize, + memory_quota: Arc, +} + +impl Pending { + fn new(memory_quota: Arc) -> Pending { + Pending { + downstreams: vec![], + locks: vec![], + pending_bytes: 0, + memory_quota, + } + } + + fn push_pending_lock(&mut self, lock: PendingLock) -> Result<()> { + let bytes = lock.heap_size(); + if !self.memory_quota.alloc(bytes) { + return Err(Error::MemoryQuotaExceeded); + } + self.locks.push(lock); + self.pending_bytes += bytes; + CDC_PENDING_BYTES_GAUGE.add(bytes as i64); + Ok(()) + } + + fn on_region_ready(&mut self, resolver: &mut Resolver) -> Result<()> { + fail::fail_point!("cdc_pending_on_region_ready", |_| Err( + Error::MemoryQuotaExceeded + )); + // Must take locks, otherwise it may double free memory quota on drop. + for lock in mem::take(&mut self.locks) { + self.memory_quota.free(lock.heap_size()); + match lock { + PendingLock::Track { key, start_ts } => { + if !resolver.track_lock(start_ts, key, None) { + return Err(Error::MemoryQuotaExceeded); + } + } + PendingLock::Untrack { key } => resolver.untrack_lock(&key, None), + } + } + Ok(()) + } } impl Drop for Pending { fn drop(&mut self) { CDC_PENDING_BYTES_GAUGE.sub(self.pending_bytes as i64); + let locks = mem::take(&mut self.locks); + if locks.is_empty() { + return; + } + + // Free memory quota used by pending locks and unlocks. + let mut bytes = 0; + let num_locks = locks.len(); + for lock in locks { + bytes += lock.heap_size(); + } + if bytes > ON_DROP_WARN_HEAP_SIZE { + warn!("cdc drop huge Pending"; + "bytes" => bytes, + "num_locks" => num_locks, + "memory_quota_in_use" => self.memory_quota.in_use(), + "memory_quota_capacity" => self.memory_quota.capacity(), + ); + } + self.memory_quota.free(bytes); } } @@ -244,6 +309,14 @@ enum PendingLock { Untrack { key: Vec }, } +impl HeapSize for PendingLock { + fn heap_size(&self) -> usize { + match self { + PendingLock::Track { key, .. } | PendingLock::Untrack { key } => key.heap_size(), + } + } +} + /// A CDC delegate of a raftstore region peer. /// /// It converts raft commands into CDC events and broadcast to downstreams. @@ -265,14 +338,18 @@ pub struct Delegate { impl Delegate { /// Create a Delegate the given region. - pub fn new(region_id: u64, txn_extra_op: Arc>) -> Delegate { + pub fn new( + region_id: u64, + txn_extra_op: Arc>, + memory_quota: Arc, + ) -> Delegate { Delegate { region_id, handle: ObserveHandle::new(), resolver: None, region: None, resolved_downstreams: Vec::new(), - pending: Some(Pending::default()), + pending: Some(Pending::new(memory_quota)), txn_extra_op, failed: false, } @@ -395,7 +472,7 @@ impl Delegate { &mut self, mut resolver: Resolver, region: Region, - ) -> Vec<(&Downstream, Error)> { + ) -> Result> { assert!( self.resolver.is_none(), "region {} resolver should not be ready", @@ -408,29 +485,24 @@ impl Delegate { } // Mark the delegate as initialized. - let mut pending = self.pending.take().unwrap(); - self.region = Some(region); info!("cdc region is ready"; "region_id" => self.region_id); + // Downstreams in pending must be moved to resolved_downstreams + // immediately and must not return in the middle, otherwise the delegate + // loses downstreams. + let mut pending = self.pending.take().unwrap(); + self.resolved_downstreams = mem::take(&mut pending.downstreams); - for lock in mem::take(&mut pending.locks) { - match lock { - PendingLock::Track { key, start_ts } => { - // TODO: handle memory quota exceed, for now, quota is set to usize::MAX. - assert!(resolver.track_lock(start_ts, key, None)); - } - PendingLock::Untrack { key } => resolver.untrack_lock(&key, None), - } - } + pending.on_region_ready(&mut resolver)?; self.resolver = Some(resolver); + self.region = Some(region); - self.resolved_downstreams = mem::take(&mut pending.downstreams); let mut failed_downstreams = Vec::new(); for downstream in self.downstreams() { if let Err(e) = self.check_epoch_on_ready(downstream) { failed_downstreams.push((downstream, e)); } } - failed_downstreams + Ok(failed_downstreams) } /// Try advance and broadcast resolved ts. @@ -611,16 +683,14 @@ impl Delegate { let mut txn_rows: HashMap, (EventRow, bool)> = HashMap::default(); let mut raw_rows: Vec = Vec::new(); for mut req in requests { - match req.get_cmd_type() { - CmdType::Put => { - self.sink_put( - req.take_put(), - is_one_pc, - &mut txn_rows, - &mut raw_rows, - &mut read_old_value, - )?; - } + let res = match req.get_cmd_type() { + CmdType::Put => self.sink_put( + req.take_put(), + is_one_pc, + &mut txn_rows, + &mut raw_rows, + &mut read_old_value, + ), CmdType::Delete => self.sink_delete(req.take_delete()), _ => { debug!( @@ -628,7 +698,12 @@ impl Delegate { "region_id" => self.region_id, "command" => ?req, ); + Ok(()) } + }; + if res.is_err() { + self.mark_failed(); + return res; } } @@ -825,18 +900,17 @@ impl Delegate { // In order to compute resolved ts, we must track inflight txns. match self.resolver { Some(ref mut resolver) => { - // TODO: handle memory quota exceed, for now, quota is set to usize::MAX. - assert!(resolver.track_lock(row.start_ts.into(), row.key.clone(), None)); + if !resolver.track_lock(row.start_ts.into(), row.key.clone(), None) { + return Err(Error::MemoryQuotaExceeded); + } } None => { assert!(self.pending.is_some(), "region resolver not ready"); let pending = self.pending.as_mut().unwrap(); - pending.locks.push(PendingLock::Track { + pending.push_pending_lock(PendingLock::Track { key: row.key.clone(), start_ts: row.start_ts.into(), - }); - pending.pending_bytes += row.key.len(); - CDC_PENDING_BYTES_GAUGE.add(row.key.len() as i64); + })?; } } @@ -858,7 +932,7 @@ impl Delegate { Ok(()) } - fn sink_delete(&mut self, mut delete: DeleteRequest) { + fn sink_delete(&mut self, mut delete: DeleteRequest) -> Result<()> { match delete.cf.as_str() { "lock" => { let raw_key = Key::from_encoded(delete.take_key()).into_raw().unwrap(); @@ -866,11 +940,8 @@ impl Delegate { Some(ref mut resolver) => resolver.untrack_lock(&raw_key, None), None => { assert!(self.pending.is_some(), "region resolver not ready"); - let key_len = raw_key.len(); let pending = self.pending.as_mut().unwrap(); - pending.locks.push(PendingLock::Untrack { key: raw_key }); - pending.pending_bytes += key_len; - CDC_PENDING_BYTES_GAUGE.add(key_len as i64); + pending.push_pending_lock(PendingLock::Untrack { key: raw_key })?; } } } @@ -879,6 +950,7 @@ impl Delegate { panic!("invalid cf {}", other); } } + Ok(()) } fn sink_admin(&mut self, request: AdminRequest, mut response: AdminResponse) -> Result<()> { @@ -949,7 +1021,7 @@ impl Delegate { } fn stop_observing(&self) { - info!("stop observing"; "region_id" => self.region_id, "failed" => self.failed); + info!("cdc stop observing"; "region_id" => self.region_id, "failed" => self.failed); // Stop observe further events. self.handle.stop_observing(); // To inform transaction layer no more old values are required for the region. @@ -1184,12 +1256,18 @@ mod tests { ObservedRange::default(), ); downstream.set_sink(sink); - let mut delegate = Delegate::new(region_id, Default::default()); + let memory_quota = Arc::new(MemoryQuota::new(usize::MAX)); + let mut delegate = Delegate::new(region_id, Default::default(), memory_quota); delegate.subscribe(downstream).unwrap(); assert!(delegate.handle.is_observing()); let memory_quota = Arc::new(MemoryQuota::new(std::usize::MAX)); let resolver = Resolver::new(region_id, memory_quota); - assert!(delegate.on_region_ready(resolver, region).is_empty()); + assert!( + delegate + .on_region_ready(resolver, region) + .unwrap() + .is_empty() + ); assert!(delegate.downstreams()[0].observed_range.all_key_covered); let rx_wrap = Cell::new(Some(rx)); @@ -1313,8 +1391,9 @@ mod tests { }; // Create a new delegate. + let memory_quota = Arc::new(MemoryQuota::new(usize::MAX)); let txn_extra_op = Arc::new(AtomicCell::new(TxnExtraOp::Noop)); - let mut delegate = Delegate::new(1, txn_extra_op.clone()); + let mut delegate = Delegate::new(1, txn_extra_op.clone(), memory_quota); assert_eq!(txn_extra_op.load(), TxnExtraOp::Noop); assert!(delegate.handle.is_observing()); @@ -1340,7 +1419,9 @@ mod tests { region.mut_region_epoch().set_version(1); { let memory_quota = Arc::new(MemoryQuota::new(std::usize::MAX)); - let failures = delegate.on_region_ready(Resolver::new(1, memory_quota), region); + let failures = delegate + .on_region_ready(Resolver::new(1, memory_quota), region) + .unwrap(); assert_eq!(failures.len(), 1); let id = failures[0].0.id; delegate.unsubscribe(id, None); @@ -1431,8 +1512,9 @@ mod tests { Key::from_raw(b"d").into_encoded(), ) .unwrap(); + let memory_quota = Arc::new(MemoryQuota::new(usize::MAX)); let txn_extra_op = Arc::new(AtomicCell::new(TxnExtraOp::Noop)); - let mut delegate = Delegate::new(1, txn_extra_op); + let mut delegate = Delegate::new(1, txn_extra_op, memory_quota); assert!(delegate.handle.is_observing()); let mut map = HashMap::default(); @@ -1500,8 +1582,9 @@ mod tests { Key::from_raw(b"f").into_encoded(), ) .unwrap(); + let memory_quota = Arc::new(MemoryQuota::new(usize::MAX)); let txn_extra_op = Arc::new(AtomicCell::new(TxnExtraOp::Noop)); - let mut delegate = Delegate::new(1, txn_extra_op); + let mut delegate = Delegate::new(1, txn_extra_op, memory_quota); assert!(delegate.handle.is_observing()); let mut map = HashMap::default(); diff --git a/components/cdc/src/endpoint.rs b/components/cdc/src/endpoint.rs index 969d0cba0d9..2b314f22443 100644 --- a/components/cdc/src/endpoint.rs +++ b/components/cdc/src/endpoint.rs @@ -736,7 +736,11 @@ impl, E: KvEngine, S: StoreRegionMeta> Endpoint e.into_mut(), HashMapEntry::Vacant(e) => { is_new_delegate = true; - e.insert(Delegate::new(region_id, txn_extra_op)) + e.insert(Delegate::new( + region_id, + txn_extra_op, + self.sink_memory_quota.clone(), + )) } }; @@ -802,10 +806,11 @@ impl, E: KvEngine, S: StoreRegionMeta> Endpoint { @@ -858,18 +863,26 @@ impl, E: KvEngine, S: StoreRegionMeta> Endpoint { + for (downstream, e) in fails { + deregisters.push(Deregister::Downstream { + conn_id: downstream.get_conn_id(), + request_id: downstream.get_req_id(), + region_id, + downstream_id: downstream.get_id(), + err: Some(e), + }); + } + } + Err(e) => deregisters.push(Deregister::Delegate { region_id, - downstream_id: downstream.get_id(), - err: Some(e), - }); + observe_id, + err: e, + }), } } else { debug!("cdc stale region ready"; @@ -883,7 +896,7 @@ impl, E: KvEngine, S: StoreRegionMeta> Endpoint Initializer { change_observer: ChangeObserver, cdc_handle: T, concurrency_semaphore: Arc, + memory_quota: Arc, ) -> Result<()> { fail_point!("cdc_before_initialize"); let _permit = concurrency_semaphore.acquire().await; @@ -173,7 +174,7 @@ impl Initializer { } match fut.await { - Ok(resp) => self.on_change_cmd_response(resp).await, + Ok(resp) => self.on_change_cmd_response(resp, memory_quota).await, Err(e) => Err(Error::Other(box_err!(e))), } } @@ -181,11 +182,13 @@ impl Initializer { pub(crate) async fn on_change_cmd_response( &mut self, mut resp: ReadResponse, + memory_quota: Arc, ) -> Result<()> { if let Some(region_snapshot) = resp.snapshot { assert_eq!(self.region_id, region_snapshot.get_region().get_id()); let region = region_snapshot.get_region().clone(); - self.async_incremental_scan(region_snapshot, region).await + self.async_incremental_scan(region_snapshot, region, memory_quota) + .await } else { assert!( resp.response.get_header().has_error(), @@ -201,6 +204,7 @@ impl Initializer { &mut self, snap: S, region: Region, + memory_quota: Arc, ) -> Result<()> { let downstream_id = self.downstream_id; let region_id = region.get_id(); @@ -216,8 +220,6 @@ impl Initializer { "end_key" => log_wrappers::Value::key(snap.upper_bound().unwrap_or_default())); let mut resolver = if self.build_resolver { - // TODO: limit the memory usage of the resolver. - let memory_quota = Arc::new(MemoryQuota::new(std::usize::MAX)); Some(Resolver::new(region_id, memory_quota)) } else { None @@ -422,9 +424,9 @@ impl Initializer { let lock = Lock::parse(value)?; match lock.lock_type { LockType::Put | LockType::Delete => { - // TODO: handle memory quota exceed, for now, quota is set to - // usize::MAX. - assert!(resolver.track_lock(lock.ts, key, None)); + if !resolver.track_lock(lock.ts, key, None) { + return Err(Error::MemoryQuotaExceeded); + } } _ => (), }; @@ -745,21 +747,37 @@ mod tests { } }); - block_on(initializer.async_incremental_scan(snap.clone(), region.clone())).unwrap(); + let memory_quota = Arc::new(MemoryQuota::new(usize::MAX)); + block_on(initializer.async_incremental_scan( + snap.clone(), + region.clone(), + memory_quota.clone(), + )) + .unwrap(); check_result(); initializer .downstream_state .store(DownstreamState::Initializing); initializer.max_scan_batch_bytes = total_bytes; - block_on(initializer.async_incremental_scan(snap.clone(), region.clone())).unwrap(); + block_on(initializer.async_incremental_scan( + snap.clone(), + region.clone(), + memory_quota.clone(), + )) + .unwrap(); check_result(); initializer .downstream_state .store(DownstreamState::Initializing); initializer.build_resolver = false; - block_on(initializer.async_incremental_scan(snap.clone(), region.clone())).unwrap(); + block_on(initializer.async_incremental_scan( + snap.clone(), + region.clone(), + memory_quota.clone(), + )) + .unwrap(); loop { let task = rx.recv_timeout(Duration::from_millis(100)); @@ -772,7 +790,8 @@ mod tests { // Test cancellation. initializer.downstream_state.store(DownstreamState::Stopped); - block_on(initializer.async_incremental_scan(snap.clone(), region)).unwrap_err(); + block_on(initializer.async_incremental_scan(snap.clone(), region, memory_quota.clone())) + .unwrap_err(); // Cancel error should trigger a deregsiter. let mut region = Region::default(); @@ -784,14 +803,15 @@ mod tests { response: Default::default(), txn_extra_op: Default::default(), }; - block_on(initializer.on_change_cmd_response(resp.clone())).unwrap_err(); + block_on(initializer.on_change_cmd_response(resp.clone(), memory_quota.clone())) + .unwrap_err(); // Disconnect sink by dropping runtime (it also drops drain). drop(pool); initializer .downstream_state .store(DownstreamState::Initializing); - block_on(initializer.on_change_cmd_response(resp)).unwrap_err(); + block_on(initializer.on_change_cmd_response(resp, memory_quota)).unwrap_err(); worker.stop(); } @@ -819,8 +839,9 @@ mod tests { filter_loop, ); let th = pool.spawn(async move { + let memory_quota = Arc::new(MemoryQuota::new(usize::MAX)); initializer - .async_incremental_scan(snap, Region::default()) + .async_incremental_scan(snap, Region::default(), memory_quota) .await .unwrap(); }); @@ -904,8 +925,9 @@ mod tests { let snap = engine.snapshot(Default::default()).unwrap(); let th = pool.spawn(async move { + let memory_qutoa = Arc::new(MemoryQuota::new(usize::MAX)); initializer - .async_incremental_scan(snap, Region::default()) + .async_incremental_scan(snap, Region::default(), memory_qutoa) .await .unwrap(); }); @@ -1017,12 +1039,14 @@ mod tests { let change_cmd = ChangeObserver::from_cdc(1, ObserveHandle::new()); let raft_router = CdcRaftRouter(MockRaftStoreRouter::new()); let concurrency_semaphore = Arc::new(Semaphore::new(1)); + let memory_quota = Arc::new(MemoryQuota::new(usize::MAX)); initializer.downstream_state.store(DownstreamState::Stopped); block_on(initializer.initialize( change_cmd, raft_router.clone(), concurrency_semaphore.clone(), + memory_quota.clone(), )) .unwrap_err(); @@ -1048,7 +1072,7 @@ mod tests { &concurrency_semaphore, ); let res = initializer - .initialize(change_cmd, raft_router, concurrency_semaphore) + .initialize(change_cmd, raft_router, concurrency_semaphore, memory_quota) .await; tx1.send(res).unwrap(); }); diff --git a/components/cdc/tests/failpoints/mod.rs b/components/cdc/tests/failpoints/mod.rs index 082b1c15f67..619ee200985 100644 --- a/components/cdc/tests/failpoints/mod.rs +++ b/components/cdc/tests/failpoints/mod.rs @@ -4,6 +4,7 @@ #![test_runner(test_util::run_failpoint_tests)] mod test_endpoint; +mod test_memory_quota; mod test_observe; mod test_register; mod test_resolve; diff --git a/components/cdc/tests/failpoints/test_memory_quota.rs b/components/cdc/tests/failpoints/test_memory_quota.rs new file mode 100644 index 00000000000..5b564ba61ec --- /dev/null +++ b/components/cdc/tests/failpoints/test_memory_quota.rs @@ -0,0 +1,289 @@ +// Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{sync::*, time::Duration}; + +use cdc::{Task, Validate}; +use futures::{executor::block_on, SinkExt}; +use grpcio::WriteFlags; +use kvproto::{cdcpb::*, kvrpcpb::*}; +use pd_client::PdClient; +use test_raftstore::*; + +use crate::{new_event_feed, TestSuiteBuilder}; + +#[test] +fn test_resolver_track_lock_memory_quota_exceeded() { + let mut cluster = new_server_cluster(1, 1); + // Increase the Raft tick interval to make this test case running reliably. + configure_for_lease_read(&mut cluster.cfg, Some(100), None); + let memory_quota = 1024; // 1KB + let mut suite = TestSuiteBuilder::new() + .cluster(cluster) + .memory_quota(memory_quota) + .build(); + + // Let CdcEvent size be 0 to effectively disable memory quota for CdcEvent. + fail::cfg("cdc_event_size", "return(0)").unwrap(); + + let req = suite.new_changedata_request(1); + let (mut req_tx, _event_feed_wrap, receive_event) = + new_event_feed(suite.get_region_cdc_client(1)); + block_on(req_tx.send((req, WriteFlags::default()))).unwrap(); + let event = receive_event(false); + event.events.into_iter().for_each(|e| { + match e.event.unwrap() { + // Even if there is no write, + // it should always outputs an Initialized event. + Event_oneof_event::Entries(es) => { + assert!(es.entries.len() == 1, "{:?}", es); + let e = &es.entries[0]; + assert_eq!(e.get_type(), EventLogType::Initialized, "{:?}", es); + } + other => panic!("unknown event {:?}", other), + } + }); + + // Client must receive messages when there is no congest error. + let key_size = memory_quota / 2; + let (k, v) = (vec![1; key_size], vec![5]); + // Prewrite + let start_ts = block_on(suite.cluster.pd_client.get_tso()).unwrap(); + let mut mutation = Mutation::default(); + mutation.set_op(Op::Put); + mutation.key = k.clone(); + mutation.value = v; + suite.must_kv_prewrite(1, vec![mutation], k, start_ts); + let mut events = receive_event(false).events.to_vec(); + assert_eq!(events.len(), 1, "{:?}", events); + match events.pop().unwrap().event.unwrap() { + Event_oneof_event::Entries(entries) => { + assert_eq!(entries.entries.len(), 1); + assert_eq!(entries.entries[0].get_type(), EventLogType::Prewrite); + } + other => panic!("unknown event {:?}", other), + } + + // Trigger congest error. + let key_size = memory_quota * 2; + let (k, v) = (vec![2; key_size], vec![5]); + // Prewrite + let start_ts = block_on(suite.cluster.pd_client.get_tso()).unwrap(); + let mut mutation = Mutation::default(); + mutation.set_op(Op::Put); + mutation.key = k.clone(); + mutation.value = v; + suite.must_kv_prewrite(1, vec![mutation], k, start_ts); + let mut events = receive_event(false).events.to_vec(); + assert_eq!(events.len(), 1, "{:?}", events); + match events.pop().unwrap().event.unwrap() { + Event_oneof_event::Error(e) => { + // Unknown errors are translated into region_not_found. + assert!(e.has_region_not_found(), "{:?}", e); + } + other => panic!("unknown event {:?}", other), + } + + // The delegate must be removed. + let scheduler = suite.endpoints.values().next().unwrap().scheduler(); + let (tx, rx) = mpsc::channel(); + scheduler + .schedule(Task::Validate(Validate::Region( + 1, + Box::new(move |delegate| { + tx.send(delegate.is_none()).unwrap(); + }), + ))) + .unwrap(); + + assert!( + rx.recv_timeout(Duration::from_millis(1000)).unwrap(), + "find unexpected delegate" + ); + + suite.stop(); +} + +#[test] +fn test_pending_on_region_ready_memory_quota_exceeded() { + let mut cluster = new_server_cluster(1, 1); + // Increase the Raft tick interval to make this test case running reliably. + configure_for_lease_read(&mut cluster.cfg, Some(100), None); + let memory_quota = 1024; // 1KB + let mut suite = TestSuiteBuilder::new() + .cluster(cluster) + .memory_quota(memory_quota) + .build(); + + // Let CdcEvent size be 0 to effectively disable memory quota for CdcEvent. + fail::cfg("cdc_event_size", "return(0)").unwrap(); + + // Trigger memory quota exceeded error. + fail::cfg("cdc_pending_on_region_ready", "return").unwrap(); + let req = suite.new_changedata_request(1); + let (mut req_tx, _event_feed_wrap, receive_event) = + new_event_feed(suite.get_region_cdc_client(1)); + block_on(req_tx.send((req, WriteFlags::default()))).unwrap(); + let event = receive_event(false); + event.events.into_iter().for_each(|e| { + match e.event.unwrap() { + // Even if there is no write, + // it should always outputs an Initialized event. + Event_oneof_event::Entries(es) => { + assert!(es.entries.len() == 1, "{:?}", es); + let e = &es.entries[0]; + assert_eq!(e.get_type(), EventLogType::Initialized, "{:?}", es); + } + other => panic!("unknown event {:?}", other), + } + }); + // MemoryQuotaExceeded error is triggered on_region_ready. + let mut events = receive_event(false).events.to_vec(); + assert_eq!(events.len(), 1, "{:?}", events); + match events.pop().unwrap().event.unwrap() { + Event_oneof_event::Error(e) => { + // Unknown errors are translated into region_not_found. + assert!(e.has_region_not_found(), "{:?}", e); + } + other => panic!("unknown event {:?}", other), + } + + // The delegate must be removed. + let scheduler = suite.endpoints.values().next().unwrap().scheduler(); + let (tx, rx) = mpsc::channel(); + scheduler + .schedule(Task::Validate(Validate::Region( + 1, + Box::new(move |delegate| { + tx.send(delegate.is_none()).unwrap(); + }), + ))) + .unwrap(); + + assert!( + rx.recv_timeout(Duration::from_millis(1000)).unwrap(), + "find unexpected delegate" + ); + + fail::remove("cdc_incremental_scan_start"); + suite.stop(); +} + +#[test] +fn test_pending_push_lock_memory_quota_exceeded() { + let mut cluster = new_server_cluster(1, 1); + // Increase the Raft tick interval to make this test case running reliably. + configure_for_lease_read(&mut cluster.cfg, Some(100), None); + let memory_quota = 1024; // 1KB + let mut suite = TestSuiteBuilder::new() + .cluster(cluster) + .memory_quota(memory_quota) + .build(); + + // Let CdcEvent size be 0 to effectively disable memory quota for CdcEvent. + fail::cfg("cdc_event_size", "return(0)").unwrap(); + + // Pause scan so that no region can be initialized, and all locks will be + // put in pending locks. + fail::cfg("cdc_incremental_scan_start", "pause").unwrap(); + + let req = suite.new_changedata_request(1); + let (mut req_tx, _event_feed_wrap, receive_event) = + new_event_feed(suite.get_region_cdc_client(1)); + block_on(req_tx.send((req, WriteFlags::default()))).unwrap(); + + // Trigger congest error. + let key_size = memory_quota * 2; + let (k, v) = (vec![1; key_size], vec![5]); + let start_ts = block_on(suite.cluster.pd_client.get_tso()).unwrap(); + let mut mutation = Mutation::default(); + mutation.set_op(Op::Put); + mutation.key = k.clone(); + mutation.value = v; + suite.must_kv_prewrite(1, vec![mutation], k, start_ts); + let mut events = receive_event(false).events.to_vec(); + assert_eq!(events.len(), 1, "{:?}", events); + match events.pop().unwrap().event.unwrap() { + Event_oneof_event::Error(e) => { + // Unknown errors are translated into region_not_found. + assert!(e.has_region_not_found(), "{:?}", e); + } + other => panic!("unknown event {:?}", other), + } + + // The delegate must be removed. + let scheduler = suite.endpoints.values().next().unwrap().scheduler(); + let (tx, rx) = mpsc::channel(); + scheduler + .schedule(Task::Validate(Validate::Region( + 1, + Box::new(move |delegate| { + tx.send(delegate.is_none()).unwrap(); + }), + ))) + .unwrap(); + + assert!( + rx.recv_timeout(Duration::from_millis(1000)).unwrap(), + "find unexpected delegate" + ); + + fail::remove("cdc_incremental_scan_start"); + suite.stop(); +} + +#[test] +fn test_scan_lock_memory_quota_exceeded() { + let mut cluster = new_server_cluster(1, 1); + // Increase the Raft tick interval to make this test case running reliably. + configure_for_lease_read(&mut cluster.cfg, Some(100), None); + let memory_quota = 1024; // 1KB + let mut suite = TestSuiteBuilder::new() + .cluster(cluster) + .memory_quota(memory_quota) + .build(); + + // Let CdcEvent size be 0 to effectively disable memory quota for CdcEvent. + fail::cfg("cdc_event_size", "return(0)").unwrap(); + + // Put a lock that exceeds memory quota. + let key_size = memory_quota * 2; + let (k, v) = (vec![1; key_size], vec![5]); + let start_ts = block_on(suite.cluster.pd_client.get_tso()).unwrap(); + let mut mutation = Mutation::default(); + mutation.set_op(Op::Put); + mutation.key = k.clone(); + mutation.value = v; + suite.must_kv_prewrite(1, vec![mutation], k, start_ts); + + // No region can be initialized. + let req = suite.new_changedata_request(1); + let (mut req_tx, _event_feed_wrap, receive_event) = + new_event_feed(suite.get_region_cdc_client(1)); + block_on(req_tx.send((req, WriteFlags::default()))).unwrap(); + let mut events = receive_event(false).events.to_vec(); + assert_eq!(events.len(), 1, "{:?}", events); + match events.pop().unwrap().event.unwrap() { + Event_oneof_event::Error(e) => { + // Unknown errors are translated into region_not_found. + assert!(e.has_region_not_found(), "{:?}", e); + } + other => panic!("unknown event {:?}", other), + } + let scheduler = suite.endpoints.values().next().unwrap().scheduler(); + let (tx, rx) = mpsc::channel(); + scheduler + .schedule(Task::Validate(Validate::Region( + 1, + Box::new(move |delegate| { + tx.send(delegate.is_none()).unwrap(); + }), + ))) + .unwrap(); + + assert!( + rx.recv_timeout(Duration::from_millis(1000)).unwrap(), + "find unexpected delegate" + ); + + suite.stop(); +} diff --git a/components/cdc/tests/mod.rs b/components/cdc/tests/mod.rs index ec479909793..afd209af2d3 100644 --- a/components/cdc/tests/mod.rs +++ b/components/cdc/tests/mod.rs @@ -168,6 +168,7 @@ impl TestSuiteBuilder { let count = cluster.count; let pd_cli = cluster.pd_client.clone(); let mut endpoints = HashMap::default(); + let mut quotas = HashMap::default(); let mut obs = HashMap::default(); let mut concurrency_managers = HashMap::default(); // Hack! node id are generated from 1..count+1. @@ -177,15 +178,14 @@ impl TestSuiteBuilder { let mut sim = cluster.sim.wl(); // Register cdc service to gRPC server. + let memory_quota = Arc::new(MemoryQuota::new(memory_quota)); + let memory_quota_ = memory_quota.clone(); let scheduler = worker.scheduler(); sim.pending_services .entry(id) .or_default() .push(Box::new(move || { - create_change_data(cdc::Service::new( - scheduler.clone(), - Arc::new(MemoryQuota::new(memory_quota)), - )) + create_change_data(cdc::Service::new(scheduler.clone(), memory_quota_.clone())) })); sim.txn_extra_schedulers.insert( id, @@ -200,6 +200,7 @@ impl TestSuiteBuilder { }, )); endpoints.insert(id, worker); + quotas.insert(id, memory_quota); } runner(&mut cluster); @@ -224,7 +225,7 @@ impl TestSuiteBuilder { cm.clone(), env, sim.security_mgr.clone(), - Arc::new(MemoryQuota::new(usize::MAX)), + quotas[id].clone(), sim.get_causal_ts_provider(*id), ); let mut updated_cfg = cfg.clone(); diff --git a/components/resolved_ts/src/resolver.rs b/components/resolved_ts/src/resolver.rs index 405138d41cf..ef257ad4762 100644 --- a/components/resolved_ts/src/resolver.rs +++ b/components/resolved_ts/src/resolver.rs @@ -13,7 +13,7 @@ use txn_types::TimeStamp; use crate::metrics::RTS_RESOLVED_FAIL_ADVANCE_VEC; const MAX_NUMBER_OF_LOCKS_IN_LOG: usize = 10; -pub(crate) const ON_DROP_WARN_HEAP_SIZE: usize = 64 * 1024 * 1024; // 64MB +pub const ON_DROP_WARN_HEAP_SIZE: usize = 64 * 1024 * 1024; // 64MB // Resolver resolves timestamps that guarantee no more commit will happen before // the timestamp. @@ -188,13 +188,16 @@ impl Resolver { if let Some(index) = index { self.update_tracked_index(index); } + let bytes = self.lock_heap_size(&key); debug!( - "track lock {}@{}, region {}", + "track lock {}@{}", &log_wrappers::Value::key(&key), - start_ts, - self.region_id + start_ts; + "region_id" => self.region_id, + "memory_in_use" => self.memory_quota.in_use(), + "memory_capacity" => self.memory_quota.capacity(), + "key_heap_size" => bytes, ); - let bytes = self.lock_heap_size(&key); if !self.memory_quota.alloc(bytes) { return false; } @@ -213,14 +216,18 @@ impl Resolver { self.memory_quota.free(bytes); start_ts } else { - debug!("untrack a lock that was not tracked before"; "key" => &log_wrappers::Value::key(key)); + debug!("untrack a lock that was not tracked before"; + "key" => &log_wrappers::Value::key(key), + "region_id" => self.region_id, + ); return; }; debug!( - "untrack lock {}@{}, region {}", + "untrack lock {}@{}", &log_wrappers::Value::key(key), - start_ts, - self.region_id, + start_ts; + "region_id" => self.region_id, + "memory_in_use" => self.memory_quota.in_use(), ); let mut shrink_ts = None; From 9bf96f921637f1823f8507f822a215dff55d50e1 Mon Sep 17 00:00:00 2001 From: ekexium Date: Wed, 6 Sep 2023 07:20:12 +0800 Subject: [PATCH 032/220] metrics: more logs and metrics for resolved-ts (#15416) ref tikv/tikv#15082 Add more logs and metrics for resolved-ts. Signed-off-by: ekexium Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- .../backup-stream/src/subscription_track.rs | 6 +- components/cdc/src/delegate.rs | 4 +- components/cdc/src/initializer.rs | 4 +- components/concurrency_manager/src/lib.rs | 17 + .../concurrency_manager/src/lock_table.rs | 8 + components/raftstore/src/store/util.rs | 1 - components/resolved_ts/src/advance.rs | 9 +- components/resolved_ts/src/endpoint.rs | 563 +++++++++++++----- components/resolved_ts/src/metrics.rs | 68 ++- components/resolved_ts/src/resolver.rs | 112 +++- metrics/grafana/tikv_details.json | 12 +- 11 files changed, 615 insertions(+), 189 deletions(-) diff --git a/components/backup-stream/src/subscription_track.rs b/components/backup-stream/src/subscription_track.rs index ef6e24d9d8f..d6d49f0cf1c 100644 --- a/components/backup-stream/src/subscription_track.rs +++ b/components/backup-stream/src/subscription_track.rs @@ -8,7 +8,7 @@ use dashmap::{ }; use kvproto::metapb::Region; use raftstore::coprocessor::*; -use resolved_ts::Resolver; +use resolved_ts::{Resolver, TsSource}; use tikv_util::{info, memory::MemoryQuota, warn}; use txn_types::TimeStamp; @@ -516,7 +516,7 @@ impl TwoPhaseResolver { return min_ts.min(stable_ts); } - self.resolver.resolve(min_ts, None) + self.resolver.resolve(min_ts, None, TsSource::BackupStream) } pub fn resolved_ts(&self) -> TimeStamp { @@ -548,7 +548,7 @@ impl TwoPhaseResolver { // advance the internal resolver. // the start ts of initial scanning would be a safe ts for min ts // -- because is used to be a resolved ts. - self.resolver.resolve(ts, None); + self.resolver.resolve(ts, None, TsSource::BackupStream); } None => { warn!("BUG: a two-phase resolver is executing phase_one_done when not in phase one"; "resolver" => ?self) diff --git a/components/cdc/src/delegate.rs b/components/cdc/src/delegate.rs index e109b3368b4..f7125aa8882 100644 --- a/components/cdc/src/delegate.rs +++ b/components/cdc/src/delegate.rs @@ -28,7 +28,7 @@ use raftstore::{ store::util::compare_region_epoch, Error as RaftStoreError, }; -use resolved_ts::{Resolver, ON_DROP_WARN_HEAP_SIZE}; +use resolved_ts::{Resolver, TsSource, ON_DROP_WARN_HEAP_SIZE}; use tikv::storage::{txn::TxnEntry, Statistics}; use tikv_util::{ debug, info, @@ -514,7 +514,7 @@ impl Delegate { } debug!("cdc try to advance ts"; "region_id" => self.region_id, "min_ts" => min_ts); let resolver = self.resolver.as_mut().unwrap(); - let resolved_ts = resolver.resolve(min_ts, None); + let resolved_ts = resolver.resolve(min_ts, None, TsSource::Cdc); debug!("cdc resolved ts updated"; "region_id" => self.region_id, "resolved_ts" => resolved_ts); Some(resolved_ts) diff --git a/components/cdc/src/initializer.rs b/components/cdc/src/initializer.rs index 25b7175a08d..ef0b15caab9 100644 --- a/components/cdc/src/initializer.rs +++ b/components/cdc/src/initializer.rs @@ -23,7 +23,7 @@ use raftstore::{ msg::{Callback, ReadResponse}, }, }; -use resolved_ts::Resolver; +use resolved_ts::{Resolver, TsSource}; use tikv::storage::{ kv::Snapshot, mvcc::{DeltaScanner, ScannerBuilder}, @@ -467,7 +467,7 @@ impl Initializer { fn finish_building_resolver(&self, mut resolver: Resolver, region: Region) { let observe_id = self.observe_id; - let rts = resolver.resolve(TimeStamp::zero(), None); + let rts = resolver.resolve(TimeStamp::zero(), None, TsSource::Cdc); info!( "cdc resolver initialized and schedule resolver ready"; "region_id" => region.get_id(), diff --git a/components/concurrency_manager/src/lib.rs b/components/concurrency_manager/src/lib.rs index ce77cb87a42..1c6bdb8dbf1 100644 --- a/components/concurrency_manager/src/lib.rs +++ b/components/concurrency_manager/src/lib.rs @@ -124,6 +124,23 @@ impl ConcurrencyManager { }); min_lock_ts } + + pub fn global_min_lock(&self) -> Option<(TimeStamp, Key)> { + let mut min_lock: Option<(TimeStamp, Key)> = None; + // TODO: The iteration looks not so efficient. It's better to be optimized. + self.lock_table.for_each_kv(|key, handle| { + if let Some(curr_ts) = handle.with_lock(|lock| lock.as_ref().map(|l| l.ts)) { + if min_lock + .as_ref() + .map(|(ts, _)| ts > &curr_ts) + .unwrap_or(true) + { + min_lock = Some((curr_ts, key.clone())); + } + } + }); + min_lock + } } #[cfg(test)] diff --git a/components/concurrency_manager/src/lock_table.rs b/components/concurrency_manager/src/lock_table.rs index db6995fa1d0..8f4fb8952c3 100644 --- a/components/concurrency_manager/src/lock_table.rs +++ b/components/concurrency_manager/src/lock_table.rs @@ -115,6 +115,14 @@ impl LockTable { } } + pub fn for_each_kv(&self, mut f: impl FnMut(&Key, Arc)) { + for entry in self.0.iter() { + if let Some(handle) = entry.value().upgrade() { + f(entry.key(), handle); + } + } + } + /// Removes the key and its key handle from the map. pub fn remove(&self, key: &Key) { self.0.remove(key); diff --git a/components/raftstore/src/store/util.rs b/components/raftstore/src/store/util.rs index 880a394fdae..3f34fe691ee 100644 --- a/components/raftstore/src/store/util.rs +++ b/components/raftstore/src/store/util.rs @@ -1439,7 +1439,6 @@ impl RegionReadProgress { self.safe_ts() } - // Dump the `LeaderInfo` and the peer list pub fn get_core(&self) -> MutexGuard<'_, RegionReadProgressCore> { self.core.lock().unwrap() } diff --git a/components/resolved_ts/src/advance.rs b/components/resolved_ts/src/advance.rs index 4428ed01a35..59478f5affb 100644 --- a/components/resolved_ts/src/advance.rs +++ b/components/resolved_ts/src/advance.rs @@ -43,7 +43,7 @@ use tokio::{ }; use txn_types::TimeStamp; -use crate::{endpoint::Task, metrics::*}; +use crate::{endpoint::Task, metrics::*, TsSource}; pub(crate) const DEFAULT_CHECK_LEADER_TIMEOUT_DURATION: Duration = Duration::from_secs(5); // 5s const DEFAULT_GRPC_GZIP_COMPRESSION_LEVEL: usize = 2; @@ -57,7 +57,7 @@ pub struct AdvanceTsWorker { scheduler: Scheduler, /// The concurrency manager for transactions. It's needed for CDC to check /// locks when calculating resolved_ts. - concurrency_manager: ConcurrencyManager, + pub(crate) concurrency_manager: ConcurrencyManager, // cache the last pd tso, used to approximate the next timestamp w/o an actual TSO RPC pub(crate) last_pd_tso: Arc>>, @@ -114,15 +114,17 @@ impl AdvanceTsWorker { if let Ok(mut last_pd_tso) = last_pd_tso.try_lock() { *last_pd_tso = Some((min_ts, Instant::now())); } + let mut ts_source = TsSource::PdTso; // Sync with concurrency manager so that it can work correctly when // optimizations like async commit is enabled. // Note: This step must be done before scheduling `Task::MinTs` task, and the // resolver must be checked in or after `Task::MinTs`' execution. cm.update_max_ts(min_ts); - if let Some(min_mem_lock_ts) = cm.global_min_lock_ts() { + if let Some((min_mem_lock_ts, lock)) = cm.global_min_lock() { if min_mem_lock_ts < min_ts { min_ts = min_mem_lock_ts; + ts_source = TsSource::MemoryLock(lock); } } @@ -131,6 +133,7 @@ impl AdvanceTsWorker { if let Err(e) = scheduler.schedule(Task::ResolvedTsAdvanced { regions, ts: min_ts, + ts_source, }) { info!("failed to schedule advance event"; "err" => ?e); } diff --git a/components/resolved_ts/src/endpoint.rs b/components/resolved_ts/src/endpoint.rs index fc3e24de1e4..e2d2aec4f70 100644 --- a/components/resolved_ts/src/endpoint.rs +++ b/components/resolved_ts/src/endpoint.rs @@ -1,12 +1,13 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. use std::{ + cmp::min, collections::HashMap, fmt, marker::PhantomData, sync::{ atomic::{AtomicBool, Ordering}, - Arc, Mutex, + Arc, Mutex, MutexGuard, }, time::Duration, }; @@ -14,7 +15,7 @@ use std::{ use concurrency_manager::ConcurrencyManager; use engine_traits::KvEngine; use grpcio::Environment; -use kvproto::{metapb::Region, raft_cmdpb::AdminCmdType}; +use kvproto::{kvrpcpb::LeaderInfo, metapb::Region, raft_cmdpb::AdminCmdType}; use online_config::{self, ConfigChange, ConfigManager, OnlineConfig}; use pd_client::PdClient; use raftstore::{ @@ -22,7 +23,9 @@ use raftstore::{ router::CdcHandle, store::{ fsm::store::StoreRegionMeta, - util::{self, RegionReadProgress, RegionReadProgressRegistry}, + util::{ + self, ReadState, RegionReadProgress, RegionReadProgressCore, RegionReadProgressRegistry, + }, }, }; use security::SecurityManager; @@ -39,12 +42,12 @@ use crate::{ advance::{AdvanceTsWorker, LeadershipResolver, DEFAULT_CHECK_LEADER_TIMEOUT_DURATION}, cmd::{ChangeLog, ChangeRow}, metrics::*, - resolver::Resolver, + resolver::{LastAttempt, Resolver}, scanner::{ScanEntry, ScanMode, ScanTask, ScannerPool}, - Error, Result, ON_DROP_WARN_HEAP_SIZE, + Error, Result, TsSource, ON_DROP_WARN_HEAP_SIZE, }; -/// grace period for logging safe-ts and resolved-ts gap in slow log +/// grace period for identifying identifying slow resolved-ts and safe-ts. const SLOW_LOG_GRACE_PERIOD_MS: u64 = 1000; const MEMORY_QUOTA_EXCEEDED_BACKOFF: Duration = Duration::from_secs(30); @@ -386,6 +389,265 @@ pub struct Endpoint { _phantom: PhantomData<(T, E)>, } +// methods that are used for metrics and logging +impl Endpoint +where + T: 'static + CdcHandle, + E: KvEngine, + S: StoreRegionMeta, +{ + fn is_leader(&self, store_id: Option, leader_store_id: Option) -> bool { + store_id.is_some() && store_id == leader_store_id + } + + fn collect_stats(&mut self) -> Stats { + let store_id = self.get_or_init_store_id(); + let mut stats = Stats::default(); + self.region_read_progress.with(|registry| { + for (region_id, read_progress) in registry { + let (leader_info, leader_store_id) = read_progress.dump_leader_info(); + let core = read_progress.get_core(); + let resolved_ts = leader_info.get_read_state().get_safe_ts(); + let safe_ts = core.read_state().ts; + + if resolved_ts == 0 { + stats.zero_ts_count += 1; + continue; + } + + if self.is_leader(store_id, leader_store_id) { + // leader resolved-ts + if resolved_ts < stats.min_leader_resolved_ts.resolved_ts { + let resolver = self.regions.get(region_id).map(|x| &x.resolver); + stats + .min_leader_resolved_ts + .set(*region_id, resolver, &core, &leader_info); + } + } else { + // follower safe-ts + if safe_ts > 0 && safe_ts < stats.min_follower_safe_ts.safe_ts { + stats.min_follower_safe_ts.set(*region_id, &core); + } + + // follower resolved-ts + if resolved_ts < stats.min_follower_resolved_ts.resolved_ts { + stats.min_follower_resolved_ts.set(*region_id, &core); + } + } + } + }); + + stats.resolver = self.collect_resolver_stats(); + stats.cm_min_lock = self.advance_worker.concurrency_manager.global_min_lock(); + stats + } + + fn collect_resolver_stats(&mut self) -> ResolverStats { + let mut stats = ResolverStats::default(); + for observed_region in self.regions.values() { + match &observed_region.resolver_status { + ResolverStatus::Pending { locks, .. } => { + for l in locks { + match l { + PendingLock::Track { key, .. } => stats.heap_size += key.len() as i64, + PendingLock::Untrack { key, .. } => stats.heap_size += key.len() as i64, + } + } + stats.unresolved_count += 1; + } + ResolverStatus::Ready { .. } => { + stats.heap_size += observed_region.resolver.approximate_heap_bytes() as i64; + stats.resolved_count += 1; + } + } + } + stats + } + + fn update_metrics(&self, stats: &Stats) { + let now = self.approximate_now_tso(); + // general + if stats.min_follower_resolved_ts.resolved_ts < stats.min_leader_resolved_ts.resolved_ts { + RTS_MIN_RESOLVED_TS.set(stats.min_follower_resolved_ts.resolved_ts as i64); + RTS_MIN_RESOLVED_TS_GAP.set(now.saturating_sub( + TimeStamp::from(stats.min_follower_resolved_ts.resolved_ts).physical(), + ) as i64); + RTS_MIN_RESOLVED_TS_REGION.set(stats.min_follower_resolved_ts.region_id as i64); + } else { + RTS_MIN_RESOLVED_TS.set(stats.min_leader_resolved_ts.resolved_ts as i64); + RTS_MIN_RESOLVED_TS_GAP.set(now.saturating_sub( + TimeStamp::from(stats.min_leader_resolved_ts.resolved_ts).physical(), + ) as i64); + RTS_MIN_RESOLVED_TS_REGION.set(stats.min_leader_resolved_ts.region_id as i64); + } + RTS_ZERO_RESOLVED_TS.set(stats.zero_ts_count); + + RTS_LOCK_HEAP_BYTES_GAUGE.set(stats.resolver.heap_size); + RTS_REGION_RESOLVE_STATUS_GAUGE_VEC + .with_label_values(&["resolved"]) + .set(stats.resolver.resolved_count); + RTS_REGION_RESOLVE_STATUS_GAUGE_VEC + .with_label_values(&["unresolved"]) + .set(stats.resolver.unresolved_count); + + CONCURRENCY_MANAGER_MIN_LOCK_TS.set( + stats + .cm_min_lock + .clone() + .map(|(ts, _)| ts.into_inner()) + .unwrap_or_default() as i64, + ); + + // min follower safe ts + RTS_MIN_FOLLOWER_SAFE_TS_REGION.set(stats.min_follower_safe_ts.region_id as i64); + RTS_MIN_FOLLOWER_SAFE_TS.set(stats.min_follower_safe_ts.safe_ts as i64); + RTS_MIN_FOLLOWER_SAFE_TS_GAP.set( + now.saturating_sub(TimeStamp::from(stats.min_follower_safe_ts.safe_ts).physical()) + as i64, + ); + RTS_MIN_FOLLOWER_SAFE_TS_DURATION_TO_LAST_CONSUME_LEADER.set( + stats + .min_follower_safe_ts + .duration_to_last_consume_leader + .map(|x| x as i64) + .unwrap_or(-1), + ); + + // min leader resolved ts + RTS_MIN_LEADER_RESOLVED_TS.set(stats.min_leader_resolved_ts.resolved_ts as i64); + RTS_MIN_LEADER_RESOLVED_TS_REGION.set(stats.min_leader_resolved_ts.region_id as i64); + RTS_MIN_LEADER_RESOLVED_TS_REGION_MIN_LOCK_TS.set( + stats + .min_leader_resolved_ts + .min_lock + .as_ref() + .map(|(ts, _)| (*ts).into_inner() as i64) + .unwrap_or(-1), + ); + RTS_MIN_LEADER_RESOLVED_TS_GAP + .set(now.saturating_sub( + TimeStamp::from(stats.min_leader_resolved_ts.resolved_ts).physical(), + ) as i64); + RTS_MIN_LEADER_DUATION_TO_LAST_UPDATE_SAFE_TS.set( + stats + .min_leader_resolved_ts + .duration_to_last_update_ms + .map(|x| x as i64) + .unwrap_or(-1), + ); + + // min follower resolved ts + RTS_MIN_FOLLOWER_RESOLVED_TS.set(stats.min_follower_resolved_ts.resolved_ts as i64); + RTS_MIN_FOLLOWER_RESOLVED_TS_REGION.set(stats.min_follower_resolved_ts.region_id as i64); + RTS_MIN_FOLLOWER_RESOLVED_TS_GAP.set( + now.saturating_sub( + TimeStamp::from(stats.min_follower_resolved_ts.resolved_ts).physical(), + ) as i64, + ); + RTS_MIN_FOLLOWER_RESOLVED_TS_DURATION_TO_LAST_CONSUME_LEADER.set( + stats + .min_follower_resolved_ts + .duration_to_last_consume_leader + .map(|x| x as i64) + .unwrap_or(-1), + ); + } + + // Approximate a TSO from PD. It is better than local timestamp when clock skew + // exists. + // Returns the physical part. + fn approximate_now_tso(&self) -> u64 { + self.advance_worker + .last_pd_tso + .try_lock() + .map(|opt| { + opt.map(|(pd_ts, instant)| { + pd_ts.physical() + instant.saturating_elapsed().as_millis() as u64 + }) + .unwrap_or_else(|| TimeStamp::physical_now()) + }) + .unwrap_or_else(|_| TimeStamp::physical_now()) + } + + fn log_slow_regions(&self, stats: &Stats) { + let expected_interval = min( + self.cfg.advance_ts_interval.as_millis(), + DEFAULT_CHECK_LEADER_TIMEOUT_DURATION.as_millis() as u64, + ) + self.cfg.advance_ts_interval.as_millis(); + let leader_threshold = expected_interval + SLOW_LOG_GRACE_PERIOD_MS; + let follower_threshold = 2 * expected_interval + SLOW_LOG_GRACE_PERIOD_MS; + let now = self.approximate_now_tso(); + + // min leader resolved ts + let min_leader_resolved_ts_gap = now + .saturating_sub(TimeStamp::from(stats.min_leader_resolved_ts.resolved_ts).physical()); + if min_leader_resolved_ts_gap > leader_threshold { + info!( + "the max gap of leader resolved-ts is large"; + "region_id" => stats.min_leader_resolved_ts.region_id, + "gap" => format!("{}ms", min_leader_resolved_ts_gap), + "read_state" => ?stats.min_leader_resolved_ts.read_state, + "applied_index" => stats.min_leader_resolved_ts.applied_index, + "min_lock" => ?stats.min_leader_resolved_ts.min_lock, + "lock_num" => stats.min_leader_resolved_ts.lock_num, + "txn_num" => stats.min_leader_resolved_ts.txn_num, + "min_memory_lock" => ?stats.cm_min_lock, + "duration_to_last_update_safe_ts" => match stats.min_leader_resolved_ts.duration_to_last_update_ms { + Some(d) => format!("{}ms", d), + None => "none".to_owned(), + }, + "last_resolve_attempt" => &stats.min_leader_resolved_ts.last_resolve_attempt, + ); + } + + // min follower safe ts + let min_follower_safe_ts_gap = + now.saturating_sub(TimeStamp::from(stats.min_follower_safe_ts.safe_ts).physical()); + if min_follower_safe_ts_gap > follower_threshold { + info!( + "the max gap of follower safe-ts is large"; + "region_id" => stats.min_follower_safe_ts.region_id, + "gap" => format!("{}ms", min_follower_safe_ts_gap), + "safe_ts" => stats.min_follower_safe_ts.safe_ts, + "resolved_ts" => stats.min_follower_safe_ts.resolved_ts, + "duration_to_last_consume_leader" => match stats.min_follower_safe_ts.duration_to_last_consume_leader { + Some(d) => format!("{}ms", d), + None => "none".to_owned(), + }, + "applied_index" => stats.min_follower_safe_ts.applied_index, + "latest_candidate" => ?stats.min_follower_safe_ts.latest_candidate, + "oldest_candidate" => ?stats.min_follower_safe_ts.oldest_candidate, + ); + } + + // min follower resolved ts + let min_follower_resolved_ts_gap = now + .saturating_sub(TimeStamp::from(stats.min_follower_resolved_ts.resolved_ts).physical()); + if min_follower_resolved_ts_gap > follower_threshold { + if stats.min_follower_resolved_ts.region_id == stats.min_follower_safe_ts.region_id { + info!( + "the max gap of follower resolved-ts is large; it's the same region that has the min safe-ts" + ); + } else { + info!( + "the max gap of follower resolved-ts is large"; + "region_id" => stats.min_follower_resolved_ts.region_id, + "gap" => format!("{}ms", min_follower_resolved_ts_gap), + "safe_ts" => stats.min_follower_resolved_ts.safe_ts, + "resolved_ts" => stats.min_follower_resolved_ts.resolved_ts, + "duration_to_last_consume_leader" => match stats.min_follower_resolved_ts.duration_to_last_consume_leader { + Some(d) => format!("{}ms", d), + None => "none".to_owned(), + }, + "applied_index" => stats.min_follower_resolved_ts.applied_index, + "latest_candidate" => ?stats.min_follower_resolved_ts.latest_candidate, + "oldest_candidate" => ?stats.min_follower_resolved_ts.oldest_candidate, + ); + } + } + } +} + impl Endpoint where T: 'static + CdcHandle, @@ -623,7 +885,12 @@ where // Update advanced resolved ts. // Must ensure all regions are leaders at the point of ts. - fn handle_resolved_ts_advanced(&mut self, regions: Vec, ts: TimeStamp) { + fn handle_resolved_ts_advanced( + &mut self, + regions: Vec, + ts: TimeStamp, + ts_source: TsSource, + ) { if regions.is_empty() { return; } @@ -631,7 +898,9 @@ where for region_id in regions.iter() { if let Some(observe_region) = self.regions.get_mut(region_id) { if let ResolverStatus::Ready = observe_region.resolver_status { - let _ = observe_region.resolver.resolve(ts, Some(now)); + let _ = observe_region + .resolver + .resolve(ts, Some(now), ts_source.clone()); } } } @@ -776,6 +1045,7 @@ pub enum Task { ResolvedTsAdvanced { regions: Vec, ts: TimeStamp, + ts_source: TsSource, }, ChangeLog { cmd_batch: Vec, @@ -830,10 +1100,12 @@ impl fmt::Debug for Task { Task::ResolvedTsAdvanced { ref regions, ref ts, + ref ts_source, } => de .field("name", &"advance_resolved_ts") .field("regions", ®ions) .field("ts", &ts) + .field("ts_source", &ts_source.label()) .finish(), Task::ChangeLog { .. } => de.field("name", &"change_log").finish(), Task::ScanLocks { @@ -890,9 +1162,11 @@ where Task::AdvanceResolvedTs { leader_resolver } => { self.handle_advance_resolved_ts(leader_resolver) } - Task::ResolvedTsAdvanced { regions, ts } => { - self.handle_resolved_ts_advanced(regions, ts) - } + Task::ResolvedTsAdvanced { + regions, + ts, + ts_source, + } => self.handle_resolved_ts_advanced(regions, ts, ts_source), Task::ChangeLog { cmd_batch } => self.handle_change_log(cmd_batch), Task::ScanLocks { region_id, @@ -928,6 +1202,138 @@ impl ConfigManager for ResolvedTsConfigManager { } } +#[derive(Default)] +struct Stats { + // stats for metrics + zero_ts_count: i64, + min_leader_resolved_ts: LeaderStats, + min_follower_safe_ts: FollowerStats, + min_follower_resolved_ts: FollowerStats, + resolver: ResolverStats, + // we don't care about min_safe_ts_leader, because safe_ts should be equal to resolved_ts in + // leaders + // The min memory lock in concurrency manager. + cm_min_lock: Option<(TimeStamp, Key)>, +} + +struct LeaderStats { + region_id: u64, + resolved_ts: u64, + read_state: ReadState, + duration_to_last_update_ms: Option, + last_resolve_attempt: Option, + applied_index: u64, + // min lock in LOCK CF + min_lock: Option<(TimeStamp, Key)>, + lock_num: Option, + txn_num: Option, +} + +impl Default for LeaderStats { + fn default() -> Self { + Self { + region_id: 0, + resolved_ts: u64::MAX, + read_state: ReadState::default(), + duration_to_last_update_ms: None, + applied_index: 0, + last_resolve_attempt: None, + min_lock: None, + lock_num: None, + txn_num: None, + } + } +} + +impl LeaderStats { + fn set( + &mut self, + region_id: u64, + resolver: Option<&Resolver>, + region_read_progress: &MutexGuard<'_, RegionReadProgressCore>, + leader_info: &LeaderInfo, + ) { + *self = LeaderStats { + region_id, + resolved_ts: leader_info.get_read_state().get_safe_ts(), + read_state: region_read_progress.read_state().clone(), + duration_to_last_update_ms: region_read_progress + .last_instant_of_update_ts() + .map(|i| i.saturating_elapsed().as_millis() as u64), + last_resolve_attempt: resolver.and_then(|r| r.last_attempt.clone()), + min_lock: resolver.and_then(|r| { + r.oldest_transaction().map(|(ts, keys)| { + ( + *ts, + keys.iter() + .next() + .map(|k| Key::from_encoded_slice(k.as_ref())) + .unwrap_or_else(|| Key::from_encoded_slice("no_keys_found".as_ref())), + ) + }) + }), + applied_index: region_read_progress.applied_index(), + lock_num: resolver.map(|r| r.num_locks()), + txn_num: resolver.map(|r| r.num_transactions()), + }; + } +} + +struct FollowerStats { + region_id: u64, + resolved_ts: u64, + safe_ts: u64, + latest_candidate: Option, + oldest_candidate: Option, + applied_index: u64, + duration_to_last_consume_leader: Option, +} + +impl Default for FollowerStats { + fn default() -> Self { + Self { + region_id: 0, + safe_ts: u64::MAX, + resolved_ts: u64::MAX, + latest_candidate: None, + oldest_candidate: None, + applied_index: 0, + duration_to_last_consume_leader: None, + } + } +} + +impl FollowerStats { + fn set( + &mut self, + region_id: u64, + region_read_progress: &MutexGuard<'_, RegionReadProgressCore>, + ) { + let read_state = region_read_progress.read_state(); + *self = FollowerStats { + region_id, + resolved_ts: region_read_progress + .get_leader_info() + .get_read_state() + .get_safe_ts(), + safe_ts: read_state.ts, + applied_index: region_read_progress.applied_index(), + latest_candidate: region_read_progress.pending_items().back().cloned(), + oldest_candidate: region_read_progress.pending_items().front().cloned(), + duration_to_last_consume_leader: region_read_progress + .last_instant_of_consume_leader() + .map(|i| i.saturating_elapsed().as_millis() as u64), + }; + } +} + +#[derive(Default)] +struct ResolverStats { + resolved_count: i64, + unresolved_count: i64, + heap_size: i64, +} + const METRICS_FLUSH_INTERVAL: u64 = 10_000; // 10s impl RunnableWithTimer for Endpoint @@ -937,138 +1343,9 @@ where S: StoreRegionMeta, { fn on_timeout(&mut self) { - let store_id = self.get_or_init_store_id(); - let (mut oldest_ts, mut oldest_region, mut zero_ts_count) = (u64::MAX, 0, 0); - let (mut oldest_leader_ts, mut oldest_leader_region) = (u64::MAX, 0); - let (mut oldest_safe_ts, mut oldest_safe_ts_region) = (u64::MAX, 0); - let mut oldest_duration_to_last_update_ms = 0; - let mut oldest_duration_to_last_consume_leader_ms = 0; - self.region_read_progress.with(|registry| { - for (region_id, read_progress) in registry { - let safe_ts = read_progress.safe_ts(); - if safe_ts > 0 && safe_ts < oldest_safe_ts { - oldest_safe_ts = safe_ts; - oldest_safe_ts_region = *region_id; - } - - let (leader_info, leader_store_id) = read_progress.dump_leader_info(); - // this is maximum resolved-ts pushed to region_read_progress, namely candidates - // of safe_ts. It may not be the safe_ts yet - let ts = leader_info.get_read_state().get_safe_ts(); - if ts == 0 { - zero_ts_count += 1; - continue; - } - if ts < oldest_ts { - oldest_ts = ts; - oldest_region = *region_id; - // use -1 to denote none. - oldest_duration_to_last_update_ms = read_progress - .get_core() - .last_instant_of_consume_leader() - .map(|t| t.saturating_elapsed().as_millis() as i64) - .unwrap_or(-1); - oldest_duration_to_last_consume_leader_ms = read_progress - .get_core() - .last_instant_of_consume_leader() - .map(|t| t.saturating_elapsed().as_millis() as i64) - .unwrap_or(-1); - } - - if let (Some(store_id), Some(leader_store_id)) = (store_id, leader_store_id) { - if leader_store_id == store_id && ts < oldest_leader_ts { - oldest_leader_ts = ts; - oldest_leader_region = *region_id; - } - } - } - }); - let mut lock_heap_size = 0; - let (mut resolved_count, mut unresolved_count) = (0, 0); - for observe_region in self.regions.values() { - match &observe_region.resolver_status { - ResolverStatus::Pending { locks, .. } => { - for l in locks { - match l { - PendingLock::Track { key, .. } => lock_heap_size += key.len(), - PendingLock::Untrack { key, .. } => lock_heap_size += key.len(), - } - } - unresolved_count += 1; - } - ResolverStatus::Ready { .. } => { - lock_heap_size += observe_region.resolver.approximate_heap_bytes(); - resolved_count += 1; - } - } - } - // approximate a TSO from PD. It is better than local timestamp when clock skew - // exists. - let now: u64 = self - .advance_worker - .last_pd_tso - .try_lock() - .map(|opt| { - opt.map(|(pd_ts, instant)| { - pd_ts.physical() + instant.saturating_elapsed().as_millis() as u64 - }) - .unwrap_or_else(|| TimeStamp::physical_now()) - }) - .unwrap_or_else(|_| TimeStamp::physical_now()); - - RTS_MIN_SAFE_TS.set(oldest_safe_ts as i64); - RTS_MIN_SAFE_TS_REGION.set(oldest_safe_ts_region as i64); - let safe_ts_gap = now.saturating_sub(TimeStamp::from(oldest_safe_ts).physical()); - if safe_ts_gap - > self.cfg.advance_ts_interval.as_millis() - + DEFAULT_CHECK_LEADER_TIMEOUT_DURATION.as_millis() as u64 - + SLOW_LOG_GRACE_PERIOD_MS - { - let mut lock_num = None; - let mut min_start_ts = None; - if let Some(ob) = self.regions.get(&oldest_safe_ts_region) { - min_start_ts = ob - .resolver - .locks() - .keys() - .next() - .cloned() - .map(TimeStamp::into_inner); - lock_num = Some(ob.resolver.num_locks()); - } - info!( - "the max gap of safe-ts is large"; - "gap" => safe_ts_gap, - "oldest_safe_ts" => ?oldest_safe_ts, - "region_id" => oldest_safe_ts_region, - "advance_ts_interval" => ?self.cfg.advance_ts_interval, - "lock_num" => lock_num, - "min_start_ts" => min_start_ts, - ); - } - RTS_MIN_SAFE_TS_GAP.set(safe_ts_gap as i64); - RTS_MIN_SAFE_TS_DUATION_TO_UPDATE_SAFE_TS.set(oldest_duration_to_last_update_ms); - RTS_MIN_SAFE_TS_DURATION_TO_LAST_CONSUME_LEADER - .set(oldest_duration_to_last_consume_leader_ms); - - RTS_MIN_RESOLVED_TS_REGION.set(oldest_region as i64); - RTS_MIN_RESOLVED_TS.set(oldest_ts as i64); - RTS_ZERO_RESOLVED_TS.set(zero_ts_count as i64); - RTS_MIN_RESOLVED_TS_GAP - .set(now.saturating_sub(TimeStamp::from(oldest_ts).physical()) as i64); - - RTS_MIN_LEADER_RESOLVED_TS_REGION.set(oldest_leader_region as i64); - RTS_MIN_LEADER_RESOLVED_TS.set(oldest_leader_ts as i64); - RTS_MIN_LEADER_RESOLVED_TS_GAP - .set(now.saturating_sub(TimeStamp::from(oldest_leader_ts).physical()) as i64); - - RTS_LOCK_HEAP_BYTES_GAUGE.set(lock_heap_size as i64); - RTS_REGION_RESOLVE_STATUS_GAUGE_VEC - .with_label_values(&["resolved"]) - .set(resolved_count as _); - RTS_REGION_RESOLVE_STATUS_GAUGE_VEC - .with_label_values(&["unresolved"]) - .set(unresolved_count as _); + let stats = self.collect_stats(); + self.update_metrics(&stats); + self.log_slow_regions(&stats); } fn get_interval(&self) -> Duration { diff --git a/components/resolved_ts/src/metrics.rs b/components/resolved_ts/src/metrics.rs index 74da743952c..02bb92f7887 100644 --- a/components/resolved_ts/src/metrics.rs +++ b/components/resolved_ts/src/metrics.rs @@ -38,7 +38,7 @@ lazy_static! { .unwrap(); pub static ref RTS_MIN_RESOLVED_TS_GAP: IntGauge = register_int_gauge!( "tikv_resolved_ts_min_resolved_ts_gap_millis", - "The minimal (non-zero) resolved ts gap for observed regions" + "The gap between now() and the minimal (non-zero) resolved ts" ) .unwrap(); pub static ref RTS_RESOLVED_FAIL_ADVANCE_VEC: IntCounterVec = register_int_counter_vec!( @@ -69,29 +69,29 @@ lazy_static! { "The minimal (non-zero) resolved ts for observed regions" ) .unwrap(); - pub static ref RTS_MIN_SAFE_TS_REGION: IntGauge = register_int_gauge!( - "tikv_resolved_ts_min_safe_ts_region", - "The region which has minimal safe ts" + pub static ref RTS_MIN_FOLLOWER_SAFE_TS_REGION: IntGauge = register_int_gauge!( + "tikv_resolved_ts_min_follower_safe_ts_region", + "The region id of the follower that has minimal safe ts" ) .unwrap(); - pub static ref RTS_MIN_SAFE_TS: IntGauge = register_int_gauge!( - "tikv_resolved_ts_min_safe_ts", - "The minimal (non-zero) safe ts for observed regions" + pub static ref RTS_MIN_FOLLOWER_SAFE_TS: IntGauge = register_int_gauge!( + "tikv_resolved_ts_min_follower_safe_ts", + "The minimal (non-zero) safe ts for followers" ) .unwrap(); - pub static ref RTS_MIN_SAFE_TS_GAP: IntGauge = register_int_gauge!( - "tikv_resolved_ts_min_safe_ts_gap_millis", - "The minimal (non-zero) safe ts gap for observed regions" + pub static ref RTS_MIN_FOLLOWER_SAFE_TS_GAP: IntGauge = register_int_gauge!( + "tikv_resolved_ts_min_follower_safe_ts_gap_millis", + "The gap between now() and the minimal (non-zero) safe ts for followers" ) .unwrap(); - pub static ref RTS_MIN_SAFE_TS_DUATION_TO_UPDATE_SAFE_TS: IntGauge = register_int_gauge!( - "tikv_resolved_ts_min_safe_ts_duration_to_update_safe_ts", - "The duration since last update_safe_ts() called by resolved-ts routine. -1 denotes None." + pub static ref RTS_MIN_LEADER_DUATION_TO_LAST_UPDATE_SAFE_TS: IntGauge = register_int_gauge!( + "tikv_resolved_ts_leader_min_resolved_ts_duration_to_last_update_safe_ts", + "The duration since last update_safe_ts() called by resolved-ts routine in the leader with min resolved ts. -1 denotes None." ) .unwrap(); - pub static ref RTS_MIN_SAFE_TS_DURATION_TO_LAST_CONSUME_LEADER: IntGauge = register_int_gauge!( - "tikv_resolved_ts_min_safe_ts_duration_to_last_consume_leader", - "The duration since last check_leader(). -1 denotes None." + pub static ref RTS_MIN_FOLLOWER_SAFE_TS_DURATION_TO_LAST_CONSUME_LEADER: IntGauge = register_int_gauge!( + "tikv_resolved_ts_min_follower_safe_ts_duration_to_last_consume_leader", + "The duration since last check_leader() in the follower region with min safe ts. -1 denotes None." ) .unwrap(); pub static ref RTS_ZERO_RESOLVED_TS: IntGauge = register_int_gauge!( @@ -125,7 +125,17 @@ lazy_static! { .unwrap(); pub static ref RTS_MIN_LEADER_RESOLVED_TS_REGION: IntGauge = register_int_gauge!( "tikv_resolved_ts_min_leader_resolved_ts_region", - "The region which its leader peer has minimal resolved ts" + "The region whose leader peer has minimal resolved ts" + ) + .unwrap(); + pub static ref RTS_MIN_LEADER_RESOLVED_TS_REGION_MIN_LOCK_TS: IntGauge = register_int_gauge!( + "tikv_resolved_ts_min_leader_resolved_ts_region_min_lock_ts", + "The minimal lock ts for the region whose leader peer has minimal resolved ts. 0 means no lock. -1 means no region found." + ) + .unwrap(); + pub static ref CONCURRENCY_MANAGER_MIN_LOCK_TS: IntGauge = register_int_gauge!( + "tikv_concurrency_manager_min_lock_ts", + "The minimal lock ts in concurrency manager. 0 means no lock." ) .unwrap(); pub static ref RTS_MIN_LEADER_RESOLVED_TS: IntGauge = register_int_gauge!( @@ -135,7 +145,29 @@ lazy_static! { .unwrap(); pub static ref RTS_MIN_LEADER_RESOLVED_TS_GAP: IntGauge = register_int_gauge!( "tikv_resolved_ts_min_leader_resolved_ts_gap_millis", - "The minimal (non-zero) resolved ts gap for observe leader peers" + "The gap between now() and the minimal (non-zero) resolved ts for leader peers" + ) + .unwrap(); + + // for min_follower_resolved_ts + pub static ref RTS_MIN_FOLLOWER_RESOLVED_TS_REGION: IntGauge = register_int_gauge!( + "tikv_resolved_ts_min_follower_resolved_ts_region", + "The region id of the follower has minimal resolved ts" + ) + .unwrap(); + pub static ref RTS_MIN_FOLLOWER_RESOLVED_TS: IntGauge = register_int_gauge!( + "tikv_resolved_ts_min_follower_resolved_ts", + "The minimal (non-zero) resolved ts for follower regions" + ) + .unwrap(); + pub static ref RTS_MIN_FOLLOWER_RESOLVED_TS_GAP: IntGauge = register_int_gauge!( + "tikv_resolved_ts_min_follower_resolved_ts_gap_millis", + "The max gap of now() and the minimal (non-zero) resolved ts for follower regions" + ) + .unwrap(); + pub static ref RTS_MIN_FOLLOWER_RESOLVED_TS_DURATION_TO_LAST_CONSUME_LEADER: IntGauge = register_int_gauge!( + "tikv_resolved_ts_min_follower_resolved_ts_duration_to_last_consume_leader", + "The duration since last check_leader() in the follower region with min resolved ts. -1 denotes None." ) .unwrap(); pub static ref RTS_INITIAL_SCAN_BACKOFF_DURATION_HISTOGRAM: Histogram = register_histogram!( diff --git a/components/resolved_ts/src/resolver.rs b/components/resolved_ts/src/resolver.rs index ef257ad4762..e0814176a92 100644 --- a/components/resolved_ts/src/resolver.rs +++ b/components/resolved_ts/src/resolver.rs @@ -8,13 +8,46 @@ use tikv_util::{ memory::{HeapSize, MemoryQuota}, time::Instant, }; -use txn_types::TimeStamp; +use txn_types::{Key, TimeStamp}; use crate::metrics::RTS_RESOLVED_FAIL_ADVANCE_VEC; const MAX_NUMBER_OF_LOCKS_IN_LOG: usize = 10; pub const ON_DROP_WARN_HEAP_SIZE: usize = 64 * 1024 * 1024; // 64MB +#[derive(Clone)] +pub enum TsSource { + // A lock in LOCK CF + Lock(Arc<[u8]>), + // A memory lock in concurrency manager + MemoryLock(Key), + PdTso, + // The following sources can also come from PD or memory lock, but we care more about sources + // in resolved-ts. + BackupStream, + Cdc, +} + +impl TsSource { + pub fn label(&self) -> &str { + match self { + TsSource::Lock(_) => "lock", + TsSource::MemoryLock(_) => "rts_cm_min_lock", + TsSource::PdTso => "pd_tso", + TsSource::BackupStream => "backup_stream", + TsSource::Cdc => "cdc", + } + } + + pub fn key(&self) -> Option { + match self { + TsSource::Lock(k) => Some(Key::from_encoded_slice(k)), + TsSource::MemoryLock(k) => Some(k.clone()), + _ => None, + } + } +} + // Resolver resolves timestamps that guarantee no more commit will happen before // the timestamp. pub struct Resolver { @@ -22,7 +55,7 @@ pub struct Resolver { // key -> start_ts locks_by_key: HashMap, TimeStamp>, // start_ts -> locked keys. - lock_ts_heap: BTreeMap>>, + pub(crate) lock_ts_heap: BTreeMap>>, // The last shrink time. last_aggressive_shrink_time: Instant, // The timestamps that guarantees no more commit will happen before. @@ -35,14 +68,42 @@ pub struct Resolver { min_ts: TimeStamp, // Whether the `Resolver` is stopped stopped: bool, - // The memory quota for the `Resolver` and its lock keys and timestamps. memory_quota: Arc, + // The last attempt of resolve(), used for diagnosis. + pub(crate) last_attempt: Option, +} + +#[derive(Clone)] +pub(crate) struct LastAttempt { + success: bool, + ts: TimeStamp, + reason: TsSource, +} + +impl slog::Value for LastAttempt { + fn serialize( + &self, + _record: &slog::Record<'_>, + key: slog::Key, + serializer: &mut dyn slog::Serializer, + ) -> slog::Result { + serializer.emit_arguments( + key, + &format_args!( + "{{ success={}, ts={}, reason={}, key={:?} }}", + self.success, + self.ts, + self.reason.label(), + self.reason.key(), + ), + ) + } } impl std::fmt::Debug for Resolver { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let far_lock = self.lock_ts_heap.iter().next(); + let far_lock = self.oldest_transaction(); let mut dt = f.debug_tuple("Resolver"); dt.field(&format_args!("region={}", self.region_id)); @@ -103,6 +164,7 @@ impl Resolver { min_ts: TimeStamp::zero(), stopped: false, memory_quota, + last_attempt: None, } } @@ -252,7 +314,12 @@ impl Resolver { /// /// `min_ts` advances the resolver even if there is no write. /// Return None means the resolver is not initialized. - pub fn resolve(&mut self, min_ts: TimeStamp, now: Option) -> TimeStamp { + pub fn resolve( + &mut self, + min_ts: TimeStamp, + now: Option, + source: TsSource, + ) -> TimeStamp { // Use a small ratio to shrink the memory usage aggressively. const AGGRESSIVE_SHRINK_RATIO: usize = 2; const AGGRESSIVE_SHRINK_INTERVAL: Duration = Duration::from_secs(10); @@ -268,17 +335,36 @@ impl Resolver { } // Find the min start ts. - let min_lock = self.lock_ts_heap.keys().next().cloned(); + let min_lock = self + .oldest_transaction() + .and_then(|(ts, locks)| locks.iter().next().map(|lock| (*ts, lock))); let has_lock = min_lock.is_some(); - let min_start_ts = min_lock.unwrap_or(min_ts); + let min_start_ts = min_lock.map(|(ts, _)| ts).unwrap_or(min_ts); // No more commit happens before the ts. let new_resolved_ts = cmp::min(min_start_ts, min_ts); + // reason is the min source of the new resolved ts. + let reason = match (min_lock, min_ts) { + (Some(lock), min_ts) if lock.0 < min_ts => TsSource::Lock(lock.1.clone()), + (Some(_), _) => source, + (None, _) => source, + }; + if self.resolved_ts >= new_resolved_ts { - let label = if has_lock { "has_lock" } else { "stale_ts" }; RTS_RESOLVED_FAIL_ADVANCE_VEC - .with_label_values(&[label]) + .with_label_values(&[reason.label()]) .inc(); + self.last_attempt = Some(LastAttempt { + success: false, + ts: new_resolved_ts, + reason, + }); + } else { + self.last_attempt = Some(LastAttempt { + success: true, + ts: new_resolved_ts, + reason, + }) } // Resolved ts never decrease. @@ -335,6 +421,10 @@ impl Resolver { pub(crate) fn read_progress(&self) -> Option<&Arc> { self.read_progress.as_ref() } + + pub(crate) fn oldest_transaction(&self) -> Option<(&TimeStamp, &HashSet>)> { + self.lock_ts_heap.iter().next() + } } #[cfg(test)] @@ -419,7 +509,7 @@ mod tests { Event::Unlock(key) => resolver.untrack_lock(&key.into_raw().unwrap(), None), Event::Resolve(min_ts, expect) => { assert_eq!( - resolver.resolve(min_ts.into(), None), + resolver.resolve(min_ts.into(), None, TsSource::PdTso), expect.into(), "case {}", i @@ -501,7 +591,7 @@ mod tests { // Trigger aggressive shrink. resolver.last_aggressive_shrink_time = Instant::now_coarse() - Duration::from_secs(600); - resolver.resolve(TimeStamp::new(0), None); + resolver.resolve(TimeStamp::new(0), None, TsSource::PdTso); assert!( resolver.locks_by_key.capacity() == 0, "{}, {}", diff --git a/metrics/grafana/tikv_details.json b/metrics/grafana/tikv_details.json index c78540c601a..ceed5c6314c 100644 --- a/metrics/grafana/tikv_details.json +++ b/metrics/grafana/tikv_details.json @@ -39068,7 +39068,7 @@ "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", - "description": "The gap between safe ts and current time", + "description": "The gap between now() and the minimal (non-zero) safe ts for followers", "editable": true, "error": false, "fieldConfig": { @@ -39119,7 +39119,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(tikv_resolved_ts_min_safe_ts_gap_millis{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}) by (instance)", + "expr": "sum(tikv_resolved_ts_min_follower_safe_ts_gap_millis{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}) by (instance)", "format": "time_series", "interval": "", "intervalFactor": 2, @@ -39132,7 +39132,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Max gap of safe-ts", + "title": "Max gap of follower safe-ts", "tooltip": { "msResolution": false, "shared": true, @@ -39292,7 +39292,7 @@ "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, - "description": "The region that has minimal safe ts", + "description": "The region id of the follower that has minimal safe ts", "editable": true, "error": false, "fieldConfig": { @@ -39348,7 +39348,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(tikv_resolved_ts_min_safe_ts_region{tidb_cluster=~\"$tidb_cluster.*\", instance=~\"$instance\"}) by (instance)", + "expr": "sum(tikv_resolved_ts_min_follower_safe_ts_region{tidb_cluster=~\"$tidb_cluster.*\", instance=~\"$instance\"}) by (instance)", "format": "time_series", "hide": false, "interval": "", @@ -39362,7 +39362,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Min Safe TS Region", + "title": "Min Safe TS Follower Region", "tooltip": { "msResolution": false, "shared": true, From 1abc220dca85950a728c7be06f469870373fb463 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E8=B6=85?= Date: Wed, 6 Sep 2023 14:48:43 +0800 Subject: [PATCH 033/220] coprocessor: add SQL statement tracing in tikv slow log (#15514) close tikv/tikv#15513 coprocessor: add SQL statement tracing in tikv slow log Signed-off-by: Chao Wang Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- src/coprocessor/tracker.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/coprocessor/tracker.rs b/src/coprocessor/tracker.rs index 71d84388c3b..bb32a3a0e03 100644 --- a/src/coprocessor/tracker.rs +++ b/src/coprocessor/tracker.rs @@ -264,8 +264,11 @@ impl Tracker { .unwrap_or_default() }); + let source_stmt = self.req_ctx.context.get_source_stmt(); with_tls_tracker(|tracker| { info!(#"slow_log", "slow-query"; + "connection_id" => source_stmt.get_connection_id(), + "session_alias" => source_stmt.get_session_alias(), "region_id" => &self.req_ctx.context.get_region_id(), "remote_host" => &self.req_ctx.peer, "total_lifetime" => ?self.req_lifetime, From fd896513d1c1bf274cf11acae1a09b6034b3c149 Mon Sep 17 00:00:00 2001 From: Neil Shen Date: Thu, 7 Sep 2023 15:00:44 +0800 Subject: [PATCH 034/220] engine_rocks: trace all memtables including pinned (#15547) close tikv/tikv#15546 Signed-off-by: Neil Shen --- components/engine_rocks/src/rocks_metrics.rs | 15 ++++++++++++--- components/engine_rocks/src/rocks_metrics_defs.rs | 1 + metrics/grafana/tikv_details.json | 2 +- 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/components/engine_rocks/src/rocks_metrics.rs b/components/engine_rocks/src/rocks_metrics.rs index 522696cb150..2b32af111ec 100644 --- a/components/engine_rocks/src/rocks_metrics.rs +++ b/components/engine_rocks/src/rocks_metrics.rs @@ -920,6 +920,7 @@ struct CfStats { blob_cache_size: Option, readers_mem: Option, mem_tables: Option, + mem_tables_all: Option, num_keys: Option, pending_compaction_bytes: Option, num_immutable_mem_table: Option, @@ -978,6 +979,9 @@ impl StatisticsReporter for RocksStatisticsReporter { if let Some(v) = db.get_property_int_cf(handle, ROCKSDB_CUR_SIZE_ALL_MEM_TABLES) { *cf_stats.mem_tables.get_or_insert_default() += v; } + if let Some(v) = db.get_property_int_cf(handle, ROCKSDB_SIZE_ALL_MEM_TABLES) { + *cf_stats.mem_tables_all.get_or_insert_default() += v; + } // TODO: add cache usage and pinned usage. if let Some(v) = db.get_property_int_cf(handle, ROCKSDB_ESTIMATE_NUM_KEYS) { *cf_stats.num_keys.get_or_insert_default() += v; @@ -1119,6 +1123,11 @@ impl StatisticsReporter for RocksStatisticsReporter { .with_label_values(&[&self.name, cf, "mem-tables"]) .set(v as i64); } + if let Some(v) = cf_stats.mem_tables_all { + STORE_ENGINE_MEMORY_GAUGE_VEC + .with_label_values(&[&self.name, cf, "mem-tables-all"]) + .set(v as i64); + } if let Some(v) = cf_stats.num_keys { STORE_ENGINE_ESTIMATE_NUM_KEYS_VEC .with_label_values(&[&self.name, cf]) @@ -1538,9 +1547,9 @@ lazy_static! { "Number of times titan blob file sync is done", &["db"] ).unwrap(); - pub static ref STORE_ENGINE_BLOB_FILE_SYNCED: SimpleEngineTickerMetrics = - auto_flush_from!(STORE_ENGINE_BLOB_FILE_SYNCED_VEC, SimpleEngineTickerMetrics); - + pub static ref STORE_ENGINE_BLOB_FILE_SYNCED: SimpleEngineTickerMetrics = + auto_flush_from!(STORE_ENGINE_BLOB_FILE_SYNCED_VEC, SimpleEngineTickerMetrics); + pub static ref STORE_ENGINE_BLOB_CACHE_EFFICIENCY_VEC: IntCounterVec = register_int_counter_vec!( "tikv_engine_blob_cache_efficiency", "Efficiency of titan's blob cache", diff --git a/components/engine_rocks/src/rocks_metrics_defs.rs b/components/engine_rocks/src/rocks_metrics_defs.rs index 042949f1c09..5bbc6245c72 100644 --- a/components/engine_rocks/src/rocks_metrics_defs.rs +++ b/components/engine_rocks/src/rocks_metrics_defs.rs @@ -5,6 +5,7 @@ use rocksdb::{DBStatisticsHistogramType as HistType, DBStatisticsTickerType as T pub const ROCKSDB_TOTAL_SST_FILES_SIZE: &str = "rocksdb.total-sst-files-size"; pub const ROCKSDB_TABLE_READERS_MEM: &str = "rocksdb.estimate-table-readers-mem"; pub const ROCKSDB_CUR_SIZE_ALL_MEM_TABLES: &str = "rocksdb.cur-size-all-mem-tables"; +pub const ROCKSDB_SIZE_ALL_MEM_TABLES: &str = "rocksdb.size-all-mem-tables"; pub const ROCKSDB_ESTIMATE_NUM_KEYS: &str = "rocksdb.estimate-num-keys"; pub const ROCKSDB_PENDING_COMPACTION_BYTES: &str = "rocksdb.\ estimate-pending-compaction-bytes"; diff --git a/metrics/grafana/tikv_details.json b/metrics/grafana/tikv_details.json index ceed5c6314c..c31ee12b27b 100644 --- a/metrics/grafana/tikv_details.json +++ b/metrics/grafana/tikv_details.json @@ -31941,7 +31941,7 @@ "steppedLine": false, "targets": [ { - "expr": "avg(tikv_engine_memory_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\", type=\"mem-tables\"}) by (cf)", + "expr": "avg(tikv_engine_memory_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\", type=\"mem-tables-all\"}) by (cf)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{cf}}", From 23c89b3fd2d0395d868b76deb0a0c820c3e48aab Mon Sep 17 00:00:00 2001 From: Neil Shen Date: Thu, 7 Sep 2023 15:15:44 +0800 Subject: [PATCH 035/220] *: let alloc API return result (#15529) ref tikv/tikv#15412 MemoryQuota alloc API returns result, make it more ergonomic. Signed-off-by: Neil Shen Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- .../backup-stream/src/subscription_track.rs | 6 +-- components/cdc/src/channel.rs | 30 ++++++++----- components/cdc/src/delegate.rs | 14 ++---- components/cdc/src/endpoint.rs | 4 +- components/cdc/src/errors.rs | 3 +- components/cdc/src/initializer.rs | 4 +- components/resolved_ts/src/endpoint.rs | 36 +++++++--------- components/resolved_ts/src/errors.rs | 3 +- components/resolved_ts/src/resolver.rs | 26 +++++------ components/tikv_util/src/memory.rs | 43 +++++++++++-------- 10 files changed, 87 insertions(+), 82 deletions(-) diff --git a/components/backup-stream/src/subscription_track.rs b/components/backup-stream/src/subscription_track.rs index d6d49f0cf1c..4f44ec46853 100644 --- a/components/backup-stream/src/subscription_track.rs +++ b/components/backup-stream/src/subscription_track.rs @@ -480,7 +480,7 @@ impl TwoPhaseResolver { warn!("backup stream tracking lock as if in phase one"; "start_ts" => %start_ts, "key" => %utils::redact(&key)) } // TODO: handle memory quota exceed, for now, quota is set to usize::MAX. - assert!(self.resolver.track_lock(start_ts, key, None)); + self.resolver.track_lock(start_ts, key, None).unwrap(); } pub fn track_lock(&mut self, start_ts: TimeStamp, key: Vec) { @@ -489,7 +489,7 @@ impl TwoPhaseResolver { return; } // TODO: handle memory quota exceed, for now, quota is set to usize::MAX. - assert!(self.resolver.track_lock(start_ts, key, None)); + self.resolver.track_lock(start_ts, key, None).unwrap(); } pub fn untrack_lock(&mut self, key: &[u8]) { @@ -505,7 +505,7 @@ impl TwoPhaseResolver { match lock { FutureLock::Lock(key, ts) => { // TODO: handle memory quota exceed, for now, quota is set to usize::MAX. - assert!(self.resolver.track_lock(ts, key, None)); + self.resolver.track_lock(ts, key, None).unwrap(); } FutureLock::Unlock(key) => self.resolver.untrack_lock(&key, None), } diff --git a/components/cdc/src/channel.rs b/components/cdc/src/channel.rs index a3ddeeb9030..b386c3561bb 100644 --- a/components/cdc/src/channel.rs +++ b/components/cdc/src/channel.rs @@ -14,7 +14,11 @@ use grpcio::WriteFlags; use kvproto::cdcpb::{ChangeDataEvent, Event, ResolvedTs}; use protobuf::Message; use tikv_util::{ - future::block_on_timeout, impl_display_as_debug, memory::MemoryQuota, time::Instant, warn, + future::block_on_timeout, + impl_display_as_debug, + memory::{MemoryQuota, MemoryQuotaExceeded}, + time::Instant, + warn, }; use crate::metrics::*; @@ -234,6 +238,12 @@ impl_from_future_send_error! { TrySendError<(CdcEvent, usize)>, } +impl From for SendError { + fn from(_: MemoryQuotaExceeded) -> Self { + SendError::Congested + } +} + #[derive(Clone)] pub struct Sink { unbounded_sender: UnboundedSender<(CdcEvent, usize)>, @@ -245,8 +255,8 @@ impl Sink { pub fn unbounded_send(&self, event: CdcEvent, force: bool) -> Result<(), SendError> { // Try it's best to send error events. let bytes = if !force { event.size() as usize } else { 0 }; - if bytes != 0 && !self.memory_quota.alloc(bytes) { - return Err(SendError::Congested); + if bytes != 0 { + self.memory_quota.alloc(bytes)?; } match self.unbounded_sender.unbounded_send((event, bytes)) { Ok(_) => Ok(()), @@ -265,9 +275,7 @@ impl Sink { let bytes = event.size(); total_bytes += bytes; } - if !self.memory_quota.alloc(total_bytes as _) { - return Err(SendError::Congested); - } + self.memory_quota.alloc(total_bytes as _)?; for event in events { let bytes = event.size() as usize; if let Err(e) = self.bounded_sender.feed((event, bytes)).await { @@ -570,9 +578,9 @@ mod tests { } } let memory_quota = rx.memory_quota.clone(); - assert_eq!(memory_quota.alloc(event.size() as _), false,); + memory_quota.alloc(event.size() as _).unwrap_err(); drop(rx); - assert_eq!(memory_quota.alloc(1024), true); + memory_quota.alloc(1024).unwrap(); } // Make sure memory quota is freed when tx is dropped before rx. { @@ -587,10 +595,10 @@ mod tests { } } let memory_quota = rx.memory_quota.clone(); - assert_eq!(memory_quota.alloc(event.size() as _), false,); + memory_quota.alloc(event.size() as _).unwrap_err(); drop(send); drop(rx); - assert_eq!(memory_quota.alloc(1024), true); + memory_quota.alloc(1024).unwrap(); } // Make sure sending message to a closed channel does not leak memory quota. { @@ -602,7 +610,7 @@ mod tests { send(CdcEvent::Event(e.clone())).unwrap_err(); } assert_eq!(memory_quota.in_use(), 0); - assert_eq!(memory_quota.alloc(1024), true); + memory_quota.alloc(1024).unwrap(); // Freeing bytes should not cause overflow. memory_quota.free(1024); diff --git a/components/cdc/src/delegate.rs b/components/cdc/src/delegate.rs index f7125aa8882..c82c4cb6f13 100644 --- a/components/cdc/src/delegate.rs +++ b/components/cdc/src/delegate.rs @@ -249,9 +249,7 @@ impl Pending { fn push_pending_lock(&mut self, lock: PendingLock) -> Result<()> { let bytes = lock.heap_size(); - if !self.memory_quota.alloc(bytes) { - return Err(Error::MemoryQuotaExceeded); - } + self.memory_quota.alloc(bytes)?; self.locks.push(lock); self.pending_bytes += bytes; CDC_PENDING_BYTES_GAUGE.add(bytes as i64); @@ -260,16 +258,14 @@ impl Pending { fn on_region_ready(&mut self, resolver: &mut Resolver) -> Result<()> { fail::fail_point!("cdc_pending_on_region_ready", |_| Err( - Error::MemoryQuotaExceeded + Error::MemoryQuotaExceeded(tikv_util::memory::MemoryQuotaExceeded) )); // Must take locks, otherwise it may double free memory quota on drop. for lock in mem::take(&mut self.locks) { self.memory_quota.free(lock.heap_size()); match lock { PendingLock::Track { key, start_ts } => { - if !resolver.track_lock(start_ts, key, None) { - return Err(Error::MemoryQuotaExceeded); - } + resolver.track_lock(start_ts, key, None)?; } PendingLock::Untrack { key } => resolver.untrack_lock(&key, None), } @@ -900,9 +896,7 @@ impl Delegate { // In order to compute resolved ts, we must track inflight txns. match self.resolver { Some(ref mut resolver) => { - if !resolver.track_lock(row.start_ts.into(), row.key.clone(), None) { - return Err(Error::MemoryQuotaExceeded); - } + resolver.track_lock(row.start_ts.into(), row.key.clone(), None)?; } None => { assert!(self.pending.is_some(), "region resolver not ready"); diff --git a/components/cdc/src/endpoint.rs b/components/cdc/src/endpoint.rs index 2b314f22443..a5f00a08028 100644 --- a/components/cdc/src/endpoint.rs +++ b/components/cdc/src/endpoint.rs @@ -2644,7 +2644,9 @@ mod tests { let memory_quota = Arc::new(MemoryQuota::new(std::usize::MAX)); let mut resolver = Resolver::new(id, memory_quota); - assert!(resolver.track_lock(TimeStamp::compose(0, id), vec![], None)); + resolver + .track_lock(TimeStamp::compose(0, id), vec![], None) + .unwrap(); let mut region = Region::default(); region.id = id; region.set_region_epoch(region_epoch); diff --git a/components/cdc/src/errors.rs b/components/cdc/src/errors.rs index e44c39e3999..e7bd7605e7d 100644 --- a/components/cdc/src/errors.rs +++ b/components/cdc/src/errors.rs @@ -10,6 +10,7 @@ use tikv::storage::{ mvcc::{Error as MvccError, ErrorInner as MvccErrorInner}, txn::{Error as TxnError, ErrorInner as TxnErrorInner}, }; +use tikv_util::memory::MemoryQuotaExceeded; use txn_types::Error as TxnTypesError; use crate::channel::SendError; @@ -36,7 +37,7 @@ pub enum Error { #[error("Sink send error {0:?}")] Sink(#[from] SendError), #[error("Memory quota exceeded")] - MemoryQuotaExceeded, + MemoryQuotaExceeded(#[from] MemoryQuotaExceeded), } macro_rules! impl_from { diff --git a/components/cdc/src/initializer.rs b/components/cdc/src/initializer.rs index ef0b15caab9..31cda4b9e72 100644 --- a/components/cdc/src/initializer.rs +++ b/components/cdc/src/initializer.rs @@ -424,9 +424,7 @@ impl Initializer { let lock = Lock::parse(value)?; match lock.lock_type { LockType::Put | LockType::Delete => { - if !resolver.track_lock(lock.ts, key, None) { - return Err(Error::MemoryQuotaExceeded); - } + resolver.track_lock(lock.ts, key, None)?; } _ => (), }; diff --git a/components/resolved_ts/src/endpoint.rs b/components/resolved_ts/src/endpoint.rs index e2d2aec4f70..2a2f56eaadd 100644 --- a/components/resolved_ts/src/endpoint.rs +++ b/components/resolved_ts/src/endpoint.rs @@ -103,10 +103,10 @@ impl ResolverStatus { }; // Check if adding a new lock or unlock will exceed the memory // quota. - if !memory_quota.alloc(lock.heap_size()) { + memory_quota.alloc(lock.heap_size()).map_err(|e| { fail::fail_point!("resolved_ts_on_pending_locks_memory_quota_exceeded"); - return Err(Error::MemoryQuotaExceeded); - } + Error::MemoryQuotaExceeded(e) + })?; locks.push(lock); Ok(()) } @@ -292,13 +292,11 @@ impl ObserveRegion { for row in rows { match row { ChangeRow::Prewrite { key, start_ts, .. } => { - if !self.resolver.track_lock( + self.resolver.track_lock( *start_ts, key.to_raw().unwrap(), Some(*index), - ) { - return Err(Error::MemoryQuotaExceeded); - } + )?; } ChangeRow::Commit { key, .. } => self .resolver @@ -328,13 +326,11 @@ impl ObserveRegion { panic!("region {:?} resolver has ready", self.meta.id) } for (key, lock) in locks { - if !self.resolver.track_lock( + self.resolver.track_lock( lock.ts, key.to_raw().unwrap(), Some(apply_index), - ) { - return Err(Error::MemoryQuotaExceeded); - } + )?; } } ScanEntry::None => { @@ -347,13 +343,11 @@ impl ObserveRegion { for lock in pending_locks { match lock { PendingLock::Track { key, start_ts } => { - if !self.resolver.track_lock( + self.resolver.track_lock( start_ts, key.to_raw().unwrap(), Some(pending_tracked_index), - ) { - return Err(Error::MemoryQuotaExceeded); - } + )?; } PendingLock::Untrack { key, .. } => self .resolver @@ -924,7 +918,7 @@ where if let Err(e) = observe_region.track_change_log(&logs) { drop(observe_region); let backoff = match e { - Error::MemoryQuotaExceeded => Some(MEMORY_QUOTA_EXCEEDED_BACKOFF), + Error::MemoryQuotaExceeded(_) => Some(MEMORY_QUOTA_EXCEEDED_BACKOFF), Error::Other(_) => None, }; self.re_register_region(region_id, observe_id, e, backoff); @@ -947,13 +941,13 @@ where entries: Vec, apply_index: u64, ) { - let mut is_memory_quota_exceeded = false; + let mut memory_quota_exceeded = None; if let Some(observe_region) = self.regions.get_mut(®ion_id) { if observe_region.handle.id == observe_id { - if let Err(Error::MemoryQuotaExceeded) = + if let Err(Error::MemoryQuotaExceeded(e)) = observe_region.track_scan_locks(entries, apply_index) { - is_memory_quota_exceeded = true; + memory_quota_exceeded = Some(Error::MemoryQuotaExceeded(e)); } } } else { @@ -961,9 +955,9 @@ where "region_id" => region_id, "observe_id" => ?observe_id); } - if is_memory_quota_exceeded { + if let Some(e) = memory_quota_exceeded { let backoff = Some(MEMORY_QUOTA_EXCEEDED_BACKOFF); - self.re_register_region(region_id, observe_id, Error::MemoryQuotaExceeded, backoff); + self.re_register_region(region_id, observe_id, e, backoff); } } diff --git a/components/resolved_ts/src/errors.rs b/components/resolved_ts/src/errors.rs index b4a59a2c7a0..4e14c1d78d9 100644 --- a/components/resolved_ts/src/errors.rs +++ b/components/resolved_ts/src/errors.rs @@ -1,11 +1,12 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. use thiserror::Error; +use tikv_util::memory::MemoryQuotaExceeded; #[derive(Debug, Error)] pub enum Error { #[error("Memory quota exceeded")] - MemoryQuotaExceeded, + MemoryQuotaExceeded(#[from] MemoryQuotaExceeded), #[error("Other error {0}")] Other(#[from] Box), } diff --git a/components/resolved_ts/src/resolver.rs b/components/resolved_ts/src/resolver.rs index e0814176a92..9a62a0eea98 100644 --- a/components/resolved_ts/src/resolver.rs +++ b/components/resolved_ts/src/resolver.rs @@ -5,7 +5,7 @@ use std::{cmp, collections::BTreeMap, sync::Arc, time::Duration}; use collections::{HashMap, HashSet}; use raftstore::store::RegionReadProgress; use tikv_util::{ - memory::{HeapSize, MemoryQuota}, + memory::{HeapSize, MemoryQuota, MemoryQuotaExceeded}, time::Instant, }; use txn_types::{Key, TimeStamp}; @@ -245,8 +245,12 @@ impl Resolver { } } - #[must_use] - pub fn track_lock(&mut self, start_ts: TimeStamp, key: Vec, index: Option) -> bool { + pub fn track_lock( + &mut self, + start_ts: TimeStamp, + key: Vec, + index: Option, + ) -> Result<(), MemoryQuotaExceeded> { if let Some(index) = index { self.update_tracked_index(index); } @@ -260,13 +264,11 @@ impl Resolver { "memory_capacity" => self.memory_quota.capacity(), "key_heap_size" => bytes, ); - if !self.memory_quota.alloc(bytes) { - return false; - } + self.memory_quota.alloc(bytes)?; let key: Arc<[u8]> = key.into_boxed_slice().into(); self.locks_by_key.insert(key.clone(), start_ts); self.lock_ts_heap.entry(start_ts).or_default().insert(key); - true + Ok(()) } pub fn untrack_lock(&mut self, key: &[u8], index: Option) { @@ -500,11 +502,9 @@ mod tests { for e in case.clone() { match e { Event::Lock(start_ts, key) => { - assert!(resolver.track_lock( - start_ts.into(), - key.into_raw().unwrap(), - None - )); + resolver + .track_lock(start_ts.into(), key.into_raw().unwrap(), None) + .unwrap(); } Event::Unlock(key) => resolver.untrack_lock(&key.into_raw().unwrap(), None), Event::Resolve(min_ts, expect) => { @@ -527,7 +527,7 @@ mod tests { let mut key = vec![0; 77]; let lock_size = resolver.lock_heap_size(&key); let mut ts = TimeStamp::default(); - while resolver.track_lock(ts, key.clone(), None) { + while resolver.track_lock(ts, key.clone(), None).is_ok() { ts.incr(); key[0..8].copy_from_slice(&ts.into_inner().to_be_bytes()); } diff --git a/components/tikv_util/src/memory.rs b/components/tikv_util/src/memory.rs index 17b6b23cf78..291254c5227 100644 --- a/components/tikv_util/src/memory.rs +++ b/components/tikv_util/src/memory.rs @@ -75,16 +75,23 @@ impl HeapSize for RaftCmdRequest { } } +#[derive(Debug)] +pub struct MemoryQuotaExceeded; + +impl std::error::Error for MemoryQuotaExceeded {} + +impl_display_as_debug!(MemoryQuotaExceeded); + pub struct MemoryQuota { - capacity: AtomicUsize, in_use: AtomicUsize, + capacity: AtomicUsize, } impl MemoryQuota { pub fn new(capacity: usize) -> MemoryQuota { MemoryQuota { - capacity: AtomicUsize::new(capacity), in_use: AtomicUsize::new(0), + capacity: AtomicUsize::new(capacity), } } @@ -93,28 +100,28 @@ impl MemoryQuota { } pub fn capacity(&self) -> usize { - self.capacity.load(Ordering::Acquire) + self.capacity.load(Ordering::Relaxed) } pub fn set_capacity(&self, capacity: usize) { - self.capacity.store(capacity, Ordering::Release) + self.capacity.store(capacity, Ordering::Relaxed); } - pub fn alloc(&self, bytes: usize) -> bool { + pub fn alloc(&self, bytes: usize) -> Result<(), MemoryQuotaExceeded> { + let capacity = self.capacity.load(Ordering::Relaxed); let mut in_use_bytes = self.in_use.load(Ordering::Relaxed); - let capacity = self.capacity.load(Ordering::Acquire); loop { if in_use_bytes + bytes > capacity { - return false; + return Err(MemoryQuotaExceeded); } let new_in_use_bytes = in_use_bytes + bytes; match self.in_use.compare_exchange_weak( in_use_bytes, new_in_use_bytes, - Ordering::Acquire, + Ordering::Relaxed, Ordering::Relaxed, ) { - Ok(_) => return true, + Ok(_) => return Ok(()), Err(current) => in_use_bytes = current, } } @@ -128,7 +135,7 @@ impl MemoryQuota { match self.in_use.compare_exchange_weak( in_use_bytes, new_in_use_bytes, - Ordering::Acquire, + Ordering::Relaxed, Ordering::Relaxed, ) { Ok(_) => return, @@ -145,13 +152,13 @@ mod tests { #[test] fn test_memory_quota() { let quota = MemoryQuota::new(100); - assert!(quota.alloc(10)); + quota.alloc(10).unwrap(); assert_eq!(quota.in_use(), 10); - assert!(!quota.alloc(100)); + quota.alloc(100).unwrap_err(); assert_eq!(quota.in_use(), 10); quota.free(5); assert_eq!(quota.in_use(), 5); - assert!(quota.alloc(95)); + quota.alloc(95).unwrap(); assert_eq!(quota.in_use(), 100); quota.free(95); assert_eq!(quota.in_use(), 5); @@ -160,19 +167,19 @@ mod tests { #[test] fn test_resize_memory_quota() { let quota = MemoryQuota::new(100); - assert!(quota.alloc(10)); + quota.alloc(10).unwrap(); assert_eq!(quota.in_use(), 10); - assert!(!quota.alloc(100)); + quota.alloc(100).unwrap_err(); assert_eq!(quota.in_use(), 10); quota.set_capacity(200); - assert!(quota.alloc(100)); + quota.alloc(100).unwrap(); assert_eq!(quota.in_use(), 110); quota.set_capacity(50); - assert!(!quota.alloc(100)); + quota.alloc(100).unwrap_err(); assert_eq!(quota.in_use(), 110); quota.free(100); assert_eq!(quota.in_use(), 10); - assert!(quota.alloc(40)); + quota.alloc(40).unwrap(); assert_eq!(quota.in_use(), 50); } } From 87d0f7cf143524222b4b0d80a4a8c5e02d11cf67 Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Thu, 7 Sep 2023 15:44:15 +0800 Subject: [PATCH 036/220] raftstore-v2: supplement read track metrics (#15508) ref tikv/tikv#15409 supplement read track metrics Signed-off-by: SpadeA-Tang Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- components/raftstore-v2/src/fsm/peer.rs | 21 ++++++++++++------- .../raftstore-v2/src/operation/query/local.rs | 4 ++++ .../cases/test_read_execution_tracker.rs | 15 +++++++------ 3 files changed, 27 insertions(+), 13 deletions(-) diff --git a/components/raftstore-v2/src/fsm/peer.rs b/components/raftstore-v2/src/fsm/peer.rs index f6b9217ecbf..d51d8eedb2a 100644 --- a/components/raftstore-v2/src/fsm/peer.rs +++ b/components/raftstore-v2/src/fsm/peer.rs @@ -9,7 +9,7 @@ use crossbeam::channel::TryRecvError; use encryption_export::DataKeyManager; use engine_traits::{KvEngine, RaftEngine, TabletRegistry}; use kvproto::{errorpb, raft_cmdpb::RaftCmdResponse}; -use raftstore::store::{Config, TabletSnapManager, Transport}; +use raftstore::store::{Config, ReadCallback, TabletSnapManager, Transport}; use slog::{debug, info, trace, Logger}; use tikv_util::{ is_zero_duration, @@ -17,6 +17,7 @@ use tikv_util::{ slog_panic, time::{duration_to_sec, Instant}, }; +use tracker::{TrackerToken, GLOBAL_TRACKERS}; use crate::{ batch::StoreContext, @@ -206,11 +207,17 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, } #[inline] - fn on_receive_command(&self, send_time: Instant) { + fn on_receive_command(&self, send_time: Instant, read_token: Option) { + let propose_wait_time = send_time.saturating_elapsed(); self.store_ctx .raft_metrics .propose_wait_time - .observe(duration_to_sec(send_time.saturating_elapsed())); + .observe(duration_to_sec(propose_wait_time)); + if let Some(token) = read_token { + GLOBAL_TRACKERS.with_tracker(token, |tracker| { + tracker.metrics.read_index_propose_wait_nanos = propose_wait_time.as_nanos() as u64; + }); + } } fn on_tick(&mut self, tick: PeerTick) { @@ -243,17 +250,17 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, self.fsm.peer.on_raft_message(self.store_ctx, msg); } PeerMsg::RaftQuery(cmd) => { - self.on_receive_command(cmd.send_time); + self.on_receive_command(cmd.send_time, cmd.ch.read_tracker()); self.on_query(cmd.request, cmd.ch) } PeerMsg::AdminCommand(cmd) => { - self.on_receive_command(cmd.send_time); + self.on_receive_command(cmd.send_time, None); self.fsm .peer_mut() .on_admin_command(self.store_ctx, cmd.request, cmd.ch) } PeerMsg::SimpleWrite(write) => { - self.on_receive_command(write.send_time); + self.on_receive_command(write.send_time, None); self.fsm.peer_mut().on_simple_write( self.store_ctx, write.header, @@ -262,7 +269,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, ); } PeerMsg::UnsafeWrite(write) => { - self.on_receive_command(write.send_time); + self.on_receive_command(write.send_time, None); self.fsm .peer_mut() .on_unsafe_write(self.store_ctx, write.data); diff --git a/components/raftstore-v2/src/operation/query/local.rs b/components/raftstore-v2/src/operation/query/local.rs index 36dbb26e4c7..2f074fdc04d 100644 --- a/components/raftstore-v2/src/operation/query/local.rs +++ b/components/raftstore-v2/src/operation/query/local.rs @@ -580,6 +580,10 @@ impl<'r> SnapRequestInspector<'r> { )); } + fail::fail_point!("perform_read_index", |_| Ok(ReadRequestPolicy::ReadIndex)); + + fail::fail_point!("perform_read_local", |_| Ok(ReadRequestPolicy::ReadLocal)); + let flags = WriteBatchFlags::from_bits_check(req.get_header().get_flags()); if flags.contains(WriteBatchFlags::STALE_READ) { return Ok(ReadRequestPolicy::StaleRead); diff --git a/tests/failpoints/cases/test_read_execution_tracker.rs b/tests/failpoints/cases/test_read_execution_tracker.rs index c5ff93a70c1..7351044b297 100644 --- a/tests/failpoints/cases/test_read_execution_tracker.rs +++ b/tests/failpoints/cases/test_read_execution_tracker.rs @@ -2,13 +2,13 @@ use kvproto::kvrpcpb::*; use test_coprocessor::{init_with_data, DagSelect, ProductTable}; -use test_raftstore::{ - kv_batch_read, kv_read, must_kv_commit, must_kv_prewrite, must_new_cluster_and_kv_client, -}; +use test_raftstore::{kv_batch_read, kv_read, must_kv_commit, must_kv_prewrite}; +use test_raftstore_macro::test_case; -#[test] +#[test_case(test_raftstore::must_new_cluster_and_kv_client)] +#[test_case(test_raftstore_v2::must_new_cluster_and_kv_client)] fn test_read_execution_tracking() { - let (_cluster, client, ctx) = must_new_cluster_and_kv_client(); + let (_cluster, client, ctx) = new_cluster(); let (k1, v1) = (b"k1".to_vec(), b"v1".to_vec()); let (k2, v2) = (b"k2".to_vec(), b"v2".to_vec()); @@ -104,18 +104,21 @@ fn test_read_execution_tracking() { ); }; - fail::cfg("perform_read_index", "return()").unwrap(); + // return read_index twich: one for local reader and one for raftstore + fail::cfg("perform_read_index", "2*return()").unwrap(); // should perform read index let resp = kv_read(&client, ctx.clone(), k1.clone(), 100); read_index_checker(resp.get_exec_details_v2().get_scan_detail_v2()); + fail::cfg("perform_read_index", "2*return()").unwrap(); // should perform read index let resp = kv_batch_read(&client, ctx, vec![k1, k2], 100); read_index_checker(resp.get_exec_details_v2().get_scan_detail_v2()); + fail::cfg("perform_read_index", "2*return()").unwrap(); // should perform read index let resp = client.coprocessor(&coprocessor_request).unwrap(); From 98eb383b41695b11a03e3d1ce471181f02bfc741 Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Mon, 11 Sep 2023 17:06:14 +0800 Subject: [PATCH 037/220] raftstore-v2: fix chaos between on_memtable_sealed and on_flush_completed (#15543) close tikv/tikv#15534 fix chaos between on_memtable_sealed and on_flush_completed Signed-off-by: SpadeA-Tang --- Cargo.lock | 6 +- components/engine_rocks/src/event_listener.rs | 11 ++- components/engine_traits/src/flush.rs | 31 +++++-- tests/failpoints/cases/test_engine.rs | 88 ++++++++++++++++++- 4 files changed, 124 insertions(+), 12 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4cd0882628b..7e09c3d2979 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3108,7 +3108,7 @@ dependencies = [ [[package]] name = "librocksdb_sys" version = "0.1.0" -source = "git+https://github.com/tikv/rust-rocksdb.git#b68565569d711d78f8ae0d24e2d2b59f0fd03ef1" +source = "git+https://github.com/SpadeA-Tang/rust-rocksdb.git?branch=fix-sealed-chaos#f5121f48a1543c5d576ad7964c617f30f79a3d66" dependencies = [ "bindgen 0.65.1", "bzip2-sys", @@ -3127,7 +3127,7 @@ dependencies = [ [[package]] name = "libtitan_sys" version = "0.0.1" -source = "git+https://github.com/tikv/rust-rocksdb.git#b68565569d711d78f8ae0d24e2d2b59f0fd03ef1" +source = "git+https://github.com/SpadeA-Tang/rust-rocksdb.git?branch=fix-sealed-chaos#f5121f48a1543c5d576ad7964c617f30f79a3d66" dependencies = [ "bzip2-sys", "cc", @@ -5101,7 +5101,7 @@ dependencies = [ [[package]] name = "rocksdb" version = "0.3.0" -source = "git+https://github.com/tikv/rust-rocksdb.git#b68565569d711d78f8ae0d24e2d2b59f0fd03ef1" +source = "git+https://github.com/SpadeA-Tang/rust-rocksdb.git?branch=fix-sealed-chaos#f5121f48a1543c5d576ad7964c617f30f79a3d66" dependencies = [ "libc 0.2.146", "librocksdb_sys", diff --git a/components/engine_rocks/src/event_listener.rs b/components/engine_rocks/src/event_listener.rs index 9628c61c23f..03a40d005c8 100644 --- a/components/engine_rocks/src/event_listener.rs +++ b/components/engine_rocks/src/event_listener.rs @@ -194,8 +194,15 @@ impl rocksdb::EventListener for RocksPersistenceListener { fn on_memtable_sealed(&self, info: &MemTableInfo) { // Note: first_seqno is effectively the smallest seqno of memtable. // earliest_seqno has ambiguous semantics. - self.0 - .on_memtable_sealed(info.cf_name().to_string(), info.first_seqno()); + self.0.on_memtable_sealed( + info.cf_name().to_string(), + info.first_seqno(), + info.largest_seqno(), + ); + } + + fn on_flush_begin(&self, _: &FlushJobInfo) { + fail::fail_point!("on_flush_begin"); } fn on_flush_completed(&self, job: &FlushJobInfo) { diff --git a/components/engine_traits/src/flush.rs b/components/engine_traits/src/flush.rs index 9344e84bb4e..8590236e126 100644 --- a/components/engine_traits/src/flush.rs +++ b/components/engine_traits/src/flush.rs @@ -18,14 +18,17 @@ use std::{ atomic::{AtomicU64, Ordering}, Arc, Mutex, RwLock, }, + time::Duration, }; use kvproto::import_sstpb::SstMeta; -use slog_global::info; -use tikv_util::set_panic_mark; +use slog_global::{info, warn}; +use tikv_util::{set_panic_mark, time::Instant}; use crate::{data_cf_offset, RaftEngine, RaftLogBatch, DATA_CFS_LEN}; +const HEAVY_WORKER_THRESHOLD: Duration = Duration::from_millis(25); + #[derive(Debug)] pub struct ApplyProgress { cf: String, @@ -203,7 +206,11 @@ impl PersistenceListener { /// Called when memtable is frozen. /// /// `smallest_seqno` should be the smallest seqno of the memtable. - pub fn on_memtable_sealed(&self, cf: String, smallest_seqno: u64) { + /// + /// Note: After https://github.com/tikv/rocksdb/pull/347, rocksdb global lock will + /// be held during this method, so we should avoid do heavy things in it. + pub fn on_memtable_sealed(&self, cf: String, smallest_seqno: u64, largest_seqno: u64) { + let t = Instant::now_coarse(); (|| { fail_point!("on_memtable_sealed", |t| { assert_eq!(t.unwrap().as_str(), cf); @@ -219,8 +226,9 @@ impl PersistenceListener { let flushed = prs.last_flushed[offset]; if flushed > smallest_seqno { panic!( - "sealed seqno has been flushed {} {} {} <= {}", - cf, apply_index, smallest_seqno, flushed + "sealed seqno conflict with latest flushed index, cf {}, + sealed smallest_seqno {}, sealed largest_seqno {}, last_flushed {}, apply_index {}", + cf, smallest_seqno, largest_seqno, flushed, apply_index, ); } prs.prs.push_back(ApplyProgress { @@ -228,6 +236,11 @@ impl PersistenceListener { apply_index, smallest_seqno, }); + if t.saturating_elapsed() > HEAVY_WORKER_THRESHOLD { + warn!( + "heavy work in on_memtable_sealed, the code should be reviewed"; + ); + } } /// Called a memtable finished flushing. @@ -244,7 +257,13 @@ impl PersistenceListener { if flushed >= largest_seqno { // According to facebook/rocksdb#11183, it's possible OnFlushCompleted can be // called out of order. But it's guaranteed files are installed in order. - info!("flush complete reorder found"; "flushed" => flushed, "largest_seqno" => largest_seqno, "file_no" => file_no, "cf" => cf); + info!( + "flush complete reorder found"; + "flushed" => flushed, + "largest_seqno" => largest_seqno, + "file_no" => file_no, + "cf" => cf + ); return; } prs.last_flushed[offset] = largest_seqno; diff --git a/tests/failpoints/cases/test_engine.rs b/tests/failpoints/cases/test_engine.rs index 93d1c96597b..073f7276419 100644 --- a/tests/failpoints/cases/test_engine.rs +++ b/tests/failpoints/cases/test_engine.rs @@ -1,6 +1,11 @@ // Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. -use engine_traits::{CF_DEFAULT, CF_LOCK, CF_WRITE}; +use std::{ + sync::{mpsc::channel, Mutex}, + time::Duration, +}; + +use engine_traits::{MiscExt, CF_DEFAULT, CF_LOCK, CF_WRITE}; use tikv_util::config::ReadableSize; fn dummy_string(len: usize) -> String { @@ -51,3 +56,84 @@ fn test_write_buffer_manager() { cluster.must_put_cf(CF_WRITE, key.as_bytes(), dummy.as_bytes()); } } + +// The test mocks the senario before https://github.com/tikv/rocksdb/pull/347: +// note: before rocksdb/pull/347, lock is called before on_memtable_sealed. +// Case: +// Assume FlushMemtable cf1 (schedule flush task) and BackgroundCallFlush cf1 +// (execute flush task) are performed concurrently. +// t FlushMemtable cf1 BackgroundCallFlush cf1 +// 1. lock +// 2. convert memtable t2(seqno. 10-20) +// to immemtable +// 3. unlock +// 4. lock +// 5. pick memtables to flush: +// t1(0-10), t2(10-20) +// flush job(0-20) +// 6. finish flush +// 7. unlock +// 8. on_flush_completed: +// update last_flushed to 20 +// 9. on_memtable_sealed +// 10 > 20 *panic* +#[test] +fn test_rocksdb_listener() { + use test_raftstore_v2::*; + let count = 1; + let mut cluster = new_node_cluster(0, count); + // make flush thread num 1 to be easy to construct the case + cluster.cfg.rocksdb.max_background_flushes = 1; + cluster.run(); + + let r = cluster.get_region(b"k10"); + cluster.must_split(&r, b"k10"); + + for i in 0..20 { + let k = format!("k{:02}", i); + cluster.must_put(k.as_bytes(), b"val"); + } + + let r1 = cluster.get_region(b"k00").get_id(); + let r2 = cluster.get_region(b"k15").get_id(); + + let engine = cluster.get_engine(1); + let tablet1 = engine.get_tablet_by_id(r1).unwrap(); + let tablet2 = engine.get_tablet_by_id(r2).unwrap(); + + fail::cfg("on_flush_begin", "1*pause").unwrap(); + tablet1.flush_cf("default", false).unwrap(); // call flush 1 + std::thread::sleep(Duration::from_secs(1)); + + tablet2.flush_cf("default", false).unwrap(); // call flush 2 + for i in 20..30 { + let k = format!("k{:02}", i); + cluster.must_put(k.as_bytes(), b"val"); + } + fail::cfg("on_memtable_sealed", "pause").unwrap(); + + let h = std::thread::spawn(move || { + tablet2.flush_cf("default", true).unwrap(); + }); + + let (tx, rx) = channel(); + let tx = Mutex::new(tx); + fail::cfg_callback("on_flush_completed", move || { + let _ = tx.lock().unwrap().send(true); // call flush 3 + }) + .unwrap(); + fail::remove("on_flush_begin"); + + let _ = rx.recv(); // flush 1 done + // Now, flush 1 has done, flush 3 is blocked at on_memtable_sealed. + // Before https://github.com/tikv/rocksdb/pull/347, unlock will be called + // before calling on_memtable_sealed, so flush 2 can pick the memtable sealed by + // flush 3 and thus make the order chaos. + // Now, unlock will not be called, so we have to remove failpoint to avoid + // deadlock. 2 seconds is long enough to make the test failed before + // rocksdb/pull/347. + std::thread::sleep(Duration::from_secs(2)); + fail::remove("on_memtable_sealed"); + + h.join().unwrap(); +} From 6f0d84e911a86837263b914e8b1ddba9a1da5232 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B1=B1=E5=B2=9A?= <36239017+YuJuncen@users.noreply.github.com> Date: Tue, 12 Sep 2023 08:49:39 +0800 Subject: [PATCH 038/220] sst_importer: don't cache rewritten files (#15502) close tikv/tikv#15483 The rewrite step of sst_importer::apply has been delayed to while iterating the file. Signed-off-by: hillium Co-authored-by: 3pointer --- components/sst_importer/src/sst_importer.rs | 42 +++--- .../tikv_util/src/codec/stream_event.rs | 109 ++++++++++++++-- src/import/sst_service.rs | 5 +- tests/integrations/import/mod.rs | 1 + tests/integrations/import/test_apply_log.rs | 72 ++++++++++ tests/integrations/import/util.rs | 123 +++++++++++++++++- 6 files changed, 322 insertions(+), 30 deletions(-) create mode 100644 tests/integrations/import/test_apply_log.rs diff --git a/components/sst_importer/src/sst_importer.rs b/components/sst_importer/src/sst_importer.rs index 33f3c691a26..181f9d67b2f 100644 --- a/components/sst_importer/src/sst_importer.rs +++ b/components/sst_importer/src/sst_importer.rs @@ -642,7 +642,6 @@ impl SstImporter { async fn exec_download( &self, meta: &KvMeta, - rewrite_rule: &RewriteRule, ext_storage: Arc, speed_limiter: &Limiter, ) -> Result { @@ -690,9 +689,8 @@ impl SstImporter { .with_label_values(&["exec_download"]) .observe(start.saturating_elapsed().as_secs_f64()); - let rewrite_buff = self.rewrite_kv_file(buff, rewrite_rule)?; Ok(LoadedFile { - content: Arc::from(rewrite_buff.into_boxed_slice()), + content: Arc::from(buff.into_boxed_slice()), permit, }) } @@ -700,7 +698,6 @@ impl SstImporter { pub async fn do_read_kv_file( &self, meta: &KvMeta, - rewrite_rule: &RewriteRule, ext_storage: Arc, speed_limiter: &Limiter, ) -> Result { @@ -741,7 +738,7 @@ impl SstImporter { } cache - .get_or_try_init(|| self.exec_download(meta, rewrite_rule, ext_storage, speed_limiter)) + .get_or_try_init(|| self.exec_download(meta, ext_storage, speed_limiter)) .await?; Ok(CacheKvFile::Mem(cache)) } @@ -814,7 +811,6 @@ impl SstImporter { pub async fn read_from_kv_file( &self, meta: &KvMeta, - rewrite_rule: &RewriteRule, ext_storage: Arc, backend: &StorageBackend, speed_limiter: &Limiter, @@ -823,7 +819,7 @@ impl SstImporter { self.do_download_kv_file(meta, backend, speed_limiter) .await? } else { - self.do_read_kv_file(meta, rewrite_rule, ext_storage, speed_limiter) + self.do_read_kv_file(meta, ext_storage, speed_limiter) .await? }; match c { @@ -841,8 +837,7 @@ impl SstImporter { let mut buffer = Vec::new(); reader.read_to_end(&mut buffer)?; - let rewrite_buff = self.rewrite_kv_file(buffer, rewrite_rule)?; - Ok(Arc::from(rewrite_buff.into_boxed_slice())) + Ok(Arc::from(buffer.into_boxed_slice())) } } } @@ -940,7 +935,11 @@ impl SstImporter { // perform iteration and key rewrite. let mut new_buff = Vec::with_capacity(file_buff.len()); - let mut event_iter = EventIterator::new(file_buff.as_slice()); + let mut event_iter = EventIterator::with_rewriting( + file_buff.as_slice(), + rewrite_rule.get_old_key_prefix(), + rewrite_rule.get_new_key_prefix(), + ); let mut key = new_prefix.to_vec(); let new_prefix_data_key_len = key.len(); @@ -983,9 +982,14 @@ impl SstImporter { start_ts: u64, restore_ts: u64, file_buff: Arc<[u8]>, + rewrite_rule: &RewriteRule, mut build_fn: impl FnMut(Vec, Vec), ) -> Result> { - let mut event_iter = EventIterator::new(file_buff.as_ref()); + let mut event_iter = EventIterator::with_rewriting( + file_buff.as_ref(), + rewrite_rule.get_old_key_prefix(), + rewrite_rule.get_new_key_prefix(), + ); let mut smallest_key = None; let mut largest_key = None; let mut total_key = 0; @@ -1001,6 +1005,16 @@ impl SstImporter { event_iter.next()?; INPORTER_APPLY_COUNT.with_label_values(&["key_meet"]).inc(); + if !event_iter + .key() + .starts_with(rewrite_rule.get_new_key_prefix()) + { + return Err(Error::WrongKeyPrefix { + what: "do_apply_kv_file", + key: event_iter.key().to_vec(), + prefix: rewrite_rule.get_old_key_prefix().to_vec(), + }); + } let key = event_iter.key().to_vec(); let value = event_iter.value().to_vec(); let ts = Key::decode_ts_from(&key)?; @@ -1028,7 +1042,7 @@ impl SstImporter { largest_key = largest_key .map_or_else(|| Some(key.clone()), |v: Vec| Some(v.max(key.clone()))); } - if total_key != not_in_range { + if not_in_range != 0 || ts_not_expected != 0 { info!("build download request file done"; "total_keys" => %total_key, "ts_filtered_keys" => %ts_not_expected, @@ -2050,10 +2064,8 @@ mod tests { }; // test do_read_kv_file() - let rewrite_rule = &new_rewrite_rule(b"", b"", 12345); let output = block_on_external_io(importer.do_read_kv_file( &kv_meta, - rewrite_rule, ext_storage, &Limiter::new(f64::INFINITY), )) @@ -2163,7 +2175,6 @@ mod tests { }; let importer = SstImporter::new(&cfg, import_dir, Some(key_manager), ApiVersion::V1, false).unwrap(); - let rewrite_rule = &new_rewrite_rule(b"", b"", 12345); let ext_storage = { importer.wrap_kms( importer.external_storage_or_cache(&backend, "").unwrap(), @@ -2181,7 +2192,6 @@ mod tests { assert!(importer.import_support_download()); let output = block_on_external_io(importer.read_from_kv_file( &kv_meta, - rewrite_rule, ext_storage, &backend, &Limiter::new(f64::INFINITY), diff --git a/components/tikv_util/src/codec/stream_event.rs b/components/tikv_util/src/codec/stream_event.rs index 5b00cad6372..3c1a04f77e3 100644 --- a/components/tikv_util/src/codec/stream_event.rs +++ b/components/tikv_util/src/codec/stream_event.rs @@ -6,6 +6,13 @@ use bytes::{Buf, Bytes}; use crate::{codec::Result, Either}; +// Note: maybe allow them to be different lifetime. +// But not necessary for now, so keep it simple...? +pub struct Rewrite<'a> { + from: &'a [u8], + to: &'a [u8], +} + pub trait Iterator { fn next(&mut self) -> Result<()>; @@ -19,10 +26,12 @@ pub trait Iterator { pub struct EventIterator<'a> { buf: &'a [u8], offset: usize, - key_offset: usize, value_offset: usize, - key_len: usize, value_len: usize, + + key_buf: Vec, + + rewrite_rule: Option>, } impl EventIterator<'_> { @@ -30,10 +39,21 @@ impl EventIterator<'_> { EventIterator { buf, offset: 0, - key_offset: 0, - key_len: 0, + key_buf: vec![], value_offset: 0, value_len: 0, + rewrite_rule: None, + } + } + + pub fn with_rewriting<'a>(buf: &'a [u8], from: &'a [u8], to: &'a [u8]) -> EventIterator<'a> { + EventIterator { + buf, + offset: 0, + key_buf: vec![], + value_offset: 0, + value_len: 0, + rewrite_rule: Some(Rewrite { from, to }), } } @@ -42,14 +62,47 @@ impl EventIterator<'_> { self.offset += 4; result } + + fn consume_key_with_len(&mut self, key_len: usize) { + self.key_buf.clear(); + self.key_buf.reserve(key_len); + self.key_buf + .extend_from_slice(&self.buf[self.offset..self.offset + key_len]); + self.offset += key_len; + } + + fn move_to_next_key_with_rewrite(&mut self) { + let key_len = self.get_size() as usize; + let rewrite = self.rewrite_rule.as_ref().expect("rewrite rule not set"); + if key_len < rewrite.from.len() + || &self.buf[self.offset..self.offset + rewrite.from.len()] != rewrite.from + { + self.consume_key_with_len(key_len); + return; + } + self.key_buf.clear(); + self.key_buf + .reserve(rewrite.to.len() + key_len - rewrite.from.len()); + self.key_buf.extend_from_slice(rewrite.to); + self.key_buf + .extend_from_slice(&self.buf[self.offset + rewrite.from.len()..self.offset + key_len]); + self.offset += key_len; + } + + fn fetch_key_buffer_and_move_to_value(&mut self) { + if self.rewrite_rule.is_some() { + self.move_to_next_key_with_rewrite() + } else { + let key_len = self.get_size() as usize; + self.consume_key_with_len(key_len); + } + } } impl Iterator for EventIterator<'_> { fn next(&mut self) -> Result<()> { if self.valid() { - self.key_len = self.get_size() as usize; - self.key_offset = self.offset; - self.offset += self.key_len; + self.fetch_key_buffer_and_move_to_value(); self.value_len = self.get_size() as usize; self.value_offset = self.offset; @@ -63,7 +116,7 @@ impl Iterator for EventIterator<'_> { } fn key(&self) -> &[u8] { - &self.buf[self.key_offset..self.key_offset + self.key_len] + &self.key_buf[..] } fn value(&self) -> &[u8] { @@ -155,4 +208,44 @@ mod tests { } assert_eq!(count, index); } + + #[test] + fn test_rewrite() { + let mut rng = rand::thread_rng(); + let mut event = vec![]; + let mut keys = vec![]; + let mut vals = vec![]; + let count = 20; + + for _i in 0..count { + let should_rewrite = rng.gen::(); + let mut key: Vec = std::iter::once(if should_rewrite { b'k' } else { b'l' }) + .chain((0..100).map(|_| rng.gen_range(0..255))) + .collect(); + let val: Vec = (0..100).map(|_| rng.gen_range(0..255)).collect(); + let e = EventEncoder::encode_event(&key, &val); + for s in e { + event.extend_from_slice(s.as_ref()); + } + if should_rewrite { + key[0] = b'r'; + } + keys.push(key); + vals.push(val); + } + + let mut iter = EventIterator::with_rewriting(&event, b"k", b"r"); + + let mut index = 0_usize; + loop { + if !iter.valid() { + break; + } + iter.next().unwrap(); + assert_eq!(iter.key(), keys[index]); + assert_eq!(iter.value(), vals[index]); + index += 1; + } + assert_eq!(count, index); + } } diff --git a/src/import/sst_service.rs b/src/import/sst_service.rs index 0c81873c130..6d40ffe959c 100644 --- a/src/import/sst_service.rs +++ b/src/import/sst_service.rs @@ -165,6 +165,9 @@ impl RequestCollector { } fn accept_kv(&mut self, cf: &str, is_delete: bool, k: Vec, v: Vec) { + debug!("Accepting KV."; "cf" => %cf, + "key" => %log_wrappers::Value::key(&k), + "value" => %log_wrappers::Value::key(&v)); // Need to skip the empty key/value that could break the transaction or cause // data corruption. see details at https://github.com/pingcap/tiflow/issues/5468. if k.is_empty() || (!is_delete && v.is_empty()) { @@ -567,7 +570,6 @@ impl ImportSstService { let buff = importer .read_from_kv_file( meta, - rule, ext_storage.clone(), req.get_storage_backend(), &limiter, @@ -579,6 +581,7 @@ impl ImportSstService { meta.get_start_ts(), meta.get_restore_ts(), buff, + rule, |k, v| collector.accept_kv(meta.get_cf(), meta.get_is_delete(), k, v), )? { if let Some(range) = range.as_mut() { diff --git a/tests/integrations/import/mod.rs b/tests/integrations/import/mod.rs index 96e2c655e18..4de0fa26472 100644 --- a/tests/integrations/import/mod.rs +++ b/tests/integrations/import/mod.rs @@ -1,4 +1,5 @@ // Copyright 2018 TiKV Project Authors. Licensed under Apache-2.0. +mod test_apply_log; mod test_sst_service; mod util; diff --git a/tests/integrations/import/test_apply_log.rs b/tests/integrations/import/test_apply_log.rs new file mode 100644 index 00000000000..3d8cf85b02c --- /dev/null +++ b/tests/integrations/import/test_apply_log.rs @@ -0,0 +1,72 @@ +use engine_traits::CF_DEFAULT; +use external_storage_export::LocalStorage; +use kvproto::import_sstpb::ApplyRequest; +use tempfile::TempDir; + +use crate::import::util; + +#[test] +fn test_basic_apply() { + let (_cluster, ctx, tikv, import) = util::new_cluster_and_tikv_import_client(); + let tmp = TempDir::new().unwrap(); + let storage = LocalStorage::new(tmp.path()).unwrap(); + let default = [ + (b"k1", b"v1", 1), + (b"k2", b"v2", 2), + (b"k3", b"v3", 3), + (b"k4", b"v4", 4), + ]; + let default_rewritten = [(b"r1", b"v1", 1), (b"r2", b"v2", 2), (b"r3", b"v3", 3)]; + let mut sst_meta = util::make_plain_file(&storage, "file1.log", default.into_iter()); + util::register_range_for(&mut sst_meta, b"k1", b"k3a"); + let mut req = ApplyRequest::new(); + req.set_context(ctx.clone()); + req.set_rewrite_rules(vec![util::rewrite_for(&mut sst_meta, b"k", b"r")].into()); + req.set_metas(vec![sst_meta].into()); + req.set_storage_backend(util::local_storage(&tmp)); + import.apply(&req).unwrap(); + util::check_applied_kvs_cf(&tikv, &ctx, CF_DEFAULT, default_rewritten.into_iter()); +} + +#[test] +fn test_apply_twice() { + let (_cluster, ctx, tikv, import) = util::new_cluster_and_tikv_import_client(); + let tmp = TempDir::new().unwrap(); + let storage = LocalStorage::new(tmp.path()).unwrap(); + let default = [( + b"k1", + b"In this case, we are going to test write twice, but with different rewrite rule.", + 1, + )]; + let default_fst = [( + b"r1", + b"In this case, we are going to test write twice, but with different rewrite rule.", + 1, + )]; + let default_snd = [( + b"z1", + b"In this case, we are going to test write twice, but with different rewrite rule.", + 1, + )]; + + let mut sst_meta = util::make_plain_file(&storage, "file2.log", default.into_iter()); + util::register_range_for(&mut sst_meta, b"k1", b"k1a"); + let mut req = ApplyRequest::new(); + req.set_context(ctx.clone()); + req.set_rewrite_rules(vec![util::rewrite_for(&mut sst_meta, b"k", b"r")].into()); + req.set_metas(vec![sst_meta.clone()].into()); + req.set_storage_backend(util::local_storage(&tmp)); + import.apply(&req).unwrap(); + util::check_applied_kvs_cf(&tikv, &ctx, CF_DEFAULT, default_fst.into_iter()); + + util::register_range_for(&mut sst_meta, b"k1", b"k1a"); + req.set_rewrite_rules(vec![util::rewrite_for(&mut sst_meta, b"k", b"z")].into()); + req.set_metas(vec![sst_meta].into()); + import.apply(&req).unwrap(); + util::check_applied_kvs_cf( + &tikv, + &ctx, + CF_DEFAULT, + default_fst.into_iter().chain(default_snd.into_iter()), + ); +} diff --git a/tests/integrations/import/util.rs b/tests/integrations/import/util.rs index cc5d22d517d..d8a11d50746 100644 --- a/tests/integrations/import/util.rs +++ b/tests/integrations/import/util.rs @@ -1,16 +1,31 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. -use std::{sync::Arc, thread, time::Duration}; - +use std::{ + io::{Cursor, Write}, + sync::Arc, + thread, + time::Duration, +}; + +use collections::HashMap; use engine_rocks::RocksEngine; -use futures::{executor::block_on, stream, SinkExt}; +use engine_traits::CF_DEFAULT; +use external_storage_export::{ExternalStorage, UnpinReader}; +use futures::{executor::block_on, io::Cursor as AsyncCursor, stream, SinkExt}; use grpcio::{ChannelBuilder, Environment, Result, WriteFlags}; -use kvproto::{import_sstpb::*, kvrpcpb::*, tikvpb::*}; +use kvproto::{ + brpb::{Local, StorageBackend}, + import_sstpb::{KvMeta, *}, + kvrpcpb::*, + tikvpb::*, +}; use security::SecurityConfig; +use tempfile::TempDir; use test_raftstore::*; use test_raftstore_v2::{Cluster as ClusterV2, ServerCluster as ServerClusterV2}; use tikv::config::TikvConfig; -use tikv_util::HandyRwLock; +use tikv_util::{codec::stream_event::EventEncoder, stream::block_on_external_io, HandyRwLock}; +use txn_types::Key; use uuid::Uuid; const CLEANUP_SST_MILLIS: u64 = 10; @@ -246,6 +261,40 @@ pub fn check_ingested_kvs_cf(tikv: &TikvClient, ctx: &Context, cf: &str, sst_ran } } +#[track_caller] +pub fn check_applied_kvs_cf, V: AsRef<[u8]> + std::fmt::Debug>( + tikv: &TikvClient, + ctx: &Context, + cf: &str, + entries: impl Iterator, +) { + let mut get = RawBatchGetRequest::default(); + get.set_cf(cf.to_owned()); + get.set_context(ctx.clone()); + let mut keymap = HashMap::default(); + for (key, value, ts) in entries { + let the_key = Key::from_raw(key.as_ref()) + .append_ts(ts.into()) + .into_encoded(); + keymap.insert(the_key.clone(), value); + get.mut_keys().push(the_key); + } + for pair in tikv.raw_batch_get(&get).unwrap().get_pairs() { + let entry = keymap.remove(pair.get_key()).expect("unexpected key"); + assert_eq!( + entry.as_ref(), + pair.get_value(), + "key is {:?}", + pair.get_key() + ); + } + assert!( + keymap.is_empty(), + "not all keys consumed, remained {:?}", + keymap + ); +} + pub fn check_ingested_txn_kvs( tikv: &TikvClient, ctx: &Context, @@ -273,3 +322,67 @@ pub fn check_sst_deleted(client: &ImportSstClient, meta: &SstMeta, data: &[u8]) } send_upload_sst(client, meta, data).unwrap(); } + +pub fn make_plain_file(storage: &dyn ExternalStorage, name: &str, kvs: I) -> KvMeta +where + I: Iterator, + K: AsRef<[u8]>, + V: AsRef<[u8]>, +{ + let mut buf = vec![]; + let mut file = Cursor::new(&mut buf); + let mut start_ts: Option = None; + for (key, value, ts) in kvs { + let the_key = Key::from_raw(key.as_ref()) + .append_ts(ts.into()) + .into_encoded(); + start_ts = Some(start_ts.map_or(ts, |ts0| ts0.min(ts))); + for segment in EventEncoder::encode_event(&the_key, value.as_ref()) { + file.write_all(segment.as_ref()).unwrap(); + } + } + file.flush().unwrap(); + let len = buf.len() as u64; + block_on_external_io(storage.write(name, UnpinReader(Box::new(AsyncCursor::new(buf))), len)) + .unwrap(); + let mut meta = KvMeta::new(); + meta.set_start_ts(start_ts.unwrap_or_default()); + meta.set_length(len); + meta.set_restore_ts(u64::MAX); + meta.set_compression_type(kvproto::brpb::CompressionType::Unknown); + meta.set_name(name.to_owned()); + meta.set_cf(CF_DEFAULT.to_owned()); + meta +} + +pub fn rewrite_for(meta: &mut KvMeta, old_prefix: &[u8], new_prefix: &[u8]) -> RewriteRule { + assert_eq!(old_prefix.len(), new_prefix.len()); + fn rewrite(key: &mut Vec, old_prefix: &[u8], new_prefix: &[u8]) { + assert!(key.starts_with(old_prefix)); + let len = old_prefix.len(); + key.splice(..len, new_prefix.iter().cloned()); + } + rewrite(meta.mut_start_key(), old_prefix, new_prefix); + rewrite(meta.mut_end_key(), old_prefix, new_prefix); + let mut rule = RewriteRule::default(); + rule.set_old_key_prefix(old_prefix.to_vec()); + rule.set_new_key_prefix(new_prefix.to_vec()); + rule +} + +pub fn register_range_for(meta: &mut KvMeta, start: &[u8], end: &[u8]) { + let start = Key::from_raw(start); + let end = Key::from_raw(end); + meta.set_start_key(start.into_encoded()); + meta.set_end_key(end.into_encoded()); +} + +pub fn local_storage(tmp: &TempDir) -> StorageBackend { + let mut backend = StorageBackend::default(); + backend.set_local({ + let mut local = Local::default(); + local.set_path(tmp.path().to_str().unwrap().to_owned()); + local + }); + backend +} From d830a58335839fe02434727f2d8b252a02ba386d Mon Sep 17 00:00:00 2001 From: lucasliang Date: Tue, 12 Sep 2023 18:04:41 +0800 Subject: [PATCH 039/220] [Dynamic Region] Supply extra test cases for `gc`. (#15544) ref tikv/tikv#15409 Supply extra test cases, including integration tests and unit tests for raftstore-v2 on `gc`. Signed-off-by: lucasliang --- tests/failpoints/cases/test_gc_worker.rs | 105 +++++++++++------------ tests/integrations/server/gc_worker.rs | 18 ++-- 2 files changed, 61 insertions(+), 62 deletions(-) diff --git a/tests/failpoints/cases/test_gc_worker.rs b/tests/failpoints/cases/test_gc_worker.rs index d24ec85f040..50b71b59f47 100644 --- a/tests/failpoints/cases/test_gc_worker.rs +++ b/tests/failpoints/cases/test_gc_worker.rs @@ -14,9 +14,10 @@ use raftstore::coprocessor::{ RegionInfo, RegionInfoCallback, RegionInfoProvider, Result as CopResult, SeekRegionCallback, }; use test_raftstore::*; +use test_raftstore_macro::test_case; use tikv::{ server::gc_worker::{ - AutoGcConfig, GcSafePointProvider, GcTask, Result as GcWorkerResult, TestGcRunner, + sync_gc, AutoGcConfig, GcSafePointProvider, GcTask, Result as GcWorkerResult, TestGcRunner, }, storage::{ kv::TestEngineBuilder, @@ -61,11 +62,38 @@ fn test_error_in_compaction_filter() { fail::remove(fp); } +#[derive(Clone)] +struct MockSafePointProvider; +impl GcSafePointProvider for MockSafePointProvider { + fn get_safe_point(&self) -> GcWorkerResult { + Ok(TimeStamp::from(0)) + } +} + +#[derive(Clone)] +struct MockRegionInfoProvider; +impl RegionInfoProvider for MockRegionInfoProvider { + fn seek_region(&self, _: &[u8], _: SeekRegionCallback) -> CopResult<()> { + Ok(()) + } + fn find_region_by_id( + &self, + _: u64, + _: RegionInfoCallback>, + ) -> CopResult<()> { + Ok(()) + } + fn get_regions_in_range(&self, _start_key: &[u8], _end_key: &[u8]) -> CopResult> { + Ok(vec![]) + } +} + // Test GC worker can receive and handle orphan versions emit from write CF's // compaction filter correctly. -#[test] +#[test_case(test_raftstore::must_new_and_configure_cluster)] +#[test_case(test_raftstore_v2::must_new_and_configure_cluster)] fn test_orphan_versions_from_compaction_filter() { - let (cluster, leader, ctx) = must_new_and_configure_cluster(|cluster| { + let (cluster, leader, ctx) = new_cluster(|cluster| { cluster.cfg.gc.enable_compaction_filter = true; cluster.cfg.gc.compaction_filter_skip_version_check = true; cluster.pd_client.disable_default_operator(); @@ -76,8 +104,20 @@ fn test_orphan_versions_from_compaction_filter() { let channel = ChannelBuilder::new(env).connect(&cluster.sim.rl().get_addr(leader_store)); let client = TikvClient::new(channel); - init_compaction_filter(&cluster, leader_store); - let engine = cluster.engines.get(&leader_store).unwrap(); + // Call `start_auto_gc` like `cmd/src/server.rs` does. It will combine + // compaction filter and GC worker so that GC worker can help to process orphan + // versions on default CF. + { + let sim = cluster.sim.rl(); + let gc_worker = sim.get_gc_worker(leader_store); + gc_worker + .start_auto_gc( + AutoGcConfig::new(MockSafePointProvider, MockRegionInfoProvider, 1), + Arc::new(AtomicU64::new(0)), + ) + .unwrap(); + } + let engine = cluster.get_engine(leader_store); let pk = b"k1".to_vec(); let large_value = vec![b'x'; 300]; @@ -91,22 +131,23 @@ fn test_orphan_versions_from_compaction_filter() { if start_ts < 40 { let key = Key::from_raw(b"k1").append_ts(start_ts.into()); let key = data_key(key.as_encoded()); - assert!(engine.kv.get_value(&key).unwrap().is_some()); + assert!(engine.get_value(&key).unwrap().is_some()); } } let fp = "write_compaction_filter_flush_write_batch"; fail::cfg(fp, "return").unwrap(); - let mut gc_runner = TestGcRunner::new(100); - gc_runner.gc_scheduler = cluster.sim.rl().get_gc_worker(1).scheduler(); - gc_runner.gc(&engine.kv); + let gc_safe_ponit = TimeStamp::from(100); + let gc_scheduler = cluster.sim.rl().get_gc_worker(1).scheduler(); + let region = cluster.get_region(&pk); + sync_gc(&gc_scheduler, region, gc_safe_ponit).unwrap(); 'IterKeys: for &start_ts in &[10, 20, 30] { let key = Key::from_raw(b"k1").append_ts(start_ts.into()); let key = data_key(key.as_encoded()); for _ in 0..100 { - if engine.kv.get_value(&key).unwrap().is_some() { + if engine.get_value(&key).unwrap().is_some() { thread::sleep(Duration::from_millis(20)); continue; } @@ -117,47 +158,3 @@ fn test_orphan_versions_from_compaction_filter() { fail::remove(fp); } - -// Call `start_auto_gc` like `cmd/src/server.rs` does. It will combine -// compaction filter and GC worker so that GC worker can help to process orphan -// versions on default CF. -fn init_compaction_filter(cluster: &Cluster, store_id: u64) { - #[derive(Clone)] - struct MockSafePointProvider; - impl GcSafePointProvider for MockSafePointProvider { - fn get_safe_point(&self) -> GcWorkerResult { - Ok(TimeStamp::from(0)) - } - } - - #[derive(Clone)] - struct MockRegionInfoProvider; - impl RegionInfoProvider for MockRegionInfoProvider { - fn seek_region(&self, _: &[u8], _: SeekRegionCallback) -> CopResult<()> { - Ok(()) - } - fn find_region_by_id( - &self, - _: u64, - _: RegionInfoCallback>, - ) -> CopResult<()> { - Ok(()) - } - fn get_regions_in_range( - &self, - _start_key: &[u8], - _end_key: &[u8], - ) -> CopResult> { - Ok(vec![]) - } - } - - let sim = cluster.sim.rl(); - let gc_worker = sim.get_gc_worker(store_id); - gc_worker - .start_auto_gc( - AutoGcConfig::new(MockSafePointProvider, MockRegionInfoProvider, 1), - Arc::new(AtomicU64::new(0)), - ) - .unwrap(); -} diff --git a/tests/integrations/server/gc_worker.rs b/tests/integrations/server/gc_worker.rs index cfadde84405..238102df6b6 100644 --- a/tests/integrations/server/gc_worker.rs +++ b/tests/integrations/server/gc_worker.rs @@ -7,15 +7,17 @@ use grpcio::{ChannelBuilder, Environment}; use keys::data_key; use kvproto::{kvrpcpb::*, tikvpb::TikvClient}; use test_raftstore::*; +use test_raftstore_macro::test_case; use tikv::server::gc_worker::sync_gc; use tikv_util::HandyRwLock; use txn_types::Key; // Since v5.0 GC bypasses Raft, which means GC scans/deletes records with // `keys::DATA_PREFIX`. This case ensures it's performed correctly. -#[test] +#[test_case(test_raftstore::must_new_cluster_mul)] +#[test_case(test_raftstore_v2::must_new_cluster_mul)] fn test_gc_bypass_raft() { - let (cluster, leader, ctx) = must_new_cluster_mul(2); + let (cluster, leader, ctx) = new_cluster(2); cluster.pd_client.disable_default_operator(); let env = Arc::new(Environment::new(1)); @@ -25,7 +27,7 @@ fn test_gc_bypass_raft() { let pk = b"k1".to_vec(); let value = vec![b'x'; 300]; - let engine = cluster.engines.get(&leader_store).unwrap(); + let engine = cluster.get_engine(leader_store); for &start_ts in &[10, 20, 30, 40] { let commit_ts = start_ts + 5; @@ -37,11 +39,11 @@ fn test_gc_bypass_raft() { let key = Key::from_raw(b"k1").append_ts(start_ts.into()); let key = data_key(key.as_encoded()); - assert!(engine.kv.get_value(&key).unwrap().is_some()); + assert!(engine.get_value(&key).unwrap().is_some()); let key = Key::from_raw(b"k1").append_ts(commit_ts.into()); let key = data_key(key.as_encoded()); - assert!(engine.kv.get_value_cf(CF_WRITE, &key).unwrap().is_some()); + assert!(engine.get_value_cf(CF_WRITE, &key).unwrap().is_some()); } let node_ids = cluster.get_node_ids(); @@ -53,16 +55,16 @@ fn test_gc_bypass_raft() { region.set_end_key(b"k2".to_vec()); sync_gc(&gc_sched, region, 200.into()).unwrap(); - let engine = cluster.engines.get(&store_id).unwrap(); + let engine = cluster.get_engine(store_id); for &start_ts in &[10, 20, 30] { let commit_ts = start_ts + 5; let key = Key::from_raw(b"k1").append_ts(start_ts.into()); let key = data_key(key.as_encoded()); - assert!(engine.kv.get_value(&key).unwrap().is_none()); + assert!(engine.get_value(&key).unwrap().is_none()); let key = Key::from_raw(b"k1").append_ts(commit_ts.into()); let key = data_key(key.as_encoded()); - assert!(engine.kv.get_value_cf(CF_WRITE, &key).unwrap().is_none()); + assert!(engine.get_value_cf(CF_WRITE, &key).unwrap().is_none()); } } } From db0304e65045fdc6701e8fe0db80416a0210e412 Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Wed, 13 Sep 2023 07:43:38 +0800 Subject: [PATCH 040/220] *: update cargo.lock (#15573) close tikv/tikv#15579 update cargo.lock Signed-off-by: SpadeA-Tang --- Cargo.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 7e09c3d2979..fb5e711d34d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3108,7 +3108,7 @@ dependencies = [ [[package]] name = "librocksdb_sys" version = "0.1.0" -source = "git+https://github.com/SpadeA-Tang/rust-rocksdb.git?branch=fix-sealed-chaos#f5121f48a1543c5d576ad7964c617f30f79a3d66" +source = "git+https://github.com/tikv/rust-rocksdb.git#fc38a5b427e6c9b351f835c641e2ee95b8ff8306" dependencies = [ "bindgen 0.65.1", "bzip2-sys", @@ -3127,7 +3127,7 @@ dependencies = [ [[package]] name = "libtitan_sys" version = "0.0.1" -source = "git+https://github.com/SpadeA-Tang/rust-rocksdb.git?branch=fix-sealed-chaos#f5121f48a1543c5d576ad7964c617f30f79a3d66" +source = "git+https://github.com/tikv/rust-rocksdb.git#fc38a5b427e6c9b351f835c641e2ee95b8ff8306" dependencies = [ "bzip2-sys", "cc", @@ -5101,7 +5101,7 @@ dependencies = [ [[package]] name = "rocksdb" version = "0.3.0" -source = "git+https://github.com/SpadeA-Tang/rust-rocksdb.git?branch=fix-sealed-chaos#f5121f48a1543c5d576ad7964c617f30f79a3d66" +source = "git+https://github.com/tikv/rust-rocksdb.git#fc38a5b427e6c9b351f835c641e2ee95b8ff8306" dependencies = [ "libc 0.2.146", "librocksdb_sys", From d5d89ba60b07e508e4073b5460b192680c272213 Mon Sep 17 00:00:00 2001 From: lance6716 Date: Wed, 13 Sep 2023 14:10:38 +0800 Subject: [PATCH 041/220] coprocessor: use the deadline in kvrpcpb::Context (#15564) close tikv/tikv#15565 Signed-off-by: lance6716 Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- src/coprocessor/mod.rs | 46 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/src/coprocessor/mod.rs b/src/coprocessor/mod.rs index 140d3c0476e..fcd16f9b947 100644 --- a/src/coprocessor/mod.rs +++ b/src/coprocessor/mod.rs @@ -159,7 +159,11 @@ impl ReqContext { cache_match_version: Option, perf_level: PerfLevel, ) -> Self { - let deadline = Deadline::from_now(max_handle_duration); + let mut deadline_duration = max_handle_duration; + if context.max_execution_duration_ms > 0 { + deadline_duration = Duration::from_millis(context.max_execution_duration_ms); + } + let deadline = Deadline::from_now(deadline_duration); let bypass_locks = TsSet::from_u64s(context.take_resolved_locks()); let access_locks = TsSet::from_u64s(context.take_committed_locks()); let lower_bound = match ranges.first().as_ref() { @@ -235,6 +239,23 @@ lazy_static! { mod tests { use super::*; + fn default_req_ctx_with_ctx_duration( + context: kvrpcpb::Context, + max_handle_duration: Duration, + ) -> ReqContext { + ReqContext::new( + ReqTag::test, + context, + Vec::new(), + max_handle_duration, + None, + None, + TimeStamp::max(), + None, + PerfLevel::EnableCount, + ) + } + #[test] fn test_build_task_id() { let mut ctx = ReqContext::default_for_test(); @@ -246,4 +267,27 @@ mod tests { ctx.context.set_task_id(0); assert_eq!(ctx.build_task_id(), start_ts); } + + #[test] + fn test_deadline_from_req_ctx() { + let ctx = kvrpcpb::Context::default(); + let max_handle_duration = Duration::from_millis(100); + let req_ctx = default_req_ctx_with_ctx_duration(ctx, max_handle_duration); + // sleep at least 100ms + std::thread::sleep(Duration::from_millis(200)); + req_ctx + .deadline + .check() + .expect_err("deadline should exceed"); + + let mut ctx = kvrpcpb::Context::default(); + ctx.max_execution_duration_ms = 100_000; + let req_ctx = default_req_ctx_with_ctx_duration(ctx, max_handle_duration); + // sleep at least 100ms + std::thread::sleep(Duration::from_millis(200)); + req_ctx + .deadline + .check() + .expect("deadline should not exceed"); + } } From b75f55901e5defd5c87a10de2ca7088749c16b7f Mon Sep 17 00:00:00 2001 From: YangKeao Date: Wed, 13 Sep 2023 17:19:38 +0800 Subject: [PATCH 042/220] tidb_query_datatype,collation: remove utf8mb4_0900_bin from need_restored_data (#15572) close tikv/tikv#15571 Signed-off-by: Yang Keao --- .../tidb_query_datatype/src/def/field_type.rs | 37 ++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/components/tidb_query_datatype/src/def/field_type.rs b/components/tidb_query_datatype/src/def/field_type.rs index 06f4454b36d..8a56ac5ac68 100644 --- a/components/tidb_query_datatype/src/def/field_type.rs +++ b/components/tidb_query_datatype/src/def/field_type.rs @@ -140,7 +140,10 @@ impl Collation { } pub fn is_bin_collation(&self) -> bool { - matches!(self, Collation::Utf8Mb4Bin | Collation::Latin1Bin) + matches!( + self, + Collation::Utf8Mb4Bin | Collation::Latin1Bin | Collation::Utf8Mb40900Bin + ) } } @@ -333,6 +336,10 @@ pub trait FieldTypeAccessor { .map(|col| col.is_bin_collation()) .unwrap_or(false) || self.is_varchar_like()) + && self + .collation() + .map(|col| col != Collation::Utf8Mb40900Bin) + .unwrap_or(false) } } @@ -455,6 +462,7 @@ mod tests { use std::i32; use super::*; + use crate::builder::FieldTypeBuilder; fn field_types() -> Vec { vec![ @@ -583,4 +591,31 @@ mod tests { } } } + + #[test] + fn test_need_restored_data() { + let cases = vec![ + (FieldTypeTp::String, Collation::Binary, false), + (FieldTypeTp::VarString, Collation::Binary, false), + (FieldTypeTp::String, Collation::Utf8Mb4Bin, false), + (FieldTypeTp::VarString, Collation::Utf8Mb4Bin, true), + (FieldTypeTp::String, Collation::Utf8Mb4GeneralCi, true), + (FieldTypeTp::VarString, Collation::Utf8Mb4GeneralCi, true), + (FieldTypeTp::String, Collation::Utf8Mb4UnicodeCi, true), + (FieldTypeTp::VarString, Collation::Utf8Mb4UnicodeCi, true), + (FieldTypeTp::String, Collation::Utf8Mb40900AiCi, true), + (FieldTypeTp::VarString, Collation::Utf8Mb40900AiCi, true), + (FieldTypeTp::String, Collation::Utf8Mb40900Bin, false), + (FieldTypeTp::VarString, Collation::Utf8Mb40900Bin, false), + (FieldTypeTp::String, Collation::GbkBin, true), + (FieldTypeTp::VarString, Collation::GbkBin, true), + (FieldTypeTp::String, Collation::GbkChineseCi, true), + (FieldTypeTp::VarString, Collation::GbkChineseCi, true), + ]; + + for (tp, collation, result) in cases { + let ft = FieldTypeBuilder::new().tp(tp).collation(collation).build(); + assert_eq!(ft.need_restored_data(), result) + } + } } From 063c9cd64c8bcf0c2373358354994499d9edeb0b Mon Sep 17 00:00:00 2001 From: glorv Date: Thu, 14 Sep 2023 00:52:38 +0800 Subject: [PATCH 043/220] raftstore-v2: persist applied index after ingset sst (#15538) close tikv/tikv#15461 Signed-off-by: glorv Co-authored-by: tonyxuqqi --- .../raftstore-v2/src/operation/command/mod.rs | 6 + .../src/operation/command/write/ingest.rs | 12 +- .../src/operation/ready/apply_trace.rs | 244 ++++++++++++++++-- .../raftstore-v2/src/operation/ready/mod.rs | 10 +- components/raftstore-v2/src/raft/apply.rs | 14 +- .../src/router/internal_message.rs | 7 + components/raftstore-v2/src/router/mod.rs | 2 +- tests/failpoints/cases/test_import_service.rs | 76 +++++- 8 files changed, 341 insertions(+), 30 deletions(-) diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index c39f2412f32..e579d22c6da 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -455,6 +455,11 @@ impl Peer { if is_leader { self.retry_pending_prepare_merge(ctx, apply_res.applied_index); } + if !apply_res.sst_applied_index.is_empty() { + self.storage_mut() + .apply_trace_mut() + .on_sst_ingested(&apply_res.sst_applied_index); + } self.on_data_modified(apply_res.modifications); self.handle_read_on_apply( ctx, @@ -866,6 +871,7 @@ impl Apply { apply_res.modifications = *self.modifications_mut(); apply_res.metrics = mem::take(&mut self.metrics); apply_res.bucket_stat = self.buckets.clone(); + apply_res.sst_applied_index = self.take_sst_applied_index(); let written_bytes = apply_res.metrics.written_bytes; let skip_report = || -> bool { diff --git a/components/raftstore-v2/src/operation/command/write/ingest.rs b/components/raftstore-v2/src/operation/command/write/ingest.rs index 7e8ed381ad0..92f5923d167 100644 --- a/components/raftstore-v2/src/operation/command/write/ingest.rs +++ b/components/raftstore-v2/src/operation/command/write/ingest.rs @@ -2,7 +2,7 @@ use collections::HashMap; use crossbeam::channel::TrySendError; -use engine_traits::{data_cf_offset, KvEngine, RaftEngine}; +use engine_traits::{data_cf_offset, KvEngine, RaftEngine, DATA_CFS_LEN}; use kvproto::import_sstpb::SstMeta; use raftstore::{ store::{check_sst_for_ingestion, metrics::PEER_WRITE_CMD_COUNTER, util}, @@ -16,7 +16,7 @@ use crate::{ batch::StoreContext, fsm::{ApplyResReporter, Store, StoreFsmDelegate}, raft::{Apply, Peer}, - router::{PeerMsg, StoreTick}, + router::{PeerMsg, SstApplyIndex, StoreTick}, worker::tablet, }; @@ -107,10 +107,12 @@ impl Peer { impl Apply { #[inline] pub fn apply_ingest(&mut self, index: u64, ssts: Vec) -> Result<()> { + fail::fail_point!("on_apply_ingest"); PEER_WRITE_CMD_COUNTER.ingest_sst.inc(); let mut infos = Vec::with_capacity(ssts.len()); let mut size: i64 = 0; let mut keys: u64 = 0; + let mut cf_indexes = [u64::MAX; DATA_CFS_LEN]; for sst in &ssts { // This may not be enough as ingest sst may not trigger flush at all. let off = data_cf_offset(sst.get_cf_name()); @@ -138,6 +140,7 @@ impl Apply { slog_panic!(self.logger, "corrupted sst"; "sst" => ?sst, "error" => ?e); } } + cf_indexes[off] = index; } if !infos.is_empty() { // Unlike v1, we can't batch ssts accross regions. @@ -154,6 +157,11 @@ impl Apply { self.metrics.size_diff_hint += size; self.metrics.written_bytes += size as u64; self.metrics.written_keys += keys; + for (cf_index, index) in cf_indexes.into_iter().enumerate() { + if index != u64::MAX { + self.push_sst_applied_index(SstApplyIndex { cf_index, index }); + } + } Ok(()) } } diff --git a/components/raftstore-v2/src/operation/ready/apply_trace.rs b/components/raftstore-v2/src/operation/ready/apply_trace.rs index 1601e1f01dd..af0257e763f 100644 --- a/components/raftstore-v2/src/operation/ready/apply_trace.rs +++ b/components/raftstore-v2/src/operation/ready/apply_trace.rs @@ -29,6 +29,7 @@ use std::{ cmp, + collections::VecDeque, path::Path, sync::{atomic::Ordering, mpsc::SyncSender, Mutex}, }; @@ -56,7 +57,7 @@ use crate::{ ready::snapshot::{install_tablet, recv_snap_path}, }, raft::{Peer, Storage}, - router::PeerMsg, + router::{PeerMsg, SstApplyIndex}, worker::tablet, Result, StoreRouter, }; @@ -138,7 +139,7 @@ impl engine_traits::StateStorage for StateStorage< /// Mapping from data cf to an u64 index. pub type DataTrace = [u64; DATA_CFS_LEN]; -#[derive(Clone, Copy, Default, Debug)] +#[derive(Clone, Default, Debug)] struct Progress { flushed: u64, /// The index of last entry that has modification to the CF. The value @@ -146,6 +147,20 @@ struct Progress { /// /// If `flushed` == `last_modified`, then all data in the CF is persisted. last_modified: u64, + // applied indexes ranges that represent sst is ingested but not flushed indexes. + pending_sst_ranges: VecDeque, +} + +// A range representing [start, end], upper bound inclusive for handling +// convenience. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +struct IndexRange(u64, u64); + +#[derive(Debug)] +// track the global flushed index related to the write task. +struct ReadyFlushedIndex { + ready_number: u64, + flushed_index: u64, } /// `ApplyTrace` is used to track the indexes of modifications and flushes. @@ -178,6 +193,9 @@ pub struct ApplyTrace { last_flush_trigger: u64, /// `true` means the raft cf record should be persisted in next ready. try_persist: bool, + // Because we persist the global flushed in the write task, so we should track + // the task and handle sst cleanup after the write task finished. + flushed_index_queue: VecDeque, } impl ApplyTrace { @@ -230,6 +248,25 @@ impl ApplyTrace { self.admin.last_modified = index; } + pub fn on_sst_ingested(&mut self, sst_applied_index: &[SstApplyIndex]) { + use std::cmp::Ordering; + for &SstApplyIndex { cf_index, index } in sst_applied_index { + let p = &mut self.data_cfs[cf_index]; + if p.flushed < index { + let max_idx = p.pending_sst_ranges.iter().last().map(|r| r.1).unwrap_or(0) + 1; + match max_idx.cmp(&index) { + Ordering::Less => { + p.pending_sst_ranges.push_back(IndexRange(index, index)); + } + Ordering::Equal => { + p.pending_sst_ranges.iter_mut().last().unwrap().1 = index; + } + _ => {} + } + } + } + } + pub fn persisted_apply_index(&self) -> u64 { self.persisted_applied } @@ -283,17 +320,45 @@ impl ApplyTrace { } }) .min(); + // At best effort, we can only advance the index to `mem_index`. let candidate = cmp::min(mem_index, min_flushed.unwrap_or(u64::MAX)); + // try advance the index if there are any sst ingestion next to the flushed + // index, and always trigger a flush if there is any sst ingestion. + let (candidate, has_ingested_sst) = self.advance_flushed_index_for_ingest(candidate); if candidate > self.admin.flushed { self.admin.flushed = candidate; - if self.admin.flushed > self.persisted_applied + 100 { + if has_ingested_sst || (self.admin.flushed > self.persisted_applied + 100) { self.try_persist = true; } } // TODO: persist admin.flushed every 10 minutes. } + fn advance_flushed_index_for_ingest(&mut self, mut max_index: u64) -> (u64, bool) { + let mut has_ingest = false; + loop { + let mut has_change = false; + for p in self.data_cfs.iter_mut() { + while let Some(r) = p.pending_sst_ranges.front_mut() { + if r.0 > max_index + 1 { + break; + } else if r.1 > max_index { + max_index = r.1; + has_change = true; + } + p.pending_sst_ranges.pop_front(); + has_ingest = true; + } + } + if !has_change { + break; + } + } + + (max_index, has_ingest) + } + /// Get the flushed indexes of all data CF that is needed when recoverying /// logs. /// @@ -348,6 +413,38 @@ impl ApplyTrace { fail_point!("should_persist_apply_trace", |_| true); self.try_persist } + + #[inline] + pub fn register_flush_task(&mut self, ready_number: u64, flushed_index: u64) { + assert!( + self.flushed_index_queue + .iter() + .last() + .map(|f| f.ready_number) + .unwrap_or(0) + < ready_number + ); + self.flushed_index_queue.push_back(ReadyFlushedIndex { + ready_number, + flushed_index, + }); + } + + #[inline] + pub fn take_flush_index(&mut self, ready_number: u64) -> Option { + use std::cmp::Ordering; + while let Some(r) = self.flushed_index_queue.pop_front() { + match r.ready_number.cmp(&ready_number) { + Ordering::Equal => return Some(r.flushed_index), + Ordering::Greater => { + self.flushed_index_queue.push_front(r); + break; + } + _ => {} + } + } + None + } } impl Storage { @@ -546,6 +643,7 @@ impl Storage { .unwrap(); trace.try_persist = false; trace.persisted_applied = trace.admin.flushed; + trace.register_flush_task(write_task.ready_number(), trace.admin.flushed); } } @@ -566,24 +664,7 @@ impl Peer { let apply_trace = self.storage_mut().apply_trace_mut(); apply_trace.on_flush(cf, index); apply_trace.maybe_advance_admin_flushed(apply_index); - let stale_ssts = self.sst_apply_state().stale_ssts(cf, index); - if stale_ssts.is_empty() { - return; - } - info!( - self.logger, - "schedule delete stale ssts after flush"; - "stale_ssts" => ?stale_ssts, - "apply_index" => apply_index, - "cf" => cf, - "flushed_index" => index, - ); - let _ = ctx - .schedulers - .tablet - .schedule(tablet::Task::CleanupImportSst( - stale_ssts.into_boxed_slice(), - )); + self.cleanup_stale_ssts(ctx, &[cf], index, apply_index); } pub fn on_data_modified(&mut self, modification: DataTrace) { @@ -598,6 +679,38 @@ impl Peer { apply_trace.maybe_advance_admin_flushed(apply_index); } + pub fn cleanup_stale_ssts( + &mut self, + ctx: &mut StoreContext, + cfs: &[&str], + index: u64, + apply_index: u64, + ) { + let mut stale_ssts = vec![]; + for cf in cfs { + let ssts = self.sst_apply_state().stale_ssts(cf, index); + if !ssts.is_empty() { + info!( + self.logger, + "schedule delete stale ssts after flush"; + "stale_ssts" => ?stale_ssts, + "apply_index" => apply_index, + "cf" => cf, + "flushed_index" => index, + ); + stale_ssts.extend(ssts); + } + } + if !stale_ssts.is_empty() { + _ = ctx + .schedulers + .tablet + .schedule(tablet::Task::CleanupImportSst( + stale_ssts.into_boxed_slice(), + )); + } + } + pub fn flush_before_close(&mut self, ctx: &StoreContext, tx: SyncSender<()>) { info!( self.logger, @@ -689,7 +802,7 @@ impl Peer { #[cfg(test)] mod tests { - use engine_traits::RaftEngineReadOnly; + use engine_traits::{CfName, RaftEngineReadOnly}; use kvproto::metapb::Peer; use tempfile::TempDir; @@ -809,6 +922,93 @@ mod tests { // Because modify is recorded, so we know there should be no admin // modification and index can be advanced. assert_eq!(5, trace.admin.flushed); + + fn range_equals(trace: &ApplyTrace, cf: &str, expected: Vec) { + let pending_ranges = &trace.data_cfs[data_cf_offset(cf)].pending_sst_ranges; + assert_eq!( + pending_ranges.len(), + expected.len(), + "actual: {:?}, expected: {:?}", + pending_ranges, + &expected + ); + pending_ranges + .iter() + .zip(expected.iter()) + .for_each(|(r, e)| { + assert_eq!(r, e); + }); + } + + trace.on_modify(CF_DEFAULT, 8); + let ingested_ssts_idx = + make_sst_apply_index(vec![(CF_DEFAULT, 6), (CF_WRITE, 6), (CF_WRITE, 7)]); + trace.on_sst_ingested(&ingested_ssts_idx); + range_equals(&trace, CF_DEFAULT, vec![IndexRange(6, 6)]); + range_equals(&trace, CF_WRITE, vec![IndexRange(6, 7)]); + trace.maybe_advance_admin_flushed(8); + assert_eq!(7, trace.admin.flushed); + for cf in [CF_DEFAULT, CF_WRITE] { + assert_eq!( + trace.data_cfs[data_cf_offset(cf)].pending_sst_ranges.len(), + 0 + ); + } + trace.on_modify(CF_DEFAULT, 10); + let ingested_ssts_idx = make_sst_apply_index(vec![(CF_DEFAULT, 10)]); + trace.on_sst_ingested(&ingested_ssts_idx); + trace.on_flush(CF_DEFAULT, 8); + trace.maybe_advance_admin_flushed(10); + assert_eq!(8, trace.admin.flushed); + range_equals(&trace, CF_DEFAULT, vec![IndexRange(10, 10)]); + + trace.on_modify(CF_DEFAULT, 16); + let ingested_ssts_idx = make_sst_apply_index(vec![ + (CF_DEFAULT, 11), + (CF_WRITE, 12), + (CF_LOCK, 13), + (CF_DEFAULT, 14), + (CF_WRITE, 14), + (CF_WRITE, 15), + (CF_LOCK, 16), + ]); + trace.on_sst_ingested(&ingested_ssts_idx); + range_equals( + &trace, + CF_DEFAULT, + vec![IndexRange(10, 11), IndexRange(14, 14)], + ); + range_equals( + &trace, + CF_WRITE, + vec![IndexRange(12, 12), IndexRange(14, 15)], + ); + range_equals( + &trace, + CF_LOCK, + vec![IndexRange(13, 13), IndexRange(16, 16)], + ); + trace.maybe_advance_admin_flushed(16); + assert_eq!(8, trace.admin.flushed); + + trace.on_flush(CF_DEFAULT, 9); + trace.maybe_advance_admin_flushed(16); + assert_eq!(16, trace.admin.flushed); + for cf in DATA_CFS { + assert_eq!( + trace.data_cfs[data_cf_offset(cf)].pending_sst_ranges.len(), + 0 + ); + } + } + + fn make_sst_apply_index(data: Vec<(CfName, u64)>) -> Vec { + data.into_iter() + .map(|d| SstApplyIndex { + cf_index: data_cf_offset(d.0), + index: d.1, + }) + .collect() } #[test] diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index b985fd69c27..ba7170ac8c8 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -31,7 +31,7 @@ use std::{ time::Instant, }; -use engine_traits::{KvEngine, RaftEngine}; +use engine_traits::{KvEngine, RaftEngine, DATA_CFS}; use error_code::ErrorCodeExt; use kvproto::{ raft_cmdpb::AdminCmdType, @@ -896,6 +896,14 @@ impl Peer { self.storage_mut() .entry_storage_mut() .update_cache_persisted(persisted_index); + if let Some(idx) = self + .storage_mut() + .apply_trace_mut() + .take_flush_index(ready_number) + { + let apply_index = self.flush_state().applied_index(); + self.cleanup_stale_ssts(ctx, DATA_CFS, idx, apply_index); + } if self.is_in_force_leader() { // forward commit index, the committed entries will be applied in diff --git a/components/raftstore-v2/src/raft/apply.rs b/components/raftstore-v2/src/raft/apply.rs index 3e660c4549c..f3aa5a541c1 100644 --- a/components/raftstore-v2/src/raft/apply.rs +++ b/components/raftstore-v2/src/raft/apply.rs @@ -20,7 +20,7 @@ use tikv_util::{log::SlogFormat, worker::Scheduler, yatp_pool::FuturePool}; use crate::{ operation::{AdminCmdResult, ApplyFlowControl, DataTrace}, - router::CmdResChannel, + router::{CmdResChannel, SstApplyIndex}, TabletTask, }; @@ -64,6 +64,7 @@ pub struct Apply { admin_cmd_result: Vec, flush_state: Arc, sst_apply_state: SstApplyState, + sst_applied_index: Vec, /// The flushed indexes of each column family before being restarted. /// /// If an apply index is less than the flushed index, the log can be @@ -138,6 +139,7 @@ impl Apply { res_reporter, flush_state, sst_apply_state, + sst_applied_index: vec![], log_recovery, metrics: ApplyMetrics::default(), buckets, @@ -308,6 +310,16 @@ impl Apply { &self.sst_apply_state } + #[inline] + pub fn push_sst_applied_index(&mut self, sst_index: SstApplyIndex) { + self.sst_applied_index.push(sst_index); + } + + #[inline] + pub fn take_sst_applied_index(&mut self) -> Vec { + mem::take(&mut self.sst_applied_index) + } + #[inline] pub fn log_recovery(&self) -> &Option> { &self.log_recovery diff --git a/components/raftstore-v2/src/router/internal_message.rs b/components/raftstore-v2/src/router/internal_message.rs index 6c8d1136b3a..7ac86c3f8c7 100644 --- a/components/raftstore-v2/src/router/internal_message.rs +++ b/components/raftstore-v2/src/router/internal_message.rs @@ -25,4 +25,11 @@ pub struct ApplyRes { pub modifications: DataTrace, pub metrics: ApplyMetrics, pub bucket_stat: Option, + pub sst_applied_index: Vec, +} + +#[derive(Copy, Clone, Debug)] +pub struct SstApplyIndex { + pub cf_index: usize, + pub index: u64, } diff --git a/components/raftstore-v2/src/router/mod.rs b/components/raftstore-v2/src/router/mod.rs index 7630e35c2a5..83a2497b331 100644 --- a/components/raftstore-v2/src/router/mod.rs +++ b/components/raftstore-v2/src/router/mod.rs @@ -12,7 +12,7 @@ pub use self::response_channel::FlushChannel; pub use self::response_channel::FlushSubscriber; pub use self::{ imp::{RaftRouter, UnsafeRecoveryRouter}, - internal_message::ApplyRes, + internal_message::{ApplyRes, SstApplyIndex}, message::{PeerMsg, PeerTick, RaftRequest, StoreMsg, StoreTick}, response_channel::{ build_any_channel, AnyResChannel, AnyResSubscriber, BaseSubscriber, CmdResChannel, diff --git a/tests/failpoints/cases/test_import_service.rs b/tests/failpoints/cases/test_import_service.rs index a2487456108..e51c9862e47 100644 --- a/tests/failpoints/cases/test_import_service.rs +++ b/tests/failpoints/cases/test_import_service.rs @@ -7,10 +7,10 @@ use std::{ use file_system::calc_crc32; use futures::{executor::block_on, stream, SinkExt}; -use grpcio::{Result, WriteFlags}; -use kvproto::import_sstpb::*; +use grpcio::{ChannelBuilder, Environment, Result, WriteFlags}; +use kvproto::{import_sstpb::*, tikvpb_grpc::TikvClient}; use tempfile::{Builder, TempDir}; -use test_raftstore::Simulator; +use test_raftstore::{must_raw_put, Simulator}; use test_sst_importer::*; use tikv::config::TikvConfig; use tikv_util::{config::ReadableSize, HandyRwLock}; @@ -455,3 +455,73 @@ fn sst_file_count(paths: &Vec) -> u64 { } count } + +#[test] +fn test_flushed_applied_index_after_ingset() { + // disable data flushed + fail::cfg("on_flush_completed", "return()").unwrap(); + // disable data flushed + let (mut cluster, ctx, _tikv, import) = open_cluster_and_tikv_import_client_v2(None); + let temp_dir = Builder::new().prefix("test_ingest_sst").tempdir().unwrap(); + let sst_path = temp_dir.path().join("test.sst"); + + // Create clients. + let env = Arc::new(Environment::new(1)); + let channel = ChannelBuilder::new(Arc::clone(&env)).connect(&cluster.sim.rl().get_addr(1)); + let client = TikvClient::new(channel); + + for i in 0..5 { + let sst_range = (i * 20, (i + 1) * 20); + let (mut meta, data) = gen_sst_file(sst_path.clone(), sst_range); + // No region id and epoch. + send_upload_sst(&import, &meta, &data).unwrap(); + let mut ingest = IngestRequest::default(); + ingest.set_context(ctx.clone()); + ingest.set_sst(meta.clone()); + meta.set_region_id(ctx.get_region_id()); + meta.set_region_epoch(ctx.get_region_epoch().clone()); + send_upload_sst(&import, &meta, &data).unwrap(); + ingest.set_sst(meta.clone()); + let resp = import.ingest(&ingest).unwrap(); + assert!(!resp.has_error(), "{:?}", resp.get_error()); + } + + // only 1 sst left because there is no more event to trigger a raft ready flush. + let count = sst_file_count(&cluster.paths); + assert_eq!(1, count); + + for i in 5..8 { + let sst_range = (i * 20, (i + 1) * 20); + let (mut meta, data) = gen_sst_file(sst_path.clone(), sst_range); + // No region id and epoch. + send_upload_sst(&import, &meta, &data).unwrap(); + let mut ingest = IngestRequest::default(); + ingest.set_context(ctx.clone()); + ingest.set_sst(meta.clone()); + meta.set_region_id(ctx.get_region_id()); + meta.set_region_epoch(ctx.get_region_epoch().clone()); + send_upload_sst(&import, &meta, &data).unwrap(); + ingest.set_sst(meta.clone()); + let resp = import.ingest(&ingest).unwrap(); + assert!(!resp.has_error(), "{:?}", resp.get_error()); + } + + // ingest more sst files, unflushed index still be 1. + let count = sst_file_count(&cluster.paths); + assert_eq!(1, count); + + // file a write to trigger ready flush, even if the write is not flushed. + must_raw_put(&client, ctx, b"key1".to_vec(), b"value1".to_vec()); + let count = sst_file_count(&cluster.paths); + assert_eq!(0, count); + + // restart node, should not tirgger any ingest + fail::cfg("on_apply_ingest", "panic").unwrap(); + cluster.stop_node(1); + cluster.start().unwrap(); + let count = sst_file_count(&cluster.paths); + assert_eq!(0, count); + + fail::remove("on_apply_ingest"); + fail::remove("on_flush_completed"); +} From b172835345cb015572faabb2bc164d532ba8d62f Mon Sep 17 00:00:00 2001 From: tonyxuqqi Date: Wed, 13 Sep 2023 19:57:08 -0700 Subject: [PATCH 044/220] add option to update config without persist (#15587) close tikv/tikv#15588 add option to update TiKV config without persist in status API "POST /config?persist=false|true" Signed-off-by: tonyxuqqi --- src/server/status_server/mod.rs | 88 ++++++++++++++++++++++++++++++++- 1 file changed, 87 insertions(+), 1 deletion(-) diff --git a/src/server/status_server/mod.rs b/src/server/status_server/mod.rs index b49fdce12af..98077d9e93f 100644 --- a/src/server/status_server/mod.rs +++ b/src/server/status_server/mod.rs @@ -313,6 +313,18 @@ where req: Request, ) -> hyper::Result> { let mut body = Vec::new(); + let mut persist = true; + if let Some(query) = req.uri().query() { + let query_pairs: HashMap<_, _> = + url::form_urlencoded::parse(query.as_bytes()).collect(); + persist = match query_pairs.get("persist") { + Some(val) => match val.parse() { + Ok(val) => val, + Err(err) => return Ok(make_response(StatusCode::BAD_REQUEST, err.to_string())), + }, + None => true, + }; + } req.into_body() .try_for_each(|bytes| { body.extend(bytes); @@ -320,7 +332,11 @@ where }) .await?; Ok(match decode_json(&body) { - Ok(change) => match cfg_controller.update(change) { + Ok(change) => match if persist { + cfg_controller.update(change) + } else { + cfg_controller.update_without_persist(change) + } { Err(e) => { if let Some(e) = e.downcast_ref::() { make_response( @@ -1227,6 +1243,76 @@ mod tests { status_server.stop(); } + #[test] + fn test_update_config_endpoint() { + let test_config = |persist: bool| { + let temp_dir = tempfile::TempDir::new().unwrap(); + let mut config = TikvConfig::default(); + config.cfg_path = temp_dir + .path() + .join("tikv.toml") + .to_str() + .unwrap() + .to_string(); + let mut status_server = StatusServer::new( + 1, + ConfigController::new(config), + Arc::new(SecurityConfig::default()), + MockRouter, + temp_dir.path().to_path_buf(), + None, + GrpcServiceManager::dummy(), + ) + .unwrap(); + let addr = "127.0.0.1:0".to_owned(); + let _ = status_server.start(addr); + let client = Client::new(); + let uri = if persist { + Uri::builder() + .scheme("http") + .authority(status_server.listening_addr().to_string().as_str()) + .path_and_query("/config") + .build() + .unwrap() + } else { + Uri::builder() + .scheme("http") + .authority(status_server.listening_addr().to_string().as_str()) + .path_and_query("/config?persist=false") + .build() + .unwrap() + }; + let mut req = Request::new(Body::from("{\"coprocessor.region-split-size\": \"1GB\"}")); + *req.method_mut() = Method::POST; + *req.uri_mut() = uri.clone(); + let handle = status_server.thread_pool.spawn(async move { + let resp = client.request(req).await.unwrap(); + assert_eq!(resp.status(), StatusCode::OK); + }); + block_on(handle).unwrap(); + + let client = Client::new(); + let handle2 = status_server.thread_pool.spawn(async move { + let resp = client.get(uri).await.unwrap(); + assert_eq!(resp.status(), StatusCode::OK); + let mut v = Vec::new(); + resp.into_body() + .try_for_each(|bytes| { + v.extend(bytes); + ok(()) + }) + .await + .unwrap(); + let resp_json = String::from_utf8_lossy(&v).to_string(); + assert!(resp_json.contains("\"region-split-size\":\"1GiB\"")); + }); + block_on(handle2).unwrap(); + status_server.stop(); + }; + test_config(true); + test_config(false); + } + #[cfg(feature = "failpoints")] #[test] fn test_status_service_fail_endpoints() { From 905ecd79ee9a30bcd8b9b1949c430062c4c3fd07 Mon Sep 17 00:00:00 2001 From: glorv Date: Thu, 14 Sep 2023 12:07:39 +0800 Subject: [PATCH 045/220] tracker: add a warn log for deadline exceeded query (#15577) ref tikv/tikv#15566 Signed-off-by: glorv Co-authored-by: tonyxuqqi --- src/coprocessor/tracker.rs | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/src/coprocessor/tracker.rs b/src/coprocessor/tracker.rs index bb32a3a0e03..f6502c2459e 100644 --- a/src/coprocessor/tracker.rs +++ b/src/coprocessor/tracker.rs @@ -434,6 +434,36 @@ impl Drop for Tracker { if let TrackerState::ItemFinished(_) = self.current_stage { self.on_finish_all_items(); } + + if self.current_stage != TrackerState::AllItemFinished + && self.req_ctx.deadline.check().is_err() + { + // record deadline exceeded error log. + let total_lifetime = self.request_begin_at.saturating_elapsed(); + let source_stmt = self.req_ctx.context.get_source_stmt(); + let first_range = self.req_ctx.ranges.first(); + let some_table_id = first_range.as_ref().map(|range| { + tidb_query_datatype::codec::table::decode_table_id(range.get_start()) + .unwrap_or_default() + }); + warn!("query deadline exceeded"; + "current_stage" => ?self.current_stage, + "connection_id" => source_stmt.get_connection_id(), + "session_alias" => source_stmt.get_session_alias(), + "region_id" => &self.req_ctx.context.get_region_id(), + "remote_host" => &self.req_ctx.peer, + "total_lifetime" => ?total_lifetime, + "wait_time" => ?self.wait_time, + "wait_time.schedule" => ?self.schedule_wait_time, + "wait_time.snapshot" => ?self.snapshot_wait_time, + "handler_build_time" => ?self.handler_build_time, + "total_process_time" => ?self.total_process_time, + "total_suspend_time" => ?self.total_suspend_time, + "txn_start_ts" => self.req_ctx.txn_start_ts, + "table_id" => some_table_id, + "tag" => self.req_ctx.tag.get_str(), + ); + } } } From 62c17991fd73269929bdfbd8e408710078e53351 Mon Sep 17 00:00:00 2001 From: Yang Zhang Date: Wed, 13 Sep 2023 21:42:09 -0700 Subject: [PATCH 046/220] unsafe recovery: Enable force leader to rollback merge (#15578) close tikv/tikv#15580 Enable force leader to rollback merges when they are not able to proceed, previously, only regions with quorum can do this. Signed-off-by: Yang Zhang Co-authored-by: tonyxuqqi --- components/raftstore/src/store/fsm/peer.rs | 17 ++- components/raftstore/src/store/peer.rs | 4 +- .../failpoints/cases/test_unsafe_recovery.rs | 110 ++++++++++++++++++ 3 files changed, 129 insertions(+), 2 deletions(-) diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index 9f7934e806e..d61e6784295 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -4434,6 +4434,9 @@ where fn schedule_merge(&mut self) -> Result<()> { fail_point!("on_schedule_merge", |_| Ok(())); + fail_point!("on_schedule_merge_ret_err", |_| Err(Error::RegionNotFound( + 1 + ))); let (request, target_id) = { let state = self.fsm.peer.pending_merge_state.as_ref().unwrap(); let expect_region = state.get_target(); @@ -4557,6 +4560,17 @@ where "error_code" => %e.error_code(), ); self.rollback_merge(); + } else if let Some(ForceLeaderState::ForceLeader { .. }) = + &self.fsm.peer.force_leader + { + info!( + "failed to schedule merge, rollback in force leader state"; + "region_id" => self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + "err" => %e, + "error_code" => %e.error_code(), + ); + self.rollback_merge(); } } else if !is_learner(&self.fsm.peer.peer) { info!( @@ -5228,7 +5242,8 @@ where // error-prone if !(msg.has_admin_request() && (msg.get_admin_request().get_cmd_type() == AdminCmdType::ChangePeer - || msg.get_admin_request().get_cmd_type() == AdminCmdType::ChangePeerV2)) + || msg.get_admin_request().get_cmd_type() == AdminCmdType::ChangePeerV2 + || msg.get_admin_request().get_cmd_type() == AdminCmdType::RollbackMerge)) { return Err(Error::RecoveryInProgress(self.region_id())); } diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index 8c1a7ef61e9..8ef857bfa12 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -4237,7 +4237,9 @@ where // Should not propose normal in force leader state. // In `pre_propose_raft_command`, it rejects all the requests expect conf-change // if in force leader state. - if self.force_leader.is_some() { + if self.force_leader.is_some() + && req.get_admin_request().get_cmd_type() != AdminCmdType::RollbackMerge + { poll_ctx.raft_metrics.invalid_proposal.force_leader.inc(); panic!( "{} propose normal in force leader state {:?}", diff --git a/tests/failpoints/cases/test_unsafe_recovery.rs b/tests/failpoints/cases/test_unsafe_recovery.rs index cc33a01ff03..978489b5cd6 100644 --- a/tests/failpoints/cases/test_unsafe_recovery.rs +++ b/tests/failpoints/cases/test_unsafe_recovery.rs @@ -440,3 +440,113 @@ fn test_unsafe_recovery_demotion_reentrancy() { assert_eq!(demoted, true); fail::remove("on_handle_apply_store_1"); } + +#[test_case(test_raftstore::new_node_cluster)] +fn test_unsafe_recovery_rollback_merge() { + let mut cluster = new_cluster(0, 3); + cluster.cfg.raft_store.raft_store_max_leader_lease = ReadableDuration::millis(40); + cluster.cfg.raft_store.merge_check_tick_interval = ReadableDuration::millis(20); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + for i in 0..10 { + cluster.must_put(format!("k{}", i).as_bytes(), b"v"); + } + + // Block merge commit, let go of the merge prepare. + fail::cfg("on_schedule_merge_ret_err", "return()").unwrap(); + + let region = pd_client.get_region(b"k1").unwrap(); + cluster.must_split(®ion, b"k2"); + + let left = pd_client.get_region(b"k1").unwrap(); + let right = pd_client.get_region(b"k2").unwrap(); + + // Makes the leadership definite. + let left_peer_2 = find_peer(&left, nodes[2]).unwrap().to_owned(); + let right_peer_2 = find_peer(&right, nodes[2]).unwrap().to_owned(); + cluster.must_transfer_leader(left.get_id(), left_peer_2); + cluster.must_transfer_leader(right.get_id(), right_peer_2); + cluster.must_try_merge(left.get_id(), right.get_id()); + + // Makes the group lose its quorum. + cluster.stop_node(nodes[1]); + cluster.stop_node(nodes[2]); + { + let put = new_put_cmd(b"k2", b"v2"); + let req = new_request( + region.get_id(), + region.get_region_epoch().clone(), + vec![put], + true, + ); + // marjority is lost, can't propose command successfully. + cluster + .call_command_on_leader(req, Duration::from_millis(10)) + .unwrap_err(); + } + + cluster.must_enter_force_leader(left.get_id(), nodes[0], vec![nodes[1], nodes[2]]); + cluster.must_enter_force_leader(right.get_id(), nodes[0], vec![nodes[1], nodes[2]]); + + // Construct recovery plan. + let mut plan = pdpb::RecoveryPlan::default(); + + let left_demote_peers: Vec = left + .get_peers() + .iter() + .filter(|&peer| peer.get_store_id() != nodes[0]) + .cloned() + .collect(); + let mut left_demote = pdpb::DemoteFailedVoters::default(); + left_demote.set_region_id(left.get_id()); + left_demote.set_failed_voters(left_demote_peers.into()); + let right_demote_peers: Vec = right + .get_peers() + .iter() + .filter(|&peer| peer.get_store_id() != nodes[0]) + .cloned() + .collect(); + let mut right_demote = pdpb::DemoteFailedVoters::default(); + right_demote.set_region_id(right.get_id()); + right_demote.set_failed_voters(right_demote_peers.into()); + plan.mut_demotes().push(left_demote); + plan.mut_demotes().push(right_demote); + + // Triggers the unsafe recovery plan execution. + pd_client.must_set_unsafe_recovery_plan(nodes[0], plan.clone()); + cluster.must_send_store_heartbeat(nodes[0]); + + let mut demoted = false; + for _ in 0..10 { + let new_left = block_on(pd_client.get_region_by_id(left.get_id())) + .unwrap() + .unwrap(); + let new_right = block_on(pd_client.get_region_by_id(right.get_id())) + .unwrap() + .unwrap(); + assert_eq!(new_left.get_peers().len(), 3); + assert_eq!(new_right.get_peers().len(), 3); + demoted = new_left + .get_peers() + .iter() + .filter(|peer| peer.get_store_id() != nodes[0]) + .all(|peer| peer.get_role() == metapb::PeerRole::Learner) + && new_right + .get_peers() + .iter() + .filter(|peer| peer.get_store_id() != nodes[0]) + .all(|peer| peer.get_role() == metapb::PeerRole::Learner); + if demoted { + break; + } + sleep_ms(100); + } + assert_eq!(demoted, true); + + fail::remove("on_schedule_merge_ret_err"); +} From e43a157c4a35034dfd705bdd94fac6d958e8a1ff Mon Sep 17 00:00:00 2001 From: Neil Shen Date: Thu, 14 Sep 2023 16:10:39 +0800 Subject: [PATCH 047/220] resolved_ts: limit scanner memory usage (#15523) ref tikv/tikv#14864 * Break resolved ts scan entry into multiple tasks. * Limit concurrent resolved ts scan tasks. * Remove resolved ts dead code. Signed-off-by: Neil Shen Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- components/resolved_ts/src/endpoint.rs | 190 ++++++++---------- components/resolved_ts/src/metrics.rs | 5 + components/resolved_ts/src/resolver.rs | 27 ++- components/resolved_ts/src/scanner.rs | 222 +++++++++------------ src/config/mod.rs | 2 + tests/integrations/config/mod.rs | 1 + tests/integrations/config/test-custom.toml | 1 + 7 files changed, 198 insertions(+), 250 deletions(-) diff --git a/components/resolved_ts/src/endpoint.rs b/components/resolved_ts/src/endpoint.rs index 2a2f56eaadd..34f00672fa7 100644 --- a/components/resolved_ts/src/endpoint.rs +++ b/components/resolved_ts/src/endpoint.rs @@ -5,15 +5,13 @@ use std::{ collections::HashMap, fmt, marker::PhantomData, - sync::{ - atomic::{AtomicBool, Ordering}, - Arc, Mutex, MutexGuard, - }, + sync::{Arc, Mutex, MutexGuard}, time::Duration, }; use concurrency_manager::ConcurrencyManager; use engine_traits::KvEngine; +use futures::channel::oneshot::{channel, Receiver, Sender}; use grpcio::Environment; use kvproto::{kvrpcpb::LeaderInfo, metapb::Region, raft_cmdpb::AdminCmdType}; use online_config::{self, ConfigChange, ConfigManager, OnlineConfig}; @@ -35,7 +33,7 @@ use tikv_util::{ warn, worker::{Runnable, RunnableWithTimer, Scheduler}, }; -use tokio::sync::Notify; +use tokio::sync::{Notify, Semaphore}; use txn_types::{Key, TimeStamp}; use crate::{ @@ -43,7 +41,7 @@ use crate::{ cmd::{ChangeLog, ChangeRow}, metrics::*, resolver::{LastAttempt, Resolver}, - scanner::{ScanEntry, ScanMode, ScanTask, ScannerPool}, + scanner::{ScanEntries, ScanTask, ScannerPool}, Error, Result, TsSource, ON_DROP_WARN_HEAP_SIZE, }; @@ -55,7 +53,7 @@ enum ResolverStatus { Pending { tracked_index: u64, locks: Vec, - cancelled: Arc, + cancelled: Option>, memory_quota: Arc, }, Ready, @@ -188,7 +186,12 @@ struct ObserveRegion { } impl ObserveRegion { - fn new(meta: Region, rrp: Arc, memory_quota: Arc) -> Self { + fn new( + meta: Region, + rrp: Arc, + memory_quota: Arc, + cancelled: Sender<()>, + ) -> Self { ObserveRegion { resolver: Resolver::with_read_progress(meta.id, Some(rrp), memory_quota.clone()), meta, @@ -196,7 +199,7 @@ impl ObserveRegion { resolver_status: ResolverStatus::Pending { tracked_index: 0, locks: vec![], - cancelled: Arc::new(AtomicBool::new(false)), + cancelled: Some(cancelled), memory_quota, }, } @@ -318,51 +321,45 @@ impl ObserveRegion { } /// Track locks in incoming scan entries. - fn track_scan_locks(&mut self, entries: Vec, apply_index: u64) -> Result<()> { - for es in entries { - match es { - ScanEntry::Lock(locks) => { - if let ResolverStatus::Ready = self.resolver_status { - panic!("region {:?} resolver has ready", self.meta.id) - } - for (key, lock) in locks { - self.resolver.track_lock( - lock.ts, - key.to_raw().unwrap(), - Some(apply_index), - )?; - } + fn track_scan_locks(&mut self, entries: ScanEntries, apply_index: u64) -> Result<()> { + match entries { + ScanEntries::Lock(locks) => { + if let ResolverStatus::Ready = self.resolver_status { + panic!("region {:?} resolver has ready", self.meta.id) } - ScanEntry::None => { - // Update the `tracked_index` to the snapshot's `apply_index` - self.resolver.update_tracked_index(apply_index); - let mut resolver_status = - std::mem::replace(&mut self.resolver_status, ResolverStatus::Ready); - let (pending_tracked_index, pending_locks) = - resolver_status.drain_pending_locks(self.meta.id); - for lock in pending_locks { - match lock { - PendingLock::Track { key, start_ts } => { - self.resolver.track_lock( - start_ts, - key.to_raw().unwrap(), - Some(pending_tracked_index), - )?; - } - PendingLock::Untrack { key, .. } => self - .resolver - .untrack_lock(&key.to_raw().unwrap(), Some(pending_tracked_index)), + for (key, lock) in locks { + self.resolver + .track_lock(lock.ts, key.to_raw().unwrap(), Some(apply_index))?; + } + } + ScanEntries::None => { + // Update the `tracked_index` to the snapshot's `apply_index` + self.resolver.update_tracked_index(apply_index); + let mut resolver_status = + std::mem::replace(&mut self.resolver_status, ResolverStatus::Ready); + let (pending_tracked_index, pending_locks) = + resolver_status.drain_pending_locks(self.meta.id); + for lock in pending_locks { + match lock { + PendingLock::Track { key, start_ts } => { + self.resolver.track_lock( + start_ts, + key.to_raw().unwrap(), + Some(pending_tracked_index), + )?; } + PendingLock::Untrack { key, .. } => self + .resolver + .untrack_lock(&key.to_raw().unwrap(), Some(pending_tracked_index)), } - info!( - "Resolver initialized"; - "region" => self.meta.id, - "observe_id" => ?self.handle.id, - "snapshot_index" => apply_index, - "pending_data_index" => pending_tracked_index, - ); } - ScanEntry::TxnEntry(_) => panic!("unexpected entry type"), + info!( + "Resolver initialized"; + "region" => self.meta.id, + "observe_id" => ?self.handle.id, + "snapshot_index" => apply_index, + "pending_data_index" => pending_tracked_index, + ); } } Ok(()) @@ -378,6 +375,7 @@ pub struct Endpoint { region_read_progress: RegionReadProgressRegistry, regions: HashMap, scanner_pool: ScannerPool, + scan_concurrency_semaphore: Arc, scheduler: Scheduler, advance_worker: AdvanceTsWorker, _phantom: PhantomData<(T, E)>, @@ -442,10 +440,7 @@ where match &observed_region.resolver_status { ResolverStatus::Pending { locks, .. } => { for l in locks { - match l { - PendingLock::Track { key, .. } => stats.heap_size += key.len() as i64, - PendingLock::Untrack { key, .. } => stats.heap_size += key.len() as i64, - } + stats.heap_size += l.heap_size() as i64; } stats.unresolved_count += 1; } @@ -477,6 +472,7 @@ where RTS_ZERO_RESOLVED_TS.set(stats.zero_ts_count); RTS_LOCK_HEAP_BYTES_GAUGE.set(stats.resolver.heap_size); + RTS_LOCK_QUOTA_IN_USE_BYTES_GAUGE.set(self.memory_quota.in_use() as i64); RTS_REGION_RESOLVE_STATUS_GAUGE_VEC .with_label_values(&["resolved"]) .set(stats.resolver.resolved_count); @@ -678,6 +674,7 @@ where region_read_progress.clone(), store_resolver_gc_interval, ); + let scan_concurrency_semaphore = Arc::new(Semaphore::new(cfg.incremental_scan_concurrency)); let ep = Self { store_id: Some(store_id), cfg: cfg.clone(), @@ -688,6 +685,7 @@ where region_read_progress, advance_worker, scanner_pool, + scan_concurrency_semaphore, regions: HashMap::default(), _phantom: PhantomData::default(), }; @@ -698,33 +696,28 @@ where fn register_region(&mut self, region: Region, backoff: Option) { let region_id = region.get_id(); assert!(self.regions.get(®ion_id).is_none()); - let observe_region = { - if let Some(read_progress) = self.region_read_progress.get(®ion_id) { - info!( - "register observe region"; - "region" => ?region - ); - ObserveRegion::new(region.clone(), read_progress, self.memory_quota.clone()) - } else { - warn!( - "try register unexit region"; - "region" => ?region, - ); - return; - } + let Some(read_progress) = self.region_read_progress.get(®ion_id) else { + warn!("try register nonexistent region"; "region" => ?region); + return; }; + info!("register observe region"; "region" => ?region); + let (cancelled_tx, cancelled_rx) = channel(); + let observe_region = ObserveRegion::new( + region.clone(), + read_progress, + self.memory_quota.clone(), + cancelled_tx, + ); let observe_handle = observe_region.handle.clone(); - let cancelled = match observe_region.resolver_status { - ResolverStatus::Pending { ref cancelled, .. } => cancelled.clone(), - ResolverStatus::Ready => panic!("resolved ts illeagal created observe region"), - }; observe_region .read_progress() .update_advance_resolved_ts_notify(self.advance_notify.clone()); self.regions.insert(region_id, observe_region); - let scan_task = self.build_scan_task(region, observe_handle, cancelled, backoff); - self.scanner_pool.spawn_task(scan_task); + let scan_task = self.build_scan_task(region, observe_handle, cancelled_rx, backoff); + let concurrency_semaphore = self.scan_concurrency_semaphore.clone(); + self.scanner_pool + .spawn_task(scan_task, concurrency_semaphore); RTS_SCAN_TASKS.with_label_values(&["total"]).inc(); } @@ -732,45 +725,17 @@ where &self, region: Region, observe_handle: ObserveHandle, - cancelled: Arc, + cancelled: Receiver<()>, backoff: Option, ) -> ScanTask { let scheduler = self.scheduler.clone(); - let scheduler_error = self.scheduler.clone(); - let region_id = region.id; - let observe_id = observe_handle.id; ScanTask { handle: observe_handle, - tag: String::new(), - mode: ScanMode::LockOnly, region, checkpoint_ts: TimeStamp::zero(), backoff, - is_cancelled: Box::new(move || cancelled.load(Ordering::Acquire)), - send_entries: Box::new(move |entries, apply_index| { - scheduler - .schedule(Task::ScanLocks { - region_id, - observe_id, - entries, - apply_index, - }) - .unwrap_or_else(|e| warn!("schedule resolved ts task failed"; "err" => ?e)); - RTS_SCAN_TASKS.with_label_values(&["finish"]).inc(); - }), - on_error: Some(Box::new(move |observe_id, _region, e| { - if let Err(e) = scheduler_error.schedule(Task::ReRegisterRegion { - region_id, - observe_id, - cause: e, - }) { - warn!("schedule re-register task failed"; - "region_id" => region_id, - "observe_id" => ?observe_id, - "error" => ?e); - } - RTS_SCAN_TASKS.with_label_values(&["abort"]).inc(); - })), + cancelled, + scheduler, } } @@ -778,7 +743,7 @@ where if let Some(observe_region) = self.regions.remove(®ion_id) { let ObserveRegion { handle, - resolver_status, + mut resolver_status, .. } = observe_region; @@ -791,8 +756,11 @@ where // Stop observing data handle.stop_observing(); // Stop scanning data - if let ResolverStatus::Pending { ref cancelled, .. } = resolver_status { - cancelled.store(true, Ordering::Release); + if let ResolverStatus::Pending { + ref mut cancelled, .. + } = resolver_status + { + let _ = cancelled.take(); } } else { debug!("deregister unregister region"; "region_id" => region_id); @@ -938,7 +906,7 @@ where &mut self, region_id: u64, observe_id: ObserveId, - entries: Vec, + entries: ScanEntries, apply_index: u64, ) { let mut memory_quota_exceeded = None; @@ -979,6 +947,8 @@ where self.advance_notify.notify_waiters(); self.memory_quota .set_capacity(self.cfg.memory_quota.0 as usize); + self.scan_concurrency_semaphore = + Arc::new(Semaphore::new(self.cfg.incremental_scan_concurrency)); info!( "resolved-ts config changed"; "prev" => prev, @@ -1047,7 +1017,7 @@ pub enum Task { ScanLocks { region_id: u64, observe_id: ObserveId, - entries: Vec, + entries: ScanEntries, apply_index: u64, }, ChangeConfig { diff --git a/components/resolved_ts/src/metrics.rs b/components/resolved_ts/src/metrics.rs index 02bb92f7887..fb751491d10 100644 --- a/components/resolved_ts/src/metrics.rs +++ b/components/resolved_ts/src/metrics.rs @@ -104,6 +104,11 @@ lazy_static! { "Total bytes in memory of resolved-ts observed regions's lock heap" ) .unwrap(); + pub static ref RTS_LOCK_QUOTA_IN_USE_BYTES_GAUGE: IntGauge = register_int_gauge!( + "tikv_resolved_ts_memory_quota_in_use_bytes", + "Total bytes in memory of resolved-ts observed regions's lock heap" + ) + .unwrap(); pub static ref RTS_REGION_RESOLVE_STATUS_GAUGE_VEC: IntGaugeVec = register_int_gauge_vec!( "tikv_resolved_ts_region_resolve_status", "The status of resolved-ts observed regions", diff --git a/components/resolved_ts/src/resolver.rs b/components/resolved_ts/src/resolver.rs index 9a62a0eea98..85e7acff4a4 100644 --- a/components/resolved_ts/src/resolver.rs +++ b/components/resolved_ts/src/resolver.rs @@ -10,7 +10,7 @@ use tikv_util::{ }; use txn_types::{Key, TimeStamp}; -use crate::metrics::RTS_RESOLVED_FAIL_ADVANCE_VEC; +use crate::metrics::*; const MAX_NUMBER_OF_LOCKS_IN_LOG: usize = 10; pub const ON_DROP_WARN_HEAP_SIZE: usize = 64 * 1024 * 1024; // 64MB @@ -203,16 +203,23 @@ impl Resolver { // Return an approximate heap memory usage in bytes. pub fn approximate_heap_bytes(&self) -> usize { - // memory used by locks_by_key. - let memory_quota_in_use = self.memory_quota.in_use(); - - // memory used by lock_ts_heap. - let memory_lock_ts_heap = self.lock_ts_heap.len() - * (std::mem::size_of::() + std::mem::size_of::>>()) - // memory used by HashSet> - + self.locks_by_key.len() * std::mem::size_of::>(); + if self.locks_by_key.is_empty() { + return 0; + } - memory_quota_in_use + memory_lock_ts_heap + const SAMPLE_COUNT: usize = 8; + let mut key_count = 0; + let mut key_bytes = 0; + for key in self.locks_by_key.keys() { + key_count += 1; + key_bytes += key.len(); + if key_count >= SAMPLE_COUNT { + break; + } + } + self.locks_by_key.len() * (key_bytes / key_count + std::mem::size_of::()) + + self.lock_ts_heap.len() + * (std::mem::size_of::() + std::mem::size_of::>>()) } fn lock_heap_size(&self, key: &[u8]) -> usize { diff --git a/components/resolved_ts/src/scanner.rs b/components/resolved_ts/src/scanner.rs index 615819db799..6c8c90dc38f 100644 --- a/components/resolved_ts/src/scanner.rs +++ b/components/resolved_ts/src/scanner.rs @@ -3,57 +3,79 @@ use std::{marker::PhantomData, sync::Arc, time::Duration}; use engine_traits::KvEngine; -use futures::compat::Future01CompatExt; -use kvproto::{kvrpcpb::ExtraOp as TxnExtraOp, metapb::Region}; +use futures::{channel::oneshot::Receiver, compat::Future01CompatExt, FutureExt}; +use kvproto::metapb::Region; use raftstore::{ - coprocessor::{ObserveHandle, ObserveId}, + coprocessor::ObserveHandle, router::CdcHandle, store::{fsm::ChangeObserver, msg::Callback, RegionSnapshot}, }; use tikv::storage::{ kv::{ScanMode as MvccScanMode, Snapshot}, - mvcc::{DeltaScanner, MvccReader, ScannerBuilder}, - txn::{TxnEntry, TxnEntryScanner}, + mvcc::MvccReader, +}; +use tikv_util::{ + sys::thread::ThreadBuildWrapper, time::Instant, timer::GLOBAL_TIMER_HANDLE, worker::Scheduler, +}; +use tokio::{ + runtime::{Builder, Runtime}, + sync::Semaphore, }; -use tikv_util::{sys::thread::ThreadBuildWrapper, time::Instant, timer::GLOBAL_TIMER_HANDLE}; -use tokio::runtime::{Builder, Runtime}; use txn_types::{Key, Lock, LockType, TimeStamp}; use crate::{ errors::{Error, Result}, metrics::*, + Task, }; -const DEFAULT_SCAN_BATCH_SIZE: usize = 1024; +const DEFAULT_SCAN_BATCH_SIZE: usize = 128; const GET_SNAPSHOT_RETRY_TIME: u32 = 3; const GET_SNAPSHOT_RETRY_BACKOFF_STEP: Duration = Duration::from_millis(100); -pub type BeforeStartCallback = Box; -pub type OnErrorCallback = Box; -pub type OnEntriesCallback = Box, u64) + Send>; -pub type IsCancelledCallback = Box bool + Send>; - -pub enum ScanMode { - LockOnly, - All, - AllWithOldValue, -} - pub struct ScanTask { pub handle: ObserveHandle, - pub tag: String, - pub mode: ScanMode, pub region: Region, pub checkpoint_ts: TimeStamp, pub backoff: Option, - pub is_cancelled: IsCancelledCallback, - pub send_entries: OnEntriesCallback, - pub on_error: Option, + pub cancelled: Receiver<()>, + pub scheduler: Scheduler, +} + +impl ScanTask { + async fn send_entries(&self, entries: ScanEntries, apply_index: u64) { + let task = Task::ScanLocks { + region_id: self.region.get_id(), + observe_id: self.handle.id, + entries, + apply_index, + }; + if let Err(e) = self.scheduler.schedule(task) { + warn!("resolved_ts scheduler send entries failed"; "err" => ?e); + } + } + + fn is_cancelled(&mut self) -> bool { + matches!(self.cancelled.try_recv(), Err(_) | Ok(Some(_))) + } + + fn on_error(&self, err: Error) { + if let Err(e) = self.scheduler.schedule(Task::ReRegisterRegion { + region_id: self.region.get_id(), + observe_id: self.handle.id, + cause: err, + }) { + warn!("schedule re-register task failed"; + "region_id" => self.region.get_id(), + "observe_id" => ?self.handle.id, + "error" => ?e); + } + RTS_SCAN_TASKS.with_label_values(&["abort"]).inc(); + } } #[derive(Debug)] -pub enum ScanEntry { - TxnEntry(Vec), +pub enum ScanEntries { Lock(Vec<(Key, Lock)>), None, } @@ -82,109 +104,66 @@ impl, E: KvEngine> ScannerPool { } } - pub fn spawn_task(&self, mut task: ScanTask) { + pub fn spawn_task(&self, mut task: ScanTask, concurrency_semaphore: Arc) { let cdc_handle = self.cdc_handle.clone(); let fut = async move { + tikv_util::defer!({ + RTS_SCAN_TASKS.with_label_values(&["finish"]).inc(); + }); if let Some(backoff) = task.backoff { RTS_INITIAL_SCAN_BACKOFF_DURATION_HISTOGRAM.observe(backoff.as_secs_f64()); - if let Err(e) = GLOBAL_TIMER_HANDLE + let mut backoff = GLOBAL_TIMER_HANDLE .delay(std::time::Instant::now() + backoff) .compat() - .await - { - error!("failed to backoff"; "err" => ?e); + .fuse(); + futures::select! { + res = backoff => if let Err(e) = res { + error!("failed to backoff"; "err" => ?e); + }, + _ = &mut task.cancelled => {} } - if (task.is_cancelled)() { + if task.is_cancelled() { return; } } + let _permit = concurrency_semaphore.acquire().await; + if task.is_cancelled() { + return; + } + fail::fail_point!("resolved_ts_before_scanner_get_snapshot"); let snap = match Self::get_snapshot(&mut task, cdc_handle).await { Ok(snap) => snap, Err(e) => { warn!("resolved_ts scan get snapshot failed"; "err" => ?e); - let ScanTask { - on_error, - region, - handle, - .. - } = task; - if let Some(on_error) = on_error { - on_error(handle.id, region, e); - } + task.on_error(e); return; } }; fail::fail_point!("resolved_ts_after_scanner_get_snapshot"); let start = Instant::now(); let apply_index = snap.get_apply_index().unwrap(); - let mut entries = vec![]; - match task.mode { - ScanMode::All | ScanMode::AllWithOldValue => { - let txn_extra_op = if let ScanMode::AllWithOldValue = task.mode { - TxnExtraOp::ReadOldValue - } else { - TxnExtraOp::Noop - }; - let mut scanner = ScannerBuilder::new(snap, TimeStamp::max()) - .range(None, None) - .build_delta_scanner(task.checkpoint_ts, txn_extra_op) - .unwrap(); - let mut done = false; - while !done && !(task.is_cancelled)() { - let (es, has_remaining) = match Self::scan_delta(&mut scanner) { - Ok(rs) => rs, - Err(e) => { - warn!("resolved_ts scan delta failed"; "err" => ?e); - let ScanTask { - on_error, - region, - handle, - .. - } = task; - if let Some(on_error) = on_error { - on_error(handle.id, region, e); - } - return; - } - }; - done = !has_remaining; - entries.push(ScanEntry::TxnEntry(es)); - } - } - ScanMode::LockOnly => { - let mut reader = MvccReader::new(snap, Some(MvccScanMode::Forward), false); - let mut done = false; - let mut start = None; - while !done && !(task.is_cancelled)() { - let (locks, has_remaining) = - match Self::scan_locks(&mut reader, start.as_ref(), task.checkpoint_ts) - { - Ok(rs) => rs, - Err(e) => { - warn!("resolved_ts scan lock failed"; "err" => ?e); - let ScanTask { - on_error, - region, - handle, - .. - } = task; - if let Some(on_error) = on_error { - on_error(handle.id, region, e); - } - return; - } - }; - done = !has_remaining; - if has_remaining { - start = Some(locks.last().unwrap().0.clone()) + let mut reader = MvccReader::new(snap, Some(MvccScanMode::Forward), false); + let mut done = false; + let mut start_key = None; + while !done && !task.is_cancelled() { + let (locks, has_remaining) = + match Self::scan_locks(&mut reader, start_key.as_ref(), task.checkpoint_ts) { + Ok(rs) => rs, + Err(e) => { + warn!("resolved_ts scan lock failed"; "err" => ?e); + task.on_error(e); + return; } - entries.push(ScanEntry::Lock(locks)); - } + }; + done = !has_remaining; + if has_remaining { + start_key = Some(locks.last().unwrap().0.clone()) } + task.send_entries(ScanEntries::Lock(locks), apply_index) + .await; } - entries.push(ScanEntry::None); RTS_SCAN_DURATION_HISTOGRAM.observe(start.saturating_elapsed().as_secs_f64()); - (task.send_entries)(entries, apply_index); + task.send_entries(ScanEntries::None, apply_index).await; }; self.workers.spawn(fut); } @@ -196,18 +175,21 @@ impl, E: KvEngine> ScannerPool { let mut last_err = None; for retry_times in 0..=GET_SNAPSHOT_RETRY_TIME { if retry_times != 0 { - if let Err(e) = GLOBAL_TIMER_HANDLE + let mut backoff = GLOBAL_TIMER_HANDLE .delay( std::time::Instant::now() + GET_SNAPSHOT_RETRY_BACKOFF_STEP .mul_f64(10_f64.powi(retry_times as i32 - 1)), ) .compat() - .await - { - error!("failed to backoff"; "err" => ?e); + .fuse(); + futures::select! { + res = backoff => if let Err(e) = res { + error!("failed to backoff"; "err" => ?e); + }, + _ = &mut task.cancelled => {} } - if (task.is_cancelled)() { + if task.is_cancelled() { return Err(box_err!("scan task cancelled")); } } @@ -256,24 +238,4 @@ impl, E: KvEngine> ScannerPool { .map_err(|e| Error::Other(box_err!("{:?}", e)))?; Ok((locks, has_remaining)) } - - fn scan_delta(scanner: &mut DeltaScanner) -> Result<(Vec, bool)> { - let mut entries = Vec::with_capacity(DEFAULT_SCAN_BATCH_SIZE); - let mut has_remaining = true; - while entries.len() < entries.capacity() { - match scanner - .next_entry() - .map_err(|e| Error::Other(box_err!("{:?}", e)))? - { - Some(entry) => { - entries.push(entry); - } - None => { - has_remaining = false; - break; - } - } - } - Ok((entries, has_remaining)) - } } diff --git a/src/config/mod.rs b/src/config/mod.rs index be2a52d9b07..4f9a9a01b4a 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -3000,6 +3000,7 @@ pub struct ResolvedTsConfig { #[online_config(skip)] pub scan_lock_pool_size: usize, pub memory_quota: ReadableSize, + pub incremental_scan_concurrency: usize, } impl ResolvedTsConfig { @@ -3021,6 +3022,7 @@ impl Default for ResolvedTsConfig { advance_ts_interval: ReadableDuration::secs(20), scan_lock_pool_size: 2, memory_quota: ReadableSize::mb(256), + incremental_scan_concurrency: 6, } } } diff --git a/tests/integrations/config/mod.rs b/tests/integrations/config/mod.rs index c6e98e95c05..c6f787df9a7 100644 --- a/tests/integrations/config/mod.rs +++ b/tests/integrations/config/mod.rs @@ -857,6 +857,7 @@ fn test_serde_custom_tikv_config() { advance_ts_interval: ReadableDuration::secs(5), scan_lock_pool_size: 1, memory_quota: ReadableSize::mb(1), + incremental_scan_concurrency: 7, }; value.causal_ts = CausalTsConfig { renew_interval: ReadableDuration::millis(100), diff --git a/tests/integrations/config/test-custom.toml b/tests/integrations/config/test-custom.toml index 653c3d2daef..ece8cabae49 100644 --- a/tests/integrations/config/test-custom.toml +++ b/tests/integrations/config/test-custom.toml @@ -707,6 +707,7 @@ enable = true advance-ts-interval = "5s" scan-lock-pool-size = 1 memory-quota = "1MB" +incremental-scan-concurrency = 7 [split] detect-times = 10 From 32f58924b825230d159714db63bed344e913a58a Mon Sep 17 00:00:00 2001 From: glorv Date: Fri, 15 Sep 2023 13:16:39 +0800 Subject: [PATCH 048/220] *: update rust-toolchain (#15584) close tikv/tikv#15581 Signed-off-by: glorv --- cmd/tikv-ctl/src/fork_readonly_tikv.rs | 1 + cmd/tikv-ctl/src/main.rs | 2 +- components/backup-stream/src/errors.rs | 4 +- .../backup-stream/src/metadata/client.rs | 5 +- components/backup-stream/src/router.rs | 4 +- .../backup-stream/src/subscription_track.rs | 2 +- components/backup-stream/src/utils.rs | 4 +- components/backup/src/endpoint.rs | 6 +-- components/batch-system/src/fsm.rs | 8 +++- components/case_macros/src/lib.rs | 10 ++-- components/cdc/src/delegate.rs | 2 +- components/cdc/src/endpoint.rs | 6 +-- .../concurrency_manager/src/lock_table.rs | 4 +- components/coprocessor_plugin_api/src/util.rs | 4 ++ components/encryption/src/config.rs | 9 +--- components/engine_rocks/src/logger.rs | 2 - components/engine_rocks/src/properties.rs | 15 +++--- .../engine_tirocks/src/properties/mvcc.rs | 2 +- .../engine_tirocks/src/properties/range.rs | 10 ++-- components/engine_traits/src/flush.rs | 2 +- components/engine_traits/src/lib.rs | 4 +- components/engine_traits/src/tablet.rs | 2 +- .../online_config_derive/src/lib.rs | 14 ++---- components/raftstore-v2/src/batch/store.rs | 6 ++- components/raftstore-v2/src/lib.rs | 1 + .../operation/command/admin/merge/prepare.rs | 4 +- .../src/operation/command/admin/split.rs | 4 +- .../command/admin/transfer_leader.rs | 20 ++++---- components/raftstore-v2/src/operation/life.rs | 8 +++- .../raftstore-v2/src/operation/query/local.rs | 4 +- .../src/operation/ready/apply_trace.rs | 2 +- .../src/operation/ready/snapshot.rs | 14 +++--- .../raftstore-v2/src/operation/txn_ext.rs | 4 +- .../src/operation/unsafe_recovery/demote.rs | 5 +- .../src/worker/cleanup/compact.rs | 16 +++++-- .../raftstore-v2/src/worker/pd/region.rs | 15 ++---- .../raftstore-v2/src/worker/pd/split.rs | 6 ++- components/raftstore-v2/src/worker/tablet.rs | 13 ++++- .../tests/integrations/cluster.rs | 4 +- .../raftstore/src/coprocessor/dispatcher.rs | 5 +- components/raftstore/src/errors.rs | 2 +- components/raftstore/src/lib.rs | 4 +- .../raftstore/src/store/async_io/write.rs | 6 ++- .../raftstore/src/store/entry_storage.rs | 8 ++-- components/raftstore/src/store/fsm/apply.rs | 18 +++---- components/raftstore/src/store/fsm/peer.rs | 8 ++-- components/raftstore/src/store/msg.rs | 24 +++++++--- components/raftstore/src/store/peer.rs | 48 +++++++++---------- .../raftstore/src/store/peer_storage.rs | 2 +- .../raftstore/src/store/region_snapshot.rs | 6 +-- .../raftstore/src/store/simple_write.rs | 24 +++++++--- components/raftstore/src/store/snap.rs | 4 +- components/raftstore/src/store/snap/io.rs | 4 +- components/raftstore/src/store/txn_ext.rs | 2 +- components/raftstore/src/store/util.rs | 3 +- components/raftstore/src/store/worker/pd.rs | 20 +++----- components/raftstore/src/store/worker/read.rs | 3 +- .../raftstore/src/store/worker/region.rs | 4 +- .../raftstore/src/store/worker/split_check.rs | 8 ++-- .../src/store/worker/split_controller.rs | 11 ++--- components/resolved_ts/src/cmd.rs | 6 +-- components/resolved_ts/src/endpoint.rs | 20 ++++---- components/resolved_ts/src/scanner.rs | 3 +- .../resource_control/src/resource_group.rs | 4 +- components/resource_metering/src/lib.rs | 2 +- components/resource_metering/src/model.rs | 2 +- .../src/recorder/sub_recorder/cpu.rs | 4 +- .../resource_metering/tests/recorder_test.rs | 12 ++--- components/server/src/common.rs | 4 +- components/snap_recovery/src/leader_keeper.rs | 4 +- components/sst_importer/src/import_mode2.rs | 2 +- components/sst_importer/src/sst_importer.rs | 17 +++---- components/sst_importer/src/util.rs | 3 +- components/test_coprocessor/src/store.rs | 2 +- .../example_plugin/src/lib.rs | 2 +- components/test_pd/src/server.rs | 8 +--- components/test_pd_client/src/pd.rs | 2 +- components/test_raftstore-v2/src/cluster.rs | 3 +- components/test_raftstore-v2/src/lib.rs | 2 + components/test_raftstore-v2/src/node.rs | 2 +- components/test_raftstore-v2/src/server.rs | 14 +++++- components/test_raftstore/src/lib.rs | 2 + components/test_raftstore/src/node.rs | 2 +- components/test_raftstore/src/server.rs | 8 +++- .../tidb_query_codegen/src/rpn_function.rs | 35 +++++++------- .../src/codec/collation/mod.rs | 2 +- .../tidb_query_datatype/src/codec/convert.rs | 12 ++--- .../src/codec/data_type/mod.rs | 2 +- .../src/codec/data_type/scalar.rs | 17 ++++--- .../tidb_query_datatype/src/codec/datum.rs | 8 ++-- .../src/codec/mysql/decimal.rs | 2 +- .../src/codec/mysql/duration.rs | 4 +- .../src/codec/mysql/json/comparison.rs | 4 +- .../src/codec/mysql/json/jcodec.rs | 8 ++-- .../src/codec/mysql/json/json_modify.rs | 2 +- .../src/codec/mysql/time/mod.rs | 10 ++-- .../src/codec/row/v2/row_slice.rs | 2 +- .../tidb_query_datatype/src/codec/table.rs | 2 +- .../src/index_scan_executor.rs | 4 +- components/tidb_query_executors/src/runner.rs | 18 +++---- .../src/selection_executor.rs | 4 +- .../src/util/aggr_executor.rs | 4 +- .../tidb_query_executors/src/util/mod.rs | 4 +- components/tidb_query_expr/src/impl_cast.rs | 2 +- .../tidb_query_expr/src/impl_miscellaneous.rs | 5 +- components/tidb_query_expr/src/impl_string.rs | 6 +-- components/tidb_query_expr/src/lib.rs | 2 + .../tidb_query_expr/src/types/expr_eval.rs | 11 ++--- components/tikv_kv/src/cursor.rs | 2 +- components/tikv_kv/src/lib.rs | 1 + components/tikv_util/src/logger/formatter.rs | 6 +-- components/tikv_util/src/lru.rs | 2 +- components/tikv_util/src/memory.rs | 2 +- .../src/metrics/allocator_metrics.rs | 2 +- components/tikv_util/src/mpsc/future.rs | 2 + components/tikv_util/src/sys/cpu_time.rs | 2 +- components/tikv_util/src/timer.rs | 4 +- components/txn_types/src/timestamp.rs | 10 +--- components/txn_types/src/types.rs | 18 ++----- rust-toolchain | 2 +- src/config/mod.rs | 20 ++++---- src/coprocessor/metrics.rs | 2 +- src/coprocessor/mod.rs | 2 + src/import/sst_service.rs | 6 +-- src/lib.rs | 3 +- src/server/debug2.rs | 2 +- src/server/gc_worker/compaction_filter.rs | 1 + src/server/gc_worker/gc_manager.rs | 8 ++-- src/server/gc_worker/gc_worker.rs | 14 ++---- src/server/lock_manager/deadlock.rs | 9 +--- src/server/raftkv/mod.rs | 5 +- src/server/raftkv2/mod.rs | 4 +- src/server/raftkv2/node.rs | 4 +- src/server/service/debug.rs | 1 - src/server/service/diagnostics/log.rs | 18 ++++--- src/server/service/diagnostics/sys.rs | 2 +- src/server/service/kv.rs | 1 - src/storage/lock_manager/lock_wait_context.rs | 12 ++--- .../lock_manager/lock_waiting_queue.rs | 7 +-- src/storage/metrics.rs | 2 +- src/storage/mod.rs | 32 ++++++------- src/storage/mvcc/reader/point_getter.rs | 2 +- src/storage/mvcc/reader/reader.rs | 21 ++++---- src/storage/mvcc/reader/scanner/forward.rs | 4 +- src/storage/raw/raw_mvcc.rs | 2 +- src/storage/txn/actions/prewrite.rs | 2 - src/storage/txn/commands/atomic_store.rs | 4 +- src/storage/txn/commands/prewrite.rs | 26 +++++----- src/storage/txn/latch.rs | 20 ++++---- src/storage/txn/sched_pool.rs | 2 +- .../benches/coprocessor_executors/util/mod.rs | 2 +- tests/benches/hierarchy/mvcc/mod.rs | 2 +- .../misc/coprocessor/codec/chunk/chunk.rs | 2 +- tests/benches/misc/raftkv/mod.rs | 2 + tests/benches/raftstore/mod.rs | 2 +- tests/failpoints/cases/mod.rs | 3 ++ tests/failpoints/cases/test_disk_full.rs | 8 ++-- tests/failpoints/cases/test_engine.rs | 1 + tests/failpoints/cases/test_hibernate.rs | 1 + tests/failpoints/cases/test_pd_client.rs | 1 + .../failpoints/cases/test_pd_client_legacy.rs | 1 + tests/failpoints/cases/test_rawkv.rs | 2 +- .../cases/test_read_execution_tracker.rs | 11 +++-- tests/failpoints/cases/test_split_region.rs | 3 +- tests/failpoints/cases/test_storage.rs | 4 +- tests/failpoints/cases/test_transaction.rs | 2 +- .../failpoints/cases/test_transfer_leader.rs | 4 +- tests/integrations/backup/mod.rs | 1 + tests/integrations/import/test_apply_log.rs | 2 +- tests/integrations/mod.rs | 2 + .../integrations/raftstore/test_bootstrap.rs | 4 +- .../raftstore/test_compact_lock_cf.rs | 4 +- tests/integrations/raftstore/test_stats.rs | 1 + 173 files changed, 584 insertions(+), 534 deletions(-) diff --git a/cmd/tikv-ctl/src/fork_readonly_tikv.rs b/cmd/tikv-ctl/src/fork_readonly_tikv.rs index ef3ae7f8023..d1a917f5624 100644 --- a/cmd/tikv-ctl/src/fork_readonly_tikv.rs +++ b/cmd/tikv-ctl/src/fork_readonly_tikv.rs @@ -265,6 +265,7 @@ where .map_err(|e| format!("copy({}, {}): {}", src.display(), dst.display(), e)) } +#[allow(clippy::permissions_set_readonly_false)] fn add_write_permission>(path: P) -> Result<(), String> { let path = path.as_ref(); let mut pmt = std::fs::metadata(path) diff --git a/cmd/tikv-ctl/src/main.rs b/cmd/tikv-ctl/src/main.rs index 6baa1fe6c39..c1ab11cc507 100644 --- a/cmd/tikv-ctl/src/main.rs +++ b/cmd/tikv-ctl/src/main.rs @@ -1,7 +1,7 @@ // Copyright 2016 TiKV Project Authors. Licensed under Apache-2.0. -#![feature(once_cell)] #![feature(let_chains)] +#![feature(lazy_cell)] #[macro_use] extern crate log; diff --git a/components/backup-stream/src/errors.rs b/components/backup-stream/src/errors.rs index c3cc91da9ff..cc720d5aecc 100644 --- a/components/backup-stream/src/errors.rs +++ b/components/backup-stream/src/errors.rs @@ -158,7 +158,7 @@ where /// Like `errors.Annotate` in Go. /// Wrap an unknown error with [`Error::Other`]. -#[macro_export(crate)] +#[macro_export] macro_rules! annotate { ($inner: expr, $message: expr) => { { @@ -242,6 +242,7 @@ mod test { #[bench] // 2,685 ns/iter (+/- 194) + #[allow(clippy::unnecessary_literal_unwrap)] fn contextual_add_format_strings_directly(b: &mut test::Bencher) { b.iter(|| { let err = Error::Io(io::Error::new( @@ -305,6 +306,7 @@ mod test { #[bench] // 773 ns/iter (+/- 8) + #[allow(clippy::unnecessary_literal_unwrap)] fn baseline(b: &mut test::Bencher) { b.iter(|| { let err = Error::Io(io::Error::new( diff --git a/components/backup-stream/src/metadata/client.rs b/components/backup-stream/src/metadata/client.rs index 1fdc1b3b1e8..df8f0f025b1 100644 --- a/components/backup-stream/src/metadata/client.rs +++ b/components/backup-stream/src/metadata/client.rs @@ -663,11 +663,10 @@ impl MetadataClient { let cp = match r.len() { 0 => { let global_cp = self.global_checkpoint_of(task).await?; - let cp = match global_cp { + match global_cp { None => self.get_task_start_ts_checkpoint(task).await?, Some(cp) => cp, - }; - cp + } } _ => Checkpoint::from_kv(&r[0])?, }; diff --git a/components/backup-stream/src/router.rs b/components/backup-stream/src/router.rs index b0d3453c958..6ce8486109f 100644 --- a/components/backup-stream/src/router.rs +++ b/components/backup-stream/src/router.rs @@ -950,7 +950,9 @@ impl StreamTaskInfo { .last_flush_time .swap(Box::into_raw(Box::new(Instant::now())), Ordering::SeqCst); // manual gc last instant - unsafe { Box::from_raw(ptr) }; + unsafe { + let _ = Box::from_raw(ptr); + } } pub fn should_flush(&self, flush_interval: &Duration) -> bool { diff --git a/components/backup-stream/src/subscription_track.rs b/components/backup-stream/src/subscription_track.rs index 4f44ec46853..0803ba1b99a 100644 --- a/components/backup-stream/src/subscription_track.rs +++ b/components/backup-stream/src/subscription_track.rs @@ -309,7 +309,7 @@ impl SubscriptionTracer { } }; - let mut subscription = sub.value_mut(); + let subscription = sub.value_mut(); let old_epoch = subscription.meta.get_region_epoch(); let new_epoch = new_region.get_region_epoch(); diff --git a/components/backup-stream/src/utils.rs b/components/backup-stream/src/utils.rs index 1b150eaa1f0..52b6f0e9391 100644 --- a/components/backup-stream/src/utils.rs +++ b/components/backup-stream/src/utils.rs @@ -342,7 +342,7 @@ pub fn request_to_triple(mut req: Request) -> Either<(Vec, Vec, CfName), /// `try_send!(s: Scheduler, task: T)` tries to send a task to the scheduler, /// once meet an error, would report it, with the current file and line (so it /// is made as a macro). returns whether it success. -#[macro_export(crate)] +#[macro_export] macro_rules! try_send { ($s:expr, $task:expr) => { match $s.schedule($task) { @@ -366,7 +366,7 @@ macro_rules! try_send { /// `backup_stream_debug`. because once we enable debug log for all crates, it /// would soon get too verbose to read. using this macro now we can enable debug /// log level for the crate only (even compile time...). -#[macro_export(crate)] +#[macro_export] macro_rules! debug { ($($t: tt)+) => { if cfg!(feature = "backup-stream-debug") { diff --git a/components/backup/src/endpoint.rs b/components/backup/src/endpoint.rs index a4efc162092..d6330f49966 100644 --- a/components/backup/src/endpoint.rs +++ b/components/backup/src/endpoint.rs @@ -2493,8 +2493,8 @@ pub mod tests { fn test_backup_file_name() { let region = metapb::Region::default(); let store_id = 1; - let test_cases = vec!["s3", "local", "gcs", "azure", "hdfs"]; - let test_target = vec![ + let test_cases = ["s3", "local", "gcs", "azure", "hdfs"]; + let test_target = [ "1/0_0_000", "1/0_0_000", "1_0_0_000", @@ -2513,7 +2513,7 @@ pub mod tests { assert_eq!(target.to_string(), prefix_arr.join(delimiter)); } - let test_target = vec!["1/0_0", "1/0_0", "1_0_0", "1_0_0", "1_0_0"]; + let test_target = ["1/0_0", "1/0_0", "1_0_0", "1_0_0", "1_0_0"]; for (storage_name, target) in test_cases.iter().zip(test_target.iter()) { let key = None; let filename = backup_file_name(store_id, ®ion, key, storage_name); diff --git a/components/batch-system/src/fsm.rs b/components/batch-system/src/fsm.rs index 3fa5ad15a64..16113dde8e2 100644 --- a/components/batch-system/src/fsm.rs +++ b/components/batch-system/src/fsm.rs @@ -149,7 +149,9 @@ impl FsmState { Ok(_) => return, Err(Self::NOTIFYSTATE_DROP) => { let ptr = self.data.swap(ptr::null_mut(), Ordering::AcqRel); - unsafe { Box::from_raw(ptr) }; + unsafe { + let _ = Box::from_raw(ptr); + } return; } Err(s) => s, @@ -179,7 +181,9 @@ impl Drop for FsmState { fn drop(&mut self) { let ptr = self.data.swap(ptr::null_mut(), Ordering::SeqCst); if !ptr.is_null() { - unsafe { Box::from_raw(ptr) }; + unsafe { + let _ = Box::from_raw(ptr); + } } self.state_cnt.fetch_sub(1, Ordering::Relaxed); } diff --git a/components/case_macros/src/lib.rs b/components/case_macros/src/lib.rs index 057b68065d2..b779373a59d 100644 --- a/components/case_macros/src/lib.rs +++ b/components/case_macros/src/lib.rs @@ -5,12 +5,12 @@ use proc_macro::{Group, Literal, TokenStream, TokenTree}; macro_rules! transform_idents_in_stream_to_string { - ($stream:ident, $transform:expr) => { + ($stream:ident, $transform:ident) => { $stream .into_iter() .map(|token_tree| match token_tree { TokenTree::Ident(ref ident) => { - Literal::string(&$transform(ident.to_string())).into() + Literal::string(&$transform(&ident.to_string())).into() } // find all idents in `TokenGroup` apply and reconstruct the group TokenTree::Group(ref group) => TokenTree::Group(Group::new( @@ -20,7 +20,7 @@ macro_rules! transform_idents_in_stream_to_string { .into_iter() .map(|group_token_tree| { if let TokenTree::Ident(ref ident) = group_token_tree { - Literal::string(&$transform(ident.to_string())).into() + Literal::string(&$transform(&ident.to_string())).into() } else { group_token_tree } @@ -53,7 +53,7 @@ fn to_snake(s: &str) -> String { /// e.g. `HelloWorld` -> `hello-world` #[proc_macro] pub fn kebab_case(stream: TokenStream) -> TokenStream { - transform_idents_in_stream_to_string!(stream, |s: String| to_kebab(&s)) + transform_idents_in_stream_to_string!(stream, to_kebab) } /// Expands idents in the input stream as snake-case string literal @@ -61,5 +61,5 @@ pub fn kebab_case(stream: TokenStream) -> TokenStream { /// e.g. `HelloWorld` -> `hello_world` #[proc_macro] pub fn snake_case(stream: TokenStream) -> TokenStream { - transform_idents_in_stream_to_string!(stream, |s: String| to_snake(&s)) + transform_idents_in_stream_to_string!(stream, to_snake) } diff --git a/components/cdc/src/delegate.rs b/components/cdc/src/delegate.rs index c82c4cb6f13..18528fd08e9 100644 --- a/components/cdc/src/delegate.rs +++ b/components/cdc/src/delegate.rs @@ -1437,7 +1437,7 @@ mod tests { #[test] fn test_observed_range() { - for case in vec![ + for case in [ (b"".as_slice(), b"".as_slice(), false), (b"a", b"", false), (b"", b"b", false), diff --git a/components/cdc/src/endpoint.rs b/components/cdc/src/endpoint.rs index a5f00a08028..9d5601eba84 100644 --- a/components/cdc/src/endpoint.rs +++ b/components/cdc/src/endpoint.rs @@ -1015,10 +1015,10 @@ impl, E: KvEngine, S: StoreRegionMeta> Endpoint( - &'m self, + pub fn find_first( + &self, start_key: Option<&Key>, end_key: Option<&Key>, mut pred: impl FnMut(Arc) -> Option, diff --git a/components/coprocessor_plugin_api/src/util.rs b/components/coprocessor_plugin_api/src/util.rs index 31d75610d75..06e8847402f 100644 --- a/components/coprocessor_plugin_api/src/util.rs +++ b/components/coprocessor_plugin_api/src/util.rs @@ -19,10 +19,14 @@ pub type PluginConstructorSignature = /// Type signature of the exported function with symbol /// [`PLUGIN_GET_BUILD_INFO_SYMBOL`]. +// emit this warn because to fix it need to change the data type which is a breaking change. +#[allow(improper_ctypes_definitions)] pub type PluginGetBuildInfoSignature = extern "C" fn() -> BuildInfo; /// Type signature of the exported function with symbol /// [`PLUGIN_GET_PLUGIN_INFO_SYMBOL`]. +// emit this warn because to fix it need to change the data type which is a breaking change. +#[allow(improper_ctypes_definitions)] pub type PluginGetPluginInfoSignature = extern "C" fn() -> PluginInfo; /// Automatically collected build information about the plugin that is exposed diff --git a/components/encryption/src/config.rs b/components/encryption/src/config.rs index 23e049e0df4..4455e4ce7cc 100644 --- a/components/encryption/src/config.rs +++ b/components/encryption/src/config.rs @@ -134,11 +134,12 @@ impl KmsConfig { } } -#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] +#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)] #[serde(rename_all = "kebab-case", tag = "type")] pub enum MasterKeyConfig { // Store encryption metadata as plaintext. Data still get encrypted. Not allowed to use if // encryption is enabled. (i.e. when encryption_config.method != Plaintext). + #[default] Plaintext, // Pass master key from a file, with key encoded as a readable hex string. The file should end @@ -156,12 +157,6 @@ pub enum MasterKeyConfig { }, } -impl Default for MasterKeyConfig { - fn default() -> Self { - MasterKeyConfig::Plaintext - } -} - mod encryption_method_serde { use std::fmt; diff --git a/components/engine_rocks/src/logger.rs b/components/engine_rocks/src/logger.rs index 85f4de713ac..185411dcacf 100644 --- a/components/engine_rocks/src/logger.rs +++ b/components/engine_rocks/src/logger.rs @@ -3,7 +3,6 @@ use rocksdb::{DBInfoLogLevel as InfoLogLevel, Logger}; use tikv_util::{crit, debug, error, info, warn}; // TODO(yiwu): abstract the Logger interface. -#[derive(Default)] pub struct RocksdbLogger; impl Logger for RocksdbLogger { @@ -44,7 +43,6 @@ impl Logger for TabletLogger { } } -#[derive(Default)] pub struct RaftDbLogger; impl Logger for RaftDbLogger { diff --git a/components/engine_rocks/src/properties.rs b/components/engine_rocks/src/properties.rs index 87ccab9e5ab..700d7621dc6 100644 --- a/components/engine_rocks/src/properties.rs +++ b/components/engine_rocks/src/properties.rs @@ -144,10 +144,7 @@ pub struct RangeProperties { impl RangeProperties { pub fn get(&self, key: &[u8]) -> &RangeOffsets { - let idx = self - .offsets - .binary_search_by_key(&key, |&(ref k, _)| k) - .unwrap(); + let idx = self.offsets.binary_search_by_key(&key, |(k, _)| k).unwrap(); &self.offsets[idx].1 } @@ -205,11 +202,11 @@ impl RangeProperties { if start == end { return (0, 0); } - let start_offset = match self.offsets.binary_search_by_key(&start, |&(ref k, _)| k) { + let start_offset = match self.offsets.binary_search_by_key(&start, |(k, _)| k) { Ok(idx) => Some(idx), Err(next_idx) => next_idx.checked_sub(1), }; - let end_offset = match self.offsets.binary_search_by_key(&end, |&(ref k, _)| k) { + let end_offset = match self.offsets.binary_search_by_key(&end, |(k, _)| k) { Ok(idx) => Some(idx), Err(next_idx) => next_idx.checked_sub(1), }; @@ -227,7 +224,7 @@ impl RangeProperties { ) -> Vec<(Vec, RangeOffsets)> { let start_offset = match self .offsets - .binary_search_by_key(&start_key, |&(ref k, _)| k) + .binary_search_by_key(&start_key, |(ref k, _)| k) { Ok(idx) => { if idx == self.offsets.len() - 1 { @@ -239,7 +236,7 @@ impl RangeProperties { Err(next_idx) => next_idx, }; - let end_offset = match self.offsets.binary_search_by_key(&end_key, |&(ref k, _)| k) { + let end_offset = match self.offsets.binary_search_by_key(&end_key, |(ref k, _)| k) { Ok(idx) => { if idx == 0 { return vec![]; @@ -869,7 +866,7 @@ mod tests { let mut collector = MvccPropertiesCollector::new(KeyMode::Txn); b.iter(|| { - for &(ref k, ref v) in &entries { + for (k, v) in &entries { collector.add(k, v, DBEntryType::Put, 0, 0); } }); diff --git a/components/engine_tirocks/src/properties/mvcc.rs b/components/engine_tirocks/src/properties/mvcc.rs index 1ca170f33d5..66c96284ea3 100644 --- a/components/engine_tirocks/src/properties/mvcc.rs +++ b/components/engine_tirocks/src/properties/mvcc.rs @@ -356,7 +356,7 @@ mod tests { let mut collector = MvccPropertiesCollector::new(CStr::from_bytes_with_nul(b"\0").unwrap(), KeyMode::Txn); b.iter(|| { - for &(ref k, ref v) in &entries { + for (k, v) in &entries { collector.add(k, v, EntryType::kEntryPut, 0, 0).unwrap(); } }); diff --git a/components/engine_tirocks/src/properties/range.rs b/components/engine_tirocks/src/properties/range.rs index 59b9e68a6bb..e8a3411b02f 100644 --- a/components/engine_tirocks/src/properties/range.rs +++ b/components/engine_tirocks/src/properties/range.rs @@ -53,7 +53,7 @@ impl RangeProperties { pub fn get(&self, key: &[u8]) -> &RangeOffsets { let idx = self .offsets - .binary_search_by_key(&key, |&(ref k, _)| k) + .binary_search_by_key(&key, |(k, _)| k) .unwrap(); &self.offsets[idx].1 } @@ -112,11 +112,11 @@ impl RangeProperties { if start == end { return (0, 0); } - let start_offset = match self.offsets.binary_search_by_key(&start, |&(ref k, _)| k) { + let start_offset = match self.offsets.binary_search_by_key(&start, |(k, _)| k) { Ok(idx) => Some(idx), Err(next_idx) => next_idx.checked_sub(1), }; - let end_offset = match self.offsets.binary_search_by_key(&end, |&(ref k, _)| k) { + let end_offset = match self.offsets.binary_search_by_key(&end, |(k, _)| k) { Ok(idx) => Some(idx), Err(next_idx) => next_idx.checked_sub(1), }; @@ -134,7 +134,7 @@ impl RangeProperties { ) -> Vec<(Vec, RangeOffsets)> { let start_offset = match self .offsets - .binary_search_by_key(&start_key, |&(ref k, _)| k) + .binary_search_by_key(&start_key, |(k, _)| k) { Ok(idx) => { if idx == self.offsets.len() - 1 { @@ -146,7 +146,7 @@ impl RangeProperties { Err(next_idx) => next_idx, }; - let end_offset = match self.offsets.binary_search_by_key(&end_key, |&(ref k, _)| k) { + let end_offset = match self.offsets.binary_search_by_key(&end_key, |(k, _)| k) { Ok(idx) => { if idx == 0 { return vec![]; diff --git a/components/engine_traits/src/flush.rs b/components/engine_traits/src/flush.rs index 8590236e126..6449399cef8 100644 --- a/components/engine_traits/src/flush.rs +++ b/components/engine_traits/src/flush.rs @@ -119,7 +119,7 @@ impl SstApplyState { for sst in ssts { let cf_index = data_cf_offset(sst.get_cf_name()); if let Some(metas) = sst_list.get_mut(cf_index) { - metas.drain_filter(|entry| entry.sst.get_uuid() == sst.get_uuid()); + let _ = metas.extract_if(|entry| entry.sst.get_uuid() == sst.get_uuid()); } } } diff --git a/components/engine_traits/src/lib.rs b/components/engine_traits/src/lib.rs index e09b1b52733..0f89776e7fd 100644 --- a/components/engine_traits/src/lib.rs +++ b/components/engine_traits/src/lib.rs @@ -253,8 +253,8 @@ #![feature(assert_matches)] #![feature(linked_list_cursors)] #![feature(let_chains)] -#![feature(str_split_as_str)] -#![feature(drain_filter)] +#![feature(str_split_remainder)] +#![feature(extract_if)] #[macro_use(fail_point)] extern crate fail; diff --git a/components/engine_traits/src/tablet.rs b/components/engine_traits/src/tablet.rs index c88f1548513..64e6dcbd4b4 100644 --- a/components/engine_traits/src/tablet.rs +++ b/components/engine_traits/src/tablet.rs @@ -241,7 +241,7 @@ impl TabletRegistry { let mut parts = name.rsplit('_'); let suffix = parts.next()?.parse().ok()?; let id = parts.next()?.parse().ok()?; - let prefix = parts.as_str(); + let prefix = parts.remainder().unwrap_or(""); Some((prefix, id, suffix)) } diff --git a/components/online_config/online_config_derive/src/lib.rs b/components/online_config/online_config_derive/src/lib.rs index bb37aad5924..e48a540c6b8 100644 --- a/components/online_config/online_config_derive/src/lib.rs +++ b/components/online_config/online_config_derive/src/lib.rs @@ -330,15 +330,11 @@ fn is_option_type(ty: &Type) -> bool { // TODO store (with lazy static) the vec of string // TODO maybe optimization, reverse the order of segments fn extract_option_segment(path: &Path) -> Option<&PathSegment> { - let idents_of_path = path - .segments - .iter() - .into_iter() - .fold(String::new(), |mut acc, v| { - acc.push_str(&v.ident.to_string()); - acc.push('|'); - acc - }); + let idents_of_path = path.segments.iter().fold(String::new(), |mut acc, v| { + acc.push_str(&v.ident.to_string()); + acc.push('|'); + acc + }); vec!["Option|", "std|option|Option|", "core|option|Option|"] .into_iter() .find(|s| idents_of_path == *s) diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index 4c142a43abf..5f036c61020 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -489,7 +489,11 @@ impl StorePollerBuilder { self.remove_dir(&path)?; continue; } - let Some((prefix, region_id, tablet_index)) = self.tablet_registry.parse_tablet_name(&path) else { continue }; + let Some((prefix, region_id, tablet_index)) = + self.tablet_registry.parse_tablet_name(&path) + else { + continue; + }; if prefix == MERGE_SOURCE_PREFIX { continue; } diff --git a/components/raftstore-v2/src/lib.rs b/components/raftstore-v2/src/lib.rs index 5b5e132b9ce..697d0525169 100644 --- a/components/raftstore-v2/src/lib.rs +++ b/components/raftstore-v2/src/lib.rs @@ -27,6 +27,7 @@ #![feature(box_into_inner)] #![feature(assert_matches)] #![feature(option_get_or_insert_default)] +#![allow(clippy::needless_pass_by_ref_mut)] mod batch; mod bootstrap; diff --git a/components/raftstore-v2/src/operation/command/admin/merge/prepare.rs b/components/raftstore-v2/src/operation/command/admin/merge/prepare.rs index d3d1896287c..76b71a8906c 100644 --- a/components/raftstore-v2/src/operation/command/admin/merge/prepare.rs +++ b/components/raftstore-v2/src/operation/command/admin/merge/prepare.rs @@ -343,7 +343,9 @@ impl Peer { entry.get_data(), entry.get_index(), entry.get_term(), - ) else { continue }; + ) else { + continue; + }; let cmd_type = cmd.get_admin_request().get_cmd_type(); match cmd_type { AdminCmdType::TransferLeader diff --git a/components/raftstore-v2/src/operation/command/admin/split.rs b/components/raftstore-v2/src/operation/command/admin/split.rs index 0f9cae7218d..2fe2b4b5735 100644 --- a/components/raftstore-v2/src/operation/command/admin/split.rs +++ b/components/raftstore-v2/src/operation/command/admin/split.rs @@ -1098,7 +1098,9 @@ mod test { } } - let AdminCmdResult::SplitRegion(SplitResult { tablet, .. }) = apply_res else { panic!() }; + let AdminCmdResult::SplitRegion(SplitResult { tablet, .. }) = apply_res else { + panic!() + }; // update cache let mut cache = apply.tablet_registry().get(parent_id).unwrap(); cache.set(*tablet.downcast().unwrap()); diff --git a/components/raftstore-v2/src/operation/command/admin/transfer_leader.rs b/components/raftstore-v2/src/operation/command/admin/transfer_leader.rs index 4cdeba3bc41..f60b9828bbb 100644 --- a/components/raftstore-v2/src/operation/command/admin/transfer_leader.rs +++ b/components/raftstore-v2/src/operation/command/admin/transfer_leader.rs @@ -50,21 +50,21 @@ impl Peer { /// to target follower first to ensures it's ready to become leader. /// After that the real transfer leader process begin. /// - /// 1. pre_transfer_leader on leader: - /// Leader will send a MsgTransferLeader to follower. - /// 2. execute_transfer_leader on follower - /// If follower passes all necessary checks, it will reply an - /// ACK with type MsgTransferLeader and its promised applied index. - /// 3. ready_to_transfer_leader on leader: - /// Leader checks if it's appropriate to transfer leadership. If it - /// does, it calls raft transfer_leader API to do the remaining work. + /// 1. pre_transfer_leader on leader: Leader will send a MsgTransferLeader + /// to follower. + /// 2. execute_transfer_leader on follower If follower passes all necessary + /// checks, it will reply an ACK with type MsgTransferLeader and its + /// promised applied index. + /// 3. ready_to_transfer_leader on leader: Leader checks if it's appropriate + /// to transfer leadership. If it does, it calls raft transfer_leader API + /// to do the remaining work. /// /// Additional steps when there are remaining pessimistic /// locks to propose (detected in function on_transfer_leader_msg). /// 1. Leader firstly proposes pessimistic locks and then proposes a /// TransferLeader command. - /// 2. The follower applies the TransferLeader command and replies an - /// ACK with special context TRANSFER_LEADER_COMMAND_REPLY_CTX. + /// 2. The follower applies the TransferLeader command and replies an ACK + /// with special context TRANSFER_LEADER_COMMAND_REPLY_CTX. /// /// See also: tikv/rfcs#37. pub fn propose_transfer_leader( diff --git a/components/raftstore-v2/src/operation/life.rs b/components/raftstore-v2/src/operation/life.rs index 8fe1d2a07b3..395774e17f1 100644 --- a/components/raftstore-v2/src/operation/life.rs +++ b/components/raftstore-v2/src/operation/life.rs @@ -637,8 +637,12 @@ impl Peer { let check_peer_id = check.get_check_peer().get_id(); let records = self.storage().region_state().get_merged_records(); let Some(record) = records.iter().find(|r| { - r.get_source_peers().iter().any(|p| p.get_id() == check_peer_id) - }) else { return }; + r.get_source_peers() + .iter() + .any(|p| p.get_id() == check_peer_id) + }) else { + return; + }; let source_index = record.get_source_index(); forward_destroy_to_source_peer(msg, |m| { let source_checkpoint = super::merge_source_path( diff --git a/components/raftstore-v2/src/operation/query/local.rs b/components/raftstore-v2/src/operation/query/local.rs index 2f074fdc04d..5f6d589eca6 100644 --- a/components/raftstore-v2/src/operation/query/local.rs +++ b/components/raftstore-v2/src/operation/query/local.rs @@ -345,7 +345,9 @@ where match fut.await? { Some(query_res) => { if query_res.read().is_none() { - let QueryResult::Response(res) = query_res else { unreachable!() }; + let QueryResult::Response(res) = query_res else { + unreachable!() + }; // Get an error explicitly in header, // or leader reports KeyIsLocked error via read index. assert!( diff --git a/components/raftstore-v2/src/operation/ready/apply_trace.rs b/components/raftstore-v2/src/operation/ready/apply_trace.rs index af0257e763f..2b6c9c666e6 100644 --- a/components/raftstore-v2/src/operation/ready/apply_trace.rs +++ b/components/raftstore-v2/src/operation/ready/apply_trace.rs @@ -774,7 +774,7 @@ impl Peer { flushed = true; let flush_state = self.flush_state().clone(); - let mut apply_trace = self.storage_mut().apply_trace_mut(); + let apply_trace = self.storage_mut().apply_trace_mut(); let flushed_indexes = flush_state.as_ref().flushed_index(); for i in 0..flushed_indexes.len() { diff --git a/components/raftstore-v2/src/operation/ready/snapshot.rs b/components/raftstore-v2/src/operation/ready/snapshot.rs index 9e0ed449cef..15caf5f0c84 100644 --- a/components/raftstore-v2/src/operation/ready/snapshot.rs +++ b/components/raftstore-v2/src/operation/ready/snapshot.rs @@ -570,10 +570,9 @@ impl Storage { pub fn cancel_generating_snap_due_to_compacted(&self, compact_to: u64) { let mut states = self.snap_states.borrow_mut(); states.retain(|id, state| { - let SnapState::Generating { - ref index, - .. - } = *state else { return true; }; + let SnapState::Generating { ref index, .. } = *state else { + return true; + }; let snap_index = index.load(Ordering::SeqCst); if snap_index == 0 || compact_to <= snap_index + 1 { return true; @@ -600,10 +599,9 @@ impl Storage { } let (mut snapshot, to_peer_id) = *res.unwrap(); if let Some(state) = self.snap_states.borrow_mut().get_mut(&to_peer_id) { - let SnapState::Generating { - ref index, - .. - } = *state else { return false }; + let SnapState::Generating { ref index, .. } = *state else { + return false; + }; if snapshot.get_metadata().get_index() < index.load(Ordering::SeqCst) { warn!( self.logger(), diff --git a/components/raftstore-v2/src/operation/txn_ext.rs b/components/raftstore-v2/src/operation/txn_ext.rs index 272b2526b39..6c3a9269a7f 100644 --- a/components/raftstore-v2/src/operation/txn_ext.rs +++ b/components/raftstore-v2/src/operation/txn_ext.rs @@ -266,7 +266,9 @@ impl Peer { self.logger, "propose {} locks before transferring leader", lock_count; ); - let PeerMsg::SimpleWrite(write) = PeerMsg::simple_write(header, encoder.encode()).0 else {unreachable!()}; + let PeerMsg::SimpleWrite(write) = PeerMsg::simple_write(header, encoder.encode()).0 else { + unreachable!() + }; self.on_simple_write(ctx, write.header, write.data, write.ch); true } diff --git a/components/raftstore-v2/src/operation/unsafe_recovery/demote.rs b/components/raftstore-v2/src/operation/unsafe_recovery/demote.rs index 37962a45452..e7b3c8e62b8 100644 --- a/components/raftstore-v2/src/operation/unsafe_recovery/demote.rs +++ b/components/raftstore-v2/src/operation/unsafe_recovery/demote.rs @@ -100,7 +100,10 @@ impl Peer { failed_voters, target_index, demote_after_exit, - }) = self.unsafe_recovery_state() else { return }; + }) = self.unsafe_recovery_state() + else { + return; + }; if self.raft_group().raft.raft_log.applied < *target_index { return; diff --git a/components/raftstore-v2/src/worker/cleanup/compact.rs b/components/raftstore-v2/src/worker/cleanup/compact.rs index 7acdb943b91..feb519a04ad 100644 --- a/components/raftstore-v2/src/worker/cleanup/compact.rs +++ b/components/raftstore-v2/src/worker/cleanup/compact.rs @@ -97,8 +97,12 @@ where ) { Ok(mut region_ids) => { for region_id in region_ids.drain(..) { - let Some(mut tablet_cache) = self.tablet_registry.get(region_id) else {continue}; - let Some(tablet) = tablet_cache.latest() else {continue}; + let Some(mut tablet_cache) = self.tablet_registry.get(region_id) else { + continue; + }; + let Some(tablet) = tablet_cache.latest() else { + continue; + }; for cf in &cf_names { if let Err(e) = tablet.compact_range_cf(cf, None, None, false, 1 /* threads */) @@ -143,8 +147,12 @@ fn collect_regions_to_compact( ); let mut regions_to_compact = vec![]; for id in region_ids { - let Some(mut tablet_cache) = reg.get(id) else {continue}; - let Some(tablet) = tablet_cache.latest() else {continue}; + let Some(mut tablet_cache) = reg.get(id) else { + continue; + }; + let Some(tablet) = tablet_cache.latest() else { + continue; + }; if tablet.auto_compactions_is_disabled().expect("cf") { info!( logger, diff --git a/components/raftstore-v2/src/worker/pd/region.rs b/components/raftstore-v2/src/worker/pd/region.rs index 763e12fff07..999eccb4962 100644 --- a/components/raftstore-v2/src/worker/pd/region.rs +++ b/components/raftstore-v2/src/worker/pd/region.rs @@ -113,10 +113,7 @@ where let approximate_keys = task.approximate_keys.unwrap_or_default(); let region_id = task.region.get_id(); - let peer_stat = self - .region_peers - .entry(region_id) - .or_insert_with(PeerStat::default); + let peer_stat = self.region_peers.entry(region_id).or_default(); peer_stat.approximate_size = approximate_size; peer_stat.approximate_keys = approximate_keys; @@ -373,10 +370,7 @@ where pub fn handle_update_read_stats(&mut self, mut stats: ReadStats) { for (region_id, region_info) in stats.region_infos.iter_mut() { - let peer_stat = self - .region_peers - .entry(*region_id) - .or_insert_with(PeerStat::default); + let peer_stat = self.region_peers.entry(*region_id).or_default(); peer_stat.read_bytes += region_info.flow.read_bytes as u64; peer_stat.read_keys += region_info.flow.read_keys as u64; self.store_stat.engine_total_bytes_read += region_info.flow.read_bytes as u64; @@ -398,10 +392,7 @@ where pub fn handle_update_write_stats(&mut self, mut stats: WriteStats) { for (region_id, region_info) in stats.region_infos.iter_mut() { - let peer_stat = self - .region_peers - .entry(*region_id) - .or_insert_with(PeerStat::default); + let peer_stat = self.region_peers.entry(*region_id).or_default(); peer_stat.query_stats.add_query_stats(®ion_info.0); self.store_stat .engine_total_query_num diff --git a/components/raftstore-v2/src/worker/pd/split.rs b/components/raftstore-v2/src/worker/pd/split.rs index 7fec5a31bb6..7bafb6c442a 100644 --- a/components/raftstore-v2/src/worker/pd/split.rs +++ b/components/raftstore-v2/src/worker/pd/split.rs @@ -142,8 +142,10 @@ where let f = async move { for split_info in split_infos { - let Ok(Some(region)) = - pd_client.get_region_by_id(split_info.region_id).await else { continue }; + let Ok(Some(region)) = pd_client.get_region_by_id(split_info.region_id).await + else { + continue; + }; // Try to split the region with the given split key. if let Some(split_key) = split_info.split_key { Self::ask_batch_split_imp( diff --git a/components/raftstore-v2/src/worker/tablet.rs b/components/raftstore-v2/src/worker/tablet.rs index 7c330353836..0b0429eb8d1 100644 --- a/components/raftstore-v2/src/worker/tablet.rs +++ b/components/raftstore-v2/src/worker/tablet.rs @@ -467,7 +467,8 @@ impl Runner { let Some(Some(tablet)) = self .tablet_registry .get(region_id) - .map(|mut cache| cache.latest().cloned()) else { + .map(|mut cache| cache.latest().cloned()) + else { warn!( self.logger, "flush memtable failed to acquire tablet"; @@ -555,7 +556,15 @@ impl Runner { } fn delete_range(&self, delete_range: Task) { - let Task::DeleteRange { region_id, tablet, cf, start_key, end_key, cb } = delete_range else { + let Task::DeleteRange { + region_id, + tablet, + cf, + start_key, + end_key, + cb, + } = delete_range + else { slog_panic!(self.logger, "unexpected task"; "task" => format!("{}", delete_range)) }; diff --git a/components/raftstore-v2/tests/integrations/cluster.rs b/components/raftstore-v2/tests/integrations/cluster.rs index 5b3cc5feb93..a949725090d 100644 --- a/components/raftstore-v2/tests/integrations/cluster.rs +++ b/components/raftstore-v2/tests/integrations/cluster.rs @@ -137,7 +137,9 @@ impl TestRouter { match res { Ok(_) => return block_on(sub.result()).is_some(), Err(TrySendError::Disconnected(m)) => { - let PeerMsg::WaitFlush(ch) = m else { unreachable!() }; + let PeerMsg::WaitFlush(ch) = m else { + unreachable!() + }; match self .store_router() .send_control(StoreMsg::WaitFlush { region_id, ch }) diff --git a/components/raftstore/src/coprocessor/dispatcher.rs b/components/raftstore/src/coprocessor/dispatcher.rs index d082013cd2c..756b7dc399e 100644 --- a/components/raftstore/src/coprocessor/dispatcher.rs +++ b/components/raftstore/src/coprocessor/dispatcher.rs @@ -471,10 +471,7 @@ impl CoprocessorHost { BoxSplitCheckObserver::new(KeysCheckObserver::new(ch)), ); registry.register_split_check_observer(100, BoxSplitCheckObserver::new(HalfCheckObserver)); - registry.register_split_check_observer( - 400, - BoxSplitCheckObserver::new(TableCheckObserver::default()), - ); + registry.register_split_check_observer(400, BoxSplitCheckObserver::new(TableCheckObserver)); registry.register_admin_observer(100, BoxAdminObserver::new(SplitObserver)); CoprocessorHost { registry, cfg } } diff --git a/components/raftstore/src/errors.rs b/components/raftstore/src/errors.rs index d1597a77121..6cf83a6cf84 100644 --- a/components/raftstore/src/errors.rs +++ b/components/raftstore/src/errors.rs @@ -223,7 +223,7 @@ impl From for errorpb::Error { .mut_proposal_in_merging_mode() .set_region_id(region_id); } - Error::Transport(reason) if reason == DiscardReason::Full => { + Error::Transport(DiscardReason::Full) => { let mut server_is_busy_err = errorpb::ServerIsBusy::default(); server_is_busy_err.set_reason(RAFTSTORE_IS_BUSY.to_owned()); errorpb.set_server_is_busy(server_is_busy_err); diff --git a/components/raftstore/src/lib.rs b/components/raftstore/src/lib.rs index 1db5f79d226..197eaefeac7 100644 --- a/components/raftstore/src/lib.rs +++ b/components/raftstore/src/lib.rs @@ -5,11 +5,13 @@ #![feature(div_duration)] #![feature(min_specialization)] #![feature(box_patterns)] -#![feature(hash_drain_filter)] +#![feature(hash_extract_if)] #![feature(let_chains)] #![feature(assert_matches)] #![feature(type_alias_impl_trait)] +#![feature(impl_trait_in_assoc_type)] #![recursion_limit = "256"] +#![allow(clippy::needless_pass_by_ref_mut)] #[cfg(test)] extern crate test; diff --git a/components/raftstore/src/store/async_io/write.rs b/components/raftstore/src/store/async_io/write.rs index eedd5052bbb..12617bc28a2 100644 --- a/components/raftstore/src/store/async_io/write.rs +++ b/components/raftstore/src/store/async_io/write.rs @@ -419,7 +419,11 @@ where } self.state_size = 0; if let ExtraBatchWrite::V2(_) = self.extra_batch_write { - let ExtraBatchWrite::V2(lb) = mem::replace(&mut self.extra_batch_write, ExtraBatchWrite::None) else { unreachable!() }; + let ExtraBatchWrite::V2(lb) = + mem::replace(&mut self.extra_batch_write, ExtraBatchWrite::None) + else { + unreachable!() + }; wb.merge(lb).unwrap(); } } diff --git a/components/raftstore/src/store/entry_storage.rs b/components/raftstore/src/store/entry_storage.rs index c91c68538dd..95f099f77a7 100644 --- a/components/raftstore/src/store/entry_storage.rs +++ b/components/raftstore/src/store/entry_storage.rs @@ -1338,14 +1338,14 @@ pub mod tests { // Test the initial data structure size. let (tx, rx) = mpsc::sync_channel(8); let mut cache = EntryCache::new_with_cb(move |c: i64| tx.send(c).unwrap()); - assert_eq!(rx.try_recv().unwrap(), 896); + assert_eq!(rx.try_recv().unwrap(), 0); cache.append( 0, 0, &[new_padded_entry(101, 1, 1), new_padded_entry(102, 1, 2)], ); - assert_eq!(rx.try_recv().unwrap(), 3); + assert_eq!(rx.try_recv().unwrap(), 419); cache.prepend(vec![new_padded_entry(100, 1, 1)]); assert_eq!(rx.try_recv().unwrap(), 1); @@ -1371,7 +1371,7 @@ pub mod tests { // Test trace a dangle entry. let cached_entries = CachedEntries::new(vec![new_padded_entry(100, 1, 1)]); cache.trace_cached_entries(cached_entries); - assert_eq!(rx.try_recv().unwrap(), 1); + assert_eq!(rx.try_recv().unwrap(), 97); // Test trace an entry which is still in cache. let cached_entries = CachedEntries::new(vec![new_padded_entry(102, 3, 5)]); @@ -1398,7 +1398,7 @@ pub mod tests { assert_eq!(rx.try_recv().unwrap(), -7); drop(cache); - assert_eq!(rx.try_recv().unwrap(), -896); + assert_eq!(rx.try_recv().unwrap(), -512); } #[test] diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index c170e5a35f9..406c8d79d18 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -1262,9 +1262,9 @@ where apply_ctx.host.on_empty_cmd(&self.region, index, term); // 1. When a peer become leader, it will send an empty entry. - // 2. When a leader tries to read index during transferring leader, - // it will also propose an empty entry. But that entry will not contain - // any associated callback. So no need to clear callback. + // 2. When a leader tries to read index during transferring leader, it will also + // propose an empty entry. But that entry will not contain any associated + // callback. So no need to clear callback. while let Some(mut cmd) = self.pending_cmds.pop_normal(u64::MAX, term - 1) { if let Some(cb) = cmd.cb.take() { apply_ctx @@ -4787,12 +4787,12 @@ where // command may not read the writes of previous commands and break ACID. If // it's still leader, there are two possibility that mailbox is closed: // 1. The process is shutting down. - // 2. The leader is destroyed. A leader won't propose to destroy itself, so - // it should either destroyed by older leaders or newer leaders. Leader - // won't respond to read until it has applied to current term, so no - // command will be proposed until command from older leaders have applied, - // which will then stop it from accepting proposals. If the command is - // proposed by new leader, then it won't be able to propose new proposals. + // 2. The leader is destroyed. A leader won't propose to destroy itself, so it + // should either destroyed by older leaders or newer leaders. Leader won't + // respond to read until it has applied to current term, so no command will + // be proposed until command from older leaders have applied, which will then + // stop it from accepting proposals. If the command is proposed by new + // leader, then it won't be able to propose new proposals. // So only shutdown needs to be checked here. if !tikv_util::thread_group::is_shutdown(!cfg!(test)) { for p in apply.cbs.drain(..) { diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index d61e6784295..36c4c7e8e5f 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -1015,10 +1015,10 @@ where // in snapshot recovery after we stopped all conf changes from PD. // if the follower slow than leader and has the pending conf change. // that's means - // 1. if the follower didn't finished the conf change - // => it cannot be chosen to be leader during recovery. - // 2. if the follower has been chosen to be leader - // => it already apply the pending conf change already. + // 1. if the follower didn't finished the conf change => it cannot be chosen to + // be leader during recovery. + // 2. if the follower has been chosen to be leader => it already apply the + // pending conf change already. return; } debug!( diff --git a/components/raftstore/src/store/msg.rs b/components/raftstore/src/store/msg.rs index 64c5be6d7e1..a858b5afddd 100644 --- a/components/raftstore/src/store/msg.rs +++ b/components/raftstore/src/store/msg.rs @@ -169,19 +169,25 @@ where } pub fn has_proposed_cb(&self) -> bool { - let Callback::Write { proposed_cb, .. } = self else { return false; }; + let Callback::Write { proposed_cb, .. } = self else { + return false; + }; proposed_cb.is_some() } pub fn invoke_proposed(&mut self) { - let Callback::Write { proposed_cb, .. } = self else { return; }; + let Callback::Write { proposed_cb, .. } = self else { + return; + }; if let Some(cb) = proposed_cb.take() { cb(); } } pub fn invoke_committed(&mut self) { - let Callback::Write { committed_cb, .. } = self else { return; }; + let Callback::Write { committed_cb, .. } = self else { + return; + }; if let Some(cb) = committed_cb.take() { cb(); } @@ -195,12 +201,16 @@ where } pub fn take_proposed_cb(&mut self) -> Option { - let Callback::Write { proposed_cb, .. } = self else { return None; }; + let Callback::Write { proposed_cb, .. } = self else { + return None; + }; proposed_cb.take() } pub fn take_committed_cb(&mut self) -> Option { - let Callback::Write { committed_cb, .. } = self else { return None; }; + let Callback::Write { committed_cb, .. } = self else { + return None; + }; committed_cb.take() } } @@ -258,7 +268,9 @@ impl ReadCallback for Callback { } fn read_tracker(&self) -> Option { - let Callback::Read { tracker, .. } = self else { return None; }; + let Callback::Read { tracker, .. } = self else { + return None; + }; Some(*tracker) } } diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index 8ef857bfa12..aafd2f9695b 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -2314,14 +2314,14 @@ where CheckApplyingSnapStatus::Applying => { // If this peer is applying snapshot, we should not get a new ready. // There are two reasons in my opinion: - // 1. If we handle a new ready and persist the data(e.g. entries), - // we can not tell raft-rs that this ready has been persisted because - // the ready need to be persisted one by one from raft-rs's view. - // 2. When this peer is applying snapshot, the response msg should not - // be sent to leader, thus the leader will not send new entries to - // this peer. Although it's possible a new leader may send a AppendEntries - // msg to this peer, this possibility is very low. In most cases, there - // is no msg need to be handled. + // 1. If we handle a new ready and persist the data(e.g. entries), we can not + // tell raft-rs that this ready has been persisted because the ready need + // to be persisted one by one from raft-rs's view. + // 2. When this peer is applying snapshot, the response msg should not be sent + // to leader, thus the leader will not send new entries to this peer. + // Although it's possible a new leader may send a AppendEntries msg to this + // peer, this possibility is very low. In most cases, there is no msg need + // to be handled. // So we choose to not get a new ready which makes the logic more clear. debug!( "still applying snapshot, skip further handling"; @@ -4467,27 +4467,25 @@ where /// to target follower first to ensures it's ready to become leader. /// After that the real transfer leader process begin. /// - /// 1. pre_transfer_leader on leader: - /// Leader will send a MsgTransferLeader to follower. - /// 2. pre_ack_transfer_leader_msg on follower: - /// If follower passes all necessary checks, it will try to warmup - /// the entry cache. - /// 3. ack_transfer_leader_msg on follower: - /// When the entry cache has been warmed up or the operator is timeout, - /// the follower reply an ACK with type MsgTransferLeader and - /// its promised persistent index. + /// 1. pre_transfer_leader on leader: Leader will send a MsgTransferLeader + /// to follower. + /// 2. pre_ack_transfer_leader_msg on follower: If follower passes all + /// necessary checks, it will try to warmup the entry cache. + /// 3. ack_transfer_leader_msg on follower: When the entry cache has been + /// warmed up or the operator is timeout, the follower reply an ACK with + /// type MsgTransferLeader and its promised persistent index. /// /// Additional steps when there are remaining pessimistic /// locks to propose (detected in function on_transfer_leader_msg). /// 1. Leader firstly proposes pessimistic locks and then proposes a /// TransferLeader command. - /// 2. ack_transfer_leader_msg on follower again: - /// The follower applies the TransferLeader command and replies an - /// ACK with special context TRANSFER_LEADER_COMMAND_REPLY_CTX. + /// 2. ack_transfer_leader_msg on follower again: The follower applies + /// the TransferLeader command and replies an ACK with special context + /// TRANSFER_LEADER_COMMAND_REPLY_CTX. /// - /// 4. ready_to_transfer_leader on leader: - /// Leader checks if it's appropriate to transfer leadership. If it - /// does, it calls raft transfer_leader API to do the remaining work. + /// 4. ready_to_transfer_leader on leader: Leader checks if it's appropriate + /// to transfer leadership. If it does, it calls raft transfer_leader API + /// to do the remaining work. /// /// See also: tikv/rfcs#37. fn propose_transfer_leader( @@ -5820,7 +5818,7 @@ mod tests { admin_req.clear_transfer_leader(); req.clear_admin_request(); - for (op, policy) in vec![ + for (op, policy) in [ (CmdType::Get, RequestPolicy::ReadLocal), (CmdType::Snap, RequestPolicy::ReadLocal), (CmdType::Put, RequestPolicy::ProposeNormal), @@ -5973,7 +5971,7 @@ mod tests { // (1, 4) and (1, 5) is not committed let entries = vec![(1, 1), (1, 2), (1, 3), (1, 4), (1, 5), (2, 6), (2, 7)]; - let committed = vec![(1, 1), (1, 2), (1, 3), (2, 6), (2, 7)]; + let committed = [(1, 1), (1, 2), (1, 3), (2, 6), (2, 7)]; for (index, term) in entries.clone() { if term != 1 { continue; diff --git a/components/raftstore/src/store/peer_storage.rs b/components/raftstore/src/store/peer_storage.rs index a888929ca98..1556338e9c0 100644 --- a/components/raftstore/src/store/peer_storage.rs +++ b/components/raftstore/src/store/peer_storage.rs @@ -96,7 +96,7 @@ impl PartialEq for SnapState { (&SnapState::Relax, &SnapState::Relax) | (&SnapState::ApplyAborted, &SnapState::ApplyAborted) | (&SnapState::Generating { .. }, &SnapState::Generating { .. }) => true, - (&SnapState::Applying(ref b1), &SnapState::Applying(ref b2)) => { + (SnapState::Applying(b1), SnapState::Applying(b2)) => { b1.load(Ordering::Relaxed) == b2.load(Ordering::Relaxed) } _ => false, diff --git a/components/raftstore/src/store/region_snapshot.rs b/components/raftstore/src/store/region_snapshot.rs index bc22dfbf586..40168707f6a 100644 --- a/components/raftstore/src/store/region_snapshot.rs +++ b/components/raftstore/src/store/region_snapshot.rs @@ -438,7 +438,7 @@ mod tests { (b"a9".to_vec(), b"v9".to_vec()), ]; - for &(ref k, ref v) in &base_data { + for (k, v) in &base_data { engines.kv.put(&data_key(k), v).unwrap(); } let store = new_peer_storage(engines, &r); @@ -482,11 +482,11 @@ mod tests { let mut data = vec![]; { let db = &engines.kv; - for &(ref k, level) in &levels { + for (k, level) in &levels { db.put(&data_key(k), k).unwrap(); db.flush_cfs(&[], true).unwrap(); data.push((k.to_vec(), k.to_vec())); - db.compact_files_in_range(Some(&data_key(k)), Some(&data_key(k)), Some(level)) + db.compact_files_in_range(Some(&data_key(k)), Some(&data_key(k)), Some(*level)) .unwrap(); } } diff --git a/components/raftstore/src/store/simple_write.rs b/components/raftstore/src/store/simple_write.rs index a303a586935..1d8341c1c0b 100644 --- a/components/raftstore/src/store/simple_write.rs +++ b/components/raftstore/src/store/simple_write.rs @@ -579,13 +579,17 @@ mod tests { SimpleWriteReqDecoder::new(decoder_fallback, &logger, &bytes, 0, 0).unwrap(); assert_eq!(*decoder.header(), *header); let write = decoder.next().unwrap(); - let SimpleWrite::Put(put) = write else { panic!("should be put") }; + let SimpleWrite::Put(put) = write else { + panic!("should be put") + }; assert_eq!(put.cf, CF_DEFAULT); assert_eq!(put.key, b"key"); assert_eq!(put.value, b""); let write = decoder.next().unwrap(); - let SimpleWrite::Delete(delete) = write else { panic!("should be delete") }; + let SimpleWrite::Delete(delete) = write else { + panic!("should be delete") + }; assert_eq!(delete.cf, CF_WRITE); assert_eq!(delete.key, &delete_key); assert_matches!(decoder.next(), None); @@ -593,14 +597,18 @@ mod tests { let (bytes, _) = req_encoder2.encode(); decoder = SimpleWriteReqDecoder::new(decoder_fallback, &logger, &bytes, 0, 0).unwrap(); let write = decoder.next().unwrap(); - let SimpleWrite::DeleteRange(dr) = write else { panic!("should be delete range") }; + let SimpleWrite::DeleteRange(dr) = write else { + panic!("should be delete range") + }; assert_eq!(dr.cf, CF_LOCK); assert_eq!(dr.start_key, b"key"); assert_eq!(dr.end_key, b"key"); assert!(dr.notify_only); let write = decoder.next().unwrap(); - let SimpleWrite::DeleteRange(dr) = write else { panic!("should be delete range") }; + let SimpleWrite::DeleteRange(dr) = write else { + panic!("should be delete range") + }; assert_eq!(dr.cf, "cf"); assert_eq!(dr.start_key, b"key"); assert_eq!(dr.end_key, b"key"); @@ -626,7 +634,9 @@ mod tests { let mut decoder = SimpleWriteReqDecoder::new(decoder_fallback, &logger, &bytes, 0, 0).unwrap(); let write = decoder.next().unwrap(); - let SimpleWrite::Ingest(ssts) = write else { panic!("should be ingest") }; + let SimpleWrite::Ingest(ssts) = write else { + panic!("should be ingest") + }; assert_eq!(exp, ssts); assert_matches!(decoder.next(), None); } @@ -715,7 +725,9 @@ mod tests { SimpleWriteReqDecoder::new(decoder_fallback, &logger, &bytes, 0, 0).unwrap(); assert_eq!(*decoder.header(), *header); let req = decoder.next().unwrap(); - let SimpleWrite::Put(put) = req else { panic!("should be put") }; + let SimpleWrite::Put(put) = req else { + panic!("should be put") + }; assert_eq!(put.cf, CF_DEFAULT); assert_eq!(put.key, b"key"); assert_eq!(put.value, b""); diff --git a/components/raftstore/src/store/snap.rs b/components/raftstore/src/store/snap.rs index 6fe21fe9750..dcb98dd9cb2 100644 --- a/components/raftstore/src/store/snap.rs +++ b/components/raftstore/src/store/snap.rs @@ -1323,7 +1323,7 @@ impl Write for Snapshot { } assert!(cf_file.size[self.cf_file_index] != 0); - let mut file_for_recving = cf_file + let file_for_recving = cf_file .file_for_recving .get_mut(self.cf_file_index) .unwrap(); @@ -2162,7 +2162,7 @@ impl TabletSnapManager { .stats .lock() .unwrap() - .drain_filter(|_, (_, stat)| stat.get_region_id() > 0) + .extract_if(|_, (_, stat)| stat.get_region_id() > 0) .map(|(_, (_, stat))| stat) .filter(|stat| stat.get_total_duration_sec() > 1) .collect(); diff --git a/components/raftstore/src/store/snap/io.rs b/components/raftstore/src/store/snap/io.rs index 3cdee1e40f1..8fcaf826c6a 100644 --- a/components/raftstore/src/store/snap/io.rs +++ b/components/raftstore/src/store/snap/io.rs @@ -327,7 +327,7 @@ mod tests { for db_creater in db_creaters { let (_enc_dir, enc_opts) = gen_db_options_with_encryption("test_cf_build_and_apply_plain_files_enc"); - for db_opt in vec![None, Some(enc_opts)] { + for db_opt in [None, Some(enc_opts)] { let dir = Builder::new().prefix("test-snap-cf-db").tempdir().unwrap(); let db: KvTestEngine = db_creater(dir.path(), db_opt.clone(), None).unwrap(); // Collect keys via the key_callback into a collection. @@ -408,7 +408,7 @@ mod tests { for db_creater in db_creaters { let (_enc_dir, enc_opts) = gen_db_options_with_encryption("test_cf_build_and_apply_sst_files_enc"); - for db_opt in vec![None, Some(enc_opts)] { + for db_opt in [None, Some(enc_opts)] { let dir = Builder::new().prefix("test-snap-cf-db").tempdir().unwrap(); let db = db_creater(dir.path(), db_opt.clone(), None).unwrap(); let snap_cf_dir = Builder::new().prefix("test-snap-cf").tempdir().unwrap(); diff --git a/components/raftstore/src/store/txn_ext.rs b/components/raftstore/src/store/txn_ext.rs index 0091fd4e7bb..9c73be2b9eb 100644 --- a/components/raftstore/src/store/txn_ext.rs +++ b/components/raftstore/src/store/txn_ext.rs @@ -244,7 +244,7 @@ impl PeerPessimisticLocks { // Locks that are marked deleted still need to be moved to the new regions, // and the deleted mark should also be cleared. // Refer to the comment in `PeerPessimisticLocks` for details. - let removed_locks = self.map.drain_filter(|key, _| { + let removed_locks = self.map.extract_if(|key, _| { let key = &**key.as_encoded(); let (start_key, end_key) = (derived.get_start_key(), derived.get_end_key()); key < start_key || (!end_key.is_empty() && key >= end_key) diff --git a/components/raftstore/src/store/util.rs b/components/raftstore/src/store/util.rs index 3f34fe691ee..ed2c70822c9 100644 --- a/components/raftstore/src/store/util.rs +++ b/components/raftstore/src/store/util.rs @@ -124,8 +124,7 @@ pub fn is_vote_msg(msg: &eraftpb::Message) -> bool { /// peer or not. // There could be two cases: // 1. Target peer already exists but has not established communication with leader yet -// 2. Target peer is added newly due to member change or region split, but it's not -// created yet +// 2. Target peer is added newly due to member change or region split, but it's not created yet // For both cases the region start key and end key are attached in RequestVote and // Heartbeat message for the store of that peer to check whether to create a new peer // when receiving these messages, or just to wait for a pending region split to perform diff --git a/components/raftstore/src/store/worker/pd.rs b/components/raftstore/src/store/worker/pd.rs index 32fbdbc3145..cb067ca840b 100644 --- a/components/raftstore/src/store/worker/pd.rs +++ b/components/raftstore/src/store/worker/pd.rs @@ -1710,10 +1710,7 @@ where fn handle_read_stats(&mut self, mut read_stats: ReadStats) { for (region_id, region_info) in read_stats.region_infos.iter_mut() { - let peer_stat = self - .region_peers - .entry(*region_id) - .or_insert_with(PeerStat::default); + let peer_stat = self.region_peers.entry(*region_id).or_default(); peer_stat.read_bytes += region_info.flow.read_bytes as u64; peer_stat.read_keys += region_info.flow.read_keys as u64; self.store_stat.engine_total_bytes_read += region_info.flow.read_bytes as u64; @@ -1735,10 +1732,7 @@ where fn handle_write_stats(&mut self, mut write_stats: WriteStats) { for (region_id, region_info) in write_stats.region_infos.iter_mut() { - let peer_stat = self - .region_peers - .entry(*region_id) - .or_insert_with(PeerStat::default); + let peer_stat = self.region_peers.entry(*region_id).or_default(); peer_stat.query_stats.add_query_stats(®ion_info.0); self.store_stat .engine_total_query_num @@ -2096,7 +2090,10 @@ where let f = async move { for split_info in split_infos { let Ok(Some(region)) = - pd_client.get_region_by_id(split_info.region_id).await else { continue }; + pd_client.get_region_by_id(split_info.region_id).await + else { + continue; + }; // Try to split the region with the given split key. if let Some(split_key) = split_info.split_key { Self::handle_ask_batch_split( @@ -2161,10 +2158,7 @@ where cpu_usage, ) = { let region_id = hb_task.region.get_id(); - let peer_stat = self - .region_peers - .entry(region_id) - .or_insert_with(PeerStat::default); + let peer_stat = self.region_peers.entry(region_id).or_default(); peer_stat.approximate_size = approximate_size; peer_stat.approximate_keys = approximate_keys; diff --git a/components/raftstore/src/store/worker/read.rs b/components/raftstore/src/store/worker/read.rs index 5d6ede9c193..5a6e641f5dc 100644 --- a/components/raftstore/src/store/worker/read.rs +++ b/components/raftstore/src/store/worker/read.rs @@ -2155,11 +2155,12 @@ mod tests { let (notify_tx, notify_rx) = channel(); let (wait_spawn_tx, wait_spawn_rx) = channel(); let runtime = tokio::runtime::Runtime::new().unwrap(); - let _ = runtime.spawn(async move { + let handler = runtime.spawn(async move { wait_spawn_tx.send(()).unwrap(); notify.notified().await; notify_tx.send(()).unwrap(); }); + drop(handler); wait_spawn_rx.recv().unwrap(); thread::sleep(std::time::Duration::from_millis(500)); // Prevent lost notify. must_not_redirect(&mut reader, &rx, task); diff --git a/components/raftstore/src/store/worker/region.rs b/components/raftstore/src/store/worker/region.rs index 068904b2a67..7a675646f5c 100644 --- a/components/raftstore/src/store/worker/region.rs +++ b/components/raftstore/src/store/worker/region.rs @@ -179,7 +179,7 @@ impl PendingDeleteRanges { ) -> Vec<(u64, Vec, Vec, u64)> { let ranges = self.find_overlap_ranges(start_key, end_key); - for &(_, ref s_key, ..) in &ranges { + for (_, s_key, ..) in &ranges { self.ranges.remove(s_key).unwrap(); } ranges @@ -1293,7 +1293,7 @@ pub(crate) mod tests { } }; - #[allow(dead_code)] + #[cfg(feature = "failpoints")] let must_not_finish = |ids: &[u64]| { for id in ids { let region_key = keys::region_state_key(*id); diff --git a/components/raftstore/src/store/worker/split_check.rs b/components/raftstore/src/store/worker/split_check.rs index 4ff853f70a0..468c06febd4 100644 --- a/components/raftstore/src/store/worker/split_check.rs +++ b/components/raftstore/src/store/worker/split_check.rs @@ -64,14 +64,14 @@ impl KeyEntry { impl PartialOrd for KeyEntry { fn partial_cmp(&self, rhs: &KeyEntry) -> Option { - // BinaryHeap is max heap, so we have to reverse order to get a min heap. - Some(self.key.cmp(&rhs.key).reverse()) + Some(self.cmp(rhs)) } } impl Ord for KeyEntry { fn cmp(&self, rhs: &KeyEntry) -> Ordering { - self.partial_cmp(rhs).unwrap() + // BinaryHeap is max heap, so we have to reverse order to get a min heap. + self.key.cmp(&rhs.key).reverse() } } @@ -287,7 +287,7 @@ impl Runner { region: &Region, bucket_ranges: &Vec, ) { - for (mut bucket, bucket_range) in &mut buckets.iter_mut().zip(bucket_ranges) { + for (bucket, bucket_range) in &mut buckets.iter_mut().zip(bucket_ranges) { let mut bucket_region = region.clone(); bucket_region.set_start_key(bucket_range.0.clone()); bucket_region.set_end_key(bucket_range.1.clone()); diff --git a/components/raftstore/src/store/worker/split_controller.rs b/components/raftstore/src/store/worker/split_controller.rs index 4bbcc773763..9cf534c62b0 100644 --- a/components/raftstore/src/store/worker/split_controller.rs +++ b/components/raftstore/src/store/worker/split_controller.rs @@ -178,7 +178,7 @@ impl Samples { // evaluate the samples according to the given key range, it will update the // sample's left, right and contained counter. fn evaluate(&mut self, key_range: &KeyRange) { - for mut sample in self.0.iter_mut() { + for sample in self.0.iter_mut() { let order_start = if key_range.start_key.is_empty() { Ordering::Greater } else { @@ -496,10 +496,7 @@ pub struct WriteStats { impl WriteStats { pub fn add_query_num(&mut self, region_id: u64, kind: QueryKind) { - let query_stats = self - .region_infos - .entry(region_id) - .or_insert_with(QueryStats::default); + let query_stats = self.region_infos.entry(region_id).or_default(); query_stats.add_query_num(kind, 1); } @@ -988,8 +985,8 @@ mod tests { #[test] fn test_prefix_sum() { - let v = vec![1, 2, 3, 4, 5, 6, 7, 8, 9]; - let expect = vec![1, 3, 6, 10, 15, 21, 28, 36, 45]; + let v = [1, 2, 3, 4, 5, 6, 7, 8, 9]; + let expect = [1, 3, 6, 10, 15, 21, 28, 36, 45]; let pre = prefix_sum(v.iter(), |x| *x); for i in 0..v.len() { assert_eq!(expect[i], pre[i]); diff --git a/components/resolved_ts/src/cmd.rs b/components/resolved_ts/src/cmd.rs index 47d14304112..328f725edaa 100644 --- a/components/resolved_ts/src/cmd.rs +++ b/components/resolved_ts/src/cmd.rs @@ -213,13 +213,13 @@ fn group_row_changes(requests: Vec) -> (HashMap, bool) CF_WRITE => { if let Ok(ts) = key.decode_ts() { let key = key.truncate_ts().unwrap(); - let mut row = changes.entry(key).or_default(); + let row = changes.entry(key).or_default(); assert!(row.write.is_none()); row.write = Some(KeyOp::Put(Some(ts), value)); } } CF_LOCK => { - let mut row = changes.entry(key).or_default(); + let row = changes.entry(key).or_default(); assert!(row.lock.is_none()); row.lock = Some(KeyOp::Put(None, value)); } @@ -239,7 +239,7 @@ fn group_row_changes(requests: Vec) -> (HashMap, bool) match delete.cf.as_str() { CF_LOCK => { let key = Key::from_encoded(delete.take_key()); - let mut row = changes.entry(key).or_default(); + let row = changes.entry(key).or_default(); row.lock = Some(KeyOp::Delete); } "" | CF_WRITE | CF_DEFAULT => {} diff --git a/components/resolved_ts/src/endpoint.rs b/components/resolved_ts/src/endpoint.rs index 34f00672fa7..600da207ec4 100644 --- a/components/resolved_ts/src/endpoint.rs +++ b/components/resolved_ts/src/endpoint.rs @@ -65,7 +65,8 @@ impl Drop for ResolverStatus { locks, memory_quota, .. - } = self else { + } = self + else { return; }; if locks.is_empty() { @@ -96,7 +97,8 @@ impl ResolverStatus { locks, memory_quota, .. - } = self else { + } = self + else { panic!("region {:?} resolver has ready", region_id) }; // Check if adding a new lock or unlock will exceed the memory @@ -110,10 +112,7 @@ impl ResolverStatus { } fn update_tracked_index(&mut self, index: u64, region_id: u64) { - let ResolverStatus::Pending { - tracked_index, - .. - } = self else { + let ResolverStatus::Pending { tracked_index, .. } = self else { panic!("region {:?} resolver has ready", region_id) }; assert!( @@ -135,7 +134,8 @@ impl ResolverStatus { memory_quota, tracked_index, .. - } = self else { + } = self + else { panic!("region {:?} resolver has ready", region_id) }; // Must take locks, otherwise it may double free memory quota on drop. @@ -687,7 +687,7 @@ where scanner_pool, scan_concurrency_semaphore, regions: HashMap::default(), - _phantom: PhantomData::default(), + _phantom: PhantomData, }; ep.handle_advance_resolved_ts(leader_resolver); ep @@ -870,7 +870,6 @@ where // Tracking or untracking locks with incoming commands that corresponding // observe id is valid. - #[allow(clippy::drop_ref)] fn handle_change_log(&mut self, cmd_batch: Vec) { let size = cmd_batch.iter().map(|b| b.size()).sum::(); RTS_CHANNEL_PENDING_CMD_BYTES.sub(size as i64); @@ -884,7 +883,6 @@ where if observe_region.handle.id == observe_id { let logs = ChangeLog::encode_change_log(region_id, batch); if let Err(e) = observe_region.track_change_log(&logs) { - drop(observe_region); let backoff = match e { Error::MemoryQuotaExceeded(_) => Some(MEMORY_QUOTA_EXCEEDED_BACKOFF), Error::Other(_) => None, @@ -930,7 +928,7 @@ where } fn handle_advance_resolved_ts(&self, leader_resolver: LeadershipResolver) { - let regions = self.regions.keys().into_iter().copied().collect(); + let regions = self.regions.keys().copied().collect(); self.advance_worker.advance_ts_for_regions( regions, leader_resolver, diff --git a/components/resolved_ts/src/scanner.rs b/components/resolved_ts/src/scanner.rs index 6c8c90dc38f..ad052338fa2 100644 --- a/components/resolved_ts/src/scanner.rs +++ b/components/resolved_ts/src/scanner.rs @@ -100,7 +100,7 @@ impl, E: KvEngine> ScannerPool { Self { workers, cdc_handle, - _phantom: PhantomData::default(), + _phantom: PhantomData, } } @@ -168,6 +168,7 @@ impl, E: KvEngine> ScannerPool { self.workers.spawn(fut); } + #[allow(clippy::needless_pass_by_ref_mut)] async fn get_snapshot( task: &mut ScanTask, cdc_handle: T, diff --git a/components/resource_control/src/resource_group.rs b/components/resource_control/src/resource_group.rs index a356d30a7ac..0e40255b354 100644 --- a/components/resource_control/src/resource_group.rs +++ b/components/resource_control/src/resource_group.rs @@ -307,8 +307,8 @@ pub struct ResourceController { // 1. the priority factor is calculate based on read/write RU settings. // 2. for read request, we increase a constant virtual time delta at each `get_priority` call // because the cost can't be calculated at start, so we only increase a constant delta and - // increase the real cost after task is executed; but don't increase it at write because - // the cost is known so we just pre-consume it. + // increase the real cost after task is executed; but don't increase it at write because the + // cost is known so we just pre-consume it. is_read: bool, // Track the maximum ru quota used to calculate the factor of each resource group. // factor = max_ru_quota / group_ru_quota * 10.0 diff --git a/components/resource_metering/src/lib.rs b/components/resource_metering/src/lib.rs index ba8e2174e19..7b437ea4303 100644 --- a/components/resource_metering/src/lib.rs +++ b/components/resource_metering/src/lib.rs @@ -2,7 +2,7 @@ // TODO(mornyx): crate doc. -#![feature(hash_drain_filter)] +#![feature(hash_extract_if)] #![feature(core_intrinsics)] use std::{ diff --git a/components/resource_metering/src/model.rs b/components/resource_metering/src/model.rs index 6f7118ef9e1..03cd500eb2e 100644 --- a/components/resource_metering/src/model.rs +++ b/components/resource_metering/src/model.rs @@ -87,7 +87,7 @@ impl RawRecords { pdqselect::select_by(&mut buf, k, |a, b| b.cmp(a)); let kth = buf[k]; // Evict records with cpu time less or equal than `kth` - let evicted_records = self.records.drain_filter(|_, r| r.cpu_time <= kth); + let evicted_records = self.records.extract_if(|_, r| r.cpu_time <= kth); // Record evicted into others for (_, record) in evicted_records { others.merge(&record); diff --git a/components/resource_metering/src/recorder/sub_recorder/cpu.rs b/components/resource_metering/src/recorder/sub_recorder/cpu.rs index 8c4053a80ab..08675bb6153 100644 --- a/components/resource_metering/src/recorder/sub_recorder/cpu.rs +++ b/components/resource_metering/src/recorder/sub_recorder/cpu.rs @@ -9,7 +9,7 @@ use crate::{ localstorage::{LocalStorage, SharedTagInfos}, SubRecorder, }, - RawRecord, RawRecords, + RawRecords, }; /// An implementation of [SubRecorder] for collecting cpu statistics. @@ -37,7 +37,7 @@ impl SubRecorder for CpuRecorder { if *last_stat != cur_stat { let delta_ms = (cur_stat.total_cpu_time() - last_stat.total_cpu_time()) * 1_000.; - let record = records.entry(cur_tag).or_insert_with(RawRecord::default); + let record = records.entry(cur_tag).or_default(); record.cpu_time += delta_ms as u32; } thread_stat.stat = cur_stat; diff --git a/components/resource_metering/tests/recorder_test.rs b/components/resource_metering/tests/recorder_test.rs index daa371e7477..6e164b8e5e8 100644 --- a/components/resource_metering/tests/recorder_test.rs +++ b/components/resource_metering/tests/recorder_test.rs @@ -55,7 +55,7 @@ mod tests { if let Some(tag) = self.current_ctx { self.records .entry(tag.as_bytes().to_vec()) - .or_insert_with(RawRecord::default) + .or_default() .cpu_time += ms; } self.ops.push(op); @@ -140,7 +140,7 @@ mod tests { if let Ok(mut r) = self.records.lock() { for (tag, record) in records.records.iter() { r.entry(tag.extra_attachment.to_vec()) - .or_insert_with(RawRecord::default) + .or_default() .merge(record); } } @@ -156,10 +156,10 @@ mod tests { let mut records = self.records.lock().unwrap(); for k in expected.keys() { - records.entry(k.clone()).or_insert_with(RawRecord::default); + records.entry(k.clone()).or_default(); } for k in records.keys() { - expected.entry(k.clone()).or_insert_with(RawRecord::default); + expected.entry(k.clone()).or_default(); } for (k, expected_value) in expected { let value = records.get(&k).unwrap(); @@ -324,10 +324,10 @@ mod tests { fn merge( maps: impl IntoIterator, RawRecord>>, ) -> HashMap, RawRecord> { - let mut map = HashMap::default(); + let mut map: HashMap, RawRecord> = HashMap::default(); for m in maps { for (k, v) in m { - map.entry(k).or_insert_with(RawRecord::default).merge(&v); + map.entry(k).or_default().merge(&v); } } map diff --git a/components/server/src/common.rs b/components/server/src/common.rs index c8cf879d905..43b0314cbbe 100644 --- a/components/server/src/common.rs +++ b/components/server/src/common.rs @@ -558,7 +558,9 @@ impl EnginesResourceInfo { }); for (_, cache) in cached_latest_tablets.iter_mut() { - let Some(tablet) = cache.latest() else { continue }; + let Some(tablet) = cache.latest() else { + continue; + }; for cf in DATA_CFS { fetch_engine_cf(tablet, cf); } diff --git a/components/snap_recovery/src/leader_keeper.rs b/components/snap_recovery/src/leader_keeper.rs index 417d5becca3..48344fe5012 100644 --- a/components/snap_recovery/src/leader_keeper.rs +++ b/components/snap_recovery/src/leader_keeper.rs @@ -206,7 +206,7 @@ mod test { #[test] fn test_basic() { - let leaders = vec![1, 2, 3]; + let leaders = [1, 2, 3]; let mut store = MockStore::default(); store.regions = leaders.iter().copied().collect(); let mut lk = LeaderKeeper::::new(store, leaders); @@ -217,7 +217,7 @@ mod test { #[test] fn test_failure() { - let leaders = vec![1, 2, 3]; + let leaders = [1, 2, 3]; let mut store = MockStore::default(); store.regions = leaders.iter().copied().collect(); let mut lk = LeaderKeeper::::new(store, vec![1, 2, 3, 4]); diff --git a/components/sst_importer/src/import_mode2.rs b/components/sst_importer/src/import_mode2.rs index 70b7d7fac5e..4db29c47a6f 100644 --- a/components/sst_importer/src/import_mode2.rs +++ b/components/sst_importer/src/import_mode2.rs @@ -139,7 +139,7 @@ impl ImportModeSwitcherV2 { pub fn ranges_in_import(&self) -> HashSet { let inner = self.inner.lock().unwrap(); - HashSet::from_iter(inner.import_mode_ranges.keys().into_iter().cloned()) + HashSet::from_iter(inner.import_mode_ranges.keys().cloned()) } } diff --git a/components/sst_importer/src/sst_importer.rs b/components/sst_importer/src/sst_importer.rs index 181f9d67b2f..502a81ff6a6 100644 --- a/components/sst_importer/src/sst_importer.rs +++ b/components/sst_importer/src/sst_importer.rs @@ -367,8 +367,8 @@ impl SstImporter { // This method is blocking. It performs the following transformations before // writing to disk: // - // 1. only KV pairs in the *inclusive* range (`[start, end]`) are used. - // (set the range to `["", ""]` to import everything). + // 1. only KV pairs in the *inclusive* range (`[start, end]`) are used. (set + // the range to `["", ""]` to import everything). // 2. keys are rewritten according to the given rewrite rule. // // Both the range and rewrite keys are specified using origin keys. However, @@ -1541,7 +1541,7 @@ mod tests { let env = get_env(key_manager.clone(), None /* io_rate_limiter */).unwrap(); let db = new_test_engine_with_env(db_path.to_str().unwrap(), &[CF_DEFAULT], env); - let cases = vec![(0, 10), (5, 15), (10, 20), (0, 100)]; + let cases = [(0, 10), (5, 15), (10, 20), (0, 100)]; let mut ingested = Vec::new(); @@ -2055,13 +2055,10 @@ mod tests { false, ) .unwrap(); - let ext_storage = { - let inner = importer.wrap_kms( - importer.external_storage_or_cache(&backend, "").unwrap(), - false, - ); - inner - }; + let ext_storage = importer.wrap_kms( + importer.external_storage_or_cache(&backend, "").unwrap(), + false, + ); // test do_read_kv_file() let output = block_on_external_io(importer.do_read_kv_file( diff --git a/components/sst_importer/src/util.rs b/components/sst_importer/src/util.rs index ff7526172d5..654971b0d41 100644 --- a/components/sst_importer/src/util.rs +++ b/components/sst_importer/src/util.rs @@ -97,7 +97,8 @@ pub fn copy_sst_for_ingestion, Q: AsRef>( let mut pmts = file_system::metadata(clone)?.permissions(); if pmts.readonly() { - pmts.set_readonly(false); + use std::os::unix::fs::PermissionsExt; + pmts.set_mode(0o644); file_system::set_permissions(clone, pmts)?; } diff --git a/components/test_coprocessor/src/store.rs b/components/test_coprocessor/src/store.rs index 96f405d8f39..6763ea7bb1a 100644 --- a/components/test_coprocessor/src/store.rs +++ b/components/test_coprocessor/src/store.rs @@ -203,7 +203,7 @@ impl Store { } pub fn put(&mut self, ctx: Context, mut kv: Vec<(Vec, Vec)>) { - self.handles.extend(kv.iter().map(|&(ref k, _)| k.clone())); + self.handles.extend(kv.iter().map(|(k, _)| k.clone())); let pk = kv[0].0.clone(); let kv = kv .drain(..) diff --git a/components/test_coprocessor_plugin/example_plugin/src/lib.rs b/components/test_coprocessor_plugin/example_plugin/src/lib.rs index afcaa4962b9..d383797c069 100644 --- a/components/test_coprocessor_plugin/example_plugin/src/lib.rs +++ b/components/test_coprocessor_plugin/example_plugin/src/lib.rs @@ -18,4 +18,4 @@ impl CoprocessorPlugin for ExamplePlugin { } } -declare_plugin!(ExamplePlugin::default()); +declare_plugin!(ExamplePlugin); diff --git a/components/test_pd/src/server.rs b/components/test_pd/src/server.rs index 90a420fbba0..02833e030eb 100644 --- a/components/test_pd/src/server.rs +++ b/components/test_pd/src/server.rs @@ -128,12 +128,8 @@ impl Server { } #[allow(unused_mut)] -fn hijack_unary( - mock: &mut PdMock, - ctx: RpcContext<'_>, - sink: UnarySink, - f: F, -) where +fn hijack_unary(mock: &PdMock, ctx: RpcContext<'_>, sink: UnarySink, f: F) +where R: Send + 'static, F: Fn(&dyn PdMocker) -> Option>, { diff --git a/components/test_pd_client/src/pd.rs b/components/test_pd_client/src/pd.rs index c81230f6a16..58df5998758 100644 --- a/components/test_pd_client/src/pd.rs +++ b/components/test_pd_client/src/pd.rs @@ -1438,7 +1438,7 @@ impl TestPdClient { pub fn switch_replication_mode(&self, state: DrAutoSyncState, available_stores: Vec) { let mut cluster = self.cluster.wl(); let status = cluster.replication_status.as_mut().unwrap(); - let mut dr = status.mut_dr_auto_sync(); + let dr = status.mut_dr_auto_sync(); dr.state_id += 1; dr.set_state(state); dr.available_stores = available_stores; diff --git a/components/test_raftstore-v2/src/cluster.rs b/components/test_raftstore-v2/src/cluster.rs index 8ede3290167..346813e7d1f 100644 --- a/components/test_raftstore-v2/src/cluster.rs +++ b/components/test_raftstore-v2/src/cluster.rs @@ -220,7 +220,7 @@ pub trait Simulator { None => { error!("call_query_on_node receives none response"; "request" => ?request); // Do not unwrap here, sometimes raftstore v2 may return none. - return Err(box_err!("receives none response {:?}", request)); + Err(box_err!("receives none response {:?}", request)) } } } @@ -1612,6 +1612,7 @@ impl, EK: KvEngine> Cluster { ) } + #[allow(clippy::let_underscore_future)] pub fn merge_region(&mut self, source: u64, target: u64, _cb: Callback) { // FIXME: callback is ignored. let mut req = self.new_prepare_merge(source, target); diff --git a/components/test_raftstore-v2/src/lib.rs b/components/test_raftstore-v2/src/lib.rs index 685affe45d0..45642df1e7f 100644 --- a/components/test_raftstore-v2/src/lib.rs +++ b/components/test_raftstore-v2/src/lib.rs @@ -3,6 +3,8 @@ #![feature(type_alias_impl_trait)] #![feature(return_position_impl_trait_in_trait)] #![feature(let_chains)] +#![allow(clippy::needless_pass_by_ref_mut)] +#![allow(clippy::arc_with_non_send_sync)] mod cluster; mod node; diff --git a/components/test_raftstore-v2/src/node.rs b/components/test_raftstore-v2/src/node.rs index d63ca0aa2f2..70b6ccb1407 100644 --- a/components/test_raftstore-v2/src/node.rs +++ b/components/test_raftstore-v2/src/node.rs @@ -258,7 +258,7 @@ impl Simulator for NodeCluster { ) } else { let trans = self.trans.core.lock().unwrap(); - let &(ref snap_mgr, _) = &trans.snap_paths[&node_id]; + let (snap_mgr, _) = &trans.snap_paths[&node_id]; (snap_mgr.clone(), None) }; self.snap_mgrs.insert(node_id, snap_mgr.clone()); diff --git a/components/test_raftstore-v2/src/server.rs b/components/test_raftstore-v2/src/server.rs index 7b5d501a59f..a7d64591fe1 100644 --- a/components/test_raftstore-v2/src/server.rs +++ b/components/test_raftstore-v2/src/server.rs @@ -1006,7 +1006,18 @@ pub fn must_new_cluster_and_kv_client_mul( TikvClient, Context, ) { - let (cluster, leader, ctx) = must_new_cluster_mul(count); + must_new_cluster_with_cfg_and_kv_client_mul(count, |_| {}) +} + +pub fn must_new_cluster_with_cfg_and_kv_client_mul( + count: usize, + configure: impl FnMut(&mut Cluster, RocksEngine>), +) -> ( + Cluster, RocksEngine>, + TikvClient, + Context, +) { + let (cluster, leader, ctx) = must_new_and_configure_cluster_mul(count, configure); let env = Arc::new(Environment::new(1)); let channel = @@ -1015,6 +1026,7 @@ pub fn must_new_cluster_and_kv_client_mul( (cluster, client, ctx) } + pub fn must_new_cluster_mul( count: usize, ) -> ( diff --git a/components/test_raftstore/src/lib.rs b/components/test_raftstore/src/lib.rs index 04dfbd24de1..6f48c17190a 100644 --- a/components/test_raftstore/src/lib.rs +++ b/components/test_raftstore/src/lib.rs @@ -1,6 +1,8 @@ // Copyright 2018 TiKV Project Authors. Licensed under Apache-2.0. #![feature(let_chains)] +#![allow(clippy::needless_pass_by_ref_mut)] +#![allow(clippy::arc_with_non_send_sync)] #[macro_use] extern crate lazy_static; diff --git a/components/test_raftstore/src/node.rs b/components/test_raftstore/src/node.rs index f429f27ff8b..8a9969c1913 100644 --- a/components/test_raftstore/src/node.rs +++ b/components/test_raftstore/src/node.rs @@ -281,7 +281,7 @@ impl Simulator for NodeCluster { (snap_mgr, Some(tmp)) } else { let trans = self.trans.core.lock().unwrap(); - let &(ref snap_mgr, _) = &trans.snap_paths[&node_id]; + let (snap_mgr, _) = &trans.snap_paths[&node_id]; (snap_mgr.clone(), None) }; diff --git a/components/test_raftstore/src/server.rs b/components/test_raftstore/src/server.rs index 8d26bae968d..0df44b4e784 100644 --- a/components/test_raftstore/src/server.rs +++ b/components/test_raftstore/src/server.rs @@ -918,8 +918,14 @@ pub fn must_new_cluster_and_kv_client() -> (Cluster, TikvClient, pub fn must_new_cluster_and_kv_client_mul( count: usize, ) -> (Cluster, TikvClient, Context) { - let (cluster, leader, ctx) = must_new_cluster_mul(count); + must_new_cluster_with_cfg_and_kv_client_mul(count, |_| {}) +} +pub fn must_new_cluster_with_cfg_and_kv_client_mul( + count: usize, + configure: impl FnMut(&mut Cluster), +) -> (Cluster, TikvClient, Context) { + let (cluster, leader, ctx) = must_new_and_configure_cluster_mul(count, configure); let env = Arc::new(Environment::new(1)); let channel = ChannelBuilder::new(env).connect(&cluster.sim.rl().get_addr(leader.get_store_id())); diff --git a/components/tidb_query_codegen/src/rpn_function.rs b/components/tidb_query_codegen/src/rpn_function.rs index 33976939c83..ea3017d5d02 100644 --- a/components/tidb_query_codegen/src/rpn_function.rs +++ b/components/tidb_query_codegen/src/rpn_function.rs @@ -1739,27 +1739,24 @@ mod tests_normal { /// Compare TokenStream with all white chars trimmed. fn assert_token_stream_equal(l: TokenStream, r: TokenStream) { - let result = l - .clone() - .into_iter() - .eq_by(r.clone().into_iter(), |x, y| match x { - TokenTree::Ident(x) => matches!(y, TokenTree::Ident(y) if x == y), - TokenTree::Literal(x) => { - matches!(y, TokenTree::Literal(y) if x.to_string() == y.to_string()) - } - TokenTree::Punct(x) => { - matches!(y, TokenTree::Punct(y) if x.to_string() == y.to_string()) - } - TokenTree::Group(x) => { - if let TokenTree::Group(y) = y { - assert_token_stream_equal(x.stream(), y.stream()); + let result = l.clone().into_iter().eq_by(r.clone(), |x, y| match x { + TokenTree::Ident(x) => matches!(y, TokenTree::Ident(y) if x == y), + TokenTree::Literal(x) => { + matches!(y, TokenTree::Literal(y) if x.to_string() == y.to_string()) + } + TokenTree::Punct(x) => { + matches!(y, TokenTree::Punct(y) if x.to_string() == y.to_string()) + } + TokenTree::Group(x) => { + if let TokenTree::Group(y) = y { + assert_token_stream_equal(x.stream(), y.stream()); - true - } else { - false - } + true + } else { + false } - }); + } + }); assert!(result, "expect: {:#?}, actual: {:#?}", &l, &r); } diff --git a/components/tidb_query_datatype/src/codec/collation/mod.rs b/components/tidb_query_datatype/src/codec/collation/mod.rs index 22127e62f49..738e0020de7 100644 --- a/components/tidb_query_datatype/src/codec/collation/mod.rs +++ b/components/tidb_query_datatype/src/codec/collation/mod.rs @@ -251,7 +251,7 @@ where { #[inline] fn partial_cmp(&self, other: &Self) -> Option { - C::sort_compare(self.inner.as_ref(), other.inner.as_ref()).ok() + Some(self.cmp(other)) } } diff --git a/components/tidb_query_datatype/src/codec/convert.rs b/components/tidb_query_datatype/src/codec/convert.rs index 418841547ca..d2bbee78078 100644 --- a/components/tidb_query_datatype/src/codec/convert.rs +++ b/components/tidb_query_datatype/src/codec/convert.rs @@ -574,13 +574,13 @@ pub fn bytes_to_int_without_context(bytes: &[u8]) -> Result { if let Some(&c) = trimed.next() { if c == b'-' { negative = true; - } else if (b'0'..=b'9').contains(&c) { + } else if c.is_ascii_digit() { r = Some(i64::from(c) - i64::from(b'0')); } else if c != b'+' { return Ok(0); } - for c in trimed.take_while(|&c| (b'0'..=b'9').contains(c)) { + for c in trimed.take_while(|&c| c.is_ascii_digit()) { let cur = i64::from(*c - b'0'); r = r.and_then(|r| r.checked_mul(10)).and_then(|r| { if negative { @@ -605,13 +605,13 @@ pub fn bytes_to_uint_without_context(bytes: &[u8]) -> Result { let mut trimed = bytes.iter().skip_while(|&&b| b == b' ' || b == b'\t'); let mut r = Some(0u64); if let Some(&c) = trimed.next() { - if (b'0'..=b'9').contains(&c) { + if c.is_ascii_digit() { r = Some(u64::from(c) - u64::from(b'0')); } else if c != b'+' { return Ok(0); } - for c in trimed.take_while(|&c| (b'0'..=b'9').contains(c)) { + for c in trimed.take_while(|&c| c.is_ascii_digit()) { r = r .and_then(|r| r.checked_mul(10)) .and_then(|r| r.checked_add(u64::from(*c - b'0'))); @@ -856,7 +856,7 @@ pub fn get_valid_int_prefix_helper<'a>( if (c == '+' || c == '-') && i == 0 { continue; } - if ('0'..='9').contains(&c) { + if c.is_ascii_digit() { valid_len = i + 1; continue; } @@ -917,7 +917,7 @@ pub fn get_valid_float_prefix_helper<'a>( break; } e_idx = i - } else if !('0'..='9').contains(&c) { + } else if !c.is_ascii_digit() { break; } else { saw_digit = true; diff --git a/components/tidb_query_datatype/src/codec/data_type/mod.rs b/components/tidb_query_datatype/src/codec/data_type/mod.rs index 8ca36790824..b464b1119c8 100644 --- a/components/tidb_query_datatype/src/codec/data_type/mod.rs +++ b/components/tidb_query_datatype/src/codec/data_type/mod.rs @@ -248,7 +248,7 @@ macro_rules! impl_evaluable_type { } #[inline] - fn borrow_scalar_value_ref<'a>(v: ScalarValueRef<'a>) -> Option<&'a Self> { + fn borrow_scalar_value_ref(v: ScalarValueRef<'_>) -> Option<&Self> { match v { ScalarValueRef::$ty(x) => x, other => panic!( diff --git a/components/tidb_query_datatype/src/codec/data_type/scalar.rs b/components/tidb_query_datatype/src/codec/data_type/scalar.rs index c74423107e4..ff66ddc42ee 100644 --- a/components/tidb_query_datatype/src/codec/data_type/scalar.rs +++ b/components/tidb_query_datatype/src/codec/data_type/scalar.rs @@ -467,24 +467,23 @@ impl<'a> ScalarValueRef<'a> { impl<'a> Ord for ScalarValueRef<'a> { fn cmp(&self, other: &Self) -> Ordering { - self.partial_cmp(other) - .expect("Cannot compare two ScalarValueRef in different type") - } -} - -impl<'a> PartialOrd for ScalarValueRef<'a> { - fn partial_cmp(&self, other: &Self) -> Option { match_template_evaltype! { TT, match (self, other) { // v1 and v2 are `Option`. However, in MySQL NULL values are considered lower // than any non-NULL value, so using `Option::PartialOrd` directly is fine. - (ScalarValueRef::TT(v1), ScalarValueRef::TT(v2)) => Some(v1.cmp(v2)), - _ => None, + (ScalarValueRef::TT(v1), ScalarValueRef::TT(v2)) => v1.cmp(v2), + _ => panic!("Cannot compare two ScalarValueRef in different type"), } } } } +impl<'a> PartialOrd for ScalarValueRef<'a> { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + impl<'a> PartialEq for ScalarValueRef<'a> { fn eq(&self, other: &ScalarValue) -> bool { self == &other.as_scalar_value_ref() diff --git a/components/tidb_query_datatype/src/codec/datum.rs b/components/tidb_query_datatype/src/codec/datum.rs index dde98003475..f91d204b3b0 100644 --- a/components/tidb_query_datatype/src/codec/datum.rs +++ b/components/tidb_query_datatype/src/codec/datum.rs @@ -668,7 +668,7 @@ impl Datum { Datum::F64(res) } } - (&Datum::Dec(ref l), &Datum::Dec(ref r)) => { + (Datum::Dec(l), Datum::Dec(r)) => { let dec: Result = (l + r).into(); return dec.map(Datum::Dec); } @@ -700,7 +700,7 @@ impl Datum { } (&Datum::U64(l), &Datum::U64(r)) => l.checked_sub(r).into(), (&Datum::F64(l), &Datum::F64(r)) => return Ok(Datum::F64(l - r)), - (&Datum::Dec(ref l), &Datum::Dec(ref r)) => { + (Datum::Dec(l), Datum::Dec(r)) => { let dec: Result = (l - r).into(); return dec.map(Datum::Dec); } @@ -724,7 +724,7 @@ impl Datum { } (&Datum::U64(l), &Datum::U64(r)) => l.checked_mul(r).into(), (&Datum::F64(l), &Datum::F64(r)) => return Ok(Datum::F64(l * r)), - (&Datum::Dec(ref l), &Datum::Dec(ref r)) => return Ok(Datum::Dec((l * r).unwrap())), + (Datum::Dec(l), Datum::Dec(r)) => return Ok(Datum::Dec((l * r).unwrap())), (l, r) => return Err(invalid_type!("{} can't multiply {}", l, r)), }; @@ -1179,7 +1179,7 @@ mod tests { | (&Datum::Null, &Datum::Null) | (&Datum::Time(_), &Datum::Time(_)) | (&Datum::Json(_), &Datum::Json(_)) => true, - (&Datum::Dec(ref d1), &Datum::Dec(ref d2)) => d1.prec_and_frac() == d2.prec_and_frac(), + (Datum::Dec(d1), Datum::Dec(d2)) => d1.prec_and_frac() == d2.prec_and_frac(), _ => false, } } diff --git a/components/tidb_query_datatype/src/codec/mysql/decimal.rs b/components/tidb_query_datatype/src/codec/mysql/decimal.rs index 143ec6c7760..8853a1d6a16 100644 --- a/components/tidb_query_datatype/src/codec/mysql/decimal.rs +++ b/components/tidb_query_datatype/src/codec/mysql/decimal.rs @@ -1872,7 +1872,7 @@ impl<'a> ConvertTo for JsonRef<'a> { fn first_non_digit(bs: &[u8], start_idx: usize) -> usize { bs.iter() .skip(start_idx) - .position(|c| !(b'0'..=b'9').contains(c)) + .position(|c| !c.is_ascii_digit()) .map_or_else(|| bs.len(), |s| s + start_idx) } diff --git a/components/tidb_query_datatype/src/codec/mysql/duration.rs b/components/tidb_query_datatype/src/codec/mysql/duration.rs index 7279f788146..4b735977712 100644 --- a/components/tidb_query_datatype/src/codec/mysql/duration.rs +++ b/components/tidb_query_datatype/src/codec/mysql/duration.rs @@ -629,14 +629,14 @@ impl Eq for Duration {} impl PartialOrd for Duration { #[inline] fn partial_cmp(&self, rhs: &Duration) -> Option { - self.nanos.partial_cmp(&rhs.nanos) + Some(self.cmp(rhs)) } } impl Ord for Duration { #[inline] fn cmp(&self, rhs: &Duration) -> Ordering { - self.partial_cmp(rhs).unwrap() + self.nanos.partial_cmp(&rhs.nanos).unwrap() } } diff --git a/components/tidb_query_datatype/src/codec/mysql/json/comparison.rs b/components/tidb_query_datatype/src/codec/mysql/json/comparison.rs index d9104385bc6..73e04885890 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/comparison.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/comparison.rs @@ -77,6 +77,8 @@ impl<'a> PartialEq for JsonRef<'a> { .map_or(false, |r| r == Ordering::Equal) } } + +#[allow(clippy::incorrect_partial_ord_impl_on_ord_type)] impl<'a> PartialOrd for JsonRef<'a> { // See `CompareBinary` in TiDB `types/json/binary_functions.go` fn partial_cmp(&self, right: &JsonRef<'_>) -> Option { @@ -197,7 +199,7 @@ impl PartialEq for Json { impl PartialOrd for Json { fn partial_cmp(&self, right: &Json) -> Option { - self.as_ref().partial_cmp(&right.as_ref()) + Some(self.cmp(right)) } } diff --git a/components/tidb_query_datatype/src/codec/mysql/json/jcodec.rs b/components/tidb_query_datatype/src/codec/mysql/json/jcodec.rs index 867d8ec2c20..f76b29790f9 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/jcodec.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/jcodec.rs @@ -28,9 +28,9 @@ pub trait JsonEncoder: NumberEncoder { } // See `appendBinaryObject` in TiDB `types/json/binary.go` - fn write_json_obj_from_keys_values<'a>( + fn write_json_obj_from_keys_values( &mut self, - mut entries: Vec<(&[u8], JsonRef<'a>)>, + mut entries: Vec<(&[u8], JsonRef<'_>)>, ) -> Result<()> { entries.sort_by(|a, b| a.0.cmp(b.0)); // object: element-count size key-entry* value-entry* key* value* @@ -122,7 +122,7 @@ pub trait JsonEncoder: NumberEncoder { } // See `appendBinaryArray` in TiDB `types/json/binary.go` - fn write_json_ref_array<'a>(&mut self, data: &[JsonRef<'a>]) -> Result<()> { + fn write_json_ref_array(&mut self, data: &[JsonRef<'_>]) -> Result<()> { let element_count = data.len(); let value_entries_len = VALUE_ENTRY_LEN * element_count; let values_len = data.iter().fold(0, |acc, v| acc + v.encoded_len()); @@ -167,7 +167,7 @@ pub trait JsonEncoder: NumberEncoder { } // See `appendBinaryValElem` in TiDB `types/json/binary.go` - fn write_value_entry<'a>(&mut self, value_offset: &mut u32, v: &JsonRef<'a>) -> Result<()> { + fn write_value_entry(&mut self, value_offset: &mut u32, v: &JsonRef<'_>) -> Result<()> { let tp = v.get_type(); self.write_u8(tp as u8)?; match tp { diff --git a/components/tidb_query_datatype/src/codec/mysql/json/json_modify.rs b/components/tidb_query_datatype/src/codec/mysql/json/json_modify.rs index b359158d06b..3cc78270d60 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/json_modify.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/json_modify.rs @@ -41,7 +41,7 @@ impl<'a> JsonRef<'a> { } } let mut res = self.to_owned(); - for (expr, value) in path_expr_list.iter().zip(values.into_iter()) { + for (expr, value) in path_expr_list.iter().zip(values) { let modifier = BinaryModifier::new(res.as_ref()); res = match mt { ModifyType::Insert => modifier.insert(expr, value)?, diff --git a/components/tidb_query_datatype/src/codec/mysql/time/mod.rs b/components/tidb_query_datatype/src/codec/mysql/time/mod.rs index 4c6c2f676d7..44228f2d88e 100644 --- a/components/tidb_query_datatype/src/codec/mysql/time/mod.rs +++ b/components/tidb_query_datatype/src/codec/mysql/time/mod.rs @@ -1094,7 +1094,7 @@ impl Time { ) } - fn try_into_chrono_datetime(self, ctx: &mut EvalContext) -> Result> { + fn try_into_chrono_datetime(self, ctx: &EvalContext) -> Result> { chrono_datetime( &ctx.cfg.tz, self.year(), @@ -2670,9 +2670,9 @@ mod tests { #[test] fn test_no_zero_in_date() -> Result<()> { - let cases = vec!["2019-01-00", "2019-00-01"]; + let cases = ["2019-01-00", "2019-00-01"]; - for &case in cases.iter() { + for case in cases { // Enable NO_ZERO_IN_DATE only. If zero-date is encountered, a warning is // produced. let mut ctx = EvalContext::from(TimeEnv { @@ -2817,7 +2817,7 @@ mod tests { let actual = Time::from_duration(&mut ctx, duration, TimeType::DateTime)?; let today = actual - .try_into_chrono_datetime(&mut ctx)? + .try_into_chrono_datetime(&ctx)? .checked_sub_signed(chrono::Duration::nanoseconds(duration.to_nanos())) .unwrap(); @@ -2837,7 +2837,7 @@ mod tests { let mut ctx = EvalContext::default(); for i in 2..10 { let actual = Time::from_local_time(&mut ctx, TimeType::DateTime, i % MAX_FSP)?; - let c_datetime = actual.try_into_chrono_datetime(&mut ctx)?; + let c_datetime = actual.try_into_chrono_datetime(&ctx)?; let now0 = c_datetime.timestamp_millis() as u64; let now1 = Utc::now().timestamp_millis() as u64; diff --git a/components/tidb_query_datatype/src/codec/row/v2/row_slice.rs b/components/tidb_query_datatype/src/codec/row/v2/row_slice.rs index da117c96e2c..aa5eb3fc56f 100644 --- a/components/tidb_query_datatype/src/codec/row/v2/row_slice.rs +++ b/components/tidb_query_datatype/src/codec/row/v2/row_slice.rs @@ -298,7 +298,7 @@ impl<'a, T: PrimInt> LeBytes<'a, T> { fn new(slice: &'a [u8]) -> Self { Self { slice, - _marker: PhantomData::default(), + _marker: PhantomData, } } diff --git a/components/tidb_query_datatype/src/codec/table.rs b/components/tidb_query_datatype/src/codec/table.rs index 37becbfb801..81ef4b072c6 100644 --- a/components/tidb_query_datatype/src/codec/table.rs +++ b/components/tidb_query_datatype/src/codec/table.rs @@ -528,7 +528,7 @@ pub fn generate_index_data_for_test( let mut expect_row = HashMap::default(); let mut v: Vec<_> = indice .iter() - .map(|&(ref cid, ref value)| { + .map(|(cid, value)| { expect_row.insert( *cid, datum::encode_key(&mut EvalContext::default(), &[value.clone()]).unwrap(), diff --git a/components/tidb_query_executors/src/index_scan_executor.rs b/components/tidb_query_executors/src/index_scan_executor.rs index 3a5c53a4d09..5ebf8a031d3 100644 --- a/components/tidb_query_executors/src/index_scan_executor.rs +++ b/components/tidb_query_executors/src/index_scan_executor.rs @@ -611,8 +611,8 @@ impl IndexScanExecutorImpl { } #[inline] - fn build_operations<'a, 'b>( - &'b self, + fn build_operations<'a>( + &self, mut key_payload: &'a [u8], index_value: &'a [u8], ) -> Result<(DecodeHandleOp<'a>, DecodePartitionIdOp<'a>, RestoreData<'a>)> { diff --git a/components/tidb_query_executors/src/runner.rs b/components/tidb_query_executors/src/runner.rs index 7c410befb25..27e52dde288 100644 --- a/components/tidb_query_executors/src/runner.rs +++ b/components/tidb_query_executors/src/runner.rs @@ -137,31 +137,31 @@ impl BatchExecutorsRunner<()> { .map_err(|e| other_err!("BatchProjectionExecutor: {}", e))?; } ExecType::TypeJoin => { - other_err!("Join executor not implemented"); + return Err(other_err!("Join executor not implemented")); } ExecType::TypeKill => { - other_err!("Kill executor not implemented"); + return Err(other_err!("Kill executor not implemented")); } ExecType::TypeExchangeSender => { - other_err!("ExchangeSender executor not implemented"); + return Err(other_err!("ExchangeSender executor not implemented")); } ExecType::TypeExchangeReceiver => { - other_err!("ExchangeReceiver executor not implemented"); + return Err(other_err!("ExchangeReceiver executor not implemented")); } ExecType::TypePartitionTableScan => { - other_err!("PartitionTableScan executor not implemented"); + return Err(other_err!("PartitionTableScan executor not implemented")); } ExecType::TypeSort => { - other_err!("Sort executor not implemented"); + return Err(other_err!("Sort executor not implemented")); } ExecType::TypeWindow => { - other_err!("Window executor not implemented"); + return Err(other_err!("Window executor not implemented")); } ExecType::TypeExpand => { - other_err!("Expand executor not implemented"); + return Err(other_err!("Expand executor not implemented")); } ExecType::TypeExpand2 => { - other_err!("Expand2 executor not implemented"); + return Err(other_err!("Expand2 executor not implemented")); } } } diff --git a/components/tidb_query_executors/src/selection_executor.rs b/components/tidb_query_executors/src/selection_executor.rs index bd65547109d..ffcb22671da 100644 --- a/components/tidb_query_executors/src/selection_executor.rs +++ b/components/tidb_query_executors/src/selection_executor.rs @@ -537,7 +537,7 @@ mod tests { }) .collect(); - for predicates in vec![ + for predicates in [ // Swap predicates should produce same results. vec![predicate[0](), predicate[1]()], vec![predicate[1](), predicate[0]()], @@ -572,7 +572,7 @@ mod tests { }) .collect(); - for predicates in vec![ + for predicates in [ // Swap predicates should produce same results. vec![predicate[0](), predicate[1](), predicate[2]()], vec![predicate[1](), predicate[2](), predicate[0]()], diff --git a/components/tidb_query_executors/src/util/aggr_executor.rs b/components/tidb_query_executors/src/util/aggr_executor.rs index 0535e8dbd83..a5d760dc80d 100644 --- a/components/tidb_query_executors/src/util/aggr_executor.rs +++ b/components/tidb_query_executors/src/util/aggr_executor.rs @@ -641,8 +641,8 @@ pub mod tests { )) as Box> }; - let test_paging_size = vec![2, 5, 7]; - let expect_call_num = vec![1, 3, 4]; + let test_paging_size = [2, 5, 7]; + let expect_call_num = [1, 3, 4]; let expect_row_num = vec![vec![4], vec![0, 0, 5], vec![0, 0, 0, 6]]; let executor_builders: Vec) -> _>> = vec![Box::new(exec_fast), Box::new(exec_slow)]; diff --git a/components/tidb_query_executors/src/util/mod.rs b/components/tidb_query_executors/src/util/mod.rs index ca05e49fcd3..db456a84883 100644 --- a/components/tidb_query_executors/src/util/mod.rs +++ b/components/tidb_query_executors/src/util/mod.rs @@ -28,13 +28,13 @@ pub fn ensure_columns_decoded( /// Evaluates expressions and outputs the result into the given Vec. Lifetime of /// the expressions are erased. -pub unsafe fn eval_exprs_decoded_no_lifetime<'a>( +pub unsafe fn eval_exprs_decoded_no_lifetime( ctx: &mut EvalContext, exprs: &[RpnExpression], schema: &[FieldType], input_physical_columns: &LazyBatchColumnVec, input_logical_rows: &[usize], - output: &mut Vec>, + output: &mut Vec>, ) -> Result<()> { unsafe fn erase_lifetime<'a, T: ?Sized>(v: &T) -> &'a T { &*(v as *const T) diff --git a/components/tidb_query_expr/src/impl_cast.rs b/components/tidb_query_expr/src/impl_cast.rs index 76e90f79c5b..b6619f9d8cc 100644 --- a/components/tidb_query_expr/src/impl_cast.rs +++ b/components/tidb_query_expr/src/impl_cast.rs @@ -6528,7 +6528,7 @@ mod tests { "cast_decimal_as_duration", ); - let values = vec![ + let values = [ Decimal::from_bytes(b"9995959").unwrap().unwrap(), Decimal::from_bytes(b"-9995959").unwrap().unwrap(), ]; diff --git a/components/tidb_query_expr/src/impl_miscellaneous.rs b/components/tidb_query_expr/src/impl_miscellaneous.rs index 5d2daed7f9a..663571804ae 100644 --- a/components/tidb_query_expr/src/impl_miscellaneous.rs +++ b/components/tidb_query_expr/src/impl_miscellaneous.rs @@ -58,7 +58,7 @@ pub fn inet_aton(addr: BytesRef) -> Result> { } let (mut byte_result, mut result, mut dot_count): (u64, u64, usize) = (0, 0, 0); for c in addr.chars() { - if ('0'..='9').contains(&c) { + if c.is_ascii_digit() { let digit = c as u64 - '0' as u64; byte_result = byte_result * 10 + digit; if byte_result > 255 { @@ -501,8 +501,9 @@ mod tests { (Some(hex("00000000")), Some(b"0.0.0.0".to_vec())), (Some(hex("0A000509")), Some(b"10.0.5.9".to_vec())), ( + // the output format has changed, see: https://github.com/rust-lang/rust/pull/112606 Some(hex("00000000000000000000000001020304")), - Some(b"::1.2.3.4".to_vec()), + Some(b"::102:304".to_vec()), ), ( Some(hex("00000000000000000000FFFF01020304")), diff --git a/components/tidb_query_expr/src/impl_string.rs b/components/tidb_query_expr/src/impl_string.rs index f3b9b03c287..45754d0a101 100644 --- a/components/tidb_query_expr/src/impl_string.rs +++ b/components/tidb_query_expr/src/impl_string.rs @@ -63,13 +63,13 @@ pub fn oct_string(s: BytesRef, writer: BytesWriter) -> Result { if let Some(&c) = trimmed.next() { if c == b'-' { negative = true; - } else if (b'0'..=b'9').contains(&c) { + } else if c.is_ascii_digit() { r = Some(u64::from(c) - u64::from(b'0')); } else if c != b'+' { return Ok(writer.write(Some(b"0".to_vec()))); } - for c in trimmed.take_while(|&c| (b'0'..=b'9').contains(c)) { + for c in trimmed.take_while(|&c| c.is_ascii_digit()) { r = r .and_then(|r| r.checked_mul(10)) .and_then(|r| r.checked_add(u64::from(*c - b'0'))); @@ -879,7 +879,7 @@ impl TrimDirection { } #[inline] -fn trim<'a, 'b>(string: &'a [u8], pattern: &'b [u8], direction: TrimDirection) -> &'a [u8] { +fn trim<'a>(string: &'a [u8], pattern: &[u8], direction: TrimDirection) -> &'a [u8] { if pattern.is_empty() { return string; } diff --git a/components/tidb_query_expr/src/lib.rs b/components/tidb_query_expr/src/lib.rs index c2ef6722148..40c1f485e54 100644 --- a/components/tidb_query_expr/src/lib.rs +++ b/components/tidb_query_expr/src/lib.rs @@ -10,6 +10,8 @@ #![allow(elided_lifetimes_in_paths)] // Necessary until rpn_fn accepts functions annotated with lifetimes. #![allow(incomplete_features)] +#![allow(clippy::needless_raw_string_hashes)] +#![allow(clippy::needless_return_with_question_mark)] #![feature(proc_macro_hygiene)] #![feature(specialization)] #![feature(test)] diff --git a/components/tidb_query_expr/src/types/expr_eval.rs b/components/tidb_query_expr/src/types/expr_eval.rs index b892333b0ef..e3ab7d35297 100644 --- a/components/tidb_query_expr/src/types/expr_eval.rs +++ b/components/tidb_query_expr/src/types/expr_eval.rs @@ -1091,16 +1091,13 @@ mod tests { use tipb::{Expr, ScalarFuncSig}; #[allow(clippy::trivially_copy_pass_by_ref)] - #[rpn_fn(capture = [metadata], metadata_mapper = prepare_a::)] - fn fn_a_nonnull( - metadata: &i64, - v: &Int, - ) -> Result> { + #[rpn_fn(capture = [metadata], metadata_mapper = prepare_a)] + fn fn_a_nonnull(metadata: &i64, v: &Int) -> Result> { assert_eq!(*metadata, 42); Ok(Some(v + *metadata)) } - fn prepare_a(_expr: &mut Expr) -> Result { + fn prepare_a(_expr: &mut Expr) -> Result { Ok(42) } @@ -1136,7 +1133,7 @@ mod tests { // fn_b: CastIntAsReal // fn_c: CastIntAsString Ok(match expr.get_sig() { - ScalarFuncSig::CastIntAsInt => fn_a_nonnull_fn_meta::(), + ScalarFuncSig::CastIntAsInt => fn_a_nonnull_fn_meta(), ScalarFuncSig::CastIntAsReal => fn_b_fn_meta::(), ScalarFuncSig::CastIntAsString => fn_c_fn_meta::(), _ => unreachable!(), diff --git a/components/tikv_kv/src/cursor.rs b/components/tikv_kv/src/cursor.rs index 576aa5cfa76..858edfffec2 100644 --- a/components/tikv_kv/src/cursor.rs +++ b/components/tikv_kv/src/cursor.rs @@ -605,7 +605,7 @@ mod tests { (b"a9".to_vec(), b"v9".to_vec()), ]; - for &(ref k, ref v) in &base_data { + for (k, v) in &base_data { engine.put(&data_key(k), v).unwrap(); } (r, base_data) diff --git a/components/tikv_kv/src/lib.rs b/components/tikv_kv/src/lib.rs index 25f58352750..43e5f1bea05 100644 --- a/components/tikv_kv/src/lib.rs +++ b/components/tikv_kv/src/lib.rs @@ -9,6 +9,7 @@ #![feature(min_specialization)] #![feature(type_alias_impl_trait)] #![feature(associated_type_defaults)] +#![feature(impl_trait_in_assoc_type)] #[macro_use(fail_point)] extern crate fail; diff --git a/components/tikv_util/src/logger/formatter.rs b/components/tikv_util/src/logger/formatter.rs index c53c5896519..b786d2aa681 100644 --- a/components/tikv_util/src/logger/formatter.rs +++ b/components/tikv_util/src/logger/formatter.rs @@ -11,9 +11,9 @@ where let mut start = 0; let bytes = file_name.as_bytes(); for (index, &b) in bytes.iter().enumerate() { - if (b'A'..=b'Z').contains(&b) - || (b'a'..=b'z').contains(&b) - || (b'0'..=b'9').contains(&b) + if b.is_ascii_uppercase() + || b.is_ascii_lowercase() + || b.is_ascii_digit() || b == b'.' || b == b'-' || b == b'_' diff --git a/components/tikv_util/src/lru.rs b/components/tikv_util/src/lru.rs index 76fad6e8a34..a2d0943df90 100644 --- a/components/tikv_util/src/lru.rs +++ b/components/tikv_util/src/lru.rs @@ -247,7 +247,7 @@ where HashMapEntry::Occupied(mut e) => { self.size_policy.on_remove(e.key(), &e.get().value); self.size_policy.on_insert(e.key(), &value); - let mut entry = e.get_mut(); + let entry = e.get_mut(); self.trace.promote(entry.record); entry.value = value; } diff --git a/components/tikv_util/src/memory.rs b/components/tikv_util/src/memory.rs index 291254c5227..a2897809683 100644 --- a/components/tikv_util/src/memory.rs +++ b/components/tikv_util/src/memory.rs @@ -33,7 +33,7 @@ pub trait HeapSize { impl HeapSize for [u8] { fn heap_size(&self) -> usize { - self.len() * mem::size_of::() + mem::size_of_val(self) } } diff --git a/components/tikv_util/src/metrics/allocator_metrics.rs b/components/tikv_util/src/metrics/allocator_metrics.rs index 260aa88ac8e..af22e411767 100644 --- a/components/tikv_util/src/metrics/allocator_metrics.rs +++ b/components/tikv_util/src/metrics/allocator_metrics.rs @@ -64,7 +64,7 @@ impl Collector for AllocStatsCollector { .set(dealloc as _); }); let mut g = self.memory_stats.collect(); - g.extend(self.allocation.collect().into_iter()); + g.extend(self.allocation.collect()); g } } diff --git a/components/tikv_util/src/mpsc/future.rs b/components/tikv_util/src/mpsc/future.rs index 4492e33a933..354ef74adb0 100644 --- a/components/tikv_util/src/mpsc/future.rs +++ b/components/tikv_util/src/mpsc/future.rs @@ -302,6 +302,8 @@ mod tests { use super::*; + // the JoinHandler is useless here, so just ignore this warning. + #[allow(clippy::let_underscore_future)] fn spawn_and_wait( rx_builder: impl FnOnce() -> S, ) -> (Runtime, Arc) { diff --git a/components/tikv_util/src/sys/cpu_time.rs b/components/tikv_util/src/sys/cpu_time.rs index 6ec1621c629..61608d1518f 100644 --- a/components/tikv_util/src/sys/cpu_time.rs +++ b/components/tikv_util/src/sys/cpu_time.rs @@ -333,7 +333,7 @@ mod tests { for _ in 0..num * 10 { std::thread::spawn(move || { loop { - let _ = (0..10_000_000).into_iter().sum::(); + let _ = (0..10_000_000).sum::(); } }); } diff --git a/components/tikv_util/src/timer.rs b/components/tikv_util/src/timer.rs index bb555e11794..a7a2b421ab0 100644 --- a/components/tikv_util/src/timer.rs +++ b/components/tikv_util/src/timer.rs @@ -81,14 +81,14 @@ impl Eq for TimeoutTask {} impl PartialOrd for TimeoutTask { fn partial_cmp(&self, other: &TimeoutTask) -> Option { - self.next_tick.partial_cmp(&other.next_tick) + Some(self.cmp(other)) } } impl Ord for TimeoutTask { fn cmp(&self, other: &TimeoutTask) -> Ordering { // TimeoutTask.next_tick must have same type of instants. - self.partial_cmp(other).unwrap() + self.next_tick.partial_cmp(&other.next_tick).unwrap() } } diff --git a/components/txn_types/src/timestamp.rs b/components/txn_types/src/timestamp.rs index fb0cd900123..79727575d60 100644 --- a/components/txn_types/src/timestamp.rs +++ b/components/txn_types/src/timestamp.rs @@ -118,9 +118,10 @@ impl slog::Value for TimeStamp { const TS_SET_USE_VEC_LIMIT: usize = 8; /// A hybrid immutable set for timestamps. -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Default, Clone, PartialEq)] pub enum TsSet { /// When the set is empty, avoid the useless cloning of Arc. + #[default] Empty, /// `Vec` is suitable when the set is small or the set is barely used, and /// it doesn't worth converting a `Vec` into a `HashSet`. @@ -130,13 +131,6 @@ pub enum TsSet { Set(Arc>), } -impl Default for TsSet { - #[inline] - fn default() -> TsSet { - TsSet::Empty - } -} - impl TsSet { /// Create a `TsSet` from the given vec of timestamps. It will select the /// proper internal collection type according to the size. diff --git a/components/txn_types/src/types.rs b/components/txn_types/src/types.rs index 624ac81212d..5305e3ec69a 100644 --- a/components/txn_types/src/types.rs +++ b/components/txn_types/src/types.rs @@ -451,7 +451,7 @@ impl From for Mutation { /// `OldValue` is used by cdc to read the previous value associated with some /// key during the prewrite process. -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Default, Clone, PartialEq)] pub enum OldValue { /// A real `OldValue`. Value { value: Value }, @@ -460,18 +460,13 @@ pub enum OldValue { /// `None` means we don't found a previous value. None, /// The user doesn't care about the previous value. + #[default] Unspecified, /// Not sure whether the old value exists or not. users can seek CF_WRITE to /// the give position to take a look. SeekWrite(Key), } -impl Default for OldValue { - fn default() -> Self { - OldValue::Unspecified - } -} - impl OldValue { pub fn value(value: Value) -> Self { OldValue::Value { value } @@ -590,8 +585,9 @@ impl WriteBatchFlags { /// The position info of the last actual write (PUT or DELETE) of a LOCK record. /// Note that if the last change is a DELETE, its LastChange can be either /// Exist(which points to it) or NotExist. -#[derive(Clone, Eq, PartialEq, Debug)] +#[derive(Clone, Default, Eq, PartialEq, Debug)] pub enum LastChange { + #[default] Unknown, /// The pointer may point to a PUT or a DELETE record. Exist { @@ -647,12 +643,6 @@ impl LastChange { } } -impl Default for LastChange { - fn default() -> Self { - LastChange::Unknown - } -} - #[cfg(test)] mod tests { use super::*; diff --git a/rust-toolchain b/rust-toolchain index 4e5f9a4d82b..c1eb62e26cb 100644 --- a/rust-toolchain +++ b/rust-toolchain @@ -1 +1 @@ -nightly-2022-11-15 +nightly-2023-08-15 diff --git a/src/config/mod.rs b/src/config/mod.rs index 4f9a9a01b4a..6b3332fb015 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -1482,7 +1482,7 @@ impl DbConfig { opts.set_paranoid_checks(b); } if for_engine == EngineType::RaftKv { - opts.set_info_log(RocksdbLogger::default()); + opts.set_info_log(RocksdbLogger); } opts.set_info_log_level(self.info_log_level.into()); if self.titan.enabled { @@ -1858,7 +1858,7 @@ impl RaftDbConfig { opts.set_max_log_file_size(self.info_log_max_size.0); opts.set_log_file_time_to_roll(self.info_log_roll_time.as_secs()); opts.set_keep_log_file_num(self.info_log_keep_log_file_num); - opts.set_info_log(RaftDbLogger::default()); + opts.set_info_log(RaftDbLogger); opts.set_info_log_level(self.info_log_level.into()); opts.set_max_subcompactions(self.max_sub_compactions); opts.set_writable_file_max_buffer_size(self.writable_file_max_buffer_size.0 as i32); @@ -2015,7 +2015,7 @@ impl ConfigManager for DbConfigManger { self.cfg.update(change.clone())?; let change_str = format!("{:?}", change); let mut change: Vec<(String, ConfigValue)> = change.into_iter().collect(); - let cf_config = change.drain_filter(|(name, _)| name.ends_with("cf")); + let cf_config = change.extract_if(|(name, _)| name.ends_with("cf")); for (cf_name, cf_change) in cf_config { if let ConfigValue::Module(mut cf_change) = cf_change { // defaultcf -> default @@ -2040,7 +2040,7 @@ impl ConfigManager for DbConfigManger { } if let Some(rate_bytes_config) = change - .drain_filter(|(name, _)| name == "rate_bytes_per_sec") + .extract_if(|(name, _)| name == "rate_bytes_per_sec") .next() { let rate_bytes_per_sec: ReadableSize = rate_bytes_config.1.into(); @@ -2049,7 +2049,7 @@ impl ConfigManager for DbConfigManger { } if let Some(rate_bytes_config) = change - .drain_filter(|(name, _)| name == "rate_limiter_auto_tuned") + .extract_if(|(name, _)| name == "rate_limiter_auto_tuned") .next() { let rate_limiter_auto_tuned: bool = rate_bytes_config.1.into(); @@ -2058,7 +2058,7 @@ impl ConfigManager for DbConfigManger { } if let Some(size) = change - .drain_filter(|(name, _)| name == "write_buffer_limit") + .extract_if(|(name, _)| name == "write_buffer_limit") .next() { let size: ReadableSize = size.1.into(); @@ -2066,14 +2066,14 @@ impl ConfigManager for DbConfigManger { } if let Some(f) = change - .drain_filter(|(name, _)| name == "write_buffer_flush_oldest_first") + .extract_if(|(name, _)| name == "write_buffer_flush_oldest_first") .next() { self.db.set_flush_oldest_first(f.1.into())?; } if let Some(background_jobs_config) = change - .drain_filter(|(name, _)| name == "max_background_jobs") + .extract_if(|(name, _)| name == "max_background_jobs") .next() { let max_background_jobs: i32 = background_jobs_config.1.into(); @@ -2081,7 +2081,7 @@ impl ConfigManager for DbConfigManger { } if let Some(background_subcompactions_config) = change - .drain_filter(|(name, _)| name == "max_sub_compactions") + .extract_if(|(name, _)| name == "max_sub_compactions") .next() { let max_subcompactions: u32 = background_subcompactions_config.1.into(); @@ -2090,7 +2090,7 @@ impl ConfigManager for DbConfigManger { } if let Some(background_flushes_config) = change - .drain_filter(|(name, _)| name == "max_background_flushes") + .extract_if(|(name, _)| name == "max_background_flushes") .next() { let max_background_flushes: i32 = background_flushes_config.1.into(); diff --git a/src/coprocessor/metrics.rs b/src/coprocessor/metrics.rs index 02f45d35311..7d2d7e9e947 100644 --- a/src/coprocessor/metrics.rs +++ b/src/coprocessor/metrics.rs @@ -285,7 +285,7 @@ pub fn tls_collect_scan_details(cmd: ReqTag, stats: &Statistics) { m.borrow_mut() .local_scan_details .entry(cmd) - .or_insert_with(Default::default) + .or_default() .add(stats); }); } diff --git a/src/coprocessor/mod.rs b/src/coprocessor/mod.rs index fcd16f9b947..874917130e4 100644 --- a/src/coprocessor/mod.rs +++ b/src/coprocessor/mod.rs @@ -64,11 +64,13 @@ type HandlerStreamStepResult = Result<(Option, bool)>; #[async_trait] pub trait RequestHandler: Send { /// Processes current request and produces a response. + #[allow(clippy::diverging_sub_expression)] async fn handle_request(&mut self) -> Result> { panic!("unary request is not supported for this handler"); } /// Processes current request and produces streaming responses. + #[allow(clippy::diverging_sub_expression)] async fn handle_streaming_request(&mut self) -> HandlerStreamStepResult { panic!("streaming request is not supported for this handler"); } diff --git a/src/import/sst_service.rs b/src/import/sst_service.rs index 6d40ffe959c..1a670c917ca 100644 --- a/src/import/sst_service.rs +++ b/src/import/sst_service.rs @@ -66,9 +66,9 @@ const REQUEST_WRITE_CONCURRENCY: usize = 16; /// bytes. In detail, they are: /// - 2 bytes for the request type (Tag+Value). /// - 2 bytes for every string or bytes field (Tag+Length), they are: -/// . + the key field -/// . + the value field -/// . + the CF field (None for CF_DEFAULT) +/// . + the key field +/// . + the value field +/// . + the CF field (None for CF_DEFAULT) /// - 2 bytes for the embedded message field `PutRequest` (Tag+Length). /// - 2 bytes for the request itself (which would be embedded into a /// [`RaftCmdRequest`].) diff --git a/src/lib.rs b/src/lib.rs index b3e9ebaf8e8..aafb099c6cc 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -23,13 +23,14 @@ #![feature(proc_macro_hygiene)] #![feature(min_specialization)] #![feature(box_patterns)] -#![feature(drain_filter)] +#![feature(extract_if)] #![feature(deadline_api)] #![feature(let_chains)] #![feature(read_buf)] #![feature(type_alias_impl_trait)] #![allow(incomplete_features)] #![feature(return_position_impl_trait_in_trait)] +#![feature(impl_trait_in_assoc_type)] #[macro_use(fail_point)] extern crate fail; diff --git a/src/server/debug2.rs b/src/server/debug2.rs index cf17aea81eb..1ee1d108edc 100644 --- a/src/server/debug2.rs +++ b/src/server/debug2.rs @@ -1096,7 +1096,7 @@ fn get_tablet_cache( "tablet load failed, region_state {:?}", region_state.get_state() ); - return Err(box_err!(e)); + Err(box_err!(e)) } } } diff --git a/src/server/gc_worker/compaction_filter.rs b/src/server/gc_worker/compaction_filter.rs index 665824a1bac..fe5a252b8db 100644 --- a/src/server/gc_worker/compaction_filter.rs +++ b/src/server/gc_worker/compaction_filter.rs @@ -826,6 +826,7 @@ pub mod test_utils { use crate::storage::kv::RocksEngine as StorageRocksEngine; /// Do a global GC with the given safe point. + #[allow(clippy::needless_pass_by_ref_mut)] pub fn gc_by_compact(engine: &mut StorageRocksEngine, _: &[u8], safe_point: u64) { let engine = engine.get_rocksdb(); // Put a new key-value pair to ensure compaction can be triggered correctly. diff --git a/src/server/gc_worker/gc_manager.rs b/src/server/gc_worker/gc_manager.rs index be18f8216d5..d2dc6532200 100644 --- a/src/server/gc_worker/gc_manager.rs +++ b/src/server/gc_worker/gc_manager.rs @@ -546,7 +546,9 @@ impl GcMan ) -> GcManagerResult> { // Get the information of the next region to do GC. let (region, next_key) = self.get_next_gc_context(from_key); - let Some(region) = region else { return Ok(None) }; + let Some(region) = region else { + return Ok(None); + }; let hex_start = format!("{:?}", log_wrappers::Value::key(region.get_start_key())); let hex_end = format!("{:?}", log_wrappers::Value::key(region.get_end_key())); @@ -807,7 +809,7 @@ mod tests { // Following code asserts gc_tasks == expected_gc_tasks. assert_eq!(gc_tasks.len(), expected_gc_tasks.len()); - let all_passed = gc_tasks.into_iter().zip(expected_gc_tasks.into_iter()).all( + let all_passed = gc_tasks.into_iter().zip(expected_gc_tasks).all( |((region, safe_point), (expect_region, expect_safe_point))| { region == expect_region && safe_point == expect_safe_point.into() }, @@ -884,7 +886,7 @@ mod tests { #[test] fn test_auto_gc_rewinding() { - for regions in vec![ + for regions in [ // First region starts with empty and last region ends with empty. vec![ (b"".to_vec(), b"1".to_vec(), 1), diff --git a/src/server/gc_worker/gc_worker.rs b/src/server/gc_worker/gc_worker.rs index c608470ba87..de40975632f 100644 --- a/src/server/gc_worker/gc_worker.rs +++ b/src/server/gc_worker/gc_worker.rs @@ -254,7 +254,7 @@ fn get_keys_in_region(keys: &mut Peekable>, region: &Region) -> Ve let mut keys_in_region = Vec::new(); loop { - let Some(key) = keys.peek() else {break}; + let Some(key) = keys.peek() else { break }; let key = key.as_encoded().as_slice(); if key < region.get_start_key() { @@ -552,7 +552,7 @@ impl GcRunner { let mut keys = keys.into_iter().peekable(); for region in regions { let mut raw_modifies = MvccRaw::new(); - let mut snapshot = self.get_snapshot(self.store_id, ®ion)?; + let snapshot = self.get_snapshot(self.store_id, ®ion)?; let mut keys_in_region = get_keys_in_region(&mut keys, ®ion).into_iter(); let mut next_gc_key = keys_in_region.next(); @@ -563,7 +563,7 @@ impl GcRunner { &range_start_key, &range_end_key, &mut raw_modifies, - &mut snapshot, + &snapshot, &mut gc_info, ) { GC_KEY_FAILURES.inc(); @@ -615,7 +615,7 @@ impl GcRunner { range_start_key: &Key, range_end_key: &Key, raw_modifies: &mut MvccRaw, - kv_snapshot: &mut ::Snap, + kv_snapshot: &::Snap, gc_info: &mut GcInfo, ) -> Result<()> { let start_key = key.clone().append_ts(safe_point.prev()); @@ -669,10 +669,7 @@ impl GcRunner { } pub fn mut_stats(&mut self, key_mode: GcKeyMode) -> &mut Statistics { - let stats = self - .stats_map - .entry(key_mode) - .or_insert_with(Default::default); + let stats = self.stats_map.entry(key_mode).or_default(); stats } @@ -2269,7 +2266,6 @@ mod tests { fn generate_keys(start: u64, end: u64) -> Vec { (start..end) - .into_iter() .map(|i| { let key = format!("k{:02}", i); Key::from_raw(key.as_bytes()) diff --git a/src/server/lock_manager/deadlock.rs b/src/server/lock_manager/deadlock.rs index 9583df80dd6..938dfaff8a6 100644 --- a/src/server/lock_manager/deadlock.rs +++ b/src/server/lock_manager/deadlock.rs @@ -361,20 +361,15 @@ impl DetectTable { } /// The role of the detector. -#[derive(Debug, PartialEq, Clone, Copy)] +#[derive(Debug, Default, PartialEq, Clone, Copy)] pub enum Role { /// The node is the leader of the detector. Leader, /// The node is a follower of the leader. + #[default] Follower, } -impl Default for Role { - fn default() -> Role { - Role::Follower - } -} - impl From for Role { fn from(role: StateRole) -> Role { match role { diff --git a/src/server/raftkv/mod.rs b/src/server/raftkv/mod.rs index 2074d469310..f5b36dffbac 100644 --- a/src/server/raftkv/mod.rs +++ b/src/server/raftkv/mod.rs @@ -306,6 +306,7 @@ struct WriteResFeed { unsafe impl Send for WriteResFeed {} impl WriteResFeed { + #[allow(clippy::arc_with_non_send_sync)] fn pair() -> (Self, WriteResSub) { let core = Arc::new(WriteResCore { ev: AtomicU8::new(0), @@ -581,7 +582,9 @@ where tx.notify(res); } rx.inspect(move |ev| { - let WriteEvent::Finished(res) = ev else { return }; + let WriteEvent::Finished(res) = ev else { + return; + }; match res { Ok(()) => { ASYNC_REQUESTS_COUNTER_VEC.write.success.inc(); diff --git a/src/server/raftkv2/mod.rs b/src/server/raftkv2/mod.rs index dacc90a91f0..81143e6c2be 100644 --- a/src/server/raftkv2/mod.rs +++ b/src/server/raftkv2/mod.rs @@ -291,7 +291,9 @@ impl tikv_kv::Engine for RaftKv2 { early_err: res.err(), }) .inspect(move |ev| { - let WriteEvent::Finished(res) = ev else { return }; + let WriteEvent::Finished(res) = ev else { + return; + }; match res { Ok(()) => { ASYNC_REQUESTS_COUNTER_VEC.write.success.inc(); diff --git a/src/server/raftkv2/node.rs b/src/server/raftkv2/node.rs index d9b17c5d35c..73a15983bd0 100644 --- a/src/server/raftkv2/node.rs +++ b/src/server/raftkv2/node.rs @@ -269,7 +269,9 @@ where /// Stops the Node. pub fn stop(&mut self) { let store_id = self.store.get_id(); - let Some((_, mut system)) = self.system.take() else { return }; + let Some((_, mut system)) = self.system.take() else { + return; + }; info!(self.logger, "stop raft store thread"; "store_id" => store_id); system.shutdown(); } diff --git a/src/server/service/debug.rs b/src/server/service/debug.rs index d0b715542d5..497d8240684 100644 --- a/src/server/service/debug.rs +++ b/src/server/service/debug.rs @@ -300,7 +300,6 @@ where let debugger = self.debugger.clone(); let res = self.pool.spawn(async move { - let req = req; debugger .compact( req.get_db(), diff --git a/src/server/service/diagnostics/log.rs b/src/server/service/diagnostics/log.rs index 8e77d65233e..413e36a6645 100644 --- a/src/server/service/diagnostics/log.rs +++ b/src/server/service/diagnostics/log.rs @@ -612,7 +612,7 @@ Some invalid logs 4: Welcome to TiKV - test-filter"# vec![], ) .unwrap(); - let expected = vec![ + let expected = [ "2019/08/23 18:09:56.387 +08:00", "2019/08/23 18:09:56.387 +08:00", // for invalid line "2019/08/23 18:09:57.387 +08:00", @@ -639,7 +639,7 @@ Some invalid logs 4: Welcome to TiKV - test-filter"# vec![], ) .unwrap(); - let expected = vec![ + let expected = [ "2019/08/23 18:09:56.387 +08:00", "2019/08/23 18:09:56.387 +08:00", // for invalid line "2019/08/23 18:09:57.387 +08:00", @@ -662,7 +662,7 @@ Some invalid logs 4: Welcome to TiKV - test-filter"# vec![], ) .unwrap(); - let expected = vec!["2019/08/23 18:09:53.387 +08:00"] + let expected = ["2019/08/23 18:09:53.387 +08:00"] .iter() .map(|s| timestamp(s)) .collect::>(); @@ -671,7 +671,7 @@ Some invalid logs 4: Welcome to TiKV - test-filter"# expected ); - for time in vec![0, i64::MAX].into_iter() { + for time in [0, i64::MAX].into_iter() { let log_iter = LogIterator::new( &log_file, timestamp("2019/08/23 18:09:53.387 +08:00"), @@ -680,7 +680,7 @@ Some invalid logs 4: Welcome to TiKV - test-filter"# vec![], ) .unwrap(); - let expected = vec![ + let expected = [ "2019/08/23 18:09:58.387 +08:00", "2019/08/23 18:09:59.387 +08:00", "2019/08/23 18:10:06.387 +08:00", @@ -704,7 +704,7 @@ Some invalid logs 4: Welcome to TiKV - test-filter"# vec![regex::Regex::new(".*test-filter.*").unwrap()], ) .unwrap(); - let expected = vec![ + let expected = [ "2019/08/23 18:09:58.387 +08:00", "2019/08/23 18:10:06.387 +08:00", // for invalid line ] @@ -783,7 +783,7 @@ Some invalid logs 2: Welcome to TiKV - test-filter"# req.set_end_time(i64::MAX); req.set_levels(vec![LogLevel::Warn as _]); req.set_patterns(vec![".*test-filter.*".to_string()].into()); - let expected = vec![ + let expected = [ "2019/08/23 18:09:58.387 +08:00", "2019/08/23 18:11:58.387 +08:00", "2019/08/23 18:11:59.387 +08:00", // for invalid line @@ -796,9 +796,7 @@ Some invalid logs 2: Welcome to TiKV - test-filter"# s.collect::>() .await .into_iter() - .map(|mut resp| resp.take_messages().into_iter()) - .into_iter() - .flatten() + .flat_map(|mut resp| resp.take_messages().into_iter()) .map(|msg| msg.get_time()) .collect::>() }); diff --git a/src/server/service/diagnostics/sys.rs b/src/server/service/diagnostics/sys.rs index 8a84eaf6293..12494e9e7c4 100644 --- a/src/server/service/diagnostics/sys.rs +++ b/src/server/service/diagnostics/sys.rs @@ -601,7 +601,7 @@ mod tests { ] ); // memory - for name in vec!["virtual", "swap"].into_iter() { + for name in ["virtual", "swap"].into_iter() { let item = collector .iter() .find(|x| x.get_tp() == "memory" && x.get_name() == name); diff --git a/src/server/service/kv.rs b/src/server/service/kv.rs index 5a4327ba46e..6f1cf0eaa1f 100644 --- a/src/server/service/kv.rs +++ b/src/server/service/kv.rs @@ -889,7 +889,6 @@ impl Tikv for Service { forward_duplex!(self.proxy, batch_commands, ctx, stream, sink); let (tx, rx) = unbounded(WakePolicy::TillReach(GRPC_MSG_NOTIFY_SIZE)); - let ctx = Arc::new(ctx); let peer = ctx.peer(); let storage = self.storage.clone(); let copr = self.copr.clone(); diff --git a/src/storage/lock_manager/lock_wait_context.rs b/src/storage/lock_manager/lock_wait_context.rs index 32c99867a3f..1eba8cd81b7 100644 --- a/src/storage/lock_manager/lock_wait_context.rs +++ b/src/storage/lock_manager/lock_wait_context.rs @@ -387,9 +387,9 @@ mod tests { let res = rx.recv().unwrap().unwrap_err(); assert!(matches!( &res, - StorageError(box StorageErrorInner::Txn(TxnError( - box TxnErrorInner::Mvcc(MvccError(box MvccErrorInner::WriteConflict { .. })) - ))) + StorageError(box StorageErrorInner::Txn(TxnError(box TxnErrorInner::Mvcc(MvccError( + box MvccErrorInner::WriteConflict { .. }, + ))))) )); // The tx should be dropped. rx.recv().unwrap_err(); @@ -422,9 +422,9 @@ mod tests { let res = rx.recv().unwrap().unwrap_err(); assert!(matches!( &res, - StorageError(box StorageErrorInner::Txn(TxnError( - box TxnErrorInner::Mvcc(MvccError(box MvccErrorInner::KeyIsLocked(_))) - ))) + StorageError(box StorageErrorInner::Txn(TxnError(box TxnErrorInner::Mvcc(MvccError( + box MvccErrorInner::KeyIsLocked(_), + ))))) )); // Since the cancellation callback can fully execute only when it's successfully // removed from the lock waiting queues, it's impossible that `finish_request` diff --git a/src/storage/lock_manager/lock_waiting_queue.rs b/src/storage/lock_manager/lock_waiting_queue.rs index a81248fe9e2..68e0118610a 100644 --- a/src/storage/lock_manager/lock_waiting_queue.rs +++ b/src/storage/lock_manager/lock_waiting_queue.rs @@ -110,12 +110,7 @@ impl Eq for LockWaitEntry {} impl PartialOrd for LockWaitEntry { fn partial_cmp(&self, other: &Self) -> Option { - // Reverse it since the priority queue is a max heap and we want to pop the - // minimal. - other - .parameters - .start_ts - .partial_cmp(&self.parameters.start_ts) + Some(self.cmp(other)) } } diff --git a/src/storage/metrics.rs b/src/storage/metrics.rs index e9477b56b0f..d3b3e89a3f8 100644 --- a/src/storage/metrics.rs +++ b/src/storage/metrics.rs @@ -63,7 +63,7 @@ pub fn tls_collect_scan_details(cmd: CommandKind, stats: &Statistics) { m.borrow_mut() .local_scan_details .entry(cmd) - .or_insert_with(Default::default) + .or_default() .add(stats); }); } diff --git a/src/storage/mod.rs b/src/storage/mod.rs index cb4057bfd7e..b8224df696b 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -1946,7 +1946,7 @@ impl Storage { key_ranges.push(build_key_range(k.as_encoded(), k.as_encoded(), false)); (k, v) }) - .filter(|&(_, ref v)| !(v.is_ok() && v.as_ref().unwrap().is_none())) + .filter(|(_, v)| !(v.is_ok() && v.as_ref().unwrap().is_none())) .map(|(k, v)| match v { Ok(v) => { let (user_key, _) = F::decode_raw_key_owned(k, false).unwrap(); @@ -3892,9 +3892,9 @@ mod tests { let result = block_on(storage.get(Context::default(), Key::from_raw(b"x"), 100.into())); assert!(matches!( result, - Err(Error(box ErrorInner::Txn(txn::Error( - box txn::ErrorInner::Mvcc(mvcc::Error(box mvcc::ErrorInner::KeyIsLocked { .. })) - )))) + Err(Error(box ErrorInner::Txn(txn::Error(box txn::ErrorInner::Mvcc(mvcc::Error( + box mvcc::ErrorInner::KeyIsLocked { .. }, + )))))) )); } @@ -5744,7 +5744,7 @@ mod tests { ]; // Write key-value pairs one by one - for &(ref key, ref value) in &test_data { + for (key, value) in &test_data { storage .raw_put( ctx.clone(), @@ -5803,7 +5803,7 @@ mod tests { let mut total_bytes: u64 = 0; let mut is_first = true; // Write key-value pairs one by one - for &(ref key, ref value) in &test_data { + for (key, value) in &test_data { storage .raw_put( ctx.clone(), @@ -6116,7 +6116,7 @@ mod tests { #[test] fn test_raw_batch_put() { - for for_cas in vec![false, true].into_iter() { + for for_cas in [false, true].into_iter() { test_kv_format_impl!(test_raw_batch_put_impl(for_cas)); } } @@ -6245,7 +6245,7 @@ mod tests { ]; // Write key-value pairs one by one - for &(ref key, ref value) in &test_data { + for (key, value) in &test_data { storage .raw_put( ctx.clone(), @@ -6260,7 +6260,7 @@ mod tests { } // Verify pairs in a batch - let keys = test_data.iter().map(|&(ref k, _)| k.clone()).collect(); + let keys = test_data.iter().map(|(k, _)| k.clone()).collect(); let results = test_data.into_iter().map(|(k, v)| Some((k, v))).collect(); expect_multi_values( results, @@ -6292,7 +6292,7 @@ mod tests { ]; // Write key-value pairs one by one - for &(ref key, ref value) in &test_data { + for (key, value) in &test_data { storage .raw_put( ctx.clone(), @@ -6310,7 +6310,7 @@ mod tests { let mut ids = vec![]; let cmds = test_data .iter() - .map(|&(ref k, _)| { + .map(|(k, _)| { let mut req = RawGetRequest::default(); req.set_context(ctx.clone()); req.set_key(k.clone()); @@ -6331,7 +6331,7 @@ mod tests { #[test] fn test_raw_batch_delete() { - for for_cas in vec![false, true].into_iter() { + for for_cas in [false, true].into_iter() { test_kv_format_impl!(test_raw_batch_delete_impl(for_cas)); } } @@ -6381,10 +6381,10 @@ mod tests { rx.recv().unwrap(); // Verify pairs exist - let keys = test_data.iter().map(|&(ref k, _)| k.clone()).collect(); + let keys = test_data.iter().map(|(k, _)| k.clone()).collect(); let results = test_data .iter() - .map(|&(ref k, ref v)| Some((k.clone(), v.clone()))) + .map(|(k, v)| Some((k.clone(), v.clone()))) .collect(); expect_multi_values( results, @@ -6512,7 +6512,7 @@ mod tests { // Scan pairs with key only let mut results: Vec> = test_data .iter() - .map(|&(ref k, _)| Some((k.clone(), vec![]))) + .map(|(k, _)| Some((k.clone(), vec![]))) .collect(); expect_multi_values( results.clone(), @@ -6909,7 +6909,7 @@ mod tests { rx.recv().unwrap(); // Verify pairs exist - let keys = test_data.iter().map(|&(ref k, _)| k.clone()).collect(); + let keys = test_data.iter().map(|(k, _)| k.clone()).collect(); let results = test_data.into_iter().map(|(k, v)| Some((k, v))).collect(); expect_multi_values( results, diff --git a/src/storage/mvcc/reader/point_getter.rs b/src/storage/mvcc/reader/point_getter.rs index cc4403229c1..474c789a31d 100644 --- a/src/storage/mvcc/reader/point_getter.rs +++ b/src/storage/mvcc/reader/point_getter.rs @@ -1287,7 +1287,7 @@ mod tests { let k = b"k"; // Write enough LOCK recrods - for start_ts in (1..30).into_iter().step_by(2) { + for start_ts in (1..30).step_by(2) { must_prewrite_lock(&mut engine, k, k, start_ts); must_commit(&mut engine, k, start_ts, start_ts + 1); } diff --git a/src/storage/mvcc/reader/reader.rs b/src/storage/mvcc/reader/reader.rs index 48158eda946..61a366c12ee 100644 --- a/src/storage/mvcc/reader/reader.rs +++ b/src/storage/mvcc/reader/reader.rs @@ -418,11 +418,10 @@ impl MvccReader { estimated_versions_to_last_change, } if estimated_versions_to_last_change >= SEEK_BOUND => { let key_with_ts = key.clone().append_ts(commit_ts); - let Some(value) = self - .snapshot - .get_cf(CF_WRITE, &key_with_ts)? else { - return Ok(None); - }; + let Some(value) = self.snapshot.get_cf(CF_WRITE, &key_with_ts)? + else { + return Ok(None); + }; self.statistics.write.get += 1; let write = WriteRef::parse(&value)?.to_owned(); assert!( @@ -2421,7 +2420,7 @@ pub mod tests { engine.commit(k, 1, 2); // Write enough LOCK recrods - for start_ts in (6..30).into_iter().step_by(2) { + for start_ts in (6..30).step_by(2) { engine.lock(k, start_ts, start_ts + 1); } @@ -2430,7 +2429,7 @@ pub mod tests { engine.commit(k, 45, 46); // Write enough LOCK recrods - for start_ts in (50..80).into_iter().step_by(2) { + for start_ts in (50..80).step_by(2) { engine.lock(k, start_ts, start_ts + 1); } @@ -2485,7 +2484,7 @@ pub mod tests { let k = b"k"; // Write enough LOCK recrods - for start_ts in (6..30).into_iter().step_by(2) { + for start_ts in (6..30).step_by(2) { engine.lock(k, start_ts, start_ts + 1); } @@ -2522,7 +2521,7 @@ pub mod tests { engine.put(k, 1, 2); // 10 locks were put - for start_ts in (6..30).into_iter().step_by(2) { + for start_ts in (6..30).step_by(2) { engine.lock(k, start_ts, start_ts + 1); } @@ -2549,7 +2548,7 @@ pub mod tests { feature_gate.set_version("6.1.0").unwrap(); set_tls_feature_gate(feature_gate); engine.delete(k, 51, 52); - for start_ts in (56..80).into_iter().step_by(2) { + for start_ts in (56..80).step_by(2) { engine.lock(k, start_ts, start_ts + 1); } let feature_gate = FeatureGate::default(); @@ -2581,7 +2580,7 @@ pub mod tests { let k = b"k"; engine.put(k, 1, 2); - for start_ts in (6..30).into_iter().step_by(2) { + for start_ts in (6..30).step_by(2) { engine.lock(k, start_ts, start_ts + 1); } engine.rollback(k, 30); diff --git a/src/storage/mvcc/reader/scanner/forward.rs b/src/storage/mvcc/reader/scanner/forward.rs index 3437a1e5432..2b0a8e13582 100644 --- a/src/storage/mvcc/reader/scanner/forward.rs +++ b/src/storage/mvcc/reader/scanner/forward.rs @@ -633,7 +633,7 @@ impl ScanPolicy for LatestEntryPolicy { fn scan_latest_handle_lock( current_user_key: Key, - cfg: &mut ScannerConfig, + cfg: &ScannerConfig, cursors: &mut Cursors, statistics: &mut Statistics, ) -> Result> { @@ -1636,7 +1636,7 @@ mod latest_kv_tests { must_prewrite_put(&mut engine, b"k4", b"v41", b"k4", 3); must_commit(&mut engine, b"k4", 3, 7); - for start_ts in (10..30).into_iter().step_by(2) { + for start_ts in (10..30).step_by(2) { must_prewrite_lock(&mut engine, b"k1", b"k1", start_ts); must_commit(&mut engine, b"k1", start_ts, start_ts + 1); must_prewrite_lock(&mut engine, b"k3", b"k1", start_ts); diff --git a/src/storage/raw/raw_mvcc.rs b/src/storage/raw/raw_mvcc.rs index 8c4ad5da08b..aa635827961 100644 --- a/src/storage/raw/raw_mvcc.rs +++ b/src/storage/raw/raw_mvcc.rs @@ -290,7 +290,7 @@ mod tests { RawEncodeSnapshot::from_snapshot(raw_mvcc_snapshot); // get_cf - for &(ref key, ref value, _) in &test_data[6..12] { + for (key, value, _) in &test_data[6..12] { let res = encode_snapshot.get_cf(CF_DEFAULT, &ApiV2::encode_raw_key(key, None)); assert_eq!(res.unwrap(), Some(value.to_owned())); } diff --git a/src/storage/txn/actions/prewrite.rs b/src/storage/txn/actions/prewrite.rs index 64e22a13585..713155f9160 100644 --- a/src/storage/txn/actions/prewrite.rs +++ b/src/storage/txn/actions/prewrite.rs @@ -766,7 +766,6 @@ fn async_commit_timestamps( #[cfg(not(feature = "failpoints"))] let injected_fallback = false; - let max_commit_ts = max_commit_ts; if (!max_commit_ts.is_zero() && min_commit_ts > max_commit_ts) || injected_fallback { warn!("commit_ts is too large, fallback to normal 2PC"; "key" => log_wrappers::Value::key(key.as_encoded()), @@ -1875,7 +1874,6 @@ pub mod tests { // At most 12 ops per-case. let ops_count = rg.gen::() % 12; let ops = (0..ops_count) - .into_iter() .enumerate() .map(|(i, _)| { if i == 0 { diff --git a/src/storage/txn/commands/atomic_store.rs b/src/storage/txn/commands/atomic_store.rs index 9a54895e7e2..61dbdac6565 100644 --- a/src/storage/txn/commands/atomic_store.rs +++ b/src/storage/txn/commands/atomic_store.rs @@ -88,8 +88,8 @@ mod tests { fn test_atomic_process_write_impl() { let mut engine = TestEngineBuilder::new().build().unwrap(); let cm = concurrency_manager::ConcurrencyManager::new(1.into()); - let raw_keys = vec![b"ra", b"rz"]; - let raw_values = vec![b"valuea", b"valuez"]; + let raw_keys = [b"ra", b"rz"]; + let raw_values = [b"valuea", b"valuez"]; let ts_provider = super::super::test_util::gen_ts_provider(F::TAG); let mut modifies = vec![]; diff --git a/src/storage/txn/commands/prewrite.rs b/src/storage/txn/commands/prewrite.rs index 10446db6292..2f39b29bc64 100644 --- a/src/storage/txn/commands/prewrite.rs +++ b/src/storage/txn/commands/prewrite.rs @@ -1853,9 +1853,7 @@ mod tests { .unwrap_err(); assert!(matches!( res, - Error(box ErrorInner::Mvcc(MvccError( - box MvccErrorInner::AlreadyExist { .. } - ))) + Error(box ErrorInner::Mvcc(MvccError(box MvccErrorInner::AlreadyExist { .. }))) )); assert_eq!(cm.max_ts().into_inner(), 15); @@ -1878,9 +1876,7 @@ mod tests { .unwrap_err(); assert!(matches!( res, - Error(box ErrorInner::Mvcc(MvccError( - box MvccErrorInner::WriteConflict { .. } - ))) + Error(box ErrorInner::Mvcc(MvccError(box MvccErrorInner::WriteConflict { .. }))) )); } @@ -2286,9 +2282,9 @@ mod tests { .unwrap_err(); assert!(matches!( err, - Error(box ErrorInner::Mvcc(MvccError( - box MvccErrorInner::PessimisticLockNotFound { .. } - ))) + Error(box ErrorInner::Mvcc(MvccError(box MvccErrorInner::PessimisticLockNotFound { + .. + }))) )); must_unlocked(&mut engine, b"k2"); // However conflict still won't be checked if there's a non-retry request @@ -2469,9 +2465,9 @@ mod tests { let err = prewrite_command(&mut engine, cm.clone(), &mut stat, cmd).unwrap_err(); assert!(matches!( err, - Error(box ErrorInner::Mvcc(MvccError( - box MvccErrorInner::PessimisticLockNotFound { .. } - ))) + Error(box ErrorInner::Mvcc(MvccError(box MvccErrorInner::PessimisticLockNotFound { + .. + }))) )); // Passing keys in different order gets the same result: let cmd = PrewritePessimistic::with_defaults( @@ -2492,9 +2488,9 @@ mod tests { let err = prewrite_command(&mut engine, cm, &mut stat, cmd).unwrap_err(); assert!(matches!( err, - Error(box ErrorInner::Mvcc(MvccError( - box MvccErrorInner::PessimisticLockNotFound { .. } - ))) + Error(box ErrorInner::Mvcc(MvccError(box MvccErrorInner::PessimisticLockNotFound { + .. + }))) )); // If the two keys are sent in different requests, it would be the client's duty diff --git a/src/storage/txn/latch.rs b/src/storage/txn/latch.rs index a662d9bab79..549d1d22636 100644 --- a/src/storage/txn/latch.rs +++ b/src/storage/txn/latch.rs @@ -224,7 +224,7 @@ impl Latches { keep_latches_for_next_cmd: Option<(u64, &Lock)>, ) -> Vec { // Used to - let dummy_vec = vec![]; + let dummy_vec = []; let (keep_latches_for_cid, mut keep_latches_it) = match keep_latches_for_next_cmd { Some((cid, lock)) => (Some(cid), lock.required_hashes.iter().peekable()), None => (None, dummy_vec.iter().peekable()), @@ -282,9 +282,9 @@ mod tests { fn test_wakeup() { let latches = Latches::new(256); - let keys_a = vec!["k1", "k3", "k5"]; + let keys_a = ["k1", "k3", "k5"]; let mut lock_a = Lock::new(keys_a.iter()); - let keys_b = vec!["k4", "k5", "k6"]; + let keys_b = ["k4", "k5", "k6"]; let mut lock_b = Lock::new(keys_b.iter()); let cid_a: u64 = 1; let cid_b: u64 = 2; @@ -310,9 +310,9 @@ mod tests { fn test_wakeup_by_multi_cmds() { let latches = Latches::new(256); - let keys_a = vec!["k1", "k2", "k3"]; - let keys_b = vec!["k4", "k5", "k6"]; - let keys_c = vec!["k3", "k4"]; + let keys_a = ["k1", "k2", "k3"]; + let keys_b = ["k4", "k5", "k6"]; + let keys_c = ["k3", "k4"]; let mut lock_a = Lock::new(keys_a.iter()); let mut lock_b = Lock::new(keys_b.iter()); let mut lock_c = Lock::new(keys_c.iter()); @@ -353,10 +353,10 @@ mod tests { fn test_wakeup_by_small_latch_slot() { let latches = Latches::new(5); - let keys_a = vec!["k1", "k2", "k3"]; - let keys_b = vec!["k6", "k7", "k8"]; - let keys_c = vec!["k3", "k4"]; - let keys_d = vec!["k7", "k10"]; + let keys_a = ["k1", "k2", "k3"]; + let keys_b = ["k6", "k7", "k8"]; + let keys_c = ["k3", "k4"]; + let keys_d = ["k7", "k10"]; let mut lock_a = Lock::new(keys_a.iter()); let mut lock_b = Lock::new(keys_b.iter()); let mut lock_c = Lock::new(keys_c.iter()); diff --git a/src/storage/txn/sched_pool.rs b/src/storage/txn/sched_pool.rs index 19736304373..2ca3ef145c8 100644 --- a/src/storage/txn/sched_pool.rs +++ b/src/storage/txn/sched_pool.rs @@ -267,7 +267,7 @@ pub fn tls_collect_scan_details(cmd: &'static str, stats: &Statistics) { m.borrow_mut() .local_scan_details .entry(cmd) - .or_insert_with(Default::default) + .or_default() .add(stats); }); } diff --git a/tests/benches/coprocessor_executors/util/mod.rs b/tests/benches/coprocessor_executors/util/mod.rs index 0a5708c74ce..3698860b4ea 100644 --- a/tests/benches/coprocessor_executors/util/mod.rs +++ b/tests/benches/coprocessor_executors/util/mod.rs @@ -147,7 +147,7 @@ where I: 'static, { fn partial_cmp(&self, other: &Self) -> Option { - self.get_name().partial_cmp(other.get_name()) + Some(self.cmp(other)) } } diff --git a/tests/benches/hierarchy/mvcc/mod.rs b/tests/benches/hierarchy/mvcc/mod.rs index 92dacfe6dc9..99f2c9ee1f4 100644 --- a/tests/benches/hierarchy/mvcc/mod.rs +++ b/tests/benches/hierarchy/mvcc/mod.rs @@ -61,7 +61,7 @@ where .unwrap(); } let write_data = WriteData::from_modifies(txn.into_modifies()); - let _ = tikv_kv::write(engine, &ctx, write_data, None); + let _ = futures::executor::block_on(tikv_kv::write(engine, &ctx, write_data, None)); let keys: Vec = kvs.iter().map(|(k, _)| Key::from_raw(k)).collect(); let snapshot = engine.snapshot(Default::default()).unwrap(); (snapshot, keys) diff --git a/tests/benches/misc/coprocessor/codec/chunk/chunk.rs b/tests/benches/misc/coprocessor/codec/chunk/chunk.rs index 4c033f2a80d..241284a7228 100644 --- a/tests/benches/misc/coprocessor/codec/chunk/chunk.rs +++ b/tests/benches/misc/coprocessor/codec/chunk/chunk.rs @@ -79,7 +79,7 @@ impl ChunkBuilder { pub fn build(self, tps: &[FieldType]) -> Chunk { let mut fields = Vec::with_capacity(tps.len()); let mut arrays: Vec> = Vec::with_capacity(tps.len()); - for (field_type, column) in tps.iter().zip(self.columns.into_iter()) { + for (field_type, column) in tps.iter().zip(self.columns) { match field_type.as_accessor().tp() { FieldTypeTp::Tiny | FieldTypeTp::Short diff --git a/tests/benches/misc/raftkv/mod.rs b/tests/benches/misc/raftkv/mod.rs index d567edd5add..a545d9935e6 100644 --- a/tests/benches/misc/raftkv/mod.rs +++ b/tests/benches/misc/raftkv/mod.rs @@ -171,6 +171,7 @@ fn bench_async_snapshots_noop(b: &mut test::Bencher) { } #[bench] +#[allow(clippy::let_underscore_future)] fn bench_async_snapshot(b: &mut test::Bencher) { let leader = new_peer(2, 3); let mut region = Region::default(); @@ -205,6 +206,7 @@ fn bench_async_snapshot(b: &mut test::Bencher) { } #[bench] +#[allow(clippy::let_underscore_future)] fn bench_async_write(b: &mut test::Bencher) { let leader = new_peer(2, 3); let mut region = Region::default(); diff --git a/tests/benches/raftstore/mod.rs b/tests/benches/raftstore/mod.rs index 05c602824c2..e164d59f82a 100644 --- a/tests/benches/raftstore/mod.rs +++ b/tests/benches/raftstore/mod.rs @@ -12,7 +12,7 @@ const DEFAULT_DATA_SIZE: usize = 100_000; fn enc_write_kvs(db: &RocksEngine, kvs: &[(Vec, Vec)]) { let mut wb = db.write_batch(); - for &(ref k, ref v) in kvs { + for (k, v) in kvs { wb.put(&keys::data_key(k), v).unwrap(); } wb.write().unwrap(); diff --git a/tests/failpoints/cases/mod.rs b/tests/failpoints/cases/mod.rs index a9dbd36a81a..f40f40e6af1 100644 --- a/tests/failpoints/cases/mod.rs +++ b/tests/failpoints/cases/mod.rs @@ -1,5 +1,8 @@ // Copyright 2017 TiKV Project Authors. Licensed under Apache-2.0. +#![allow(clippy::arc_with_non_send_sync)] +#![allow(clippy::unnecessary_mut_passed)] +#[allow(clippy::let_underscore_future)] mod test_async_fetch; mod test_async_io; mod test_backup; diff --git a/tests/failpoints/cases/test_disk_full.rs b/tests/failpoints/cases/test_disk_full.rs index 217269bb5b8..55c06d87b07 100644 --- a/tests/failpoints/cases/test_disk_full.rs +++ b/tests/failpoints/cases/test_disk_full.rs @@ -35,7 +35,7 @@ fn get_fp(usage: DiskUsage, store_id: u64) -> String { // check the region new leader is elected. fn assert_region_leader_changed( - cluster: &mut Cluster, + cluster: &Cluster, region_id: u64, original_leader: u64, ) { @@ -91,7 +91,7 @@ fn test_disk_full_leader_behaviors(usage: DiskUsage) { let new_last_index = cluster.raft_local_state(1, 1).last_index; assert_eq!(old_last_index, new_last_index); - assert_region_leader_changed(&mut cluster, 1, 1); + assert_region_leader_changed(&cluster, 1, 1); fail::remove(get_fp(usage, 1)); cluster.must_transfer_leader(1, new_peer(1, 1)); fail::cfg(get_fp(usage, 1), "return").unwrap(); @@ -199,7 +199,7 @@ fn test_disk_full_txn_behaviors(usage: DiskUsage) { DiskFullOpt::NotAllowedOnFull, ); assert!(res.get_region_error().has_disk_full()); - assert_region_leader_changed(&mut cluster, 1, 1); + assert_region_leader_changed(&cluster, 1, 1); fail::remove(get_fp(usage, 1)); cluster.must_transfer_leader(1, new_peer(1, 1)); @@ -393,7 +393,7 @@ fn test_disk_full_followers_with_hibernate_regions() { // check the region new leader is elected. fn assert_region_merged( - cluster: &mut Cluster, + cluster: &Cluster, left_region_key: &[u8], right_region_key: &[u8], ) { diff --git a/tests/failpoints/cases/test_engine.rs b/tests/failpoints/cases/test_engine.rs index 073f7276419..2dd5b6ac04b 100644 --- a/tests/failpoints/cases/test_engine.rs +++ b/tests/failpoints/cases/test_engine.rs @@ -57,6 +57,7 @@ fn test_write_buffer_manager() { } } +#[rustfmt::skip] // The test mocks the senario before https://github.com/tikv/rocksdb/pull/347: // note: before rocksdb/pull/347, lock is called before on_memtable_sealed. // Case: diff --git a/tests/failpoints/cases/test_hibernate.rs b/tests/failpoints/cases/test_hibernate.rs index 616a4e5e196..74561396593 100644 --- a/tests/failpoints/cases/test_hibernate.rs +++ b/tests/failpoints/cases/test_hibernate.rs @@ -93,6 +93,7 @@ fn test_break_leadership_on_restart() { // received, and become `GroupState::Ordered` after the proposal is received. // But they should keep wakeful for a while. #[test] +#[allow(clippy::let_underscore_future)] fn test_store_disconnect_with_hibernate() { let mut cluster = new_server_cluster(0, 3); let base_tick_ms = 50; diff --git a/tests/failpoints/cases/test_pd_client.rs b/tests/failpoints/cases/test_pd_client.rs index 0115d6d7ba5..201aafce6fb 100644 --- a/tests/failpoints/cases/test_pd_client.rs +++ b/tests/failpoints/cases/test_pd_client.rs @@ -43,6 +43,7 @@ macro_rules! request { } #[test] +#[allow(clippy::let_underscore_future)] fn test_pd_client_deadlock() { let (_server, client) = new_test_server_and_client(ReadableDuration::millis(100)); let pd_client_reconnect_fp = "pd_client_reconnect"; diff --git a/tests/failpoints/cases/test_pd_client_legacy.rs b/tests/failpoints/cases/test_pd_client_legacy.rs index ac427c29e69..583dad2ff34 100644 --- a/tests/failpoints/cases/test_pd_client_legacy.rs +++ b/tests/failpoints/cases/test_pd_client_legacy.rs @@ -43,6 +43,7 @@ macro_rules! request { } #[test] +#[allow(clippy::let_underscore_future)] fn test_pd_client_deadlock() { let (_server, client) = new_test_server_and_client(ReadableDuration::millis(100)); let client = Arc::new(client); diff --git a/tests/failpoints/cases/test_rawkv.rs b/tests/failpoints/cases/test_rawkv.rs index a795422c120..5ab7edb503f 100644 --- a/tests/failpoints/cases/test_rawkv.rs +++ b/tests/failpoints/cases/test_rawkv.rs @@ -208,7 +208,7 @@ fn test_leader_transfer() { #[test] fn test_region_merge() { let mut suite = TestSuite::new(3, ApiVersion::V2); - let keys = vec![b"rk0", b"rk1", b"rk2", b"rk3", b"rk4", b"rk5"]; + let keys = [b"rk0", b"rk1", b"rk2", b"rk3", b"rk4", b"rk5"]; suite.must_raw_put(keys[1], b"v1"); suite.must_raw_put(keys[3], b"v3"); diff --git a/tests/failpoints/cases/test_read_execution_tracker.rs b/tests/failpoints/cases/test_read_execution_tracker.rs index 7351044b297..dc6906b668a 100644 --- a/tests/failpoints/cases/test_read_execution_tracker.rs +++ b/tests/failpoints/cases/test_read_execution_tracker.rs @@ -4,11 +4,16 @@ use kvproto::kvrpcpb::*; use test_coprocessor::{init_with_data, DagSelect, ProductTable}; use test_raftstore::{kv_batch_read, kv_read, must_kv_commit, must_kv_prewrite}; use test_raftstore_macro::test_case; +use tikv_util::config::ReadableDuration; -#[test_case(test_raftstore::must_new_cluster_and_kv_client)] -#[test_case(test_raftstore_v2::must_new_cluster_and_kv_client)] +#[test_case(test_raftstore::must_new_cluster_with_cfg_and_kv_client_mul)] +#[test_case(test_raftstore_v2::must_new_cluster_with_cfg_and_kv_client_mul)] fn test_read_execution_tracking() { - let (_cluster, client, ctx) = new_cluster(); + let (_cluster, client, ctx) = new_cluster(1, |c| { + // set a small renew duration to avoid trigger pre-renew that can affact the + // metrics. + c.cfg.tikv.raft_store.renew_leader_lease_advance_duration = ReadableDuration::millis(1); + }); let (k1, v1) = (b"k1".to_vec(), b"v1".to_vec()); let (k2, v2) = (b"k2".to_vec(), b"v2".to_vec()); diff --git a/tests/failpoints/cases/test_split_region.rs b/tests/failpoints/cases/test_split_region.rs index 65c50793d7a..10a65271462 100644 --- a/tests/failpoints/cases/test_split_region.rs +++ b/tests/failpoints/cases/test_split_region.rs @@ -1426,8 +1426,7 @@ impl Filter for TeeFilter { // 2. the splitted region set has_dirty_data be true in `apply_snapshot` // 3. the splitted region schedule tablet trim task in `on_applied_snapshot` // with tablet index 5 -// 4. the splitted region received a snapshot sent from its -// leader +// 4. the splitted region received a snapshot sent from its leader // 5. after finishing applying this snapshot, the tablet index in storage // changed to 6 // 6. tablet trim complete and callbacked to raftstore diff --git a/tests/failpoints/cases/test_storage.rs b/tests/failpoints/cases/test_storage.rs index 57047bef9d4..4668c24ad66 100644 --- a/tests/failpoints/cases/test_storage.rs +++ b/tests/failpoints/cases/test_storage.rs @@ -1620,9 +1620,7 @@ fn test_before_propose_deadline() { assert!( matches!( res, - Err(StorageError(box StorageErrorInner::Kv(KvError( - box KvErrorInner::Request(_), - )))) + Err(StorageError(box StorageErrorInner::Kv(KvError(box KvErrorInner::Request(_))))) ), "actual: {:?}", res diff --git a/tests/failpoints/cases/test_transaction.rs b/tests/failpoints/cases/test_transaction.rs index 14f4161c7ae..4154a764d99 100644 --- a/tests/failpoints/cases/test_transaction.rs +++ b/tests/failpoints/cases/test_transaction.rs @@ -751,7 +751,7 @@ fn test_proposal_concurrent_with_conf_change_and_transfer_leader() { let handle = std::thread::spawn(move || { let mut mutations = vec![]; - for key in vec![b"key3".to_vec(), b"key4".to_vec()] { + for key in [b"key3".to_vec(), b"key4".to_vec()] { let mut mutation = kvproto::kvrpcpb::Mutation::default(); mutation.set_op(Op::Put); mutation.set_key(key); diff --git a/tests/failpoints/cases/test_transfer_leader.rs b/tests/failpoints/cases/test_transfer_leader.rs index 75eb62bab99..02fb8c046c8 100644 --- a/tests/failpoints/cases/test_transfer_leader.rs +++ b/tests/failpoints/cases/test_transfer_leader.rs @@ -361,8 +361,8 @@ fn test_read_lock_after_become_follower() { /// 1. Inserted 5 entries and make all stores commit and apply them. /// 2. Prevent the store 3 from append following logs. /// 3. Insert another 20 entries. -/// 4. Wait for some time so that part of the entry cache are compacted -/// on the leader(store 1). +/// 4. Wait for some time so that part of the entry cache are compacted on the +/// leader(store 1). macro_rules! run_cluster_for_test_warmup_entry_cache { ($cluster:expr) => { // Let the leader compact the entry cache. diff --git a/tests/integrations/backup/mod.rs b/tests/integrations/backup/mod.rs index 4cfd4be07be..bd5461e6134 100644 --- a/tests/integrations/backup/mod.rs +++ b/tests/integrations/backup/mod.rs @@ -492,6 +492,7 @@ fn test_backup_raw_meta() { } #[test] +#[allow(clippy::permissions_set_readonly_false)] fn test_invalid_external_storage() { let mut suite = TestSuite::new(1, 144 * 1024 * 1024, ApiVersion::V1); // Put some data. diff --git a/tests/integrations/import/test_apply_log.rs b/tests/integrations/import/test_apply_log.rs index 3d8cf85b02c..f821ffea2e7 100644 --- a/tests/integrations/import/test_apply_log.rs +++ b/tests/integrations/import/test_apply_log.rs @@ -67,6 +67,6 @@ fn test_apply_twice() { &tikv, &ctx, CF_DEFAULT, - default_fst.into_iter().chain(default_snd.into_iter()), + default_fst.into_iter().chain(default_snd), ); } diff --git a/tests/integrations/mod.rs b/tests/integrations/mod.rs index 2b68c0a8ba9..86ceb5369e7 100644 --- a/tests/integrations/mod.rs +++ b/tests/integrations/mod.rs @@ -4,6 +4,8 @@ #![feature(box_patterns)] #![feature(custom_test_frameworks)] #![test_runner(test_util::run_tests)] +#![allow(clippy::needless_pass_by_ref_mut)] +#![allow(clippy::extra_unused_type_parameters)] extern crate test; diff --git a/tests/integrations/raftstore/test_bootstrap.rs b/tests/integrations/raftstore/test_bootstrap.rs index 056641e1e3f..30ea12a424b 100644 --- a/tests/integrations/raftstore/test_bootstrap.rs +++ b/tests/integrations/raftstore/test_bootstrap.rs @@ -287,8 +287,8 @@ fn test_flush_before_stop2() { // 1. lock `k` with index 6 // 2. on_applied_res => lockcf's last_modified = 6 // 3. flush lock cf => lockcf's flushed_index = 6 -// 4. batch {unlock `k`, write `k`} with index 7 -// (last_modified is updated in store but RocksDB is modified in apply. So, +// 4. batch {unlock `k`, write `k`} with index 7 (last_modified is updated in +// store but RocksDB is modified in apply. So, // before on_apply_res, the last_modified is not updated.) // // flush-before-close: diff --git a/tests/integrations/raftstore/test_compact_lock_cf.rs b/tests/integrations/raftstore/test_compact_lock_cf.rs index fbc7629c73f..56cb65cce87 100644 --- a/tests/integrations/raftstore/test_compact_lock_cf.rs +++ b/tests/integrations/raftstore/test_compact_lock_cf.rs @@ -5,13 +5,13 @@ use engine_traits::{MiscExt, CF_LOCK}; use test_raftstore::*; use tikv_util::config::*; -fn flush(cluster: &mut Cluster) { +fn flush(cluster: &Cluster) { for engines in cluster.engines.values() { engines.kv.flush_cf(CF_LOCK, true).unwrap(); } } -fn flush_then_check(cluster: &mut Cluster, interval: u64, written: bool) { +fn flush_then_check(cluster: &Cluster, interval: u64, written: bool) { flush(cluster); // Wait for compaction. sleep_ms(interval * 2); diff --git a/tests/integrations/raftstore/test_stats.rs b/tests/integrations/raftstore/test_stats.rs index d61d6a59182..13e718b269d 100644 --- a/tests/integrations/raftstore/test_stats.rs +++ b/tests/integrations/raftstore/test_stats.rs @@ -420,6 +420,7 @@ fn test_txn_query_stats_tmpl() { fail::remove("mock_collect_tick_interval"); } +#[allow(clippy::extra_unused_type_parameters)] fn raw_put( _cluster: &Cluster, client: &TikvClient, From 058336850ce52cd0eb2691931b92f10318529d09 Mon Sep 17 00:00:00 2001 From: qupeng Date: Fri, 15 Sep 2023 14:55:39 +0800 Subject: [PATCH 049/220] stablize case test_store_disconnect_with_hibernate (#15596) close tikv/tikv#15607 None Signed-off-by: qupeng Co-authored-by: tonyxuqqi --- components/raftstore/src/store/fsm/peer.rs | 5 +++++ tests/failpoints/cases/test_hibernate.rs | 10 +++++----- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index 36c4c7e8e5f..371e8cd8eb5 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -2859,6 +2859,11 @@ where } fn reset_raft_tick(&mut self, state: GroupState) { + debug!( + "reset raft tick to {:?}", state; + "region_id"=> self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + ); self.fsm.reset_hibernate_state(state); self.fsm.missing_ticks = 0; self.fsm.peer.should_wake_up = false; diff --git a/tests/failpoints/cases/test_hibernate.rs b/tests/failpoints/cases/test_hibernate.rs index 74561396593..d8f73f312b6 100644 --- a/tests/failpoints/cases/test_hibernate.rs +++ b/tests/failpoints/cases/test_hibernate.rs @@ -102,10 +102,10 @@ fn test_store_disconnect_with_hibernate() { cluster.cfg.raft_store.raft_election_timeout_ticks = 10; cluster.cfg.raft_store.unreachable_backoff = ReadableDuration::millis(500); cluster.cfg.server.raft_client_max_backoff = ReadableDuration::millis(200); - // So the random election timeout will always be 10, which makes the case more - // stable. + // Use a small range but still random election timeouts, which makes the case + // more stable. cluster.cfg.raft_store.raft_min_election_timeout_ticks = 10; - cluster.cfg.raft_store.raft_max_election_timeout_ticks = 11; + cluster.cfg.raft_store.raft_max_election_timeout_ticks = 13; configure_for_hibernate(&mut cluster.cfg); cluster.pd_client.disable_default_operator(); let r = cluster.run_conf_change(); @@ -117,7 +117,7 @@ fn test_store_disconnect_with_hibernate() { must_get_equal(&cluster.get_engine(3), b"k1", b"v1"); // Wait until all peers of region 1 hibernate. - thread::sleep(Duration::from_millis(base_tick_ms * 30)); + thread::sleep(Duration::from_millis(base_tick_ms * 40)); // Stop the region leader. fail::cfg("receive_raft_message_from_outside", "pause").unwrap(); @@ -129,7 +129,7 @@ fn test_store_disconnect_with_hibernate() { fail::remove("receive_raft_message_from_outside"); // Wait for a while. Peers of region 1 shouldn't hibernate. - thread::sleep(Duration::from_millis(base_tick_ms * 30)); + thread::sleep(Duration::from_millis(base_tick_ms * 40)); must_get_equal(&cluster.get_engine(2), b"k2", b"v2"); must_get_equal(&cluster.get_engine(3), b"k2", b"v2"); } From 820ed9395b97853145fea4a21d6d906cbcd4d2fb Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Sat, 16 Sep 2023 14:42:09 +0800 Subject: [PATCH 050/220] tikv-ctl v2: get_all_regions_in_store excludes `tombstone` (#15522) ref tikv/tikv#14654 get_all_regions_in_store should exclude tombstone Signed-off-by: SpadeA-Tang Co-authored-by: tonyxuqqi --- cmd/tikv-ctl/src/executor.rs | 15 ++++-- src/server/debug2.rs | 94 +++++++++++++++++++++++++++--------- 2 files changed, 80 insertions(+), 29 deletions(-) diff --git a/cmd/tikv-ctl/src/executor.rs b/cmd/tikv-ctl/src/executor.rs index a145118acea..a20d6ce2602 100644 --- a/cmd/tikv-ctl/src/executor.rs +++ b/cmd/tikv-ctl/src/executor.rs @@ -1332,11 +1332,16 @@ impl DebugExecutor for DebuggerImplV2 { } fn get_region_size(&self, region: u64, cfs: Vec<&str>) -> Vec<(String, usize)> { - self.region_size(region, cfs) - .unwrap_or_else(|e| perror_and_exit("Debugger::region_size", e)) - .into_iter() - .map(|(cf, size)| (cf.to_owned(), size)) - .collect() + match self.region_size(region, cfs) { + Ok(v) => v + .into_iter() + .map(|(cf, size)| (cf.to_owned(), size)) + .collect(), + Err(e) => { + println!("Debugger::region_size: {}", e); + vec![] + } + } } fn get_region_info(&self, region: u64) -> RegionInfo { diff --git a/src/server/debug2.rs b/src/server/debug2.rs index 1ee1d108edc..7060b20bdb2 100644 --- a/src/server/debug2.rs +++ b/src/server/debug2.rs @@ -688,19 +688,19 @@ impl Debugger for DebuggerImplV2 { fn region_size>(&self, region_id: u64, cfs: Vec) -> Result> { match self.raft_engine.get_region_state(region_id, u64::MAX) { Ok(Some(region_state)) => { - if region_state.get_state() != PeerState::Normal { - return Err(Error::NotFound(format!( - "region {:?} has been deleted", - region_id - ))); - } let region = region_state.get_region(); + let state = region_state.get_state(); let start_key = &keys::data_key(region.get_start_key()); let end_key = &keys::data_end_key(region.get_end_key()); let mut sizes = vec![]; let mut tablet_cache = get_tablet_cache(&self.tablet_reg, region.id, Some(region_state))?; - let tablet = tablet_cache.latest().unwrap(); + let Some(tablet) = tablet_cache.latest() else { + return Err(Error::NotFound(format!( + "tablet not found, region_id={:?}, peer_state={:?}", + region_id, state + ))); + }; for cf in cfs { let mut size = 0; box_try!(tablet.scan(cf.as_ref(), start_key, end_key, false, |k, v| { @@ -731,7 +731,7 @@ impl Debugger for DebuggerImplV2 { )); } - let mut region_states = get_all_region_states_with_normal_state(&self.raft_engine); + let mut region_states = get_all_active_region_states(&self.raft_engine); region_states.sort_by(|r1, r2| { r1.get_region() @@ -786,12 +786,21 @@ impl Debugger for DebuggerImplV2 { fn get_all_regions_in_store(&self) -> Result> { let mut region_ids = vec![]; + let raft_engine = &self.raft_engine; self.raft_engine .for_each_raft_group::(&mut |region_id| { + let region_state = raft_engine + .get_region_state(region_id, u64::MAX) + .unwrap() + .unwrap(); + if region_state.state == PeerState::Tombstone { + return Ok(()); + } region_ids.push(region_id); Ok(()) }) .unwrap(); + region_ids.sort_unstable(); Ok(region_ids) } @@ -844,21 +853,29 @@ impl Debugger for DebuggerImplV2 { Err(e) => return Err(Error::EngineTrait(e)), }; - if region_state.state != PeerState::Normal { - return Err(Error::NotFound(format!("none region {:?}", region_id))); + let state = region_state.get_state(); + if state == PeerState::Tombstone { + return Err(Error::NotFound(format!( + "region {:?} is tombstone", + region_id + ))); } - let region = region_state.get_region(); - let start = keys::enc_start_key(region); - let end = keys::enc_end_key(region); - - let mut tablet_cache = - get_tablet_cache(&self.tablet_reg, region.id, Some(region_state.clone())).unwrap(); - let tablet = tablet_cache.latest().unwrap(); + let region = region_state.get_region().clone(); + let start = keys::enc_start_key(®ion); + let end = keys::enc_end_key(®ion); + + let mut tablet_cache = get_tablet_cache(&self.tablet_reg, region.id, Some(region_state))?; + let Some(tablet) = tablet_cache.latest() else { + return Err(Error::NotFound(format!( + "tablet not found, region_id={:?}, peer_state={:?}", + region_id, state + ))); + }; let mut res = dump_write_cf_properties(tablet, &start, &end)?; let mut res1 = dump_default_cf_properties(tablet, &start, &end)?; res.append(&mut res1); - let middle_key = match box_try!(get_region_approximate_middle(tablet, region)) { + let middle_key = match box_try!(get_region_approximate_middle(tablet, ®ion)) { Some(data_key) => keys::origin_key(&data_key).to_vec(), None => Vec::new(), }; @@ -1102,9 +1119,7 @@ fn get_tablet_cache( } } -fn get_all_region_states_with_normal_state( - raft_engine: &ER, -) -> Vec { +fn get_all_active_region_states(raft_engine: &ER) -> Vec { let mut region_states = vec![]; raft_engine .for_each_raft_group::(&mut |region_id| { @@ -1112,7 +1127,7 @@ fn get_all_region_states_with_normal_state( .get_region_state(region_id, u64::MAX) .unwrap() .unwrap(); - if region_state.state == PeerState::Normal { + if region_state.state != PeerState::Tombstone { region_states.push(region_state); } Ok(()) @@ -1133,7 +1148,7 @@ fn deivde_regions_for_concurrency( registry: &TabletRegistry, threads: u64, ) -> Result>> { - let region_states = get_all_region_states_with_normal_state(raft_engine); + let region_states = get_all_active_region_states(raft_engine); if threads == 1 { return Ok(vec![ @@ -1452,6 +1467,7 @@ mod tests { let mut wb = raft_engine.log_batch(10); wb.put_region_state(region_id, 10, &state).unwrap(); raft_engine.consume(&mut wb, true).unwrap(); + debugger.tablet_reg.remove(region_id); debugger.region_size(region_id, cfs.clone()).unwrap_err(); } @@ -1930,9 +1946,9 @@ mod tests { assert_eq!(region_info_2, region_info_2_before); } - #[test] // It tests that the latest apply state cannot be read as it is invisible // on persisted_applied + #[test] fn test_drop_unapplied_raftlog_2() { let dir = test_util::temp_dir("test-debugger", false); let debugger = new_debugger(dir.path()); @@ -1968,4 +1984,34 @@ mod tests { 80 ); } + + #[test] + fn test_get_all_regions_in_store() { + let dir = test_util::temp_dir("test-debugger", false); + let debugger = new_debugger(dir.path()); + let raft_engine = &debugger.raft_engine; + + init_region_state(raft_engine, 1, &[100, 101], 1); + init_region_state(raft_engine, 3, &[100, 101], 1); + init_region_state(raft_engine, 4, &[100, 101], 1); + + let mut lb = raft_engine.log_batch(3); + + let mut put_tombsotne_region = |region_id: u64| { + let mut region = metapb::Region::default(); + region.set_id(region_id); + let mut region_state = RegionLocalState::default(); + region_state.set_state(PeerState::Tombstone); + region_state.set_region(region.clone()); + lb.put_region_state(region_id, INITIAL_APPLY_INDEX, ®ion_state) + .unwrap(); + raft_engine.consume(&mut lb, true).unwrap(); + }; + + put_tombsotne_region(2); + put_tombsotne_region(5); + + let regions = debugger.get_all_regions_in_store().unwrap(); + assert_eq!(regions, vec![1, 3, 4]); + } } From 086965358f0109340b84261695fbeaccce3a62e2 Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Mon, 18 Sep 2023 18:06:11 +0800 Subject: [PATCH 051/220] raftstore-v2: report async snapshot metrics to prometheus (#15562) ref tikv/tikv#15401 report async snapshot metrics to prometheus Signed-off-by: SpadeA-Tang --- .../raftstore-v2/src/operation/query/local.rs | 8 ++++- src/server/raftkv/mod.rs | 2 +- src/server/raftkv2/mod.rs | 32 ++++++++++++++++--- 3 files changed, 36 insertions(+), 6 deletions(-) diff --git a/components/raftstore-v2/src/operation/query/local.rs b/components/raftstore-v2/src/operation/query/local.rs index 5f6d589eca6..fcc93636640 100644 --- a/components/raftstore-v2/src/operation/query/local.rs +++ b/components/raftstore-v2/src/operation/query/local.rs @@ -28,6 +28,7 @@ use raftstore::{ use slog::{debug, Logger}; use tikv_util::{box_err, codec::number::decode_u64, time::monotonic_raw_now, Either}; use time::Timespec; +use tracker::{get_tls_tracker_token, GLOBAL_TRACKERS}; use txn_types::WriteBatchFlags; use crate::{ @@ -335,7 +336,12 @@ where async move { let (mut fut, mut reader) = match res { - Either::Left(Ok(snap)) => return Ok(snap), + Either::Left(Ok(snap)) => { + GLOBAL_TRACKERS.with_tracker(get_tls_tracker_token(), |t| { + t.metrics.local_read = true; + }); + return Ok(snap); + } Either::Left(Err(e)) => return Err(e), Either::Right((fut, reader)) => (fut, reader), }; diff --git a/src/server/raftkv/mod.rs b/src/server/raftkv/mod.rs index f5b36dffbac..58287c2bb83 100644 --- a/src/server/raftkv/mod.rs +++ b/src/server/raftkv/mod.rs @@ -686,7 +686,7 @@ where tracker.metrics.read_index_propose_wait_nanos as f64 / 1_000_000_000.0, ); - // snapshot may be hanlded by lease read in raftstore + // snapshot may be handled by lease read in raftstore if tracker.metrics.read_index_confirm_wait_nanos > 0 { ASYNC_REQUESTS_DURATIONS_VEC .snapshot_read_index_confirm diff --git a/src/server/raftkv2/mod.rs b/src/server/raftkv2/mod.rs index 81143e6c2be..9785e821312 100644 --- a/src/server/raftkv2/mod.rs +++ b/src/server/raftkv2/mod.rs @@ -28,6 +28,7 @@ use raftstore_v2::{ }; use tikv_kv::{Modify, WriteEvent}; use tikv_util::time::Instant; +use tracker::{get_tls_tracker_token, GLOBAL_TRACKERS}; use txn_types::{TxnExtra, TxnExtraScheduler, WriteBatchFlags}; use super::{ @@ -172,7 +173,7 @@ impl tikv_kv::Engine for RaftKv2 { .set_key_ranges(mem::take(&mut ctx.key_ranges).into()); } ASYNC_REQUESTS_COUNTER_VEC.snapshot.all.inc(); - let begin_instant = Instant::now_coarse(); + let begin_instant = Instant::now(); let mut header = new_request_header(ctx.pb_ctx); let mut flags = 0; @@ -200,9 +201,32 @@ impl tikv_kv::Engine for RaftKv2 { let res = f.await; match res { Ok(snap) => { - ASYNC_REQUESTS_DURATIONS_VEC - .snapshot - .observe(begin_instant.saturating_elapsed_secs()); + let elapse = begin_instant.saturating_elapsed_secs(); + let tracker = get_tls_tracker_token(); + GLOBAL_TRACKERS.with_tracker(tracker, |tracker| { + if tracker.metrics.read_index_propose_wait_nanos > 0 { + ASYNC_REQUESTS_DURATIONS_VEC + .snapshot_read_index_propose_wait + .observe( + tracker.metrics.read_index_propose_wait_nanos as f64 + / 1_000_000_000.0, + ); + // snapshot may be handled by lease read in raftstore + if tracker.metrics.read_index_confirm_wait_nanos > 0 { + ASYNC_REQUESTS_DURATIONS_VEC + .snapshot_read_index_confirm + .observe( + tracker.metrics.read_index_confirm_wait_nanos as f64 + / 1_000_000_000.0, + ); + } + } else if tracker.metrics.local_read { + ASYNC_REQUESTS_DURATIONS_VEC + .snapshot_local_read + .observe(elapse); + } + }); + ASYNC_REQUESTS_DURATIONS_VEC.snapshot.observe(elapse); ASYNC_REQUESTS_COUNTER_VEC.snapshot.success.inc(); Ok(snap) } From 4a5fb7321ca2ee2bab0b31f6556c8fb196a590f4 Mon Sep 17 00:00:00 2001 From: glorv Date: Mon, 18 Sep 2023 18:24:11 +0800 Subject: [PATCH 052/220] test: make test test_destroy_missing more stable (#15616) close tikv/tikv#15615 Signed-off-by: glorv Co-authored-by: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> --- components/raftstore-v2/src/worker/tablet.rs | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/components/raftstore-v2/src/worker/tablet.rs b/components/raftstore-v2/src/worker/tablet.rs index 0b0429eb8d1..ef9739226e7 100644 --- a/components/raftstore-v2/src/worker/tablet.rs +++ b/components/raftstore-v2/src/worker/tablet.rs @@ -602,6 +602,13 @@ impl Runner { } } +#[cfg(test)] +impl Runner { + pub fn get_running_task_count(&self) -> usize { + self.low_pri_pool.get_running_task_count() + } +} + impl Runnable for Runner where EK: KvEngine, @@ -822,6 +829,14 @@ mod tests { runner.run(Task::destroy(r_1, 100)); assert!(path.exists()); registry.remove(r_1); + // waiting for async `pause_background_work` to be finished, + // this task can block tablet's destroy. + for _i in 0..100 { + if runner.get_running_task_count() == 0 { + break; + } + std::thread::sleep(Duration::from_millis(5)); + } runner.on_timeout(); assert!(!path.exists()); assert!(runner.pending_destroy_tasks.is_empty()); From 2db4b895a1e82d32830493eb10cea30925f65c7e Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Wed, 20 Sep 2023 10:40:42 +0800 Subject: [PATCH 053/220] raftstore-v2: fix rollback merge and commit merge can happen simultaneously (#15625) ref tikv/tikv#15242 fix rollback merge and commit merge can happen simultaneously Signed-off-by: SpadeA-Tang --- .../operation/command/admin/merge/commit.rs | 10 +- .../operation/command/admin/merge/rollback.rs | 12 +- components/raftstore-v2/src/raft/peer.rs | 10 ++ tests/failpoints/cases/test_merge.rs | 147 ++++++++++++++++++ 4 files changed, 177 insertions(+), 2 deletions(-) diff --git a/components/raftstore-v2/src/operation/command/admin/merge/commit.rs b/components/raftstore-v2/src/operation/command/admin/merge/commit.rs index 5bd92e3ea1c..5208dcc96a8 100644 --- a/components/raftstore-v2/src/operation/command/admin/merge/commit.rs +++ b/components/raftstore-v2/src/operation/command/admin/merge/commit.rs @@ -319,7 +319,7 @@ impl Peer { region ); assert!(!self.storage().has_dirty_data()); - if self.is_leader() { + if self.is_leader() && !self.leader_transferring() { let index = commit_of_merge(req.get_admin_request().get_commit_merge()); if self.proposal_control().is_merging() { // `on_admin_command` may delay our request indefinitely. It's better to check @@ -341,12 +341,19 @@ impl Peer { "res" => ?res, ); } else { + fail::fail_point!("on_propose_commit_merge_success"); return; } } let _ = store_ctx .router .force_send(source_id, PeerMsg::RejectCommitMerge { index }); + } else if self.leader_transferring() { + info!( + self.logger, + "not to propose commit merge when transferring leader"; + "transferee" => self.leader_transferee(), + ); } } else { info!( @@ -362,6 +369,7 @@ impl Peer { store_ctx: &mut StoreContext, req: RaftCmdRequest, ) -> Result { + (|| fail::fail_point!("propose_commit_merge_1", store_ctx.store_id == 1, |_| {}))(); let mut proposal_ctx = ProposalContext::empty(); proposal_ctx.insert(ProposalContext::COMMIT_MERGE); let data = req.write_to_bytes().unwrap(); diff --git a/components/raftstore-v2/src/operation/command/admin/merge/rollback.rs b/components/raftstore-v2/src/operation/command/admin/merge/rollback.rs index cb45fdcf1cf..d931a295f4d 100644 --- a/components/raftstore-v2/src/operation/command/admin/merge/rollback.rs +++ b/components/raftstore-v2/src/operation/command/admin/merge/rollback.rs @@ -38,6 +38,7 @@ impl Peer { store_ctx: &mut StoreContext, index: u64, ) { + fail::fail_point!("on_reject_commit_merge_1", store_ctx.store_id == 1, |_| {}); let self_index = self.merge_context().and_then(|c| c.prepare_merge_index()); if self_index != Some(index) { info!( @@ -75,7 +76,7 @@ impl Apply { pub fn apply_rollback_merge( &mut self, req: &AdminRequest, - _index: u64, + index: u64, ) -> Result<(AdminResponse, AdminCmdResult)> { fail::fail_point!("apply_rollback_merge"); PEER_ADMIN_CMD_COUNTER.rollback_merge.all.inc(); @@ -95,6 +96,15 @@ impl Apply { "state" => ?merge_state, ); } + + let prepare_merge_commit = rollback.commit; + info!( + self.logger, + "execute RollbackMerge"; + "commit" => prepare_merge_commit, + "index" => index, + ); + let mut region = self.region().clone(); let version = region.get_region_epoch().get_version(); // Update version to avoid duplicated rollback requests. diff --git a/components/raftstore-v2/src/raft/peer.rs b/components/raftstore-v2/src/raft/peer.rs index c3a80e3756c..87d41de776c 100644 --- a/components/raftstore-v2/src/raft/peer.rs +++ b/components/raftstore-v2/src/raft/peer.rs @@ -862,6 +862,16 @@ impl Peer { ) } + #[inline] + pub fn leader_transferee(&self) -> u64 { + self.leader_transferee + } + + #[inline] + pub fn leader_transferring(&self) -> bool { + self.leader_transferee != raft::INVALID_ID + } + #[inline] pub fn long_uncommitted_threshold(&self) -> Duration { Duration::from_secs(self.long_uncommitted_threshold) diff --git a/tests/failpoints/cases/test_merge.rs b/tests/failpoints/cases/test_merge.rs index eb6b8a235e1..08b7474bb8e 100644 --- a/tests/failpoints/cases/test_merge.rs +++ b/tests/failpoints/cases/test_merge.rs @@ -3,6 +3,7 @@ use std::{ sync::{ atomic::{AtomicBool, Ordering}, + mpsc::{channel, Sender}, *, }, thread, @@ -19,6 +20,7 @@ use kvproto::{ use pd_client::PdClient; use raft::eraftpb::MessageType; use raftstore::store::*; +use raftstore_v2::router::PeerMsg; use test_raftstore::*; use tikv::storage::{kv::SnapshotExt, Snapshot}; use tikv_util::{config::*, future::block_on_timeout, time::Instant, HandyRwLock}; @@ -1706,3 +1708,148 @@ fn test_destroy_source_peer_while_merging() { must_get_equal(&cluster.get_engine(i), b"k5", b"v5"); } } + +struct MsgTimeoutFilter { + tx: Sender, +} + +impl Filter for MsgTimeoutFilter { + fn before(&self, msgs: &mut Vec) -> raftstore::Result<()> { + let mut res = Vec::with_capacity(msgs.len()); + for m in msgs.drain(..) { + if m.get_message().msg_type == MessageType::MsgTimeoutNow { + self.tx.send(m).unwrap(); + } else { + res.push(m); + } + } + + *msgs = res; + check_messages(msgs) + } +} + +// Concurrent execution between transfer leader and merge can cause rollback and +// commit merge at the same time before this fix which corrupt the region. +// It can happen as this: +// Assume at the begin, leader of source and target are both on node-1 +// 1. node-1 transfer leader to node-2: execute up to sending MsgTimeoutNow +// (leader_transferre has been set), but before becoming follower. +// 2. node-1 source region propose, and apply PrepareMerge +// 3. node-1 target region propose CommitMerge but fail (due to +// leader_transferre being set) +// 4. node-1 source region successfully proposed rollback merge +// 5. node-2 target region became leader and apply the first no-op entry +// 6. node-2 target region successfully proposed commit merge +// Now, rollback at source region and commit at target region are both proposed +// and will be executed which will cause region corrupt +#[test] +fn test_concurrent_between_transfer_leader_and_merge() { + use test_raftstore_v2::*; + let mut cluster = new_node_cluster(0, 3); + configure_for_merge(&mut cluster.cfg); + cluster.run(); + + cluster.must_put(b"k1", b"v1"); + cluster.must_put(b"k3", b"v3"); + for i in 0..3 { + must_get_equal(&cluster.get_engine(i + 1), b"k1", b"v1"); + must_get_equal(&cluster.get_engine(i + 1), b"k3", b"v3"); + } + + let pd_client = Arc::clone(&cluster.pd_client); + let region = pd_client.get_region(b"k1").unwrap(); + cluster.must_split(®ion, b"k2"); + + let right = pd_client.get_region(b"k1").unwrap(); + let left = pd_client.get_region(b"k3").unwrap(); + cluster.must_transfer_leader( + left.get_id(), + left.get_peers() + .iter() + .find(|p| p.store_id == 1) + .cloned() + .unwrap(), + ); + + cluster.must_transfer_leader( + right.get_id(), + right + .get_peers() + .iter() + .find(|p| p.store_id == 1) + .cloned() + .unwrap(), + ); + + // Source region: 1, Target Region: 1000 + // Let target region in leader_transfering status by interceptting MsgTimeoutNow + // msg by using Filter. So we make node-1-1000 be in leader_transferring status + // for some time. + let (tx, rx_msg) = channel(); + let filter = MsgTimeoutFilter { tx }; + cluster.add_send_filter_on_node(1, Box::new(filter)); + + pd_client.transfer_leader( + right.get_id(), + right + .get_peers() + .iter() + .find(|p| p.store_id == 2) + .cloned() + .unwrap(), + vec![], + ); + + let msg = rx_msg.recv().unwrap(); + + // Now, node-1-1000 is in leader_transferring status. After it reject proposing + // commit merge, make node-1-1 block before proposing rollback merge until + // node-2-1000 propose commit merge. + + fail::cfg("on_reject_commit_merge_1", "pause").unwrap(); + + let router = cluster.get_router(2).unwrap(); + let (tx, rx) = channel(); + let _ = fail::cfg_callback("propose_commit_merge_1", move || { + tx.send(()).unwrap(); + }); + + let (tx2, rx2) = channel(); + let _ = fail::cfg_callback("on_propose_commit_merge_success", move || { + tx2.send(()).unwrap(); + }); + + cluster.merge_region(left.get_id(), right.get_id(), Callback::None); + + // Actually, store 1 should not reach the line of propose_commit_merge_1 + let _ = rx.recv_timeout(Duration::from_secs(2)); + router + .force_send(msg.get_region_id(), PeerMsg::RaftMessage(Box::new(msg))) + .unwrap(); + + // Wait region 1 of node 2 to become leader + rx2.recv().unwrap(); + fail::remove("on_reject_commit_merge_1"); + + let timer = Instant::now(); + loop { + if right.get_region_epoch().get_version() + == cluster.get_region_epoch(right.get_id()).get_version() + { + if timer.saturating_elapsed() > Duration::from_secs(5) { + panic!("region {:?} is still not merged.", right); + } + } else { + break; + } + sleep_ms(10); + } + + let region = pd_client.get_region(b"k1").unwrap(); + assert_eq!(region.get_id(), right.get_id()); + assert_eq!(region.get_start_key(), right.get_start_key()); + assert_eq!(region.get_end_key(), left.get_end_key()); + + cluster.must_put(b"k4", b"v4"); +} From ec4a9002f153f86c609e902ba685eee7a1224e6c Mon Sep 17 00:00:00 2001 From: lucasliang Date: Wed, 20 Sep 2023 11:51:13 +0800 Subject: [PATCH 054/220] raftstore: upgrade tokio timer to fix insecure issues. (#15622) ref tikv/tikv#15621 Signed-off-by: lucasliang --- Cargo.lock | 42 ++++++++++++--------------------- Cargo.toml | 1 + components/tikv_util/Cargo.toml | 2 +- 3 files changed, 17 insertions(+), 28 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index fb5e711d34d..34f9c381958 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1290,7 +1290,7 @@ dependencies = [ "crossbeam-deque", "crossbeam-epoch", "crossbeam-queue", - "crossbeam-utils 0.8.8", + "crossbeam-utils", ] [[package]] @@ -1300,7 +1300,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c2dd04ddaf88237dc3b8d8f9a3c1004b506b54b3313403944054d23c0870c521" dependencies = [ "cfg-if 1.0.0", - "crossbeam-utils 0.8.8", + "crossbeam-utils", ] [[package]] @@ -1311,7 +1311,7 @@ checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef" dependencies = [ "cfg-if 1.0.0", "crossbeam-epoch", - "crossbeam-utils 0.8.8", + "crossbeam-utils", ] [[package]] @@ -1322,7 +1322,7 @@ checksum = "1145cf131a2c6ba0615079ab6a638f7e1973ac9c2634fcbeaaad6114246efe8c" dependencies = [ "autocfg", "cfg-if 1.0.0", - "crossbeam-utils 0.8.8", + "crossbeam-utils", "lazy_static", "memoffset 0.6.4", "scopeguard", @@ -1335,7 +1335,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1f25d8400f4a7a5778f0e4e52384a48cbd9b5c495d110786187fc750075277a2" dependencies = [ "cfg-if 1.0.0", - "crossbeam-utils 0.8.8", + "crossbeam-utils", ] [[package]] @@ -1346,21 +1346,10 @@ checksum = "883a5821d7d079fcf34ac55f27a833ee61678110f6b97637cc74513c0d0b42fc" dependencies = [ "cfg-if 1.0.0", "crossbeam-epoch", - "crossbeam-utils 0.8.8", + "crossbeam-utils", "scopeguard", ] -[[package]] -name = "crossbeam-utils" -version = "0.7.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3c7c73a2d1e9fc0886a08b93e98eb643461230d5f1925e4036204d5f2e261a8" -dependencies = [ - "autocfg", - "cfg-if 0.1.10", - "lazy_static", -] - [[package]] name = "crossbeam-utils" version = "0.8.8" @@ -1989,7 +1978,7 @@ dependencies = [ "bcc", "collections", "crc32fast", - "crossbeam-utils 0.8.8", + "crossbeam-utils", "fs2", "lazy_static", "libc 0.2.146", @@ -4839,7 +4828,7 @@ checksum = "258bcdb5ac6dad48491bb2992db6b7cf74878b0384908af124823d118c99683f" dependencies = [ "crossbeam-channel", "crossbeam-deque", - "crossbeam-utils 0.8.8", + "crossbeam-utils", "num_cpus", ] @@ -7136,11 +7125,10 @@ dependencies = [ [[package]] name = "tokio-executor" -version = "0.1.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fb2d1b8f4548dbf5e1f7818512e9c406860678f29c300cdf0ebac72d1a3a1671" +version = "0.1.9" +source = "git+https://github.com/tikv/tokio?branch=tokio-timer-hotfix#4394380fa3c1f7f2c702a4ccc5ff01384746fdfd" dependencies = [ - "crossbeam-utils 0.7.2", + "crossbeam-utils", "futures 0.1.31", ] @@ -7201,9 +7189,9 @@ dependencies = [ [[package]] name = "tokio-timer" version = "0.2.13" -source = "git+https://github.com/tikv/tokio?branch=tokio-timer-hotfix#e8ac149d93f4a9bf49ea569d8d313ee40c5eb448" +source = "git+https://github.com/tikv/tokio?branch=tokio-timer-hotfix#4394380fa3c1f7f2c702a4ccc5ff01384746fdfd" dependencies = [ - "crossbeam-utils 0.7.2", + "crossbeam-utils", "futures 0.1.31", "slab", "tokio-executor", @@ -7377,7 +7365,7 @@ name = "tracker" version = "0.0.1" dependencies = [ "collections", - "crossbeam-utils 0.8.8", + "crossbeam-utils", "kvproto", "lazy_static", "parking_lot 0.12.1", @@ -7907,7 +7895,7 @@ source = "git+https://github.com/tikv/yatp.git?branch=master#5572a78702572087cab dependencies = [ "crossbeam-deque", "crossbeam-skiplist", - "crossbeam-utils 0.8.8", + "crossbeam-utils", "dashmap", "fail", "lazy_static", diff --git a/Cargo.toml b/Cargo.toml index 5bc49b17e42..c4c70e999be 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -368,6 +368,7 @@ tipb = { git = "https://github.com/pingcap/tipb.git" } kvproto = { git = "https://github.com/pingcap/kvproto.git" } yatp = { git = "https://github.com/tikv/yatp.git", branch = "master" } tokio-timer = { git = "https://github.com/tikv/tokio", branch = "tokio-timer-hotfix" } +tokio-executor = { git = "https://github.com/tikv/tokio", branch = "tokio-timer-hotfix" } slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } diff --git a/components/tikv_util/Cargo.toml b/components/tikv_util/Cargo.toml index 6de354fa259..b502a701136 100644 --- a/components/tikv_util/Cargo.toml +++ b/components/tikv_util/Cargo.toml @@ -57,7 +57,7 @@ thiserror = "1.0" tikv_alloc = { workspace = true } time = "0.1" tokio = { version = "1.5", features = ["rt-multi-thread"] } -tokio-executor = "0.1" +tokio-executor = { workspace = true } tokio-timer = { workspace = true } tracker = { workspace = true } url = "2" From 76df17e2c67e139a79653293b566d604a94a0352 Mon Sep 17 00:00:00 2001 From: 3pointer Date: Wed, 20 Sep 2023 14:22:43 +0800 Subject: [PATCH 055/220] log backup: fix the race of on events and do flush (#15618) close tikv/tikv#15602 Signed-off-by: 3pointer --- components/backup-stream/src/router.rs | 132 ++++++++++++++++++++++--- 1 file changed, 116 insertions(+), 16 deletions(-) diff --git a/components/backup-stream/src/router.rs b/components/backup-stream/src/router.rs index 6ce8486109f..b2fd9acc743 100644 --- a/components/backup-stream/src/router.rs +++ b/components/backup-stream/src/router.rs @@ -708,22 +708,25 @@ impl TempFileKey { /// The full name of the file owns the key. fn temp_file_name(&self) -> String { + let timestamp = (|| { + fail::fail_point!("temp_file_name_timestamp", |t| t.map_or_else( + || TimeStamp::physical_now(), + |v| + // reduce the precision of timestamp + v.parse::().ok().map_or(0, |u| TimeStamp::physical_now() / u) + )); + TimeStamp::physical_now() + })(); + let uuid = uuid::Uuid::new_v4(); if self.is_meta { format!( - "meta_{:08}_{}_{:?}_{}.temp.log", - self.region_id, - self.cf, - self.cmd_type, - TimeStamp::physical_now(), + "meta_{:08}_{}_{:?}_{:?}_{}.temp.log", + self.region_id, self.cf, self.cmd_type, uuid, timestamp, ) } else { format!( - "{:08}_{:08}_{}_{:?}_{}.temp.log", - self.table_id, - self.region_id, - self.cf, - self.cmd_type, - TimeStamp::physical_now(), + "{:08}_{:08}_{}_{:?}_{:?}_{}.temp.log", + self.table_id, self.region_id, self.cf, self.cmd_type, uuid, timestamp, ) } } @@ -864,6 +867,7 @@ impl StreamTaskInfo { } async fn on_events_of_key(&self, key: TempFileKey, events: ApplyEvents) -> Result<()> { + fail::fail_point!("before_generate_temp_file"); if let Some(f) = self.files.read().await.get(&key) { self.total_size .fetch_add(f.lock().await.on_events(events).await?, Ordering::SeqCst); @@ -886,6 +890,7 @@ impl StreamTaskInfo { let f = w.get(&key).unwrap(); self.total_size .fetch_add(f.lock().await.on_events(events).await?, Ordering::SeqCst); + fail::fail_point!("after_write_to_file"); Ok(()) } @@ -970,7 +975,9 @@ impl StreamTaskInfo { pub async fn move_to_flushing_files(&self) -> Result<&Self> { // if flushing_files is not empty, which represents this flush is a retry // operation. - if !self.flushing_files.read().await.is_empty() { + if !self.flushing_files.read().await.is_empty() + || !self.flushing_meta_files.read().await.is_empty() + { return Ok(self); } @@ -1032,7 +1039,12 @@ impl StreamTaskInfo { // and push it into merged_file_info(DataFileGroup). file_info_clone.set_range_offset(stat_length); data_files_open.push({ - let file = shared_pool.open_raw_for_read(data_file.inner.path())?; + let file = shared_pool + .open_raw_for_read(data_file.inner.path()) + .context(format_args!( + "failed to open read file {:?}", + data_file.inner.path() + ))?; let compress_length = file.len().await?; stat_length += compress_length; file_info_clone.set_range_length(compress_length); @@ -1097,7 +1109,6 @@ impl StreamTaskInfo { .await?; self.merge_log(metadata, storage.clone(), &self.flushing_meta_files, true) .await?; - Ok(()) } @@ -1157,7 +1168,8 @@ impl StreamTaskInfo { UnpinReader(Box::new(Cursor::new(meta_buff))), buflen as _, ) - .await?; + .await + .context(format_args!("flush meta {:?}", meta_path))?; } Ok(()) } @@ -1191,13 +1203,14 @@ impl StreamTaskInfo { .await? .generate_metadata(store_id) .await?; + + fail::fail_point!("after_moving_to_flushing_files"); crate::metrics::FLUSH_DURATION .with_label_values(&["generate_metadata"]) .observe(sw.lap().as_secs_f64()); // flush log file to storage. self.flush_log(&mut metadata_info).await?; - // the field `min_resolved_ts` of metadata will be updated // only after flush is done. metadata_info.min_resolved_ts = metadata_info @@ -2413,4 +2426,91 @@ mod tests { let r = cfg_manager.dispatch(changed); assert!(r.is_err()); } + + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn test_flush_on_events_race() -> Result<()> { + let (tx, _rx) = dummy_scheduler(); + let tmp = std::env::temp_dir().join(format!("{}", uuid::Uuid::new_v4())); + let router = Arc::new(RouterInner::new( + tx, + Config { + prefix: tmp.clone(), + // disable auto flush. + temp_file_size_limit: 1000, + temp_file_memory_quota: 2, + max_flush_interval: Duration::from_secs(300), + }, + )); + + let (task, _path) = task("race".to_owned()).await?; + must_register_table(router.as_ref(), task, 1).await; + router + .must_mut_task_info("race", |i| { + i.storage = Arc::new(NoopStorage::default()); + }) + .await; + let mut b = KvEventsBuilder::new(42, 0); + b.put_table(CF_DEFAULT, 1, b"k1", b"v1"); + let events_before_flush = b.finish(); + + b.put_table(CF_DEFAULT, 1, b"k1", b"v1"); + let events_after_flush = b.finish(); + + // make timestamp precision to 1 seconds. + fail::cfg("temp_file_name_timestamp", "return(1000)").unwrap(); + + let (trigger_tx, trigger_rx) = std::sync::mpsc::sync_channel(0); + let trigger_rx = std::sync::Mutex::new(trigger_rx); + + let (fp_tx, fp_rx) = std::sync::mpsc::sync_channel(0); + let fp_rx = std::sync::Mutex::new(fp_rx); + + let t = router.get_task_info("race").await.unwrap(); + let _ = router.on_events(events_before_flush).await; + + // make generate temp files ***happen after*** moving files to flushing_files + // and read flush file ***happen between*** genenrate file name and + // write kv to file. T1 is write thread. T2 is flush thread + // The order likes + // [T1] generate file name -> [T2] moving files to flushing_files -> [T1] write + // kv to file -> [T2] read flush file. + fail::cfg_callback("after_write_to_file", move || { + fp_tx.send(()).unwrap(); + }) + .unwrap(); + + fail::cfg_callback("before_generate_temp_file", move || { + trigger_rx.lock().unwrap().recv().unwrap(); + }) + .unwrap(); + + fail::cfg_callback("after_moving_to_flushing_files", move || { + trigger_tx.send(()).unwrap(); + fp_rx.lock().unwrap().recv().unwrap(); + }) + .unwrap(); + + // set flush status to true, because we disabled the auto flush. + t.set_flushing_status(true); + let router_clone = router.clone(); + let _ = tokio::join!( + // do flush in another thread + tokio::spawn(async move { + router_clone.do_flush("race", 42, TimeStamp::max()).await; + }), + router.on_events(events_after_flush) + ); + fail::remove("after_write_to_file"); + fail::remove("before_generate_temp_file"); + fail::remove("after_moving_to_flushing_files"); + fail::remove("temp_file_name_timestamp"); + + // set flush status to true, because we disabled the auto flush. + t.set_flushing_status(true); + let res = router.do_flush("race", 42, TimeStamp::max()).await; + // this time flush should success. + assert!(res.is_some()); + assert_eq!(t.files.read().await.len(), 0,); + Ok(()) + } } From 641f9b8dab1d8770ef5fded564490f8dbc094b74 Mon Sep 17 00:00:00 2001 From: buffer <1045931706@qq.com> Date: Wed, 20 Sep 2023 14:42:13 +0800 Subject: [PATCH 056/220] metrics: make disk usage clearer in the grafana (#15583) close tikv/tikv#15582 add metrics for detail disk usage. Signed-off-by: bufferflies <1045931706@qq.com> Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- components/pd_client/src/metrics.rs | 24 +++++++++++-- .../src/operation/command/write/ingest.rs | 3 ++ .../raftstore-v2/src/worker/pd/store.rs | 30 ++++++++-------- components/raftstore/src/store/fsm/store.rs | 4 ++- components/raftstore/src/store/worker/pd.rs | 36 +++++++++---------- components/sst_importer/src/sst_importer.rs | 19 +++++++++- metrics/grafana/tikv_details.json | 29 ++++++++++++++- 7 files changed, 106 insertions(+), 39 deletions(-) diff --git a/components/pd_client/src/metrics.rs b/components/pd_client/src/metrics.rs index 4e185658f15..7e7121170d6 100644 --- a/components/pd_client/src/metrics.rs +++ b/components/pd_client/src/metrics.rs @@ -48,6 +48,20 @@ make_static_metric! { try_connect, } + pub label_enum StoreSizeEventType { + capacity, + available, + used, + snap_size, + raft_size, + kv_size, + import_size, + } + + pub struct StoreSizeEventIntrVec: IntGauge { + "type" => StoreSizeEventType, + } + pub struct PDRequestEventHistogramVec: Histogram { "type" => PDRequestEventType, } @@ -101,8 +115,14 @@ lazy_static! { &["type"] ) .unwrap(); - pub static ref STORE_SIZE_GAUGE_VEC: IntGaugeVec = - register_int_gauge_vec!("tikv_store_size_bytes", "Size of storage.", &["type"]).unwrap(); + pub static ref STORE_SIZE_EVENT_INT_VEC: StoreSizeEventIntrVec = + register_static_int_gauge_vec!( + StoreSizeEventIntrVec, + "tikv_store_size_bytes", + "Size of storage.", + &["type"] + ) + .unwrap(); pub static ref REGION_READ_KEYS_HISTOGRAM: Histogram = register_histogram!( "tikv_region_read_keys", "Histogram of keys written for regions", diff --git a/components/raftstore-v2/src/operation/command/write/ingest.rs b/components/raftstore-v2/src/operation/command/write/ingest.rs index 92f5923d167..e963434fe83 100644 --- a/components/raftstore-v2/src/operation/command/write/ingest.rs +++ b/components/raftstore-v2/src/operation/command/write/ingest.rs @@ -4,6 +4,7 @@ use collections::HashMap; use crossbeam::channel::TrySendError; use engine_traits::{data_cf_offset, KvEngine, RaftEngine, DATA_CFS_LEN}; use kvproto::import_sstpb::SstMeta; +use pd_client::metrics::STORE_SIZE_EVENT_INT_VEC; use raftstore::{ store::{check_sst_for_ingestion, metrics::PEER_WRITE_CMD_COUNTER, util}, Result, @@ -39,6 +40,8 @@ impl Store { &mut self, ctx: &mut StoreContext, ) -> Result<()> { + let import_size = box_try!(ctx.sst_importer.get_total_size()); + STORE_SIZE_EVENT_INT_VEC.import_size.set(import_size as i64); let ssts = box_try!(ctx.sst_importer.list_ssts()); if ssts.is_empty() { return Ok(()); diff --git a/components/raftstore-v2/src/worker/pd/store.rs b/components/raftstore-v2/src/worker/pd/store.rs index a5aad42d85c..b3fd3245be6 100644 --- a/components/raftstore-v2/src/worker/pd/store.rs +++ b/components/raftstore-v2/src/worker/pd/store.rs @@ -9,7 +9,7 @@ use kvproto::pdpb; use pd_client::{ metrics::{ REGION_READ_BYTES_HISTOGRAM, REGION_READ_KEYS_HISTOGRAM, REGION_WRITTEN_BYTES_HISTOGRAM, - REGION_WRITTEN_KEYS_HISTOGRAM, STORE_SIZE_GAUGE_VEC, + REGION_WRITTEN_KEYS_HISTOGRAM, STORE_SIZE_EVENT_INT_VEC, }, PdClient, }; @@ -263,15 +263,9 @@ where self.store_stat.region_bytes_read.flush(); self.store_stat.region_keys_read.flush(); - STORE_SIZE_GAUGE_VEC - .with_label_values(&["capacity"]) - .set(capacity as i64); - STORE_SIZE_GAUGE_VEC - .with_label_values(&["available"]) - .set(available as i64); - STORE_SIZE_GAUGE_VEC - .with_label_values(&["used"]) - .set(used_size as i64); + STORE_SIZE_EVENT_INT_VEC.capacity.set(capacity as i64); + STORE_SIZE_EVENT_INT_VEC.available.set(available as i64); + STORE_SIZE_EVENT_INT_VEC.used.set(used_size as i64); // Update slowness statistics self.update_slowness_in_store_stats(&mut stats, last_query_sum); @@ -473,12 +467,16 @@ where true }); let snap_size = self.snap_mgr.total_snap_size().unwrap(); - let used_size = snap_size - + kv_size - + self - .raft_engine - .get_engine_size() - .expect("raft engine used size"); + let raft_size = self + .raft_engine + .get_engine_size() + .expect("engine used size"); + + STORE_SIZE_EVENT_INT_VEC.kv_size.set(kv_size as i64); + STORE_SIZE_EVENT_INT_VEC.raft_size.set(raft_size as i64); + STORE_SIZE_EVENT_INT_VEC.snap_size.set(snap_size as i64); + + let used_size = snap_size + kv_size + raft_size; let mut available = capacity.checked_sub(used_size).unwrap_or_default(); // We only care about rocksdb SST file size, so we should check disk available // here. diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index 53559bbe1b8..2434dfdd8e6 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -43,7 +43,7 @@ use kvproto::{ raft_serverpb::{ExtraMessage, ExtraMessageType, PeerState, RaftMessage, RegionLocalState}, replication_modepb::{ReplicationMode, ReplicationStatus}, }; -use pd_client::{Feature, FeatureGate, PdClient}; +use pd_client::{metrics::STORE_SIZE_EVENT_INT_VEC, Feature, FeatureGate, PdClient}; use protobuf::Message; use raft::StateRole; use resource_control::{channel::unbounded, ResourceGroupManager}; @@ -2791,6 +2791,8 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER fn on_cleanup_import_sst(&mut self) -> Result<()> { let mut delete_ssts = Vec::new(); let mut validate_ssts = Vec::new(); + let import_size = box_try!(self.ctx.importer.get_total_size()); + STORE_SIZE_EVENT_INT_VEC.import_size.set(import_size as i64); let ssts = box_try!(self.ctx.importer.list_ssts()); if ssts.is_empty() { diff --git a/components/raftstore/src/store/worker/pd.rs b/components/raftstore/src/store/worker/pd.rs index cb067ca840b..6aa192bd28e 100644 --- a/components/raftstore/src/store/worker/pd.rs +++ b/components/raftstore/src/store/worker/pd.rs @@ -1359,15 +1359,9 @@ where self.store_stat.region_bytes_read.flush(); self.store_stat.region_keys_read.flush(); - STORE_SIZE_GAUGE_VEC - .with_label_values(&["capacity"]) - .set(capacity as i64); - STORE_SIZE_GAUGE_VEC - .with_label_values(&["available"]) - .set(available as i64); - STORE_SIZE_GAUGE_VEC - .with_label_values(&["used"]) - .set(used_size as i64); + STORE_SIZE_EVENT_INT_VEC.capacity.set(capacity as i64); + STORE_SIZE_EVENT_INT_VEC.available.set(available as i64); + STORE_SIZE_EVENT_INT_VEC.used.set(used_size as i64); let slow_score = self.slow_score.get(); stats.set_slow_score(slow_score as u64); @@ -2590,15 +2584,21 @@ fn collect_engine_size( } else { store_info.capacity }; - let used_size = snap_mgr_size - + store_info - .kv_engine - .get_engine_used_size() - .expect("kv engine used size") - + store_info - .raft_engine - .get_engine_size() - .expect("raft engine used size"); + let raft_size = store_info + .raft_engine + .get_engine_size() + .expect("raft engine used size"); + + let kv_size = store_info + .kv_engine + .get_engine_used_size() + .expect("kv engine used size"); + + STORE_SIZE_EVENT_INT_VEC.raft_size.set(raft_size as i64); + STORE_SIZE_EVENT_INT_VEC.snap_size.set(snap_mgr_size as i64); + STORE_SIZE_EVENT_INT_VEC.kv_size.set(kv_size as i64); + + let used_size = snap_mgr_size + kv_size + raft_size; let mut available = capacity.checked_sub(used_size).unwrap_or_default(); // We only care about rocksdb SST file size, so we should check disk available // here. diff --git a/components/sst_importer/src/sst_importer.rs b/components/sst_importer/src/sst_importer.rs index 502a81ff6a6..910cfa602dd 100644 --- a/components/sst_importer/src/sst_importer.rs +++ b/components/sst_importer/src/sst_importer.rs @@ -4,7 +4,7 @@ use std::{ borrow::Cow, collections::HashMap, fs::File, - io::{self, BufReader, Read}, + io::{self, BufReader, ErrorKind, Read}, ops::Bound, path::{Path, PathBuf}, sync::{ @@ -293,6 +293,23 @@ impl SstImporter { path.save } + pub fn get_total_size(&self) -> Result { + let mut total_size = 0; + for entry in file_system::read_dir(self.dir.get_root_dir())? { + match entry.and_then(|e| e.metadata().map(|m| (e, m))) { + Ok((_, m)) => { + if !m.is_file() { + continue; + } + total_size += m.len(); + } + Err(e) if e.kind() == ErrorKind::NotFound => continue, + Err(e) => return Err(Error::from(e)), + }; + } + Ok(total_size) + } + pub fn create(&self, meta: &SstMeta) -> Result { match self.dir.create(meta, self.key_manager.clone()) { Ok(f) => { diff --git a/metrics/grafana/tikv_details.json b/metrics/grafana/tikv_details.json index c31ee12b27b..57c88782031 100644 --- a/metrics/grafana/tikv_details.json +++ b/metrics/grafana/tikv_details.json @@ -422,9 +422,36 @@ "expr": "sum(tikv_store_size_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"used\"}) by (instance)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{instance}}", + "legendFormat": "{{instance}}-used", "refId": "A", "step": 10 + }, + { + "expr": "sum(tikv_store_size_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"kv_size\"}) by (instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}-kv_size", + "refId": "B", + "step": 10, + "hide": true + }, + { + "expr": "sum(tikv_store_size_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"raft_size\"}) by (instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}-raft_size", + "refId": "C", + "step": 10, + "hide": true + }, + { + "expr": "sum(tikv_store_size_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"import_size\"}) by (instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}-import_size", + "refId": "D", + "step": 10, + "hide": true } ], "thresholds": [], From 10f51d8478e488dcef026b4d2e7fdeea80f478eb Mon Sep 17 00:00:00 2001 From: Neil Shen Date: Wed, 20 Sep 2023 14:55:43 +0800 Subject: [PATCH 057/220] resolved_ts: remove hash set to save memory (#15554) close tikv/tikv#15553 The Resolver uses a hash set to keep track of locks associated with the same timestamp. When the length of the hash set reaches zero, it indicates that the transaction has been fully committed. To save memory, we can replace the hash set with an integer. Signed-off-by: Neil Shen Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- .../backup-stream/src/subscription_track.rs | 26 ++- components/cdc/src/initializer.rs | 14 +- components/resolved_ts/src/endpoint.rs | 38 ++-- components/resolved_ts/src/resolver.rs | 210 ++++++++++++------ 4 files changed, 181 insertions(+), 107 deletions(-) diff --git a/components/backup-stream/src/subscription_track.rs b/components/backup-stream/src/subscription_track.rs index 0803ba1b99a..2dae8ce745d 100644 --- a/components/backup-stream/src/subscription_track.rs +++ b/components/backup-stream/src/subscription_track.rs @@ -8,7 +8,7 @@ use dashmap::{ }; use kvproto::metapb::Region; use raftstore::coprocessor::*; -use resolved_ts::{Resolver, TsSource}; +use resolved_ts::{Resolver, TsSource, TxnLocks}; use tikv_util::{info, memory::MemoryQuota, warn}; use txn_types::TimeStamp; @@ -99,7 +99,7 @@ impl ActiveSubscription { pub enum CheckpointType { MinTs, StartTsOfInitialScan, - StartTsOfTxn(Option>), + StartTsOfTxn(Option<(TimeStamp, TxnLocks)>), } impl std::fmt::Debug for CheckpointType { @@ -109,10 +109,7 @@ impl std::fmt::Debug for CheckpointType { Self::StartTsOfInitialScan => write!(f, "StartTsOfInitialScan"), Self::StartTsOfTxn(arg0) => f .debug_tuple("StartTsOfTxn") - .field(&format_args!( - "{}", - utils::redact(&arg0.as_ref().map(|x| x.as_ref()).unwrap_or(&[])) - )) + .field(&format_args!("{:?}", arg0)) .finish(), } } @@ -466,9 +463,11 @@ impl std::fmt::Debug for FutureLock { impl TwoPhaseResolver { /// try to get one of the key of the oldest lock in the resolver. - pub fn sample_far_lock(&self) -> Option> { - let (_, keys) = self.resolver.locks().first_key_value()?; - keys.iter().next().cloned() + pub fn sample_far_lock(&self) -> Option<(TimeStamp, TxnLocks)> { + self.resolver + .locks() + .first_key_value() + .map(|(ts, txn_locks)| (*ts, txn_locks.clone())) } pub fn in_phase_one(&self) -> bool { @@ -572,6 +571,7 @@ mod test { use kvproto::metapb::{Region, RegionEpoch}; use raftstore::coprocessor::ObserveHandle; + use resolved_ts::TxnLocks; use txn_types::TimeStamp; use super::{SubscriptionTracer, TwoPhaseResolver}; @@ -674,7 +674,13 @@ mod test { ( region(4, 8, 1), 128.into(), - StartTsOfTxn(Some(Arc::from(b"Alpi".as_slice()))) + StartTsOfTxn(Some(( + TimeStamp::new(128), + TxnLocks { + lock_count: 1, + sample_lock: Some(Arc::from(b"Alpi".as_slice())), + } + ))) ), ] ); diff --git a/components/cdc/src/initializer.rs b/components/cdc/src/initializer.rs index 31cda4b9e72..504eab621ff 100644 --- a/components/cdc/src/initializer.rs +++ b/components/cdc/src/initializer.rs @@ -575,7 +575,6 @@ mod tests { time::Duration, }; - use collections::HashSet; use engine_rocks::RocksEngine; use engine_traits::{MiscExt, CF_WRITE}; use futures::{executor::block_on, StreamExt}; @@ -584,6 +583,7 @@ mod tests { errorpb::Error as ErrorHeader, }; use raftstore::{coprocessor::ObserveHandle, router::CdcRaftRouter, store::RegionSnapshot}; + use resolved_ts::TxnLocks; use test_raftstore::MockRaftStoreRouter; use tikv::storage::{ kv::Engine, @@ -681,7 +681,7 @@ mod tests { fn test_initializer_build_resolver() { let mut engine = TestEngineBuilder::new().build_without_cache().unwrap(); - let mut expected_locks = BTreeMap::>>::new(); + let mut expected_locks = BTreeMap::::new(); // Only observe ["", "b\0x90"] let observed_range = ObservedRange::new( @@ -704,10 +704,12 @@ mod tests { total_bytes += v.len(); let ts = TimeStamp::new(i as _); must_prewrite_put(&mut engine, k, v, k, ts); - expected_locks - .entry(ts) - .or_default() - .insert(k.to_vec().into()); + let txn_locks = expected_locks.entry(ts).or_insert_with(|| { + let mut txn_locks = TxnLocks::default(); + txn_locks.sample_lock = Some(k.to_vec().into()); + txn_locks + }); + txn_locks.lock_count += 1; } let region = Region::default(); diff --git a/components/resolved_ts/src/endpoint.rs b/components/resolved_ts/src/endpoint.rs index 600da207ec4..406d931ed7f 100644 --- a/components/resolved_ts/src/endpoint.rs +++ b/components/resolved_ts/src/endpoint.rs @@ -42,7 +42,7 @@ use crate::{ metrics::*, resolver::{LastAttempt, Resolver}, scanner::{ScanEntries, ScanTask, ScannerPool}, - Error, Result, TsSource, ON_DROP_WARN_HEAP_SIZE, + Error, Result, TsSource, TxnLocks, ON_DROP_WARN_HEAP_SIZE, }; /// grace period for identifying identifying slow resolved-ts and safe-ts. @@ -388,11 +388,11 @@ where E: KvEngine, S: StoreRegionMeta, { - fn is_leader(&self, store_id: Option, leader_store_id: Option) -> bool { - store_id.is_some() && store_id == leader_store_id - } - fn collect_stats(&mut self) -> Stats { + fn is_leader(store_id: Option, leader_store_id: Option) -> bool { + store_id.is_some() && store_id == leader_store_id + } + let store_id = self.get_or_init_store_id(); let mut stats = Stats::default(); self.region_read_progress.with(|registry| { @@ -407,10 +407,10 @@ where continue; } - if self.is_leader(store_id, leader_store_id) { + if is_leader(store_id, leader_store_id) { // leader resolved-ts if resolved_ts < stats.min_leader_resolved_ts.resolved_ts { - let resolver = self.regions.get(region_id).map(|x| &x.resolver); + let resolver = self.regions.get_mut(region_id).map(|x| &mut x.resolver); stats .min_leader_resolved_ts .set(*region_id, resolver, &core, &leader_info); @@ -1186,7 +1186,7 @@ struct LeaderStats { last_resolve_attempt: Option, applied_index: u64, // min lock in LOCK CF - min_lock: Option<(TimeStamp, Key)>, + min_lock: Option<(TimeStamp, TxnLocks)>, lock_num: Option, txn_num: Option, } @@ -1211,7 +1211,7 @@ impl LeaderStats { fn set( &mut self, region_id: u64, - resolver: Option<&Resolver>, + mut resolver: Option<&mut Resolver>, region_read_progress: &MutexGuard<'_, RegionReadProgressCore>, leader_info: &LeaderInfo, ) { @@ -1222,21 +1222,13 @@ impl LeaderStats { duration_to_last_update_ms: region_read_progress .last_instant_of_update_ts() .map(|i| i.saturating_elapsed().as_millis() as u64), - last_resolve_attempt: resolver.and_then(|r| r.last_attempt.clone()), - min_lock: resolver.and_then(|r| { - r.oldest_transaction().map(|(ts, keys)| { - ( - *ts, - keys.iter() - .next() - .map(|k| Key::from_encoded_slice(k.as_ref())) - .unwrap_or_else(|| Key::from_encoded_slice("no_keys_found".as_ref())), - ) - }) - }), + last_resolve_attempt: resolver.as_mut().and_then(|r| r.take_last_attempt()), + min_lock: resolver + .as_ref() + .and_then(|r| r.oldest_transaction().map(|(t, tk)| (*t, tk.clone()))), applied_index: region_read_progress.applied_index(), - lock_num: resolver.map(|r| r.num_locks()), - txn_num: resolver.map(|r| r.num_transactions()), + lock_num: resolver.as_ref().map(|r| r.num_locks()), + txn_num: resolver.as_ref().map(|r| r.num_transactions()), }; } } diff --git a/components/resolved_ts/src/resolver.rs b/components/resolved_ts/src/resolver.rs index 85e7acff4a4..239ef566605 100644 --- a/components/resolved_ts/src/resolver.rs +++ b/components/resolved_ts/src/resolver.rs @@ -2,7 +2,7 @@ use std::{cmp, collections::BTreeMap, sync::Arc, time::Duration}; -use collections::{HashMap, HashSet}; +use collections::{HashMap, HashMapEntry}; use raftstore::store::RegionReadProgress; use tikv_util::{ memory::{HeapSize, MemoryQuota, MemoryQuotaExceeded}, @@ -12,13 +12,12 @@ use txn_types::{Key, TimeStamp}; use crate::metrics::*; -const MAX_NUMBER_OF_LOCKS_IN_LOG: usize = 10; pub const ON_DROP_WARN_HEAP_SIZE: usize = 64 * 1024 * 1024; // 64MB #[derive(Clone)] pub enum TsSource { // A lock in LOCK CF - Lock(Arc<[u8]>), + Lock(TxnLocks), // A memory lock in concurrency manager MemoryLock(Key), PdTso, @@ -41,13 +40,38 @@ impl TsSource { pub fn key(&self) -> Option { match self { - TsSource::Lock(k) => Some(Key::from_encoded_slice(k)), + TsSource::Lock(locks) => locks + .sample_lock + .as_ref() + .map(|k| Key::from_encoded_slice(k)), TsSource::MemoryLock(k) => Some(k.clone()), _ => None, } } } +#[derive(Default, Clone, PartialEq, Eq)] +pub struct TxnLocks { + pub lock_count: usize, + // A sample key in a transaction. + pub sample_lock: Option>, +} + +impl std::fmt::Debug for TxnLocks { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("TxnLocks") + .field("lock_count", &self.lock_count) + .field( + "sample_lock", + &self + .sample_lock + .as_ref() + .map(|k| log_wrappers::Value::key(k)), + ) + .finish() + } +} + // Resolver resolves timestamps that guarantee no more commit will happen before // the timestamp. pub struct Resolver { @@ -55,7 +79,7 @@ pub struct Resolver { // key -> start_ts locks_by_key: HashMap, TimeStamp>, // start_ts -> locked keys. - pub(crate) lock_ts_heap: BTreeMap>>, + lock_ts_heap: BTreeMap, // The last shrink time. last_aggressive_shrink_time: Instant, // The timestamps that guarantees no more commit will happen before. @@ -71,7 +95,7 @@ pub struct Resolver { // The memory quota for the `Resolver` and its lock keys and timestamps. memory_quota: Arc, // The last attempt of resolve(), used for diagnosis. - pub(crate) last_attempt: Option, + last_attempt: Option, } #[derive(Clone)] @@ -107,13 +131,14 @@ impl std::fmt::Debug for Resolver { let mut dt = f.debug_tuple("Resolver"); dt.field(&format_args!("region={}", self.region_id)); - if let Some((ts, keys)) = far_lock { + if let Some((ts, txn_locks)) = far_lock { + dt.field(&format_args!( + "oldest_lock_count={:?}", + txn_locks.lock_count + )); dt.field(&format_args!( - "oldest_lock={:?}", - keys.iter() - // We must use Display format here or the redact won't take effect. - .map(|k| format!("{}", log_wrappers::Value::key(k))) - .collect::>() + "oldest_lock_sample={:?}", + txn_locks.sample_lock )); dt.field(&format_args!("oldest_lock_ts={:?}", ts)); } @@ -180,7 +205,7 @@ impl Resolver { self.stopped } - pub fn locks(&self) -> &BTreeMap>> { + pub fn locks(&self) -> &BTreeMap { &self.lock_ts_heap } @@ -219,13 +244,13 @@ impl Resolver { } self.locks_by_key.len() * (key_bytes / key_count + std::mem::size_of::()) + self.lock_ts_heap.len() - * (std::mem::size_of::() + std::mem::size_of::>>()) + * (std::mem::size_of::() + std::mem::size_of::()) } fn lock_heap_size(&self, key: &[u8]) -> usize { // A resolver has // * locks_by_key: HashMap, TimeStamp> - // * lock_ts_heap: BTreeMap>> + // * lock_ts_heap: BTreeMap // // We only count memory used by locks_by_key. Because the majority of // memory is consumed by keys, locks_by_key and lock_ts_heap shares @@ -235,7 +260,7 @@ impl Resolver { key.heap_size() + std::mem::size_of::() } - fn shrink_ratio(&mut self, ratio: usize, timestamp: Option) { + fn shrink_ratio(&mut self, ratio: usize) { // HashMap load factor is 87% approximately, leave some margin to avoid // frequent rehash. // @@ -246,10 +271,6 @@ impl Resolver { { self.locks_by_key.shrink_to_fit(); } - if let Some(ts) = timestamp && let Some(lock_set) = self.lock_ts_heap.get_mut(&ts) - && lock_set.capacity() > lock_set.len() * cmp::max(MIN_SHRINK_RATIO, ratio) { - lock_set.shrink_to_fit(); - } } pub fn track_lock( @@ -273,8 +294,23 @@ impl Resolver { ); self.memory_quota.alloc(bytes)?; let key: Arc<[u8]> = key.into_boxed_slice().into(); - self.locks_by_key.insert(key.clone(), start_ts); - self.lock_ts_heap.entry(start_ts).or_default().insert(key); + match self.locks_by_key.entry(key) { + HashMapEntry::Occupied(_) => { + // Free memory quota because it's already in the map. + self.memory_quota.free(bytes); + } + HashMapEntry::Vacant(entry) => { + // Add lock count for the start ts. + let txn_locks = self.lock_ts_heap.entry(start_ts).or_insert_with(|| { + let mut txn_locks = TxnLocks::default(); + txn_locks.sample_lock = Some(entry.key().clone()); + txn_locks + }); + txn_locks.lock_count += 1; + + entry.insert(start_ts); + } + } Ok(()) } @@ -301,22 +337,17 @@ impl Resolver { "memory_in_use" => self.memory_quota.in_use(), ); - let mut shrink_ts = None; - if let Some(locked_keys) = self.lock_ts_heap.get_mut(&start_ts) { - // Only shrink large set, because committing a small transaction is - // fast and shrink adds unnecessary overhead. - const SHRINK_SET_CAPACITY: usize = 256; - if locked_keys.capacity() > SHRINK_SET_CAPACITY { - shrink_ts = Some(start_ts); + if let Some(txn_locks) = self.lock_ts_heap.get_mut(&start_ts) { + if txn_locks.lock_count > 0 { + txn_locks.lock_count -= 1; } - locked_keys.remove(key); - if locked_keys.is_empty() { + if txn_locks.lock_count == 0 { self.lock_ts_heap.remove(&start_ts); } - } + }; // Use a large ratio to amortize the cost of rehash. let shrink_ratio = 8; - self.shrink_ratio(shrink_ratio, shrink_ts); + self.shrink_ratio(shrink_ratio); } /// Try to advance resolved ts. @@ -333,7 +364,7 @@ impl Resolver { const AGGRESSIVE_SHRINK_RATIO: usize = 2; const AGGRESSIVE_SHRINK_INTERVAL: Duration = Duration::from_secs(10); if self.last_aggressive_shrink_time.saturating_elapsed() > AGGRESSIVE_SHRINK_INTERVAL { - self.shrink_ratio(AGGRESSIVE_SHRINK_RATIO, None); + self.shrink_ratio(AGGRESSIVE_SHRINK_RATIO); self.last_aggressive_shrink_time = Instant::now_coarse(); } @@ -344,17 +375,17 @@ impl Resolver { } // Find the min start ts. - let min_lock = self - .oldest_transaction() - .and_then(|(ts, locks)| locks.iter().next().map(|lock| (*ts, lock))); + let min_lock = self.oldest_transaction(); let has_lock = min_lock.is_some(); - let min_start_ts = min_lock.map(|(ts, _)| ts).unwrap_or(min_ts); + let min_start_ts = min_lock.as_ref().map(|(ts, _)| **ts).unwrap_or(min_ts); // No more commit happens before the ts. let new_resolved_ts = cmp::min(min_start_ts, min_ts); // reason is the min source of the new resolved ts. let reason = match (min_lock, min_ts) { - (Some(lock), min_ts) if lock.0 < min_ts => TsSource::Lock(lock.1.clone()), + (Some((lock_ts, txn_locks)), min_ts) if *lock_ts < min_ts => { + TsSource::Lock(txn_locks.clone()) + } (Some(_), _) => source, (None, _) => source, }; @@ -400,21 +431,16 @@ impl Resolver { pub(crate) fn log_locks(&self, min_start_ts: u64) { // log lock with the minimum start_ts >= min_start_ts - if let Some((start_ts, keys)) = self + if let Some((start_ts, txn_locks)) = self .lock_ts_heap .range(TimeStamp::new(min_start_ts)..) .next() { - let keys_for_log = keys - .iter() - .map(|key| log_wrappers::Value::key(key)) - .take(MAX_NUMBER_OF_LOCKS_IN_LOG) - .collect::>(); info!( "locks with the minimum start_ts in resolver"; "region_id" => self.region_id, "start_ts" => start_ts, - "sampled_keys" => ?keys_for_log, + "txn_locks" => ?txn_locks, ); } } @@ -431,9 +457,13 @@ impl Resolver { self.read_progress.as_ref() } - pub(crate) fn oldest_transaction(&self) -> Option<(&TimeStamp, &HashSet>)> { + pub(crate) fn oldest_transaction(&self) -> Option<(&TimeStamp, &TxnLocks)> { self.lock_ts_heap.iter().next() } + + pub(crate) fn take_last_attempt(&mut self) -> Option { + self.last_attempt.take() + } } #[cfg(test)] @@ -608,32 +638,76 @@ mod tests { } #[test] - fn test_untrack_lock_set_shrink_ratio() { + fn test_idempotent_track_and_untrack_lock() { let memory_quota = Arc::new(MemoryQuota::new(std::usize::MAX)); let mut resolver = Resolver::new(1, memory_quota); let mut key = vec![0; 16]; - let ts = TimeStamp::new(1); - for i in 0..1000usize { - key[0..8].copy_from_slice(&i.to_be_bytes()); - let _ = resolver.track_lock(ts, key.clone(), None); + + // track_lock + let mut ts = TimeStamp::default(); + for c in 0..10 { + ts.incr(); + for k in 0..100u64 { + key[0..8].copy_from_slice(&k.to_be_bytes()); + key[8..16].copy_from_slice(&ts.into_inner().to_be_bytes()); + let _ = resolver.track_lock(ts, key.clone(), None); + } + let in_use1 = resolver.memory_quota.in_use(); + let key_count1 = resolver.locks_by_key.len(); + let txn_count1 = resolver.lock_ts_heap.len(); + let txn_lock_count1 = resolver.lock_ts_heap[&ts].lock_count; + assert!(in_use1 > 0); + assert_eq!(key_count1, (c + 1) * 100); + assert_eq!(txn_count1, c + 1); + + // Put same keys again, resolver internal state must be idempotent. + for k in 0..100u64 { + key[0..8].copy_from_slice(&k.to_be_bytes()); + key[8..16].copy_from_slice(&ts.into_inner().to_be_bytes()); + let _ = resolver.track_lock(ts, key.clone(), None); + } + let in_use2 = resolver.memory_quota.in_use(); + let key_count2 = resolver.locks_by_key.len(); + let txn_count2 = resolver.lock_ts_heap.len(); + let txn_lock_count2 = resolver.lock_ts_heap[&ts].lock_count; + assert_eq!(in_use1, in_use2); + assert_eq!(key_count1, key_count2); + assert_eq!(txn_count1, txn_count2); + assert_eq!(txn_lock_count1, txn_lock_count2); } - assert!( - resolver.lock_ts_heap[&ts].capacity() >= 1000, - "{}", - resolver.lock_ts_heap[&ts].capacity() - ); + assert_eq!(resolver.resolve(ts, None, TsSource::PdTso), 1.into()); - for i in 0..990usize { - key[0..8].copy_from_slice(&i.to_be_bytes()); - resolver.untrack_lock(&key, None); + // untrack_lock + let mut ts = TimeStamp::default(); + for _ in 0..10 { + ts.incr(); + for k in 0..100u64 { + key[0..8].copy_from_slice(&k.to_be_bytes()); + key[8..16].copy_from_slice(&ts.into_inner().to_be_bytes()); + resolver.untrack_lock(&key, None); + } + let in_use1 = resolver.memory_quota.in_use(); + let key_count1 = resolver.locks_by_key.len(); + let txn_count1 = resolver.lock_ts_heap.len(); + + // Unlock same keys again, resolver internal state must be idempotent. + for k in 0..100u64 { + key[0..8].copy_from_slice(&k.to_be_bytes()); + key[8..16].copy_from_slice(&ts.into_inner().to_be_bytes()); + resolver.untrack_lock(&key, None); + } + let in_use2 = resolver.memory_quota.in_use(); + let key_count2 = resolver.locks_by_key.len(); + let txn_count2 = resolver.lock_ts_heap.len(); + assert_eq!(in_use1, in_use2); + assert_eq!(key_count1, key_count2); + assert_eq!(txn_count1, txn_count2); + + assert_eq!(resolver.resolve(ts, None, TsSource::PdTso), ts); } - // shrink_to_fit may reserve some space in accordance with the resize - // policy, but it is expected to be less than 100. - assert!( - resolver.lock_ts_heap[&ts].capacity() < 500, - "{}, {}", - resolver.lock_ts_heap[&ts].capacity(), - resolver.lock_ts_heap[&ts].len(), - ); + + assert_eq!(resolver.memory_quota.in_use(), 0); + assert_eq!(resolver.locks_by_key.len(), 0); + assert_eq!(resolver.lock_ts_heap.len(), 0); } } From 6971a4635b6b3a27b5be3db0fc4c8200d995d605 Mon Sep 17 00:00:00 2001 From: tonyxuqqi Date: Wed, 20 Sep 2023 13:09:12 -0700 Subject: [PATCH 058/220] upgrade flatbuffers from 2.1.2 to 23.5.26 to address security issue (#15628) ref tikv/tikv#15621 The security issue is https://github.com/google/flatbuffers/issues/6627. Upgrade flatbuffers from 2.1.2 to 23.5.26 to address it. Signed-off-by: tonyxuqqi Signed-off-by: Qi Xu Co-authored-by: Qi Xu --- Cargo.lock | 539 ++++++++++++++---- components/backup-stream/Cargo.toml | 2 +- components/backup-stream/src/router.rs | 1 + .../src/codec/mysql/time/mod.rs | 3 + .../src/codec/mysql/time/tz.rs | 4 + tests/Cargo.toml | 2 +- 6 files changed, 453 insertions(+), 98 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 34f9c381958..4f35ae6b935 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -47,7 +47,7 @@ version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "43bb833f0bf979d8475d38fbf09ed3b8a55e1885fe93ad3f93239fc6a4f17b98" dependencies = [ - "getrandom 0.2.3", + "getrandom 0.2.10", "once_cell", "version_check 0.9.4", ] @@ -59,6 +59,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2c99f64d1e06488f620f932677e24bc6e2897582980441ae90a671415bd7ec2f" dependencies = [ "cfg-if 1.0.0", + "const-random", + "getrandom 0.2.10", "once_cell", "version_check 0.9.4", ] @@ -78,6 +80,21 @@ version = "0.2.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c4f263788a35611fba42eb41ff811c5d0360c58b97402570312a350736e2542e" +[[package]] +name = "android-tzdata" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0" + +[[package]] +name = "android_system_properties" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" +dependencies = [ + "libc 0.2.146", +] + [[package]] name = "ansi_term" version = "0.11.0" @@ -133,28 +150,215 @@ dependencies = [ [[package]] name = "arrow" -version = "13.0.0" +version = "46.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c6bee230122beb516ead31935a61f683715f987c6f003eff44ad6986624105a" +checksum = "04a8801ebb147ad240b2d978d3ab9f73c9ccd4557ba6a03e7800496770ed10e0" dependencies = [ - "bitflags", + "ahash 0.8.3", + "arrow-arith", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-csv", + "arrow-data", + "arrow-ipc", + "arrow-json", + "arrow-ord", + "arrow-row", + "arrow-schema", + "arrow-select", + "arrow-string", +] + +[[package]] +name = "arrow-arith" +version = "46.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "895263144bd4a69751cbe6a34a53f26626e19770b313a9fa792c415cd0e78f11" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "chrono", + "half 2.3.1", + "num 0.4.1", +] + +[[package]] +name = "arrow-array" +version = "46.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "226fdc6c3a4ae154a74c24091d36a90b514f0ed7112f5b8322c1d8f354d8e20d" +dependencies = [ + "ahash 0.8.3", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "chrono", + "half 2.3.1", + "hashbrown 0.14.0", + "num 0.4.1", +] + +[[package]] +name = "arrow-buffer" +version = "46.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc4843af4dd679c2f35b69c572874da8fde33be53eb549a5fb128e7a4b763510" +dependencies = [ + "bytes", + "half 2.3.1", + "num 0.4.1", +] + +[[package]] +name = "arrow-cast" +version = "46.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35e8b9990733a9b635f656efda3c9b8308c7a19695c9ec2c7046dd154f9b144b" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", + "chrono", + "half 2.3.1", + "lexical-core", + "num 0.4.1", +] + +[[package]] +name = "arrow-csv" +version = "46.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "646fbb4e11dd0afb8083e883f53117713b8caadb4413b3c9e63e3f535da3683c" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-schema", "chrono", "csv", - "flatbuffers", - "half", - "hex 0.4.2", - "indexmap", + "csv-core", "lazy_static", "lexical-core", - "multiversion", - "num 0.4.0", - "rand 0.8.5", "regex", +] + +[[package]] +name = "arrow-data" +version = "46.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da900f31ff01a0a84da0572209be72b2b6f980f3ea58803635de47913191c188" +dependencies = [ + "arrow-buffer", + "arrow-schema", + "half 2.3.1", + "num 0.4.1", +] + +[[package]] +name = "arrow-ipc" +version = "46.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2707a8d7ee2d345d045283ece3ae43416175873483e5d96319c929da542a0b1f" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-schema", + "flatbuffers", +] + +[[package]] +name = "arrow-json" +version = "46.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d1b91a63c356d14eedc778b76d66a88f35ac8498426bb0799a769a49a74a8b4" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-schema", + "chrono", + "half 2.3.1", + "indexmap 2.0.0", + "lexical-core", + "num 0.4.1", "serde", - "serde_derive", "serde_json", ] +[[package]] +name = "arrow-ord" +version = "46.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "584325c91293abbca7aaaabf8da9fe303245d641f5f4a18a6058dc68009c7ebf" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", + "half 2.3.1", + "num 0.4.1", +] + +[[package]] +name = "arrow-row" +version = "46.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0e32afc1329f7b372463b21c6ca502b07cf237e1ed420d87706c1770bb0ebd38" +dependencies = [ + "ahash 0.8.3", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "half 2.3.1", + "hashbrown 0.14.0", +] + +[[package]] +name = "arrow-schema" +version = "46.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b104f5daa730f00fde22adc03a12aa5a2ae9ccbbf99cbd53d284119ddc90e03d" + +[[package]] +name = "arrow-select" +version = "46.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73b3ca55356d1eae07cf48808d8c462cea674393ae6ad1e0b120f40b422eb2b4" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "num 0.4.1", +] + +[[package]] +name = "arrow-string" +version = "46.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af1433ce02590cae68da0a18ed3a3ed868ffac2c6f24c533ddd2067f7ee04b4a" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", + "num 0.4.1", + "regex", + "regex-syntax 0.7.5", +] + [[package]] name = "async-channel" version = "1.6.1" @@ -407,7 +611,7 @@ dependencies = [ "bytes", "dyn-clone", "futures 0.3.15", - "getrandom 0.2.3", + "getrandom 0.2.10", "http-types", "log", "paste", @@ -591,7 +795,7 @@ dependencies = [ "futures-io", "grpcio", "hex 0.4.2", - "indexmap", + "indexmap 1.9.3", "kvproto", "lazy_static", "log_wrappers", @@ -807,9 +1011,9 @@ checksum = "08c48aae112d48ed9f069b33538ea9e3e90aa263cfa3d1c24309612b1f7472de" [[package]] name = "bytes" -version = "1.0.1" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b700ce4376041dcd0a327fd0097c41095743c4c8af8887265942faf1100bd040" +checksum = "a2bd12c1caf447e69cd4528f47f94d203fd2582878ecb9e9465484c4148a8223" dependencies = [ "serde", ] @@ -908,11 +1112,12 @@ dependencies = [ [[package]] name = "cc" -version = "1.0.73" +version = "1.0.83" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2fff2a6927b3bb87f9595d67196a70493f627687a71d87a0d692242c33f58c11" +checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0" dependencies = [ "jobserver", + "libc 0.2.146", ] [[package]] @@ -984,14 +1189,17 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "chrono" -version = "0.4.11" +version = "0.4.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "80094f509cf8b5ae86a4966a39b3ff66cd7e2a3e594accec3743ff3fabeab5b2" +checksum = "7f2c685bad3eb3d45a01354cedb7d5faa66194d1d58ba6e267a8de788f79db38" dependencies = [ - "num-integer", + "android-tzdata", + "iana-time-zone", + "js-sys", "num-traits", "serde", - "time 0.1.42", + "wasm-bindgen", + "windows-targets", ] [[package]] @@ -1039,7 +1247,7 @@ dependencies = [ "atty", "bitflags", "clap_derive", - "indexmap", + "indexmap 1.9.3", "lazy_static", "os_str_bytes", "strsim 0.10.0", @@ -1138,6 +1346,28 @@ dependencies = [ "cache-padded", ] +[[package]] +name = "const-random" +version = "0.1.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "368a7a772ead6ce7e1de82bfb04c485f3db8ec744f72925af5735e29a22cc18e" +dependencies = [ + "const-random-macro", + "proc-macro-hack", +] + +[[package]] +name = "const-random-macro" +version = "0.1.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d7d6ab3c3a2282db210df5f02c4dab6e0a7057af0fb7ebd4070f30fe05c0ddb" +dependencies = [ + "getrandom 0.2.10", + "once_cell", + "proc-macro-hack", + "tiny-keccak", +] + [[package]] name = "const_format" version = "0.2.30" @@ -1179,9 +1409,9 @@ dependencies = [ [[package]] name = "core-foundation-sys" -version = "0.8.2" +version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea221b5284a47e40033bf9b66f35f984ec0ea2931eb03505246cd27a963f981b" +checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa" [[package]] name = "cpu-time" @@ -1360,6 +1590,12 @@ dependencies = [ "lazy_static", ] +[[package]] +name = "crunchy" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" + [[package]] name = "crypto-common" version = "0.1.6" @@ -1781,6 +2017,12 @@ dependencies = [ "termcolor", ] +[[package]] +name = "equivalent" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" + [[package]] name = "errno" version = "0.2.8" @@ -2063,13 +2305,12 @@ checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" [[package]] name = "flatbuffers" -version = "2.1.2" +version = "23.5.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86b428b715fdbdd1c364b84573b5fdc0f84f8e423661b9f398735278bc7f2b6a" +checksum = "4dac53e22462d78c16d64a1cd22371b54cc3fe94aa15e7886a2fa6e5d1ab8640" dependencies = [ "bitflags", - "smallvec", - "thiserror", + "rustc_version 0.4.0", ] [[package]] @@ -2403,14 +2644,14 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.2.3" +version = "0.2.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fcd999463524c52659517fe2cea98493cfe485d10565e7b0fb07dbba7ad2753" +checksum = "be4136b2a15dd319360be1c07d9933517ccf0be8f16bf62a3bee4f0d618df427" dependencies = [ "cfg-if 1.0.0", "js-sys", "libc 0.2.146", - "wasi 0.10.2+wasi-snapshot-preview1", + "wasi 0.11.0+wasi-snapshot-preview1", "wasm-bindgen", ] @@ -2514,7 +2755,7 @@ dependencies = [ "futures-sink", "futures-util", "http", - "indexmap", + "indexmap 1.9.3", "slab", "tokio", "tokio-util", @@ -2527,11 +2768,22 @@ version = "1.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7" +[[package]] +name = "half" +version = "2.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc52e53916c08643f1b56ec082790d1e86a32e58dc5268f897f313fbae7b4872" +dependencies = [ + "cfg-if 1.0.0", + "crunchy", + "num-traits", +] + [[package]] name = "hashbrown" -version = "0.9.1" +version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7afe4a420e3fe79967a00898cc1f4db7c8a49a9333a29f8a4bd76a253d5cd04" +checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" [[package]] name = "hashbrown" @@ -2751,6 +3003,29 @@ dependencies = [ "tokio-native-tls", ] +[[package]] +name = "iana-time-zone" +version = "0.1.57" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2fad5b825842d2b38bd206f3e81d6957625fd7f0a361e345c30e01a0ae2dd613" +dependencies = [ + "android_system_properties", + "core-foundation-sys", + "iana-time-zone-haiku", + "js-sys", + "wasm-bindgen", + "windows", +] + +[[package]] +name = "iana-time-zone-haiku" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" +dependencies = [ + "cc", +] + [[package]] name = "ident_case" version = "1.0.1" @@ -2776,12 +3051,22 @@ checksum = "cb56e1aa765b4b4f3aadfab769793b7087bb03a4ea4920644a6d238e2df5b9ed" [[package]] name = "indexmap" -version = "1.6.2" +version = "1.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "824845a0bf897a9042383849b02c1bc219c2383772efcd5c6f9766fa4b81aef3" +checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99" dependencies = [ "autocfg", - "hashbrown 0.9.1", + "hashbrown 0.12.3", +] + +[[package]] +name = "indexmap" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d5477fe2230a79769d8dc68e0eabf5437907c0457a5614a9e8dddb67f65eb65d" +dependencies = [ + "equivalent", + "hashbrown 0.14.0", ] [[package]] @@ -2798,7 +3083,7 @@ checksum = "16d4bde3a7105e59c66a4104cfe9606453af1c7a0eac78cb7d5bc263eb762a70" dependencies = [ "ahash 0.7.4", "atty", - "indexmap", + "indexmap 1.9.3", "itoa 1.0.1", "lazy_static", "log", @@ -2949,7 +3234,7 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2d63b6407b66fc81fc539dccf3ddecb669f393c5101b6a2be3976c95099a06e8" dependencies = [ - "indexmap", + "indexmap 1.9.3", ] [[package]] @@ -3085,6 +3370,12 @@ dependencies = [ "winapi 0.3.9", ] +[[package]] +name = "libm" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7012b1bbb0719e1097c47611d3898568c546d597c2e74d66f6087edd5233ff4" + [[package]] name = "libmimalloc-sys" version = "0.1.21" @@ -3438,26 +3729,6 @@ dependencies = [ "serde", ] -[[package]] -name = "multiversion" -version = "0.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "025c962a3dd3cc5e0e520aa9c612201d127dcdf28616974961a649dca64f5373" -dependencies = [ - "multiversion-macros", -] - -[[package]] -name = "multiversion-macros" -version = "0.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8a3e2bde382ebf960c1f3e79689fa5941625fe9bf694a1cb64af3e85faff3af" -dependencies = [ - "proc-macro2", - "quote", - "syn 1.0.103", -] - [[package]] name = "mur3" version = "0.1.0" @@ -3604,15 +3875,15 @@ dependencies = [ [[package]] name = "num" -version = "0.4.0" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43db66d1170d347f9a065114077f7dccb00c1b9478c89384490a3425279a4606" +checksum = "b05180d69e3da0e530ba2a1dae5110317e49e3b7f3d41be227dc5f92e49ee7af" dependencies = [ "num-bigint", - "num-complex 0.4.1", + "num-complex 0.4.4", "num-integer", "num-iter", - "num-rational 0.4.0", + "num-rational 0.4.1", "num-traits", ] @@ -3638,9 +3909,9 @@ dependencies = [ [[package]] name = "num-complex" -version = "0.4.1" +version = "0.4.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97fbc387afefefd5e9e39493299f3069e14a140dd34dc19b4c1c1a8fddb6a790" +checksum = "1ba157ca0885411de85d6ca030ba7e2a83a28636056c7c699b07c8b6f7383214" dependencies = [ "num-traits", ] @@ -3668,9 +3939,9 @@ dependencies = [ [[package]] name = "num-integer" -version = "0.1.44" +version = "0.1.45" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2cc698a63b549a70bc047073d2949cce27cd1c7b0a4a862d08a8031bc2801db" +checksum = "225d3389fb3509a24c93f5c29eb6bde2586b98d9f016636dff58d7c6f7569cd9" dependencies = [ "autocfg", "num-traits", @@ -3678,9 +3949,9 @@ dependencies = [ [[package]] name = "num-iter" -version = "0.1.42" +version = "0.1.43" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2021c8337a54d21aca0d59a92577a029af9431cb59b909b03252b9c164fad59" +checksum = "7d03e6c028c5dc5cac6e2dec0efda81fc887605bb3d884578bb6d6bf7514e252" dependencies = [ "autocfg", "num-integer", @@ -3700,9 +3971,9 @@ dependencies = [ [[package]] name = "num-rational" -version = "0.4.0" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d41702bd167c2df5520b384281bc111a4b5efcf7fbc4c9c222c815b07e0a6a6a" +checksum = "0638a1c9d0a3c0914158145bc76cff373a75a627e6ecbfb71cbe6f453a5a19b0" dependencies = [ "autocfg", "num-bigint", @@ -3712,11 +3983,12 @@ dependencies = [ [[package]] name = "num-traits" -version = "0.2.14" +version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290" +checksum = "f30b0abd723be7e2ffca1272140fac1a2f084c77ec3e123c192b66af1ee9e6c2" dependencies = [ "autocfg", + "libm", ] [[package]] @@ -3746,7 +4018,7 @@ checksum = "80e47cfc4c0a1a519d9a025ebfbac3a2439d1b5cdf397d72dcb79b11d9920dab" dependencies = [ "base64 0.13.0", "chrono", - "getrandom 0.2.3", + "getrandom 0.2.10", "http", "rand 0.8.5", "serde", @@ -4037,7 +4309,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4a13a2fa9d0b63e5f22328828741e523766fff0ee9e779316902290dff3f824f" dependencies = [ "fixedbitset", - "indexmap", + "indexmap 1.9.3", ] [[package]] @@ -4769,7 +5041,7 @@ version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34cf66eb183df1c5876e2dcf6b13d57340741e8dc255b48e40a26de954d06ae7" dependencies = [ - "getrandom 0.2.3", + "getrandom 0.2.10", ] [[package]] @@ -4862,19 +5134,19 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "528532f3d801c87aec9def2add9ca802fe569e44a544afe633765267840abe64" dependencies = [ - "getrandom 0.2.3", + "getrandom 0.2.10", "redox_syscall 0.2.11", ] [[package]] name = "regex" -version = "1.5.6" +version = "1.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d83f127d94bdbcda4c8cc2e50f6f84f4b611f69c902699ca385a39c3a75f9ff1" +checksum = "8b1f693b24f6ac912f4893ef08244d70b6067480d2f1a46e950c9691e6749d1d" dependencies = [ "aho-corasick", "memchr", - "regex-syntax", + "regex-syntax 0.6.29", ] [[package]] @@ -4888,9 +5160,15 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.6.26" +version = "0.6.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49b3de9ec5dc0a3417da371aab17d729997c15010e7fd24ff707773a33bddb64" +checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1" + +[[package]] +name = "regex-syntax" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da" [[package]] name = "remove_dir_all" @@ -5433,7 +5711,7 @@ version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e18acfa2f90e8b735b2836ab8d538de304cbb6729a7360729ea5a895d15a622" dependencies = [ - "half", + "half 1.8.2", "serde", ] @@ -5463,7 +5741,7 @@ version = "1.0.64" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "799e97dc9fdae36a5c8b8f2cae9ce2ee9fdce2058c57a93e6099d919fd982f79" dependencies = [ - "indexmap", + "indexmap 1.9.3", "itoa 0.4.4", "ryu", "serde", @@ -7073,6 +7351,15 @@ dependencies = [ "time-core", ] +[[package]] +name = "tiny-keccak" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237" +dependencies = [ + "crunchy", +] + [[package]] name = "tinytemplate" version = "1.2.0" @@ -7274,7 +7561,7 @@ checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c" dependencies = [ "futures-core", "futures-util", - "indexmap", + "indexmap 1.9.3", "pin-project", "pin-project-lite", "rand 0.8.5", @@ -7512,7 +7799,7 @@ version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bc5cf98d8186244414c848017f0e2676b3fcb46807f6668a97dfe67359a3c4b7" dependencies = [ - "getrandom 0.2.3", + "getrandom 0.2.10", "serde", ] @@ -7522,7 +7809,7 @@ version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "feb41e78f93363bb2df8b0e86a2ca30eed7806ea16ea0c790d757cf93f79be83" dependencies = [ - "getrandom 0.2.3", + "getrandom 0.2.10", ] [[package]] @@ -7598,12 +7885,6 @@ version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b89c3ce4ce14bdc6fb6beaf9ec7928ca331de5df7e5ea278375642a2f478570d" -[[package]] -name = "wasi" -version = "0.10.2+wasi-snapshot-preview1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd6fbd9a79829dd1ad0cc20627bf1ed606756a7f77edff7b66b7064f9cb327c6" - [[package]] name = "wasi" version = "0.11.0+wasi-snapshot-preview1" @@ -7742,6 +8023,15 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" +[[package]] +name = "windows" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e686886bc078bc1b0b600cac0147aadb815089b6e4da64016cbd754b6342700f" +dependencies = [ + "windows-targets", +] + [[package]] name = "windows-sys" version = "0.32.0" @@ -7761,21 +8051,42 @@ version = "0.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5a3e1820f08b8513f676f7ab6c1f99ff312fb97b553d30ff4dd86f9f15728aa7" dependencies = [ - "windows_aarch64_gnullvm", + "windows_aarch64_gnullvm 0.42.0", "windows_aarch64_msvc 0.42.0", "windows_i686_gnu 0.42.0", "windows_i686_msvc 0.42.0", "windows_x86_64_gnu 0.42.0", - "windows_x86_64_gnullvm", + "windows_x86_64_gnullvm 0.42.0", "windows_x86_64_msvc 0.42.0", ] +[[package]] +name = "windows-targets" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" +dependencies = [ + "windows_aarch64_gnullvm 0.48.5", + "windows_aarch64_msvc 0.48.5", + "windows_i686_gnu 0.48.5", + "windows_i686_msvc 0.48.5", + "windows_x86_64_gnu 0.48.5", + "windows_x86_64_gnullvm 0.48.5", + "windows_x86_64_msvc 0.48.5", +] + [[package]] name = "windows_aarch64_gnullvm" version = "0.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "41d2aa71f6f0cbe00ae5167d90ef3cfe66527d6f613ca78ac8024c3ccab9a19e" +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" + [[package]] name = "windows_aarch64_msvc" version = "0.32.0" @@ -7788,6 +8099,12 @@ version = "0.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dd0f252f5a35cac83d6311b2e795981f5ee6e67eb1f9a7f64eb4500fbc4dcdb4" +[[package]] +name = "windows_aarch64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" + [[package]] name = "windows_i686_gnu" version = "0.32.0" @@ -7800,6 +8117,12 @@ version = "0.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fbeae19f6716841636c28d695375df17562ca208b2b7d0dc47635a50ae6c5de7" +[[package]] +name = "windows_i686_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" + [[package]] name = "windows_i686_msvc" version = "0.32.0" @@ -7812,6 +8135,12 @@ version = "0.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "84c12f65daa39dd2babe6e442988fc329d6243fdce47d7d2d155b8d874862246" +[[package]] +name = "windows_i686_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" + [[package]] name = "windows_x86_64_gnu" version = "0.32.0" @@ -7824,12 +8153,24 @@ version = "0.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bf7b1b21b5362cbc318f686150e5bcea75ecedc74dd157d874d754a2ca44b0ed" +[[package]] +name = "windows_x86_64_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" + [[package]] name = "windows_x86_64_gnullvm" version = "0.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09d525d2ba30eeb3297665bd434a54297e4170c7f1a44cad4ef58095b4cd2028" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" + [[package]] name = "windows_x86_64_msvc" version = "0.32.0" @@ -7842,6 +8183,12 @@ version = "0.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f40009d85759725a34da6d89a94e63d7bdc50a862acf0dbc7c8e488f1edcb6f5" +[[package]] +name = "windows_x86_64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" + [[package]] name = "winreg" version = "0.7.0" diff --git a/components/backup-stream/Cargo.toml b/components/backup-stream/Cargo.toml index 8c1edc89a48..4f53c39b9db 100644 --- a/components/backup-stream/Cargo.toml +++ b/components/backup-stream/Cargo.toml @@ -51,7 +51,7 @@ futures-io = "0.3" grpcio = { workspace = true } hex = "0.4" # Fixing ahash cyclic dep: https://github.com/tkaitchuck/ahash/issues/95 -indexmap = "=1.6.2" +indexmap = "=1.9.3" kvproto = { workspace = true } lazy_static = "1.4" log_wrappers = { workspace = true } diff --git a/components/backup-stream/src/router.rs b/components/backup-stream/src/router.rs index b2fd9acc743..ae4b98b1687 100644 --- a/components/backup-stream/src/router.rs +++ b/components/backup-stream/src/router.rs @@ -731,6 +731,7 @@ impl TempFileKey { } } + #[allow(deprecated)] fn format_date_time(ts: u64, t: FormatType) -> impl Display { use chrono::prelude::*; let millis = TimeStamp::physical(ts.into()); diff --git a/components/tidb_query_datatype/src/codec/mysql/time/mod.rs b/components/tidb_query_datatype/src/codec/mysql/time/mod.rs index 44228f2d88e..621d4384bcc 100644 --- a/components/tidb_query_datatype/src/codec/mysql/time/mod.rs +++ b/components/tidb_query_datatype/src/codec/mysql/time/mod.rs @@ -1342,6 +1342,7 @@ impl Time { Ok((((ymd << 17) | hms) << 24) | u64::from(self.micro())) } + #[allow(deprecated)] pub fn from_duration( ctx: &mut EvalContext, duration: Duration, @@ -1415,6 +1416,7 @@ impl Time { .ok_or_else(|| Error::incorrect_datetime_value(self)) } + #[allow(deprecated)] pub fn normalized(self, ctx: &mut EvalContext) -> Result { if self.get_time_type() == TimeType::Timestamp { return Ok(self); @@ -1500,6 +1502,7 @@ impl Time { + self.day()) as i32 } + #[allow(deprecated)] pub fn weekday(self) -> Weekday { let date = if self.month() == 0 { NaiveDate::from_ymd(self.year() as i32 - 1, 12, 1) diff --git a/components/tidb_query_datatype/src/codec/mysql/time/tz.rs b/components/tidb_query_datatype/src/codec/mysql/time/tz.rs index 25b35a90fc0..9dfc3ebf288 100644 --- a/components/tidb_query_datatype/src/codec/mysql/time/tz.rs +++ b/components/tidb_query_datatype/src/codec/mysql/time/tz.rs @@ -120,6 +120,7 @@ impl TimeZone for Tz { } } + #[allow(deprecated)] fn from_local_date(&self, local: &NaiveDate) -> LocalResult> { match *self { Tz::Local(ref offset) => offset @@ -134,6 +135,7 @@ impl TimeZone for Tz { } } + #[allow(deprecated)] fn from_local_datetime(&self, local: &NaiveDateTime) -> LocalResult> { match *self { Tz::Local(ref offset) => offset @@ -148,6 +150,7 @@ impl TimeZone for Tz { } } + #[allow(deprecated)] fn from_utc_date(&self, utc: &NaiveDate) -> Date { match *self { Tz::Local(ref offset) => { @@ -165,6 +168,7 @@ impl TimeZone for Tz { } } + #[allow(deprecated)] fn from_utc_datetime(&self, utc: &NaiveDateTime) -> DateTime { match *self { Tz::Local(ref offset) => { diff --git a/tests/Cargo.toml b/tests/Cargo.toml index 158e56abcb1..0081d5e95bc 100644 --- a/tests/Cargo.toml +++ b/tests/Cargo.toml @@ -120,7 +120,7 @@ uuid = { version = "0.8.1", features = ["serde", "v4"] } procinfo = { git = "https://github.com/tikv/procinfo-rs", rev = "7693954bd1dd86eb1709572fd7b62fd5f7ff2ea1" } [dev-dependencies] -arrow = "13.0" +arrow = "46.0" byteorder = "1.2" # See https://bheisler.github.io/criterion.rs/book/user_guide/known_limitations.html for the usage # of `real_blackbox` feature. From 533b205efd231f13ca716e40a0cc33fa59ee6809 Mon Sep 17 00:00:00 2001 From: glorv Date: Thu, 21 Sep 2023 14:37:43 +0800 Subject: [PATCH 059/220] raft-engine: update raft-engine to newest version (#15559) close tikv/tikv#15462 Signed-off-by: glorv Co-authored-by: tonyxuqqi Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- Cargo.lock | 23 +++++++++++++++++------ components/raft_log_engine/Cargo.toml | 3 +++ 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4f35ae6b935..f4adccf26fc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3927,6 +3927,17 @@ dependencies = [ "syn 1.0.103", ] +[[package]] +name = "num-derive" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e6a0fd4f737c707bd9086cc16c925f294943eb62eb71499e9fd4cf71f8b9f4e" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.18", +] + [[package]] name = "num-format" version = "0.4.0" @@ -4756,8 +4767,8 @@ dependencies = [ [[package]] name = "raft-engine" -version = "0.3.0" -source = "git+https://github.com/tikv/raft-engine.git#de3ad04a2db9cdf795b1c82d7413b9b53bac92a8" +version = "0.4.1" +source = "git+https://github.com/tikv/raft-engine.git#fa56f891fdf0b1cb5b7849b7bee3c5dadbb96103" dependencies = [ "byteorder", "crc32fast", @@ -4773,7 +4784,7 @@ dependencies = [ "lz4-sys", "memmap2 0.7.0", "nix 0.26.2", - "num-derive", + "num-derive 0.4.0", "num-traits", "parking_lot 0.12.1", "prometheus", @@ -4790,8 +4801,8 @@ dependencies = [ [[package]] name = "raft-engine-ctl" -version = "0.3.0" -source = "git+https://github.com/tikv/raft-engine.git#de3ad04a2db9cdf795b1c82d7413b9b53bac92a8" +version = "0.4.1" +source = "git+https://github.com/tikv/raft-engine.git#fa56f891fdf0b1cb5b7849b7bee3c5dadbb96103" dependencies = [ "clap 3.1.6", "env_logger 0.10.0", @@ -6873,7 +6884,7 @@ dependencies = [ "match-template", "nom 7.1.0", "num 0.3.0", - "num-derive", + "num-derive 0.3.0", "num-traits", "ordered-float", "protobuf", diff --git a/components/raft_log_engine/Cargo.toml b/components/raft_log_engine/Cargo.toml index e643089a872..0e640991eea 100644 --- a/components/raft_log_engine/Cargo.toml +++ b/components/raft_log_engine/Cargo.toml @@ -4,6 +4,9 @@ version = "0.0.1" publish = false edition = "2021" +[features] +failpoints = ["raft-engine/failpoints"] + [dependencies] encryption = { workspace = true } engine_traits = { workspace = true } From 241b8f53d3b35ba6b0ff5d905527f93528af192a Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Thu, 21 Sep 2023 21:22:45 +0800 Subject: [PATCH 060/220] raftstore-v2: support online change lock write buffer limit (#15632) ref tikv/tikv#14320 support online change lock write buffer limit Signed-off-by: SpadeA-Tang --- Cargo.lock | 6 ++-- components/engine_rocks/src/cf_options.rs | 17 ++++++++++ src/config/configurable.rs | 17 ++++++++++ src/config/mod.rs | 38 +++++++++++++++++++++++ 4 files changed, 75 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f4adccf26fc..f05b651b1ad 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3388,7 +3388,7 @@ dependencies = [ [[package]] name = "librocksdb_sys" version = "0.1.0" -source = "git+https://github.com/tikv/rust-rocksdb.git#fc38a5b427e6c9b351f835c641e2ee95b8ff8306" +source = "git+https://github.com/tikv/rust-rocksdb.git#f04f4dd8eacc30e67c24bc2529a6d9c6edb85f8f" dependencies = [ "bindgen 0.65.1", "bzip2-sys", @@ -3407,7 +3407,7 @@ dependencies = [ [[package]] name = "libtitan_sys" version = "0.0.1" -source = "git+https://github.com/tikv/rust-rocksdb.git#fc38a5b427e6c9b351f835c641e2ee95b8ff8306" +source = "git+https://github.com/tikv/rust-rocksdb.git#f04f4dd8eacc30e67c24bc2529a6d9c6edb85f8f" dependencies = [ "bzip2-sys", "cc", @@ -5379,7 +5379,7 @@ dependencies = [ [[package]] name = "rocksdb" version = "0.3.0" -source = "git+https://github.com/tikv/rust-rocksdb.git#fc38a5b427e6c9b351f835c641e2ee95b8ff8306" +source = "git+https://github.com/tikv/rust-rocksdb.git#f04f4dd8eacc30e67c24bc2529a6d9c6edb85f8f" dependencies = [ "libc 0.2.146", "librocksdb_sys", diff --git a/components/engine_rocks/src/cf_options.rs b/components/engine_rocks/src/cf_options.rs index 1162c67f210..6a2372fb31f 100644 --- a/components/engine_rocks/src/cf_options.rs +++ b/components/engine_rocks/src/cf_options.rs @@ -40,6 +40,23 @@ impl RocksCfOptions { pub fn into_raw(self) -> RawCfOptions { self.0 } + + pub fn set_flush_size(&mut self, f: usize) -> Result<()> { + if let Some(m) = self.0.get_write_buffer_manager() { + m.set_flush_size(f); + } else { + return Err(box_err!("write buffer manager not found")); + } + Ok(()) + } + + pub fn get_flush_size(&self) -> Result { + if let Some(m) = self.0.get_write_buffer_manager() { + return Ok(m.flush_size() as u64); + } + + Err(box_err!("write buffer manager not found")) + } } impl Deref for RocksCfOptions { diff --git a/src/config/configurable.rs b/src/config/configurable.rs index 6fe9409c1c0..c92b01cf465 100644 --- a/src/config/configurable.rs +++ b/src/config/configurable.rs @@ -15,6 +15,7 @@ pub trait ConfigurableDb { fn set_rate_bytes_per_sec(&self, rate_bytes_per_sec: i64) -> ConfigRes; fn set_rate_limiter_auto_tuned(&self, auto_tuned: bool) -> ConfigRes; fn set_flush_size(&self, f: usize) -> ConfigRes; + fn set_cf_flush_size(&self, cf: &str, f: usize) -> ConfigRes; fn set_flush_oldest_first(&self, f: bool) -> ConfigRes; fn set_shared_block_cache_capacity(&self, capacity: usize) -> ConfigRes; fn set_high_priority_background_threads(&self, n: i32, allow_reduce: bool) -> ConfigRes; @@ -57,6 +58,11 @@ impl ConfigurableDb for RocksEngine { opt.set_flush_size(f).map_err(Box::from) } + fn set_cf_flush_size(&self, cf: &str, f: usize) -> ConfigRes { + let mut cf_option = self.get_options_cf(cf)?; + cf_option.set_flush_size(f).map_err(Box::from) + } + fn set_flush_oldest_first(&self, f: bool) -> ConfigRes { let mut opt = self.get_db_options(); opt.set_flush_oldest_first(f).map_err(Box::from) @@ -171,6 +177,17 @@ impl ConfigurableDb for TabletRegistry { }) } + fn set_cf_flush_size(&self, cf: &str, f: usize) -> ConfigRes { + loop_registry(self, |cache| { + if let Some(latest) = cache.latest() { + latest.set_cf_flush_size(cf, f)?; + Ok(false) + } else { + Ok(true) + } + }) + } + fn set_flush_oldest_first(&self, f: bool) -> ConfigRes { loop_registry(self, |cache| { if let Some(latest) = cache.latest() { diff --git a/src/config/mod.rs b/src/config/mod.rs index 6b3332fb015..9b8ecad50f9 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -2031,6 +2031,15 @@ impl ConfigManager for DbConfigManger { cf_change.insert(name, value); } } + if let Some(f) = cf_change.remove("write_buffer_limit") { + if cf_name != CF_LOCK { + return Err( + "cf write buffer manager is only supportted for lock cf now".into() + ); + } + let size: ReadableSize = f.into(); + self.db.set_cf_flush_size(cf_name, size.0 as usize)?; + } if !cf_change.is_empty() { let cf_change = config_value_to_string(cf_change.into_iter().collect()); let cf_change_slice = config_to_slice(&cf_change); @@ -5167,6 +5176,7 @@ mod tests { cfg.rocksdb.defaultcf.block_cache_size = Some(ReadableSize::mb(8)); cfg.rocksdb.rate_bytes_per_sec = ReadableSize::mb(64); cfg.rocksdb.rate_limiter_auto_tuned = false; + cfg.rocksdb.lockcf.write_buffer_limit = Some(ReadableSize::mb(1)); cfg.validate().unwrap(); let (storage, cfg_controller, ..) = new_engines::(cfg); let db = storage.get_engine().get_rocksdb(); @@ -5209,6 +5219,34 @@ mod tests { let flush_size = db.get_db_options().get_flush_size().unwrap(); assert_eq!(flush_size, ReadableSize::mb(10).0); + cfg_controller + .update_config("rocksdb.lockcf.write-buffer-limit", "22MB") + .unwrap(); + let cf_opt = db.get_options_cf("lock").unwrap(); + let flush_size = cf_opt.get_flush_size().unwrap(); + assert_eq!(flush_size, ReadableSize::mb(22).0); + + cfg_controller + .update_config("rocksdb.lockcf.write-buffer-size", "102MB") + .unwrap(); + let cf_opt = db.get_options_cf("lock").unwrap(); + let bsize = cf_opt.get_write_buffer_size(); + assert_eq!(bsize, ReadableSize::mb(102).0); + + cfg_controller + .update_config("rocksdb.writecf.write-buffer-size", "102MB") + .unwrap(); + let cf_opt = db.get_options_cf("write").unwrap(); + let bsize = cf_opt.get_write_buffer_size(); + assert_eq!(bsize, ReadableSize::mb(102).0); + + cfg_controller + .update_config("rocksdb.defaultcf.write-buffer-size", "102MB") + .unwrap(); + let cf_opt = db.get_options_cf("default").unwrap(); + let bsize = cf_opt.get_write_buffer_size(); + assert_eq!(bsize, ReadableSize::mb(102).0); + // update some configs on default cf let cf_opts = db.get_options_cf(CF_DEFAULT).unwrap(); assert_eq!(cf_opts.get_disable_auto_compactions(), false); From 9b76ac97e1de01c1b0e70af406720b2c368d9624 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B1=B1=E5=B2=9A?= <36239017+YuJuncen@users.noreply.github.com> Date: Thu, 21 Sep 2023 21:39:15 +0800 Subject: [PATCH 061/220] log-bakcup: make initial scan asynchronous (#15541) ref tikv/tikv#15410 This PR also removed some fields in `Endpoint`, now they should be in the `InitialDataLoader`. The latter will communicate with the former by messages. Signed-off-by: hillium Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- components/backup-stream/src/endpoint.rs | 156 ++++++++----- components/backup-stream/src/event_loader.rs | 215 ++++++++---------- .../backup-stream/src/subscription_manager.rs | 213 +++++++++-------- .../backup-stream/src/subscription_track.rs | 2 + components/backup-stream/src/utils.rs | 64 +----- .../backup-stream/tests/integration/mod.rs | 22 ++ components/backup-stream/tests/suite.rs | 12 +- components/raftstore/src/router.rs | 32 ++- src/config/mod.rs | 5 + 9 files changed, 357 insertions(+), 364 deletions(-) diff --git a/components/backup-stream/src/endpoint.rs b/components/backup-stream/src/endpoint.rs index b11259d5be6..834a40f8bdd 100644 --- a/components/backup-stream/src/endpoint.rs +++ b/components/backup-stream/src/endpoint.rs @@ -1,16 +1,24 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::{any::Any, collections::HashSet, fmt, marker::PhantomData, sync::Arc, time::Duration}; +use std::{ + any::Any, + collections::HashSet, + fmt, + marker::PhantomData, + sync::{Arc, Mutex}, + time::Duration, +}; use concurrency_manager::ConcurrencyManager; use engine_traits::KvEngine; use error_code::ErrorCodeExt; -use futures::{stream::AbortHandle, FutureExt}; +use futures::{stream::AbortHandle, FutureExt, TryFutureExt}; use kvproto::{ brpb::{StreamBackupError, StreamBackupTaskInfo}, metapb::Region, }; use pd_client::PdClient; +use raft::StateRole; use raftstore::{ coprocessor::{CmdBatch, ObserveHandle, RegionInfoProvider}, router::CdcHandle, @@ -30,7 +38,7 @@ use tikv_util::{ use tokio::{ io::Result as TokioResult, runtime::{Handle, Runtime}, - sync::oneshot, + sync::{oneshot, Semaphore}, }; use tokio_stream::StreamExt; use txn_types::TimeStamp; @@ -60,7 +68,7 @@ const SLOW_EVENT_THRESHOLD: f64 = 120.0; /// task has fatal error. const CHECKPOINT_SAFEPOINT_TTL_IF_ERROR: u64 = 24; -pub struct Endpoint { +pub struct Endpoint { // Note: those fields are more like a shared context between components. // For now, we copied them everywhere, maybe we'd better extract them into a // context type. @@ -69,7 +77,6 @@ pub struct Endpoint { pub(crate) store_id: u64, pub(crate) regions: R, pub(crate) engine: PhantomData, - pub(crate) router: RT, pub(crate) pd_client: Arc, pub(crate) subs: SubscriptionTracer, pub(crate) concurrency_manager: ConcurrencyManager, @@ -78,8 +85,6 @@ pub struct Endpoint { pub range_router: Router, observer: BackupStreamObserver, pool: Runtime, - initial_scan_memory_quota: PendingMemoryQuota, - initial_scan_throughput_quota: Limiter, region_operator: RegionSubscriptionManager, failover_time: Option, // We holds the config before, even it is useless for now, @@ -92,17 +97,17 @@ pub struct Endpoint { /// This is used for simulating an asynchronous background worker. /// Each time we spawn a task, once time goes by, we abort that task. pub abort_last_storage_save: Option, + pub initial_scan_semaphore: Arc, } -impl Endpoint +impl Endpoint where R: RegionInfoProvider + 'static + Clone, E: KvEngine, - RT: CdcHandle + 'static, PDC: PdClient + 'static, S: MetaStore + 'static, { - pub fn new( + pub fn new + 'static>( store_id: u64, store: S, config: BackupStreamConfig, @@ -145,17 +150,21 @@ where info!("the endpoint of stream backup started"; "path" => %config.temp_path); let subs = SubscriptionTracer::default(); + let initial_scan_semaphore = Arc::new(Semaphore::new(config.initial_scan_concurrency)); let (region_operator, op_loop) = RegionSubscriptionManager::start( InitialDataLoader::new( - router.clone(), - accessor.clone(), range_router.clone(), subs.clone(), scheduler.clone(), - initial_scan_memory_quota.clone(), - pool.handle().clone(), - initial_scan_throughput_quota.clone(), + initial_scan_memory_quota, + initial_scan_throughput_quota, + // NOTE: in fact we can get rid of the `Arc`. Just need to warp the router when the + // scanner pool is created. But at that time the handle has been sealed in the + // `InitialScan` trait -- we cannot do that. + Arc::new(Mutex::new(router)), + Arc::clone(&initial_scan_semaphore), ), + accessor.clone(), observer.clone(), meta_client.clone(), pd_client.clone(), @@ -166,6 +175,7 @@ where let mut checkpoint_mgr = CheckpointManager::default(); pool.spawn(checkpoint_mgr.spawn_subscription_mgr()); let ep = Endpoint { + initial_scan_semaphore, meta_client, range_router, scheduler, @@ -174,12 +184,9 @@ where store_id, regions: accessor, engine: PhantomData, - router, pd_client, subs, concurrency_manager, - initial_scan_memory_quota, - initial_scan_throughput_quota, region_operator, failover_time: None, config, @@ -191,12 +198,11 @@ where } } -impl Endpoint +impl Endpoint where S: MetaStore + 'static, R: RegionInfoProvider + Clone + 'static, E: KvEngine, - RT: CdcHandle + 'static, PDC: PdClient + 'static, { fn get_meta_client(&self) -> MetadataClient { @@ -494,20 +500,6 @@ where }); } - /// Make an initial data loader using the resource of the endpoint. - pub fn make_initial_loader(&self) -> InitialDataLoader { - InitialDataLoader::new( - self.router.clone(), - self.regions.clone(), - self.range_router.clone(), - self.subs.clone(), - self.scheduler.clone(), - self.initial_scan_memory_quota.clone(), - self.pool.handle().clone(), - self.initial_scan_throughput_quota.clone(), - ) - } - pub fn handle_watch_task(&self, op: TaskOp) { match op { TaskOp::AddTask(task) => { @@ -525,13 +517,12 @@ where } } - async fn observe_and_scan_region( + async fn observe_regions_in_range( &self, - init: InitialDataLoader, task: &StreamTask, start_key: Vec, end_key: Vec, - ) -> Result<()> { + ) { let start = Instant::now_coarse(); let success = self .observer @@ -549,7 +540,9 @@ where // directly and this would be fast. If this gets slow, maybe make it async // again. (Will that bring race conditions? say `Start` handled after // `ResfreshResolver` of some region.) - let range_init_result = init.initialize_range(start_key.clone(), end_key.clone()); + let range_init_result = self + .initialize_range(start_key.clone(), end_key.clone()) + .await; match range_init_result { Ok(()) => { info!("backup stream success to initialize"; @@ -561,6 +554,45 @@ where e.report("backup stream initialize failed"); } } + } + + /// initialize a range: it simply scan the regions with leader role and send + /// them to [`initialize_region`]. + pub async fn initialize_range(&self, start_key: Vec, end_key: Vec) -> Result<()> { + // Generally we will be very very fast to consume. + // Directly clone the initial data loader to the background thread looks a + // little heavier than creating a new channel. TODO: Perhaps we need a + // handle to the `InitialDataLoader`. Making it a `Runnable` worker might be a + // good idea. + let (tx, mut rx) = tokio::sync::mpsc::channel(1); + self.regions + .seek_region( + &start_key, + Box::new(move |i| { + // Ignore the error, this can only happen while the server is shutting down, the + // future has been canceled. + let _ = i + .filter(|r| r.role == StateRole::Leader) + .take_while(|r| r.region.start_key < end_key) + .try_for_each(|r| { + tx.blocking_send(ObserveOp::Start { + region: r.region.clone(), + }) + }); + }), + ) + .map_err(|err| { + Error::Other(box_err!( + "failed to seek region for start key {}: {}", + utils::redact(&start_key), + err + )) + })?; + // Don't reschedule this command: or once the endpoint's mailbox gets + // full, the system might deadlock. + while let Some(cmd) = rx.recv().await { + self.region_operator.request(cmd).await; + } Ok(()) } @@ -578,7 +610,6 @@ where /// Load the task into memory: this would make the endpint start to observe. fn load_task(&self, task: StreamTask) { let cli = self.meta_client.clone(); - let init = self.make_initial_loader(); let range_router = self.range_router.clone(); info!( @@ -621,10 +652,8 @@ where .await?; for (start_key, end_key) in ranges { - let init = init.clone(); - - self.observe_and_scan_region(init, &task, start_key, end_key) - .await? + self.observe_regions_in_range(&task, start_key, end_key) + .await } info!( "finish register backup stream ranges"; @@ -859,11 +888,16 @@ where } fn on_update_change_config(&mut self, cfg: BackupStreamConfig) { + let concurrency_diff = + cfg.initial_scan_concurrency as isize - self.config.initial_scan_concurrency as isize; info!( "update log backup config"; "config" => ?cfg, + "concurrency_diff" => concurrency_diff, ); self.range_router.udpate_config(&cfg); + self.update_semaphore_capacity(&self.initial_scan_semaphore, concurrency_diff); + self.config = cfg; } @@ -873,6 +907,24 @@ where self.pool.block_on(self.region_operator.request(op)); } + fn update_semaphore_capacity(&self, sema: &Arc, diff: isize) { + use std::cmp::Ordering::*; + match diff.cmp(&0) { + Less => { + self.pool.spawn( + Arc::clone(sema) + .acquire_many_owned(-diff as _) + // It is OK to trivially ignore the Error case (semaphore has been closed, we are shutting down the server.) + .map_ok(|p| p.forget()), + ); + } + Equal => {} + Greater => { + sema.add_permits(diff as _); + } + } + } + pub fn run_task(&mut self, task: Task) { debug!("run backup stream task"; "task" => ?task, "store_id" => %self.store_id); let now = Instant::now_coarse(); @@ -1279,12 +1331,11 @@ impl Task { } } -impl Runnable for Endpoint +impl Runnable for Endpoint where S: MetaStore + 'static, R: RegionInfoProvider + Clone + 'static, E: KvEngine, - RT: CdcHandle + 'static, PDC: PdClient + 'static, { type Task = Task; @@ -1297,10 +1348,7 @@ where #[cfg(test)] mod test { use engine_rocks::RocksEngine; - use raftstore::{ - coprocessor::region_info_accessor::MockRegionInfoProvider, router::CdcRaftRouter, - }; - use test_raftstore::MockRaftStoreRouter; + use raftstore::coprocessor::region_info_accessor::MockRegionInfoProvider; use tikv_util::worker::dummy_scheduler; use crate::{ @@ -1315,13 +1363,9 @@ mod test { cli.insert_task_with_range(&task, &[]).await.unwrap(); fail::cfg("failed_to_get_tasks", "1*return").unwrap(); - Endpoint::< - _, - MockRegionInfoProvider, - RocksEngine, - CdcRaftRouter, - MockPdClient, - >::start_and_watch_tasks(cli, sched) + Endpoint::<_, MockRegionInfoProvider, RocksEngine, MockPdClient>::start_and_watch_tasks( + cli, sched, + ) .await .unwrap(); fail::remove("failed_to_get_tasks"); diff --git a/components/backup-stream/src/event_loader.rs b/components/backup-stream/src/event_loader.rs index 1b663c0e982..bfb88d5cd5f 100644 --- a/components/backup-stream/src/event_loader.rs +++ b/components/backup-stream/src/event_loader.rs @@ -3,10 +3,9 @@ use std::{marker::PhantomData, sync::Arc, time::Duration}; use engine_traits::{KvEngine, CF_DEFAULT, CF_WRITE}; -use futures::executor::block_on; use kvproto::{kvrpcpb::ExtraOp, metapb::Region, raft_cmdpb::CmdType}; use raftstore::{ - coprocessor::{ObserveHandle, RegionInfoProvider}, + coprocessor::ObserveHandle, router::CdcHandle, store::{fsm::ChangeObserver, Callback}, }; @@ -21,22 +20,16 @@ use tikv_util::{ time::{Instant, Limiter}, worker::Scheduler, }; -use tokio::{ - runtime::Handle, - sync::{OwnedSemaphorePermit, Semaphore}, -}; +use tokio::sync::{OwnedSemaphorePermit, Semaphore}; use txn_types::{Key, Lock, TimeStamp}; use crate::{ annotate, debug, - endpoint::ObserveOp, errors::{ContextualResultExt, Error, Result}, metrics, router::{ApplyEvent, ApplyEvents, Router}, subscription_track::{Ref, RefMut, SubscriptionTracer, TwoPhaseResolver}, - try_send, - utils::{self, RegionPager}, - Task, + utils, Task, }; const MAX_GET_SNAPSHOT_RETRY: usize = 5; @@ -60,10 +53,12 @@ impl PendingMemoryQuota { Self(Arc::new(Semaphore::new(quota))) } - pub fn pending(&self, size: usize) -> PendingMemory { + pub async fn pending(&self, size: usize) -> PendingMemory { PendingMemory( - Handle::current() - .block_on(self.0.clone().acquire_many_owned(size as _)) + self.0 + .clone() + .acquire_many_owned(size as _) + .await .expect("BUG: the semaphore is closed unexpectedly."), ) } @@ -175,121 +170,64 @@ impl EventLoader { } /// The context for loading incremental data between range. -/// Like [`cdc::Initializer`], but supports initialize over range. +/// Like [`cdc::Initializer`]. /// Note: maybe we can merge those two structures? -/// Note': maybe extract more fields to trait so it would be easier to test. #[derive(Clone)] -pub struct InitialDataLoader { +pub struct InitialDataLoader { // Note: maybe we can make it an abstract thing like `EventSink` with // method `async (KvEvent) -> Result<()>`? pub(crate) sink: Router, pub(crate) tracing: SubscriptionTracer, pub(crate) scheduler: Scheduler, - // Note: this is only for `init_range`, maybe make it an argument? - pub(crate) regions: R, - // Note: Maybe move those fields about initial scanning into some trait? - pub(crate) router: RT, + pub(crate) quota: PendingMemoryQuota, pub(crate) limit: Limiter, + // If there are too many concurrent initial scanning, the limit of disk speed or pending memory + // quota will probably be triggered. Then the whole scanning will be pretty slow. And when + // we are holding a iterator for a long time, the memtable may not be able to be flushed. + // Using this to restrict the possibility of that. + concurrency_limit: Arc, + + cdc_handle: H, - pub(crate) handle: Handle, _engine: PhantomData, } -impl InitialDataLoader +impl InitialDataLoader where E: KvEngine, - R: RegionInfoProvider + Clone + 'static, - RT: CdcHandle, + H: CdcHandle + Sync, { pub fn new( - router: RT, - regions: R, sink: Router, tracing: SubscriptionTracer, sched: Scheduler, quota: PendingMemoryQuota, - handle: Handle, limiter: Limiter, + cdc_handle: H, + concurrency_limit: Arc, ) -> Self { Self { - router, - regions, sink, tracing, scheduler: sched, _engine: PhantomData, quota, - handle, + cdc_handle, + concurrency_limit, limit: limiter, } } - pub fn observe_over_with_retry( + pub async fn capture_change( &self, region: &Region, - mut cmd: impl FnMut() -> ChangeObserver, + cmd: ChangeObserver, ) -> Result { - let mut last_err = None; - for _ in 0..MAX_GET_SNAPSHOT_RETRY { - let c = cmd(); - let r = self.observe_over(region, c); - match r { - Ok(s) => { - return Ok(s); - } - Err(e) => { - let can_retry = match e.without_context() { - Error::RaftRequest(pbe) => { - !(pbe.has_epoch_not_match() - || pbe.has_not_leader() - || pbe.get_message().contains("stale observe id") - || pbe.has_region_not_found()) - } - Error::RaftStore(raftstore::Error::RegionNotFound(_)) - | Error::RaftStore(raftstore::Error::NotLeader(..)) => false, - _ => true, - }; - e.report(format_args!( - "during getting initial snapshot for region {:?}; can retry = {}", - region, can_retry - )); - last_err = match last_err { - None => Some(e), - Some(err) => Some(Error::Contextual { - context: format!("and error {}", err), - inner_error: Box::new(e), - }), - }; - - if !can_retry { - break; - } - std::thread::sleep(Duration::from_secs(1)); - continue; - } - } - } - Err(last_err.expect("BUG: max retry time exceed but no error")) - } - - /// Start observe over some region. - /// This will register the region to the raftstore as observing, - /// and return the current snapshot of that region. - fn observe_over(&self, region: &Region, cmd: ChangeObserver) -> Result { - // There are 2 ways for getting the initial snapshot of a region: - // - the BR method: use the interface in the RaftKv interface, read the - // key-values directly. - // - the CDC method: use the raftstore message `SignificantMsg::CaptureChange` - // to register the region to CDC observer and get a snapshot at the same time. - // Registering the observer to the raftstore is necessary because we should only - // listen events from leader. In CDC, the change observer is - // per-delegate(i.e. per-region), we can create the command per-region here too. - let (callback, fut) = tikv_util::future::paired_future_callback::>(); - self.router + self.cdc_handle .capture_change( region.get_id(), region.get_region_epoch().clone(), @@ -315,7 +253,8 @@ where region.get_id() ))?; - let snap = block_on(fut) + let snap = fut + .await .map_err(|err| { annotate!( err, @@ -332,6 +271,54 @@ where Ok(snap) } + pub async fn observe_over_with_retry( + &self, + region: &Region, + mut cmd: impl FnMut() -> ChangeObserver, + ) -> Result { + let mut last_err = None; + for _ in 0..MAX_GET_SNAPSHOT_RETRY { + let c = cmd(); + let r = self.capture_change(region, c).await; + match r { + Ok(s) => { + return Ok(s); + } + Err(e) => { + let can_retry = match e.without_context() { + Error::RaftRequest(pbe) => { + !(pbe.has_epoch_not_match() + || pbe.has_not_leader() + || pbe.get_message().contains("stale observe id") + || pbe.has_region_not_found()) + } + Error::RaftStore(raftstore::Error::RegionNotFound(_)) + | Error::RaftStore(raftstore::Error::NotLeader(..)) => false, + _ => true, + }; + e.report(format_args!( + "during getting initial snapshot for region {:?}; can retry = {}", + region, can_retry + )); + last_err = match last_err { + None => Some(e), + Some(err) => Some(Error::Contextual { + context: format!("and error {}", err), + inner_error: Box::new(e), + }), + }; + + if !can_retry { + break; + } + tokio::time::sleep(Duration::from_secs(1)).await; + continue; + } + } + } + Err(last_err.expect("BUG: max retry time exceed but no error")) + } + fn with_resolver( &self, region: &Region, @@ -381,7 +368,7 @@ where f(v.value_mut().resolver()) } - fn scan_and_async_send( + async fn scan_and_async_send( &self, region: &Region, handle: &ObserveHandle, @@ -419,8 +406,8 @@ where let sink = self.sink.clone(); let event_size = events.size(); let sched = self.scheduler.clone(); - let permit = self.quota.pending(event_size); - self.limit.blocking_consume(disk_read as _); + let permit = self.quota.pending(event_size).await; + self.limit.consume(disk_read as _).await; debug!("sending events to router"; "size" => %event_size, "region" => %region_id); metrics::INCREMENTAL_SCAN_SIZE.observe(event_size as f64); metrics::INCREMENTAL_SCAN_DISK_READ.inc_by(disk_read as f64); @@ -434,7 +421,7 @@ where } } - pub fn do_initial_scan( + pub async fn do_initial_scan( &self, region: &Region, // We are using this handle for checking whether the initial scan is stale. @@ -442,18 +429,25 @@ where start_ts: TimeStamp, snap: impl Snapshot, ) -> Result { - let _guard = self.handle.enter(); let tr = self.tracing.clone(); let region_id = region.get_id(); let mut join_handles = Vec::with_capacity(8); + let permit = self + .concurrency_limit + .acquire() + .await + .expect("BUG: semaphore closed"); // It is ok to sink more data than needed. So scan to +inf TS for convenance. let event_loader = EventLoader::load_from(snap, start_ts, TimeStamp::max(), region)?; - let stats = self.scan_and_async_send(region, &handle, event_loader, &mut join_handles)?; + let stats = self + .scan_and_async_send(region, &handle, event_loader, &mut join_handles) + .await?; + drop(permit); - Handle::current() - .block_on(futures::future::try_join_all(join_handles)) + futures::future::try_join_all(join_handles) + .await .map_err(|err| annotate!(err, "tokio runtime failed to join consuming threads"))?; Self::with_resolver_by(&tr, region, &handle, |r| { @@ -467,31 +461,6 @@ where Ok(stats) } - - /// initialize a range: it simply scan the regions with leader role and send - /// them to [`initialize_region`]. - pub fn initialize_range(&self, start_key: Vec, end_key: Vec) -> Result<()> { - let mut pager = RegionPager::scan_from(self.regions.clone(), start_key, end_key); - loop { - let regions = pager.next_page(8)?; - debug!("scanning for entries in region."; "regions" => ?regions); - if regions.is_empty() { - break; - } - for r in regions { - // Note: Even we did the initial scanning, and blocking resolved ts from - // advancing, if the next_backup_ts was updated in some extreme condition, there - // is still little chance to lost data: For example, if a region cannot elect - // the leader for long time. (say, net work partition) At that time, we have - // nowhere to record the lock status of this region. - try_send!( - self.scheduler, - Task::ModifyObserve(ObserveOp::Start { region: r.region }) - ); - } - } - Ok(()) - } } #[cfg(test)] diff --git a/components/backup-stream/src/subscription_manager.rs b/components/backup-stream/src/subscription_manager.rs index e418d59029d..7aeecb775cc 100644 --- a/components/backup-stream/src/subscription_manager.rs +++ b/components/backup-stream/src/subscription_manager.rs @@ -1,15 +1,7 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::{ - sync::{ - atomic::{AtomicBool, Ordering}, - Arc, - }, - time::Duration, -}; +use std::{sync::Arc, time::Duration}; -use crossbeam::channel::{Receiver as SyncReceiver, Sender as SyncSender}; -use crossbeam_channel::SendError; use engine_traits::KvEngine; use error_code::ErrorCodeExt; use futures::FutureExt; @@ -22,10 +14,11 @@ use raftstore::{ store::fsm::ChangeObserver, }; use tikv::storage::Statistics; -use tikv_util::{box_err, debug, info, time::Instant, warn, worker::Scheduler}; -use tokio::sync::mpsc::{channel, Receiver, Sender}; +use tikv_util::{ + box_err, debug, info, sys::thread::ThreadBuildWrapper, time::Instant, warn, worker::Scheduler, +}; +use tokio::sync::mpsc::{channel, error::SendError, Receiver, Sender}; use txn_types::TimeStamp; -use yatp::task::callback::Handle as YatpHandle; use crate::{ annotate, @@ -43,7 +36,7 @@ use crate::{ Task, }; -type ScanPool = yatp::ThreadPool; +type ScanPool = tokio::runtime::Runtime; const INITIAL_SCAN_FAILURE_MAX_RETRY_TIME: usize = 10; @@ -128,8 +121,9 @@ fn should_retry(err: &Error) -> bool { } /// the abstraction over a "DB" which provides the initial scanning. -trait InitialScan: Clone { - fn do_initial_scan( +#[async_trait::async_trait] +trait InitialScan: Clone + Sync + Send + 'static { + async fn do_initial_scan( &self, region: &Region, start_ts: TimeStamp, @@ -139,13 +133,13 @@ trait InitialScan: Clone { fn handle_fatal_error(&self, region: &Region, err: Error); } -impl InitialScan for InitialDataLoader +#[async_trait::async_trait] +impl InitialScan for InitialDataLoader where E: KvEngine, - R: RegionInfoProvider + Clone + 'static, - RT: CdcHandle, + RT: CdcHandle + Sync + 'static, { - fn do_initial_scan( + async fn do_initial_scan( &self, region: &Region, start_ts: TimeStamp, @@ -155,12 +149,14 @@ where let h = handle.clone(); // Note: we have external retry at `ScanCmd::exec_by_with_retry`, should we keep // retrying here? - let snap = self.observe_over_with_retry(region, move || { - ChangeObserver::from_pitr(region_id, handle.clone()) - })?; + let snap = self + .observe_over_with_retry(region, move || { + ChangeObserver::from_pitr(region_id, handle.clone()) + }) + .await?; #[cfg(feature = "failpoints")] fail::fail_point!("scan_after_get_snapshot"); - let stat = self.do_initial_scan(region, h, start_ts, snap)?; + let stat = self.do_initial_scan(region, h, start_ts, snap).await?; Ok(stat) } @@ -180,7 +176,7 @@ where impl ScanCmd { /// execute the initial scanning via the specificated [`InitialDataLoader`]. - fn exec_by(&self, initial_scan: impl InitialScan) -> Result<()> { + async fn exec_by(&self, initial_scan: impl InitialScan) -> Result<()> { let Self { region, handle, @@ -188,7 +184,9 @@ impl ScanCmd { .. } = self; let begin = Instant::now_coarse(); - let stat = initial_scan.do_initial_scan(region, *last_checkpoint, handle.clone())?; + let stat = initial_scan + .do_initial_scan(region, *last_checkpoint, handle.clone()) + .await?; info!("initial scanning finished!"; "takes" => ?begin.saturating_elapsed(), "from_ts" => %last_checkpoint, utils::slog_region(region)); utils::record_cf_stat("lock", &stat.lock); utils::record_cf_stat("write", &stat.write); @@ -197,17 +195,12 @@ impl ScanCmd { } /// execute the command, when meeting error, retrying. - fn exec_by_with_retry(self, init: impl InitialScan, cancel: &AtomicBool) { + async fn exec_by_with_retry(self, init: impl InitialScan) { let mut retry_time = INITIAL_SCAN_FAILURE_MAX_RETRY_TIME; loop { - if cancel.load(Ordering::SeqCst) { - return; - } - match self.exec_by(init.clone()) { + match self.exec_by(init.clone()).await { Err(err) if should_retry(&err) && retry_time > 0 => { - // NOTE: blocking this thread may stick the process. - // Maybe spawn a task to tokio and reschedule the task then? - std::thread::sleep(Duration::from_millis(500)); + tokio::time::sleep(Duration::from_millis(500)).await; warn!("meet retryable error"; "err" => %err, "retry_time" => retry_time); retry_time -= 1; continue; @@ -223,82 +216,62 @@ impl ScanCmd { } } -fn scan_executor_loop( - init: impl InitialScan, - cmds: SyncReceiver, - canceled: Arc, -) { - while let Ok(cmd) = cmds.recv() { - fail::fail_point!("execute_scan_command"); +async fn scan_executor_loop(init: impl InitialScan, mut cmds: Receiver) { + while let Some(cmd) = cmds.recv().await { debug!("handling initial scan request"; "region_id" => %cmd.region.get_id()); metrics::PENDING_INITIAL_SCAN_LEN .with_label_values(&["queuing"]) .dec(); - if canceled.load(Ordering::Acquire) { - return; + #[cfg(feature = "failpoints")] + { + let sleep = (|| { + fail::fail_point!("execute_scan_command_sleep_100", |_| { 100 }); + 0 + })(); + tokio::time::sleep(std::time::Duration::from_secs(sleep)).await; } - metrics::PENDING_INITIAL_SCAN_LEN - .with_label_values(&["executing"]) - .inc(); - cmd.exec_by_with_retry(init.clone(), &canceled); - metrics::PENDING_INITIAL_SCAN_LEN - .with_label_values(&["executing"]) - .dec(); + let init = init.clone(); + tokio::task::spawn(async move { + metrics::PENDING_INITIAL_SCAN_LEN + .with_label_values(&["executing"]) + .inc(); + cmd.exec_by_with_retry(init).await; + metrics::PENDING_INITIAL_SCAN_LEN + .with_label_values(&["executing"]) + .dec(); + }); } } /// spawn the executors in the scan pool. -/// we make workers thread instead of spawn scan task directly into the pool -/// because the [`InitialDataLoader`] isn't `Sync` hence we must use it very -/// carefully or rustc (along with tokio) would complain that we made a `!Send` -/// future. so we have moved the data loader to the synchronous context so its -/// reference won't be shared between threads any more. -fn spawn_executors(init: impl InitialScan + Send + 'static, number: usize) -> ScanPoolHandle { - let (tx, rx) = crossbeam::channel::bounded(MESSAGE_BUFFER_SIZE); +fn spawn_executors( + init: impl InitialScan + Send + Sync + 'static, + number: usize, +) -> ScanPoolHandle { + let (tx, rx) = tokio::sync::mpsc::channel(MESSAGE_BUFFER_SIZE); let pool = create_scan_pool(number); - let stopped = Arc::new(AtomicBool::new(false)); - for _ in 0..number { - let init = init.clone(); - let rx = rx.clone(); - let stopped = stopped.clone(); - pool.spawn(move |_: &mut YatpHandle<'_>| { - let _io_guard = file_system::WithIoType::new(file_system::IoType::Replication); - scan_executor_loop(init, rx, stopped); - }) - } - ScanPoolHandle { - tx, - _pool: pool, - stopped, - } + pool.spawn(async move { + scan_executor_loop(init, rx).await; + }); + ScanPoolHandle { tx, _pool: pool } } struct ScanPoolHandle { - tx: SyncSender, - stopped: Arc, + // Theoretically, we can get rid of the sender, and spawn a new task via initial loader in each + // thread. But that will make `SubscribeManager` holds a reference to the implementation of + // `InitialScan`, which will get the type information a mass. + tx: Sender, - // in fact, we won't use the pool any more. - // but we should hold the reference to the pool so it won't try to join the threads running. _pool: ScanPool, } -impl Drop for ScanPoolHandle { - fn drop(&mut self) { - self.stopped.store(true, Ordering::Release); - } -} - impl ScanPoolHandle { - fn request(&self, cmd: ScanCmd) -> std::result::Result<(), SendError> { - if self.stopped.load(Ordering::Acquire) { - warn!("scan pool is stopped, ignore the scan command"; "region" => %cmd.region.get_id()); - return Ok(()); - } + async fn request(&self, cmd: ScanCmd) -> std::result::Result<(), SendError> { metrics::PENDING_INITIAL_SCAN_LEN .with_label_values(&["queuing"]) .inc(); - self.tx.send(cmd) + self.tx.send(cmd).await } } @@ -348,11 +321,20 @@ where } } -/// Create a yatp pool for doing initial scanning. +/// Create a pool for doing initial scanning. fn create_scan_pool(num_threads: usize) -> ScanPool { - yatp::Builder::new("log-backup-scan") - .max_thread_count(num_threads) - .build_callback_pool() + tokio::runtime::Builder::new_multi_thread() + .with_sys_and_custom_hooks( + move || { + file_system::set_io_type(file_system::IoType::Replication); + }, + || {}, + ) + .thread_name("log-backup-scan") + .enable_time() + .worker_threads(num_threads) + .build() + .unwrap() } impl RegionSubscriptionManager @@ -367,22 +349,24 @@ where /// /// a two-tuple, the first is the handle to the manager, the second is the /// operator loop future. - pub fn start( - initial_loader: InitialDataLoader, + pub fn start( + initial_loader: InitialDataLoader, + regions: R, observer: BackupStreamObserver, meta_cli: MetadataClient, pd_client: Arc, scan_pool_size: usize, - resolver: BackupStreamResolver, + resolver: BackupStreamResolver, ) -> (Self, future![()]) where E: KvEngine, - RT: CdcHandle + 'static, + HInit: CdcHandle + Sync + 'static, + HChkLd: CdcHandle + 'static, { let (tx, rx) = channel(MESSAGE_BUFFER_SIZE); let scan_pool_handle = spawn_executors(initial_loader.clone(), scan_pool_size); let op = Self { - regions: initial_loader.regions.clone(), + regions, meta_cli, pd_client, range_router: initial_loader.sink.clone(), @@ -522,7 +506,8 @@ where region, self.get_last_checkpoint_of(&for_task, region).await?, handle.clone(), - ); + ) + .await; Result::Ok(()) } .await; @@ -567,7 +552,8 @@ where Err(Error::Other(box_err!("Nature is boring"))) }); let tso = self.get_last_checkpoint_of(&for_task, region).await?; - self.observe_over_with_initial_data_from_checkpoint(region, tso, handle.clone()); + self.observe_over_with_initial_data_from_checkpoint(region, tso, handle.clone()) + .await; } } Ok(()) @@ -702,13 +688,13 @@ where Ok(cp.ts) } - fn spawn_scan(&self, cmd: ScanCmd) { + async fn spawn_scan(&self, cmd: ScanCmd) { // we should not spawn initial scanning tasks to the tokio blocking pool // because it is also used for converting sync File I/O to async. (for now!) // In that condition, if we blocking for some resources(for example, the // `MemoryQuota`) at the block threads, we may meet some ghosty // deadlock. - let s = self.scan_pool_handle.request(cmd); + let s = self.scan_pool_handle.request(cmd).await; if let Err(err) = s { let region_id = err.0.region.get_id(); annotate!(err, "BUG: scan_pool closed") @@ -716,7 +702,7 @@ where } } - fn observe_over_with_initial_data_from_checkpoint( + async fn observe_over_with_initial_data_from_checkpoint( &self, region: &Region, last_checkpoint: TimeStamp, @@ -730,6 +716,7 @@ where last_checkpoint, _work: self.scans.clone().work(), }) + .await } fn find_task_by_region(&self, r: &Region) -> Option { @@ -748,8 +735,9 @@ mod test { #[derive(Clone, Copy)] struct NoopInitialScan; + #[async_trait::async_trait] impl InitialScan for NoopInitialScan { - fn do_initial_scan( + async fn do_initial_scan( &self, _region: &Region, _start_ts: txn_types::TimeStamp, @@ -787,17 +775,20 @@ mod test { let pool = spawn_executors(NoopInitialScan, 1); let wg = CallbackWaitGroup::new(); - fail::cfg("execute_scan_command", "sleep(100)").unwrap(); + fail::cfg("execute_scan_command_sleep_100", "return").unwrap(); for _ in 0..100 { let wg = wg.clone(); - pool.request(ScanCmd { - region: Default::default(), - handle: Default::default(), - last_checkpoint: Default::default(), - // Note: Maybe make here a Box or some other trait? - _work: wg.work(), - }) - .unwrap() + assert!( + pool._pool + .block_on(pool.request(ScanCmd { + region: Default::default(), + handle: Default::default(), + last_checkpoint: Default::default(), + // Note: Maybe make here a Box or some other trait? + _work: wg.work(), + })) + .is_ok() + ) } should_finish_in(move || drop(pool), Duration::from_secs(5)); diff --git a/components/backup-stream/src/subscription_track.rs b/components/backup-stream/src/subscription_track.rs index 2dae8ce745d..5a6b2e0753b 100644 --- a/components/backup-stream/src/subscription_track.rs +++ b/components/backup-stream/src/subscription_track.rs @@ -82,6 +82,7 @@ impl ActiveSubscription { self.handle.stop_observing(); } + #[cfg(test)] pub fn is_observing(&self) -> bool { self.handle.is_observing() } @@ -319,6 +320,7 @@ impl SubscriptionTracer { } /// check whether the region_id should be observed by this observer. + #[cfg(test)] pub fn is_observing(&self, region_id: u64) -> bool { let sub = self.0.get_mut(®ion_id); match sub { diff --git a/components/backup-stream/src/utils.rs b/components/backup-stream/src/utils.rs index 52b6f0e9391..5e798a8428c 100644 --- a/components/backup-stream/src/utils.rs +++ b/components/backup-stream/src/utils.rs @@ -18,14 +18,12 @@ use std::{ use async_compression::{tokio::write::ZstdEncoder, Level}; use engine_rocks::ReadPerfInstant; use engine_traits::{CfName, CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE}; -use futures::{channel::mpsc, executor::block_on, ready, task::Poll, FutureExt, StreamExt}; +use futures::{ready, task::Poll, FutureExt}; use kvproto::{ brpb::CompressionType, metapb::Region, raft_cmdpb::{CmdType, Request}, }; -use raft::StateRole; -use raftstore::{coprocessor::RegionInfoProvider, RegionInfo}; use tikv::storage::CfStatistics; use tikv_util::{ box_err, @@ -33,7 +31,6 @@ use tikv_util::{ self_thread_inspector, IoStat, ThreadInspector, ThreadInspectorImpl as OsInspector, }, time::Instant, - warn, worker::Scheduler, Either, }; @@ -79,65 +76,6 @@ pub fn redact(key: &impl AsRef<[u8]>) -> log_wrappers::Value<'_> { log_wrappers::Value::key(key.as_ref()) } -/// RegionPager seeks regions with leader role in the range. -pub struct RegionPager

{ - regions: P, - start_key: Vec, - end_key: Vec, - reach_last_region: bool, -} - -impl RegionPager

{ - pub fn scan_from(regions: P, start_key: Vec, end_key: Vec) -> Self { - Self { - regions, - start_key, - end_key, - reach_last_region: false, - } - } - - pub fn next_page(&mut self, size: usize) -> Result> { - if self.start_key >= self.end_key || self.reach_last_region { - return Ok(vec![]); - } - - let (mut tx, rx) = mpsc::channel(size); - let end_key = self.end_key.clone(); - self.regions - .seek_region( - &self.start_key, - Box::new(move |i| { - let r = i - .filter(|r| r.role == StateRole::Leader) - .take(size) - .take_while(|r| r.region.start_key < end_key) - .try_for_each(|r| tx.try_send(r.clone())); - if let Err(_err) = r { - warn!("failed to scan region and send to initlizer") - } - }), - ) - .map_err(|err| { - Error::Other(box_err!( - "failed to seek region for start key {}: {}", - redact(&self.start_key), - err - )) - })?; - let collected_regions = block_on(rx.collect::>()); - self.start_key = collected_regions - .last() - .map(|region| region.region.end_key.to_owned()) - // no leader region found. - .unwrap_or_default(); - if self.start_key.is_empty() { - self.reach_last_region = true; - } - Ok(collected_regions) - } -} - /// StopWatch is a utility for record time cost in multi-stage tasks. /// NOTE: Maybe it should be generic over somewhat Clock type? pub struct StopWatch(Instant); diff --git a/components/backup-stream/tests/integration/mod.rs b/components/backup-stream/tests/integration/mod.rs index a209572c6d8..79a756f684d 100644 --- a/components/backup-stream/tests/integration/mod.rs +++ b/components/backup-stream/tests/integration/mod.rs @@ -16,6 +16,7 @@ mod all { use futures::{Stream, StreamExt}; use pd_client::PdClient; use test_raftstore::IsolationFilterFactory; + use tikv::config::BackupStreamConfig; use tikv_util::{box_err, defer, info, HandyRwLock}; use tokio::time::timeout; use txn_types::{Key, TimeStamp}; @@ -430,4 +431,25 @@ mod all { round1.iter().map(|k| k.as_slice()), )) } + + #[test] + fn update_config() { + let suite = SuiteBuilder::new_named("network_partition") + .nodes(1) + .build(); + let mut basic_config = BackupStreamConfig::default(); + basic_config.initial_scan_concurrency = 4; + suite.run(|| Task::ChangeConfig(basic_config.clone())); + suite.wait_with(|e| { + assert_eq!(e.initial_scan_semaphore.available_permits(), 4,); + true + }); + + basic_config.initial_scan_concurrency = 16; + suite.run(|| Task::ChangeConfig(basic_config.clone())); + suite.wait_with(|e| { + assert_eq!(e.initial_scan_semaphore.available_permits(), 16,); + true + }); + } } diff --git a/components/backup-stream/tests/suite.rs b/components/backup-stream/tests/suite.rs index e1df628d76b..41a57f5858b 100644 --- a/components/backup-stream/tests/suite.rs +++ b/components/backup-stream/tests/suite.rs @@ -31,14 +31,11 @@ use kvproto::{ }; use pd_client::PdClient; use protobuf::parse_from_bytes; -use raftstore::{ - router::{CdcRaftRouter, ServerRaftStoreRouter}, - RegionInfoAccessor, -}; +use raftstore::{router::CdcRaftRouter, RegionInfoAccessor}; use resolved_ts::LeadershipResolver; use tempdir::TempDir; use test_pd_client::TestPdClient; -use test_raftstore::{new_server_cluster, Cluster, ServerCluster, SimulateTransport}; +use test_raftstore::{new_server_cluster, Cluster, ServerCluster}; use test_util::retry; use tikv::config::BackupStreamConfig; use tikv_util::{ @@ -57,11 +54,6 @@ pub type TestEndpoint = Endpoint< ErrorStore, RegionInfoAccessor, engine_test::kv::KvTestEngine, - CdcRaftRouter< - SimulateTransport< - ServerRaftStoreRouter, - >, - >, TestPdClient, >; diff --git a/components/raftstore/src/router.rs b/components/raftstore/src/router.rs index 09f389a2230..77d3a35e306 100644 --- a/components/raftstore/src/router.rs +++ b/components/raftstore/src/router.rs @@ -1,6 +1,9 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. -use std::borrow::Cow; +use std::{ + borrow::Cow, + sync::{Arc, Mutex}, +}; // #[PerformanceCriticalPath] use crossbeam::channel::TrySendError; @@ -406,6 +409,33 @@ where ) -> RaftStoreResult<()>; } +impl> CdcHandle for Arc> { + fn capture_change( + &self, + region_id: u64, + region_epoch: metapb::RegionEpoch, + change_observer: ChangeObserver, + callback: Callback<::Snapshot>, + ) -> RaftStoreResult<()> { + Mutex::lock(self).unwrap().capture_change( + region_id, + region_epoch, + change_observer, + callback, + ) + } + + fn check_leadership( + &self, + region_id: u64, + callback: Callback<::Snapshot>, + ) -> RaftStoreResult<()> { + Mutex::lock(self) + .unwrap() + .check_leadership(region_id, callback) + } +} + /// A wrapper of SignificantRouter that is specialized for implementing /// CdcHandle. #[derive(Clone)] diff --git a/src/config/mod.rs b/src/config/mod.rs index 9b8ecad50f9..8a2fa291ff1 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -2833,6 +2833,7 @@ pub struct BackupStreamConfig { pub initial_scan_pending_memory_quota: ReadableSize, #[online_config(skip)] pub initial_scan_rate_limit: ReadableSize, + pub initial_scan_concurrency: usize, } impl BackupStreamConfig { @@ -2860,6 +2861,9 @@ impl BackupStreamConfig { ) .into()); } + if self.initial_scan_concurrency == 0 { + return Err("the `initial_scan_concurrency` shouldn't be zero".into()); + } Ok(()) } } @@ -2887,6 +2891,7 @@ impl Default for BackupStreamConfig { file_size_limit, initial_scan_pending_memory_quota: ReadableSize(quota_size as _), initial_scan_rate_limit: ReadableSize::mb(60), + initial_scan_concurrency: 6, temp_file_memory_quota: cache_size, } } From 6ff85fcc7a6384da445ef166b745ab998cc20b8d Mon Sep 17 00:00:00 2001 From: ShuNing Date: Fri, 22 Sep 2023 11:28:45 +0800 Subject: [PATCH 062/220] tests: fix unstable test_query_stats test (#15657) close tikv/tikv#15656 tests: fix unstable test_query_stats test Signed-off-by: nolouch --- tests/Cargo.toml | 2 +- tests/integrations/raftstore/test_stats.rs | 78 +++++++++++----------- 2 files changed, 40 insertions(+), 40 deletions(-) diff --git a/tests/Cargo.toml b/tests/Cargo.toml index 0081d5e95bc..f3928e97eb8 100644 --- a/tests/Cargo.toml +++ b/tests/Cargo.toml @@ -41,7 +41,7 @@ path = "benches/deadlock_detector/mod.rs" [features] default = ["failpoints", "testexport", "test-engine-kv-rocksdb", "test-engine-raft-raft-engine", "cloud-aws", "cloud-gcp", "cloud-azure"] -failpoints = ["fail/failpoints", "tikv/failpoints", "pd_client/failpoints"] +failpoints = ["fail/failpoints", "tikv/failpoints", "pd_client/failpoints", "raft_log_engine/failpoints"] cloud-aws = ["external_storage_export/cloud-aws"] cloud-gcp = ["external_storage_export/cloud-gcp"] cloud-azure = ["external_storage_export/cloud-azure"] diff --git a/tests/integrations/raftstore/test_stats.rs b/tests/integrations/raftstore/test_stats.rs index 13e718b269d..073382ced17 100644 --- a/tests/integrations/raftstore/test_stats.rs +++ b/tests/integrations/raftstore/test_stats.rs @@ -262,19 +262,10 @@ fn test_raw_query_stats_tmpl() { req.set_raw_get(get_req); req }); - batch_commands(&ctx, &client, get_command, &start_key); - assert!(check_split_key( - cluster, - F::encode_raw_key_owned(start_key.clone(), None).into_encoded(), - None - )); - if check_query_num_read( - cluster, - store_id, - region_id, - QueryKind::Get, - (i + 1) * 1000, - ) { + if i == 0 { + batch_commands(&ctx, &client, get_command, &start_key); + } + if check_query_num_read(cluster, store_id, region_id, QueryKind::Get, 1000) { flag = true; break; } @@ -284,14 +275,16 @@ fn test_raw_query_stats_tmpl() { fail::cfg("mock_hotspot_threshold", "return(0)").unwrap(); fail::cfg("mock_tick_interval", "return(0)").unwrap(); fail::cfg("mock_collect_tick_interval", "return(0)").unwrap(); - test_query_num::(raw_get, true); - test_query_num::(raw_batch_get, true); - test_query_num::(raw_scan, true); - test_query_num::(raw_batch_scan, true); + test_query_num::(raw_get, true, true); + test_query_num::(raw_batch_get, true, true); + test_query_num::(raw_scan, true, true); + test_query_num::(raw_batch_scan, true, true); if F::IS_TTL_ENABLED { - test_query_num::(raw_get_key_ttl, true); + test_query_num::(raw_get_key_ttl, true, true); } - test_query_num::(raw_batch_get_command, true); + // requests may failed caused by `EpochNotMatch` after split when auto split is + // enabled, disable it. + test_query_num::(raw_batch_get_command, true, false); test_raw_delete_query::(); fail::remove("mock_tick_interval"); fail::remove("mock_hotspot_threshold"); @@ -385,19 +378,10 @@ fn test_txn_query_stats_tmpl() { req.set_get(get_req); req }); - batch_commands(&ctx, &client, get_command, &start_key); - assert!(check_split_key( - cluster, - Key::from_raw(&start_key).as_encoded().to_vec(), - None - )); - if check_query_num_read( - cluster, - store_id, - region_id, - QueryKind::Get, - (i + 1) * 1000, - ) { + if i == 0 { + batch_commands(&ctx, &client, get_command, &start_key); + } + if check_query_num_read(cluster, store_id, region_id, QueryKind::Get, 1000) { flag = true; break; } @@ -407,11 +391,13 @@ fn test_txn_query_stats_tmpl() { fail::cfg("mock_hotspot_threshold", "return(0)").unwrap(); fail::cfg("mock_tick_interval", "return(0)").unwrap(); fail::cfg("mock_collect_tick_interval", "return(0)").unwrap(); - test_query_num::(get, false); - test_query_num::(batch_get, false); - test_query_num::(scan, false); - test_query_num::(scan_lock, false); - test_query_num::(batch_get_command, false); + test_query_num::(get, false, true); + test_query_num::(batch_get, false, true); + test_query_num::(scan, false, true); + test_query_num::(scan_lock, false, true); + // requests may failed caused by `EpochNotMatch` after split when auto split is + // enabled, disable it. + test_query_num::(batch_get_command, false, false); test_txn_delete_query::(); test_pessimistic_lock(); test_rollback(); @@ -573,15 +559,20 @@ pub fn test_rollback() { )); } -fn test_query_num(query: Box, is_raw_kv: bool) { +fn test_query_num(query: Box, is_raw_kv: bool, auto_split: bool) { let (mut cluster, client, mut ctx) = must_new_and_configure_cluster_and_kv_client(|cluster| { cluster.cfg.raft_store.pd_store_heartbeat_tick_interval = ReadableDuration::millis(50); - cluster.cfg.split.qps_threshold = Some(0); + if auto_split { + cluster.cfg.split.qps_threshold = Some(0); + } else { + cluster.cfg.split.qps_threshold = Some(1000000); + } cluster.cfg.split.split_balance_score = 2.0; cluster.cfg.split.split_contained_score = 2.0; cluster.cfg.split.detect_times = 1; cluster.cfg.split.sample_threshold = 0; cluster.cfg.storage.set_api_version(F::TAG); + cluster.cfg.server.enable_request_batch = false; }); ctx.set_api_version(F::CLIENT_TAG); @@ -763,4 +754,13 @@ fn batch_commands( } }); rx.recv_timeout(Duration::from_secs(10)).unwrap(); + sleep_ms(100); + // triage metrics flush + for _ in 0..10 { + let mut req = ScanRequest::default(); + req.set_context(ctx.to_owned()); + req.start_key = start_key.to_owned(); + req.end_key = vec![]; + client.kv_scan(&req).unwrap(); + } } From 15d2c7dcd1780d11ee118e0b9b68ca06bf2bf388 Mon Sep 17 00:00:00 2001 From: Neil Shen Date: Fri, 22 Sep 2023 13:43:44 +0800 Subject: [PATCH 063/220] raftstore-v2: fix incorrect GC peer requests to source peer after merge (#15643) close tikv/tikv#15623 After merge, target region sends GC peer requests to removed source peers, however the region_id in requests is set to target region id incorrectly. As results, source region removed peers may be left forever. This commit fixes above issue by putting source removed_records to merged_records, so that region id can be set correctly. Signed-off-by: Neil Shen Co-authored-by: tonyxuqqi Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- Cargo.lock | 2 +- components/raftstore-v2/src/batch/store.rs | 3 +- .../operation/command/admin/conf_change.rs | 15 +++- .../operation/command/admin/merge/commit.rs | 4 +- components/raftstore-v2/src/operation/life.rs | 65 ++++++++++++--- .../raftstore-v2/src/operation/ready/mod.rs | 11 ++- components/raftstore/src/store/config.rs | 7 ++ tests/integrations/config/mod.rs | 1 + tests/integrations/config/test-custom.toml | 1 + tests/integrations/raftstore/test_life.rs | 4 +- tests/integrations/raftstore/test_merge.rs | 81 +++++++++++++++++++ 11 files changed, 168 insertions(+), 26 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f05b651b1ad..0ba7b9d3499 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3253,7 +3253,7 @@ dependencies = [ [[package]] name = "kvproto" version = "0.0.2" -source = "git+https://github.com/pingcap/kvproto.git#ecdbf1f8c130089392a9bb5f86f7577deddfbed5" +source = "git+https://github.com/pingcap/kvproto.git#090f247be15c00a6000a4d23669ac3e95ea9fcd5" dependencies = [ "futures 0.3.15", "grpcio", diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index 5f036c61020..73b65bc0904 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -1,7 +1,6 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. use std::{ - cmp, ops::{Deref, DerefMut}, path::Path, sync::{ @@ -140,7 +139,7 @@ impl StoreContext { self.tick_batch[PeerTick::CheckLongUncommitted as usize].wait_duration = self.cfg.check_long_uncommitted_interval.0; self.tick_batch[PeerTick::GcPeer as usize].wait_duration = - 60 * cmp::min(Duration::from_secs(1), self.cfg.raft_base_tick_interval.0); + self.cfg.gc_peer_check_interval.0; } // Return None means it has passed unsafe vote period. diff --git a/components/raftstore-v2/src/operation/command/admin/conf_change.rs b/components/raftstore-v2/src/operation/command/admin/conf_change.rs index c7b8481aa7c..77ef6c823c1 100644 --- a/components/raftstore-v2/src/operation/command/admin/conf_change.rs +++ b/components/raftstore-v2/src/operation/command/admin/conf_change.rs @@ -609,10 +609,17 @@ impl Apply { ); removed_records.retain(|p| !updates.contains(&p.get_id())); merged_records.retain_mut(|r| { - let mut sources: Vec<_> = r.take_source_peers().into(); - sources.retain(|p| !updates.contains(&p.get_id())); - r.set_source_peers(sources.into()); - !r.get_source_peers().is_empty() + // Clean up source peers if they acknowledge GcPeerRequest. + let mut source_peers: Vec<_> = r.take_source_peers().into(); + source_peers.retain(|p| !updates.contains(&p.get_id())); + r.set_source_peers(source_peers.into()); + // Clean up source removed records (peers) if they acknowledge GcPeerRequest. + let mut source_removed_records: Vec<_> = r.take_source_removed_records().into(); + source_removed_records.retain(|p| !updates.contains(&p.get_id())); + r.set_source_removed_records(source_removed_records.into()); + // Clean up merged records if all source peers and source removed records are + // empty. + !r.get_source_peers().is_empty() || !r.get_source_removed_records().is_empty() }); self.region_state_mut() .set_removed_records(removed_records.into()); diff --git a/components/raftstore-v2/src/operation/command/admin/merge/commit.rs b/components/raftstore-v2/src/operation/command/admin/merge/commit.rs index 5208dcc96a8..8e55f89a7d2 100644 --- a/components/raftstore-v2/src/operation/command/admin/merge/commit.rs +++ b/components/raftstore-v2/src/operation/command/admin/merge/commit.rs @@ -540,9 +540,6 @@ impl Apply { state.set_state(PeerState::Normal); assert!(!state.has_merge_state()); state.set_tablet_index(index); - let mut removed_records: Vec<_> = state.take_removed_records().into(); - removed_records.append(&mut source_state.get_removed_records().into()); - state.set_removed_records(removed_records.into()); let mut merged_records: Vec<_> = state.take_merged_records().into(); merged_records.append(&mut source_state.get_merged_records().into()); state.set_merged_records(merged_records.into()); @@ -550,6 +547,7 @@ impl Apply { merged_record.set_source_region_id(source_region.get_id()); merged_record.set_source_epoch(source_region.get_region_epoch().clone()); merged_record.set_source_peers(source_region.get_peers().into()); + merged_record.set_source_removed_records(source_state.get_removed_records().into()); merged_record.set_target_region_id(region.get_id()); merged_record.set_target_epoch(region.get_region_epoch().clone()); merged_record.set_target_peers(region.get_peers().into()); diff --git a/components/raftstore-v2/src/operation/life.rs b/components/raftstore-v2/src/operation/life.rs index 395774e17f1..6b778ad6c4a 100644 --- a/components/raftstore-v2/src/operation/life.rs +++ b/components/raftstore-v2/src/operation/life.rs @@ -424,7 +424,13 @@ impl Store { }; if destroyed { if msg.get_is_tombstone() { + let msg_region_epoch = msg.get_region_epoch().clone(); if let Some(msg) = build_peer_destroyed_report(&mut msg) { + info!(self.logger(), "peer reports destroyed"; + "from_peer" => ?msg.get_from_peer(), + "from_region_epoch" => ?msg_region_epoch, + "region_id" => ?msg.get_region_id(), + "to_peer_id" => ?msg.get_to_peer().get_id()); let _ = ctx.trans.send(msg); } return false; @@ -581,7 +587,11 @@ impl Peer { .iter() .find(|p| p.id == msg.get_from_peer().get_id()) { - let tombstone_msg = self.tombstone_message_for_same_region(peer.clone()); + let tombstone_msg = self.tombstone_message( + self.region_id(), + self.region().get_region_epoch().clone(), + peer.clone(), + ); self.add_message(tombstone_msg); true } else { @@ -589,13 +599,24 @@ impl Peer { } } - fn tombstone_message_for_same_region(&self, peer: metapb::Peer) -> RaftMessage { - let region_id = self.region_id(); + fn tombstone_message( + &self, + region_id: u64, + region_epoch: metapb::RegionEpoch, + peer: metapb::Peer, + ) -> RaftMessage { let mut tombstone_message = RaftMessage::default(); + if self.region_id() != region_id { + // After merge, target region needs to GC peers of source region. + let extra_msg = tombstone_message.mut_extra_msg(); + extra_msg.set_type(ExtraMessageType::MsgGcPeerRequest); + let check_peer = extra_msg.mut_check_gc_peer(); + check_peer.set_from_region_id(self.region_id()); + } tombstone_message.set_region_id(region_id); tombstone_message.set_from_peer(self.peer().clone()); tombstone_message.set_to_peer(peer); - tombstone_message.set_region_epoch(self.region().get_region_epoch().clone()); + tombstone_message.set_region_epoch(region_epoch); tombstone_message.set_is_tombstone(true); tombstone_message } @@ -604,6 +625,10 @@ impl Peer { match msg.get_to_peer().get_id().cmp(&self.peer_id()) { cmp::Ordering::Less => { if let Some(msg) = build_peer_destroyed_report(msg) { + info!(self.logger, "peer reports destroyed"; + "from_peer" => ?msg.get_from_peer(), + "from_region_epoch" => ?msg.get_region_epoch(), + "to_peer_id" => ?msg.get_to_peer().get_id()); self.add_message(msg); } } @@ -675,6 +700,7 @@ impl Peer { && state.get_merged_records().iter().all(|p| { p.get_source_peers() .iter() + .chain(p.get_source_removed_records()) .all(|p| p.get_id() != gc_peer_id) }) { @@ -699,18 +725,33 @@ impl Peer { } let mut need_gc_ids = Vec::with_capacity(5); let gc_context = self.gc_peer_context(); + let mut tombstone_removed_records = + |region_id, region_epoch: &metapb::RegionEpoch, peer: &metapb::Peer| { + need_gc_ids.push(peer.get_id()); + if gc_context.confirmed_ids.contains(&peer.get_id()) { + return; + } + + let msg = self.tombstone_message(region_id, region_epoch.clone(), peer.clone()); + // For leader, it's OK to send gc message immediately. + let _ = ctx.trans.send(msg); + }; for peer in state.get_removed_records() { - need_gc_ids.push(peer.get_id()); - if gc_context.confirmed_ids.contains(&peer.get_id()) { - continue; + tombstone_removed_records(self.region_id(), self.region().get_region_epoch(), peer); + } + // For merge, we need to + // 1. ask source removed peers to destroy. + for record in state.get_merged_records() { + for peer in record.get_source_removed_records() { + tombstone_removed_records( + record.get_source_region_id(), + record.get_source_epoch(), + peer, + ); } - - let msg = self.tombstone_message_for_same_region(peer.clone()); - // For leader, it's OK to send gc message immediately. - let _ = ctx.trans.send(msg); } + // 2. ask target to check whether source should be deleted. for record in state.get_merged_records() { - // For merge, we ask target to check whether source should be deleted. for (source, target) in record .get_source_peers() .iter() diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index ba7170ac8c8..17845b5d0b8 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -418,9 +418,10 @@ impl Peer { return; } + let msg_type = msg.get_message().get_msg_type(); // This can be a message that sent when it's still a follower. Nevertheleast, // it's meaningless to continue to handle the request as callbacks are cleared. - if msg.get_message().get_msg_type() == MessageType::MsgReadIndex + if msg_type == MessageType::MsgReadIndex && self.is_leader() && (msg.get_message().get_from() == raft::INVALID_ID || msg.get_message().get_from() == self.peer_id()) @@ -429,14 +430,18 @@ impl Peer { return; } - if msg.get_message().get_msg_type() == MessageType::MsgReadIndex + if msg_type == MessageType::MsgReadIndex && self.is_leader() && self.on_step_read_index(ctx, msg.mut_message()) { // Read index has respond in `on_step_read_index`, // No need to step again. } else if let Err(e) = self.raft_group_mut().step(msg.take_message()) { - error!(self.logger, "raft step error"; "err" => ?e); + error!(self.logger, "raft step error"; + "from_peer" => ?msg.get_from_peer(), + "region_epoch" => ?msg.get_region_epoch(), + "message_type" => ?msg_type, + "err" => ?e); } else { let committed_index = self.raft_group().raft.raft_log.committed; self.report_commit_log_duration(ctx, pre_committed_index, committed_index); diff --git a/components/raftstore/src/store/config.rs b/components/raftstore/src/store/config.rs index f96ed2b7a45..95c4aed9349 100644 --- a/components/raftstore/src/store/config.rs +++ b/components/raftstore/src/store/config.rs @@ -169,6 +169,9 @@ pub struct Config { /// and try to alert monitoring systems, if there is any. pub abnormal_leader_missing_duration: ReadableDuration, pub peer_stale_state_check_interval: ReadableDuration, + /// Interval to check GC peers. + #[doc(hidden)] + pub gc_peer_check_interval: ReadableDuration, #[online_config(hidden)] pub leader_transfer_max_log_lag: u64, @@ -510,6 +513,7 @@ impl Default for Config { renew_leader_lease_advance_duration: ReadableDuration::secs(0), allow_unsafe_vote_after_start: false, report_region_buckets_tick_interval: ReadableDuration::secs(10), + gc_peer_check_interval: ReadableDuration::secs(60), max_snapshot_file_raw_size: ReadableSize::mb(100), unreachable_backoff: ReadableDuration::secs(10), // TODO: make its value reasonable @@ -1060,6 +1064,9 @@ impl Config { CONFIG_RAFTSTORE_GAUGE .with_label_values(&["leader_transfer_max_log_lag"]) .set(self.leader_transfer_max_log_lag as f64); + CONFIG_RAFTSTORE_GAUGE + .with_label_values(&["gc_peer_check_interval"]) + .set(self.gc_peer_check_interval.as_secs_f64()); CONFIG_RAFTSTORE_GAUGE .with_label_values(&["snap_apply_batch_size"]) diff --git a/tests/integrations/config/mod.rs b/tests/integrations/config/mod.rs index c6f787df9a7..1ac6e3840f1 100644 --- a/tests/integrations/config/mod.rs +++ b/tests/integrations/config/mod.rs @@ -212,6 +212,7 @@ fn test_serde_custom_tikv_config() { max_leader_missing_duration: ReadableDuration::hours(12), abnormal_leader_missing_duration: ReadableDuration::hours(6), peer_stale_state_check_interval: ReadableDuration::hours(2), + gc_peer_check_interval: ReadableDuration::days(1), leader_transfer_max_log_lag: 123, snap_apply_batch_size: ReadableSize::mb(12), snap_apply_copy_symlink: true, diff --git a/tests/integrations/config/test-custom.toml b/tests/integrations/config/test-custom.toml index ece8cabae49..fe1fa066ae8 100644 --- a/tests/integrations/config/test-custom.toml +++ b/tests/integrations/config/test-custom.toml @@ -183,6 +183,7 @@ max-peer-down-duration = "12m" max-leader-missing-duration = "12h" abnormal-leader-missing-duration = "6h" peer-stale-state-check-interval = "2h" +gc-peer-check-interval = "1d" leader-transfer-max-log-lag = 123 snap-apply-batch-size = "12MB" snap-apply-copy-symlink = true diff --git a/tests/integrations/raftstore/test_life.rs b/tests/integrations/raftstore/test_life.rs index e940ca30a7c..f3b5704a586 100644 --- a/tests/integrations/raftstore/test_life.rs +++ b/tests/integrations/raftstore/test_life.rs @@ -11,7 +11,7 @@ use test_raftstore::{ new_learner_peer, new_peer, sleep_ms, Filter, FilterFactory, Simulator as S1, }; use test_raftstore_v2::Simulator as S2; -use tikv_util::{time::Instant, HandyRwLock}; +use tikv_util::{config::ReadableDuration, time::Instant, HandyRwLock}; struct ForwardFactory { node_id: u64, @@ -64,6 +64,7 @@ fn test_gc_peer_tiflash_engine() { let mut cluster_v1 = test_raftstore::new_node_cluster(1, 2); let mut cluster_v2 = test_raftstore_v2::new_node_cluster(1, 2); cluster_v1.cfg.raft_store.enable_v2_compatible_learner = true; + cluster_v2.cfg.raft_store.gc_peer_check_interval = ReadableDuration::millis(500); cluster_v1.pd_client.disable_default_operator(); cluster_v2.pd_client.disable_default_operator(); let r11 = cluster_v1.run_conf_change(); @@ -144,6 +145,7 @@ fn test_gc_peer_tiflash_engine() { fn test_gc_removed_peer() { let mut cluster = test_raftstore::new_node_cluster(1, 2); cluster.cfg.raft_store.enable_v2_compatible_learner = true; + cluster.cfg.raft_store.gc_peer_check_interval = ReadableDuration::millis(500); cluster.pd_client.disable_default_operator(); let region_id = cluster.run_conf_change(); diff --git a/tests/integrations/raftstore/test_merge.rs b/tests/integrations/raftstore/test_merge.rs index ceb888a2b22..0b17ff72ae7 100644 --- a/tests/integrations/raftstore/test_merge.rs +++ b/tests/integrations/raftstore/test_merge.rs @@ -1731,3 +1731,84 @@ fn test_prepare_merge_with_5_nodes_snapshot() { // Now leader should replicate more logs and figure out a safe index. pd_client.must_merge(left.get_id(), right.get_id()); } + +#[test_case(test_raftstore_v2::new_node_cluster)] +fn test_gc_peer_after_merge() { + let mut cluster = new_cluster(0, 3); + configure_for_merge(&mut cluster.cfg); + cluster.cfg.raft_store.gc_peer_check_interval = ReadableDuration::millis(500); + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + cluster.run(); + cluster.must_put(b"k1", b"v1"); + cluster.must_put(b"k3", b"v3"); + + let region = cluster.get_region(b"k1"); + cluster.must_split(®ion, b"k2"); + let left = cluster.get_region(b"k1"); + let right = cluster.get_region(b"k3"); + + let left_peer_on_store1 = find_peer(&left, 1).unwrap().clone(); + cluster.must_transfer_leader(left.get_id(), left_peer_on_store1); + must_get_equal(&cluster.get_engine(3), b"k1", b"v1"); + let left_peer_on_store3 = find_peer(&left, 3).unwrap().clone(); + pd_client.must_remove_peer(left.get_id(), left_peer_on_store3); + must_get_none(&cluster.get_engine(3), b"k1"); + + let right_peer_on_store1 = find_peer(&right, 1).unwrap().clone(); + cluster.must_transfer_leader(right.get_id(), right_peer_on_store1); + let right_peer_on_store3 = find_peer(&right, 3).unwrap().clone(); + cluster.add_send_filter(IsolationFilterFactory::new(3)); + pd_client.must_remove_peer(right.get_id(), right_peer_on_store3.clone()); + + // So cluster becomes + // left region: 1(leader) 2 | + // right region: 1(leader) 2 | 3 (removed but not yet destroyed) + // | means isolation. + + // Merge right to left. + pd_client.must_merge(right.get_id(), left.get_id()); + let region_state = cluster.region_local_state(left.get_id(), 1); + assert!( + !region_state.get_merged_records()[0] + .get_source_removed_records() + .is_empty(), + "{:?}", + region_state + ); + assert!( + !region_state + .get_removed_records() + .iter() + .any(|p| p.get_id() == right_peer_on_store3.get_id()), + "{:?}", + region_state + ); + + // Cluster filters and wait for gc peer ticks. + cluster.clear_send_filters(); + sleep_ms(3 * cluster.cfg.raft_store.gc_peer_check_interval.as_millis()); + + // Right region replica on store 3 must be removed. + cluster.must_region_not_exist(right.get_id(), 3); + + let start = Instant::now(); + loop { + sleep_ms(cluster.cfg.raft_store.gc_peer_check_interval.as_millis()); + let region_state = cluster.region_local_state(left.get_id(), 1); + if (region_state.get_merged_records().is_empty() + || region_state.get_merged_records()[0] + .get_source_removed_records() + .is_empty()) + && region_state.get_removed_records().is_empty() + { + break; + } + if start.elapsed() > Duration::from_secs(5) { + panic!( + "source removed records and removed records must be empty, {:?}", + region_state + ); + } + } +} From bbfedd409b5965c04b9edcb34f0a0907c75d6dd2 Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Fri, 22 Sep 2023 14:36:13 +0800 Subject: [PATCH 064/220] upgrade lz4-sys to 1.9.4 to tackle security issue (#15652) ref tikv/tikv#15621 upgrade lz4-sys to 1.9.4 to tackle security issue Signed-off-by: SpadeA-Tang --- Cargo.lock | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0ba7b9d3499..e9f937e3266 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3483,9 +3483,9 @@ dependencies = [ [[package]] name = "lz4-sys" -version = "1.9.2" +version = "1.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dca79aa95d8b3226213ad454d328369853be3a1382d89532a854f4d69640acae" +checksum = "57d27b317e207b10f69f5e75494119e391a96f48861ae870d1da6edac98ca900" dependencies = [ "cc", "libc 0.2.146", From 384aaeb381ffc8f9ac881432a00e437933777c55 Mon Sep 17 00:00:00 2001 From: ShuNing Date: Mon, 25 Sep 2023 13:42:15 +0800 Subject: [PATCH 065/220] copr: fix cannot get the request source for resource control (#15606) close tikv/tikv#15663 copr: fix cannot get the request source for analyze with resource control Signed-off-by: nolouch Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- .../resource_control/src/resource_group.rs | 4 ++ src/server/service/kv.rs | 8 ++-- tests/integrations/raftstore/test_stats.rs | 40 +++++++++++++++++-- .../resource_metering/test_cpu.rs | 6 ++- 4 files changed, 49 insertions(+), 9 deletions(-) diff --git a/components/resource_control/src/resource_group.rs b/components/resource_control/src/resource_group.rs index 0e40255b354..09e90e9dd01 100644 --- a/components/resource_control/src/resource_group.rs +++ b/components/resource_control/src/resource_group.rs @@ -239,6 +239,10 @@ impl ResourceGroupManager { rg: &str, request_source: &str, ) -> Option> { + fail_point!("only_check_source_task_name", |name| { + assert_eq!(name.clone().unwrap(), request_source.to_string()); + None + }); if let Some(group) = self.resource_groups.get(rg) { if !group.fallback_default { return group.get_resource_limiter(request_source); diff --git a/src/server/service/kv.rs b/src/server/service/kv.rs index 6f1cf0eaa1f..4a961eedf19 100644 --- a/src/server/service/kv.rs +++ b/src/server/service/kv.rs @@ -1190,7 +1190,7 @@ fn handle_batch_commands_request( response_batch_commands_request(id, resp, tx.clone(), begin_instant, GrpcTypeKind::raw_get, source); } }, - Some(batch_commands_request::request::Cmd::Coprocessor(mut req)) => { + Some(batch_commands_request::request::Cmd::Coprocessor(req)) => { let resource_control_ctx = req.get_context().get_resource_control_context(); if let Some(resource_manager) = resource_manager { resource_manager.consume_penalty(resource_control_ctx); @@ -1199,7 +1199,7 @@ fn handle_batch_commands_request( .with_label_values(&[resource_control_ctx.get_resource_group_name()]) .inc(); let begin_instant = Instant::now(); - let source = req.mut_context().take_request_source(); + let source = req.get_context().get_request_source().to_owned(); let resp = future_copr(copr, Some(peer.to_string()), req) .map_ok(|resp| { resp.map(oneof!(batch_commands_response::response::Cmd::Coprocessor)) @@ -1224,7 +1224,7 @@ fn handle_batch_commands_request( String::default(), ); } - $(Some(batch_commands_request::request::Cmd::$cmd(mut req)) => { + $(Some(batch_commands_request::request::Cmd::$cmd(req)) => { let resource_control_ctx = req.get_context().get_resource_control_context(); if let Some(resource_manager) = resource_manager { resource_manager.consume_penalty(resource_control_ctx); @@ -1233,7 +1233,7 @@ fn handle_batch_commands_request( .with_label_values(&[resource_control_ctx.get_resource_group_name()]) .inc(); let begin_instant = Instant::now(); - let source = req.mut_context().take_request_source(); + let source = req.get_context().get_request_source().to_owned(); let resp = $future_fn($($arg,)* req) .map_ok(oneof!(batch_commands_response::response::Cmd::$cmd)) .map_err(|_| GRPC_MSG_FAIL_COUNTER.$metric_name.inc()); diff --git a/tests/integrations/raftstore/test_stats.rs b/tests/integrations/raftstore/test_stats.rs index 073382ced17..7701fe167c8 100644 --- a/tests/integrations/raftstore/test_stats.rs +++ b/tests/integrations/raftstore/test_stats.rs @@ -12,6 +12,7 @@ use futures::{executor::block_on, SinkExt, StreamExt}; use grpcio::*; use kvproto::{kvrpcpb::*, pdpb::QueryKind, tikvpb::*, tikvpb_grpc::TikvClient}; use pd_client::PdClient; +use test_coprocessor::{DagSelect, ProductTable}; use test_raftstore::*; use tikv_util::{config::*, store::QueryStats}; use txn_types::Key; @@ -388,9 +389,34 @@ fn test_txn_query_stats_tmpl() { } assert!(flag); }); + let batch_coprocessor: Box = + Box::new(|ctx, cluster, client, store_id, region_id, start_key| { + let mut flag = false; + for i in 0..3 { + let coprocessor: Box = Box::new(|ctx, _start_key| { + let mut req = BatchCommandsRequestRequest::new(); + let table = ProductTable::new(); + let mut cop_req = DagSelect::from(&table).build(); + cop_req.set_context(ctx.clone()); + req.set_coprocessor(cop_req); + req + }); + if i == 0 { + batch_commands(&ctx, &client, coprocessor, &start_key); + } + // here cannot read any data, so expect is 0. may need fix. here mainly used to + // verify the request source is as expect. + if check_query_num_read(cluster, store_id, region_id, QueryKind::Coprocessor, 0) { + flag = true; + break; + } + } + assert!(flag); + }); fail::cfg("mock_hotspot_threshold", "return(0)").unwrap(); fail::cfg("mock_tick_interval", "return(0)").unwrap(); fail::cfg("mock_collect_tick_interval", "return(0)").unwrap(); + fail::cfg("only_check_source_task_name", "return(test_stats)").unwrap(); test_query_num::(get, false, true); test_query_num::(batch_get, false, true); test_query_num::(scan, false, true); @@ -398,12 +424,14 @@ fn test_txn_query_stats_tmpl() { // requests may failed caused by `EpochNotMatch` after split when auto split is // enabled, disable it. test_query_num::(batch_get_command, false, false); + test_query_num::(batch_coprocessor, false, false); test_txn_delete_query::(); test_pessimistic_lock(); test_rollback(); fail::remove("mock_tick_interval"); fail::remove("mock_hotspot_threshold"); fail::remove("mock_collect_tick_interval"); + fail::remove("only_check_source_task_name"); } #[allow(clippy::extra_unused_type_parameters)] @@ -488,10 +516,11 @@ fn put( } fn test_pessimistic_lock() { - let (cluster, client, ctx) = must_new_and_configure_cluster_and_kv_client(|cluster| { + let (cluster, client, mut ctx) = must_new_and_configure_cluster_and_kv_client(|cluster| { cluster.cfg.raft_store.pd_store_heartbeat_tick_interval = ReadableDuration::millis(50); }); + ctx.set_request_source("test_stats".to_owned()); let key = b"key2".to_vec(); let store_id = 1; put(&cluster, &client, &ctx, store_id, key.clone()); @@ -528,9 +557,10 @@ fn test_pessimistic_lock() { } pub fn test_rollback() { - let (cluster, client, ctx) = must_new_and_configure_cluster_and_kv_client(|cluster| { + let (cluster, client, mut ctx) = must_new_and_configure_cluster_and_kv_client(|cluster| { cluster.cfg.raft_store.pd_store_heartbeat_tick_interval = ReadableDuration::millis(50); }); + ctx.set_request_source("test_stats".to_owned()); let key = b"key2".to_vec(); let store_id = 1; put(&cluster, &client, &ctx, store_id, key.clone()); @@ -575,6 +605,7 @@ fn test_query_num(query: Box, is_raw_kv: bool, auto_split: b cluster.cfg.server.enable_request_batch = false; }); ctx.set_api_version(F::CLIENT_TAG); + ctx.set_request_source("test_stats".to_owned()); let mut k = b"key".to_vec(); // When a peer becomes leader, it can't read before committing to current term. @@ -602,6 +633,7 @@ fn test_raw_delete_query() { cluster.cfg.storage.set_api_version(F::TAG); }); ctx.set_api_version(F::CLIENT_TAG); + ctx.set_request_source("test_stats".to_owned()); raw_put::(&cluster, &client, &ctx, store_id, k.clone()); // Raw Delete @@ -627,10 +659,10 @@ fn test_txn_delete_query() { let store_id = 1; { - let (cluster, client, ctx) = must_new_and_configure_cluster_and_kv_client(|cluster| { + let (cluster, client, mut ctx) = must_new_and_configure_cluster_and_kv_client(|cluster| { cluster.cfg.raft_store.pd_store_heartbeat_tick_interval = ReadableDuration::millis(50); }); - + ctx.set_request_source("test_stats".to_owned()); put(&cluster, &client, &ctx, store_id, k.clone()); // DeleteRange let mut delete_req = DeleteRangeRequest::default(); diff --git a/tests/integrations/resource_metering/test_cpu.rs b/tests/integrations/resource_metering/test_cpu.rs index c15bf445ed3..12d6fa4fbe0 100644 --- a/tests/integrations/resource_metering/test_cpu.rs +++ b/tests/integrations/resource_metering/test_cpu.rs @@ -12,6 +12,7 @@ use std::{ use concurrency_manager::ConcurrencyManager; use futures::{executor::block_on, StreamExt}; use kvproto::kvrpcpb::Context; +use resource_control::ResourceGroupManager; use test_coprocessor::{DagSelect, Insert, ProductTable, Store}; use tidb_query_datatype::codec::Datum; use tikv::{ @@ -95,7 +96,10 @@ pub fn test_reschedule_coprocessor() { let mut req = DagSelect::from(&table).build(); let mut ctx = Context::default(); ctx.set_resource_group_tag(tag.as_bytes().to_vec()); + ctx.set_request_source("test".to_owned()); req.set_context(ctx); + fail::cfg("only_check_source_task_name", "return(test)").unwrap(); + defer!(fail::remove("only_check_source_task_name")); assert!( !block_on(endpoint.parse_and_handle_unary_request(req, None)) .consume() @@ -229,7 +233,7 @@ fn setup_test_suite() -> (TestSuite, Store, Endpoint) cm, test_suite.get_tag_factory(), Arc::new(QuotaLimiter::default()), - None, + Some(Arc::new(ResourceGroupManager::default())), ); (test_suite, store, endpoint) } From e01c97891e6520f48e93a507d21c1f2ae0915dbf Mon Sep 17 00:00:00 2001 From: qupeng Date: Mon, 25 Sep 2023 16:42:16 +0800 Subject: [PATCH 066/220] resolved-ts: speed up advancing when stores get partitioned (#15567) close tikv/tikv#15679 Signed-off-by: qupeng --- components/resolved_ts/src/advance.rs | 100 ++++++++++-------- .../resolved_ts/tests/integrations/mod.rs | 31 +++++- 2 files changed, 86 insertions(+), 45 deletions(-) diff --git a/components/resolved_ts/src/advance.rs b/components/resolved_ts/src/advance.rs index 59478f5affb..dd6e9c2002c 100644 --- a/components/resolved_ts/src/advance.rs +++ b/components/resolved_ts/src/advance.rs @@ -167,10 +167,7 @@ pub struct LeadershipResolver { // store_id -> check leader request, record the request to each stores. store_req_map: HashMap, - // region_id -> region, cache the information of regions. - region_map: HashMap>, - // region_id -> peers id, record the responses. - resp_map: HashMap>, + progresses: HashMap, checking_regions: HashSet, valid_regions: HashSet, @@ -196,8 +193,7 @@ impl LeadershipResolver { region_read_progress, store_req_map: HashMap::default(), - region_map: HashMap::default(), - resp_map: HashMap::default(), + progresses: HashMap::default(), valid_regions: HashSet::default(), checking_regions: HashSet::default(), last_gc_time: Instant::now_coarse(), @@ -209,8 +205,7 @@ impl LeadershipResolver { let now = Instant::now_coarse(); if now - self.last_gc_time > self.gc_interval { self.store_req_map = HashMap::default(); - self.region_map = HashMap::default(); - self.resp_map = HashMap::default(); + self.progresses = HashMap::default(); self.valid_regions = HashSet::default(); self.checking_regions = HashSet::default(); self.last_gc_time = now; @@ -222,10 +217,7 @@ impl LeadershipResolver { v.regions.clear(); v.ts = 0; } - for v in self.region_map.values_mut() { - v.clear(); - } - for v in self.resp_map.values_mut() { + for v in self.progresses.values_mut() { v.clear(); } self.checking_regions.clear(); @@ -252,8 +244,7 @@ impl LeadershipResolver { let store_id = self.store_id; let valid_regions = &mut self.valid_regions; - let region_map = &mut self.region_map; - let resp_map = &mut self.resp_map; + let progresses = &mut self.progresses; let store_req_map = &mut self.store_req_map; let checking_regions = &mut self.checking_regions; for region_id in ®ions { @@ -275,13 +266,13 @@ impl LeadershipResolver { } let leader_info = core.get_leader_info(); + let prog = progresses + .entry(*region_id) + .or_insert_with(|| RegionProgress::new(peer_list.len())); let mut unvotes = 0; for peer in peer_list { if peer.store_id == store_id && peer.id == leader_id { - resp_map - .entry(*region_id) - .or_insert_with(|| Vec::with_capacity(peer_list.len())) - .push(store_id); + prog.resps.push(store_id); } else { // It's still necessary to check leader on learners even if they don't vote // because performing stale read on learners require it. @@ -299,15 +290,14 @@ impl LeadershipResolver { } } } + // Check `region_has_quorum` here because `store_map` can be empty, // in which case `region_has_quorum` won't be called any more. - if unvotes == 0 && region_has_quorum(peer_list, &resp_map[region_id]) { + if unvotes == 0 && region_has_quorum(peer_list, &prog.resps) { + prog.resolved = true; valid_regions.insert(*region_id); } else { - region_map - .entry(*region_id) - .or_insert_with(|| Vec::with_capacity(peer_list.len())) - .extend_from_slice(peer_list); + prog.peers.extend_from_slice(peer_list); } } }); @@ -321,7 +311,6 @@ impl LeadershipResolver { .values() .find(|req| !req.regions.is_empty()) .map_or(0, |req| req.regions[0].compute_size()); - let store_count = store_req_map.len(); let mut check_leader_rpcs = Vec::with_capacity(store_req_map.len()); for (store_id, req) in store_req_map { if req.regions.is_empty() { @@ -387,6 +376,7 @@ impl LeadershipResolver { .with_label_values(&["all"]) .observe(start.saturating_elapsed_secs()); }); + let rpc_count = check_leader_rpcs.len(); for _ in 0..rpc_count { // Use `select_all` to avoid the process getting blocked when some @@ -396,10 +386,16 @@ impl LeadershipResolver { match res { Ok((to_store, resp)) => { for region_id in resp.regions { - resp_map - .entry(region_id) - .or_insert_with(|| Vec::with_capacity(store_count)) - .push(to_store); + if let Some(prog) = progresses.get_mut(®ion_id) { + if prog.resolved { + continue; + } + prog.resps.push(to_store); + if region_has_quorum(&prog.peers, &prog.resps) { + prog.resolved = true; + valid_regions.insert(region_id); + } + } } } Err((to_store, reconnect, err)) => { @@ -409,24 +405,19 @@ impl LeadershipResolver { } } } - } - for (region_id, prs) in region_map { - if prs.is_empty() { - // The peer had the leadership before, but now it's no longer - // the case. Skip checking the region. - continue; - } - if let Some(resp) = resp_map.get(region_id) { - if resp.is_empty() { - // No response, maybe the peer lost leadership. - continue; - } - if region_has_quorum(prs, resp) { - valid_regions.insert(*region_id); - } + if valid_regions.len() >= progresses.len() { + break; } } - self.valid_regions.drain().collect() + let res: Vec = self.valid_regions.drain().collect(); + if res.len() != checking_regions.len() { + warn!( + "check leader returns valid regions different from checking regions"; + "valid_regions" => res.len(), + "checking_regions" => checking_regions.len(), + ); + } + res } } @@ -552,6 +543,27 @@ async fn get_tikv_client( Ok(cli) } +struct RegionProgress { + resolved: bool, + peers: Vec, + resps: Vec, +} + +impl RegionProgress { + fn new(len: usize) -> Self { + RegionProgress { + resolved: false, + peers: Vec::with_capacity(len), + resps: Vec::with_capacity(len), + } + } + fn clear(&mut self) { + self.resolved = false; + self.peers.clear(); + self.resps.clear(); + } +} + #[cfg(test)] mod tests { use std::{ diff --git a/components/resolved_ts/tests/integrations/mod.rs b/components/resolved_ts/tests/integrations/mod.rs index 634aa66c601..881d0b299f1 100644 --- a/components/resolved_ts/tests/integrations/mod.rs +++ b/components/resolved_ts/tests/integrations/mod.rs @@ -9,9 +9,10 @@ use kvproto::{kvrpcpb::*, metapb::RegionEpoch}; use pd_client::PdClient; use resolved_ts::Task; use tempfile::Builder; -use test_raftstore::sleep_ms; +use test_raftstore::{sleep_ms, IsolationFilterFactory}; use test_sst_importer::*; pub use testsuite::*; +use tikv_util::store::new_peer; #[test] fn test_resolved_ts_basic() { @@ -231,3 +232,31 @@ fn test_scan_log_memory_quota_exceeded() { suite.stop(); } + +// This case checks resolved ts can still be advanced quickly even if some TiKV +// stores are partitioned. +#[test] +fn test_store_partitioned() { + let mut suite = TestSuite::new(3); + let r = suite.cluster.get_region(&[]); + suite.cluster.must_transfer_leader(r.id, new_peer(1, 1)); + suite.must_get_rts_ge(r.id, block_on(suite.cluster.pd_client.get_tso()).unwrap()); + + suite + .cluster + .add_send_filter(IsolationFilterFactory::new(3)); + let tso = block_on(suite.cluster.pd_client.get_tso()).unwrap(); + for _ in 0..50 { + let rts = suite.region_resolved_ts(r.id).unwrap(); + if rts > tso { + if rts.physical() - tso.physical() < 3000 { + break; + } else { + panic!("resolved ts doesn't advance in time") + } + } + sleep_ms(100); + } + + suite.stop(); +} From b95f5cd0353506d728d0a50b7a898b503de072e1 Mon Sep 17 00:00:00 2001 From: glorv Date: Mon, 25 Sep 2023 17:07:47 +0800 Subject: [PATCH 067/220] build: add missing failpoint feature for raft-engine (#15676) ref tikv/tikv#15462 Signed-off-by: glorv Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- Cargo.toml | 2 +- tests/Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index c4c70e999be..81be4d36906 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,7 +21,7 @@ snmalloc = ["tikv_alloc/snmalloc"] portable = ["engine_rocks/portable"] sse = ["engine_rocks/sse"] mem-profiling = ["tikv_alloc/mem-profiling"] -failpoints = ["fail/failpoints", "raftstore/failpoints", "tikv_util/failpoints", "engine_rocks/failpoints"] +failpoints = ["fail/failpoints", "raftstore/failpoints", "tikv_util/failpoints", "engine_rocks/failpoints", "raft_log_engine/failpoints"] cloud-aws = ["encryption_export/cloud-aws", "sst_importer/cloud-aws"] cloud-gcp = ["encryption_export/cloud-gcp", "sst_importer/cloud-gcp"] cloud-azure = ["encryption_export/cloud-azure", "sst_importer/cloud-azure"] diff --git a/tests/Cargo.toml b/tests/Cargo.toml index f3928e97eb8..0081d5e95bc 100644 --- a/tests/Cargo.toml +++ b/tests/Cargo.toml @@ -41,7 +41,7 @@ path = "benches/deadlock_detector/mod.rs" [features] default = ["failpoints", "testexport", "test-engine-kv-rocksdb", "test-engine-raft-raft-engine", "cloud-aws", "cloud-gcp", "cloud-azure"] -failpoints = ["fail/failpoints", "tikv/failpoints", "pd_client/failpoints", "raft_log_engine/failpoints"] +failpoints = ["fail/failpoints", "tikv/failpoints", "pd_client/failpoints"] cloud-aws = ["external_storage_export/cloud-aws"] cloud-gcp = ["external_storage_export/cloud-gcp"] cloud-azure = ["external_storage_export/cloud-azure"] From 8fb721ef18a9e1ba354e5a91d780ed6647641ab9 Mon Sep 17 00:00:00 2001 From: glorv Date: Tue, 26 Sep 2023 07:01:45 +0800 Subject: [PATCH 068/220] raftstore-v2: adjust lockcf default write buffer size and limit (#15678) close tikv/tikv#15630 Signed-off-by: glorv Co-authored-by: tonyxuqqi --- src/config/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/config/mod.rs b/src/config/mod.rs index 8a2fa291ff1..63e36a543dc 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -110,7 +110,7 @@ const RAFT_ENGINE_MEMORY_LIMIT_RATE: f64 = 0.15; const WRITE_BUFFER_MEMORY_LIMIT_RATE: f64 = 0.2; // Too large will increase Raft Engine memory usage. const WRITE_BUFFER_MEMORY_LIMIT_MAX: u64 = ReadableSize::gb(8).0; -const DEFAULT_LOCK_BUFFER_MEMORY_LIMIT: ReadableSize = ReadableSize::mb(32); +const DEFAULT_LOCK_BUFFER_MEMORY_LIMIT: ReadableSize = ReadableSize::mb(128); /// Configs that actually took effect in the last run pub const LAST_CONFIG_FILE: &str = "last_tikv.toml"; @@ -1403,7 +1403,7 @@ impl DbConfig { self.writecf.max_compactions.get_or_insert(1); self.lockcf .write_buffer_size - .get_or_insert(ReadableSize::mb(4)); + .get_or_insert(ReadableSize::mb(32)); self.lockcf .write_buffer_limit .get_or_insert(DEFAULT_LOCK_BUFFER_MEMORY_LIMIT); From 312e0fb7f9f77e6002d0a336a58e84f3c4c12216 Mon Sep 17 00:00:00 2001 From: glorv Date: Tue, 26 Sep 2023 13:22:46 +0800 Subject: [PATCH 069/220] *: Revert "*: update rust-toolchain (#15584)" (#15683) close tikv/tikv#15653 Signed-off-by: glorv Signed-off-by: tonyxuqqi Co-authored-by: tonyxuqqi --- Cargo.lock | 627 ++---------------- cmd/tikv-ctl/src/fork_readonly_tikv.rs | 1 - cmd/tikv-ctl/src/main.rs | 2 +- components/backup-stream/Cargo.toml | 2 +- components/backup-stream/src/errors.rs | 4 +- .../backup-stream/src/metadata/client.rs | 5 +- components/backup-stream/src/router.rs | 5 +- .../backup-stream/src/subscription_track.rs | 2 +- components/backup-stream/src/utils.rs | 4 +- components/backup/src/endpoint.rs | 6 +- components/batch-system/src/fsm.rs | 8 +- components/case_macros/src/lib.rs | 10 +- components/cdc/src/delegate.rs | 2 +- components/cdc/src/endpoint.rs | 6 +- .../concurrency_manager/src/lock_table.rs | 4 +- components/coprocessor_plugin_api/src/util.rs | 4 - components/encryption/src/config.rs | 9 +- components/engine_rocks/src/logger.rs | 2 + components/engine_rocks/src/properties.rs | 15 +- .../engine_tirocks/src/properties/mvcc.rs | 2 +- .../engine_tirocks/src/properties/range.rs | 10 +- components/engine_traits/src/flush.rs | 2 +- components/engine_traits/src/lib.rs | 4 +- components/engine_traits/src/tablet.rs | 2 +- .../online_config_derive/src/lib.rs | 14 +- components/raftstore-v2/src/batch/store.rs | 6 +- components/raftstore-v2/src/lib.rs | 1 - .../operation/command/admin/merge/prepare.rs | 4 +- .../src/operation/command/admin/split.rs | 4 +- .../command/admin/transfer_leader.rs | 20 +- components/raftstore-v2/src/operation/life.rs | 8 +- .../raftstore-v2/src/operation/query/local.rs | 4 +- .../src/operation/ready/apply_trace.rs | 2 +- .../src/operation/ready/snapshot.rs | 14 +- .../raftstore-v2/src/operation/txn_ext.rs | 4 +- .../src/operation/unsafe_recovery/demote.rs | 5 +- .../src/worker/cleanup/compact.rs | 16 +- .../raftstore-v2/src/worker/pd/region.rs | 15 +- .../raftstore-v2/src/worker/pd/split.rs | 6 +- components/raftstore-v2/src/worker/tablet.rs | 13 +- .../tests/integrations/cluster.rs | 4 +- .../raftstore/src/coprocessor/dispatcher.rs | 5 +- components/raftstore/src/errors.rs | 2 +- components/raftstore/src/lib.rs | 4 +- .../raftstore/src/store/async_io/write.rs | 6 +- .../raftstore/src/store/entry_storage.rs | 8 +- components/raftstore/src/store/fsm/apply.rs | 18 +- components/raftstore/src/store/fsm/peer.rs | 8 +- components/raftstore/src/store/msg.rs | 24 +- components/raftstore/src/store/peer.rs | 48 +- .../raftstore/src/store/peer_storage.rs | 2 +- .../raftstore/src/store/region_snapshot.rs | 6 +- .../raftstore/src/store/simple_write.rs | 24 +- components/raftstore/src/store/snap.rs | 4 +- components/raftstore/src/store/snap/io.rs | 4 +- components/raftstore/src/store/txn_ext.rs | 2 +- components/raftstore/src/store/util.rs | 3 +- components/raftstore/src/store/worker/pd.rs | 20 +- components/raftstore/src/store/worker/read.rs | 3 +- .../raftstore/src/store/worker/region.rs | 4 +- .../raftstore/src/store/worker/split_check.rs | 8 +- .../src/store/worker/split_controller.rs | 11 +- components/resolved_ts/src/cmd.rs | 6 +- components/resolved_ts/src/endpoint.rs | 20 +- components/resolved_ts/src/scanner.rs | 3 +- .../resource_control/src/resource_group.rs | 6 +- components/resource_metering/src/lib.rs | 2 +- components/resource_metering/src/model.rs | 2 +- .../src/recorder/sub_recorder/cpu.rs | 4 +- .../resource_metering/tests/recorder_test.rs | 12 +- components/server/src/common.rs | 4 +- components/snap_recovery/src/leader_keeper.rs | 4 +- components/sst_importer/src/import_mode2.rs | 2 +- components/sst_importer/src/sst_importer.rs | 17 +- components/sst_importer/src/util.rs | 3 +- components/test_coprocessor/src/store.rs | 2 +- .../example_plugin/src/lib.rs | 2 +- components/test_pd/src/server.rs | 8 +- components/test_pd_client/src/pd.rs | 2 +- components/test_raftstore-v2/src/cluster.rs | 3 +- components/test_raftstore-v2/src/lib.rs | 2 - components/test_raftstore-v2/src/node.rs | 2 +- components/test_raftstore-v2/src/server.rs | 14 +- components/test_raftstore/src/lib.rs | 2 - components/test_raftstore/src/node.rs | 2 +- components/test_raftstore/src/server.rs | 8 +- .../tidb_query_codegen/src/rpn_function.rs | 35 +- .../src/codec/collation/mod.rs | 2 +- .../tidb_query_datatype/src/codec/convert.rs | 12 +- .../src/codec/data_type/mod.rs | 2 +- .../src/codec/data_type/scalar.rs | 17 +- .../tidb_query_datatype/src/codec/datum.rs | 8 +- .../src/codec/mysql/decimal.rs | 2 +- .../src/codec/mysql/duration.rs | 4 +- .../src/codec/mysql/json/comparison.rs | 4 +- .../src/codec/mysql/json/jcodec.rs | 8 +- .../src/codec/mysql/json/json_modify.rs | 2 +- .../src/codec/mysql/time/mod.rs | 13 +- .../src/codec/mysql/time/tz.rs | 4 - .../src/codec/row/v2/row_slice.rs | 2 +- .../tidb_query_datatype/src/codec/table.rs | 2 +- .../src/index_scan_executor.rs | 4 +- components/tidb_query_executors/src/runner.rs | 18 +- .../src/selection_executor.rs | 4 +- .../src/util/aggr_executor.rs | 4 +- .../tidb_query_executors/src/util/mod.rs | 4 +- components/tidb_query_expr/src/impl_cast.rs | 2 +- .../tidb_query_expr/src/impl_miscellaneous.rs | 5 +- components/tidb_query_expr/src/impl_string.rs | 6 +- components/tidb_query_expr/src/lib.rs | 2 - .../tidb_query_expr/src/types/expr_eval.rs | 11 +- components/tikv_kv/src/cursor.rs | 2 +- components/tikv_kv/src/lib.rs | 1 - components/tikv_util/src/logger/formatter.rs | 6 +- components/tikv_util/src/lru.rs | 2 +- components/tikv_util/src/memory.rs | 2 +- .../src/metrics/allocator_metrics.rs | 2 +- components/tikv_util/src/mpsc/future.rs | 2 - components/tikv_util/src/sys/cpu_time.rs | 2 +- components/tikv_util/src/timer.rs | 4 +- components/txn_types/src/timestamp.rs | 10 +- components/txn_types/src/types.rs | 18 +- rust-toolchain | 2 +- src/config/mod.rs | 20 +- src/coprocessor/metrics.rs | 2 +- src/coprocessor/mod.rs | 2 - src/import/sst_service.rs | 6 +- src/lib.rs | 3 +- src/server/debug2.rs | 2 +- src/server/gc_worker/compaction_filter.rs | 1 - src/server/gc_worker/gc_manager.rs | 8 +- src/server/gc_worker/gc_worker.rs | 14 +- src/server/lock_manager/deadlock.rs | 9 +- src/server/raftkv/mod.rs | 5 +- src/server/raftkv2/mod.rs | 4 +- src/server/raftkv2/node.rs | 4 +- src/server/service/debug.rs | 1 + src/server/service/diagnostics/log.rs | 18 +- src/server/service/diagnostics/sys.rs | 2 +- src/server/service/kv.rs | 1 + src/storage/lock_manager/lock_wait_context.rs | 12 +- .../lock_manager/lock_waiting_queue.rs | 7 +- src/storage/metrics.rs | 2 +- src/storage/mod.rs | 32 +- src/storage/mvcc/reader/point_getter.rs | 2 +- src/storage/mvcc/reader/reader.rs | 21 +- src/storage/mvcc/reader/scanner/forward.rs | 4 +- src/storage/raw/raw_mvcc.rs | 2 +- src/storage/txn/actions/prewrite.rs | 2 + src/storage/txn/commands/atomic_store.rs | 4 +- src/storage/txn/commands/prewrite.rs | 26 +- src/storage/txn/latch.rs | 20 +- src/storage/txn/sched_pool.rs | 2 +- tests/Cargo.toml | 1 - .../benches/coprocessor_executors/util/mod.rs | 2 +- tests/benches/hierarchy/mvcc/mod.rs | 2 +- .../misc/coprocessor/codec/chunk/chunk.rs | 176 ----- .../misc/coprocessor/codec/chunk/mod.rs | 140 ---- tests/benches/misc/coprocessor/codec/mod.rs | 1 - tests/benches/misc/raftkv/mod.rs | 2 - tests/benches/raftstore/mod.rs | 2 +- tests/failpoints/cases/mod.rs | 3 - tests/failpoints/cases/test_disk_full.rs | 8 +- tests/failpoints/cases/test_engine.rs | 1 - tests/failpoints/cases/test_hibernate.rs | 1 - tests/failpoints/cases/test_merge.rs | 13 +- tests/failpoints/cases/test_pd_client.rs | 1 - .../failpoints/cases/test_pd_client_legacy.rs | 1 - tests/failpoints/cases/test_rawkv.rs | 2 +- .../cases/test_read_execution_tracker.rs | 11 +- tests/failpoints/cases/test_split_region.rs | 3 +- tests/failpoints/cases/test_storage.rs | 4 +- tests/failpoints/cases/test_transaction.rs | 2 +- .../failpoints/cases/test_transfer_leader.rs | 4 +- tests/integrations/backup/mod.rs | 1 - tests/integrations/import/test_apply_log.rs | 2 +- tests/integrations/mod.rs | 2 - .../integrations/raftstore/test_bootstrap.rs | 4 +- .../raftstore/test_compact_lock_cf.rs | 4 +- tests/integrations/raftstore/test_stats.rs | 1 - 180 files changed, 600 insertions(+), 1486 deletions(-) delete mode 100644 tests/benches/misc/coprocessor/codec/chunk/chunk.rs delete mode 100644 tests/benches/misc/coprocessor/codec/chunk/mod.rs diff --git a/Cargo.lock b/Cargo.lock index e9f937e3266..124a87f069e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -47,7 +47,7 @@ version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "43bb833f0bf979d8475d38fbf09ed3b8a55e1885fe93ad3f93239fc6a4f17b98" dependencies = [ - "getrandom 0.2.10", + "getrandom 0.2.3", "once_cell", "version_check 0.9.4", ] @@ -59,8 +59,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2c99f64d1e06488f620f932677e24bc6e2897582980441ae90a671415bd7ec2f" dependencies = [ "cfg-if 1.0.0", - "const-random", - "getrandom 0.2.10", "once_cell", "version_check 0.9.4", ] @@ -80,21 +78,6 @@ version = "0.2.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c4f263788a35611fba42eb41ff811c5d0360c58b97402570312a350736e2542e" -[[package]] -name = "android-tzdata" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0" - -[[package]] -name = "android_system_properties" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" -dependencies = [ - "libc 0.2.146", -] - [[package]] name = "ansi_term" version = "0.11.0" @@ -148,217 +131,6 @@ dependencies = [ "nodrop", ] -[[package]] -name = "arrow" -version = "46.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04a8801ebb147ad240b2d978d3ab9f73c9ccd4557ba6a03e7800496770ed10e0" -dependencies = [ - "ahash 0.8.3", - "arrow-arith", - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-csv", - "arrow-data", - "arrow-ipc", - "arrow-json", - "arrow-ord", - "arrow-row", - "arrow-schema", - "arrow-select", - "arrow-string", -] - -[[package]] -name = "arrow-arith" -version = "46.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "895263144bd4a69751cbe6a34a53f26626e19770b313a9fa792c415cd0e78f11" -dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "chrono", - "half 2.3.1", - "num 0.4.1", -] - -[[package]] -name = "arrow-array" -version = "46.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "226fdc6c3a4ae154a74c24091d36a90b514f0ed7112f5b8322c1d8f354d8e20d" -dependencies = [ - "ahash 0.8.3", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "chrono", - "half 2.3.1", - "hashbrown 0.14.0", - "num 0.4.1", -] - -[[package]] -name = "arrow-buffer" -version = "46.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc4843af4dd679c2f35b69c572874da8fde33be53eb549a5fb128e7a4b763510" -dependencies = [ - "bytes", - "half 2.3.1", - "num 0.4.1", -] - -[[package]] -name = "arrow-cast" -version = "46.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "35e8b9990733a9b635f656efda3c9b8308c7a19695c9ec2c7046dd154f9b144b" -dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "arrow-select", - "chrono", - "half 2.3.1", - "lexical-core", - "num 0.4.1", -] - -[[package]] -name = "arrow-csv" -version = "46.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "646fbb4e11dd0afb8083e883f53117713b8caadb4413b3c9e63e3f535da3683c" -dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-data", - "arrow-schema", - "chrono", - "csv", - "csv-core", - "lazy_static", - "lexical-core", - "regex", -] - -[[package]] -name = "arrow-data" -version = "46.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da900f31ff01a0a84da0572209be72b2b6f980f3ea58803635de47913191c188" -dependencies = [ - "arrow-buffer", - "arrow-schema", - "half 2.3.1", - "num 0.4.1", -] - -[[package]] -name = "arrow-ipc" -version = "46.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2707a8d7ee2d345d045283ece3ae43416175873483e5d96319c929da542a0b1f" -dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-data", - "arrow-schema", - "flatbuffers", -] - -[[package]] -name = "arrow-json" -version = "46.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d1b91a63c356d14eedc778b76d66a88f35ac8498426bb0799a769a49a74a8b4" -dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-data", - "arrow-schema", - "chrono", - "half 2.3.1", - "indexmap 2.0.0", - "lexical-core", - "num 0.4.1", - "serde", - "serde_json", -] - -[[package]] -name = "arrow-ord" -version = "46.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "584325c91293abbca7aaaabf8da9fe303245d641f5f4a18a6058dc68009c7ebf" -dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "arrow-select", - "half 2.3.1", - "num 0.4.1", -] - -[[package]] -name = "arrow-row" -version = "46.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e32afc1329f7b372463b21c6ca502b07cf237e1ed420d87706c1770bb0ebd38" -dependencies = [ - "ahash 0.8.3", - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "half 2.3.1", - "hashbrown 0.14.0", -] - -[[package]] -name = "arrow-schema" -version = "46.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b104f5daa730f00fde22adc03a12aa5a2ae9ccbbf99cbd53d284119ddc90e03d" - -[[package]] -name = "arrow-select" -version = "46.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73b3ca55356d1eae07cf48808d8c462cea674393ae6ad1e0b120f40b422eb2b4" -dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "num 0.4.1", -] - -[[package]] -name = "arrow-string" -version = "46.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af1433ce02590cae68da0a18ed3a3ed868ffac2c6f24c533ddd2067f7ee04b4a" -dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "arrow-select", - "num 0.4.1", - "regex", - "regex-syntax 0.7.5", -] - [[package]] name = "async-channel" version = "1.6.1" @@ -611,7 +383,7 @@ dependencies = [ "bytes", "dyn-clone", "futures 0.3.15", - "getrandom 0.2.10", + "getrandom 0.2.3", "http-types", "log", "paste", @@ -795,7 +567,7 @@ dependencies = [ "futures-io", "grpcio", "hex 0.4.2", - "indexmap 1.9.3", + "indexmap", "kvproto", "lazy_static", "log_wrappers", @@ -1011,9 +783,9 @@ checksum = "08c48aae112d48ed9f069b33538ea9e3e90aa263cfa3d1c24309612b1f7472de" [[package]] name = "bytes" -version = "1.5.0" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a2bd12c1caf447e69cd4528f47f94d203fd2582878ecb9e9465484c4148a8223" +checksum = "b700ce4376041dcd0a327fd0097c41095743c4c8af8887265942faf1100bd040" dependencies = [ "serde", ] @@ -1112,12 +884,11 @@ dependencies = [ [[package]] name = "cc" -version = "1.0.83" +version = "1.0.73" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0" +checksum = "2fff2a6927b3bb87f9595d67196a70493f627687a71d87a0d692242c33f58c11" dependencies = [ "jobserver", - "libc 0.2.146", ] [[package]] @@ -1189,17 +960,14 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "chrono" -version = "0.4.31" +version = "0.4.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f2c685bad3eb3d45a01354cedb7d5faa66194d1d58ba6e267a8de788f79db38" +checksum = "80094f509cf8b5ae86a4966a39b3ff66cd7e2a3e594accec3743ff3fabeab5b2" dependencies = [ - "android-tzdata", - "iana-time-zone", - "js-sys", + "num-integer", "num-traits", "serde", - "wasm-bindgen", - "windows-targets", + "time 0.1.42", ] [[package]] @@ -1247,7 +1015,7 @@ dependencies = [ "atty", "bitflags", "clap_derive", - "indexmap 1.9.3", + "indexmap", "lazy_static", "os_str_bytes", "strsim 0.10.0", @@ -1346,28 +1114,6 @@ dependencies = [ "cache-padded", ] -[[package]] -name = "const-random" -version = "0.1.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "368a7a772ead6ce7e1de82bfb04c485f3db8ec744f72925af5735e29a22cc18e" -dependencies = [ - "const-random-macro", - "proc-macro-hack", -] - -[[package]] -name = "const-random-macro" -version = "0.1.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d7d6ab3c3a2282db210df5f02c4dab6e0a7057af0fb7ebd4070f30fe05c0ddb" -dependencies = [ - "getrandom 0.2.10", - "once_cell", - "proc-macro-hack", - "tiny-keccak", -] - [[package]] name = "const_format" version = "0.2.30" @@ -1409,9 +1155,9 @@ dependencies = [ [[package]] name = "core-foundation-sys" -version = "0.8.4" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa" +checksum = "ea221b5284a47e40033bf9b66f35f984ec0ea2931eb03505246cd27a963f981b" [[package]] name = "cpu-time" @@ -1590,12 +1336,6 @@ dependencies = [ "lazy_static", ] -[[package]] -name = "crunchy" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" - [[package]] name = "crypto-common" version = "0.1.6" @@ -2017,12 +1757,6 @@ dependencies = [ "termcolor", ] -[[package]] -name = "equivalent" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" - [[package]] name = "errno" version = "0.2.8" @@ -2303,16 +2037,6 @@ version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" -[[package]] -name = "flatbuffers" -version = "23.5.26" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4dac53e22462d78c16d64a1cd22371b54cc3fe94aa15e7886a2fa6e5d1ab8640" -dependencies = [ - "bitflags", - "rustc_version 0.4.0", -] - [[package]] name = "flate2" version = "1.0.11" @@ -2644,14 +2368,14 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.2.10" +version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be4136b2a15dd319360be1c07d9933517ccf0be8f16bf62a3bee4f0d618df427" +checksum = "7fcd999463524c52659517fe2cea98493cfe485d10565e7b0fb07dbba7ad2753" dependencies = [ "cfg-if 1.0.0", "js-sys", "libc 0.2.146", - "wasi 0.11.0+wasi-snapshot-preview1", + "wasi 0.10.2+wasi-snapshot-preview1", "wasm-bindgen", ] @@ -2755,7 +2479,7 @@ dependencies = [ "futures-sink", "futures-util", "http", - "indexmap 1.9.3", + "indexmap", "slab", "tokio", "tokio-util", @@ -2768,22 +2492,11 @@ version = "1.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7" -[[package]] -name = "half" -version = "2.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc52e53916c08643f1b56ec082790d1e86a32e58dc5268f897f313fbae7b4872" -dependencies = [ - "cfg-if 1.0.0", - "crunchy", - "num-traits", -] - [[package]] name = "hashbrown" -version = "0.12.3" +version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" +checksum = "d7afe4a420e3fe79967a00898cc1f4db7c8a49a9333a29f8a4bd76a253d5cd04" [[package]] name = "hashbrown" @@ -3003,29 +2716,6 @@ dependencies = [ "tokio-native-tls", ] -[[package]] -name = "iana-time-zone" -version = "0.1.57" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2fad5b825842d2b38bd206f3e81d6957625fd7f0a361e345c30e01a0ae2dd613" -dependencies = [ - "android_system_properties", - "core-foundation-sys", - "iana-time-zone-haiku", - "js-sys", - "wasm-bindgen", - "windows", -] - -[[package]] -name = "iana-time-zone-haiku" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" -dependencies = [ - "cc", -] - [[package]] name = "ident_case" version = "1.0.1" @@ -3051,22 +2741,12 @@ checksum = "cb56e1aa765b4b4f3aadfab769793b7087bb03a4ea4920644a6d238e2df5b9ed" [[package]] name = "indexmap" -version = "1.9.3" +version = "1.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99" +checksum = "824845a0bf897a9042383849b02c1bc219c2383772efcd5c6f9766fa4b81aef3" dependencies = [ "autocfg", - "hashbrown 0.12.3", -] - -[[package]] -name = "indexmap" -version = "2.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d5477fe2230a79769d8dc68e0eabf5437907c0457a5614a9e8dddb67f65eb65d" -dependencies = [ - "equivalent", - "hashbrown 0.14.0", + "hashbrown 0.9.1", ] [[package]] @@ -3083,7 +2763,7 @@ checksum = "16d4bde3a7105e59c66a4104cfe9606453af1c7a0eac78cb7d5bc263eb762a70" dependencies = [ "ahash 0.7.4", "atty", - "indexmap 1.9.3", + "indexmap", "itoa 1.0.1", "lazy_static", "log", @@ -3234,7 +2914,7 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2d63b6407b66fc81fc539dccf3ddecb669f393c5101b6a2be3976c95099a06e8" dependencies = [ - "indexmap 1.9.3", + "indexmap", ] [[package]] @@ -3274,70 +2954,6 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" -[[package]] -name = "lexical-core" -version = "0.8.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92912c4af2e7d9075be3e5e3122c4d7263855fa6cce34fbece4dd08e5884624d" -dependencies = [ - "lexical-parse-float", - "lexical-parse-integer", - "lexical-util", - "lexical-write-float", - "lexical-write-integer", -] - -[[package]] -name = "lexical-parse-float" -version = "0.8.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f518eed87c3be6debe6d26b855c97358d8a11bf05acec137e5f53080f5ad2dd8" -dependencies = [ - "lexical-parse-integer", - "lexical-util", - "static_assertions", -] - -[[package]] -name = "lexical-parse-integer" -version = "0.8.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "afc852ec67c6538bbb2b9911116a385b24510e879a69ab516e6a151b15a79168" -dependencies = [ - "lexical-util", - "static_assertions", -] - -[[package]] -name = "lexical-util" -version = "0.8.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c72a9d52c5c4e62fa2cdc2cb6c694a39ae1382d9c2a17a466f18e272a0930eb1" -dependencies = [ - "static_assertions", -] - -[[package]] -name = "lexical-write-float" -version = "0.8.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a89ec1d062e481210c309b672f73a0567b7855f21e7d2fae636df44d12e97f9" -dependencies = [ - "lexical-util", - "lexical-write-integer", - "static_assertions", -] - -[[package]] -name = "lexical-write-integer" -version = "0.8.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "094060bd2a7c2ff3a16d5304a6ae82727cb3cc9d1c70f813cc73f744c319337e" -dependencies = [ - "lexical-util", - "static_assertions", -] - [[package]] name = "libc" version = "0.1.12" @@ -3370,12 +2986,6 @@ dependencies = [ "winapi 0.3.9", ] -[[package]] -name = "libm" -version = "0.2.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f7012b1bbb0719e1097c47611d3898568c546d597c2e74d66f6087edd5233ff4" - [[package]] name = "libmimalloc-sys" version = "0.1.21" @@ -3866,35 +3476,10 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ab3e176191bc4faad357e3122c4747aa098ac880e88b168f106386128736cf4a" dependencies = [ - "num-complex 0.3.0", - "num-integer", - "num-iter", - "num-rational 0.3.0", - "num-traits", -] - -[[package]] -name = "num" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b05180d69e3da0e530ba2a1dae5110317e49e3b7f3d41be227dc5f92e49ee7af" -dependencies = [ - "num-bigint", - "num-complex 0.4.4", + "num-complex", "num-integer", "num-iter", - "num-rational 0.4.1", - "num-traits", -] - -[[package]] -name = "num-bigint" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f93ab6289c7b344a8a9f60f88d80aa20032336fe78da341afc91c8a2341fc75f" -dependencies = [ - "autocfg", - "num-integer", + "num-rational", "num-traits", ] @@ -3907,15 +3492,6 @@ dependencies = [ "num-traits", ] -[[package]] -name = "num-complex" -version = "0.4.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ba157ca0885411de85d6ca030ba7e2a83a28636056c7c699b07c8b6f7383214" -dependencies = [ - "num-traits", -] - [[package]] name = "num-derive" version = "0.3.0" @@ -3950,9 +3526,9 @@ dependencies = [ [[package]] name = "num-integer" -version = "0.1.45" +version = "0.1.44" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "225d3389fb3509a24c93f5c29eb6bde2586b98d9f016636dff58d7c6f7569cd9" +checksum = "d2cc698a63b549a70bc047073d2949cce27cd1c7b0a4a862d08a8031bc2801db" dependencies = [ "autocfg", "num-traits", @@ -3960,9 +3536,9 @@ dependencies = [ [[package]] name = "num-iter" -version = "0.1.43" +version = "0.1.42" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d03e6c028c5dc5cac6e2dec0efda81fc887605bb3d884578bb6d6bf7514e252" +checksum = "b2021c8337a54d21aca0d59a92577a029af9431cb59b909b03252b9c164fad59" dependencies = [ "autocfg", "num-integer", @@ -3980,26 +3556,13 @@ dependencies = [ "num-traits", ] -[[package]] -name = "num-rational" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0638a1c9d0a3c0914158145bc76cff373a75a627e6ecbfb71cbe6f453a5a19b0" -dependencies = [ - "autocfg", - "num-bigint", - "num-integer", - "num-traits", -] - [[package]] name = "num-traits" -version = "0.2.16" +version = "0.2.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f30b0abd723be7e2ffca1272140fac1a2f084c77ec3e123c192b66af1ee9e6c2" +checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290" dependencies = [ "autocfg", - "libm", ] [[package]] @@ -4029,7 +3592,7 @@ checksum = "80e47cfc4c0a1a519d9a025ebfbac3a2439d1b5cdf397d72dcb79b11d9920dab" dependencies = [ "base64 0.13.0", "chrono", - "getrandom 0.2.10", + "getrandom 0.2.3", "http", "rand 0.8.5", "serde", @@ -4320,7 +3883,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4a13a2fa9d0b63e5f22328828741e523766fff0ee9e779316902290dff3f824f" dependencies = [ "fixedbitset", - "indexmap 1.9.3", + "indexmap", ] [[package]] @@ -5052,7 +4615,7 @@ version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34cf66eb183df1c5876e2dcf6b13d57340741e8dc255b48e40a26de954d06ae7" dependencies = [ - "getrandom 0.2.10", + "getrandom 0.2.3", ] [[package]] @@ -5145,19 +4708,19 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "528532f3d801c87aec9def2add9ca802fe569e44a544afe633765267840abe64" dependencies = [ - "getrandom 0.2.10", + "getrandom 0.2.3", "redox_syscall 0.2.11", ] [[package]] name = "regex" -version = "1.7.3" +version = "1.5.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b1f693b24f6ac912f4893ef08244d70b6067480d2f1a46e950c9691e6749d1d" +checksum = "d83f127d94bdbcda4c8cc2e50f6f84f4b611f69c902699ca385a39c3a75f9ff1" dependencies = [ "aho-corasick", "memchr", - "regex-syntax 0.6.29", + "regex-syntax", ] [[package]] @@ -5171,15 +4734,9 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.6.29" +version = "0.6.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1" - -[[package]] -name = "regex-syntax" -version = "0.7.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da" +checksum = "49b3de9ec5dc0a3417da371aab17d729997c15010e7fd24ff707773a33bddb64" [[package]] name = "remove_dir_all" @@ -5722,7 +5279,7 @@ version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e18acfa2f90e8b735b2836ab8d538de304cbb6729a7360729ea5a895d15a622" dependencies = [ - "half 1.8.2", + "half", "serde", ] @@ -5752,7 +5309,7 @@ version = "1.0.64" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "799e97dc9fdae36a5c8b8f2cae9ce2ee9fdce2058c57a93e6099d919fd982f79" dependencies = [ - "indexmap 1.9.3", + "indexmap", "itoa 0.4.4", "ryu", "serde", @@ -6684,7 +6241,6 @@ name = "tests" version = "0.0.1" dependencies = [ "api_version", - "arrow", "async-trait", "batch-system", "byteorder", @@ -6883,7 +6439,7 @@ dependencies = [ "log_wrappers", "match-template", "nom 7.1.0", - "num 0.3.0", + "num", "num-derive 0.3.0", "num-traits", "ordered-float", @@ -6945,7 +6501,7 @@ dependencies = [ "hex 0.4.2", "log_wrappers", "match-template", - "num 0.3.0", + "num", "num-traits", "openssl", "panic_hook", @@ -7362,15 +6918,6 @@ dependencies = [ "time-core", ] -[[package]] -name = "tiny-keccak" -version = "2.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237" -dependencies = [ - "crunchy", -] - [[package]] name = "tinytemplate" version = "1.2.0" @@ -7572,7 +7119,7 @@ checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c" dependencies = [ "futures-core", "futures-util", - "indexmap 1.9.3", + "indexmap", "pin-project", "pin-project-lite", "rand 0.8.5", @@ -7810,7 +7357,7 @@ version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bc5cf98d8186244414c848017f0e2676b3fcb46807f6668a97dfe67359a3c4b7" dependencies = [ - "getrandom 0.2.10", + "getrandom 0.2.3", "serde", ] @@ -7820,7 +7367,7 @@ version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "feb41e78f93363bb2df8b0e86a2ca30eed7806ea16ea0c790d757cf93f79be83" dependencies = [ - "getrandom 0.2.10", + "getrandom 0.2.3", ] [[package]] @@ -7896,6 +7443,12 @@ version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b89c3ce4ce14bdc6fb6beaf9ec7928ca331de5df7e5ea278375642a2f478570d" +[[package]] +name = "wasi" +version = "0.10.2+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd6fbd9a79829dd1ad0cc20627bf1ed606756a7f77edff7b66b7064f9cb327c6" + [[package]] name = "wasi" version = "0.11.0+wasi-snapshot-preview1" @@ -8034,15 +7587,6 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" -[[package]] -name = "windows" -version = "0.48.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e686886bc078bc1b0b600cac0147aadb815089b6e4da64016cbd754b6342700f" -dependencies = [ - "windows-targets", -] - [[package]] name = "windows-sys" version = "0.32.0" @@ -8062,42 +7606,21 @@ version = "0.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5a3e1820f08b8513f676f7ab6c1f99ff312fb97b553d30ff4dd86f9f15728aa7" dependencies = [ - "windows_aarch64_gnullvm 0.42.0", + "windows_aarch64_gnullvm", "windows_aarch64_msvc 0.42.0", "windows_i686_gnu 0.42.0", "windows_i686_msvc 0.42.0", "windows_x86_64_gnu 0.42.0", - "windows_x86_64_gnullvm 0.42.0", + "windows_x86_64_gnullvm", "windows_x86_64_msvc 0.42.0", ] -[[package]] -name = "windows-targets" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" -dependencies = [ - "windows_aarch64_gnullvm 0.48.5", - "windows_aarch64_msvc 0.48.5", - "windows_i686_gnu 0.48.5", - "windows_i686_msvc 0.48.5", - "windows_x86_64_gnu 0.48.5", - "windows_x86_64_gnullvm 0.48.5", - "windows_x86_64_msvc 0.48.5", -] - [[package]] name = "windows_aarch64_gnullvm" version = "0.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "41d2aa71f6f0cbe00ae5167d90ef3cfe66527d6f613ca78ac8024c3ccab9a19e" -[[package]] -name = "windows_aarch64_gnullvm" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" - [[package]] name = "windows_aarch64_msvc" version = "0.32.0" @@ -8110,12 +7633,6 @@ version = "0.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dd0f252f5a35cac83d6311b2e795981f5ee6e67eb1f9a7f64eb4500fbc4dcdb4" -[[package]] -name = "windows_aarch64_msvc" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" - [[package]] name = "windows_i686_gnu" version = "0.32.0" @@ -8128,12 +7645,6 @@ version = "0.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fbeae19f6716841636c28d695375df17562ca208b2b7d0dc47635a50ae6c5de7" -[[package]] -name = "windows_i686_gnu" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" - [[package]] name = "windows_i686_msvc" version = "0.32.0" @@ -8146,12 +7657,6 @@ version = "0.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "84c12f65daa39dd2babe6e442988fc329d6243fdce47d7d2d155b8d874862246" -[[package]] -name = "windows_i686_msvc" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" - [[package]] name = "windows_x86_64_gnu" version = "0.32.0" @@ -8164,24 +7669,12 @@ version = "0.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bf7b1b21b5362cbc318f686150e5bcea75ecedc74dd157d874d754a2ca44b0ed" -[[package]] -name = "windows_x86_64_gnu" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" - [[package]] name = "windows_x86_64_gnullvm" version = "0.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09d525d2ba30eeb3297665bd434a54297e4170c7f1a44cad4ef58095b4cd2028" -[[package]] -name = "windows_x86_64_gnullvm" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" - [[package]] name = "windows_x86_64_msvc" version = "0.32.0" @@ -8194,12 +7687,6 @@ version = "0.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f40009d85759725a34da6d89a94e63d7bdc50a862acf0dbc7c8e488f1edcb6f5" -[[package]] -name = "windows_x86_64_msvc" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" - [[package]] name = "winreg" version = "0.7.0" diff --git a/cmd/tikv-ctl/src/fork_readonly_tikv.rs b/cmd/tikv-ctl/src/fork_readonly_tikv.rs index d1a917f5624..ef3ae7f8023 100644 --- a/cmd/tikv-ctl/src/fork_readonly_tikv.rs +++ b/cmd/tikv-ctl/src/fork_readonly_tikv.rs @@ -265,7 +265,6 @@ where .map_err(|e| format!("copy({}, {}): {}", src.display(), dst.display(), e)) } -#[allow(clippy::permissions_set_readonly_false)] fn add_write_permission>(path: P) -> Result<(), String> { let path = path.as_ref(); let mut pmt = std::fs::metadata(path) diff --git a/cmd/tikv-ctl/src/main.rs b/cmd/tikv-ctl/src/main.rs index c1ab11cc507..6baa1fe6c39 100644 --- a/cmd/tikv-ctl/src/main.rs +++ b/cmd/tikv-ctl/src/main.rs @@ -1,7 +1,7 @@ // Copyright 2016 TiKV Project Authors. Licensed under Apache-2.0. +#![feature(once_cell)] #![feature(let_chains)] -#![feature(lazy_cell)] #[macro_use] extern crate log; diff --git a/components/backup-stream/Cargo.toml b/components/backup-stream/Cargo.toml index 4f53c39b9db..8c1edc89a48 100644 --- a/components/backup-stream/Cargo.toml +++ b/components/backup-stream/Cargo.toml @@ -51,7 +51,7 @@ futures-io = "0.3" grpcio = { workspace = true } hex = "0.4" # Fixing ahash cyclic dep: https://github.com/tkaitchuck/ahash/issues/95 -indexmap = "=1.9.3" +indexmap = "=1.6.2" kvproto = { workspace = true } lazy_static = "1.4" log_wrappers = { workspace = true } diff --git a/components/backup-stream/src/errors.rs b/components/backup-stream/src/errors.rs index cc720d5aecc..c3cc91da9ff 100644 --- a/components/backup-stream/src/errors.rs +++ b/components/backup-stream/src/errors.rs @@ -158,7 +158,7 @@ where /// Like `errors.Annotate` in Go. /// Wrap an unknown error with [`Error::Other`]. -#[macro_export] +#[macro_export(crate)] macro_rules! annotate { ($inner: expr, $message: expr) => { { @@ -242,7 +242,6 @@ mod test { #[bench] // 2,685 ns/iter (+/- 194) - #[allow(clippy::unnecessary_literal_unwrap)] fn contextual_add_format_strings_directly(b: &mut test::Bencher) { b.iter(|| { let err = Error::Io(io::Error::new( @@ -306,7 +305,6 @@ mod test { #[bench] // 773 ns/iter (+/- 8) - #[allow(clippy::unnecessary_literal_unwrap)] fn baseline(b: &mut test::Bencher) { b.iter(|| { let err = Error::Io(io::Error::new( diff --git a/components/backup-stream/src/metadata/client.rs b/components/backup-stream/src/metadata/client.rs index df8f0f025b1..1fdc1b3b1e8 100644 --- a/components/backup-stream/src/metadata/client.rs +++ b/components/backup-stream/src/metadata/client.rs @@ -663,10 +663,11 @@ impl MetadataClient { let cp = match r.len() { 0 => { let global_cp = self.global_checkpoint_of(task).await?; - match global_cp { + let cp = match global_cp { None => self.get_task_start_ts_checkpoint(task).await?, Some(cp) => cp, - } + }; + cp } _ => Checkpoint::from_kv(&r[0])?, }; diff --git a/components/backup-stream/src/router.rs b/components/backup-stream/src/router.rs index ae4b98b1687..1786d513dc8 100644 --- a/components/backup-stream/src/router.rs +++ b/components/backup-stream/src/router.rs @@ -731,7 +731,6 @@ impl TempFileKey { } } - #[allow(deprecated)] fn format_date_time(ts: u64, t: FormatType) -> impl Display { use chrono::prelude::*; let millis = TimeStamp::physical(ts.into()); @@ -956,9 +955,7 @@ impl StreamTaskInfo { .last_flush_time .swap(Box::into_raw(Box::new(Instant::now())), Ordering::SeqCst); // manual gc last instant - unsafe { - let _ = Box::from_raw(ptr); - } + unsafe { Box::from_raw(ptr) }; } pub fn should_flush(&self, flush_interval: &Duration) -> bool { diff --git a/components/backup-stream/src/subscription_track.rs b/components/backup-stream/src/subscription_track.rs index 5a6b2e0753b..c70ad9c8038 100644 --- a/components/backup-stream/src/subscription_track.rs +++ b/components/backup-stream/src/subscription_track.rs @@ -307,7 +307,7 @@ impl SubscriptionTracer { } }; - let subscription = sub.value_mut(); + let mut subscription = sub.value_mut(); let old_epoch = subscription.meta.get_region_epoch(); let new_epoch = new_region.get_region_epoch(); diff --git a/components/backup-stream/src/utils.rs b/components/backup-stream/src/utils.rs index 5e798a8428c..974b1762cf2 100644 --- a/components/backup-stream/src/utils.rs +++ b/components/backup-stream/src/utils.rs @@ -280,7 +280,7 @@ pub fn request_to_triple(mut req: Request) -> Either<(Vec, Vec, CfName), /// `try_send!(s: Scheduler, task: T)` tries to send a task to the scheduler, /// once meet an error, would report it, with the current file and line (so it /// is made as a macro). returns whether it success. -#[macro_export] +#[macro_export(crate)] macro_rules! try_send { ($s:expr, $task:expr) => { match $s.schedule($task) { @@ -304,7 +304,7 @@ macro_rules! try_send { /// `backup_stream_debug`. because once we enable debug log for all crates, it /// would soon get too verbose to read. using this macro now we can enable debug /// log level for the crate only (even compile time...). -#[macro_export] +#[macro_export(crate)] macro_rules! debug { ($($t: tt)+) => { if cfg!(feature = "backup-stream-debug") { diff --git a/components/backup/src/endpoint.rs b/components/backup/src/endpoint.rs index d6330f49966..a4efc162092 100644 --- a/components/backup/src/endpoint.rs +++ b/components/backup/src/endpoint.rs @@ -2493,8 +2493,8 @@ pub mod tests { fn test_backup_file_name() { let region = metapb::Region::default(); let store_id = 1; - let test_cases = ["s3", "local", "gcs", "azure", "hdfs"]; - let test_target = [ + let test_cases = vec!["s3", "local", "gcs", "azure", "hdfs"]; + let test_target = vec![ "1/0_0_000", "1/0_0_000", "1_0_0_000", @@ -2513,7 +2513,7 @@ pub mod tests { assert_eq!(target.to_string(), prefix_arr.join(delimiter)); } - let test_target = ["1/0_0", "1/0_0", "1_0_0", "1_0_0", "1_0_0"]; + let test_target = vec!["1/0_0", "1/0_0", "1_0_0", "1_0_0", "1_0_0"]; for (storage_name, target) in test_cases.iter().zip(test_target.iter()) { let key = None; let filename = backup_file_name(store_id, ®ion, key, storage_name); diff --git a/components/batch-system/src/fsm.rs b/components/batch-system/src/fsm.rs index 16113dde8e2..3fa5ad15a64 100644 --- a/components/batch-system/src/fsm.rs +++ b/components/batch-system/src/fsm.rs @@ -149,9 +149,7 @@ impl FsmState { Ok(_) => return, Err(Self::NOTIFYSTATE_DROP) => { let ptr = self.data.swap(ptr::null_mut(), Ordering::AcqRel); - unsafe { - let _ = Box::from_raw(ptr); - } + unsafe { Box::from_raw(ptr) }; return; } Err(s) => s, @@ -181,9 +179,7 @@ impl Drop for FsmState { fn drop(&mut self) { let ptr = self.data.swap(ptr::null_mut(), Ordering::SeqCst); if !ptr.is_null() { - unsafe { - let _ = Box::from_raw(ptr); - } + unsafe { Box::from_raw(ptr) }; } self.state_cnt.fetch_sub(1, Ordering::Relaxed); } diff --git a/components/case_macros/src/lib.rs b/components/case_macros/src/lib.rs index b779373a59d..057b68065d2 100644 --- a/components/case_macros/src/lib.rs +++ b/components/case_macros/src/lib.rs @@ -5,12 +5,12 @@ use proc_macro::{Group, Literal, TokenStream, TokenTree}; macro_rules! transform_idents_in_stream_to_string { - ($stream:ident, $transform:ident) => { + ($stream:ident, $transform:expr) => { $stream .into_iter() .map(|token_tree| match token_tree { TokenTree::Ident(ref ident) => { - Literal::string(&$transform(&ident.to_string())).into() + Literal::string(&$transform(ident.to_string())).into() } // find all idents in `TokenGroup` apply and reconstruct the group TokenTree::Group(ref group) => TokenTree::Group(Group::new( @@ -20,7 +20,7 @@ macro_rules! transform_idents_in_stream_to_string { .into_iter() .map(|group_token_tree| { if let TokenTree::Ident(ref ident) = group_token_tree { - Literal::string(&$transform(&ident.to_string())).into() + Literal::string(&$transform(ident.to_string())).into() } else { group_token_tree } @@ -53,7 +53,7 @@ fn to_snake(s: &str) -> String { /// e.g. `HelloWorld` -> `hello-world` #[proc_macro] pub fn kebab_case(stream: TokenStream) -> TokenStream { - transform_idents_in_stream_to_string!(stream, to_kebab) + transform_idents_in_stream_to_string!(stream, |s: String| to_kebab(&s)) } /// Expands idents in the input stream as snake-case string literal @@ -61,5 +61,5 @@ pub fn kebab_case(stream: TokenStream) -> TokenStream { /// e.g. `HelloWorld` -> `hello_world` #[proc_macro] pub fn snake_case(stream: TokenStream) -> TokenStream { - transform_idents_in_stream_to_string!(stream, to_snake) + transform_idents_in_stream_to_string!(stream, |s: String| to_snake(&s)) } diff --git a/components/cdc/src/delegate.rs b/components/cdc/src/delegate.rs index 18528fd08e9..c82c4cb6f13 100644 --- a/components/cdc/src/delegate.rs +++ b/components/cdc/src/delegate.rs @@ -1437,7 +1437,7 @@ mod tests { #[test] fn test_observed_range() { - for case in [ + for case in vec![ (b"".as_slice(), b"".as_slice(), false), (b"a", b"", false), (b"", b"b", false), diff --git a/components/cdc/src/endpoint.rs b/components/cdc/src/endpoint.rs index 9d5601eba84..a5f00a08028 100644 --- a/components/cdc/src/endpoint.rs +++ b/components/cdc/src/endpoint.rs @@ -1015,10 +1015,10 @@ impl, E: KvEngine, S: StoreRegionMeta> Endpoint( - &self, + pub fn find_first<'m, T>( + &'m self, start_key: Option<&Key>, end_key: Option<&Key>, mut pred: impl FnMut(Arc) -> Option, diff --git a/components/coprocessor_plugin_api/src/util.rs b/components/coprocessor_plugin_api/src/util.rs index 06e8847402f..31d75610d75 100644 --- a/components/coprocessor_plugin_api/src/util.rs +++ b/components/coprocessor_plugin_api/src/util.rs @@ -19,14 +19,10 @@ pub type PluginConstructorSignature = /// Type signature of the exported function with symbol /// [`PLUGIN_GET_BUILD_INFO_SYMBOL`]. -// emit this warn because to fix it need to change the data type which is a breaking change. -#[allow(improper_ctypes_definitions)] pub type PluginGetBuildInfoSignature = extern "C" fn() -> BuildInfo; /// Type signature of the exported function with symbol /// [`PLUGIN_GET_PLUGIN_INFO_SYMBOL`]. -// emit this warn because to fix it need to change the data type which is a breaking change. -#[allow(improper_ctypes_definitions)] pub type PluginGetPluginInfoSignature = extern "C" fn() -> PluginInfo; /// Automatically collected build information about the plugin that is exposed diff --git a/components/encryption/src/config.rs b/components/encryption/src/config.rs index 4455e4ce7cc..23e049e0df4 100644 --- a/components/encryption/src/config.rs +++ b/components/encryption/src/config.rs @@ -134,12 +134,11 @@ impl KmsConfig { } } -#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)] +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] #[serde(rename_all = "kebab-case", tag = "type")] pub enum MasterKeyConfig { // Store encryption metadata as plaintext. Data still get encrypted. Not allowed to use if // encryption is enabled. (i.e. when encryption_config.method != Plaintext). - #[default] Plaintext, // Pass master key from a file, with key encoded as a readable hex string. The file should end @@ -157,6 +156,12 @@ pub enum MasterKeyConfig { }, } +impl Default for MasterKeyConfig { + fn default() -> Self { + MasterKeyConfig::Plaintext + } +} + mod encryption_method_serde { use std::fmt; diff --git a/components/engine_rocks/src/logger.rs b/components/engine_rocks/src/logger.rs index 185411dcacf..85f4de713ac 100644 --- a/components/engine_rocks/src/logger.rs +++ b/components/engine_rocks/src/logger.rs @@ -3,6 +3,7 @@ use rocksdb::{DBInfoLogLevel as InfoLogLevel, Logger}; use tikv_util::{crit, debug, error, info, warn}; // TODO(yiwu): abstract the Logger interface. +#[derive(Default)] pub struct RocksdbLogger; impl Logger for RocksdbLogger { @@ -43,6 +44,7 @@ impl Logger for TabletLogger { } } +#[derive(Default)] pub struct RaftDbLogger; impl Logger for RaftDbLogger { diff --git a/components/engine_rocks/src/properties.rs b/components/engine_rocks/src/properties.rs index 700d7621dc6..87ccab9e5ab 100644 --- a/components/engine_rocks/src/properties.rs +++ b/components/engine_rocks/src/properties.rs @@ -144,7 +144,10 @@ pub struct RangeProperties { impl RangeProperties { pub fn get(&self, key: &[u8]) -> &RangeOffsets { - let idx = self.offsets.binary_search_by_key(&key, |(k, _)| k).unwrap(); + let idx = self + .offsets + .binary_search_by_key(&key, |&(ref k, _)| k) + .unwrap(); &self.offsets[idx].1 } @@ -202,11 +205,11 @@ impl RangeProperties { if start == end { return (0, 0); } - let start_offset = match self.offsets.binary_search_by_key(&start, |(k, _)| k) { + let start_offset = match self.offsets.binary_search_by_key(&start, |&(ref k, _)| k) { Ok(idx) => Some(idx), Err(next_idx) => next_idx.checked_sub(1), }; - let end_offset = match self.offsets.binary_search_by_key(&end, |(k, _)| k) { + let end_offset = match self.offsets.binary_search_by_key(&end, |&(ref k, _)| k) { Ok(idx) => Some(idx), Err(next_idx) => next_idx.checked_sub(1), }; @@ -224,7 +227,7 @@ impl RangeProperties { ) -> Vec<(Vec, RangeOffsets)> { let start_offset = match self .offsets - .binary_search_by_key(&start_key, |(ref k, _)| k) + .binary_search_by_key(&start_key, |&(ref k, _)| k) { Ok(idx) => { if idx == self.offsets.len() - 1 { @@ -236,7 +239,7 @@ impl RangeProperties { Err(next_idx) => next_idx, }; - let end_offset = match self.offsets.binary_search_by_key(&end_key, |(ref k, _)| k) { + let end_offset = match self.offsets.binary_search_by_key(&end_key, |&(ref k, _)| k) { Ok(idx) => { if idx == 0 { return vec![]; @@ -866,7 +869,7 @@ mod tests { let mut collector = MvccPropertiesCollector::new(KeyMode::Txn); b.iter(|| { - for (k, v) in &entries { + for &(ref k, ref v) in &entries { collector.add(k, v, DBEntryType::Put, 0, 0); } }); diff --git a/components/engine_tirocks/src/properties/mvcc.rs b/components/engine_tirocks/src/properties/mvcc.rs index 66c96284ea3..1ca170f33d5 100644 --- a/components/engine_tirocks/src/properties/mvcc.rs +++ b/components/engine_tirocks/src/properties/mvcc.rs @@ -356,7 +356,7 @@ mod tests { let mut collector = MvccPropertiesCollector::new(CStr::from_bytes_with_nul(b"\0").unwrap(), KeyMode::Txn); b.iter(|| { - for (k, v) in &entries { + for &(ref k, ref v) in &entries { collector.add(k, v, EntryType::kEntryPut, 0, 0).unwrap(); } }); diff --git a/components/engine_tirocks/src/properties/range.rs b/components/engine_tirocks/src/properties/range.rs index e8a3411b02f..59b9e68a6bb 100644 --- a/components/engine_tirocks/src/properties/range.rs +++ b/components/engine_tirocks/src/properties/range.rs @@ -53,7 +53,7 @@ impl RangeProperties { pub fn get(&self, key: &[u8]) -> &RangeOffsets { let idx = self .offsets - .binary_search_by_key(&key, |(k, _)| k) + .binary_search_by_key(&key, |&(ref k, _)| k) .unwrap(); &self.offsets[idx].1 } @@ -112,11 +112,11 @@ impl RangeProperties { if start == end { return (0, 0); } - let start_offset = match self.offsets.binary_search_by_key(&start, |(k, _)| k) { + let start_offset = match self.offsets.binary_search_by_key(&start, |&(ref k, _)| k) { Ok(idx) => Some(idx), Err(next_idx) => next_idx.checked_sub(1), }; - let end_offset = match self.offsets.binary_search_by_key(&end, |(k, _)| k) { + let end_offset = match self.offsets.binary_search_by_key(&end, |&(ref k, _)| k) { Ok(idx) => Some(idx), Err(next_idx) => next_idx.checked_sub(1), }; @@ -134,7 +134,7 @@ impl RangeProperties { ) -> Vec<(Vec, RangeOffsets)> { let start_offset = match self .offsets - .binary_search_by_key(&start_key, |(k, _)| k) + .binary_search_by_key(&start_key, |&(ref k, _)| k) { Ok(idx) => { if idx == self.offsets.len() - 1 { @@ -146,7 +146,7 @@ impl RangeProperties { Err(next_idx) => next_idx, }; - let end_offset = match self.offsets.binary_search_by_key(&end_key, |(k, _)| k) { + let end_offset = match self.offsets.binary_search_by_key(&end_key, |&(ref k, _)| k) { Ok(idx) => { if idx == 0 { return vec![]; diff --git a/components/engine_traits/src/flush.rs b/components/engine_traits/src/flush.rs index 6449399cef8..8590236e126 100644 --- a/components/engine_traits/src/flush.rs +++ b/components/engine_traits/src/flush.rs @@ -119,7 +119,7 @@ impl SstApplyState { for sst in ssts { let cf_index = data_cf_offset(sst.get_cf_name()); if let Some(metas) = sst_list.get_mut(cf_index) { - let _ = metas.extract_if(|entry| entry.sst.get_uuid() == sst.get_uuid()); + metas.drain_filter(|entry| entry.sst.get_uuid() == sst.get_uuid()); } } } diff --git a/components/engine_traits/src/lib.rs b/components/engine_traits/src/lib.rs index 0f89776e7fd..e09b1b52733 100644 --- a/components/engine_traits/src/lib.rs +++ b/components/engine_traits/src/lib.rs @@ -253,8 +253,8 @@ #![feature(assert_matches)] #![feature(linked_list_cursors)] #![feature(let_chains)] -#![feature(str_split_remainder)] -#![feature(extract_if)] +#![feature(str_split_as_str)] +#![feature(drain_filter)] #[macro_use(fail_point)] extern crate fail; diff --git a/components/engine_traits/src/tablet.rs b/components/engine_traits/src/tablet.rs index 64e6dcbd4b4..c88f1548513 100644 --- a/components/engine_traits/src/tablet.rs +++ b/components/engine_traits/src/tablet.rs @@ -241,7 +241,7 @@ impl TabletRegistry { let mut parts = name.rsplit('_'); let suffix = parts.next()?.parse().ok()?; let id = parts.next()?.parse().ok()?; - let prefix = parts.remainder().unwrap_or(""); + let prefix = parts.as_str(); Some((prefix, id, suffix)) } diff --git a/components/online_config/online_config_derive/src/lib.rs b/components/online_config/online_config_derive/src/lib.rs index e48a540c6b8..bb37aad5924 100644 --- a/components/online_config/online_config_derive/src/lib.rs +++ b/components/online_config/online_config_derive/src/lib.rs @@ -330,11 +330,15 @@ fn is_option_type(ty: &Type) -> bool { // TODO store (with lazy static) the vec of string // TODO maybe optimization, reverse the order of segments fn extract_option_segment(path: &Path) -> Option<&PathSegment> { - let idents_of_path = path.segments.iter().fold(String::new(), |mut acc, v| { - acc.push_str(&v.ident.to_string()); - acc.push('|'); - acc - }); + let idents_of_path = path + .segments + .iter() + .into_iter() + .fold(String::new(), |mut acc, v| { + acc.push_str(&v.ident.to_string()); + acc.push('|'); + acc + }); vec!["Option|", "std|option|Option|", "core|option|Option|"] .into_iter() .find(|s| idents_of_path == *s) diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index 73b65bc0904..cd5ae8f42f7 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -488,11 +488,7 @@ impl StorePollerBuilder { self.remove_dir(&path)?; continue; } - let Some((prefix, region_id, tablet_index)) = - self.tablet_registry.parse_tablet_name(&path) - else { - continue; - }; + let Some((prefix, region_id, tablet_index)) = self.tablet_registry.parse_tablet_name(&path) else { continue }; if prefix == MERGE_SOURCE_PREFIX { continue; } diff --git a/components/raftstore-v2/src/lib.rs b/components/raftstore-v2/src/lib.rs index 697d0525169..5b5e132b9ce 100644 --- a/components/raftstore-v2/src/lib.rs +++ b/components/raftstore-v2/src/lib.rs @@ -27,7 +27,6 @@ #![feature(box_into_inner)] #![feature(assert_matches)] #![feature(option_get_or_insert_default)] -#![allow(clippy::needless_pass_by_ref_mut)] mod batch; mod bootstrap; diff --git a/components/raftstore-v2/src/operation/command/admin/merge/prepare.rs b/components/raftstore-v2/src/operation/command/admin/merge/prepare.rs index 76b71a8906c..d3d1896287c 100644 --- a/components/raftstore-v2/src/operation/command/admin/merge/prepare.rs +++ b/components/raftstore-v2/src/operation/command/admin/merge/prepare.rs @@ -343,9 +343,7 @@ impl Peer { entry.get_data(), entry.get_index(), entry.get_term(), - ) else { - continue; - }; + ) else { continue }; let cmd_type = cmd.get_admin_request().get_cmd_type(); match cmd_type { AdminCmdType::TransferLeader diff --git a/components/raftstore-v2/src/operation/command/admin/split.rs b/components/raftstore-v2/src/operation/command/admin/split.rs index 2fe2b4b5735..0f9cae7218d 100644 --- a/components/raftstore-v2/src/operation/command/admin/split.rs +++ b/components/raftstore-v2/src/operation/command/admin/split.rs @@ -1098,9 +1098,7 @@ mod test { } } - let AdminCmdResult::SplitRegion(SplitResult { tablet, .. }) = apply_res else { - panic!() - }; + let AdminCmdResult::SplitRegion(SplitResult { tablet, .. }) = apply_res else { panic!() }; // update cache let mut cache = apply.tablet_registry().get(parent_id).unwrap(); cache.set(*tablet.downcast().unwrap()); diff --git a/components/raftstore-v2/src/operation/command/admin/transfer_leader.rs b/components/raftstore-v2/src/operation/command/admin/transfer_leader.rs index f60b9828bbb..4cdeba3bc41 100644 --- a/components/raftstore-v2/src/operation/command/admin/transfer_leader.rs +++ b/components/raftstore-v2/src/operation/command/admin/transfer_leader.rs @@ -50,21 +50,21 @@ impl Peer { /// to target follower first to ensures it's ready to become leader. /// After that the real transfer leader process begin. /// - /// 1. pre_transfer_leader on leader: Leader will send a MsgTransferLeader - /// to follower. - /// 2. execute_transfer_leader on follower If follower passes all necessary - /// checks, it will reply an ACK with type MsgTransferLeader and its - /// promised applied index. - /// 3. ready_to_transfer_leader on leader: Leader checks if it's appropriate - /// to transfer leadership. If it does, it calls raft transfer_leader API - /// to do the remaining work. + /// 1. pre_transfer_leader on leader: + /// Leader will send a MsgTransferLeader to follower. + /// 2. execute_transfer_leader on follower + /// If follower passes all necessary checks, it will reply an + /// ACK with type MsgTransferLeader and its promised applied index. + /// 3. ready_to_transfer_leader on leader: + /// Leader checks if it's appropriate to transfer leadership. If it + /// does, it calls raft transfer_leader API to do the remaining work. /// /// Additional steps when there are remaining pessimistic /// locks to propose (detected in function on_transfer_leader_msg). /// 1. Leader firstly proposes pessimistic locks and then proposes a /// TransferLeader command. - /// 2. The follower applies the TransferLeader command and replies an ACK - /// with special context TRANSFER_LEADER_COMMAND_REPLY_CTX. + /// 2. The follower applies the TransferLeader command and replies an + /// ACK with special context TRANSFER_LEADER_COMMAND_REPLY_CTX. /// /// See also: tikv/rfcs#37. pub fn propose_transfer_leader( diff --git a/components/raftstore-v2/src/operation/life.rs b/components/raftstore-v2/src/operation/life.rs index 6b778ad6c4a..4d1a59de0a6 100644 --- a/components/raftstore-v2/src/operation/life.rs +++ b/components/raftstore-v2/src/operation/life.rs @@ -662,12 +662,8 @@ impl Peer { let check_peer_id = check.get_check_peer().get_id(); let records = self.storage().region_state().get_merged_records(); let Some(record) = records.iter().find(|r| { - r.get_source_peers() - .iter() - .any(|p| p.get_id() == check_peer_id) - }) else { - return; - }; + r.get_source_peers().iter().any(|p| p.get_id() == check_peer_id) + }) else { return }; let source_index = record.get_source_index(); forward_destroy_to_source_peer(msg, |m| { let source_checkpoint = super::merge_source_path( diff --git a/components/raftstore-v2/src/operation/query/local.rs b/components/raftstore-v2/src/operation/query/local.rs index fcc93636640..ea802650f3d 100644 --- a/components/raftstore-v2/src/operation/query/local.rs +++ b/components/raftstore-v2/src/operation/query/local.rs @@ -351,9 +351,7 @@ where match fut.await? { Some(query_res) => { if query_res.read().is_none() { - let QueryResult::Response(res) = query_res else { - unreachable!() - }; + let QueryResult::Response(res) = query_res else { unreachable!() }; // Get an error explicitly in header, // or leader reports KeyIsLocked error via read index. assert!( diff --git a/components/raftstore-v2/src/operation/ready/apply_trace.rs b/components/raftstore-v2/src/operation/ready/apply_trace.rs index 2b6c9c666e6..af0257e763f 100644 --- a/components/raftstore-v2/src/operation/ready/apply_trace.rs +++ b/components/raftstore-v2/src/operation/ready/apply_trace.rs @@ -774,7 +774,7 @@ impl Peer { flushed = true; let flush_state = self.flush_state().clone(); - let apply_trace = self.storage_mut().apply_trace_mut(); + let mut apply_trace = self.storage_mut().apply_trace_mut(); let flushed_indexes = flush_state.as_ref().flushed_index(); for i in 0..flushed_indexes.len() { diff --git a/components/raftstore-v2/src/operation/ready/snapshot.rs b/components/raftstore-v2/src/operation/ready/snapshot.rs index 15caf5f0c84..9e0ed449cef 100644 --- a/components/raftstore-v2/src/operation/ready/snapshot.rs +++ b/components/raftstore-v2/src/operation/ready/snapshot.rs @@ -570,9 +570,10 @@ impl Storage { pub fn cancel_generating_snap_due_to_compacted(&self, compact_to: u64) { let mut states = self.snap_states.borrow_mut(); states.retain(|id, state| { - let SnapState::Generating { ref index, .. } = *state else { - return true; - }; + let SnapState::Generating { + ref index, + .. + } = *state else { return true; }; let snap_index = index.load(Ordering::SeqCst); if snap_index == 0 || compact_to <= snap_index + 1 { return true; @@ -599,9 +600,10 @@ impl Storage { } let (mut snapshot, to_peer_id) = *res.unwrap(); if let Some(state) = self.snap_states.borrow_mut().get_mut(&to_peer_id) { - let SnapState::Generating { ref index, .. } = *state else { - return false; - }; + let SnapState::Generating { + ref index, + .. + } = *state else { return false }; if snapshot.get_metadata().get_index() < index.load(Ordering::SeqCst) { warn!( self.logger(), diff --git a/components/raftstore-v2/src/operation/txn_ext.rs b/components/raftstore-v2/src/operation/txn_ext.rs index 6c3a9269a7f..272b2526b39 100644 --- a/components/raftstore-v2/src/operation/txn_ext.rs +++ b/components/raftstore-v2/src/operation/txn_ext.rs @@ -266,9 +266,7 @@ impl Peer { self.logger, "propose {} locks before transferring leader", lock_count; ); - let PeerMsg::SimpleWrite(write) = PeerMsg::simple_write(header, encoder.encode()).0 else { - unreachable!() - }; + let PeerMsg::SimpleWrite(write) = PeerMsg::simple_write(header, encoder.encode()).0 else {unreachable!()}; self.on_simple_write(ctx, write.header, write.data, write.ch); true } diff --git a/components/raftstore-v2/src/operation/unsafe_recovery/demote.rs b/components/raftstore-v2/src/operation/unsafe_recovery/demote.rs index e7b3c8e62b8..37962a45452 100644 --- a/components/raftstore-v2/src/operation/unsafe_recovery/demote.rs +++ b/components/raftstore-v2/src/operation/unsafe_recovery/demote.rs @@ -100,10 +100,7 @@ impl Peer { failed_voters, target_index, demote_after_exit, - }) = self.unsafe_recovery_state() - else { - return; - }; + }) = self.unsafe_recovery_state() else { return }; if self.raft_group().raft.raft_log.applied < *target_index { return; diff --git a/components/raftstore-v2/src/worker/cleanup/compact.rs b/components/raftstore-v2/src/worker/cleanup/compact.rs index feb519a04ad..7acdb943b91 100644 --- a/components/raftstore-v2/src/worker/cleanup/compact.rs +++ b/components/raftstore-v2/src/worker/cleanup/compact.rs @@ -97,12 +97,8 @@ where ) { Ok(mut region_ids) => { for region_id in region_ids.drain(..) { - let Some(mut tablet_cache) = self.tablet_registry.get(region_id) else { - continue; - }; - let Some(tablet) = tablet_cache.latest() else { - continue; - }; + let Some(mut tablet_cache) = self.tablet_registry.get(region_id) else {continue}; + let Some(tablet) = tablet_cache.latest() else {continue}; for cf in &cf_names { if let Err(e) = tablet.compact_range_cf(cf, None, None, false, 1 /* threads */) @@ -147,12 +143,8 @@ fn collect_regions_to_compact( ); let mut regions_to_compact = vec![]; for id in region_ids { - let Some(mut tablet_cache) = reg.get(id) else { - continue; - }; - let Some(tablet) = tablet_cache.latest() else { - continue; - }; + let Some(mut tablet_cache) = reg.get(id) else {continue}; + let Some(tablet) = tablet_cache.latest() else {continue}; if tablet.auto_compactions_is_disabled().expect("cf") { info!( logger, diff --git a/components/raftstore-v2/src/worker/pd/region.rs b/components/raftstore-v2/src/worker/pd/region.rs index 999eccb4962..763e12fff07 100644 --- a/components/raftstore-v2/src/worker/pd/region.rs +++ b/components/raftstore-v2/src/worker/pd/region.rs @@ -113,7 +113,10 @@ where let approximate_keys = task.approximate_keys.unwrap_or_default(); let region_id = task.region.get_id(); - let peer_stat = self.region_peers.entry(region_id).or_default(); + let peer_stat = self + .region_peers + .entry(region_id) + .or_insert_with(PeerStat::default); peer_stat.approximate_size = approximate_size; peer_stat.approximate_keys = approximate_keys; @@ -370,7 +373,10 @@ where pub fn handle_update_read_stats(&mut self, mut stats: ReadStats) { for (region_id, region_info) in stats.region_infos.iter_mut() { - let peer_stat = self.region_peers.entry(*region_id).or_default(); + let peer_stat = self + .region_peers + .entry(*region_id) + .or_insert_with(PeerStat::default); peer_stat.read_bytes += region_info.flow.read_bytes as u64; peer_stat.read_keys += region_info.flow.read_keys as u64; self.store_stat.engine_total_bytes_read += region_info.flow.read_bytes as u64; @@ -392,7 +398,10 @@ where pub fn handle_update_write_stats(&mut self, mut stats: WriteStats) { for (region_id, region_info) in stats.region_infos.iter_mut() { - let peer_stat = self.region_peers.entry(*region_id).or_default(); + let peer_stat = self + .region_peers + .entry(*region_id) + .or_insert_with(PeerStat::default); peer_stat.query_stats.add_query_stats(®ion_info.0); self.store_stat .engine_total_query_num diff --git a/components/raftstore-v2/src/worker/pd/split.rs b/components/raftstore-v2/src/worker/pd/split.rs index 7bafb6c442a..7fec5a31bb6 100644 --- a/components/raftstore-v2/src/worker/pd/split.rs +++ b/components/raftstore-v2/src/worker/pd/split.rs @@ -142,10 +142,8 @@ where let f = async move { for split_info in split_infos { - let Ok(Some(region)) = pd_client.get_region_by_id(split_info.region_id).await - else { - continue; - }; + let Ok(Some(region)) = + pd_client.get_region_by_id(split_info.region_id).await else { continue }; // Try to split the region with the given split key. if let Some(split_key) = split_info.split_key { Self::ask_batch_split_imp( diff --git a/components/raftstore-v2/src/worker/tablet.rs b/components/raftstore-v2/src/worker/tablet.rs index ef9739226e7..206e87b3a8e 100644 --- a/components/raftstore-v2/src/worker/tablet.rs +++ b/components/raftstore-v2/src/worker/tablet.rs @@ -467,8 +467,7 @@ impl Runner { let Some(Some(tablet)) = self .tablet_registry .get(region_id) - .map(|mut cache| cache.latest().cloned()) - else { + .map(|mut cache| cache.latest().cloned()) else { warn!( self.logger, "flush memtable failed to acquire tablet"; @@ -556,15 +555,7 @@ impl Runner { } fn delete_range(&self, delete_range: Task) { - let Task::DeleteRange { - region_id, - tablet, - cf, - start_key, - end_key, - cb, - } = delete_range - else { + let Task::DeleteRange { region_id, tablet, cf, start_key, end_key, cb } = delete_range else { slog_panic!(self.logger, "unexpected task"; "task" => format!("{}", delete_range)) }; diff --git a/components/raftstore-v2/tests/integrations/cluster.rs b/components/raftstore-v2/tests/integrations/cluster.rs index a949725090d..5b3cc5feb93 100644 --- a/components/raftstore-v2/tests/integrations/cluster.rs +++ b/components/raftstore-v2/tests/integrations/cluster.rs @@ -137,9 +137,7 @@ impl TestRouter { match res { Ok(_) => return block_on(sub.result()).is_some(), Err(TrySendError::Disconnected(m)) => { - let PeerMsg::WaitFlush(ch) = m else { - unreachable!() - }; + let PeerMsg::WaitFlush(ch) = m else { unreachable!() }; match self .store_router() .send_control(StoreMsg::WaitFlush { region_id, ch }) diff --git a/components/raftstore/src/coprocessor/dispatcher.rs b/components/raftstore/src/coprocessor/dispatcher.rs index 756b7dc399e..d082013cd2c 100644 --- a/components/raftstore/src/coprocessor/dispatcher.rs +++ b/components/raftstore/src/coprocessor/dispatcher.rs @@ -471,7 +471,10 @@ impl CoprocessorHost { BoxSplitCheckObserver::new(KeysCheckObserver::new(ch)), ); registry.register_split_check_observer(100, BoxSplitCheckObserver::new(HalfCheckObserver)); - registry.register_split_check_observer(400, BoxSplitCheckObserver::new(TableCheckObserver)); + registry.register_split_check_observer( + 400, + BoxSplitCheckObserver::new(TableCheckObserver::default()), + ); registry.register_admin_observer(100, BoxAdminObserver::new(SplitObserver)); CoprocessorHost { registry, cfg } } diff --git a/components/raftstore/src/errors.rs b/components/raftstore/src/errors.rs index 6cf83a6cf84..d1597a77121 100644 --- a/components/raftstore/src/errors.rs +++ b/components/raftstore/src/errors.rs @@ -223,7 +223,7 @@ impl From for errorpb::Error { .mut_proposal_in_merging_mode() .set_region_id(region_id); } - Error::Transport(DiscardReason::Full) => { + Error::Transport(reason) if reason == DiscardReason::Full => { let mut server_is_busy_err = errorpb::ServerIsBusy::default(); server_is_busy_err.set_reason(RAFTSTORE_IS_BUSY.to_owned()); errorpb.set_server_is_busy(server_is_busy_err); diff --git a/components/raftstore/src/lib.rs b/components/raftstore/src/lib.rs index 197eaefeac7..1db5f79d226 100644 --- a/components/raftstore/src/lib.rs +++ b/components/raftstore/src/lib.rs @@ -5,13 +5,11 @@ #![feature(div_duration)] #![feature(min_specialization)] #![feature(box_patterns)] -#![feature(hash_extract_if)] +#![feature(hash_drain_filter)] #![feature(let_chains)] #![feature(assert_matches)] #![feature(type_alias_impl_trait)] -#![feature(impl_trait_in_assoc_type)] #![recursion_limit = "256"] -#![allow(clippy::needless_pass_by_ref_mut)] #[cfg(test)] extern crate test; diff --git a/components/raftstore/src/store/async_io/write.rs b/components/raftstore/src/store/async_io/write.rs index 12617bc28a2..eedd5052bbb 100644 --- a/components/raftstore/src/store/async_io/write.rs +++ b/components/raftstore/src/store/async_io/write.rs @@ -419,11 +419,7 @@ where } self.state_size = 0; if let ExtraBatchWrite::V2(_) = self.extra_batch_write { - let ExtraBatchWrite::V2(lb) = - mem::replace(&mut self.extra_batch_write, ExtraBatchWrite::None) - else { - unreachable!() - }; + let ExtraBatchWrite::V2(lb) = mem::replace(&mut self.extra_batch_write, ExtraBatchWrite::None) else { unreachable!() }; wb.merge(lb).unwrap(); } } diff --git a/components/raftstore/src/store/entry_storage.rs b/components/raftstore/src/store/entry_storage.rs index 95f099f77a7..c91c68538dd 100644 --- a/components/raftstore/src/store/entry_storage.rs +++ b/components/raftstore/src/store/entry_storage.rs @@ -1338,14 +1338,14 @@ pub mod tests { // Test the initial data structure size. let (tx, rx) = mpsc::sync_channel(8); let mut cache = EntryCache::new_with_cb(move |c: i64| tx.send(c).unwrap()); - assert_eq!(rx.try_recv().unwrap(), 0); + assert_eq!(rx.try_recv().unwrap(), 896); cache.append( 0, 0, &[new_padded_entry(101, 1, 1), new_padded_entry(102, 1, 2)], ); - assert_eq!(rx.try_recv().unwrap(), 419); + assert_eq!(rx.try_recv().unwrap(), 3); cache.prepend(vec![new_padded_entry(100, 1, 1)]); assert_eq!(rx.try_recv().unwrap(), 1); @@ -1371,7 +1371,7 @@ pub mod tests { // Test trace a dangle entry. let cached_entries = CachedEntries::new(vec![new_padded_entry(100, 1, 1)]); cache.trace_cached_entries(cached_entries); - assert_eq!(rx.try_recv().unwrap(), 97); + assert_eq!(rx.try_recv().unwrap(), 1); // Test trace an entry which is still in cache. let cached_entries = CachedEntries::new(vec![new_padded_entry(102, 3, 5)]); @@ -1398,7 +1398,7 @@ pub mod tests { assert_eq!(rx.try_recv().unwrap(), -7); drop(cache); - assert_eq!(rx.try_recv().unwrap(), -512); + assert_eq!(rx.try_recv().unwrap(), -896); } #[test] diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index 406c8d79d18..c170e5a35f9 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -1262,9 +1262,9 @@ where apply_ctx.host.on_empty_cmd(&self.region, index, term); // 1. When a peer become leader, it will send an empty entry. - // 2. When a leader tries to read index during transferring leader, it will also - // propose an empty entry. But that entry will not contain any associated - // callback. So no need to clear callback. + // 2. When a leader tries to read index during transferring leader, + // it will also propose an empty entry. But that entry will not contain + // any associated callback. So no need to clear callback. while let Some(mut cmd) = self.pending_cmds.pop_normal(u64::MAX, term - 1) { if let Some(cb) = cmd.cb.take() { apply_ctx @@ -4787,12 +4787,12 @@ where // command may not read the writes of previous commands and break ACID. If // it's still leader, there are two possibility that mailbox is closed: // 1. The process is shutting down. - // 2. The leader is destroyed. A leader won't propose to destroy itself, so it - // should either destroyed by older leaders or newer leaders. Leader won't - // respond to read until it has applied to current term, so no command will - // be proposed until command from older leaders have applied, which will then - // stop it from accepting proposals. If the command is proposed by new - // leader, then it won't be able to propose new proposals. + // 2. The leader is destroyed. A leader won't propose to destroy itself, so + // it should either destroyed by older leaders or newer leaders. Leader + // won't respond to read until it has applied to current term, so no + // command will be proposed until command from older leaders have applied, + // which will then stop it from accepting proposals. If the command is + // proposed by new leader, then it won't be able to propose new proposals. // So only shutdown needs to be checked here. if !tikv_util::thread_group::is_shutdown(!cfg!(test)) { for p in apply.cbs.drain(..) { diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index 371e8cd8eb5..30ba0c3059d 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -1015,10 +1015,10 @@ where // in snapshot recovery after we stopped all conf changes from PD. // if the follower slow than leader and has the pending conf change. // that's means - // 1. if the follower didn't finished the conf change => it cannot be chosen to - // be leader during recovery. - // 2. if the follower has been chosen to be leader => it already apply the - // pending conf change already. + // 1. if the follower didn't finished the conf change + // => it cannot be chosen to be leader during recovery. + // 2. if the follower has been chosen to be leader + // => it already apply the pending conf change already. return; } debug!( diff --git a/components/raftstore/src/store/msg.rs b/components/raftstore/src/store/msg.rs index a858b5afddd..64c5be6d7e1 100644 --- a/components/raftstore/src/store/msg.rs +++ b/components/raftstore/src/store/msg.rs @@ -169,25 +169,19 @@ where } pub fn has_proposed_cb(&self) -> bool { - let Callback::Write { proposed_cb, .. } = self else { - return false; - }; + let Callback::Write { proposed_cb, .. } = self else { return false; }; proposed_cb.is_some() } pub fn invoke_proposed(&mut self) { - let Callback::Write { proposed_cb, .. } = self else { - return; - }; + let Callback::Write { proposed_cb, .. } = self else { return; }; if let Some(cb) = proposed_cb.take() { cb(); } } pub fn invoke_committed(&mut self) { - let Callback::Write { committed_cb, .. } = self else { - return; - }; + let Callback::Write { committed_cb, .. } = self else { return; }; if let Some(cb) = committed_cb.take() { cb(); } @@ -201,16 +195,12 @@ where } pub fn take_proposed_cb(&mut self) -> Option { - let Callback::Write { proposed_cb, .. } = self else { - return None; - }; + let Callback::Write { proposed_cb, .. } = self else { return None; }; proposed_cb.take() } pub fn take_committed_cb(&mut self) -> Option { - let Callback::Write { committed_cb, .. } = self else { - return None; - }; + let Callback::Write { committed_cb, .. } = self else { return None; }; committed_cb.take() } } @@ -268,9 +258,7 @@ impl ReadCallback for Callback { } fn read_tracker(&self) -> Option { - let Callback::Read { tracker, .. } = self else { - return None; - }; + let Callback::Read { tracker, .. } = self else { return None; }; Some(*tracker) } } diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index aafd2f9695b..8ef857bfa12 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -2314,14 +2314,14 @@ where CheckApplyingSnapStatus::Applying => { // If this peer is applying snapshot, we should not get a new ready. // There are two reasons in my opinion: - // 1. If we handle a new ready and persist the data(e.g. entries), we can not - // tell raft-rs that this ready has been persisted because the ready need - // to be persisted one by one from raft-rs's view. - // 2. When this peer is applying snapshot, the response msg should not be sent - // to leader, thus the leader will not send new entries to this peer. - // Although it's possible a new leader may send a AppendEntries msg to this - // peer, this possibility is very low. In most cases, there is no msg need - // to be handled. + // 1. If we handle a new ready and persist the data(e.g. entries), + // we can not tell raft-rs that this ready has been persisted because + // the ready need to be persisted one by one from raft-rs's view. + // 2. When this peer is applying snapshot, the response msg should not + // be sent to leader, thus the leader will not send new entries to + // this peer. Although it's possible a new leader may send a AppendEntries + // msg to this peer, this possibility is very low. In most cases, there + // is no msg need to be handled. // So we choose to not get a new ready which makes the logic more clear. debug!( "still applying snapshot, skip further handling"; @@ -4467,25 +4467,27 @@ where /// to target follower first to ensures it's ready to become leader. /// After that the real transfer leader process begin. /// - /// 1. pre_transfer_leader on leader: Leader will send a MsgTransferLeader - /// to follower. - /// 2. pre_ack_transfer_leader_msg on follower: If follower passes all - /// necessary checks, it will try to warmup the entry cache. - /// 3. ack_transfer_leader_msg on follower: When the entry cache has been - /// warmed up or the operator is timeout, the follower reply an ACK with - /// type MsgTransferLeader and its promised persistent index. + /// 1. pre_transfer_leader on leader: + /// Leader will send a MsgTransferLeader to follower. + /// 2. pre_ack_transfer_leader_msg on follower: + /// If follower passes all necessary checks, it will try to warmup + /// the entry cache. + /// 3. ack_transfer_leader_msg on follower: + /// When the entry cache has been warmed up or the operator is timeout, + /// the follower reply an ACK with type MsgTransferLeader and + /// its promised persistent index. /// /// Additional steps when there are remaining pessimistic /// locks to propose (detected in function on_transfer_leader_msg). /// 1. Leader firstly proposes pessimistic locks and then proposes a /// TransferLeader command. - /// 2. ack_transfer_leader_msg on follower again: The follower applies - /// the TransferLeader command and replies an ACK with special context - /// TRANSFER_LEADER_COMMAND_REPLY_CTX. + /// 2. ack_transfer_leader_msg on follower again: + /// The follower applies the TransferLeader command and replies an + /// ACK with special context TRANSFER_LEADER_COMMAND_REPLY_CTX. /// - /// 4. ready_to_transfer_leader on leader: Leader checks if it's appropriate - /// to transfer leadership. If it does, it calls raft transfer_leader API - /// to do the remaining work. + /// 4. ready_to_transfer_leader on leader: + /// Leader checks if it's appropriate to transfer leadership. If it + /// does, it calls raft transfer_leader API to do the remaining work. /// /// See also: tikv/rfcs#37. fn propose_transfer_leader( @@ -5818,7 +5820,7 @@ mod tests { admin_req.clear_transfer_leader(); req.clear_admin_request(); - for (op, policy) in [ + for (op, policy) in vec![ (CmdType::Get, RequestPolicy::ReadLocal), (CmdType::Snap, RequestPolicy::ReadLocal), (CmdType::Put, RequestPolicy::ProposeNormal), @@ -5971,7 +5973,7 @@ mod tests { // (1, 4) and (1, 5) is not committed let entries = vec![(1, 1), (1, 2), (1, 3), (1, 4), (1, 5), (2, 6), (2, 7)]; - let committed = [(1, 1), (1, 2), (1, 3), (2, 6), (2, 7)]; + let committed = vec![(1, 1), (1, 2), (1, 3), (2, 6), (2, 7)]; for (index, term) in entries.clone() { if term != 1 { continue; diff --git a/components/raftstore/src/store/peer_storage.rs b/components/raftstore/src/store/peer_storage.rs index 1556338e9c0..a888929ca98 100644 --- a/components/raftstore/src/store/peer_storage.rs +++ b/components/raftstore/src/store/peer_storage.rs @@ -96,7 +96,7 @@ impl PartialEq for SnapState { (&SnapState::Relax, &SnapState::Relax) | (&SnapState::ApplyAborted, &SnapState::ApplyAborted) | (&SnapState::Generating { .. }, &SnapState::Generating { .. }) => true, - (SnapState::Applying(b1), SnapState::Applying(b2)) => { + (&SnapState::Applying(ref b1), &SnapState::Applying(ref b2)) => { b1.load(Ordering::Relaxed) == b2.load(Ordering::Relaxed) } _ => false, diff --git a/components/raftstore/src/store/region_snapshot.rs b/components/raftstore/src/store/region_snapshot.rs index 40168707f6a..bc22dfbf586 100644 --- a/components/raftstore/src/store/region_snapshot.rs +++ b/components/raftstore/src/store/region_snapshot.rs @@ -438,7 +438,7 @@ mod tests { (b"a9".to_vec(), b"v9".to_vec()), ]; - for (k, v) in &base_data { + for &(ref k, ref v) in &base_data { engines.kv.put(&data_key(k), v).unwrap(); } let store = new_peer_storage(engines, &r); @@ -482,11 +482,11 @@ mod tests { let mut data = vec![]; { let db = &engines.kv; - for (k, level) in &levels { + for &(ref k, level) in &levels { db.put(&data_key(k), k).unwrap(); db.flush_cfs(&[], true).unwrap(); data.push((k.to_vec(), k.to_vec())); - db.compact_files_in_range(Some(&data_key(k)), Some(&data_key(k)), Some(*level)) + db.compact_files_in_range(Some(&data_key(k)), Some(&data_key(k)), Some(level)) .unwrap(); } } diff --git a/components/raftstore/src/store/simple_write.rs b/components/raftstore/src/store/simple_write.rs index 1d8341c1c0b..a303a586935 100644 --- a/components/raftstore/src/store/simple_write.rs +++ b/components/raftstore/src/store/simple_write.rs @@ -579,17 +579,13 @@ mod tests { SimpleWriteReqDecoder::new(decoder_fallback, &logger, &bytes, 0, 0).unwrap(); assert_eq!(*decoder.header(), *header); let write = decoder.next().unwrap(); - let SimpleWrite::Put(put) = write else { - panic!("should be put") - }; + let SimpleWrite::Put(put) = write else { panic!("should be put") }; assert_eq!(put.cf, CF_DEFAULT); assert_eq!(put.key, b"key"); assert_eq!(put.value, b""); let write = decoder.next().unwrap(); - let SimpleWrite::Delete(delete) = write else { - panic!("should be delete") - }; + let SimpleWrite::Delete(delete) = write else { panic!("should be delete") }; assert_eq!(delete.cf, CF_WRITE); assert_eq!(delete.key, &delete_key); assert_matches!(decoder.next(), None); @@ -597,18 +593,14 @@ mod tests { let (bytes, _) = req_encoder2.encode(); decoder = SimpleWriteReqDecoder::new(decoder_fallback, &logger, &bytes, 0, 0).unwrap(); let write = decoder.next().unwrap(); - let SimpleWrite::DeleteRange(dr) = write else { - panic!("should be delete range") - }; + let SimpleWrite::DeleteRange(dr) = write else { panic!("should be delete range") }; assert_eq!(dr.cf, CF_LOCK); assert_eq!(dr.start_key, b"key"); assert_eq!(dr.end_key, b"key"); assert!(dr.notify_only); let write = decoder.next().unwrap(); - let SimpleWrite::DeleteRange(dr) = write else { - panic!("should be delete range") - }; + let SimpleWrite::DeleteRange(dr) = write else { panic!("should be delete range") }; assert_eq!(dr.cf, "cf"); assert_eq!(dr.start_key, b"key"); assert_eq!(dr.end_key, b"key"); @@ -634,9 +626,7 @@ mod tests { let mut decoder = SimpleWriteReqDecoder::new(decoder_fallback, &logger, &bytes, 0, 0).unwrap(); let write = decoder.next().unwrap(); - let SimpleWrite::Ingest(ssts) = write else { - panic!("should be ingest") - }; + let SimpleWrite::Ingest(ssts) = write else { panic!("should be ingest") }; assert_eq!(exp, ssts); assert_matches!(decoder.next(), None); } @@ -725,9 +715,7 @@ mod tests { SimpleWriteReqDecoder::new(decoder_fallback, &logger, &bytes, 0, 0).unwrap(); assert_eq!(*decoder.header(), *header); let req = decoder.next().unwrap(); - let SimpleWrite::Put(put) = req else { - panic!("should be put") - }; + let SimpleWrite::Put(put) = req else { panic!("should be put") }; assert_eq!(put.cf, CF_DEFAULT); assert_eq!(put.key, b"key"); assert_eq!(put.value, b""); diff --git a/components/raftstore/src/store/snap.rs b/components/raftstore/src/store/snap.rs index dcb98dd9cb2..6fe21fe9750 100644 --- a/components/raftstore/src/store/snap.rs +++ b/components/raftstore/src/store/snap.rs @@ -1323,7 +1323,7 @@ impl Write for Snapshot { } assert!(cf_file.size[self.cf_file_index] != 0); - let file_for_recving = cf_file + let mut file_for_recving = cf_file .file_for_recving .get_mut(self.cf_file_index) .unwrap(); @@ -2162,7 +2162,7 @@ impl TabletSnapManager { .stats .lock() .unwrap() - .extract_if(|_, (_, stat)| stat.get_region_id() > 0) + .drain_filter(|_, (_, stat)| stat.get_region_id() > 0) .map(|(_, (_, stat))| stat) .filter(|stat| stat.get_total_duration_sec() > 1) .collect(); diff --git a/components/raftstore/src/store/snap/io.rs b/components/raftstore/src/store/snap/io.rs index 8fcaf826c6a..3cdee1e40f1 100644 --- a/components/raftstore/src/store/snap/io.rs +++ b/components/raftstore/src/store/snap/io.rs @@ -327,7 +327,7 @@ mod tests { for db_creater in db_creaters { let (_enc_dir, enc_opts) = gen_db_options_with_encryption("test_cf_build_and_apply_plain_files_enc"); - for db_opt in [None, Some(enc_opts)] { + for db_opt in vec![None, Some(enc_opts)] { let dir = Builder::new().prefix("test-snap-cf-db").tempdir().unwrap(); let db: KvTestEngine = db_creater(dir.path(), db_opt.clone(), None).unwrap(); // Collect keys via the key_callback into a collection. @@ -408,7 +408,7 @@ mod tests { for db_creater in db_creaters { let (_enc_dir, enc_opts) = gen_db_options_with_encryption("test_cf_build_and_apply_sst_files_enc"); - for db_opt in [None, Some(enc_opts)] { + for db_opt in vec![None, Some(enc_opts)] { let dir = Builder::new().prefix("test-snap-cf-db").tempdir().unwrap(); let db = db_creater(dir.path(), db_opt.clone(), None).unwrap(); let snap_cf_dir = Builder::new().prefix("test-snap-cf").tempdir().unwrap(); diff --git a/components/raftstore/src/store/txn_ext.rs b/components/raftstore/src/store/txn_ext.rs index 9c73be2b9eb..0091fd4e7bb 100644 --- a/components/raftstore/src/store/txn_ext.rs +++ b/components/raftstore/src/store/txn_ext.rs @@ -244,7 +244,7 @@ impl PeerPessimisticLocks { // Locks that are marked deleted still need to be moved to the new regions, // and the deleted mark should also be cleared. // Refer to the comment in `PeerPessimisticLocks` for details. - let removed_locks = self.map.extract_if(|key, _| { + let removed_locks = self.map.drain_filter(|key, _| { let key = &**key.as_encoded(); let (start_key, end_key) = (derived.get_start_key(), derived.get_end_key()); key < start_key || (!end_key.is_empty() && key >= end_key) diff --git a/components/raftstore/src/store/util.rs b/components/raftstore/src/store/util.rs index ed2c70822c9..3f34fe691ee 100644 --- a/components/raftstore/src/store/util.rs +++ b/components/raftstore/src/store/util.rs @@ -124,7 +124,8 @@ pub fn is_vote_msg(msg: &eraftpb::Message) -> bool { /// peer or not. // There could be two cases: // 1. Target peer already exists but has not established communication with leader yet -// 2. Target peer is added newly due to member change or region split, but it's not created yet +// 2. Target peer is added newly due to member change or region split, but it's not +// created yet // For both cases the region start key and end key are attached in RequestVote and // Heartbeat message for the store of that peer to check whether to create a new peer // when receiving these messages, or just to wait for a pending region split to perform diff --git a/components/raftstore/src/store/worker/pd.rs b/components/raftstore/src/store/worker/pd.rs index 6aa192bd28e..606576b22e4 100644 --- a/components/raftstore/src/store/worker/pd.rs +++ b/components/raftstore/src/store/worker/pd.rs @@ -1704,7 +1704,10 @@ where fn handle_read_stats(&mut self, mut read_stats: ReadStats) { for (region_id, region_info) in read_stats.region_infos.iter_mut() { - let peer_stat = self.region_peers.entry(*region_id).or_default(); + let peer_stat = self + .region_peers + .entry(*region_id) + .or_insert_with(PeerStat::default); peer_stat.read_bytes += region_info.flow.read_bytes as u64; peer_stat.read_keys += region_info.flow.read_keys as u64; self.store_stat.engine_total_bytes_read += region_info.flow.read_bytes as u64; @@ -1726,7 +1729,10 @@ where fn handle_write_stats(&mut self, mut write_stats: WriteStats) { for (region_id, region_info) in write_stats.region_infos.iter_mut() { - let peer_stat = self.region_peers.entry(*region_id).or_default(); + let peer_stat = self + .region_peers + .entry(*region_id) + .or_insert_with(PeerStat::default); peer_stat.query_stats.add_query_stats(®ion_info.0); self.store_stat .engine_total_query_num @@ -2084,10 +2090,7 @@ where let f = async move { for split_info in split_infos { let Ok(Some(region)) = - pd_client.get_region_by_id(split_info.region_id).await - else { - continue; - }; + pd_client.get_region_by_id(split_info.region_id).await else { continue }; // Try to split the region with the given split key. if let Some(split_key) = split_info.split_key { Self::handle_ask_batch_split( @@ -2152,7 +2155,10 @@ where cpu_usage, ) = { let region_id = hb_task.region.get_id(); - let peer_stat = self.region_peers.entry(region_id).or_default(); + let peer_stat = self + .region_peers + .entry(region_id) + .or_insert_with(PeerStat::default); peer_stat.approximate_size = approximate_size; peer_stat.approximate_keys = approximate_keys; diff --git a/components/raftstore/src/store/worker/read.rs b/components/raftstore/src/store/worker/read.rs index 5a6e641f5dc..5d6ede9c193 100644 --- a/components/raftstore/src/store/worker/read.rs +++ b/components/raftstore/src/store/worker/read.rs @@ -2155,12 +2155,11 @@ mod tests { let (notify_tx, notify_rx) = channel(); let (wait_spawn_tx, wait_spawn_rx) = channel(); let runtime = tokio::runtime::Runtime::new().unwrap(); - let handler = runtime.spawn(async move { + let _ = runtime.spawn(async move { wait_spawn_tx.send(()).unwrap(); notify.notified().await; notify_tx.send(()).unwrap(); }); - drop(handler); wait_spawn_rx.recv().unwrap(); thread::sleep(std::time::Duration::from_millis(500)); // Prevent lost notify. must_not_redirect(&mut reader, &rx, task); diff --git a/components/raftstore/src/store/worker/region.rs b/components/raftstore/src/store/worker/region.rs index 7a675646f5c..068904b2a67 100644 --- a/components/raftstore/src/store/worker/region.rs +++ b/components/raftstore/src/store/worker/region.rs @@ -179,7 +179,7 @@ impl PendingDeleteRanges { ) -> Vec<(u64, Vec, Vec, u64)> { let ranges = self.find_overlap_ranges(start_key, end_key); - for (_, s_key, ..) in &ranges { + for &(_, ref s_key, ..) in &ranges { self.ranges.remove(s_key).unwrap(); } ranges @@ -1293,7 +1293,7 @@ pub(crate) mod tests { } }; - #[cfg(feature = "failpoints")] + #[allow(dead_code)] let must_not_finish = |ids: &[u64]| { for id in ids { let region_key = keys::region_state_key(*id); diff --git a/components/raftstore/src/store/worker/split_check.rs b/components/raftstore/src/store/worker/split_check.rs index 468c06febd4..4ff853f70a0 100644 --- a/components/raftstore/src/store/worker/split_check.rs +++ b/components/raftstore/src/store/worker/split_check.rs @@ -64,14 +64,14 @@ impl KeyEntry { impl PartialOrd for KeyEntry { fn partial_cmp(&self, rhs: &KeyEntry) -> Option { - Some(self.cmp(rhs)) + // BinaryHeap is max heap, so we have to reverse order to get a min heap. + Some(self.key.cmp(&rhs.key).reverse()) } } impl Ord for KeyEntry { fn cmp(&self, rhs: &KeyEntry) -> Ordering { - // BinaryHeap is max heap, so we have to reverse order to get a min heap. - self.key.cmp(&rhs.key).reverse() + self.partial_cmp(rhs).unwrap() } } @@ -287,7 +287,7 @@ impl Runner { region: &Region, bucket_ranges: &Vec, ) { - for (bucket, bucket_range) in &mut buckets.iter_mut().zip(bucket_ranges) { + for (mut bucket, bucket_range) in &mut buckets.iter_mut().zip(bucket_ranges) { let mut bucket_region = region.clone(); bucket_region.set_start_key(bucket_range.0.clone()); bucket_region.set_end_key(bucket_range.1.clone()); diff --git a/components/raftstore/src/store/worker/split_controller.rs b/components/raftstore/src/store/worker/split_controller.rs index 9cf534c62b0..4bbcc773763 100644 --- a/components/raftstore/src/store/worker/split_controller.rs +++ b/components/raftstore/src/store/worker/split_controller.rs @@ -178,7 +178,7 @@ impl Samples { // evaluate the samples according to the given key range, it will update the // sample's left, right and contained counter. fn evaluate(&mut self, key_range: &KeyRange) { - for sample in self.0.iter_mut() { + for mut sample in self.0.iter_mut() { let order_start = if key_range.start_key.is_empty() { Ordering::Greater } else { @@ -496,7 +496,10 @@ pub struct WriteStats { impl WriteStats { pub fn add_query_num(&mut self, region_id: u64, kind: QueryKind) { - let query_stats = self.region_infos.entry(region_id).or_default(); + let query_stats = self + .region_infos + .entry(region_id) + .or_insert_with(QueryStats::default); query_stats.add_query_num(kind, 1); } @@ -985,8 +988,8 @@ mod tests { #[test] fn test_prefix_sum() { - let v = [1, 2, 3, 4, 5, 6, 7, 8, 9]; - let expect = [1, 3, 6, 10, 15, 21, 28, 36, 45]; + let v = vec![1, 2, 3, 4, 5, 6, 7, 8, 9]; + let expect = vec![1, 3, 6, 10, 15, 21, 28, 36, 45]; let pre = prefix_sum(v.iter(), |x| *x); for i in 0..v.len() { assert_eq!(expect[i], pre[i]); diff --git a/components/resolved_ts/src/cmd.rs b/components/resolved_ts/src/cmd.rs index 328f725edaa..47d14304112 100644 --- a/components/resolved_ts/src/cmd.rs +++ b/components/resolved_ts/src/cmd.rs @@ -213,13 +213,13 @@ fn group_row_changes(requests: Vec) -> (HashMap, bool) CF_WRITE => { if let Ok(ts) = key.decode_ts() { let key = key.truncate_ts().unwrap(); - let row = changes.entry(key).or_default(); + let mut row = changes.entry(key).or_default(); assert!(row.write.is_none()); row.write = Some(KeyOp::Put(Some(ts), value)); } } CF_LOCK => { - let row = changes.entry(key).or_default(); + let mut row = changes.entry(key).or_default(); assert!(row.lock.is_none()); row.lock = Some(KeyOp::Put(None, value)); } @@ -239,7 +239,7 @@ fn group_row_changes(requests: Vec) -> (HashMap, bool) match delete.cf.as_str() { CF_LOCK => { let key = Key::from_encoded(delete.take_key()); - let row = changes.entry(key).or_default(); + let mut row = changes.entry(key).or_default(); row.lock = Some(KeyOp::Delete); } "" | CF_WRITE | CF_DEFAULT => {} diff --git a/components/resolved_ts/src/endpoint.rs b/components/resolved_ts/src/endpoint.rs index 406d931ed7f..9de21b27d9e 100644 --- a/components/resolved_ts/src/endpoint.rs +++ b/components/resolved_ts/src/endpoint.rs @@ -65,8 +65,7 @@ impl Drop for ResolverStatus { locks, memory_quota, .. - } = self - else { + } = self else { return; }; if locks.is_empty() { @@ -97,8 +96,7 @@ impl ResolverStatus { locks, memory_quota, .. - } = self - else { + } = self else { panic!("region {:?} resolver has ready", region_id) }; // Check if adding a new lock or unlock will exceed the memory @@ -112,7 +110,10 @@ impl ResolverStatus { } fn update_tracked_index(&mut self, index: u64, region_id: u64) { - let ResolverStatus::Pending { tracked_index, .. } = self else { + let ResolverStatus::Pending { + tracked_index, + .. + } = self else { panic!("region {:?} resolver has ready", region_id) }; assert!( @@ -134,8 +135,7 @@ impl ResolverStatus { memory_quota, tracked_index, .. - } = self - else { + } = self else { panic!("region {:?} resolver has ready", region_id) }; // Must take locks, otherwise it may double free memory quota on drop. @@ -687,7 +687,7 @@ where scanner_pool, scan_concurrency_semaphore, regions: HashMap::default(), - _phantom: PhantomData, + _phantom: PhantomData::default(), }; ep.handle_advance_resolved_ts(leader_resolver); ep @@ -870,6 +870,7 @@ where // Tracking or untracking locks with incoming commands that corresponding // observe id is valid. + #[allow(clippy::drop_ref)] fn handle_change_log(&mut self, cmd_batch: Vec) { let size = cmd_batch.iter().map(|b| b.size()).sum::(); RTS_CHANNEL_PENDING_CMD_BYTES.sub(size as i64); @@ -883,6 +884,7 @@ where if observe_region.handle.id == observe_id { let logs = ChangeLog::encode_change_log(region_id, batch); if let Err(e) = observe_region.track_change_log(&logs) { + drop(observe_region); let backoff = match e { Error::MemoryQuotaExceeded(_) => Some(MEMORY_QUOTA_EXCEEDED_BACKOFF), Error::Other(_) => None, @@ -928,7 +930,7 @@ where } fn handle_advance_resolved_ts(&self, leader_resolver: LeadershipResolver) { - let regions = self.regions.keys().copied().collect(); + let regions = self.regions.keys().into_iter().copied().collect(); self.advance_worker.advance_ts_for_regions( regions, leader_resolver, diff --git a/components/resolved_ts/src/scanner.rs b/components/resolved_ts/src/scanner.rs index ad052338fa2..6c8c90dc38f 100644 --- a/components/resolved_ts/src/scanner.rs +++ b/components/resolved_ts/src/scanner.rs @@ -100,7 +100,7 @@ impl, E: KvEngine> ScannerPool { Self { workers, cdc_handle, - _phantom: PhantomData, + _phantom: PhantomData::default(), } } @@ -168,7 +168,6 @@ impl, E: KvEngine> ScannerPool { self.workers.spawn(fut); } - #[allow(clippy::needless_pass_by_ref_mut)] async fn get_snapshot( task: &mut ScanTask, cdc_handle: T, diff --git a/components/resource_control/src/resource_group.rs b/components/resource_control/src/resource_group.rs index 09e90e9dd01..a4b30e3d4ad 100644 --- a/components/resource_control/src/resource_group.rs +++ b/components/resource_control/src/resource_group.rs @@ -240,7 +240,7 @@ impl ResourceGroupManager { request_source: &str, ) -> Option> { fail_point!("only_check_source_task_name", |name| { - assert_eq!(name.clone().unwrap(), request_source.to_string()); + assert_eq!(&name.unwrap(), request_source); None }); if let Some(group) = self.resource_groups.get(rg) { @@ -311,8 +311,8 @@ pub struct ResourceController { // 1. the priority factor is calculate based on read/write RU settings. // 2. for read request, we increase a constant virtual time delta at each `get_priority` call // because the cost can't be calculated at start, so we only increase a constant delta and - // increase the real cost after task is executed; but don't increase it at write because the - // cost is known so we just pre-consume it. + // increase the real cost after task is executed; but don't increase it at write because + // the cost is known so we just pre-consume it. is_read: bool, // Track the maximum ru quota used to calculate the factor of each resource group. // factor = max_ru_quota / group_ru_quota * 10.0 diff --git a/components/resource_metering/src/lib.rs b/components/resource_metering/src/lib.rs index 7b437ea4303..ba8e2174e19 100644 --- a/components/resource_metering/src/lib.rs +++ b/components/resource_metering/src/lib.rs @@ -2,7 +2,7 @@ // TODO(mornyx): crate doc. -#![feature(hash_extract_if)] +#![feature(hash_drain_filter)] #![feature(core_intrinsics)] use std::{ diff --git a/components/resource_metering/src/model.rs b/components/resource_metering/src/model.rs index 03cd500eb2e..6f7118ef9e1 100644 --- a/components/resource_metering/src/model.rs +++ b/components/resource_metering/src/model.rs @@ -87,7 +87,7 @@ impl RawRecords { pdqselect::select_by(&mut buf, k, |a, b| b.cmp(a)); let kth = buf[k]; // Evict records with cpu time less or equal than `kth` - let evicted_records = self.records.extract_if(|_, r| r.cpu_time <= kth); + let evicted_records = self.records.drain_filter(|_, r| r.cpu_time <= kth); // Record evicted into others for (_, record) in evicted_records { others.merge(&record); diff --git a/components/resource_metering/src/recorder/sub_recorder/cpu.rs b/components/resource_metering/src/recorder/sub_recorder/cpu.rs index 08675bb6153..8c4053a80ab 100644 --- a/components/resource_metering/src/recorder/sub_recorder/cpu.rs +++ b/components/resource_metering/src/recorder/sub_recorder/cpu.rs @@ -9,7 +9,7 @@ use crate::{ localstorage::{LocalStorage, SharedTagInfos}, SubRecorder, }, - RawRecords, + RawRecord, RawRecords, }; /// An implementation of [SubRecorder] for collecting cpu statistics. @@ -37,7 +37,7 @@ impl SubRecorder for CpuRecorder { if *last_stat != cur_stat { let delta_ms = (cur_stat.total_cpu_time() - last_stat.total_cpu_time()) * 1_000.; - let record = records.entry(cur_tag).or_default(); + let record = records.entry(cur_tag).or_insert_with(RawRecord::default); record.cpu_time += delta_ms as u32; } thread_stat.stat = cur_stat; diff --git a/components/resource_metering/tests/recorder_test.rs b/components/resource_metering/tests/recorder_test.rs index 6e164b8e5e8..daa371e7477 100644 --- a/components/resource_metering/tests/recorder_test.rs +++ b/components/resource_metering/tests/recorder_test.rs @@ -55,7 +55,7 @@ mod tests { if let Some(tag) = self.current_ctx { self.records .entry(tag.as_bytes().to_vec()) - .or_default() + .or_insert_with(RawRecord::default) .cpu_time += ms; } self.ops.push(op); @@ -140,7 +140,7 @@ mod tests { if let Ok(mut r) = self.records.lock() { for (tag, record) in records.records.iter() { r.entry(tag.extra_attachment.to_vec()) - .or_default() + .or_insert_with(RawRecord::default) .merge(record); } } @@ -156,10 +156,10 @@ mod tests { let mut records = self.records.lock().unwrap(); for k in expected.keys() { - records.entry(k.clone()).or_default(); + records.entry(k.clone()).or_insert_with(RawRecord::default); } for k in records.keys() { - expected.entry(k.clone()).or_default(); + expected.entry(k.clone()).or_insert_with(RawRecord::default); } for (k, expected_value) in expected { let value = records.get(&k).unwrap(); @@ -324,10 +324,10 @@ mod tests { fn merge( maps: impl IntoIterator, RawRecord>>, ) -> HashMap, RawRecord> { - let mut map: HashMap, RawRecord> = HashMap::default(); + let mut map = HashMap::default(); for m in maps { for (k, v) in m { - map.entry(k).or_default().merge(&v); + map.entry(k).or_insert_with(RawRecord::default).merge(&v); } } map diff --git a/components/server/src/common.rs b/components/server/src/common.rs index 43b0314cbbe..c8cf879d905 100644 --- a/components/server/src/common.rs +++ b/components/server/src/common.rs @@ -558,9 +558,7 @@ impl EnginesResourceInfo { }); for (_, cache) in cached_latest_tablets.iter_mut() { - let Some(tablet) = cache.latest() else { - continue; - }; + let Some(tablet) = cache.latest() else { continue }; for cf in DATA_CFS { fetch_engine_cf(tablet, cf); } diff --git a/components/snap_recovery/src/leader_keeper.rs b/components/snap_recovery/src/leader_keeper.rs index 48344fe5012..417d5becca3 100644 --- a/components/snap_recovery/src/leader_keeper.rs +++ b/components/snap_recovery/src/leader_keeper.rs @@ -206,7 +206,7 @@ mod test { #[test] fn test_basic() { - let leaders = [1, 2, 3]; + let leaders = vec![1, 2, 3]; let mut store = MockStore::default(); store.regions = leaders.iter().copied().collect(); let mut lk = LeaderKeeper::::new(store, leaders); @@ -217,7 +217,7 @@ mod test { #[test] fn test_failure() { - let leaders = [1, 2, 3]; + let leaders = vec![1, 2, 3]; let mut store = MockStore::default(); store.regions = leaders.iter().copied().collect(); let mut lk = LeaderKeeper::::new(store, vec![1, 2, 3, 4]); diff --git a/components/sst_importer/src/import_mode2.rs b/components/sst_importer/src/import_mode2.rs index 4db29c47a6f..70b7d7fac5e 100644 --- a/components/sst_importer/src/import_mode2.rs +++ b/components/sst_importer/src/import_mode2.rs @@ -139,7 +139,7 @@ impl ImportModeSwitcherV2 { pub fn ranges_in_import(&self) -> HashSet { let inner = self.inner.lock().unwrap(); - HashSet::from_iter(inner.import_mode_ranges.keys().cloned()) + HashSet::from_iter(inner.import_mode_ranges.keys().into_iter().cloned()) } } diff --git a/components/sst_importer/src/sst_importer.rs b/components/sst_importer/src/sst_importer.rs index 910cfa602dd..5530862e6a3 100644 --- a/components/sst_importer/src/sst_importer.rs +++ b/components/sst_importer/src/sst_importer.rs @@ -384,8 +384,8 @@ impl SstImporter { // This method is blocking. It performs the following transformations before // writing to disk: // - // 1. only KV pairs in the *inclusive* range (`[start, end]`) are used. (set - // the range to `["", ""]` to import everything). + // 1. only KV pairs in the *inclusive* range (`[start, end]`) are used. + // (set the range to `["", ""]` to import everything). // 2. keys are rewritten according to the given rewrite rule. // // Both the range and rewrite keys are specified using origin keys. However, @@ -1558,7 +1558,7 @@ mod tests { let env = get_env(key_manager.clone(), None /* io_rate_limiter */).unwrap(); let db = new_test_engine_with_env(db_path.to_str().unwrap(), &[CF_DEFAULT], env); - let cases = [(0, 10), (5, 15), (10, 20), (0, 100)]; + let cases = vec![(0, 10), (5, 15), (10, 20), (0, 100)]; let mut ingested = Vec::new(); @@ -2072,10 +2072,13 @@ mod tests { false, ) .unwrap(); - let ext_storage = importer.wrap_kms( - importer.external_storage_or_cache(&backend, "").unwrap(), - false, - ); + let ext_storage = { + let inner = importer.wrap_kms( + importer.external_storage_or_cache(&backend, "").unwrap(), + false, + ); + inner + }; // test do_read_kv_file() let output = block_on_external_io(importer.do_read_kv_file( diff --git a/components/sst_importer/src/util.rs b/components/sst_importer/src/util.rs index 654971b0d41..ff7526172d5 100644 --- a/components/sst_importer/src/util.rs +++ b/components/sst_importer/src/util.rs @@ -97,8 +97,7 @@ pub fn copy_sst_for_ingestion, Q: AsRef>( let mut pmts = file_system::metadata(clone)?.permissions(); if pmts.readonly() { - use std::os::unix::fs::PermissionsExt; - pmts.set_mode(0o644); + pmts.set_readonly(false); file_system::set_permissions(clone, pmts)?; } diff --git a/components/test_coprocessor/src/store.rs b/components/test_coprocessor/src/store.rs index 6763ea7bb1a..96f405d8f39 100644 --- a/components/test_coprocessor/src/store.rs +++ b/components/test_coprocessor/src/store.rs @@ -203,7 +203,7 @@ impl Store { } pub fn put(&mut self, ctx: Context, mut kv: Vec<(Vec, Vec)>) { - self.handles.extend(kv.iter().map(|(k, _)| k.clone())); + self.handles.extend(kv.iter().map(|&(ref k, _)| k.clone())); let pk = kv[0].0.clone(); let kv = kv .drain(..) diff --git a/components/test_coprocessor_plugin/example_plugin/src/lib.rs b/components/test_coprocessor_plugin/example_plugin/src/lib.rs index d383797c069..afcaa4962b9 100644 --- a/components/test_coprocessor_plugin/example_plugin/src/lib.rs +++ b/components/test_coprocessor_plugin/example_plugin/src/lib.rs @@ -18,4 +18,4 @@ impl CoprocessorPlugin for ExamplePlugin { } } -declare_plugin!(ExamplePlugin); +declare_plugin!(ExamplePlugin::default()); diff --git a/components/test_pd/src/server.rs b/components/test_pd/src/server.rs index 02833e030eb..90a420fbba0 100644 --- a/components/test_pd/src/server.rs +++ b/components/test_pd/src/server.rs @@ -128,8 +128,12 @@ impl Server { } #[allow(unused_mut)] -fn hijack_unary(mock: &PdMock, ctx: RpcContext<'_>, sink: UnarySink, f: F) -where +fn hijack_unary( + mock: &mut PdMock, + ctx: RpcContext<'_>, + sink: UnarySink, + f: F, +) where R: Send + 'static, F: Fn(&dyn PdMocker) -> Option>, { diff --git a/components/test_pd_client/src/pd.rs b/components/test_pd_client/src/pd.rs index 58df5998758..c81230f6a16 100644 --- a/components/test_pd_client/src/pd.rs +++ b/components/test_pd_client/src/pd.rs @@ -1438,7 +1438,7 @@ impl TestPdClient { pub fn switch_replication_mode(&self, state: DrAutoSyncState, available_stores: Vec) { let mut cluster = self.cluster.wl(); let status = cluster.replication_status.as_mut().unwrap(); - let dr = status.mut_dr_auto_sync(); + let mut dr = status.mut_dr_auto_sync(); dr.state_id += 1; dr.set_state(state); dr.available_stores = available_stores; diff --git a/components/test_raftstore-v2/src/cluster.rs b/components/test_raftstore-v2/src/cluster.rs index 346813e7d1f..8ede3290167 100644 --- a/components/test_raftstore-v2/src/cluster.rs +++ b/components/test_raftstore-v2/src/cluster.rs @@ -220,7 +220,7 @@ pub trait Simulator { None => { error!("call_query_on_node receives none response"; "request" => ?request); // Do not unwrap here, sometimes raftstore v2 may return none. - Err(box_err!("receives none response {:?}", request)) + return Err(box_err!("receives none response {:?}", request)); } } } @@ -1612,7 +1612,6 @@ impl, EK: KvEngine> Cluster { ) } - #[allow(clippy::let_underscore_future)] pub fn merge_region(&mut self, source: u64, target: u64, _cb: Callback) { // FIXME: callback is ignored. let mut req = self.new_prepare_merge(source, target); diff --git a/components/test_raftstore-v2/src/lib.rs b/components/test_raftstore-v2/src/lib.rs index 45642df1e7f..685affe45d0 100644 --- a/components/test_raftstore-v2/src/lib.rs +++ b/components/test_raftstore-v2/src/lib.rs @@ -3,8 +3,6 @@ #![feature(type_alias_impl_trait)] #![feature(return_position_impl_trait_in_trait)] #![feature(let_chains)] -#![allow(clippy::needless_pass_by_ref_mut)] -#![allow(clippy::arc_with_non_send_sync)] mod cluster; mod node; diff --git a/components/test_raftstore-v2/src/node.rs b/components/test_raftstore-v2/src/node.rs index 70b6ccb1407..d63ca0aa2f2 100644 --- a/components/test_raftstore-v2/src/node.rs +++ b/components/test_raftstore-v2/src/node.rs @@ -258,7 +258,7 @@ impl Simulator for NodeCluster { ) } else { let trans = self.trans.core.lock().unwrap(); - let (snap_mgr, _) = &trans.snap_paths[&node_id]; + let &(ref snap_mgr, _) = &trans.snap_paths[&node_id]; (snap_mgr.clone(), None) }; self.snap_mgrs.insert(node_id, snap_mgr.clone()); diff --git a/components/test_raftstore-v2/src/server.rs b/components/test_raftstore-v2/src/server.rs index a7d64591fe1..7b5d501a59f 100644 --- a/components/test_raftstore-v2/src/server.rs +++ b/components/test_raftstore-v2/src/server.rs @@ -1006,18 +1006,7 @@ pub fn must_new_cluster_and_kv_client_mul( TikvClient, Context, ) { - must_new_cluster_with_cfg_and_kv_client_mul(count, |_| {}) -} - -pub fn must_new_cluster_with_cfg_and_kv_client_mul( - count: usize, - configure: impl FnMut(&mut Cluster, RocksEngine>), -) -> ( - Cluster, RocksEngine>, - TikvClient, - Context, -) { - let (cluster, leader, ctx) = must_new_and_configure_cluster_mul(count, configure); + let (cluster, leader, ctx) = must_new_cluster_mul(count); let env = Arc::new(Environment::new(1)); let channel = @@ -1026,7 +1015,6 @@ pub fn must_new_cluster_with_cfg_and_kv_client_mul( (cluster, client, ctx) } - pub fn must_new_cluster_mul( count: usize, ) -> ( diff --git a/components/test_raftstore/src/lib.rs b/components/test_raftstore/src/lib.rs index 6f48c17190a..04dfbd24de1 100644 --- a/components/test_raftstore/src/lib.rs +++ b/components/test_raftstore/src/lib.rs @@ -1,8 +1,6 @@ // Copyright 2018 TiKV Project Authors. Licensed under Apache-2.0. #![feature(let_chains)] -#![allow(clippy::needless_pass_by_ref_mut)] -#![allow(clippy::arc_with_non_send_sync)] #[macro_use] extern crate lazy_static; diff --git a/components/test_raftstore/src/node.rs b/components/test_raftstore/src/node.rs index 8a9969c1913..f429f27ff8b 100644 --- a/components/test_raftstore/src/node.rs +++ b/components/test_raftstore/src/node.rs @@ -281,7 +281,7 @@ impl Simulator for NodeCluster { (snap_mgr, Some(tmp)) } else { let trans = self.trans.core.lock().unwrap(); - let (snap_mgr, _) = &trans.snap_paths[&node_id]; + let &(ref snap_mgr, _) = &trans.snap_paths[&node_id]; (snap_mgr.clone(), None) }; diff --git a/components/test_raftstore/src/server.rs b/components/test_raftstore/src/server.rs index 0df44b4e784..8d26bae968d 100644 --- a/components/test_raftstore/src/server.rs +++ b/components/test_raftstore/src/server.rs @@ -918,14 +918,8 @@ pub fn must_new_cluster_and_kv_client() -> (Cluster, TikvClient, pub fn must_new_cluster_and_kv_client_mul( count: usize, ) -> (Cluster, TikvClient, Context) { - must_new_cluster_with_cfg_and_kv_client_mul(count, |_| {}) -} + let (cluster, leader, ctx) = must_new_cluster_mul(count); -pub fn must_new_cluster_with_cfg_and_kv_client_mul( - count: usize, - configure: impl FnMut(&mut Cluster), -) -> (Cluster, TikvClient, Context) { - let (cluster, leader, ctx) = must_new_and_configure_cluster_mul(count, configure); let env = Arc::new(Environment::new(1)); let channel = ChannelBuilder::new(env).connect(&cluster.sim.rl().get_addr(leader.get_store_id())); diff --git a/components/tidb_query_codegen/src/rpn_function.rs b/components/tidb_query_codegen/src/rpn_function.rs index ea3017d5d02..33976939c83 100644 --- a/components/tidb_query_codegen/src/rpn_function.rs +++ b/components/tidb_query_codegen/src/rpn_function.rs @@ -1739,24 +1739,27 @@ mod tests_normal { /// Compare TokenStream with all white chars trimmed. fn assert_token_stream_equal(l: TokenStream, r: TokenStream) { - let result = l.clone().into_iter().eq_by(r.clone(), |x, y| match x { - TokenTree::Ident(x) => matches!(y, TokenTree::Ident(y) if x == y), - TokenTree::Literal(x) => { - matches!(y, TokenTree::Literal(y) if x.to_string() == y.to_string()) - } - TokenTree::Punct(x) => { - matches!(y, TokenTree::Punct(y) if x.to_string() == y.to_string()) - } - TokenTree::Group(x) => { - if let TokenTree::Group(y) = y { - assert_token_stream_equal(x.stream(), y.stream()); + let result = l + .clone() + .into_iter() + .eq_by(r.clone().into_iter(), |x, y| match x { + TokenTree::Ident(x) => matches!(y, TokenTree::Ident(y) if x == y), + TokenTree::Literal(x) => { + matches!(y, TokenTree::Literal(y) if x.to_string() == y.to_string()) + } + TokenTree::Punct(x) => { + matches!(y, TokenTree::Punct(y) if x.to_string() == y.to_string()) + } + TokenTree::Group(x) => { + if let TokenTree::Group(y) = y { + assert_token_stream_equal(x.stream(), y.stream()); - true - } else { - false + true + } else { + false + } } - } - }); + }); assert!(result, "expect: {:#?}, actual: {:#?}", &l, &r); } diff --git a/components/tidb_query_datatype/src/codec/collation/mod.rs b/components/tidb_query_datatype/src/codec/collation/mod.rs index 738e0020de7..22127e62f49 100644 --- a/components/tidb_query_datatype/src/codec/collation/mod.rs +++ b/components/tidb_query_datatype/src/codec/collation/mod.rs @@ -251,7 +251,7 @@ where { #[inline] fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) + C::sort_compare(self.inner.as_ref(), other.inner.as_ref()).ok() } } diff --git a/components/tidb_query_datatype/src/codec/convert.rs b/components/tidb_query_datatype/src/codec/convert.rs index d2bbee78078..418841547ca 100644 --- a/components/tidb_query_datatype/src/codec/convert.rs +++ b/components/tidb_query_datatype/src/codec/convert.rs @@ -574,13 +574,13 @@ pub fn bytes_to_int_without_context(bytes: &[u8]) -> Result { if let Some(&c) = trimed.next() { if c == b'-' { negative = true; - } else if c.is_ascii_digit() { + } else if (b'0'..=b'9').contains(&c) { r = Some(i64::from(c) - i64::from(b'0')); } else if c != b'+' { return Ok(0); } - for c in trimed.take_while(|&c| c.is_ascii_digit()) { + for c in trimed.take_while(|&c| (b'0'..=b'9').contains(c)) { let cur = i64::from(*c - b'0'); r = r.and_then(|r| r.checked_mul(10)).and_then(|r| { if negative { @@ -605,13 +605,13 @@ pub fn bytes_to_uint_without_context(bytes: &[u8]) -> Result { let mut trimed = bytes.iter().skip_while(|&&b| b == b' ' || b == b'\t'); let mut r = Some(0u64); if let Some(&c) = trimed.next() { - if c.is_ascii_digit() { + if (b'0'..=b'9').contains(&c) { r = Some(u64::from(c) - u64::from(b'0')); } else if c != b'+' { return Ok(0); } - for c in trimed.take_while(|&c| c.is_ascii_digit()) { + for c in trimed.take_while(|&c| (b'0'..=b'9').contains(c)) { r = r .and_then(|r| r.checked_mul(10)) .and_then(|r| r.checked_add(u64::from(*c - b'0'))); @@ -856,7 +856,7 @@ pub fn get_valid_int_prefix_helper<'a>( if (c == '+' || c == '-') && i == 0 { continue; } - if c.is_ascii_digit() { + if ('0'..='9').contains(&c) { valid_len = i + 1; continue; } @@ -917,7 +917,7 @@ pub fn get_valid_float_prefix_helper<'a>( break; } e_idx = i - } else if !c.is_ascii_digit() { + } else if !('0'..='9').contains(&c) { break; } else { saw_digit = true; diff --git a/components/tidb_query_datatype/src/codec/data_type/mod.rs b/components/tidb_query_datatype/src/codec/data_type/mod.rs index b464b1119c8..8ca36790824 100644 --- a/components/tidb_query_datatype/src/codec/data_type/mod.rs +++ b/components/tidb_query_datatype/src/codec/data_type/mod.rs @@ -248,7 +248,7 @@ macro_rules! impl_evaluable_type { } #[inline] - fn borrow_scalar_value_ref(v: ScalarValueRef<'_>) -> Option<&Self> { + fn borrow_scalar_value_ref<'a>(v: ScalarValueRef<'a>) -> Option<&'a Self> { match v { ScalarValueRef::$ty(x) => x, other => panic!( diff --git a/components/tidb_query_datatype/src/codec/data_type/scalar.rs b/components/tidb_query_datatype/src/codec/data_type/scalar.rs index ff66ddc42ee..c74423107e4 100644 --- a/components/tidb_query_datatype/src/codec/data_type/scalar.rs +++ b/components/tidb_query_datatype/src/codec/data_type/scalar.rs @@ -467,23 +467,24 @@ impl<'a> ScalarValueRef<'a> { impl<'a> Ord for ScalarValueRef<'a> { fn cmp(&self, other: &Self) -> Ordering { + self.partial_cmp(other) + .expect("Cannot compare two ScalarValueRef in different type") + } +} + +impl<'a> PartialOrd for ScalarValueRef<'a> { + fn partial_cmp(&self, other: &Self) -> Option { match_template_evaltype! { TT, match (self, other) { // v1 and v2 are `Option`. However, in MySQL NULL values are considered lower // than any non-NULL value, so using `Option::PartialOrd` directly is fine. - (ScalarValueRef::TT(v1), ScalarValueRef::TT(v2)) => v1.cmp(v2), - _ => panic!("Cannot compare two ScalarValueRef in different type"), + (ScalarValueRef::TT(v1), ScalarValueRef::TT(v2)) => Some(v1.cmp(v2)), + _ => None, } } } } -impl<'a> PartialOrd for ScalarValueRef<'a> { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - impl<'a> PartialEq for ScalarValueRef<'a> { fn eq(&self, other: &ScalarValue) -> bool { self == &other.as_scalar_value_ref() diff --git a/components/tidb_query_datatype/src/codec/datum.rs b/components/tidb_query_datatype/src/codec/datum.rs index f91d204b3b0..dde98003475 100644 --- a/components/tidb_query_datatype/src/codec/datum.rs +++ b/components/tidb_query_datatype/src/codec/datum.rs @@ -668,7 +668,7 @@ impl Datum { Datum::F64(res) } } - (Datum::Dec(l), Datum::Dec(r)) => { + (&Datum::Dec(ref l), &Datum::Dec(ref r)) => { let dec: Result = (l + r).into(); return dec.map(Datum::Dec); } @@ -700,7 +700,7 @@ impl Datum { } (&Datum::U64(l), &Datum::U64(r)) => l.checked_sub(r).into(), (&Datum::F64(l), &Datum::F64(r)) => return Ok(Datum::F64(l - r)), - (Datum::Dec(l), Datum::Dec(r)) => { + (&Datum::Dec(ref l), &Datum::Dec(ref r)) => { let dec: Result = (l - r).into(); return dec.map(Datum::Dec); } @@ -724,7 +724,7 @@ impl Datum { } (&Datum::U64(l), &Datum::U64(r)) => l.checked_mul(r).into(), (&Datum::F64(l), &Datum::F64(r)) => return Ok(Datum::F64(l * r)), - (Datum::Dec(l), Datum::Dec(r)) => return Ok(Datum::Dec((l * r).unwrap())), + (&Datum::Dec(ref l), &Datum::Dec(ref r)) => return Ok(Datum::Dec((l * r).unwrap())), (l, r) => return Err(invalid_type!("{} can't multiply {}", l, r)), }; @@ -1179,7 +1179,7 @@ mod tests { | (&Datum::Null, &Datum::Null) | (&Datum::Time(_), &Datum::Time(_)) | (&Datum::Json(_), &Datum::Json(_)) => true, - (Datum::Dec(d1), Datum::Dec(d2)) => d1.prec_and_frac() == d2.prec_and_frac(), + (&Datum::Dec(ref d1), &Datum::Dec(ref d2)) => d1.prec_and_frac() == d2.prec_and_frac(), _ => false, } } diff --git a/components/tidb_query_datatype/src/codec/mysql/decimal.rs b/components/tidb_query_datatype/src/codec/mysql/decimal.rs index 8853a1d6a16..143ec6c7760 100644 --- a/components/tidb_query_datatype/src/codec/mysql/decimal.rs +++ b/components/tidb_query_datatype/src/codec/mysql/decimal.rs @@ -1872,7 +1872,7 @@ impl<'a> ConvertTo for JsonRef<'a> { fn first_non_digit(bs: &[u8], start_idx: usize) -> usize { bs.iter() .skip(start_idx) - .position(|c| !c.is_ascii_digit()) + .position(|c| !(b'0'..=b'9').contains(c)) .map_or_else(|| bs.len(), |s| s + start_idx) } diff --git a/components/tidb_query_datatype/src/codec/mysql/duration.rs b/components/tidb_query_datatype/src/codec/mysql/duration.rs index 4b735977712..7279f788146 100644 --- a/components/tidb_query_datatype/src/codec/mysql/duration.rs +++ b/components/tidb_query_datatype/src/codec/mysql/duration.rs @@ -629,14 +629,14 @@ impl Eq for Duration {} impl PartialOrd for Duration { #[inline] fn partial_cmp(&self, rhs: &Duration) -> Option { - Some(self.cmp(rhs)) + self.nanos.partial_cmp(&rhs.nanos) } } impl Ord for Duration { #[inline] fn cmp(&self, rhs: &Duration) -> Ordering { - self.nanos.partial_cmp(&rhs.nanos).unwrap() + self.partial_cmp(rhs).unwrap() } } diff --git a/components/tidb_query_datatype/src/codec/mysql/json/comparison.rs b/components/tidb_query_datatype/src/codec/mysql/json/comparison.rs index 73e04885890..d9104385bc6 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/comparison.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/comparison.rs @@ -77,8 +77,6 @@ impl<'a> PartialEq for JsonRef<'a> { .map_or(false, |r| r == Ordering::Equal) } } - -#[allow(clippy::incorrect_partial_ord_impl_on_ord_type)] impl<'a> PartialOrd for JsonRef<'a> { // See `CompareBinary` in TiDB `types/json/binary_functions.go` fn partial_cmp(&self, right: &JsonRef<'_>) -> Option { @@ -199,7 +197,7 @@ impl PartialEq for Json { impl PartialOrd for Json { fn partial_cmp(&self, right: &Json) -> Option { - Some(self.cmp(right)) + self.as_ref().partial_cmp(&right.as_ref()) } } diff --git a/components/tidb_query_datatype/src/codec/mysql/json/jcodec.rs b/components/tidb_query_datatype/src/codec/mysql/json/jcodec.rs index f76b29790f9..867d8ec2c20 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/jcodec.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/jcodec.rs @@ -28,9 +28,9 @@ pub trait JsonEncoder: NumberEncoder { } // See `appendBinaryObject` in TiDB `types/json/binary.go` - fn write_json_obj_from_keys_values( + fn write_json_obj_from_keys_values<'a>( &mut self, - mut entries: Vec<(&[u8], JsonRef<'_>)>, + mut entries: Vec<(&[u8], JsonRef<'a>)>, ) -> Result<()> { entries.sort_by(|a, b| a.0.cmp(b.0)); // object: element-count size key-entry* value-entry* key* value* @@ -122,7 +122,7 @@ pub trait JsonEncoder: NumberEncoder { } // See `appendBinaryArray` in TiDB `types/json/binary.go` - fn write_json_ref_array(&mut self, data: &[JsonRef<'_>]) -> Result<()> { + fn write_json_ref_array<'a>(&mut self, data: &[JsonRef<'a>]) -> Result<()> { let element_count = data.len(); let value_entries_len = VALUE_ENTRY_LEN * element_count; let values_len = data.iter().fold(0, |acc, v| acc + v.encoded_len()); @@ -167,7 +167,7 @@ pub trait JsonEncoder: NumberEncoder { } // See `appendBinaryValElem` in TiDB `types/json/binary.go` - fn write_value_entry(&mut self, value_offset: &mut u32, v: &JsonRef<'_>) -> Result<()> { + fn write_value_entry<'a>(&mut self, value_offset: &mut u32, v: &JsonRef<'a>) -> Result<()> { let tp = v.get_type(); self.write_u8(tp as u8)?; match tp { diff --git a/components/tidb_query_datatype/src/codec/mysql/json/json_modify.rs b/components/tidb_query_datatype/src/codec/mysql/json/json_modify.rs index 3cc78270d60..b359158d06b 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/json_modify.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/json_modify.rs @@ -41,7 +41,7 @@ impl<'a> JsonRef<'a> { } } let mut res = self.to_owned(); - for (expr, value) in path_expr_list.iter().zip(values) { + for (expr, value) in path_expr_list.iter().zip(values.into_iter()) { let modifier = BinaryModifier::new(res.as_ref()); res = match mt { ModifyType::Insert => modifier.insert(expr, value)?, diff --git a/components/tidb_query_datatype/src/codec/mysql/time/mod.rs b/components/tidb_query_datatype/src/codec/mysql/time/mod.rs index 621d4384bcc..4c6c2f676d7 100644 --- a/components/tidb_query_datatype/src/codec/mysql/time/mod.rs +++ b/components/tidb_query_datatype/src/codec/mysql/time/mod.rs @@ -1094,7 +1094,7 @@ impl Time { ) } - fn try_into_chrono_datetime(self, ctx: &EvalContext) -> Result> { + fn try_into_chrono_datetime(self, ctx: &mut EvalContext) -> Result> { chrono_datetime( &ctx.cfg.tz, self.year(), @@ -1342,7 +1342,6 @@ impl Time { Ok((((ymd << 17) | hms) << 24) | u64::from(self.micro())) } - #[allow(deprecated)] pub fn from_duration( ctx: &mut EvalContext, duration: Duration, @@ -1416,7 +1415,6 @@ impl Time { .ok_or_else(|| Error::incorrect_datetime_value(self)) } - #[allow(deprecated)] pub fn normalized(self, ctx: &mut EvalContext) -> Result { if self.get_time_type() == TimeType::Timestamp { return Ok(self); @@ -1502,7 +1500,6 @@ impl Time { + self.day()) as i32 } - #[allow(deprecated)] pub fn weekday(self) -> Weekday { let date = if self.month() == 0 { NaiveDate::from_ymd(self.year() as i32 - 1, 12, 1) @@ -2673,9 +2670,9 @@ mod tests { #[test] fn test_no_zero_in_date() -> Result<()> { - let cases = ["2019-01-00", "2019-00-01"]; + let cases = vec!["2019-01-00", "2019-00-01"]; - for case in cases { + for &case in cases.iter() { // Enable NO_ZERO_IN_DATE only. If zero-date is encountered, a warning is // produced. let mut ctx = EvalContext::from(TimeEnv { @@ -2820,7 +2817,7 @@ mod tests { let actual = Time::from_duration(&mut ctx, duration, TimeType::DateTime)?; let today = actual - .try_into_chrono_datetime(&ctx)? + .try_into_chrono_datetime(&mut ctx)? .checked_sub_signed(chrono::Duration::nanoseconds(duration.to_nanos())) .unwrap(); @@ -2840,7 +2837,7 @@ mod tests { let mut ctx = EvalContext::default(); for i in 2..10 { let actual = Time::from_local_time(&mut ctx, TimeType::DateTime, i % MAX_FSP)?; - let c_datetime = actual.try_into_chrono_datetime(&ctx)?; + let c_datetime = actual.try_into_chrono_datetime(&mut ctx)?; let now0 = c_datetime.timestamp_millis() as u64; let now1 = Utc::now().timestamp_millis() as u64; diff --git a/components/tidb_query_datatype/src/codec/mysql/time/tz.rs b/components/tidb_query_datatype/src/codec/mysql/time/tz.rs index 9dfc3ebf288..25b35a90fc0 100644 --- a/components/tidb_query_datatype/src/codec/mysql/time/tz.rs +++ b/components/tidb_query_datatype/src/codec/mysql/time/tz.rs @@ -120,7 +120,6 @@ impl TimeZone for Tz { } } - #[allow(deprecated)] fn from_local_date(&self, local: &NaiveDate) -> LocalResult> { match *self { Tz::Local(ref offset) => offset @@ -135,7 +134,6 @@ impl TimeZone for Tz { } } - #[allow(deprecated)] fn from_local_datetime(&self, local: &NaiveDateTime) -> LocalResult> { match *self { Tz::Local(ref offset) => offset @@ -150,7 +148,6 @@ impl TimeZone for Tz { } } - #[allow(deprecated)] fn from_utc_date(&self, utc: &NaiveDate) -> Date { match *self { Tz::Local(ref offset) => { @@ -168,7 +165,6 @@ impl TimeZone for Tz { } } - #[allow(deprecated)] fn from_utc_datetime(&self, utc: &NaiveDateTime) -> DateTime { match *self { Tz::Local(ref offset) => { diff --git a/components/tidb_query_datatype/src/codec/row/v2/row_slice.rs b/components/tidb_query_datatype/src/codec/row/v2/row_slice.rs index aa5eb3fc56f..da117c96e2c 100644 --- a/components/tidb_query_datatype/src/codec/row/v2/row_slice.rs +++ b/components/tidb_query_datatype/src/codec/row/v2/row_slice.rs @@ -298,7 +298,7 @@ impl<'a, T: PrimInt> LeBytes<'a, T> { fn new(slice: &'a [u8]) -> Self { Self { slice, - _marker: PhantomData, + _marker: PhantomData::default(), } } diff --git a/components/tidb_query_datatype/src/codec/table.rs b/components/tidb_query_datatype/src/codec/table.rs index 81ef4b072c6..37becbfb801 100644 --- a/components/tidb_query_datatype/src/codec/table.rs +++ b/components/tidb_query_datatype/src/codec/table.rs @@ -528,7 +528,7 @@ pub fn generate_index_data_for_test( let mut expect_row = HashMap::default(); let mut v: Vec<_> = indice .iter() - .map(|(cid, value)| { + .map(|&(ref cid, ref value)| { expect_row.insert( *cid, datum::encode_key(&mut EvalContext::default(), &[value.clone()]).unwrap(), diff --git a/components/tidb_query_executors/src/index_scan_executor.rs b/components/tidb_query_executors/src/index_scan_executor.rs index 5ebf8a031d3..3a5c53a4d09 100644 --- a/components/tidb_query_executors/src/index_scan_executor.rs +++ b/components/tidb_query_executors/src/index_scan_executor.rs @@ -611,8 +611,8 @@ impl IndexScanExecutorImpl { } #[inline] - fn build_operations<'a>( - &self, + fn build_operations<'a, 'b>( + &'b self, mut key_payload: &'a [u8], index_value: &'a [u8], ) -> Result<(DecodeHandleOp<'a>, DecodePartitionIdOp<'a>, RestoreData<'a>)> { diff --git a/components/tidb_query_executors/src/runner.rs b/components/tidb_query_executors/src/runner.rs index 27e52dde288..7c410befb25 100644 --- a/components/tidb_query_executors/src/runner.rs +++ b/components/tidb_query_executors/src/runner.rs @@ -137,31 +137,31 @@ impl BatchExecutorsRunner<()> { .map_err(|e| other_err!("BatchProjectionExecutor: {}", e))?; } ExecType::TypeJoin => { - return Err(other_err!("Join executor not implemented")); + other_err!("Join executor not implemented"); } ExecType::TypeKill => { - return Err(other_err!("Kill executor not implemented")); + other_err!("Kill executor not implemented"); } ExecType::TypeExchangeSender => { - return Err(other_err!("ExchangeSender executor not implemented")); + other_err!("ExchangeSender executor not implemented"); } ExecType::TypeExchangeReceiver => { - return Err(other_err!("ExchangeReceiver executor not implemented")); + other_err!("ExchangeReceiver executor not implemented"); } ExecType::TypePartitionTableScan => { - return Err(other_err!("PartitionTableScan executor not implemented")); + other_err!("PartitionTableScan executor not implemented"); } ExecType::TypeSort => { - return Err(other_err!("Sort executor not implemented")); + other_err!("Sort executor not implemented"); } ExecType::TypeWindow => { - return Err(other_err!("Window executor not implemented")); + other_err!("Window executor not implemented"); } ExecType::TypeExpand => { - return Err(other_err!("Expand executor not implemented")); + other_err!("Expand executor not implemented"); } ExecType::TypeExpand2 => { - return Err(other_err!("Expand2 executor not implemented")); + other_err!("Expand2 executor not implemented"); } } } diff --git a/components/tidb_query_executors/src/selection_executor.rs b/components/tidb_query_executors/src/selection_executor.rs index ffcb22671da..bd65547109d 100644 --- a/components/tidb_query_executors/src/selection_executor.rs +++ b/components/tidb_query_executors/src/selection_executor.rs @@ -537,7 +537,7 @@ mod tests { }) .collect(); - for predicates in [ + for predicates in vec![ // Swap predicates should produce same results. vec![predicate[0](), predicate[1]()], vec![predicate[1](), predicate[0]()], @@ -572,7 +572,7 @@ mod tests { }) .collect(); - for predicates in [ + for predicates in vec![ // Swap predicates should produce same results. vec![predicate[0](), predicate[1](), predicate[2]()], vec![predicate[1](), predicate[2](), predicate[0]()], diff --git a/components/tidb_query_executors/src/util/aggr_executor.rs b/components/tidb_query_executors/src/util/aggr_executor.rs index a5d760dc80d..0535e8dbd83 100644 --- a/components/tidb_query_executors/src/util/aggr_executor.rs +++ b/components/tidb_query_executors/src/util/aggr_executor.rs @@ -641,8 +641,8 @@ pub mod tests { )) as Box> }; - let test_paging_size = [2, 5, 7]; - let expect_call_num = [1, 3, 4]; + let test_paging_size = vec![2, 5, 7]; + let expect_call_num = vec![1, 3, 4]; let expect_row_num = vec![vec![4], vec![0, 0, 5], vec![0, 0, 0, 6]]; let executor_builders: Vec) -> _>> = vec![Box::new(exec_fast), Box::new(exec_slow)]; diff --git a/components/tidb_query_executors/src/util/mod.rs b/components/tidb_query_executors/src/util/mod.rs index db456a84883..ca05e49fcd3 100644 --- a/components/tidb_query_executors/src/util/mod.rs +++ b/components/tidb_query_executors/src/util/mod.rs @@ -28,13 +28,13 @@ pub fn ensure_columns_decoded( /// Evaluates expressions and outputs the result into the given Vec. Lifetime of /// the expressions are erased. -pub unsafe fn eval_exprs_decoded_no_lifetime( +pub unsafe fn eval_exprs_decoded_no_lifetime<'a>( ctx: &mut EvalContext, exprs: &[RpnExpression], schema: &[FieldType], input_physical_columns: &LazyBatchColumnVec, input_logical_rows: &[usize], - output: &mut Vec>, + output: &mut Vec>, ) -> Result<()> { unsafe fn erase_lifetime<'a, T: ?Sized>(v: &T) -> &'a T { &*(v as *const T) diff --git a/components/tidb_query_expr/src/impl_cast.rs b/components/tidb_query_expr/src/impl_cast.rs index b6619f9d8cc..76e90f79c5b 100644 --- a/components/tidb_query_expr/src/impl_cast.rs +++ b/components/tidb_query_expr/src/impl_cast.rs @@ -6528,7 +6528,7 @@ mod tests { "cast_decimal_as_duration", ); - let values = [ + let values = vec![ Decimal::from_bytes(b"9995959").unwrap().unwrap(), Decimal::from_bytes(b"-9995959").unwrap().unwrap(), ]; diff --git a/components/tidb_query_expr/src/impl_miscellaneous.rs b/components/tidb_query_expr/src/impl_miscellaneous.rs index 663571804ae..5d2daed7f9a 100644 --- a/components/tidb_query_expr/src/impl_miscellaneous.rs +++ b/components/tidb_query_expr/src/impl_miscellaneous.rs @@ -58,7 +58,7 @@ pub fn inet_aton(addr: BytesRef) -> Result> { } let (mut byte_result, mut result, mut dot_count): (u64, u64, usize) = (0, 0, 0); for c in addr.chars() { - if c.is_ascii_digit() { + if ('0'..='9').contains(&c) { let digit = c as u64 - '0' as u64; byte_result = byte_result * 10 + digit; if byte_result > 255 { @@ -501,9 +501,8 @@ mod tests { (Some(hex("00000000")), Some(b"0.0.0.0".to_vec())), (Some(hex("0A000509")), Some(b"10.0.5.9".to_vec())), ( - // the output format has changed, see: https://github.com/rust-lang/rust/pull/112606 Some(hex("00000000000000000000000001020304")), - Some(b"::102:304".to_vec()), + Some(b"::1.2.3.4".to_vec()), ), ( Some(hex("00000000000000000000FFFF01020304")), diff --git a/components/tidb_query_expr/src/impl_string.rs b/components/tidb_query_expr/src/impl_string.rs index 45754d0a101..f3b9b03c287 100644 --- a/components/tidb_query_expr/src/impl_string.rs +++ b/components/tidb_query_expr/src/impl_string.rs @@ -63,13 +63,13 @@ pub fn oct_string(s: BytesRef, writer: BytesWriter) -> Result { if let Some(&c) = trimmed.next() { if c == b'-' { negative = true; - } else if c.is_ascii_digit() { + } else if (b'0'..=b'9').contains(&c) { r = Some(u64::from(c) - u64::from(b'0')); } else if c != b'+' { return Ok(writer.write(Some(b"0".to_vec()))); } - for c in trimmed.take_while(|&c| c.is_ascii_digit()) { + for c in trimmed.take_while(|&c| (b'0'..=b'9').contains(c)) { r = r .and_then(|r| r.checked_mul(10)) .and_then(|r| r.checked_add(u64::from(*c - b'0'))); @@ -879,7 +879,7 @@ impl TrimDirection { } #[inline] -fn trim<'a>(string: &'a [u8], pattern: &[u8], direction: TrimDirection) -> &'a [u8] { +fn trim<'a, 'b>(string: &'a [u8], pattern: &'b [u8], direction: TrimDirection) -> &'a [u8] { if pattern.is_empty() { return string; } diff --git a/components/tidb_query_expr/src/lib.rs b/components/tidb_query_expr/src/lib.rs index 40c1f485e54..c2ef6722148 100644 --- a/components/tidb_query_expr/src/lib.rs +++ b/components/tidb_query_expr/src/lib.rs @@ -10,8 +10,6 @@ #![allow(elided_lifetimes_in_paths)] // Necessary until rpn_fn accepts functions annotated with lifetimes. #![allow(incomplete_features)] -#![allow(clippy::needless_raw_string_hashes)] -#![allow(clippy::needless_return_with_question_mark)] #![feature(proc_macro_hygiene)] #![feature(specialization)] #![feature(test)] diff --git a/components/tidb_query_expr/src/types/expr_eval.rs b/components/tidb_query_expr/src/types/expr_eval.rs index e3ab7d35297..b892333b0ef 100644 --- a/components/tidb_query_expr/src/types/expr_eval.rs +++ b/components/tidb_query_expr/src/types/expr_eval.rs @@ -1091,13 +1091,16 @@ mod tests { use tipb::{Expr, ScalarFuncSig}; #[allow(clippy::trivially_copy_pass_by_ref)] - #[rpn_fn(capture = [metadata], metadata_mapper = prepare_a)] - fn fn_a_nonnull(metadata: &i64, v: &Int) -> Result> { + #[rpn_fn(capture = [metadata], metadata_mapper = prepare_a::)] + fn fn_a_nonnull( + metadata: &i64, + v: &Int, + ) -> Result> { assert_eq!(*metadata, 42); Ok(Some(v + *metadata)) } - fn prepare_a(_expr: &mut Expr) -> Result { + fn prepare_a(_expr: &mut Expr) -> Result { Ok(42) } @@ -1133,7 +1136,7 @@ mod tests { // fn_b: CastIntAsReal // fn_c: CastIntAsString Ok(match expr.get_sig() { - ScalarFuncSig::CastIntAsInt => fn_a_nonnull_fn_meta(), + ScalarFuncSig::CastIntAsInt => fn_a_nonnull_fn_meta::(), ScalarFuncSig::CastIntAsReal => fn_b_fn_meta::(), ScalarFuncSig::CastIntAsString => fn_c_fn_meta::(), _ => unreachable!(), diff --git a/components/tikv_kv/src/cursor.rs b/components/tikv_kv/src/cursor.rs index 858edfffec2..576aa5cfa76 100644 --- a/components/tikv_kv/src/cursor.rs +++ b/components/tikv_kv/src/cursor.rs @@ -605,7 +605,7 @@ mod tests { (b"a9".to_vec(), b"v9".to_vec()), ]; - for (k, v) in &base_data { + for &(ref k, ref v) in &base_data { engine.put(&data_key(k), v).unwrap(); } (r, base_data) diff --git a/components/tikv_kv/src/lib.rs b/components/tikv_kv/src/lib.rs index 43e5f1bea05..25f58352750 100644 --- a/components/tikv_kv/src/lib.rs +++ b/components/tikv_kv/src/lib.rs @@ -9,7 +9,6 @@ #![feature(min_specialization)] #![feature(type_alias_impl_trait)] #![feature(associated_type_defaults)] -#![feature(impl_trait_in_assoc_type)] #[macro_use(fail_point)] extern crate fail; diff --git a/components/tikv_util/src/logger/formatter.rs b/components/tikv_util/src/logger/formatter.rs index b786d2aa681..c53c5896519 100644 --- a/components/tikv_util/src/logger/formatter.rs +++ b/components/tikv_util/src/logger/formatter.rs @@ -11,9 +11,9 @@ where let mut start = 0; let bytes = file_name.as_bytes(); for (index, &b) in bytes.iter().enumerate() { - if b.is_ascii_uppercase() - || b.is_ascii_lowercase() - || b.is_ascii_digit() + if (b'A'..=b'Z').contains(&b) + || (b'a'..=b'z').contains(&b) + || (b'0'..=b'9').contains(&b) || b == b'.' || b == b'-' || b == b'_' diff --git a/components/tikv_util/src/lru.rs b/components/tikv_util/src/lru.rs index a2d0943df90..76fad6e8a34 100644 --- a/components/tikv_util/src/lru.rs +++ b/components/tikv_util/src/lru.rs @@ -247,7 +247,7 @@ where HashMapEntry::Occupied(mut e) => { self.size_policy.on_remove(e.key(), &e.get().value); self.size_policy.on_insert(e.key(), &value); - let entry = e.get_mut(); + let mut entry = e.get_mut(); self.trace.promote(entry.record); entry.value = value; } diff --git a/components/tikv_util/src/memory.rs b/components/tikv_util/src/memory.rs index a2897809683..291254c5227 100644 --- a/components/tikv_util/src/memory.rs +++ b/components/tikv_util/src/memory.rs @@ -33,7 +33,7 @@ pub trait HeapSize { impl HeapSize for [u8] { fn heap_size(&self) -> usize { - mem::size_of_val(self) + self.len() * mem::size_of::() } } diff --git a/components/tikv_util/src/metrics/allocator_metrics.rs b/components/tikv_util/src/metrics/allocator_metrics.rs index af22e411767..260aa88ac8e 100644 --- a/components/tikv_util/src/metrics/allocator_metrics.rs +++ b/components/tikv_util/src/metrics/allocator_metrics.rs @@ -64,7 +64,7 @@ impl Collector for AllocStatsCollector { .set(dealloc as _); }); let mut g = self.memory_stats.collect(); - g.extend(self.allocation.collect()); + g.extend(self.allocation.collect().into_iter()); g } } diff --git a/components/tikv_util/src/mpsc/future.rs b/components/tikv_util/src/mpsc/future.rs index 354ef74adb0..4492e33a933 100644 --- a/components/tikv_util/src/mpsc/future.rs +++ b/components/tikv_util/src/mpsc/future.rs @@ -302,8 +302,6 @@ mod tests { use super::*; - // the JoinHandler is useless here, so just ignore this warning. - #[allow(clippy::let_underscore_future)] fn spawn_and_wait( rx_builder: impl FnOnce() -> S, ) -> (Runtime, Arc) { diff --git a/components/tikv_util/src/sys/cpu_time.rs b/components/tikv_util/src/sys/cpu_time.rs index 61608d1518f..6ec1621c629 100644 --- a/components/tikv_util/src/sys/cpu_time.rs +++ b/components/tikv_util/src/sys/cpu_time.rs @@ -333,7 +333,7 @@ mod tests { for _ in 0..num * 10 { std::thread::spawn(move || { loop { - let _ = (0..10_000_000).sum::(); + let _ = (0..10_000_000).into_iter().sum::(); } }); } diff --git a/components/tikv_util/src/timer.rs b/components/tikv_util/src/timer.rs index a7a2b421ab0..bb555e11794 100644 --- a/components/tikv_util/src/timer.rs +++ b/components/tikv_util/src/timer.rs @@ -81,14 +81,14 @@ impl Eq for TimeoutTask {} impl PartialOrd for TimeoutTask { fn partial_cmp(&self, other: &TimeoutTask) -> Option { - Some(self.cmp(other)) + self.next_tick.partial_cmp(&other.next_tick) } } impl Ord for TimeoutTask { fn cmp(&self, other: &TimeoutTask) -> Ordering { // TimeoutTask.next_tick must have same type of instants. - self.next_tick.partial_cmp(&other.next_tick).unwrap() + self.partial_cmp(other).unwrap() } } diff --git a/components/txn_types/src/timestamp.rs b/components/txn_types/src/timestamp.rs index 79727575d60..fb0cd900123 100644 --- a/components/txn_types/src/timestamp.rs +++ b/components/txn_types/src/timestamp.rs @@ -118,10 +118,9 @@ impl slog::Value for TimeStamp { const TS_SET_USE_VEC_LIMIT: usize = 8; /// A hybrid immutable set for timestamps. -#[derive(Debug, Default, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq)] pub enum TsSet { /// When the set is empty, avoid the useless cloning of Arc. - #[default] Empty, /// `Vec` is suitable when the set is small or the set is barely used, and /// it doesn't worth converting a `Vec` into a `HashSet`. @@ -131,6 +130,13 @@ pub enum TsSet { Set(Arc>), } +impl Default for TsSet { + #[inline] + fn default() -> TsSet { + TsSet::Empty + } +} + impl TsSet { /// Create a `TsSet` from the given vec of timestamps. It will select the /// proper internal collection type according to the size. diff --git a/components/txn_types/src/types.rs b/components/txn_types/src/types.rs index 5305e3ec69a..624ac81212d 100644 --- a/components/txn_types/src/types.rs +++ b/components/txn_types/src/types.rs @@ -451,7 +451,7 @@ impl From for Mutation { /// `OldValue` is used by cdc to read the previous value associated with some /// key during the prewrite process. -#[derive(Debug, Default, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq)] pub enum OldValue { /// A real `OldValue`. Value { value: Value }, @@ -460,13 +460,18 @@ pub enum OldValue { /// `None` means we don't found a previous value. None, /// The user doesn't care about the previous value. - #[default] Unspecified, /// Not sure whether the old value exists or not. users can seek CF_WRITE to /// the give position to take a look. SeekWrite(Key), } +impl Default for OldValue { + fn default() -> Self { + OldValue::Unspecified + } +} + impl OldValue { pub fn value(value: Value) -> Self { OldValue::Value { value } @@ -585,9 +590,8 @@ impl WriteBatchFlags { /// The position info of the last actual write (PUT or DELETE) of a LOCK record. /// Note that if the last change is a DELETE, its LastChange can be either /// Exist(which points to it) or NotExist. -#[derive(Clone, Default, Eq, PartialEq, Debug)] +#[derive(Clone, Eq, PartialEq, Debug)] pub enum LastChange { - #[default] Unknown, /// The pointer may point to a PUT or a DELETE record. Exist { @@ -643,6 +647,12 @@ impl LastChange { } } +impl Default for LastChange { + fn default() -> Self { + LastChange::Unknown + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/rust-toolchain b/rust-toolchain index c1eb62e26cb..4e5f9a4d82b 100644 --- a/rust-toolchain +++ b/rust-toolchain @@ -1 +1 @@ -nightly-2023-08-15 +nightly-2022-11-15 diff --git a/src/config/mod.rs b/src/config/mod.rs index 63e36a543dc..8318556483e 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -1482,7 +1482,7 @@ impl DbConfig { opts.set_paranoid_checks(b); } if for_engine == EngineType::RaftKv { - opts.set_info_log(RocksdbLogger); + opts.set_info_log(RocksdbLogger::default()); } opts.set_info_log_level(self.info_log_level.into()); if self.titan.enabled { @@ -1858,7 +1858,7 @@ impl RaftDbConfig { opts.set_max_log_file_size(self.info_log_max_size.0); opts.set_log_file_time_to_roll(self.info_log_roll_time.as_secs()); opts.set_keep_log_file_num(self.info_log_keep_log_file_num); - opts.set_info_log(RaftDbLogger); + opts.set_info_log(RaftDbLogger::default()); opts.set_info_log_level(self.info_log_level.into()); opts.set_max_subcompactions(self.max_sub_compactions); opts.set_writable_file_max_buffer_size(self.writable_file_max_buffer_size.0 as i32); @@ -2015,7 +2015,7 @@ impl ConfigManager for DbConfigManger { self.cfg.update(change.clone())?; let change_str = format!("{:?}", change); let mut change: Vec<(String, ConfigValue)> = change.into_iter().collect(); - let cf_config = change.extract_if(|(name, _)| name.ends_with("cf")); + let cf_config = change.drain_filter(|(name, _)| name.ends_with("cf")); for (cf_name, cf_change) in cf_config { if let ConfigValue::Module(mut cf_change) = cf_change { // defaultcf -> default @@ -2049,7 +2049,7 @@ impl ConfigManager for DbConfigManger { } if let Some(rate_bytes_config) = change - .extract_if(|(name, _)| name == "rate_bytes_per_sec") + .drain_filter(|(name, _)| name == "rate_bytes_per_sec") .next() { let rate_bytes_per_sec: ReadableSize = rate_bytes_config.1.into(); @@ -2058,7 +2058,7 @@ impl ConfigManager for DbConfigManger { } if let Some(rate_bytes_config) = change - .extract_if(|(name, _)| name == "rate_limiter_auto_tuned") + .drain_filter(|(name, _)| name == "rate_limiter_auto_tuned") .next() { let rate_limiter_auto_tuned: bool = rate_bytes_config.1.into(); @@ -2067,7 +2067,7 @@ impl ConfigManager for DbConfigManger { } if let Some(size) = change - .extract_if(|(name, _)| name == "write_buffer_limit") + .drain_filter(|(name, _)| name == "write_buffer_limit") .next() { let size: ReadableSize = size.1.into(); @@ -2075,14 +2075,14 @@ impl ConfigManager for DbConfigManger { } if let Some(f) = change - .extract_if(|(name, _)| name == "write_buffer_flush_oldest_first") + .drain_filter(|(name, _)| name == "write_buffer_flush_oldest_first") .next() { self.db.set_flush_oldest_first(f.1.into())?; } if let Some(background_jobs_config) = change - .extract_if(|(name, _)| name == "max_background_jobs") + .drain_filter(|(name, _)| name == "max_background_jobs") .next() { let max_background_jobs: i32 = background_jobs_config.1.into(); @@ -2090,7 +2090,7 @@ impl ConfigManager for DbConfigManger { } if let Some(background_subcompactions_config) = change - .extract_if(|(name, _)| name == "max_sub_compactions") + .drain_filter(|(name, _)| name == "max_sub_compactions") .next() { let max_subcompactions: u32 = background_subcompactions_config.1.into(); @@ -2099,7 +2099,7 @@ impl ConfigManager for DbConfigManger { } if let Some(background_flushes_config) = change - .extract_if(|(name, _)| name == "max_background_flushes") + .drain_filter(|(name, _)| name == "max_background_flushes") .next() { let max_background_flushes: i32 = background_flushes_config.1.into(); diff --git a/src/coprocessor/metrics.rs b/src/coprocessor/metrics.rs index 7d2d7e9e947..02f45d35311 100644 --- a/src/coprocessor/metrics.rs +++ b/src/coprocessor/metrics.rs @@ -285,7 +285,7 @@ pub fn tls_collect_scan_details(cmd: ReqTag, stats: &Statistics) { m.borrow_mut() .local_scan_details .entry(cmd) - .or_default() + .or_insert_with(Default::default) .add(stats); }); } diff --git a/src/coprocessor/mod.rs b/src/coprocessor/mod.rs index 874917130e4..fcd16f9b947 100644 --- a/src/coprocessor/mod.rs +++ b/src/coprocessor/mod.rs @@ -64,13 +64,11 @@ type HandlerStreamStepResult = Result<(Option, bool)>; #[async_trait] pub trait RequestHandler: Send { /// Processes current request and produces a response. - #[allow(clippy::diverging_sub_expression)] async fn handle_request(&mut self) -> Result> { panic!("unary request is not supported for this handler"); } /// Processes current request and produces streaming responses. - #[allow(clippy::diverging_sub_expression)] async fn handle_streaming_request(&mut self) -> HandlerStreamStepResult { panic!("streaming request is not supported for this handler"); } diff --git a/src/import/sst_service.rs b/src/import/sst_service.rs index 1a670c917ca..6d40ffe959c 100644 --- a/src/import/sst_service.rs +++ b/src/import/sst_service.rs @@ -66,9 +66,9 @@ const REQUEST_WRITE_CONCURRENCY: usize = 16; /// bytes. In detail, they are: /// - 2 bytes for the request type (Tag+Value). /// - 2 bytes for every string or bytes field (Tag+Length), they are: -/// . + the key field -/// . + the value field -/// . + the CF field (None for CF_DEFAULT) +/// . + the key field +/// . + the value field +/// . + the CF field (None for CF_DEFAULT) /// - 2 bytes for the embedded message field `PutRequest` (Tag+Length). /// - 2 bytes for the request itself (which would be embedded into a /// [`RaftCmdRequest`].) diff --git a/src/lib.rs b/src/lib.rs index aafb099c6cc..b3e9ebaf8e8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -23,14 +23,13 @@ #![feature(proc_macro_hygiene)] #![feature(min_specialization)] #![feature(box_patterns)] -#![feature(extract_if)] +#![feature(drain_filter)] #![feature(deadline_api)] #![feature(let_chains)] #![feature(read_buf)] #![feature(type_alias_impl_trait)] #![allow(incomplete_features)] #![feature(return_position_impl_trait_in_trait)] -#![feature(impl_trait_in_assoc_type)] #[macro_use(fail_point)] extern crate fail; diff --git a/src/server/debug2.rs b/src/server/debug2.rs index 7060b20bdb2..4230828dff1 100644 --- a/src/server/debug2.rs +++ b/src/server/debug2.rs @@ -1113,7 +1113,7 @@ fn get_tablet_cache( "tablet load failed, region_state {:?}", region_state.get_state() ); - Err(box_err!(e)) + return Err(box_err!(e)); } } } diff --git a/src/server/gc_worker/compaction_filter.rs b/src/server/gc_worker/compaction_filter.rs index fe5a252b8db..665824a1bac 100644 --- a/src/server/gc_worker/compaction_filter.rs +++ b/src/server/gc_worker/compaction_filter.rs @@ -826,7 +826,6 @@ pub mod test_utils { use crate::storage::kv::RocksEngine as StorageRocksEngine; /// Do a global GC with the given safe point. - #[allow(clippy::needless_pass_by_ref_mut)] pub fn gc_by_compact(engine: &mut StorageRocksEngine, _: &[u8], safe_point: u64) { let engine = engine.get_rocksdb(); // Put a new key-value pair to ensure compaction can be triggered correctly. diff --git a/src/server/gc_worker/gc_manager.rs b/src/server/gc_worker/gc_manager.rs index d2dc6532200..be18f8216d5 100644 --- a/src/server/gc_worker/gc_manager.rs +++ b/src/server/gc_worker/gc_manager.rs @@ -546,9 +546,7 @@ impl GcMan ) -> GcManagerResult> { // Get the information of the next region to do GC. let (region, next_key) = self.get_next_gc_context(from_key); - let Some(region) = region else { - return Ok(None); - }; + let Some(region) = region else { return Ok(None) }; let hex_start = format!("{:?}", log_wrappers::Value::key(region.get_start_key())); let hex_end = format!("{:?}", log_wrappers::Value::key(region.get_end_key())); @@ -809,7 +807,7 @@ mod tests { // Following code asserts gc_tasks == expected_gc_tasks. assert_eq!(gc_tasks.len(), expected_gc_tasks.len()); - let all_passed = gc_tasks.into_iter().zip(expected_gc_tasks).all( + let all_passed = gc_tasks.into_iter().zip(expected_gc_tasks.into_iter()).all( |((region, safe_point), (expect_region, expect_safe_point))| { region == expect_region && safe_point == expect_safe_point.into() }, @@ -886,7 +884,7 @@ mod tests { #[test] fn test_auto_gc_rewinding() { - for regions in [ + for regions in vec![ // First region starts with empty and last region ends with empty. vec![ (b"".to_vec(), b"1".to_vec(), 1), diff --git a/src/server/gc_worker/gc_worker.rs b/src/server/gc_worker/gc_worker.rs index de40975632f..c608470ba87 100644 --- a/src/server/gc_worker/gc_worker.rs +++ b/src/server/gc_worker/gc_worker.rs @@ -254,7 +254,7 @@ fn get_keys_in_region(keys: &mut Peekable>, region: &Region) -> Ve let mut keys_in_region = Vec::new(); loop { - let Some(key) = keys.peek() else { break }; + let Some(key) = keys.peek() else {break}; let key = key.as_encoded().as_slice(); if key < region.get_start_key() { @@ -552,7 +552,7 @@ impl GcRunner { let mut keys = keys.into_iter().peekable(); for region in regions { let mut raw_modifies = MvccRaw::new(); - let snapshot = self.get_snapshot(self.store_id, ®ion)?; + let mut snapshot = self.get_snapshot(self.store_id, ®ion)?; let mut keys_in_region = get_keys_in_region(&mut keys, ®ion).into_iter(); let mut next_gc_key = keys_in_region.next(); @@ -563,7 +563,7 @@ impl GcRunner { &range_start_key, &range_end_key, &mut raw_modifies, - &snapshot, + &mut snapshot, &mut gc_info, ) { GC_KEY_FAILURES.inc(); @@ -615,7 +615,7 @@ impl GcRunner { range_start_key: &Key, range_end_key: &Key, raw_modifies: &mut MvccRaw, - kv_snapshot: &::Snap, + kv_snapshot: &mut ::Snap, gc_info: &mut GcInfo, ) -> Result<()> { let start_key = key.clone().append_ts(safe_point.prev()); @@ -669,7 +669,10 @@ impl GcRunner { } pub fn mut_stats(&mut self, key_mode: GcKeyMode) -> &mut Statistics { - let stats = self.stats_map.entry(key_mode).or_default(); + let stats = self + .stats_map + .entry(key_mode) + .or_insert_with(Default::default); stats } @@ -2266,6 +2269,7 @@ mod tests { fn generate_keys(start: u64, end: u64) -> Vec { (start..end) + .into_iter() .map(|i| { let key = format!("k{:02}", i); Key::from_raw(key.as_bytes()) diff --git a/src/server/lock_manager/deadlock.rs b/src/server/lock_manager/deadlock.rs index 938dfaff8a6..9583df80dd6 100644 --- a/src/server/lock_manager/deadlock.rs +++ b/src/server/lock_manager/deadlock.rs @@ -361,15 +361,20 @@ impl DetectTable { } /// The role of the detector. -#[derive(Debug, Default, PartialEq, Clone, Copy)] +#[derive(Debug, PartialEq, Clone, Copy)] pub enum Role { /// The node is the leader of the detector. Leader, /// The node is a follower of the leader. - #[default] Follower, } +impl Default for Role { + fn default() -> Role { + Role::Follower + } +} + impl From for Role { fn from(role: StateRole) -> Role { match role { diff --git a/src/server/raftkv/mod.rs b/src/server/raftkv/mod.rs index 58287c2bb83..82563666f04 100644 --- a/src/server/raftkv/mod.rs +++ b/src/server/raftkv/mod.rs @@ -306,7 +306,6 @@ struct WriteResFeed { unsafe impl Send for WriteResFeed {} impl WriteResFeed { - #[allow(clippy::arc_with_non_send_sync)] fn pair() -> (Self, WriteResSub) { let core = Arc::new(WriteResCore { ev: AtomicU8::new(0), @@ -582,9 +581,7 @@ where tx.notify(res); } rx.inspect(move |ev| { - let WriteEvent::Finished(res) = ev else { - return; - }; + let WriteEvent::Finished(res) = ev else { return }; match res { Ok(()) => { ASYNC_REQUESTS_COUNTER_VEC.write.success.inc(); diff --git a/src/server/raftkv2/mod.rs b/src/server/raftkv2/mod.rs index 9785e821312..5183ecd6567 100644 --- a/src/server/raftkv2/mod.rs +++ b/src/server/raftkv2/mod.rs @@ -315,9 +315,7 @@ impl tikv_kv::Engine for RaftKv2 { early_err: res.err(), }) .inspect(move |ev| { - let WriteEvent::Finished(res) = ev else { - return; - }; + let WriteEvent::Finished(res) = ev else { return }; match res { Ok(()) => { ASYNC_REQUESTS_COUNTER_VEC.write.success.inc(); diff --git a/src/server/raftkv2/node.rs b/src/server/raftkv2/node.rs index 73a15983bd0..d9b17c5d35c 100644 --- a/src/server/raftkv2/node.rs +++ b/src/server/raftkv2/node.rs @@ -269,9 +269,7 @@ where /// Stops the Node. pub fn stop(&mut self) { let store_id = self.store.get_id(); - let Some((_, mut system)) = self.system.take() else { - return; - }; + let Some((_, mut system)) = self.system.take() else { return }; info!(self.logger, "stop raft store thread"; "store_id" => store_id); system.shutdown(); } diff --git a/src/server/service/debug.rs b/src/server/service/debug.rs index 497d8240684..d0b715542d5 100644 --- a/src/server/service/debug.rs +++ b/src/server/service/debug.rs @@ -300,6 +300,7 @@ where let debugger = self.debugger.clone(); let res = self.pool.spawn(async move { + let req = req; debugger .compact( req.get_db(), diff --git a/src/server/service/diagnostics/log.rs b/src/server/service/diagnostics/log.rs index 413e36a6645..8e77d65233e 100644 --- a/src/server/service/diagnostics/log.rs +++ b/src/server/service/diagnostics/log.rs @@ -612,7 +612,7 @@ Some invalid logs 4: Welcome to TiKV - test-filter"# vec![], ) .unwrap(); - let expected = [ + let expected = vec![ "2019/08/23 18:09:56.387 +08:00", "2019/08/23 18:09:56.387 +08:00", // for invalid line "2019/08/23 18:09:57.387 +08:00", @@ -639,7 +639,7 @@ Some invalid logs 4: Welcome to TiKV - test-filter"# vec![], ) .unwrap(); - let expected = [ + let expected = vec![ "2019/08/23 18:09:56.387 +08:00", "2019/08/23 18:09:56.387 +08:00", // for invalid line "2019/08/23 18:09:57.387 +08:00", @@ -662,7 +662,7 @@ Some invalid logs 4: Welcome to TiKV - test-filter"# vec![], ) .unwrap(); - let expected = ["2019/08/23 18:09:53.387 +08:00"] + let expected = vec!["2019/08/23 18:09:53.387 +08:00"] .iter() .map(|s| timestamp(s)) .collect::>(); @@ -671,7 +671,7 @@ Some invalid logs 4: Welcome to TiKV - test-filter"# expected ); - for time in [0, i64::MAX].into_iter() { + for time in vec![0, i64::MAX].into_iter() { let log_iter = LogIterator::new( &log_file, timestamp("2019/08/23 18:09:53.387 +08:00"), @@ -680,7 +680,7 @@ Some invalid logs 4: Welcome to TiKV - test-filter"# vec![], ) .unwrap(); - let expected = [ + let expected = vec![ "2019/08/23 18:09:58.387 +08:00", "2019/08/23 18:09:59.387 +08:00", "2019/08/23 18:10:06.387 +08:00", @@ -704,7 +704,7 @@ Some invalid logs 4: Welcome to TiKV - test-filter"# vec![regex::Regex::new(".*test-filter.*").unwrap()], ) .unwrap(); - let expected = [ + let expected = vec![ "2019/08/23 18:09:58.387 +08:00", "2019/08/23 18:10:06.387 +08:00", // for invalid line ] @@ -783,7 +783,7 @@ Some invalid logs 2: Welcome to TiKV - test-filter"# req.set_end_time(i64::MAX); req.set_levels(vec![LogLevel::Warn as _]); req.set_patterns(vec![".*test-filter.*".to_string()].into()); - let expected = [ + let expected = vec![ "2019/08/23 18:09:58.387 +08:00", "2019/08/23 18:11:58.387 +08:00", "2019/08/23 18:11:59.387 +08:00", // for invalid line @@ -796,7 +796,9 @@ Some invalid logs 2: Welcome to TiKV - test-filter"# s.collect::>() .await .into_iter() - .flat_map(|mut resp| resp.take_messages().into_iter()) + .map(|mut resp| resp.take_messages().into_iter()) + .into_iter() + .flatten() .map(|msg| msg.get_time()) .collect::>() }); diff --git a/src/server/service/diagnostics/sys.rs b/src/server/service/diagnostics/sys.rs index 12494e9e7c4..8a84eaf6293 100644 --- a/src/server/service/diagnostics/sys.rs +++ b/src/server/service/diagnostics/sys.rs @@ -601,7 +601,7 @@ mod tests { ] ); // memory - for name in ["virtual", "swap"].into_iter() { + for name in vec!["virtual", "swap"].into_iter() { let item = collector .iter() .find(|x| x.get_tp() == "memory" && x.get_name() == name); diff --git a/src/server/service/kv.rs b/src/server/service/kv.rs index 4a961eedf19..77f92d33d95 100644 --- a/src/server/service/kv.rs +++ b/src/server/service/kv.rs @@ -889,6 +889,7 @@ impl Tikv for Service { forward_duplex!(self.proxy, batch_commands, ctx, stream, sink); let (tx, rx) = unbounded(WakePolicy::TillReach(GRPC_MSG_NOTIFY_SIZE)); + let ctx = Arc::new(ctx); let peer = ctx.peer(); let storage = self.storage.clone(); let copr = self.copr.clone(); diff --git a/src/storage/lock_manager/lock_wait_context.rs b/src/storage/lock_manager/lock_wait_context.rs index 1eba8cd81b7..32c99867a3f 100644 --- a/src/storage/lock_manager/lock_wait_context.rs +++ b/src/storage/lock_manager/lock_wait_context.rs @@ -387,9 +387,9 @@ mod tests { let res = rx.recv().unwrap().unwrap_err(); assert!(matches!( &res, - StorageError(box StorageErrorInner::Txn(TxnError(box TxnErrorInner::Mvcc(MvccError( - box MvccErrorInner::WriteConflict { .. }, - ))))) + StorageError(box StorageErrorInner::Txn(TxnError( + box TxnErrorInner::Mvcc(MvccError(box MvccErrorInner::WriteConflict { .. })) + ))) )); // The tx should be dropped. rx.recv().unwrap_err(); @@ -422,9 +422,9 @@ mod tests { let res = rx.recv().unwrap().unwrap_err(); assert!(matches!( &res, - StorageError(box StorageErrorInner::Txn(TxnError(box TxnErrorInner::Mvcc(MvccError( - box MvccErrorInner::KeyIsLocked(_), - ))))) + StorageError(box StorageErrorInner::Txn(TxnError( + box TxnErrorInner::Mvcc(MvccError(box MvccErrorInner::KeyIsLocked(_))) + ))) )); // Since the cancellation callback can fully execute only when it's successfully // removed from the lock waiting queues, it's impossible that `finish_request` diff --git a/src/storage/lock_manager/lock_waiting_queue.rs b/src/storage/lock_manager/lock_waiting_queue.rs index 68e0118610a..a81248fe9e2 100644 --- a/src/storage/lock_manager/lock_waiting_queue.rs +++ b/src/storage/lock_manager/lock_waiting_queue.rs @@ -110,7 +110,12 @@ impl Eq for LockWaitEntry {} impl PartialOrd for LockWaitEntry { fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) + // Reverse it since the priority queue is a max heap and we want to pop the + // minimal. + other + .parameters + .start_ts + .partial_cmp(&self.parameters.start_ts) } } diff --git a/src/storage/metrics.rs b/src/storage/metrics.rs index d3b3e89a3f8..e9477b56b0f 100644 --- a/src/storage/metrics.rs +++ b/src/storage/metrics.rs @@ -63,7 +63,7 @@ pub fn tls_collect_scan_details(cmd: CommandKind, stats: &Statistics) { m.borrow_mut() .local_scan_details .entry(cmd) - .or_default() + .or_insert_with(Default::default) .add(stats); }); } diff --git a/src/storage/mod.rs b/src/storage/mod.rs index b8224df696b..cb4057bfd7e 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -1946,7 +1946,7 @@ impl Storage { key_ranges.push(build_key_range(k.as_encoded(), k.as_encoded(), false)); (k, v) }) - .filter(|(_, v)| !(v.is_ok() && v.as_ref().unwrap().is_none())) + .filter(|&(_, ref v)| !(v.is_ok() && v.as_ref().unwrap().is_none())) .map(|(k, v)| match v { Ok(v) => { let (user_key, _) = F::decode_raw_key_owned(k, false).unwrap(); @@ -3892,9 +3892,9 @@ mod tests { let result = block_on(storage.get(Context::default(), Key::from_raw(b"x"), 100.into())); assert!(matches!( result, - Err(Error(box ErrorInner::Txn(txn::Error(box txn::ErrorInner::Mvcc(mvcc::Error( - box mvcc::ErrorInner::KeyIsLocked { .. }, - )))))) + Err(Error(box ErrorInner::Txn(txn::Error( + box txn::ErrorInner::Mvcc(mvcc::Error(box mvcc::ErrorInner::KeyIsLocked { .. })) + )))) )); } @@ -5744,7 +5744,7 @@ mod tests { ]; // Write key-value pairs one by one - for (key, value) in &test_data { + for &(ref key, ref value) in &test_data { storage .raw_put( ctx.clone(), @@ -5803,7 +5803,7 @@ mod tests { let mut total_bytes: u64 = 0; let mut is_first = true; // Write key-value pairs one by one - for (key, value) in &test_data { + for &(ref key, ref value) in &test_data { storage .raw_put( ctx.clone(), @@ -6116,7 +6116,7 @@ mod tests { #[test] fn test_raw_batch_put() { - for for_cas in [false, true].into_iter() { + for for_cas in vec![false, true].into_iter() { test_kv_format_impl!(test_raw_batch_put_impl(for_cas)); } } @@ -6245,7 +6245,7 @@ mod tests { ]; // Write key-value pairs one by one - for (key, value) in &test_data { + for &(ref key, ref value) in &test_data { storage .raw_put( ctx.clone(), @@ -6260,7 +6260,7 @@ mod tests { } // Verify pairs in a batch - let keys = test_data.iter().map(|(k, _)| k.clone()).collect(); + let keys = test_data.iter().map(|&(ref k, _)| k.clone()).collect(); let results = test_data.into_iter().map(|(k, v)| Some((k, v))).collect(); expect_multi_values( results, @@ -6292,7 +6292,7 @@ mod tests { ]; // Write key-value pairs one by one - for (key, value) in &test_data { + for &(ref key, ref value) in &test_data { storage .raw_put( ctx.clone(), @@ -6310,7 +6310,7 @@ mod tests { let mut ids = vec![]; let cmds = test_data .iter() - .map(|(k, _)| { + .map(|&(ref k, _)| { let mut req = RawGetRequest::default(); req.set_context(ctx.clone()); req.set_key(k.clone()); @@ -6331,7 +6331,7 @@ mod tests { #[test] fn test_raw_batch_delete() { - for for_cas in [false, true].into_iter() { + for for_cas in vec![false, true].into_iter() { test_kv_format_impl!(test_raw_batch_delete_impl(for_cas)); } } @@ -6381,10 +6381,10 @@ mod tests { rx.recv().unwrap(); // Verify pairs exist - let keys = test_data.iter().map(|(k, _)| k.clone()).collect(); + let keys = test_data.iter().map(|&(ref k, _)| k.clone()).collect(); let results = test_data .iter() - .map(|(k, v)| Some((k.clone(), v.clone()))) + .map(|&(ref k, ref v)| Some((k.clone(), v.clone()))) .collect(); expect_multi_values( results, @@ -6512,7 +6512,7 @@ mod tests { // Scan pairs with key only let mut results: Vec> = test_data .iter() - .map(|(k, _)| Some((k.clone(), vec![]))) + .map(|&(ref k, _)| Some((k.clone(), vec![]))) .collect(); expect_multi_values( results.clone(), @@ -6909,7 +6909,7 @@ mod tests { rx.recv().unwrap(); // Verify pairs exist - let keys = test_data.iter().map(|(k, _)| k.clone()).collect(); + let keys = test_data.iter().map(|&(ref k, _)| k.clone()).collect(); let results = test_data.into_iter().map(|(k, v)| Some((k, v))).collect(); expect_multi_values( results, diff --git a/src/storage/mvcc/reader/point_getter.rs b/src/storage/mvcc/reader/point_getter.rs index 474c789a31d..cc4403229c1 100644 --- a/src/storage/mvcc/reader/point_getter.rs +++ b/src/storage/mvcc/reader/point_getter.rs @@ -1287,7 +1287,7 @@ mod tests { let k = b"k"; // Write enough LOCK recrods - for start_ts in (1..30).step_by(2) { + for start_ts in (1..30).into_iter().step_by(2) { must_prewrite_lock(&mut engine, k, k, start_ts); must_commit(&mut engine, k, start_ts, start_ts + 1); } diff --git a/src/storage/mvcc/reader/reader.rs b/src/storage/mvcc/reader/reader.rs index 61a366c12ee..48158eda946 100644 --- a/src/storage/mvcc/reader/reader.rs +++ b/src/storage/mvcc/reader/reader.rs @@ -418,10 +418,11 @@ impl MvccReader { estimated_versions_to_last_change, } if estimated_versions_to_last_change >= SEEK_BOUND => { let key_with_ts = key.clone().append_ts(commit_ts); - let Some(value) = self.snapshot.get_cf(CF_WRITE, &key_with_ts)? - else { - return Ok(None); - }; + let Some(value) = self + .snapshot + .get_cf(CF_WRITE, &key_with_ts)? else { + return Ok(None); + }; self.statistics.write.get += 1; let write = WriteRef::parse(&value)?.to_owned(); assert!( @@ -2420,7 +2421,7 @@ pub mod tests { engine.commit(k, 1, 2); // Write enough LOCK recrods - for start_ts in (6..30).step_by(2) { + for start_ts in (6..30).into_iter().step_by(2) { engine.lock(k, start_ts, start_ts + 1); } @@ -2429,7 +2430,7 @@ pub mod tests { engine.commit(k, 45, 46); // Write enough LOCK recrods - for start_ts in (50..80).step_by(2) { + for start_ts in (50..80).into_iter().step_by(2) { engine.lock(k, start_ts, start_ts + 1); } @@ -2484,7 +2485,7 @@ pub mod tests { let k = b"k"; // Write enough LOCK recrods - for start_ts in (6..30).step_by(2) { + for start_ts in (6..30).into_iter().step_by(2) { engine.lock(k, start_ts, start_ts + 1); } @@ -2521,7 +2522,7 @@ pub mod tests { engine.put(k, 1, 2); // 10 locks were put - for start_ts in (6..30).step_by(2) { + for start_ts in (6..30).into_iter().step_by(2) { engine.lock(k, start_ts, start_ts + 1); } @@ -2548,7 +2549,7 @@ pub mod tests { feature_gate.set_version("6.1.0").unwrap(); set_tls_feature_gate(feature_gate); engine.delete(k, 51, 52); - for start_ts in (56..80).step_by(2) { + for start_ts in (56..80).into_iter().step_by(2) { engine.lock(k, start_ts, start_ts + 1); } let feature_gate = FeatureGate::default(); @@ -2580,7 +2581,7 @@ pub mod tests { let k = b"k"; engine.put(k, 1, 2); - for start_ts in (6..30).step_by(2) { + for start_ts in (6..30).into_iter().step_by(2) { engine.lock(k, start_ts, start_ts + 1); } engine.rollback(k, 30); diff --git a/src/storage/mvcc/reader/scanner/forward.rs b/src/storage/mvcc/reader/scanner/forward.rs index 2b0a8e13582..3437a1e5432 100644 --- a/src/storage/mvcc/reader/scanner/forward.rs +++ b/src/storage/mvcc/reader/scanner/forward.rs @@ -633,7 +633,7 @@ impl ScanPolicy for LatestEntryPolicy { fn scan_latest_handle_lock( current_user_key: Key, - cfg: &ScannerConfig, + cfg: &mut ScannerConfig, cursors: &mut Cursors, statistics: &mut Statistics, ) -> Result> { @@ -1636,7 +1636,7 @@ mod latest_kv_tests { must_prewrite_put(&mut engine, b"k4", b"v41", b"k4", 3); must_commit(&mut engine, b"k4", 3, 7); - for start_ts in (10..30).step_by(2) { + for start_ts in (10..30).into_iter().step_by(2) { must_prewrite_lock(&mut engine, b"k1", b"k1", start_ts); must_commit(&mut engine, b"k1", start_ts, start_ts + 1); must_prewrite_lock(&mut engine, b"k3", b"k1", start_ts); diff --git a/src/storage/raw/raw_mvcc.rs b/src/storage/raw/raw_mvcc.rs index aa635827961..8c4ad5da08b 100644 --- a/src/storage/raw/raw_mvcc.rs +++ b/src/storage/raw/raw_mvcc.rs @@ -290,7 +290,7 @@ mod tests { RawEncodeSnapshot::from_snapshot(raw_mvcc_snapshot); // get_cf - for (key, value, _) in &test_data[6..12] { + for &(ref key, ref value, _) in &test_data[6..12] { let res = encode_snapshot.get_cf(CF_DEFAULT, &ApiV2::encode_raw_key(key, None)); assert_eq!(res.unwrap(), Some(value.to_owned())); } diff --git a/src/storage/txn/actions/prewrite.rs b/src/storage/txn/actions/prewrite.rs index 713155f9160..64e22a13585 100644 --- a/src/storage/txn/actions/prewrite.rs +++ b/src/storage/txn/actions/prewrite.rs @@ -766,6 +766,7 @@ fn async_commit_timestamps( #[cfg(not(feature = "failpoints"))] let injected_fallback = false; + let max_commit_ts = max_commit_ts; if (!max_commit_ts.is_zero() && min_commit_ts > max_commit_ts) || injected_fallback { warn!("commit_ts is too large, fallback to normal 2PC"; "key" => log_wrappers::Value::key(key.as_encoded()), @@ -1874,6 +1875,7 @@ pub mod tests { // At most 12 ops per-case. let ops_count = rg.gen::() % 12; let ops = (0..ops_count) + .into_iter() .enumerate() .map(|(i, _)| { if i == 0 { diff --git a/src/storage/txn/commands/atomic_store.rs b/src/storage/txn/commands/atomic_store.rs index 61dbdac6565..9a54895e7e2 100644 --- a/src/storage/txn/commands/atomic_store.rs +++ b/src/storage/txn/commands/atomic_store.rs @@ -88,8 +88,8 @@ mod tests { fn test_atomic_process_write_impl() { let mut engine = TestEngineBuilder::new().build().unwrap(); let cm = concurrency_manager::ConcurrencyManager::new(1.into()); - let raw_keys = [b"ra", b"rz"]; - let raw_values = [b"valuea", b"valuez"]; + let raw_keys = vec![b"ra", b"rz"]; + let raw_values = vec![b"valuea", b"valuez"]; let ts_provider = super::super::test_util::gen_ts_provider(F::TAG); let mut modifies = vec![]; diff --git a/src/storage/txn/commands/prewrite.rs b/src/storage/txn/commands/prewrite.rs index 2f39b29bc64..10446db6292 100644 --- a/src/storage/txn/commands/prewrite.rs +++ b/src/storage/txn/commands/prewrite.rs @@ -1853,7 +1853,9 @@ mod tests { .unwrap_err(); assert!(matches!( res, - Error(box ErrorInner::Mvcc(MvccError(box MvccErrorInner::AlreadyExist { .. }))) + Error(box ErrorInner::Mvcc(MvccError( + box MvccErrorInner::AlreadyExist { .. } + ))) )); assert_eq!(cm.max_ts().into_inner(), 15); @@ -1876,7 +1878,9 @@ mod tests { .unwrap_err(); assert!(matches!( res, - Error(box ErrorInner::Mvcc(MvccError(box MvccErrorInner::WriteConflict { .. }))) + Error(box ErrorInner::Mvcc(MvccError( + box MvccErrorInner::WriteConflict { .. } + ))) )); } @@ -2282,9 +2286,9 @@ mod tests { .unwrap_err(); assert!(matches!( err, - Error(box ErrorInner::Mvcc(MvccError(box MvccErrorInner::PessimisticLockNotFound { - .. - }))) + Error(box ErrorInner::Mvcc(MvccError( + box MvccErrorInner::PessimisticLockNotFound { .. } + ))) )); must_unlocked(&mut engine, b"k2"); // However conflict still won't be checked if there's a non-retry request @@ -2465,9 +2469,9 @@ mod tests { let err = prewrite_command(&mut engine, cm.clone(), &mut stat, cmd).unwrap_err(); assert!(matches!( err, - Error(box ErrorInner::Mvcc(MvccError(box MvccErrorInner::PessimisticLockNotFound { - .. - }))) + Error(box ErrorInner::Mvcc(MvccError( + box MvccErrorInner::PessimisticLockNotFound { .. } + ))) )); // Passing keys in different order gets the same result: let cmd = PrewritePessimistic::with_defaults( @@ -2488,9 +2492,9 @@ mod tests { let err = prewrite_command(&mut engine, cm, &mut stat, cmd).unwrap_err(); assert!(matches!( err, - Error(box ErrorInner::Mvcc(MvccError(box MvccErrorInner::PessimisticLockNotFound { - .. - }))) + Error(box ErrorInner::Mvcc(MvccError( + box MvccErrorInner::PessimisticLockNotFound { .. } + ))) )); // If the two keys are sent in different requests, it would be the client's duty diff --git a/src/storage/txn/latch.rs b/src/storage/txn/latch.rs index 549d1d22636..a662d9bab79 100644 --- a/src/storage/txn/latch.rs +++ b/src/storage/txn/latch.rs @@ -224,7 +224,7 @@ impl Latches { keep_latches_for_next_cmd: Option<(u64, &Lock)>, ) -> Vec { // Used to - let dummy_vec = []; + let dummy_vec = vec![]; let (keep_latches_for_cid, mut keep_latches_it) = match keep_latches_for_next_cmd { Some((cid, lock)) => (Some(cid), lock.required_hashes.iter().peekable()), None => (None, dummy_vec.iter().peekable()), @@ -282,9 +282,9 @@ mod tests { fn test_wakeup() { let latches = Latches::new(256); - let keys_a = ["k1", "k3", "k5"]; + let keys_a = vec!["k1", "k3", "k5"]; let mut lock_a = Lock::new(keys_a.iter()); - let keys_b = ["k4", "k5", "k6"]; + let keys_b = vec!["k4", "k5", "k6"]; let mut lock_b = Lock::new(keys_b.iter()); let cid_a: u64 = 1; let cid_b: u64 = 2; @@ -310,9 +310,9 @@ mod tests { fn test_wakeup_by_multi_cmds() { let latches = Latches::new(256); - let keys_a = ["k1", "k2", "k3"]; - let keys_b = ["k4", "k5", "k6"]; - let keys_c = ["k3", "k4"]; + let keys_a = vec!["k1", "k2", "k3"]; + let keys_b = vec!["k4", "k5", "k6"]; + let keys_c = vec!["k3", "k4"]; let mut lock_a = Lock::new(keys_a.iter()); let mut lock_b = Lock::new(keys_b.iter()); let mut lock_c = Lock::new(keys_c.iter()); @@ -353,10 +353,10 @@ mod tests { fn test_wakeup_by_small_latch_slot() { let latches = Latches::new(5); - let keys_a = ["k1", "k2", "k3"]; - let keys_b = ["k6", "k7", "k8"]; - let keys_c = ["k3", "k4"]; - let keys_d = ["k7", "k10"]; + let keys_a = vec!["k1", "k2", "k3"]; + let keys_b = vec!["k6", "k7", "k8"]; + let keys_c = vec!["k3", "k4"]; + let keys_d = vec!["k7", "k10"]; let mut lock_a = Lock::new(keys_a.iter()); let mut lock_b = Lock::new(keys_b.iter()); let mut lock_c = Lock::new(keys_c.iter()); diff --git a/src/storage/txn/sched_pool.rs b/src/storage/txn/sched_pool.rs index 2ca3ef145c8..19736304373 100644 --- a/src/storage/txn/sched_pool.rs +++ b/src/storage/txn/sched_pool.rs @@ -267,7 +267,7 @@ pub fn tls_collect_scan_details(cmd: &'static str, stats: &Statistics) { m.borrow_mut() .local_scan_details .entry(cmd) - .or_default() + .or_insert_with(Default::default) .add(stats); }); } diff --git a/tests/Cargo.toml b/tests/Cargo.toml index 0081d5e95bc..aa0c2c29dec 100644 --- a/tests/Cargo.toml +++ b/tests/Cargo.toml @@ -120,7 +120,6 @@ uuid = { version = "0.8.1", features = ["serde", "v4"] } procinfo = { git = "https://github.com/tikv/procinfo-rs", rev = "7693954bd1dd86eb1709572fd7b62fd5f7ff2ea1" } [dev-dependencies] -arrow = "46.0" byteorder = "1.2" # See https://bheisler.github.io/criterion.rs/book/user_guide/known_limitations.html for the usage # of `real_blackbox` feature. diff --git a/tests/benches/coprocessor_executors/util/mod.rs b/tests/benches/coprocessor_executors/util/mod.rs index 3698860b4ea..0a5708c74ce 100644 --- a/tests/benches/coprocessor_executors/util/mod.rs +++ b/tests/benches/coprocessor_executors/util/mod.rs @@ -147,7 +147,7 @@ where I: 'static, { fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) + self.get_name().partial_cmp(other.get_name()) } } diff --git a/tests/benches/hierarchy/mvcc/mod.rs b/tests/benches/hierarchy/mvcc/mod.rs index 99f2c9ee1f4..92dacfe6dc9 100644 --- a/tests/benches/hierarchy/mvcc/mod.rs +++ b/tests/benches/hierarchy/mvcc/mod.rs @@ -61,7 +61,7 @@ where .unwrap(); } let write_data = WriteData::from_modifies(txn.into_modifies()); - let _ = futures::executor::block_on(tikv_kv::write(engine, &ctx, write_data, None)); + let _ = tikv_kv::write(engine, &ctx, write_data, None); let keys: Vec = kvs.iter().map(|(k, _)| Key::from_raw(k)).collect(); let snapshot = engine.snapshot(Default::default()).unwrap(); (snapshot, keys) diff --git a/tests/benches/misc/coprocessor/codec/chunk/chunk.rs b/tests/benches/misc/coprocessor/codec/chunk/chunk.rs deleted file mode 100644 index 241284a7228..00000000000 --- a/tests/benches/misc/coprocessor/codec/chunk/chunk.rs +++ /dev/null @@ -1,176 +0,0 @@ -// Copyright 2018 TiKV Project Authors. Licensed under Apache-2.0. - -use std::sync::Arc; - -use arrow::{ - array, - datatypes::{self, DataType, Field}, - record_batch::RecordBatch, -}; -use tidb_query_datatype::{codec::Datum, prelude::*, FieldTypeFlag, FieldTypeTp}; -use tipb::FieldType; - -pub struct Chunk { - pub data: RecordBatch, -} - -impl Chunk { - pub fn get_datum(&self, col_id: usize, row_id: usize, field_type: &FieldType) -> Datum { - if self.data.column(col_id).is_null(row_id) { - return Datum::Null; - } - - match field_type.as_accessor().tp() { - FieldTypeTp::Tiny - | FieldTypeTp::Short - | FieldTypeTp::Int24 - | FieldTypeTp::Long - | FieldTypeTp::LongLong - | FieldTypeTp::Year => { - if field_type - .as_accessor() - .flag() - .contains(FieldTypeFlag::UNSIGNED) - { - let data = self - .data - .column(col_id) - .as_any() - .downcast_ref::() - .unwrap(); - - Datum::U64(data.value(row_id)) - } else { - let data = self - .data - .column(col_id) - .as_any() - .downcast_ref::() - .unwrap(); - - Datum::I64(data.value(row_id)) - } - } - FieldTypeTp::Float | FieldTypeTp::Double => { - let data = self - .data - .column(col_id) - .as_any() - .downcast_ref::() - .unwrap(); - Datum::F64(data.value(row_id)) - } - _ => unreachable!(), - } - } -} - -pub struct ChunkBuilder { - columns: Vec, -} - -impl ChunkBuilder { - pub fn new(cols: usize, rows: usize) -> ChunkBuilder { - ChunkBuilder { - columns: vec![ColumnsBuilder::new(rows); cols], - } - } - - pub fn build(self, tps: &[FieldType]) -> Chunk { - let mut fields = Vec::with_capacity(tps.len()); - let mut arrays: Vec> = Vec::with_capacity(tps.len()); - for (field_type, column) in tps.iter().zip(self.columns) { - match field_type.as_accessor().tp() { - FieldTypeTp::Tiny - | FieldTypeTp::Short - | FieldTypeTp::Int24 - | FieldTypeTp::Long - | FieldTypeTp::LongLong - | FieldTypeTp::Year => { - if field_type - .as_accessor() - .flag() - .contains(FieldTypeFlag::UNSIGNED) - { - let (f, d) = column.into_u64_array(); - fields.push(f); - arrays.push(d); - } else { - let (f, d) = column.into_i64_array(); - fields.push(f); - arrays.push(d); - } - } - FieldTypeTp::Float | FieldTypeTp::Double => { - let (f, d) = column.into_f64_array(); - fields.push(f); - arrays.push(d); - } - _ => unreachable!(), - }; - } - let schema = datatypes::Schema::new(fields); - let batch = RecordBatch::try_new(Arc::new(schema), arrays).unwrap(); - Chunk { data: batch } - } - - pub fn append_datum(&mut self, col_id: usize, data: Datum) { - self.columns[col_id].append_datum(data) - } -} - -#[derive(Clone)] -pub struct ColumnsBuilder { - data: Vec, -} - -impl ColumnsBuilder { - fn new(rows: usize) -> ColumnsBuilder { - ColumnsBuilder { - data: Vec::with_capacity(rows), - } - } - - fn append_datum(&mut self, data: Datum) { - self.data.push(data) - } - - fn into_i64_array(self) -> (Field, Arc) { - let field = Field::new("", DataType::Int64, true); - let mut data: Vec> = Vec::with_capacity(self.data.len()); - for v in self.data { - match v { - Datum::Null => data.push(None), - Datum::I64(v) => data.push(Some(v)), - _ => unreachable!(), - } - } - (field, Arc::new(array::PrimitiveArray::from(data))) - } - - fn into_u64_array(self) -> (Field, Arc) { - let field = Field::new("", DataType::UInt64, true); - let mut data: Vec> = Vec::with_capacity(self.data.len()); - for v in self.data { - match v { - Datum::Null => data.push(None), - Datum::U64(v) => data.push(Some(v)), - _ => unreachable!(), - } - } - (field, Arc::new(array::PrimitiveArray::from(data))) - } - - fn into_f64_array(self) -> (Field, Arc) { - let field = Field::new("", DataType::Float64, true); - let mut data: Vec> = Vec::with_capacity(self.data.len()); - for v in self.data { - match v { - Datum::Null => data.push(None), - Datum::F64(v) => data.push(Some(v)), - _ => unreachable!(), - } - } - (field, Arc::new(array::PrimitiveArray::from(data))) - } -} diff --git a/tests/benches/misc/coprocessor/codec/chunk/mod.rs b/tests/benches/misc/coprocessor/codec/chunk/mod.rs deleted file mode 100644 index f956e2cb14e..00000000000 --- a/tests/benches/misc/coprocessor/codec/chunk/mod.rs +++ /dev/null @@ -1,140 +0,0 @@ -// Copyright 2018 TiKV Project Authors. Licensed under Apache-2.0. - -mod chunk; - -use test::Bencher; -use tidb_query_datatype::{ - codec::{ - chunk::{Chunk, ChunkEncoder}, - datum::Datum, - mysql::*, - }, - FieldTypeTp, -}; -use tipb::FieldType; - -#[bench] -fn bench_encode_chunk(b: &mut Bencher) { - let rows = 1024; - let fields: Vec = vec![ - FieldTypeTp::LongLong.into(), - FieldTypeTp::LongLong.into(), - FieldTypeTp::VarChar.into(), - FieldTypeTp::VarChar.into(), - FieldTypeTp::NewDecimal.into(), - FieldTypeTp::Json.into(), - ]; - let mut chunk = Chunk::new(&fields, rows); - for row_id in 0..rows { - let s = format!("{}.123435", row_id); - let bs = Datum::Bytes(s.as_bytes().to_vec()); - let dec = Datum::Dec(s.parse().unwrap()); - let json = Datum::Json(Json::from_string(s).unwrap()); - chunk.append_datum(0, &Datum::Null).unwrap(); - chunk.append_datum(1, &Datum::I64(row_id as i64)).unwrap(); - chunk.append_datum(2, &bs).unwrap(); - chunk.append_datum(3, &bs).unwrap(); - chunk.append_datum(4, &dec).unwrap(); - chunk.append_datum(5, &json).unwrap(); - } - - b.iter(|| { - let mut buf = vec![]; - buf.write_chunk(&chunk).unwrap(); - }); -} - -#[bench] -fn bench_chunk_build_tidb(b: &mut Bencher) { - let rows = 1024; - let fields: Vec = vec![FieldTypeTp::LongLong.into(), FieldTypeTp::LongLong.into()]; - - b.iter(|| { - let mut chunk = Chunk::new(&fields, rows); - for row_id in 0..rows { - chunk.append_datum(0, &Datum::Null).unwrap(); - chunk.append_datum(1, &Datum::I64(row_id as i64)).unwrap(); - } - }); -} - -#[bench] -fn bench_chunk_build_official(b: &mut Bencher) { - let rows = 1024; - let fields: Vec = vec![FieldTypeTp::LongLong.into(), FieldTypeTp::LongLong.into()]; - - b.iter(|| { - let mut chunk = chunk::ChunkBuilder::new(fields.len(), rows); - for row_id in 0..rows { - chunk.append_datum(0, Datum::Null); - chunk.append_datum(1, Datum::I64(row_id as i64)); - } - chunk.build(&fields); - }); -} - -#[bench] -fn bench_chunk_iter_tidb(b: &mut Bencher) { - let rows = 1024; - let fields: Vec = vec![FieldTypeTp::LongLong.into(), FieldTypeTp::Double.into()]; - let mut chunk = Chunk::new(&fields, rows); - for row_id in 0..rows { - if row_id & 1 == 0 { - chunk.append_datum(0, &Datum::Null).unwrap(); - } else { - chunk.append_datum(0, &Datum::I64(row_id as i64)).unwrap(); - } - chunk.append_datum(1, &Datum::F64(row_id as f64)).unwrap(); - } - - b.iter(|| { - let mut col1 = 0; - let mut col2 = 0.0; - for row in chunk.iter() { - col1 += match row.get_datum(0, &fields[0]).unwrap() { - Datum::I64(v) => v, - Datum::Null => 0, - _ => unreachable!(), - }; - col2 += match row.get_datum(1, &fields[1]).unwrap() { - Datum::F64(v) => v, - _ => unreachable!(), - }; - } - assert_eq!(col1, 262_144); - assert!(!(523_776.0 - col2).is_normal()); - }); -} - -#[bench] -fn bench_chunk_iter_official(b: &mut Bencher) { - let rows = 1024; - let fields: Vec = vec![FieldTypeTp::LongLong.into(), FieldTypeTp::Double.into()]; - let mut chunk = chunk::ChunkBuilder::new(fields.len(), rows); - for row_id in 0..rows { - if row_id & 1 == 0 { - chunk.append_datum(0, Datum::Null); - } else { - chunk.append_datum(0, Datum::I64(row_id as i64)); - } - - chunk.append_datum(1, Datum::F64(row_id as f64)); - } - let chunk = chunk.build(&fields); - b.iter(|| { - let (mut col1, mut col2) = (0, 0.0); - for row_id in 0..chunk.data.num_rows() { - col1 += match chunk.get_datum(0, row_id, &fields[0]) { - Datum::I64(v) => v, - Datum::Null => 0, - _ => unreachable!(), - }; - col2 += match chunk.get_datum(1, row_id, &fields[1]) { - Datum::F64(v) => v, - _ => unreachable!(), - }; - } - assert_eq!(col1, 262_144); - assert!(!(523_776.0 - col2).is_normal()); - }); -} diff --git a/tests/benches/misc/coprocessor/codec/mod.rs b/tests/benches/misc/coprocessor/codec/mod.rs index 274ec362377..082f1c55894 100644 --- a/tests/benches/misc/coprocessor/codec/mod.rs +++ b/tests/benches/misc/coprocessor/codec/mod.rs @@ -1,6 +1,5 @@ // Copyright 2018 TiKV Project Authors. Licensed under Apache-2.0. -mod chunk; mod mysql; use byteorder::{BigEndian, ByteOrder, LittleEndian}; diff --git a/tests/benches/misc/raftkv/mod.rs b/tests/benches/misc/raftkv/mod.rs index a545d9935e6..d567edd5add 100644 --- a/tests/benches/misc/raftkv/mod.rs +++ b/tests/benches/misc/raftkv/mod.rs @@ -171,7 +171,6 @@ fn bench_async_snapshots_noop(b: &mut test::Bencher) { } #[bench] -#[allow(clippy::let_underscore_future)] fn bench_async_snapshot(b: &mut test::Bencher) { let leader = new_peer(2, 3); let mut region = Region::default(); @@ -206,7 +205,6 @@ fn bench_async_snapshot(b: &mut test::Bencher) { } #[bench] -#[allow(clippy::let_underscore_future)] fn bench_async_write(b: &mut test::Bencher) { let leader = new_peer(2, 3); let mut region = Region::default(); diff --git a/tests/benches/raftstore/mod.rs b/tests/benches/raftstore/mod.rs index e164d59f82a..05c602824c2 100644 --- a/tests/benches/raftstore/mod.rs +++ b/tests/benches/raftstore/mod.rs @@ -12,7 +12,7 @@ const DEFAULT_DATA_SIZE: usize = 100_000; fn enc_write_kvs(db: &RocksEngine, kvs: &[(Vec, Vec)]) { let mut wb = db.write_batch(); - for (k, v) in kvs { + for &(ref k, ref v) in kvs { wb.put(&keys::data_key(k), v).unwrap(); } wb.write().unwrap(); diff --git a/tests/failpoints/cases/mod.rs b/tests/failpoints/cases/mod.rs index f40f40e6af1..a9dbd36a81a 100644 --- a/tests/failpoints/cases/mod.rs +++ b/tests/failpoints/cases/mod.rs @@ -1,8 +1,5 @@ // Copyright 2017 TiKV Project Authors. Licensed under Apache-2.0. -#![allow(clippy::arc_with_non_send_sync)] -#![allow(clippy::unnecessary_mut_passed)] -#[allow(clippy::let_underscore_future)] mod test_async_fetch; mod test_async_io; mod test_backup; diff --git a/tests/failpoints/cases/test_disk_full.rs b/tests/failpoints/cases/test_disk_full.rs index 55c06d87b07..217269bb5b8 100644 --- a/tests/failpoints/cases/test_disk_full.rs +++ b/tests/failpoints/cases/test_disk_full.rs @@ -35,7 +35,7 @@ fn get_fp(usage: DiskUsage, store_id: u64) -> String { // check the region new leader is elected. fn assert_region_leader_changed( - cluster: &Cluster, + cluster: &mut Cluster, region_id: u64, original_leader: u64, ) { @@ -91,7 +91,7 @@ fn test_disk_full_leader_behaviors(usage: DiskUsage) { let new_last_index = cluster.raft_local_state(1, 1).last_index; assert_eq!(old_last_index, new_last_index); - assert_region_leader_changed(&cluster, 1, 1); + assert_region_leader_changed(&mut cluster, 1, 1); fail::remove(get_fp(usage, 1)); cluster.must_transfer_leader(1, new_peer(1, 1)); fail::cfg(get_fp(usage, 1), "return").unwrap(); @@ -199,7 +199,7 @@ fn test_disk_full_txn_behaviors(usage: DiskUsage) { DiskFullOpt::NotAllowedOnFull, ); assert!(res.get_region_error().has_disk_full()); - assert_region_leader_changed(&cluster, 1, 1); + assert_region_leader_changed(&mut cluster, 1, 1); fail::remove(get_fp(usage, 1)); cluster.must_transfer_leader(1, new_peer(1, 1)); @@ -393,7 +393,7 @@ fn test_disk_full_followers_with_hibernate_regions() { // check the region new leader is elected. fn assert_region_merged( - cluster: &Cluster, + cluster: &mut Cluster, left_region_key: &[u8], right_region_key: &[u8], ) { diff --git a/tests/failpoints/cases/test_engine.rs b/tests/failpoints/cases/test_engine.rs index 2dd5b6ac04b..073f7276419 100644 --- a/tests/failpoints/cases/test_engine.rs +++ b/tests/failpoints/cases/test_engine.rs @@ -57,7 +57,6 @@ fn test_write_buffer_manager() { } } -#[rustfmt::skip] // The test mocks the senario before https://github.com/tikv/rocksdb/pull/347: // note: before rocksdb/pull/347, lock is called before on_memtable_sealed. // Case: diff --git a/tests/failpoints/cases/test_hibernate.rs b/tests/failpoints/cases/test_hibernate.rs index d8f73f312b6..d2eb9aa10dd 100644 --- a/tests/failpoints/cases/test_hibernate.rs +++ b/tests/failpoints/cases/test_hibernate.rs @@ -93,7 +93,6 @@ fn test_break_leadership_on_restart() { // received, and become `GroupState::Ordered` after the proposal is received. // But they should keep wakeful for a while. #[test] -#[allow(clippy::let_underscore_future)] fn test_store_disconnect_with_hibernate() { let mut cluster = new_server_cluster(0, 3); let base_tick_ms = 50; diff --git a/tests/failpoints/cases/test_merge.rs b/tests/failpoints/cases/test_merge.rs index 08b7474bb8e..0c16819082b 100644 --- a/tests/failpoints/cases/test_merge.rs +++ b/tests/failpoints/cases/test_merge.rs @@ -1710,7 +1710,8 @@ fn test_destroy_source_peer_while_merging() { } struct MsgTimeoutFilter { - tx: Sender, + // wrap with mutex to make tx Sync. + tx: Mutex>, } impl Filter for MsgTimeoutFilter { @@ -1718,7 +1719,7 @@ impl Filter for MsgTimeoutFilter { let mut res = Vec::with_capacity(msgs.len()); for m in msgs.drain(..) { if m.get_message().msg_type == MessageType::MsgTimeoutNow { - self.tx.send(m).unwrap(); + self.tx.lock().unwrap().send(m).unwrap(); } else { res.push(m); } @@ -1787,7 +1788,7 @@ fn test_concurrent_between_transfer_leader_and_merge() { // msg by using Filter. So we make node-1-1000 be in leader_transferring status // for some time. let (tx, rx_msg) = channel(); - let filter = MsgTimeoutFilter { tx }; + let filter = MsgTimeoutFilter { tx: Mutex::new(tx) }; cluster.add_send_filter_on_node(1, Box::new(filter)); pd_client.transfer_leader( @@ -1811,13 +1812,15 @@ fn test_concurrent_between_transfer_leader_and_merge() { let router = cluster.get_router(2).unwrap(); let (tx, rx) = channel(); + let tx = Mutex::new(tx); let _ = fail::cfg_callback("propose_commit_merge_1", move || { - tx.send(()).unwrap(); + tx.lock().unwrap().send(()).unwrap(); }); let (tx2, rx2) = channel(); + let tx2 = Mutex::new(tx2); let _ = fail::cfg_callback("on_propose_commit_merge_success", move || { - tx2.send(()).unwrap(); + tx2.lock().unwrap().send(()).unwrap(); }); cluster.merge_region(left.get_id(), right.get_id(), Callback::None); diff --git a/tests/failpoints/cases/test_pd_client.rs b/tests/failpoints/cases/test_pd_client.rs index 201aafce6fb..0115d6d7ba5 100644 --- a/tests/failpoints/cases/test_pd_client.rs +++ b/tests/failpoints/cases/test_pd_client.rs @@ -43,7 +43,6 @@ macro_rules! request { } #[test] -#[allow(clippy::let_underscore_future)] fn test_pd_client_deadlock() { let (_server, client) = new_test_server_and_client(ReadableDuration::millis(100)); let pd_client_reconnect_fp = "pd_client_reconnect"; diff --git a/tests/failpoints/cases/test_pd_client_legacy.rs b/tests/failpoints/cases/test_pd_client_legacy.rs index 583dad2ff34..ac427c29e69 100644 --- a/tests/failpoints/cases/test_pd_client_legacy.rs +++ b/tests/failpoints/cases/test_pd_client_legacy.rs @@ -43,7 +43,6 @@ macro_rules! request { } #[test] -#[allow(clippy::let_underscore_future)] fn test_pd_client_deadlock() { let (_server, client) = new_test_server_and_client(ReadableDuration::millis(100)); let client = Arc::new(client); diff --git a/tests/failpoints/cases/test_rawkv.rs b/tests/failpoints/cases/test_rawkv.rs index 5ab7edb503f..a795422c120 100644 --- a/tests/failpoints/cases/test_rawkv.rs +++ b/tests/failpoints/cases/test_rawkv.rs @@ -208,7 +208,7 @@ fn test_leader_transfer() { #[test] fn test_region_merge() { let mut suite = TestSuite::new(3, ApiVersion::V2); - let keys = [b"rk0", b"rk1", b"rk2", b"rk3", b"rk4", b"rk5"]; + let keys = vec![b"rk0", b"rk1", b"rk2", b"rk3", b"rk4", b"rk5"]; suite.must_raw_put(keys[1], b"v1"); suite.must_raw_put(keys[3], b"v3"); diff --git a/tests/failpoints/cases/test_read_execution_tracker.rs b/tests/failpoints/cases/test_read_execution_tracker.rs index dc6906b668a..7351044b297 100644 --- a/tests/failpoints/cases/test_read_execution_tracker.rs +++ b/tests/failpoints/cases/test_read_execution_tracker.rs @@ -4,16 +4,11 @@ use kvproto::kvrpcpb::*; use test_coprocessor::{init_with_data, DagSelect, ProductTable}; use test_raftstore::{kv_batch_read, kv_read, must_kv_commit, must_kv_prewrite}; use test_raftstore_macro::test_case; -use tikv_util::config::ReadableDuration; -#[test_case(test_raftstore::must_new_cluster_with_cfg_and_kv_client_mul)] -#[test_case(test_raftstore_v2::must_new_cluster_with_cfg_and_kv_client_mul)] +#[test_case(test_raftstore::must_new_cluster_and_kv_client)] +#[test_case(test_raftstore_v2::must_new_cluster_and_kv_client)] fn test_read_execution_tracking() { - let (_cluster, client, ctx) = new_cluster(1, |c| { - // set a small renew duration to avoid trigger pre-renew that can affact the - // metrics. - c.cfg.tikv.raft_store.renew_leader_lease_advance_duration = ReadableDuration::millis(1); - }); + let (_cluster, client, ctx) = new_cluster(); let (k1, v1) = (b"k1".to_vec(), b"v1".to_vec()); let (k2, v2) = (b"k2".to_vec(), b"v2".to_vec()); diff --git a/tests/failpoints/cases/test_split_region.rs b/tests/failpoints/cases/test_split_region.rs index 10a65271462..65c50793d7a 100644 --- a/tests/failpoints/cases/test_split_region.rs +++ b/tests/failpoints/cases/test_split_region.rs @@ -1426,7 +1426,8 @@ impl Filter for TeeFilter { // 2. the splitted region set has_dirty_data be true in `apply_snapshot` // 3. the splitted region schedule tablet trim task in `on_applied_snapshot` // with tablet index 5 -// 4. the splitted region received a snapshot sent from its leader +// 4. the splitted region received a snapshot sent from its +// leader // 5. after finishing applying this snapshot, the tablet index in storage // changed to 6 // 6. tablet trim complete and callbacked to raftstore diff --git a/tests/failpoints/cases/test_storage.rs b/tests/failpoints/cases/test_storage.rs index 4668c24ad66..57047bef9d4 100644 --- a/tests/failpoints/cases/test_storage.rs +++ b/tests/failpoints/cases/test_storage.rs @@ -1620,7 +1620,9 @@ fn test_before_propose_deadline() { assert!( matches!( res, - Err(StorageError(box StorageErrorInner::Kv(KvError(box KvErrorInner::Request(_))))) + Err(StorageError(box StorageErrorInner::Kv(KvError( + box KvErrorInner::Request(_), + )))) ), "actual: {:?}", res diff --git a/tests/failpoints/cases/test_transaction.rs b/tests/failpoints/cases/test_transaction.rs index 4154a764d99..14f4161c7ae 100644 --- a/tests/failpoints/cases/test_transaction.rs +++ b/tests/failpoints/cases/test_transaction.rs @@ -751,7 +751,7 @@ fn test_proposal_concurrent_with_conf_change_and_transfer_leader() { let handle = std::thread::spawn(move || { let mut mutations = vec![]; - for key in [b"key3".to_vec(), b"key4".to_vec()] { + for key in vec![b"key3".to_vec(), b"key4".to_vec()] { let mut mutation = kvproto::kvrpcpb::Mutation::default(); mutation.set_op(Op::Put); mutation.set_key(key); diff --git a/tests/failpoints/cases/test_transfer_leader.rs b/tests/failpoints/cases/test_transfer_leader.rs index 02fb8c046c8..75eb62bab99 100644 --- a/tests/failpoints/cases/test_transfer_leader.rs +++ b/tests/failpoints/cases/test_transfer_leader.rs @@ -361,8 +361,8 @@ fn test_read_lock_after_become_follower() { /// 1. Inserted 5 entries and make all stores commit and apply them. /// 2. Prevent the store 3 from append following logs. /// 3. Insert another 20 entries. -/// 4. Wait for some time so that part of the entry cache are compacted on the -/// leader(store 1). +/// 4. Wait for some time so that part of the entry cache are compacted +/// on the leader(store 1). macro_rules! run_cluster_for_test_warmup_entry_cache { ($cluster:expr) => { // Let the leader compact the entry cache. diff --git a/tests/integrations/backup/mod.rs b/tests/integrations/backup/mod.rs index bd5461e6134..4cfd4be07be 100644 --- a/tests/integrations/backup/mod.rs +++ b/tests/integrations/backup/mod.rs @@ -492,7 +492,6 @@ fn test_backup_raw_meta() { } #[test] -#[allow(clippy::permissions_set_readonly_false)] fn test_invalid_external_storage() { let mut suite = TestSuite::new(1, 144 * 1024 * 1024, ApiVersion::V1); // Put some data. diff --git a/tests/integrations/import/test_apply_log.rs b/tests/integrations/import/test_apply_log.rs index f821ffea2e7..3d8cf85b02c 100644 --- a/tests/integrations/import/test_apply_log.rs +++ b/tests/integrations/import/test_apply_log.rs @@ -67,6 +67,6 @@ fn test_apply_twice() { &tikv, &ctx, CF_DEFAULT, - default_fst.into_iter().chain(default_snd), + default_fst.into_iter().chain(default_snd.into_iter()), ); } diff --git a/tests/integrations/mod.rs b/tests/integrations/mod.rs index 86ceb5369e7..2b68c0a8ba9 100644 --- a/tests/integrations/mod.rs +++ b/tests/integrations/mod.rs @@ -4,8 +4,6 @@ #![feature(box_patterns)] #![feature(custom_test_frameworks)] #![test_runner(test_util::run_tests)] -#![allow(clippy::needless_pass_by_ref_mut)] -#![allow(clippy::extra_unused_type_parameters)] extern crate test; diff --git a/tests/integrations/raftstore/test_bootstrap.rs b/tests/integrations/raftstore/test_bootstrap.rs index 30ea12a424b..056641e1e3f 100644 --- a/tests/integrations/raftstore/test_bootstrap.rs +++ b/tests/integrations/raftstore/test_bootstrap.rs @@ -287,8 +287,8 @@ fn test_flush_before_stop2() { // 1. lock `k` with index 6 // 2. on_applied_res => lockcf's last_modified = 6 // 3. flush lock cf => lockcf's flushed_index = 6 -// 4. batch {unlock `k`, write `k`} with index 7 (last_modified is updated in -// store but RocksDB is modified in apply. So, +// 4. batch {unlock `k`, write `k`} with index 7 +// (last_modified is updated in store but RocksDB is modified in apply. So, // before on_apply_res, the last_modified is not updated.) // // flush-before-close: diff --git a/tests/integrations/raftstore/test_compact_lock_cf.rs b/tests/integrations/raftstore/test_compact_lock_cf.rs index 56cb65cce87..fbc7629c73f 100644 --- a/tests/integrations/raftstore/test_compact_lock_cf.rs +++ b/tests/integrations/raftstore/test_compact_lock_cf.rs @@ -5,13 +5,13 @@ use engine_traits::{MiscExt, CF_LOCK}; use test_raftstore::*; use tikv_util::config::*; -fn flush(cluster: &Cluster) { +fn flush(cluster: &mut Cluster) { for engines in cluster.engines.values() { engines.kv.flush_cf(CF_LOCK, true).unwrap(); } } -fn flush_then_check(cluster: &Cluster, interval: u64, written: bool) { +fn flush_then_check(cluster: &mut Cluster, interval: u64, written: bool) { flush(cluster); // Wait for compaction. sleep_ms(interval * 2); diff --git a/tests/integrations/raftstore/test_stats.rs b/tests/integrations/raftstore/test_stats.rs index 7701fe167c8..60f10936f2d 100644 --- a/tests/integrations/raftstore/test_stats.rs +++ b/tests/integrations/raftstore/test_stats.rs @@ -434,7 +434,6 @@ fn test_txn_query_stats_tmpl() { fail::remove("only_check_source_task_name"); } -#[allow(clippy::extra_unused_type_parameters)] fn raw_put( _cluster: &Cluster, client: &TikvClient, From 74f82f651654dba267438782af8756ccb65e7fda Mon Sep 17 00:00:00 2001 From: SeaRise Date: Tue, 26 Sep 2023 16:03:16 +0800 Subject: [PATCH 070/220] expr: fix wrong result of 0 / decimal and 0 % decimal (#15675) close tikv/tikv#15631 Signed-off-by: SeaRise --- .../src/codec/mysql/decimal.rs | 41 ++++++++++++++----- 1 file changed, 30 insertions(+), 11 deletions(-) diff --git a/components/tidb_query_datatype/src/codec/mysql/decimal.rs b/components/tidb_query_datatype/src/codec/mysql/decimal.rs index 143ec6c7760..bc18d7192f9 100644 --- a/components/tidb_query_datatype/src/codec/mysql/decimal.rs +++ b/components/tidb_query_datatype/src/codec/mysql/decimal.rs @@ -590,17 +590,24 @@ fn do_div_mod_impl( rhs: &Decimal, mut frac_incr: u8, do_mod: bool, + result_frac_cnt: Option, ) -> Option> { let r_frac_cnt = word_cnt!(rhs.frac_cnt) * DIGITS_PER_WORD; let (r_idx, r_prec) = rhs.remove_leading_zeroes(rhs.int_cnt + r_frac_cnt); if r_prec == 0 { + // short-circuit everything: rhs == 0 return None; } let l_frac_cnt = word_cnt!(lhs.frac_cnt) * DIGITS_PER_WORD; let (l_idx, l_prec) = lhs.remove_leading_zeroes(lhs.int_cnt + l_frac_cnt); if l_prec == 0 { - return Some(Res::Ok(Decimal::zero())); + // short-circuit everything: lhs == 0 + if let Some(result_frac) = result_frac_cnt { + return Some(Res::Ok(Decimal::new(0, result_frac, false))); + } else { + return Some(Res::Ok(Decimal::zero())); + } } frac_incr = frac_incr.saturating_sub(l_frac_cnt - lhs.frac_cnt + r_frac_cnt - rhs.frac_cnt); @@ -784,8 +791,9 @@ fn do_div_mod_impl( Some(res) } +#[allow(dead_code)] fn do_div_mod(lhs: &Decimal, rhs: &Decimal, frac_incr: u8, do_mod: bool) -> Option> { - do_div_mod_impl(lhs, rhs, frac_incr, do_mod) + do_div_mod_impl(lhs, rhs, frac_incr, do_mod, None) } /// `do_mul` multiplies two decimals. @@ -1704,7 +1712,7 @@ impl Decimal { fn div(&self, rhs: &Decimal, frac_incr: u8) -> Option> { let result_frac_cnt = cmp::min(self.result_frac_cnt.saturating_add(frac_incr), MAX_FRACTION); - let mut res = do_div_mod(self, rhs, frac_incr, false); + let mut res = do_div_mod_impl(self, rhs, frac_incr, false, Some(result_frac_cnt)); if let Some(ref mut dec) = res { dec.result_frac_cnt = result_frac_cnt; } @@ -2362,7 +2370,7 @@ impl<'a, 'b> Rem<&'a Decimal> for &'b Decimal { type Output = Option>; fn rem(self, rhs: &'a Decimal) -> Self::Output { let result_frac_cnt = cmp::max(self.result_frac_cnt, rhs.result_frac_cnt); - let mut res = do_div_mod_impl(self, rhs, 0, true); + let mut res = do_div_mod_impl(self, rhs, 0, true, Some(result_frac_cnt)); if let Some(ref mut dec) = res { dec.result_frac_cnt = result_frac_cnt; } @@ -3545,17 +3553,28 @@ mod tests { assert_eq!(res, rem_exp.map(|s| s.to_owned())); } - let div_cases = vec![( - "-43791957044243810000000000000000000000000000000000000000000000000000000000000", - "-0.0000000000000000000000000000000000000000000000000012867433602814482", - Res::Overflow( - "34033171179267041433424155279291553259014210153022524070386565694757521640", + let div_cases = vec![ + ( + "-43791957044243810000000000000000000000000000000000000000000000000000000000000", + "-0.0000000000000000000000000000000000000000000000000012867433602814482", + Res::Overflow( + "34033171179267041433424155279291553259014210153022524070386565694757521640", + ), ), - )]; - for (lhs_str, rhs_str, rem_exp) in div_cases { + ("0", "0.5", Res::Ok("0.0000")), + ]; + for (lhs_str, rhs_str, div_exp) in div_cases { let lhs: Decimal = lhs_str.parse().unwrap(); let rhs: Decimal = rhs_str.parse().unwrap(); let res = (&lhs / &rhs).unwrap().map(|d| d.to_string()); + assert_eq!(res, div_exp.map(|s| s.to_owned())) + } + + let rem_cases = vec![("0", "0.5", Res::Ok("0.0"))]; + for (lhs_str, rhs_str, rem_exp) in rem_cases { + let lhs: Decimal = lhs_str.parse().unwrap(); + let rhs: Decimal = rhs_str.parse().unwrap(); + let res = (lhs % rhs).unwrap().map(|d| d.to_string()); assert_eq!(res, rem_exp.map(|s| s.to_owned())) } } From 977888de9b218abd56928ab51e0f78a5b13c9063 Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Tue, 26 Sep 2023 16:30:16 +0800 Subject: [PATCH 071/220] raftstore-v2: fix "failed to get merge entries" panic (#15649) close tikv/tikv#15633 fix "failed to get merge entries" panic Signed-off-by: SpadeA-Tang --- components/raftstore-v2/src/fsm/peer.rs | 1 + .../operation/command/admin/compact_log.rs | 2 + .../operation/command/admin/merge/commit.rs | 10 +- .../operation/command/admin/merge/prepare.rs | 2 + .../operation/command/admin/merge/rollback.rs | 18 +- .../raftstore-v2/src/operation/query/mod.rs | 1 + .../src/operation/ready/apply_trace.rs | 2 +- components/raftstore-v2/src/raft/peer.rs | 11 +- components/raftstore/src/store/fsm/peer.rs | 1 + components/test_raftstore-v2/src/util.rs | 38 +++- components/test_raftstore/src/cluster.rs | 5 +- tests/failpoints/cases/test_merge.rs | 179 +++++++++++++++--- .../integrations/raftstore/test_bootstrap.rs | 6 +- 13 files changed, 224 insertions(+), 52 deletions(-) diff --git a/components/raftstore-v2/src/fsm/peer.rs b/components/raftstore-v2/src/fsm/peer.rs index d51d8eedb2a..872b2c4e7e6 100644 --- a/components/raftstore-v2/src/fsm/peer.rs +++ b/components/raftstore-v2/src/fsm/peer.rs @@ -196,6 +196,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, self.schedule_tick(PeerTick::SplitRegionCheck); self.schedule_tick(PeerTick::PdHeartbeat); self.schedule_tick(PeerTick::CompactLog); + self.fsm.peer.on_check_merge(self.store_ctx); if self.fsm.peer.storage().is_initialized() { self.fsm.peer.schedule_apply_fsm(self.store_ctx); } diff --git a/components/raftstore-v2/src/operation/command/admin/compact_log.rs b/components/raftstore-v2/src/operation/command/admin/compact_log.rs index d054234b46f..1c4538ab51e 100644 --- a/components/raftstore-v2/src/operation/command/admin/compact_log.rs +++ b/components/raftstore-v2/src/operation/command/admin/compact_log.rs @@ -145,6 +145,8 @@ impl Peer { store_ctx: &mut StoreContext, force: bool, ) { + fail::fail_point!("maybe_propose_compact_log", |_| {}); + // As leader, we would not keep caches for the peers that didn't response // heartbeat in the last few seconds. That happens probably because // another TiKV is down. In this case if we do not clean up the cache, diff --git a/components/raftstore-v2/src/operation/command/admin/merge/commit.rs b/components/raftstore-v2/src/operation/command/admin/merge/commit.rs index 8e55f89a7d2..bec0265ffc3 100644 --- a/components/raftstore-v2/src/operation/command/admin/merge/commit.rs +++ b/components/raftstore-v2/src/operation/command/admin/merge/commit.rs @@ -172,6 +172,7 @@ impl Peer { &mut self, store_ctx: &mut StoreContext, ) { + fail::fail_point!("on_schedule_merge", |_| {}); fail::fail_point!( "ask_target_peer_to_commit_merge_2", self.region_id() == 2, @@ -198,7 +199,7 @@ impl Peer { Ok(ents) => ents, Err(e) => slog_panic!( self.logger, - "failed to get merge entires"; + "failed to get merge entries"; "err" => ?e, "low" => low, "commit" => state.get_commit() @@ -261,6 +262,7 @@ impl Peer { store_ctx: &mut StoreContext, req: RaftCmdRequest, ) { + fail::fail_point!("on_ask_commit_merge", |_| {}); let expected_epoch = req.get_header().get_region_epoch(); let merge = req.get_admin_request().get_commit_merge(); assert!(merge.has_source_state() && merge.get_source_state().has_merge_state()); @@ -736,6 +738,12 @@ impl Peer { store_ctx: &mut StoreContext, mut res: CommitMergeResult, ) { + fail::fail_point!( + "on_apply_res_commit_merge_2", + self.peer().store_id == 2, + |_| {} + ); + let region = res.region_state.get_region(); assert!( res.source.get_end_key() == region.get_end_key() diff --git a/components/raftstore-v2/src/operation/command/admin/merge/prepare.rs b/components/raftstore-v2/src/operation/command/admin/merge/prepare.rs index d3d1896287c..6ff982eea8c 100644 --- a/components/raftstore-v2/src/operation/command/admin/merge/prepare.rs +++ b/components/raftstore-v2/src/operation/command/admin/merge/prepare.rs @@ -812,6 +812,8 @@ impl Peer { store_ctx: &mut StoreContext, res: PrepareMergeResult, ) { + fail::fail_point!("on_apply_res_prepare_merge"); + let region = res.region_state.get_region().clone(); { let mut meta = store_ctx.store_meta.lock().unwrap(); diff --git a/components/raftstore-v2/src/operation/command/admin/merge/rollback.rs b/components/raftstore-v2/src/operation/command/admin/merge/rollback.rs index d931a295f4d..adc49a928b3 100644 --- a/components/raftstore-v2/src/operation/command/admin/merge/rollback.rs +++ b/components/raftstore-v2/src/operation/command/admin/merge/rollback.rs @@ -4,9 +4,8 @@ use engine_traits::{KvEngine, RaftEngine, RaftLogBatch}; use kvproto::{ - metapb, raft_cmdpb::{AdminCmdType, AdminRequest, AdminResponse}, - raft_serverpb::PeerState, + raft_serverpb::{PeerState, RegionLocalState}, }; use raftstore::{ coprocessor::RegionChangeReason, @@ -28,7 +27,7 @@ use crate::{ #[derive(Debug)] pub struct RollbackMergeResult { commit: u64, - region: metapb::Region, + region_state: RegionLocalState, } impl Peer { @@ -118,7 +117,7 @@ impl Apply { AdminResponse::default(), AdminCmdResult::RollbackMerge(RollbackMergeResult { commit: rollback.get_commit(), - region, + region_state: self.region_state().clone(), }), )) } @@ -131,6 +130,7 @@ impl Peer { store_ctx: &mut StoreContext, res: RollbackMergeResult, ) { + let region = res.region_state.get_region(); assert_ne!(res.commit, 0); let current = self.merge_context().and_then(|c| c.prepare_merge_index()); if current != Some(res.commit) { @@ -143,21 +143,21 @@ impl Peer { } { let mut meta = store_ctx.store_meta.lock().unwrap(); - meta.set_region(&res.region, true, &self.logger); - let (reader, _) = meta.readers.get_mut(&res.region.get_id()).unwrap(); + meta.set_region(region, true, &self.logger); + let (reader, _) = meta.readers.get_mut(®ion.get_id()).unwrap(); self.set_region( &store_ctx.coprocessor_host, reader, - res.region.clone(), + region.clone(), RegionChangeReason::RollbackMerge, self.storage().region_state().get_tablet_index(), ); } - let region_state = self.storage().region_state().clone(); let region_id = self.region_id(); self.state_changes_mut() - .put_region_state(region_id, res.commit, ®ion_state) + .put_region_state(region_id, res.commit, &res.region_state) .unwrap(); + self.storage_mut().set_region_state(res.region_state); self.set_has_extra_write(); self.rollback_merge(store_ctx); diff --git a/components/raftstore-v2/src/operation/query/mod.rs b/components/raftstore-v2/src/operation/query/mod.rs index 2f1b1cd0138..10f6e3279c3 100644 --- a/components/raftstore-v2/src/operation/query/mod.rs +++ b/components/raftstore-v2/src/operation/query/mod.rs @@ -471,6 +471,7 @@ impl Peer { // Only leaders need to update applied_term. if progress_to_be_updated && self.is_leader() { if applied_term == self.term() { + fail::fail_point!("on_applied_current_term"); ctx.coprocessor_host .on_applied_current_term(StateRole::Leader, self.region()); } diff --git a/components/raftstore-v2/src/operation/ready/apply_trace.rs b/components/raftstore-v2/src/operation/ready/apply_trace.rs index af0257e763f..e839089837d 100644 --- a/components/raftstore-v2/src/operation/ready/apply_trace.rs +++ b/components/raftstore-v2/src/operation/ready/apply_trace.rs @@ -718,7 +718,7 @@ impl Peer { ); let region_id = self.region_id(); let flush_threshold: u64 = (|| { - fail_point!("flush_before_cluse_threshold", |t| { + fail_point!("flush_before_close_threshold", |t| { t.unwrap().parse::().unwrap() }); 50 diff --git a/components/raftstore-v2/src/raft/peer.rs b/components/raftstore-v2/src/raft/peer.rs index 87d41de776c..4ff47c4b4bb 100644 --- a/components/raftstore-v2/src/raft/peer.rs +++ b/components/raftstore-v2/src/raft/peer.rs @@ -231,6 +231,14 @@ impl Peer { unsafe_recovery_state: None, }; + // If merge_context is not None, it means the PrepareMerge is applied before + // restart. So we have to neter prepare merge again to prevent all proposals + // except for RollbackMerge. + if let Some(ref state) = peer.merge_context { + peer.proposal_control + .enter_prepare_merge(state.prepare_merge_index().unwrap()); + } + // If this region has only one peer and I am the one, campaign directly. let region = peer.region(); if region.get_peers().len() == 1 @@ -265,9 +273,6 @@ impl Peer { } /// Set the region of a peer. - /// - /// This will update the region of the peer, caller must ensure the region - /// has been preserved in a durable device. pub fn set_region( &mut self, host: &CoprocessorHost, diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index 30ba0c3059d..513e9c0636a 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -4602,6 +4602,7 @@ where } fn on_ready_prepare_merge(&mut self, region: metapb::Region, state: MergeState) { + fail_point!("on_apply_res_prepare_merge"); { let mut meta = self.ctx.store_meta.lock().unwrap(); meta.set_region( diff --git a/components/test_raftstore-v2/src/util.rs b/components/test_raftstore-v2/src/util.rs index 805394b1ea0..d83dff12e9a 100644 --- a/components/test_raftstore-v2/src/util.rs +++ b/components/test_raftstore-v2/src/util.rs @@ -1,6 +1,12 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::{fmt::Write, path::Path, sync::Arc, thread, time::Duration}; +use std::{ + fmt::Write, + path::Path, + sync::Arc, + thread, + time::{Duration, Instant}, +}; use encryption_export::{data_key_manager_from_config, DataKeyManager}; use engine_rocks::{RocksEngine, RocksStatistics}; @@ -18,7 +24,7 @@ use raftstore::{store::ReadResponse, Result}; use rand::{prelude::SliceRandom, RngCore}; use server::common::ConfiguredRaftEngine; use tempfile::TempDir; -use test_raftstore::{new_get_cmd, new_put_cf_cmd, new_request, new_snap_cmd, Config}; +use test_raftstore::{new_get_cmd, new_put_cf_cmd, new_request, new_snap_cmd, sleep_ms, Config}; use tikv::{ server::KvEngineFactoryBuilder, storage::{ @@ -27,7 +33,8 @@ use tikv::{ }, }; use tikv_util::{ - config::ReadableDuration, escape, future::block_on_timeout, worker::LazyWorker, HandyRwLock, + config::ReadableDuration, escape, future::block_on_timeout, time::InstantExt, + worker::LazyWorker, HandyRwLock, }; use txn_types::Key; @@ -447,3 +454,28 @@ pub fn wait_down_peers, EK: KvEngine>( peers, count, peer ); } + +pub fn wait_region_epoch_change, EK: KvEngine>( + cluster: &Cluster, + waited_region: &metapb::Region, + timeout: Duration, +) { + let timer = Instant::now(); + loop { + if waited_region.get_region_epoch().get_version() + == cluster + .get_region_epoch(waited_region.get_id()) + .get_version() + { + if timer.saturating_elapsed() > timeout { + panic!( + "region {:?}, region epoch is still not changed.", + waited_region + ); + } + } else { + break; + } + sleep_ms(10); + } +} diff --git a/components/test_raftstore/src/cluster.rs b/components/test_raftstore/src/cluster.rs index 26fa2a47d5f..2a4082893e7 100644 --- a/components/test_raftstore/src/cluster.rs +++ b/components/test_raftstore/src/cluster.rs @@ -4,7 +4,10 @@ use std::{ collections::hash_map::Entry as MapEntry, error::Error as StdError, result, - sync::{mpsc, Arc, Mutex, RwLock}, + sync::{ + mpsc::{self}, + Arc, Mutex, RwLock, + }, thread, time::Duration, }; diff --git a/tests/failpoints/cases/test_merge.rs b/tests/failpoints/cases/test_merge.rs index 0c16819082b..861e4a658ce 100644 --- a/tests/failpoints/cases/test_merge.rs +++ b/tests/failpoints/cases/test_merge.rs @@ -3,7 +3,7 @@ use std::{ sync::{ atomic::{AtomicBool, Ordering}, - mpsc::{channel, Sender}, + mpsc::{channel, sync_channel, Sender}, *, }, thread, @@ -22,14 +22,16 @@ use raft::eraftpb::MessageType; use raftstore::store::*; use raftstore_v2::router::PeerMsg; use test_raftstore::*; +use test_raftstore_macro::test_case; use tikv::storage::{kv::SnapshotExt, Snapshot}; use tikv_util::{config::*, future::block_on_timeout, time::Instant, HandyRwLock}; use txn_types::{Key, LastChange, PessimisticLock}; /// Test if merge is rollback as expected. -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_node_merge_rollback() { - let mut cluster = new_node_cluster(0, 3); + let mut cluster = new_cluster(0, 3); configure_for_merge(&mut cluster.cfg); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); @@ -53,8 +55,16 @@ fn test_node_merge_rollback() { let schedule_merge_fp = "on_schedule_merge"; fail::cfg(schedule_merge_fp, "return()").unwrap(); - // The call is finished when prepare_merge is applied. - cluster.must_try_merge(region.get_id(), target_region.get_id()); + let (tx, rx) = channel(); + let tx = Mutex::new(tx); + fail::cfg_callback("on_apply_res_prepare_merge", move || { + tx.lock().unwrap().send(()).unwrap(); + }) + .unwrap(); + + cluster.merge_region(region.get_id(), target_region.get_id(), Callback::None); + // PrepareMerge is applied. + rx.recv().unwrap(); // Add a peer to trigger rollback. pd_client.must_add_peer(right.get_id(), new_peer(3, 5)); @@ -74,12 +84,7 @@ fn test_node_merge_rollback() { region.mut_region_epoch().set_version(4); for i in 1..3 { must_get_equal(&cluster.get_engine(i), b"k11", b"v11"); - let state_key = keys::region_state_key(region.get_id()); - let state: RegionLocalState = cluster - .get_engine(i) - .get_msg_cf(CF_RAFT, &state_key) - .unwrap() - .unwrap(); + let state = cluster.region_local_state(region.get_id(), i); assert_eq!(state.get_state(), PeerState::Normal); assert_eq!(*state.get_region(), region); } @@ -88,7 +93,10 @@ fn test_node_merge_rollback() { fail::cfg(schedule_merge_fp, "return()").unwrap(); let target_region = pd_client.get_region(b"k3").unwrap(); - cluster.must_try_merge(region.get_id(), target_region.get_id()); + cluster.merge_region(region.get_id(), target_region.get_id(), Callback::None); + // PrepareMerge is applied. + rx.recv().unwrap(); + let mut region = pd_client.get_region(b"k1").unwrap(); // Split to trigger rollback. @@ -103,12 +111,7 @@ fn test_node_merge_rollback() { region.mut_region_epoch().set_version(6); for i in 1..3 { must_get_equal(&cluster.get_engine(i), b"k12", b"v12"); - let state_key = keys::region_state_key(region.get_id()); - let state: RegionLocalState = cluster - .get_engine(i) - .get_msg_cf(CF_RAFT, &state_key) - .unwrap() - .unwrap(); + let state = cluster.region_local_state(region.get_id(), i); assert_eq!(state.get_state(), PeerState::Normal); assert_eq!(*state.get_region(), region); } @@ -1835,19 +1838,7 @@ fn test_concurrent_between_transfer_leader_and_merge() { rx2.recv().unwrap(); fail::remove("on_reject_commit_merge_1"); - let timer = Instant::now(); - loop { - if right.get_region_epoch().get_version() - == cluster.get_region_epoch(right.get_id()).get_version() - { - if timer.saturating_elapsed() > Duration::from_secs(5) { - panic!("region {:?} is still not merged.", right); - } - } else { - break; - } - sleep_ms(10); - } + wait_region_epoch_change(&cluster, &right, Duration::from_secs(5)); let region = pd_client.get_region(b"k1").unwrap(); assert_eq!(region.get_id(), right.get_id()); @@ -1856,3 +1847,129 @@ fn test_concurrent_between_transfer_leader_and_merge() { cluster.must_put(b"k4", b"v4"); } + +struct MsgVoteFilter {} + +impl Filter for MsgVoteFilter { + fn before(&self, msgs: &mut Vec) -> raftstore::Result<()> { + msgs.retain(|m| { + let msg_type = m.get_message().msg_type; + msg_type != MessageType::MsgRequestPreVote && msg_type != MessageType::MsgRequestVote + }); + check_messages(msgs) + } +} + +// Before the fix of this PR (#15649), after prepare merge, raft cmd can still +// be proposed if restart is involved. If the proposed raft cmd is CompactLog, +// panic can occur during fetch entries: see issue https://github.com/tikv/tikv/issues/15633. +// Consider the case: +// 1. node-1 apply PrepareMerge (assume log index 30), so it's in is_merging +// status which reject all proposals except for Rollback Merge +// 2. node-1 advance persisted_apply to 30 +// 3. node-1 restart and became leader. Now, it's not in is_merging status, so +// proposals can be proposed +// 4. node-1 propose CompactLog, replicate it to other nodes, and commit +// 5. node-0 apply PrepareMerge +// 6. node-0 apply CompactLog +// 6. node-0 fetches raft log entries which is required by +// AdminCmdType::CommitMerge and panic (due to compacted) +#[test] +fn test_restart_may_lose_merging_state() { + use test_raftstore_v2::*; + let mut cluster = new_node_cluster(0, 2); + configure_for_merge(&mut cluster.cfg); + cluster.cfg.raft_store.raft_log_gc_count_limit = Some(12); + cluster.cfg.raft_store.raft_log_gc_tick_interval = ReadableDuration::millis(10); + cluster.cfg.raft_store.raft_base_tick_interval = ReadableDuration::millis(10); + cluster.cfg.raft_store.merge_check_tick_interval = ReadableDuration::millis(10); + + cluster.run(); + fail::cfg("maybe_propose_compact_log", "return").unwrap(); + fail::cfg("on_ask_commit_merge", "return").unwrap(); + fail::cfg("flush_before_close_threshold", "return(0)").unwrap(); + + let (tx, rx) = channel(); + let tx = Mutex::new(tx); + fail::cfg_callback("on_apply_res_prepare_merge", move || { + tx.lock().unwrap().send(()).unwrap(); + }) + .unwrap(); + + let region = cluster.get_region(b""); + cluster.must_split(®ion, b"k20"); + + let source = cluster.get_region(b"k05"); + let target = cluster.get_region(b"k25"); + + cluster.add_send_filter_on_node(2, Box::new(MsgVoteFilter {})); + + cluster.must_transfer_leader( + source.id, + source + .get_peers() + .iter() + .find(|p| p.store_id == 1) + .cloned() + .unwrap(), + ); + cluster.must_transfer_leader( + target.id, + target + .get_peers() + .iter() + .find(|p| p.store_id == 1) + .cloned() + .unwrap(), + ); + + for i in 0..20 { + let k = format!("k{:02}", i); + cluster.must_put(k.as_bytes(), b"val"); + } + + cluster.merge_region(source.id, target.id, Callback::None); + + rx.recv().unwrap(); + let router = cluster.get_router(1).unwrap(); + let (tx, rx) = sync_channel(1); + let msg = PeerMsg::FlushBeforeClose { tx }; + router.force_send(source.id, msg).unwrap(); + rx.recv().unwrap(); + + let (tx, rx) = channel(); + let tx = Mutex::new(tx); + fail::cfg_callback("on_apply_res_commit_merge_2", move || { + tx.lock().unwrap().send(()).unwrap(); + }) + .unwrap(); + + cluster.stop_node(1); + // Need to avoid propose commit merge, before node 1 becomes leader. Otherwise, + // the commit merge will be rejected. + let (tx2, rx2) = channel(); + let tx2 = Mutex::new(tx2); + fail::cfg_callback("on_applied_current_term", move || { + tx2.lock().unwrap().send(()).unwrap(); + }) + .unwrap(); + + fail::remove("maybe_propose_compact_log"); + cluster.run_node(1).unwrap(); + + // we have two regions. + rx2.recv().unwrap(); + rx2.recv().unwrap(); + fail::remove("on_ask_commit_merge"); + // wait node 2 to apply commit merge + rx.recv_timeout(Duration::from_secs(10)).unwrap(); + + wait_region_epoch_change(&cluster, &target, Duration::from_secs(5)); + + let region = cluster.get_region(b"k1"); + assert_eq!(region.get_id(), target.get_id()); + assert_eq!(region.get_start_key(), source.get_start_key()); + assert_eq!(region.get_end_key(), target.get_end_key()); + + cluster.must_put(b"k400", b"v400"); +} diff --git a/tests/integrations/raftstore/test_bootstrap.rs b/tests/integrations/raftstore/test_bootstrap.rs index 056641e1e3f..bca389b26e6 100644 --- a/tests/integrations/raftstore/test_bootstrap.rs +++ b/tests/integrations/raftstore/test_bootstrap.rs @@ -216,7 +216,7 @@ fn test_flush_before_stop() { let region = cluster.get_region(b"k60"); cluster.must_split(®ion, b"k070"); - fail::cfg("flush_before_cluse_threshold", "return(10)").unwrap(); + fail::cfg("flush_before_close_threshold", "return(10)").unwrap(); for i in 0..100 { let key = format!("k{:03}", i); @@ -260,7 +260,7 @@ fn test_flush_before_stop2() { let mut cluster = new_server_cluster(0, 3); cluster.run(); - fail::cfg("flush_before_cluse_threshold", "return(10)").unwrap(); + fail::cfg("flush_before_close_threshold", "return(10)").unwrap(); fail::cfg("on_flush_completed", "return").unwrap(); for i in 0..20 { @@ -331,7 +331,7 @@ fn test_flush_index_exceed_last_modified() { ) .unwrap(); - fail::cfg("flush_before_cluse_threshold", "return(1)").unwrap(); + fail::cfg("flush_before_close_threshold", "return(1)").unwrap(); let router = cluster.get_router(1).unwrap(); let (tx, rx) = sync_channel(1); let msg = PeerMsg::FlushBeforeClose { tx }; From 9307f7ccfdf11c1047f833f888cbd77487b1c707 Mon Sep 17 00:00:00 2001 From: Neil Shen Date: Tue, 26 Sep 2023 18:06:17 +0800 Subject: [PATCH 072/220] raftstore-v2: fix MergedRecords not being cleaned up (#15650) close tikv/tikv#15644 MergedRecords were not being properly cleaned up, causing unnecessary bloating of RegionLocalState and continuous sending of GcPeerRequest by raftstore. This commit addresses the issue by enhancing the handling of GcPeerRequests, ensuring that target region followers forward GcPeerRequests to the source peer. The source peer or store then reports GcPeerResponse accordingly. Signed-off-by: Neil Shen Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> Co-authored-by: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> --- .../operation/command/admin/conf_change.rs | 4 +- components/raftstore-v2/src/operation/life.rs | 22 ++- components/test_raftstore-v2/src/cluster.rs | 44 +++++ tests/integrations/raftstore/test_life.rs | 19 +- tests/integrations/raftstore/test_merge.rs | 170 +++++++++++++++--- 5 files changed, 214 insertions(+), 45 deletions(-) diff --git a/components/raftstore-v2/src/operation/command/admin/conf_change.rs b/components/raftstore-v2/src/operation/command/admin/conf_change.rs index 77ef6c823c1..55cee490e52 100644 --- a/components/raftstore-v2/src/operation/command/admin/conf_change.rs +++ b/components/raftstore-v2/src/operation/command/admin/conf_change.rs @@ -604,8 +604,8 @@ impl Apply { "update gc peer"; "index" => log_index, "updates" => ?updates, - "gc_peers" => ?removed_records, - "merged_peers" => ?merged_records + "removed_records" => ?removed_records, + "merged_records" => ?merged_records ); removed_records.retain(|p| !updates.contains(&p.get_id())); merged_records.retain_mut(|r| { diff --git a/components/raftstore-v2/src/operation/life.rs b/components/raftstore-v2/src/operation/life.rs index 4d1a59de0a6..8591d5daf23 100644 --- a/components/raftstore-v2/src/operation/life.rs +++ b/components/raftstore-v2/src/operation/life.rs @@ -681,6 +681,10 @@ impl Peer { let _ = router.send_raft_message(m.into()); }, ); + } else { + // Source peer is already destroyed. Forward to store, and let + // it report GcPeer response. + let _ = ctx.router.send_raft_message(m.into()); } }); } @@ -748,15 +752,23 @@ impl Peer { } // 2. ask target to check whether source should be deleted. for record in state.get_merged_records() { - for (source, target) in record - .get_source_peers() - .iter() - .zip(record.get_target_peers()) - { + for source in record.get_source_peers() { need_gc_ids.push(source.get_id()); if gc_context.confirmed_ids.contains(&source.get_id()) { continue; } + let Some(target) = record + .get_target_peers() + .iter() + .find(|p| p.get_store_id() == source.get_store_id()) + else { + panic!( + "[region {}] {} target peer not found, {:?}", + self.region_id(), + self.peer_id(), + state + ); + }; let mut msg = RaftMessage::default(); msg.set_region_id(record.get_target_region_id()); diff --git a/components/test_raftstore-v2/src/cluster.rs b/components/test_raftstore-v2/src/cluster.rs index 8ede3290167..9d61918bd1f 100644 --- a/components/test_raftstore-v2/src/cluster.rs +++ b/components/test_raftstore-v2/src/cluster.rs @@ -1689,6 +1689,50 @@ impl, EK: KvEngine> Cluster { } } + pub fn must_empty_region_removed_records(&mut self, region_id: u64) { + let timer = Instant::now(); + loop { + thread::sleep(Duration::from_millis(100)); + + let leader = match self.leader_of_region(region_id) { + None => continue, + Some(l) => l, + }; + let region_state = self.region_local_state(region_id, leader.get_store_id()); + if region_state.get_removed_records().is_empty() { + return; + } + if timer.saturating_elapsed() > Duration::from_secs(5) { + panic!( + "merged records and removed records must be empty, {:?}", + region_state + ); + } + } + } + + pub fn must_empty_region_merged_records(&mut self, region_id: u64) { + let timer = Instant::now(); + loop { + thread::sleep(Duration::from_millis(100)); + + let leader = match self.leader_of_region(region_id) { + None => continue, + Some(l) => l, + }; + let region_state = self.region_local_state(region_id, leader.get_store_id()); + if region_state.get_merged_records().is_empty() { + return; + } + if timer.saturating_elapsed() > Duration::from_secs(5) { + panic!( + "merged records and removed records must be empty, {:?}", + region_state + ); + } + } + } + pub fn get_snap_dir(&self, node_id: u64) -> String { self.sim.rl().get_snap_dir(node_id) } diff --git a/tests/integrations/raftstore/test_life.rs b/tests/integrations/raftstore/test_life.rs index f3b5704a586..809904c7f46 100644 --- a/tests/integrations/raftstore/test_life.rs +++ b/tests/integrations/raftstore/test_life.rs @@ -7,9 +7,7 @@ use std::{ use kvproto::raft_serverpb::{ExtraMessageType, PeerState, RaftMessage}; use raftstore::errors::Result; -use test_raftstore::{ - new_learner_peer, new_peer, sleep_ms, Filter, FilterFactory, Simulator as S1, -}; +use test_raftstore::{new_learner_peer, new_peer, Filter, FilterFactory, Simulator as S1}; use test_raftstore_v2::Simulator as S2; use tikv_util::{config::ReadableDuration, time::Instant, HandyRwLock}; @@ -125,20 +123,7 @@ fn test_gc_peer_tiflash_engine() { .must_remove_peer(r21, new_learner_peer(2, 10)); // Make sure leader cleans up removed_records. - let start = Instant::now(); - loop { - sleep_ms(500); - if cluster_v2 - .region_local_state(r21, 1) - .get_removed_records() - .is_empty() - { - break; - } - if start.saturating_elapsed() > Duration::from_secs(5) { - panic!("timeout"); - } - } + cluster_v2.must_empty_region_removed_records(r21); } #[test] diff --git a/tests/integrations/raftstore/test_merge.rs b/tests/integrations/raftstore/test_merge.rs index 0b17ff72ae7..080724b15a7 100644 --- a/tests/integrations/raftstore/test_merge.rs +++ b/tests/integrations/raftstore/test_merge.rs @@ -6,7 +6,7 @@ use api_version::{test_kv_format_impl, KvFormat}; use engine_traits::{CF_LOCK, CF_WRITE}; use kvproto::{ raft_cmdpb::CmdType, - raft_serverpb::{PeerState, RaftMessage, RegionLocalState}, + raft_serverpb::{ExtraMessageType, PeerState, RaftMessage, RegionLocalState}, }; use pd_client::PdClient; use raft::eraftpb::{ConfChangeType, MessageType}; @@ -1733,7 +1733,7 @@ fn test_prepare_merge_with_5_nodes_snapshot() { } #[test_case(test_raftstore_v2::new_node_cluster)] -fn test_gc_peer_after_merge() { +fn test_gc_source_removed_records_after_merge() { let mut cluster = new_cluster(0, 3); configure_for_merge(&mut cluster.cfg); cluster.cfg.raft_store.gc_peer_check_interval = ReadableDuration::millis(500); @@ -1792,23 +1792,151 @@ fn test_gc_peer_after_merge() { // Right region replica on store 3 must be removed. cluster.must_region_not_exist(right.get_id(), 3); - let start = Instant::now(); - loop { - sleep_ms(cluster.cfg.raft_store.gc_peer_check_interval.as_millis()); - let region_state = cluster.region_local_state(left.get_id(), 1); - if (region_state.get_merged_records().is_empty() - || region_state.get_merged_records()[0] - .get_source_removed_records() - .is_empty()) - && region_state.get_removed_records().is_empty() - { - break; - } - if start.elapsed() > Duration::from_secs(5) { - panic!( - "source removed records and removed records must be empty, {:?}", - region_state - ); - } - } + // Right region must clean up removed and merged records. + cluster.must_empty_region_merged_records(left.get_id()); + cluster.must_empty_region_removed_records(left.get_id()); +} + +#[test_case(test_raftstore_v2::new_node_cluster)] +fn test_gc_source_peers_forward_by_target_peer_after_merge() { + let mut cluster = new_cluster(0, 3); + configure_for_merge(&mut cluster.cfg); + cluster.cfg.raft_store.raft_log_gc_threshold = 40; + cluster.cfg.raft_store.raft_log_gc_count_limit = Some(40); + cluster.cfg.raft_store.merge_max_log_gap = 15; + cluster.cfg.raft_store.gc_peer_check_interval = ReadableDuration::millis(500); + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + cluster.run(); + + let region = cluster.get_region(b"k1"); + cluster.must_split(®ion, b"k2"); + let left = cluster.get_region(b"k1"); + let right = cluster.get_region(b"k3"); + + let left_peer_on_store1 = find_peer(&left, 1).unwrap().clone(); + cluster.must_transfer_leader(left.get_id(), left_peer_on_store1); + let right_peer_on_store1 = find_peer(&right, 1).unwrap().clone(); + cluster.must_transfer_leader(right.get_id(), right_peer_on_store1); + cluster.must_put(b"k1", b"v1"); + cluster.must_put(b"k3", b"v3"); + must_get_equal(&cluster.get_engine(3), b"k1", b"v1"); + must_get_equal(&cluster.get_engine(3), b"k3", b"v3"); + // Use DropMessageFilter to drop messages to store 3 without reporting error. + cluster.add_recv_filter_on_node( + 3, + Box::new(DropMessageFilter::new(Arc::new(|m| { + // Do not drop MsgAvailabilityRequest and MsgAvailabilityResponse + // messages, otherwise merge is blocked. + matches!( + m.get_extra_msg().get_type(), + ExtraMessageType::MsgAvailabilityRequest + | ExtraMessageType::MsgAvailabilityResponse + ) + }))), + ); + + // So cluster becomes + // left region: 1(leader) 2 | 3 + // right region: 1(leader) 2 | 3 + // | means isolation. + + // Merge left to right and remove left peer on store 3. + pd_client.must_merge(left.get_id(), right.get_id()); + let right_peer_on_store3 = find_peer(&right, 3).unwrap().clone(); + pd_client.must_remove_peer(right.get_id(), right_peer_on_store3); + let region_state = cluster.region_local_state(right.get_id(), 1); + assert!( + !region_state.get_merged_records().is_empty(), + "{:?}", + region_state + ); + + // So cluster becomes + // left region: merged + // right region: 1(leader) 2 | 3 (removed but not yet destroyed) + // | means isolation. + + let state1 = cluster.truncated_state(right.get_id(), 1); + (0..50).for_each(|i| cluster.must_put(b"k2", format!("v{}", i).as_bytes())); + // Wait to trigger compact raft log + cluster.wait_log_truncated(right.get_id(), 1, state1.get_index() + 1); + + // Cluster filters and wait for gc peer ticks. + cluster.clear_recv_filter_on_node(3); + sleep_ms(3 * cluster.cfg.raft_store.gc_peer_check_interval.as_millis()); + + // Left region replica on store 3 must be removed. + cluster.must_region_not_exist(left.get_id(), 3); + // Right region must clean up removed and merged records. + cluster.must_empty_region_merged_records(right.get_id()); + cluster.must_empty_region_removed_records(right.get_id()); +} + +#[test_case(test_raftstore_v2::new_node_cluster)] +fn test_gc_source_peers_forward_by_store_after_merge() { + let mut cluster = new_cluster(0, 3); + configure_for_merge(&mut cluster.cfg); + cluster.cfg.raft_store.gc_peer_check_interval = ReadableDuration::millis(500); + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + cluster.run(); + + let region = cluster.get_region(b"k1"); + cluster.must_split(®ion, b"k2"); + let left = cluster.get_region(b"k1"); + let right = cluster.get_region(b"k3"); + + let left_peer_on_store1 = find_peer(&left, 1).unwrap().clone(); + cluster.must_transfer_leader(left.get_id(), left_peer_on_store1); + let right_peer_on_store1 = find_peer(&right, 1).unwrap().clone(); + cluster.must_transfer_leader(right.get_id(), right_peer_on_store1); + cluster.must_put(b"k1", b"v1"); + cluster.must_put(b"k3", b"v3"); + must_get_equal(&cluster.get_engine(3), b"k1", b"v1"); + must_get_equal(&cluster.get_engine(3), b"k3", b"v3"); + // Drop GcPeerResponse. + cluster.add_recv_filter_on_node( + 1, + Box::new(DropMessageFilter::new(Arc::new(|m| { + m.get_extra_msg().get_type() != ExtraMessageType::MsgGcPeerResponse + }))), + ); + + // So cluster becomes + // left region: 1(leader) 2 | 3 + // right region: 1(leader) 2 | 3 + // | means isolation. + + // Merge left to right and remove left peer on store 3. + pd_client.must_merge(left.get_id(), right.get_id()); + let right_peer_on_store3 = find_peer(&right, 3).unwrap().clone(); + pd_client.must_remove_peer(right.get_id(), right_peer_on_store3); + // Right region replica on store 3 must be removed. + cluster.must_region_not_exist(right.get_id(), 3); + let region_state = cluster.region_local_state(right.get_id(), 1); + assert!( + !region_state.get_merged_records().is_empty(), + "{:?}", + region_state + ); + assert!( + !region_state.get_removed_records().is_empty(), + "{:?}", + region_state + ); + + // So cluster becomes + // left region: merged + // right region: 1(leader) 2 | 3 (destroyed but not yet cleaned in removed + // records) + // | means isolation. + + // Cluster filters and wait for gc peer ticks. + cluster.clear_recv_filter_on_node(1); + sleep_ms(3 * cluster.cfg.raft_store.gc_peer_check_interval.as_millis()); + + // Right region must clean up removed and merged records. + cluster.must_empty_region_merged_records(right.get_id()); + cluster.must_empty_region_removed_records(right.get_id()); } From df263d287dbdc8397030a3437ee97c918c43abb4 Mon Sep 17 00:00:00 2001 From: Neil Shen Date: Wed, 27 Sep 2023 15:34:47 +0800 Subject: [PATCH 073/220] raftstore-v2: check gc peer after commit merge (#15693) close tikv/tikv#15672 This commit addresses the issue of orphan peers remaining in TiKV due to the absence of GcPeer tick registration after commit merge. The lack of regular checks on removed_records and merged_records can lead to delays in detecting and resolving these issues. To improve this, we have implemented a solution that ensures TiKV registers the GcPeer tick after commit merge. This change enables regular checks on the removed_records and merged_records, preventing them from being overlooked for an extended period. Signed-off-by: Neil Shen --- .../operation/command/admin/merge/commit.rs | 1 + tests/integrations/raftstore/test_merge.rs | 31 +++++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/components/raftstore-v2/src/operation/command/admin/merge/commit.rs b/components/raftstore-v2/src/operation/command/admin/merge/commit.rs index bec0265ffc3..e95a13600fb 100644 --- a/components/raftstore-v2/src/operation/command/admin/merge/commit.rs +++ b/components/raftstore-v2/src/operation/command/admin/merge/commit.rs @@ -829,6 +829,7 @@ impl Peer { "target_region" => ?self.region(), ); self.add_pending_tick(PeerTick::SplitRegionCheck); + self.maybe_schedule_gc_peer_tick(); } } diff --git a/tests/integrations/raftstore/test_merge.rs b/tests/integrations/raftstore/test_merge.rs index 080724b15a7..8d93d2c5a5c 100644 --- a/tests/integrations/raftstore/test_merge.rs +++ b/tests/integrations/raftstore/test_merge.rs @@ -1940,3 +1940,34 @@ fn test_gc_source_peers_forward_by_store_after_merge() { cluster.must_empty_region_merged_records(right.get_id()); cluster.must_empty_region_removed_records(right.get_id()); } + +#[test_case(test_raftstore_v2::new_node_cluster)] +fn test_gc_merged_record_in_time() { + let mut cluster = new_cluster(0, 3); + configure_for_merge(&mut cluster.cfg); + cluster.cfg.raft_store.gc_peer_check_interval = ReadableDuration::millis(100); + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + cluster.run(); + + let region = cluster.get_region(b"k1"); + cluster.must_split(®ion, b"k2"); + let left = cluster.get_region(b"k1"); + let right = cluster.get_region(b"k3"); + + let left_peer_on_store1 = find_peer(&left, 1).unwrap().clone(); + cluster.must_transfer_leader(left.get_id(), left_peer_on_store1); + let right_peer_on_store1 = find_peer(&right, 1).unwrap().clone(); + cluster.must_transfer_leader(right.get_id(), right_peer_on_store1); + + // Wait enough time to trigger gc peer, and if there is nothing to gc, + // leader skips registering gc peer tick. + sleep_ms(3 * cluster.cfg.raft_store.gc_peer_check_interval.as_millis()); + + // Merge left to right. + pd_client.must_merge(left.get_id(), right.get_id()); + + // Once merge complete, gc peer tick should be registered and merged record + // will be cleaned up in time. + cluster.must_empty_region_merged_records(right.get_id()); +} From 73bc4012f0ea5c49870639ccf353d1de5382025f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B1=B1=E5=B2=9A?= <36239017+YuJuncen@users.noreply.github.com> Date: Wed, 27 Sep 2023 17:45:16 +0800 Subject: [PATCH 074/220] sst_importer: impl SuspendImport interface (#15612) close tikv/tikv#15611 Signed-off-by: hillium Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- Cargo.lock | 2 +- components/error_code/src/sst_importer.rs | 5 +- components/raftstore/src/store/util.rs | 2 +- components/sst_importer/src/errors.rs | 16 +++ src/import/sst_service.rs | 99 ++++++++++++++++++- tests/integrations/import/test_sst_service.rs | 94 ++++++++++++++++++ 6 files changed, 212 insertions(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 124a87f069e..b3842f92752 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2933,7 +2933,7 @@ dependencies = [ [[package]] name = "kvproto" version = "0.0.2" -source = "git+https://github.com/pingcap/kvproto.git#090f247be15c00a6000a4d23669ac3e95ea9fcd5" +source = "git+https://github.com/pingcap/kvproto.git#87bebcc0d071a18cbbd94a4fc02de9c4988af815" dependencies = [ "futures 0.3.15", "grpcio", diff --git a/components/error_code/src/sst_importer.rs b/components/error_code/src/sst_importer.rs index 001f4f146f6..117400e8aff 100644 --- a/components/error_code/src/sst_importer.rs +++ b/components/error_code/src/sst_importer.rs @@ -22,5 +22,8 @@ define_error_codes!( TTL_LEN_NOT_EQUALS_TO_PAIRS => ("TtlLenNotEqualsToPairs", "", ""), INCOMPATIBLE_API_VERSION => ("IncompatibleApiVersion", "", ""), INVALID_KEY_MODE => ("InvalidKeyMode", "", ""), - RESOURCE_NOT_ENOUTH => ("ResourceNotEnough", "", "") + RESOURCE_NOT_ENOUTH => ("ResourceNotEnough", "", ""), + SUSPENDED => ("Suspended", + "this request has been suspended.", + "Probably there are some export tools don't support exporting data inserted by `ingest`(say, snapshot backup). Check the user manual and stop them.") ); diff --git a/components/raftstore/src/store/util.rs b/components/raftstore/src/store/util.rs index 3f34fe691ee..519d486102c 100644 --- a/components/raftstore/src/store/util.rs +++ b/components/raftstore/src/store/util.rs @@ -306,7 +306,7 @@ pub fn compare_region_epoch( // tells TiDB with a epoch not match error contains the latest target Region // info, TiDB updates its region cache and sends requests to TiKV B, // and TiKV B has not applied commit merge yet, since the region epoch in - // request is higher than TiKV B, the request must be denied due to epoch + // request is higher than TiKV B, the request must be suspended due to epoch // not match, so it does not read on a stale snapshot, thus avoid the // KeyNotInRegion error. let current_epoch = region.get_region_epoch(); diff --git a/components/sst_importer/src/errors.rs b/components/sst_importer/src/errors.rs index 7ff940fff12..acca7523427 100644 --- a/components/sst_importer/src/errors.rs +++ b/components/sst_importer/src/errors.rs @@ -2,6 +2,7 @@ use std::{ error::Error as StdError, io::Error as IoError, num::ParseIntError, path::PathBuf, result, + time::Duration, }; use encryption::Error as EncryptionError; @@ -31,6 +32,7 @@ pub fn error_inc(type_: &str, err: &Error) { Error::BadFormat(..) => "bad_format", Error::Encryption(..) => "encryption", Error::CodecError(..) => "codec", + Error::Suspended { .. } => "suspended", _ => return, }; IMPORTER_ERROR_VEC.with_label_values(&[type_, label]).inc(); @@ -125,6 +127,9 @@ pub enum Error { #[error("resource is not enough {0}")] ResourceNotEnough(String), + + #[error("imports are suspended for {time_to_lease_expire:?}")] + Suspended { time_to_lease_expire: Duration }, } impl Error { @@ -160,6 +165,16 @@ impl From for import_sstpb::Error { err.set_store_error(import_err); err.set_message(format!("{}", e)); } + Error::Suspended { + time_to_lease_expire, + } => { + let mut store_err = errorpb::Error::default(); + let mut server_is_busy = errorpb::ServerIsBusy::default(); + server_is_busy.set_backoff_ms(time_to_lease_expire.as_millis() as _); + store_err.set_server_is_busy(server_is_busy); + err.set_store_error(store_err); + err.set_message(format!("{}", e)); + } _ => { err.set_message(format!("{}", e)); } @@ -197,6 +212,7 @@ impl ErrorCodeExt for Error { Error::IncompatibleApiVersion => error_code::sst_importer::INCOMPATIBLE_API_VERSION, Error::InvalidKeyMode { .. } => error_code::sst_importer::INVALID_KEY_MODE, Error::ResourceNotEnough(_) => error_code::sst_importer::RESOURCE_NOT_ENOUTH, + Error::Suspended { .. } => error_code::sst_importer::SUSPENDED, } } } diff --git a/src/import/sst_service.rs b/src/import/sst_service.rs index 6d40ffe959c..68403e226f8 100644 --- a/src/import/sst_service.rs +++ b/src/import/sst_service.rs @@ -5,7 +5,10 @@ use std::{ convert::identity, future::Future, path::PathBuf, - sync::{Arc, Mutex}, + sync::{ + atomic::{AtomicU64, Ordering}, + Arc, Mutex, + }, time::Duration, }; @@ -20,7 +23,8 @@ use kvproto::{ errorpb, import_sstpb::{ Error as ImportPbError, ImportSst, Range, RawWriteRequest_oneof_chunk as RawChunk, SstMeta, - SwitchMode, WriteRequest_oneof_chunk as Chunk, *, + SuspendImportRpcRequest, SuspendImportRpcResponse, SwitchMode, + WriteRequest_oneof_chunk as Chunk, *, }, kvrpcpb::Context, }; @@ -41,7 +45,7 @@ use tikv_util::{ HandyRwLock, }; use tokio::{runtime::Runtime, time::sleep}; -use txn_types::{Key, WriteRef, WriteType}; +use txn_types::{Key, TimeStamp, WriteRef, WriteType}; use super::{ make_rpc_error, @@ -49,6 +53,7 @@ use super::{ }; use crate::{ import::duplicate_detect::DuplicateDetector, + send_rpc_response, server::CONFIG_ROCKSDB_GAUGE, storage::{self, errors::extract_region_error_from_error}, }; @@ -80,6 +85,10 @@ const WIRE_EXTRA_BYTES: usize = 12; /// [`raft_writer::ThrottledTlsEngineWriter`]. There aren't too many items held /// in the writer. So we can run the GC less frequently. const WRITER_GC_INTERVAL: Duration = Duration::from_secs(300); +/// The max time of suspending requests. +/// This may save us from some client sending insane value to the server. +const SUSPEND_REQUEST_MAX_SECS: u64 = // 6h + 6 * 60 * 60; fn transfer_error(err: storage::Error) -> ImportPbError { let mut e = ImportPbError::default(); @@ -121,6 +130,9 @@ pub struct ImportSstService { // it's some iff multi-rocksdb is enabled store_meta: Option>>>, resource_manager: Option>, + + // When less than now, don't accept any requests. + suspend_req_until: Arc, } struct RequestCollector { @@ -356,6 +368,7 @@ impl ImportSstService { writer, store_meta, resource_manager, + suspend_req_until: Arc::new(AtomicU64::new(0)), } } @@ -619,6 +632,47 @@ impl ImportSstService { Ok(range) } + + /// Check whether we should suspend the current request. + fn check_suspend(&self) -> Result<()> { + let now = TimeStamp::physical_now(); + let suspend_until = self.suspend_req_until.load(Ordering::SeqCst); + if now < suspend_until { + Err(Error::Suspended { + time_to_lease_expire: Duration::from_millis(suspend_until - now), + }) + } else { + Ok(()) + } + } + + /// suspend requests for a period. + /// + /// # returns + /// + /// whether for now, the requests has already been suspended. + pub fn suspend_requests(&self, for_time: Duration) -> bool { + let now = TimeStamp::physical_now(); + let last_suspend_until = self.suspend_req_until.load(Ordering::SeqCst); + let suspended = now < last_suspend_until; + let suspend_until = TimeStamp::physical_now() + for_time.as_millis() as u64; + self.suspend_req_until + .store(suspend_until, Ordering::SeqCst); + suspended + } + + /// allow all requests to enter. + /// + /// # returns + /// + /// whether requests has already been previously suspended. + pub fn allow_requests(&self) -> bool { + let now = TimeStamp::physical_now(); + let last_suspend_until = self.suspend_req_until.load(Ordering::SeqCst); + let suspended = now < last_suspend_until; + self.suspend_req_until.store(0, Ordering::SeqCst); + suspended + } } #[macro_export] @@ -993,6 +1047,10 @@ impl ImportSst for ImportSstService { ) { let label = "ingest"; let timer = Instant::now_coarse(); + if let Err(err) = self.check_suspend() { + ctx.spawn(async move { crate::send_rpc_response!(Err(err), sink, label, timer) }); + return; + } let mut resp = IngestResponse::default(); let region_id = req.get_context().get_region_id(); @@ -1036,6 +1094,10 @@ impl ImportSst for ImportSstService { ) { let label = "multi-ingest"; let timer = Instant::now_coarse(); + if let Err(err) = self.check_suspend() { + ctx.spawn(async move { crate::send_rpc_response!(Err(err), sink, label, timer) }); + return; + } let mut resp = IngestResponse::default(); if let Some(errorpb) = self.check_write_stall(req.get_context().get_region_id()) { @@ -1240,6 +1302,37 @@ impl ImportSst for ImportSstService { RawChunk, new_raw_writer ); + + fn suspend_import_rpc( + &mut self, + ctx: RpcContext<'_>, + req: SuspendImportRpcRequest, + sink: UnarySink, + ) { + let label = "suspend_import_rpc"; + let timer = Instant::now_coarse(); + + if req.should_suspend_imports && req.get_duration_in_secs() > SUSPEND_REQUEST_MAX_SECS { + ctx.spawn(async move { + send_rpc_response!(Err(Error::Io( + std::io::Error::new(std::io::ErrorKind::InvalidInput, + format!("you are going to suspend the import RPCs too long. (for {} seconds, max acceptable duration is {} seconds)", + req.get_duration_in_secs(), SUSPEND_REQUEST_MAX_SECS)))), sink, label, timer); + }); + return; + } + + let suspended = if req.should_suspend_imports { + info!("suspend incoming import RPCs."; "for_second" => req.get_duration_in_secs(), "caller" => req.get_caller()); + self.suspend_requests(Duration::from_secs(req.get_duration_in_secs())) + } else { + info!("allow incoming import RPCs."; "caller" => req.get_caller()); + self.allow_requests() + }; + let mut resp = SuspendImportRpcResponse::default(); + resp.set_already_suspended(suspended); + ctx.spawn(async move { send_rpc_response!(Ok(resp), sink, label, timer) }); + } } // add error statistics from pb error response diff --git a/tests/integrations/import/test_sst_service.rs b/tests/integrations/import/test_sst_service.rs index 22ab9c7d7fe..6c56ab0018b 100644 --- a/tests/integrations/import/test_sst_service.rs +++ b/tests/integrations/import/test_sst_service.rs @@ -555,3 +555,97 @@ fn test_duplicate_and_close() { req.set_mode(SwitchMode::Normal); import.switch_mode(&req).unwrap(); } + +#[test] +fn test_suspend_import() { + let (_cluster, ctx, tikv, import) = new_cluster_and_tikv_import_client(); + let sst_range = (0, 10); + let write = |sst_range: (u8, u8)| { + let mut meta = new_sst_meta(0, 0); + meta.set_region_id(ctx.get_region_id()); + meta.set_region_epoch(ctx.get_region_epoch().clone()); + + let mut keys = vec![]; + let mut values = vec![]; + for i in sst_range.0..sst_range.1 { + keys.push(vec![i]); + values.push(vec![i]); + } + send_write_sst(&import, &meta, keys, values, 1) + }; + let ingest = |sst_meta: &SstMeta| { + let mut ingest = IngestRequest::default(); + ingest.set_context(ctx.clone()); + ingest.set_sst(sst_meta.clone()); + import.ingest(&ingest) + }; + let multi_ingest = |sst_metas: &[SstMeta]| { + let mut multi_ingest = MultiIngestRequest::default(); + multi_ingest.set_context(ctx.clone()); + multi_ingest.set_ssts(sst_metas.to_vec().into()); + import.multi_ingest(&multi_ingest) + }; + let suspendctl = |for_time| { + let mut req = SuspendImportRpcRequest::default(); + req.set_caller("test_suspend_import".to_owned()); + if for_time == 0 { + req.set_should_suspend_imports(false); + } else { + req.set_should_suspend_imports(true); + req.set_duration_in_secs(for_time); + } + req + }; + + let write_res = write(sst_range).unwrap(); + assert_eq!(write_res.metas.len(), 1); + let sst = write_res.metas[0].clone(); + + assert!( + !import + .suspend_import_rpc(&suspendctl(6000)) + .unwrap() + .already_suspended + ); + let write_res = write(sst_range); + write_res.unwrap(); + let ingest_res = ingest(&sst); + assert_to_string_contains!(ingest_res.unwrap_err(), "Suspended"); + let multi_ingest_res = multi_ingest(&[sst.clone()]); + assert_to_string_contains!(multi_ingest_res.unwrap_err(), "Suspended"); + + assert!( + import + .suspend_import_rpc(&suspendctl(0)) + .unwrap() + .already_suspended + ); + + let ingest_res = ingest(&sst); + assert!(ingest_res.is_ok(), "{:?} => {:?}", sst, ingest_res); + + check_ingested_txn_kvs(&tikv, &ctx, sst_range, 2); + + // test timeout. + assert!( + !import + .suspend_import_rpc(&suspendctl(1)) + .unwrap() + .already_suspended + ); + let sst_range = (10, 20); + let write_res = write(sst_range); + let sst = write_res.unwrap().metas; + let res = multi_ingest(&sst); + assert_to_string_contains!(res.unwrap_err(), "Suspended"); + std::thread::sleep(Duration::from_secs(1)); + multi_ingest(&sst).unwrap(); + + // check an insane value should be rejected. + import + .suspend_import_rpc(&suspendctl(u64::MAX - 42)) + .unwrap_err(); + let sst_range = (20, 30); + let ssts = write(sst_range).unwrap(); + multi_ingest(ssts.get_metas()).unwrap(); +} From 4814a6129b8a4ae122bb6152c140a064787456bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B1=B1=E5=B2=9A?= <36239017+YuJuncen@users.noreply.github.com> Date: Thu, 28 Sep 2023 11:34:48 +0800 Subject: [PATCH 075/220] compaction_guard: split SST when detected possible huge compaction (#15379) close tikv/tikv#15058 This PR make the compaction guard splits SSTs when it find that there are possible huge compactions. It works by iterating the next of the output level (Let is be level L+1), when a SST crosses such a huge key range that making L+1 contains size greater than the `max-compaction-size`. Signed-off-by: hillium Co-authored-by: tonyxuqqi --- .../engine_rocks/src/sst_partitioner.rs | 2 + .../engine_traits/src/sst_partitioner.rs | 2 + .../raftstore/src/store/compaction_guard.rs | 314 ++++++++++++++++-- src/config/mod.rs | 1 + 4 files changed, 293 insertions(+), 26 deletions(-) diff --git a/components/engine_rocks/src/sst_partitioner.rs b/components/engine_rocks/src/sst_partitioner.rs index fc1dcd40270..f642a94f28f 100644 --- a/components/engine_rocks/src/sst_partitioner.rs +++ b/components/engine_rocks/src/sst_partitioner.rs @@ -23,6 +23,8 @@ impl rocksdb::SstPartitionerFactory output_level: context.output_level, smallest_key: context.smallest_key, largest_key: context.largest_key, + next_level_boundaries: context.next_level_boundaries.clone(), + next_level_sizes: context.next_level_sizes.clone(), }; self.0.create_partitioner(&ctx).map(RocksSstPartitioner) } diff --git a/components/engine_traits/src/sst_partitioner.rs b/components/engine_traits/src/sst_partitioner.rs index bc6ec13a4eb..4a8ee9e71bc 100644 --- a/components/engine_traits/src/sst_partitioner.rs +++ b/components/engine_traits/src/sst_partitioner.rs @@ -22,6 +22,8 @@ pub struct SstPartitionerContext<'a> { pub output_level: i32, pub smallest_key: &'a [u8], pub largest_key: &'a [u8], + pub next_level_boundaries: Vec<&'a [u8]>, + pub next_level_sizes: Vec, } pub trait SstPartitioner { diff --git a/components/raftstore/src/store/compaction_guard.rs b/components/raftstore/src/store/compaction_guard.rs index efee09be906..138d730fa29 100644 --- a/components/raftstore/src/store/compaction_guard.rs +++ b/components/raftstore/src/store/compaction_guard.rs @@ -23,10 +23,16 @@ pub struct CompactionGuardGeneratorFactory { cf_name: CfNames, provider: P, min_output_file_size: u64, + max_compaction_size: u64, } impl CompactionGuardGeneratorFactory

{ - pub fn new(cf: CfName, provider: P, min_output_file_size: u64) -> Result { + pub fn new( + cf: CfName, + provider: P, + min_output_file_size: u64, + max_compaction_size: u64, + ) -> Result { let cf_name = match cf { CF_DEFAULT => CfNames::default, CF_LOCK => CfNames::lock, @@ -43,6 +49,7 @@ impl CompactionGuardGeneratorFactory

{ cf_name, provider, min_output_file_size, + max_compaction_size, }) } } @@ -72,6 +79,15 @@ impl SstPartitionerFactory use_guard: false, boundaries: vec![], pos: 0, + next_level_pos: 0, + next_level_boundaries: context + .next_level_boundaries + .iter() + .map(|v| v.to_vec()) + .collect(), + next_level_size: context.next_level_sizes.clone(), + current_next_level_size: 0, + max_compaction_size: self.max_compaction_size, }) } } @@ -86,7 +102,20 @@ pub struct CompactionGuardGenerator { use_guard: bool, // The boundary keys are exclusive. boundaries: Vec>, + /// The SST boundaries overlapped with the compaction input at the next + /// level of output level (let we call it L+2). When the output level is the + /// bottom-most level(usually L6), this will be empty. The boundaries + /// are the first key of the first sst concatenating with all ssts' end key. + next_level_boundaries: Vec>, + /// The size of each "segment" of L+2. If the `next_level_boundaries`(let we + /// call it NLB) isn't empty, `next_level_size` will have length + /// `NLB.len() - 1`, and at the position `N` stores the size of range + /// `[NLB[N], NLB[N+1]]` in L+2. + next_level_size: Vec, pos: usize, + next_level_pos: usize, + current_next_level_size: u64, + max_compaction_size: u64, } impl CompactionGuardGenerator

{ @@ -153,27 +182,52 @@ impl SstPartitioner for CompactionGuardGenerator

{ if !self.use_guard { return SstPartitionerResult::NotRequired; } - let mut pos = self.pos; - let mut skip_count = 0; - while pos < self.boundaries.len() && self.boundaries[pos].as_slice() <= req.prev_user_key { - pos += 1; - skip_count += 1; - if skip_count >= COMPACTION_GUARD_MAX_POS_SKIP { - let prev_user_key = req.prev_user_key.to_vec(); - pos = match self.boundaries.binary_search(&prev_user_key) { - Ok(search_pos) => search_pos + 1, - Err(search_pos) => search_pos, - }; - break; - } + self.pos = seek_to(&self.boundaries, req.prev_user_key, self.pos); + // Generally this shall be a noop... because each time we are moving the cursor + // to the previous key. + let left_next_level_pos = seek_to( + &self.next_level_boundaries, + req.prev_user_key, + self.next_level_pos, + ); + let right_next_level_pos = seek_to( + &self.next_level_boundaries, + req.current_user_key, + left_next_level_pos, + ); + // The cursor has been moved. + if right_next_level_pos > left_next_level_pos { + self.current_next_level_size += self.next_level_size + [left_next_level_pos..right_next_level_pos - 1] + .iter() + .map(|x| *x as u64) + .sum::(); } - self.pos = pos; - if pos < self.boundaries.len() && self.boundaries[pos].as_slice() <= req.current_user_key { - if req.current_output_file_size >= self.min_output_file_size { + self.next_level_pos = right_next_level_pos; + + if self.pos < self.boundaries.len() + && self.boundaries[self.pos].as_slice() <= req.current_user_key + { + if req.current_output_file_size >= self.min_output_file_size + // Or, the output file may make a huge compaction even greater than the max compaction size. + || self.current_next_level_size >= self.max_compaction_size + { COMPACTION_GUARD_ACTION_COUNTER .get(self.cf_name) .partition .inc(); + // The current pointer status should be like (let * be the current pos, ^ be + // where the previous user key is): + // boundaries: A B C D + // size: 1 3 2 + // ^ * + // You will notice that the previous user key is between B and C, which indices + // that there must still be something between previous user key and C. + // We still set `current_next_level_size` to zero here, so the segment will be + // forgotten. I think that will be acceptable given generally a segment won't be + // greater than the `max-sst-size`, which is tiny comparing to the + // `max-compaction-size` usually. + self.current_next_level_size = 0; SstPartitionerResult::Required } else { COMPACTION_GUARD_ACTION_COUNTER @@ -193,10 +247,28 @@ impl SstPartitioner for CompactionGuardGenerator

{ } } +fn seek_to(all_data: &Vec>, target_key: &[u8], from_pos: usize) -> usize { + let mut pos = from_pos; + let mut skip_count = 0; + while pos < all_data.len() && all_data[pos].as_slice() <= target_key { + pos += 1; + skip_count += 1; + if skip_count >= COMPACTION_GUARD_MAX_POS_SKIP { + pos = match all_data.binary_search_by(|probe| probe.as_slice().cmp(target_key)) { + Ok(search_pos) => search_pos + 1, + Err(search_pos) => search_pos, + }; + break; + } + } + pos +} + #[cfg(test)] mod tests { - use std::str; + use std::{path::Path, str}; + use collections::HashMap; use engine_rocks::{ raw::{BlockBasedOptions, DBCompressionType}, util::new_engine_opt, @@ -212,6 +284,13 @@ mod tests { use super::*; use crate::coprocessor::region_info_accessor::MockRegionInfoProvider; + impl CompactionGuardGenerator { + fn reset_next_level_size_state(&mut self) { + self.current_next_level_size = 0; + self.next_level_pos = 0; + } + } + #[test] fn test_compaction_guard_non_data() { let mut guard = CompactionGuardGenerator { @@ -224,6 +303,11 @@ mod tests { use_guard: false, boundaries: vec![], pos: 0, + current_next_level_size: 0, + next_level_pos: 0, + next_level_boundaries: vec![], + next_level_size: vec![], + max_compaction_size: 1 << 30, }; guard.smallest_key = keys::LOCAL_MIN_KEY.to_vec(); @@ -267,8 +351,16 @@ mod tests { provider: MockRegionInfoProvider::new(vec![]), initialized: true, use_guard: true, - boundaries: vec![b"bbb".to_vec(), b"ccc".to_vec()], + boundaries: vec![b"bbb".to_vec(), b"ccc".to_vec(), b"ddd".to_vec()], pos: 0, + current_next_level_size: 0, + next_level_pos: 0, + next_level_boundaries: (0..10) + .map(|x| format!("bbb{:02}", x).into_bytes()) + .chain((0..100).map(|x| format!("cccz{:03}", x).into_bytes())) + .collect(), + next_level_size: [&[1 << 18; 99][..], &[1 << 28; 10][..]].concat(), + max_compaction_size: 1 << 30, // 1GB }; // Crossing region boundary. let mut req = SstPartitionerRequest { @@ -277,7 +369,11 @@ mod tests { current_output_file_size: 32 << 20, }; assert_eq!(guard.should_partition(&req), SstPartitionerResult::Required); + assert_eq!(guard.next_level_pos, 10); assert_eq!(guard.pos, 0); + assert_eq!(guard.current_next_level_size, 0); + guard.reset_next_level_size_state(); + // Output file size too small. req = SstPartitionerRequest { prev_user_key: b"bba", @@ -289,6 +385,10 @@ mod tests { SstPartitionerResult::NotRequired ); assert_eq!(guard.pos, 0); + assert_eq!(guard.next_level_pos, 10); + assert_eq!(guard.current_next_level_size, 9 << 18); + guard.reset_next_level_size_state(); + // Not crossing boundary. req = SstPartitionerRequest { prev_user_key: b"aaa", @@ -300,6 +400,9 @@ mod tests { SstPartitionerResult::NotRequired ); assert_eq!(guard.pos, 0); + assert_eq!(guard.next_level_pos, 0); + guard.reset_next_level_size_state(); + // Move position req = SstPartitionerRequest { prev_user_key: b"cca", @@ -308,6 +411,30 @@ mod tests { }; assert_eq!(guard.should_partition(&req), SstPartitionerResult::Required); assert_eq!(guard.pos, 1); + assert_eq!(guard.next_level_pos, 110); + guard.reset_next_level_size_state(); + + // Move next level posistion + req = SstPartitionerRequest { + prev_user_key: b"cccz000", + current_user_key: b"cccz042", + current_output_file_size: 1 << 20, + }; + assert_eq!( + guard.should_partition(&req), + SstPartitionerResult::NotRequired + ); + assert_eq!(guard.pos, 2); + assert_eq!(guard.next_level_pos, 53); + + req = SstPartitionerRequest { + prev_user_key: b"cccz090", + current_user_key: b"dde", + current_output_file_size: 1 << 20, + }; + assert_eq!(guard.should_partition(&req), SstPartitionerResult::Required); + assert_eq!(guard.pos, 2); + assert_eq!(guard.next_level_pos, 110); } #[test] @@ -339,6 +466,11 @@ mod tests { b"aaa15".to_vec(), ], pos: 0, + current_next_level_size: 0, + next_level_pos: 0, + next_level_boundaries: vec![], + next_level_size: vec![], + max_compaction_size: 1 << 30, }; // Binary search meet exact match. guard.pos = 0; @@ -365,15 +497,23 @@ mod tests { const MIN_OUTPUT_FILE_SIZE: u64 = 1024; const MAX_OUTPUT_FILE_SIZE: u64 = 4096; + const MAX_COMPACTION_SIZE: u64 = 10240; fn new_test_db(provider: MockRegionInfoProvider) -> (RocksEngine, TempDir) { let temp_dir = TempDir::new().unwrap(); let mut cf_opts = RocksCfOptions::default(); + cf_opts.set_max_bytes_for_level_base(MAX_OUTPUT_FILE_SIZE); + cf_opts.set_max_bytes_for_level_multiplier(5); cf_opts.set_target_file_size_base(MAX_OUTPUT_FILE_SIZE); cf_opts.set_sst_partitioner_factory(RocksSstPartitionerFactory( - CompactionGuardGeneratorFactory::new(CF_DEFAULT, provider, MIN_OUTPUT_FILE_SIZE) - .unwrap(), + CompactionGuardGeneratorFactory::new( + CF_DEFAULT, + provider, + MIN_OUTPUT_FILE_SIZE, + MAX_COMPACTION_SIZE, + ) + .unwrap(), )); cf_opts.set_disable_auto_compactions(true); cf_opts.compression_per_level(&[ @@ -412,6 +552,16 @@ mod tests { ret } + fn get_sst_files(dir: &Path) -> Vec { + let files = dir.read_dir().unwrap(); + let mut sst_files = files + .map(|entry| entry.unwrap().path().to_str().unwrap().to_owned()) + .filter(|entry| entry.ends_with(".sst")) + .collect::>(); + sst_files.sort(); + sst_files + } + #[test] fn test_compaction_guard_with_rocks() { let provider = MockRegionInfoProvider::new(vec![ @@ -463,11 +613,7 @@ mod tests { ) .unwrap(); - let files = dir.path().read_dir().unwrap(); - let mut sst_files = files - .map(|entry| entry.unwrap().path().to_str().unwrap().to_owned()) - .filter(|entry| entry.ends_with(".sst")) - .collect::>(); + let mut sst_files = get_sst_files(dir.path()); sst_files.sort(); assert_eq!(3, sst_files.len()); assert_eq!(collect_keys(&sst_files[0]), [b"za1", b"zb1", b"zb2"]); @@ -477,4 +623,120 @@ mod tests { ); assert_eq!(collect_keys(&sst_files[2]), [b"zc6"]); } + + fn simple_regions() -> MockRegionInfoProvider { + MockRegionInfoProvider::new(vec![ + Region { + id: 1, + start_key: b"a".to_vec(), + end_key: b"b".to_vec(), + ..Default::default() + }, + Region { + id: 2, + start_key: b"b".to_vec(), + end_key: b"c".to_vec(), + ..Default::default() + }, + Region { + id: 3, + start_key: b"c".to_vec(), + end_key: b"d".to_vec(), + ..Default::default() + }, + ]) + } + + #[test] + fn test_next_level_compaction() { + let provider = simple_regions(); + let (db, _dir) = new_test_db(provider); + assert_eq!(b"z", DATA_PREFIX_KEY); + let tiny_value = [b'v'; 1]; + let value = vec![b'v'; 1024 * 10]; + ['a', 'b', 'c'] + .into_iter() + .flat_map(|x| (1..10).map(move |n| format!("z{x}{n}").into_bytes())) + .for_each(|key| db.put(&key, &value).unwrap()); + db.flush_cfs(&[], true).unwrap(); + db.compact_files_in_range(None, None, Some(2)).unwrap(); + db.put(b"za0", &tiny_value).unwrap(); + db.put(b"zd0", &tiny_value).unwrap(); + db.flush_cfs(&[], true).unwrap(); + db.compact_files_in_range(None, None, Some(1)).unwrap(); + + let level_1 = &level_files(&db)[&1]; + assert_eq!(level_1.len(), 2, "{:?}", level_1); + assert_eq!(level_1[0].smallestkey, b"za0", "{:?}", level_1); + assert_eq!(level_1[0].largestkey, b"za0", "{:?}", level_1); + assert_eq!(level_1[1].smallestkey, b"zd0", "{:?}", level_1); + assert_eq!(level_1[1].largestkey, b"zd0", "{:?}", level_1); + } + + #[test] + fn test_next_level_compaction_no_split() { + let provider = simple_regions(); + let (db, _dir) = new_test_db(provider); + assert_eq!(b"z", DATA_PREFIX_KEY); + let tiny_value = [b'v'; 1]; + let value = vec![b'v'; 1024 * 10]; + ['a', 'b', 'c'] + .into_iter() + .flat_map(|x| (1..10).map(move |n| format!("z{x}{n}").into_bytes())) + .for_each(|key| db.put(&key, &value).unwrap()); + db.flush_cfs(&[], true).unwrap(); + db.compact_files_in_range(None, None, Some(2)).unwrap(); + // So... the next-level size will be almost 1024 * 9, which doesn't exceeds the + // compaction size limit. + db.put(b"za0", &tiny_value).unwrap(); + db.put(b"za9", &tiny_value).unwrap(); + db.flush_cfs(&[], true).unwrap(); + db.compact_files_in_range(None, None, Some(1)).unwrap(); + + let level_1 = &level_files(&db)[&1]; + assert_eq!(level_1.len(), 1, "{:?}", level_1); + assert_eq!(level_1[0].smallestkey, b"za0", "{:?}", level_1); + assert_eq!(level_1[0].largestkey, b"za9", "{:?}", level_1); + db.compact_range(None, None, false, 1).unwrap(); + + // So... the next-level size will be almost 1024 * 15, which should reach the + // limit. + db.put(b"za30", &tiny_value).unwrap(); + db.put(b"zb90", &tiny_value).unwrap(); + db.flush_cfs(&[], true).unwrap(); + db.compact_files_in_range(None, None, Some(1)).unwrap(); + + let level_1 = &level_files(&db)[&1]; + assert_eq!(level_1.len(), 2, "{:?}", level_1); + assert_eq!(level_1[0].smallestkey, b"za30", "{:?}", level_1); + assert_eq!(level_1[1].largestkey, b"zb90", "{:?}", level_1); + } + + #[derive(Debug)] + #[allow(dead_code)] + struct OwnedSstFileMetadata { + name: String, + size: usize, + smallestkey: Vec, + largestkey: Vec, + } + + #[allow(unused)] + fn level_files(db: &RocksEngine) -> HashMap> { + let db = db.as_inner(); + let cf = db.cf_handle("default").unwrap(); + let md = db.get_column_family_meta_data(cf); + let mut res: HashMap> = HashMap::default(); + for (i, level) in md.get_levels().into_iter().enumerate() { + for file in level.get_files() { + res.entry(i).or_default().push(OwnedSstFileMetadata { + name: file.get_name(), + size: file.get_size(), + smallestkey: file.get_smallestkey().to_owned(), + largestkey: file.get_largestkey().to_owned(), + }); + } + } + res + } } diff --git a/src/config/mod.rs b/src/config/mod.rs index 8318556483e..d18d6f8cda0 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -645,6 +645,7 @@ macro_rules! build_cf_opt { $cf_name, provider.clone(), $opt.compaction_guard_min_output_file_size.0, + $opt.max_compaction_bytes.0, ) .unwrap(); cf_opts.set_sst_partitioner_factory(factory); From 56091d5998745f7c741d1c6fa8aa1ba281e990ed Mon Sep 17 00:00:00 2001 From: lijie Date: Thu, 28 Sep 2023 11:48:41 +0800 Subject: [PATCH 076/220] chore: bump version to 7.5.0-alpha (#15708) Signed-off-by: lijie --- Cargo.lock | 2 +- Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b3842f92752..a10755f5a7f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6526,7 +6526,7 @@ dependencies = [ [[package]] name = "tikv" -version = "7.4.0-alpha" +version = "7.5.0-alpha" dependencies = [ "anyhow", "api_version", diff --git a/Cargo.toml b/Cargo.toml index 81be4d36906..4d8cefa9fa4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "tikv" -version = "7.4.0-alpha" +version = "7.5.0-alpha" authors = ["The TiKV Authors"] description = "A distributed transactional key-value database powered by Rust and Raft" license = "Apache-2.0" From a7db07d72dcbf2c938ebd0b4661270fdc95f9a43 Mon Sep 17 00:00:00 2001 From: Neil Shen Date: Thu, 28 Sep 2023 17:40:50 +0800 Subject: [PATCH 077/220] raftstore-v2: gc removed_records and merged_records on tombstone store (#15677) close tikv/tikv#15669 Let leader directly GC removed_records and merged_records on tombstone store, instead of sending GcPeerRequests to such store. Signed-off-by: Neil Shen Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- components/raftstore-v2/src/fsm/peer.rs | 3 ++ components/raftstore-v2/src/operation/life.rs | 31 ++++++++++++ .../raftstore-v2/src/operation/ready/mod.rs | 7 +++ components/raftstore-v2/src/router/message.rs | 5 ++ components/test_pd_client/src/pd.rs | 4 +- components/test_raftstore-v2/src/server.rs | 5 ++ components/test_raftstore/src/server.rs | 4 +- components/tikv_kv/src/raft_extension.rs | 3 ++ src/server/lock_manager/deadlock.rs | 13 +---- src/server/lock_manager/mod.rs | 4 +- src/server/metrics.rs | 1 + src/server/raft_client.rs | 16 +++--- src/server/raftkv2/raft_extension.rs | 5 ++ src/server/resolve.rs | 50 +++++++++++++++++-- src/server/server.rs | 6 +-- tests/failpoints/cases/mod.rs | 1 + tests/failpoints/cases/test_life.rs | 36 +++++++++++++ .../config/dynamic/pessimistic_txn.rs | 16 +----- tests/integrations/server/raft_client.rs | 44 ++++++++-------- 19 files changed, 188 insertions(+), 66 deletions(-) create mode 100644 tests/failpoints/cases/test_life.rs diff --git a/components/raftstore-v2/src/fsm/peer.rs b/components/raftstore-v2/src/fsm/peer.rs index 872b2c4e7e6..54729787271 100644 --- a/components/raftstore-v2/src/fsm/peer.rs +++ b/components/raftstore-v2/src/fsm/peer.rs @@ -315,6 +315,9 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, PeerMsg::StoreUnreachable { to_store_id } => { self.fsm.peer_mut().on_store_unreachable(to_store_id) } + PeerMsg::StoreMaybeTombstone { store_id } => { + self.fsm.peer_mut().on_store_maybe_tombstone(store_id) + } PeerMsg::SnapshotSent { to_peer_id, status } => { self.fsm.peer_mut().on_snapshot_sent(to_peer_id, status) } diff --git a/components/raftstore-v2/src/operation/life.rs b/components/raftstore-v2/src/operation/life.rs index 8591d5daf23..84bded8a9bb 100644 --- a/components/raftstore-v2/src/operation/life.rs +++ b/components/raftstore-v2/src/operation/life.rs @@ -713,6 +713,37 @@ impl Peer { ctx.confirmed_ids.push(gc_peer_id); } + // Clean up removed and merged records for peers on tombstone stores, + // otherwise it may keep sending gc peer request to the tombstone store. + pub fn on_store_maybe_tombstone_gc_peer(&mut self, store_id: u64) { + let mut peers_on_tombstone = vec![]; + let state = self.storage().region_state(); + for peer in state.get_removed_records() { + if peer.get_store_id() == store_id { + peers_on_tombstone.push(peer.clone()); + } + } + for record in state.get_merged_records() { + for peer in record.get_source_peers() { + if peer.get_store_id() == store_id { + peers_on_tombstone.push(peer.clone()); + } + } + } + if peers_on_tombstone.is_empty() { + return; + } + info!(self.logger, "gc peer on tombstone store"; + "tombstone_store_id" => store_id, + "peers" => ?peers_on_tombstone); + let ctx = self.gc_peer_context_mut(); + for peer in peers_on_tombstone { + if !ctx.confirmed_ids.contains(&peer.get_id()) { + ctx.confirmed_ids.push(peer.get_id()); + } + } + } + // Removes deleted peers from region state by proposing a `UpdateGcPeer` // command. pub fn on_gc_peer_tick(&mut self, ctx: &mut StoreContext) { diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index 17845b5d0b8..1ff07f2ccc1 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -247,6 +247,13 @@ impl Peer { } } + pub fn on_store_maybe_tombstone(&mut self, store_id: u64) { + if !self.is_leader() { + return; + } + self.on_store_maybe_tombstone_gc_peer(store_id); + } + pub fn on_raft_message( &mut self, ctx: &mut StoreContext, diff --git a/components/raftstore-v2/src/router/message.rs b/components/raftstore-v2/src/router/message.rs index 2d364af44e1..16d43970e7a 100644 --- a/components/raftstore-v2/src/router/message.rs +++ b/components/raftstore-v2/src/router/message.rs @@ -197,6 +197,11 @@ pub enum PeerMsg { StoreUnreachable { to_store_id: u64, }, + // A store may be tombstone. Use it with caution, it also means store not + // found, PD can not distinguish them now, as PD may delete tombstone stores. + StoreMaybeTombstone { + store_id: u64, + }, /// Reports whether the snapshot sending is successful or not. SnapshotSent { to_peer_id: u64, diff --git a/components/test_pd_client/src/pd.rs b/components/test_pd_client/src/pd.rs index c81230f6a16..a9141bf6299 100644 --- a/components/test_pd_client/src/pd.rs +++ b/components/test_pd_client/src/pd.rs @@ -547,7 +547,9 @@ impl PdCluster { fn get_store(&self, store_id: u64) -> Result { match self.stores.get(&store_id) { Some(s) if s.store.get_id() != 0 => Ok(s.store.clone()), - _ => Err(box_err!("store {} not found", store_id)), + // Matches PD error message. + // See https://github.com/tikv/pd/blob/v7.3.0/server/grpc_service.go#L777-L780 + _ => Err(box_err!("invalid store ID {}, not found", store_id)), } } diff --git a/components/test_raftstore-v2/src/server.rs b/components/test_raftstore-v2/src/server.rs index 7b5d501a59f..299e93eb746 100644 --- a/components/test_raftstore-v2/src/server.rs +++ b/components/test_raftstore-v2/src/server.rs @@ -222,6 +222,11 @@ impl RaftExtension for TestExtension { self.extension.report_store_unreachable(store_id) } + #[inline] + fn report_store_maybe_tombstone(&self, store_id: u64) { + self.extension.report_store_maybe_tombstone(store_id) + } + #[inline] fn report_snapshot_status( &self, diff --git a/components/test_raftstore/src/server.rs b/components/test_raftstore/src/server.rs index 8d26bae968d..0002f36d647 100644 --- a/components/test_raftstore/src/server.rs +++ b/components/test_raftstore/src/server.rs @@ -113,8 +113,8 @@ impl StoreAddrResolver for AddressMap { fn resolve( &self, store_id: u64, - cb: Box) + Send>, - ) -> ServerResult<()> { + cb: Box) + Send>, + ) -> resolve::Result<()> { let addr = self.get(store_id); match addr { Some(addr) => cb(Ok(addr)), diff --git a/components/tikv_kv/src/raft_extension.rs b/components/tikv_kv/src/raft_extension.rs index 26c9e687ef6..7ab4c1c030d 100644 --- a/components/tikv_kv/src/raft_extension.rs +++ b/components/tikv_kv/src/raft_extension.rs @@ -32,6 +32,9 @@ pub trait RaftExtension: Clone + Send { /// Report the target store is unreachable. fn report_store_unreachable(&self, _store_id: u64) {} + /// Report the target store may be tombstone. + fn report_store_maybe_tombstone(&self, _store_id: u64) {} + /// Report the status of snapshot. fn report_snapshot_status(&self, _region_id: u64, _to_peer_id: u64, _status: SnapshotStatus) {} diff --git a/src/server/lock_manager/deadlock.rs b/src/server/lock_manager/deadlock.rs index 9583df80dd6..fd749cc3175 100644 --- a/src/server/lock_manager/deadlock.rs +++ b/src/server/lock_manager/deadlock.rs @@ -1119,7 +1119,7 @@ pub mod tests { use tikv_util::worker::FutureWorker; use super::*; - use crate::server::resolve::Callback; + use crate::server::resolve; #[test] fn test_detect_table() { @@ -1467,15 +1467,6 @@ pub mod tests { impl PdClient for MockPdClient {} - #[derive(Clone)] - pub(crate) struct MockResolver; - - impl StoreAddrResolver for MockResolver { - fn resolve(&self, _store_id: u64, _cb: Callback) -> Result<()> { - Err(Error::Other(box_err!("unimplemented"))) - } - } - fn start_deadlock_detector( host: &mut CoprocessorHost, ) -> (FutureWorker, Scheduler) { @@ -1485,7 +1476,7 @@ pub mod tests { let detector_runner = Detector::new( 1, Arc::new(MockPdClient {}), - MockResolver {}, + resolve::MockStoreAddrResolver::default(), Arc::new(SecurityManager::new(&SecurityConfig::default()).unwrap()), waiter_mgr_scheduler, &Config::default(), diff --git a/src/server/lock_manager/mod.rs b/src/server/lock_manager/mod.rs index 243d533a0e5..c42531ae0fd 100644 --- a/src/server/lock_manager/mod.rs +++ b/src/server/lock_manager/mod.rs @@ -318,7 +318,7 @@ mod tests { use self::{deadlock::tests::*, metrics::*, waiter_manager::tests::*}; use super::*; - use crate::storage::lock_manager::LockDigest; + use crate::{server::resolve::MockStoreAddrResolver, storage::lock_manager::LockDigest}; fn start_lock_manager() -> LockManager { let mut coprocessor_host = CoprocessorHost::::default(); @@ -336,7 +336,7 @@ mod tests { .start( 1, Arc::new(MockPdClient {}), - MockResolver {}, + MockStoreAddrResolver::default(), Arc::new(SecurityManager::new(&SecurityConfig::default()).unwrap()), &cfg, ) diff --git a/src/server/metrics.rs b/src/server/metrics.rs index 2745be59a71..122748cdfa9 100644 --- a/src/server/metrics.rs +++ b/src/server/metrics.rs @@ -86,6 +86,7 @@ make_auto_flush_static_metric! { failed, success, tombstone, + not_found, } pub label_enum ReplicaReadLockCheckResult { diff --git a/src/server/raft_client.rs b/src/server/raft_client.rs index f30e5b36045..b120011c490 100644 --- a/src/server/raft_client.rs +++ b/src/server/raft_client.rs @@ -46,8 +46,11 @@ use tikv_util::{ use yatp::{task::future::TaskCell, ThreadPool}; use crate::server::{ - self, load_statistics::ThreadLoadPool, metrics::*, snap::Task as SnapTask, Config, - StoreAddrResolver, + load_statistics::ThreadLoadPool, + metrics::*, + resolve::{Error as ResolveError, Result as ResolveResult}, + snap::Task as SnapTask, + Config, StoreAddrResolver, }; pub struct MetadataSourceStoreId {} @@ -642,7 +645,7 @@ where S: StoreAddrResolver, R: RaftExtension + Unpin + 'static, { - fn resolve(&self) -> impl Future> { + fn resolve(&self) -> impl Future> { let (tx, rx) = oneshot::channel(); let store_id = self.store_id; let res = self.builder.resolver.resolve( @@ -673,7 +676,7 @@ where res?; match rx.await { Ok(a) => a, - Err(_) => Err(server::Error::Other( + Err(_) => Err(ResolveError::Other( "failed to receive resolve result".into(), )), } @@ -824,8 +827,7 @@ async fn start( RESOLVE_STORE_COUNTER.with_label_values(&["failed"]).inc(); back_end.clear_pending_message("resolve"); error_unknown!(?e; "resolve store address failed"; "store_id" => back_end.store_id,); - // TOMBSTONE - if format!("{}", e).contains("has been removed") { + if let ResolveError::StoreTombstone(_) = e { let mut pool = pool.lock().unwrap(); if let Some(s) = pool.connections.remove(&(back_end.store_id, conn_id)) { s.set_conn_state(ConnState::Disconnected); @@ -940,7 +942,7 @@ struct CachedQueue { /// ```text /// for m in msgs { /// if !raft_client.send(m) { -/// // handle error. +/// // handle error. /// } /// } /// raft_client.flush(); diff --git a/src/server/raftkv2/raft_extension.rs b/src/server/raftkv2/raft_extension.rs index f6bb66e9e11..8b15c73fb65 100644 --- a/src/server/raftkv2/raft_extension.rs +++ b/src/server/raftkv2/raft_extension.rs @@ -49,6 +49,11 @@ impl tikv_kv::RaftExtension for Extension .send_control(StoreMsg::StoreUnreachable { to_store_id }); } + fn report_store_maybe_tombstone(&self, store_id: u64) { + self.router + .broadcast_normal(|| PeerMsg::StoreMaybeTombstone { store_id }); + } + fn report_snapshot_status( &self, region_id: u64, diff --git a/src/server/resolve.rs b/src/server/resolve.rs index c831ff28d17..013511183e2 100644 --- a/src/server/resolve.rs +++ b/src/server/resolve.rs @@ -1,6 +1,7 @@ // Copyright 2016 TiKV Project Authors. Licensed under Apache-2.0. use std::{ + error::Error as StdError, fmt::{self, Display, Formatter}, sync::{Arc, Mutex}, }; @@ -9,16 +10,28 @@ use collections::HashMap; use kvproto::replication_modepb::ReplicationMode; use pd_client::{take_peer_address, PdClient}; use raftstore::store::GlobalReplicationState; +use thiserror::Error; use tikv_kv::RaftExtension; use tikv_util::{ + info, time::Instant, worker::{Runnable, Scheduler, Worker}, }; -use super::{metrics::*, Result}; +use super::metrics::*; const STORE_ADDRESS_REFRESH_SECONDS: u64 = 60; +#[derive(Debug, Error)] +pub enum Error { + #[error("{0:?}")] + Other(#[from] Box), + #[error("store {0} has been removed")] + StoreTombstone(u64), +} + +pub type Result = std::result::Result; + pub type Callback = Box) + Send>; pub fn store_address_refresh_interval_secs() -> u64 { @@ -95,9 +108,21 @@ where // it explicitly. Err(pd_client::Error::StoreTombstone(_)) => { RESOLVE_STORE_COUNTER_STATIC.tombstone.inc(); - return Err(box_err!("store {} has been removed", store_id)); + self.router.report_store_maybe_tombstone(store_id); + return Err(Error::StoreTombstone(store_id)); + } + Err(e) => { + // Tombstone store may be removed manually or automatically + // after 30 days of deletion. PD returns + // "invalid store ID %d, not found" for such store id. + // See https://github.com/tikv/pd/blob/v7.3.0/server/grpc_service.go#L777-L780 + if format!("{:?}", e).contains("not found") { + RESOLVE_STORE_COUNTER_STATIC.not_found.inc(); + info!("resolve store not found"; "store_id" => store_id); + self.router.report_store_maybe_tombstone(store_id); + } + return Err(box_err!(e)); } - Err(e) => return Err(box_err!(e)), }; let mut group_id = None; let mut state = self.state.lock().unwrap(); @@ -181,6 +206,25 @@ impl StoreAddrResolver for PdStoreAddrResolver { } } +#[derive(Clone)] +pub struct MockStoreAddrResolver { + pub resolve_fn: Arc Result<()> + Send + Sync>, +} + +impl StoreAddrResolver for MockStoreAddrResolver { + fn resolve(&self, store_id: u64, cb: Callback) -> Result<()> { + (self.resolve_fn)(store_id, cb) + } +} + +impl Default for MockStoreAddrResolver { + fn default() -> MockStoreAddrResolver { + MockStoreAddrResolver { + resolve_fn: Arc::new(|_, _| unimplemented!()), + } + } +} + #[cfg(test)] mod tests { use std::{net::SocketAddr, ops::Sub, str::FromStr, sync::Arc, thread, time::Duration}; diff --git a/src/server/server.rs b/src/server/server.rs index 948930ae7ae..a886f1232f4 100644 --- a/src/server/server.rs +++ b/src/server/server.rs @@ -533,8 +533,8 @@ mod tests { use super::{ super::{ - resolve::{Callback as ResolveCallback, StoreAddrResolver}, - Config, Result, + resolve::{self, Callback as ResolveCallback, StoreAddrResolver}, + Config, }, *, }; @@ -552,7 +552,7 @@ mod tests { } impl StoreAddrResolver for MockResolver { - fn resolve(&self, _: u64, cb: ResolveCallback) -> Result<()> { + fn resolve(&self, _: u64, cb: ResolveCallback) -> resolve::Result<()> { if self.quick_fail.load(Ordering::SeqCst) { return Err(box_err!("quick fail")); } diff --git a/tests/failpoints/cases/mod.rs b/tests/failpoints/cases/mod.rs index a9dbd36a81a..ed2b8d79f9c 100644 --- a/tests/failpoints/cases/mod.rs +++ b/tests/failpoints/cases/mod.rs @@ -17,6 +17,7 @@ mod test_gc_worker; mod test_hibernate; mod test_import_service; mod test_kv_service; +mod test_life; mod test_local_read; mod test_memory_usage_limit; mod test_merge; diff --git a/tests/failpoints/cases/test_life.rs b/tests/failpoints/cases/test_life.rs new file mode 100644 index 00000000000..2bc833075c6 --- /dev/null +++ b/tests/failpoints/cases/test_life.rs @@ -0,0 +1,36 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use std::sync::Arc; + +use test_raftstore::*; +use test_raftstore_macro::test_case; +use tikv_util::config::ReadableDuration; + +#[test_case(test_raftstore_v2::new_server_cluster)] +fn test_gc_peer_on_tombstone_store() { + let mut cluster = new_cluster(0, 3); + configure_for_merge(&mut cluster.cfg); + cluster.cfg.raft_store.gc_peer_check_interval = ReadableDuration::millis(500); + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + cluster.run(); + cluster.must_put(b"k1", b"v1"); + + let region = cluster.get_region(b"k1"); + + let peer_on_store1 = find_peer(®ion, 1).unwrap().clone(); + let peer_on_store3 = find_peer(®ion, 3).unwrap().clone(); + cluster.must_transfer_leader(region.get_id(), peer_on_store1); + cluster.add_send_filter(IsolationFilterFactory::new(3)); + pd_client.must_remove_peer(region.get_id(), peer_on_store3); + + // Immediately invalidate store address cache. + fail::cfg("mock_store_refresh_interval_secs", "return(0)").unwrap(); + + // Shutdown store 3 and wait for gc peer ticks. + cluster.stop_node(3); + cluster.clear_send_filters(); + sleep_ms(3 * cluster.cfg.raft_store.gc_peer_check_interval.as_millis()); + + cluster.must_empty_region_removed_records(region.get_id()); +} diff --git a/tests/integrations/config/dynamic/pessimistic_txn.rs b/tests/integrations/config/dynamic/pessimistic_txn.rs index 7af5455a199..dc88bbd93a3 100644 --- a/tests/integrations/config/dynamic/pessimistic_txn.rs +++ b/tests/integrations/config/dynamic/pessimistic_txn.rs @@ -9,11 +9,7 @@ use security::SecurityManager; use test_pd_client::TestPdClient; use tikv::{ config::*, - server::{ - lock_manager::*, - resolve::{Callback, StoreAddrResolver}, - Error, Result, - }, + server::{lock_manager::*, resolve}, }; use tikv_util::config::ReadableDuration; @@ -27,14 +23,6 @@ fn test_config_validate() { invalid_cfg.validate().unwrap_err(); } -#[derive(Clone)] -struct MockResolver; -impl StoreAddrResolver for MockResolver { - fn resolve(&self, _store_id: u64, _cb: Callback) -> Result<()> { - Err(Error::Other(box_err!("unimplemented"))) - } -} - fn setup( cfg: TikvConfig, ) -> ( @@ -50,7 +38,7 @@ fn setup( .start( 1, pd_client, - MockResolver, + resolve::MockStoreAddrResolver::default(), security_mgr, &cfg.pessimistic_txn, ) diff --git a/tests/integrations/server/raft_client.rs b/tests/integrations/server/raft_client.rs index aad9ab7ceb1..2b51bb1f21b 100644 --- a/tests/integrations/server/raft_client.rs +++ b/tests/integrations/server/raft_client.rs @@ -21,8 +21,8 @@ use kvproto::{ use raft::eraftpb::Entry; use raftstore::errors::DiscardReason; use tikv::server::{ - self, load_statistics::ThreadLoadPool, raftkv::RaftRouterWrap, resolve, resolve::Callback, - Config, ConnectionBuilder, RaftClient, StoreAddrResolver, TestRaftStoreRouter, + load_statistics::ThreadLoadPool, raftkv::RaftRouterWrap, resolve, Config, ConnectionBuilder, + RaftClient, StoreAddrResolver, TestRaftStoreRouter, }; use tikv_kv::{FakeExtension, RaftExtension}; use tikv_util::{ @@ -32,24 +32,6 @@ use tikv_util::{ use super::*; -#[derive(Clone)] -pub struct StaticResolver { - port: u16, -} - -impl StaticResolver { - fn new(port: u16) -> StaticResolver { - StaticResolver { port } - } -} - -impl StoreAddrResolver for StaticResolver { - fn resolve(&self, _store_id: u64, cb: Callback) -> server::Result<()> { - cb(Ok(format!("localhost:{}", self.port))); - Ok(()) - } -} - fn get_raft_client(router: R, resolver: T) -> RaftClient where R: RaftExtension + Unpin + 'static, @@ -75,8 +57,16 @@ where RaftClient::new(0, builder) } -fn get_raft_client_by_port(port: u16) -> RaftClient { - get_raft_client(FakeExtension, StaticResolver::new(port)) +fn get_raft_client_by_port(port: u16) -> RaftClient { + get_raft_client( + FakeExtension, + resolve::MockStoreAddrResolver { + resolve_fn: Arc::new(move |_, cb| { + cb(Ok(format!("localhost:{}", port))); + Ok(()) + }), + }, + ) } #[derive(Clone)] @@ -177,7 +167,15 @@ fn test_raft_client_reconnect() { let (significant_msg_sender, _significant_msg_receiver) = mpsc::channel(); let router = TestRaftStoreRouter::new(tx, significant_msg_sender); let wrap = RaftRouterWrap::new(router); - let mut raft_client = get_raft_client(wrap, StaticResolver::new(port)); + let mut raft_client = get_raft_client( + wrap, + resolve::MockStoreAddrResolver { + resolve_fn: Arc::new(move |_, cb| { + cb(Ok(format!("localhost:{}", port))); + Ok(()) + }), + }, + ); (0..50).for_each(|_| raft_client.send(RaftMessage::default()).unwrap()); raft_client.flush(); From fda1b5caf19f3ee87ab26c4458c64a6b3f3ea5ca Mon Sep 17 00:00:00 2001 From: Juan Grande Date: Thu, 28 Sep 2023 02:55:50 -0700 Subject: [PATCH 078/220] logger: added thread_id to logs (#15638) close tikv/tikv#13395 Added thread_id to logs Signed-off-by: Juan Grande Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- components/tikv_util/src/logger/mod.rs | 126 +++++++++++++++++++------ 1 file changed, 96 insertions(+), 30 deletions(-) diff --git a/components/tikv_util/src/logger/mod.rs b/components/tikv_util/src/logger/mod.rs index 5ebe9468a50..c321f56a1b5 100644 --- a/components/tikv_util/src/logger/mod.rs +++ b/components/tikv_util/src/logger/mod.rs @@ -6,6 +6,7 @@ mod formatter; use std::{ env, fmt, io::{self, BufWriter}, + num::NonZeroU64, path::{Path, PathBuf}, sync::{ atomic::{AtomicUsize, Ordering}, @@ -15,7 +16,10 @@ use std::{ }; use log::{self, SetLoggerError}; -use slog::{self, slog_o, Drain, FnValue, Key, OwnedKVList, PushFnValue, Record, KV}; +use slog::{ + self, slog_o, Drain, FnValue, Key, OwnedKV, OwnedKVList, PushFnValue, Record, + SendSyncRefUnwindSafeKV, KV, +}; pub use slog::{FilterFn, Level}; use slog_async::{Async, AsyncGuard, OverflowStrategy}; use slog_term::{Decorator, PlainDecorator, RecordDecorator}; @@ -85,7 +89,7 @@ where }; let filtered = GlobalLevelFilter::new(drain.filter(filter).fuse()); - (slog::Logger::root(filtered, slog_o!()), Some(guard)) + (slog::Logger::root(filtered, get_values()), Some(guard)) } else { let drain = LogAndFuse(Mutex::new(drain)); let drain = SlowLogFilter { @@ -93,7 +97,7 @@ where inner: drain, }; let filtered = GlobalLevelFilter::new(drain.filter(filter).fuse()); - (slog::Logger::root(filtered, slog_o!()), None) + (slog::Logger::root(filtered, get_values()), None) }; set_global_logger(level, init_stdlog, logger, guard) @@ -628,6 +632,18 @@ fn write_log_fields( Ok(()) } +fn format_thread_id(thread_id: NonZeroU64) -> String { + format!("{:#0x}", thread_id) +} + +fn get_values() -> OwnedKV { + slog_o!( + "thread_id" => FnValue(|_| { + format_thread_id(std::thread::current().id().as_u64()) + }) + ) +} + struct Serializer<'a> { decorator: &'a mut dyn RecordDecorator, } @@ -679,7 +695,7 @@ impl<'a> slog::Serializer for Serializer<'a> { #[cfg(test)] mod tests { - use std::{cell::RefCell, io, io::Write, str::from_utf8}; + use std::{cell::RefCell, io, io::Write, str::from_utf8, sync::RwLock, time::Duration}; use chrono::DateTime; use regex::Regex; @@ -705,8 +721,6 @@ mod tests { } fn log_format_cases(logger: slog::Logger) { - use std::time::Duration; - // Empty message is not recommend, just for test purpose here. slog_info!(logger, ""); slog_info!(logger, "Welcome"); @@ -763,21 +777,25 @@ mod tests { fn test_log_format_text() { let decorator = PlainSyncDecorator::new(TestWriter); let drain = TikvFormat::new(decorator, true).fuse(); - let logger = slog::Logger::root_typed(drain, slog_o!()).into_erased(); + let logger = slog::Logger::root_typed(drain, get_values()).into_erased(); log_format_cases(logger); - let expect = r#"[2019/01/15 13:40:39.619 +08:00] [INFO] [mod.rs:469] [] -[2019/01/15 13:40:39.619 +08:00] [INFO] [mod.rs:469] [Welcome] -[2019/01/15 13:40:39.619 +08:00] [INFO] [mod.rs:470] ["Welcome TiKV"] -[2019/01/15 13:40:39.619 +08:00] [INFO] [mod.rs:471] [欢迎] -[2019/01/15 13:40:39.619 +08:00] [INFO] [mod.rs:472] ["欢迎 TiKV"] -[2019/01/15 13:40:39.615 +08:00] [INFO] [mod.rs:455] ["failed to fetch URL"] [backoff=3s] [attempt=3] [url=http://example.com] -[2019/01/15 13:40:39.619 +08:00] [INFO] [mod.rs:460] ["failed to \"fetch\" [URL]: http://example.com"] -[2019/01/15 13:40:39.619 +08:00] [DEBUG] [mod.rs:463] ["Slow query"] ["process keys"=1500] [duration=123ns] [sql="SELECT * FROM TABLE WHERE ID=\"abc\""] -[2019/01/15 13:40:39.619 +08:00] [WARN] [mod.rs:473] [Type] [Other=-inf] [Score=inf] [Counter=NaN] -[2019/01/16 16:56:04.854 +08:00] [INFO] [mod.rs:391] ["more type tests"] [str_array="[\"💖\", \"�\", \"☺☻☹\", \"日a本b語ç日ð本Ê語þ日¥本¼語i日©\", \"日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©\", \"\\\\x80\\\\x80\\\\x80\\\\x80\", \"XML\"]"] [u8=34] [is_None=None] [is_false=false] [is_true=true] ["store ids"="[1, 2, 3]"] [url-peers="[\"peer1\", \"peer 2\"]"] [urls="[\"http://xxx.com:2347\", \"http://xxx.com:2432\"]"] [field2="in quote"] [field1=no_quote] -"#; + let thread_id = format_thread_id(std::thread::current().id().as_u64()); + let expect = format!( + r#"[2019/01/15 13:40:39.619 +08:00] [INFO] [mod.rs:469] [] [thread_id={0}] +[2019/01/15 13:40:39.619 +08:00] [INFO] [mod.rs:469] [Welcome] [thread_id={0}] +[2019/01/15 13:40:39.619 +08:00] [INFO] [mod.rs:470] ["Welcome TiKV"] [thread_id={0}] +[2019/01/15 13:40:39.619 +08:00] [INFO] [mod.rs:471] [欢迎] [thread_id={0}] +[2019/01/15 13:40:39.619 +08:00] [INFO] [mod.rs:472] ["欢迎 TiKV"] [thread_id={0}] +[2019/01/15 13:40:39.615 +08:00] [INFO] [mod.rs:455] ["failed to fetch URL"] [backoff=3s] [attempt=3] [url=http://example.com] [thread_id={0}] +[2019/01/15 13:40:39.619 +08:00] [INFO] [mod.rs:460] ["failed to \"fetch\" [URL]: http://example.com"] [thread_id={0}] +[2019/01/15 13:40:39.619 +08:00] [DEBUG] [mod.rs:463] ["Slow query"] ["process keys"=1500] [duration=123ns] [sql="SELECT * FROM TABLE WHERE ID=\"abc\""] [thread_id={0}] +[2019/01/15 13:40:39.619 +08:00] [WARN] [mod.rs:473] [Type] [Other=-inf] [Score=inf] [Counter=NaN] [thread_id={0}] +[2019/01/16 16:56:04.854 +08:00] [INFO] [mod.rs:391] ["more type tests"] [str_array="[\"💖\", \"�\", \"☺☻☹\", \"日a本b語ç日ð本Ê語þ日¥本¼語i日©\", \"日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©\", \"\\\\x80\\\\x80\\\\x80\\\\x80\", \"XML\"]"] [u8=34] [is_None=None] [is_false=false] [is_true=true] ["store ids"="[1, 2, 3]"] [url-peers="[\"peer1\", \"peer 2\"]"] [urls="[\"http://xxx.com:2347\", \"http://xxx.com:2432\"]"] [field2="in quote"] [field1=no_quote] [thread_id={0}] +"#, + thread_id + ); BUFFER.with(|buffer| { let mut buffer = buffer.borrow_mut(); @@ -811,21 +829,25 @@ mod tests { fn test_log_format_json() { use serde_json::{from_str, Value}; let drain = Mutex::new(json_format(TestWriter, true)).map(slog::Fuse); - let logger = slog::Logger::root_typed(drain, slog_o!()).into_erased(); + let logger = slog::Logger::root_typed(drain, get_values()).into_erased(); log_format_cases(logger); - let expect = r#"{"time":"2020/05/16 15:49:52.449 +08:00","level":"INFO","caller":"mod.rs:469","message":""} -{"time":"2020/05/16 15:49:52.450 +08:00","level":"INFO","caller":"mod.rs:469","message":"Welcome"} -{"time":"2020/05/16 15:49:52.450 +08:00","level":"INFO","caller":"mod.rs:470","message":"Welcome TiKV"} -{"time":"2020/05/16 15:49:52.450 +08:00","level":"INFO","caller":"mod.rs:471","message":"欢迎"} -{"time":"2020/05/16 15:49:52.450 +08:00","level":"INFO","caller":"mod.rs:472","message":"欢迎 TiKV"} -{"time":"2020/05/16 15:49:52.450 +08:00","level":"INFO","caller":"mod.rs:455","message":"failed to fetch URL","backoff":"3s","attempt":3,"url":"http://example.com"} -{"time":"2020/05/16 15:49:52.450 +08:00","level":"INFO","caller":"mod.rs:460","message":"failed to \"fetch\" [URL]: http://example.com"} -{"time":"2020/05/16 15:49:52.450 +08:00","level":"DEBUG","caller":"mod.rs:463","message":"Slow query","process keys":1500,"duration":"123ns","sql":"SELECT * FROM TABLE WHERE ID=\"abc\""} -{"time":"2020/05/16 15:49:52.450 +08:00","level":"WARN","caller":"mod.rs:473","message":"Type","Other":null,"Score":null,"Counter":null} -{"time":"2020/05/16 15:49:52.451 +08:00","level":"INFO","caller":"mod.rs:391","message":"more type tests","str_array":"[\"💖\", \"�\", \"☺☻☹\", \"日a本b語ç日ð本Ê語þ日¥本¼語i日©\", \"日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©\", \"\\\\x80\\\\x80\\\\x80\\\\x80\", \"XML\"]","u8":34,"is_None":null,"is_false":false,"is_true":true,"store ids":"[1, 2, 3]","url-peers":"[\"peer1\", \"peer 2\"]","urls":"[\"http://xxx.com:2347\", \"http://xxx.com:2432\"]","field2":"in quote","field1":"no_quote"} -"#; + let thread_id = format_thread_id(std::thread::current().id().as_u64()); + let expect = format!( + r#"{{"time":"2020/05/16 15:49:52.449 +08:00","level":"INFO","caller":"mod.rs:469","message":"","thread_id":"{0}"}} +{{"time":"2020/05/16 15:49:52.450 +08:00","level":"INFO","caller":"mod.rs:469","message":"Welcome","thread_id":"{0}"}} +{{"time":"2020/05/16 15:49:52.450 +08:00","level":"INFO","caller":"mod.rs:470","message":"Welcome TiKV","thread_id":"{0}"}} +{{"time":"2020/05/16 15:49:52.450 +08:00","level":"INFO","caller":"mod.rs:471","message":"欢迎","thread_id":"{0}"}} +{{"time":"2020/05/16 15:49:52.450 +08:00","level":"INFO","caller":"mod.rs:472","message":"欢迎 TiKV","thread_id":"{0}"}} +{{"time":"2020/05/16 15:49:52.450 +08:00","level":"INFO","caller":"mod.rs:455","message":"failed to fetch URL","backoff":"3s","attempt":3,"url":"http://example.com","thread_id":"{0}"}} +{{"time":"2020/05/16 15:49:52.450 +08:00","level":"INFO","caller":"mod.rs:460","message":"failed to \"fetch\" [URL]: http://example.com","thread_id":"{0}"}} +{{"time":"2020/05/16 15:49:52.450 +08:00","level":"DEBUG","caller":"mod.rs:463","message":"Slow query","process keys":1500,"duration":"123ns","sql":"SELECT * FROM TABLE WHERE ID=\"abc\"","thread_id":"{0}"}} +{{"time":"2020/05/16 15:49:52.450 +08:00","level":"WARN","caller":"mod.rs:473","message":"Type","Other":null,"Score":null,"Counter":null,"thread_id":"{0}"}} +{{"time":"2020/05/16 15:49:52.451 +08:00","level":"INFO","caller":"mod.rs:391","message":"more type tests","str_array":"[\"💖\", \"�\", \"☺☻☹\", \"日a本b語ç日ð本Ê語þ日¥本¼語i日©\", \"日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©\", \"\\\\x80\\\\x80\\\\x80\\\\x80\", \"XML\"]","u8":34,"is_None":null,"is_false":false,"is_true":true,"store ids":"[1, 2, 3]","url-peers":"[\"peer1\", \"peer 2\"]","urls":"[\"http://xxx.com:2347\", \"http://xxx.com:2432\"]","field2":"in quote","field1":"no_quote","thread_id":"{0}"}} +"#, + thread_id + ); BUFFER.with(|buffer| { let mut buffer = buffer.borrow_mut(); @@ -1074,4 +1096,48 @@ mod tests { } }); } + + static THREAD_SAFE_BUFFER: RwLock> = RwLock::new(Vec::new()); + + struct ThreadSafeWriter; + impl Write for ThreadSafeWriter { + fn write(&mut self, data: &[u8]) -> io::Result { + let mut buffer = THREAD_SAFE_BUFFER.write().unwrap(); + buffer.write(data) + } + + fn flush(&mut self) -> io::Result<()> { + let mut buffer = THREAD_SAFE_BUFFER.write().unwrap(); + buffer.flush() + } + } + + #[test] + fn test_threadid() { + let drain = TikvFormat::new(PlainSyncDecorator::new(ThreadSafeWriter), true).fuse(); + let logger = slog::Logger::root_typed(drain, get_values()).into_erased(); + + slog_info!(logger, "Hello from the first thread"); + let this_threadid = thread::current().id().as_u64(); + let this_threadid = format_thread_id(this_threadid); + + let handle = thread::spawn(move || { + slog_info!(logger, "Hello from the second thread"); + }); + let other_threadid = handle.thread().id().as_u64(); + let other_threadid = format_thread_id(other_threadid); + handle.join().unwrap(); + + let expected = vec![this_threadid, other_threadid]; + + let re = Regex::new(r"\[thread_id=(.*?)\]").unwrap(); + let buffer = THREAD_SAFE_BUFFER.read().unwrap(); + let output = from_utf8(&buffer).unwrap(); + let actual: Vec<&str> = output + .lines() + .map(|line| re.captures(line).unwrap()) + .map(|captures| captures.get(1).unwrap().as_str()) + .collect(); + assert_eq!(expected, actual); + } } From 58253e8b7cea59b414511753b75dd7fc980d99af Mon Sep 17 00:00:00 2001 From: buffer <1045931706@qq.com> Date: Sun, 8 Oct 2023 13:03:22 +0800 Subject: [PATCH 079/220] raftstore: split bucket if the increment flow reach the limit (#15637) close tikv/tikv#15636 there are three reason may cause the bucket not split: 1. split check tick will refresh bucket info even info the bucket version not change 2. the suspect buckets only conside the increment flow 3. all the bucket increment flows are reset if one bucket is updated. To solve this, bucket stats only record the increment flow and reset it after meta size updated. Signed-off-by: bufferflies <1045931706@qq.com> Co-authored-by: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> --- components/pd_client/src/lib.rs | 9 + .../raftstore-v2/src/operation/bucket.rs | 536 ++++++++++++------ components/raftstore-v2/src/worker/pd/mod.rs | 6 +- .../raftstore-v2/src/worker/pd/region.rs | 22 +- components/raftstore/src/store/fsm/peer.rs | 25 +- components/raftstore/src/store/util.rs | 14 + 6 files changed, 391 insertions(+), 221 deletions(-) diff --git a/components/pd_client/src/lib.rs b/components/pd_client/src/lib.rs index 7a9d2cd2a61..21ae61ccd61 100644 --- a/components/pd_client/src/lib.rs +++ b/components/pd_client/src/lib.rs @@ -211,6 +211,15 @@ impl BucketStat { } } + pub fn clean_stats(&mut self, idx: usize) { + self.stats.write_keys[idx] = 0; + self.stats.write_bytes[idx] = 0; + self.stats.read_qps[idx] = 0; + self.stats.write_qps[idx] = 0; + self.stats.read_keys[idx] = 0; + self.stats.read_bytes[idx] = 0; + } + pub fn split(&mut self, idx: usize) { assert!(idx != 0); // inherit the traffic stats for splited bucket diff --git a/components/raftstore-v2/src/operation/bucket.rs b/components/raftstore-v2/src/operation/bucket.rs index 432ea72456a..242b9a9b33b 100644 --- a/components/raftstore-v2/src/operation/bucket.rs +++ b/components/raftstore-v2/src/operation/bucket.rs @@ -11,10 +11,10 @@ use kvproto::{ }; use pd_client::{BucketMeta, BucketStat}; use raftstore::{ - coprocessor::RegionChangeEvent, + coprocessor::{Config, RegionChangeEvent}, store::{util, Bucket, BucketRange, ReadProgress, SplitCheckTask, Transport}, }; -use slog::{error, info, warn}; +use slog::{error, info}; use crate::{ batch::StoreContext, @@ -26,15 +26,13 @@ use crate::{ #[derive(Debug, Clone, Default)] pub struct BucketStatsInfo { + // the stats is increment flow. bucket_stat: Option, - // the last buckets records the stats that the recently refreshed. - last_bucket_stat: Option, // the report bucket stat records the increment stats after last report pd. // it will be reset after report pd. report_bucket_stat: Option, - // last bucket count. - // BucketStat.meta is Arc so it cannot be used for last bucket count - last_bucket_count: usize, + // avoid the version roll back, it record the last bucket version if bucket stat isn't none. + last_bucket_version: u64, } impl BucketStatsInfo { @@ -42,55 +40,33 @@ impl BucketStatsInfo { /// diff_size_threshold. pub fn gen_bucket_range_for_update( &self, - diff_size_threshold: u64, + region_bucket_max_size: u64, ) -> Option> { let region_buckets = self.bucket_stat.as_ref()?; let stats = ®ion_buckets.stats; let keys = ®ion_buckets.meta.keys; + let sizes = ®ion_buckets.meta.sizes; - let empty_last_keys = vec![]; - let empty_last_stats = metapb::BucketStats::default(); - let (last_keys, last_stats, stats_reset) = self - .last_bucket_stat - .as_ref() - .map(|b| { - ( - &b.meta.keys, - &b.stats, - region_buckets.create_time != b.create_time, - ) - }) - .unwrap_or((&empty_last_keys, &empty_last_stats, false)); - - let mut bucket_ranges = vec![]; - let mut j = 0; + let mut suspect_bucket_ranges = vec![]; assert_eq!(keys.len(), stats.write_bytes.len() + 1); for i in 0..stats.write_bytes.len() { - let mut diff_in_bytes = stats.write_bytes[i]; - while j < last_keys.len() && keys[i] > last_keys[j] { - j += 1; - } - if j < last_keys.len() && keys[i] == last_keys[j] { - if !stats_reset { - diff_in_bytes -= last_stats.write_bytes[j]; - } - j += 1; - } - if diff_in_bytes >= diff_size_threshold { - bucket_ranges.push(BucketRange(keys[i].clone(), keys[i + 1].clone())); + let estimated_bucket_size = stats.write_bytes[i] + sizes[i]; + if estimated_bucket_size >= region_bucket_max_size { + suspect_bucket_ranges.push(BucketRange(keys[i].clone(), keys[i + 1].clone())); } } - Some(bucket_ranges) + Some(suspect_bucket_ranges) } #[inline] pub fn version(&self) -> u64 { self.bucket_stat .as_ref() - .or(self.last_bucket_stat.as_ref()) .map(|b| b.meta.version) + .or(Some(self.last_bucket_version)) .unwrap_or_default() } + #[inline] pub fn add_bucket_flow(&mut self, delta: &Option) { if let (Some(buckets), Some(report_buckets), Some(delta)) = ( @@ -105,21 +81,18 @@ impl BucketStatsInfo { #[inline] pub fn set_bucket_stat(&mut self, buckets: Option) { - if let Some(b) = self.bucket_stat.take() { - self.last_bucket_stat = Some(b); - } - self.report_bucket_stat = buckets.clone(); - self.bucket_stat = buckets; - self.last_bucket_count = self - .bucket_stat - .as_ref() - .map_or(0, |bucket_stat| bucket_stat.meta.keys.len() - 1); - } - - #[inline] - pub fn clear_bucket_stat(&mut self) { - if let Some(bucket) = self.report_bucket_stat.as_mut() { - bucket.clear_stats(); + self.bucket_stat = buckets.clone(); + if let Some(new_buckets) = buckets { + self.last_bucket_version = new_buckets.meta.version; + let mut new_report_buckets = BucketStat::from_meta(new_buckets.meta); + if let Some(old) = &mut self.report_bucket_stat { + new_report_buckets.merge(old); + *old = new_report_buckets; + } else { + self.report_bucket_stat = Some(new_report_buckets); + } + } else { + self.report_bucket_stat = None; } } @@ -136,142 +109,163 @@ impl BucketStatsInfo { &self.bucket_stat } - #[inline] - pub fn last_bucket_count(&self) -> usize { - self.last_bucket_count - } -} - -impl Peer { - #[inline] - pub fn on_refresh_region_buckets( + pub fn on_refresh_region_buckets( &mut self, - store_ctx: &mut StoreContext, + cfg: &Config, + next_bucket_version: u64, + buckets: Vec, region_epoch: RegionEpoch, - mut buckets: Vec, + region: metapb::Region, bucket_ranges: Option>, - ) { - // bucket version layout - // term logical counter - // |-----------|-----------| - // high bits low bits - // term: given 10s election timeout, the 32 bit means 1362 year running time - let gen_bucket_version = |term, current_version| { - let current_version_term = current_version >> 32; - let bucket_version: u64 = if current_version_term == term { - current_version + 1 - } else { - if term > u32::MAX.into() { - error!( - self.logger, - "unexpected term {} more than u32::MAX. Bucket - version will be backward.", - term - ); - } - term << 32 - }; - bucket_version - }; - - let region = self.region(); - let current_version = self.region_buckets_info().version(); - let next_bucket_version = gen_bucket_version(self.term(), current_version); - let mut is_first_refresh = true; - let mut change_bucket_version = false; - let mut region_buckets: BucketStat; - + ) -> bool { + let change_bucket_version: bool; // The region buckets reset after this region happened split or merge. // The message should be dropped if it's epoch is lower than the regions. // The bucket ranges is none when the region buckets is also none. // So this condition indicates that the region buckets needs to refresh not // renew. - if let (Some(bucket_ranges), Some(peer_region_buckets)) = - (bucket_ranges, self.region_buckets_info().bucket_stat()) - { - is_first_refresh = false; + if let Some(bucket_ranges) = bucket_ranges&&self.bucket_stat.is_some(){ assert_eq!(buckets.len(), bucket_ranges.len()); - let mut meta_idx = 0; - region_buckets = peer_region_buckets.clone(); - let mut meta = (*region_buckets.meta).clone(); - meta.region_epoch = region_epoch; - for (bucket, bucket_range) in buckets.into_iter().zip(bucket_ranges) { - // the bucket ranges maybe need to split or merge not all the meta keys, so it - // needs to find the first keys. - while meta_idx < meta.keys.len() && meta.keys[meta_idx] != bucket_range.0 { - meta_idx += 1; - } - // meta_idx can't be not the last entry (which is end key) - if meta_idx >= meta.keys.len() - 1 { - warn!( - self.logger, - "can't find the bucket key"; - "bucket_range_key" => log_wrappers::Value::key(&bucket_range.0)); - break; - } - // the bucket size is small and does not have split keys, - // then it should be merged with its left neighbor - let region_bucket_merge_size = store_ctx - .coprocessor_host - .cfg - .region_bucket_merge_size_ratio - * (store_ctx.coprocessor_host.cfg.region_bucket_size.0 as f64); - if bucket.keys.is_empty() && bucket.size <= (region_bucket_merge_size as u64) { - meta.sizes[meta_idx] = bucket.size; - // the region has more than one bucket - // and the left neighbor + current bucket size is not very big - if meta.keys.len() > 2 - && meta_idx != 0 - && meta.sizes[meta_idx - 1] + bucket.size - < store_ctx.coprocessor_host.cfg.region_bucket_size.0 * 2 - { - // bucket is too small - region_buckets.left_merge(meta_idx); - meta.left_merge(meta_idx); - change_bucket_version = true; - continue; - } - } else { - // update size - meta.sizes[meta_idx] = bucket.size / (bucket.keys.len() + 1) as u64; - // insert new bucket keys (split the original bucket) - for bucket_key in bucket.keys { - meta_idx += 1; - region_buckets.split(meta_idx); - meta.split(meta_idx, bucket_key); - change_bucket_version = true; - } - } + change_bucket_version=self.update_buckets(cfg, next_bucket_version, buckets, region_epoch, &bucket_ranges); + }else{ + change_bucket_version = true; + // when the region buckets is none, the exclusive buckets includes all the + // bucket keys. + self.init_buckets(cfg, next_bucket_version, buckets, region_epoch, region); + } + change_bucket_version + } + + fn update_buckets( + &mut self, + cfg: &Config, + next_bucket_version: u64, + buckets: Vec, + region_epoch: RegionEpoch, + bucket_ranges: &Vec, + ) -> bool { + let origin_region_buckets = self.bucket_stat.as_ref().unwrap(); + let mut change_bucket_version = false; + let mut meta_idx = 0; + let mut region_buckets = origin_region_buckets.clone(); + let mut meta = (*region_buckets.meta).clone(); + meta.region_epoch = region_epoch; + + // bucket stats will clean if the bucket size is updated. + for (bucket, bucket_range) in buckets.into_iter().zip(bucket_ranges) { + // the bucket ranges maybe need to split or merge not all the meta keys, so it + // needs to find the first keys. + while meta_idx < meta.keys.len() && meta.keys[meta_idx] != bucket_range.0 { meta_idx += 1; } - if self.region_buckets_info().last_bucket_count() != region_buckets.meta.keys.len() - 1 - { - change_bucket_version = true; + // meta_idx can't be not the last entry (which is end key) + if meta_idx >= meta.keys.len() - 1 { + break; } - if change_bucket_version { - meta.version = next_bucket_version; + // the bucket size is small and does not have split keys, + // then it should be merged with its left neighbor + let region_bucket_merge_size = + cfg.region_bucket_merge_size_ratio * (cfg.region_bucket_size.0 as f64); + if bucket.keys.is_empty() && bucket.size <= (region_bucket_merge_size as u64) { + meta.sizes[meta_idx] = bucket.size; + region_buckets.clean_stats(meta_idx); + // the region has more than one bucket + // and the left neighbor + current bucket size is not very big + if meta.keys.len() > 2 + && meta_idx != 0 + && meta.sizes[meta_idx - 1] + bucket.size < cfg.region_bucket_size.0 * 2 + { + // bucket is too small + region_buckets.left_merge(meta_idx); + meta.left_merge(meta_idx); + change_bucket_version = true; + continue; + } + } else { + // update size + meta.sizes[meta_idx] = bucket.size / (bucket.keys.len() + 1) as u64; + region_buckets.clean_stats(meta_idx); + // insert new bucket keys (split the original bucket) + for bucket_key in bucket.keys { + meta_idx += 1; + region_buckets.split(meta_idx); + meta.split(meta_idx, bucket_key); + change_bucket_version = true; + } } - region_buckets.meta = Arc::new(meta); - } else { - // when the region buckets is none, the exclusive buckets includes all the - // bucket keys. - assert_eq!(buckets.len(), 1); - change_bucket_version = true; - let bucket_keys = buckets.pop().unwrap().keys; - let bucket_count = bucket_keys.len() + 1; - let mut meta = BucketMeta { - region_id: self.region_id(), - region_epoch, - version: next_bucket_version, - keys: bucket_keys, - sizes: vec![store_ctx.coprocessor_host.cfg.region_bucket_size.0; bucket_count], - }; - // padding the boundary keys and initialize the flow. - meta.keys.insert(0, region.get_start_key().to_vec()); - meta.keys.push(region.get_end_key().to_vec()); - region_buckets = BucketStat::from_meta(Arc::new(meta)); + meta_idx += 1; + } + if change_bucket_version { + meta.version = next_bucket_version; } + region_buckets.meta = Arc::new(meta); + self.set_bucket_stat(Some(region_buckets)); + change_bucket_version + } + + fn init_buckets( + &mut self, + cfg: &Config, + next_bucket_version: u64, + mut buckets: Vec, + region_epoch: RegionEpoch, + region: metapb::Region, + ) { + // when the region buckets is none, the exclusive buckets includes all the + // bucket keys. + assert_eq!(buckets.len(), 1); + let bucket_keys = buckets.pop().unwrap().keys; + let bucket_count = bucket_keys.len() + 1; + let mut meta = BucketMeta { + region_id: region.get_id(), + region_epoch, + version: next_bucket_version, + keys: bucket_keys, + sizes: vec![cfg.region_bucket_size.0; bucket_count], + }; + // padding the boundary keys and initialize the flow. + meta.keys.insert(0, region.get_start_key().to_vec()); + meta.keys.push(region.get_end_key().to_vec()); + let bucket_stats = BucketStat::from_meta(Arc::new(meta)); + self.set_bucket_stat(Some(bucket_stats)); + } +} +impl Peer { + #[inline] + pub fn on_refresh_region_buckets( + &mut self, + store_ctx: &mut StoreContext, + region_epoch: RegionEpoch, + buckets: Vec, + bucket_ranges: Option>, + ) { + if self.term() > u32::MAX.into() { + error!( + self.logger, + "unexpected term {} more than u32::MAX. Bucket version will be backward.", + self.term() + ); + } + + let current_version = self.region_buckets_info().version(); + let next_bucket_version = util::gen_bucket_version(self.term(), current_version); + // let mut is_first_refresh = true; + let region = self.region().clone(); + let change_bucket_version = self.region_buckets_info_mut().on_refresh_region_buckets( + &store_ctx.coprocessor_host.cfg, + next_bucket_version, + buckets, + region_epoch, + region, + bucket_ranges, + ); + let region_buckets = self + .region_buckets_info() + .bucket_stat() + .as_ref() + .unwrap() + .clone(); let buckets_count = region_buckets.meta.keys.len() - 1; if change_bucket_version { // TODO: we may need to make it debug once the coprocessor timeout is resolved. @@ -281,17 +275,18 @@ impl Peer { "bucket_version" => next_bucket_version, "buckets_count" => buckets_count, "estimated_region_size" => region_buckets.meta.total_size(), - "first_refresh" => is_first_refresh, ); + } else { + // it means the buckets key range not any change, so don't need to refresh. + return; } + store_ctx.coprocessor_host.on_region_changed( - region, + self.region(), RegionChangeEvent::UpdateBuckets(buckets_count), self.state_role(), ); let meta = region_buckets.meta.clone(); - self.region_buckets_info_mut() - .set_bucket_stat(Some(region_buckets.clone())); { let mut store_meta = store_ctx.store_meta.lock().unwrap(); if let Some(reader) = store_meta.readers.get_mut(&self.region_id()) { @@ -302,13 +297,13 @@ impl Peer { if let Some(apply_scheduler) = self.apply_scheduler() { apply_scheduler.send(ApplyTask::RefreshBucketStat(region_buckets.meta.clone())); } + if !self.is_leader() { + return; + } let version = region_buckets.meta.version; let keys = region_buckets.meta.keys.clone(); // Notify followers to flush their relevant memtables let peers = self.region().get_peers().to_vec(); - if !self.is_leader() { - return; - } for p in peers { if p == *self.peer() || p.is_witness { continue; @@ -397,9 +392,9 @@ impl Peer { if !ctx.coprocessor_host.cfg.enable_region_bucket() { return None; } - let bucket_update_diff_size_threshold = ctx.coprocessor_host.cfg.region_bucket_size.0 / 2; + let region_bucket_max_size = ctx.coprocessor_host.cfg.region_bucket_size.0 * 2; self.region_buckets_info() - .gen_bucket_range_for_update(bucket_update_diff_size_threshold) + .gen_bucket_range_for_update(region_bucket_max_size) } } @@ -448,3 +443,178 @@ where self.schedule_tick(PeerTick::ReportBuckets); } } + +#[cfg(test)] +mod tests { + use super::*; + + // create BucketStatsInfo include three keys: ["","100","200",""]. + fn mock_bucket_stats_info() -> BucketStatsInfo { + let mut bucket_stats_info = BucketStatsInfo::default(); + let cfg = Config::default(); + let next_bucket_version = 1; + let bucket_ranges = None; + let mut region_epoch = RegionEpoch::default(); + region_epoch.set_conf_ver(1); + region_epoch.set_version(1); + let mut region = metapb::Region::default(); + region.set_id(1); + + let mut buckets = vec![]; + let mut bucket = Bucket::default(); + bucket.keys.push(vec![100]); + bucket.keys.push(vec![200]); + buckets.insert(0, bucket); + + let _ = bucket_stats_info.on_refresh_region_buckets( + &cfg, + next_bucket_version, + buckets, + region_epoch, + region, + bucket_ranges, + ); + bucket_stats_info + } + + #[test] + pub fn test_version() { + let mut bucket_stats_info = mock_bucket_stats_info(); + assert_eq!(1, bucket_stats_info.version()); + bucket_stats_info.set_bucket_stat(None); + assert_eq!(1, bucket_stats_info.version()); + + let mut meta = BucketMeta::default(); + meta.version = 2; + meta.keys.push(vec![]); + meta.keys.push(vec![]); + let bucket_stat = BucketStat::from_meta(Arc::new(meta)); + bucket_stats_info.set_bucket_stat(Some(bucket_stat)); + assert_eq!(2, bucket_stats_info.version()); + } + + #[test] + pub fn test_insert_new_buckets() { + let bucket_stats_info = mock_bucket_stats_info(); + + let cfg = Config::default(); + let bucket_stat = bucket_stats_info.bucket_stat.unwrap(); + assert_eq!( + vec![vec![], vec![100], vec![200], vec![]], + bucket_stat.meta.keys + ); + for i in 0..bucket_stat.stats.write_bytes.len() { + assert_eq!(cfg.region_bucket_size.0, bucket_stat.meta.sizes[i]); + assert_eq!(0, bucket_stat.stats.write_bytes[i]); + } + } + + #[test] + pub fn test_report_buckets() { + let mut bucket_stats_info = mock_bucket_stats_info(); + let bucket_stats = bucket_stats_info.bucket_stat().as_ref().unwrap(); + let mut delta_bucket_stats = bucket_stats.clone(); + delta_bucket_stats.write_key(&[1], 1); + delta_bucket_stats.write_key(&[201], 1); + bucket_stats_info.add_bucket_flow(&Some(delta_bucket_stats.clone())); + let bucket_stats = bucket_stats_info.report_bucket_stat(); + assert_eq!(vec![2, 0, 2], bucket_stats.stats.write_bytes); + + let report_bucket_stats = bucket_stats_info.report_bucket_stat(); + assert_eq!(vec![0, 0, 0], report_bucket_stats.stats.write_bytes); + bucket_stats_info.add_bucket_flow(&Some(delta_bucket_stats)); + assert_eq!(vec![2, 0, 2], bucket_stats.stats.write_bytes); + } + + #[test] + pub fn test_spilt_and_merge_buckets() { + let mut bucket_stats_info = mock_bucket_stats_info(); + let next_bucket_version = 2; + let mut region = metapb::Region::default(); + region.set_id(1); + let cfg = Config::default(); + let bucket_size = cfg.region_bucket_size.0; + let bucket_stats = bucket_stats_info.bucket_stat().as_ref().unwrap(); + let region_epoch = bucket_stats.meta.region_epoch.clone(); + + // step1: update buckets flow + let mut delta_bucket_stats = bucket_stats.clone(); + delta_bucket_stats.write_key(&[1], 1); + delta_bucket_stats.write_key(&[201], 1); + bucket_stats_info.add_bucket_flow(&Some(delta_bucket_stats)); + let bucket_stats = bucket_stats_info.bucket_stat().as_ref().unwrap(); + assert_eq!(vec![2, 0, 2], bucket_stats.stats.write_bytes); + + // step2: tick not affect anything + let bucket_ranges = Some(vec![]); + let buckets = vec![]; + let mut change_bucket_version = bucket_stats_info.on_refresh_region_buckets( + &cfg, + next_bucket_version, + buckets, + region_epoch.clone(), + region.clone(), + bucket_ranges, + ); + let bucket_stats = bucket_stats_info.bucket_stat().as_ref().unwrap(); + assert!(!change_bucket_version); + assert_eq!(vec![2, 0, 2], bucket_stats.stats.write_bytes); + + // step3: split key 50 + let mut bucket_ranges = Some(vec![BucketRange(vec![], vec![100])]); + let mut bucket = Bucket::default(); + bucket.keys = vec![vec![50]]; + bucket.size = bucket_size; + let mut buckets = vec![bucket]; + change_bucket_version = bucket_stats_info.on_refresh_region_buckets( + &cfg, + next_bucket_version, + buckets.clone(), + region_epoch.clone(), + region.clone(), + bucket_ranges.clone(), + ); + assert!(change_bucket_version); + let bucket_stats = bucket_stats_info.bucket_stat().as_ref().unwrap(); + assert_eq!( + vec![vec![], vec![50], vec![100], vec![200], vec![]], + bucket_stats.meta.keys + ); + assert_eq!( + vec![bucket_size / 2, bucket_size / 2, bucket_size, bucket_size], + bucket_stats.meta.sizes + ); + assert_eq!(vec![0, 0, 0, 2], bucket_stats.stats.write_bytes); + + // step4: merge [50-100] to [0-50], + bucket_ranges = Some(vec![BucketRange(vec![50], vec![100])]); + let mut bucket = Bucket::default(); + bucket.keys = vec![]; + bucket.size = 0; + buckets = vec![bucket]; + change_bucket_version = bucket_stats_info.on_refresh_region_buckets( + &cfg, + next_bucket_version, + buckets, + region_epoch, + region, + bucket_ranges, + ); + assert!(change_bucket_version); + + let bucket_stats = bucket_stats_info.bucket_stat().as_ref().unwrap(); + assert_eq!( + vec![vec![], vec![100], vec![200], vec![]], + bucket_stats.meta.keys + ); + assert_eq!( + vec![bucket_size / 2, bucket_size, bucket_size], + bucket_stats.meta.sizes + ); + assert_eq!(vec![0, 0, 2], bucket_stats.stats.write_bytes); + + // report buckets doesn't be affected by the split and merge. + let report_bucket_stats = bucket_stats_info.report_bucket_stat(); + assert_eq!(vec![4, 0, 2], report_bucket_stats.stats.write_bytes); + } +} diff --git a/components/raftstore-v2/src/worker/pd/mod.rs b/components/raftstore-v2/src/worker/pd/mod.rs index 061a5ad5126..77915dd0378 100644 --- a/components/raftstore-v2/src/worker/pd/mod.rs +++ b/components/raftstore-v2/src/worker/pd/mod.rs @@ -57,7 +57,6 @@ pub enum Task { }, // In region.rs. RegionHeartbeat(RegionHeartbeatTask), - ReportRegionBuckets(BucketStat), UpdateReadStats(ReadStats), UpdateWriteStats(WriteStats), UpdateRegionCpuRecords(Arc), @@ -85,6 +84,7 @@ pub enum Task { initial_status: u64, txn_ext: Arc, }, + // BucketStat is the delta write flow of the bucket. ReportBuckets(BucketStat), ReportMinResolvedTs { store_id: u64, @@ -123,7 +123,6 @@ impl Display for Task { hb_task.region, hb_task.peer.get_id(), ), - Task::ReportRegionBuckets(ref buckets) => write!(f, "report buckets: {:?}", buckets), Task::UpdateReadStats(ref stats) => { write!(f, "update read stats: {stats:?}") } @@ -314,7 +313,6 @@ where write_io_rates, } => self.handle_update_store_infos(cpu_usages, read_io_rates, write_io_rates), Task::RegionHeartbeat(task) => self.handle_region_heartbeat(task), - Task::ReportRegionBuckets(buckets) => self.handle_report_region_buckets(buckets), Task::UpdateReadStats(stats) => self.handle_update_read_stats(stats), Task::UpdateWriteStats(stats) => self.handle_update_write_stats(stats), Task::UpdateRegionCpuRecords(records) => self.handle_update_region_cpu_records(records), @@ -341,7 +339,7 @@ where initial_status, txn_ext, } => self.handle_update_max_timestamp(region_id, initial_status, txn_ext), - Task::ReportBuckets(buckets) => self.handle_report_region_buckets(buckets), + Task::ReportBuckets(delta_buckets) => self.handle_report_region_buckets(delta_buckets), Task::ReportMinResolvedTs { store_id, min_resolved_ts, diff --git a/components/raftstore-v2/src/worker/pd/region.rs b/components/raftstore-v2/src/worker/pd/region.rs index 763e12fff07..d3ef54bd75a 100644 --- a/components/raftstore-v2/src/worker/pd/region.rs +++ b/components/raftstore-v2/src/worker/pd/region.rs @@ -339,9 +339,9 @@ where self.is_hb_receiver_scheduled = true; } - pub fn handle_report_region_buckets(&mut self, region_buckets: BucketStat) { - let region_id = region_buckets.meta.region_id; - self.merge_buckets(region_buckets); + pub fn handle_report_region_buckets(&mut self, delta_buckets: BucketStat) { + let region_id = delta_buckets.meta.region_id; + self.merge_buckets(delta_buckets); let report_buckets = self.region_buckets.get_mut(®ion_id).unwrap(); let last_report_ts = if report_buckets.last_report_ts.is_zero() { self.start_ts @@ -388,8 +388,8 @@ where .engine_total_query_num .add_query_stats(®ion_info.query_stats.0); } - for (_, region_buckets) in std::mem::take(&mut stats.region_buckets) { - self.merge_buckets(region_buckets); + for (_, delta_buckets) in std::mem::take(&mut stats.region_buckets) { + self.merge_buckets(delta_buckets); } if !stats.region_infos.is_empty() { self.stats_monitor.maybe_send_read_stats(stats); @@ -424,18 +424,18 @@ where } } - fn merge_buckets(&mut self, mut buckets: BucketStat) { - let region_id = buckets.meta.region_id; + fn merge_buckets(&mut self, mut delta: BucketStat) { + let region_id = delta.meta.region_id; self.region_buckets .entry(region_id) .and_modify(|report_bucket| { let current = &mut report_bucket.current_stat; - if current.meta < buckets.meta { - std::mem::swap(current, &mut buckets); + if current.meta < delta.meta { + std::mem::swap(current, &mut delta); } - current.merge(&buckets); + current.merge(&delta); }) - .or_insert_with(|| ReportBucket::new(buckets)); + .or_insert_with(|| ReportBucket::new(delta)); } fn calculate_region_cpu_records( diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index 513e9c0636a..b6d7f8fcfcc 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -5970,27 +5970,6 @@ where } }; - // bucket version layout - // term logical counter - // |-----------|-----------| - // high bits low bits - // term: given 10s election timeout, the 32 bit means 1362 year running time - let gen_bucket_version = |term, current_version| { - let current_version_term = current_version >> 32; - let bucket_version: u64 = if current_version_term == term { - current_version + 1 - } else { - if term > u32::MAX.into() { - error!( - "unexpected term {} more than u32::MAX. Bucket version will be backward.", - term - ); - } - term << 32 - }; - bucket_version - }; - let region = self.fsm.peer.region(); if util::is_epoch_stale(®ion_epoch, region.get_region_epoch()) { info!( @@ -6042,7 +6021,7 @@ where region_buckets = self.fsm.peer.region_buckets.clone().unwrap(); let mut meta = (*region_buckets.meta).clone(); if !buckets.is_empty() { - meta.version = gen_bucket_version(self.fsm.peer.term(), current_version); + meta.version = util::gen_bucket_version(self.fsm.peer.term(), current_version); } meta.region_epoch = region_epoch; for (bucket, bucket_range) in buckets.into_iter().zip(bucket_ranges) { @@ -6096,7 +6075,7 @@ where let mut meta = BucketMeta { region_id: self.fsm.region_id(), region_epoch, - version: gen_bucket_version(self.fsm.peer.term(), current_version), + version: util::gen_bucket_version(self.fsm.peer.term(), current_version), keys: bucket_keys, sizes: vec![self.ctx.coprocessor_host.cfg.region_bucket_size.0; bucket_count], }; diff --git a/components/raftstore/src/store/util.rs b/components/raftstore/src/store/util.rs index 519d486102c..d9076a67d8a 100644 --- a/components/raftstore/src/store/util.rs +++ b/components/raftstore/src/store/util.rs @@ -160,6 +160,20 @@ pub fn new_empty_snapshot( snapshot } +pub fn gen_bucket_version(term: u64, current_version: u64) -> u64 { + // term logical counter + // |-----------|-----------| + // high bits low bits + // term: given 10s election timeout, the 32 bit means 1362 year running time + let current_version_term = current_version >> 32; + let bucket_version: u64 = if current_version_term == term { + current_version + 1 + } else { + term << 32 + }; + bucket_version +} + const STR_CONF_CHANGE_ADD_NODE: &str = "AddNode"; const STR_CONF_CHANGE_REMOVE_NODE: &str = "RemoveNode"; const STR_CONF_CHANGE_ADDLEARNER_NODE: &str = "AddLearner"; From 64d2129a0c21bc1e8521c38dd144a327baa88965 Mon Sep 17 00:00:00 2001 From: glorv Date: Tue, 10 Oct 2023 13:01:53 +0800 Subject: [PATCH 080/220] config: set a longer rocksdb io limiter smooth window for raft-v2 (#15734) ref tikv/tikv#11470 Signed-off-by: glorv --- Cargo.lock | 6 +++--- cmd/tikv-ctl/src/main.rs | 2 +- src/config/mod.rs | 21 ++++++++++++++++++--- src/server/engine_factory.rs | 2 +- tests/integrations/storage/test_titan.rs | 4 +++- 5 files changed, 26 insertions(+), 9 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a10755f5a7f..c221af119e9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2998,7 +2998,7 @@ dependencies = [ [[package]] name = "librocksdb_sys" version = "0.1.0" -source = "git+https://github.com/tikv/rust-rocksdb.git#f04f4dd8eacc30e67c24bc2529a6d9c6edb85f8f" +source = "git+https://github.com/tikv/rust-rocksdb.git#b747689e1b94cb1507872e898b83553447e8f8de" dependencies = [ "bindgen 0.65.1", "bzip2-sys", @@ -3017,7 +3017,7 @@ dependencies = [ [[package]] name = "libtitan_sys" version = "0.0.1" -source = "git+https://github.com/tikv/rust-rocksdb.git#f04f4dd8eacc30e67c24bc2529a6d9c6edb85f8f" +source = "git+https://github.com/tikv/rust-rocksdb.git#b747689e1b94cb1507872e898b83553447e8f8de" dependencies = [ "bzip2-sys", "cc", @@ -4936,7 +4936,7 @@ dependencies = [ [[package]] name = "rocksdb" version = "0.3.0" -source = "git+https://github.com/tikv/rust-rocksdb.git#f04f4dd8eacc30e67c24bc2529a6d9c6edb85f8f" +source = "git+https://github.com/tikv/rust-rocksdb.git#b747689e1b94cb1507872e898b83553447e8f8de" dependencies = [ "libc 0.2.146", "librocksdb_sys", diff --git a/cmd/tikv-ctl/src/main.rs b/cmd/tikv-ctl/src/main.rs index 6baa1fe6c39..df17e81f1ef 100644 --- a/cmd/tikv-ctl/src/main.rs +++ b/cmd/tikv-ctl/src/main.rs @@ -1048,7 +1048,7 @@ fn build_rocks_opts(cfg: &TikvConfig) -> engine_rocks::RocksDbOptions { .unwrap() .map(Arc::new); let env = get_env(key_manager, None /* io_rate_limiter */).unwrap(); - let resource = cfg.rocksdb.build_resources(env); + let resource = cfg.rocksdb.build_resources(env, cfg.storage.engine); cfg.rocksdb.build_opt(&resource, cfg.storage.engine) } diff --git a/src/config/mod.rs b/src/config/mod.rs index d18d6f8cda0..911308809c6 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -1412,14 +1412,25 @@ impl DbConfig { } } - pub fn build_resources(&self, env: Arc) -> DbResources { + pub fn build_resources(&self, env: Arc, engine: EngineType) -> DbResources { let rate_limiter = if self.rate_bytes_per_sec.0 > 0 { + // for raft-v2, we use a longer window to make the compaction io smoother + let (tune_per_secs, window_size, recent_size) = match engine { + // 1s tune duraion, long term window is 5m, short term window is 30s. + // this is the default settings. + EngineType::RaftKv => (1, 300, 30), + // 5s tune duraion, long term window is 1h, short term window is 5m + EngineType::RaftKv2 => (5, 720, 60), + }; Some(Arc::new(RateLimiter::new_writeampbased_with_auto_tuned( self.rate_bytes_per_sec.0 as i64, (self.rate_limiter_refill_period.as_millis() * 1000) as i64, 10, // fairness self.rate_limiter_mode, self.rate_limiter_auto_tuned, + tune_per_secs, + window_size, + recent_size, ))) } else { None @@ -4844,7 +4855,9 @@ mod tests { fn test_rocks_rate_limit_zero() { let mut tikv_cfg = TikvConfig::default(); tikv_cfg.rocksdb.rate_bytes_per_sec = ReadableSize(0); - let resource = tikv_cfg.rocksdb.build_resources(Arc::new(Env::default())); + let resource = tikv_cfg + .rocksdb + .build_resources(Arc::new(Env::default()), tikv_cfg.storage.engine); tikv_cfg .rocksdb .build_opt(&resource, tikv_cfg.storage.engine); @@ -5008,7 +5021,9 @@ mod tests { Arc, ) { assert_eq!(F::TAG, cfg.storage.api_version()); - let resource = cfg.rocksdb.build_resources(Arc::default()); + let resource = cfg + .rocksdb + .build_resources(Arc::default(), cfg.storage.engine); let engine = RocksDBEngine::new( &cfg.storage.data_dir, Some(cfg.rocksdb.build_opt(&resource, cfg.storage.engine)), diff --git a/src/server/engine_factory.rs b/src/server/engine_factory.rs index 85de282b137..3593c01ca7f 100644 --- a/src/server/engine_factory.rs +++ b/src/server/engine_factory.rs @@ -56,7 +56,7 @@ impl KvEngineFactoryBuilder { flow_listener: None, sst_recovery_sender: None, encryption_key_manager: key_manager, - db_resources: config.rocksdb.build_resources(env), + db_resources: config.rocksdb.build_resources(env, config.storage.engine), cf_resources: config.rocksdb.build_cf_resources(cache), state_storage: None, lite: false, diff --git a/tests/integrations/storage/test_titan.rs b/tests/integrations/storage/test_titan.rs index 9c3eeec0c83..4bb8fee4087 100644 --- a/tests/integrations/storage/test_titan.rs +++ b/tests/integrations/storage/test_titan.rs @@ -159,7 +159,9 @@ fn test_delete_files_in_range_for_titan() { cfg.rocksdb.defaultcf.titan.min_gc_batch_size = ReadableSize(0); cfg.rocksdb.defaultcf.titan.discardable_ratio = 0.4; cfg.rocksdb.defaultcf.titan.min_blob_size = ReadableSize(0); - let resource = cfg.rocksdb.build_resources(Default::default()); + let resource = cfg + .rocksdb + .build_resources(Default::default(), cfg.storage.engine); let kv_db_opts = cfg.rocksdb.build_opt(&resource, cfg.storage.engine); let kv_cfs_opts = cfg.rocksdb.build_cf_opts( &cfg.rocksdb.build_cf_resources(cache), From 905e8bffbee3a289198b31de70e418c101f3be78 Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Tue, 10 Oct 2023 14:01:54 +0800 Subject: [PATCH 081/220] raftstore: disable region bucket for raftstore v1 by default (#15740) ref tikv/tikv#15719 disable region bucket for raftstore v1 by default Signed-off-by: SpadeA-Tang --- .../raftstore/src/coprocessor/config.rs | 21 ++++++++++--------- src/config/mod.rs | 11 +++++++--- 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/components/raftstore/src/coprocessor/config.rs b/components/raftstore/src/coprocessor/config.rs index e1246e8d59d..b1dc3830bbb 100644 --- a/components/raftstore/src/coprocessor/config.rs +++ b/components/raftstore/src/coprocessor/config.rs @@ -168,7 +168,7 @@ impl Config { Ok(()) } - pub fn validate(&mut self) -> Result<()> { + pub fn validate(&mut self, raft_kv_v2: bool) -> Result<()> { if self.region_split_keys.is_none() { self.region_split_keys = Some((self.region_split_size().as_mb_f64() * 10000.0) as u64); } @@ -199,8 +199,9 @@ impl Config { None => self.region_max_keys = Some(self.region_split_keys() / 2 * 3), } let res = self.validate_bucket_size(); - // If it's OK to enable bucket, we will prefer to enable it if useful. - if let Ok(()) = res && self.enable_region_bucket.is_none() { + // If it's OK to enable bucket, we will prefer to enable it if useful for + // raftstore-v2. + if let Ok(()) = res && self.enable_region_bucket.is_none() && raft_kv_v2 { let useful = self.region_split_size() >= self.region_bucket_size * 2; self.enable_region_bucket = Some(useful); } else if let Err(e) = res && self.enable_region_bucket() { @@ -237,39 +238,39 @@ mod tests { #[test] fn test_config_validate() { let mut cfg = Config::default(); - cfg.validate().unwrap(); + cfg.validate(false).unwrap(); cfg = Config::default(); cfg.region_max_size = Some(ReadableSize(10)); cfg.region_split_size = Some(ReadableSize(20)); - cfg.validate().unwrap_err(); + cfg.validate(false).unwrap_err(); cfg = Config::default(); cfg.region_max_size = None; cfg.region_split_size = Some(ReadableSize(20)); - cfg.validate().unwrap(); + cfg.validate(false).unwrap(); assert_eq!(cfg.region_max_size, Some(ReadableSize(30))); cfg = Config::default(); cfg.region_max_keys = Some(10); cfg.region_split_keys = Some(20); - cfg.validate().unwrap_err(); + cfg.validate(false).unwrap_err(); cfg = Config::default(); cfg.region_max_keys = None; cfg.region_split_keys = Some(20); - cfg.validate().unwrap(); + cfg.validate(false).unwrap(); assert_eq!(cfg.region_max_keys, Some(30)); cfg = Config::default(); cfg.enable_region_bucket = Some(false); cfg.region_split_size = Some(ReadableSize(20)); cfg.region_bucket_size = ReadableSize(30); - cfg.validate().unwrap(); + cfg.validate(false).unwrap(); cfg = Config::default(); cfg.region_split_size = Some(ReadableSize::mb(20)); - cfg.validate().unwrap(); + cfg.validate(false).unwrap(); assert_eq!(cfg.region_split_keys, Some(200000)); } } diff --git a/src/config/mod.rs b/src/config/mod.rs index 911308809c6..0eb006363f0 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -3725,7 +3725,8 @@ impl TikvConfig { self.raft_engine.validate()?; self.server.validate()?; self.pd.validate()?; - self.coprocessor.validate()?; + self.coprocessor + .validate(self.storage.engine == EngineType::RaftKv2)?; self.raft_store.validate( self.coprocessor.region_split_size(), self.coprocessor.enable_region_bucket(), @@ -6238,21 +6239,25 @@ mod tests { let mut default_cfg = TikvConfig::default(); default_cfg.coprocessor.region_split_size = Some(ReadableSize::mb(500)); default_cfg.coprocessor.optimize_for(false); - default_cfg.coprocessor.validate().unwrap(); + default_cfg.coprocessor.validate(false).unwrap(); assert_eq!( default_cfg.coprocessor.region_split_size(), ReadableSize::mb(500) ); + assert!(!default_cfg.coprocessor.enable_region_bucket()); + default_cfg.coprocessor.validate(true).unwrap(); assert!(default_cfg.coprocessor.enable_region_bucket()); let mut default_cfg = TikvConfig::default(); default_cfg.coprocessor.region_split_size = Some(ReadableSize::mb(500)); default_cfg.coprocessor.optimize_for(true); - default_cfg.coprocessor.validate().unwrap(); + default_cfg.coprocessor.validate(false).unwrap(); assert_eq!( default_cfg.coprocessor.region_split_size(), ReadableSize::mb(500) ); + assert!(!default_cfg.coprocessor.enable_region_bucket()); + default_cfg.coprocessor.validate(true).unwrap(); assert!(default_cfg.coprocessor.enable_region_bucket()); } From 88aaaa3e7b1e194d389fee6a9831f7491d7f9acd Mon Sep 17 00:00:00 2001 From: Connor Date: Tue, 10 Oct 2023 05:18:25 -0500 Subject: [PATCH 082/220] status_server: Add symbol service to support remote fetching symbolized heap profile (#15695) close tikv/tikv#15732 Jeprof supports generating the svg by remote fetching, so we can add a symbol service following the [pprof format](https://gperftools.github.io/gperftools/pprof_remote_servers.html), then with ` jeprof --show_bytes http://:20180/debug/pprof/heap --svg` it can simply get the heap profiling svg from remote. With this PR, we can get rid of the limitation that the heap profile must be processed with the corresponding tikv binary and perl runtime which is used by `jeprof`. Later, we only need to install `jeprof` and `perl` in tidb_dashboard environment and collect the heap profile just like how CPU profile does. Signed-off-by: Connor1996 Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- Cargo.lock | 22 +++--- src/server/status_server/mod.rs | 136 ++++++++++++++++++++++++++++++++ 2 files changed, 149 insertions(+), 9 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c221af119e9..fccff7d7822 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -89,9 +89,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.26" +version = "1.0.75" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7825f6833612eb2414095684fcf6c635becf3ce97fe48cf6421321e93bfbd53c" +checksum = "a4668cab20f66d8d020e1fbc0ebe47217433c1b6c8f2040faf858554e394ace6" [[package]] name = "api_version" @@ -777,9 +777,9 @@ checksum = "cdead85bdec19c194affaeeb670c0e41fe23de31459efd1c174d049269cf02cc" [[package]] name = "byteorder" -version = "1.3.4" +version = "1.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08c48aae112d48ed9f069b33538ea9e3e90aa263cfa3d1c24309612b1f7472de" +checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" [[package]] name = "bytes" @@ -3168,9 +3168,9 @@ dependencies = [ [[package]] name = "memmap2" -version = "0.5.3" +version = "0.5.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "057a3db23999c867821a7a59feb06a578fcb03685e983dff90daf9e7d24ac08f" +checksum = "83faa42c0a078c393f6b29d5db232d8be22776a891f8f56e5284faee4a20b327" dependencies = [ "libc 0.2.146", ] @@ -5834,7 +5834,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac457d054f793cedfde6f32d21d692b8351cfec9084fefd0470c0373f6d799bc" dependencies = [ "debugid", - "memmap2 0.5.3", + "memmap2 0.5.10", "stable_deref_trait", "uuid 1.2.1", ] @@ -7237,9 +7237,13 @@ dependencies = [ [[package]] name = "twox-hash" -version = "1.5.0" +version = "1.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3bfd5b7557925ce778ff9b9ef90e3ade34c524b5ff10e239c69a42d546d2af56" +checksum = "97fee6b57c6a41524a810daee9286c02d7752c4253064d0b05472833a438f675" +dependencies = [ + "cfg-if 1.0.0", + "static_assertions", +] [[package]] name = "txn_types" diff --git a/src/server/status_server/mod.rs b/src/server/status_server/mod.rs index 98077d9e93f..3e68b0b6310 100644 --- a/src/server/status_server/mod.rs +++ b/src/server/status_server/mod.rs @@ -3,6 +3,7 @@ /// Provides profilers for TiKV. mod profile; use std::{ + env::args, error::Error as StdError, net::SocketAddr, path::PathBuf, @@ -308,6 +309,83 @@ where }) } + async fn get_cmdline(_req: Request) -> hyper::Result> { + let args = args().into_iter().fold(String::new(), |mut a, b| { + a.push_str(&b); + a.push('\x00'); + a + }); + let response = Response::builder() + .header("Content-Type", mime::TEXT_PLAIN.to_string()) + .header("X-Content-Type-Options", "nosniff") + .body(args.into()) + .unwrap(); + Ok(response) + } + + async fn get_symbol_count(req: Request) -> hyper::Result> { + assert_eq!(req.method(), Method::GET); + // We don't know how many symbols we have, but we + // do have symbol information. pprof only cares whether + // this number is 0 (no symbols available) or > 0. + let text = "num_symbols: 1\n"; + let response = Response::builder() + .header("Content-Type", mime::TEXT_PLAIN.to_string()) + .header("X-Content-Type-Options", "nosniff") + .header("Content-Length", text.len()) + .body(text.into()) + .unwrap(); + Ok(response) + } + + // The request and response format follows pprof remote server + // https://gperftools.github.io/gperftools/pprof_remote_servers.html + // Here is the go pprof implementation: + // https://github.com/golang/go/blob/3857a89e7eb872fa22d569e70b7e076bec74ebbb/src/net/http/pprof/pprof.go#L191 + async fn get_symbol(req: Request) -> hyper::Result> { + assert_eq!(req.method(), Method::POST); + let mut text = String::new(); + let body_bytes = hyper::body::to_bytes(req.into_body()).await?; + let body = String::from_utf8(body_bytes.to_vec()).unwrap(); + + // The request body is a list of addr to be resolved joined by '+'. + // Resolve addrs with addr2line and write the symbols each per line in + // response. + for pc in body.split('+') { + let addr = usize::from_str_radix(pc.trim_start_matches("0x"), 16).unwrap_or(0); + if addr == 0 { + info!("invalid addr: {}", addr); + continue; + } + + // Would be multiple symbols if inlined. + let mut syms = vec![]; + backtrace::resolve(addr as *mut std::ffi::c_void, |sym| { + let name = sym + .name() + .unwrap_or_else(|| backtrace::SymbolName::new(b"")); + syms.push(name.to_string()); + }); + + if !syms.is_empty() { + // join inline functions with '--' + let f = syms.join("--"); + // should be + text.push_str(format!("{:#x} {}\n", addr, f).as_str()); + } else { + info!("can't resolve mapped addr: {:#x}", addr); + text.push_str(format!("{:#x} ??\n", addr).as_str()); + } + } + let response = Response::builder() + .header("Content-Type", mime::TEXT_PLAIN.to_string()) + .header("X-Content-Type-Options", "nosniff") + .header("Content-Length", text.len()) + .body(text.into()) + .unwrap(); + Ok(response) + } + async fn update_config( cfg_controller: ConfigController, req: Request, @@ -693,6 +771,11 @@ where (Method::GET, "/debug/pprof/heap") => { Self::dump_heap_prof_to_resp(req).await } + (Method::GET, "/debug/pprof/cmdline") => Self::get_cmdline(req).await, + (Method::GET, "/debug/pprof/symbol") => { + Self::get_symbol_count(req).await + } + (Method::POST, "/debug/pprof/symbol") => Self::get_symbol(req).await, (Method::GET, "/config") => { Self::get_config(req, &cfg_controller).await } @@ -1658,6 +1741,59 @@ mod tests { status_server.stop(); } + #[test] + fn test_pprof_symbol_service() { + let _test_guard = TEST_PROFILE_MUTEX.lock().unwrap(); + let temp_dir = tempfile::TempDir::new().unwrap(); + let mut status_server = StatusServer::new( + 1, + ConfigController::default(), + Arc::new(SecurityConfig::default()), + MockRouter, + temp_dir.path().to_path_buf(), + None, + GrpcServiceManager::dummy(), + ) + .unwrap(); + let addr = "127.0.0.1:0".to_owned(); + let _ = status_server.start(addr); + let client = Client::new(); + + let mut addr = None; + backtrace::trace(|f| { + addr = Some(f.ip()); + false + }); + assert!(addr.is_some()); + + let uri = Uri::builder() + .scheme("http") + .authority(status_server.listening_addr().to_string().as_str()) + .path_and_query("/debug/pprof/symbol") + .build() + .unwrap(); + let req = Request::builder() + .method(Method::POST) + .uri(uri) + .body(Body::from(format!("{:p}", addr.unwrap()))) + .unwrap(); + let handle = status_server + .thread_pool + .spawn(async move { client.request(req).await.unwrap() }); + let resp = block_on(handle).unwrap(); + assert_eq!(resp.status(), StatusCode::OK); + let body_bytes = block_on(hyper::body::to_bytes(resp.into_body())).unwrap(); + assert!( + String::from_utf8(body_bytes.as_ref().to_owned()) + .unwrap() + .split(' ') + .last() + .unwrap() + .starts_with("backtrace::backtrace") + ); + status_server.stop(); + } + #[test] fn test_metrics() { let _test_guard = TEST_PROFILE_MUTEX.lock().unwrap(); From 262845cefc4810aa8bdcdc7ec18fa3d4469547de Mon Sep 17 00:00:00 2001 From: lucasliang Date: Wed, 11 Oct 2023 13:27:24 +0800 Subject: [PATCH 083/220] raftstore-v2: support to make protection when disk full. (#15558) close tikv/tikv#15170 This pr is used to protect `raftstore-v2` when disk full. And all checking and validation is transplant from `raftstore`. --- components/raftstore-v2/src/batch/store.rs | 8 +- components/raftstore-v2/src/fsm/peer.rs | 1 + .../operation/command/admin/merge/prepare.rs | 36 +- .../src/operation/command/admin/mod.rs | 57 ++- .../src/operation/command/admin/split.rs | 16 + .../command/admin/transfer_leader.rs | 2 +- .../raftstore-v2/src/operation/command/mod.rs | 6 + .../src/operation/command/write/mod.rs | 10 +- components/raftstore-v2/src/operation/life.rs | 326 +++++++++++++- components/raftstore-v2/src/operation/pd.rs | 2 +- .../raftstore-v2/src/operation/query/lease.rs | 2 +- .../raftstore-v2/src/operation/ready/mod.rs | 75 +++- .../raftstore-v2/src/operation/txn_ext.rs | 16 +- components/raftstore-v2/src/raft/peer.rs | 21 +- components/raftstore-v2/src/router/message.rs | 11 + components/raftstore/src/store/mod.rs | 4 +- components/raftstore/src/store/peer.rs | 9 + components/test_raftstore-v2/src/cluster.rs | 31 +- components/test_raftstore-v2/src/util.rs | 111 ++++- src/server/raftkv2/mod.rs | 1 + tests/failpoints/cases/test_disk_full.rs | 401 +++++++++--------- .../integrations/raftstore/test_stale_read.rs | 2 +- 22 files changed, 897 insertions(+), 251 deletions(-) diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index cd5ae8f42f7..5ed84c70937 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -47,7 +47,7 @@ use tikv_util::{ box_err, config::{Tracker, VersionTrack}, log::SlogFormat, - sys::SysQuota, + sys::{disk::get_disk_status, SysQuota}, time::{duration_to_sec, monotonic_raw_now, Instant as TiInstant, Limiter}, timer::{SteadyTimer, GLOBAL_TIMER_HANDLE}, worker::{Builder, LazyWorker, Scheduler, Worker}, @@ -104,6 +104,10 @@ pub struct StoreContext { /// Disk usage for the store itself. pub self_disk_usage: DiskUsage, + // TODO: how to remove offlined stores? + /// Disk usage for other stores. The store itself is not included. + /// Only contains items which is not `DiskUsage::Normal`. + pub store_disk_usages: HashMap, pub snap_mgr: TabletSnapManager, pub global_stat: GlobalStoreStat, @@ -228,6 +232,7 @@ impl PollHandler PeerFsmDelegate<'a, EK, ER, write.header, write.data, write.ch, + Some(write.disk_full_opt), ); } PeerMsg::UnsafeWrite(write) => { diff --git a/components/raftstore-v2/src/operation/command/admin/merge/prepare.rs b/components/raftstore-v2/src/operation/command/admin/merge/prepare.rs index 6ff982eea8c..4a5875f7097 100644 --- a/components/raftstore-v2/src/operation/command/admin/merge/prepare.rs +++ b/components/raftstore-v2/src/operation/command/admin/merge/prepare.rs @@ -219,22 +219,7 @@ impl Peer { if r.is_ok() { self.proposal_control_mut().set_pending_prepare_merge(false); } else { - // Match v1::post_propose_fail. - // If we just failed to propose PrepareMerge, the pessimistic locks status - // may become MergingRegion incorrectly. So, we have to revert it here. - // Note: The `is_merging` check from v1 is removed because proposed - // `PrepareMerge` rejects all writes (in `ProposalControl::check_conflict`). - assert!( - !self.proposal_control().is_merging(), - "{}", - SlogFormat(&self.logger) - ); - self.take_merge_context(); - self.proposal_control_mut().set_pending_prepare_merge(false); - let mut pessimistic_locks = self.txn_context().ext().pessimistic_locks.write(); - if pessimistic_locks.status == LocksStatus::MergingRegion { - pessimistic_locks.status = LocksStatus::Normal; - } + self.post_prepare_merge_fail(); } r } @@ -707,6 +692,25 @@ impl Peer { self.propose(store_ctx, cmd.write_to_bytes().unwrap())?; Ok(()) } + + pub fn post_prepare_merge_fail(&mut self) { + // Match v1::post_propose_fail. + // If we just failed to propose PrepareMerge, the pessimistic locks status + // may become MergingRegion incorrectly. So, we have to revert it here. + // Note: The `is_merging` check from v1 is removed because proposed + // `PrepareMerge` rejects all writes (in `ProposalControl::check_conflict`). + assert!( + !self.proposal_control().is_merging(), + "{}", + SlogFormat(&self.logger) + ); + self.take_merge_context(); + self.proposal_control_mut().set_pending_prepare_merge(false); + let mut pessimistic_locks = self.txn_context().ext().pessimistic_locks.write(); + if pessimistic_locks.status == LocksStatus::MergingRegion { + pessimistic_locks.status = LocksStatus::Normal; + } + } } impl Apply { diff --git a/components/raftstore-v2/src/operation/command/admin/mod.rs b/components/raftstore-v2/src/operation/command/admin/mod.rs index d59a564c696..9d7fee55ae4 100644 --- a/components/raftstore-v2/src/operation/command/admin/mod.rs +++ b/components/raftstore-v2/src/operation/command/admin/mod.rs @@ -12,6 +12,7 @@ use compact_log::CompactLogResult; use conf_change::{ConfChangeResult, UpdateGcPeersResult}; use engine_traits::{KvEngine, RaftEngine}; use kvproto::{ + kvrpcpb::DiskFullOpt, metapb::{PeerRole, Region}, raft_cmdpb::{AdminCmdType, RaftCmdRequest}, raft_serverpb::{ExtraMessageType, FlushMemtable, RaftMessage}, @@ -33,13 +34,13 @@ use raftstore::{ }, Error, }; -use slog::{error, info}; +use slog::{debug, error, info}; use split::SplitResult; pub use split::{ report_split_init_finish, temp_split_path, RequestHalfSplit, RequestSplit, SplitFlowControl, SplitInit, SplitPendingAppend, SPLIT_PREFIX, }; -use tikv_util::{box_err, log::SlogFormat, slog_panic}; +use tikv_util::{box_err, log::SlogFormat, slog_panic, sys::disk::DiskUsage}; use txn_types::WriteBatchFlags; use self::flashback::FlashbackResult; @@ -103,6 +104,18 @@ impl Peer { let pre_transfer_leader = cmd_type == AdminCmdType::TransferLeader && !WriteBatchFlags::from_bits_truncate(req.get_header().get_flags()) .contains(WriteBatchFlags::TRANSFER_LEADER_PROPOSAL); + let is_conf_change = apply::is_conf_change_cmd(&req); + + // Check whether the admin request can be proposed when disk full. + let can_skip_check = is_transfer_leader || pre_transfer_leader || is_conf_change; + if !can_skip_check && let Err(e) = + self.check_proposal_with_disk_full_opt(ctx, DiskFullOpt::AllowedOnAlmostFull) + { + let resp = cmd_resp::new_error(e); + ch.report_error(resp); + self.post_propose_fail(cmd_type); + return; + } // The admin request is rejected because it may need to update epoch checker // which introduces an uncertainty and may breaks the correctness of epoch @@ -134,9 +147,11 @@ impl Peer { ch.report_error(resp); return; } + // Prepare Merge need to be broadcast to as many as followers when disk full. + self.on_prepare_merge(cmd_type, ctx); // To maintain propose order, we need to make pending proposal first. self.propose_pending_writes(ctx); - let res = if apply::is_conf_change_cmd(&req) { + let res = if is_conf_change { self.propose_conf_change(ctx, req) } else { // propose other admin command. @@ -258,6 +273,42 @@ impl Peer { self.post_propose_command(ctx, res, vec![ch], true); } + fn on_prepare_merge( + &mut self, + cmd_type: AdminCmdType, + ctx: &StoreContext, + ) { + let is_merge_cmd = + cmd_type == AdminCmdType::PrepareMerge || cmd_type == AdminCmdType::RollbackMerge; + let has_disk_full_peers = self.abnormal_peer_context().disk_full_peers().is_empty(); + let proposal_index = self.next_proposal_index(); + if is_merge_cmd + && (!matches!(ctx.self_disk_usage, DiskUsage::Normal) || !has_disk_full_peers) + { + self.has_region_merge_proposal = true; + self.region_merge_proposal_index = proposal_index; + let mut peers = vec![]; + self.abnormal_peer_context_mut() + .disk_full_peers_mut() + .peers_mut() + .iter_mut() + .for_each(|(k, v)| { + if !matches!(v.0, DiskUsage::AlreadyFull) { + v.1 = true; + peers.push(*k); + } + }); + debug!( + self.logger, + "adjust max inflight msgs"; + "cmd_type" => ?cmd_type, + "raft_max_inflight_msgs" => ctx.cfg.raft_max_inflight_msgs, + "region" => self.region_id() + ); + self.adjust_peers_max_inflight_msgs(&peers, ctx.cfg.raft_max_inflight_msgs); + } + } + fn start_pre_flush( &mut self, ctx: &mut StoreContext, diff --git a/components/raftstore-v2/src/operation/command/admin/split.rs b/components/raftstore-v2/src/operation/command/admin/split.rs index 0f9cae7218d..cfbd7678c17 100644 --- a/components/raftstore-v2/src/operation/command/admin/split.rs +++ b/components/raftstore-v2/src/operation/command/admin/split.rs @@ -35,6 +35,7 @@ use engine_traits::{ use fail::fail_point; use futures::channel::oneshot; use kvproto::{ + kvrpcpb::DiskFullOpt, metapb::{self, Region, RegionEpoch}, pdpb::CheckPolicy, raft_cmdpb::{AdminRequest, AdminResponse, RaftCmdRequest, SplitRequest}, @@ -332,6 +333,14 @@ impl Peer { )))); return; } + // Check whether the admin request can be proposed when disk full. + if let Err(e) = + self.check_proposal_with_disk_full_opt(ctx, DiskFullOpt::AllowedOnAlmostFull) + { + info!(self.logger, "disk is full, skip split"; "err" => ?e); + ch.set_result(cmd_resp::new_error(e)); + return; + } if let Err(e) = util::validate_split_region( self.region_id(), self.peer_id(), @@ -365,6 +374,13 @@ impl Peer { info!(self.logger, "not leader, skip."); return; } + // Check whether the admin request can be proposed when disk full. + if let Err(e) = + self.check_proposal_with_disk_full_opt(ctx, DiskFullOpt::AllowedOnAlmostFull) + { + info!(self.logger, "disk is full, skip half split"; "err" => ?e); + return; + } let region = self.region(); if util::is_epoch_stale(&rhs.epoch, region.get_region_epoch()) { diff --git a/components/raftstore-v2/src/operation/command/admin/transfer_leader.rs b/components/raftstore-v2/src/operation/command/admin/transfer_leader.rs index 4cdeba3bc41..bf9cb426255 100644 --- a/components/raftstore-v2/src/operation/command/admin/transfer_leader.rs +++ b/components/raftstore-v2/src/operation/command/admin/transfer_leader.rs @@ -118,7 +118,7 @@ impl Peer { transferee } - fn pre_transfer_leader(&mut self, peer: &metapb::Peer) -> bool { + pub fn pre_transfer_leader(&mut self, peer: &metapb::Peer) -> bool { if self.raft_group().raft.has_pending_conf() { info!( self.logger, diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index e579d22c6da..70cdbfda237 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -481,6 +481,12 @@ impl Peer { } self.check_unsafe_recovery_state(ctx); } + + pub fn post_propose_fail(&mut self, cmd_type: AdminCmdType) { + if cmd_type == AdminCmdType::PrepareMerge { + self.post_prepare_merge_fail(); + } + } } #[derive(Debug)] diff --git a/components/raftstore-v2/src/operation/command/write/mod.rs b/components/raftstore-v2/src/operation/command/write/mod.rs index a9d8bd664fe..6eacc75c0f1 100644 --- a/components/raftstore-v2/src/operation/command/write/mod.rs +++ b/components/raftstore-v2/src/operation/command/write/mod.rs @@ -5,7 +5,7 @@ use engine_traits::{ }; use fail::fail_point; use futures::channel::oneshot; -use kvproto::raft_cmdpb::RaftRequestHeader; +use kvproto::{kvrpcpb::DiskFullOpt, raft_cmdpb::RaftRequestHeader}; use raftstore::{ store::{ cmd_resp, @@ -42,6 +42,7 @@ impl Peer { header: Box, data: SimpleWriteBinary, ch: CmdResChannel, + disk_full_opt: Option, ) { if !self.serving() { apply::notify_req_region_removed(self.region_id(), ch); @@ -59,6 +60,13 @@ impl Peer { ch.report_error(resp); return; } + // Check whether the write request can be proposed with the given disk full + // option. + if let Some(opt) = disk_full_opt && let Err(e) = self.check_proposal_with_disk_full_opt(ctx, opt) { + let resp = cmd_resp::new_error(e); + ch.report_error(resp); + return; + } // To maintain propose order, we need to make pending proposal first. self.propose_pending_writes(ctx); if let Some(conflict) = self.proposal_control_mut().check_conflict(None) { diff --git a/components/raftstore-v2/src/operation/life.rs b/components/raftstore-v2/src/operation/life.rs index 84bded8a9bb..5828a7bb661 100644 --- a/components/raftstore-v2/src/operation/life.rs +++ b/components/raftstore-v2/src/operation/life.rs @@ -26,28 +26,34 @@ //! `merged_records`, to avoid race between destroy and merge, leader needs to //! ask target peer to destroy source peer. -use std::{cmp, mem}; +use std::{cmp, collections::HashSet, mem}; use batch_system::BasicMailbox; use crossbeam::channel::{SendError, TrySendError}; use engine_traits::{KvEngine, RaftEngine, RaftLogBatch}; use kvproto::{ - metapb::{self, Region}, + kvrpcpb::DiskFullOpt, + metapb::{self, PeerRole, Region}, raft_cmdpb::{AdminCmdType, RaftCmdRequest}, raft_serverpb::{ExtraMessage, ExtraMessageType, PeerState, RaftMessage}, }; -use raftstore::store::{ - fsm::{ - apply, - life::{build_peer_destroyed_report, forward_destroy_to_source_peer}, - Proposal, +use raft::eraftpb::MessageType; +use raftstore::{ + store::{ + fsm::{ + apply, + life::{build_peer_destroyed_report, forward_destroy_to_source_peer}, + Proposal, + }, + metrics::RAFT_PEER_PENDING_DURATION, + util, DiskFullPeers, Transport, WriteTask, }, - metrics::RAFT_PEER_PENDING_DURATION, - util, Transport, WriteTask, + Error, Result, }; use slog::{debug, error, info, warn}; use tikv_util::{ store::find_peer, + sys::disk::DiskUsage, time::{duration_to_sec, Instant}, }; @@ -126,16 +132,22 @@ pub struct AbnormalPeerContext { pending_peers: Vec<(u64, Instant)>, /// A inaccurate cache about which peer is marked as down. down_peers: Vec, + // disk full peer set. + disk_full_peers: DiskFullPeers, + // show whether an already disk full TiKV appears in the potential majority set. + dangerous_majority_set: bool, } impl AbnormalPeerContext { #[inline] pub fn is_empty(&self) -> bool { - self.pending_peers.is_empty() && self.down_peers.is_empty() + self.pending_peers.is_empty() && self.down_peers.is_empty() /* && self.disk_full_peers.is_empty() */ } #[inline] pub fn reset(&mut self) { + // No need to refresh disk_full_peers as it will be refreshed + // automatically when the disk usage updated. self.pending_peers.clear(); self.down_peers.clear(); } @@ -174,6 +186,26 @@ impl AbnormalPeerContext { RAFT_PEER_PENDING_DURATION.observe(elapsed); }); } + + #[inline] + pub fn disk_full_peers(&self) -> &DiskFullPeers { + &self.disk_full_peers + } + + #[inline] + pub fn disk_full_peers_mut(&mut self) -> &mut DiskFullPeers { + &mut self.disk_full_peers + } + + #[inline] + pub fn is_dangerous_majority_set(&self) -> bool { + self.dangerous_majority_set + } + + #[inline] + pub fn setup_dangerous_majority_set(&mut self, is_dangerous: bool) { + self.dangerous_majority_set = is_dangerous; + } } #[derive(Default)] @@ -415,6 +447,20 @@ impl Store { ctx.raft_metrics.message_dropped.stale_msg.inc(); return false; } + // Check whether this message should be dropped when disk full. + let msg_type = msg.get_message().get_msg_type(); + if matches!(ctx.self_disk_usage, DiskUsage::AlreadyFull) + && MessageType::MsgTimeoutNow == msg_type + { + debug!( + self.logger(), + "skip {:?} because of disk full", msg_type; + "region_id" => region_id, "peer_id" => to_peer.id, + ); + ctx.raft_metrics.message_dropped.disk_full.inc(); + return false; + } + let destroyed = match check_if_to_peer_destroyed(&ctx.engine, &msg, self.store_id()) { Ok(d) => d, Err(e) => { @@ -836,6 +882,266 @@ impl Peer { self.maybe_schedule_gc_peer_tick(); } + pub fn adjust_peers_max_inflight_msgs(&mut self, peers: &[u64], raft_max_inflight_msgs: usize) { + peers.iter().for_each(|id| { + self.raft_group_mut() + .raft + .adjust_max_inflight_msgs(*id, raft_max_inflight_msgs); + debug!( + self.logger, + "adjust max inflight msgs"; + "raft_max_inflight_msgs" => raft_max_inflight_msgs, + "peer_id" => id + ); + }); + } + + // Check disk usages for the peer itself and other peers in the raft group. + // The return value indicates whether the proposal is allowed or not. + pub fn check_proposal_with_disk_full_opt( + &mut self, + ctx: &StoreContext, + disk_full_opt: DiskFullOpt, + ) -> Result<()> { + let leader_allowed = match ctx.self_disk_usage { + DiskUsage::Normal => true, + DiskUsage::AlmostFull => !matches!(disk_full_opt, DiskFullOpt::NotAllowedOnFull), + DiskUsage::AlreadyFull => false, + }; + let mut disk_full_stores = Vec::new(); + let abnormal_peer_context = self.abnormal_peer_context(); + let disk_full_peers = abnormal_peer_context.disk_full_peers(); + if !leader_allowed { + disk_full_stores.push(ctx.store_id); + // Try to transfer leader to a node with disk usage normal to maintain write + // availability. If majority node is disk full, to transfer leader or not is not + // necessary. Note: Need to exclude learner node. + if !disk_full_peers.majority() { + let target_peer = self + .region() + .get_peers() + .iter() + .find(|x| { + !disk_full_peers.has(x.get_id()) + && x.get_id() != self.peer_id() + && !self + .abnormal_peer_context() + .down_peers() + .contains(&x.get_id()) + && !matches!(x.get_role(), PeerRole::Learner) + }) + .cloned(); + if let Some(p) = target_peer { + debug!( + self.logger, + "try to transfer leader because of current leader disk full"; + "region_id" => self.region().get_id(), + "peer_id" => self.peer_id(), + "target_peer_id" => p.get_id(), + ); + self.pre_transfer_leader(&p); + } + } + } else { + // Check followers. + if disk_full_peers.is_empty() { + return Ok(()); + } + if !abnormal_peer_context.is_dangerous_majority_set() { + if !disk_full_peers.majority() { + return Ok(()); + } + // Majority peers are in disk full status but the request carries a special + // flag. + if matches!(disk_full_opt, DiskFullOpt::AllowedOnAlmostFull) + && disk_full_peers.peers().values().any(|x| x.1) + { + return Ok(()); + } + } + for peer in self.region().get_peers() { + let (peer_id, store_id) = (peer.get_id(), peer.get_store_id()); + if disk_full_peers.peers().get(&peer_id).is_some() { + disk_full_stores.push(store_id); + } + } + } + let errmsg = format!( + "propose failed: tikv disk full, cmd diskFullOpt={:?}, leader diskUsage={:?}", + disk_full_opt, ctx.self_disk_usage + ); + Err(Error::DiskFull(disk_full_stores, errmsg)) + } + + pub fn clear_disk_full_peers(&mut self, ctx: &StoreContext) { + let disk_full_peers = mem::take(self.abnormal_peer_context_mut().disk_full_peers_mut()); + let raft = &mut self.raft_group_mut().raft; + for peer in disk_full_peers.peers().iter() { + raft.adjust_max_inflight_msgs(*peer.0, ctx.cfg.raft_max_inflight_msgs); + } + } + + pub fn refill_disk_full_peers(&mut self, ctx: &StoreContext) { + self.clear_disk_full_peers(ctx); + debug!( + self.logger, + "region id {}, peer id {}, store id {}: refill disk full peers when peer disk usage status changed or merge triggered", + self.region().get_id(), + self.peer_id(), + ctx.store_id, + ); + + // Collect disk full peers and all peers' `next_idx` to find a potential quorum. + let peers_len = self.region().get_peers().len(); + let mut normal_peers = HashSet::default(); + let mut next_idxs = Vec::with_capacity(peers_len); + let mut min_peer_index = u64::MAX; + for peer in self.region().get_peers() { + let (peer_id, store_id) = (peer.get_id(), peer.get_store_id()); + let usage = ctx.store_disk_usages.get(&store_id); + if usage.is_none() { + // Always treat the leader itself as normal. + normal_peers.insert(peer_id); + } + if let Some(pr) = self.raft_group().raft.prs().get(peer_id) { + // status 3-normal, 2-almostfull, 1-alreadyfull, only for simplying the sort + // func belowing. + let mut status = 3; + if let Some(usg) = usage { + status = match usg { + DiskUsage::Normal => 3, + DiskUsage::AlmostFull => 2, + DiskUsage::AlreadyFull => 1, + }; + } + + if !self.abnormal_peer_context().down_peers().contains(&peer_id) { + next_idxs.push((peer_id, pr.next_idx, usage, status)); + if min_peer_index > pr.next_idx { + min_peer_index = pr.next_idx; + } + } + } + } + if self.has_region_merge_proposal { + debug!( + self.logger, + "region id {}, peer id {}, store id {} has a merge request, with region_merge_proposal_index {}", + self.region_id(), + self.peer_id(), + ctx.store_id, + self.region_merge_proposal_index + ); + if min_peer_index > self.region_merge_proposal_index { + self.has_region_merge_proposal = false; + } + } + + if normal_peers.len() == peers_len { + return; + } + + // Reverse sort peers based on `next_idx`, `usage` and `store healthy status`, + // then try to get a potential quorum. + next_idxs.sort_by(|x, y| { + if x.3 == y.3 { + y.1.cmp(&x.1) + } else { + y.3.cmp(&x.3) + } + }); + + let majority = !self.raft_group().raft.prs().has_quorum(&normal_peers); + self.abnormal_peer_context_mut() + .disk_full_peers_mut() + .set_majority(majority); + // Here set all peers can be sent when merging. + for &(peer, _, usage, ..) in &next_idxs { + if let Some(usage) = usage { + if self.has_region_merge_proposal && !matches!(*usage, DiskUsage::AlreadyFull) { + self.abnormal_peer_context_mut() + .disk_full_peers_mut() + .peers_mut() + .insert(peer, (*usage, true)); + self.raft_group_mut() + .raft + .adjust_max_inflight_msgs(peer, ctx.cfg.raft_max_inflight_msgs); + debug!( + self.logger, + "refill disk full peer max inflight to {} on a merging region: region id {}, peer id {}", + ctx.cfg.raft_max_inflight_msgs, + self.region_id(), + peer + ); + } else { + self.abnormal_peer_context_mut() + .disk_full_peers_mut() + .peers_mut() + .insert(peer, (*usage, false)); + self.raft_group_mut().raft.adjust_max_inflight_msgs(peer, 0); + debug!( + self.logger, + "refill disk full peer max inflight to {} on region without merging: region id {}, peer id {}", + 0, + self.region_id(), + peer + ); + } + } + } + + if !self.abnormal_peer_context().disk_full_peers().majority() { + // Less than majority peers are in disk full status. + return; + } + + let (mut potential_quorum, mut quorum_ok) = (HashSet::default(), false); + let mut is_dangerous_set = false; + for &(peer_id, _, _, status) in &next_idxs { + potential_quorum.insert(peer_id); + + if status == 1 { + // already full peer. + is_dangerous_set = true; + } + + if self.raft_group().raft.prs().has_quorum(&potential_quorum) { + quorum_ok = true; + break; + } + } + + self.abnormal_peer_context_mut() + .setup_dangerous_majority_set(is_dangerous_set); + + // For the Peer with AlreadFull in potential quorum set, we still need to send + // logs to it. To support incoming configure change. + if quorum_ok { + let has_region_merge_proposal = self.has_region_merge_proposal; + let peers = self + .abnormal_peer_context_mut() + .disk_full_peers_mut() + .peers_mut(); + let mut inflight_peers = vec![]; + for peer in potential_quorum { + if let Some(x) = peers.get_mut(&peer) { + // It can help to establish a quorum. + x.1 = true; + // for merge region, all peers have been set to the max. + if !has_region_merge_proposal { + inflight_peers.push(peer); + } + } + } + debug!( + self.logger, + "refill disk full peer max inflight to 1 in potential quorum set: region id {}", + self.region_id(), + ); + self.adjust_peers_max_inflight_msgs(&inflight_peers, 1); + } + } + /// A peer can be destroyed in four cases: /// /// 1. Received a gc message; diff --git a/components/raftstore-v2/src/operation/pd.rs b/components/raftstore-v2/src/operation/pd.rs index 9bce8f3ba02..8e392755c5e 100644 --- a/components/raftstore-v2/src/operation/pd.rs +++ b/components/raftstore-v2/src/operation/pd.rs @@ -103,7 +103,7 @@ impl Peer { let task = pd::Task::RegionHeartbeat(pd::RegionHeartbeatTask { term: self.term(), region: self.region().clone(), - down_peers: self.collect_down_peers(ctx.cfg.max_peer_down_duration.0), + down_peers: self.collect_down_peers(ctx), peer: self.peer().clone(), pending_peers: self.collect_pending_peers(ctx), written_bytes: self.self_stat().written_bytes, diff --git a/components/raftstore-v2/src/operation/query/lease.rs b/components/raftstore-v2/src/operation/query/lease.rs index 84a8ad09ed3..189986f93d2 100644 --- a/components/raftstore-v2/src/operation/query/lease.rs +++ b/components/raftstore-v2/src/operation/query/lease.rs @@ -168,7 +168,7 @@ impl Peer { header.set_term(self.term()); let empty_data = SimpleWriteEncoder::with_capacity(0).encode(); let (ch, _) = CmdResChannel::pair(); - self.on_simple_write(ctx, header, empty_data, ch); + self.on_simple_write(ctx, header, empty_data, ch, None); } /// response the read index request diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index 1ff07f2ccc1..3ceb8693c0b 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -54,6 +54,7 @@ use tikv_util::{ log::SlogFormat, slog_panic, store::find_peer, + sys::disk::DiskUsage, time::{duration_to_sec, monotonic_raw_now, Duration}, }; @@ -265,6 +266,7 @@ impl Peer { "message_type" => %util::MsgType(&msg), "from_peer_id" => msg.get_from_peer().get_id(), "to_peer_id" => msg.get_to_peer().get_id(), + "disk_usage" => ?msg.disk_usage, ); if self.pause_for_replay() && msg.get_message().get_msg_type() == MessageType::MsgAppend { ctx.raft_metrics.message_dropped.recovery.inc(); @@ -287,6 +289,9 @@ impl Peer { return; } } + + self.handle_reported_disk_usage(ctx, &msg); + if msg.get_to_peer().get_store_id() != self.peer().get_store_id() { ctx.raft_metrics.message_dropped.mismatch_store_id.inc(); return; @@ -515,7 +520,11 @@ impl Peer { /// /// If the recipient can't be found, `None` is returned. #[inline] - fn build_raft_message(&mut self, msg: eraftpb::Message) -> Option { + fn build_raft_message( + &mut self, + msg: eraftpb::Message, + disk_usage: DiskUsage, + ) -> Option { let to_peer = match self.peer_from_cache(msg.to) { Some(p) => p, None => { @@ -530,6 +539,8 @@ impl Peer { }; let mut raft_msg = self.prepare_raft_message(); + // Fill in the disk usage. + raft_msg.set_disk_usage(disk_usage); raft_msg.set_to_peer(to_peer); if msg.from != self.peer().id { @@ -772,8 +783,9 @@ impl Peer { if !ready.messages().is_empty() { debug_assert!(self.is_leader()); + let disk_usage = ctx.self_disk_usage; for msg in ready.take_messages() { - if let Some(msg) = self.build_raft_message(msg) { + if let Some(msg) = self.build_raft_message(msg, disk_usage) { self.send_raft_message_on_leader(ctx, msg); } } @@ -802,10 +814,11 @@ impl Peer { self.on_advance_persisted_apply_index(ctx, prev_persisted, &mut write_task); if !ready.persisted_messages().is_empty() { + let disk_usage = ctx.self_disk_usage; write_task.messages = ready .take_persisted_messages() .into_iter() - .flat_map(|m| self.build_raft_message(m)) + .flat_map(|m| self.build_raft_message(m, disk_usage)) .collect(); } if self.has_pending_messages() { @@ -1069,6 +1082,16 @@ impl Peer { // Exit entry cache warmup state when the peer becomes leader. self.entry_storage_mut().clear_entry_cache_warmup_state(); + if !ctx.store_disk_usages.is_empty() { + self.refill_disk_full_peers(ctx); + debug!( + self.logger, + "become leader refills disk full peers to {:?}", + self.abnormal_peer_context().disk_full_peers(); + "region_id" => self.region_id(), + ); + } + self.region_heartbeat_pd(ctx); self.add_pending_tick(PeerTick::CompactLog); self.add_pending_tick(PeerTick::SplitRegionCheck); @@ -1209,6 +1232,52 @@ impl Peer { ); } } + + fn handle_reported_disk_usage( + &mut self, + ctx: &mut StoreContext, + msg: &RaftMessage, + ) { + let store_id = msg.get_from_peer().get_store_id(); + let peer_id = msg.get_from_peer().get_id(); + let disk_full_peers = self.abnormal_peer_context().disk_full_peers(); + let refill_disk_usages = if matches!(msg.disk_usage, DiskUsage::Normal) { + ctx.store_disk_usages.remove(&store_id); + if !self.is_leader() { + return; + } + disk_full_peers.has(peer_id) + } else { + ctx.store_disk_usages.insert(store_id, msg.disk_usage); + if !self.is_leader() { + return; + } + + disk_full_peers.is_empty() + || disk_full_peers + .get(peer_id) + .map_or(true, |x| x != msg.disk_usage) + }; + + if refill_disk_usages || self.has_region_merge_proposal { + let prev = disk_full_peers.get(peer_id); + if Some(msg.disk_usage) != prev { + info!( + self.logger, + "reported disk usage changes {:?} -> {:?}", prev, msg.disk_usage; + "region_id" => self.region_id(), + "peer_id" => peer_id, + ); + } + self.refill_disk_full_peers(ctx); + debug!( + self.logger, + "raft message refills disk full peers to {:?}", + self.abnormal_peer_context().disk_full_peers(); + "region_id" => self.region_id(), + ); + } + } } impl Storage { diff --git a/components/raftstore-v2/src/operation/txn_ext.rs b/components/raftstore-v2/src/operation/txn_ext.rs index 272b2526b39..4c875a675ef 100644 --- a/components/raftstore-v2/src/operation/txn_ext.rs +++ b/components/raftstore-v2/src/operation/txn_ext.rs @@ -9,7 +9,11 @@ use std::sync::{atomic::Ordering, Arc}; use crossbeam::atomic::AtomicCell; use engine_traits::{KvEngine, RaftEngine, CF_LOCK}; -use kvproto::{kvrpcpb::ExtraOp, metapb::Region, raft_cmdpb::RaftRequestHeader}; +use kvproto::{ + kvrpcpb::{DiskFullOpt, ExtraOp}, + metapb::Region, + raft_cmdpb::RaftRequestHeader, +}; use parking_lot::RwLockWriteGuard; use raft::eraftpb; use raftstore::store::{ @@ -266,8 +270,14 @@ impl Peer { self.logger, "propose {} locks before transferring leader", lock_count; ); - let PeerMsg::SimpleWrite(write) = PeerMsg::simple_write(header, encoder.encode()).0 else {unreachable!()}; - self.on_simple_write(ctx, write.header, write.data, write.ch); + let PeerMsg::SimpleWrite(write) = PeerMsg::simple_write_with_opt(header, encoder.encode(), DiskFullOpt::AllowedOnAlmostFull).0 else {unreachable!()}; + self.on_simple_write( + ctx, + write.header, + write.data, + write.ch, + Some(write.disk_full_opt), + ); true } } diff --git a/components/raftstore-v2/src/raft/peer.rs b/components/raftstore-v2/src/raft/peer.rs index 4ff47c4b4bb..2c8b8cef1db 100644 --- a/components/raftstore-v2/src/raft/peer.rs +++ b/components/raftstore-v2/src/raft/peer.rs @@ -32,6 +32,7 @@ use tikv_util::{slog_panic, time::duration_to_sec}; use super::storage::Storage; use crate::{ + batch::StoreContext, fsm::ApplyScheduler, operation::{ AbnormalPeerContext, AsyncWriter, BucketStatsInfo, CompactLogContext, DestroyProgress, @@ -126,6 +127,10 @@ pub struct Peer { abnormal_peer_context: AbnormalPeerContext, + // region merge logic need to be broadcast to all followers when disk full happens. + pub has_region_merge_proposal: bool, + pub region_merge_proposal_index: u64, + /// Force leader state is only used in online recovery when the majority of /// peers are missing. In this state, it forces one peer to become leader /// out of accordance with Raft election rule, and forbids any @@ -227,6 +232,8 @@ impl Peer { pending_messages: vec![], gc_peer_context: GcPeerContext::default(), abnormal_peer_context: AbnormalPeerContext::default(), + has_region_merge_proposal: false, + region_merge_proposal_index: 0_u64, force_leader_state: None, unsafe_recovery_state: None, }; @@ -600,7 +607,7 @@ impl Peer { ) } - pub fn collect_down_peers(&mut self, max_duration: Duration) -> Vec { + pub fn collect_down_peers(&mut self, ctx: &StoreContext) -> Vec { let mut down_peers = Vec::new(); let mut down_peer_ids = Vec::new(); let now = Instant::now(); @@ -610,7 +617,7 @@ impl Peer { } if let Some(instant) = self.peer_heartbeats.get(&p.get_id()) { let elapsed = now.saturating_duration_since(*instant); - if elapsed >= max_duration { + if elapsed >= ctx.cfg.max_peer_down_duration.0 { let mut stats = pdpb::PeerStats::default(); stats.set_peer(p.clone()); stats.set_down_seconds(elapsed.as_secs()); @@ -619,8 +626,11 @@ impl Peer { } } } + let exist_down_peers = !down_peer_ids.is_empty(); *self.abnormal_peer_context_mut().down_peers_mut() = down_peer_ids; - // TODO: `refill_disk_full_peers` + if exist_down_peers { + self.refill_disk_full_peers(ctx); + } down_peers } @@ -925,6 +935,11 @@ impl Peer { self.last_sent_snapshot_index } + #[inline] + pub fn next_proposal_index(&self) -> u64 { + self.raft_group.raft.raft_log.last_index() + 1 + } + #[inline] pub fn index_term(&self, idx: u64) -> u64 { match self.raft_group.raft.raft_log.term(idx) { diff --git a/components/raftstore-v2/src/router/message.rs b/components/raftstore-v2/src/router/message.rs index 16d43970e7a..830286bb142 100644 --- a/components/raftstore-v2/src/router/message.rs +++ b/components/raftstore-v2/src/router/message.rs @@ -6,6 +6,7 @@ use std::sync::{mpsc::SyncSender, Arc}; use collections::HashSet; use kvproto::{ import_sstpb::SstMeta, + kvrpcpb::DiskFullOpt, metapb, metapb::RegionEpoch, pdpb, @@ -134,6 +135,7 @@ pub struct SimpleWrite { pub header: Box, pub data: SimpleWriteBinary, pub ch: CmdResChannel, + pub disk_full_opt: DiskFullOpt, } #[derive(Debug)] @@ -296,6 +298,14 @@ impl PeerMsg { pub fn simple_write( header: Box, data: SimpleWriteBinary, + ) -> (Self, CmdResSubscriber) { + PeerMsg::simple_write_with_opt(header, data, DiskFullOpt::default()) + } + + pub fn simple_write_with_opt( + header: Box, + data: SimpleWriteBinary, + disk_full_opt: DiskFullOpt, ) -> (Self, CmdResSubscriber) { let (ch, sub) = CmdResChannel::pair(); ( @@ -304,6 +314,7 @@ impl PeerMsg { header, data, ch, + disk_full_opt, }), sub, ) diff --git a/components/raftstore/src/store/mod.rs b/components/raftstore/src/store/mod.rs index efd149e7c41..0ca99efffc4 100644 --- a/components/raftstore/src/store/mod.rs +++ b/components/raftstore/src/store/mod.rs @@ -57,8 +57,8 @@ pub use self::{ }, peer::{ can_amend_read, get_sync_log_from_request, make_transfer_leader_response, - propose_read_index, should_renew_lease, Peer, PeerStat, ProposalContext, ProposalQueue, - RequestInspector, RequestPolicy, TRANSFER_LEADER_COMMAND_REPLY_CTX, + propose_read_index, should_renew_lease, DiskFullPeers, Peer, PeerStat, ProposalContext, + ProposalQueue, RequestInspector, RequestPolicy, TRANSFER_LEADER_COMMAND_REPLY_CTX, }, peer_storage::{ clear_meta, do_snapshot, write_initial_apply_state, write_initial_raft_state, diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index 8ef857bfa12..e9350ba7bb0 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -5049,6 +5049,15 @@ impl DiskFullPeers { pub fn majority(&self) -> bool { self.majority } + pub fn set_majority(&mut self, majority: bool) { + self.majority = majority; + } + pub fn peers(&self) -> &HashMap { + &self.peers + } + pub fn peers_mut(&mut self) -> &mut HashMap { + &mut self.peers + } pub fn has(&self, peer_id: u64) -> bool { !self.peers.is_empty() && self.peers.contains_key(&peer_id) } diff --git a/components/test_raftstore-v2/src/cluster.rs b/components/test_raftstore-v2/src/cluster.rs index 9d61918bd1f..496f8cc87dc 100644 --- a/components/test_raftstore-v2/src/cluster.rs +++ b/components/test_raftstore-v2/src/cluster.rs @@ -37,7 +37,7 @@ use pd_client::PdClient; use raftstore::{ store::{ cmd_resp, initial_region, region_meta::RegionMeta, util::check_key_in_region, Bucket, - BucketRange, Callback, RegionSnapshot, TabletSnapManager, WriteResponse, + BucketRange, Callback, RaftCmdExtraOpts, RegionSnapshot, TabletSnapManager, WriteResponse, INIT_EPOCH_CONF_VER, INIT_EPOCH_VER, }, Error, Result, @@ -283,9 +283,18 @@ pub trait Simulator { } fn async_command_on_node( + &mut self, + node_id: u64, + request: RaftCmdRequest, + ) -> BoxFuture<'static, RaftCmdResponse> { + self.async_command_on_node_with_opts(node_id, request, RaftCmdExtraOpts::default()) + } + + fn async_command_on_node_with_opts( &mut self, node_id: u64, mut request: RaftCmdRequest, + opts: RaftCmdExtraOpts, ) -> BoxFuture<'static, RaftCmdResponse> { let region_id = request.get_header().get_region_id(); @@ -316,7 +325,11 @@ pub trait Simulator { _ => unreachable!(), } } - PeerMsg::simple_write(Box::new(request.take_header()), write_encoder.encode()) + PeerMsg::simple_write_with_opt( + Box::new(request.take_header()), + write_encoder.encode(), + opts.disk_full_opt, + ) }; self.async_peer_msg_on_node(node_id, region_id, msg) @@ -1275,6 +1288,20 @@ impl, EK: KvEngine> Cluster { .async_command_on_node(leader.get_store_id(), req) } + pub fn async_request_with_opts( + &mut self, + mut req: RaftCmdRequest, + opts: RaftCmdExtraOpts, + ) -> Result> { + let region_id = req.get_header().get_region_id(); + let leader = self.leader_of_region(region_id).unwrap(); + req.mut_header().set_peer(leader.clone()); + Ok(self + .sim + .wl() + .async_command_on_node_with_opts(leader.get_store_id(), req, opts)) + } + pub fn async_put( &mut self, key: &[u8], diff --git a/components/test_raftstore-v2/src/util.rs b/components/test_raftstore-v2/src/util.rs index d83dff12e9a..af2bab26183 100644 --- a/components/test_raftstore-v2/src/util.rs +++ b/components/test_raftstore-v2/src/util.rs @@ -14,16 +14,19 @@ use engine_test::raft::RaftTestEngine; use engine_traits::{CfName, KvEngine, TabletRegistry, CF_DEFAULT}; use file_system::IoRateLimiter; use futures::future::BoxFuture; +use grpcio::{ChannelBuilder, Environment}; use kvproto::{ encryptionpb::EncryptionMethod, - kvrpcpb::Context, + kvrpcpb::{Context, DiskFullOpt, GetResponse, Mutation, PrewriteResponse}, metapb, raft_cmdpb::{CmdType, RaftCmdRequest, RaftCmdResponse}, + tikvpb::TikvClient, }; use raftstore::{store::ReadResponse, Result}; use rand::{prelude::SliceRandom, RngCore}; use server::common::ConfiguredRaftEngine; use tempfile::TempDir; +use test_pd_client::TestPdClient; use test_raftstore::{new_get_cmd, new_put_cf_cmd, new_request, new_snap_cmd, sleep_ms, Config}; use tikv::{ server::KvEngineFactoryBuilder, @@ -479,3 +482,109 @@ pub fn wait_region_epoch_change, EK: KvEngine>( sleep_ms(10); } } + +pub struct PeerClient { + pub cli: TikvClient, + pub ctx: Context, +} + +impl PeerClient { + pub fn new( + cluster: &Cluster, EK>, + region_id: u64, + peer: metapb::Peer, + ) -> PeerClient { + let cli = { + let env = Arc::new(Environment::new(1)); + let channel = + ChannelBuilder::new(env).connect(&cluster.sim.rl().get_addr(peer.get_store_id())); + TikvClient::new(channel) + }; + let ctx = { + let epoch = cluster.get_region_epoch(region_id); + let mut ctx = Context::default(); + ctx.set_region_id(region_id); + ctx.set_peer(peer); + ctx.set_region_epoch(epoch); + ctx + }; + PeerClient { cli, ctx } + } + + pub fn kv_read(&self, key: Vec, ts: u64) -> GetResponse { + test_raftstore::kv_read(&self.cli, self.ctx.clone(), key, ts) + } + + pub fn must_kv_read_equal(&self, key: Vec, val: Vec, ts: u64) { + test_raftstore::must_kv_read_equal(&self.cli, self.ctx.clone(), key, val, ts) + } + + pub fn must_kv_write(&self, pd_client: &TestPdClient, kvs: Vec, pk: Vec) -> u64 { + test_raftstore::must_kv_write(pd_client, &self.cli, self.ctx.clone(), kvs, pk) + } + + pub fn must_kv_prewrite(&self, muts: Vec, pk: Vec, ts: u64) { + test_raftstore::must_kv_prewrite(&self.cli, self.ctx.clone(), muts, pk, ts) + } + + pub fn try_kv_prewrite( + &self, + muts: Vec, + pk: Vec, + ts: u64, + opt: DiskFullOpt, + ) -> PrewriteResponse { + let mut ctx = self.ctx.clone(); + ctx.disk_full_opt = opt; + test_raftstore::try_kv_prewrite(&self.cli, ctx, muts, pk, ts) + } + + pub fn must_kv_prewrite_async_commit(&self, muts: Vec, pk: Vec, ts: u64) { + test_raftstore::must_kv_prewrite_with( + &self.cli, + self.ctx.clone(), + muts, + pk, + ts, + 0, + true, + false, + ) + } + + pub fn must_kv_prewrite_one_pc(&self, muts: Vec, pk: Vec, ts: u64) { + test_raftstore::must_kv_prewrite_with( + &self.cli, + self.ctx.clone(), + muts, + pk, + ts, + 0, + false, + true, + ) + } + + pub fn must_kv_commit(&self, keys: Vec>, start_ts: u64, commit_ts: u64) { + test_raftstore::must_kv_commit( + &self.cli, + self.ctx.clone(), + keys, + start_ts, + commit_ts, + commit_ts, + ) + } + + pub fn must_kv_rollback(&self, keys: Vec>, start_ts: u64) { + test_raftstore::must_kv_rollback(&self.cli, self.ctx.clone(), keys, start_ts) + } + + pub fn must_kv_pessimistic_lock(&self, key: Vec, ts: u64) { + test_raftstore::must_kv_pessimistic_lock(&self.cli, self.ctx.clone(), key, ts) + } + + pub fn must_kv_pessimistic_rollback(&self, key: Vec, ts: u64) { + test_raftstore::must_kv_pessimistic_rollback(&self.cli, self.ctx.clone(), key, ts, ts) + } +} diff --git a/src/server/raftkv2/mod.rs b/src/server/raftkv2/mod.rs index 5183ecd6567..a80cdda392f 100644 --- a/src/server/raftkv2/mod.rs +++ b/src/server/raftkv2/mod.rs @@ -304,6 +304,7 @@ impl tikv_kv::Engine for RaftKv2 { data, ch, send_time: Instant::now_coarse(), + disk_full_opt: batch.disk_full_opt, }); let res = self .router diff --git a/tests/failpoints/cases/test_disk_full.rs b/tests/failpoints/cases/test_disk_full.rs index 217269bb5b8..d8b3fadb054 100644 --- a/tests/failpoints/cases/test_disk_full.rs +++ b/tests/failpoints/cases/test_disk_full.rs @@ -5,12 +5,12 @@ use std::{thread, time::Duration}; use kvproto::{ disk_usage::DiskUsage, kvrpcpb::{DiskFullOpt, Op}, - metapb::Region, raft_cmdpb::*, }; use raft::eraftpb::MessageType; use raftstore::store::msg::*; use test_raftstore::*; +use test_raftstore_macro::test_case; use tikv_util::{config::ReadableDuration, future::block_on_timeout, time::Instant}; fn assert_disk_full(resp: &RaftCmdResponse) { @@ -34,148 +34,147 @@ fn get_fp(usage: DiskUsage, store_id: u64) -> String { } // check the region new leader is elected. -fn assert_region_leader_changed( - cluster: &mut Cluster, - region_id: u64, - original_leader: u64, -) { - let timer = Instant::now(); - loop { - if timer.saturating_elapsed() > Duration::from_secs(5) { - panic!("Leader cannot change when the only disk full node is leader"); +macro_rules! assert_region_leader_changed { + ($cluster:expr, $region_id:expr, $original_leader:expr) => {{ + let timer = Instant::now(); + loop { + if timer.saturating_elapsed() > Duration::from_secs(5) { + panic!("Leader cannot change when the only disk full node is leader"); + } + let new_leader = $cluster.query_leader(1, $region_id, Duration::from_secs(1)); + if new_leader.is_none() { + sleep_ms(10); + continue; + } + if new_leader.unwrap().get_id() == $original_leader { + sleep_ms(10); + continue; + } else { + break; + } } - let new_leader = cluster.query_leader(1, region_id, Duration::from_secs(1)); - if new_leader.is_none() { - sleep_ms(10); - continue; - } - if new_leader.unwrap().get_id() == original_leader { - sleep_ms(10); - continue; - } else { - break; - } - } + }}; } -fn ensure_disk_usage_is_reported( - cluster: &mut Cluster, - peer_id: u64, - store_id: u64, - region: &Region, -) { - let peer = new_peer(store_id, peer_id); - let key = region.get_start_key(); - let ch = async_read_on_peer(cluster, peer, region.clone(), key, true, true); - block_on_timeout(ch, Duration::from_secs(1)).unwrap(); +macro_rules! ensure_disk_usage_is_reported { + ($cluster:expr, $peer_id:expr, $store_id:expr, $region:expr) => {{ + let peer = new_peer($store_id, $peer_id); + let key = $region.get_start_key(); + let ch = async_read_on_peer($cluster, peer, $region.clone(), key, true, true); + block_on_timeout(ch, Duration::from_secs(1)).unwrap(); + }}; } -fn test_disk_full_leader_behaviors(usage: DiskUsage) { - let mut cluster = new_node_cluster(0, 3); - cluster.pd_client.disable_default_operator(); - cluster.run(); - - // To ensure all replicas are not pending. - cluster.must_put(b"k1", b"v1"); - must_get_equal(&cluster.get_engine(1), b"k1", b"v1"); - must_get_equal(&cluster.get_engine(2), b"k1", b"v1"); - must_get_equal(&cluster.get_engine(3), b"k1", b"v1"); - - cluster.must_transfer_leader(1, new_peer(1, 1)); - fail::cfg(get_fp(usage, 1), "return").unwrap(); - - // Test new normal proposals won't be allowed when disk is full. - let old_last_index = cluster.raft_local_state(1, 1).last_index; - let rx = cluster.async_put(b"k2", b"v2").unwrap(); - assert_disk_full(&block_on_timeout(rx, Duration::from_secs(2)).unwrap()); - let new_last_index = cluster.raft_local_state(1, 1).last_index; - assert_eq!(old_last_index, new_last_index); - - assert_region_leader_changed(&mut cluster, 1, 1); - fail::remove(get_fp(usage, 1)); - cluster.must_transfer_leader(1, new_peer(1, 1)); - fail::cfg(get_fp(usage, 1), "return").unwrap(); - - // merge/split is only allowed on disk almost full. - if usage != DiskUsage::AlreadyFull { - // Test split must be allowed when disk is full. - let region = cluster.get_region(b"k1"); - cluster.must_split(®ion, b"k1"); +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] +fn test_disk_full_leader_behaviors() { + for usage in [DiskUsage::AlmostFull, DiskUsage::AlreadyFull] { + let mut cluster = new_cluster(0, 3); + cluster.cfg.raft_store.gc_peer_check_interval = ReadableDuration::millis(500); // set gc duration for v2 + cluster.pd_client.disable_default_operator(); + cluster.run(); + + // To ensure all replicas are not pending. + cluster.must_put(b"k1", b"v1"); + must_get_equal(&cluster.get_engine(1), b"k1", b"v1"); + must_get_equal(&cluster.get_engine(2), b"k1", b"v1"); + must_get_equal(&cluster.get_engine(3), b"k1", b"v1"); + + cluster.must_transfer_leader(1, new_peer(1, 1)); + fail::cfg(get_fp(usage, 1), "return").unwrap(); + + // Test new normal proposals won't be allowed when disk is full. + let old_last_index = cluster.raft_local_state(1, 1).last_index; + let rx = cluster.async_put(b"k2", b"v2").unwrap(); + assert_disk_full(&block_on_timeout(rx, Duration::from_secs(2)).unwrap()); + let new_last_index = cluster.raft_local_state(1, 1).last_index; + assert_eq!(old_last_index, new_last_index); + + assert_region_leader_changed!(&cluster, 1, 1); + fail::remove(get_fp(usage, 1)); + cluster.must_transfer_leader(1, new_peer(1, 1)); + fail::cfg(get_fp(usage, 1), "return").unwrap(); + + // merge/split is only allowed on disk almost full. + if usage != DiskUsage::AlreadyFull { + // Test split must be allowed when disk is full. + let region = cluster.get_region(b"k1"); + cluster.must_split(®ion, b"k1"); + } + // Test transfer leader should be allowed. + cluster.must_transfer_leader(1, new_peer(2, 2)); + + // Transfer the leadership back to store 1. + fail::remove(get_fp(usage, 1)); + cluster.must_transfer_leader(1, new_peer(1, 1)); + fail::cfg(get_fp(usage, 1), "return").unwrap(); + + // Test remove peer should be allowed. + cluster.pd_client.must_remove_peer(1, new_peer(3, 3)); + // Sleep for a while until the disk usage and peer changes have been synced. + thread::sleep(Duration::from_secs(1)); + must_get_none(&cluster.get_engine(3), b"k1"); + + // Test add peer should be allowed. It must be a higher peer-id in v2. + cluster.pd_client.must_add_peer(1, new_peer(3, 4)); + must_get_equal(&cluster.get_engine(3), b"k1", b"v1"); + + fail::remove(get_fp(usage, 1)); + // Sleep for a while before next case to make it clear. + thread::sleep(Duration::from_secs(1)); } - // Test transfer leader should be allowed. - cluster.must_transfer_leader(1, new_peer(2, 2)); - - // Transfer the leadership back to store 1. - fail::remove(get_fp(usage, 1)); - cluster.must_transfer_leader(1, new_peer(1, 1)); - fail::cfg(get_fp(usage, 1), "return").unwrap(); - - // Test remove peer should be allowed. - cluster.pd_client.must_remove_peer(1, new_peer(3, 3)); - must_get_none(&cluster.get_engine(3), b"k1"); - - // Test add peer should be allowed. - cluster.pd_client.must_add_peer(1, new_peer(3, 3)); - must_get_equal(&cluster.get_engine(3), b"k1", b"v1"); - - fail::remove(get_fp(usage, 1)); -} - -#[test] -fn test_disk_full_for_region_leader() { - test_disk_full_leader_behaviors(DiskUsage::AlmostFull); - test_disk_full_leader_behaviors(DiskUsage::AlreadyFull); -} - -fn test_disk_full_follower_behaviors(usage: DiskUsage) { - let mut cluster = new_node_cluster(0, 3); - cluster.pd_client.disable_default_operator(); - cluster.run(); - - // To ensure all replicas are not pending. - cluster.must_put(b"k1", b"v1"); - must_get_equal(&cluster.get_engine(1), b"k1", b"v1"); - must_get_equal(&cluster.get_engine(2), b"k1", b"v1"); - must_get_equal(&cluster.get_engine(3), b"k1", b"v1"); - - cluster.must_transfer_leader(1, new_peer(1, 1)); - fail::cfg(get_fp(usage, 2), "return").unwrap(); - - // Test followers will reject pre-transfer-leader command. - let epoch = cluster.get_region_epoch(1); - let transfer = new_admin_request(1, &epoch, new_transfer_leader_cmd(new_peer(2, 2))); - cluster - .call_command_on_leader(transfer, Duration::from_secs(3)) - .unwrap(); - assert_eq!(cluster.leader_of_region(1).unwrap(), new_peer(1, 1)); - cluster.must_put(b"k2", b"v2"); - - // Test leader shouldn't append entries to disk full followers. - let old_last_index = cluster.raft_local_state(1, 2).last_index; - cluster.must_put(b"k3", b"v3"); - let new_last_index = cluster.raft_local_state(1, 2).last_index; - assert_eq!(old_last_index, new_last_index); - must_get_none(&cluster.get_engine(2), b"k3"); - - // Test followers will response votes when disk is full. - cluster.add_send_filter(CloneFilterFactory( - RegionPacketFilter::new(1, 1) - .direction(Direction::Send) - .msg_type(MessageType::MsgRequestVoteResponse), - )); - cluster.must_transfer_leader(1, new_peer(3, 3)); - - fail::remove(get_fp(usage, 2)); } -#[test] -fn test_disk_full_for_region_follower() { - test_disk_full_follower_behaviors(DiskUsage::AlmostFull); - test_disk_full_follower_behaviors(DiskUsage::AlreadyFull); +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] +fn test_disk_full_follower_behaviors() { + for usage in [DiskUsage::AlmostFull, DiskUsage::AlreadyFull] { + let mut cluster = new_cluster(0, 3); + cluster.pd_client.disable_default_operator(); + cluster.run(); + + // To ensure all replicas are not pending. + cluster.must_put(b"k1", b"v1"); + must_get_equal(&cluster.get_engine(1), b"k1", b"v1"); + must_get_equal(&cluster.get_engine(2), b"k1", b"v1"); + must_get_equal(&cluster.get_engine(3), b"k1", b"v1"); + + cluster.must_transfer_leader(1, new_peer(1, 1)); + fail::cfg(get_fp(usage, 2), "return").unwrap(); + + // Test followers will reject pre-transfer-leader command. + let epoch = cluster.get_region_epoch(1); + let transfer = new_admin_request(1, &epoch, new_transfer_leader_cmd(new_peer(2, 2))); + cluster + .call_command_on_leader(transfer, Duration::from_secs(3)) + .unwrap(); + assert_eq!(cluster.leader_of_region(1).unwrap(), new_peer(1, 1)); + cluster.must_put(b"k2", b"v2"); + + // Test leader shouldn't append entries to disk full followers. + let old_last_index = cluster.raft_local_state(1, 2).last_index; + cluster.must_put(b"k3", b"v3"); + let new_last_index = cluster.raft_local_state(1, 2).last_index; + assert_eq!(old_last_index, new_last_index); + must_get_none(&cluster.get_engine(2), b"k3"); + + // Test followers will response votes when disk is full. + cluster.add_send_filter(CloneFilterFactory( + RegionPacketFilter::new(1, 1) + .direction(Direction::Send) + .msg_type(MessageType::MsgRequestVoteResponse), + )); + cluster.must_transfer_leader(1, new_peer(3, 3)); + + fail::remove(get_fp(usage, 2)); + } } -fn test_disk_full_txn_behaviors(usage: DiskUsage) { - let mut cluster = new_server_cluster(0, 3); +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] +fn test_disk_full_txn_behaviors() { + let usage = DiskUsage::AlmostFull; + let mut cluster = new_cluster(0, 3); cluster.pd_client.disable_default_operator(); cluster.run(); @@ -199,7 +198,7 @@ fn test_disk_full_txn_behaviors(usage: DiskUsage) { DiskFullOpt::NotAllowedOnFull, ); assert!(res.get_region_error().has_disk_full()); - assert_region_leader_changed(&mut cluster, 1, 1); + assert_region_leader_changed!(&cluster, 1, 1); fail::remove(get_fp(usage, 1)); cluster.must_transfer_leader(1, new_peer(1, 1)); @@ -269,16 +268,13 @@ fn test_disk_full_txn_behaviors(usage: DiskUsage) { fail::remove(get_fp(usage, 1)); } -#[test] -fn test_disk_full_for_txn_operations() { - test_disk_full_txn_behaviors(DiskUsage::AlmostFull); -} - -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_majority_disk_full() { - let mut cluster = new_node_cluster(0, 3); + let mut cluster = new_cluster(0, 3); // To ensure the thread has full store disk usage infomation. cluster.cfg.raft_store.store_batch_system.pool_size = 1; + cluster.cfg.raft_store.gc_peer_check_interval = ReadableDuration::millis(500); // set gc duration for v2 cluster.pd_client.disable_default_operator(); cluster.run(); @@ -295,7 +291,7 @@ fn test_majority_disk_full() { // To ensure followers have reported disk usages to the leader. for i in 1..3 { fail::cfg(get_fp(DiskUsage::AlmostFull, i + 1), "return").unwrap(); - ensure_disk_usage_is_reported(&mut cluster, i + 1, i + 1, ®ion); + ensure_disk_usage_is_reported!(&mut cluster, i + 1, i + 1, ®ion); } // Normal proposals will be rejected because of majority peers' disk full. @@ -319,14 +315,14 @@ fn test_majority_disk_full() { // new disk usages are reported. for i in 1..3 { fail::remove(get_fp(DiskUsage::AlmostFull, i + 1)); - ensure_disk_usage_is_reported(&mut cluster, i + 1, i + 1, ®ion); + ensure_disk_usage_is_reported!(&mut cluster, i + 1, i + 1, ®ion); must_get_equal(&cluster.get_engine(i + 1), b"k3", b"v3"); } // To ensure followers have reported disk usages to the leader. for i in 1..3 { fail::cfg(get_fp(DiskUsage::AlreadyFull, i + 1), "return").unwrap(); - ensure_disk_usage_is_reported(&mut cluster, i + 1, i + 1, ®ion); + ensure_disk_usage_is_reported!(&mut cluster, i + 1, i + 1, ®ion); } // Proposals with special `DiskFullOpt`s will still be rejected if majority @@ -342,10 +338,12 @@ fn test_majority_disk_full() { // Peer 2 disk usage changes from already full to almost full. fail::remove(get_fp(DiskUsage::AlreadyFull, 2)); fail::cfg(get_fp(DiskUsage::AlmostFull, 2), "return").unwrap(); - ensure_disk_usage_is_reported(&mut cluster, 2, 2, ®ion); + ensure_disk_usage_is_reported!(&mut cluster, 2, 2, ®ion); - // Configuration change should be alloed. + // Configuration change should be allowed. cluster.pd_client.must_remove_peer(1, new_peer(2, 2)); + // Sleep for a while until the disk usage and peer changes have been synced. + thread::sleep(Duration::from_secs(1)); // After the last configuration change is applied, the raft group will be like // `[(1, DiskUsage::AlmostFull), (3, DiskUsage::AlreadyFull)]`. So no more @@ -364,9 +362,10 @@ fn test_majority_disk_full() { } } -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_disk_full_followers_with_hibernate_regions() { - let mut cluster = new_node_cluster(0, 2); + let mut cluster = new_cluster(0, 2); // To ensure the thread has full store disk usage infomation. cluster.cfg.raft_store.store_batch_system.pool_size = 1; cluster.pd_client.disable_default_operator(); @@ -391,31 +390,13 @@ fn test_disk_full_followers_with_hibernate_regions() { must_get_equal(&cluster.get_engine(2), b"k2", b"v2"); } -// check the region new leader is elected. -fn assert_region_merged( - cluster: &mut Cluster, - left_region_key: &[u8], - right_region_key: &[u8], -) { - let timer = Instant::now(); - loop { - if timer.saturating_elapsed() > Duration::from_secs(5) { - panic!("region merge failed"); - } - let region_left = cluster.get_region(left_region_key); - let region_right = cluster.get_region(right_region_key); - if region_left.get_id() != region_right.get_id() { - sleep_ms(10); - continue; - } else { - break; - } - } -} - -#[test] +// #[test_case(test_raftstore_v2::new_server_cluster)] +// FIXME: #[test_case(test_raftstore_v2::new_server_cluster)] +// In v2 `must_try_merge` always return error. Also the last `must_merge` +// sometimes cannot get an updated min_matched. +#[test_case(test_raftstore::new_server_cluster)] fn test_merge_on_majority_disk_full() { - let mut cluster = new_server_cluster(0, 3); + let mut cluster = new_cluster(0, 3); // To ensure the thread has full store disk usage infomation. cluster.cfg.raft_store.store_batch_system.pool_size = 1; cluster.pd_client.disable_default_operator(); @@ -448,23 +429,42 @@ fn test_merge_on_majority_disk_full() { fail::cfg(get_fp(DiskUsage::AlmostFull, i), "return").unwrap(); } for peer in region1.get_peers().iter() { - ensure_disk_usage_is_reported(&mut cluster, peer.get_id(), peer.get_store_id(), ®ion1); + ensure_disk_usage_is_reported!(&mut cluster, peer.get_id(), peer.get_store_id(), ®ion1); } for peer in region2.get_peers().iter() { - ensure_disk_usage_is_reported(&mut cluster, peer.get_id(), peer.get_store_id(), ®ion2); + ensure_disk_usage_is_reported!(&mut cluster, peer.get_id(), peer.get_store_id(), ®ion2); } cluster.must_try_merge(region1.get_id(), region2.get_id()); - assert_region_merged(&mut cluster, b"k1", b"k3"); + + // check the region new leader is elected. + let assert_region_merged = |left_region_key: &[u8], right_region_key: &[u8]| { + let timer = Instant::now(); + loop { + if timer.saturating_elapsed() > Duration::from_secs(5) { + panic!("region merge failed"); + } + let region_left = cluster.get_region(left_region_key); + let region_right = cluster.get_region(right_region_key); + if region_left.get_id() != region_right.get_id() { + sleep_ms(10); + continue; + } else { + break; + } + } + }; + assert_region_merged(b"k1", b"k3"); for i in 1..3 { fail::remove(get_fp(DiskUsage::AlmostFull, i)); } } -#[test] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_almost_and_already_full_behavior() { - let mut cluster = new_server_cluster(0, 5); + let mut cluster = new_cluster(0, 5); // To ensure the thread has full store disk usage infomation. cluster.cfg.raft_store.store_batch_system.pool_size = 1; cluster.pd_client.disable_default_operator(); @@ -481,7 +481,7 @@ fn test_almost_and_already_full_behavior() { fail::cfg(get_fp(DiskUsage::AlreadyFull, i), "return").unwrap(); } for i in 1..5 { - ensure_disk_usage_is_reported(&mut cluster, i + 1, i + 1, ®ion); + ensure_disk_usage_is_reported!(&mut cluster, i + 1, i + 1, ®ion); } let lead_client = PeerClient::new(&cluster, 1, new_peer(1, 1)); @@ -521,29 +521,10 @@ fn test_almost_and_already_full_behavior() { } } -fn wait_down_peers_reported( - cluster: &Cluster, - total_down_count: u64, - target_report_peer: u64, -) { - let mut peers = cluster.get_down_peers(); - let timer = Instant::now(); - loop { - if timer.saturating_elapsed() > Duration::from_secs(5) { - panic!("Leader cannot change when the only disk full node is leader"); - } - - if peers.len() == total_down_count as usize && peers.contains_key(&target_report_peer) { - return; - } - sleep_ms(10); - peers = cluster.get_down_peers(); - } -} - -#[test] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_down_node_when_disk_full() { - let mut cluster = new_server_cluster(0, 5); + let mut cluster = new_cluster(0, 5); // To ensure the thread has full store disk usage infomation. cluster.cfg.raft_store.store_batch_system.pool_size = 1; cluster.cfg.raft_store.max_peer_down_duration = ReadableDuration::secs(1); @@ -555,7 +536,7 @@ fn test_down_node_when_disk_full() { let region = cluster.get_region(b"k1"); for i in 3..6 { fail::cfg(get_fp(DiskUsage::AlmostFull, i), "return").unwrap(); - ensure_disk_usage_is_reported(&mut cluster, i, i, ®ion); + ensure_disk_usage_is_reported!(&mut cluster, i, i, ®ion); } let lead_client = PeerClient::new(&cluster, 1, new_peer(1, 1)); @@ -574,7 +555,23 @@ fn test_down_node_when_disk_full() { ); cluster.stop_node(2); - wait_down_peers_reported(&cluster, 1, 2u64); + + let wait_down_peers_reported = |total_down_count: u64, target_report_peer: u64| { + let mut peers = cluster.get_down_peers(); + let timer = Instant::now(); + loop { + if timer.saturating_elapsed() > Duration::from_secs(5) { + panic!("Leader cannot change when the only disk full node is leader"); + } + + if peers.len() == total_down_count as usize && peers.contains_key(&target_report_peer) { + return; + } + sleep_ms(10); + peers = cluster.get_down_peers(); + } + }; + wait_down_peers_reported(1u64, 2u64); let prewrite_ts = get_tso(&cluster.pd_client); let res = lead_client.try_kv_prewrite( diff --git a/tests/integrations/raftstore/test_stale_read.rs b/tests/integrations/raftstore/test_stale_read.rs index 24e13003f7e..5de9bda1f64 100644 --- a/tests/integrations/raftstore/test_stale_read.rs +++ b/tests/integrations/raftstore/test_stale_read.rs @@ -8,7 +8,7 @@ use kvproto::{ metapb::{Peer, Region}, tikvpb_grpc::TikvClient, }; -use test_raftstore::{must_get_equal, new_mutation, new_peer, PeerClient}; +use test_raftstore::{must_get_equal, new_mutation, new_peer}; use test_raftstore_macro::test_case; use tikv_util::{config::ReadableDuration, time::Instant}; From e29d3a989d73f0a1c1534114dc530d3c3200d38d Mon Sep 17 00:00:00 2001 From: Neil Shen Date: Wed, 11 Oct 2023 15:09:25 +0800 Subject: [PATCH 084/220] raftstore-v2: fix non-deterministic region merge (#15697) close tikv/tikv#15682 This commit addresses the issue where a "region corrupted" error still occurs in certain scenarios despite PR #15625 resolving the problem in the transfer leader scenario. The root cause of the issue is the non-deterministic nature of commit merge and rollback merge, allowing transient errors during propose to trigger the problem again. To fix this issue, the proposed solution ensures that TiKV only initiates rollback merge when either the target region is not found or the epoch has increased. Signed-off-by: Neil Shen Co-authored-by: tonyxuqqi Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- .../operation/command/admin/merge/commit.rs | 106 +++++++++--------- .../tests/failpoints/test_merge.rs | 9 +- components/raftstore/src/store/peer.rs | 2 + tests/failpoints/cases/test_merge.rs | 94 +++++++++++++++- 4 files changed, 153 insertions(+), 58 deletions(-) diff --git a/components/raftstore-v2/src/operation/command/admin/merge/commit.rs b/components/raftstore-v2/src/operation/command/admin/merge/commit.rs index e95a13600fb..b12ba9eaf9d 100644 --- a/components/raftstore-v2/src/operation/command/admin/merge/commit.rs +++ b/components/raftstore-v2/src/operation/command/admin/merge/commit.rs @@ -178,6 +178,11 @@ impl Peer { self.region_id() == 2, |_| {} ); + fail::fail_point!( + "ask_target_peer_to_commit_merge_store_1", + store_ctx.store_id == 1, + |_| {} + ); let state = self.applied_merge_state().unwrap(); let target = state.get_target(); let target_id = target.get_id(); @@ -295,7 +300,10 @@ impl Peer { target_id: self.region_id(), }, ); - } else if util::is_epoch_stale(expected_epoch, region.get_region_epoch()) { + return; + } + // current region_epoch > region epoch in commit merge. + if util::is_epoch_stale(expected_epoch, region.get_region_epoch()) { info!( self.logger, "reject commit merge because of stale"; @@ -306,63 +314,51 @@ impl Peer { let _ = store_ctx .router .force_send(source_id, PeerMsg::RejectCommitMerge { index }); - } else if expected_epoch == region.get_region_epoch() { - assert!( - util::is_sibling_regions(source_region, region), - "{}: {:?}, {:?}", - SlogFormat(&self.logger), - source_region, - region - ); - assert!( - region_on_same_stores(source_region, region), - "{:?}, {:?}", - source_region, - region - ); - assert!(!self.storage().has_dirty_data()); - if self.is_leader() && !self.leader_transferring() { - let index = commit_of_merge(req.get_admin_request().get_commit_merge()); - if self.proposal_control().is_merging() { - // `on_admin_command` may delay our request indefinitely. It's better to check - // directly. - info!( - self.logger, - "reject commit merge because of target is merging with another region"; - ); - } else { - let (ch, res) = CmdResChannel::pair(); - self.on_admin_command(store_ctx, req, ch); - if let Some(res) = res.take_result() - && res.get_header().has_error() - { - error!( - self.logger, - "failed to propose commit merge"; - "source" => source_id, - "res" => ?res, - ); - } else { - fail::fail_point!("on_propose_commit_merge_success"); - return; - } - } - let _ = store_ctx - .router - .force_send(source_id, PeerMsg::RejectCommitMerge { index }); - } else if self.leader_transferring() { - info!( - self.logger, - "not to propose commit merge when transferring leader"; - "transferee" => self.leader_transferee(), - ); - } - } else { + return; + } + // current region_epoch < region epoch in commit merge. + if util::is_epoch_stale(region.get_region_epoch(), expected_epoch) { info!( self.logger, - "ignore commit merge because self epoch is stale"; + "target region still not catch up, skip."; "source" => ?source_region, + "target_region_epoch" => ?expected_epoch, + "exist_region_epoch" => ?self.region().get_region_epoch(), ); + return; + } + assert!( + util::is_sibling_regions(source_region, region), + "{}: {:?}, {:?}", + SlogFormat(&self.logger), + source_region, + region + ); + assert!( + region_on_same_stores(source_region, region), + "{:?}, {:?}", + source_region, + region + ); + assert!(!self.storage().has_dirty_data()); + let (ch, res) = CmdResChannel::pair(); + self.on_admin_command(store_ctx, req, ch); + if let Some(res) = res.take_result() + && res.get_header().has_error() + { + error!( + self.logger, + "failed to propose commit merge"; + "source" => source_id, + "res" => ?res, + ); + fail::fail_point!( + "on_propose_commit_merge_fail_store_1", + store_ctx.store_id == 1, + |_| {} + ); + } else { + fail::fail_point!("on_propose_commit_merge_success"); } } @@ -691,6 +687,8 @@ impl Peer { info!( self.logger, "become follower for new logs"; + "first_log_term" => first.term, + "first_log_index" => first.index, "new_log_term" => last_log.term, "new_log_index" => last_log.index, "term" => self.term(), diff --git a/components/raftstore-v2/tests/failpoints/test_merge.rs b/components/raftstore-v2/tests/failpoints/test_merge.rs index 890b8c5e27a..11fe666b49b 100644 --- a/components/raftstore-v2/tests/failpoints/test_merge.rs +++ b/components/raftstore-v2/tests/failpoints/test_merge.rs @@ -7,7 +7,7 @@ use std::{ use engine_traits::Peekable; use raftstore_v2::router::{PeerMsg, PeerTick}; -use tikv_util::store::new_peer; +use tikv_util::{config::ReadableDuration, info, store::new_peer}; use crate::cluster::{ life_helper::assert_peer_not_exist, @@ -179,7 +179,9 @@ fn test_rollback() { // Target is merging. #[test] fn test_merge_conflict_0() { - let mut cluster = Cluster::default(); + let mut cluster = Cluster::with_configs(1, None, None, |cfg| { + cfg.merge_check_tick_interval = ReadableDuration::millis(100); + }); let store_id = cluster.node(0).id(); let router = &mut cluster.routers[0]; @@ -216,6 +218,7 @@ fn test_merge_conflict_0() { format!("k{}", region_3_id).as_bytes(), false, ); + info!("regions: {:?}, {:?}, {:?}", region_1, region_2, region_3); // pause merge progress of 2+3. let fp = fail::FailGuard::new("apply_commit_merge", "pause"); @@ -236,9 +239,9 @@ fn test_merge_conflict_0() { .unwrap(); let region_2 = cluster.routers[0].region_detail(region_2.get_id()); merge_region(&cluster, 0, region_1, peer_1, region_2, false); + drop(fp); // wait for rollback. rx.recv_timeout(std::time::Duration::from_secs(1)).unwrap(); - drop(fp); fail::remove("apply_rollback_merge"); // Check region 1 is not merged and can serve writes. diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index e9350ba7bb0..85b8798bfb1 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -1086,6 +1086,8 @@ where // of term explicitly to get correct metadata. info!( "become follower for new logs"; + "first_log_term" => first.term, + "first_log_index" => first.index, "new_log_term" => last_log.term, "new_log_index" => last_log.index, "term" => self.term(), diff --git a/tests/failpoints/cases/test_merge.rs b/tests/failpoints/cases/test_merge.rs index 861e4a658ce..ffbd69dc05e 100644 --- a/tests/failpoints/cases/test_merge.rs +++ b/tests/failpoints/cases/test_merge.rs @@ -20,7 +20,7 @@ use kvproto::{ use pd_client::PdClient; use raft::eraftpb::MessageType; use raftstore::store::*; -use raftstore_v2::router::PeerMsg; +use raftstore_v2::router::{PeerMsg, PeerTick}; use test_raftstore::*; use test_raftstore_macro::test_case; use tikv::storage::{kv::SnapshotExt, Snapshot}; @@ -1848,6 +1848,98 @@ fn test_concurrent_between_transfer_leader_and_merge() { cluster.must_put(b"k4", b"v4"); } +#[test] +fn test_deterministic_commit_rollback_merge() { + use test_raftstore_v2::*; + let mut cluster = new_node_cluster(0, 3); + configure_for_merge(&mut cluster.cfg); + // Use a large election tick to stable test. + configure_for_lease_read(&mut cluster.cfg, None, Some(1000)); + // Use 2 threads for polling peers, so that they can run concurrently. + cluster.cfg.raft_store.store_batch_system.pool_size = 2; + cluster.cfg.raft_store.store_batch_system.max_batch_size = Some(1); + cluster.run(); + + let pd_client = Arc::clone(&cluster.pd_client); + let region = pd_client.get_region(b"k1").unwrap(); + cluster.must_split(®ion, b"k2"); + + let left = pd_client.get_region(b"k1").unwrap(); + let right = pd_client.get_region(b"k3").unwrap(); + let right_1 = find_peer(&right, 1).unwrap().clone(); + cluster.must_transfer_leader(right.get_id(), right_1); + let left_2 = find_peer(&left, 2).unwrap().clone(); + cluster.must_transfer_leader(left.get_id(), left_2); + + cluster.must_put(b"k1", b"v1"); + cluster.must_put(b"k3", b"v3"); + for i in 0..3 { + must_get_equal(&cluster.get_engine(i + 1), b"k1", b"v1"); + must_get_equal(&cluster.get_engine(i + 1), b"k3", b"v3"); + } + + // Delay 1003 apply by dropping append response, so that proposal will fail + // due to applied_term != current_term. + let target_region_id = left.get_id(); + cluster.add_recv_filter_on_node( + 1, + Box::new(DropMessageFilter::new(Arc::new(move |m| { + if m.get_region_id() == target_region_id { + return m.get_message().get_msg_type() != MessageType::MsgAppendResponse; + } + true + }))), + ); + + let left_1 = find_peer(&left, 1).unwrap().clone(); + cluster.must_transfer_leader(left.get_id(), left_1); + + // left(1000) <- right(1). + let (tx1, rx1) = channel(); + let (tx2, rx2) = channel(); + let tx1 = Mutex::new(tx1); + let rx2 = Mutex::new(rx2); + fail::cfg_callback("on_propose_commit_merge_fail_store_1", move || { + tx1.lock().unwrap().send(()).unwrap(); + rx2.lock().unwrap().recv().unwrap(); + }) + .unwrap(); + cluster.merge_region(right.get_id(), left.get_id(), Callback::None); + + // Wait for target fails to propose commit merge. + rx1.recv_timeout(Duration::from_secs(5)).unwrap(); + // Let target apply continue, and new AskCommitMerge messages will propose + // commit merge successfully. + cluster.clear_recv_filter_on_node(1); + + // Trigger a CheckMerge tick, so source will send a AskCommitMerge again. + fail::cfg("ask_target_peer_to_commit_merge_store_1", "pause").unwrap(); + let router = cluster.get_router(1).unwrap(); + router + .check_send(1, PeerMsg::Tick(PeerTick::CheckMerge)) + .unwrap(); + + // Send RejectCommitMerge to source. + tx2.send(()).unwrap(); + fail::remove("on_propose_commit_merge_fail_store_1"); + + // Wait for target applies to current term. + cluster.must_put(b"k1", b"v11"); + + // By remove the failpoint, CheckMerge tick sends a AskCommitMerge again. + fail::remove("ask_target_peer_to_commit_merge_store_1"); + // At this point, source region will propose rollback merge if commit merge + // is not deterministic. + + // Wait for source handle commit or rollback merge. + wait_region_epoch_change(&cluster, &left, Duration::from_secs(5)); + + // No matter commit merge or rollback merge, cluster must be available to + // process requests + cluster.must_put(b"k0", b"v0"); + cluster.must_put(b"k4", b"v4"); +} + struct MsgVoteFilter {} impl Filter for MsgVoteFilter { From 08a2d654549105104bb701179586256402dbcadd Mon Sep 17 00:00:00 2001 From: glorv Date: Thu, 12 Oct 2023 12:10:55 +0800 Subject: [PATCH 085/220] coprocessor: do not treat deadline exceeded error as other error (#15709) ref tikv/tikv#15566 Signed-off-by: glorv --- src/coprocessor/dag/mod.rs | 50 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 48 insertions(+), 2 deletions(-) diff --git a/src/coprocessor/dag/mod.rs b/src/coprocessor/dag/mod.rs index 31a6df181d5..bd077c5c0ba 100644 --- a/src/coprocessor/dag/mod.rs +++ b/src/coprocessor/dag/mod.rs @@ -143,7 +143,9 @@ fn handle_qe_response( can_be_cached: bool, data_version: Option, ) -> Result { - use tidb_query_common::error::ErrorInner; + use tidb_query_common::error::{ErrorInner, EvaluateError}; + + use crate::coprocessor::Error; match result { Ok((sel_resp, range)) => { @@ -162,6 +164,7 @@ fn handle_qe_response( } Err(err) => match *err.0 { ErrorInner::Storage(err) => Err(err.into()), + ErrorInner::Evaluate(EvaluateError::DeadlineExceeded) => Err(Error::DeadlineExceeded), ErrorInner::Evaluate(err) => { let mut resp = Response::default(); let mut sel_resp = SelectResponse::default(); @@ -179,7 +182,9 @@ fn handle_qe_response( fn handle_qe_stream_response( result: tidb_query_common::Result<(Option<(StreamResponse, IntervalRange)>, bool)>, ) -> Result<(Option, bool)> { - use tidb_query_common::error::ErrorInner; + use tidb_query_common::error::{ErrorInner, EvaluateError}; + + use crate::coprocessor::Error; match result { Ok((Some((s_resp, range)), finished)) => { @@ -192,6 +197,7 @@ fn handle_qe_stream_response( Ok((None, finished)) => Ok((None, finished)), Err(err) => match *err.0 { ErrorInner::Storage(err) => Err(err.into()), + ErrorInner::Evaluate(EvaluateError::DeadlineExceeded) => Err(Error::DeadlineExceeded), ErrorInner::Evaluate(err) => { let mut resp = Response::default(); let mut s_resp = StreamResponse::default(); @@ -203,3 +209,43 @@ fn handle_qe_stream_response( }, } } + +#[cfg(test)] +mod tests { + use anyhow::anyhow; + use protobuf::Message; + use tidb_query_common::error::{Error as CommonError, EvaluateError, StorageError}; + + use super::*; + use crate::coprocessor::Error; + + #[test] + fn test_handle_qe_response() { + // Ok Response + let ok_res = Ok((SelectResponse::default(), None)); + let res = handle_qe_response(ok_res, true, Some(1)).unwrap(); + assert!(res.can_be_cached); + assert_eq!(res.get_cache_last_version(), 1); + let mut select_res = SelectResponse::new(); + Message::merge_from_bytes(&mut select_res, res.get_data()).unwrap(); + assert!(!select_res.has_error()); + + // Storage Error + let storage_err = CommonError::from(StorageError(anyhow!("unknown"))); + let res = handle_qe_response(Err(storage_err), false, None); + assert!(matches!(res, Err(Error::Other(_)))); + + // Evaluate Error + let err = CommonError::from(EvaluateError::DeadlineExceeded); + let res = handle_qe_response(Err(err), false, None); + assert!(matches!(res, Err(Error::DeadlineExceeded))); + + let err = CommonError::from(EvaluateError::InvalidCharacterString { + charset: "test".into(), + }); + let res = handle_qe_response(Err(err), false, None).unwrap(); + let mut select_res = SelectResponse::new(); + Message::merge_from_bytes(&mut select_res, res.get_data()).unwrap(); + assert_eq!(select_res.get_error().get_code(), 1300); + } +} From 2d7616e3f8e3d254bbfa8d82f3980547073d1948 Mon Sep 17 00:00:00 2001 From: glorv Date: Thu, 12 Oct 2023 12:25:55 +0800 Subject: [PATCH 086/220] raftstore-v2: adjust max-background-flushes default value (#15723) ref tikv/tikv#14470 Co-authored-by: tonyxuqqi Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- src/config/mod.rs | 185 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 129 insertions(+), 56 deletions(-) diff --git a/src/config/mod.rs b/src/config/mod.rs index 0eb006363f0..74f25a22ef6 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -244,22 +244,30 @@ const RAFTDB_DEFAULT_BACKGROUND_JOB_LIMITS: BackgroundJobLimits = BackgroundJobL // `defaults` serves as an upper bound for returning limits. fn get_background_job_limits_impl( + engine_type: EngineType, cpu_num: u32, defaults: &BackgroundJobLimits, ) -> BackgroundJobLimits { // At the minimum, we should have two background jobs: one for flush and one for // compaction. Otherwise, the number of background jobs should not exceed // cpu_num - 1. - let max_background_jobs = cmp::max(2, cmp::min(defaults.max_background_jobs, cpu_num - 1)); + let mut max_background_jobs = cmp::max(2, cmp::min(defaults.max_background_jobs, cpu_num - 1)); // Scale flush threads proportionally to cpu cores. Also make sure the number of // flush threads doesn't exceed total jobs. let max_background_flushes = cmp::min( (max_background_jobs + 3) / 4, defaults.max_background_flushes, ); - // Cap max_sub_compactions to allow at least two compactions. - let max_compactions = max_background_jobs - max_background_flushes; + + // set the default compaction threads differently for v1 and v2: + // v1: cap max_sub_compactions to allow at least two compactions. + // v2: decrease the compaction threads to make the qps more stable. + let max_compactions = match engine_type { + EngineType::RaftKv => max_background_jobs - max_background_flushes, + EngineType::RaftKv2 => (max_background_jobs + 7) / 8, + }; let max_sub_compactions: u32 = (max_compactions - 1).clamp(1, defaults.max_sub_compactions); + max_background_jobs = max_background_flushes + max_compactions; // Maximum background GC threads for Titan let max_titan_background_gc = cmp::min(defaults.max_titan_background_gc, cpu_num); @@ -271,9 +279,12 @@ fn get_background_job_limits_impl( } } -fn get_background_job_limits(defaults: &BackgroundJobLimits) -> BackgroundJobLimits { +fn get_background_job_limits( + engine_type: EngineType, + defaults: &BackgroundJobLimits, +) -> BackgroundJobLimits { let cpu_num = cmp::max(SysQuota::cpu_cores_quota() as u32, 1); - get_background_job_limits_impl(cpu_num, defaults) + get_background_job_limits_impl(engine_type, cpu_num, defaults) } macro_rules! cf_config { @@ -1308,19 +1319,14 @@ pub struct DbResources { impl Default for DbConfig { fn default() -> DbConfig { - let bg_job_limits = get_background_job_limits(&KVDB_DEFAULT_BACKGROUND_JOB_LIMITS); - let titan_config = TitanDbConfig { - max_background_gc: bg_job_limits.max_titan_background_gc as i32, - ..Default::default() - }; DbConfig { wal_recovery_mode: DBRecoveryMode::PointInTime, wal_dir: "".to_owned(), wal_ttl_seconds: 0, wal_size_limit: ReadableSize::kb(0), max_total_wal_size: None, - max_background_jobs: bg_job_limits.max_background_jobs as i32, - max_background_flushes: bg_job_limits.max_background_flushes as i32, + max_background_jobs: 0, + max_background_flushes: 0, max_manifest_file_size: ReadableSize::mb(128), create_if_missing: true, max_open_files: 40960, @@ -1339,7 +1345,7 @@ impl Default for DbConfig { rate_limiter_auto_tuned: true, bytes_per_sync: ReadableSize::mb(1), wal_bytes_per_sync: ReadableSize::kb(512), - max_sub_compactions: bg_job_limits.max_sub_compactions, + max_sub_compactions: 0, writable_file_max_buffer_size: ReadableSize::mb(1), use_direct_io_for_flush_and_compaction: false, enable_pipelined_write: false, @@ -1354,7 +1360,7 @@ impl Default for DbConfig { writecf: WriteCfConfig::default(), lockcf: LockCfConfig::default(), raftcf: RaftCfConfig::default(), - titan: titan_config, + titan: TitanDbConfig::default(), } } } @@ -1410,6 +1416,19 @@ impl DbConfig { .get_or_insert(DEFAULT_LOCK_BUFFER_MEMORY_LIMIT); } } + let bg_job_limits = get_background_job_limits(engine, &KVDB_DEFAULT_BACKGROUND_JOB_LIMITS); + if self.max_background_jobs == 0 { + self.max_background_jobs = bg_job_limits.max_background_jobs as i32; + } + if self.max_background_flushes == 0 { + self.max_background_flushes = bg_job_limits.max_background_flushes as i32; + } + if self.max_sub_compactions == 0 { + self.max_sub_compactions = bg_job_limits.max_sub_compactions; + } + if self.titan.max_background_gc == 0 { + self.titan.max_background_gc = bg_job_limits.max_titan_background_gc as i32; + } } pub fn build_resources(&self, env: Arc, engine: EngineType) -> DbResources { @@ -1807,7 +1826,9 @@ pub struct RaftDbConfig { impl Default for RaftDbConfig { fn default() -> RaftDbConfig { - let bg_job_limits = get_background_job_limits(&RAFTDB_DEFAULT_BACKGROUND_JOB_LIMITS); + // raftdb should only be used for raftkv + let bg_job_limits = + get_background_job_limits(EngineType::RaftKv, &RAFTDB_DEFAULT_BACKGROUND_JOB_LIMITS); let titan_config = TitanDbConfig { max_background_gc: bg_job_limits.max_titan_background_gc as i32, ..Default::default() @@ -5913,61 +5934,95 @@ mod tests { #[test] fn test_background_job_limits() { - // cpu num = 1 + for engine in [EngineType::RaftKv, EngineType::RaftKv2] { + // cpu num = 1 + assert_eq!( + get_background_job_limits_impl( + engine, + 1, // cpu_num + &KVDB_DEFAULT_BACKGROUND_JOB_LIMITS + ), + BackgroundJobLimits { + max_background_jobs: 2, + max_background_flushes: 1, + max_sub_compactions: 1, + max_titan_background_gc: 1, + } + ); + assert_eq!( + get_background_job_limits_impl( + engine, + 1, // cpu_num + &RAFTDB_DEFAULT_BACKGROUND_JOB_LIMITS + ), + BackgroundJobLimits { + max_background_jobs: 2, + max_background_flushes: 1, + max_sub_compactions: 1, + max_titan_background_gc: 1, + } + ); + // cpu num = 2 + assert_eq!( + get_background_job_limits_impl( + EngineType::RaftKv, + 2, // cpu_num + &KVDB_DEFAULT_BACKGROUND_JOB_LIMITS + ), + BackgroundJobLimits { + max_background_jobs: 2, + max_background_flushes: 1, + max_sub_compactions: 1, + max_titan_background_gc: 2, + } + ); + assert_eq!( + get_background_job_limits_impl( + EngineType::RaftKv, + 2, // cpu_num + &RAFTDB_DEFAULT_BACKGROUND_JOB_LIMITS + ), + BackgroundJobLimits { + max_background_jobs: 2, + max_background_flushes: 1, + max_sub_compactions: 1, + max_titan_background_gc: 2, + } + ); + } + + // cpu num = 4 assert_eq!( get_background_job_limits_impl( - 1, // cpu_num + EngineType::RaftKv, + 4, // cpu_num &KVDB_DEFAULT_BACKGROUND_JOB_LIMITS ), BackgroundJobLimits { - max_background_jobs: 2, - max_background_flushes: 1, - max_sub_compactions: 1, - max_titan_background_gc: 1, - } - ); - assert_eq!( - get_background_job_limits_impl( - 1, // cpu_num - &RAFTDB_DEFAULT_BACKGROUND_JOB_LIMITS - ), - BackgroundJobLimits { - max_background_jobs: 2, + max_background_jobs: 3, max_background_flushes: 1, max_sub_compactions: 1, - max_titan_background_gc: 1, + max_titan_background_gc: 4, } ); - // cpu num = 2 assert_eq!( get_background_job_limits_impl( - 2, // cpu_num + EngineType::RaftKv2, + 4, // cpu_num &KVDB_DEFAULT_BACKGROUND_JOB_LIMITS ), BackgroundJobLimits { max_background_jobs: 2, max_background_flushes: 1, max_sub_compactions: 1, - max_titan_background_gc: 2, - } - ); - assert_eq!( - get_background_job_limits_impl( - 2, // cpu_num - &RAFTDB_DEFAULT_BACKGROUND_JOB_LIMITS - ), - BackgroundJobLimits { - max_background_jobs: 2, - max_background_flushes: 1, - max_sub_compactions: 1, - max_titan_background_gc: 2, + max_titan_background_gc: 4, } ); - // cpu num = 4 assert_eq!( get_background_job_limits_impl( + EngineType::RaftKv, 4, // cpu_num - &KVDB_DEFAULT_BACKGROUND_JOB_LIMITS + &RAFTDB_DEFAULT_BACKGROUND_JOB_LIMITS ), BackgroundJobLimits { max_background_jobs: 3, @@ -5976,33 +6031,36 @@ mod tests { max_titan_background_gc: 4, } ); + // cpu num = 8 assert_eq!( get_background_job_limits_impl( - 4, // cpu_num - &RAFTDB_DEFAULT_BACKGROUND_JOB_LIMITS + EngineType::RaftKv, + 8, // cpu_num + &KVDB_DEFAULT_BACKGROUND_JOB_LIMITS ), BackgroundJobLimits { - max_background_jobs: 3, - max_background_flushes: 1, - max_sub_compactions: 1, + max_background_jobs: 7, + max_background_flushes: 2, + max_sub_compactions: 3, max_titan_background_gc: 4, } ); - // cpu num = 8 assert_eq!( get_background_job_limits_impl( + EngineType::RaftKv2, 8, // cpu_num &KVDB_DEFAULT_BACKGROUND_JOB_LIMITS ), BackgroundJobLimits { - max_background_jobs: 7, + max_background_jobs: 3, max_background_flushes: 2, - max_sub_compactions: 3, + max_sub_compactions: 1, max_titan_background_gc: 4, } ); assert_eq!( get_background_job_limits_impl( + EngineType::RaftKv, 8, // cpu_num &RAFTDB_DEFAULT_BACKGROUND_JOB_LIMITS ), @@ -6011,6 +6069,7 @@ mod tests { // cpu num = 16 assert_eq!( get_background_job_limits_impl( + EngineType::RaftKv, 16, // cpu_num &KVDB_DEFAULT_BACKGROUND_JOB_LIMITS ), @@ -6018,6 +6077,20 @@ mod tests { ); assert_eq!( get_background_job_limits_impl( + EngineType::RaftKv2, + 16, // cpu_num + &KVDB_DEFAULT_BACKGROUND_JOB_LIMITS + ), + BackgroundJobLimits { + max_background_jobs: 5, + max_background_flushes: 3, + max_sub_compactions: 1, + max_titan_background_gc: 4, + } + ); + assert_eq!( + get_background_job_limits_impl( + EngineType::RaftKv, 16, // cpu_num &RAFTDB_DEFAULT_BACKGROUND_JOB_LIMITS ), From 272fcd04f645479c4fdc265e3083c250796c60df Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Thu, 12 Oct 2023 14:18:25 +0800 Subject: [PATCH 087/220] raftstore-v2: avoid follower forwarding propose msg (#15704) ref tikv/tikv#14390 avoid follower forwarding propose msg Signed-off-by: SpadeA-Tang Co-authored-by: tonyxuqqi --- .../src/operation/command/write/mod.rs | 31 ++---- components/raftstore-v2/src/operation/mod.rs | 4 +- components/raftstore/src/store/fsm/apply.rs | 1 - .../raftstore/src/store/simple_write.rs | 30 +----- tests/failpoints/cases/test_transaction.rs | 101 +++++++++++++++++- 5 files changed, 112 insertions(+), 55 deletions(-) diff --git a/components/raftstore-v2/src/operation/command/write/mod.rs b/components/raftstore-v2/src/operation/command/write/mod.rs index 6eacc75c0f1..cc71533a29a 100644 --- a/components/raftstore-v2/src/operation/command/write/mod.rs +++ b/components/raftstore-v2/src/operation/command/write/mod.rs @@ -12,7 +12,7 @@ use raftstore::{ fsm::{apply, MAX_PROPOSAL_SIZE_RATIO}, metrics::PEER_WRITE_CMD_COUNTER, msg::ErrorCallback, - util::{self, NORMAL_REQ_CHECK_CONF_VER, NORMAL_REQ_CHECK_VER}, + util::{self}, }, Error, Result, }; @@ -80,13 +80,10 @@ impl Peer { ch.report_error(resp); return; } - // ProposalControl is reliable only when applied to current term. - let call_proposed_on_success = self.applied_to_current_term(); let mut encoder = SimpleWriteReqEncoder::new( header, data, (ctx.cfg.raft_entry_max_size.0 as f64 * MAX_PROPOSAL_SIZE_RATIO) as usize, - call_proposed_on_success, ); encoder.add_response_channel(ch); self.set_has_ready(); @@ -106,7 +103,6 @@ impl Peer { Box::::default(), data, ctx.cfg.raft_entry_max_size.0 as usize, - false, ) .encode() .0 @@ -118,30 +114,17 @@ impl Peer { pub fn propose_pending_writes(&mut self, ctx: &mut StoreContext) { if let Some(encoder) = self.simple_write_encoder_mut().take() { - let call_proposed_on_success = if encoder.notify_proposed() { - // The request has pass conflict check and called all proposed callbacks. + let header = encoder.header(); + let res = self.validate_command(header, None, &mut ctx.raft_metrics); + let call_proposed_on_success = if matches!(res, Err(Error::EpochNotMatch { .. })) { false } else { - // Epoch may have changed since last check. - let from_epoch = encoder.header().get_region_epoch(); - let res = util::compare_region_epoch( - from_epoch, - self.region(), - NORMAL_REQ_CHECK_CONF_VER, - NORMAL_REQ_CHECK_VER, - true, - ); - if let Err(e) = res { - // TODO: query sibling regions. - ctx.raft_metrics.invalid_proposal.epoch_not_match.inc(); - encoder.encode().1.report_error(cmd_resp::new_error(e)); - return; - } - // Only when it applies to current term, the epoch check can be reliable. self.applied_to_current_term() }; + let (data, chs) = encoder.encode(); - let res = self.propose(ctx, data); + let res = res.and_then(|_| self.propose(ctx, data)); + fail_point!("after_propose_pending_writes"); self.post_propose_command(ctx, res, chs, call_proposed_on_success); diff --git a/components/raftstore-v2/src/operation/mod.rs b/components/raftstore-v2/src/operation/mod.rs index 8ce592dd753..6d5cba9fff8 100644 --- a/components/raftstore-v2/src/operation/mod.rs +++ b/components/raftstore-v2/src/operation/mod.rs @@ -87,7 +87,7 @@ pub mod test_util { let mut header = Box::::default(); header.set_region_id(region_id); header.set_region_epoch(region_epoch); - let req_encoder = SimpleWriteReqEncoder::new(header, encoder.encode(), 512, false); + let req_encoder = SimpleWriteReqEncoder::new(header, encoder.encode(), 512); let (bin, _) = req_encoder.encode(); let mut e = Entry::default(); e.set_entry_type(EntryType::EntryNormal); @@ -112,7 +112,7 @@ pub mod test_util { let mut header = Box::::default(); header.set_region_id(region_id); header.set_region_epoch(region_epoch); - let req_encoder = SimpleWriteReqEncoder::new(header, encoder.encode(), 512, false); + let req_encoder = SimpleWriteReqEncoder::new(header, encoder.encode(), 512); let (bin, _) = req_encoder.encode(); let mut e = Entry::default(); e.set_entry_type(EntryType::EntryNormal); diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index c170e5a35f9..038171d9715 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -5745,7 +5745,6 @@ mod tests { self.header.clone(), bin, 1000, - false, ); let (bytes, _) = req_encoder.encode(); self.entry.set_data(bytes.into()); diff --git a/components/raftstore/src/store/simple_write.rs b/components/raftstore/src/store/simple_write.rs index a303a586935..dd461e61867 100644 --- a/components/raftstore/src/store/simple_write.rs +++ b/components/raftstore/src/store/simple_write.rs @@ -49,7 +49,6 @@ where channels: Vec, size_limit: usize, write_type: WriteType, - notify_proposed: bool, } impl SimpleWriteReqEncoder @@ -57,14 +56,10 @@ where C: ErrorCallback + WriteCallback, { /// Create a request encoder. - /// - /// If `notify_proposed` is true, channels will be called `notify_proposed` - /// when it's appended. pub fn new( header: Box, bin: SimpleWriteBinary, size_limit: usize, - notify_proposed: bool, ) -> SimpleWriteReqEncoder { let mut buf = Vec::with_capacity(256); buf.push(MAGIC_PREFIX); @@ -77,7 +72,6 @@ where channels: vec![], size_limit, write_type: bin.write_type, - notify_proposed, } } @@ -112,18 +106,10 @@ where } #[inline] - pub fn add_response_channel(&mut self, mut ch: C) { - if self.notify_proposed { - ch.notify_proposed(); - } + pub fn add_response_channel(&mut self, ch: C) { self.channels.push(ch); } - #[inline] - pub fn notify_proposed(&self) -> bool { - self.notify_proposed - } - #[inline] pub fn header(&self) -> &RaftRequestHeader { &self.header @@ -558,7 +544,6 @@ mod tests { header.clone(), bin, usize::MAX, - false, ); let mut encoder = SimpleWriteEncoder::with_capacity(512); @@ -570,7 +555,6 @@ mod tests { header.clone(), bin, 0, - false, ); let (bytes, _) = req_encoder.encode(); @@ -619,9 +603,8 @@ mod tests { .collect(); encoder.ingest(exp.clone()); let bin = encoder.encode(); - let req_encoder = SimpleWriteReqEncoder::>::new( - header, bin, 0, false, - ); + let req_encoder = + SimpleWriteReqEncoder::>::new(header, bin, 0); let (bytes, _) = req_encoder.encode(); let mut decoder = SimpleWriteReqDecoder::new(decoder_fallback, &logger, &bytes, 0, 0).unwrap(); @@ -683,7 +666,6 @@ mod tests { header.clone(), bin.clone(), 512, - false, ); let mut header2 = Box::::default(); @@ -700,7 +682,6 @@ mod tests { header.clone(), bin2.clone(), 512, - false, ); assert!(!req_encoder2.amend(&header, &bin)); @@ -735,7 +716,6 @@ mod tests { header.clone(), SimpleWriteEncoder::with_capacity(512).encode(), 512, - false, ); let (bin, _) = req_encoder.encode(); assert_eq!( @@ -753,7 +733,6 @@ mod tests { header.clone(), encoder.encode(), 512, - false, ); let (bin, _) = req_encoder.encode(); let req = SimpleWriteReqDecoder::new(decoder_fallback, &logger, &bin, 0, 0) @@ -771,7 +750,6 @@ mod tests { header.clone(), encoder.encode(), 512, - false, ); let (bin, _) = req_encoder.encode(); let req = SimpleWriteReqDecoder::new(decoder_fallback, &logger, &bin, 0, 0) @@ -788,7 +766,6 @@ mod tests { header.clone(), encoder.encode(), 512, - false, ); let (bin, _) = req_encoder.encode(); let req = SimpleWriteReqDecoder::new(decoder_fallback, &logger, &bin, 0, 0) @@ -816,7 +793,6 @@ mod tests { header, encoder.encode(), 512, - false, ); let (bin, _) = req_encoder.encode(); let req = SimpleWriteReqDecoder::new(decoder_fallback, &logger, &bin, 0, 0) diff --git a/tests/failpoints/cases/test_transaction.rs b/tests/failpoints/cases/test_transaction.rs index 14f4161c7ae..0b6e6269e95 100644 --- a/tests/failpoints/cases/test_transaction.rs +++ b/tests/failpoints/cases/test_transaction.rs @@ -2,6 +2,7 @@ use std::{ sync::{ + atomic::{AtomicBool, Ordering}, mpsc::{channel, sync_channel}, Arc, Mutex, }, @@ -9,13 +10,15 @@ use std::{ time::Duration, }; -use futures::executor::block_on; +use engine_traits::CF_DEFAULT; +use futures::{executor::block_on, StreamExt}; use grpcio::{ChannelBuilder, Environment}; use kvproto::{ kvrpcpb::{ self as pb, AssertionLevel, Context, GetRequest, Op, PessimisticLockRequest, PrewriteRequest, PrewriteRequestPessimisticAction::*, }, + raft_serverpb::RaftMessage, tikvpb::TikvClient, }; use raft::prelude::{ConfChangeType, MessageType}; @@ -45,7 +48,9 @@ use tikv::{ Snapshot, TestEngineBuilder, TestStorageBuilderApiV1, }, }; +use tikv_kv::{Engine, Modify, WriteData, WriteEvent}; use tikv_util::{ + config::ReadableDuration, store::{new_peer, peer::new_incoming_voter}, HandyRwLock, }; @@ -803,3 +808,97 @@ fn test_next_last_change_info_called_when_gc() { assert_eq!(h.join().unwrap().unwrap().as_slice(), b"v"); } + +fn must_put(ctx: &Context, engine: &E, key: &[u8], value: &[u8]) { + engine.put(ctx, Key::from_raw(key), value.to_vec()).unwrap(); +} + +fn must_delete(ctx: &Context, engine: &E, key: &[u8]) { + engine.delete(ctx, Key::from_raw(key)).unwrap(); +} + +// Before the fix, a proposal can be proposed twice, which is caused by that +// write proposal validation and propose are not atomic. So a raft message with +// higher term between them can make the proposal goes to msg proposal +// forwarding logic. However, raft proposal forawrd logic is not compatible with +// the raft store, as the failed proposal makes client retry. The retried +// proposal coupled with forward proposal makes the propsal applied twice. +#[test] +fn test_forbid_forward_propose() { + use test_raftstore_v2::*; + let count = 3; + let mut cluster = new_server_cluster(0, count); + cluster.cfg.raft_store.raft_base_tick_interval = ReadableDuration::millis(10); + cluster.cfg.raft_store.store_batch_system.pool_size = 2; + cluster.run(); + + let region = cluster.get_region(b""); + let peer1 = new_peer(1, 1); + let peer2 = new_peer(2, 2); + cluster.must_transfer_leader(region.id, peer2.clone()); + let storage = cluster.sim.rl().storages[&1].clone(); + let storage2 = cluster.sim.rl().storages[&2].clone(); + + let p = Arc::new(AtomicBool::new(false)); + let p2 = p.clone(); + let (tx, rx) = channel(); + let tx = Mutex::new(tx); + cluster.add_recv_filter_on_node( + 2, + Box::new(DropMessageFilter::new(Arc::new(move |_| { + if p2.load(Ordering::Relaxed) { + tx.lock().unwrap().send(()).unwrap(); + // One msg is enough + p2.store(false, Ordering::Relaxed); + true + } else { + false + } + }))), + ); + + let k = Key::from_raw(b"k"); + let mut ctx = Context::default(); + ctx.set_region_id(region.get_id()); + ctx.set_region_epoch(region.get_region_epoch().clone()); + ctx.set_peer(peer2); + + // block node when collecting message to make async write proposal and a raft + // message with higher term occured in a single batch. + fail::cfg("on_peer_collect_message_2", "pause").unwrap(); + let mut res = storage2.async_write( + &ctx, + WriteData::from_modifies(vec![Modify::Put(CF_DEFAULT, k.clone(), b"val".to_vec())]), + WriteEvent::EVENT_PROPOSED, + None, + ); + + // Make node 1 become leader + let router = cluster.get_router(1).unwrap(); + let mut raft_msg = RaftMessage::default(); + raft_msg.set_region_id(1); + raft_msg.set_to_peer(peer1.clone()); + raft_msg.set_region_epoch(region.get_region_epoch().clone()); + raft_msg + .mut_message() + .set_msg_type(MessageType::MsgTimeoutNow); + router.send_raft_message(Box::new(raft_msg)).unwrap(); + + std::thread::sleep(Duration::from_secs(1)); + + ctx.set_peer(peer1); + must_put(&ctx, &storage, b"k", b"val"); + must_delete(&ctx, &storage, b"k"); + + p.store(true, Ordering::Release); + rx.recv().unwrap(); + // Ensure the msg is sent by router. + std::thread::sleep(Duration::from_millis(100)); + fail::remove("on_peer_collect_message_2"); + + let r = block_on(async { res.next().await }).unwrap(); + assert!(matches!(r, WriteEvent::Finished(Err { .. }))); + + std::thread::sleep(Duration::from_secs(1)); + assert_eq!(cluster.get(k.as_encoded()), None); +} From b3ffab6d4e4fc3278eec51df88b8571724ba12c5 Mon Sep 17 00:00:00 2001 From: lucasliang Date: Thu, 12 Oct 2023 19:34:56 +0800 Subject: [PATCH 088/220] [Dynamic Regions] Supplement extra uts for test_storage.rs. (#15750) ref tikv/tikv#15409 This pr contains several necessary uts and some enhancements for `raftstore-v2`: - Supply extra test cases, including integration tests and unit tests for raftstore-v2 on `storage`. - Transplant the necessary options on setting `deadline` for `SimpleWrite` in raftstore-v2. Signed-off-by: lucasliang --- components/raftstore-v2/src/fsm/peer.rs | 2 +- .../src/operation/command/write/mod.rs | 24 ++++-- .../raftstore-v2/src/operation/txn_ext.rs | 9 ++- components/raftstore-v2/src/router/message.rs | 11 ++- components/test_raftstore-v2/src/cluster.rs | 11 ++- src/server/raftkv2/mod.rs | 50 +++++++++---- tests/failpoints/cases/test_storage.rs | 73 +++++++++++-------- 7 files changed, 117 insertions(+), 63 deletions(-) diff --git a/components/raftstore-v2/src/fsm/peer.rs b/components/raftstore-v2/src/fsm/peer.rs index 6896f8caa5e..1734b46b25a 100644 --- a/components/raftstore-v2/src/fsm/peer.rs +++ b/components/raftstore-v2/src/fsm/peer.rs @@ -267,7 +267,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, write.header, write.data, write.ch, - Some(write.disk_full_opt), + Some(write.extra_opts), ); } PeerMsg::UnsafeWrite(write) => { diff --git a/components/raftstore-v2/src/operation/command/write/mod.rs b/components/raftstore-v2/src/operation/command/write/mod.rs index cc71533a29a..5806614e192 100644 --- a/components/raftstore-v2/src/operation/command/write/mod.rs +++ b/components/raftstore-v2/src/operation/command/write/mod.rs @@ -5,7 +5,7 @@ use engine_traits::{ }; use fail::fail_point; use futures::channel::oneshot; -use kvproto::{kvrpcpb::DiskFullOpt, raft_cmdpb::RaftRequestHeader}; +use kvproto::raft_cmdpb::RaftRequestHeader; use raftstore::{ store::{ cmd_resp, @@ -13,6 +13,7 @@ use raftstore::{ metrics::PEER_WRITE_CMD_COUNTER, msg::ErrorCallback, util::{self}, + RaftCmdExtraOpts, }, Error, Result, }; @@ -42,7 +43,7 @@ impl Peer { header: Box, data: SimpleWriteBinary, ch: CmdResChannel, - disk_full_opt: Option, + extra_opts: Option, ) { if !self.serving() { apply::notify_req_region_removed(self.region_id(), ch); @@ -60,12 +61,19 @@ impl Peer { ch.report_error(resp); return; } - // Check whether the write request can be proposed with the given disk full - // option. - if let Some(opt) = disk_full_opt && let Err(e) = self.check_proposal_with_disk_full_opt(ctx, opt) { - let resp = cmd_resp::new_error(e); - ch.report_error(resp); - return; + if let Some(opts) = extra_opts { + if let Some(Err(e)) = opts.deadline.map(|deadline| deadline.check()) { + let resp = cmd_resp::new_error(e.into()); + ch.report_error(resp); + return; + } + // Check whether the write request can be proposed with the given disk full + // option. + if let Err(e) = self.check_proposal_with_disk_full_opt(ctx, opts.disk_full_opt) { + let resp = cmd_resp::new_error(e); + ch.report_error(resp); + return; + } } // To maintain propose order, we need to make pending proposal first. self.propose_pending_writes(ctx); diff --git a/components/raftstore-v2/src/operation/txn_ext.rs b/components/raftstore-v2/src/operation/txn_ext.rs index 4c875a675ef..6a379b9a1a2 100644 --- a/components/raftstore-v2/src/operation/txn_ext.rs +++ b/components/raftstore-v2/src/operation/txn_ext.rs @@ -17,7 +17,7 @@ use kvproto::{ use parking_lot::RwLockWriteGuard; use raft::eraftpb; use raftstore::store::{ - LocksStatus, PeerPessimisticLocks, TxnExt, TRANSFER_LEADER_COMMAND_REPLY_CTX, + LocksStatus, PeerPessimisticLocks, RaftCmdExtraOpts, TxnExt, TRANSFER_LEADER_COMMAND_REPLY_CTX, }; use slog::{error, info, Logger}; @@ -270,13 +270,16 @@ impl Peer { self.logger, "propose {} locks before transferring leader", lock_count; ); - let PeerMsg::SimpleWrite(write) = PeerMsg::simple_write_with_opt(header, encoder.encode(), DiskFullOpt::AllowedOnAlmostFull).0 else {unreachable!()}; + let PeerMsg::SimpleWrite(write) = PeerMsg::simple_write_with_opt(header, encoder.encode(), RaftCmdExtraOpts { + disk_full_opt: DiskFullOpt::AllowedOnAlmostFull, + ..Default::default() + }).0 else {unreachable!()}; self.on_simple_write( ctx, write.header, write.data, write.ch, - Some(write.disk_full_opt), + Some(write.extra_opts), ); true } diff --git a/components/raftstore-v2/src/router/message.rs b/components/raftstore-v2/src/router/message.rs index 830286bb142..c9da5241fa8 100644 --- a/components/raftstore-v2/src/router/message.rs +++ b/components/raftstore-v2/src/router/message.rs @@ -6,7 +6,6 @@ use std::sync::{mpsc::SyncSender, Arc}; use collections::HashSet; use kvproto::{ import_sstpb::SstMeta, - kvrpcpb::DiskFullOpt, metapb, metapb::RegionEpoch, pdpb, @@ -15,7 +14,7 @@ use kvproto::{ }; use raftstore::store::{ fsm::ChangeObserver, metrics::RaftEventDurationType, simple_write::SimpleWriteBinary, - util::LatencyInspector, FetchedLogs, GenSnapRes, TabletSnapKey, + util::LatencyInspector, FetchedLogs, GenSnapRes, RaftCmdExtraOpts, TabletSnapKey, UnsafeRecoveryExecutePlanSyncer, UnsafeRecoveryFillOutReportSyncer, UnsafeRecoveryForceLeaderSyncer, UnsafeRecoveryWaitApplySyncer, }; @@ -135,7 +134,7 @@ pub struct SimpleWrite { pub header: Box, pub data: SimpleWriteBinary, pub ch: CmdResChannel, - pub disk_full_opt: DiskFullOpt, + pub extra_opts: RaftCmdExtraOpts, } #[derive(Debug)] @@ -299,13 +298,13 @@ impl PeerMsg { header: Box, data: SimpleWriteBinary, ) -> (Self, CmdResSubscriber) { - PeerMsg::simple_write_with_opt(header, data, DiskFullOpt::default()) + PeerMsg::simple_write_with_opt(header, data, RaftCmdExtraOpts::default()) } pub fn simple_write_with_opt( header: Box, data: SimpleWriteBinary, - disk_full_opt: DiskFullOpt, + extra_opts: RaftCmdExtraOpts, ) -> (Self, CmdResSubscriber) { let (ch, sub) = CmdResChannel::pair(); ( @@ -314,7 +313,7 @@ impl PeerMsg { header, data, ch, - disk_full_opt, + extra_opts, }), sub, ) diff --git a/components/test_raftstore-v2/src/cluster.rs b/components/test_raftstore-v2/src/cluster.rs index 496f8cc87dc..53ff2c0f0b6 100644 --- a/components/test_raftstore-v2/src/cluster.rs +++ b/components/test_raftstore-v2/src/cluster.rs @@ -69,6 +69,9 @@ use tikv_util::{ }; use txn_types::WriteBatchFlags; +// MAX duration waiting for releasing store metas, default: 10s. +const MAX_WAIT_RELEASE_INTERVAL: u32 = 1000; + // We simulate 3 or 5 nodes, each has a store. // Sometimes, we use fixed id to test, which means the id // isn't allocated by pd, and node id, store id are same. @@ -328,7 +331,7 @@ pub trait Simulator { PeerMsg::simple_write_with_opt( Box::new(request.take_header()), write_encoder.encode(), - opts.disk_full_opt, + opts, ) }; @@ -1874,15 +1877,17 @@ impl, EK: KvEngine> Cluster { } self.leaders.clear(); for store_meta in self.store_metas.values() { - while Arc::strong_count(store_meta) != 1 { + // Limits the loop count of checking. + let mut idx = 0; + while Arc::strong_count(store_meta) != 1 && idx < MAX_WAIT_RELEASE_INTERVAL { std::thread::sleep(Duration::from_millis(10)); + idx += 1; } } self.store_metas.clear(); for sst_worker in self.sst_workers.drain(..) { sst_worker.stop_worker(); } - debug!("all nodes are shut down."); } diff --git a/src/server/raftkv2/mod.rs b/src/server/raftkv2/mod.rs index a80cdda392f..a9f7eb7586e 100644 --- a/src/server/raftkv2/mod.rs +++ b/src/server/raftkv2/mod.rs @@ -19,7 +19,13 @@ use kvproto::{ }; pub use node::NodeV2; pub use raft_extension::Extension; -use raftstore::store::{util::encode_start_ts_into_flag_data, RegionSnapshot}; +use raftstore::{ + store::{ + cmd_resp, msg::ErrorCallback, util::encode_start_ts_into_flag_data, RaftCmdExtraOpts, + RegionSnapshot, + }, + Error, +}; use raftstore_v2::{ router::{ message::SimpleWrite, CmdResChannelBuilder, CmdResEvent, CmdResStream, PeerMsg, RaftRouter, @@ -265,6 +271,17 @@ impl tikv_kv::Engine for RaftKv2 { let region_id = ctx.region_id; ASYNC_REQUESTS_COUNTER_VEC.write.all.inc(); + + let inject_region_not_found = (|| { + // If rid is some, only the specified region reports error. + // If rid is None, all regions report error. + fail_point!("raftkv_early_error_report", |rid| -> bool { + rid.and_then(|rid| rid.parse().ok()) + .map_or(true, |rid: u64| rid == region_id) + }); + false + })(); + let begin_instant = Instant::now_coarse(); let mut header = Box::new(new_request_header(ctx)); let mut flags = 0; @@ -299,18 +316,25 @@ impl tikv_kv::Engine for RaftKv2 { }); } let (ch, sub) = builder.build(); - let msg = PeerMsg::SimpleWrite(SimpleWrite { - header, - data, - ch, - send_time: Instant::now_coarse(), - disk_full_opt: batch.disk_full_opt, - }); - let res = self - .router - .store_router() - .check_send(region_id, msg) - .map_err(tikv_kv::Error::from); + let res = if inject_region_not_found { + ch.report_error(cmd_resp::new_error(Error::RegionNotFound(region_id))); + Err(tikv_kv::Error::from(Error::RegionNotFound(region_id))) + } else { + let msg = PeerMsg::SimpleWrite(SimpleWrite { + header, + data, + ch, + send_time: Instant::now_coarse(), + extra_opts: RaftCmdExtraOpts { + deadline: batch.deadline, + disk_full_opt: batch.disk_full_opt, + }, + }); + self.router + .store_router() + .check_send(region_id, msg) + .map_err(tikv_kv::Error::from) + }; (Transform { resp: CmdResStream::new(sub), early_err: res.err(), diff --git a/tests/failpoints/cases/test_storage.rs b/tests/failpoints/cases/test_storage.rs index 57047bef9d4..533d8d0abd4 100644 --- a/tests/failpoints/cases/test_storage.rs +++ b/tests/failpoints/cases/test_storage.rs @@ -24,6 +24,7 @@ use kvproto::{ }; use resource_control::ResourceGroupManager; use test_raftstore::*; +use test_raftstore_macro::test_case; use tikv::{ config::{ConfigController, Module}, storage::{ @@ -44,10 +45,11 @@ use tikv::{ use tikv_util::{future::paired_future_callback, worker::dummy_scheduler, HandyRwLock}; use txn_types::{Key, Mutation, TimeStamp}; -#[test] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_scheduler_leader_change_twice() { let snapshot_fp = "scheduler_async_snapshot_finish"; - let mut cluster = new_server_cluster(0, 2); + let mut cluster = new_cluster(0, 2); cluster.run(); let region0 = cluster.get_region(b""); let peers = region0.get_peers(); @@ -108,10 +110,11 @@ fn test_scheduler_leader_change_twice() { } } -#[test] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_server_catching_api_error() { let raftkv_fp = "raftkv_early_error_report"; - let mut cluster = new_server_cluster(0, 1); + let mut cluster = new_cluster(0, 1); cluster.run(); let region = cluster.get_region(b""); let leader = region.get_peers()[0].clone(); @@ -168,10 +171,11 @@ fn test_server_catching_api_error() { must_get_equal(&cluster.get_engine(1), b"k3", b"v3"); } -#[test] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_raftkv_early_error_report() { let raftkv_fp = "raftkv_early_error_report"; - let mut cluster = new_server_cluster(0, 1); + let mut cluster = new_cluster(0, 1); cluster.run(); cluster.must_split(&cluster.get_region(b"k0"), b"k1"); @@ -233,10 +237,12 @@ fn test_raftkv_early_error_report() { fail::remove(raftkv_fp); } -#[test] +// FIXME: #[test_case(test_raftstore_v2::new_server_cluster)] +// Raftstore-v2 not support get the storage engine, returning `None` currently. +#[test_case(test_raftstore::new_server_cluster)] fn test_scale_scheduler_pool() { let snapshot_fp = "scheduler_start_execute"; - let mut cluster = new_server_cluster(0, 1); + let mut cluster = new_cluster(0, 1); cluster.run(); let origin_pool_size = cluster.cfg.storage.scheduler_worker_pool_size; @@ -332,9 +338,10 @@ fn test_scale_scheduler_pool() { fail::remove(snapshot_fp); } -#[test] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_scheduler_pool_auto_switch_for_resource_ctl() { - let mut cluster = new_server_cluster(0, 1); + let mut cluster = new_cluster(0, 1); cluster.run(); let engine = cluster @@ -1090,9 +1097,10 @@ fn test_async_apply_prewrite_impl( } } -#[test] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_async_apply_prewrite() { - let mut cluster = new_server_cluster(0, 1); + let mut cluster = new_cluster(0, 1); cluster.run(); let engine = cluster @@ -1149,7 +1157,6 @@ fn test_async_apply_prewrite() { true, true, ); - test_async_apply_prewrite_impl( &storage, ctx.clone(), @@ -1188,9 +1195,10 @@ fn test_async_apply_prewrite() { ); } -#[test] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_async_apply_prewrite_fallback() { - let mut cluster = new_server_cluster(0, 1); + let mut cluster = new_cluster(0, 1); cluster.run(); let engine = cluster @@ -1378,9 +1386,10 @@ fn test_async_apply_prewrite_1pc_impl( } } -#[test] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_async_apply_prewrite_1pc() { - let mut cluster = new_server_cluster(0, 1); + let mut cluster = new_cluster(0, 1); cluster.run(); let engine = cluster @@ -1405,9 +1414,10 @@ fn test_async_apply_prewrite_1pc() { test_async_apply_prewrite_1pc_impl(&storage, ctx, b"key", b"value2", 20, true); } -#[test] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_atomic_cas_lock_by_latch() { - let mut cluster = new_server_cluster(0, 1); + let mut cluster = new_cluster(0, 1); cluster.run(); let engine = cluster @@ -1493,9 +1503,10 @@ fn test_atomic_cas_lock_by_latch() { assert_eq!(b"v2".to_vec(), ret); } -#[test] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_before_async_write_deadline() { - let mut cluster = new_server_cluster(0, 1); + let mut cluster = new_cluster(0, 1); cluster.run(); let engine = cluster @@ -1532,12 +1543,13 @@ fn test_before_async_write_deadline() { )); } -#[test] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_deadline_exceeded_on_get_and_batch_get() { use tikv_util::time::Instant; use tracker::INVALID_TRACKER_TOKEN; - let mut cluster = new_server_cluster(0, 1); + let mut cluster = new_cluster(0, 1); cluster.run(); let engine = cluster @@ -1591,9 +1603,10 @@ fn test_deadline_exceeded_on_get_and_batch_get() { fail::remove("after-snapshot"); } -#[test] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_before_propose_deadline() { - let mut cluster = new_server_cluster(0, 1); + let mut cluster = new_cluster(0, 1); cluster.run(); let engine = cluster.sim.read().unwrap().storages[&1].clone(); @@ -1629,9 +1642,10 @@ fn test_before_propose_deadline() { ); } -#[test] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_resolve_lock_deadline() { - let mut cluster = new_server_cluster(0, 1); + let mut cluster = new_cluster(0, 1); cluster.run(); let engine = cluster.sim.read().unwrap().storages[&1].clone(); @@ -1789,10 +1803,11 @@ fn test_mvcc_concurrent_commit_and_rollback_at_shutdown() { assert_eq!(get_resp.value, v); } -#[test] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_raw_put_deadline() { let deadline_fp = "deadline_check_fail"; - let mut cluster = new_server_cluster(0, 1); + let mut cluster = new_cluster(0, 1); cluster.run(); let region = cluster.get_region(b""); let leader = region.get_peers()[0].clone(); From b4e0bf7bab6ad395b74c0be938119d82ded4cd2a Mon Sep 17 00:00:00 2001 From: Connor Date: Thu, 12 Oct 2023 21:16:56 -0500 Subject: [PATCH 089/220] raftstore: Introduce failed state for unsafe recovery to fix rollback merge timeout (#15635) close tikv/tikv#15629 Introduce failed state for unsafe recovery to fix rollback merge timeout. To rollback merge, it has to be in force leader state when performing online recovery. Force leader state would exit after executing the plan no matter succeeded or failed. While rollback merge is triggered on check merge tick periodically. So there is a chance that check merge can't always be in the time window of being force leader state. To solve that, let it skip exiting force leader state when failed to demote, so later rollback merge can be triggered. Signed-off-by: Connor1996 Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- components/raftstore-v2/src/fsm/peer.rs | 7 ++- .../src/operation/unsafe_recovery/demote.rs | 2 + .../operation/unsafe_recovery/force_leader.rs | 15 ++++- .../src/operation/unsafe_recovery/report.rs | 20 ++++-- components/raftstore/src/store/fsm/peer.rs | 63 +++++++++++++------ .../raftstore/src/store/unsafe_recovery.rs | 16 +++-- .../failpoints/cases/test_unsafe_recovery.rs | 44 ++++++++++++- 7 files changed, 132 insertions(+), 35 deletions(-) diff --git a/components/raftstore-v2/src/fsm/peer.rs b/components/raftstore-v2/src/fsm/peer.rs index 1734b46b25a..94506a8a19f 100644 --- a/components/raftstore-v2/src/fsm/peer.rs +++ b/components/raftstore-v2/src/fsm/peer.rs @@ -382,9 +382,10 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, syncer, failed_stores, ), - PeerMsg::ExitForceLeaderState => { - self.fsm.peer_mut().on_exit_force_leader(self.store_ctx) - } + PeerMsg::ExitForceLeaderState => self + .fsm + .peer_mut() + .on_exit_force_leader(self.store_ctx, false), PeerMsg::ExitForceLeaderStateCampaign => { self.fsm.peer_mut().on_exit_force_leader_campaign() } diff --git a/components/raftstore-v2/src/operation/unsafe_recovery/demote.rs b/components/raftstore-v2/src/operation/unsafe_recovery/demote.rs index 37962a45452..20a42b9f978 100644 --- a/components/raftstore-v2/src/operation/unsafe_recovery/demote.rs +++ b/components/raftstore-v2/src/operation/unsafe_recovery/demote.rs @@ -75,6 +75,7 @@ impl Peer { "Unsafe recovery, fail to finish demotion"; "err" => ?resp.get_header().get_error(), ); + *self.unsafe_recovery_state_mut() = Some(UnsafeRecoveryState::Failed); return; } *self.unsafe_recovery_state_mut() = Some(UnsafeRecoveryState::DemoteFailedVoters { @@ -129,6 +130,7 @@ impl Peer { "Unsafe recovery, fail to exit joint state"; "err" => ?resp.get_header().get_error(), ); + *self.unsafe_recovery_state_mut()= Some(UnsafeRecoveryState::Failed); } } else { error!(self.logger, diff --git a/components/raftstore-v2/src/operation/unsafe_recovery/force_leader.rs b/components/raftstore-v2/src/operation/unsafe_recovery/force_leader.rs index ba7e391dbef..e6af0fddb7b 100644 --- a/components/raftstore-v2/src/operation/unsafe_recovery/force_leader.rs +++ b/components/raftstore-v2/src/operation/unsafe_recovery/force_leader.rs @@ -5,7 +5,9 @@ use std::mem; use collections::HashSet; use engine_traits::{KvEngine, RaftEngine}; use raft::{eraftpb::MessageType, StateRole, Storage}; -use raftstore::store::{util::LeaseState, ForceLeaderState, UnsafeRecoveryForceLeaderSyncer}; +use raftstore::store::{ + util::LeaseState, ForceLeaderState, UnsafeRecoveryForceLeaderSyncer, UnsafeRecoveryState, +}; use slog::{info, warn}; use tikv_util::time::Instant as TiInstant; @@ -182,11 +184,20 @@ impl Peer { self.set_has_ready(); } - pub fn on_exit_force_leader(&mut self, ctx: &StoreContext) { + // TODO: add exit force leader check tick for raftstore v2 + pub fn on_exit_force_leader(&mut self, ctx: &StoreContext, force: bool) { if !self.has_force_leader() { return; } + if let Some(UnsafeRecoveryState::Failed) = self.unsafe_recovery_state() && !force { + // Skip force leader if the plan failed, so wait for the next retry of plan with force leader state holding + info!( + self.logger, "skip exiting force leader state" + ); + return; + } + info!(self.logger, "exit force leader state"); *self.force_leader_mut() = None; // leader lease shouldn't be renewed in force leader state. diff --git a/components/raftstore-v2/src/operation/unsafe_recovery/report.rs b/components/raftstore-v2/src/operation/unsafe_recovery/report.rs index 7173d00363a..90c8e3db34d 100644 --- a/components/raftstore-v2/src/operation/unsafe_recovery/report.rs +++ b/components/raftstore-v2/src/operation/unsafe_recovery/report.rs @@ -44,11 +44,19 @@ impl Peer { self.raft_group().raft.raft_log.committed }; - *self.unsafe_recovery_state_mut() = Some(UnsafeRecoveryState::WaitApply { - target_index, - syncer, - }); - self.unsafe_recovery_maybe_finish_wait_apply(!self.serving()); + if target_index > self.raft_group().raft.raft_log.applied { + info!( + self.logger, + "Unsafe recovery, start wait apply"; + "target_index" => target_index, + "applied" => self.raft_group().raft.raft_log.applied, + ); + *self.unsafe_recovery_state_mut() = Some(UnsafeRecoveryState::WaitApply { + target_index, + syncer, + }); + self.unsafe_recovery_maybe_finish_wait_apply(!self.serving()); + } } pub fn unsafe_recovery_maybe_finish_wait_apply(&mut self, force: bool) { @@ -113,7 +121,7 @@ impl Peer { Some(UnsafeRecoveryState::DemoteFailedVoters { .. }) => { self.unsafe_recovery_maybe_finish_demote_failed_voters(ctx) } - Some(UnsafeRecoveryState::Destroy(_)) | None => {} + Some(UnsafeRecoveryState::Destroy(_)) | Some(UnsafeRecoveryState::Failed) | None => {} } } } diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index b6d7f8fcfcc..584db92e8be 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -824,6 +824,8 @@ where target_index: self.fsm.peer.raft_group.raft.raft_log.last_index(), demote_after_exit: true, }); + } else { + self.fsm.peer.unsafe_recovery_state = Some(UnsafeRecoveryState::Failed); } } else { self.unsafe_recovery_demote_failed_voters(syncer, failed_voters); @@ -863,6 +865,8 @@ where target_index: self.fsm.peer.raft_group.raft.raft_log.last_index(), demote_after_exit: false, }); + } else { + self.fsm.peer.unsafe_recovery_state = Some(UnsafeRecoveryState::Failed); } } else { warn!( @@ -913,13 +917,22 @@ where self.fsm.peer.raft_group.raft.raft_log.committed }; - self.fsm.peer.unsafe_recovery_state = Some(UnsafeRecoveryState::WaitApply { - target_index, - syncer, - }); - self.fsm - .peer - .unsafe_recovery_maybe_finish_wait_apply(/* force= */ self.fsm.stopped); + if target_index > self.fsm.peer.raft_group.raft.raft_log.applied { + info!( + "Unsafe recovery, start wait apply"; + "region_id" => self.region().get_id(), + "peer_id" => self.fsm.peer_id(), + "target_index" => target_index, + "applied" => self.fsm.peer.raft_group.raft.raft_log.applied, + ); + self.fsm.peer.unsafe_recovery_state = Some(UnsafeRecoveryState::WaitApply { + target_index, + syncer, + }); + self.fsm + .peer + .unsafe_recovery_maybe_finish_wait_apply(/* force= */ self.fsm.stopped); + } } // func be invoked firstly after assigned leader by BR, wait all leader apply to @@ -1466,7 +1479,7 @@ where } => { self.on_enter_pre_force_leader(syncer, failed_stores); } - SignificantMsg::ExitForceLeaderState => self.on_exit_force_leader(), + SignificantMsg::ExitForceLeaderState => self.on_exit_force_leader(false), SignificantMsg::UnsafeRecoveryDemoteFailedVoters { syncer, failed_voters, @@ -1700,10 +1713,19 @@ where self.fsm.has_ready = true; } - fn on_exit_force_leader(&mut self) { + fn on_exit_force_leader(&mut self, force: bool) { if self.fsm.peer.force_leader.is_none() { return; } + if let Some(UnsafeRecoveryState::Failed) = self.fsm.peer.unsafe_recovery_state && !force { + // Skip force leader if the plan failed, so wait for the next retry of plan with force leader state holding + info!( + "skip exiting force leader state"; + "region_id" => self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + ); + return; + } info!( "exit force leader state"; @@ -1712,7 +1734,7 @@ where ); self.fsm.peer.force_leader = None; // make sure it's not hibernated - assert_eq!(self.fsm.hibernate_state.group_state(), GroupState::Ordered); + assert_ne!(self.fsm.hibernate_state.group_state(), GroupState::Idle); // leader lease shouldn't be renewed in force leader state. assert_eq!( self.fsm.peer.leader_lease().inspect(None), @@ -2274,7 +2296,10 @@ where } } // Destroy does not need be processed, the state is cleaned up together with peer. - Some(_) | None => {} + Some(UnsafeRecoveryState::Destroy { .. }) + | Some(UnsafeRecoveryState::Failed) + | Some(UnsafeRecoveryState::WaitInitialize(..)) + | None => {} } } @@ -6360,13 +6385,6 @@ where return; } - if let Some(ForceLeaderState::ForceLeader { time, .. }) = self.fsm.peer.force_leader { - // Clean up the force leader state after a timeout, since the PD recovery - // process may have been aborted for some reasons. - if time.saturating_elapsed() > UNSAFE_RECOVERY_STATE_TIMEOUT { - self.on_exit_force_leader(); - } - } if let Some(state) = &mut self.fsm.peer.unsafe_recovery_state { let unsafe_recovery_state_timeout_failpoint = || -> bool { fail_point!("unsafe_recovery_state_timeout", |_| true); @@ -6379,6 +6397,15 @@ where { info!("timeout, abort unsafe recovery"; "state" => ?state); state.abort(); + self.fsm.peer.unsafe_recovery_state = None; + } + } + + if let Some(ForceLeaderState::ForceLeader { time, .. }) = self.fsm.peer.force_leader { + // Clean up the force leader state after a timeout, since the PD recovery + // process may have been aborted for some reasons. + if time.saturating_elapsed() > UNSAFE_RECOVERY_STATE_TIMEOUT { + self.on_exit_force_leader(true); } } diff --git a/components/raftstore/src/store/unsafe_recovery.rs b/components/raftstore/src/store/unsafe_recovery.rs index f98fcaea581..28943ae7339 100644 --- a/components/raftstore/src/store/unsafe_recovery.rs +++ b/components/raftstore/src/store/unsafe_recovery.rs @@ -241,7 +241,7 @@ pub struct UnsafeRecoveryForceLeaderSyncer(Arc); impl UnsafeRecoveryForceLeaderSyncer { pub fn new(report_id: u64, router: Arc) -> Self { let inner = InvokeClosureOnDrop(Some(Box::new(move || { - info!("Unsafe recovery, force leader finished."); + info!("Unsafe recovery, force leader finished."; "report_id" => report_id); start_unsafe_recovery_report(router, report_id, false); }))); UnsafeRecoveryForceLeaderSyncer(Arc::new(inner)) @@ -260,11 +260,11 @@ impl UnsafeRecoveryExecutePlanSyncer { let abort = Arc::new(Mutex::new(false)); let abort_clone = abort.clone(); let closure = InvokeClosureOnDrop(Some(Box::new(move || { - info!("Unsafe recovery, plan execution finished"); if *abort_clone.lock().unwrap() { - warn!("Unsafe recovery, plan execution aborted"); + warn!("Unsafe recovery, plan execution aborted"; "report_id" => report_id); return; } + info!("Unsafe recovery, plan execution finished"; "report_id" => report_id); start_unsafe_recovery_report(router, report_id, true); }))); UnsafeRecoveryExecutePlanSyncer { @@ -330,7 +330,7 @@ impl UnsafeRecoveryWaitApplySyncer { let abort_clone = abort.clone(); let closure = InvokeClosureOnDrop(Some(Box::new(move || { if *abort_clone.lock().unwrap() { - warn!("Unsafe recovery, wait apply aborted"); + warn!("Unsafe recovery, wait apply aborted"; "report_id" => report_id); return; } info!("Unsafe recovery, wait apply finished"); @@ -363,7 +363,7 @@ impl UnsafeRecoveryFillOutReportSyncer { let reports = Arc::new(Mutex::new(vec![])); let reports_clone = reports.clone(); let closure = InvokeClosureOnDrop(Some(Box::new(move || { - info!("Unsafe recovery, peer reports collected"); + info!("Unsafe recovery, peer reports collected"; "report_id" => report_id); let mut store_report = StoreReport::default(); { let mut reports_ptr = reports_clone.lock().unwrap(); @@ -420,6 +420,9 @@ pub enum UnsafeRecoveryState { }, Destroy(UnsafeRecoveryExecutePlanSyncer), WaitInitialize(UnsafeRecoveryExecutePlanSyncer), + // DemoteFailedVoter may fail due to some reasons. It's just a marker to avoid exiting force + // leader state + Failed, } impl UnsafeRecoveryState { @@ -429,6 +432,7 @@ impl UnsafeRecoveryState { UnsafeRecoveryState::DemoteFailedVoters { syncer, .. } | UnsafeRecoveryState::Destroy(syncer) | UnsafeRecoveryState::WaitInitialize(syncer) => syncer.time, + UnsafeRecoveryState::Failed => return false, }; time.saturating_elapsed() >= timeout } @@ -439,6 +443,7 @@ impl UnsafeRecoveryState { UnsafeRecoveryState::DemoteFailedVoters { syncer, .. } | UnsafeRecoveryState::Destroy(syncer) | UnsafeRecoveryState::WaitInitialize(syncer) => &syncer.abort, + UnsafeRecoveryState::Failed => return true, }; *abort.lock().unwrap() } @@ -449,6 +454,7 @@ impl UnsafeRecoveryState { UnsafeRecoveryState::DemoteFailedVoters { syncer, .. } | UnsafeRecoveryState::Destroy(syncer) | UnsafeRecoveryState::WaitInitialize(syncer) => syncer.abort(), + UnsafeRecoveryState::Failed => (), } } } diff --git a/tests/failpoints/cases/test_unsafe_recovery.rs b/tests/failpoints/cases/test_unsafe_recovery.rs index 978489b5cd6..9e5a5dffcd9 100644 --- a/tests/failpoints/cases/test_unsafe_recovery.rs +++ b/tests/failpoints/cases/test_unsafe_recovery.rs @@ -458,7 +458,7 @@ fn test_unsafe_recovery_rollback_merge() { } // Block merge commit, let go of the merge prepare. - fail::cfg("on_schedule_merge_ret_err", "return()").unwrap(); + fail::cfg("on_schedule_merge", "return()").unwrap(); let region = pd_client.get_region(b"k1").unwrap(); cluster.must_split(®ion, b"k2"); @@ -521,6 +521,48 @@ fn test_unsafe_recovery_rollback_merge() { pd_client.must_set_unsafe_recovery_plan(nodes[0], plan.clone()); cluster.must_send_store_heartbeat(nodes[0]); + // Can't propose demotion as it's in merging mode + let mut store_report = None; + for _ in 0..20 { + store_report = pd_client.must_get_store_report(nodes[0]); + if store_report.is_some() { + break; + } + sleep_ms(100); + } + assert_ne!(store_report, None); + let has_force_leader = store_report + .unwrap() + .get_peer_reports() + .iter() + .any(|p| p.get_is_force_leader()); + // Force leader is not exited due to demotion failure + assert!(has_force_leader); + + fail::remove("on_schedule_merge"); + fail::cfg("on_schedule_merge_ret_err", "return()").unwrap(); + + // Make sure merge check is scheduled, and rollback merge is triggered + sleep_ms(50); + + // Re-triggers the unsafe recovery plan execution. + pd_client.must_set_unsafe_recovery_plan(nodes[0], plan); + cluster.must_send_store_heartbeat(nodes[0]); + let mut store_report = None; + for _ in 0..20 { + store_report = pd_client.must_get_store_report(nodes[0]); + if store_report.is_some() { + break; + } + sleep_ms(100); + } + assert_ne!(store_report, None); + // No force leader + for peer_report in store_report.unwrap().get_peer_reports() { + assert!(!peer_report.get_is_force_leader()); + } + + // Demotion is done let mut demoted = false; for _ in 0..10 { let new_left = block_on(pd_client.get_region_by_id(left.get_id())) From cb27f24b89c8107c9ead57be5016ee779996ac25 Mon Sep 17 00:00:00 2001 From: cfzjywxk Date: Mon, 16 Oct 2023 12:36:27 +0800 Subject: [PATCH 090/220] retry leader read when stale read encounters data not ready (#15726) ref tikv/tikv#14553 Signed-off-by: cfzjywxk Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- .../raftstore/src/store/worker/metrics.rs | 20 ++ components/raftstore/src/store/worker/read.rs | 330 +++++++++++++++--- tests/failpoints/cases/test_kv_service.rs | 57 ++- .../cases/test_replica_stale_read.rs | 18 +- 4 files changed, 358 insertions(+), 67 deletions(-) diff --git a/components/raftstore/src/store/worker/metrics.rs b/components/raftstore/src/store/worker/metrics.rs index fd3f54d239d..8dca3bcfd44 100644 --- a/components/raftstore/src/store/worker/metrics.rs +++ b/components/raftstore/src/store/worker/metrics.rs @@ -70,6 +70,8 @@ make_static_metric! { pub struct LocalReadMetrics { pub local_executed_requests: LocalIntCounter, pub local_executed_stale_read_requests: LocalIntCounter, + pub local_executed_stale_read_fallback_success_requests: LocalIntCounter, + pub local_executed_stale_read_fallback_failure_requests: LocalIntCounter, pub local_executed_replica_read_requests: LocalIntCounter, pub local_executed_snapshot_cache_hit: LocalIntCounter, pub reject_reason: LocalReadRejectCounter, @@ -82,6 +84,8 @@ thread_local! { LocalReadMetrics { local_executed_requests: LOCAL_READ_EXECUTED_REQUESTS.local(), local_executed_stale_read_requests: LOCAL_READ_EXECUTED_STALE_READ_REQUESTS.local(), + local_executed_stale_read_fallback_success_requests: LOCAL_READ_EXECUTED_STALE_READ_FALLBACK_SUCCESS_REQUESTS.local(), + local_executed_stale_read_fallback_failure_requests: LOCAL_READ_EXECUTED_STALE_READ_FALLBACK_FAILURE_REQUESTS.local(), local_executed_replica_read_requests: LOCAL_READ_EXECUTED_REPLICA_READ_REQUESTS.local(), local_executed_snapshot_cache_hit: LOCAL_READ_EXECUTED_CACHE_REQUESTS.local(), reject_reason: LocalReadRejectCounter::from(&LOCAL_READ_REJECT_VEC), @@ -100,6 +104,10 @@ pub fn maybe_tls_local_read_metrics_flush() { if m.last_flush_time.saturating_elapsed() >= Duration::from_millis(METRICS_FLUSH_INTERVAL) { m.local_executed_requests.flush(); m.local_executed_stale_read_requests.flush(); + m.local_executed_stale_read_fallback_success_requests + .flush(); + m.local_executed_stale_read_fallback_failure_requests + .flush(); m.local_executed_replica_read_requests.flush(); m.local_executed_snapshot_cache_hit.flush(); m.reject_reason.flush(); @@ -189,6 +197,18 @@ lazy_static! { "Total number of stale read requests directly executed by local reader." ) .unwrap(); + pub static ref LOCAL_READ_EXECUTED_STALE_READ_FALLBACK_SUCCESS_REQUESTS: IntCounter = + register_int_counter!( + "tikv_raftstore_local_read_executed_stale_read_fallback_success_requests", + "Total number of stale read requests executed by local leader peer as snapshot read." + ) + .unwrap(); + pub static ref LOCAL_READ_EXECUTED_STALE_READ_FALLBACK_FAILURE_REQUESTS: IntCounter = + register_int_counter!( + "tikv_raftstore_local_read_executed_stale_read_fallback_failure_requests", + "Total number of stale read requests failed to be executed by local leader peer as snapshot read." + ) + .unwrap(); pub static ref LOCAL_READ_EXECUTED_REPLICA_READ_REQUESTS: IntCounter = register_int_counter!( "tikv_raftstore_local_read_executed_replica_read_requests", "Total number of stale read requests directly executed by local reader." diff --git a/components/raftstore/src/store/worker/read.rs b/components/raftstore/src/store/worker/read.rs index 5d6ede9c193..2d54c00baa6 100644 --- a/components/raftstore/src/store/worker/read.rs +++ b/components/raftstore/src/store/worker/read.rs @@ -30,7 +30,7 @@ use tikv_util::{ }; use time::Timespec; use tracker::GLOBAL_TRACKERS; -use txn_types::TimeStamp; +use txn_types::{TimeStamp, WriteBatchFlags}; use super::metrics::*; use crate::{ @@ -974,80 +974,155 @@ where cmd.callback.set_result(read_resp); } + /// Try to handle the read request using local read, if the leader is valid + /// the read response is returned, otherwise None is returned. + fn try_local_leader_read( + &mut self, + req: &RaftCmdRequest, + delegate: &mut CachedReadDelegate, + read_id: Option, + snap_updated: &mut bool, + last_valid_ts: Timespec, + ) -> Option> { + let mut local_read_ctx = LocalReadContext::new(&mut self.snap_cache, read_id); + + (*snap_updated) = + local_read_ctx.maybe_update_snapshot(delegate.get_tablet(), last_valid_ts); + + let snapshot_ts = local_read_ctx.snapshot_ts().unwrap(); + if !delegate.is_in_leader_lease(snapshot_ts) { + return None; + } + + let region = Arc::clone(&delegate.region); + let mut response = delegate.execute(req, ®ion, None, Some(local_read_ctx)); + if let Some(snap) = response.snapshot.as_mut() { + snap.bucket_meta = delegate.bucket_meta.clone(); + } + // Try renew lease in advance + delegate.maybe_renew_lease_advance(&self.router, snapshot_ts); + Some(response) + } + + /// Try to handle the stale read request, if the read_ts < safe_ts the read + /// response is returned, otherwise the raft command response with + /// `DataIsNotReady` error is returned. + fn try_local_stale_read( + &mut self, + req: &RaftCmdRequest, + delegate: &mut CachedReadDelegate, + snap_updated: &mut bool, + last_valid_ts: Timespec, + ) -> std::result::Result, RaftCmdResponse> { + let read_ts = decode_u64(&mut req.get_header().get_flag_data()).unwrap(); + delegate.check_stale_read_safe(read_ts)?; + + // Stale read does not use cache, so we pass None for read_id + let mut local_read_ctx = LocalReadContext::new(&mut self.snap_cache, None); + (*snap_updated) = + local_read_ctx.maybe_update_snapshot(delegate.get_tablet(), last_valid_ts); + + let region = Arc::clone(&delegate.region); + // Getting the snapshot + let mut response = delegate.execute(req, ®ion, None, Some(local_read_ctx)); + if let Some(snap) = response.snapshot.as_mut() { + snap.bucket_meta = delegate.bucket_meta.clone(); + } + // Double check in case `safe_ts` change after the first check and before + // getting snapshot + delegate.check_stale_read_safe(read_ts)?; + + TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().local_executed_stale_read_requests.inc()); + Ok(response) + } + pub fn propose_raft_command( &mut self, read_id: Option, - req: RaftCmdRequest, + mut req: RaftCmdRequest, cb: Callback, ) { match self.pre_propose_raft_command(&req) { Ok(Some((mut delegate, policy))) => { - let snap_updated; + let mut snap_updated = false; let last_valid_ts = delegate.last_valid_ts; let mut response = match policy { // Leader can read local if and only if it is in lease. RequestPolicy::ReadLocal => { - let mut local_read_ctx = - LocalReadContext::new(&mut self.snap_cache, read_id); - - snap_updated = local_read_ctx - .maybe_update_snapshot(delegate.get_tablet(), last_valid_ts); - - let snapshot_ts = local_read_ctx.snapshot_ts().unwrap(); - if !delegate.is_in_leader_lease(snapshot_ts) { + if let Some(read_resp) = self.try_local_leader_read( + &req, + &mut delegate, + read_id, + &mut snap_updated, + last_valid_ts, + ) { + read_resp + } else { fail_point!("localreader_before_redirect", |_| {}); // Forward to raftstore. self.redirect(RaftCommand::new(req, cb)); return; } - - let region = Arc::clone(&delegate.region); - let mut response = - delegate.execute(&req, ®ion, None, Some(local_read_ctx)); - if let Some(snap) = response.snapshot.as_mut() { - snap.bucket_meta = delegate.bucket_meta.clone(); - } - // Try renew lease in advance - delegate.maybe_renew_lease_advance(&self.router, snapshot_ts); - response } // Replica can serve stale read if and only if its `safe_ts` >= `read_ts` RequestPolicy::StaleRead => { - let read_ts = decode_u64(&mut req.get_header().get_flag_data()).unwrap(); - if let Err(resp) = delegate.check_stale_read_safe(read_ts) { - cb.set_result(ReadResponse { - response: resp, - snapshot: None, - txn_extra_op: TxnExtraOp::Noop, - }); - return; + match self.try_local_stale_read( + &req, + &mut delegate, + &mut snap_updated, + last_valid_ts, + ) { + Ok(read_resp) => read_resp, + Err(err_resp) => { + // It's safe to change the header of the `RaftCmdRequest`, as it + // would not affect the `SnapCtx` used in upper layer like. + let unset_stale_flag = req.get_header().get_flags() + & (!WriteBatchFlags::STALE_READ.bits()); + req.mut_header().set_flags(unset_stale_flag); + let mut inspector = Inspector { + delegate: &delegate, + }; + // The read request could be handled using snapshot read if the + // local peer is a valid leader. + let allow_fallback_leader_read = inspector + .inspect(&req) + .map_or(false, |r| r == RequestPolicy::ReadLocal); + if !allow_fallback_leader_read { + cb.set_result(ReadResponse { + response: err_resp, + snapshot: None, + txn_extra_op: TxnExtraOp::Noop, + }); + return; + } + if let Some(read_resp) = self.try_local_leader_read( + &req, + &mut delegate, + None, + &mut snap_updated, + last_valid_ts, + ) { + TLS_LOCAL_READ_METRICS.with(|m| { + m.borrow_mut() + .local_executed_stale_read_fallback_success_requests + .inc() + }); + read_resp + } else { + TLS_LOCAL_READ_METRICS.with(|m| { + m.borrow_mut() + .local_executed_stale_read_fallback_failure_requests + .inc() + }); + cb.set_result(ReadResponse { + response: err_resp, + snapshot: None, + txn_extra_op: TxnExtraOp::Noop, + }); + return; + } + } } - - // Stale read does not use cache, so we pass None for read_id - let mut local_read_ctx = LocalReadContext::new(&mut self.snap_cache, None); - snap_updated = local_read_ctx - .maybe_update_snapshot(delegate.get_tablet(), last_valid_ts); - - let region = Arc::clone(&delegate.region); - // Getting the snapshot - let mut response = - delegate.execute(&req, ®ion, None, Some(local_read_ctx)); - if let Some(snap) = response.snapshot.as_mut() { - snap.bucket_meta = delegate.bucket_meta.clone(); - } - // Double check in case `safe_ts` change after the first check and before - // getting snapshot - if let Err(resp) = delegate.check_stale_read_safe(read_ts) { - cb.set_result(ReadResponse { - response: resp, - snapshot: None, - txn_extra_op: TxnExtraOp::Noop, - }); - return; - } - TLS_LOCAL_READ_METRICS - .with(|m| m.borrow_mut().local_executed_stale_read_requests.inc()); - response } _ => unreachable!(), }; @@ -1598,6 +1673,8 @@ mod tests { read_progress.update_safe_ts(1, 1); assert_eq!(read_progress.safe_ts(), 1); + // Expire lease manually to avoid local retry on leader peer. + lease.expire(); let data = { let mut d = [0u8; 8]; (&mut d[..]).encode_u64(2).unwrap(); @@ -1755,13 +1832,14 @@ mod tests { assert_eq!(kv_engine.path(), tablet.path()); } - fn prepare_read_delegate( + fn prepare_read_delegate_with_lease( store_id: u64, region_id: u64, term: u64, pr_ids: Vec, region_epoch: RegionEpoch, store_meta: Arc>, + max_lease: Duration, ) { let mut region = metapb::Region::default(); region.set_id(region_id); @@ -1770,7 +1848,7 @@ mod tests { let leader = prs[0].clone(); region.set_region_epoch(region_epoch); - let mut lease = Lease::new(Duration::seconds(1), Duration::milliseconds(250)); // 1s is long enough. + let mut lease = Lease::new(max_lease, Duration::milliseconds(250)); // 1s is long enough. let read_progress = Arc::new(RegionReadProgress::new(®ion, 1, 1, 1)); // Register region @@ -1799,6 +1877,25 @@ mod tests { } } + fn prepare_read_delegate( + store_id: u64, + region_id: u64, + term: u64, + pr_ids: Vec, + region_epoch: RegionEpoch, + store_meta: Arc>, + ) { + prepare_read_delegate_with_lease( + store_id, + region_id, + term, + pr_ids, + region_epoch, + store_meta, + Duration::seconds(1), + ) + } + #[test] fn test_snap_across_regions() { let store_id = 2; @@ -2165,4 +2262,123 @@ mod tests { must_not_redirect(&mut reader, &rx, task); notify_rx.recv().unwrap(); } + + #[test] + fn test_stale_read_local_leader_fallback() { + let store_id = 2; + let store_meta = Arc::new(Mutex::new(StoreMeta::new(0))); + let (_tmp, mut reader, rx) = new_reader( + "test-stale-local-leader-fallback", + store_id, + store_meta.clone(), + ); + reader.kv_engine.put(b"key", b"value").unwrap(); + + let epoch13 = { + let mut ep = metapb::RegionEpoch::default(); + ep.set_conf_ver(1); + ep.set_version(3); + ep + }; + let term6 = 6; + + // Register region1. + let pr_ids1 = vec![2, 3, 4]; + let prs1 = new_peers(store_id, pr_ids1.clone()); + // Ensure the leader lease is long enough so the fallback would work. + prepare_read_delegate_with_lease( + store_id, + 1, + term6, + pr_ids1.clone(), + epoch13.clone(), + store_meta.clone(), + Duration::seconds(10), + ); + let leader1 = prs1[0].clone(); + + // Local read. + let mut cmd = RaftCmdRequest::default(); + let mut header = RaftRequestHeader::default(); + header.set_region_id(1); + header.set_peer(leader1); + header.set_region_epoch(epoch13.clone()); + header.set_term(term6); + header.set_flags(header.get_flags() | WriteBatchFlags::STALE_READ.bits()); + cmd.set_header(header.clone()); + let mut req = Request::default(); + req.set_cmd_type(CmdType::Snap); + cmd.set_requests(vec![req].into()); + + // A peer can serve read_ts < safe_ts. + let safe_ts = TimeStamp::compose(2, 0); + { + let mut meta = store_meta.lock().unwrap(); + let delegate = meta.readers.get_mut(&1).unwrap(); + delegate + .read_progress + .update_safe_ts(1, safe_ts.into_inner()); + assert_eq!(delegate.read_progress.safe_ts(), safe_ts.into_inner()); + } + let read_ts_1 = TimeStamp::compose(1, 0); + let mut data = [0u8; 8]; + (&mut data[..]).encode_u64(read_ts_1.into_inner()).unwrap(); + header.set_flag_data(data.into()); + cmd.set_header(header.clone()); + let (snap_tx, snap_rx) = channel(); + let task = RaftCommand::::new( + cmd.clone(), + Callback::read(Box::new(move |resp: ReadResponse| { + snap_tx.send(resp).unwrap(); + })), + ); + must_not_redirect(&mut reader, &rx, task); + snap_rx.recv().unwrap().snapshot.unwrap(); + + // When read_ts > safe_ts, the leader peer could still serve if its lease is + // valid. + let read_ts_2 = TimeStamp::compose(safe_ts.physical() + 201, 0); + let mut data = [0u8; 8]; + (&mut data[..]).encode_u64(read_ts_2.into_inner()).unwrap(); + header.set_flag_data(data.into()); + cmd.set_header(header.clone()); + let (snap_tx, snap_rx) = channel(); + let task = RaftCommand::::new( + cmd.clone(), + Callback::read(Box::new(move |resp: ReadResponse| { + snap_tx.send(resp).unwrap(); + })), + ); + must_not_redirect(&mut reader, &rx, task); + snap_rx.recv().unwrap().snapshot.unwrap(); + + // The fallback would not happen if the lease is not valid. + prepare_read_delegate_with_lease( + store_id, + 1, + term6, + pr_ids1, + epoch13, + store_meta, + Duration::milliseconds(1), + ); + thread::sleep(std::time::Duration::from_millis(50)); + let (snap_tx, snap_rx) = channel(); + let task2 = RaftCommand::::new( + cmd.clone(), + Callback::read(Box::new(move |resp: ReadResponse| { + snap_tx.send(resp).unwrap(); + })), + ); + must_not_redirect(&mut reader, &rx, task2); + assert!( + snap_rx + .recv() + .unwrap() + .response + .get_header() + .get_error() + .has_data_is_not_ready() + ); + } } diff --git a/tests/failpoints/cases/test_kv_service.rs b/tests/failpoints/cases/test_kv_service.rs index f3831bb984b..00f5c3c778e 100644 --- a/tests/failpoints/cases/test_kv_service.rs +++ b/tests/failpoints/cases/test_kv_service.rs @@ -5,9 +5,10 @@ use std::{sync::Arc, time::Duration}; use grpcio::{ChannelBuilder, Environment}; use kvproto::{kvrpcpb::*, tikvpb::TikvClient}; use test_raftstore::{ - must_kv_prewrite, must_new_cluster_and_kv_client, must_new_cluster_mul, - try_kv_prewrite_with_impl, + configure_for_lease_read, must_kv_commit, must_kv_prewrite, must_new_cluster_and_kv_client, + must_new_cluster_mul, new_server_cluster, try_kv_prewrite_with_impl, }; +use tikv_util::{config::ReadableDuration, HandyRwLock}; #[test] fn test_batch_get_memory_lock() { @@ -103,3 +104,55 @@ fn test_undetermined_write_err() { // The previous panic hasn't been captured. assert!(std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| drop(cluster))).is_err()); } +#[test] +fn test_stale_read_on_local_leader() { + let mut cluster = new_server_cluster(0, 1); + // Increase the election tick to make this test case running reliably. + configure_for_lease_read(&mut cluster.cfg, Some(50), Some(10_000)); + let max_lease = Duration::from_secs(2); + cluster.cfg.raft_store.raft_store_max_leader_lease = ReadableDuration(max_lease); + cluster.pd_client.disable_default_operator(); + cluster.run(); + + let region_id = 1; + let leader = cluster.leader_of_region(region_id).unwrap(); + let epoch = cluster.get_region_epoch(region_id); + let mut ctx = Context::default(); + ctx.set_region_id(region_id); + ctx.set_peer(leader.clone()); + ctx.set_region_epoch(epoch); + let env = Arc::new(Environment::new(1)); + let channel = + ChannelBuilder::new(env).connect(&cluster.sim.rl().get_addr(leader.get_store_id())); + let client = TikvClient::new(channel); + + let (k, v) = (b"key".to_vec(), b"value".to_vec()); + let v1 = b"value1".to_vec(); + + // Write record. + let mut mutation = Mutation::default(); + mutation.set_op(Op::Put); + mutation.set_key(k.clone()); + mutation.set_value(v.clone()); + must_kv_prewrite(&client, ctx.clone(), vec![mutation], k.clone(), 10); + must_kv_commit(&client, ctx.clone(), vec![k.clone()], 10, 30, 30); + + // Prewrite and leave a lock. + let mut mutation = Mutation::default(); + mutation.set_op(Op::Put); + mutation.set_key(k.clone()); + mutation.set_value(v1); + must_kv_prewrite(&client, ctx.clone(), vec![mutation], k.clone(), 50); + + let mut req = GetRequest::default(); + req.set_context(ctx); + req.set_key(k); + req.version = 40; + req.mut_context().set_stale_read(true); + + // The stale read should fallback and succeed on the leader peer. + let resp = client.kv_get(&req).unwrap(); + assert!(resp.error.is_none()); + assert!(resp.region_error.is_none()); + assert_eq!(v, resp.get_value()); +} diff --git a/tests/failpoints/cases/test_replica_stale_read.rs b/tests/failpoints/cases/test_replica_stale_read.rs index b7d436d92d7..cb986250d82 100644 --- a/tests/failpoints/cases/test_replica_stale_read.rs +++ b/tests/failpoints/cases/test_replica_stale_read.rs @@ -288,9 +288,11 @@ fn test_update_resoved_ts_before_apply_index() { sleep_ms(100); // The leader can't handle stale read with `commit_ts2` because its `safe_ts` - // can't update due to its `apply_index` not update + // can't update due to its `apply_index` not update. + // The request would be handled as a snapshot read on the valid leader peer + // after fallback. let resp = leader_client.kv_read(b"key1".to_vec(), commit_ts2); - assert!(resp.get_region_error().has_data_is_not_ready(),); + assert_eq!(resp.get_value(), b"value2"); // The follower can't handle stale read with `commit_ts2` because it don't // have enough data let resp = follower_client2.kv_read(b"key1".to_vec(), commit_ts2); @@ -667,10 +669,10 @@ fn test_stale_read_future_ts_not_update_max_ts() { b"key1".to_vec(), ); - // Perform stale read with a future ts should return error + // Perform stale read with a future ts, the stale read could be processed + // falling back to snapshot read on the leader peer. let read_ts = get_tso(&pd_client) + 10000000; - let resp = leader_client.kv_read(b"key1".to_vec(), read_ts); - assert!(resp.get_region_error().has_data_is_not_ready()); + leader_client.must_kv_read_equal(b"key1".to_vec(), b"value1".to_vec(), read_ts); // The `max_ts` should not updated by the stale read request, so we can prewrite // and commit `async_commit` transaction with a ts that smaller than the @@ -687,10 +689,10 @@ fn test_stale_read_future_ts_not_update_max_ts() { leader_client.must_kv_commit(vec![b"key2".to_vec()], prewrite_ts, commit_ts); leader_client.must_kv_read_equal(b"key2".to_vec(), b"value1".to_vec(), get_tso(&pd_client)); - // Perform stale read with a future ts should return error + // Perform stale read with a future ts, the stale read could be processed + // falling back to snapshot read on the leader peer. let read_ts = get_tso(&pd_client) + 10000000; - let resp = leader_client.kv_read(b"key1".to_vec(), read_ts); - assert!(resp.get_region_error().has_data_is_not_ready()); + leader_client.must_kv_read_equal(b"key2".to_vec(), b"value1".to_vec(), read_ts); // The `max_ts` should not updated by the stale read request, so 1pc transaction // with a ts that smaller than the `read_ts` should not be fallbacked to 2pc From 9fb1ce63a079cd486f0fc4661ff28abb76d0e734 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B1=B1=E5=B2=9A?= <36239017+YuJuncen@users.noreply.github.com> Date: Mon, 16 Oct 2023 18:18:29 +0800 Subject: [PATCH 091/220] snap_restore: Abort last recover region (#15685) close tikv/tikv#15684 This PR will make `recover_region` return `ABORTED` once there are new `recover_region` RPCs in. Signed-off-by: hillium Co-authored-by: qupeng --- Cargo.toml | 2 +- components/raftstore/src/store/transport.rs | 16 +- components/snap_recovery/Cargo.toml | 7 + components/snap_recovery/src/leader_keeper.rs | 16 +- components/snap_recovery/src/services.rs | 147 ++++++++++++++++-- 5 files changed, 165 insertions(+), 23 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 4d8cefa9fa4..bd2b4946950 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -334,7 +334,7 @@ resource_metering = { path = "components/resource_metering" } security = { path = "components/security" } server = { path = "components/server" } service = { path = "components/service" } -snap_recovery = { path = "components/snap_recovery" } +snap_recovery = { path = "components/snap_recovery", default-features = false } sst_importer = { path = "components/sst_importer", default-features = false } test_backup = { path = "components/test_backup" } test_coprocessor = { path = "components/test_coprocessor", default-features = false } diff --git a/components/raftstore/src/store/transport.rs b/components/raftstore/src/store/transport.rs index 7f10e7cd249..2ca19fbe5fe 100644 --- a/components/raftstore/src/store/transport.rs +++ b/components/raftstore/src/store/transport.rs @@ -1,7 +1,7 @@ // Copyright 2016 TiKV Project Authors. Licensed under Apache-2.0. // #[PerformanceCriticalPath] -use std::sync::mpsc; +use std::sync::{mpsc, Mutex}; use crossbeam::channel::{SendError, TrySendError}; use engine_traits::{KvEngine, RaftEngine, Snapshot}; @@ -46,6 +46,13 @@ where fn significant_send(&self, region_id: u64, msg: SignificantMsg) -> Result<()>; } +impl<'a, T: SignificantRouter, EK: KvEngine> SignificantRouter for &'a Mutex { + #[inline] + fn significant_send(&self, region_id: u64, msg: SignificantMsg) -> Result<()> { + Mutex::lock(self).unwrap().significant_send(region_id, msg) + } +} + /// Routes proposal to target region. pub trait ProposalRouter where @@ -79,6 +86,13 @@ where } } +impl<'a, EK: KvEngine, T: CasualRouter> CasualRouter for &'a Mutex { + #[inline] + fn send(&self, region_id: u64, msg: CasualMessage) -> Result<()> { + CasualRouter::send(&*Mutex::lock(self).unwrap(), region_id, msg) + } +} + impl SignificantRouter for RaftRouter where EK: KvEngine, diff --git a/components/snap_recovery/Cargo.toml b/components/snap_recovery/Cargo.toml index 8b0b0ec4c3a..23cbdcfe098 100644 --- a/components/snap_recovery/Cargo.toml +++ b/components/snap_recovery/Cargo.toml @@ -5,6 +5,13 @@ edition = "2021" publish = false # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html +[features] +default = ["test-engine-kv-rocksdb", "test-engine-raft-raft-engine"] +test-engine-kv-rocksdb = ["tikv/test-engine-kv-rocksdb"] +test-engine-raft-raft-engine = ["tikv/test-engine-raft-raft-engine"] +test-engines-rocksdb = ["tikv/test-engines-rocksdb"] +test-engines-panic = ["tikv/test-engines-panic"] + [dependencies] chrono = "0.4" encryption = { workspace = true } diff --git a/components/snap_recovery/src/leader_keeper.rs b/components/snap_recovery/src/leader_keeper.rs index 417d5becca3..ca2623c82ca 100644 --- a/components/snap_recovery/src/leader_keeper.rs +++ b/components/snap_recovery/src/leader_keeper.rs @@ -9,18 +9,17 @@ use std::{ use engine_traits::KvEngine; use futures::compat::Future01CompatExt; -use itertools::Itertools; use raftstore::{ errors::{Error, Result}, store::{Callback, CasualMessage, CasualRouter, SignificantMsg, SignificantRouter}, }; use tikv_util::{future::paired_future_callback, timer::GLOBAL_TIMER_HANDLE}; -pub struct LeaderKeeper { +pub struct LeaderKeeper<'a, EK, Router: 'a> { router: Router, not_leader: HashSet, - _ek: PhantomData, + _ek: PhantomData<&'a EK>, } #[derive(Default)] @@ -51,10 +50,10 @@ impl std::fmt::Debug for StepResult { } } -impl LeaderKeeper +impl<'a, EK, Router> LeaderKeeper<'a, EK, Router> where EK: KvEngine, - Router: CasualRouter + SignificantRouter + 'static, + Router: CasualRouter + SignificantRouter + 'a, { pub fn new(router: Router, to_keep: impl IntoIterator) -> Self { Self { @@ -85,8 +84,9 @@ where const CONCURRENCY: usize = 256; let r = Mutex::new(StepResult::default()); let success = Mutex::new(HashSet::new()); - for batch in &self.not_leader.iter().chunks(CONCURRENCY) { - let tasks = batch.map(|region_id| async { + let regions = self.not_leader.iter().copied().collect::>(); + for batch in regions.as_slice().chunks(CONCURRENCY) { + let tasks = batch.iter().map(|region_id| async { match self.check_leader(*region_id).await { Ok(_) => { success.lock().unwrap().insert(*region_id); @@ -150,7 +150,7 @@ mod test { leaders: RefCell>, } - impl LeaderKeeper { + impl<'a, EK, Router> LeaderKeeper<'a, EK, Router> { fn mut_router(&mut self) -> &mut Router { &mut self.router } diff --git a/components/snap_recovery/src/services.rs b/components/snap_recovery/src/services.rs index 10f82d64917..daf6e7ed30f 100644 --- a/components/snap_recovery/src/services.rs +++ b/components/snap_recovery/src/services.rs @@ -2,8 +2,14 @@ use std::{ error::Error as StdError, + fmt::Display, + future::Future, result, - sync::mpsc::{sync_channel, SyncSender}, + sync::{ + atomic::{AtomicBool, Ordering}, + mpsc::{sync_channel, SyncSender}, + Arc, Mutex, + }, thread::Builder, time::Instant, }; @@ -17,10 +23,12 @@ use engine_traits::{CfNamesExt, CfOptionsExt, Engines, Peekable, RaftEngine}; use futures::{ channel::mpsc, executor::{ThreadPool, ThreadPoolBuilder}, + stream::{AbortHandle, Aborted}, FutureExt, SinkExt, StreamExt, }; use grpcio::{ - ClientStreamingSink, RequestStream, RpcContext, ServerStreamingSink, UnarySink, WriteFlags, + ClientStreamingSink, RequestStream, RpcContext, RpcStatus, RpcStatusCode, ServerStreamingSink, + UnarySink, WriteFlags, }; use kvproto::{raft_serverpb::StoreIdent, recoverdatapb::*}; use raftstore::{ @@ -65,6 +73,44 @@ pub struct RecoveryService { engines: Engines, router: RaftRouter, threads: ThreadPool, + + /// The handle to last call of recover region RPC. + /// + /// We need to make sure the execution of keeping leader exits before next + /// `RecoverRegion` rpc gets in. Or the previous call may stuck at keep + /// leader forever, once the second caller request the leader to be at + /// another store. + // NOTE: Perhaps it would be better to abort the procedure as soon as the client + // stream has been closed, but yet it seems there isn't such hook like + // `on_client_go` for us, and the current implementation only start + // work AFTER the client closes their sender part(!) + last_recovery_region_rpc: Arc>>, +} + +struct RecoverRegionState { + start_at: Instant, + finished: Arc, + abort: AbortHandle, +} + +impl RecoverRegionState { + /// Create the state by wrapping a execution of recover region. + fn wrap_task, T>( + task: F, + ) -> (Self, impl Future>) { + let finished = Arc::new(AtomicBool::new(false)); + let (cancelable_task, abort) = futures::future::abortable(task); + let state = Self { + start_at: Instant::now(), + finished: Arc::clone(&finished), + abort, + }; + (state, async move { + let res = cancelable_task.await; + finished.store(true, Ordering::SeqCst); + res + }) + } } impl RecoveryService { @@ -99,6 +145,7 @@ impl RecoveryService { engines, router, threads, + last_recovery_region_rpc: Arc::default(), } } @@ -140,6 +187,34 @@ impl RecoveryService { Ok(store_id) } + fn abort_last_recover_region(&self, place: impl Display) { + let mut last_state_lock = self.last_recovery_region_rpc.lock().unwrap(); + Self::abort_last_recover_region_of(place, &mut last_state_lock) + } + + fn replace_last_recover_region(&self, place: impl Display, new_state: RecoverRegionState) { + let mut last_state_lock = self.last_recovery_region_rpc.lock().unwrap(); + Self::abort_last_recover_region_of(place, &mut last_state_lock); + *last_state_lock = Some(new_state); + } + + fn abort_last_recover_region_of( + place: impl Display, + last_state_lock: &mut Option, + ) { + if let Some(last_state) = last_state_lock.take() { + info!("Another task enter, checking last task."; + "finished" => ?last_state.finished, + "start_before" => ?last_state.start_at.elapsed(), + "abort_by" => %place, + ); + if !last_state.finished.load(Ordering::SeqCst) { + last_state.abort.abort(); + warn!("Last task not finished, aborting it."); + } + } + } + // a new wait apply syncer share with all regions, // when all region reached the target index, share reference decreased to 0, // trigger closure to send finish info back. @@ -190,7 +265,7 @@ impl RecoverData for RecoveryService { // 1. br start to ready region meta fn read_region_meta( &mut self, - _ctx: RpcContext<'_>, + ctx: RpcContext<'_>, _req: ReadRegionMetaRequest, mut sink: ServerStreamingSink, ) { @@ -215,6 +290,11 @@ impl RecoverData for RecoveryService { } }); + // Hacking: Sometimes, the client may omit the RPC call to `recover_region` if + // no leader should be register to some (unfortunate) store. So we abort + // last recover region here too, anyway this RPC implies a consequent + // `recover_region` for now. + self.abort_last_recover_region(format_args!("read_region_meta by {}", ctx.peer())); self.threads.spawn_ok(send_task); } @@ -222,11 +302,11 @@ impl RecoverData for RecoveryService { // assign region leader and wait leader apply to last log fn recover_region( &mut self, - _ctx: RpcContext<'_>, + ctx: RpcContext<'_>, mut stream: RequestStream, sink: ClientStreamingSink, ) { - let raft_router = self.router.clone(); + let mut raft_router = Mutex::new(self.router.clone()); let store_id = self.get_store_id(); info!("start to recover the region"); let task = async move { @@ -241,17 +321,15 @@ impl RecoverData for RecoveryService { } } - let mut lk = LeaderKeeper::new(raft_router.clone(), leaders.clone()); + let mut lk = LeaderKeeper::new(&raft_router, leaders.clone()); // We must use the tokio runtime here because there isn't a `block_in_place` // like thing in the futures executor. It simply panics when block // on the block_on context. // It is also impossible to directly `await` here, because that will make // borrowing to the raft router crosses the await point. - tokio::runtime::Builder::new_current_thread() - .build() - .expect("failed to build temporary tokio runtime.") - .block_on(lk.elect_and_wait_all_ready()); + lk.elect_and_wait_all_ready().await; info!("all region leader assigned done"; "count" => %leaders.len()); + drop(lk); let now = Instant::now(); // wait apply to the last log @@ -260,7 +338,7 @@ impl RecoverData for RecoveryService { let (tx, rx) = sync_channel(1); REGION_EVENT_COUNTER.start_wait_leader_apply.inc(); let wait_apply = SnapshotRecoveryWaitApplySyncer::new(region_id, tx.clone()); - if let Err(e) = raft_router.significant_send( + if let Err(e) = raft_router.get_mut().unwrap().significant_send( region_id, SignificantMsg::SnapshotRecoveryWaitApply(wait_apply.clone()), ) { @@ -277,6 +355,10 @@ impl RecoverData for RecoveryService { for (rid, rx) in leaders.iter().zip(rx_apply) { if let Some(rx) = rx { CURRENT_WAIT_APPLY_LEADER.set(*rid as _); + // FIXME: we cannot the former RPC when we get stuck at here. + // Perhaps we need to make `SnapshotRecoveryWaitApplySyncer` be able to support + // asynchronous channels. But for now, waiting seems won't cause live lock, so + // we are keeping it unchanged. match rx.recv() { Ok(region_id) => { debug!("leader apply to last log"; "region_id" => region_id); @@ -301,10 +383,20 @@ impl RecoverData for RecoveryService { Err(e) => error!("failed to get store id"; "error" => ?e), }; - let _ = sink.success(resp).await; + resp }; - self.threads.spawn_ok(task); + let (state, task) = RecoverRegionState::wrap_task(task); + self.replace_last_recover_region(format!("recover_region by {}", ctx.peer()), state); + self.threads.spawn_ok(async move { + let res = match task.await { + Ok(resp) => sink.success(resp), + Err(Aborted) => sink.fail(RpcStatus::new(RpcStatusCode::ABORTED)), + }; + if let Err(err) = res.await { + warn!("failed to response recover region rpc"; "err" => %err); + } + }); } // 3. ensure all region peer/follower apply to last @@ -381,3 +473,32 @@ impl RecoverData for RecoveryService { self.threads.spawn_ok(send_task); } } + +#[cfg(test)] +mod test { + use std::{sync::atomic::Ordering, time::Duration}; + + use futures::never::Never; + + use super::RecoverRegionState; + + #[test] + fn test_state() { + let rt = tokio::runtime::Builder::new_current_thread() + .enable_time() + .build() + .unwrap(); + let (state, task) = RecoverRegionState::wrap_task(futures::future::pending::()); + let hnd = rt.spawn(task); + state.abort.abort(); + rt.block_on(async { tokio::time::timeout(Duration::from_secs(10), hnd).await }) + .unwrap() + .unwrap() + .unwrap_err(); + + let (state, task) = RecoverRegionState::wrap_task(futures::future::ready(42)); + assert_eq!(state.finished.load(Ordering::SeqCst), false); + assert_eq!(rt.block_on(task), Ok(42)); + assert_eq!(state.finished.load(Ordering::SeqCst), true); + } +} From 8c7d9e3b7d71b012fdf2a7e50423b61af1bf6092 Mon Sep 17 00:00:00 2001 From: glorv Date: Mon, 16 Oct 2023 21:00:29 +0800 Subject: [PATCH 092/220] config: adjust rocksdb background compaction threads (#15769) ref tikv/tikv#14470 Signed-off-by: glorv Co-authored-by: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- src/config/mod.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/config/mod.rs b/src/config/mod.rs index 74f25a22ef6..d1fb1e4f8d8 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -264,7 +264,7 @@ fn get_background_job_limits_impl( // v2: decrease the compaction threads to make the qps more stable. let max_compactions = match engine_type { EngineType::RaftKv => max_background_jobs - max_background_flushes, - EngineType::RaftKv2 => (max_background_jobs + 7) / 8, + EngineType::RaftKv2 => (max_background_jobs + 3) / 4, }; let max_sub_compactions: u32 = (max_compactions - 1).clamp(1, defaults.max_sub_compactions); max_background_jobs = max_background_flushes + max_compactions; @@ -6052,7 +6052,7 @@ mod tests { &KVDB_DEFAULT_BACKGROUND_JOB_LIMITS ), BackgroundJobLimits { - max_background_jobs: 3, + max_background_jobs: 4, max_background_flushes: 2, max_sub_compactions: 1, max_titan_background_gc: 4, @@ -6082,9 +6082,9 @@ mod tests { &KVDB_DEFAULT_BACKGROUND_JOB_LIMITS ), BackgroundJobLimits { - max_background_jobs: 5, + max_background_jobs: 6, max_background_flushes: 3, - max_sub_compactions: 1, + max_sub_compactions: 2, max_titan_background_gc: 4, } ); From d8756403ef730142d7eb5b3b79567b1576d5ed50 Mon Sep 17 00:00:00 2001 From: lance6716 Date: Mon, 16 Oct 2023 12:56:00 -0500 Subject: [PATCH 093/220] import: write RPC will check region epoch before continue (#15013) close tikv/tikv#15003 Signed-off-by: lance6716 Co-authored-by: tonyxuqqi --- Makefile | 8 + .../src/operation/command/write/ingest.rs | 9 +- components/raftstore/src/store/fsm/store.rs | 87 ++------ components/raftstore/src/store/msg.rs | 6 - .../raftstore/src/store/worker/cleanup.rs | 19 +- .../raftstore/src/store/worker/cleanup_sst.rs | 120 +----------- components/server/src/server.rs | 2 + components/server/src/server2.rs | 2 + components/sst_importer/src/import_file.rs | 49 +++-- components/sst_importer/src/lib.rs | 2 +- components/sst_importer/src/sst_importer.rs | 8 +- components/test_raftstore-v2/src/server.rs | 1 + components/test_raftstore/src/server.rs | 1 + src/import/sst_service.rs | 185 +++++++++++++++++- 14 files changed, 271 insertions(+), 228 deletions(-) diff --git a/Makefile b/Makefile index bb1d7316e1b..ce8d4e8b793 100644 --- a/Makefile +++ b/Makefile @@ -406,6 +406,14 @@ docker_test: ${DEV_DOCKER_IMAGE_NAME}:${DOCKER_IMAGE_TAG} \ make test +docker_shell: + docker build -f Dockerfile.test \ + -t ${DEV_DOCKER_IMAGE_NAME}:${DOCKER_IMAGE_TAG} \ + . + docker run -it -v $(shell pwd):/tikv \ + ${DEV_DOCKER_IMAGE_NAME}:${DOCKER_IMAGE_TAG} \ + /bin/bash + ## The driver for script/run-cargo.sh ## ---------------------------------- diff --git a/components/raftstore-v2/src/operation/command/write/ingest.rs b/components/raftstore-v2/src/operation/command/write/ingest.rs index e963434fe83..3d39c9a7369 100644 --- a/components/raftstore-v2/src/operation/command/write/ingest.rs +++ b/components/raftstore-v2/src/operation/command/write/ingest.rs @@ -43,6 +43,11 @@ impl Store { let import_size = box_try!(ctx.sst_importer.get_total_size()); STORE_SIZE_EVENT_INT_VEC.import_size.set(import_size as i64); let ssts = box_try!(ctx.sst_importer.list_ssts()); + // filter old version SSTs + let ssts: Vec<_> = ssts + .into_iter() + .filter(|sst| sst.api_version >= sst_importer::API_VERSION_2) + .collect(); if ssts.is_empty() { return Ok(()); } @@ -50,9 +55,9 @@ impl Store { let mut region_ssts: HashMap<_, Vec<_>> = HashMap::default(); for sst in ssts { region_ssts - .entry(sst.get_region_id()) + .entry(sst.meta.get_region_id()) .or_default() - .push(sst); + .push(sst.meta); } let ranges = ctx.sst_importer.ranges_in_import(); diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index 2434dfdd8e6..33010a993a2 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -36,14 +36,13 @@ use futures::{compat::Future01CompatExt, FutureExt}; use grpcio_health::HealthService; use keys::{self, data_end_key, data_key, enc_end_key, enc_start_key}; use kvproto::{ - import_sstpb::{SstMeta, SwitchMode}, metapb::{self, Region, RegionEpoch}, pdpb::{self, QueryStats, StoreStats}, raft_cmdpb::{AdminCmdType, AdminRequest}, raft_serverpb::{ExtraMessage, ExtraMessageType, PeerState, RaftMessage, RegionLocalState}, replication_modepb::{ReplicationMode, ReplicationStatus}, }; -use pd_client::{metrics::STORE_SIZE_EVENT_INT_VEC, Feature, FeatureGate, PdClient}; +use pd_client::{Feature, FeatureGate, PdClient}; use protobuf::Message; use raft::StateRole; use resource_control::{channel::unbounded, ResourceGroupManager}; @@ -810,9 +809,6 @@ impl<'a, EK: KvEngine + 'static, ER: RaftEngine + 'static, T: Transport> } } StoreMsg::CompactedEvent(event) => self.on_compaction_finished(event), - StoreMsg::ValidateSstResult { invalid_ssts } => { - self.on_validate_sst_result(invalid_ssts) - } StoreMsg::ClearRegionSizeInRange { start_key, end_key } => { self.clear_region_size_in_range(&start_key, &end_key) } @@ -1652,12 +1648,7 @@ impl RaftBatchSystem { ); let compact_runner = CompactRunner::new(engines.kv.clone()); - let cleanup_sst_runner = CleanupSstRunner::new( - meta.get_id(), - self.router.clone(), - Arc::clone(&importer), - Arc::clone(&pd_client), - ); + let cleanup_sst_runner = CleanupSstRunner::new(Arc::clone(&importer)); let gc_snapshot_runner = GcSnapshotRunner::new( meta.get_id(), self.router.clone(), // RaftRouter @@ -2755,44 +2746,8 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER } impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER, T> { - fn on_validate_sst_result(&mut self, ssts: Vec) { - if ssts.is_empty() || self.ctx.importer.get_mode() == SwitchMode::Import { - return; - } - // A stale peer can still ingest a stale Sst before it is - // destroyed. We need to make sure that no stale peer exists. - let mut delete_ssts = Vec::new(); - { - let meta = self.ctx.store_meta.lock().unwrap(); - for sst in ssts { - if !meta.regions.contains_key(&sst.get_region_id()) { - delete_ssts.push(sst); - } - } - } - if delete_ssts.is_empty() { - return; - } - - let task = CleanupSstTask::DeleteSst { ssts: delete_ssts }; - if let Err(e) = self - .ctx - .cleanup_scheduler - .schedule(CleanupTask::CleanupSst(task)) - { - error!( - "schedule to delete ssts failed"; - "store_id" => self.fsm.store.id, - "err" => ?e, - ); - } - } - fn on_cleanup_import_sst(&mut self) -> Result<()> { let mut delete_ssts = Vec::new(); - let mut validate_ssts = Vec::new(); - let import_size = box_try!(self.ctx.importer.get_total_size()); - STORE_SIZE_EVENT_INT_VEC.import_size.set(import_size as i64); let ssts = box_try!(self.ctx.importer.list_ssts()); if ssts.is_empty() { @@ -2801,15 +2756,22 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER { let meta = self.ctx.store_meta.lock().unwrap(); for sst in ssts { - if let Some(r) = meta.regions.get(&sst.get_region_id()) { + if sst.api_version < sst_importer::API_VERSION_2 { + // SST of old versions are created by old TiKV and have different prerequisite + // we can't delete them here. They can only be deleted manually + continue; + } + if let Some(r) = meta.regions.get(&sst.meta.get_region_id()) { let region_epoch = r.get_region_epoch(); - if util::is_epoch_stale(sst.get_region_epoch(), region_epoch) { + if util::is_epoch_stale(sst.meta.get_region_epoch(), region_epoch) { // If the SST epoch is stale, it will not be ingested anymore. - delete_ssts.push(sst); + delete_ssts.push(sst.meta); } } else { - // If the peer doesn't exist, we need to validate the SST through PD. - validate_ssts.push(sst); + // The write RPC of import sst service have make sure the region do exist at the + // write time, and now the region is not found, sst can be + // deleted because it won't be used by ingest in future. + delete_ssts.push(sst.meta); } } } @@ -2829,27 +2791,6 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER } } - // When there is an import job running, the region which this sst belongs may - // has not been split from the origin region because the apply thread is so busy - // that it can not apply SplitRequest as soon as possible. So we can not - // delete this sst file. - if !validate_ssts.is_empty() && self.ctx.importer.get_mode() != SwitchMode::Import { - let task = CleanupSstTask::ValidateSst { - ssts: validate_ssts, - }; - if let Err(e) = self - .ctx - .cleanup_scheduler - .schedule(CleanupTask::CleanupSst(task)) - { - error!( - "schedule to validate ssts failed"; - "store_id" => self.fsm.store.id, - "err" => ?e, - ); - } - } - Ok(()) } diff --git a/components/raftstore/src/store/msg.rs b/components/raftstore/src/store/msg.rs index 64c5be6d7e1..f7bf7f6d297 100644 --- a/components/raftstore/src/store/msg.rs +++ b/components/raftstore/src/store/msg.rs @@ -10,7 +10,6 @@ use engine_traits::{CompactedEvent, KvEngine, Snapshot}; use futures::channel::mpsc::UnboundedSender; use kvproto::{ brpb::CheckAdminResponse, - import_sstpb::SstMeta, kvrpcpb::{DiskFullOpt, ExtraOp as TxnExtraOp}, metapb, metapb::RegionEpoch, @@ -824,10 +823,6 @@ where { RaftMessage(InspectedRaftMessage), - ValidateSstResult { - invalid_ssts: Vec, - }, - // Clear region size and keys for all regions in the range, so we can force them to // re-calculate their size later. ClearRegionSizeInRange { @@ -884,7 +879,6 @@ where write!(fmt, "Store {} is unreachable", store_id) } StoreMsg::CompactedEvent(ref event) => write!(fmt, "CompactedEvent cf {}", event.cf()), - StoreMsg::ValidateSstResult { .. } => write!(fmt, "Validate SST Result"), StoreMsg::ClearRegionSizeInRange { ref start_key, ref end_key, diff --git a/components/raftstore/src/store/worker/cleanup.rs b/components/raftstore/src/store/worker/cleanup.rs index 632e85f40cc..726b7abe5ce 100644 --- a/components/raftstore/src/store/worker/cleanup.rs +++ b/components/raftstore/src/store/worker/cleanup.rs @@ -3,7 +3,6 @@ use std::fmt::{self, Display, Formatter}; use engine_traits::{KvEngine, RaftEngine}; -use pd_client::PdClient; use tikv_util::worker::Runnable; use super::{ @@ -11,7 +10,6 @@ use super::{ cleanup_sst::{Runner as CleanupSstRunner, Task as CleanupSstTask}, compact::{Runner as CompactRunner, Task as CompactTask}, }; -use crate::store::StoreRouter; pub enum Task { Compact(CompactTask), @@ -29,29 +27,26 @@ impl Display for Task { } } -pub struct Runner +pub struct Runner where E: KvEngine, R: RaftEngine, - S: StoreRouter, { compact: CompactRunner, - cleanup_sst: CleanupSstRunner, + cleanup_sst: CleanupSstRunner, gc_snapshot: GcSnapshotRunner, } -impl Runner +impl Runner where E: KvEngine, R: RaftEngine, - C: PdClient, - S: StoreRouter, { pub fn new( compact: CompactRunner, - cleanup_sst: CleanupSstRunner, + cleanup_sst: CleanupSstRunner, gc_snapshot: GcSnapshotRunner, - ) -> Runner { + ) -> Runner { Runner { compact, cleanup_sst, @@ -60,12 +55,10 @@ where } } -impl Runnable for Runner +impl Runnable for Runner where E: KvEngine, R: RaftEngine, - C: PdClient, - S: StoreRouter, { type Task = Task; diff --git a/components/raftstore/src/store/worker/cleanup_sst.rs b/components/raftstore/src/store/worker/cleanup_sst.rs index 8174b872f4b..44f188e6f8f 100644 --- a/components/raftstore/src/store/worker/cleanup_sst.rs +++ b/components/raftstore/src/store/worker/cleanup_sst.rs @@ -1,62 +1,30 @@ // Copyright 2018 TiKV Project Authors. Licensed under Apache-2.0. -use std::{error::Error, fmt, marker::PhantomData, sync::Arc}; +use std::{fmt, sync::Arc}; -use engine_traits::KvEngine; -use kvproto::{import_sstpb::SstMeta, metapb::Region}; -use pd_client::PdClient; +use kvproto::import_sstpb::SstMeta; use sst_importer::SstImporter; -use tikv_util::{error, worker::Runnable}; - -use crate::store::{util::is_epoch_stale, StoreMsg, StoreRouter}; - -type Result = std::result::Result>; +use tikv_util::worker::Runnable; pub enum Task { DeleteSst { ssts: Vec }, - ValidateSst { ssts: Vec }, } impl fmt::Display for Task { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match *self { Task::DeleteSst { ref ssts } => write!(f, "Delete {} ssts", ssts.len()), - Task::ValidateSst { ref ssts } => write!(f, "Validate {} ssts", ssts.len()), } } } -pub struct Runner -where - EK: KvEngine, - S: StoreRouter, -{ - store_id: u64, - store_router: S, +pub struct Runner { importer: Arc, - pd_client: Arc, - _engine: PhantomData, } -impl Runner -where - EK: KvEngine, - C: PdClient, - S: StoreRouter, -{ - pub fn new( - store_id: u64, - store_router: S, - importer: Arc, - pd_client: Arc, - ) -> Runner { - Runner { - store_id, - store_router, - importer, - pd_client, - _engine: PhantomData, - } +impl Runner { + pub fn new(importer: Arc) -> Runner { + Runner { importer } } /// Deletes SST files from the importer. @@ -65,78 +33,9 @@ where let _ = self.importer.delete(sst); } } - - fn get_region_by_meta(&self, sst: &SstMeta) -> Result { - // The SST meta has been delivered with a range, use it directly. - // For now, no case will reach this. But this still could be a guard for - // reducing the superise in the future... - if !sst.get_range().get_start().is_empty() || !sst.get_range().get_end().is_empty() { - return self - .pd_client - .get_region(sst.get_range().get_start()) - .map_err(Into::into); - } - // Once there isn't range provided. - let query_by_start_key_of_full_meta = || { - let start_key = self - .importer - .load_start_key_by_meta::(sst)? - .ok_or_else(|| -> Box { - "failed to load start key from sst, the sst might be empty".into() - })?; - let region = self.pd_client.get_region(&start_key)?; - Result::Ok(region) - }; - query_by_start_key_of_full_meta() - .map_err(|err| - format!("failed to load full sst meta from disk for {:?} and there isn't extra information provided: {err}", sst.get_uuid()).into() - ) - } - - /// Validates whether the SST is stale or not. - fn handle_validate_sst(&self, ssts: Vec) { - let store_id = self.store_id; - let mut invalid_ssts = Vec::new(); - for sst in ssts { - match self.get_region_by_meta(&sst) { - Ok(r) => { - // The region id may or may not be the same as the - // SST file, but it doesn't matter, because the - // epoch of a range will not decrease anyway. - if is_epoch_stale(r.get_region_epoch(), sst.get_region_epoch()) { - // Region has not been updated. - continue; - } - if r.get_id() == sst.get_region_id() - && r.get_peers().iter().any(|p| p.get_store_id() == store_id) - { - // The SST still belongs to this store. - continue; - } - invalid_ssts.push(sst); - } - Err(e) => { - error!("get region failed"; "err" => %e); - } - } - } - - // We need to send back the result to check for the stale - // peer, which may ingest the stale SST before it is - // destroyed. - let msg = StoreMsg::ValidateSstResult { invalid_ssts }; - if let Err(e) = self.store_router.send(msg) { - error!(%e; "send validate sst result failed"); - } - } } -impl Runnable for Runner -where - EK: KvEngine, - C: PdClient, - S: StoreRouter, -{ +impl Runnable for Runner { type Task = Task; fn run(&mut self, task: Task) { @@ -144,9 +43,6 @@ where Task::DeleteSst { ssts } => { self.handle_delete_sst(ssts); } - Task::ValidateSst { ssts } => { - self.handle_validate_sst(ssts); - } } } } diff --git a/components/server/src/server.rs b/components/server/src/server.rs index 8d44890e5a6..a4b6276a587 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -366,6 +366,7 @@ where router.clone(), config.coprocessor.clone(), )); + let region_info_accessor = RegionInfoAccessor::new(coprocessor_host.as_mut().unwrap()); // Initialize concurrency manager @@ -1080,6 +1081,7 @@ where servers.importer.clone(), None, self.resource_manager.clone(), + Arc::new(self.region_info_accessor.clone()), ); let import_cfg_mgr = import_service.get_config_manager(); diff --git a/components/server/src/server2.rs b/components/server/src/server2.rs index 2593035618d..65d02f58c08 100644 --- a/components/server/src/server2.rs +++ b/components/server/src/server2.rs @@ -948,6 +948,7 @@ where backup_worker.start(backup_endpoint); // Import SST service. + let region_info_accessor = self.region_info_accessor.as_ref().unwrap().clone(); let import_service = ImportSstService::new( self.core.config.import.clone(), self.core.config.raft_store.raft_entry_max_size, @@ -956,6 +957,7 @@ where servers.importer.clone(), Some(self.router.as_ref().unwrap().store_meta().clone()), self.resource_manager.clone(), + Arc::new(region_info_accessor), ); let import_cfg_mgr = import_service.get_config_manager(); diff --git a/components/sst_importer/src/import_file.rs b/components/sst_importer/src/import_file.rs index b270d26a411..ae81cf01646 100644 --- a/components/sst_importer/src/import_file.rs +++ b/components/sst_importer/src/import_file.rs @@ -440,7 +440,7 @@ impl ImportDir { Ok(real_key.map(ToOwned::to_owned)) } - pub fn list_ssts(&self) -> Result> { + pub fn list_ssts(&self) -> Result> { let mut ssts = Vec::new(); for e in file_system::read_dir(&self.root_dir)? { let e = e?; @@ -458,20 +458,33 @@ impl ImportDir { } const SST_SUFFIX: &str = ".sst"; - +// version 2: compared to version 1 which is the default version, we will check +// epoch of request and local region in write API. +pub const API_VERSION_2: i32 = 2; + +/// sst_meta_to_path will encode the filepath with default api version (current +/// is 2). So when the SstMeta is created in old version of TiKV and filepath +/// will not correspond to the real file, in the deletion logic we can't remove +/// these files. pub fn sst_meta_to_path(meta: &SstMeta) -> Result { Ok(PathBuf::from(format!( - "{}_{}_{}_{}_{}{}", + "{}_{}_{}_{}_{}_{}{}", UuidBuilder::from_slice(meta.get_uuid())?.build(), meta.get_region_id(), meta.get_region_epoch().get_conf_ver(), meta.get_region_epoch().get_version(), meta.get_cf_name(), + API_VERSION_2, SST_SUFFIX, ))) } -pub fn parse_meta_from_path>(path: P) -> Result { +pub struct SstMetaWithApiVersion { + pub meta: SstMeta, + pub api_version: i32, // in future we may move api_version into SstMeta +} + +pub fn parse_meta_from_path>(path: P) -> Result { let path = path.as_ref(); let file_name = match path.file_name().and_then(|n| n.to_str()) { Some(name) => name, @@ -500,7 +513,11 @@ pub fn parse_meta_from_path>(path: P) -> Result { // cf_name to path. meta.set_cf_name(elems[4].to_owned()); } - Ok(meta) + let mut api_version = 1; + if elems.len() > 5 { + api_version = elems[5].parse()?; + } + Ok(SstMetaWithApiVersion { meta, api_version }) } #[cfg(test)] @@ -520,11 +537,12 @@ mod test { meta.mut_region_epoch().set_version(3); let path = sst_meta_to_path(&meta).unwrap(); - let expected_path = format!("{}_1_2_3_default.sst", uuid); + let expected_path = format!("{}_1_2_3_default_2.sst", uuid); assert_eq!(path.to_str().unwrap(), &expected_path); - let new_meta = parse_meta_from_path(path).unwrap(); - assert_eq!(meta, new_meta); + let meta_with_ver = parse_meta_from_path(path).unwrap(); + assert_eq!(meta, meta_with_ver.meta); + assert_eq!(2, meta_with_ver.api_version); } #[test] @@ -543,8 +561,9 @@ mod test { meta.get_region_epoch().get_version(), SST_SUFFIX, )); - let new_meta = parse_meta_from_path(path).unwrap(); - assert_eq!(meta, new_meta); + let meta_with_ver = parse_meta_from_path(path).unwrap(); + assert_eq!(meta, meta_with_ver.meta); + assert_eq!(1, meta_with_ver.api_version); } #[cfg(feature = "test-engines-rocksdb")] @@ -596,14 +615,20 @@ mod test { w.finish().unwrap(); dp.save(arcmgr.as_deref()).unwrap(); let mut ssts = dir.list_ssts().unwrap(); - ssts.iter_mut().for_each(|meta| { + ssts.iter_mut().for_each(|meta_with_ver| { + let meta = &mut meta_with_ver.meta; let start = dir .load_start_key_by_meta::(meta, arcmgr.clone()) .unwrap() .unwrap(); meta.mut_range().set_start(start) }); - assert_eq!(ssts, vec![meta]); + assert_eq!( + ssts.iter() + .map(|meta_with_ver| { meta_with_ver.meta.clone() }) + .collect(), + vec![meta] + ); } #[test] diff --git a/components/sst_importer/src/lib.rs b/components/sst_importer/src/lib.rs index 0cfc3bab774..ff137005b09 100644 --- a/components/sst_importer/src/lib.rs +++ b/components/sst_importer/src/lib.rs @@ -27,7 +27,7 @@ pub mod sst_importer; pub use self::{ config::{Config, ConfigManager}, errors::{error_inc, Error, Result}, - import_file::sst_meta_to_path, + import_file::{sst_meta_to_path, API_VERSION_2}, import_mode2::range_overlaps, sst_importer::SstImporter, sst_writer::{RawSstWriter, TxnSstWriter}, diff --git a/components/sst_importer/src/sst_importer.rs b/components/sst_importer/src/sst_importer.rs index 5530862e6a3..f36016eb309 100644 --- a/components/sst_importer/src/sst_importer.rs +++ b/components/sst_importer/src/sst_importer.rs @@ -51,7 +51,7 @@ use txn_types::{Key, TimeStamp, WriteRef}; use crate::{ caching::cache_map::{CacheMap, ShareOwned}, - import_file::{ImportDir, ImportFile}, + import_file::{ImportDir, ImportFile, SstMetaWithApiVersion}, import_mode::{ImportModeSwitcher, RocksDbMetricsFn}, import_mode2::{HashRange, ImportModeSwitcherV2}, metrics::*, @@ -1387,7 +1387,7 @@ impl SstImporter { /// List the basic information of the current SST files. /// The information contains UUID, region ID, region Epoch. /// Other fields may be left blank. - pub fn list_ssts(&self) -> Result> { + pub fn list_ssts(&self) -> Result> { self.dir.list_ssts() } @@ -1587,9 +1587,9 @@ mod tests { for sst in &ssts { ingested .iter() - .find(|s| s.get_uuid() == sst.get_uuid()) + .find(|s| s.get_uuid() == sst.meta.get_uuid()) .unwrap(); - dir.delete(sst, key_manager.as_deref()).unwrap(); + dir.delete(&sst.meta, key_manager.as_deref()).unwrap(); } assert!(dir.list_ssts().unwrap().is_empty()); } diff --git a/components/test_raftstore-v2/src/server.rs b/components/test_raftstore-v2/src/server.rs index 299e93eb746..5073304e17a 100644 --- a/components/test_raftstore-v2/src/server.rs +++ b/components/test_raftstore-v2/src/server.rs @@ -561,6 +561,7 @@ impl ServerCluster { Arc::clone(&importer), Some(store_meta), resource_manager.clone(), + Arc::new(region_info_accessor.clone()), ); // Create deadlock service. diff --git a/components/test_raftstore/src/server.rs b/components/test_raftstore/src/server.rs index 0002f36d647..f5c64fa86e9 100644 --- a/components/test_raftstore/src/server.rs +++ b/components/test_raftstore/src/server.rs @@ -451,6 +451,7 @@ impl ServerCluster { Arc::clone(&importer), None, resource_manager.clone(), + Arc::new(region_info_accessor.clone()), ); // Create deadlock service. diff --git a/src/import/sst_service.rs b/src/import/sst_service.rs index 68403e226f8..6f9f22c9cb4 100644 --- a/src/import/sst_service.rs +++ b/src/import/sst_service.rs @@ -15,6 +15,7 @@ use std::{ use engine_traits::{CompactExt, MiscExt, CF_DEFAULT, CF_WRITE}; use file_system::{set_io_type, IoType}; use futures::{sink::SinkExt, stream::TryStreamExt, FutureExt, TryFutureExt}; +use futures_executor::block_on; use grpcio::{ ClientStreamingSink, RequestStream, RpcContext, ServerStreamingSink, UnarySink, WriteFlags, }; @@ -27,7 +28,9 @@ use kvproto::{ WriteRequest_oneof_chunk as Chunk, *, }, kvrpcpb::Context, + metapb::RegionEpoch, }; +use raftstore::{coprocessor::RegionInfoProvider, store::util::is_epoch_stale, RegionInfoAccessor}; use raftstore_v2::StoreMeta; use resource_control::{with_resource_limiter, ResourceGroupManager}; use sst_importer::{ @@ -39,7 +42,7 @@ use tikv_kv::{ }; use tikv_util::{ config::ReadableSize, - future::create_stream_with_buffer, + future::{create_stream_with_buffer, paired_future_callback}, sys::thread::ThreadBuildWrapper, time::{Instant, Limiter}, HandyRwLock, @@ -124,6 +127,7 @@ pub struct ImportSstService { limiter: Limiter, task_slots: Arc>>, raft_entry_max_size: ReadableSize, + region_info_accessor: Arc, writer: raft_writer::ThrottledTlsEngineWriter, @@ -318,6 +322,7 @@ impl ImportSstService { importer: Arc, store_meta: Option>>>, resource_manager: Option>, + region_info_accessor: Arc, ) -> Self { let props = tikv_util::thread_group::current_properties(); let eng = Mutex::new(engine.clone()); @@ -365,6 +370,7 @@ impl ImportSstService { limiter: Limiter::new(f64::INFINITY), task_slots: Arc::new(Mutex::new(HashSet::default())), raft_entry_max_size, + region_info_accessor, writer, store_meta, resource_manager, @@ -675,6 +681,59 @@ impl ImportSstService { } } +fn check_local_region_stale( + region_id: u64, + epoch: &RegionEpoch, + region_info_accessor: Arc, +) -> Result<()> { + let (cb, f) = paired_future_callback(); + region_info_accessor + .find_region_by_id(region_id, cb) + .map_err(|e| { + Error::Engine(format!("failed to find region {} err {:?}", region_id, e).into()) + })?; + match block_on(f)? { + Some(local_region_info) => { + let local_region_epoch = local_region_info.region.region_epoch.unwrap(); + + // TODO(lance6717): we should only need to check conf_ver because we require all + // peers have SST on the disk, and does not care about which one is + // leader. But since check_sst_for_ingestion also checks epoch version, + // we just keep it here for now. + + // when local region epoch is stale, client can retry write later + if is_epoch_stale(&local_region_epoch, epoch) { + return Err(Error::Engine( + format!("request region {} is ahead of local region, local epoch {:?}, request epoch {:?}, please retry write later", + region_id, local_region_epoch, epoch).into(), + )); + } + // when local region epoch is ahead, client need to rescan region from PD to get + // latest region later + if is_epoch_stale(epoch, &local_region_epoch) { + return Err(Error::Engine( + format!("request region {} is staler than local region, local epoch {:?}, request epoch {:?}, please rescan region later", + region_id, local_region_epoch, epoch).into(), + )); + } + + // not match means to rescan + Ok(()) + } + None => { + // when region not found, we can't tell whether it's stale or ahead, so we just + // return the safest case + Err(Error::Engine( + format!( + "region {} is not found, please rescan region later", + region_id + ) + .into(), + )) + } + } +} + #[macro_export] macro_rules! impl_write { ($fn:ident, $req_ty:ident, $resp_ty:ident, $chunk_ty:ident, $writer_fn:ident) => { @@ -686,6 +745,7 @@ macro_rules! impl_write { ) { let import = self.importer.clone(); let tablets = self.tablets.clone(); + let region_info_accessor = self.region_info_accessor.clone(); let (rx, buf_driver) = create_stream_with_buffer(stream, self.cfg.rl().stream_channel_window); let mut rx = rx.map_err(Error::from); @@ -713,7 +773,15 @@ macro_rules! impl_write { } _ => return Err(Error::InvalidChunk), }; + // wait the region epoch on this TiKV to catch up with the epoch + // in request, which comes from PD and represents the majority + // peers' status. let region_id = meta.get_region_id(); + check_local_region_stale( + region_id, + meta.get_region_epoch(), + region_info_accessor, + )?; let tablet = match tablets.get(region_id) { Some(t) => t, None => { @@ -1387,19 +1455,30 @@ fn write_needs_restore(write: &[u8]) -> bool { #[cfg(test)] mod test { - use std::collections::HashMap; + use std::{ + collections::HashMap, + sync::{Arc, Mutex}, + }; use engine_traits::{CF_DEFAULT, CF_WRITE}; use kvproto::{ kvrpcpb::Context, - metapb::RegionEpoch, + metapb::{Region, RegionEpoch}, raft_cmdpb::{RaftCmdRequest, Request}, }; - use protobuf::Message; + use protobuf::{Message, SingularPtrField}; + use raft::StateRole::Follower; + use raftstore::{ + coprocessor::{region_info_accessor::Callback, RegionInfoProvider}, + RegionInfo, + }; use tikv_kv::{Modify, WriteData}; use txn_types::{Key, TimeStamp, Write, WriteBatchFlags, WriteType}; - use crate::{import::sst_service::RequestCollector, server::raftkv}; + use crate::{ + import::sst_service::{check_local_region_stale, RequestCollector}, + server::raftkv, + }; fn write(key: &[u8], ty: WriteType, commit_ts: u64, start_ts: u64) -> (Vec, Vec) { let k = Key::from_raw(key).append_ts(TimeStamp::new(commit_ts)); @@ -1683,4 +1762,100 @@ mod test { } assert_eq!(total, 100); } + + #[test] + fn test_write_rpc_check_region_epoch() { + struct MockRegionInfoProvider { + map: Mutex>, + } + impl RegionInfoProvider for MockRegionInfoProvider { + fn find_region_by_id( + &self, + region_id: u64, + callback: Callback>, + ) -> Result<(), raftstore::coprocessor::Error> { + callback(self.map.lock().unwrap().get(®ion_id).cloned()); + Ok(()) + } + } + + let mock_provider = Arc::new(MockRegionInfoProvider { + map: Mutex::new(HashMap::new()), + }); + + let mut req_epoch = RegionEpoch { + conf_ver: 10, + version: 10, + ..Default::default() + }; + // test for region not found + let result = check_local_region_stale(1, &req_epoch, mock_provider.clone()); + assert!(result.is_err()); + // check error message contains "rescan region later", client will match this + // string pattern + assert!( + result + .unwrap_err() + .to_string() + .contains("rescan region later") + ); + + let mut local_region_info = RegionInfo { + region: Region { + id: 1, + region_epoch: SingularPtrField::some(req_epoch.clone()), + ..Default::default() + }, + role: Follower, + buckets: 1, + }; + mock_provider + .map + .lock() + .unwrap() + .insert(1, local_region_info.clone()); + // test the local region epoch is same as request + let result = check_local_region_stale(1, &req_epoch, mock_provider.clone()); + result.unwrap(); + + // test the local region epoch is ahead of request + local_region_info + .region + .region_epoch + .as_mut() + .unwrap() + .conf_ver = 11; + mock_provider + .map + .lock() + .unwrap() + .insert(1, local_region_info.clone()); + let result = check_local_region_stale(1, &req_epoch, mock_provider.clone()); + assert!(result.is_err()); + // check error message contains "rescan region later", client will match this + // string pattern + assert!( + result + .unwrap_err() + .to_string() + .contains("rescan region later") + ); + + req_epoch.conf_ver = 11; + let result = check_local_region_stale(1, &req_epoch, mock_provider.clone()); + result.unwrap(); + + // test the local region epoch is staler than request + req_epoch.version = 12; + let result = check_local_region_stale(1, &req_epoch, mock_provider); + assert!(result.is_err()); + // check error message contains "retry write later", client will match this + // string pattern + assert!( + result + .unwrap_err() + .to_string() + .contains("retry write later") + ); + } } From 6e826308b9ca246ee5572bcdd24e6b26fd19c156 Mon Sep 17 00:00:00 2001 From: tonyxuqqi Date: Mon, 16 Oct 2023 12:28:57 -0700 Subject: [PATCH 094/220] add more metrics for slow commit log diagnostics (#15716) ref tikv/tikv#15175 Add more metrics for slow commit log duration investigation. In this PR, it adds raft message process wait duration and exposes raft message recv by store counter. Together with raft-engine write duration, we can further narrow reason of the commit log duration. With this PR, we still cannot tell if the slowness comes from network or raft-client's (grpc client). Signed-off-by: tonyxuqqi Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- components/raftstore-v2/src/batch/store.rs | 8 +- components/raftstore-v2/src/fsm/peer.rs | 6 +- components/raftstore-v2/src/operation/life.rs | 6 +- .../raftstore-v2/src/operation/ready/mod.rs | 10 +- components/raftstore-v2/src/router/message.rs | 2 +- components/raftstore/src/store/fsm/peer.rs | 15 +- components/raftstore/src/store/fsm/store.rs | 24 ++- .../raftstore/src/store/local_metrics.rs | 5 + components/raftstore/src/store/metrics.rs | 7 + components/raftstore/src/store/msg.rs | 4 +- metrics/grafana/tikv_details.json | 194 +++++++++++++++++- src/server/server.rs | 11 +- tests/failpoints/cases/test_merge.rs | 5 +- 13 files changed, 261 insertions(+), 36 deletions(-) diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index 5ed84c70937..23e41914012 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -990,16 +990,16 @@ impl StoreRouter { msg: Box, ) -> std::result::Result<(), TrySendError>> { let id = msg.get_region_id(); - let peer_msg = PeerMsg::RaftMessage(msg); + let peer_msg = PeerMsg::RaftMessage(msg, Some(TiInstant::now())); let store_msg = match self.router.try_send(id, peer_msg) { Either::Left(Ok(())) => return Ok(()), - Either::Left(Err(TrySendError::Full(PeerMsg::RaftMessage(m)))) => { + Either::Left(Err(TrySendError::Full(PeerMsg::RaftMessage(m, _)))) => { return Err(TrySendError::Full(m)); } - Either::Left(Err(TrySendError::Disconnected(PeerMsg::RaftMessage(m)))) => { + Either::Left(Err(TrySendError::Disconnected(PeerMsg::RaftMessage(m, _)))) => { return Err(TrySendError::Disconnected(m)); } - Either::Right(PeerMsg::RaftMessage(m)) => StoreMsg::RaftMessage(m), + Either::Right(PeerMsg::RaftMessage(m, _)) => StoreMsg::RaftMessage(m), _ => unreachable!(), }; match self.router.send_control(store_msg) { diff --git a/components/raftstore-v2/src/fsm/peer.rs b/components/raftstore-v2/src/fsm/peer.rs index 94506a8a19f..47a1aee1ef4 100644 --- a/components/raftstore-v2/src/fsm/peer.rs +++ b/components/raftstore-v2/src/fsm/peer.rs @@ -247,8 +247,10 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, pub fn on_msgs(&mut self, peer_msgs_buf: &mut Vec) { for msg in peer_msgs_buf.drain(..) { match msg { - PeerMsg::RaftMessage(msg) => { - self.fsm.peer.on_raft_message(self.store_ctx, msg); + PeerMsg::RaftMessage(msg, send_time) => { + self.fsm + .peer + .on_raft_message(self.store_ctx, msg, send_time); } PeerMsg::RaftQuery(cmd) => { self.on_receive_command(cmd.send_time, cmd.ch.read_tracker()); diff --git a/components/raftstore-v2/src/operation/life.rs b/components/raftstore-v2/src/operation/life.rs index 5828a7bb661..00df317f73a 100644 --- a/components/raftstore-v2/src/operation/life.rs +++ b/components/raftstore-v2/src/operation/life.rs @@ -416,8 +416,8 @@ impl Store { ); let region_id = msg.get_region_id(); // The message can be sent when the peer is being created, so try send it first. - let mut msg = if let Err(TrySendError::Disconnected(PeerMsg::RaftMessage(m))) = - ctx.router.send(region_id, PeerMsg::RaftMessage(msg)) + let mut msg = if let Err(TrySendError::Disconnected(PeerMsg::RaftMessage(m, _))) = + ctx.router.send(region_id, PeerMsg::RaftMessage(msg, None)) { m } else { @@ -562,7 +562,7 @@ impl Store { if from_peer.id != raft::INVALID_ID { // For now the peer only exists in memory. It will persist its states when // handling its first readiness. - let _ = ctx.router.send(region_id, PeerMsg::RaftMessage(msg)); + let _ = ctx.router.send(region_id, PeerMsg::RaftMessage(msg, None)); } true } diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index 3ceb8693c0b..a2697f29f02 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -55,7 +55,7 @@ use tikv_util::{ slog_panic, store::find_peer, sys::disk::DiskUsage, - time::{duration_to_sec, monotonic_raw_now, Duration}, + time::{duration_to_sec, monotonic_raw_now, Duration, Instant as TiInstant}, }; pub use self::{ @@ -259,6 +259,7 @@ impl Peer { &mut self, ctx: &mut StoreContext, mut msg: Box, + send_time: Option, ) { debug!( self.logger, @@ -268,6 +269,13 @@ impl Peer { "to_peer_id" => msg.get_to_peer().get_id(), "disk_usage" => ?msg.disk_usage, ); + if let Some(send_time) = send_time { + let process_wait_time = send_time.saturating_elapsed(); + ctx.raft_metrics + .process_wait_time + .observe(duration_to_sec(process_wait_time)); + } + if self.pause_for_replay() && msg.get_message().get_msg_type() == MessageType::MsgAppend { ctx.raft_metrics.message_dropped.recovery.inc(); return; diff --git a/components/raftstore-v2/src/router/message.rs b/components/raftstore-v2/src/router/message.rs index c9da5241fa8..59d1edd8198 100644 --- a/components/raftstore-v2/src/router/message.rs +++ b/components/raftstore-v2/src/router/message.rs @@ -157,7 +157,7 @@ pub enum PeerMsg { /// Raft message is the message sent between raft nodes in the same /// raft group. Messages need to be redirected to raftstore if target /// peer doesn't exist. - RaftMessage(Box), + RaftMessage(Box, Option), /// Query won't change any state. A typical query is KV read. In most cases, /// it will be processed using lease or read index. RaftQuery(RaftRequest), diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index 584db92e8be..7504f746abe 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -206,7 +206,7 @@ where let callback = match msg { PeerMsg::RaftCommand(cmd) => cmd.callback, PeerMsg::CasualMessage(CasualMessage::SplitRegion { callback, .. }) => callback, - PeerMsg::RaftMessage(im) => { + PeerMsg::RaftMessage(im, _) => { raft_messages_size += im.heap_size; continue; } @@ -617,10 +617,16 @@ where let count = msgs.len(); for m in msgs.drain(..) { match m { - PeerMsg::RaftMessage(msg) => { + PeerMsg::RaftMessage(msg, sent_time) => { + if let Some(sent_time) = sent_time { + let wait_time = sent_time.saturating_elapsed().as_secs_f64(); + self.ctx.raft_metrics.process_wait_time.observe(wait_time); + } + if !self.ctx.coprocessor_host.on_raft_message(&msg.msg) { continue; } + if let Err(e) = self.on_raft_message(msg) { error!(%e; "handle raft message err"; @@ -4298,7 +4304,10 @@ where .pending_msgs .swap_remove_front(|m| m.get_to_peer() == &meta_peer) { - let peer_msg = PeerMsg::RaftMessage(InspectedRaftMessage { heap_size: 0, msg }); + let peer_msg = PeerMsg::RaftMessage( + InspectedRaftMessage { heap_size: 0, msg }, + Some(TiInstant::now()), + ); if let Err(e) = self.ctx.router.force_send(new_region_id, peer_msg) { warn!("handle first requset failed"; "region_id" => region_id, "error" => ?e); } diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index 33010a993a2..3a22ef8434d 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -382,7 +382,10 @@ where for e in msg.get_message().get_entries() { heap_size += bytes_capacity(&e.data) + bytes_capacity(&e.context); } - let peer_msg = PeerMsg::RaftMessage(InspectedRaftMessage { heap_size, msg }); + let peer_msg = PeerMsg::RaftMessage( + InspectedRaftMessage { heap_size, msg }, + Some(TiInstant::now()), + ); let event = TraceEvent::Add(heap_size); let send_failed = Cell::new(true); @@ -397,13 +400,13 @@ where send_failed.set(false); return Ok(()); } - Either::Left(Err(TrySendError::Full(PeerMsg::RaftMessage(im)))) => { + Either::Left(Err(TrySendError::Full(PeerMsg::RaftMessage(im, _)))) => { return Err(TrySendError::Full(im.msg)); } - Either::Left(Err(TrySendError::Disconnected(PeerMsg::RaftMessage(im)))) => { + Either::Left(Err(TrySendError::Disconnected(PeerMsg::RaftMessage(im, _)))) => { return Err(TrySendError::Disconnected(im.msg)); } - Either::Right(PeerMsg::RaftMessage(im)) => StoreMsg::RaftMessage(im), + Either::Right(PeerMsg::RaftMessage(im, _)) => StoreMsg::RaftMessage(im), _ => unreachable!(), }; match self.send_control(store_msg) { @@ -2067,14 +2070,18 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER }); let region_id = msg.msg.get_region_id(); - let msg = match self.ctx.router.send(region_id, PeerMsg::RaftMessage(msg)) { + let msg = match self + .ctx + .router + .send(region_id, PeerMsg::RaftMessage(msg, None)) + { Ok(()) => { forwarded.set(true); return Ok(()); } Err(TrySendError::Full(_)) => return Ok(()), Err(TrySendError::Disconnected(_)) if self.ctx.router.is_shutdown() => return Ok(()), - Err(TrySendError::Disconnected(PeerMsg::RaftMessage(im))) => im.msg, + Err(TrySendError::Disconnected(PeerMsg::RaftMessage(im, None))) => im.msg, Err(_) => unreachable!(), }; @@ -2146,7 +2153,8 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER check_msg_status == CheckMsgStatus::NewPeerFirst, )? { // Peer created, send the message again. - let peer_msg = PeerMsg::RaftMessage(InspectedRaftMessage { heap_size, msg }); + let peer_msg = + PeerMsg::RaftMessage(InspectedRaftMessage { heap_size, msg }, None); if self.ctx.router.send(region_id, peer_msg).is_ok() { forwarded.set(true); } @@ -2169,7 +2177,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER store_meta.pending_msgs.push(msg); } else { drop(store_meta); - let peer_msg = PeerMsg::RaftMessage(InspectedRaftMessage { heap_size, msg }); + let peer_msg = PeerMsg::RaftMessage(InspectedRaftMessage { heap_size, msg }, None); if let Err(e) = self.ctx.router.force_send(region_id, peer_msg) { warn!("handle first request failed"; "region_id" => region_id, "error" => ?e); } else { diff --git a/components/raftstore/src/store/local_metrics.rs b/components/raftstore/src/store/local_metrics.rs index 5460a57ae0f..aceacdb81ee 100644 --- a/components/raftstore/src/store/local_metrics.rs +++ b/components/raftstore/src/store/local_metrics.rs @@ -112,7 +112,10 @@ pub struct RaftMetrics { // local histogram pub store_time: LocalHistogram, + // the wait time for processing a raft command pub propose_wait_time: LocalHistogram, + // the wait time for processing a raft message + pub process_wait_time: LocalHistogram, pub process_ready: LocalHistogram, pub event_time: RaftEventDurationVec, pub peer_msg_len: LocalHistogram, @@ -152,6 +155,7 @@ impl RaftMetrics { raft_log_gc_skipped: RaftLogGcSkippedCounterVec::from(&RAFT_LOG_GC_SKIPPED_VEC), store_time: STORE_TIME_HISTOGRAM.local(), propose_wait_time: REQUEST_WAIT_TIME_HISTOGRAM.local(), + process_wait_time: RAFT_MESSAGE_WAIT_TIME_HISTOGRAM.local(), process_ready: PEER_RAFT_PROCESS_DURATION .with_label_values(&["ready"]) .local(), @@ -190,6 +194,7 @@ impl RaftMetrics { self.store_time.flush(); self.propose_wait_time.flush(); + self.process_wait_time.flush(); self.process_ready.flush(); self.event_time.flush(); self.peer_msg_len.flush(); diff --git a/components/raftstore/src/store/metrics.rs b/components/raftstore/src/store/metrics.rs index a5aa164e63e..a4f2b7820cb 100644 --- a/components/raftstore/src/store/metrics.rs +++ b/components/raftstore/src/store/metrics.rs @@ -551,6 +551,13 @@ lazy_static! { exponential_buckets(0.00001, 2.0, 26).unwrap() ).unwrap(); + pub static ref RAFT_MESSAGE_WAIT_TIME_HISTOGRAM: Histogram = + register_histogram!( + "tikv_raftstore_raft_msg_wait_time_duration_secs", + "Bucketed histogram of raft message wait time duration.", + exponential_buckets(0.00001, 2.0, 26).unwrap() + ).unwrap(); + pub static ref PEER_GC_RAFT_LOG_COUNTER: IntCounter = register_int_counter!( "tikv_raftstore_gc_raft_log_total", diff --git a/components/raftstore/src/store/msg.rs b/components/raftstore/src/store/msg.rs index f7bf7f6d297..a92e5169549 100644 --- a/components/raftstore/src/store/msg.rs +++ b/components/raftstore/src/store/msg.rs @@ -740,7 +740,7 @@ pub enum PeerMsg { /// Raft message is the message sent between raft nodes in the same /// raft group. Messages need to be redirected to raftstore if target /// peer doesn't exist. - RaftMessage(InspectedRaftMessage), + RaftMessage(InspectedRaftMessage, Option), /// Raft command is the command that is expected to be proposed by the /// leader of the target raft group. If it's failed to be sent, callback /// usually needs to be called before dropping in case of resource leak. @@ -778,7 +778,7 @@ impl ResourceMetered for PeerMsg {} impl fmt::Debug for PeerMsg { fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { match self { - PeerMsg::RaftMessage(_) => write!(fmt, "Raft Message"), + PeerMsg::RaftMessage(..) => write!(fmt, "Raft Message"), PeerMsg::RaftCommand(_) => write!(fmt, "Raft Command"), PeerMsg::Tick(tick) => write! { fmt, diff --git a/metrics/grafana/tikv_details.json b/metrics/grafana/tikv_details.json index 57c88782031..f2654ba3da1 100644 --- a/metrics/grafana/tikv_details.json +++ b/metrics/grafana/tikv_details.json @@ -179,6 +179,14 @@ "interval": "", "legendFormat": "Apply Duration .99", "refId": "E" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_raft_msg_wait_time_duration_secs_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le))", + "hide": false, + "interval": "", + "legendFormat": "Raft Message Wait .99", + "refId": "F" } ], "thresholds": [], @@ -5819,7 +5827,7 @@ "fillGradient": 0, "gridPos": { "h": 9, - "w": 24, + "w": 12, "x": 0, "y": 37 }, @@ -5908,6 +5916,111 @@ "align": false, "alignLevel": null } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "decimals": 1, + "description": "The count of gRPC raft message", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 37 + }, + "hiddenSeries": false, + "id": 24763573092, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.11", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(tikv_raftstore_message_recv_by_store{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (instance, store)", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{instance}} - {{store}}", + "metric": "tikv_raftstore_message_recv_by_store", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "gRPC message count", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } } ], "repeat": null, @@ -13892,7 +14005,7 @@ "format": "heatmap", "intervalFactor": 2, "legendFormat": "{{le}}", - "metric": "tikv_raftstore_request_wait_time_duration_secs_bucket", + "metric": "tikv_raftstore_apply_wait_time_duration_secs_bucket", "refId": "A", "step": 4 } @@ -14070,7 +14183,7 @@ "interval": "", "intervalFactor": 2, "legendFormat": "{{le}}", - "metric": "tikv_raftstore_request_wait_time_duration_secs_bucket", + "metric": "tikv_raftstore_store_write_handle_msg_duration_secs_bucket", "refId": "A", "step": 4 } @@ -14144,7 +14257,7 @@ "interval": "", "intervalFactor": 2, "legendFormat": "{{le}}", - "metric": "tikv_raftstore_request_wait_time_duration_secs_bucket", + "metric": "tikv_raftstore_store_write_trigger_wb_bytes_bucket", "refId": "A", "step": 4 } @@ -14333,7 +14446,7 @@ "format": "time_series", "intervalFactor": 2, "legendFormat": "store-{{type}}", - "metric": "tikv_raftstore_request_wait_time_duration_secs_bucket", + "metric": "tikv_raftstore_store_perf_context_time_duration_secs_bucket", "refId": "A", "step": 4 }, @@ -14387,6 +14500,77 @@ "align": false, "alignLevel": null } + }, + { + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_TEST-CLUSTER}", + "description": "The wait time of each raft message", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 62 + }, + "heatmap": {}, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 1977, + "legend": { + "show": false + }, + "links": [], + "reverseYBuckets": false, + "targets": [ + { + "expr": "sum(delta(tikv_raftstore_raft_msg_wait_time_duration_secs_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le)", + "format": "heatmap", + "intervalFactor": 2, + "legendFormat": "{{le}}", + "metric": "tikv_raftstore_raft_msg_wait_time_duration_secs_bucket", + "refId": "A", + "step": 4 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Raft message wait duration", + "tooltip": { + "show": true, + "showHistogram": false + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 0, + "format": "s", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketBound": "upper", + "yBucketNumber": null, + "yBucketSize": null } ], "repeat": null, diff --git a/src/server/server.rs b/src/server/server.rs index a886f1232f4..09782be4e16 100644 --- a/src/server/server.rs +++ b/src/server/server.rs @@ -437,6 +437,7 @@ pub mod test_router { use engine_rocks::{RocksEngine, RocksSnapshot}; use kvproto::raft_serverpb::RaftMessage; use raftstore::{router::RaftStoreRouter, store::*, Result as RaftStoreResult}; + use tikv_util::time::Instant as TiInstant; use super::*; @@ -496,12 +497,10 @@ pub mod test_router { impl RaftStoreRouter for TestRaftStoreRouter { fn send_raft_msg(&self, msg: RaftMessage) -> RaftStoreResult<()> { - let _ = self - .tx - .send(Either::Left(PeerMsg::RaftMessage(InspectedRaftMessage { - heap_size: 0, - msg, - }))); + let _ = self.tx.send(Either::Left(PeerMsg::RaftMessage( + InspectedRaftMessage { heap_size: 0, msg }, + Some(TiInstant::now()), + ))); Ok(()) } diff --git a/tests/failpoints/cases/test_merge.rs b/tests/failpoints/cases/test_merge.rs index ffbd69dc05e..eb15c7e16fa 100644 --- a/tests/failpoints/cases/test_merge.rs +++ b/tests/failpoints/cases/test_merge.rs @@ -1831,7 +1831,10 @@ fn test_concurrent_between_transfer_leader_and_merge() { // Actually, store 1 should not reach the line of propose_commit_merge_1 let _ = rx.recv_timeout(Duration::from_secs(2)); router - .force_send(msg.get_region_id(), PeerMsg::RaftMessage(Box::new(msg))) + .force_send( + msg.get_region_id(), + PeerMsg::RaftMessage(Box::new(msg), None), + ) .unwrap(); // Wait region 1 of node 2 to become leader From 327231947f969388cc51b19f1eb53faaeba0c64b Mon Sep 17 00:00:00 2001 From: lijie Date: Tue, 17 Oct 2023 11:19:59 +0800 Subject: [PATCH 095/220] chore: bump version to 7.5.0 (#15783) Signed-off-by: lijie --- Cargo.lock | 2 +- Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index fccff7d7822..003ccaf39e3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6526,7 +6526,7 @@ dependencies = [ [[package]] name = "tikv" -version = "7.5.0-alpha" +version = "7.5.0" dependencies = [ "anyhow", "api_version", diff --git a/Cargo.toml b/Cargo.toml index bd2b4946950..8abff4f9ca8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "tikv" -version = "7.5.0-alpha" +version = "7.5.0" authors = ["The TiKV Authors"] description = "A distributed transactional key-value database powered by Rust and Raft" license = "Apache-2.0" From 8a9cf21df55d14a4ec413f85a1ffdd0ed3062c89 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Thu, 26 Oct 2023 15:35:04 +0800 Subject: [PATCH 096/220] raftstore: calculate group id for every peer when it is syn-recover status (#15785) (#15823) close tikv/tikv#15784 Signed-off-by: TonsnakeLin Co-authored-by: TonsnakeLin --- components/raftstore/src/store/peer.rs | 35 ++++---- components/test_pd_client/src/pd.rs | 22 ++++- .../raftstore/test_replication_mode.rs | 86 +++++++++++++++++-- 3 files changed, 119 insertions(+), 24 deletions(-) diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index 85b8798bfb1..52932573d7e 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -974,29 +974,32 @@ where pub fn switch_replication_mode(&mut self, state: &Mutex) { self.replication_sync = false; let guard = state.lock().unwrap(); - let enable_group_commit = if guard.status().get_mode() == ReplicationMode::Majority { - self.replication_mode_version = 0; - self.dr_auto_sync_state = DrAutoSyncState::Async; - false - } else { - self.dr_auto_sync_state = guard.status().get_dr_auto_sync().get_state(); - self.replication_mode_version = guard.status().get_dr_auto_sync().state_id; - match guard.status().get_dr_auto_sync().get_state() { - // SyncRecover will enable group commit after it catches up logs. - DrAutoSyncState::Async | DrAutoSyncState::SyncRecover => false, - _ => true, - } - }; + let (enable_group_commit, calculate_group_id) = + if guard.status().get_mode() == ReplicationMode::Majority { + self.replication_mode_version = 0; + self.dr_auto_sync_state = DrAutoSyncState::Async; + (false, false) + } else { + self.dr_auto_sync_state = guard.status().get_dr_auto_sync().get_state(); + self.replication_mode_version = guard.status().get_dr_auto_sync().state_id; + match guard.status().get_dr_auto_sync().get_state() { + // SyncRecover will enable group commit after it catches up logs. + DrAutoSyncState::Async => (false, false), + DrAutoSyncState::SyncRecover => (false, true), + _ => (true, true), + } + }; drop(guard); - self.switch_group_commit(enable_group_commit, state); + self.switch_group_commit(enable_group_commit, calculate_group_id, state); } fn switch_group_commit( &mut self, enable_group_commit: bool, + calculate_group_id: bool, state: &Mutex, ) { - if enable_group_commit { + if enable_group_commit || calculate_group_id { let mut guard = state.lock().unwrap(); let ids = mem::replace( guard.calculate_commit_group( @@ -5140,7 +5143,7 @@ where // should enable group commit to promise `IntegrityOverLabel`. then safe // to switch to the `Sync` phase. if self.dr_auto_sync_state == DrAutoSyncState::SyncRecover { - self.switch_group_commit(true, &ctx.global_replication_state) + self.switch_group_commit(true, true, &ctx.global_replication_state) } self.replication_sync = true; } diff --git a/components/test_pd_client/src/pd.rs b/components/test_pd_client/src/pd.rs index a9141bf6299..7f00cf35ccd 100644 --- a/components/test_pd_client/src/pd.rs +++ b/components/test_pd_client/src/pd.rs @@ -1437,15 +1437,33 @@ impl TestPdClient { cluster.replication_status = Some(status); } - pub fn switch_replication_mode(&self, state: DrAutoSyncState, available_stores: Vec) { + pub fn switch_replication_mode( + &self, + state: Option, + available_stores: Vec, + ) { let mut cluster = self.cluster.wl(); let status = cluster.replication_status.as_mut().unwrap(); + if state.is_none() { + status.set_mode(ReplicationMode::Majority); + let mut dr = status.mut_dr_auto_sync(); + dr.state_id += 1; + return; + } let mut dr = status.mut_dr_auto_sync(); dr.state_id += 1; - dr.set_state(state); + dr.set_state(state.unwrap()); dr.available_stores = available_stores; } + pub fn switch_to_drautosync_mode(&self) { + let mut cluster = self.cluster.wl(); + let status = cluster.replication_status.as_mut().unwrap(); + status.set_mode(ReplicationMode::DrAutoSync); + let mut dr = status.mut_dr_auto_sync(); + dr.state_id += 1; + } + pub fn region_replication_status(&self, region_id: u64) -> RegionReplicationStatus { self.cluster .rl() diff --git a/tests/integrations/raftstore/test_replication_mode.rs b/tests/integrations/raftstore/test_replication_mode.rs index 367ac63aabb..38054c1a995 100644 --- a/tests/integrations/raftstore/test_replication_mode.rs +++ b/tests/integrations/raftstore/test_replication_mode.rs @@ -34,6 +34,18 @@ fn run_cluster(cluster: &mut Cluster) { cluster.must_put(b"k1", b"v0"); } +fn prepare_labels(cluster: &mut Cluster) { + cluster.add_label(1, "dc", "dc1"); + cluster.add_label(2, "dc", "dc1"); + cluster.add_label(3, "dc", "dc2"); + cluster.add_label(1, "zone", "z1"); + cluster.add_label(2, "zone", "z2"); + cluster.add_label(3, "zone", "z3"); + cluster.add_label(1, "host", "h1"); + cluster.add_label(2, "host", "h2"); + cluster.add_label(3, "host", "h3"); +} + /// When using DrAutoSync replication mode, data should be replicated to /// different labels before committed. #[test] @@ -119,7 +131,7 @@ fn test_sync_recover_after_apply_snapshot() { // swith to async cluster .pd_client - .switch_replication_mode(DrAutoSyncState::Async, vec![]); + .switch_replication_mode(Some(DrAutoSyncState::Async), vec![]); rx.recv_timeout(Duration::from_millis(100)).unwrap(); must_get_equal(&cluster.get_engine(1), b"k2", b"v2"); thread::sleep(Duration::from_millis(100)); @@ -136,7 +148,7 @@ fn test_sync_recover_after_apply_snapshot() { cluster .pd_client - .switch_replication_mode(DrAutoSyncState::SyncRecover, vec![]); + .switch_replication_mode(Some(DrAutoSyncState::SyncRecover), vec![]); thread::sleep(Duration::from_millis(100)); // Add node 3 back, snapshot will apply cluster.clear_send_filters(); @@ -265,7 +277,7 @@ fn test_switching_replication_mode() { cluster .pd_client - .switch_replication_mode(DrAutoSyncState::Async, vec![]); + .switch_replication_mode(Some(DrAutoSyncState::Async), vec![]); rx.recv_timeout(Duration::from_millis(100)).unwrap(); must_get_equal(&cluster.get_engine(1), b"k2", b"v2"); thread::sleep(Duration::from_millis(100)); @@ -275,7 +287,7 @@ fn test_switching_replication_mode() { cluster .pd_client - .switch_replication_mode(DrAutoSyncState::SyncRecover, vec![]); + .switch_replication_mode(Some(DrAutoSyncState::SyncRecover), vec![]); thread::sleep(Duration::from_millis(100)); let mut request = new_request( region.get_id(), @@ -331,7 +343,7 @@ fn test_replication_mode_allowlist() { run_cluster(&mut cluster); cluster .pd_client - .switch_replication_mode(DrAutoSyncState::Async, vec![1]); + .switch_replication_mode(Some(DrAutoSyncState::Async), vec![1]); thread::sleep(Duration::from_millis(100)); // 2,3 are paused, so it should not be able to write. @@ -357,7 +369,7 @@ fn test_replication_mode_allowlist() { // clear allowlist. cluster .pd_client - .switch_replication_mode(DrAutoSyncState::Async, vec![]); + .switch_replication_mode(Some(DrAutoSyncState::Async), vec![]); rx.recv_timeout(Duration::from_millis(100)).unwrap(); must_get_equal(&cluster.get_engine(1), b"k2", b"v2"); } @@ -456,6 +468,68 @@ fn test_migrate_replication_mode() { assert_eq!(state.state, RegionReplicationState::IntegrityOverLabel); } +#[test] +fn test_migrate_majority_to_drautosync() { + // 1. start cluster, enable dr-auto-sync and set labels. + let mut cluster = new_server_cluster(0, 3); + cluster.pd_client.disable_default_operator(); + cluster.cfg.raft_store.pd_store_heartbeat_tick_interval = ReadableDuration::millis(50); + cluster.cfg.raft_store.raft_log_gc_threshold = 10; + prepare_labels(&mut cluster); + cluster.run(); + cluster.must_transfer_leader(1, new_peer(1, 1)); + cluster.must_put(b"k1", b"v0"); + cluster.pd_client.configure_dr_auto_sync("dc"); + thread::sleep(Duration::from_millis(100)); + let region = cluster.get_region(b"k1"); + let mut request = new_request( + region.get_id(), + region.get_region_epoch().clone(), + vec![new_put_cf_cmd("default", b"k2", b"v2")], + false, + ); + request.mut_header().set_peer(new_peer(1, 1)); + let (cb, mut rx) = make_cb(&request); + cluster + .sim + .rl() + .async_command_on_node(1, request, cb) + .unwrap(); + assert_eq!(rx.recv_timeout(Duration::from_millis(100)).is_ok(), true); + must_get_equal(&cluster.get_engine(1), b"k2", b"v2"); + let state = cluster.pd_client.region_replication_status(region.get_id()); + assert_eq!(state.state_id, 1); + assert_eq!(state.state, RegionReplicationState::IntegrityOverLabel); + + // 2. swith to marjority mode. + cluster.pd_client.switch_replication_mode(None, vec![]); + thread::sleep(Duration::from_millis(150)); + + // 3. spilt the region and make a new region, the regions status must be + // SimpleMajority. + cluster.must_split(®ion, b"m1"); + thread::sleep(Duration::from_millis(150)); + cluster.must_put(b"n4", b"v4"); + must_get_equal(&cluster.get_engine(1), b"n4", b"v4"); + let region_m = cluster.get_region(b"n4"); + let region_k = cluster.get_region(b"k1"); + + // 4. switch to dy-auto-sync mode, the new region generated at marjority mode + // becomes IntegrityOverLabel again. + cluster.pd_client.switch_to_drautosync_mode(); + thread::sleep(Duration::from_millis(100)); + let state_m = cluster + .pd_client + .region_replication_status(region_m.get_id()); + let state_k = cluster + .pd_client + .region_replication_status(region_k.get_id()); + assert_eq!(state_m.state_id, 3); + assert_eq!(state_m.state, RegionReplicationState::IntegrityOverLabel); + assert_eq!(state_k.state_id, 3); + assert_eq!(state_k.state, RegionReplicationState::IntegrityOverLabel); +} + /// Tests if labels are loaded correctly after rolling start. #[test] fn test_loading_label_after_rolling_start() { From 16454ca067e0a10d0b1761210fd4af448b655ac0 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Thu, 26 Oct 2023 18:17:04 +0800 Subject: [PATCH 097/220] Revert "import: write RPC will check region epoch before continue" (#15787) (#15792) close tikv/tikv#15791 Signed-off-by: lance6716 Co-authored-by: lance6716 Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- Makefile | 8 - .../src/operation/command/write/ingest.rs | 9 +- components/raftstore/src/store/fsm/store.rs | 87 ++++++-- components/raftstore/src/store/msg.rs | 6 + .../raftstore/src/store/worker/cleanup.rs | 19 +- .../raftstore/src/store/worker/cleanup_sst.rs | 120 +++++++++++- components/server/src/server.rs | 2 - components/server/src/server2.rs | 2 - components/sst_importer/src/import_file.rs | 49 ++--- components/sst_importer/src/lib.rs | 2 +- components/sst_importer/src/sst_importer.rs | 8 +- components/test_raftstore-v2/src/server.rs | 1 - components/test_raftstore/src/server.rs | 1 - src/import/sst_service.rs | 185 +----------------- 14 files changed, 228 insertions(+), 271 deletions(-) diff --git a/Makefile b/Makefile index ce8d4e8b793..bb1d7316e1b 100644 --- a/Makefile +++ b/Makefile @@ -406,14 +406,6 @@ docker_test: ${DEV_DOCKER_IMAGE_NAME}:${DOCKER_IMAGE_TAG} \ make test -docker_shell: - docker build -f Dockerfile.test \ - -t ${DEV_DOCKER_IMAGE_NAME}:${DOCKER_IMAGE_TAG} \ - . - docker run -it -v $(shell pwd):/tikv \ - ${DEV_DOCKER_IMAGE_NAME}:${DOCKER_IMAGE_TAG} \ - /bin/bash - ## The driver for script/run-cargo.sh ## ---------------------------------- diff --git a/components/raftstore-v2/src/operation/command/write/ingest.rs b/components/raftstore-v2/src/operation/command/write/ingest.rs index 3d39c9a7369..e963434fe83 100644 --- a/components/raftstore-v2/src/operation/command/write/ingest.rs +++ b/components/raftstore-v2/src/operation/command/write/ingest.rs @@ -43,11 +43,6 @@ impl Store { let import_size = box_try!(ctx.sst_importer.get_total_size()); STORE_SIZE_EVENT_INT_VEC.import_size.set(import_size as i64); let ssts = box_try!(ctx.sst_importer.list_ssts()); - // filter old version SSTs - let ssts: Vec<_> = ssts - .into_iter() - .filter(|sst| sst.api_version >= sst_importer::API_VERSION_2) - .collect(); if ssts.is_empty() { return Ok(()); } @@ -55,9 +50,9 @@ impl Store { let mut region_ssts: HashMap<_, Vec<_>> = HashMap::default(); for sst in ssts { region_ssts - .entry(sst.meta.get_region_id()) + .entry(sst.get_region_id()) .or_default() - .push(sst.meta); + .push(sst); } let ranges = ctx.sst_importer.ranges_in_import(); diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index 3a22ef8434d..aa8fa7c318e 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -36,13 +36,14 @@ use futures::{compat::Future01CompatExt, FutureExt}; use grpcio_health::HealthService; use keys::{self, data_end_key, data_key, enc_end_key, enc_start_key}; use kvproto::{ + import_sstpb::{SstMeta, SwitchMode}, metapb::{self, Region, RegionEpoch}, pdpb::{self, QueryStats, StoreStats}, raft_cmdpb::{AdminCmdType, AdminRequest}, raft_serverpb::{ExtraMessage, ExtraMessageType, PeerState, RaftMessage, RegionLocalState}, replication_modepb::{ReplicationMode, ReplicationStatus}, }; -use pd_client::{Feature, FeatureGate, PdClient}; +use pd_client::{metrics::STORE_SIZE_EVENT_INT_VEC, Feature, FeatureGate, PdClient}; use protobuf::Message; use raft::StateRole; use resource_control::{channel::unbounded, ResourceGroupManager}; @@ -812,6 +813,9 @@ impl<'a, EK: KvEngine + 'static, ER: RaftEngine + 'static, T: Transport> } } StoreMsg::CompactedEvent(event) => self.on_compaction_finished(event), + StoreMsg::ValidateSstResult { invalid_ssts } => { + self.on_validate_sst_result(invalid_ssts) + } StoreMsg::ClearRegionSizeInRange { start_key, end_key } => { self.clear_region_size_in_range(&start_key, &end_key) } @@ -1651,7 +1655,12 @@ impl RaftBatchSystem { ); let compact_runner = CompactRunner::new(engines.kv.clone()); - let cleanup_sst_runner = CleanupSstRunner::new(Arc::clone(&importer)); + let cleanup_sst_runner = CleanupSstRunner::new( + meta.get_id(), + self.router.clone(), + Arc::clone(&importer), + Arc::clone(&pd_client), + ); let gc_snapshot_runner = GcSnapshotRunner::new( meta.get_id(), self.router.clone(), // RaftRouter @@ -2754,8 +2763,44 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER } impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER, T> { + fn on_validate_sst_result(&mut self, ssts: Vec) { + if ssts.is_empty() || self.ctx.importer.get_mode() == SwitchMode::Import { + return; + } + // A stale peer can still ingest a stale Sst before it is + // destroyed. We need to make sure that no stale peer exists. + let mut delete_ssts = Vec::new(); + { + let meta = self.ctx.store_meta.lock().unwrap(); + for sst in ssts { + if !meta.regions.contains_key(&sst.get_region_id()) { + delete_ssts.push(sst); + } + } + } + if delete_ssts.is_empty() { + return; + } + + let task = CleanupSstTask::DeleteSst { ssts: delete_ssts }; + if let Err(e) = self + .ctx + .cleanup_scheduler + .schedule(CleanupTask::CleanupSst(task)) + { + error!( + "schedule to delete ssts failed"; + "store_id" => self.fsm.store.id, + "err" => ?e, + ); + } + } + fn on_cleanup_import_sst(&mut self) -> Result<()> { let mut delete_ssts = Vec::new(); + let mut validate_ssts = Vec::new(); + let import_size = box_try!(self.ctx.importer.get_total_size()); + STORE_SIZE_EVENT_INT_VEC.import_size.set(import_size as i64); let ssts = box_try!(self.ctx.importer.list_ssts()); if ssts.is_empty() { @@ -2764,22 +2809,15 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER { let meta = self.ctx.store_meta.lock().unwrap(); for sst in ssts { - if sst.api_version < sst_importer::API_VERSION_2 { - // SST of old versions are created by old TiKV and have different prerequisite - // we can't delete them here. They can only be deleted manually - continue; - } - if let Some(r) = meta.regions.get(&sst.meta.get_region_id()) { + if let Some(r) = meta.regions.get(&sst.get_region_id()) { let region_epoch = r.get_region_epoch(); - if util::is_epoch_stale(sst.meta.get_region_epoch(), region_epoch) { + if util::is_epoch_stale(sst.get_region_epoch(), region_epoch) { // If the SST epoch is stale, it will not be ingested anymore. - delete_ssts.push(sst.meta); + delete_ssts.push(sst); } } else { - // The write RPC of import sst service have make sure the region do exist at the - // write time, and now the region is not found, sst can be - // deleted because it won't be used by ingest in future. - delete_ssts.push(sst.meta); + // If the peer doesn't exist, we need to validate the SST through PD. + validate_ssts.push(sst); } } } @@ -2799,6 +2837,27 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER } } + // When there is an import job running, the region which this sst belongs may + // has not been split from the origin region because the apply thread is so busy + // that it can not apply SplitRequest as soon as possible. So we can not + // delete this sst file. + if !validate_ssts.is_empty() && self.ctx.importer.get_mode() != SwitchMode::Import { + let task = CleanupSstTask::ValidateSst { + ssts: validate_ssts, + }; + if let Err(e) = self + .ctx + .cleanup_scheduler + .schedule(CleanupTask::CleanupSst(task)) + { + error!( + "schedule to validate ssts failed"; + "store_id" => self.fsm.store.id, + "err" => ?e, + ); + } + } + Ok(()) } diff --git a/components/raftstore/src/store/msg.rs b/components/raftstore/src/store/msg.rs index a92e5169549..a33ca0e476e 100644 --- a/components/raftstore/src/store/msg.rs +++ b/components/raftstore/src/store/msg.rs @@ -10,6 +10,7 @@ use engine_traits::{CompactedEvent, KvEngine, Snapshot}; use futures::channel::mpsc::UnboundedSender; use kvproto::{ brpb::CheckAdminResponse, + import_sstpb::SstMeta, kvrpcpb::{DiskFullOpt, ExtraOp as TxnExtraOp}, metapb, metapb::RegionEpoch, @@ -823,6 +824,10 @@ where { RaftMessage(InspectedRaftMessage), + ValidateSstResult { + invalid_ssts: Vec, + }, + // Clear region size and keys for all regions in the range, so we can force them to // re-calculate their size later. ClearRegionSizeInRange { @@ -879,6 +884,7 @@ where write!(fmt, "Store {} is unreachable", store_id) } StoreMsg::CompactedEvent(ref event) => write!(fmt, "CompactedEvent cf {}", event.cf()), + StoreMsg::ValidateSstResult { .. } => write!(fmt, "Validate SST Result"), StoreMsg::ClearRegionSizeInRange { ref start_key, ref end_key, diff --git a/components/raftstore/src/store/worker/cleanup.rs b/components/raftstore/src/store/worker/cleanup.rs index 726b7abe5ce..632e85f40cc 100644 --- a/components/raftstore/src/store/worker/cleanup.rs +++ b/components/raftstore/src/store/worker/cleanup.rs @@ -3,6 +3,7 @@ use std::fmt::{self, Display, Formatter}; use engine_traits::{KvEngine, RaftEngine}; +use pd_client::PdClient; use tikv_util::worker::Runnable; use super::{ @@ -10,6 +11,7 @@ use super::{ cleanup_sst::{Runner as CleanupSstRunner, Task as CleanupSstTask}, compact::{Runner as CompactRunner, Task as CompactTask}, }; +use crate::store::StoreRouter; pub enum Task { Compact(CompactTask), @@ -27,26 +29,29 @@ impl Display for Task { } } -pub struct Runner +pub struct Runner where E: KvEngine, R: RaftEngine, + S: StoreRouter, { compact: CompactRunner, - cleanup_sst: CleanupSstRunner, + cleanup_sst: CleanupSstRunner, gc_snapshot: GcSnapshotRunner, } -impl Runner +impl Runner where E: KvEngine, R: RaftEngine, + C: PdClient, + S: StoreRouter, { pub fn new( compact: CompactRunner, - cleanup_sst: CleanupSstRunner, + cleanup_sst: CleanupSstRunner, gc_snapshot: GcSnapshotRunner, - ) -> Runner { + ) -> Runner { Runner { compact, cleanup_sst, @@ -55,10 +60,12 @@ where } } -impl Runnable for Runner +impl Runnable for Runner where E: KvEngine, R: RaftEngine, + C: PdClient, + S: StoreRouter, { type Task = Task; diff --git a/components/raftstore/src/store/worker/cleanup_sst.rs b/components/raftstore/src/store/worker/cleanup_sst.rs index 44f188e6f8f..8174b872f4b 100644 --- a/components/raftstore/src/store/worker/cleanup_sst.rs +++ b/components/raftstore/src/store/worker/cleanup_sst.rs @@ -1,30 +1,62 @@ // Copyright 2018 TiKV Project Authors. Licensed under Apache-2.0. -use std::{fmt, sync::Arc}; +use std::{error::Error, fmt, marker::PhantomData, sync::Arc}; -use kvproto::import_sstpb::SstMeta; +use engine_traits::KvEngine; +use kvproto::{import_sstpb::SstMeta, metapb::Region}; +use pd_client::PdClient; use sst_importer::SstImporter; -use tikv_util::worker::Runnable; +use tikv_util::{error, worker::Runnable}; + +use crate::store::{util::is_epoch_stale, StoreMsg, StoreRouter}; + +type Result = std::result::Result>; pub enum Task { DeleteSst { ssts: Vec }, + ValidateSst { ssts: Vec }, } impl fmt::Display for Task { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match *self { Task::DeleteSst { ref ssts } => write!(f, "Delete {} ssts", ssts.len()), + Task::ValidateSst { ref ssts } => write!(f, "Validate {} ssts", ssts.len()), } } } -pub struct Runner { +pub struct Runner +where + EK: KvEngine, + S: StoreRouter, +{ + store_id: u64, + store_router: S, importer: Arc, + pd_client: Arc, + _engine: PhantomData, } -impl Runner { - pub fn new(importer: Arc) -> Runner { - Runner { importer } +impl Runner +where + EK: KvEngine, + C: PdClient, + S: StoreRouter, +{ + pub fn new( + store_id: u64, + store_router: S, + importer: Arc, + pd_client: Arc, + ) -> Runner { + Runner { + store_id, + store_router, + importer, + pd_client, + _engine: PhantomData, + } } /// Deletes SST files from the importer. @@ -33,9 +65,78 @@ impl Runner { let _ = self.importer.delete(sst); } } + + fn get_region_by_meta(&self, sst: &SstMeta) -> Result { + // The SST meta has been delivered with a range, use it directly. + // For now, no case will reach this. But this still could be a guard for + // reducing the superise in the future... + if !sst.get_range().get_start().is_empty() || !sst.get_range().get_end().is_empty() { + return self + .pd_client + .get_region(sst.get_range().get_start()) + .map_err(Into::into); + } + // Once there isn't range provided. + let query_by_start_key_of_full_meta = || { + let start_key = self + .importer + .load_start_key_by_meta::(sst)? + .ok_or_else(|| -> Box { + "failed to load start key from sst, the sst might be empty".into() + })?; + let region = self.pd_client.get_region(&start_key)?; + Result::Ok(region) + }; + query_by_start_key_of_full_meta() + .map_err(|err| + format!("failed to load full sst meta from disk for {:?} and there isn't extra information provided: {err}", sst.get_uuid()).into() + ) + } + + /// Validates whether the SST is stale or not. + fn handle_validate_sst(&self, ssts: Vec) { + let store_id = self.store_id; + let mut invalid_ssts = Vec::new(); + for sst in ssts { + match self.get_region_by_meta(&sst) { + Ok(r) => { + // The region id may or may not be the same as the + // SST file, but it doesn't matter, because the + // epoch of a range will not decrease anyway. + if is_epoch_stale(r.get_region_epoch(), sst.get_region_epoch()) { + // Region has not been updated. + continue; + } + if r.get_id() == sst.get_region_id() + && r.get_peers().iter().any(|p| p.get_store_id() == store_id) + { + // The SST still belongs to this store. + continue; + } + invalid_ssts.push(sst); + } + Err(e) => { + error!("get region failed"; "err" => %e); + } + } + } + + // We need to send back the result to check for the stale + // peer, which may ingest the stale SST before it is + // destroyed. + let msg = StoreMsg::ValidateSstResult { invalid_ssts }; + if let Err(e) = self.store_router.send(msg) { + error!(%e; "send validate sst result failed"); + } + } } -impl Runnable for Runner { +impl Runnable for Runner +where + EK: KvEngine, + C: PdClient, + S: StoreRouter, +{ type Task = Task; fn run(&mut self, task: Task) { @@ -43,6 +144,9 @@ impl Runnable for Runner { Task::DeleteSst { ssts } => { self.handle_delete_sst(ssts); } + Task::ValidateSst { ssts } => { + self.handle_validate_sst(ssts); + } } } } diff --git a/components/server/src/server.rs b/components/server/src/server.rs index a4b6276a587..8d44890e5a6 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -366,7 +366,6 @@ where router.clone(), config.coprocessor.clone(), )); - let region_info_accessor = RegionInfoAccessor::new(coprocessor_host.as_mut().unwrap()); // Initialize concurrency manager @@ -1081,7 +1080,6 @@ where servers.importer.clone(), None, self.resource_manager.clone(), - Arc::new(self.region_info_accessor.clone()), ); let import_cfg_mgr = import_service.get_config_manager(); diff --git a/components/server/src/server2.rs b/components/server/src/server2.rs index 65d02f58c08..2593035618d 100644 --- a/components/server/src/server2.rs +++ b/components/server/src/server2.rs @@ -948,7 +948,6 @@ where backup_worker.start(backup_endpoint); // Import SST service. - let region_info_accessor = self.region_info_accessor.as_ref().unwrap().clone(); let import_service = ImportSstService::new( self.core.config.import.clone(), self.core.config.raft_store.raft_entry_max_size, @@ -957,7 +956,6 @@ where servers.importer.clone(), Some(self.router.as_ref().unwrap().store_meta().clone()), self.resource_manager.clone(), - Arc::new(region_info_accessor), ); let import_cfg_mgr = import_service.get_config_manager(); diff --git a/components/sst_importer/src/import_file.rs b/components/sst_importer/src/import_file.rs index ae81cf01646..b270d26a411 100644 --- a/components/sst_importer/src/import_file.rs +++ b/components/sst_importer/src/import_file.rs @@ -440,7 +440,7 @@ impl ImportDir { Ok(real_key.map(ToOwned::to_owned)) } - pub fn list_ssts(&self) -> Result> { + pub fn list_ssts(&self) -> Result> { let mut ssts = Vec::new(); for e in file_system::read_dir(&self.root_dir)? { let e = e?; @@ -458,33 +458,20 @@ impl ImportDir { } const SST_SUFFIX: &str = ".sst"; -// version 2: compared to version 1 which is the default version, we will check -// epoch of request and local region in write API. -pub const API_VERSION_2: i32 = 2; - -/// sst_meta_to_path will encode the filepath with default api version (current -/// is 2). So when the SstMeta is created in old version of TiKV and filepath -/// will not correspond to the real file, in the deletion logic we can't remove -/// these files. + pub fn sst_meta_to_path(meta: &SstMeta) -> Result { Ok(PathBuf::from(format!( - "{}_{}_{}_{}_{}_{}{}", + "{}_{}_{}_{}_{}{}", UuidBuilder::from_slice(meta.get_uuid())?.build(), meta.get_region_id(), meta.get_region_epoch().get_conf_ver(), meta.get_region_epoch().get_version(), meta.get_cf_name(), - API_VERSION_2, SST_SUFFIX, ))) } -pub struct SstMetaWithApiVersion { - pub meta: SstMeta, - pub api_version: i32, // in future we may move api_version into SstMeta -} - -pub fn parse_meta_from_path>(path: P) -> Result { +pub fn parse_meta_from_path>(path: P) -> Result { let path = path.as_ref(); let file_name = match path.file_name().and_then(|n| n.to_str()) { Some(name) => name, @@ -513,11 +500,7 @@ pub fn parse_meta_from_path>(path: P) -> Result 5 { - api_version = elems[5].parse()?; - } - Ok(SstMetaWithApiVersion { meta, api_version }) + Ok(meta) } #[cfg(test)] @@ -537,12 +520,11 @@ mod test { meta.mut_region_epoch().set_version(3); let path = sst_meta_to_path(&meta).unwrap(); - let expected_path = format!("{}_1_2_3_default_2.sst", uuid); + let expected_path = format!("{}_1_2_3_default.sst", uuid); assert_eq!(path.to_str().unwrap(), &expected_path); - let meta_with_ver = parse_meta_from_path(path).unwrap(); - assert_eq!(meta, meta_with_ver.meta); - assert_eq!(2, meta_with_ver.api_version); + let new_meta = parse_meta_from_path(path).unwrap(); + assert_eq!(meta, new_meta); } #[test] @@ -561,9 +543,8 @@ mod test { meta.get_region_epoch().get_version(), SST_SUFFIX, )); - let meta_with_ver = parse_meta_from_path(path).unwrap(); - assert_eq!(meta, meta_with_ver.meta); - assert_eq!(1, meta_with_ver.api_version); + let new_meta = parse_meta_from_path(path).unwrap(); + assert_eq!(meta, new_meta); } #[cfg(feature = "test-engines-rocksdb")] @@ -615,20 +596,14 @@ mod test { w.finish().unwrap(); dp.save(arcmgr.as_deref()).unwrap(); let mut ssts = dir.list_ssts().unwrap(); - ssts.iter_mut().for_each(|meta_with_ver| { - let meta = &mut meta_with_ver.meta; + ssts.iter_mut().for_each(|meta| { let start = dir .load_start_key_by_meta::(meta, arcmgr.clone()) .unwrap() .unwrap(); meta.mut_range().set_start(start) }); - assert_eq!( - ssts.iter() - .map(|meta_with_ver| { meta_with_ver.meta.clone() }) - .collect(), - vec![meta] - ); + assert_eq!(ssts, vec![meta]); } #[test] diff --git a/components/sst_importer/src/lib.rs b/components/sst_importer/src/lib.rs index ff137005b09..0cfc3bab774 100644 --- a/components/sst_importer/src/lib.rs +++ b/components/sst_importer/src/lib.rs @@ -27,7 +27,7 @@ pub mod sst_importer; pub use self::{ config::{Config, ConfigManager}, errors::{error_inc, Error, Result}, - import_file::{sst_meta_to_path, API_VERSION_2}, + import_file::sst_meta_to_path, import_mode2::range_overlaps, sst_importer::SstImporter, sst_writer::{RawSstWriter, TxnSstWriter}, diff --git a/components/sst_importer/src/sst_importer.rs b/components/sst_importer/src/sst_importer.rs index f36016eb309..5530862e6a3 100644 --- a/components/sst_importer/src/sst_importer.rs +++ b/components/sst_importer/src/sst_importer.rs @@ -51,7 +51,7 @@ use txn_types::{Key, TimeStamp, WriteRef}; use crate::{ caching::cache_map::{CacheMap, ShareOwned}, - import_file::{ImportDir, ImportFile, SstMetaWithApiVersion}, + import_file::{ImportDir, ImportFile}, import_mode::{ImportModeSwitcher, RocksDbMetricsFn}, import_mode2::{HashRange, ImportModeSwitcherV2}, metrics::*, @@ -1387,7 +1387,7 @@ impl SstImporter { /// List the basic information of the current SST files. /// The information contains UUID, region ID, region Epoch. /// Other fields may be left blank. - pub fn list_ssts(&self) -> Result> { + pub fn list_ssts(&self) -> Result> { self.dir.list_ssts() } @@ -1587,9 +1587,9 @@ mod tests { for sst in &ssts { ingested .iter() - .find(|s| s.get_uuid() == sst.meta.get_uuid()) + .find(|s| s.get_uuid() == sst.get_uuid()) .unwrap(); - dir.delete(&sst.meta, key_manager.as_deref()).unwrap(); + dir.delete(sst, key_manager.as_deref()).unwrap(); } assert!(dir.list_ssts().unwrap().is_empty()); } diff --git a/components/test_raftstore-v2/src/server.rs b/components/test_raftstore-v2/src/server.rs index 5073304e17a..299e93eb746 100644 --- a/components/test_raftstore-v2/src/server.rs +++ b/components/test_raftstore-v2/src/server.rs @@ -561,7 +561,6 @@ impl ServerCluster { Arc::clone(&importer), Some(store_meta), resource_manager.clone(), - Arc::new(region_info_accessor.clone()), ); // Create deadlock service. diff --git a/components/test_raftstore/src/server.rs b/components/test_raftstore/src/server.rs index f5c64fa86e9..0002f36d647 100644 --- a/components/test_raftstore/src/server.rs +++ b/components/test_raftstore/src/server.rs @@ -451,7 +451,6 @@ impl ServerCluster { Arc::clone(&importer), None, resource_manager.clone(), - Arc::new(region_info_accessor.clone()), ); // Create deadlock service. diff --git a/src/import/sst_service.rs b/src/import/sst_service.rs index 6f9f22c9cb4..68403e226f8 100644 --- a/src/import/sst_service.rs +++ b/src/import/sst_service.rs @@ -15,7 +15,6 @@ use std::{ use engine_traits::{CompactExt, MiscExt, CF_DEFAULT, CF_WRITE}; use file_system::{set_io_type, IoType}; use futures::{sink::SinkExt, stream::TryStreamExt, FutureExt, TryFutureExt}; -use futures_executor::block_on; use grpcio::{ ClientStreamingSink, RequestStream, RpcContext, ServerStreamingSink, UnarySink, WriteFlags, }; @@ -28,9 +27,7 @@ use kvproto::{ WriteRequest_oneof_chunk as Chunk, *, }, kvrpcpb::Context, - metapb::RegionEpoch, }; -use raftstore::{coprocessor::RegionInfoProvider, store::util::is_epoch_stale, RegionInfoAccessor}; use raftstore_v2::StoreMeta; use resource_control::{with_resource_limiter, ResourceGroupManager}; use sst_importer::{ @@ -42,7 +39,7 @@ use tikv_kv::{ }; use tikv_util::{ config::ReadableSize, - future::{create_stream_with_buffer, paired_future_callback}, + future::create_stream_with_buffer, sys::thread::ThreadBuildWrapper, time::{Instant, Limiter}, HandyRwLock, @@ -127,7 +124,6 @@ pub struct ImportSstService { limiter: Limiter, task_slots: Arc>>, raft_entry_max_size: ReadableSize, - region_info_accessor: Arc, writer: raft_writer::ThrottledTlsEngineWriter, @@ -322,7 +318,6 @@ impl ImportSstService { importer: Arc, store_meta: Option>>>, resource_manager: Option>, - region_info_accessor: Arc, ) -> Self { let props = tikv_util::thread_group::current_properties(); let eng = Mutex::new(engine.clone()); @@ -370,7 +365,6 @@ impl ImportSstService { limiter: Limiter::new(f64::INFINITY), task_slots: Arc::new(Mutex::new(HashSet::default())), raft_entry_max_size, - region_info_accessor, writer, store_meta, resource_manager, @@ -681,59 +675,6 @@ impl ImportSstService { } } -fn check_local_region_stale( - region_id: u64, - epoch: &RegionEpoch, - region_info_accessor: Arc, -) -> Result<()> { - let (cb, f) = paired_future_callback(); - region_info_accessor - .find_region_by_id(region_id, cb) - .map_err(|e| { - Error::Engine(format!("failed to find region {} err {:?}", region_id, e).into()) - })?; - match block_on(f)? { - Some(local_region_info) => { - let local_region_epoch = local_region_info.region.region_epoch.unwrap(); - - // TODO(lance6717): we should only need to check conf_ver because we require all - // peers have SST on the disk, and does not care about which one is - // leader. But since check_sst_for_ingestion also checks epoch version, - // we just keep it here for now. - - // when local region epoch is stale, client can retry write later - if is_epoch_stale(&local_region_epoch, epoch) { - return Err(Error::Engine( - format!("request region {} is ahead of local region, local epoch {:?}, request epoch {:?}, please retry write later", - region_id, local_region_epoch, epoch).into(), - )); - } - // when local region epoch is ahead, client need to rescan region from PD to get - // latest region later - if is_epoch_stale(epoch, &local_region_epoch) { - return Err(Error::Engine( - format!("request region {} is staler than local region, local epoch {:?}, request epoch {:?}, please rescan region later", - region_id, local_region_epoch, epoch).into(), - )); - } - - // not match means to rescan - Ok(()) - } - None => { - // when region not found, we can't tell whether it's stale or ahead, so we just - // return the safest case - Err(Error::Engine( - format!( - "region {} is not found, please rescan region later", - region_id - ) - .into(), - )) - } - } -} - #[macro_export] macro_rules! impl_write { ($fn:ident, $req_ty:ident, $resp_ty:ident, $chunk_ty:ident, $writer_fn:ident) => { @@ -745,7 +686,6 @@ macro_rules! impl_write { ) { let import = self.importer.clone(); let tablets = self.tablets.clone(); - let region_info_accessor = self.region_info_accessor.clone(); let (rx, buf_driver) = create_stream_with_buffer(stream, self.cfg.rl().stream_channel_window); let mut rx = rx.map_err(Error::from); @@ -773,15 +713,7 @@ macro_rules! impl_write { } _ => return Err(Error::InvalidChunk), }; - // wait the region epoch on this TiKV to catch up with the epoch - // in request, which comes from PD and represents the majority - // peers' status. let region_id = meta.get_region_id(); - check_local_region_stale( - region_id, - meta.get_region_epoch(), - region_info_accessor, - )?; let tablet = match tablets.get(region_id) { Some(t) => t, None => { @@ -1455,30 +1387,19 @@ fn write_needs_restore(write: &[u8]) -> bool { #[cfg(test)] mod test { - use std::{ - collections::HashMap, - sync::{Arc, Mutex}, - }; + use std::collections::HashMap; use engine_traits::{CF_DEFAULT, CF_WRITE}; use kvproto::{ kvrpcpb::Context, - metapb::{Region, RegionEpoch}, + metapb::RegionEpoch, raft_cmdpb::{RaftCmdRequest, Request}, }; - use protobuf::{Message, SingularPtrField}; - use raft::StateRole::Follower; - use raftstore::{ - coprocessor::{region_info_accessor::Callback, RegionInfoProvider}, - RegionInfo, - }; + use protobuf::Message; use tikv_kv::{Modify, WriteData}; use txn_types::{Key, TimeStamp, Write, WriteBatchFlags, WriteType}; - use crate::{ - import::sst_service::{check_local_region_stale, RequestCollector}, - server::raftkv, - }; + use crate::{import::sst_service::RequestCollector, server::raftkv}; fn write(key: &[u8], ty: WriteType, commit_ts: u64, start_ts: u64) -> (Vec, Vec) { let k = Key::from_raw(key).append_ts(TimeStamp::new(commit_ts)); @@ -1762,100 +1683,4 @@ mod test { } assert_eq!(total, 100); } - - #[test] - fn test_write_rpc_check_region_epoch() { - struct MockRegionInfoProvider { - map: Mutex>, - } - impl RegionInfoProvider for MockRegionInfoProvider { - fn find_region_by_id( - &self, - region_id: u64, - callback: Callback>, - ) -> Result<(), raftstore::coprocessor::Error> { - callback(self.map.lock().unwrap().get(®ion_id).cloned()); - Ok(()) - } - } - - let mock_provider = Arc::new(MockRegionInfoProvider { - map: Mutex::new(HashMap::new()), - }); - - let mut req_epoch = RegionEpoch { - conf_ver: 10, - version: 10, - ..Default::default() - }; - // test for region not found - let result = check_local_region_stale(1, &req_epoch, mock_provider.clone()); - assert!(result.is_err()); - // check error message contains "rescan region later", client will match this - // string pattern - assert!( - result - .unwrap_err() - .to_string() - .contains("rescan region later") - ); - - let mut local_region_info = RegionInfo { - region: Region { - id: 1, - region_epoch: SingularPtrField::some(req_epoch.clone()), - ..Default::default() - }, - role: Follower, - buckets: 1, - }; - mock_provider - .map - .lock() - .unwrap() - .insert(1, local_region_info.clone()); - // test the local region epoch is same as request - let result = check_local_region_stale(1, &req_epoch, mock_provider.clone()); - result.unwrap(); - - // test the local region epoch is ahead of request - local_region_info - .region - .region_epoch - .as_mut() - .unwrap() - .conf_ver = 11; - mock_provider - .map - .lock() - .unwrap() - .insert(1, local_region_info.clone()); - let result = check_local_region_stale(1, &req_epoch, mock_provider.clone()); - assert!(result.is_err()); - // check error message contains "rescan region later", client will match this - // string pattern - assert!( - result - .unwrap_err() - .to_string() - .contains("rescan region later") - ); - - req_epoch.conf_ver = 11; - let result = check_local_region_stale(1, &req_epoch, mock_provider.clone()); - result.unwrap(); - - // test the local region epoch is staler than request - req_epoch.version = 12; - let result = check_local_region_stale(1, &req_epoch, mock_provider); - assert!(result.is_err()); - // check error message contains "retry write later", client will match this - // string pattern - assert!( - result - .unwrap_err() - .to_string() - .contains("retry write later") - ); - } } From a2b2bd0ec5ad86155f47c37f0294a2749d74d7ad Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Fri, 27 Oct 2023 14:53:03 +0800 Subject: [PATCH 098/220] import: write RPC will check region epoch before continue (#15795) (#15856) close tikv/tikv#15003 Signed-off-by: lance6716 Co-authored-by: lance6716 --- Makefile | 8 + components/error_code/src/sst_importer.rs | 4 +- .../src/operation/command/write/ingest.rs | 9 +- components/raftstore/src/store/fsm/store.rs | 103 +++------ components/raftstore/src/store/msg.rs | 6 - .../raftstore/src/store/worker/cleanup.rs | 19 +- .../raftstore/src/store/worker/cleanup_sst.rs | 120 +---------- components/server/src/server.rs | 2 + components/server/src/server2.rs | 2 + components/sst_importer/src/errors.rs | 8 + components/sst_importer/src/import_file.rs | 50 +++-- components/sst_importer/src/lib.rs | 2 +- components/sst_importer/src/sst_importer.rs | 12 +- components/test_raftstore-v2/src/server.rs | 1 + components/test_raftstore/src/server.rs | 1 + src/import/sst_service.rs | 197 ++++++++++++++++-- 16 files changed, 300 insertions(+), 244 deletions(-) diff --git a/Makefile b/Makefile index bb1d7316e1b..ce8d4e8b793 100644 --- a/Makefile +++ b/Makefile @@ -406,6 +406,14 @@ docker_test: ${DEV_DOCKER_IMAGE_NAME}:${DOCKER_IMAGE_TAG} \ make test +docker_shell: + docker build -f Dockerfile.test \ + -t ${DEV_DOCKER_IMAGE_NAME}:${DOCKER_IMAGE_TAG} \ + . + docker run -it -v $(shell pwd):/tikv \ + ${DEV_DOCKER_IMAGE_NAME}:${DOCKER_IMAGE_TAG} \ + /bin/bash + ## The driver for script/run-cargo.sh ## ---------------------------------- diff --git a/components/error_code/src/sst_importer.rs b/components/error_code/src/sst_importer.rs index 117400e8aff..9e568ee00c1 100644 --- a/components/error_code/src/sst_importer.rs +++ b/components/error_code/src/sst_importer.rs @@ -25,5 +25,7 @@ define_error_codes!( RESOURCE_NOT_ENOUTH => ("ResourceNotEnough", "", ""), SUSPENDED => ("Suspended", "this request has been suspended.", - "Probably there are some export tools don't support exporting data inserted by `ingest`(say, snapshot backup). Check the user manual and stop them.") + "Probably there are some export tools don't support exporting data inserted by `ingest`(say, snapshot backup). Check the user manual and stop them."), + REQUEST_TOO_NEW => ("RequestTooNew", "", ""), + REQUEST_TOO_OLD => ("RequestTooOld", "", "") ); diff --git a/components/raftstore-v2/src/operation/command/write/ingest.rs b/components/raftstore-v2/src/operation/command/write/ingest.rs index e963434fe83..45247b3f36f 100644 --- a/components/raftstore-v2/src/operation/command/write/ingest.rs +++ b/components/raftstore-v2/src/operation/command/write/ingest.rs @@ -43,6 +43,11 @@ impl Store { let import_size = box_try!(ctx.sst_importer.get_total_size()); STORE_SIZE_EVENT_INT_VEC.import_size.set(import_size as i64); let ssts = box_try!(ctx.sst_importer.list_ssts()); + // filter old version SSTs + let ssts: Vec<_> = ssts + .into_iter() + .filter(|sst| sst.1 >= sst_importer::API_VERSION_2) + .collect(); if ssts.is_empty() { return Ok(()); } @@ -50,9 +55,9 @@ impl Store { let mut region_ssts: HashMap<_, Vec<_>> = HashMap::default(); for sst in ssts { region_ssts - .entry(sst.get_region_id()) + .entry(sst.0.get_region_id()) .or_default() - .push(sst); + .push(sst.0); } let ranges = ctx.sst_importer.ranges_in_import(); diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index aa8fa7c318e..2efcbf87b09 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -14,7 +14,7 @@ use std::{ atomic::{AtomicU64, Ordering}, Arc, Mutex, }, - time::{Duration, Instant}, + time::{Duration, Instant, SystemTime}, u64, }; @@ -36,14 +36,13 @@ use futures::{compat::Future01CompatExt, FutureExt}; use grpcio_health::HealthService; use keys::{self, data_end_key, data_key, enc_end_key, enc_start_key}; use kvproto::{ - import_sstpb::{SstMeta, SwitchMode}, metapb::{self, Region, RegionEpoch}, pdpb::{self, QueryStats, StoreStats}, raft_cmdpb::{AdminCmdType, AdminRequest}, raft_serverpb::{ExtraMessage, ExtraMessageType, PeerState, RaftMessage, RegionLocalState}, replication_modepb::{ReplicationMode, ReplicationStatus}, }; -use pd_client::{metrics::STORE_SIZE_EVENT_INT_VEC, Feature, FeatureGate, PdClient}; +use pd_client::{Feature, FeatureGate, PdClient}; use protobuf::Message; use raft::StateRole; use resource_control::{channel::unbounded, ResourceGroupManager}; @@ -813,9 +812,6 @@ impl<'a, EK: KvEngine + 'static, ER: RaftEngine + 'static, T: Transport> } } StoreMsg::CompactedEvent(event) => self.on_compaction_finished(event), - StoreMsg::ValidateSstResult { invalid_ssts } => { - self.on_validate_sst_result(invalid_ssts) - } StoreMsg::ClearRegionSizeInRange { start_key, end_key } => { self.clear_region_size_in_range(&start_key, &end_key) } @@ -1655,12 +1651,7 @@ impl RaftBatchSystem { ); let compact_runner = CompactRunner::new(engines.kv.clone()); - let cleanup_sst_runner = CleanupSstRunner::new( - meta.get_id(), - self.router.clone(), - Arc::clone(&importer), - Arc::clone(&pd_client), - ); + let cleanup_sst_runner = CleanupSstRunner::new(Arc::clone(&importer)); let gc_snapshot_runner = GcSnapshotRunner::new( meta.get_id(), self.router.clone(), // RaftRouter @@ -2762,62 +2753,47 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER } } -impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER, T> { - fn on_validate_sst_result(&mut self, ssts: Vec) { - if ssts.is_empty() || self.ctx.importer.get_mode() == SwitchMode::Import { - return; - } - // A stale peer can still ingest a stale Sst before it is - // destroyed. We need to make sure that no stale peer exists. - let mut delete_ssts = Vec::new(); - { - let meta = self.ctx.store_meta.lock().unwrap(); - for sst in ssts { - if !meta.regions.contains_key(&sst.get_region_id()) { - delete_ssts.push(sst); - } - } - } - if delete_ssts.is_empty() { - return; - } - - let task = CleanupSstTask::DeleteSst { ssts: delete_ssts }; - if let Err(e) = self - .ctx - .cleanup_scheduler - .schedule(CleanupTask::CleanupSst(task)) - { - error!( - "schedule to delete ssts failed"; - "store_id" => self.fsm.store.id, - "err" => ?e, - ); - } - } +// we will remove 1-week old version 1 SST files. +const VERSION_1_SST_CLEANUP_DURATION: Duration = Duration::from_secs(7 * 24 * 60 * 60); +impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER, T> { fn on_cleanup_import_sst(&mut self) -> Result<()> { let mut delete_ssts = Vec::new(); - let mut validate_ssts = Vec::new(); - let import_size = box_try!(self.ctx.importer.get_total_size()); - STORE_SIZE_EVENT_INT_VEC.import_size.set(import_size as i64); let ssts = box_try!(self.ctx.importer.list_ssts()); if ssts.is_empty() { return Ok(()); } + let now = SystemTime::now(); { let meta = self.ctx.store_meta.lock().unwrap(); for sst in ssts { - if let Some(r) = meta.regions.get(&sst.get_region_id()) { + if let Some(r) = meta.regions.get(&sst.0.get_region_id()) { let region_epoch = r.get_region_epoch(); - if util::is_epoch_stale(sst.get_region_epoch(), region_epoch) { + if util::is_epoch_stale(sst.0.get_region_epoch(), region_epoch) { // If the SST epoch is stale, it will not be ingested anymore. - delete_ssts.push(sst); + delete_ssts.push(sst.0); } + } else if sst.1 >= sst_importer::API_VERSION_2 { + // The write RPC of import sst service have make sure the region do exist at + // the write time, and now the region is not found, + // sst can be deleted because it won't be used by + // ingest in future. + delete_ssts.push(sst.0); } else { - // If the peer doesn't exist, we need to validate the SST through PD. - validate_ssts.push(sst); + // in the old protocol, we can't easily know if the SST will be used in the + // committed raft log, so we only delete the SST + // files that has not be modified for 1 week. + if let Ok(duration) = now.duration_since(sst.2) { + if duration > VERSION_1_SST_CLEANUP_DURATION { + warn!( + "found 1-week old SST file of version 1, will delete it"; + "sst_meta" => ?sst.0, + "last_modified" => ?sst.2 + ); + delete_ssts.push(sst.0); + } + } } } } @@ -2837,27 +2813,6 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER } } - // When there is an import job running, the region which this sst belongs may - // has not been split from the origin region because the apply thread is so busy - // that it can not apply SplitRequest as soon as possible. So we can not - // delete this sst file. - if !validate_ssts.is_empty() && self.ctx.importer.get_mode() != SwitchMode::Import { - let task = CleanupSstTask::ValidateSst { - ssts: validate_ssts, - }; - if let Err(e) = self - .ctx - .cleanup_scheduler - .schedule(CleanupTask::CleanupSst(task)) - { - error!( - "schedule to validate ssts failed"; - "store_id" => self.fsm.store.id, - "err" => ?e, - ); - } - } - Ok(()) } diff --git a/components/raftstore/src/store/msg.rs b/components/raftstore/src/store/msg.rs index a33ca0e476e..a92e5169549 100644 --- a/components/raftstore/src/store/msg.rs +++ b/components/raftstore/src/store/msg.rs @@ -10,7 +10,6 @@ use engine_traits::{CompactedEvent, KvEngine, Snapshot}; use futures::channel::mpsc::UnboundedSender; use kvproto::{ brpb::CheckAdminResponse, - import_sstpb::SstMeta, kvrpcpb::{DiskFullOpt, ExtraOp as TxnExtraOp}, metapb, metapb::RegionEpoch, @@ -824,10 +823,6 @@ where { RaftMessage(InspectedRaftMessage), - ValidateSstResult { - invalid_ssts: Vec, - }, - // Clear region size and keys for all regions in the range, so we can force them to // re-calculate their size later. ClearRegionSizeInRange { @@ -884,7 +879,6 @@ where write!(fmt, "Store {} is unreachable", store_id) } StoreMsg::CompactedEvent(ref event) => write!(fmt, "CompactedEvent cf {}", event.cf()), - StoreMsg::ValidateSstResult { .. } => write!(fmt, "Validate SST Result"), StoreMsg::ClearRegionSizeInRange { ref start_key, ref end_key, diff --git a/components/raftstore/src/store/worker/cleanup.rs b/components/raftstore/src/store/worker/cleanup.rs index 632e85f40cc..726b7abe5ce 100644 --- a/components/raftstore/src/store/worker/cleanup.rs +++ b/components/raftstore/src/store/worker/cleanup.rs @@ -3,7 +3,6 @@ use std::fmt::{self, Display, Formatter}; use engine_traits::{KvEngine, RaftEngine}; -use pd_client::PdClient; use tikv_util::worker::Runnable; use super::{ @@ -11,7 +10,6 @@ use super::{ cleanup_sst::{Runner as CleanupSstRunner, Task as CleanupSstTask}, compact::{Runner as CompactRunner, Task as CompactTask}, }; -use crate::store::StoreRouter; pub enum Task { Compact(CompactTask), @@ -29,29 +27,26 @@ impl Display for Task { } } -pub struct Runner +pub struct Runner where E: KvEngine, R: RaftEngine, - S: StoreRouter, { compact: CompactRunner, - cleanup_sst: CleanupSstRunner, + cleanup_sst: CleanupSstRunner, gc_snapshot: GcSnapshotRunner, } -impl Runner +impl Runner where E: KvEngine, R: RaftEngine, - C: PdClient, - S: StoreRouter, { pub fn new( compact: CompactRunner, - cleanup_sst: CleanupSstRunner, + cleanup_sst: CleanupSstRunner, gc_snapshot: GcSnapshotRunner, - ) -> Runner { + ) -> Runner { Runner { compact, cleanup_sst, @@ -60,12 +55,10 @@ where } } -impl Runnable for Runner +impl Runnable for Runner where E: KvEngine, R: RaftEngine, - C: PdClient, - S: StoreRouter, { type Task = Task; diff --git a/components/raftstore/src/store/worker/cleanup_sst.rs b/components/raftstore/src/store/worker/cleanup_sst.rs index 8174b872f4b..44f188e6f8f 100644 --- a/components/raftstore/src/store/worker/cleanup_sst.rs +++ b/components/raftstore/src/store/worker/cleanup_sst.rs @@ -1,62 +1,30 @@ // Copyright 2018 TiKV Project Authors. Licensed under Apache-2.0. -use std::{error::Error, fmt, marker::PhantomData, sync::Arc}; +use std::{fmt, sync::Arc}; -use engine_traits::KvEngine; -use kvproto::{import_sstpb::SstMeta, metapb::Region}; -use pd_client::PdClient; +use kvproto::import_sstpb::SstMeta; use sst_importer::SstImporter; -use tikv_util::{error, worker::Runnable}; - -use crate::store::{util::is_epoch_stale, StoreMsg, StoreRouter}; - -type Result = std::result::Result>; +use tikv_util::worker::Runnable; pub enum Task { DeleteSst { ssts: Vec }, - ValidateSst { ssts: Vec }, } impl fmt::Display for Task { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match *self { Task::DeleteSst { ref ssts } => write!(f, "Delete {} ssts", ssts.len()), - Task::ValidateSst { ref ssts } => write!(f, "Validate {} ssts", ssts.len()), } } } -pub struct Runner -where - EK: KvEngine, - S: StoreRouter, -{ - store_id: u64, - store_router: S, +pub struct Runner { importer: Arc, - pd_client: Arc, - _engine: PhantomData, } -impl Runner -where - EK: KvEngine, - C: PdClient, - S: StoreRouter, -{ - pub fn new( - store_id: u64, - store_router: S, - importer: Arc, - pd_client: Arc, - ) -> Runner { - Runner { - store_id, - store_router, - importer, - pd_client, - _engine: PhantomData, - } +impl Runner { + pub fn new(importer: Arc) -> Runner { + Runner { importer } } /// Deletes SST files from the importer. @@ -65,78 +33,9 @@ where let _ = self.importer.delete(sst); } } - - fn get_region_by_meta(&self, sst: &SstMeta) -> Result { - // The SST meta has been delivered with a range, use it directly. - // For now, no case will reach this. But this still could be a guard for - // reducing the superise in the future... - if !sst.get_range().get_start().is_empty() || !sst.get_range().get_end().is_empty() { - return self - .pd_client - .get_region(sst.get_range().get_start()) - .map_err(Into::into); - } - // Once there isn't range provided. - let query_by_start_key_of_full_meta = || { - let start_key = self - .importer - .load_start_key_by_meta::(sst)? - .ok_or_else(|| -> Box { - "failed to load start key from sst, the sst might be empty".into() - })?; - let region = self.pd_client.get_region(&start_key)?; - Result::Ok(region) - }; - query_by_start_key_of_full_meta() - .map_err(|err| - format!("failed to load full sst meta from disk for {:?} and there isn't extra information provided: {err}", sst.get_uuid()).into() - ) - } - - /// Validates whether the SST is stale or not. - fn handle_validate_sst(&self, ssts: Vec) { - let store_id = self.store_id; - let mut invalid_ssts = Vec::new(); - for sst in ssts { - match self.get_region_by_meta(&sst) { - Ok(r) => { - // The region id may or may not be the same as the - // SST file, but it doesn't matter, because the - // epoch of a range will not decrease anyway. - if is_epoch_stale(r.get_region_epoch(), sst.get_region_epoch()) { - // Region has not been updated. - continue; - } - if r.get_id() == sst.get_region_id() - && r.get_peers().iter().any(|p| p.get_store_id() == store_id) - { - // The SST still belongs to this store. - continue; - } - invalid_ssts.push(sst); - } - Err(e) => { - error!("get region failed"; "err" => %e); - } - } - } - - // We need to send back the result to check for the stale - // peer, which may ingest the stale SST before it is - // destroyed. - let msg = StoreMsg::ValidateSstResult { invalid_ssts }; - if let Err(e) = self.store_router.send(msg) { - error!(%e; "send validate sst result failed"); - } - } } -impl Runnable for Runner -where - EK: KvEngine, - C: PdClient, - S: StoreRouter, -{ +impl Runnable for Runner { type Task = Task; fn run(&mut self, task: Task) { @@ -144,9 +43,6 @@ where Task::DeleteSst { ssts } => { self.handle_delete_sst(ssts); } - Task::ValidateSst { ssts } => { - self.handle_validate_sst(ssts); - } } } } diff --git a/components/server/src/server.rs b/components/server/src/server.rs index 8d44890e5a6..a4b6276a587 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -366,6 +366,7 @@ where router.clone(), config.coprocessor.clone(), )); + let region_info_accessor = RegionInfoAccessor::new(coprocessor_host.as_mut().unwrap()); // Initialize concurrency manager @@ -1080,6 +1081,7 @@ where servers.importer.clone(), None, self.resource_manager.clone(), + Arc::new(self.region_info_accessor.clone()), ); let import_cfg_mgr = import_service.get_config_manager(); diff --git a/components/server/src/server2.rs b/components/server/src/server2.rs index 2593035618d..65d02f58c08 100644 --- a/components/server/src/server2.rs +++ b/components/server/src/server2.rs @@ -948,6 +948,7 @@ where backup_worker.start(backup_endpoint); // Import SST service. + let region_info_accessor = self.region_info_accessor.as_ref().unwrap().clone(); let import_service = ImportSstService::new( self.core.config.import.clone(), self.core.config.raft_store.raft_entry_max_size, @@ -956,6 +957,7 @@ where servers.importer.clone(), Some(self.router.as_ref().unwrap().store_meta().clone()), self.resource_manager.clone(), + Arc::new(region_info_accessor), ); let import_cfg_mgr = import_service.get_config_manager(); diff --git a/components/sst_importer/src/errors.rs b/components/sst_importer/src/errors.rs index acca7523427..e03288bb3e1 100644 --- a/components/sst_importer/src/errors.rs +++ b/components/sst_importer/src/errors.rs @@ -118,6 +118,12 @@ pub enum Error { #[error("Importing a SST file with imcompatible api version")] IncompatibleApiVersion, + #[error("{0}, please retry write later")] + RequestTooNew(String), + + #[error("{0}, please rescan region later")] + RequestTooOld(String), + #[error("Key mode mismatched with the request mode, writer: {:?}, storage: {:?}, key: {}", .writer, .storage_api_version, .key)] InvalidKeyMode { writer: SstWriterType, @@ -213,6 +219,8 @@ impl ErrorCodeExt for Error { Error::InvalidKeyMode { .. } => error_code::sst_importer::INVALID_KEY_MODE, Error::ResourceNotEnough(_) => error_code::sst_importer::RESOURCE_NOT_ENOUTH, Error::Suspended { .. } => error_code::sst_importer::SUSPENDED, + Error::RequestTooNew(_) => error_code::sst_importer::REQUEST_TOO_NEW, + Error::RequestTooOld(_) => error_code::sst_importer::REQUEST_TOO_OLD, } } } diff --git a/components/sst_importer/src/import_file.rs b/components/sst_importer/src/import_file.rs index b270d26a411..b3b7c051ce4 100644 --- a/components/sst_importer/src/import_file.rs +++ b/components/sst_importer/src/import_file.rs @@ -6,6 +6,7 @@ use std::{ io::{self, Write}, path::{Path, PathBuf}, sync::Arc, + time::SystemTime, }; use api_version::api_v2::TIDB_RANGES_COMPLEMENT; @@ -440,7 +441,7 @@ impl ImportDir { Ok(real_key.map(ToOwned::to_owned)) } - pub fn list_ssts(&self) -> Result> { + pub fn list_ssts(&self) -> Result> { let mut ssts = Vec::new(); for e in file_system::read_dir(&self.root_dir)? { let e = e?; @@ -449,7 +450,10 @@ impl ImportDir { } let path = e.path(); match parse_meta_from_path(&path) { - Ok(sst) => ssts.push(sst), + Ok(sst) => { + let last_modify = e.metadata()?.modified()?; + ssts.push((sst.0, sst.1, last_modify)) + } Err(e) => error!(%e; "path_to_sst_meta failed"; "path" => %path.display(),), } } @@ -458,20 +462,28 @@ impl ImportDir { } const SST_SUFFIX: &str = ".sst"; - +// version 2: compared to version 1 which is the default version, we will check +// epoch of request and local region in write API. +pub const API_VERSION_2: i32 = 2; + +/// sst_meta_to_path will encode the filepath with default api version (current +/// is 2). So when the SstMeta is created in old version of TiKV and filepath +/// will not correspond to the real file, in the deletion logic we can't remove +/// these files. pub fn sst_meta_to_path(meta: &SstMeta) -> Result { Ok(PathBuf::from(format!( - "{}_{}_{}_{}_{}{}", + "{}_{}_{}_{}_{}_{}{}", UuidBuilder::from_slice(meta.get_uuid())?.build(), meta.get_region_id(), meta.get_region_epoch().get_conf_ver(), meta.get_region_epoch().get_version(), meta.get_cf_name(), + API_VERSION_2, SST_SUFFIX, ))) } -pub fn parse_meta_from_path>(path: P) -> Result { +pub fn parse_meta_from_path>(path: P) -> Result<(SstMeta, i32)> { let path = path.as_ref(); let file_name = match path.file_name().and_then(|n| n.to_str()) { Some(name) => name, @@ -500,7 +512,11 @@ pub fn parse_meta_from_path>(path: P) -> Result { // cf_name to path. meta.set_cf_name(elems[4].to_owned()); } - Ok(meta) + let mut api_version = 1; + if elems.len() > 5 { + api_version = elems[5].parse()?; + } + Ok((meta, api_version)) } #[cfg(test)] @@ -520,11 +536,12 @@ mod test { meta.mut_region_epoch().set_version(3); let path = sst_meta_to_path(&meta).unwrap(); - let expected_path = format!("{}_1_2_3_default.sst", uuid); + let expected_path = format!("{}_1_2_3_default_2.sst", uuid); assert_eq!(path.to_str().unwrap(), &expected_path); - let new_meta = parse_meta_from_path(path).unwrap(); - assert_eq!(meta, new_meta); + let meta_with_ver = parse_meta_from_path(path).unwrap(); + assert_eq!(meta, meta_with_ver.0); + assert_eq!(2, meta_with_ver.1); } #[test] @@ -543,8 +560,9 @@ mod test { meta.get_region_epoch().get_version(), SST_SUFFIX, )); - let new_meta = parse_meta_from_path(path).unwrap(); - assert_eq!(meta, new_meta); + let meta_with_ver = parse_meta_from_path(path).unwrap(); + assert_eq!(meta, meta_with_ver.0); + assert_eq!(1, meta_with_ver.1); } #[cfg(feature = "test-engines-rocksdb")] @@ -596,14 +614,20 @@ mod test { w.finish().unwrap(); dp.save(arcmgr.as_deref()).unwrap(); let mut ssts = dir.list_ssts().unwrap(); - ssts.iter_mut().for_each(|meta| { + ssts.iter_mut().for_each(|meta_with_ver| { + let meta = &mut meta_with_ver.0; let start = dir .load_start_key_by_meta::(meta, arcmgr.clone()) .unwrap() .unwrap(); meta.mut_range().set_start(start) }); - assert_eq!(ssts, vec![meta]); + assert_eq!( + ssts.iter() + .map(|meta_with_ver| { meta_with_ver.0.clone() }) + .collect(), + vec![meta] + ); } #[test] diff --git a/components/sst_importer/src/lib.rs b/components/sst_importer/src/lib.rs index 0cfc3bab774..ff137005b09 100644 --- a/components/sst_importer/src/lib.rs +++ b/components/sst_importer/src/lib.rs @@ -27,7 +27,7 @@ pub mod sst_importer; pub use self::{ config::{Config, ConfigManager}, errors::{error_inc, Error, Result}, - import_file::sst_meta_to_path, + import_file::{sst_meta_to_path, API_VERSION_2}, import_mode2::range_overlaps, sst_importer::SstImporter, sst_writer::{RawSstWriter, TxnSstWriter}, diff --git a/components/sst_importer/src/sst_importer.rs b/components/sst_importer/src/sst_importer.rs index 5530862e6a3..d97dddcb642 100644 --- a/components/sst_importer/src/sst_importer.rs +++ b/components/sst_importer/src/sst_importer.rs @@ -11,7 +11,7 @@ use std::{ atomic::{AtomicU64, Ordering}, Arc, }, - time::Duration, + time::{Duration, SystemTime}, }; use collections::HashSet; @@ -1385,9 +1385,9 @@ impl SstImporter { } /// List the basic information of the current SST files. - /// The information contains UUID, region ID, region Epoch. - /// Other fields may be left blank. - pub fn list_ssts(&self) -> Result> { + /// The information contains UUID, region ID, region Epoch, api version, + /// last modified time. Other fields may be left blank. + pub fn list_ssts(&self) -> Result> { self.dir.list_ssts() } @@ -1587,9 +1587,9 @@ mod tests { for sst in &ssts { ingested .iter() - .find(|s| s.get_uuid() == sst.get_uuid()) + .find(|s| s.get_uuid() == sst.0.get_uuid()) .unwrap(); - dir.delete(sst, key_manager.as_deref()).unwrap(); + dir.delete(&sst.0, key_manager.as_deref()).unwrap(); } assert!(dir.list_ssts().unwrap().is_empty()); } diff --git a/components/test_raftstore-v2/src/server.rs b/components/test_raftstore-v2/src/server.rs index 299e93eb746..5073304e17a 100644 --- a/components/test_raftstore-v2/src/server.rs +++ b/components/test_raftstore-v2/src/server.rs @@ -561,6 +561,7 @@ impl ServerCluster { Arc::clone(&importer), Some(store_meta), resource_manager.clone(), + Arc::new(region_info_accessor.clone()), ); // Create deadlock service. diff --git a/components/test_raftstore/src/server.rs b/components/test_raftstore/src/server.rs index 0002f36d647..f5c64fa86e9 100644 --- a/components/test_raftstore/src/server.rs +++ b/components/test_raftstore/src/server.rs @@ -451,6 +451,7 @@ impl ServerCluster { Arc::clone(&importer), None, resource_manager.clone(), + Arc::new(region_info_accessor.clone()), ); // Create deadlock service. diff --git a/src/import/sst_service.rs b/src/import/sst_service.rs index 68403e226f8..92e73ca9f8f 100644 --- a/src/import/sst_service.rs +++ b/src/import/sst_service.rs @@ -27,6 +27,12 @@ use kvproto::{ WriteRequest_oneof_chunk as Chunk, *, }, kvrpcpb::Context, + metapb::RegionEpoch, +}; +use raftstore::{ + coprocessor::{RegionInfo, RegionInfoProvider}, + store::util::is_epoch_stale, + RegionInfoAccessor, }; use raftstore_v2::StoreMeta; use resource_control::{with_resource_limiter, ResourceGroupManager}; @@ -39,7 +45,7 @@ use tikv_kv::{ }; use tikv_util::{ config::ReadableSize, - future::create_stream_with_buffer, + future::{create_stream_with_buffer, paired_future_callback}, sys::thread::ThreadBuildWrapper, time::{Instant, Limiter}, HandyRwLock, @@ -124,6 +130,7 @@ pub struct ImportSstService { limiter: Limiter, task_slots: Arc>>, raft_entry_max_size: ReadableSize, + region_info_accessor: Arc, writer: raft_writer::ThrottledTlsEngineWriter, @@ -318,6 +325,7 @@ impl ImportSstService { importer: Arc, store_meta: Option>>>, resource_manager: Option>, + region_info_accessor: Arc, ) -> Self { let props = tikv_util::thread_group::current_properties(); let eng = Mutex::new(engine.clone()); @@ -365,6 +373,7 @@ impl ImportSstService { limiter: Limiter::new(f64::INFINITY), task_slots: Arc::new(Mutex::new(HashSet::default())), raft_entry_max_size, + region_info_accessor, writer, store_meta, resource_manager, @@ -675,6 +684,45 @@ impl ImportSstService { } } +fn check_local_region_stale( + region_id: u64, + epoch: &RegionEpoch, + local_region_info: Option, +) -> Result<()> { + match local_region_info { + Some(local_region_info) => { + let local_region_epoch = local_region_info.region.region_epoch.unwrap(); + + // when local region epoch is stale, client can retry write later + if is_epoch_stale(&local_region_epoch, epoch) { + return Err(Error::RequestTooNew(format!( + "request region {} is ahead of local region, local epoch {:?}, request epoch {:?}, please retry write later", + region_id, local_region_epoch, epoch + ))); + } + // when local region epoch is ahead, client need to rescan region from PD to get + // latest region later + if is_epoch_stale(epoch, &local_region_epoch) { + return Err(Error::RequestTooOld(format!( + "request region {} is staler than local region, local epoch {:?}, request epoch {:?}", + region_id, local_region_epoch, epoch + ))); + } + + // not match means to rescan + Ok(()) + } + None => { + // when region not found, we can't tell whether it's stale or ahead, so we just + // return the safest case + Err(Error::RequestTooOld(format!( + "region {} is not found", + region_id + ))) + } + } +} + #[macro_export] macro_rules! impl_write { ($fn:ident, $req_ty:ident, $resp_ty:ident, $chunk_ty:ident, $writer_fn:ident) => { @@ -686,6 +734,7 @@ macro_rules! impl_write { ) { let import = self.importer.clone(); let tablets = self.tablets.clone(); + let region_info_accessor = self.region_info_accessor.clone(); let (rx, buf_driver) = create_stream_with_buffer(stream, self.cfg.rl().stream_channel_window); let mut rx = rx.map_err(Error::from); @@ -694,8 +743,11 @@ macro_rules! impl_write { let label = stringify!($fn); let resource_manager = self.resource_manager.clone(); let handle_task = async move { - let res = async move { - let first_req = rx.try_next().await?; + let (res, rx) = async move { + let first_req = match rx.try_next().await { + Ok(r) => r, + Err(e) => return (Err(e), Some(rx)), + }; let (meta, resource_limiter) = match first_req { Some(r) => { let limiter = resource_manager.as_ref().and_then(|m| { @@ -708,18 +760,49 @@ macro_rules! impl_write { }); match r.chunk { Some($chunk_ty::Meta(m)) => (m, limiter), - _ => return Err(Error::InvalidChunk), + _ => return (Err(Error::InvalidChunk), Some(rx)), } } - _ => return Err(Error::InvalidChunk), + _ => return (Err(Error::InvalidChunk), Some(rx)), }; + // wait the region epoch on this TiKV to catch up with the epoch + // in request, which comes from PD and represents the majority + // peers' status. let region_id = meta.get_region_id(); + let (cb, f) = paired_future_callback(); + if let Err(e) = region_info_accessor + .find_region_by_id(region_id, cb) + .map_err(|e| { + // when region not found, we can't tell whether it's stale or ahead, so + // we just return the safest case + Error::RequestTooOld(format!( + "failed to find region {} err {:?}", + region_id, e + )) + }) + { + return (Err(e), Some(rx)); + }; + let res = match f.await { + Ok(r) => r, + Err(e) => return (Err(From::from(e)), Some(rx)), + }; + if let Err(e) = + check_local_region_stale(region_id, meta.get_region_epoch(), res) + { + return (Err(e), Some(rx)); + }; + let tablet = match tablets.get(region_id) { Some(t) => t, None => { - return Err(Error::Engine( - format!("region {} not found", region_id).into(), - )); + return ( + Err(Error::RequestTooOld(format!( + "region {} not found", + region_id + ))), + Some(rx), + ); } }; @@ -727,10 +810,10 @@ macro_rules! impl_write { Ok(w) => w, Err(e) => { error!("build writer failed {:?}", e); - return Err(Error::InvalidChunk); + return (Err(Error::InvalidChunk), Some(rx)); } }; - let (writer, resource_limiter) = rx + let result = rx .try_fold( (writer, resource_limiter), |(mut writer, limiter), req| async move { @@ -747,7 +830,11 @@ macro_rules! impl_write { .map(|w| (w, limiter)) }, ) - .await?; + .await; + let (writer, resource_limiter) = match result { + Ok(r) => r, + Err(e) => return (Err(e), None), + }; let finish_fn = async { let metas = writer.finish()?; @@ -756,13 +843,18 @@ macro_rules! impl_write { }; let metas: Result<_> = with_resource_limiter(finish_fn, resource_limiter).await; - let metas = metas?; + let metas = match metas { + Ok(r) => r, + Err(e) => return (Err(e), None), + }; let mut resp = $resp_ty::default(); resp.set_metas(metas.into()); - Ok(resp) + (Ok(resp), None) } .await; $crate::send_rpc_response!(res, sink, label, timer); + // don't drop rx before send response + _ = rx; }; self.threads.spawn(buf_driver); @@ -1392,14 +1484,19 @@ mod test { use engine_traits::{CF_DEFAULT, CF_WRITE}; use kvproto::{ kvrpcpb::Context, - metapb::RegionEpoch, + metapb::{Region, RegionEpoch}, raft_cmdpb::{RaftCmdRequest, Request}, }; - use protobuf::Message; + use protobuf::{Message, SingularPtrField}; + use raft::StateRole::Follower; + use raftstore::RegionInfo; use tikv_kv::{Modify, WriteData}; use txn_types::{Key, TimeStamp, Write, WriteBatchFlags, WriteType}; - use crate::{import::sst_service::RequestCollector, server::raftkv}; + use crate::{ + import::sst_service::{check_local_region_stale, RequestCollector}, + server::raftkv, + }; fn write(key: &[u8], ty: WriteType, commit_ts: u64, start_ts: u64) -> (Vec, Vec) { let k = Key::from_raw(key).append_ts(TimeStamp::new(commit_ts)); @@ -1683,4 +1780,72 @@ mod test { } assert_eq!(total, 100); } + + #[test] + fn test_write_rpc_check_region_epoch() { + let mut req_epoch = RegionEpoch { + conf_ver: 10, + version: 10, + ..Default::default() + }; + // test for region not found + let result = check_local_region_stale(1, &req_epoch, None); + assert!(result.is_err()); + // check error message contains "rescan region later", client will match this + // string pattern + assert!( + result + .unwrap_err() + .to_string() + .contains("rescan region later") + ); + + let mut local_region_info = RegionInfo { + region: Region { + id: 1, + region_epoch: SingularPtrField::some(req_epoch.clone()), + ..Default::default() + }, + role: Follower, + buckets: 1, + }; + // test the local region epoch is same as request + let result = check_local_region_stale(1, &req_epoch, Some(local_region_info.clone())); + result.unwrap(); + + // test the local region epoch is ahead of request + local_region_info + .region + .region_epoch + .as_mut() + .unwrap() + .conf_ver = 11; + let result = check_local_region_stale(1, &req_epoch, Some(local_region_info.clone())); + assert!(result.is_err()); + // check error message contains "rescan region later", client will match this + // string pattern + assert!( + result + .unwrap_err() + .to_string() + .contains("rescan region later") + ); + + req_epoch.conf_ver = 11; + let result = check_local_region_stale(1, &req_epoch, Some(local_region_info.clone())); + result.unwrap(); + + // test the local region epoch is staler than request + req_epoch.version = 12; + let result = check_local_region_stale(1, &req_epoch, Some(local_region_info)); + assert!(result.is_err()); + // check error message contains "retry write later", client will match this + // string pattern + assert!( + result + .unwrap_err() + .to_string() + .contains("retry write later") + ); + } } From 629a0a9a6d4f95f02dbd05e75f0c182afe06acfb Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Tue, 31 Oct 2023 20:59:06 +0800 Subject: [PATCH 099/220] txn: Fix to the prewrite requests retry problem by using TxnStatusCache (#15658) (#15871) ref tikv/tikv#11187 Signed-off-by: MyonKeminta Co-authored-by: MyonKeminta <9948422+MyonKeminta@users.noreply.github.com> Co-authored-by: MyonKeminta --- components/test_raftstore-v2/src/util.rs | 2 + components/test_raftstore/src/util.rs | 106 +- components/tikv_util/src/lru.rs | 199 +++- src/storage/config.rs | 10 + src/storage/metrics.rs | 15 + src/storage/mod.rs | 505 +++++++++ src/storage/mvcc/metrics.rs | 16 + .../txn/commands/acquire_pessimistic_lock.rs | 1 + .../acquire_pessimistic_lock_resumed.rs | 3 + src/storage/txn/commands/atomic_store.rs | 6 +- .../txn/commands/check_secondary_locks.rs | 14 +- src/storage/txn/commands/check_txn_status.rs | 10 + src/storage/txn/commands/cleanup.rs | 1 + src/storage/txn/commands/commit.rs | 1 + src/storage/txn/commands/compare_and_swap.rs | 8 +- .../txn/commands/flashback_to_version.rs | 1 + src/storage/txn/commands/mod.rs | 12 +- src/storage/txn/commands/pause.rs | 1 + .../txn/commands/pessimistic_rollback.rs | 3 + src/storage/txn/commands/prewrite.rs | 48 +- src/storage/txn/commands/resolve_lock.rs | 10 +- src/storage/txn/commands/resolve_lock_lite.rs | 6 + src/storage/txn/commands/rollback.rs | 1 + src/storage/txn/commands/txn_heart_beat.rs | 8 +- src/storage/txn/mod.rs | 1 + src/storage/txn/scheduler.rs | 26 + src/storage/txn/txn_status_cache.rs | 980 ++++++++++++++++++ tests/failpoints/cases/test_kv_service.rs | 113 +- tests/integrations/config/mod.rs | 1 + tests/integrations/config/test-custom.toml | 1 + 30 files changed, 2075 insertions(+), 34 deletions(-) create mode 100644 src/storage/txn/txn_status_cache.rs diff --git a/components/test_raftstore-v2/src/util.rs b/components/test_raftstore-v2/src/util.rs index af2bab26183..315150e29c2 100644 --- a/components/test_raftstore-v2/src/util.rs +++ b/components/test_raftstore-v2/src/util.rs @@ -544,6 +544,7 @@ impl PeerClient { &self.cli, self.ctx.clone(), muts, + vec![], pk, ts, 0, @@ -557,6 +558,7 @@ impl PeerClient { &self.cli, self.ctx.clone(), muts, + vec![], pk, ts, 0, diff --git a/components/test_raftstore/src/util.rs b/components/test_raftstore/src/util.rs index e88df1fb0ca..ff47525ea37 100644 --- a/components/test_raftstore/src/util.rs +++ b/components/test_raftstore/src/util.rs @@ -958,6 +958,7 @@ pub fn must_kv_prewrite_with( client: &TikvClient, ctx: Context, muts: Vec, + pessimistic_actions: Vec, pk: Vec, ts: u64, for_update_ts: u64, @@ -967,7 +968,7 @@ pub fn must_kv_prewrite_with( let mut prewrite_req = PrewriteRequest::default(); prewrite_req.set_context(ctx); if for_update_ts != 0 { - prewrite_req.pessimistic_actions = vec![DoPessimisticCheck; muts.len()]; + prewrite_req.pessimistic_actions = pessimistic_actions; } prewrite_req.set_mutations(muts.into_iter().collect()); prewrite_req.primary_lock = pk; @@ -994,6 +995,7 @@ pub fn try_kv_prewrite_with( client: &TikvClient, ctx: Context, muts: Vec, + pessimistic_actions: Vec, pk: Vec, ts: u64, for_update_ts: u64, @@ -1004,6 +1006,7 @@ pub fn try_kv_prewrite_with( client, ctx, muts, + pessimistic_actions, pk, ts, for_update_ts, @@ -1017,6 +1020,7 @@ pub fn try_kv_prewrite_with_impl( client: &TikvClient, ctx: Context, muts: Vec, + pessimistic_actions: Vec, pk: Vec, ts: u64, for_update_ts: u64, @@ -1026,7 +1030,7 @@ pub fn try_kv_prewrite_with_impl( let mut prewrite_req = PrewriteRequest::default(); prewrite_req.set_context(ctx); if for_update_ts != 0 { - prewrite_req.pessimistic_actions = vec![DoPessimisticCheck; muts.len()]; + prewrite_req.pessimistic_actions = pessimistic_actions; } prewrite_req.set_mutations(muts.into_iter().collect()); prewrite_req.primary_lock = pk; @@ -1046,7 +1050,7 @@ pub fn try_kv_prewrite( pk: Vec, ts: u64, ) -> PrewriteResponse { - try_kv_prewrite_with(client, ctx, muts, pk, ts, 0, false, false) + try_kv_prewrite_with(client, ctx, muts, vec![], pk, ts, 0, false, false) } pub fn try_kv_prewrite_pessimistic( @@ -1056,7 +1060,18 @@ pub fn try_kv_prewrite_pessimistic( pk: Vec, ts: u64, ) -> PrewriteResponse { - try_kv_prewrite_with(client, ctx, muts, pk, ts, ts, false, false) + let len = muts.len(); + try_kv_prewrite_with( + client, + ctx, + muts, + vec![DoPessimisticCheck; len], + pk, + ts, + ts, + false, + false, + ) } pub fn must_kv_prewrite( @@ -1066,7 +1081,7 @@ pub fn must_kv_prewrite( pk: Vec, ts: u64, ) { - must_kv_prewrite_with(client, ctx, muts, pk, ts, 0, false, false) + must_kv_prewrite_with(client, ctx, muts, vec![], pk, ts, 0, false, false) } pub fn must_kv_prewrite_pessimistic( @@ -1076,7 +1091,18 @@ pub fn must_kv_prewrite_pessimistic( pk: Vec, ts: u64, ) { - must_kv_prewrite_with(client, ctx, muts, pk, ts, ts, false, false) + let len = muts.len(); + must_kv_prewrite_with( + client, + ctx, + muts, + vec![DoPessimisticCheck; len], + pk, + ts, + ts, + false, + false, + ) } pub fn must_kv_commit( @@ -1232,6 +1258,50 @@ pub fn must_check_txn_status( resp } +pub fn must_kv_have_locks( + client: &TikvClient, + ctx: Context, + ts: u64, + start_key: &[u8], + end_key: &[u8], + expected_locks: &[( + // key + &[u8], + Op, + // start_ts + u64, + // for_update_ts + u64, + )], +) { + let mut req = ScanLockRequest::default(); + req.set_context(ctx); + req.set_limit(100); + req.set_start_key(start_key.to_vec()); + req.set_end_key(end_key.to_vec()); + req.set_max_version(ts); + let resp = client.kv_scan_lock(&req).unwrap(); + assert!(!resp.has_region_error(), "{:?}", resp.get_region_error()); + assert!(resp.error.is_none(), "{:?}", resp.get_error()); + + assert_eq!( + resp.locks.len(), + expected_locks.len(), + "lock count not match, expected: {:?}; got: {:?}", + expected_locks, + resp.locks + ); + + for (lock_info, (expected_key, expected_op, expected_start_ts, expected_for_update_ts)) in + resp.locks.into_iter().zip(expected_locks.iter()) + { + assert_eq!(lock_info.get_key(), *expected_key); + assert_eq!(lock_info.get_lock_type(), *expected_op); + assert_eq!(lock_info.get_lock_version(), *expected_start_ts); + assert_eq!(lock_info.get_lock_for_update_ts(), *expected_for_update_ts); + } +} + pub fn get_tso(pd_client: &TestPdClient) -> u64 { block_on(pd_client.get_tso()).unwrap().into_inner() } @@ -1440,11 +1510,31 @@ impl PeerClient { } pub fn must_kv_prewrite_async_commit(&self, muts: Vec, pk: Vec, ts: u64) { - must_kv_prewrite_with(&self.cli, self.ctx.clone(), muts, pk, ts, 0, true, false) + must_kv_prewrite_with( + &self.cli, + self.ctx.clone(), + muts, + vec![], + pk, + ts, + 0, + true, + false, + ) } pub fn must_kv_prewrite_one_pc(&self, muts: Vec, pk: Vec, ts: u64) { - must_kv_prewrite_with(&self.cli, self.ctx.clone(), muts, pk, ts, 0, false, true) + must_kv_prewrite_with( + &self.cli, + self.ctx.clone(), + muts, + vec![], + pk, + ts, + 0, + false, + true, + ) } pub fn must_kv_commit(&self, keys: Vec>, start_ts: u64, commit_ts: u64) { diff --git a/components/tikv_util/src/lru.rs b/components/tikv_util/src/lru.rs index 76fad6e8a34..302bfc9264b 100644 --- a/components/tikv_util/src/lru.rs +++ b/components/tikv_util/src/lru.rs @@ -135,6 +135,10 @@ impl Trace { r.key.as_ptr().read() } } + + fn get_tail(&self) -> &K { + unsafe { self.tail.as_ref().prev.as_ref().key.assume_init_ref() } + } } impl Drop for Trace { @@ -174,14 +178,52 @@ impl SizePolicy for CountTracker { } } -pub struct LruCache +/// Some [`EvictPolicy`] (e.g. the `TxnStatusCache` in +/// `tikv::storage::txn::txn_status_cache` module) may need to know what the +/// entry bing popped out is to determine if it really can be popped. But there +/// is performance cost to always get the tail entry. So we pass this interface +/// to the `should_evict` function. An implementation of `EvictPolicy` can read +/// the tail entry only when it really needs. +pub trait GetTailEntry { + fn get_tail_entry(&self) -> Option<(&K, &V)>; +} + +/// An [`EvictPolicy`] defines how the [`LruCache`] should determine an entry +/// at the tail should be popped out. +pub trait EvictPolicy { + fn should_evict( + &self, + current_size: usize, + capacity: usize, + get_tail_entry: &impl GetTailEntry, + ) -> bool; +} + +/// The default [`EvictPolicy`] of [`LruCache`], which pops out entries at the +/// tail when the limit specified by `capacity` is exceeded. +pub struct EvictOnFull; + +impl EvictPolicy for EvictOnFull { + fn should_evict( + &self, + current_size: usize, + capacity: usize, + _: &impl GetTailEntry, + ) -> bool { + capacity < current_size + } +} + +pub struct LruCache where T: SizePolicy, + E: EvictPolicy, { map: HashMap>, trace: Trace, capacity: usize, size_policy: T, + evict_policy: E, } impl LruCache @@ -189,18 +231,30 @@ where T: SizePolicy, { pub fn with_capacity_sample_and_trace( - mut capacity: usize, + capacity: usize, sample_mask: usize, size_policy: T, ) -> LruCache { + Self::new(capacity, sample_mask, size_policy, EvictOnFull) + } +} + +impl LruCache +where + T: SizePolicy, + E: EvictPolicy, +{ + pub fn new(mut capacity: usize, sample_mask: usize, size_policy: T, evict_policy: E) -> Self { + // The capacity is at least 1. if capacity == 0 { capacity = 1; } - LruCache { + Self { map: HashMap::default(), trace: Trace::new(sample_mask), capacity, size_policy, + evict_policy, } } @@ -215,10 +269,18 @@ where self.trace.clear(); self.size_policy.on_reset(0); } + + /// Get the capacity limited on the `LruCache`. #[inline] pub fn capacity(&self) -> usize { self.capacity } + + /// Get the capacity actually allocated by the internal data structure. + #[inline] + pub fn internal_allocated_capacity(&self) -> usize { + self.map.capacity() + } } impl LruCache @@ -234,25 +296,36 @@ where } } -impl LruCache +impl LruCache where K: Eq + Hash + Clone + std::fmt::Debug, T: SizePolicy, + E: EvictPolicy, { #[inline] - pub fn insert(&mut self, key: K, value: V) { + fn insert_impl(&mut self, key: K, value: V, replace: bool) -> bool { + let mut inserted = true; let mut old_key = None; let current_size = SizePolicy::::current(&self.size_policy); + // In case the current size exactly equals to capacity, we also expect to reuse + // tail when inserting. Use `current_size + 1` to include the case. + let should_evict_on_insert = + self.evict_policy + .should_evict(current_size + 1, self.capacity, self); match self.map.entry(key) { HashMapEntry::Occupied(mut e) => { - self.size_policy.on_remove(e.key(), &e.get().value); - self.size_policy.on_insert(e.key(), &value); - let mut entry = e.get_mut(); - self.trace.promote(entry.record); - entry.value = value; + if replace { + self.size_policy.on_remove(e.key(), &e.get().value); + self.size_policy.on_insert(e.key(), &value); + let mut entry = e.get_mut(); + self.trace.promote(entry.record); + entry.value = value; + } else { + inserted = false; + } } HashMapEntry::Vacant(v) => { - let record = if self.capacity <= current_size { + let record = if should_evict_on_insert { let res = self.trace.reuse_tail(v.key().clone()); old_key = Some(res.0); res.1 @@ -274,7 +347,8 @@ where // Perhaps we can reject entries larger than capacity goes in the LRU cache, but // that is impossible for now: the `SizePolicy` trait doesn't provide the // interface of querying the actual size of an item. - self.evict_until_fit() + self.evict_until_fit(); + inserted } fn evict_until_fit(&mut self) { @@ -283,7 +357,7 @@ where let current_size = self.size_policy.current(); // Should we keep at least one entry? So our users won't lose their fresh record // once it exceeds the capacity. - if current_size <= cap || self.map.is_empty() { + if !self.evict_policy.should_evict(current_size, cap, self) || self.map.is_empty() { break; } let key = self.trace.remove_tail(); @@ -292,6 +366,18 @@ where } } + #[inline] + pub fn insert(&mut self, key: K, value: V) { + self.insert_impl(key, value, true); + } + + /// Insert an entry if the key doesn't exist before. The existing entry + /// won't be replaced and won't be promoted to the most-recent place. + #[inline] + pub fn insert_if_not_exist(&mut self, key: K, value: V) -> bool { + self.insert_impl(key, value, false) + } + #[inline] pub fn remove(&mut self, key: &K) -> Option { if let Some(v) = self.map.remove(key) { @@ -313,6 +399,12 @@ where } } + /// Get an item by key without promoting the item. + #[inline] + pub fn get_no_promote(&self, key: &K) -> Option<&V> { + self.map.get(key).map(|v| &v.value) + } + #[inline] pub fn get_mut(&mut self, key: &K) -> Option<&mut V> { match self.map.get_mut(key) { @@ -355,17 +447,37 @@ where } } -unsafe impl Send for LruCache +impl GetTailEntry for LruCache +where + K: Eq + Hash + Clone + std::fmt::Debug, + T: SizePolicy, + E: EvictPolicy, +{ + fn get_tail_entry(&self) -> Option<(&K, &V)> { + if self.is_empty() { + return None; + } + + let k = self.trace.get_tail(); + self.map + .get_key_value(k) + .map(|(k, entry)| (k, &entry.value)) + } +} + +unsafe impl Send for LruCache where K: Send, V: Send, T: Send + SizePolicy, + E: Send + EvictPolicy, { } -impl Drop for LruCache +impl Drop for LruCache where T: SizePolicy, + E: EvictPolicy, { fn drop(&mut self) { self.clear(); @@ -626,4 +738,61 @@ mod tests { assert!(cache.size() <= 42); } } + + #[test] + fn test_get_no_promote() { + let mut cache = LruCache::with_capacity_sample_and_trace(3, 0, CountTracker::default()); + cache.insert(1, 1); + cache.insert(2, 2); + cache.insert(3, 3); + assert_eq!(cache.size(), 3); + assert_eq!(*cache.get_no_promote(&1).unwrap(), 1); + cache.insert(4, 4); + assert_eq!(cache.size(), 3); + // Key 1 is not promoted, so it's popped out first. + assert!(cache.get_no_promote(&1).is_none()); + // Other entries are not affected. + assert_eq!(*cache.get_no_promote(&2).unwrap(), 2); + assert_eq!(*cache.get_no_promote(&3).unwrap(), 3); + assert_eq!(*cache.get_no_promote(&4).unwrap(), 4); + } + + #[test] + fn test_insert_if_not_exist() { + let mut cache = LruCache::with_capacity_sample_and_trace(4, 0, CountTracker::default()); + assert!(cache.insert_if_not_exist(1, 1)); + assert!(cache.insert_if_not_exist(2, 2)); + assert!(cache.insert_if_not_exist(3, 3)); + assert_eq!(cache.size(), 3); + assert_eq!(*cache.get_no_promote(&1).unwrap(), 1); + assert_eq!(*cache.get_no_promote(&2).unwrap(), 2); + assert_eq!(*cache.get_no_promote(&3).unwrap(), 3); + + assert!(!cache.insert_if_not_exist(1, 11)); + // Not updated. + assert_eq!(*cache.get_no_promote(&1).unwrap(), 1); + + assert!(cache.insert_if_not_exist(4, 4)); + assert!(!cache.insert_if_not_exist(2, 22)); + // Not updated. + assert_eq!(*cache.get_no_promote(&2).unwrap(), 2); + + assert_eq!(cache.size(), 4); + assert!(cache.insert_if_not_exist(5, 5)); + assert_eq!(cache.size(), 4); + // key 1 is not promoted, so it's first popped out. + assert!(cache.get_no_promote(&1).is_none()); + assert_eq!(*cache.get_no_promote(&2).unwrap(), 2); + + assert!(cache.insert_if_not_exist(6, 6)); + assert_eq!(cache.size(), 4); + // key 2 is not promoted either, so it's first popped out. + assert!(cache.get_no_promote(&2).is_none()); + assert_eq!(*cache.get_no_promote(&3).unwrap(), 3); + + assert!(cache.insert_if_not_exist(7, 7)); + assert_eq!(cache.size(), 4); + assert!(cache.get_no_promote(&3).is_none()); + assert_eq!(*cache.get_no_promote(&4).unwrap(), 4); + } } diff --git a/src/storage/config.rs b/src/storage/config.rs index a40db2c424b..91c98ebf57b 100644 --- a/src/storage/config.rs +++ b/src/storage/config.rs @@ -31,6 +31,13 @@ const DEFAULT_SCHED_PENDING_WRITE_MB: u64 = 100; const DEFAULT_RESERVED_SPACE_GB: u64 = 5; const DEFAULT_RESERVED_RAFT_SPACE_GB: u64 = 1; +// In tests, we've observed 1.2M entries in the TxnStatusCache. We +// conservatively set the limit to 5M entries in total. +// As TxnStatusCache have 128 slots by default. We round it to 5.12M. +// This consumes at most around 300MB memory theoretically, but usually it's +// much less as it's hard to see the capacity being used up. +const DEFAULT_TXN_STATUS_CACHE_CAPACITY: usize = 40_000 * 128; + // Block cache capacity used when TikvConfig isn't validated. It should only // occur in tests. const FALLBACK_BLOCK_CACHE_CAPACITY: ReadableSize = ReadableSize::mb(128); @@ -76,6 +83,8 @@ pub struct Config { pub background_error_recovery_window: ReadableDuration, /// Interval to check TTL for all SSTs, pub ttl_check_poll_interval: ReadableDuration, + #[online_config(skip)] + pub txn_status_cache_capacity: usize, #[online_config(submodule)] pub flow_control: FlowControlConfig, #[online_config(submodule)] @@ -105,6 +114,7 @@ impl Default for Config { api_version: 1, enable_ttl: false, ttl_check_poll_interval: ReadableDuration::hours(12), + txn_status_cache_capacity: DEFAULT_TXN_STATUS_CACHE_CAPACITY, flow_control: FlowControlConfig::default(), block_cache: BlockCacheConfig::default(), io_rate_limit: IoRateLimitConfig::default(), diff --git a/src/storage/metrics.rs b/src/storage/metrics.rs index e9477b56b0f..cf7956d76b7 100644 --- a/src/storage/metrics.rs +++ b/src/storage/metrics.rs @@ -370,6 +370,13 @@ make_static_metric! { keys, }, } + + pub struct TxnStatusCacheSizeGauge: IntGauge { + "type" => { + used, + allocated, + } + } } lazy_static! { @@ -601,4 +608,12 @@ lazy_static! { exponential_buckets(1.0, 2.0, 16).unwrap() ) .unwrap(); + + pub static ref SCHED_TXN_STATUS_CACHE_SIZE: TxnStatusCacheSizeGauge = register_static_int_gauge_vec!( + TxnStatusCacheSizeGauge, + "tikv_scheduler_txn_status_cache_size", + "Statistics of size and capacity of txn status cache (represented in count of entries)", + &["type"] + ) + .unwrap(); } diff --git a/src/storage/mod.rs b/src/storage/mod.rs index cb4057bfd7e..cc48d9e36e3 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -3853,6 +3853,7 @@ mod tests { commands, commands::{AcquirePessimisticLock, Prewrite}, tests::must_rollback, + txn_status_cache::TxnStatusCache, Error as TxnError, ErrorInner as TxnErrorInner, }, types::{PessimisticLockKeyResult, PessimisticLockResults}, @@ -3884,6 +3885,7 @@ mod tests { statistics: &mut Statistics::default(), async_apply_prewrite: false, raw_ext: None, + txn_status_cache: &TxnStatusCache::new_for_test(), }, ) .unwrap(); @@ -10869,4 +10871,507 @@ mod tests { // Prewrite still succeeds rx.recv().unwrap().unwrap(); } + + #[test] + fn test_prewrite_cached_committed_transaction_do_not_skip_constraint_check() { + let storage = TestStorageBuilderApiV1::new(MockLockManager::new()) + .build() + .unwrap(); + let cm = storage.concurrency_manager.clone(); + let k1 = Key::from_raw(b"k1"); + let pk = b"pk"; + // Simulate the case that the current TiKV instance have a non-unique + // index key of a pessimistic transaction. It won't be pessimistic + // locked, and prewrite skips constraint checks. + // Simulate the case that a prewrite is performed twice, with async + // commit enabled, and max_ts changes when the second request arrives. + + // A retrying prewrite request arrives. + cm.update_max_ts(20.into()); + let mut ctx = Context::default(); + ctx.set_is_retry_request(true); + let (tx, rx) = channel(); + storage + .sched_txn_command( + commands::PrewritePessimistic::new( + vec![( + Mutation::make_put(k1.clone(), b"v".to_vec()), + SkipPessimisticCheck, + )], + pk.to_vec(), + 10.into(), + 3000, + 10.into(), + 1, + 11.into(), + 0.into(), + Some(vec![]), + false, + AssertionLevel::Off, + vec![], + ctx, + ), + Box::new(move |res| { + tx.send(res).unwrap(); + }), + ) + .unwrap(); + + let res = rx.recv().unwrap().unwrap(); + assert_eq!(res.min_commit_ts, 21.into()); + + // Commit it. + let (tx, rx) = channel(); + storage + .sched_txn_command( + commands::Commit::new(vec![k1.clone()], 10.into(), 21.into(), Context::default()), + expect_ok_callback(tx, 0), + ) + .unwrap(); + rx.recv().unwrap(); + + // The txn's status is cached + assert_eq!( + storage + .sched + .get_txn_status_cache() + .get_no_promote(10.into()) + .unwrap(), + 21.into() + ); + + // Check committed; push max_ts to 30 + assert_eq!( + block_on(storage.get(Context::default(), k1.clone(), 30.into())) + .unwrap() + .0, + Some(b"v".to_vec()) + ); + + let (tx, rx) = channel(); + storage + .sched_txn_command( + commands::PrewritePessimistic::new( + vec![( + Mutation::make_put(k1.clone(), b"v".to_vec()), + SkipPessimisticCheck, + )], + pk.to_vec(), + 10.into(), + 3000, + 10.into(), + 1, + 11.into(), + 0.into(), + Some(vec![]), + false, + AssertionLevel::Off, + vec![], + Context::default(), + ), + Box::new(move |res| { + tx.send(res).unwrap(); + }), + ) + .unwrap(); + let res = rx.recv().unwrap().unwrap(); + assert_eq!(res.min_commit_ts, 21.into()); + + // Key must not be locked. + assert_eq!( + block_on(storage.get(Context::default(), k1, 50.into())) + .unwrap() + .0, + Some(b"v".to_vec()) + ); + } + + #[test] + fn test_updating_txn_status_cache() { + let storage = TestStorageBuilderApiV1::new(MockLockManager::new()) + .build() + .unwrap(); + let cm = storage.concurrency_manager.clone(); + + // Commit + let (tx, rx) = channel(); + storage + .sched_txn_command( + commands::PrewritePessimistic::new( + vec![( + Mutation::make_put(Key::from_raw(b"k1"), b"v1".to_vec()), + SkipPessimisticCheck, + )], + b"k1".to_vec(), + 10.into(), + 3000, + 10.into(), + 1, + 11.into(), + 0.into(), + Some(vec![]), + false, + AssertionLevel::Off, + vec![], + Context::default(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + assert!( + storage + .sched + .get_txn_status_cache() + .get_no_promote(10.into()) + .is_none() + ); + + storage + .sched_txn_command( + commands::Commit::new( + vec![Key::from_raw(b"k1")], + 10.into(), + 20.into(), + Context::default(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + assert_eq!( + storage + .sched + .get_txn_status_cache() + .get_no_promote(10.into()) + .unwrap(), + 20.into() + ); + + // Unsuccessful commit won't update cache + storage + .sched_txn_command( + commands::Commit::new( + vec![Key::from_raw(b"k2")], + 30.into(), + 40.into(), + Context::default(), + ), + expect_fail_callback(tx, 0, |_| ()), + ) + .unwrap(); + rx.recv().unwrap(); + assert!( + storage + .sched + .get_txn_status_cache() + .get_no_promote(30.into()) + .is_none() + ); + + // 1PC update + let (tx, rx) = channel(); + cm.update_max_ts(59.into()); + storage + .sched_txn_command( + Prewrite::new( + vec![Mutation::make_put(Key::from_raw(b"k3"), b"v3".to_vec())], + b"k3".to_vec(), + 50.into(), + 3000, + false, + 1, + 51.into(), + 0.into(), + Some(vec![]), + true, + AssertionLevel::Off, + Context::default(), + ), + Box::new(move |res| { + tx.send(res).unwrap(); + }), + ) + .unwrap(); + let res = rx.recv().unwrap().unwrap(); + assert_eq!(res.one_pc_commit_ts, 60.into()); + assert_eq!( + storage + .sched + .get_txn_status_cache() + .get_no_promote(50.into()) + .unwrap(), + 60.into() + ); + + // Resolve lock commit + let (tx, rx) = channel(); + storage + .sched_txn_command( + Prewrite::new( + vec![Mutation::make_put(Key::from_raw(b"k4"), b"v4".to_vec())], + b"pk".to_vec(), + 70.into(), + 3000, + false, + 1, + 0.into(), + 0.into(), + None, + false, + AssertionLevel::Off, + Context::default(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + + storage + .sched_txn_command( + commands::ResolveLockReadPhase::new( + vec![(TimeStamp::from(70), TimeStamp::from(80))] + .into_iter() + .collect(), + None, + Context::default(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + assert_eq!( + storage + .sched + .get_txn_status_cache() + .get_no_promote(70.into()) + .unwrap(), + 80.into() + ); + + // Resolve lock lite + storage + .sched_txn_command( + Prewrite::new( + vec![Mutation::make_put(Key::from_raw(b"k5"), b"v5".to_vec())], + b"pk".to_vec(), + 90.into(), + 3000, + false, + 1, + 0.into(), + 0.into(), + None, + false, + AssertionLevel::Off, + Context::default(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + + storage + .sched_txn_command( + commands::ResolveLockLite::new( + 90.into(), + 100.into(), + vec![Key::from_raw(b"k5")], + Context::default(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + assert_eq!( + storage + .sched + .get_txn_status_cache() + .get_no_promote(90.into()) + .unwrap(), + 100.into() + ); + + // CheckTxnStatus: uncommitted transaction + storage + .sched_txn_command( + commands::CheckTxnStatus::new( + Key::from_raw(b"k1"), + 9.into(), + 110.into(), + 110.into(), + true, + false, + false, + false, + Context::default(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + assert!( + storage + .sched + .get_txn_status_cache() + .get_no_promote(9.into()) + .is_none() + ); + + // CheckTxnStatus: committed transaction + storage.sched.get_txn_status_cache().remove(10.into()); + storage + .sched_txn_command( + commands::CheckTxnStatus::new( + Key::from_raw(b"k1"), + 10.into(), + 110.into(), + 110.into(), + true, + false, + false, + false, + Context::default(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + assert_eq!( + storage + .sched + .get_txn_status_cache() + .get_no_promote(10.into()) + .unwrap(), + 20.into() + ); + + // CheckSecondaryLocks: uncommitted transaction + storage + .sched_txn_command( + Prewrite::new( + vec![Mutation::make_put(Key::from_raw(b"k6"), b"v6".to_vec())], + b"pk".to_vec(), + 120.into(), + 3000, + false, + 1, + 0.into(), + 0.into(), + Some(vec![]), + false, + AssertionLevel::Off, + Context::default(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + + // Lock exists but the transaction status is still unknown + storage + .sched_txn_command( + commands::CheckSecondaryLocks::new( + vec![Key::from_raw(b"k6")], + 120.into(), + Context::default(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + assert!( + storage + .sched + .get_txn_status_cache() + .get_no_promote(120.into()) + .is_none() + ); + + // One of the lock doesn't exist so the transaction becomes rolled-back status. + storage + .sched_txn_command( + commands::CheckSecondaryLocks::new( + vec![Key::from_raw(b"k6"), Key::from_raw(b"k7")], + 120.into(), + Context::default(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + assert!( + storage + .sched + .get_txn_status_cache() + .get_no_promote(120.into()) + .is_none() + ); + + // CheckSecondaryLocks: committed transaction + storage + .sched_txn_command( + Prewrite::new( + vec![ + Mutation::make_put(Key::from_raw(b"k8"), b"v8".to_vec()), + Mutation::make_put(Key::from_raw(b"k9"), b"v9".to_vec()), + ], + b"pk".to_vec(), + 130.into(), + 3000, + false, + 1, + 0.into(), + 0.into(), + Some(vec![]), + false, + AssertionLevel::Off, + Context::default(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + // Commit one of the key + storage + .sched_txn_command( + commands::Commit::new( + vec![Key::from_raw(b"k9")], + 130.into(), + 140.into(), + Context::default(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + assert_eq!( + storage + .sched + .get_txn_status_cache() + .remove(130.into()) + .unwrap(), + 140.into() + ); + + storage + .sched_txn_command( + commands::CheckSecondaryLocks::new( + vec![Key::from_raw(b"k8"), Key::from_raw(b"k9")], + 130.into(), + Context::default(), + ), + expect_ok_callback(tx, 0), + ) + .unwrap(); + rx.recv().unwrap(); + assert_eq!( + storage + .sched + .get_txn_status_cache() + .get_no_promote(130.into()) + .unwrap(), + 140.into() + ); + } } diff --git a/src/storage/mvcc/metrics.rs b/src/storage/mvcc/metrics.rs index 3c4bda63f7e..22d2760a769 100644 --- a/src/storage/mvcc/metrics.rs +++ b/src/storage/mvcc/metrics.rs @@ -51,6 +51,13 @@ make_static_metric! { pub struct MvccPrewriteAssertionPerfCounterVec: IntCounter { "type" => MvccPrewriteAssertionPerfKind, } + + pub struct MvccPrewriteRequestAfterCommitCounterVec: IntCounter { + "type" => { + non_retry_req, + retry_req, + }, + } } lazy_static! { @@ -104,4 +111,13 @@ lazy_static! { ) .unwrap() }; + pub static ref MVCC_PREWRITE_REQUEST_AFTER_COMMIT_COUNTER_VEC: MvccPrewriteRequestAfterCommitCounterVec = { + register_static_int_counter_vec!( + MvccPrewriteRequestAfterCommitCounterVec, + "tikv_storage_mvcc_prewrite_request_after_commit_counter", + "Counter of prewrite requests of already-committed transactions that are determined by checking TxnStatucCache", + &["type"] + ) + .unwrap() + }; } diff --git a/src/storage/txn/commands/acquire_pessimistic_lock.rs b/src/storage/txn/commands/acquire_pessimistic_lock.rs index 58c33706bbc..ceb7957c926 100644 --- a/src/storage/txn/commands/acquire_pessimistic_lock.rs +++ b/src/storage/txn/commands/acquire_pessimistic_lock.rs @@ -183,6 +183,7 @@ impl WriteCommand for AcquirePessimisticLock new_acquired_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnProposed, + known_txn_status: vec![], }) } } diff --git a/src/storage/txn/commands/acquire_pessimistic_lock_resumed.rs b/src/storage/txn/commands/acquire_pessimistic_lock_resumed.rs index 7640edd7c0c..a1e2e6fc119 100644 --- a/src/storage/txn/commands/acquire_pessimistic_lock_resumed.rs +++ b/src/storage/txn/commands/acquire_pessimistic_lock_resumed.rs @@ -194,6 +194,7 @@ impl WriteCommand for AcquirePessimisticLockR new_acquired_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnProposed, + known_txn_status: vec![], }) } } @@ -239,6 +240,7 @@ mod tests { txn::{ commands::pessimistic_rollback::tests::must_success as must_pessimistic_rollback, tests::{must_commit, must_pessimistic_locked, must_prewrite_put, must_rollback}, + txn_status_cache::TxnStatusCache, }, TestEngineBuilder, }; @@ -275,6 +277,7 @@ mod tests { statistics: &mut Default::default(), async_apply_prewrite: false, raw_ext: None, + txn_status_cache: &TxnStatusCache::new_for_test(), }, ) .unwrap(); diff --git a/src/storage/txn/commands/atomic_store.rs b/src/storage/txn/commands/atomic_store.rs index 9a54895e7e2..4bca5d514c5 100644 --- a/src/storage/txn/commands/atomic_store.rs +++ b/src/storage/txn/commands/atomic_store.rs @@ -63,6 +63,7 @@ impl WriteCommand for RawAtomicStore { new_acquired_locks: vec![], lock_guards: raw_ext.into_iter().map(|r| r.key_guard).collect(), response_policy: ResponsePolicy::OnApplied, + known_txn_status: vec![], }) } } @@ -77,7 +78,9 @@ mod tests { use super::*; use crate::storage::{ - lock_manager::MockLockManager, txn::scheduler::get_raw_ext, Statistics, TestEngineBuilder, + lock_manager::MockLockManager, + txn::{scheduler::get_raw_ext, txn_status_cache::TxnStatusCache}, + Statistics, TestEngineBuilder, }; #[test] @@ -116,6 +119,7 @@ mod tests { statistics: &mut statistic, async_apply_prewrite: false, raw_ext, + txn_status_cache: &TxnStatusCache::new_for_test(), }; let cmd: Command = cmd.into(); let write_result = cmd.process_write(snap, context).unwrap(); diff --git a/src/storage/txn/commands/check_secondary_locks.rs b/src/storage/txn/commands/check_secondary_locks.rs index 92985c4d90d..ceb169f79b2 100644 --- a/src/storage/txn/commands/check_secondary_locks.rs +++ b/src/storage/txn/commands/check_secondary_locks.rs @@ -201,6 +201,12 @@ impl WriteCommand for CheckSecondaryLocks { } } + let write_result_known_txn_status = + if let SecondaryLocksStatus::Committed(commit_ts) = &result { + vec![(self.start_ts, *commit_ts)] + } else { + vec![] + }; let mut rows = 0; if let SecondaryLocksStatus::RolledBack = &result { // One row is mutated only when a secondary lock is rolled back. @@ -220,6 +226,7 @@ impl WriteCommand for CheckSecondaryLocks { new_acquired_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, + known_txn_status: write_result_known_txn_status, }) } } @@ -235,7 +242,10 @@ pub mod tests { kv::TestEngineBuilder, lock_manager::MockLockManager, mvcc::tests::*, - txn::{commands::WriteCommand, scheduler::DEFAULT_EXECUTION_DURATION_LIMIT, tests::*}, + txn::{ + commands::WriteCommand, scheduler::DEFAULT_EXECUTION_DURATION_LIMIT, tests::*, + txn_status_cache::TxnStatusCache, + }, Engine, }; @@ -265,6 +275,7 @@ pub mod tests { statistics: &mut Default::default(), async_apply_prewrite: false, raw_ext: None, + txn_status_cache: &TxnStatusCache::new_for_test(), }, ) .unwrap(); @@ -303,6 +314,7 @@ pub mod tests { statistics: &mut Default::default(), async_apply_prewrite: false, raw_ext: None, + txn_status_cache: &TxnStatusCache::new_for_test(), }, ) .unwrap(); diff --git a/src/storage/txn/commands/check_txn_status.rs b/src/storage/txn/commands/check_txn_status.rs index dc99ebf3b01..9e9a6cc0895 100644 --- a/src/storage/txn/commands/check_txn_status.rs +++ b/src/storage/txn/commands/check_txn_status.rs @@ -131,6 +131,12 @@ impl WriteCommand for CheckTxnStatus { let mut released_locks = ReleasedLocks::new(); released_locks.push(released); + let write_result_known_txn_status = if let TxnStatus::Committed { commit_ts } = &txn_status + { + vec![(self.lock_ts, *commit_ts)] + } else { + vec![] + }; let pr = ProcessResult::TxnStatus { txn_status }; let new_acquired_locks = txn.take_new_locks(); let mut write_data = WriteData::from_modifies(txn.into_modifies()); @@ -145,6 +151,7 @@ impl WriteCommand for CheckTxnStatus { new_acquired_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, + known_txn_status: write_result_known_txn_status, }) } } @@ -168,6 +175,7 @@ pub mod tests { commands::{pessimistic_rollback, WriteCommand, WriteContext}, scheduler::DEFAULT_EXECUTION_DURATION_LIMIT, tests::*, + txn_status_cache::TxnStatusCache, }, types::TxnStatus, ProcessResult, TestEngineBuilder, @@ -211,6 +219,7 @@ pub mod tests { statistics: &mut Default::default(), async_apply_prewrite: false, raw_ext: None, + txn_status_cache: &TxnStatusCache::new_for_test(), }, ) .unwrap(); @@ -259,6 +268,7 @@ pub mod tests { statistics: &mut Default::default(), async_apply_prewrite: false, raw_ext: None, + txn_status_cache: &TxnStatusCache::new_for_test(), }, ) .map(|r| { diff --git a/src/storage/txn/commands/cleanup.rs b/src/storage/txn/commands/cleanup.rs index 302c4fe1308..886094a7f34 100644 --- a/src/storage/txn/commands/cleanup.rs +++ b/src/storage/txn/commands/cleanup.rs @@ -80,6 +80,7 @@ impl WriteCommand for Cleanup { new_acquired_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, + known_txn_status: vec![], }) } } diff --git a/src/storage/txn/commands/commit.rs b/src/storage/txn/commands/commit.rs index 4f05df8fe83..8daff9b2aee 100644 --- a/src/storage/txn/commands/commit.rs +++ b/src/storage/txn/commands/commit.rs @@ -80,6 +80,7 @@ impl WriteCommand for Commit { new_acquired_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, + known_txn_status: vec![(self.lock_ts, self.commit_ts)], }) } } diff --git a/src/storage/txn/commands/compare_and_swap.rs b/src/storage/txn/commands/compare_and_swap.rs index ca9213b57d3..3725de47273 100644 --- a/src/storage/txn/commands/compare_and_swap.rs +++ b/src/storage/txn/commands/compare_and_swap.rs @@ -117,6 +117,7 @@ impl WriteCommand for RawCompareAndSwap { new_acquired_locks: vec![], lock_guards, response_policy: ResponsePolicy::OnApplied, + known_txn_status: vec![], }) } } @@ -134,8 +135,9 @@ mod tests { use super::*; use crate::storage::{ - lock_manager::MockLockManager, txn::scheduler::get_raw_ext, Engine, Statistics, - TestEngineBuilder, + lock_manager::MockLockManager, + txn::{scheduler::get_raw_ext, txn_status_cache::TxnStatusCache}, + Engine, Statistics, TestEngineBuilder, }; #[test] @@ -215,6 +217,7 @@ mod tests { statistics: &mut statistic, async_apply_prewrite: false, raw_ext, + txn_status_cache: &TxnStatusCache::new_for_test(), }; let ret = cmd.cmd.process_write(snap, context)?; match ret.pr { @@ -269,6 +272,7 @@ mod tests { statistics: &mut statistic, async_apply_prewrite: false, raw_ext, + txn_status_cache: &TxnStatusCache::new_for_test(), }; let cmd: Command = cmd.into(); let write_result = cmd.process_write(snap, context).unwrap(); diff --git a/src/storage/txn/commands/flashback_to_version.rs b/src/storage/txn/commands/flashback_to_version.rs index 37d288fa266..efbeefa2494 100644 --- a/src/storage/txn/commands/flashback_to_version.rs +++ b/src/storage/txn/commands/flashback_to_version.rs @@ -185,6 +185,7 @@ impl WriteCommand for FlashbackToVersion { new_acquired_locks: vec![], lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, + known_txn_status: vec![], }) } } diff --git a/src/storage/txn/commands/mod.rs b/src/storage/txn/commands/mod.rs index 5896d6562f1..dabef707e61 100644 --- a/src/storage/txn/commands/mod.rs +++ b/src/storage/txn/commands/mod.rs @@ -70,7 +70,7 @@ use crate::storage::{ }, metrics, mvcc::{Lock as MvccLock, MvccReader, ReleasedLock, SnapshotReader}, - txn::{latch, ProcessResult, Result}, + txn::{latch, txn_status_cache::TxnStatusCache, ProcessResult, Result}, types::{ MvccInfo, PessimisticLockParameters, PessimisticLockResults, PrewriteResult, SecondaryLocksStatus, StorageCallbackType, TxnStatus, @@ -422,6 +422,12 @@ pub struct WriteResult { pub new_acquired_locks: Vec, pub lock_guards: Vec, pub response_policy: ResponsePolicy, + /// The txn status that can be inferred by the successful writing. This will + /// be used to update the cache. + /// + /// Currently only commit_ts of committed transactions will be collected. + /// Rolled-back transactions may also be collected in the future. + pub known_txn_status: Vec<(TimeStamp, TimeStamp)>, } pub struct WriteResultLockInfo { @@ -573,6 +579,7 @@ pub struct WriteContext<'a, L: LockManager> { pub statistics: &'a mut Statistics, pub async_apply_prewrite: bool, pub raw_ext: Option, // use for apiv2 + pub txn_status_cache: &'a TxnStatusCache, } pub struct ReaderWithStats<'a, S: Snapshot> { @@ -823,6 +830,7 @@ pub mod test_util { statistics, async_apply_prewrite: false, raw_ext: None, + txn_status_cache: &TxnStatusCache::new_for_test(), }; let ret = cmd.cmd.process_write(snap, context)?; let res = match ret.pr { @@ -983,6 +991,7 @@ pub mod test_util { statistics, async_apply_prewrite: false, raw_ext: None, + txn_status_cache: &TxnStatusCache::new_for_test(), }; let ret = cmd.cmd.process_write(snap, context)?; @@ -1008,6 +1017,7 @@ pub mod test_util { statistics, async_apply_prewrite: false, raw_ext: None, + txn_status_cache: &TxnStatusCache::new_for_test(), }; let ret = cmd.cmd.process_write(snap, context)?; diff --git a/src/storage/txn/commands/pause.rs b/src/storage/txn/commands/pause.rs index 5d3aa7f6d2f..1f5d40b2d4e 100644 --- a/src/storage/txn/commands/pause.rs +++ b/src/storage/txn/commands/pause.rs @@ -53,6 +53,7 @@ impl WriteCommand for Pause { new_acquired_locks: vec![], lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, + known_txn_status: vec![], }) } } diff --git a/src/storage/txn/commands/pessimistic_rollback.rs b/src/storage/txn/commands/pessimistic_rollback.rs index 4e0bf8c8c56..531eb256c40 100644 --- a/src/storage/txn/commands/pessimistic_rollback.rs +++ b/src/storage/txn/commands/pessimistic_rollback.rs @@ -96,6 +96,7 @@ impl WriteCommand for PessimisticRollback { new_acquired_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, + known_txn_status: vec![], }) } } @@ -116,6 +117,7 @@ pub mod tests { commands::{WriteCommand, WriteContext}, scheduler::DEFAULT_EXECUTION_DURATION_LIMIT, tests::*, + txn_status_cache::TxnStatusCache, }, TestEngineBuilder, }; @@ -146,6 +148,7 @@ pub mod tests { statistics: &mut Default::default(), async_apply_prewrite: false, raw_ext: None, + txn_status_cache: &TxnStatusCache::new_for_test(), }; let result = command.process_write(snapshot, write_context).unwrap(); write(engine, &ctx, result.to_be_write.modifies); diff --git a/src/storage/txn/commands/prewrite.rs b/src/storage/txn/commands/prewrite.rs index 10446db6292..34c98dab156 100644 --- a/src/storage/txn/commands/prewrite.rs +++ b/src/storage/txn/commands/prewrite.rs @@ -24,7 +24,7 @@ use crate::storage::{ kv::WriteData, lock_manager::LockManager, mvcc::{ - has_data_in_range, Error as MvccError, ErrorInner as MvccErrorInner, MvccTxn, + has_data_in_range, metrics::*, Error as MvccError, ErrorInner as MvccErrorInner, MvccTxn, Result as MvccResult, SnapshotReader, TxnCommitRecord, }, txn::{ @@ -489,6 +489,36 @@ impl Prewriter { snapshot: impl Snapshot, mut context: WriteContext<'_, impl LockManager>, ) -> Result { + // Handle special cases about retried prewrite requests for pessimistic + // transactions. + if let TransactionKind::Pessimistic(_) = self.kind.txn_kind() { + if let Some(commit_ts) = context.txn_status_cache.get_no_promote(self.start_ts) { + fail_point!("before_prewrite_txn_status_cache_hit"); + if self.ctx.is_retry_request { + MVCC_PREWRITE_REQUEST_AFTER_COMMIT_COUNTER_VEC + .retry_req + .inc(); + } else { + MVCC_PREWRITE_REQUEST_AFTER_COMMIT_COUNTER_VEC + .non_retry_req + .inc(); + } + warn!("prewrite request received due to transaction is known to be already committed"; "start_ts" => %self.start_ts, "commit_ts" => %commit_ts); + // In normal cases if the transaction is committed, then the key should have + // been already prewritten successfully. But in order to + // simplify code as well as prevent possible corner cases or + // special cases in the future, we disallow skipping constraint + // check in this case. + // We regard this request as a retried request no matter if it really is (the + // original request may arrive later than retried request due to + // network latency, in which case we'd better handle it like a + // retried request). + self.ctx.is_retry_request = true; + } else { + fail_point!("before_prewrite_txn_status_cache_miss"); + } + } + self.kind .can_skip_constraint_check(&mut self.mutations, &snapshot, &mut context)?; self.check_max_ts_synced(&snapshot)?; @@ -748,6 +778,11 @@ impl Prewriter { new_acquired_locks, lock_guards, response_policy: ResponsePolicy::OnApplied, + known_txn_status: if !one_pc_commit_ts.is_zero() { + vec![(self.start_ts, one_pc_commit_ts)] + } else { + vec![] + }, } } else { // Skip write stage if some keys are locked. @@ -768,6 +803,7 @@ impl Prewriter { new_acquired_locks: vec![], lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, + known_txn_status: vec![], } }; @@ -1002,6 +1038,7 @@ mod tests { must_acquire_pessimistic_lock, must_acquire_pessimistic_lock_err, must_commit, must_prewrite_put_err_impl, must_prewrite_put_impl, must_rollback, }, + txn_status_cache::TxnStatusCache, Error, ErrorInner, }, types::TxnStatus, @@ -1647,6 +1684,7 @@ mod tests { statistics: &mut Statistics::default(), async_apply_prewrite: false, raw_ext: None, + txn_status_cache: &TxnStatusCache::new_for_test(), } }; } @@ -1818,6 +1856,7 @@ mod tests { statistics: &mut statistics, async_apply_prewrite: case.async_apply_prewrite, raw_ext: None, + txn_status_cache: &TxnStatusCache::new_for_test(), }; let mut engine = TestEngineBuilder::new().build().unwrap(); let snap = engine.snapshot(Default::default()).unwrap(); @@ -1932,6 +1971,7 @@ mod tests { statistics: &mut statistics, async_apply_prewrite: false, raw_ext: None, + txn_status_cache: &TxnStatusCache::new_for_test(), }; let snap = engine.snapshot(Default::default()).unwrap(); let result = cmd.cmd.process_write(snap, context).unwrap(); @@ -1960,6 +2000,7 @@ mod tests { statistics: &mut statistics, async_apply_prewrite: false, raw_ext: None, + txn_status_cache: &TxnStatusCache::new_for_test(), }; let snap = engine.snapshot(Default::default()).unwrap(); let result = cmd.cmd.process_write(snap, context).unwrap(); @@ -2043,6 +2084,7 @@ mod tests { statistics: &mut statistics, async_apply_prewrite: false, raw_ext: None, + txn_status_cache: &TxnStatusCache::new_for_test(), }; let snap = engine.snapshot(Default::default()).unwrap(); let result = cmd.cmd.process_write(snap, context).unwrap(); @@ -2075,6 +2117,7 @@ mod tests { statistics: &mut statistics, async_apply_prewrite: false, raw_ext: None, + txn_status_cache: &TxnStatusCache::new_for_test(), }; let snap = engine.snapshot(Default::default()).unwrap(); let result = cmd.cmd.process_write(snap, context).unwrap(); @@ -2345,6 +2388,7 @@ mod tests { statistics: &mut statistics, async_apply_prewrite: false, raw_ext: None, + txn_status_cache: &TxnStatusCache::new_for_test(), }; let snap = engine.snapshot(Default::default()).unwrap(); assert!(prewrite_cmd.cmd.process_write(snap, context).is_err()); @@ -2369,6 +2413,7 @@ mod tests { statistics: &mut statistics, async_apply_prewrite: false, raw_ext: None, + txn_status_cache: &TxnStatusCache::new_for_test(), }; let snap = engine.snapshot(Default::default()).unwrap(); assert!(prewrite_cmd.cmd.process_write(snap, context).is_err()); @@ -2575,6 +2620,7 @@ mod tests { statistics: &mut statistics, async_apply_prewrite: false, raw_ext: None, + txn_status_cache: &TxnStatusCache::new_for_test(), }; let snap = engine.snapshot(Default::default()).unwrap(); let res = prewrite_cmd.cmd.process_write(snap, context).unwrap(); diff --git a/src/storage/txn/commands/resolve_lock.rs b/src/storage/txn/commands/resolve_lock.rs index f3d141807e8..cd01fc60475 100644 --- a/src/storage/txn/commands/resolve_lock.rs +++ b/src/storage/txn/commands/resolve_lock.rs @@ -83,6 +83,7 @@ impl WriteCommand for ResolveLock { let mut scan_key = self.scan_key.take(); let rows = key_locks.len(); let mut released_locks = ReleasedLocks::new(); + let mut known_txn_status = vec![]; for (current_key, current_lock) in key_locks { txn.start_ts = current_lock.ts; reader.start_ts = current_lock.ts; @@ -103,7 +104,10 @@ impl WriteCommand for ResolveLock { // type. They could be left if the transaction is finally committed and // pessimistic conflict retry happens during execution. match commit(&mut txn, &mut reader, current_key.clone(), commit_ts) { - Ok(res) => res, + Ok(res) => { + known_txn_status.push((current_lock.ts, commit_ts)); + res + } Err(MvccError(box MvccErrorInner::TxnLockNotFound { .. })) if current_lock.is_pessimistic_lock() => { @@ -125,6 +129,9 @@ impl WriteCommand for ResolveLock { } } + known_txn_status.sort(); + known_txn_status.dedup(); + let pr = if scan_key.is_none() { ProcessResult::Res } else { @@ -151,6 +158,7 @@ impl WriteCommand for ResolveLock { new_acquired_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, + known_txn_status, }) } } diff --git a/src/storage/txn/commands/resolve_lock_lite.rs b/src/storage/txn/commands/resolve_lock_lite.rs index 63fe201596d..318e5d57313 100644 --- a/src/storage/txn/commands/resolve_lock_lite.rs +++ b/src/storage/txn/commands/resolve_lock_lite.rs @@ -63,6 +63,11 @@ impl WriteCommand for ResolveLockLite { }); } + let known_txn_status = if !self.commit_ts.is_zero() { + vec![(self.start_ts, self.commit_ts)] + } else { + vec![] + }; let new_acquired_locks = txn.take_new_locks(); let mut write_data = WriteData::from_modifies(txn.into_modifies()); write_data.set_allowed_on_disk_almost_full(); @@ -76,6 +81,7 @@ impl WriteCommand for ResolveLockLite { new_acquired_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, + known_txn_status, }) } } diff --git a/src/storage/txn/commands/rollback.rs b/src/storage/txn/commands/rollback.rs index f3b674f4916..df60767e716 100644 --- a/src/storage/txn/commands/rollback.rs +++ b/src/storage/txn/commands/rollback.rs @@ -71,6 +71,7 @@ impl WriteCommand for Rollback { new_acquired_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, + known_txn_status: vec![], }) } } diff --git a/src/storage/txn/commands/txn_heart_beat.rs b/src/storage/txn/commands/txn_heart_beat.rs index 448395fc436..c900464099a 100644 --- a/src/storage/txn/commands/txn_heart_beat.rs +++ b/src/storage/txn/commands/txn_heart_beat.rs @@ -96,6 +96,7 @@ impl WriteCommand for TxnHeartBeat { new_acquired_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, + known_txn_status: vec![], }) } } @@ -111,7 +112,10 @@ pub mod tests { kv::TestEngineBuilder, lock_manager::MockLockManager, mvcc::tests::*, - txn::{commands::WriteCommand, scheduler::DEFAULT_EXECUTION_DURATION_LIMIT, tests::*}, + txn::{ + commands::WriteCommand, scheduler::DEFAULT_EXECUTION_DURATION_LIMIT, tests::*, + txn_status_cache::TxnStatusCache, + }, Engine, }; @@ -143,6 +147,7 @@ pub mod tests { statistics: &mut Default::default(), async_apply_prewrite: false, raw_ext: None, + txn_status_cache: &TxnStatusCache::new_for_test(), }, ) .unwrap(); @@ -185,6 +190,7 @@ pub mod tests { statistics: &mut Default::default(), async_apply_prewrite: false, raw_ext: None, + txn_status_cache: &TxnStatusCache::new_for_test(), }, ) .is_err() diff --git a/src/storage/txn/mod.rs b/src/storage/txn/mod.rs index 640c534fc86..8c30ae0a068 100644 --- a/src/storage/txn/mod.rs +++ b/src/storage/txn/mod.rs @@ -6,6 +6,7 @@ pub mod commands; pub mod flow_controller; pub mod sched_pool; pub mod scheduler; +pub mod txn_status_cache; mod actions; mod latch; diff --git a/src/storage/txn/scheduler.rs b/src/storage/txn/scheduler.rs index 3c6a66c3941..36492f22701 100644 --- a/src/storage/txn/scheduler.rs +++ b/src/storage/txn/scheduler.rs @@ -83,6 +83,7 @@ use crate::{ flow_controller::FlowController, latch::{Latches, Lock}, sched_pool::{tls_collect_query, tls_collect_scan_details, SchedPool}, + txn_status_cache::TxnStatusCache, Error, ErrorInner, ProcessResult, }, types::StorageCallback, @@ -293,6 +294,8 @@ struct TxnSchedulerInner { quota_limiter: Arc, resource_manager: Option>, feature_gate: FeatureGate, + + txn_status_cache: TxnStatusCache, } #[inline] @@ -484,6 +487,7 @@ impl TxnScheduler { quota_limiter, resource_manager, feature_gate, + txn_status_cache: TxnStatusCache::new(config.txn_status_cache_capacity), }); slow_log!( @@ -815,6 +819,7 @@ impl TxnScheduler { pipelined: bool, async_apply_prewrite: bool, new_acquired_locks: Vec, + known_txn_status: Vec<(TimeStamp, TimeStamp)>, tag: CommandKind, metadata: TaskMetadata<'_>, sched_details: &SchedulerDetails, @@ -837,6 +842,17 @@ impl TxnScheduler { debug!("write command finished"; "cid" => cid, "pipelined" => pipelined, "async_apply_prewrite" => async_apply_prewrite); drop(lock_guards); + + if result.is_ok() && !known_txn_status.is_empty() { + // Update cache before calling the callback. + // Reversing the order can lead to test failures as the cache may still + // remain not updated after receiving signal from the callback. + let now = std::time::SystemTime::now(); + for (start_ts, commit_ts) in known_txn_status { + self.inner.txn_status_cache.insert(start_ts, commit_ts, now); + } + } + let tctx = self.inner.dequeue_task_context(cid); let mut do_wake_up = !tctx.woken_up_resumable_lock_requests.is_empty(); @@ -1258,6 +1274,7 @@ impl TxnScheduler { statistics: &mut sched_details.stat, async_apply_prewrite: self.inner.enable_async_apply_prewrite, raw_ext, + txn_status_cache: &self.inner.txn_status_cache, }; let begin_instant = Instant::now(); let res = unsafe { @@ -1328,6 +1345,7 @@ impl TxnScheduler { new_acquired_locks, lock_guards, response_policy, + known_txn_status, } = match deadline .check() .map_err(StorageError::from) @@ -1406,6 +1424,7 @@ impl TxnScheduler { false, false, new_acquired_locks, + known_txn_status, tag, metadata, sched_details, @@ -1441,6 +1460,7 @@ impl TxnScheduler { false, false, new_acquired_locks, + known_txn_status, tag, metadata, sched_details, @@ -1636,6 +1656,7 @@ impl TxnScheduler { pipelined, is_async_apply_prewrite, new_acquired_locks, + known_txn_status, tag, metadata, sched_details, @@ -1879,6 +1900,11 @@ impl TxnScheduler { .push_lock_wait(entry, Default::default()); } } + + #[cfg(test)] + pub fn get_txn_status_cache(&self) -> &TxnStatusCache { + &self.inner.txn_status_cache + } } pub async fn get_raw_ext( diff --git a/src/storage/txn/txn_status_cache.rs b/src/storage/txn/txn_status_cache.rs new file mode 100644 index 00000000000..c9b231c60ec --- /dev/null +++ b/src/storage/txn/txn_status_cache.rs @@ -0,0 +1,980 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +//! This module implements a cache for the status of recent finished +//! transactions. When a transaction is committed or rolled back, we store the +//! information in the cache for a while. Later, in some cases, one can find +//! the transaction status without accessing the physical storage. This helps +//! to quickly find out the transaction status in some cases. +//! +//! > **Note:** +//! > * Currently, only committed transactions are cached. We may also cache +//! > rolled-back transactions in the future. +//! > * Currently, the cache is only used to filter unnecessary stale prewrite +//! > requests. We will also consider use the cache for other purposes in the +//! > future. +//! +//! ## Why we need this? +//! +//! ### For filtering out unwanted late-arrived stale prewrite requests +//! +//! This solves a problem which has a complicated background. +//! +//! There's such an optimization in pessimistic transactions when TiKV runs +//! accompanied with TiDB: non-unique index keys don't need to be pessimistic- +//! locked, and WRITE CF don't need to be checked either when prewriting. The +//! correctness in case there's any kinds of conflicts will be protected by +//! the corresponding row key, as the index key is never written without +//! writing the corresponding row key. +//! +//! However, it's later found to be problematic, especially with async commit +//! and 1PC, as the prewrite requests on these index keys lost its idempotency. +//! You can see [this issue](https://github.com/tikv/tikv/issues/11187) to see +//! how it causes problems, including those that affects transaction +//! correctness. +//! +//! The problem happens when the prewrite request to the same index key is +//! sent more than once. Our first solution is to add a `is_retry_request` flag +//! to the second (or even more) requests, which is sent due to retrying from +//! the client side. But it's still imperfect, considering that it's +//! theoretically possible that the original request arrives to TiKV later than +//! the retried one. In fact, we once observed this happens in an environment +//! where the network is terribly unstable. +//! +//! Our second solution, additional to the previous one, is to use this cache. +//! Each committed transaction should be guaranteed to be kept in the cache for +//! [a long-enough time](CACHE_ITEMS_REQUIRED_KEEP_TIME). When a prewrite +//! request is received, it should check the cache before executing. If it finds +//! its belonging transaction is already committed, it won't skip constraint +//! check in WRITE CF. Note that if the index key is already committed but the +//! transaction info is not cached, then a late-arrived prewrite request cannot +//! be protected by this mechanism. This means we shouldn't miss any cacheable +//! transactions, and it is the reason why committed transactions should be +//! cached for *a long-enough time*. +//! +//! Unfortunately, the solution is still imperfect. As it's already known, it +//! may still be problematic due to the following reasons: +//! +//! 1. We don't have mechanism to refuse requests that have +//! past more than [CACHE_ITEMS_REQUIRED_KEEP_TIME] since they were sent. +//! 2. To prevent the cache from consuming too much more memory than expected, +//! we have a limit to the capacity (though the limit is very large), and it's +//! configurable (so the cache can be disabled, see how the `capacity` parameter +//! of function [TxnStatusCache::new] is used) as a way to escape from potential +//! faults. +//! 3. The cache can't be synced across different TiKV instances. +//! +//! The third case above needs detailed explanation to be clarified. This is +//! an example of the problem: +//! +//! 1. Client try to send prewrite request to TiKV A, who has the leader of the +//! region containing a index key. The request is not received by TiKV and the +//! client retries. +//! 2. The leader is transferred to TiKV B, and the retries prewrite request +//! is sent to it and processed successfully. +//! 3. The transaction is committed on TiKV B, not being known by TiKV A. +//! 4. The leader transferred back to TiKV A. +//! 5. The original request arrives to TiKV A and being executed. As the +//! status of the transaction is not in the cache in TiKV A, the prewrite +//! request will be handled in normal way, skipping constraint checks. +//! +//! As of the time when this module is written, the above remaining cases have +//! not yet been handled, considering the extremely low possibility to happen +//! and high complexity to fix. +//! +//! The perfect and most elegant way to fix all of these problem is never to +//! skip constraint checks or never skipping pessimistic locks for index keys. +//! Or to say, totally remove the optimization mentioned above on index keys. +//! But for historical reason, this may lead to significant performance +//! regression in existing clusters. +//! +//! ### For read data locked by large transactions more efficiently +//! +//! * Note: the `TxnStatusCache` is designed prepared for this usage, but not +//! used yet for now. +//! +//! Consider the case that a very-large transaction locked a lot of keys after +//! prewriting, while many simple reads and writes executes frequently, thus +//! these simple transactions frequently meets the lock left by the large +//! transaction. It will be very inefficient for these small transactions to +//! come back to the client and start resolve lock procedure. Even if the client +//! side has the cache of that transaction, it still wastes an RTT. +//! +//! There would be more possibilities if we have such a cache in TiKV side: for +//! read requests, it can check the cache to know whether it can read from the +//! lock; and for write requests, if it finds the transaction of that lock is +//! already committed, it can merge together the resolve-lock-committing and the +//! write operation that the request needs to perform. + +use std::{ + sync::{atomic::AtomicU64, Arc}, + time::{Duration, SystemTime, UNIX_EPOCH}, +}; + +use crossbeam::utils::CachePadded; +use parking_lot::Mutex; +use tikv_util::{ + lru, + lru::{GetTailEntry, LruCache}, +}; +use txn_types::TimeStamp; + +use crate::storage::metrics::*; + +const TXN_STATUS_CACHE_SLOTS: usize = 128; + +/// An cache item should be kept for at least this time. +/// Actually this should be guaranteed only for committed transactions. See +/// [this section](# +/// for-filtering-out-unwanted-late-arrived-stale-prewrite-requests) for details +/// about why this is needed. +const CACHE_ITEMS_REQUIRED_KEEP_TIME: Duration = Duration::from_secs(30); + +struct CacheEntry { + commit_ts: TimeStamp, + /// The system timestamp in milliseconds when the entry is inserted to the + /// cache. + insert_time: u64, +} + +/// Defines the policy to evict expired entries from the cache. +/// [`TxnStatusCache`] needs to keep entries for a while, so the common +/// policy that only limiting capacity is not proper to be used here. +struct TxnStatusCacheEvictPolicy { + required_keep_time_millis: u64, + #[cfg(test)] + simulated_system_time: Option>, +} + +impl TxnStatusCacheEvictPolicy { + fn new( + required_keep_time: Duration, + #[allow(unused_variables)] simulated_system_time: Option>, + ) -> Self { + Self { + required_keep_time_millis: required_keep_time.as_millis() as u64, + #[cfg(test)] + simulated_system_time, + } + } + + #[inline] + #[cfg(not(test))] + fn now(&self) -> SystemTime { + SystemTime::now() + } + + /// When used in tests, the system time can be simulated by controlling the + /// field `simulated_system_time`. + #[inline] + #[cfg(test)] + fn now(&self) -> SystemTime { + // Always get the system time to simulate the latency. + let now = SystemTime::now(); + if let Some(pseudo_system_time) = &self.simulated_system_time { + UNIX_EPOCH + + std::time::Duration::from_millis( + pseudo_system_time.load(std::sync::atomic::Ordering::Acquire), + ) + } else { + now + } + } +} + +impl lru::EvictPolicy for TxnStatusCacheEvictPolicy { + fn should_evict( + &self, + current_size: usize, + capacity: usize, + get_tail_entry: &impl GetTailEntry, + ) -> bool { + // See how much time has been elapsed since the tail entry is inserted. + // If it's long enough, remove it. + if let Some((_, v)) = get_tail_entry.get_tail_entry() { + if self.now().duration_since(UNIX_EPOCH).unwrap().as_millis() as u64 + > self.required_keep_time_millis + v.insert_time + { + return true; + } + } + + // If the capacity limit is exceeded, remove it. + current_size > capacity + } +} + +type TxnStatusCacheSlot = + LruCache; + +/// The cache for storing transaction status. It holds recent +/// `start_ts` -> `commit_ts` pairs for a while, which can be useful for quickly +/// but not strictly determining transaction status. +/// +/// `TxnStatusCache` is divided into several slots +/// to make the lock more fine-grained. Each slot uses an [`LruCache`] as the +/// internal implementation, with customized evict policy. However, we do not +/// always adopt the LRU behavior. Some operation to an existing entry in the +/// cache won't promote it to the most-recent place. +/// +/// Note that the `TxnStatusCache` updates metrics in some operations assuming +/// there's at most one instance of `TxnStatusCache` in a process. +pub struct TxnStatusCache { + slots: Vec>>, + is_enabled: bool, +} + +unsafe impl Sync for TxnStatusCache {} + +impl TxnStatusCache { + fn new_impl( + slots: usize, + required_keep_time: Duration, + capacity: usize, + simulated_system_time: Option>, + ) -> Self { + if capacity == 0 { + return Self { + slots: vec![], + is_enabled: false, + }; + } + + // The limit of the LruCache of each slot. + let allowed_capacity_per_slot = capacity / slots; + // The total memory allocated initially by the LruCache's internal data + // structure for all slots. + + let mut initial_allocated_capacity_total = 0; + let res = Self { + slots: (0..slots) + .map(|_| { + let cache = LruCache::new( + allowed_capacity_per_slot, + 0, + lru::CountTracker::default(), + TxnStatusCacheEvictPolicy::new( + required_keep_time, + simulated_system_time.clone(), + ), + ); + let allocated_capacity = cache.internal_allocated_capacity(); + initial_allocated_capacity_total += allocated_capacity; + Mutex::new(cache).into() + }) + .collect(), + is_enabled: true, + }; + SCHED_TXN_STATUS_CACHE_SIZE + .allocated + .set(initial_allocated_capacity_total as i64); + res + } + + pub fn new(capacity: usize) -> Self { + Self::with_slots_and_time_limit( + TXN_STATUS_CACHE_SLOTS, + CACHE_ITEMS_REQUIRED_KEEP_TIME, + capacity, + ) + } + + #[cfg(test)] + pub fn new_for_test() -> Self { + // 1M capacity should be enough for tests. + Self::with_slots_and_time_limit(16, CACHE_ITEMS_REQUIRED_KEEP_TIME, 1 << 20) + } + + pub fn with_slots_and_time_limit( + slots: usize, + required_keep_time: Duration, + capacity: usize, + ) -> Self { + Self::new_impl(slots, required_keep_time, capacity, None) + } + + /// Create a `TxnStatusCache` instance for test purpose, with simulating + /// system time enabled. This helps when testing functionalities that are + /// related to system time. + /// + /// An `AtomicU64` will be returned. Store timestamps + /// in milliseconds in it to control the time. + #[cfg(test)] + fn with_simulated_system_time( + slots: usize, + requried_keep_time: Duration, + capacity: usize, + ) -> (Self, Arc) { + let system_time = Arc::new(AtomicU64::new(0)); + let res = Self::new_impl( + slots, + requried_keep_time, + capacity, + Some(system_time.clone()), + ); + (res, system_time) + } + + fn slot_index(&self, start_ts: TimeStamp) -> usize { + fxhash::hash(&start_ts) % self.slots.len() + } + + /// Insert a transaction status into the cache. The current system time + /// should be passed from outside to avoid getting system time repeatedly + /// when multiple items is being inserted. + /// + /// If the transaction's information is already in the cache, it will + /// **NOT** be promoted to the most-recent place of the internal LRU. + pub fn insert(&self, start_ts: TimeStamp, commit_ts: TimeStamp, now: SystemTime) { + if !self.is_enabled { + return; + } + + let insert_time = now.duration_since(UNIX_EPOCH).unwrap().as_millis() as u64; + let mut slot = self.slots[self.slot_index(start_ts)].lock(); + let previous_size = slot.size(); + let previous_allocated = slot.internal_allocated_capacity(); + slot.insert_if_not_exist( + start_ts, + CacheEntry { + commit_ts, + insert_time, + }, + ); + let size = slot.size(); + let allocated = slot.internal_allocated_capacity(); + drop(slot); + + // Update statistics. + // CAUTION: Assuming that only one TxnStatusCache instance is in a TiKV process. + SCHED_TXN_STATUS_CACHE_SIZE + .used + .add(size as i64 - previous_size as i64); + SCHED_TXN_STATUS_CACHE_SIZE + .allocated + .add(allocated as i64 - previous_allocated as i64); + } + + /// Try to get an item from the cache, without promoting the item (if + /// exists) to the most recent place. + pub fn get_no_promote(&self, start_ts: TimeStamp) -> Option { + if !self.is_enabled { + return None; + } + + let slot = self.slots[self.slot_index(start_ts)].lock(); + slot.get_no_promote(&start_ts).map(|entry| entry.commit_ts) + } + + pub fn get(&self, start_ts: TimeStamp) -> Option { + if !self.is_enabled { + return None; + } + + let mut slot = self.slots[self.slot_index(start_ts)].lock(); + slot.get(&start_ts).map(|entry| entry.commit_ts) + } + + /// Remove an entry from the cache. We usually don't need to remove anything + /// from the `TxnStatusCache`, but it's useful in tests to construct cache- + /// miss cases. + #[cfg(test)] + pub fn remove(&self, start_ts: TimeStamp) -> Option { + if !self.is_enabled { + return None; + } + + let res = { + let mut slot = self.slots[self.slot_index(start_ts)].lock(); + slot.remove(&start_ts).map(|e| e.commit_ts) + }; + debug_assert!(self.get_no_promote(start_ts).is_none()); + res + } +} + +#[cfg(test)] +mod tests { + use std::{ + sync::{ + atomic::{AtomicU64, Ordering}, + Arc, + }, + time::{Duration, Instant, SystemTime}, + }; + + use rand::{prelude::SliceRandom, Rng}; + + use super::*; + + fn bench_insert_impl(b: &mut test::Bencher, init_size: usize) { + let (c, time) = TxnStatusCache::with_simulated_system_time( + TXN_STATUS_CACHE_SLOTS, + Duration::from_millis(init_size as u64), + 1 << 20, + ); + let start_time = SystemTime::now(); + // Spread these items evenly in a specific time limit, so that every time + // a new item is inserted, an item will be popped out. + for i in 1..=init_size { + c.insert( + (i as u64).into(), + (i as u64 + 1).into(), + start_time + Duration::from_millis(i as u64), + ); + } + let mut current_time_shift = (init_size + 1) as u64; + b.iter(|| { + let simulated_now = start_time + Duration::from_millis(current_time_shift); + // Simulate the system time advancing. + time.store( + simulated_now + .duration_since(UNIX_EPOCH) + .unwrap() + .as_millis() as u64, + Ordering::Release, + ); + c.insert( + current_time_shift.into(), + (current_time_shift + 1).into(), + simulated_now, + ); + current_time_shift += 1; + }); + test::black_box(&c); + } + + fn bench_get_impl(b: &mut test::Bencher, init_size: usize) { + let c = TxnStatusCache::with_slots_and_time_limit( + TXN_STATUS_CACHE_SLOTS, + CACHE_ITEMS_REQUIRED_KEEP_TIME, + 1 << 20, + ); + let now = SystemTime::now(); + for i in 1..=init_size { + c.insert( + (i as u64).into(), + (i as u64 + 1).into(), + now + Duration::from_millis(i as u64), + ); + } + let rand_range = if init_size == 0 { 10000 } else { init_size } as u64; + b.iter(|| { + let ts = rand::thread_rng().gen_range(0u64, rand_range); + let res = c.get_no_promote(ts.into()); + test::black_box(&res); + }) + } + + #[bench] + fn bench_insert_empty(b: &mut test::Bencher) { + bench_insert_impl(b, 0); + } + + #[bench] + fn bench_insert_100000(b: &mut test::Bencher) { + bench_insert_impl(b, 100000); + } + + #[bench] + fn bench_get_empty(b: &mut test::Bencher) { + bench_get_impl(b, 0); + } + + #[bench] + fn bench_get_100000(b: &mut test::Bencher) { + bench_get_impl(b, 100000); + } + + /// A simple statistic tool for collecting a set of data and calculating the + /// average, stddev, and percentiles (by using a linear histogram). + /// Data is collected in u128, and results are given in f64. + struct SimpleStatistics { + sum: u128, + sum_square: u128, + count: usize, + bucket_width: u128, + buckets: Vec, + } + + impl SimpleStatistics { + fn new(bucket_width: u128) -> Self { + Self { + sum: 0, + sum_square: 0, + count: 0, + bucket_width, + buckets: vec![], + } + } + + /// Merge another instance into the current one + fn add(&mut self, other: Self) { + self.sum += other.sum; + self.sum_square += other.sum_square; + self.count += other.count; + assert_eq!(self.bucket_width, other.bucket_width); + if self.buckets.len() < other.buckets.len() { + self.buckets.resize(other.buckets.len(), 0); + } + for (count, other_count) in self.buckets.iter_mut().zip(other.buckets.iter()) { + *count += *other_count + } + } + + fn avg(&self) -> f64 { + self.sum as f64 / (self.count as f64) + } + + fn stddev(&self) -> f64 { + let avg = self.avg(); + let sum_sqr_diff: f64 = + (self.sum_square as f64) - (self.sum as f64 * avg * 2.0) + avg * self.count as f64; + (sum_sqr_diff / (self.count - 1) as f64).sqrt() + } + + /// Calculate the percentile value at specified position (should be in + /// range [0, 1]) + fn percentile(&self, position: f64) -> f64 { + let mut bucket = self.buckets.len(); + let mut prefix_sum = self.count; + while bucket > 0 { + bucket -= 1; + prefix_sum -= self.buckets[bucket]; + let prefix_percentile = prefix_sum as f64 / self.count as f64; + if prefix_percentile <= position { + assert_le!(prefix_sum as f64, position * self.count as f64); + assert_lt!( + position * self.count as f64, + (prefix_sum + self.buckets[bucket]) as f64 + ); + break; + } + } + + bucket as f64 * self.bucket_width as f64 + + (position * self.count as f64 - prefix_sum as f64) * self.bucket_width as f64 + / self.buckets[bucket] as f64 + } + + fn observe(&mut self, value: u128) { + self.sum += value; + self.sum_square += value * value; + self.count += 1; + let bucket = (value / self.bucket_width) as usize; + if self.buckets.len() <= bucket { + self.buckets.resize(bucket + 1, 0); + } + self.buckets[bucket] += 1; + } + } + + fn bench_concurrent_impl( + name: &str, + threads: usize, + function: impl Fn(u64) -> T + Send + Sync + 'static, + ) { + let start_time = Instant::now(); + // Run the benchmark code repeatedly for 10 seconds. + const TIME_LIMIT: Duration = Duration::from_secs(10); + let iteration = Arc::new(AtomicU64::new(0)); + + // Make the lifetime checker happy. + let function = Arc::new(function); + + let mut handles = Vec::with_capacity(threads); + for _ in 0..threads { + let f = function.clone(); + let iteration = iteration.clone(); + let handle = std::thread::spawn(move || { + let mut stats = SimpleStatistics::new(20); + loop { + if start_time.elapsed() > TIME_LIMIT { + break; + } + let i = iteration.fetch_add(1, Ordering::SeqCst); + let iter_start_time = Instant::now(); + test::black_box(f(i)); + let duration = iter_start_time.elapsed(); + stats.observe(duration.as_nanos()); + } + stats + }); + handles.push(handle); + } + + let mut total_stats = SimpleStatistics::new(20); + for h in handles { + total_stats.add(h.join().unwrap()); + } + + println!( + "benchmark {}: duration per iter: avg: {:?}, stddev: {:?}, percentile .99: {:?}, percentile .999: {:?}", + name, + Duration::from_nanos(total_stats.avg() as u64), + Duration::from_nanos(total_stats.stddev() as u64), + Duration::from_nanos(total_stats.percentile(0.99) as u64), + Duration::from_nanos(total_stats.percentile(0.999) as u64), + ); + } + + fn bench_txn_status_cache_concurrent_impl( + threads: usize, + init_size: usize, + simulate_contention: bool, + get_before_insert: bool, + ) { + let slots = if simulate_contention { + 1 + } else { + TXN_STATUS_CACHE_SLOTS + }; + let (c, time) = TxnStatusCache::with_simulated_system_time( + slots, + Duration::from_millis(init_size as u64), + 1 << 20, + ); + let start_time = SystemTime::now(); + for i in 1..=init_size { + c.insert( + (i as u64).into(), + (i as u64 + 1).into(), + start_time + Duration::from_millis(i as u64), + ); + } + + let name = format!( + "bench_concurrent_{}_{}_size{}{}", + if get_before_insert { + "get_and_insert" + } else { + "insert" + }, + threads, + init_size, + if simulate_contention { + "_contention" + } else { + "" + }, + ); + + bench_concurrent_impl(&name, threads, move |iter| { + let time_shift = init_size as u64 + iter; + let now = start_time + Duration::from_millis(time_shift); + time.store( + now.duration_since(UNIX_EPOCH).unwrap().as_millis() as u64, + Ordering::Release, + ); + + if get_before_insert { + test::black_box(c.get_no_promote(time_shift.into())); + } + c.insert(time_shift.into(), (time_shift + 1).into(), now); + test::black_box(&c); + }); + } + + #[bench] + #[ignore] + fn bench_txn_status_cache_concurrent(_b: &mut test::Bencher) { + // This case is implemented to run the concurrent benchmark in a handy way + // just like running other normal benchmarks. However, it doesn't seem + // to be possible to benchmark an operation in concurrent way by using + // either the built-in bencher or criterion. + // Here we test it in our own way without using the built-in bencher, + // and output the result by stdout. + // When you need to run this benchmark, comment out the `#[ignore]` and + // add --nocapture in your benchmark command line to get the result. + bench_txn_status_cache_concurrent_impl(16, 10000, false, false); + bench_txn_status_cache_concurrent_impl(16, 10000, true, false); + bench_txn_status_cache_concurrent_impl(16, 10000, false, true); + bench_txn_status_cache_concurrent_impl(16, 10000, true, true); + bench_txn_status_cache_concurrent_impl(64, 10000, false, false); + bench_txn_status_cache_concurrent_impl(64, 10000, true, false); + bench_txn_status_cache_concurrent_impl(64, 10000, false, true); + bench_txn_status_cache_concurrent_impl(64, 10000, true, true); + } + + #[test] + fn test_insert_and_get() { + let c = TxnStatusCache::new_for_test(); + assert!(c.get_no_promote(1.into()).is_none()); + + let now = SystemTime::now(); + + c.insert(1.into(), 2.into(), now); + assert_eq!(c.get_no_promote(1.into()).unwrap(), 2.into()); + c.insert(3.into(), 4.into(), now); + assert_eq!(c.get_no_promote(3.into()).unwrap(), 4.into()); + + // This won't actually happen, since a transaction will never have commit info + // with two different commit_ts. We just use this to check replacing + // won't happen. + c.insert(1.into(), 4.into(), now); + assert_eq!(c.get_no_promote(1.into()).unwrap(), 2.into()); + + let mut start_ts_list: Vec<_> = (1..100).step_by(2).map(TimeStamp::from).collect(); + start_ts_list.shuffle(&mut rand::thread_rng()); + for &start_ts in &start_ts_list { + let commit_ts = start_ts.next(); + c.insert(start_ts, commit_ts, now); + } + start_ts_list.shuffle(&mut rand::thread_rng()); + for &start_ts in &start_ts_list { + let commit_ts = start_ts.next(); + assert_eq!(c.get_no_promote(start_ts).unwrap(), commit_ts); + } + } + + #[test] + fn test_evicting_expired() { + let (c, time) = + TxnStatusCache::with_simulated_system_time(1, Duration::from_millis(1000), 1000); + let time_base = SystemTime::now(); + let set_time = |offset_millis: u64| { + time.store( + time_base.duration_since(UNIX_EPOCH).unwrap().as_millis() as u64 + offset_millis, + Ordering::Release, + ) + }; + let now = || UNIX_EPOCH + Duration::from_millis(time.load(Ordering::Acquire)); + + set_time(0); + assert_lt!( + time_base.duration_since(now()).unwrap(), + Duration::from_millis(1) + ); + + c.insert(1.into(), 2.into(), now()); + set_time(1); + c.insert(3.into(), 4.into(), now()); + set_time(2); + c.insert(5.into(), 6.into(), now()); + // Size should be calculated by count. + assert_eq!(c.slots[0].lock().size(), 3); + + // Insert entry 1 again. So if entry 1 is the first one to be popped out, it + // verifies that inserting an existing key won't promote it. + c.insert(1.into(), 2.into(), now()); + + // All the 3 entries are kept + assert_eq!(c.get_no_promote(1.into()).unwrap(), 2.into()); + assert_eq!(c.get_no_promote(3.into()).unwrap(), 4.into()); + assert_eq!(c.get_no_promote(5.into()).unwrap(), 6.into()); + + set_time(1001); + c.insert(7.into(), 8.into(), now()); + // Entry 1 will be popped out. + assert!(c.get_no_promote(1.into()).is_none()); + assert_eq!(c.get_no_promote(3.into()).unwrap(), 4.into()); + assert_eq!(c.get_no_promote(5.into()).unwrap(), 6.into()); + set_time(1004); + c.insert(9.into(), 10.into(), now()); + // It pops more than 1 entries if there are many expired items at the tail. + // Entry 3 and 5 will be popped out. + assert!(c.get_no_promote(1.into()).is_none()); + assert!(c.get_no_promote(3.into()).is_none()); + assert!(c.get_no_promote(5.into()).is_none()); + assert_eq!(c.get_no_promote(7.into()).unwrap(), 8.into()); + assert_eq!(c.get_no_promote(9.into()).unwrap(), 10.into()); + + // Now the cache's contents are: + // 7@1001, 9@1004 + // Test `get` promotes an entry and entries are not in order on insert time. + assert_eq!(c.get(7.into()).unwrap(), 8.into()); + set_time(2003); + c.insert(11.into(), 12.into(), now()); + assert_eq!(c.get_no_promote(7.into()).unwrap(), 8.into()); + assert_eq!(c.get_no_promote(9.into()).unwrap(), 10.into()); + assert_eq!(c.get_no_promote(11.into()).unwrap(), 12.into()); + + set_time(2005); + c.insert(13.into(), 14.into(), now()); + assert!(c.get_no_promote(7.into()).is_none()); + assert!(c.get_no_promote(9.into()).is_none()); + assert_eq!(c.get_no_promote(11.into()).unwrap(), 12.into()); + + // Now the cache's contents are: + // 11@2003, 13@2005 + // Test inserting existed entries. + // According to the implementation of LruCache, though it won't do any update to + // the content, it still check the tail to see if anything can be + // evicted. + set_time(3004); + c.insert(13.into(), 14.into(), now()); + assert!(c.get_no_promote(11.into()).is_none()); + assert_eq!(c.get_no_promote(13.into()).unwrap(), 14.into()); + + set_time(3006); + c.insert(13.into(), 14.into(), now()); + assert!(c.get_no_promote(13.into()).is_none()); + + // Now the cache is empty. + c.insert(15.into(), 16.into(), now()); + set_time(3008); + c.insert(17.into(), 18.into(), now()); + // Test inserting existed entry doesn't promote it. + // Re-insert 15. + set_time(3009); + c.insert(15.into(), 16.into(), now()); + set_time(4007); + c.insert(19.into(), 20.into(), now()); + // 15's insert time is not updated, and is at the tail of the LRU, so it should + // be popped. + assert!(c.get_no_promote(15.into()).is_none()); + assert_eq!(c.get_no_promote(17.into()).unwrap(), 18.into()); + + // Now the cache's contents are: + // 17@3008, 19@4007 + // Test system time being changed, which can lead to current time being less + // than entries' insert time. + set_time(2000); + c.insert(21.into(), 22.into(), now()); + assert_eq!(c.get_no_promote(17.into()).unwrap(), 18.into()); + assert_eq!(c.get_no_promote(19.into()).unwrap(), 20.into()); + assert_eq!(c.get_no_promote(21.into()).unwrap(), 22.into()); + set_time(3500); + c.insert(23.into(), 24.into(), now()); + assert_eq!(c.get_no_promote(21.into()).unwrap(), 22.into()); + assert_eq!(c.get(17.into()).unwrap(), 18.into()); + assert_eq!(c.get(19.into()).unwrap(), 20.into()); + assert_eq!(c.get(23.into()).unwrap(), 24.into()); + // `get` promotes the entries, and entry 21 is put to the tail. + c.insert(23.into(), 24.into(), now()); + assert_eq!(c.get_no_promote(17.into()).unwrap(), 18.into()); + assert_eq!(c.get_no_promote(19.into()).unwrap(), 20.into()); + assert!(c.get_no_promote(21.into()).is_none()); + assert_eq!(c.get_no_promote(23.into()).unwrap(), 24.into()); + + // Now the cache's contents are: + // 17@3008, 19@4007, 23@3500 + // The time passed to `insert` may differ from the time fetched in + // the `TxnStatusCacheEvictPolicy` as they are fetched at different time. + set_time(4009); + // Insert with time 4007, but check with time 4009 + c.insert(25.into(), 26.into(), now() - Duration::from_millis(2)); + assert!(c.get_no_promote(17.into()).is_none()); + assert_eq!(c.get_no_promote(19.into()).unwrap(), 20.into()); + + // The cache's contents: + // 19@4007, 23@3500, 25@4007 + set_time(4010); + c.insert(27.into(), 28.into(), now()); + // The cache's contents: + // 19@4007, 23@3500, 25@4007, 27@4010 + + // It's also possible to check with a lower time considering that system time + // may be changed. Insert with time 5018, but check with time 5008 + set_time(5008); + c.insert(29.into(), 30.into(), now() + Duration::from_millis(10)); + assert!(c.get_no_promote(19.into()).is_none()); + assert!(c.get_no_promote(23.into()).is_none()); + assert!(c.get_no_promote(25.into()).is_none()); + assert_eq!(c.get_no_promote(27.into()).unwrap(), 28.into()); + assert_eq!(c.get_no_promote(29.into()).unwrap(), 30.into()); + + // Now the the cache's contents are: + // 27@4010, 29@5018 + // Considering the case that system time is being changed, it's even + // possible that the entry being inserted is already expired + // comparing to the current time. It doesn't matter whether the + // entry will be dropped immediately or not. We just ensure it won't + // trigger more troubles. + set_time(7000); + c.insert(31.into(), 32.into(), now() - Duration::from_millis(1001)); + assert!(c.get_no_promote(27.into()).is_none()); + assert!(c.get_no_promote(29.into()).is_none()); + assert!(c.get_no_promote(31.into()).is_none()); + assert_eq!(c.slots[0].lock().size(), 0); + } + + #[test] + fn test_setting_capacity() { + let c = TxnStatusCache::new_impl(2, Duration::from_millis(1000), 10, None); + assert!(c.is_enabled); + assert_eq!(c.slots.len(), 2); + assert_eq!(c.slots[0].lock().capacity(), 5); + assert_eq!(c.slots[1].lock().capacity(), 5); + + let c = TxnStatusCache::new_impl(2, Duration::from_millis(1000), 0, None); + assert!(!c.is_enabled); + assert_eq!(c.slots.len(), 0); + // All operations are noops and won't cause panic or return any incorrect + // result. + c.insert(1.into(), 2.into(), SystemTime::now()); + assert!(c.get_no_promote(1.into()).is_none()); + assert!(c.get(1.into()).is_none()); + } + + #[test] + fn test_evicting_by_capacity() { + let (c, time) = + TxnStatusCache::with_simulated_system_time(1, Duration::from_millis(1000), 5); + let time_base = SystemTime::now(); + let set_time = |offset_millis: u64| { + time.store( + time_base.duration_since(UNIX_EPOCH).unwrap().as_millis() as u64 + offset_millis, + Ordering::Release, + ) + }; + let now = || UNIX_EPOCH + Duration::from_millis(time.load(Ordering::Acquire)); + + set_time(0); + c.insert(1.into(), 2.into(), now()); + set_time(2); + c.insert(3.into(), 4.into(), now()); + set_time(4); + c.insert(5.into(), 6.into(), now()); + set_time(6); + c.insert(7.into(), 8.into(), now()); + + // The cache can keep at most 5 entries. + set_time(8); + c.insert(9.into(), 10.into(), now()); + // Entry 1 not evicted. 5 entries in the cache currently + assert_eq!(c.slots[0].lock().len(), 5); + assert_eq!(c.get_no_promote(1.into()).unwrap(), 2.into()); + set_time(10); + c.insert(11.into(), 12.into(), now()); + // Entry 1 evicted. Still 5 entries in the cache. + assert_eq!(c.slots[0].lock().len(), 5); + assert!(c.get_no_promote(1.into()).is_none()); + assert_eq!(c.get_no_promote(3.into()).unwrap(), 4.into()); + + // Nothing will be evicted after trying to insert an existing key. + c.insert(11.into(), 12.into(), now()); + assert_eq!(c.slots[0].lock().len(), 5); + assert_eq!(c.get_no_promote(3.into()).unwrap(), 4.into()); + + // Current contents (key@time): + // 3@2, 5@4, 7@6. 9@8, 11@10 + // Evicting by time works as well. + set_time(1005); + c.insert(13.into(), 14.into(), now()); + assert_eq!(c.slots[0].lock().len(), 4); + assert!(c.get_no_promote(3.into()).is_none()); + assert!(c.get_no_promote(5.into()).is_none()); + assert_eq!(c.get_no_promote(7.into()).unwrap(), 8.into()); + + // Reorder the entries by `get` to prepare for testing the next case. + assert_eq!(c.get(7.into()).unwrap(), 8.into()); + assert_eq!(c.get(9.into()).unwrap(), 10.into()); + assert_eq!(c.get(11.into()).unwrap(), 12.into()); + + c.insert(15.into(), 16.into(), now()); + // Current contents: + // 13@1005, 7@6. 9@8, 11@10, 15@1005 + assert_eq!(c.slots[0].lock().len(), 5); + // Expired entries that are not the tail can be evicted after the tail + // is evicted due to capacity exceeded. + set_time(1011); + c.insert(17.into(), 18.into(), now()); + assert_eq!(c.slots[0].lock().len(), 2); + assert!(c.get_no_promote(13.into()).is_none()); + assert!(c.get_no_promote(7.into()).is_none()); + assert!(c.get_no_promote(9.into()).is_none()); + assert!(c.get_no_promote(11.into()).is_none()); + assert_eq!(c.get(15.into()).unwrap(), 16.into()); + assert_eq!(c.get(17.into()).unwrap(), 18.into()); + } +} diff --git a/tests/failpoints/cases/test_kv_service.rs b/tests/failpoints/cases/test_kv_service.rs index 00f5c3c778e..2ec1109edd4 100644 --- a/tests/failpoints/cases/test_kv_service.rs +++ b/tests/failpoints/cases/test_kv_service.rs @@ -3,10 +3,14 @@ use std::{sync::Arc, time::Duration}; use grpcio::{ChannelBuilder, Environment}; -use kvproto::{kvrpcpb::*, tikvpb::TikvClient}; +use kvproto::{ + kvrpcpb::{PrewriteRequestPessimisticAction::SkipPessimisticCheck, *}, + tikvpb::TikvClient, +}; use test_raftstore::{ - configure_for_lease_read, must_kv_commit, must_kv_prewrite, must_new_cluster_and_kv_client, - must_new_cluster_mul, new_server_cluster, try_kv_prewrite_with_impl, + configure_for_lease_read, must_kv_commit, must_kv_have_locks, must_kv_prewrite, + must_kv_prewrite_with, must_new_cluster_and_kv_client, must_new_cluster_mul, + new_server_cluster, try_kv_prewrite_with, try_kv_prewrite_with_impl, }; use tikv_util::{config::ReadableDuration, HandyRwLock}; @@ -92,6 +96,7 @@ fn test_undetermined_write_err() { &client, ctx, vec![mutation], + vec![], b"k".to_vec(), 10, 0, @@ -156,3 +161,105 @@ fn test_stale_read_on_local_leader() { assert!(resp.region_error.is_none()); assert_eq!(v, resp.get_value()); } + +#[test] +fn test_storage_do_not_update_txn_status_cache_on_write_error() { + let cache_hit_fp = "before_prewrite_txn_status_cache_hit"; + let cache_miss_fp = "before_prewrite_txn_status_cache_miss"; + + let (cluster, leader, ctx) = must_new_cluster_mul(1); + let env = Arc::new(Environment::new(1)); + let channel = ChannelBuilder::new(env) + .connect(&cluster.sim.read().unwrap().get_addr(leader.get_store_id())); + let client = TikvClient::new(channel); + + let pk = b"pk".to_vec(); + + // Case 1: Test write successfully. + + let mut mutation = Mutation::default(); + mutation.set_op(Op::Put); + mutation.set_key(b"k1".to_vec()); + mutation.set_value(b"v1".to_vec()); + must_kv_prewrite_with( + &client, + ctx.clone(), + vec![mutation.clone()], + vec![SkipPessimisticCheck], + pk.clone(), + 10, + 10, + true, + false, + ); + must_kv_commit(&client, ctx.clone(), vec![b"k1".to_vec()], 10, 15, 15); + + // Expect cache hit + fail::cfg(cache_miss_fp, "panic").unwrap(); + must_kv_prewrite_with( + &client, + ctx.clone(), + vec![mutation], + vec![SkipPessimisticCheck], + pk.clone(), + 10, + 10, + true, + false, + ); + // Key not locked. + must_kv_have_locks(&client, ctx.clone(), 19, b"k1", b"k2", &[]); + fail::remove(cache_miss_fp); + + // Case 2: Write failed. + + let mut mutation = Mutation::default(); + mutation.set_op(Op::Put); + mutation.set_key(b"k2".to_vec()); + mutation.set_value(b"v2".to_vec()); + + try_kv_prewrite_with( + &client, + ctx.clone(), + vec![mutation.clone()], + vec![SkipPessimisticCheck], + pk.clone(), + 20, + 20, + true, + false, + ); + fail::cfg("raftkv_early_error_report", "return").unwrap(); + let mut commit_req = CommitRequest::default(); + commit_req.set_context(ctx.clone()); + commit_req.set_start_version(20); + commit_req.set_commit_version(25); + commit_req.set_keys(vec![b"k2".to_vec()].into()); + let commit_resp = client.kv_commit(&commit_req).unwrap(); + assert!(commit_resp.has_region_error()); + fail::remove("raftkv_early_error_report"); + must_kv_have_locks( + &client, + ctx.clone(), + 29, + b"k2", + b"k3", + &[(b"k2", Op::Put, 20, 20)], + ); + + // Expect cache miss + fail::cfg(cache_hit_fp, "panic").unwrap(); + try_kv_prewrite_with( + &client, + ctx.clone(), + vec![mutation], + vec![SkipPessimisticCheck], + pk, + 20, + 20, + true, + false, + ); + must_kv_have_locks(&client, ctx, 29, b"k2", b"k3", &[(b"k2", Op::Put, 20, 20)]); + fail::remove(cache_hit_fp); +} diff --git a/tests/integrations/config/mod.rs b/tests/integrations/config/mod.rs index 1ac6e3840f1..2f4f5ba7695 100644 --- a/tests/integrations/config/mod.rs +++ b/tests/integrations/config/mod.rs @@ -761,6 +761,7 @@ fn test_serde_custom_tikv_config() { other_priority: IoPriority::Low, }, background_error_recovery_window: ReadableDuration::hours(1), + txn_status_cache_capacity: 1000, }; value.coprocessor = CopConfig { split_region_on_table: false, diff --git a/tests/integrations/config/test-custom.toml b/tests/integrations/config/test-custom.toml index fe1fa066ae8..1bb52fad5fc 100644 --- a/tests/integrations/config/test-custom.toml +++ b/tests/integrations/config/test-custom.toml @@ -101,6 +101,7 @@ reserve-space = "10GB" reserve-raft-space = "2GB" enable-ttl = true ttl-check-poll-interval = "0s" +txn-status-cache-capacity = 1000 [storage.block-cache] capacity = "40GB" From 9f62d8f97e61052fed00dea3748967a71b7ceb4a Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Wed, 1 Nov 2023 12:23:08 +0800 Subject: [PATCH 100/220] tidb_query_expr: fix the behavior of `field` function (#15879) (#15884) close tikv/tikv#15878 Signed-off-by: Yang Keao Co-authored-by: Yang Keao --- components/tidb_query_expr/src/impl_string.rs | 52 +++++++++++++++---- components/tidb_query_expr/src/lib.rs | 10 +++- 2 files changed, 52 insertions(+), 10 deletions(-) diff --git a/components/tidb_query_expr/src/impl_string.rs b/components/tidb_query_expr/src/impl_string.rs index f3b9b03c287..25c9294d533 100644 --- a/components/tidb_query_expr/src/impl_string.rs +++ b/components/tidb_query_expr/src/impl_string.rs @@ -635,15 +635,22 @@ fn field(args: &[Option<&T>]) -> Result #[rpn_fn(nullable, varg, min_args = 1)] #[inline] -fn field_bytes(args: &[Option]) -> Result> { +fn field_bytes(args: &[Option]) -> Result> { Ok(Some(match args[0] { // As per the MySQL doc, if the first argument is NULL, this function always returns 0. None => 0, - Some(val) => args - .iter() - .skip(1) - .position(|&i| i == Some(val)) - .map_or(0, |pos| (pos + 1) as i64), + Some(val) => { + for (pos, arg) in args.iter().enumerate().skip(1) { + if arg.is_none() { + continue; + } + match C::sort_compare(val, arg.unwrap()) { + Ok(Ordering::Equal) => return Ok(Some(pos as i64)), + _ => continue, + } + } + 0 + } })) } @@ -3214,6 +3221,7 @@ mod tests { Some(b"baz".to_vec()), ], Some(1), + Collation::Utf8Mb4Bin, ), ( vec![ @@ -3223,6 +3231,7 @@ mod tests { Some(b"hello".to_vec()), ], Some(0), + Collation::Utf8Mb4Bin, ), ( vec![ @@ -3232,6 +3241,7 @@ mod tests { Some(b"hello".to_vec()), ], Some(3), + Collation::Utf8Mb4Bin, ), ( vec![ @@ -3244,6 +3254,7 @@ mod tests { Some(b"Hello".to_vec()), ], Some(6), + Collation::Utf8Mb4Bin, ), ( vec![ @@ -3252,14 +3263,37 @@ mod tests { Some(b"Hello World!".to_vec()), ], Some(0), + Collation::Utf8Mb4Bin, + ), + ( + vec![None, None, Some(b"Hello World!".to_vec())], + Some(0), + Collation::Utf8Mb4Bin, + ), + ( + vec![Some(b"Hello World!".to_vec())], + Some(0), + Collation::Utf8Mb4Bin, + ), + ( + vec![ + Some(b"a".to_vec()), + Some(b"A".to_vec()), + Some(b"a".to_vec()), + ], + Some(1), + Collation::Utf8Mb4GeneralCi, ), - (vec![None, None, Some(b"Hello World!".to_vec())], Some(0)), - (vec![Some(b"Hello World!".to_vec())], Some(0)), ]; - for (args, expect_output) in test_cases { + for (args, expect_output, collation) in test_cases { let output = RpnFnScalarEvaluator::new() .push_params(args) + .return_field_type( + FieldTypeBuilder::new() + .tp(FieldTypeTp::Long) + .collation(collation), + ) .evaluate(ScalarFuncSig::FieldString) .unwrap(); assert_eq!(output, expect_output); diff --git a/components/tidb_query_expr/src/lib.rs b/components/tidb_query_expr/src/lib.rs index c2ef6722148..61fb3612b63 100644 --- a/components/tidb_query_expr/src/lib.rs +++ b/components/tidb_query_expr/src/lib.rs @@ -409,6 +409,14 @@ fn map_lower_utf8_sig(value: ScalarFuncSig, children: &[Expr]) -> Result Result { + Ok(match_template_collator! { + TT, match ret_field_type.as_accessor().collation().map_err(tidb_query_datatype::codec::Error::from)? { + Collation::TT => field_bytes_fn_meta::() + } + }) +} + #[rustfmt::skip] fn map_expr_node_to_rpn_func(expr: &Expr) -> Result { let value = expr.get_sig(); @@ -787,7 +795,7 @@ fn map_expr_node_to_rpn_func(expr: &Expr) -> Result { ScalarFuncSig::Locate3Args => locate_3_args_fn_meta(), ScalarFuncSig::FieldInt => field_fn_meta::(), ScalarFuncSig::FieldReal => field_fn_meta::(), - ScalarFuncSig::FieldString => field_bytes_fn_meta(), + ScalarFuncSig::FieldString => map_field_string_sig(ft)?, ScalarFuncSig::Elt => elt_fn_meta(), ScalarFuncSig::MakeSet => make_set_fn_meta(), ScalarFuncSig::Space => space_fn_meta(), From d07fcc0d3f59e847c4d30ec5ccfd3a04240ac083 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Fri, 3 Nov 2023 17:32:09 +0800 Subject: [PATCH 101/220] raftstore: improve the bucket split strategy (#15798) (#15843) close tikv/tikv#13671 there are three reason may cause the bucket not split: 1. split check tick will refresh bucket info even info the bucket version not change 2. the suspect buckets only conside the increment flow 3. all the bucket increment flows are reset if one bucket is updated. To solve this, bucket stats only record the increment flow and reset it after meta size updated. Signed-off-by: bufferflies <1045931706@qq.com> Co-authored-by: tongjian <1045931706@qq.com> Co-authored-by: bufferflies <1045931706@qq.com> Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- .../raftstore-v2/src/operation/bucket.rs | 391 +---------------- components/raftstore-v2/src/operation/mod.rs | 1 - components/raftstore-v2/src/raft/peer.rs | 11 +- components/raftstore/src/store/fsm/apply.rs | 6 +- components/raftstore/src/store/fsm/peer.rs | 217 +++------- components/raftstore/src/store/mod.rs | 18 +- components/raftstore/src/store/peer.rs | 37 +- components/raftstore/src/store/worker/mod.rs | 3 +- components/raftstore/src/store/worker/read.rs | 6 +- .../raftstore/src/store/worker/split_check.rs | 396 +++++++++++++++++- components/test_raftstore/src/cluster.rs | 2 +- .../raftstore/test_split_region.rs | 65 ++- 12 files changed, 537 insertions(+), 616 deletions(-) diff --git a/components/raftstore-v2/src/operation/bucket.rs b/components/raftstore-v2/src/operation/bucket.rs index 242b9a9b33b..920a4e68e8c 100644 --- a/components/raftstore-v2/src/operation/bucket.rs +++ b/components/raftstore-v2/src/operation/bucket.rs @@ -6,12 +6,12 @@ use std::sync::Arc; use engine_traits::{KvEngine, RaftEngine}; use kvproto::{ - metapb::{self, RegionEpoch}, + metapb::RegionEpoch, raft_serverpb::{ExtraMessageType, RaftMessage, RefreshBuckets}, }; -use pd_client::{BucketMeta, BucketStat}; +use pd_client::BucketMeta; use raftstore::{ - coprocessor::{Config, RegionChangeEvent}, + coprocessor::RegionChangeEvent, store::{util, Bucket, BucketRange, ReadProgress, SplitCheckTask, Transport}, }; use slog::{error, info}; @@ -24,213 +24,6 @@ use crate::{ worker::pd, }; -#[derive(Debug, Clone, Default)] -pub struct BucketStatsInfo { - // the stats is increment flow. - bucket_stat: Option, - // the report bucket stat records the increment stats after last report pd. - // it will be reset after report pd. - report_bucket_stat: Option, - // avoid the version roll back, it record the last bucket version if bucket stat isn't none. - last_bucket_version: u64, -} - -impl BucketStatsInfo { - /// returns all bucket ranges those's write_bytes exceed the given - /// diff_size_threshold. - pub fn gen_bucket_range_for_update( - &self, - region_bucket_max_size: u64, - ) -> Option> { - let region_buckets = self.bucket_stat.as_ref()?; - let stats = ®ion_buckets.stats; - let keys = ®ion_buckets.meta.keys; - let sizes = ®ion_buckets.meta.sizes; - - let mut suspect_bucket_ranges = vec![]; - assert_eq!(keys.len(), stats.write_bytes.len() + 1); - for i in 0..stats.write_bytes.len() { - let estimated_bucket_size = stats.write_bytes[i] + sizes[i]; - if estimated_bucket_size >= region_bucket_max_size { - suspect_bucket_ranges.push(BucketRange(keys[i].clone(), keys[i + 1].clone())); - } - } - Some(suspect_bucket_ranges) - } - - #[inline] - pub fn version(&self) -> u64 { - self.bucket_stat - .as_ref() - .map(|b| b.meta.version) - .or(Some(self.last_bucket_version)) - .unwrap_or_default() - } - - #[inline] - pub fn add_bucket_flow(&mut self, delta: &Option) { - if let (Some(buckets), Some(report_buckets), Some(delta)) = ( - self.bucket_stat.as_mut(), - self.report_bucket_stat.as_mut(), - delta, - ) { - buckets.merge(delta); - report_buckets.merge(delta); - } - } - - #[inline] - pub fn set_bucket_stat(&mut self, buckets: Option) { - self.bucket_stat = buckets.clone(); - if let Some(new_buckets) = buckets { - self.last_bucket_version = new_buckets.meta.version; - let mut new_report_buckets = BucketStat::from_meta(new_buckets.meta); - if let Some(old) = &mut self.report_bucket_stat { - new_report_buckets.merge(old); - *old = new_report_buckets; - } else { - self.report_bucket_stat = Some(new_report_buckets); - } - } else { - self.report_bucket_stat = None; - } - } - - #[inline] - pub fn report_bucket_stat(&mut self) -> BucketStat { - let current = self.report_bucket_stat.as_mut().unwrap(); - let delta = current.clone(); - current.clear_stats(); - delta - } - - #[inline] - pub fn bucket_stat(&self) -> &Option { - &self.bucket_stat - } - - pub fn on_refresh_region_buckets( - &mut self, - cfg: &Config, - next_bucket_version: u64, - buckets: Vec, - region_epoch: RegionEpoch, - region: metapb::Region, - bucket_ranges: Option>, - ) -> bool { - let change_bucket_version: bool; - // The region buckets reset after this region happened split or merge. - // The message should be dropped if it's epoch is lower than the regions. - // The bucket ranges is none when the region buckets is also none. - // So this condition indicates that the region buckets needs to refresh not - // renew. - if let Some(bucket_ranges) = bucket_ranges&&self.bucket_stat.is_some(){ - assert_eq!(buckets.len(), bucket_ranges.len()); - change_bucket_version=self.update_buckets(cfg, next_bucket_version, buckets, region_epoch, &bucket_ranges); - }else{ - change_bucket_version = true; - // when the region buckets is none, the exclusive buckets includes all the - // bucket keys. - self.init_buckets(cfg, next_bucket_version, buckets, region_epoch, region); - } - change_bucket_version - } - - fn update_buckets( - &mut self, - cfg: &Config, - next_bucket_version: u64, - buckets: Vec, - region_epoch: RegionEpoch, - bucket_ranges: &Vec, - ) -> bool { - let origin_region_buckets = self.bucket_stat.as_ref().unwrap(); - let mut change_bucket_version = false; - let mut meta_idx = 0; - let mut region_buckets = origin_region_buckets.clone(); - let mut meta = (*region_buckets.meta).clone(); - meta.region_epoch = region_epoch; - - // bucket stats will clean if the bucket size is updated. - for (bucket, bucket_range) in buckets.into_iter().zip(bucket_ranges) { - // the bucket ranges maybe need to split or merge not all the meta keys, so it - // needs to find the first keys. - while meta_idx < meta.keys.len() && meta.keys[meta_idx] != bucket_range.0 { - meta_idx += 1; - } - // meta_idx can't be not the last entry (which is end key) - if meta_idx >= meta.keys.len() - 1 { - break; - } - // the bucket size is small and does not have split keys, - // then it should be merged with its left neighbor - let region_bucket_merge_size = - cfg.region_bucket_merge_size_ratio * (cfg.region_bucket_size.0 as f64); - if bucket.keys.is_empty() && bucket.size <= (region_bucket_merge_size as u64) { - meta.sizes[meta_idx] = bucket.size; - region_buckets.clean_stats(meta_idx); - // the region has more than one bucket - // and the left neighbor + current bucket size is not very big - if meta.keys.len() > 2 - && meta_idx != 0 - && meta.sizes[meta_idx - 1] + bucket.size < cfg.region_bucket_size.0 * 2 - { - // bucket is too small - region_buckets.left_merge(meta_idx); - meta.left_merge(meta_idx); - change_bucket_version = true; - continue; - } - } else { - // update size - meta.sizes[meta_idx] = bucket.size / (bucket.keys.len() + 1) as u64; - region_buckets.clean_stats(meta_idx); - // insert new bucket keys (split the original bucket) - for bucket_key in bucket.keys { - meta_idx += 1; - region_buckets.split(meta_idx); - meta.split(meta_idx, bucket_key); - change_bucket_version = true; - } - } - meta_idx += 1; - } - if change_bucket_version { - meta.version = next_bucket_version; - } - region_buckets.meta = Arc::new(meta); - self.set_bucket_stat(Some(region_buckets)); - change_bucket_version - } - - fn init_buckets( - &mut self, - cfg: &Config, - next_bucket_version: u64, - mut buckets: Vec, - region_epoch: RegionEpoch, - region: metapb::Region, - ) { - // when the region buckets is none, the exclusive buckets includes all the - // bucket keys. - assert_eq!(buckets.len(), 1); - let bucket_keys = buckets.pop().unwrap().keys; - let bucket_count = bucket_keys.len() + 1; - let mut meta = BucketMeta { - region_id: region.get_id(), - region_epoch, - version: next_bucket_version, - keys: bucket_keys, - sizes: vec![cfg.region_bucket_size.0; bucket_count], - }; - // padding the boundary keys and initialize the flow. - meta.keys.insert(0, region.get_start_key().to_vec()); - meta.keys.push(region.get_end_key().to_vec()); - let bucket_stats = BucketStat::from_meta(Arc::new(meta)); - self.set_bucket_stat(Some(bucket_stats)); - } -} - impl Peer { #[inline] pub fn on_refresh_region_buckets( @@ -250,14 +43,13 @@ impl Peer { let current_version = self.region_buckets_info().version(); let next_bucket_version = util::gen_bucket_version(self.term(), current_version); - // let mut is_first_refresh = true; let region = self.region().clone(); let change_bucket_version = self.region_buckets_info_mut().on_refresh_region_buckets( &store_ctx.coprocessor_host.cfg, next_bucket_version, buckets, region_epoch, - region, + ®ion, bucket_ranges, ); let region_buckets = self @@ -443,178 +235,3 @@ where self.schedule_tick(PeerTick::ReportBuckets); } } - -#[cfg(test)] -mod tests { - use super::*; - - // create BucketStatsInfo include three keys: ["","100","200",""]. - fn mock_bucket_stats_info() -> BucketStatsInfo { - let mut bucket_stats_info = BucketStatsInfo::default(); - let cfg = Config::default(); - let next_bucket_version = 1; - let bucket_ranges = None; - let mut region_epoch = RegionEpoch::default(); - region_epoch.set_conf_ver(1); - region_epoch.set_version(1); - let mut region = metapb::Region::default(); - region.set_id(1); - - let mut buckets = vec![]; - let mut bucket = Bucket::default(); - bucket.keys.push(vec![100]); - bucket.keys.push(vec![200]); - buckets.insert(0, bucket); - - let _ = bucket_stats_info.on_refresh_region_buckets( - &cfg, - next_bucket_version, - buckets, - region_epoch, - region, - bucket_ranges, - ); - bucket_stats_info - } - - #[test] - pub fn test_version() { - let mut bucket_stats_info = mock_bucket_stats_info(); - assert_eq!(1, bucket_stats_info.version()); - bucket_stats_info.set_bucket_stat(None); - assert_eq!(1, bucket_stats_info.version()); - - let mut meta = BucketMeta::default(); - meta.version = 2; - meta.keys.push(vec![]); - meta.keys.push(vec![]); - let bucket_stat = BucketStat::from_meta(Arc::new(meta)); - bucket_stats_info.set_bucket_stat(Some(bucket_stat)); - assert_eq!(2, bucket_stats_info.version()); - } - - #[test] - pub fn test_insert_new_buckets() { - let bucket_stats_info = mock_bucket_stats_info(); - - let cfg = Config::default(); - let bucket_stat = bucket_stats_info.bucket_stat.unwrap(); - assert_eq!( - vec![vec![], vec![100], vec![200], vec![]], - bucket_stat.meta.keys - ); - for i in 0..bucket_stat.stats.write_bytes.len() { - assert_eq!(cfg.region_bucket_size.0, bucket_stat.meta.sizes[i]); - assert_eq!(0, bucket_stat.stats.write_bytes[i]); - } - } - - #[test] - pub fn test_report_buckets() { - let mut bucket_stats_info = mock_bucket_stats_info(); - let bucket_stats = bucket_stats_info.bucket_stat().as_ref().unwrap(); - let mut delta_bucket_stats = bucket_stats.clone(); - delta_bucket_stats.write_key(&[1], 1); - delta_bucket_stats.write_key(&[201], 1); - bucket_stats_info.add_bucket_flow(&Some(delta_bucket_stats.clone())); - let bucket_stats = bucket_stats_info.report_bucket_stat(); - assert_eq!(vec![2, 0, 2], bucket_stats.stats.write_bytes); - - let report_bucket_stats = bucket_stats_info.report_bucket_stat(); - assert_eq!(vec![0, 0, 0], report_bucket_stats.stats.write_bytes); - bucket_stats_info.add_bucket_flow(&Some(delta_bucket_stats)); - assert_eq!(vec![2, 0, 2], bucket_stats.stats.write_bytes); - } - - #[test] - pub fn test_spilt_and_merge_buckets() { - let mut bucket_stats_info = mock_bucket_stats_info(); - let next_bucket_version = 2; - let mut region = metapb::Region::default(); - region.set_id(1); - let cfg = Config::default(); - let bucket_size = cfg.region_bucket_size.0; - let bucket_stats = bucket_stats_info.bucket_stat().as_ref().unwrap(); - let region_epoch = bucket_stats.meta.region_epoch.clone(); - - // step1: update buckets flow - let mut delta_bucket_stats = bucket_stats.clone(); - delta_bucket_stats.write_key(&[1], 1); - delta_bucket_stats.write_key(&[201], 1); - bucket_stats_info.add_bucket_flow(&Some(delta_bucket_stats)); - let bucket_stats = bucket_stats_info.bucket_stat().as_ref().unwrap(); - assert_eq!(vec![2, 0, 2], bucket_stats.stats.write_bytes); - - // step2: tick not affect anything - let bucket_ranges = Some(vec![]); - let buckets = vec![]; - let mut change_bucket_version = bucket_stats_info.on_refresh_region_buckets( - &cfg, - next_bucket_version, - buckets, - region_epoch.clone(), - region.clone(), - bucket_ranges, - ); - let bucket_stats = bucket_stats_info.bucket_stat().as_ref().unwrap(); - assert!(!change_bucket_version); - assert_eq!(vec![2, 0, 2], bucket_stats.stats.write_bytes); - - // step3: split key 50 - let mut bucket_ranges = Some(vec![BucketRange(vec![], vec![100])]); - let mut bucket = Bucket::default(); - bucket.keys = vec![vec![50]]; - bucket.size = bucket_size; - let mut buckets = vec![bucket]; - change_bucket_version = bucket_stats_info.on_refresh_region_buckets( - &cfg, - next_bucket_version, - buckets.clone(), - region_epoch.clone(), - region.clone(), - bucket_ranges.clone(), - ); - assert!(change_bucket_version); - let bucket_stats = bucket_stats_info.bucket_stat().as_ref().unwrap(); - assert_eq!( - vec![vec![], vec![50], vec![100], vec![200], vec![]], - bucket_stats.meta.keys - ); - assert_eq!( - vec![bucket_size / 2, bucket_size / 2, bucket_size, bucket_size], - bucket_stats.meta.sizes - ); - assert_eq!(vec![0, 0, 0, 2], bucket_stats.stats.write_bytes); - - // step4: merge [50-100] to [0-50], - bucket_ranges = Some(vec![BucketRange(vec![50], vec![100])]); - let mut bucket = Bucket::default(); - bucket.keys = vec![]; - bucket.size = 0; - buckets = vec![bucket]; - change_bucket_version = bucket_stats_info.on_refresh_region_buckets( - &cfg, - next_bucket_version, - buckets, - region_epoch, - region, - bucket_ranges, - ); - assert!(change_bucket_version); - - let bucket_stats = bucket_stats_info.bucket_stat().as_ref().unwrap(); - assert_eq!( - vec![vec![], vec![100], vec![200], vec![]], - bucket_stats.meta.keys - ); - assert_eq!( - vec![bucket_size / 2, bucket_size, bucket_size], - bucket_stats.meta.sizes - ); - assert_eq!(vec![0, 0, 2], bucket_stats.stats.write_bytes); - - // report buckets doesn't be affected by the split and merge. - let report_bucket_stats = bucket_stats_info.report_bucket_stat(); - assert_eq!(vec![4, 0, 2], report_bucket_stats.stats.write_bytes); - } -} diff --git a/components/raftstore-v2/src/operation/mod.rs b/components/raftstore-v2/src/operation/mod.rs index 6d5cba9fff8..9ccf08d6d54 100644 --- a/components/raftstore-v2/src/operation/mod.rs +++ b/components/raftstore-v2/src/operation/mod.rs @@ -24,7 +24,6 @@ pub use ready::{ }; pub(crate) use self::{ - bucket::BucketStatsInfo, command::SplitInit, query::{LocalReader, ReadDelegatePair, SharedReadTablet}, txn_ext::TxnContext, diff --git a/components/raftstore-v2/src/raft/peer.rs b/components/raftstore-v2/src/raft/peer.rs index 2c8b8cef1db..9b095b872e7 100644 --- a/components/raftstore-v2/src/raft/peer.rs +++ b/components/raftstore-v2/src/raft/peer.rs @@ -23,8 +23,9 @@ use raftstore::{ fsm::ApplyMetrics, metrics::RAFT_PEER_PENDING_DURATION, util::{Lease, RegionReadProgress}, - Config, EntryStorage, ForceLeaderState, PeerStat, ProposalQueue, ReadDelegate, - ReadIndexQueue, ReadProgress, TabletSnapManager, UnsafeRecoveryState, WriteTask, + BucketStatsInfo, Config, EntryStorage, ForceLeaderState, PeerStat, ProposalQueue, + ReadDelegate, ReadIndexQueue, ReadProgress, TabletSnapManager, UnsafeRecoveryState, + WriteTask, }, }; use slog::{debug, info, Logger}; @@ -35,9 +36,9 @@ use crate::{ batch::StoreContext, fsm::ApplyScheduler, operation::{ - AbnormalPeerContext, AsyncWriter, BucketStatsInfo, CompactLogContext, DestroyProgress, - GcPeerContext, MergeContext, ProposalControl, ReplayWatch, SimpleWriteReqEncoder, - SplitFlowControl, SplitPendingAppend, TxnContext, + AbnormalPeerContext, AsyncWriter, CompactLogContext, DestroyProgress, GcPeerContext, + MergeContext, ProposalControl, ReplayWatch, SimpleWriteReqEncoder, SplitFlowControl, + SplitPendingAppend, TxnContext, }, router::{ApplyTask, CmdResChannel, PeerTick, QueryResChannel}, Result, diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index 038171d9715..339dff68e76 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -679,7 +679,7 @@ where exec_res: results, metrics: mem::take(&mut delegate.metrics), applied_term: delegate.applied_term, - bucket_stat: delegate.buckets.clone().map(Box::new), + bucket_stat: delegate.buckets.clone(), }); if !self.kv_wb().is_empty() { // Pending writes not flushed, need to set seqno to following ApplyRes later @@ -3874,7 +3874,7 @@ where pub applied_term: u64, pub exec_res: VecDeque>, pub metrics: ApplyMetrics, - pub bucket_stat: Option>, + pub bucket_stat: Option, pub write_seqno: Vec, } @@ -6929,7 +6929,7 @@ mod tests { router.schedule_task(1, Msg::apply(apply2)); let res = fetch_apply_res(&rx); - let bucket_version = res.bucket_stat.unwrap().as_ref().meta.version; + let bucket_version = res.bucket_stat.unwrap().meta.version; assert_eq!(bucket_version, 2); diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index 7504f746abe..42241e46475 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -41,7 +41,7 @@ use kvproto::{ replication_modepb::{DrAutoSyncState, ReplicationMode}, }; use parking_lot::RwLockWriteGuard; -use pd_client::{new_bucket_stats, BucketMeta, BucketStat}; +use pd_client::BucketMeta; use protobuf::Message; use raft::{ self, @@ -2327,10 +2327,11 @@ where return; } let applied_index = res.apply_state.applied_index; - let buckets = self.fsm.peer.region_buckets.as_mut(); - if let (Some(delta), Some(buckets)) = (res.bucket_stat, buckets) { - buckets.merge(&delta); - } + self.fsm + .peer + .region_buckets_info_mut() + .add_bucket_flow(&res.bucket_stat); + self.fsm.has_ready |= self.fsm.peer.post_apply( self.ctx, res.apply_state, @@ -5989,7 +5990,7 @@ where fn on_refresh_region_buckets( &mut self, region_epoch: RegionEpoch, - mut buckets: Vec, + buckets: Vec, bucket_ranges: Option>, _cb: Callback, ) { @@ -6017,14 +6018,14 @@ where // test purpose #[cfg(any(test, feature = "testexport"))] { - let default_buckets = BucketStat::default(); test_only_callback( _cb, self.fsm .peer - .region_buckets + .region_buckets_info() + .bucket_stat() .as_ref() - .unwrap_or(&default_buckets) + .unwrap() .meta .clone(), ); @@ -6032,108 +6033,53 @@ where return; } - let mut current_version = self + let current_version = self.fsm.peer.region_buckets_info().version(); + let next_bucket_version = util::gen_bucket_version(self.fsm.peer.term(), current_version); + let region = self.region().clone(); + let change_bucket_version = self + .fsm + .peer + .region_buckets_info_mut() + .on_refresh_region_buckets( + &self.ctx.coprocessor_host.cfg, + next_bucket_version, + buckets, + region_epoch, + ®ion, + bucket_ranges, + ); + let region_buckets = self .fsm .peer - .region_buckets + .region_buckets_info() + .bucket_stat() .as_ref() - .map(|b| b.meta.version) - .unwrap_or_default(); - if current_version == 0 { - current_version = self - .fsm - .peer - .last_region_buckets - .as_ref() - .map(|b| b.meta.version) - .unwrap_or_default(); - } - let mut region_buckets: BucketStat; - if let Some(bucket_ranges) = bucket_ranges { - assert_eq!(buckets.len(), bucket_ranges.len()); - let mut i = 0; - region_buckets = self.fsm.peer.region_buckets.clone().unwrap(); - let mut meta = (*region_buckets.meta).clone(); - if !buckets.is_empty() { - meta.version = util::gen_bucket_version(self.fsm.peer.term(), current_version); - } - meta.region_epoch = region_epoch; - for (bucket, bucket_range) in buckets.into_iter().zip(bucket_ranges) { - while i < meta.keys.len() && meta.keys[i] != bucket_range.0 { - i += 1; - } - assert!(i != meta.keys.len()); - // the bucket size is small and does not have split keys, - // then it should be merged with its left neighbor - let region_bucket_merge_size = - self.ctx.coprocessor_host.cfg.region_bucket_merge_size_ratio - * (self.ctx.coprocessor_host.cfg.region_bucket_size.0 as f64); - if bucket.keys.is_empty() && bucket.size <= (region_bucket_merge_size as u64) { - meta.sizes[i] = bucket.size; - // i is not the last entry (which is end key) - assert!(i < meta.keys.len() - 1); - // the region has more than one bucket - // and the left neighbor + current bucket size is not very big - if meta.keys.len() > 2 - && i != 0 - && meta.sizes[i - 1] + bucket.size - < self.ctx.coprocessor_host.cfg.region_bucket_size.0 * 2 - { - // bucket is too small - region_buckets.left_merge(i); - meta.left_merge(i); - continue; - } - } else { - // update size - meta.sizes[i] = bucket.size / (bucket.keys.len() + 1) as u64; - // insert new bucket keys (split the original bucket) - for bucket_key in bucket.keys { - i += 1; - region_buckets.split(i); - meta.split(i, bucket_key); - } - } - i += 1; - } - region_buckets.meta = Arc::new(meta); - } else { - debug!( - "refresh_region_buckets re-generates buckets"; + .unwrap() + .clone(); + let buckets_count = region_buckets.meta.keys.len() - 1; + if change_bucket_version { + // TODO: we may need to make it debug once the coprocessor timeout is resolved. + info!( + "finished on_refresh_region_buckets"; "region_id" => self.fsm.region_id(), + "buckets_count" => buckets_count, + "buckets_size" => ?region_buckets.meta.sizes, ); - assert_eq!(buckets.len(), 1); - let bucket_keys = buckets.pop().unwrap().keys; - let bucket_count = bucket_keys.len() + 1; - - let mut meta = BucketMeta { - region_id: self.fsm.region_id(), - region_epoch, - version: util::gen_bucket_version(self.fsm.peer.term(), current_version), - keys: bucket_keys, - sizes: vec![self.ctx.coprocessor_host.cfg.region_bucket_size.0; bucket_count], - }; - meta.keys.insert(0, region.get_start_key().to_vec()); - meta.keys.push(region.get_end_key().to_vec()); - region_buckets = BucketStat::from_meta(Arc::new(meta)); + } else { + // it means the buckets key range not any change, so don't need to refresh. + test_only_callback(_cb, region_buckets.meta); + return; } - - let buckets_count = region_buckets.meta.keys.len() - 1; self.ctx.coprocessor_host.on_region_changed( - region, + self.region(), RegionChangeEvent::UpdateBuckets(buckets_count), self.fsm.peer.get_role(), ); let keys = region_buckets.meta.keys.clone(); - let old_region_buckets: Option = - self.fsm.peer.region_buckets.replace(region_buckets); - self.fsm.peer.last_region_buckets = old_region_buckets; + let version = region_buckets.meta.version; let mut store_meta = self.ctx.store_meta.lock().unwrap(); - let version = self.fsm.peer.region_buckets.as_ref().unwrap().meta.version; if let Some(reader) = store_meta.readers.get_mut(&self.fsm.region_id()) { - reader.update(ReadProgress::region_buckets( - self.fsm.peer.region_buckets.as_ref().unwrap().meta.clone(), - )); + reader.update(ReadProgress::region_buckets(region_buckets.meta.clone())); } // Notify followers to refresh their buckets version @@ -6154,19 +6100,9 @@ where .send_extra_message(extra_msg, &mut self.ctx.trans, &p); } } - - debug!( - "finished on_refresh_region_buckets"; - "region_id" => self.fsm.region_id(), - "buckets_count" => buckets_count, - "buckets_size" => ?self.fsm.peer.region_buckets.as_ref().unwrap().meta.sizes, - ); // test purpose #[cfg(any(test, feature = "testexport"))] - test_only_callback( - _cb, - self.fsm.peer.region_buckets.as_ref().unwrap().meta.clone(), - ); + test_only_callback(_cb, region_buckets.meta); } pub fn on_msg_refresh_buckets(&mut self, msg: RaftMessage) { @@ -6205,50 +6141,11 @@ where if !self.ctx.coprocessor_host.cfg.enable_region_bucket() { return None; } - let region_buckets = self.fsm.peer.region_buckets.as_ref()?; - let stats = ®ion_buckets.stats; - let keys = ®ion_buckets.meta.keys; - - let empty_last_keys = vec![]; - let empty_last_stats = metapb::BucketStats::default(); - let (last_keys, last_stats, stats_reset) = self - .fsm + let region_bucket_max_size = self.ctx.coprocessor_host.cfg.region_bucket_size.0 * 2; + self.fsm .peer - .last_region_buckets - .as_ref() - .map(|b| { - ( - &b.meta.keys, - &b.stats, - region_buckets.create_time != b.create_time, - ) - }) - .unwrap_or((&empty_last_keys, &empty_last_stats, false)); - - let mut bucket_ranges = vec![]; - let mut j = 0; - assert_eq!(keys.len(), stats.write_bytes.len() + 1); - for i in 0..stats.write_bytes.len() { - let mut diff_in_bytes = stats.write_bytes[i]; - while j < last_keys.len() && keys[i] > last_keys[j] { - j += 1; - } - if j < last_keys.len() && keys[i] == last_keys[j] { - if !stats_reset { - diff_in_bytes -= last_stats.write_bytes[j]; - } - j += 1; - } - - // if the bucket's write_bytes exceed half of the configured region_bucket_size, - // add it to the bucket_ranges for checking update - let bucket_update_diff_size_threshold = - self.ctx.coprocessor_host.cfg.region_bucket_size.0 / 2; - if diff_in_bytes >= bucket_update_diff_size_threshold { - bucket_ranges.push(BucketRange(keys[i].clone(), keys[i + 1].clone())); - } - } - Some(bucket_ranges) + .region_buckets_info() + .gen_bucket_range_for_update(region_bucket_max_size) } fn on_schedule_half_split_region( @@ -6308,6 +6205,12 @@ where cb(peer_stat); } } + + // only check the suspect buckets, not split region. + if source == "bucket" { + return; + } + let task = SplitCheckTask::split_check_key_range( region.clone(), start_key, @@ -6544,7 +6447,7 @@ where fn on_report_region_buckets_tick(&mut self) { if !self.fsm.peer.is_leader() - || self.fsm.peer.region_buckets.is_none() + || self.fsm.peer.region_buckets_info().bucket_stat().is_none() || self.fsm.hibernate_state.group_state() == GroupState::Idle { return; @@ -6552,11 +6455,11 @@ where let region_id = self.region_id(); let peer_id = self.fsm.peer_id(); - let region_buckets = self.fsm.peer.region_buckets.as_mut().unwrap(); + let region_buckets = self.fsm.peer.region_buckets_info_mut().report_bucket_stat(); if let Err(e) = self .ctx .pd_scheduler - .schedule(PdTask::ReportBuckets(region_buckets.clone())) + .schedule(PdTask::ReportBuckets(region_buckets)) { error!( "failed to report region buckets"; @@ -6565,8 +6468,6 @@ where "err" => ?e, ); } - // todo: it will delete in next pr. - region_buckets.stats = new_bucket_stats(®ion_buckets.meta); self.register_report_region_buckets_tick(); } @@ -6640,7 +6541,7 @@ where self.fsm.peer.approximate_keys = Some(self.fsm.peer.approximate_keys.unwrap_or_default() + keys); - if let Some(buckets) = &mut self.fsm.peer.region_buckets { + if let Some(buckets) = &mut self.fsm.peer.region_buckets_info_mut().bucket_stat_mut() { buckets.ingest_sst(keys, size); } // The ingested file may be overlapped with the data in engine, so we need to diff --git a/components/raftstore/src/store/mod.rs b/components/raftstore/src/store/mod.rs index 0ca99efffc4..4cae84d1d25 100644 --- a/components/raftstore/src/store/mod.rs +++ b/components/raftstore/src/store/mod.rs @@ -85,14 +85,14 @@ pub use self::{ util::{RegionReadProgress, RegionReadProgressRegistry}, worker::{ metrics as worker_metrics, need_compact, AutoSplitController, BatchComponent, Bucket, - BucketRange, CachedReadDelegate, CheckLeaderRunner, CheckLeaderTask, CompactThreshold, - FlowStatistics, FlowStatsReporter, KeyEntry, LocalReadContext, LocalReader, - LocalReaderCore, PdStatsMonitor, PdTask, ReadDelegate, ReadExecutor, ReadExecutorProvider, - ReadProgress, ReadStats, RefreshConfigTask, RegionTask, SplitCheckRunner, SplitCheckTask, - SplitConfig, SplitConfigManager, SplitInfo, StoreMetaDelegate, StoreStatsReporter, - TrackVer, WriteStats, WriterContoller, BIG_REGION_CPU_OVERLOAD_THRESHOLD_RATIO, - DEFAULT_BIG_REGION_BYTE_THRESHOLD, DEFAULT_BIG_REGION_QPS_THRESHOLD, - DEFAULT_BYTE_THRESHOLD, DEFAULT_QPS_THRESHOLD, NUM_COLLECT_STORE_INFOS_PER_HEARTBEAT, - REGION_CPU_OVERLOAD_THRESHOLD_RATIO, + BucketRange, BucketStatsInfo, CachedReadDelegate, CheckLeaderRunner, CheckLeaderTask, + CompactThreshold, FlowStatistics, FlowStatsReporter, KeyEntry, LocalReadContext, + LocalReader, LocalReaderCore, PdStatsMonitor, PdTask, ReadDelegate, ReadExecutor, + ReadExecutorProvider, ReadProgress, ReadStats, RefreshConfigTask, RegionTask, + SplitCheckRunner, SplitCheckTask, SplitConfig, SplitConfigManager, SplitInfo, + StoreMetaDelegate, StoreStatsReporter, TrackVer, WriteStats, WriterContoller, + BIG_REGION_CPU_OVERLOAD_THRESHOLD_RATIO, DEFAULT_BIG_REGION_BYTE_THRESHOLD, + DEFAULT_BIG_REGION_QPS_THRESHOLD, DEFAULT_BYTE_THRESHOLD, DEFAULT_QPS_THRESHOLD, + NUM_COLLECT_STORE_INFOS_PER_HEARTBEAT, REGION_CPU_OVERLOAD_THRESHOLD_RATIO, }, }; diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index 52932573d7e..57ad30785f6 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -43,7 +43,7 @@ use kvproto::{ }, }; use parking_lot::RwLockUpgradableReadGuard; -use pd_client::{BucketStat, INVALID_ID}; +use pd_client::INVALID_ID; use protobuf::Message; use raft::{ self, @@ -80,6 +80,7 @@ use super::{ self, check_req_region_epoch, is_initial_msg, AdminCmdEpochState, ChangePeerI, ConfChangeKind, Lease, LeaseState, NORMAL_REQ_CHECK_CONF_VER, NORMAL_REQ_CHECK_VER, }, + worker::BucketStatsInfo, DestroyPeerJob, LocalReadContext, }; use crate::{ @@ -780,9 +781,8 @@ where persisted_number: u64, /// The context of applying snapshot. apply_snap_ctx: Option, - /// region buckets. - pub region_buckets: Option, - pub last_region_buckets: Option, + /// region buckets info in this region. + region_buckets_info: BucketStatsInfo, /// lead_transferee if this peer(leader) is in a leadership transferring. pub lead_transferee: u64, pub unsafe_recovery_state: Option, @@ -931,8 +931,7 @@ where unpersisted_ready: None, persisted_number: 0, apply_snap_ctx: None, - region_buckets: None, - last_region_buckets: None, + region_buckets_info: BucketStatsInfo::default(), lead_transferee: raft::INVALID_ID, unsafe_recovery_state: None, snapshot_recovery_state: None, @@ -1317,6 +1316,16 @@ where self.get_store().region() } + #[inline] + pub fn region_buckets_info_mut(&mut self) -> &mut BucketStatsInfo { + &mut self.region_buckets_info + } + + #[inline] + pub fn region_buckets_info(&self) -> &BucketStatsInfo { + &self.region_buckets_info + } + /// Check whether the peer can be hibernated. /// /// This should be used with `check_after_tick` to get a correct conclusion. @@ -2845,7 +2854,10 @@ where commit_term, committed_entries, cbs, - self.region_buckets.as_ref().map(|b| b.meta.clone()), + self.region_buckets_info() + .bucket_stat() + .as_ref() + .map(|b| b.meta.clone()), ); apply.on_schedule(&ctx.raft_metrics); self.mut_store() @@ -3385,10 +3397,7 @@ where } pub fn reset_region_buckets(&mut self) { - if self.region_buckets.is_some() { - self.last_region_buckets = self.region_buckets.take(); - self.region_buckets = None; - } + self.region_buckets_info_mut().set_bucket_stat(None); } /// Try to renew leader lease. @@ -4705,7 +4714,11 @@ where let mut resp = reader.execute(&req, &Arc::new(region), read_index, None); if let Some(snap) = resp.snapshot.as_mut() { snap.txn_ext = Some(self.txn_ext.clone()); - snap.bucket_meta = self.region_buckets.as_ref().map(|b| b.meta.clone()); + snap.bucket_meta = self + .region_buckets_info() + .bucket_stat() + .as_ref() + .map(|s| s.meta.clone()); } resp.txn_extra_op = self.txn_extra_op.load(); cmd_resp::bind_term(&mut resp.response, self.term()); diff --git a/components/raftstore/src/store/worker/mod.rs b/components/raftstore/src/store/worker/mod.rs index e79f37a4bc4..c6783238520 100644 --- a/components/raftstore/src/store/worker/mod.rs +++ b/components/raftstore/src/store/worker/mod.rs @@ -42,7 +42,8 @@ pub use self::{ }, region::{Runner as RegionRunner, Task as RegionTask}, split_check::{ - Bucket, BucketRange, KeyEntry, Runner as SplitCheckRunner, Task as SplitCheckTask, + Bucket, BucketRange, BucketStatsInfo, KeyEntry, Runner as SplitCheckRunner, + Task as SplitCheckTask, }, split_config::{ SplitConfig, SplitConfigManager, BIG_REGION_CPU_OVERLOAD_THRESHOLD_RATIO, diff --git a/components/raftstore/src/store/worker/read.rs b/components/raftstore/src/store/worker/read.rs index 2d54c00baa6..2694481494f 100644 --- a/components/raftstore/src/store/worker/read.rs +++ b/components/raftstore/src/store/worker/read.rs @@ -440,7 +440,11 @@ impl ReadDelegate { read_progress: peer.read_progress.clone(), pending_remove: false, wait_data: false, - bucket_meta: peer.region_buckets.as_ref().map(|b| b.meta.clone()), + bucket_meta: peer + .region_buckets_info() + .bucket_stat() + .as_ref() + .map(|b| b.meta.clone()), track_ver: TrackVer::new(), } } diff --git a/components/raftstore/src/store/worker/split_check.rs b/components/raftstore/src/store/worker/split_check.rs index 4ff853f70a0..94708e84f7a 100644 --- a/components/raftstore/src/store/worker/split_check.rs +++ b/components/raftstore/src/store/worker/split_check.rs @@ -5,6 +5,7 @@ use std::{ collections::BinaryHeap, fmt::{self, Display, Formatter}, mem, + sync::Arc, }; use engine_traits::{ @@ -12,21 +13,23 @@ use engine_traits::{ }; use file_system::{IoType, WithIoType}; use itertools::Itertools; -use kvproto::{metapb::Region, pdpb::CheckPolicy}; +use kvproto::{ + metapb::{Region, RegionEpoch}, + pdpb::CheckPolicy, +}; use online_config::{ConfigChange, OnlineConfig}; +use pd_client::{BucketMeta, BucketStat}; use tikv_util::{ box_err, debug, error, info, keybuilder::KeyBuilder, warn, worker::Runnable, Either, }; use txn_types::Key; use super::metrics::*; -#[cfg(any(test, feature = "testexport"))] -use crate::coprocessor::Config; use crate::{ coprocessor::{ dispatcher::StoreHandle, split_observer::{is_valid_split_key, strip_timestamp_if_exists}, - CoprocessorHost, SplitCheckerHost, + Config, CoprocessorHost, SplitCheckerHost, }, Result, }; @@ -144,6 +147,216 @@ pub struct Bucket { pub size: u64, } +#[derive(Debug, Clone, Default)] +pub struct BucketStatsInfo { + // the stats is increment flow. + bucket_stat: Option, + // the report bucket stat records the increment stats after last report pd. + // it will be reset after report pd. + report_bucket_stat: Option, + // avoid the version roll back, it record the last bucket version if bucket stat isn't none. + last_bucket_version: u64, +} + +impl BucketStatsInfo { + /// returns all bucket ranges those's write_bytes exceed the given + /// diff_size_threshold. + pub fn gen_bucket_range_for_update( + &self, + region_bucket_max_size: u64, + ) -> Option> { + let region_buckets = self.bucket_stat.as_ref()?; + let stats = ®ion_buckets.stats; + let keys = ®ion_buckets.meta.keys; + let sizes = ®ion_buckets.meta.sizes; + + let mut suspect_bucket_ranges = vec![]; + assert_eq!(keys.len(), stats.write_bytes.len() + 1); + for i in 0..stats.write_bytes.len() { + let estimated_bucket_size = stats.write_bytes[i] + sizes[i]; + if estimated_bucket_size >= region_bucket_max_size { + suspect_bucket_ranges.push(BucketRange(keys[i].clone(), keys[i + 1].clone())); + } + } + Some(suspect_bucket_ranges) + } + + #[inline] + pub fn version(&self) -> u64 { + self.bucket_stat + .as_ref() + .map_or(self.last_bucket_version, |b| b.meta.version) + } + + #[inline] + pub fn add_bucket_flow(&mut self, delta: &Option) { + if let (Some(buckets), Some(report_buckets), Some(delta)) = ( + self.bucket_stat.as_mut(), + self.report_bucket_stat.as_mut(), + delta, + ) { + buckets.merge(delta); + report_buckets.merge(delta); + } + } + + #[inline] + pub fn set_bucket_stat(&mut self, buckets: Option) { + self.bucket_stat = buckets.clone(); + if let Some(new_buckets) = buckets { + self.last_bucket_version = new_buckets.meta.version; + let mut new_report_buckets = BucketStat::from_meta(new_buckets.meta); + if let Some(old) = &mut self.report_bucket_stat { + new_report_buckets.merge(old); + *old = new_report_buckets; + } else { + self.report_bucket_stat = Some(new_report_buckets); + } + } else { + self.report_bucket_stat = None; + } + } + + #[inline] + pub fn report_bucket_stat(&mut self) -> BucketStat { + let current = self.report_bucket_stat.as_mut().unwrap(); + let delta = current.clone(); + current.clear_stats(); + delta + } + + #[inline] + pub fn bucket_stat(&self) -> &Option { + &self.bucket_stat + } + + #[inline] + pub fn bucket_stat_mut(&mut self) -> Option<&mut BucketStat> { + self.bucket_stat.as_mut() + } + + pub fn on_refresh_region_buckets( + &mut self, + cfg: &Config, + next_bucket_version: u64, + buckets: Vec, + region_epoch: RegionEpoch, + region: &Region, + bucket_ranges: Option>, + ) -> bool { + let change_bucket_version: bool; + // The region buckets reset after this region happened split or merge. + // The message should be dropped if it's epoch is lower than the regions. + // The bucket ranges is none when the region buckets is also none. + // So this condition indicates that the region buckets needs to refresh not + // renew. + if let Some(bucket_ranges) = bucket_ranges&&self.bucket_stat.is_some(){ + assert_eq!(buckets.len(), bucket_ranges.len()); + change_bucket_version=self.update_buckets(cfg, next_bucket_version, buckets, region_epoch, &bucket_ranges); + }else{ + change_bucket_version = true; + // when the region buckets is none, the exclusive buckets includes all the + // bucket keys. + self.init_buckets(cfg, next_bucket_version, buckets, region_epoch, region); + } + change_bucket_version + } + + fn update_buckets( + &mut self, + cfg: &Config, + next_bucket_version: u64, + buckets: Vec, + region_epoch: RegionEpoch, + bucket_ranges: &Vec, + ) -> bool { + let origin_region_buckets = self.bucket_stat.as_ref().unwrap(); + let mut change_bucket_version = false; + let mut meta_idx = 0; + let mut region_buckets = origin_region_buckets.clone(); + let mut meta = (*region_buckets.meta).clone(); + meta.region_epoch = region_epoch; + + // bucket stats will clean if the bucket size is updated. + for (bucket, bucket_range) in buckets.into_iter().zip(bucket_ranges) { + // the bucket ranges maybe need to split or merge not all the meta keys, so it + // needs to find the first keys. + while meta_idx < meta.keys.len() && meta.keys[meta_idx] != bucket_range.0 { + meta_idx += 1; + } + // meta_idx can't be not the last entry (which is end key) + if meta_idx >= meta.keys.len() - 1 { + break; + } + // the bucket size is small and does not have split keys, + // then it should be merged with its left neighbor + let region_bucket_merge_size = + cfg.region_bucket_merge_size_ratio * (cfg.region_bucket_size.0 as f64); + if bucket.keys.is_empty() && bucket.size <= (region_bucket_merge_size as u64) { + meta.sizes[meta_idx] = bucket.size; + region_buckets.clean_stats(meta_idx); + // the region has more than one bucket + // and the left neighbor + current bucket size is not very big + if meta.keys.len() > 2 + && meta_idx != 0 + && meta.sizes[meta_idx - 1] + bucket.size < cfg.region_bucket_size.0 * 2 + { + // bucket is too small + region_buckets.left_merge(meta_idx); + meta.left_merge(meta_idx); + change_bucket_version = true; + continue; + } + } else { + // update size + meta.sizes[meta_idx] = bucket.size / (bucket.keys.len() + 1) as u64; + region_buckets.clean_stats(meta_idx); + // insert new bucket keys (split the original bucket) + for bucket_key in bucket.keys { + meta_idx += 1; + region_buckets.split(meta_idx); + meta.split(meta_idx, bucket_key); + change_bucket_version = true; + } + } + meta_idx += 1; + } + if change_bucket_version { + meta.version = next_bucket_version; + } + region_buckets.meta = Arc::new(meta); + self.set_bucket_stat(Some(region_buckets)); + change_bucket_version + } + + fn init_buckets( + &mut self, + cfg: &Config, + next_bucket_version: u64, + mut buckets: Vec, + region_epoch: RegionEpoch, + region: &Region, + ) { + // when the region buckets is none, the exclusive buckets includes all the + // bucket keys. + assert_eq!(buckets.len(), 1); + let bucket_keys = buckets.pop().unwrap().keys; + let bucket_count = bucket_keys.len() + 1; + let mut meta = BucketMeta { + region_id: region.get_id(), + region_epoch, + version: next_bucket_version, + keys: bucket_keys, + sizes: vec![cfg.region_bucket_size.0; bucket_count], + }; + // padding the boundary keys and initialize the flow. + meta.keys.insert(0, region.get_start_key().to_vec()); + meta.keys.push(region.get_end_key().to_vec()); + let bucket_stats = BucketStat::from_meta(Arc::new(meta)); + self.set_bucket_stat(Some(bucket_stats)); + } +} + pub enum Task { SplitCheckTask { region: Region, @@ -702,3 +915,178 @@ where } } } + +#[cfg(test)] +mod tests { + use super::*; + + // create BucketStatsInfo include three keys: ["","100","200",""]. + fn mock_bucket_stats_info() -> BucketStatsInfo { + let mut bucket_stats_info = BucketStatsInfo::default(); + let cfg = Config::default(); + let next_bucket_version = 1; + let bucket_ranges = None; + let mut region_epoch = RegionEpoch::default(); + region_epoch.set_conf_ver(1); + region_epoch.set_version(1); + let mut region = Region::default(); + region.set_id(1); + + let mut buckets = vec![]; + let mut bucket = Bucket::default(); + bucket.keys.push(vec![100]); + bucket.keys.push(vec![200]); + buckets.insert(0, bucket); + + let _ = bucket_stats_info.on_refresh_region_buckets( + &cfg, + next_bucket_version, + buckets, + region_epoch, + ®ion, + bucket_ranges, + ); + bucket_stats_info + } + + #[test] + pub fn test_version() { + let mut bucket_stats_info = mock_bucket_stats_info(); + assert_eq!(1, bucket_stats_info.version()); + bucket_stats_info.set_bucket_stat(None); + assert_eq!(1, bucket_stats_info.version()); + + let mut meta = BucketMeta::default(); + meta.version = 2; + meta.keys.push(vec![]); + meta.keys.push(vec![]); + let bucket_stat = BucketStat::from_meta(Arc::new(meta)); + bucket_stats_info.set_bucket_stat(Some(bucket_stat)); + assert_eq!(2, bucket_stats_info.version()); + } + + #[test] + pub fn test_insert_new_buckets() { + let bucket_stats_info = mock_bucket_stats_info(); + + let cfg = Config::default(); + let bucket_stat = bucket_stats_info.bucket_stat.unwrap(); + assert_eq!( + vec![vec![], vec![100], vec![200], vec![]], + bucket_stat.meta.keys + ); + for i in 0..bucket_stat.stats.write_bytes.len() { + assert_eq!(cfg.region_bucket_size.0, bucket_stat.meta.sizes[i]); + assert_eq!(0, bucket_stat.stats.write_bytes[i]); + } + } + + #[test] + pub fn test_report_buckets() { + let mut bucket_stats_info = mock_bucket_stats_info(); + let bucket_stats = bucket_stats_info.bucket_stat().as_ref().unwrap(); + let mut delta_bucket_stats = bucket_stats.clone(); + delta_bucket_stats.write_key(&[1], 1); + delta_bucket_stats.write_key(&[201], 1); + bucket_stats_info.add_bucket_flow(&Some(delta_bucket_stats.clone())); + let bucket_stats = bucket_stats_info.report_bucket_stat(); + assert_eq!(vec![2, 0, 2], bucket_stats.stats.write_bytes); + + let report_bucket_stats = bucket_stats_info.report_bucket_stat(); + assert_eq!(vec![0, 0, 0], report_bucket_stats.stats.write_bytes); + bucket_stats_info.add_bucket_flow(&Some(delta_bucket_stats)); + assert_eq!(vec![2, 0, 2], bucket_stats.stats.write_bytes); + } + + #[test] + pub fn test_spilt_and_merge_buckets() { + let mut bucket_stats_info = mock_bucket_stats_info(); + let next_bucket_version = 2; + let mut region = Region::default(); + region.set_id(1); + let cfg = Config::default(); + let bucket_size = cfg.region_bucket_size.0; + let bucket_stats = bucket_stats_info.bucket_stat().as_ref().unwrap(); + let region_epoch = bucket_stats.meta.region_epoch.clone(); + + // step1: update buckets flow + let mut delta_bucket_stats = bucket_stats.clone(); + delta_bucket_stats.write_key(&[1], 1); + delta_bucket_stats.write_key(&[201], 1); + bucket_stats_info.add_bucket_flow(&Some(delta_bucket_stats)); + let bucket_stats = bucket_stats_info.bucket_stat().as_ref().unwrap(); + assert_eq!(vec![2, 0, 2], bucket_stats.stats.write_bytes); + + // step2: tick not affect anything + let bucket_ranges = Some(vec![]); + let buckets = vec![]; + let mut change_bucket_version = bucket_stats_info.on_refresh_region_buckets( + &cfg, + next_bucket_version, + buckets, + region_epoch.clone(), + ®ion, + bucket_ranges, + ); + let bucket_stats = bucket_stats_info.bucket_stat().as_ref().unwrap(); + assert!(!change_bucket_version); + assert_eq!(vec![2, 0, 2], bucket_stats.stats.write_bytes); + + // step3: split key 50 + let mut bucket_ranges = Some(vec![BucketRange(vec![], vec![100])]); + let mut bucket = Bucket::default(); + bucket.keys = vec![vec![50]]; + bucket.size = bucket_size; + let mut buckets = vec![bucket]; + change_bucket_version = bucket_stats_info.on_refresh_region_buckets( + &cfg, + next_bucket_version, + buckets.clone(), + region_epoch.clone(), + ®ion, + bucket_ranges.clone(), + ); + assert!(change_bucket_version); + let bucket_stats = bucket_stats_info.bucket_stat().as_ref().unwrap(); + assert_eq!( + vec![vec![], vec![50], vec![100], vec![200], vec![]], + bucket_stats.meta.keys + ); + assert_eq!( + vec![bucket_size / 2, bucket_size / 2, bucket_size, bucket_size], + bucket_stats.meta.sizes + ); + assert_eq!(vec![0, 0, 0, 2], bucket_stats.stats.write_bytes); + + // step4: merge [50-100] to [0-50], + bucket_ranges = Some(vec![BucketRange(vec![50], vec![100])]); + let mut bucket = Bucket::default(); + bucket.keys = vec![]; + bucket.size = 0; + buckets = vec![bucket]; + change_bucket_version = bucket_stats_info.on_refresh_region_buckets( + &cfg, + next_bucket_version, + buckets, + region_epoch, + ®ion, + bucket_ranges, + ); + assert!(change_bucket_version); + + let bucket_stats = bucket_stats_info.bucket_stat().as_ref().unwrap(); + assert_eq!( + vec![vec![], vec![100], vec![200], vec![]], + bucket_stats.meta.keys + ); + assert_eq!( + vec![bucket_size / 2, bucket_size, bucket_size], + bucket_stats.meta.sizes + ); + assert_eq!(vec![0, 0, 2], bucket_stats.stats.write_bytes); + + // report buckets doesn't be affected by the split and merge. + let report_bucket_stats = bucket_stats_info.report_bucket_stat(); + assert_eq!(vec![4, 0, 2], report_bucket_stats.stats.write_bytes); + } +} diff --git a/components/test_raftstore/src/cluster.rs b/components/test_raftstore/src/cluster.rs index 2a4082893e7..a08f858c031 100644 --- a/components/test_raftstore/src/cluster.rs +++ b/components/test_raftstore/src/cluster.rs @@ -1938,7 +1938,7 @@ impl Cluster { start_key: None, end_key: None, policy: CheckPolicy::Scan, - source: "test", + source: "bucket", cb, }, ) diff --git a/tests/integrations/raftstore/test_split_region.rs b/tests/integrations/raftstore/test_split_region.rs index c0f75487998..0324b57e724 100644 --- a/tests/integrations/raftstore/test_split_region.rs +++ b/tests/integrations/raftstore/test_split_region.rs @@ -976,14 +976,13 @@ fn test_refresh_region_bucket_keys() { cluster.run(); let pd_client = Arc::clone(&cluster.pd_client); + // case: init bucket info cluster.must_put(b"k11", b"v1"); let mut region = pd_client.get_region(b"k11").unwrap(); - let bucket = Bucket { keys: vec![b"k11".to_vec()], size: 1024 * 1024 * 200, }; - let mut expected_buckets = metapb::Buckets::default(); expected_buckets.set_keys(bucket.clone().keys.into()); expected_buckets @@ -997,6 +996,8 @@ fn test_refresh_region_bucket_keys() { Option::None, Some(expected_buckets.clone()), ); + + // case: bucket range should refresh if epoch changed let conf_ver = region.get_region_epoch().get_conf_ver() + 1; region.mut_region_epoch().set_conf_ver(conf_ver); @@ -1018,6 +1019,7 @@ fn test_refresh_region_bucket_keys() { ); assert_eq!(bucket_version2, bucket_version + 1); + // case: stale epoch will not refresh buckets info let conf_ver = 0; region.mut_region_epoch().set_conf_ver(conf_ver); let bucket_version3 = cluster.refresh_region_bucket_keys( @@ -1028,6 +1030,7 @@ fn test_refresh_region_bucket_keys() { ); assert_eq!(bucket_version3, bucket_version2); + // case: bucket split // now the buckets is ["", "k12", ""]. further split ["", k12], [k12, ""] // buckets into more buckets let region = pd_client.get_region(b"k11").unwrap(); @@ -1066,6 +1069,7 @@ fn test_refresh_region_bucket_keys() { ); assert_eq!(bucket_version4, bucket_version3 + 1); + // case: merge buckets // remove k11~k12, k12~k121, k122~[] bucket let buckets = vec![ Bucket { @@ -1107,7 +1111,7 @@ fn test_refresh_region_bucket_keys() { assert_eq!(bucket_version5, bucket_version4 + 1); - // split the region + // case: split the region pd_client.must_split_region(region, pdpb::CheckPolicy::Usekey, vec![b"k11".to_vec()]); let mut buckets = vec![Bucket { keys: vec![b"k10".to_vec()], @@ -1132,7 +1136,7 @@ fn test_refresh_region_bucket_keys() { cluster.refresh_region_bucket_keys(®ion, buckets, None, Some(expected_buckets.clone())); assert_eq!(bucket_version6, bucket_version5 + 1); - // merge the region + // case: merge the region pd_client.must_merge(left_id, right.get_id()); let region = pd_client.get_region(b"k10").unwrap(); let buckets = vec![Bucket { @@ -1145,6 +1149,7 @@ fn test_refresh_region_bucket_keys() { cluster.refresh_region_bucket_keys(®ion, buckets, None, Some(expected_buckets.clone())); assert_eq!(bucket_version7, bucket_version6 + 1); + // case: nothing changed let bucket_version8 = cluster.refresh_region_bucket_keys( ®ion, vec![], @@ -1157,26 +1162,24 @@ fn test_refresh_region_bucket_keys() { #[test] fn test_gen_split_check_bucket_ranges() { - let count = 5; - let mut cluster = new_server_cluster(0, count); - cluster.cfg.coprocessor.region_bucket_size = ReadableSize(5); + let mut cluster = new_server_cluster(0, 1); + let region_bucket_size = ReadableSize::kb(1); + cluster.cfg.coprocessor.region_bucket_size = region_bucket_size; cluster.cfg.coprocessor.enable_region_bucket = Some(true); // disable report buckets; as it will reset the user traffic stats to randomize // the test result - cluster.cfg.raft_store.check_leader_lease_interval = ReadableDuration::secs(5); - // Make merge check resume quickly. - cluster.cfg.raft_store.merge_check_tick_interval = ReadableDuration::millis(100); cluster.run(); let pd_client = Arc::clone(&cluster.pd_client); - cluster.must_put(b"k11", b"v1"); - let region = pd_client.get_region(b"k11").unwrap(); + let mut range = 1..; + let mid_key = put_till_size(&mut cluster, region_bucket_size.0, &mut range); + let second_key = put_till_size(&mut cluster, region_bucket_size.0, &mut range); + let region = pd_client.get_region(&second_key).unwrap(); let bucket = Bucket { - keys: vec![b"k11".to_vec()], - size: 1024 * 1024 * 200, + keys: vec![mid_key.clone()], + size: region_bucket_size.0 * 2, }; - let mut expected_buckets = metapb::Buckets::default(); expected_buckets.set_keys(bucket.clone().keys.into()); expected_buckets @@ -1184,7 +1187,6 @@ fn test_gen_split_check_bucket_ranges() { .insert(0, region.get_start_key().to_vec()); expected_buckets.keys.push(region.get_end_key().to_vec()); let buckets = vec![bucket]; - // initialize fsm.peer.bucket_regions cluster.refresh_region_bucket_keys( ®ion, @@ -1192,32 +1194,28 @@ fn test_gen_split_check_bucket_ranges() { Option::None, Some(expected_buckets.clone()), ); - cluster.must_put(b"k10", b"v1"); - cluster.must_put(b"k12", b"v1"); - let expected_bucket_ranges = vec![ - BucketRange(vec![], b"k11".to_vec()), - BucketRange(b"k11".to_vec(), vec![]), - ]; + // put some data into the right buckets, so the bucket range will be check by + // split check. + let latest_key = put_till_size(&mut cluster, region_bucket_size.0 + 100, &mut range); + let expected_bucket_ranges = vec![BucketRange(mid_key.clone(), vec![])]; cluster.send_half_split_region_message(®ion, Some(expected_bucket_ranges)); - // set fsm.peer.last_bucket_regions + // reset bucket stats. cluster.refresh_region_bucket_keys( ®ion, buckets, Option::None, Some(expected_buckets.clone()), ); - // because the diff between last_bucket_regions and bucket_regions is zero, - // bucket range for split check should be empty. - let expected_bucket_ranges = vec![]; - cluster.send_half_split_region_message(®ion, Some(expected_bucket_ranges)); - // split the region - pd_client.must_split_region(region, pdpb::CheckPolicy::Usekey, vec![b"k11".to_vec()]); + thread::sleep(Duration::from_millis(100)); + cluster.send_half_split_region_message(®ion, Some(vec![])); - let left = pd_client.get_region(b"k10").unwrap(); - let right = pd_client.get_region(b"k12").unwrap(); + // split the region + pd_client.must_split_region(region, pdpb::CheckPolicy::Usekey, vec![second_key]); + let left = pd_client.get_region(&mid_key).unwrap(); + let right = pd_client.get_region(&latest_key).unwrap(); if right.get_id() == 1 { // the bucket_ranges should be None to refresh the bucket cluster.send_half_split_region_message(&right, None); @@ -1225,11 +1223,10 @@ fn test_gen_split_check_bucket_ranges() { // the bucket_ranges should be None to refresh the bucket cluster.send_half_split_region_message(&left, None); } - + thread::sleep(Duration::from_millis(300)); // merge the region pd_client.must_merge(left.get_id(), right.get_id()); - let region = pd_client.get_region(b"k10").unwrap(); - // the bucket_ranges should be None to refresh the bucket + let region = pd_client.get_region(&mid_key).unwrap(); cluster.send_half_split_region_message(®ion, None); } From 32d043fae26d1aa755ba104f92efb0586506e5e5 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Mon, 6 Nov 2023 11:18:39 +0800 Subject: [PATCH 102/220] raftstore: make release work (#15850) (#15923) close tikv/tikv#15851 Signed-off-by: bufferflies <1045931706@qq.com> Co-authored-by: tongjian <1045931706@qq.com> --- components/raftstore/src/store/fsm/peer.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index 42241e46475..f4be67260f3 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -6067,6 +6067,7 @@ where ); } else { // it means the buckets key range not any change, so don't need to refresh. + #[cfg(any(test, feature = "testexport"))] test_only_callback(_cb, region_buckets.meta); return; } From c5e0ce3dc4ade2fcc75fd4d9832c507d8dc0bdf6 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Tue, 7 Nov 2023 13:05:11 +0800 Subject: [PATCH 103/220] sst_importer: join can fallback to version 1 filename (#15913) (#15921) close tikv/tikv#15912 Signed-off-by: lance6716 Co-authored-by: lance6716 Co-authored-by: tonyxuqqi --- components/sst_importer/src/import_file.rs | 121 ++++++++++++-------- components/sst_importer/src/sst_importer.rs | 46 +++----- 2 files changed, 89 insertions(+), 78 deletions(-) diff --git a/components/sst_importer/src/import_file.rs b/components/sst_importer/src/import_file.rs index b3b7c051ce4..850df867da8 100644 --- a/components/sst_importer/src/import_file.rs +++ b/components/sst_importer/src/import_file.rs @@ -13,8 +13,7 @@ use api_version::api_v2::TIDB_RANGES_COMPLEMENT; use encryption::{DataKeyManager, EncrypterWriter}; use engine_rocks::{get_env, RocksSstReader}; use engine_traits::{ - iter_option, EncryptionKeyManager, IterOptions, Iterator, KvEngine, RefIterable, SstExt, - SstMetaInfo, SstReader, + iter_option, EncryptionKeyManager, Iterator, KvEngine, RefIterable, SstMetaInfo, SstReader, }; use file_system::{get_io_rate_limiter, sync_dir, File, OpenOptions}; use keys::data_key; @@ -261,17 +260,36 @@ impl ImportDir { }) } - pub fn join(&self, meta: &SstMeta) -> Result { + pub fn join_for_write(&self, meta: &SstMeta) -> Result { let file_name = sst_meta_to_path(meta)?; self.get_import_path(file_name.to_str().unwrap()) } + /// Different with join_for_write, join_for_read will also handle the api + /// version 1 filenames which can be generated by old version TiKV. + pub fn join_for_read(&self, meta: &SstMeta) -> Result { + let file_name = sst_meta_to_path(meta)?; + let files_result = self.get_import_path(file_name.to_str().unwrap()); + // if files does not exists, it means the SstMeta is generated by old version + // TiKV, we try sst_meta_to_path_v1 + match files_result { + Ok(path) => { + if path.save.exists() { + return Ok(path); + } + let file_name = sst_meta_to_path_v1(meta)?; + self.get_import_path(file_name.to_str().unwrap()) + } + Err(e) => Err(e), + } + } + pub fn create( &self, meta: &SstMeta, key_manager: Option>, ) -> Result { - let path = self.join(meta)?; + let path = self.join_for_write(meta)?; if path.save.exists() { return Err(Error::FileExists(path.save, "create SST upload cache")); } @@ -290,7 +308,7 @@ impl ImportDir { } pub fn delete(&self, meta: &SstMeta, manager: Option<&DataKeyManager>) -> Result { - let path = self.join(meta)?; + let path = self.join_for_read(meta)?; self.delete_file(&path.save, manager)?; self.delete_file(&path.temp, manager)?; self.delete_file(&path.clone, manager)?; @@ -298,7 +316,7 @@ impl ImportDir { } pub fn exist(&self, meta: &SstMeta) -> Result { - let path = self.join(meta)?; + let path = self.join_for_read(meta)?; Ok(path.save.exists()) } @@ -307,7 +325,7 @@ impl ImportDir { meta: &SstMeta, key_manager: Option>, ) -> Result { - let path = self.join(meta)?; + let path = self.join_for_read(meta)?; let path_str = path.save.to_str().unwrap(); let env = get_env(key_manager, get_io_rate_limiter())?; let sst_reader = RocksSstReader::open_with_env(path_str, Some(env))?; @@ -334,7 +352,7 @@ impl ImportDir { // otherwise we are upgrade/downgrade between V1 and V2 // this can be done if all keys are written by TiDB _ => { - let path = self.join(meta)?; + let path = self.join_for_read(meta)?; let path_str = path.save.to_str().unwrap(); let env = get_env(key_manager.clone(), get_io_rate_limiter())?; let sst_reader = RocksSstReader::open_with_env(path_str, Some(env))?; @@ -382,7 +400,7 @@ impl ImportDir { let mut paths = HashMap::new(); let mut ingest_bytes = 0; for info in metas { - let path = self.join(&info.meta)?; + let path = self.join_for_read(&info.meta)?; let cf = info.meta.get_cf_name(); super::prepare_sst_for_ingestion(&path.save, &path.clone, key_manager.as_deref())?; ingest_bytes += info.total_bytes; @@ -407,7 +425,7 @@ impl ImportDir { key_manager: Option>, ) -> Result<()> { for meta in metas { - let path = self.join(meta)?; + let path = self.join_for_read(meta)?; let path_str = path.save.to_str().unwrap(); let env = get_env(key_manager.clone(), get_io_rate_limiter())?; let sst_reader = RocksSstReader::open_with_env(path_str, Some(env))?; @@ -416,31 +434,6 @@ impl ImportDir { Ok(()) } - pub fn load_start_key_by_meta( - &self, - meta: &SstMeta, - km: Option>, - ) -> Result>> { - let path = self.join(meta)?; - let r = match km { - Some(km) => E::SstReader::open_encrypted(&path.save.to_string_lossy(), km)?, - None => E::SstReader::open(&path.save.to_string_lossy())?, - }; - let opts = IterOptions::new(None, None, false); - let mut i = r.iter(opts)?; - if !i.seek_to_first()? || !i.valid()? { - return Ok(None); - } - // Should we warn if the key doesn't start with the prefix key? (Is that - // possible?) - // Also note this brings implicit coupling between this and - // RocksEngine. Perhaps it is better to make the engine to provide - // decode functions. Anyway we have directly used the RocksSstReader - // somewhere... This won't make things worse. - let real_key = i.key().strip_prefix(keys::DATA_PREFIX_KEY); - Ok(real_key.map(ToOwned::to_owned)) - } - pub fn list_ssts(&self) -> Result> { let mut ssts = Vec::new(); for e in file_system::read_dir(&self.root_dir)? { @@ -483,6 +476,18 @@ pub fn sst_meta_to_path(meta: &SstMeta) -> Result { ))) } +pub fn sst_meta_to_path_v1(meta: &SstMeta) -> Result { + Ok(PathBuf::from(format!( + "{}_{}_{}_{}_{}{}", + UuidBuilder::from_slice(meta.get_uuid())?.build(), + meta.get_region_id(), + meta.get_region_epoch().get_conf_ver(), + meta.get_region_epoch().get_version(), + meta.get_cf_name(), + SST_SUFFIX, + ))) +} + pub fn parse_meta_from_path>(path: P) -> Result<(SstMeta, i32)> { let path = path.as_ref(); let file_name = match path.file_name().and_then(|n| n.to_str()) { @@ -521,6 +526,8 @@ pub fn parse_meta_from_path>(path: P) -> Result<(SstMeta, i32)> { #[cfg(test)] mod test { + use std::fs; + use engine_traits::CF_DEFAULT; use super::*; @@ -565,6 +572,35 @@ mod test { assert_eq!(1, meta_with_ver.1); } + #[test] + fn test_join_for_rw() { + use tempfile::TempDir; + use uuid::Uuid; + + let tmp = TempDir::new().unwrap(); + let dir = ImportDir::new(tmp.path()).unwrap(); + let mut meta = SstMeta::default(); + meta.set_uuid(Uuid::new_v4().as_bytes().to_vec()); + let filename_v1 = sst_meta_to_path_v1(&meta).unwrap(); + let path_v1 = tmp.path().join(filename_v1); + + let got = dir + .join_for_read(&meta) + .expect("fallback to version 1 because version 2 file does not exist"); + assert_eq!(got.save, path_v1); + + let filename_v2 = sst_meta_to_path(&meta).unwrap(); + let path_v2 = tmp.path().join(filename_v2); + fs::File::create(&path_v2).expect("create empty file"); + let got = dir.join_for_read(&meta).expect("read should succeed"); + assert_eq!(got.save, path_v2); + fs::remove_file(path_v2).expect("delete file"); + + fs::File::create(&path_v1).expect("create empty file"); + let got = dir.join_for_read(&meta).expect("read should succeed"); + assert_eq!(got.save, path_v1); + } + #[cfg(feature = "test-engines-rocksdb")] fn test_path_with_range_and_km(km: Option) { use engine_rocks::{RocksEngine, RocksSstWriterBuilder}; @@ -613,21 +649,6 @@ mod test { .unwrap(); w.finish().unwrap(); dp.save(arcmgr.as_deref()).unwrap(); - let mut ssts = dir.list_ssts().unwrap(); - ssts.iter_mut().for_each(|meta_with_ver| { - let meta = &mut meta_with_ver.0; - let start = dir - .load_start_key_by_meta::(meta, arcmgr.clone()) - .unwrap() - .unwrap(); - meta.mut_range().set_start(start) - }); - assert_eq!( - ssts.iter() - .map(|meta_with_ver| { meta_with_ver.0.clone() }) - .collect(), - vec![meta] - ); } #[test] diff --git a/components/sst_importer/src/sst_importer.rs b/components/sst_importer/src/sst_importer.rs index d97dddcb642..54a41cea15b 100644 --- a/components/sst_importer/src/sst_importer.rs +++ b/components/sst_importer/src/sst_importer.rs @@ -289,7 +289,7 @@ impl SstImporter { } pub fn get_path(&self, meta: &SstMeta) -> PathBuf { - let path = self.dir.join(meta).unwrap(); + let path = self.dir.join_for_read(meta).unwrap(); path.save } @@ -1116,7 +1116,7 @@ impl SstImporter { engine: E, ext: DownloadExt<'_>, ) -> Result> { - let path = self.dir.join(meta)?; + let path = self.dir.join_for_write(meta)?; let file_crypter = crypter.map(|c| FileEncryptionInfo { method: to_engine_encryption_method(c.cipher_type), @@ -1391,20 +1391,10 @@ impl SstImporter { self.dir.list_ssts() } - /// Load the start key by a metadata. - /// This will open the internal SST and try to load the first user key. - /// (For RocksEngine, that is the key without the 'z' prefix.) - /// When the SST is empty or the first key cannot be parsed as user key, - /// return None. - pub fn load_start_key_by_meta(&self, meta: &SstMeta) -> Result>> { - self.dir - .load_start_key_by_meta::(meta, self.key_manager.clone()) - } - pub fn new_txn_writer(&self, db: &E, meta: SstMeta) -> Result> { let mut default_meta = meta.clone(); default_meta.set_cf_name(CF_DEFAULT.to_owned()); - let default_path = self.dir.join(&default_meta)?; + let default_path = self.dir.join_for_write(&default_meta)?; let default = E::SstWriterBuilder::new() .set_db(db) .set_cf(CF_DEFAULT) @@ -1414,7 +1404,7 @@ impl SstImporter { let mut write_meta = meta; write_meta.set_cf_name(CF_WRITE.to_owned()); - let write_path = self.dir.join(&write_meta)?; + let write_path = self.dir.join_for_write(&write_meta)?; let write = E::SstWriterBuilder::new() .set_db(db) .set_cf(CF_WRITE) @@ -1440,7 +1430,7 @@ impl SstImporter { mut meta: SstMeta, ) -> Result> { meta.set_cf_name(CF_DEFAULT.to_owned()); - let default_path = self.dir.join(&meta)?; + let default_path = self.dir.join_for_write(&meta)?; let default = E::SstWriterBuilder::new() .set_db(db) .set_cf(CF_DEFAULT) @@ -1521,7 +1511,7 @@ mod tests { let mut meta = SstMeta::default(); meta.set_uuid(Uuid::new_v4().as_bytes().to_vec()); - let path = dir.join(&meta).unwrap(); + let path = dir.join_for_write(&meta).unwrap(); // Test ImportDir::create() { @@ -2335,7 +2325,7 @@ mod tests { assert_eq!(range.get_end(), b"t123_r13"); // verifies that the file is saved to the correct place. - let sst_file_path = importer.dir.join(&meta).unwrap().save; + let sst_file_path = importer.dir.join_for_read(&meta).unwrap().save; let sst_file_metadata = sst_file_path.metadata().unwrap(); assert!(sst_file_metadata.is_file()); assert_eq!(sst_file_metadata.len(), meta.get_length()); @@ -2395,7 +2385,7 @@ mod tests { assert_eq!(range.get_end(), b"t123_r13"); // verifies that the file is saved to the correct place. - let sst_file_path = importer.dir.join(&meta).unwrap().save; + let sst_file_path = importer.dir.join_for_read(&meta).unwrap().save; let sst_file_metadata = sst_file_path.metadata().unwrap(); assert!(sst_file_metadata.is_file()); assert_eq!(sst_file_metadata.len(), meta.get_length()); @@ -2445,7 +2435,7 @@ mod tests { // verifies that the file is saved to the correct place. // (the file size may be changed, so not going to check the file size) - let sst_file_path = importer.dir.join(&meta).unwrap().save; + let sst_file_path = importer.dir.join_for_read(&meta).unwrap().save; assert!(sst_file_path.is_file()); // verifies the SST content is correct. @@ -2490,7 +2480,7 @@ mod tests { // verifies that the file is saved to the correct place. // (the file size may be changed, so not going to check the file size) - let sst_file_path = importer.dir.join(&meta).unwrap().save; + let sst_file_path = importer.dir.join_for_read(&meta).unwrap().save; assert!(sst_file_path.is_file()); // verifies the SST content is correct. @@ -2534,7 +2524,7 @@ mod tests { // verifies that the file is saved to the correct place. // (the file size may be changed, so not going to check the file size) - let sst_file_path = importer.dir.join(&meta).unwrap().save; + let sst_file_path = importer.dir.join_for_read(&meta).unwrap().save; assert!(sst_file_path.is_file()); // verifies the SST content is correct. @@ -2676,7 +2666,7 @@ mod tests { // verifies that the file is saved to the correct place. // (the file size is changed, so not going to check the file size) - let sst_file_path = importer.dir.join(&meta).unwrap().save; + let sst_file_path = importer.dir.join_for_read(&meta).unwrap().save; assert!(sst_file_path.is_file()); // verifies the SST content is correct. @@ -2720,7 +2710,7 @@ mod tests { assert_eq!(range.get_end(), b"t5_r07"); // verifies that the file is saved to the correct place. - let sst_file_path = importer.dir.join(&meta).unwrap().save; + let sst_file_path = importer.dir.join_for_read(&meta).unwrap().save; assert!(sst_file_path.is_file()); // verifies the SST content is correct. @@ -2853,7 +2843,7 @@ mod tests { assert_eq!(range.get_end(), b"d"); // verifies that the file is saved to the correct place. - let sst_file_path = importer.dir.join(&meta).unwrap().save; + let sst_file_path = importer.dir.join_for_read(&meta).unwrap().save; let sst_file_metadata = sst_file_path.metadata().unwrap(); assert!(sst_file_metadata.is_file()); assert_eq!(sst_file_metadata.len(), meta.get_length()); @@ -2912,7 +2902,7 @@ mod tests { assert_eq!(range.get_end(), b"c\x00"); // verifies that the file is saved to the correct place. - let sst_file_path = importer.dir.join(&meta).unwrap().save; + let sst_file_path = importer.dir.join_for_read(&meta).unwrap().save; let sst_file_metadata = sst_file_path.metadata().unwrap(); assert!(sst_file_metadata.is_file()); @@ -2967,7 +2957,7 @@ mod tests { assert_eq!(range.get_end(), b"c"); // verifies that the file is saved to the correct place. - let sst_file_path = importer.dir.join(&meta).unwrap().save; + let sst_file_path = importer.dir.join_for_read(&meta).unwrap().save; let sst_file_metadata = sst_file_path.metadata().unwrap(); assert!(sst_file_metadata.is_file()); @@ -3013,7 +3003,7 @@ mod tests { .unwrap(); // verifies the SST is compressed using Snappy. - let sst_file_path = importer.dir.join(&meta).unwrap().save; + let sst_file_path = importer.dir.join_for_read(&meta).unwrap().save; assert!(sst_file_path.is_file()); let sst_reader = new_sst_reader(sst_file_path.to_str().unwrap(), None); @@ -3060,7 +3050,7 @@ mod tests { // verifies SST compression algorithm... for meta in metas { - let sst_file_path = importer.dir.join(&meta).unwrap().save; + let sst_file_path = importer.dir.join_for_read(&meta).unwrap().save; assert!(sst_file_path.is_file()); let sst_reader = new_sst_reader(sst_file_path.to_str().unwrap(), None); From 5a307f134917978ec34349c4d25af0e8f1ef7102 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Tue, 7 Nov 2023 17:30:42 +0800 Subject: [PATCH 104/220] titan: update titan to avoid manifest io mutex (#15914) (#15915) close tikv/tikv#15351 titan: update titan to avoid manifest io mutex Signed-off-by: Connor1996 Co-authored-by: Connor1996 Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- Cargo.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 003ccaf39e3..901b768d24d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2998,7 +2998,7 @@ dependencies = [ [[package]] name = "librocksdb_sys" version = "0.1.0" -source = "git+https://github.com/tikv/rust-rocksdb.git#b747689e1b94cb1507872e898b83553447e8f8de" +source = "git+https://github.com/tikv/rust-rocksdb.git#aa41eb102d373f56846be88ffd250c2b581b48d4" dependencies = [ "bindgen 0.65.1", "bzip2-sys", @@ -3017,7 +3017,7 @@ dependencies = [ [[package]] name = "libtitan_sys" version = "0.0.1" -source = "git+https://github.com/tikv/rust-rocksdb.git#b747689e1b94cb1507872e898b83553447e8f8de" +source = "git+https://github.com/tikv/rust-rocksdb.git#aa41eb102d373f56846be88ffd250c2b581b48d4" dependencies = [ "bzip2-sys", "cc", @@ -4936,7 +4936,7 @@ dependencies = [ [[package]] name = "rocksdb" version = "0.3.0" -source = "git+https://github.com/tikv/rust-rocksdb.git#b747689e1b94cb1507872e898b83553447e8f8de" +source = "git+https://github.com/tikv/rust-rocksdb.git#aa41eb102d373f56846be88ffd250c2b581b48d4" dependencies = [ "libc 0.2.146", "librocksdb_sys", From 33122ce4b5a6741e473c43d1479c12d6b115cba5 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Tue, 7 Nov 2023 18:05:12 +0800 Subject: [PATCH 105/220] server: make cpu and heap profiling can be running concurrently (#15761) (#15933) close tikv/tikv#15760 Make cpu and heap profiling can be running concurrently. Beside, change the behavior of - "debug/pprof/heap_activate": do not dump heap profile periodically by default - "debug/pprof/heap": dump heap profile without any delay and use embedded jeprof script Signed-off-by: Connor1996 Co-authored-by: Connor1996 Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- src/server/status_server/jeprof.in | 5727 +++++++++++++++++++++++++++ src/server/status_server/mod.rs | 45 +- src/server/status_server/profile.rs | 185 +- 3 files changed, 5855 insertions(+), 102 deletions(-) create mode 100644 src/server/status_server/jeprof.in diff --git a/src/server/status_server/jeprof.in b/src/server/status_server/jeprof.in new file mode 100644 index 00000000000..cadf15d7d8e --- /dev/null +++ b/src/server/status_server/jeprof.in @@ -0,0 +1,5727 @@ +#! /usr/bin/env perl + +# Copyright (c) 1998-2007, Google Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following disclaimer +# in the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Google Inc. nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# --- +# Program for printing the profile generated by common/profiler.cc, +# or by the heap profiler (common/debugallocation.cc) +# +# The profile contains a sequence of entries of the form: +# +# This program parses the profile, and generates user-readable +# output. +# +# Examples: +# +# % tools/jeprof "program" "profile" +# Enters "interactive" mode +# +# % tools/jeprof --text "program" "profile" +# Generates one line per procedure +# +# % tools/jeprof --gv "program" "profile" +# Generates annotated call-graph and displays via "gv" +# +# % tools/jeprof --gv --focus=Mutex "program" "profile" +# Restrict to code paths that involve an entry that matches "Mutex" +# +# % tools/jeprof --gv --focus=Mutex --ignore=string "program" "profile" +# Restrict to code paths that involve an entry that matches "Mutex" +# and does not match "string" +# +# % tools/jeprof --list=IBF_CheckDocid "program" "profile" +# Generates disassembly listing of all routines with at least one +# sample that match the --list= pattern. The listing is +# annotated with the flat and cumulative sample counts at each line. +# +# % tools/jeprof --disasm=IBF_CheckDocid "program" "profile" +# Generates disassembly listing of all routines with at least one +# sample that match the --disasm= pattern. The listing is +# annotated with the flat and cumulative sample counts at each PC value. +# +# TODO: Use color to indicate files? + +use strict; +use warnings; +use Getopt::Long; +use Cwd; + +my $JEPROF_VERSION = "unknown"; +my $PPROF_VERSION = "2.0"; + +# These are the object tools we use which can come from a +# user-specified location using --tools, from the JEPROF_TOOLS +# environment variable, or from the environment. +my %obj_tool_map = ( + "objdump" => "objdump", + "nm" => "nm", + "addr2line" => "addr2line", + "c++filt" => "c++filt", + ## ConfigureObjTools may add architecture-specific entries: + #"nm_pdb" => "nm-pdb", # for reading windows (PDB-format) executables + #"addr2line_pdb" => "addr2line-pdb", # ditto + #"otool" => "otool", # equivalent of objdump on OS X +); +# NOTE: these are lists, so you can put in commandline flags if you want. +my @DOT = ("dot"); # leave non-absolute, since it may be in /usr/local +my @GV = ("gv"); +my @EVINCE = ("evince"); # could also be xpdf or perhaps acroread +my @KCACHEGRIND = ("kcachegrind"); +my @PS2PDF = ("ps2pdf"); +# These are used for dynamic profiles +my @URL_FETCHER = ("curl", "-s", "--fail"); + +# These are the web pages that servers need to support for dynamic profiles +my $HEAP_PAGE = "/pprof/heap"; +my $PROFILE_PAGE = "/pprof/profile"; # must support cgi-param "?seconds=#" +my $PMUPROFILE_PAGE = "/pprof/pmuprofile(?:\\?.*)?"; # must support cgi-param + # ?seconds=#&event=x&period=n +my $GROWTH_PAGE = "/pprof/growth"; +my $CONTENTION_PAGE = "/pprof/contention"; +my $WALL_PAGE = "/pprof/wall(?:\\?.*)?"; # accepts options like namefilter +my $FILTEREDPROFILE_PAGE = "/pprof/filteredprofile(?:\\?.*)?"; +my $CENSUSPROFILE_PAGE = "/pprof/censusprofile(?:\\?.*)?"; # must support cgi-param + # "?seconds=#", + # "?tags_regexp=#" and + # "?type=#". +my $SYMBOL_PAGE = "/pprof/symbol"; # must support symbol lookup via POST +my $PROGRAM_NAME_PAGE = "/pprof/cmdline"; + +# These are the web pages that can be named on the command line. +# All the alternatives must begin with /. +my $PROFILES = "($HEAP_PAGE|$PROFILE_PAGE|$PMUPROFILE_PAGE|" . + "$GROWTH_PAGE|$CONTENTION_PAGE|$WALL_PAGE|" . + "$FILTEREDPROFILE_PAGE|$CENSUSPROFILE_PAGE)"; + +# default binary name +my $UNKNOWN_BINARY = "(unknown)"; + +# There is a pervasive dependency on the length (in hex characters, +# i.e., nibbles) of an address, distinguishing between 32-bit and +# 64-bit profiles. To err on the safe size, default to 64-bit here: +my $address_length = 16; + +my $dev_null = "/dev/null"; +if (! -e $dev_null && $^O =~ /MSWin/) { # $^O is the OS perl was built for + $dev_null = "nul"; +} + +# A list of paths to search for shared object files +my @prefix_list = (); + +# Special routine name that should not have any symbols. +# Used as separator to parse "addr2line -i" output. +my $sep_symbol = '_fini'; +my $sep_address = undef; + +##### Argument parsing ##### + +sub usage_string { + return < + is a space separated list of profile names. +jeprof [options] + is a list of profile files where each file contains + the necessary symbol mappings as well as profile data (likely generated + with --raw). +jeprof [options] + is a remote form. Symbols are obtained from host:port$SYMBOL_PAGE + + Each name can be: + /path/to/profile - a path to a profile file + host:port[/] - a location of a service to get profile from + + The / can be $HEAP_PAGE, $PROFILE_PAGE, /pprof/pmuprofile, + $GROWTH_PAGE, $CONTENTION_PAGE, /pprof/wall, + $CENSUSPROFILE_PAGE, or /pprof/filteredprofile. + For instance: + jeprof http://myserver.com:80$HEAP_PAGE + If / is omitted, the service defaults to $PROFILE_PAGE (cpu profiling). +jeprof --symbols + Maps addresses to symbol names. In this mode, stdin should be a + list of library mappings, in the same format as is found in the heap- + and cpu-profile files (this loosely matches that of /proc/self/maps + on linux), followed by a list of hex addresses to map, one per line. + + For more help with querying remote servers, including how to add the + necessary server-side support code, see this filename (or one like it): + + /usr/doc/gperftools-$PPROF_VERSION/pprof_remote_servers.html + +Options: + --cum Sort by cumulative data + --base= Subtract from before display + --interactive Run in interactive mode (interactive "help" gives help) [default] + --seconds= Length of time for dynamic profiles [default=30 secs] + --add_lib= Read additional symbols and line info from the given library + --lib_prefix=

Comma separated list of library path prefixes + +Reporting Granularity: + --addresses Report at address level + --lines Report at source line level + --functions Report at function level [default] + --files Report at source file level + +Output type: + --text Generate text report + --callgrind Generate callgrind format to stdout + --gv Generate Postscript and display + --evince Generate PDF and display + --web Generate SVG and display + --list= Generate source listing of matching routines + --disasm= Generate disassembly of matching routines + --symbols Print demangled symbol names found at given addresses + --dot Generate DOT file to stdout + --ps Generate Postcript to stdout + --pdf Generate PDF to stdout + --svg Generate SVG to stdout + --gif Generate GIF to stdout + --raw Generate symbolized jeprof data (useful with remote fetch) + --collapsed Generate collapsed stacks for building flame graphs + (see http://www.brendangregg.com/flamegraphs.html) + +Heap-Profile Options: + --inuse_space Display in-use (mega)bytes [default] + --inuse_objects Display in-use objects + --alloc_space Display allocated (mega)bytes + --alloc_objects Display allocated objects + --show_bytes Display space in bytes + --drop_negative Ignore negative differences + +Contention-profile options: + --total_delay Display total delay at each region [default] + --contentions Display number of delays at each region + --mean_delay Display mean delay at each region + +Call-graph Options: + --nodecount= Show at most so many nodes [default=80] + --nodefraction= Hide nodes below *total [default=.005] + --edgefraction= Hide edges below *total [default=.001] + --maxdegree= Max incoming/outgoing edges per node [default=8] + --focus= Focus on backtraces with nodes matching + --thread= Show profile for thread + --ignore= Ignore backtraces with nodes matching + --scale= Set GV scaling [default=0] + --heapcheck Make nodes with non-0 object counts + (i.e. direct leak generators) more visible + --retain= Retain only nodes that match + --exclude= Exclude all nodes that match + +Miscellaneous: + --tools=[,...] \$PATH for object tool pathnames + --test Run unit tests + --help This message + --version Version information + --debug-syms-by-id (Linux only) Find debug symbol files by build ID as well as by name + +Environment Variables: + JEPROF_TMPDIR Profiles directory. Defaults to \$HOME/jeprof + JEPROF_TOOLS Prefix for object tools pathnames + +Examples: + +jeprof /bin/ls ls.prof + Enters "interactive" mode +jeprof --text /bin/ls ls.prof + Outputs one line per procedure +jeprof --web /bin/ls ls.prof + Displays annotated call-graph in web browser +jeprof --gv /bin/ls ls.prof + Displays annotated call-graph via 'gv' +jeprof --gv --focus=Mutex /bin/ls ls.prof + Restricts to code paths including a .*Mutex.* entry +jeprof --gv --focus=Mutex --ignore=string /bin/ls ls.prof + Code paths including Mutex but not string +jeprof --list=getdir /bin/ls ls.prof + (Per-line) annotated source listing for getdir() +jeprof --disasm=getdir /bin/ls ls.prof + (Per-PC) annotated disassembly for getdir() + +jeprof http://localhost:1234/ + Enters "interactive" mode +jeprof --text localhost:1234 + Outputs one line per procedure for localhost:1234 +jeprof --raw localhost:1234 > ./local.raw +jeprof --text ./local.raw + Fetches a remote profile for later analysis and then + analyzes it in text mode. +EOF +} + +sub version_string { + return < \$main::opt_help, + "version!" => \$main::opt_version, + "cum!" => \$main::opt_cum, + "base=s" => \$main::opt_base, + "seconds=i" => \$main::opt_seconds, + "add_lib=s" => \$main::opt_lib, + "lib_prefix=s" => \$main::opt_lib_prefix, + "functions!" => \$main::opt_functions, + "lines!" => \$main::opt_lines, + "addresses!" => \$main::opt_addresses, + "files!" => \$main::opt_files, + "text!" => \$main::opt_text, + "callgrind!" => \$main::opt_callgrind, + "list=s" => \$main::opt_list, + "disasm=s" => \$main::opt_disasm, + "symbols!" => \$main::opt_symbols, + "gv!" => \$main::opt_gv, + "evince!" => \$main::opt_evince, + "web!" => \$main::opt_web, + "dot!" => \$main::opt_dot, + "ps!" => \$main::opt_ps, + "pdf!" => \$main::opt_pdf, + "svg!" => \$main::opt_svg, + "gif!" => \$main::opt_gif, + "raw!" => \$main::opt_raw, + "collapsed!" => \$main::opt_collapsed, + "interactive!" => \$main::opt_interactive, + "nodecount=i" => \$main::opt_nodecount, + "nodefraction=f" => \$main::opt_nodefraction, + "edgefraction=f" => \$main::opt_edgefraction, + "maxdegree=i" => \$main::opt_maxdegree, + "focus=s" => \$main::opt_focus, + "thread=s" => \$main::opt_thread, + "ignore=s" => \$main::opt_ignore, + "scale=i" => \$main::opt_scale, + "heapcheck" => \$main::opt_heapcheck, + "retain=s" => \$main::opt_retain, + "exclude=s" => \$main::opt_exclude, + "inuse_space!" => \$main::opt_inuse_space, + "inuse_objects!" => \$main::opt_inuse_objects, + "alloc_space!" => \$main::opt_alloc_space, + "alloc_objects!" => \$main::opt_alloc_objects, + "show_bytes!" => \$main::opt_show_bytes, + "drop_negative!" => \$main::opt_drop_negative, + "total_delay!" => \$main::opt_total_delay, + "contentions!" => \$main::opt_contentions, + "mean_delay!" => \$main::opt_mean_delay, + "tools=s" => \$main::opt_tools, + "test!" => \$main::opt_test, + "debug!" => \$main::opt_debug, + "debug-syms-by-id!" => \$main::opt_debug_syms_by_id, + # Undocumented flags used only by unittests: + "test_stride=i" => \$main::opt_test_stride, + ) || usage("Invalid option(s)"); + + # Deal with the standard --help and --version + if ($main::opt_help) { + print usage_string(); + exit(0); + } + + if ($main::opt_version) { + print version_string(); + exit(0); + } + + # Disassembly/listing/symbols mode requires address-level info + if ($main::opt_disasm || $main::opt_list || $main::opt_symbols) { + $main::opt_functions = 0; + $main::opt_lines = 0; + $main::opt_addresses = 1; + $main::opt_files = 0; + } + + # Check heap-profiling flags + if ($main::opt_inuse_space + + $main::opt_inuse_objects + + $main::opt_alloc_space + + $main::opt_alloc_objects > 1) { + usage("Specify at most on of --inuse/--alloc options"); + } + + # Check output granularities + my $grains = + $main::opt_functions + + $main::opt_lines + + $main::opt_addresses + + $main::opt_files + + 0; + if ($grains > 1) { + usage("Only specify one output granularity option"); + } + if ($grains == 0) { + $main::opt_functions = 1; + } + + # Check output modes + my $modes = + $main::opt_text + + $main::opt_callgrind + + ($main::opt_list eq '' ? 0 : 1) + + ($main::opt_disasm eq '' ? 0 : 1) + + ($main::opt_symbols == 0 ? 0 : 1) + + $main::opt_gv + + $main::opt_evince + + $main::opt_web + + $main::opt_dot + + $main::opt_ps + + $main::opt_pdf + + $main::opt_svg + + $main::opt_gif + + $main::opt_raw + + $main::opt_collapsed + + $main::opt_interactive + + 0; + if ($modes > 1) { + usage("Only specify one output mode"); + } + if ($modes == 0) { + if (-t STDOUT) { # If STDOUT is a tty, activate interactive mode + $main::opt_interactive = 1; + } else { + $main::opt_text = 1; + } + } + + if ($main::opt_test) { + RunUnitTests(); + # Should not return + exit(1); + } + + # Binary name and profile arguments list + $main::prog = ""; + @main::pfile_args = (); + + # Remote profiling without a binary (using $SYMBOL_PAGE instead) + if (@ARGV > 0) { + if (IsProfileURL($ARGV[0])) { + $main::use_symbol_page = 1; + } elsif (IsSymbolizedProfileFile($ARGV[0])) { + $main::use_symbolized_profile = 1; + $main::prog = $UNKNOWN_BINARY; # will be set later from the profile file + } + } + + if ($main::use_symbol_page || $main::use_symbolized_profile) { + # We don't need a binary! + my %disabled = ('--lines' => $main::opt_lines, + '--disasm' => $main::opt_disasm); + for my $option (keys %disabled) { + usage("$option cannot be used without a binary") if $disabled{$option}; + } + # Set $main::prog later... + scalar(@ARGV) || usage("Did not specify profile file"); + } elsif ($main::opt_symbols) { + # --symbols needs a binary-name (to run nm on, etc) but not profiles + $main::prog = shift(@ARGV) || usage("Did not specify program"); + } else { + $main::prog = shift(@ARGV) || usage("Did not specify program"); + scalar(@ARGV) || usage("Did not specify profile file"); + } + + # Parse profile file/location arguments + foreach my $farg (@ARGV) { + if ($farg =~ m/(.*)\@([0-9]+)(|\/.*)$/ ) { + my $machine = $1; + my $num_machines = $2; + my $path = $3; + for (my $i = 0; $i < $num_machines; $i++) { + unshift(@main::pfile_args, "$i.$machine$path"); + } + } else { + unshift(@main::pfile_args, $farg); + } + } + + if ($main::use_symbol_page) { + unless (IsProfileURL($main::pfile_args[0])) { + error("The first profile should be a remote form to use $SYMBOL_PAGE\n"); + } + CheckSymbolPage(); + $main::prog = FetchProgramName(); + } elsif (!$main::use_symbolized_profile) { # may not need objtools! + ConfigureObjTools($main::prog) + } + + # Break the opt_lib_prefix into the prefix_list array + @prefix_list = split (',', $main::opt_lib_prefix); + + # Remove trailing / from the prefixes, in the list to prevent + # searching things like /my/path//lib/mylib.so + foreach (@prefix_list) { + s|/+$||; + } + + # Flag to prevent us from trying over and over to use + # elfutils if it's not installed (used only with + # --debug-syms-by-id option). + $main::gave_up_on_elfutils = 0; +} + +sub FilterAndPrint { + my ($profile, $symbols, $libs, $thread) = @_; + + # Get total data in profile + my $total = TotalProfile($profile); + + # Remove uniniteresting stack items + $profile = RemoveUninterestingFrames($symbols, $profile); + + # Focus? + if ($main::opt_focus ne '') { + $profile = FocusProfile($symbols, $profile, $main::opt_focus); + } + + # Ignore? + if ($main::opt_ignore ne '') { + $profile = IgnoreProfile($symbols, $profile, $main::opt_ignore); + } + + my $calls = ExtractCalls($symbols, $profile); + + # Reduce profiles to required output granularity, and also clean + # each stack trace so a given entry exists at most once. + my $reduced = ReduceProfile($symbols, $profile); + + # Get derived profiles + my $flat = FlatProfile($reduced); + my $cumulative = CumulativeProfile($reduced); + + # Print + if (!$main::opt_interactive) { + if ($main::opt_disasm) { + PrintDisassembly($libs, $flat, $cumulative, $main::opt_disasm); + } elsif ($main::opt_list) { + PrintListing($total, $libs, $flat, $cumulative, $main::opt_list, 0); + } elsif ($main::opt_text) { + # Make sure the output is empty when have nothing to report + # (only matters when --heapcheck is given but we must be + # compatible with old branches that did not pass --heapcheck always): + if ($total != 0) { + printf("Total%s: %s %s\n", + (defined($thread) ? " (t$thread)" : ""), + Unparse($total), Units()); + } + PrintText($symbols, $flat, $cumulative, -1); + } elsif ($main::opt_raw) { + PrintSymbolizedProfile($symbols, $profile, $main::prog); + } elsif ($main::opt_collapsed) { + PrintCollapsedStacks($symbols, $profile); + } elsif ($main::opt_callgrind) { + PrintCallgrind($calls); + } else { + if (PrintDot($main::prog, $symbols, $profile, $flat, $cumulative, $total)) { + if ($main::opt_gv) { + RunGV(TempName($main::next_tmpfile, "ps"), ""); + } elsif ($main::opt_evince) { + RunEvince(TempName($main::next_tmpfile, "pdf"), ""); + } elsif ($main::opt_web) { + my $tmp = TempName($main::next_tmpfile, "svg"); + RunWeb($tmp); + # The command we run might hand the file name off + # to an already running browser instance and then exit. + # Normally, we'd remove $tmp on exit (right now), + # but fork a child to remove $tmp a little later, so that the + # browser has time to load it first. + delete $main::tempnames{$tmp}; + if (fork() == 0) { + sleep 5; + unlink($tmp); + exit(0); + } + } + } else { + cleanup(); + exit(1); + } + } + } else { + InteractiveMode($profile, $symbols, $libs, $total); + } +} + +sub Main() { + Init(); + $main::collected_profile = undef; + @main::profile_files = (); + $main::op_time = time(); + + # Printing symbols is special and requires a lot less info that most. + if ($main::opt_symbols) { + PrintSymbols(*STDIN); # Get /proc/maps and symbols output from stdin + return; + } + + # Fetch all profile data + FetchDynamicProfiles(); + + # this will hold symbols that we read from the profile files + my $symbol_map = {}; + + # Read one profile, pick the last item on the list + my $data = ReadProfile($main::prog, pop(@main::profile_files)); + my $profile = $data->{profile}; + my $pcs = $data->{pcs}; + my $libs = $data->{libs}; # Info about main program and shared libraries + $symbol_map = MergeSymbols($symbol_map, $data->{symbols}); + + # Add additional profiles, if available. + if (scalar(@main::profile_files) > 0) { + foreach my $pname (@main::profile_files) { + my $data2 = ReadProfile($main::prog, $pname); + $profile = AddProfile($profile, $data2->{profile}); + $pcs = AddPcs($pcs, $data2->{pcs}); + $symbol_map = MergeSymbols($symbol_map, $data2->{symbols}); + } + } + + # Subtract base from profile, if specified + if ($main::opt_base ne '') { + my $base = ReadProfile($main::prog, $main::opt_base); + $profile = SubtractProfile($profile, $base->{profile}); + $pcs = AddPcs($pcs, $base->{pcs}); + $symbol_map = MergeSymbols($symbol_map, $base->{symbols}); + } + + # Collect symbols + my $symbols; + if ($main::use_symbolized_profile) { + $symbols = FetchSymbols($pcs, $symbol_map); + } elsif ($main::use_symbol_page) { + $symbols = FetchSymbols($pcs); + } else { + # TODO(csilvers): $libs uses the /proc/self/maps data from profile1, + # which may differ from the data from subsequent profiles, especially + # if they were run on different machines. Use appropriate libs for + # each pc somehow. + $symbols = ExtractSymbols($libs, $pcs); + } + + if (!defined($main::opt_thread)) { + FilterAndPrint($profile, $symbols, $libs); + } + if (defined($data->{threads})) { + foreach my $thread (sort { $a <=> $b } keys(%{$data->{threads}})) { + if (defined($main::opt_thread) && + ($main::opt_thread eq '*' || $main::opt_thread == $thread)) { + my $thread_profile = $data->{threads}{$thread}; + FilterAndPrint($thread_profile, $symbols, $libs, $thread); + } + } + } + + cleanup(); + exit(0); +} + +##### Entry Point ##### + +Main(); + +# Temporary code to detect if we're running on a Goobuntu system. +# These systems don't have the right stuff installed for the special +# Readline libraries to work, so as a temporary workaround, we default +# to using the normal stdio code, rather than the fancier readline-based +# code +sub ReadlineMightFail { + if (-e '/lib/libtermcap.so.2') { + return 0; # libtermcap exists, so readline should be okay + } else { + return 1; + } +} + +sub RunGV { + my $fname = shift; + my $bg = shift; # "" or " &" if we should run in background + if (!system(ShellEscape(@GV, "--version") . " >$dev_null 2>&1")) { + # Options using double dash are supported by this gv version. + # Also, turn on noantialias to better handle bug in gv for + # postscript files with large dimensions. + # TODO: Maybe we should not pass the --noantialias flag + # if the gv version is known to work properly without the flag. + system(ShellEscape(@GV, "--scale=$main::opt_scale", "--noantialias", $fname) + . $bg); + } else { + # Old gv version - only supports options that use single dash. + print STDERR ShellEscape(@GV, "-scale", $main::opt_scale) . "\n"; + system(ShellEscape(@GV, "-scale", "$main::opt_scale", $fname) . $bg); + } +} + +sub RunEvince { + my $fname = shift; + my $bg = shift; # "" or " &" if we should run in background + system(ShellEscape(@EVINCE, $fname) . $bg); +} + +sub RunWeb { + my $fname = shift; + print STDERR "Loading web page file:///$fname\n"; + + if (`uname` =~ /Darwin/) { + # OS X: open will use standard preference for SVG files. + system("/usr/bin/open", $fname); + return; + } + + # Some kind of Unix; try generic symlinks, then specific browsers. + # (Stop once we find one.) + # Works best if the browser is already running. + my @alt = ( + "/etc/alternatives/gnome-www-browser", + "/etc/alternatives/x-www-browser", + "google-chrome", + "firefox", + ); + foreach my $b (@alt) { + if (system($b, $fname) == 0) { + return; + } + } + + print STDERR "Could not load web browser.\n"; +} + +sub RunKcachegrind { + my $fname = shift; + my $bg = shift; # "" or " &" if we should run in background + print STDERR "Starting '@KCACHEGRIND " . $fname . $bg . "'\n"; + system(ShellEscape(@KCACHEGRIND, $fname) . $bg); +} + + +##### Interactive helper routines ##### + +sub InteractiveMode { + $| = 1; # Make output unbuffered for interactive mode + my ($orig_profile, $symbols, $libs, $total) = @_; + + print STDERR "Welcome to jeprof! For help, type 'help'.\n"; + + # Use ReadLine if it's installed and input comes from a console. + if ( -t STDIN && + !ReadlineMightFail() && + defined(eval {require Term::ReadLine}) ) { + my $term = new Term::ReadLine 'jeprof'; + while ( defined ($_ = $term->readline('(jeprof) '))) { + $term->addhistory($_) if /\S/; + if (!InteractiveCommand($orig_profile, $symbols, $libs, $total, $_)) { + last; # exit when we get an interactive command to quit + } + } + } else { # don't have readline + while (1) { + print STDERR "(jeprof) "; + $_ = ; + last if ! defined $_ ; + s/\r//g; # turn windows-looking lines into unix-looking lines + + # Save some flags that might be reset by InteractiveCommand() + my $save_opt_lines = $main::opt_lines; + + if (!InteractiveCommand($orig_profile, $symbols, $libs, $total, $_)) { + last; # exit when we get an interactive command to quit + } + + # Restore flags + $main::opt_lines = $save_opt_lines; + } + } +} + +# Takes two args: orig profile, and command to run. +# Returns 1 if we should keep going, or 0 if we were asked to quit +sub InteractiveCommand { + my($orig_profile, $symbols, $libs, $total, $command) = @_; + $_ = $command; # just to make future m//'s easier + if (!defined($_)) { + print STDERR "\n"; + return 0; + } + if (m/^\s*quit/) { + return 0; + } + if (m/^\s*help/) { + InteractiveHelpMessage(); + return 1; + } + # Clear all the mode options -- mode is controlled by "$command" + $main::opt_text = 0; + $main::opt_callgrind = 0; + $main::opt_disasm = 0; + $main::opt_list = 0; + $main::opt_gv = 0; + $main::opt_evince = 0; + $main::opt_cum = 0; + + if (m/^\s*(text|top)(\d*)\s*(.*)/) { + $main::opt_text = 1; + + my $line_limit = ($2 ne "") ? int($2) : 10; + + my $routine; + my $ignore; + ($routine, $ignore) = ParseInteractiveArgs($3); + + my $profile = ProcessProfile($total, $orig_profile, $symbols, "", $ignore); + my $reduced = ReduceProfile($symbols, $profile); + + # Get derived profiles + my $flat = FlatProfile($reduced); + my $cumulative = CumulativeProfile($reduced); + + PrintText($symbols, $flat, $cumulative, $line_limit); + return 1; + } + if (m/^\s*callgrind\s*([^ \n]*)/) { + $main::opt_callgrind = 1; + + # Get derived profiles + my $calls = ExtractCalls($symbols, $orig_profile); + my $filename = $1; + if ( $1 eq '' ) { + $filename = TempName($main::next_tmpfile, "callgrind"); + } + PrintCallgrind($calls, $filename); + if ( $1 eq '' ) { + RunKcachegrind($filename, " & "); + $main::next_tmpfile++; + } + + return 1; + } + if (m/^\s*(web)?list\s*(.+)/) { + my $html = (defined($1) && ($1 eq "web")); + $main::opt_list = 1; + + my $routine; + my $ignore; + ($routine, $ignore) = ParseInteractiveArgs($2); + + my $profile = ProcessProfile($total, $orig_profile, $symbols, "", $ignore); + my $reduced = ReduceProfile($symbols, $profile); + + # Get derived profiles + my $flat = FlatProfile($reduced); + my $cumulative = CumulativeProfile($reduced); + + PrintListing($total, $libs, $flat, $cumulative, $routine, $html); + return 1; + } + if (m/^\s*disasm\s*(.+)/) { + $main::opt_disasm = 1; + + my $routine; + my $ignore; + ($routine, $ignore) = ParseInteractiveArgs($1); + + # Process current profile to account for various settings + my $profile = ProcessProfile($total, $orig_profile, $symbols, "", $ignore); + my $reduced = ReduceProfile($symbols, $profile); + + # Get derived profiles + my $flat = FlatProfile($reduced); + my $cumulative = CumulativeProfile($reduced); + + PrintDisassembly($libs, $flat, $cumulative, $routine); + return 1; + } + if (m/^\s*(gv|web|evince)\s*(.*)/) { + $main::opt_gv = 0; + $main::opt_evince = 0; + $main::opt_web = 0; + if ($1 eq "gv") { + $main::opt_gv = 1; + } elsif ($1 eq "evince") { + $main::opt_evince = 1; + } elsif ($1 eq "web") { + $main::opt_web = 1; + } + + my $focus; + my $ignore; + ($focus, $ignore) = ParseInteractiveArgs($2); + + # Process current profile to account for various settings + my $profile = ProcessProfile($total, $orig_profile, $symbols, + $focus, $ignore); + my $reduced = ReduceProfile($symbols, $profile); + + # Get derived profiles + my $flat = FlatProfile($reduced); + my $cumulative = CumulativeProfile($reduced); + + if (PrintDot($main::prog, $symbols, $profile, $flat, $cumulative, $total)) { + if ($main::opt_gv) { + RunGV(TempName($main::next_tmpfile, "ps"), " &"); + } elsif ($main::opt_evince) { + RunEvince(TempName($main::next_tmpfile, "pdf"), " &"); + } elsif ($main::opt_web) { + RunWeb(TempName($main::next_tmpfile, "svg")); + } + $main::next_tmpfile++; + } + return 1; + } + if (m/^\s*$/) { + return 1; + } + print STDERR "Unknown command: try 'help'.\n"; + return 1; +} + + +sub ProcessProfile { + my $total_count = shift; + my $orig_profile = shift; + my $symbols = shift; + my $focus = shift; + my $ignore = shift; + + # Process current profile to account for various settings + my $profile = $orig_profile; + printf("Total: %s %s\n", Unparse($total_count), Units()); + if ($focus ne '') { + $profile = FocusProfile($symbols, $profile, $focus); + my $focus_count = TotalProfile($profile); + printf("After focusing on '%s': %s %s of %s (%0.1f%%)\n", + $focus, + Unparse($focus_count), Units(), + Unparse($total_count), ($focus_count*100.0) / $total_count); + } + if ($ignore ne '') { + $profile = IgnoreProfile($symbols, $profile, $ignore); + my $ignore_count = TotalProfile($profile); + printf("After ignoring '%s': %s %s of %s (%0.1f%%)\n", + $ignore, + Unparse($ignore_count), Units(), + Unparse($total_count), + ($ignore_count*100.0) / $total_count); + } + + return $profile; +} + +sub InteractiveHelpMessage { + print STDERR <{$k}; + my @addrs = split(/\n/, $k); + if ($#addrs >= 0) { + my $depth = $#addrs + 1; + # int(foo / 2**32) is the only reliable way to get rid of bottom + # 32 bits on both 32- and 64-bit systems. + print pack('L*', $count & 0xFFFFFFFF, int($count / 2**32)); + print pack('L*', $depth & 0xFFFFFFFF, int($depth / 2**32)); + + foreach my $full_addr (@addrs) { + my $addr = $full_addr; + $addr =~ s/0x0*//; # strip off leading 0x, zeroes + if (length($addr) > 16) { + print STDERR "Invalid address in profile: $full_addr\n"; + next; + } + my $low_addr = substr($addr, -8); # get last 8 hex chars + my $high_addr = substr($addr, -16, 8); # get up to 8 more hex chars + print pack('L*', hex('0x' . $low_addr), hex('0x' . $high_addr)); + } + } + } +} + +# Print symbols and profile data +sub PrintSymbolizedProfile { + my $symbols = shift; + my $profile = shift; + my $prog = shift; + + $SYMBOL_PAGE =~ m,[^/]+$,; # matches everything after the last slash + my $symbol_marker = $&; + + print '--- ', $symbol_marker, "\n"; + if (defined($prog)) { + print 'binary=', $prog, "\n"; + } + while (my ($pc, $name) = each(%{$symbols})) { + my $sep = ' '; + print '0x', $pc; + # We have a list of function names, which include the inlined + # calls. They are separated (and terminated) by --, which is + # illegal in function names. + for (my $j = 2; $j <= $#{$name}; $j += 3) { + print $sep, $name->[$j]; + $sep = '--'; + } + print "\n"; + } + print '---', "\n"; + + my $profile_marker; + if ($main::profile_type eq 'heap') { + $HEAP_PAGE =~ m,[^/]+$,; # matches everything after the last slash + $profile_marker = $&; + } elsif ($main::profile_type eq 'growth') { + $GROWTH_PAGE =~ m,[^/]+$,; # matches everything after the last slash + $profile_marker = $&; + } elsif ($main::profile_type eq 'contention') { + $CONTENTION_PAGE =~ m,[^/]+$,; # matches everything after the last slash + $profile_marker = $&; + } else { # elsif ($main::profile_type eq 'cpu') + $PROFILE_PAGE =~ m,[^/]+$,; # matches everything after the last slash + $profile_marker = $&; + } + + print '--- ', $profile_marker, "\n"; + if (defined($main::collected_profile)) { + # if used with remote fetch, simply dump the collected profile to output. + open(SRC, "<$main::collected_profile"); + while () { + print $_; + } + close(SRC); + } else { + # --raw/http: For everything to work correctly for non-remote profiles, we + # would need to extend PrintProfileData() to handle all possible profile + # types, re-enable the code that is currently disabled in ReadCPUProfile() + # and FixCallerAddresses(), and remove the remote profile dumping code in + # the block above. + die "--raw/http: jeprof can only dump remote profiles for --raw\n"; + # dump a cpu-format profile to standard out + PrintProfileData($profile); + } +} + +# Print text output +sub PrintText { + my $symbols = shift; + my $flat = shift; + my $cumulative = shift; + my $line_limit = shift; + + my $total = TotalProfile($flat); + + # Which profile to sort by? + my $s = $main::opt_cum ? $cumulative : $flat; + + my $running_sum = 0; + my $lines = 0; + foreach my $k (sort { GetEntry($s, $b) <=> GetEntry($s, $a) || $a cmp $b } + keys(%{$cumulative})) { + my $f = GetEntry($flat, $k); + my $c = GetEntry($cumulative, $k); + $running_sum += $f; + + my $sym = $k; + if (exists($symbols->{$k})) { + $sym = $symbols->{$k}->[0] . " " . $symbols->{$k}->[1]; + if ($main::opt_addresses) { + $sym = $k . " " . $sym; + } + } + + if ($f != 0 || $c != 0) { + printf("%8s %6s %6s %8s %6s %s\n", + Unparse($f), + Percent($f, $total), + Percent($running_sum, $total), + Unparse($c), + Percent($c, $total), + $sym); + } + $lines++; + last if ($line_limit >= 0 && $lines >= $line_limit); + } +} + +# Callgrind format has a compression for repeated function and file +# names. You show the name the first time, and just use its number +# subsequently. This can cut down the file to about a third or a +# quarter of its uncompressed size. $key and $val are the key/value +# pair that would normally be printed by callgrind; $map is a map from +# value to number. +sub CompressedCGName { + my($key, $val, $map) = @_; + my $idx = $map->{$val}; + # For very short keys, providing an index hurts rather than helps. + if (length($val) <= 3) { + return "$key=$val\n"; + } elsif (defined($idx)) { + return "$key=($idx)\n"; + } else { + # scalar(keys $map) gives the number of items in the map. + $idx = scalar(keys(%{$map})) + 1; + $map->{$val} = $idx; + return "$key=($idx) $val\n"; + } +} + +# Print the call graph in a way that's suiteable for callgrind. +sub PrintCallgrind { + my $calls = shift; + my $filename; + my %filename_to_index_map; + my %fnname_to_index_map; + + if ($main::opt_interactive) { + $filename = shift; + print STDERR "Writing callgrind file to '$filename'.\n" + } else { + $filename = "&STDOUT"; + } + open(CG, ">$filename"); + printf CG ("events: Hits\n\n"); + foreach my $call ( map { $_->[0] } + sort { $a->[1] cmp $b ->[1] || + $a->[2] <=> $b->[2] } + map { /([^:]+):(\d+):([^ ]+)( -> ([^:]+):(\d+):(.+))?/; + [$_, $1, $2] } + keys %$calls ) { + my $count = int($calls->{$call}); + $call =~ /([^:]+):(\d+):([^ ]+)( -> ([^:]+):(\d+):(.+))?/; + my ( $caller_file, $caller_line, $caller_function, + $callee_file, $callee_line, $callee_function ) = + ( $1, $2, $3, $5, $6, $7 ); + + # TODO(csilvers): for better compression, collect all the + # caller/callee_files and functions first, before printing + # anything, and only compress those referenced more than once. + printf CG CompressedCGName("fl", $caller_file, \%filename_to_index_map); + printf CG CompressedCGName("fn", $caller_function, \%fnname_to_index_map); + if (defined $6) { + printf CG CompressedCGName("cfl", $callee_file, \%filename_to_index_map); + printf CG CompressedCGName("cfn", $callee_function, \%fnname_to_index_map); + printf CG ("calls=$count $callee_line\n"); + } + printf CG ("$caller_line $count\n\n"); + } +} + +# Print disassembly for all all routines that match $main::opt_disasm +sub PrintDisassembly { + my $libs = shift; + my $flat = shift; + my $cumulative = shift; + my $disasm_opts = shift; + + my $total = TotalProfile($flat); + + foreach my $lib (@{$libs}) { + my $symbol_table = GetProcedureBoundaries($lib->[0], $disasm_opts); + my $offset = AddressSub($lib->[1], $lib->[3]); + foreach my $routine (sort ByName keys(%{$symbol_table})) { + my $start_addr = $symbol_table->{$routine}->[0]; + my $end_addr = $symbol_table->{$routine}->[1]; + # See if there are any samples in this routine + my $length = hex(AddressSub($end_addr, $start_addr)); + my $addr = AddressAdd($start_addr, $offset); + for (my $i = 0; $i < $length; $i++) { + if (defined($cumulative->{$addr})) { + PrintDisassembledFunction($lib->[0], $offset, + $routine, $flat, $cumulative, + $start_addr, $end_addr, $total); + last; + } + $addr = AddressInc($addr); + } + } + } +} + +# Return reference to array of tuples of the form: +# [start_address, filename, linenumber, instruction, limit_address] +# E.g., +# ["0x806c43d", "/foo/bar.cc", 131, "ret", "0x806c440"] +sub Disassemble { + my $prog = shift; + my $offset = shift; + my $start_addr = shift; + my $end_addr = shift; + + my $objdump = $obj_tool_map{"objdump"}; + my $cmd = ShellEscape($objdump, "-C", "-d", "-l", "--no-show-raw-insn", + "--start-address=0x$start_addr", + "--stop-address=0x$end_addr", $prog); + open(OBJDUMP, "$cmd |") || error("$cmd: $!\n"); + my @result = (); + my $filename = ""; + my $linenumber = -1; + my $last = ["", "", "", ""]; + while () { + s/\r//g; # turn windows-looking lines into unix-looking lines + chop; + if (m|\s*([^:\s]+):(\d+)\s*$|) { + # Location line of the form: + # : + $filename = $1; + $linenumber = $2; + } elsif (m/^ +([0-9a-f]+):\s*(.*)/) { + # Disassembly line -- zero-extend address to full length + my $addr = HexExtend($1); + my $k = AddressAdd($addr, $offset); + $last->[4] = $k; # Store ending address for previous instruction + $last = [$k, $filename, $linenumber, $2, $end_addr]; + push(@result, $last); + } + } + close(OBJDUMP); + return @result; +} + +# The input file should contain lines of the form /proc/maps-like +# output (same format as expected from the profiles) or that looks +# like hex addresses (like "0xDEADBEEF"). We will parse all +# /proc/maps output, and for all the hex addresses, we will output +# "short" symbol names, one per line, in the same order as the input. +sub PrintSymbols { + my $maps_and_symbols_file = shift; + + # ParseLibraries expects pcs to be in a set. Fine by us... + my @pclist = (); # pcs in sorted order + my $pcs = {}; + my $map = ""; + foreach my $line (<$maps_and_symbols_file>) { + $line =~ s/\r//g; # turn windows-looking lines into unix-looking lines + if ($line =~ /\b(0x[0-9a-f]+)\b/i) { + push(@pclist, HexExtend($1)); + $pcs->{$pclist[-1]} = 1; + } else { + $map .= $line; + } + } + + my $libs = ParseLibraries($main::prog, $map, $pcs); + my $symbols = ExtractSymbols($libs, $pcs); + + foreach my $pc (@pclist) { + # ->[0] is the shortname, ->[2] is the full name + print(($symbols->{$pc}->[0] || "??") . "\n"); + } +} + + +# For sorting functions by name +sub ByName { + return ShortFunctionName($a) cmp ShortFunctionName($b); +} + +# Print source-listing for all all routines that match $list_opts +sub PrintListing { + my $total = shift; + my $libs = shift; + my $flat = shift; + my $cumulative = shift; + my $list_opts = shift; + my $html = shift; + + my $output = \*STDOUT; + my $fname = ""; + + if ($html) { + # Arrange to write the output to a temporary file + $fname = TempName($main::next_tmpfile, "html"); + $main::next_tmpfile++; + if (!open(TEMP, ">$fname")) { + print STDERR "$fname: $!\n"; + return; + } + $output = \*TEMP; + print $output HtmlListingHeader(); + printf $output ("
%s
Total: %s %s
\n", + $main::prog, Unparse($total), Units()); + } + + my $listed = 0; + foreach my $lib (@{$libs}) { + my $symbol_table = GetProcedureBoundaries($lib->[0], $list_opts); + my $offset = AddressSub($lib->[1], $lib->[3]); + foreach my $routine (sort ByName keys(%{$symbol_table})) { + # Print if there are any samples in this routine + my $start_addr = $symbol_table->{$routine}->[0]; + my $end_addr = $symbol_table->{$routine}->[1]; + my $length = hex(AddressSub($end_addr, $start_addr)); + my $addr = AddressAdd($start_addr, $offset); + for (my $i = 0; $i < $length; $i++) { + if (defined($cumulative->{$addr})) { + $listed += PrintSource( + $lib->[0], $offset, + $routine, $flat, $cumulative, + $start_addr, $end_addr, + $html, + $output); + last; + } + $addr = AddressInc($addr); + } + } + } + + if ($html) { + if ($listed > 0) { + print $output HtmlListingFooter(); + close($output); + RunWeb($fname); + } else { + close($output); + unlink($fname); + } + } +} + +sub HtmlListingHeader { + return <<'EOF'; + + + +Pprof listing + + + + +EOF +} + +sub HtmlListingFooter { + return <<'EOF'; + + +EOF +} + +sub HtmlEscape { + my $text = shift; + $text =~ s/&/&/g; + $text =~ s//>/g; + return $text; +} + +# Returns the indentation of the line, if it has any non-whitespace +# characters. Otherwise, returns -1. +sub Indentation { + my $line = shift; + if (m/^(\s*)\S/) { + return length($1); + } else { + return -1; + } +} + +# If the symbol table contains inlining info, Disassemble() may tag an +# instruction with a location inside an inlined function. But for +# source listings, we prefer to use the location in the function we +# are listing. So use MapToSymbols() to fetch full location +# information for each instruction and then pick out the first +# location from a location list (location list contains callers before +# callees in case of inlining). +# +# After this routine has run, each entry in $instructions contains: +# [0] start address +# [1] filename for function we are listing +# [2] line number for function we are listing +# [3] disassembly +# [4] limit address +# [5] most specific filename (may be different from [1] due to inlining) +# [6] most specific line number (may be different from [2] due to inlining) +sub GetTopLevelLineNumbers { + my ($lib, $offset, $instructions) = @_; + my $pcs = []; + for (my $i = 0; $i <= $#{$instructions}; $i++) { + push(@{$pcs}, $instructions->[$i]->[0]); + } + my $symbols = {}; + MapToSymbols($lib, $offset, $pcs, $symbols); + for (my $i = 0; $i <= $#{$instructions}; $i++) { + my $e = $instructions->[$i]; + push(@{$e}, $e->[1]); + push(@{$e}, $e->[2]); + my $addr = $e->[0]; + my $sym = $symbols->{$addr}; + if (defined($sym)) { + if ($#{$sym} >= 2 && $sym->[1] =~ m/^(.*):(\d+)$/) { + $e->[1] = $1; # File name + $e->[2] = $2; # Line number + } + } + } +} + +# Print source-listing for one routine +sub PrintSource { + my $prog = shift; + my $offset = shift; + my $routine = shift; + my $flat = shift; + my $cumulative = shift; + my $start_addr = shift; + my $end_addr = shift; + my $html = shift; + my $output = shift; + + # Disassemble all instructions (just to get line numbers) + my @instructions = Disassemble($prog, $offset, $start_addr, $end_addr); + GetTopLevelLineNumbers($prog, $offset, \@instructions); + + # Hack 1: assume that the first source file encountered in the + # disassembly contains the routine + my $filename = undef; + for (my $i = 0; $i <= $#instructions; $i++) { + if ($instructions[$i]->[2] >= 0) { + $filename = $instructions[$i]->[1]; + last; + } + } + if (!defined($filename)) { + print STDERR "no filename found in $routine\n"; + return 0; + } + + # Hack 2: assume that the largest line number from $filename is the + # end of the procedure. This is typically safe since if P1 contains + # an inlined call to P2, then P2 usually occurs earlier in the + # source file. If this does not work, we might have to compute a + # density profile or just print all regions we find. + my $lastline = 0; + for (my $i = 0; $i <= $#instructions; $i++) { + my $f = $instructions[$i]->[1]; + my $l = $instructions[$i]->[2]; + if (($f eq $filename) && ($l > $lastline)) { + $lastline = $l; + } + } + + # Hack 3: assume the first source location from "filename" is the start of + # the source code. + my $firstline = 1; + for (my $i = 0; $i <= $#instructions; $i++) { + if ($instructions[$i]->[1] eq $filename) { + $firstline = $instructions[$i]->[2]; + last; + } + } + + # Hack 4: Extend last line forward until its indentation is less than + # the indentation we saw on $firstline + my $oldlastline = $lastline; + { + if (!open(FILE, "<$filename")) { + print STDERR "$filename: $!\n"; + return 0; + } + my $l = 0; + my $first_indentation = -1; + while () { + s/\r//g; # turn windows-looking lines into unix-looking lines + $l++; + my $indent = Indentation($_); + if ($l >= $firstline) { + if ($first_indentation < 0 && $indent >= 0) { + $first_indentation = $indent; + last if ($first_indentation == 0); + } + } + if ($l >= $lastline && $indent >= 0) { + if ($indent >= $first_indentation) { + $lastline = $l+1; + } else { + last; + } + } + } + close(FILE); + } + + # Assign all samples to the range $firstline,$lastline, + # Hack 4: If an instruction does not occur in the range, its samples + # are moved to the next instruction that occurs in the range. + my $samples1 = {}; # Map from line number to flat count + my $samples2 = {}; # Map from line number to cumulative count + my $running1 = 0; # Unassigned flat counts + my $running2 = 0; # Unassigned cumulative counts + my $total1 = 0; # Total flat counts + my $total2 = 0; # Total cumulative counts + my %disasm = (); # Map from line number to disassembly + my $running_disasm = ""; # Unassigned disassembly + my $skip_marker = "---\n"; + if ($html) { + $skip_marker = ""; + for (my $l = $firstline; $l <= $lastline; $l++) { + $disasm{$l} = ""; + } + } + my $last_dis_filename = ''; + my $last_dis_linenum = -1; + my $last_touched_line = -1; # To detect gaps in disassembly for a line + foreach my $e (@instructions) { + # Add up counts for all address that fall inside this instruction + my $c1 = 0; + my $c2 = 0; + for (my $a = $e->[0]; $a lt $e->[4]; $a = AddressInc($a)) { + $c1 += GetEntry($flat, $a); + $c2 += GetEntry($cumulative, $a); + } + + if ($html) { + my $dis = sprintf(" %6s %6s \t\t%8s: %s ", + HtmlPrintNumber($c1), + HtmlPrintNumber($c2), + UnparseAddress($offset, $e->[0]), + CleanDisassembly($e->[3])); + + # Append the most specific source line associated with this instruction + if (length($dis) < 80) { $dis .= (' ' x (80 - length($dis))) }; + $dis = HtmlEscape($dis); + my $f = $e->[5]; + my $l = $e->[6]; + if ($f ne $last_dis_filename) { + $dis .= sprintf("%s:%d", + HtmlEscape(CleanFileName($f)), $l); + } elsif ($l ne $last_dis_linenum) { + # De-emphasize the unchanged file name portion + $dis .= sprintf("%s" . + ":%d", + HtmlEscape(CleanFileName($f)), $l); + } else { + # De-emphasize the entire location + $dis .= sprintf("%s:%d", + HtmlEscape(CleanFileName($f)), $l); + } + $last_dis_filename = $f; + $last_dis_linenum = $l; + $running_disasm .= $dis; + $running_disasm .= "\n"; + } + + $running1 += $c1; + $running2 += $c2; + $total1 += $c1; + $total2 += $c2; + my $file = $e->[1]; + my $line = $e->[2]; + if (($file eq $filename) && + ($line >= $firstline) && + ($line <= $lastline)) { + # Assign all accumulated samples to this line + AddEntry($samples1, $line, $running1); + AddEntry($samples2, $line, $running2); + $running1 = 0; + $running2 = 0; + if ($html) { + if ($line != $last_touched_line && $disasm{$line} ne '') { + $disasm{$line} .= "\n"; + } + $disasm{$line} .= $running_disasm; + $running_disasm = ''; + $last_touched_line = $line; + } + } + } + + # Assign any leftover samples to $lastline + AddEntry($samples1, $lastline, $running1); + AddEntry($samples2, $lastline, $running2); + if ($html) { + if ($lastline != $last_touched_line && $disasm{$lastline} ne '') { + $disasm{$lastline} .= "\n"; + } + $disasm{$lastline} .= $running_disasm; + } + + if ($html) { + printf $output ( + "

%s

%s\n
\n" .
+      "Total:%6s %6s (flat / cumulative %s)\n",
+      HtmlEscape(ShortFunctionName($routine)),
+      HtmlEscape(CleanFileName($filename)),
+      Unparse($total1),
+      Unparse($total2),
+      Units());
+  } else {
+    printf $output (
+      "ROUTINE ====================== %s in %s\n" .
+      "%6s %6s Total %s (flat / cumulative)\n",
+      ShortFunctionName($routine),
+      CleanFileName($filename),
+      Unparse($total1),
+      Unparse($total2),
+      Units());
+  }
+  if (!open(FILE, "<$filename")) {
+    print STDERR "$filename: $!\n";
+    return 0;
+  }
+  my $l = 0;
+  while () {
+    s/\r//g;         # turn windows-looking lines into unix-looking lines
+    $l++;
+    if ($l >= $firstline - 5 &&
+        (($l <= $oldlastline + 5) || ($l <= $lastline))) {
+      chop;
+      my $text = $_;
+      if ($l == $firstline) { print $output $skip_marker; }
+      my $n1 = GetEntry($samples1, $l);
+      my $n2 = GetEntry($samples2, $l);
+      if ($html) {
+        # Emit a span that has one of the following classes:
+        #    livesrc -- has samples
+        #    deadsrc -- has disassembly, but with no samples
+        #    nop     -- has no matching disasembly
+        # Also emit an optional span containing disassembly.
+        my $dis = $disasm{$l};
+        my $asm = "";
+        if (defined($dis) && $dis ne '') {
+          $asm = "" . $dis . "";
+        }
+        my $source_class = (($n1 + $n2 > 0)
+                            ? "livesrc"
+                            : (($asm ne "") ? "deadsrc" : "nop"));
+        printf $output (
+          "%5d " .
+          "%6s %6s %s%s\n",
+          $l, $source_class,
+          HtmlPrintNumber($n1),
+          HtmlPrintNumber($n2),
+          HtmlEscape($text),
+          $asm);
+      } else {
+        printf $output(
+          "%6s %6s %4d: %s\n",
+          UnparseAlt($n1),
+          UnparseAlt($n2),
+          $l,
+          $text);
+      }
+      if ($l == $lastline)  { print $output $skip_marker; }
+    };
+  }
+  close(FILE);
+  if ($html) {
+    print $output "
\n"; + } + return 1; +} + +# Return the source line for the specified file/linenumber. +# Returns undef if not found. +sub SourceLine { + my $file = shift; + my $line = shift; + + # Look in cache + if (!defined($main::source_cache{$file})) { + if (100 < scalar keys(%main::source_cache)) { + # Clear the cache when it gets too big + $main::source_cache = (); + } + + # Read all lines from the file + if (!open(FILE, "<$file")) { + print STDERR "$file: $!\n"; + $main::source_cache{$file} = []; # Cache the negative result + return undef; + } + my $lines = []; + push(@{$lines}, ""); # So we can use 1-based line numbers as indices + while () { + push(@{$lines}, $_); + } + close(FILE); + + # Save the lines in the cache + $main::source_cache{$file} = $lines; + } + + my $lines = $main::source_cache{$file}; + if (($line < 0) || ($line > $#{$lines})) { + return undef; + } else { + return $lines->[$line]; + } +} + +# Print disassembly for one routine with interspersed source if available +sub PrintDisassembledFunction { + my $prog = shift; + my $offset = shift; + my $routine = shift; + my $flat = shift; + my $cumulative = shift; + my $start_addr = shift; + my $end_addr = shift; + my $total = shift; + + # Disassemble all instructions + my @instructions = Disassemble($prog, $offset, $start_addr, $end_addr); + + # Make array of counts per instruction + my @flat_count = (); + my @cum_count = (); + my $flat_total = 0; + my $cum_total = 0; + foreach my $e (@instructions) { + # Add up counts for all address that fall inside this instruction + my $c1 = 0; + my $c2 = 0; + for (my $a = $e->[0]; $a lt $e->[4]; $a = AddressInc($a)) { + $c1 += GetEntry($flat, $a); + $c2 += GetEntry($cumulative, $a); + } + push(@flat_count, $c1); + push(@cum_count, $c2); + $flat_total += $c1; + $cum_total += $c2; + } + + # Print header with total counts + printf("ROUTINE ====================== %s\n" . + "%6s %6s %s (flat, cumulative) %.1f%% of total\n", + ShortFunctionName($routine), + Unparse($flat_total), + Unparse($cum_total), + Units(), + ($cum_total * 100.0) / $total); + + # Process instructions in order + my $current_file = ""; + for (my $i = 0; $i <= $#instructions; ) { + my $e = $instructions[$i]; + + # Print the new file name whenever we switch files + if ($e->[1] ne $current_file) { + $current_file = $e->[1]; + my $fname = $current_file; + $fname =~ s|^\./||; # Trim leading "./" + + # Shorten long file names + if (length($fname) >= 58) { + $fname = "..." . substr($fname, -55); + } + printf("-------------------- %s\n", $fname); + } + + # TODO: Compute range of lines to print together to deal with + # small reorderings. + my $first_line = $e->[2]; + my $last_line = $first_line; + my %flat_sum = (); + my %cum_sum = (); + for (my $l = $first_line; $l <= $last_line; $l++) { + $flat_sum{$l} = 0; + $cum_sum{$l} = 0; + } + + # Find run of instructions for this range of source lines + my $first_inst = $i; + while (($i <= $#instructions) && + ($instructions[$i]->[2] >= $first_line) && + ($instructions[$i]->[2] <= $last_line)) { + $e = $instructions[$i]; + $flat_sum{$e->[2]} += $flat_count[$i]; + $cum_sum{$e->[2]} += $cum_count[$i]; + $i++; + } + my $last_inst = $i - 1; + + # Print source lines + for (my $l = $first_line; $l <= $last_line; $l++) { + my $line = SourceLine($current_file, $l); + if (!defined($line)) { + $line = "?\n"; + next; + } else { + $line =~ s/^\s+//; + } + printf("%6s %6s %5d: %s", + UnparseAlt($flat_sum{$l}), + UnparseAlt($cum_sum{$l}), + $l, + $line); + } + + # Print disassembly + for (my $x = $first_inst; $x <= $last_inst; $x++) { + my $e = $instructions[$x]; + printf("%6s %6s %8s: %6s\n", + UnparseAlt($flat_count[$x]), + UnparseAlt($cum_count[$x]), + UnparseAddress($offset, $e->[0]), + CleanDisassembly($e->[3])); + } + } +} + +# Print DOT graph +sub PrintDot { + my $prog = shift; + my $symbols = shift; + my $raw = shift; + my $flat = shift; + my $cumulative = shift; + my $overall_total = shift; + + # Get total + my $local_total = TotalProfile($flat); + my $nodelimit = int($main::opt_nodefraction * $local_total); + my $edgelimit = int($main::opt_edgefraction * $local_total); + my $nodecount = $main::opt_nodecount; + + # Find nodes to include + my @list = (sort { abs(GetEntry($cumulative, $b)) <=> + abs(GetEntry($cumulative, $a)) + || $a cmp $b } + keys(%{$cumulative})); + my $last = $nodecount - 1; + if ($last > $#list) { + $last = $#list; + } + while (($last >= 0) && + (abs(GetEntry($cumulative, $list[$last])) <= $nodelimit)) { + $last--; + } + if ($last < 0) { + print STDERR "No nodes to print\n"; + return 0; + } + + if ($nodelimit > 0 || $edgelimit > 0) { + printf STDERR ("Dropping nodes with <= %s %s; edges with <= %s abs(%s)\n", + Unparse($nodelimit), Units(), + Unparse($edgelimit), Units()); + } + + # Open DOT output file + my $output; + my $escaped_dot = ShellEscape(@DOT); + my $escaped_ps2pdf = ShellEscape(@PS2PDF); + if ($main::opt_gv) { + my $escaped_outfile = ShellEscape(TempName($main::next_tmpfile, "ps")); + $output = "| $escaped_dot -Tps2 >$escaped_outfile"; + } elsif ($main::opt_evince) { + my $escaped_outfile = ShellEscape(TempName($main::next_tmpfile, "pdf")); + $output = "| $escaped_dot -Tps2 | $escaped_ps2pdf - $escaped_outfile"; + } elsif ($main::opt_ps) { + $output = "| $escaped_dot -Tps2"; + } elsif ($main::opt_pdf) { + $output = "| $escaped_dot -Tps2 | $escaped_ps2pdf - -"; + } elsif ($main::opt_web || $main::opt_svg) { + # We need to post-process the SVG, so write to a temporary file always. + my $escaped_outfile = ShellEscape(TempName($main::next_tmpfile, "svg")); + $output = "| $escaped_dot -Tsvg >$escaped_outfile"; + } elsif ($main::opt_gif) { + $output = "| $escaped_dot -Tgif"; + } else { + $output = ">&STDOUT"; + } + open(DOT, $output) || error("$output: $!\n"); + + # Title + printf DOT ("digraph \"%s; %s %s\" {\n", + $prog, + Unparse($overall_total), + Units()); + if ($main::opt_pdf) { + # The output is more printable if we set the page size for dot. + printf DOT ("size=\"8,11\"\n"); + } + printf DOT ("node [width=0.375,height=0.25];\n"); + + # Print legend + printf DOT ("Legend [shape=box,fontsize=24,shape=plaintext," . + "label=\"%s\\l%s\\l%s\\l%s\\l%s\\l\"];\n", + $prog, + sprintf("Total %s: %s", Units(), Unparse($overall_total)), + sprintf("Focusing on: %s", Unparse($local_total)), + sprintf("Dropped nodes with <= %s abs(%s)", + Unparse($nodelimit), Units()), + sprintf("Dropped edges with <= %s %s", + Unparse($edgelimit), Units()) + ); + + # Print nodes + my %node = (); + my $nextnode = 1; + foreach my $a (@list[0..$last]) { + # Pick font size + my $f = GetEntry($flat, $a); + my $c = GetEntry($cumulative, $a); + + my $fs = 8; + if ($local_total > 0) { + $fs = 8 + (50.0 * sqrt(abs($f * 1.0 / $local_total))); + } + + $node{$a} = $nextnode++; + my $sym = $a; + $sym =~ s/\s+/\\n/g; + $sym =~ s/::/\\n/g; + + # Extra cumulative info to print for non-leaves + my $extra = ""; + if ($f != $c) { + $extra = sprintf("\\rof %s (%s)", + Unparse($c), + Percent($c, $local_total)); + } + my $style = ""; + if ($main::opt_heapcheck) { + if ($f > 0) { + # make leak-causing nodes more visible (add a background) + $style = ",style=filled,fillcolor=gray" + } elsif ($f < 0) { + # make anti-leak-causing nodes (which almost never occur) + # stand out as well (triple border) + $style = ",peripheries=3" + } + } + + printf DOT ("N%d [label=\"%s\\n%s (%s)%s\\r" . + "\",shape=box,fontsize=%.1f%s];\n", + $node{$a}, + $sym, + Unparse($f), + Percent($f, $local_total), + $extra, + $fs, + $style, + ); + } + + # Get edges and counts per edge + my %edge = (); + my $n; + my $fullname_to_shortname_map = {}; + FillFullnameToShortnameMap($symbols, $fullname_to_shortname_map); + foreach my $k (keys(%{$raw})) { + # TODO: omit low %age edges + $n = $raw->{$k}; + my @translated = TranslateStack($symbols, $fullname_to_shortname_map, $k); + for (my $i = 1; $i <= $#translated; $i++) { + my $src = $translated[$i]; + my $dst = $translated[$i-1]; + #next if ($src eq $dst); # Avoid self-edges? + if (exists($node{$src}) && exists($node{$dst})) { + my $edge_label = "$src\001$dst"; + if (!exists($edge{$edge_label})) { + $edge{$edge_label} = 0; + } + $edge{$edge_label} += $n; + } + } + } + + # Print edges (process in order of decreasing counts) + my %indegree = (); # Number of incoming edges added per node so far + my %outdegree = (); # Number of outgoing edges added per node so far + foreach my $e (sort { $edge{$b} <=> $edge{$a} } keys(%edge)) { + my @x = split(/\001/, $e); + $n = $edge{$e}; + + # Initialize degree of kept incoming and outgoing edges if necessary + my $src = $x[0]; + my $dst = $x[1]; + if (!exists($outdegree{$src})) { $outdegree{$src} = 0; } + if (!exists($indegree{$dst})) { $indegree{$dst} = 0; } + + my $keep; + if ($indegree{$dst} == 0) { + # Keep edge if needed for reachability + $keep = 1; + } elsif (abs($n) <= $edgelimit) { + # Drop if we are below --edgefraction + $keep = 0; + } elsif ($outdegree{$src} >= $main::opt_maxdegree || + $indegree{$dst} >= $main::opt_maxdegree) { + # Keep limited number of in/out edges per node + $keep = 0; + } else { + $keep = 1; + } + + if ($keep) { + $outdegree{$src}++; + $indegree{$dst}++; + + # Compute line width based on edge count + my $fraction = abs($local_total ? (3 * ($n / $local_total)) : 0); + if ($fraction > 1) { $fraction = 1; } + my $w = $fraction * 2; + if ($w < 1 && ($main::opt_web || $main::opt_svg)) { + # SVG output treats line widths < 1 poorly. + $w = 1; + } + + # Dot sometimes segfaults if given edge weights that are too large, so + # we cap the weights at a large value + my $edgeweight = abs($n) ** 0.7; + if ($edgeweight > 100000) { $edgeweight = 100000; } + $edgeweight = int($edgeweight); + + my $style = sprintf("setlinewidth(%f)", $w); + if ($x[1] =~ m/\(inline\)/) { + $style .= ",dashed"; + } + + # Use a slightly squashed function of the edge count as the weight + printf DOT ("N%s -> N%s [label=%s, weight=%d, style=\"%s\"];\n", + $node{$x[0]}, + $node{$x[1]}, + Unparse($n), + $edgeweight, + $style); + } + } + + print DOT ("}\n"); + close(DOT); + + if ($main::opt_web || $main::opt_svg) { + # Rewrite SVG to be more usable inside web browser. + RewriteSvg(TempName($main::next_tmpfile, "svg")); + } + + return 1; +} + +sub RewriteSvg { + my $svgfile = shift; + + open(SVG, $svgfile) || die "open temp svg: $!"; + my @svg = ; + close(SVG); + unlink $svgfile; + my $svg = join('', @svg); + + # Dot's SVG output is + # + # + # + # ... + # + # + # + # Change it to + # + # + # $svg_javascript + # + # + # ... + # + # + # + + # Fix width, height; drop viewBox. + $svg =~ s/(?s) above first + my $svg_javascript = SvgJavascript(); + my $viewport = "\n"; + $svg =~ s/ above . + $svg =~ s/(.*)(<\/svg>)/$1<\/g>$2/; + $svg =~ s/$svgfile") || die "open $svgfile: $!"; + print SVG $svg; + close(SVG); + } +} + +sub SvgJavascript { + return <<'EOF'; + +EOF +} + +# Provides a map from fullname to shortname for cases where the +# shortname is ambiguous. The symlist has both the fullname and +# shortname for all symbols, which is usually fine, but sometimes -- +# such as overloaded functions -- two different fullnames can map to +# the same shortname. In that case, we use the address of the +# function to disambiguate the two. This function fills in a map that +# maps fullnames to modified shortnames in such cases. If a fullname +# is not present in the map, the 'normal' shortname provided by the +# symlist is the appropriate one to use. +sub FillFullnameToShortnameMap { + my $symbols = shift; + my $fullname_to_shortname_map = shift; + my $shortnames_seen_once = {}; + my $shortnames_seen_more_than_once = {}; + + foreach my $symlist (values(%{$symbols})) { + # TODO(csilvers): deal with inlined symbols too. + my $shortname = $symlist->[0]; + my $fullname = $symlist->[2]; + if ($fullname !~ /<[0-9a-fA-F]+>$/) { # fullname doesn't end in an address + next; # the only collisions we care about are when addresses differ + } + if (defined($shortnames_seen_once->{$shortname}) && + $shortnames_seen_once->{$shortname} ne $fullname) { + $shortnames_seen_more_than_once->{$shortname} = 1; + } else { + $shortnames_seen_once->{$shortname} = $fullname; + } + } + + foreach my $symlist (values(%{$symbols})) { + my $shortname = $symlist->[0]; + my $fullname = $symlist->[2]; + # TODO(csilvers): take in a list of addresses we care about, and only + # store in the map if $symlist->[1] is in that list. Saves space. + next if defined($fullname_to_shortname_map->{$fullname}); + if (defined($shortnames_seen_more_than_once->{$shortname})) { + if ($fullname =~ /<0*([^>]*)>$/) { # fullname has address at end of it + $fullname_to_shortname_map->{$fullname} = "$shortname\@$1"; + } + } + } +} + +# Return a small number that identifies the argument. +# Multiple calls with the same argument will return the same number. +# Calls with different arguments will return different numbers. +sub ShortIdFor { + my $key = shift; + my $id = $main::uniqueid{$key}; + if (!defined($id)) { + $id = keys(%main::uniqueid) + 1; + $main::uniqueid{$key} = $id; + } + return $id; +} + +# Translate a stack of addresses into a stack of symbols +sub TranslateStack { + my $symbols = shift; + my $fullname_to_shortname_map = shift; + my $k = shift; + + my @addrs = split(/\n/, $k); + my @result = (); + for (my $i = 0; $i <= $#addrs; $i++) { + my $a = $addrs[$i]; + + # Skip large addresses since they sometimes show up as fake entries on RH9 + if (length($a) > 8 && $a gt "7fffffffffffffff") { + next; + } + + if ($main::opt_disasm || $main::opt_list) { + # We want just the address for the key + push(@result, $a); + next; + } + + my $symlist = $symbols->{$a}; + if (!defined($symlist)) { + $symlist = [$a, "", $a]; + } + + # We can have a sequence of symbols for a particular entry + # (more than one symbol in the case of inlining). Callers + # come before callees in symlist, so walk backwards since + # the translated stack should contain callees before callers. + for (my $j = $#{$symlist}; $j >= 2; $j -= 3) { + my $func = $symlist->[$j-2]; + my $fileline = $symlist->[$j-1]; + my $fullfunc = $symlist->[$j]; + if (defined($fullname_to_shortname_map->{$fullfunc})) { + $func = $fullname_to_shortname_map->{$fullfunc}; + } + if ($j > 2) { + $func = "$func (inline)"; + } + + # Do not merge nodes corresponding to Callback::Run since that + # causes confusing cycles in dot display. Instead, we synthesize + # a unique name for this frame per caller. + if ($func =~ m/Callback.*::Run$/) { + my $caller = ($i > 0) ? $addrs[$i-1] : 0; + $func = "Run#" . ShortIdFor($caller); + } + + if ($main::opt_addresses) { + push(@result, "$a $func $fileline"); + } elsif ($main::opt_lines) { + if ($func eq '??' && $fileline eq '??:0') { + push(@result, "$a"); + } else { + push(@result, "$func $fileline"); + } + } elsif ($main::opt_functions) { + if ($func eq '??') { + push(@result, "$a"); + } else { + push(@result, $func); + } + } elsif ($main::opt_files) { + if ($fileline eq '??:0' || $fileline eq '') { + push(@result, "$a"); + } else { + my $f = $fileline; + $f =~ s/:\d+$//; + push(@result, $f); + } + } else { + push(@result, $a); + last; # Do not print inlined info + } + } + } + + # print join(",", @addrs), " => ", join(",", @result), "\n"; + return @result; +} + +# Generate percent string for a number and a total +sub Percent { + my $num = shift; + my $tot = shift; + if ($tot != 0) { + return sprintf("%.1f%%", $num * 100.0 / $tot); + } else { + return ($num == 0) ? "nan" : (($num > 0) ? "+inf" : "-inf"); + } +} + +# Generate pretty-printed form of number +sub Unparse { + my $num = shift; + if ($main::profile_type eq 'heap' || $main::profile_type eq 'growth') { + if ($main::opt_inuse_objects || $main::opt_alloc_objects) { + return sprintf("%d", $num); + } else { + if ($main::opt_show_bytes) { + return sprintf("%d", $num); + } else { + return sprintf("%.1f", $num / 1048576.0); + } + } + } elsif ($main::profile_type eq 'contention' && !$main::opt_contentions) { + return sprintf("%.3f", $num / 1e9); # Convert nanoseconds to seconds + } else { + return sprintf("%d", $num); + } +} + +# Alternate pretty-printed form: 0 maps to "." +sub UnparseAlt { + my $num = shift; + if ($num == 0) { + return "."; + } else { + return Unparse($num); + } +} + +# Alternate pretty-printed form: 0 maps to "" +sub HtmlPrintNumber { + my $num = shift; + if ($num == 0) { + return ""; + } else { + return Unparse($num); + } +} + +# Return output units +sub Units { + if ($main::profile_type eq 'heap' || $main::profile_type eq 'growth') { + if ($main::opt_inuse_objects || $main::opt_alloc_objects) { + return "objects"; + } else { + if ($main::opt_show_bytes) { + return "B"; + } else { + return "MB"; + } + } + } elsif ($main::profile_type eq 'contention' && !$main::opt_contentions) { + return "seconds"; + } else { + return "samples"; + } +} + +##### Profile manipulation code ##### + +# Generate flattened profile: +# If count is charged to stack [a,b,c,d], in generated profile, +# it will be charged to [a] +sub FlatProfile { + my $profile = shift; + my $result = {}; + foreach my $k (keys(%{$profile})) { + my $count = $profile->{$k}; + my @addrs = split(/\n/, $k); + if ($#addrs >= 0) { + AddEntry($result, $addrs[0], $count); + } + } + return $result; +} + +# Generate cumulative profile: +# If count is charged to stack [a,b,c,d], in generated profile, +# it will be charged to [a], [b], [c], [d] +sub CumulativeProfile { + my $profile = shift; + my $result = {}; + foreach my $k (keys(%{$profile})) { + my $count = $profile->{$k}; + my @addrs = split(/\n/, $k); + foreach my $a (@addrs) { + AddEntry($result, $a, $count); + } + } + return $result; +} + +# If the second-youngest PC on the stack is always the same, returns +# that pc. Otherwise, returns undef. +sub IsSecondPcAlwaysTheSame { + my $profile = shift; + + my $second_pc = undef; + foreach my $k (keys(%{$profile})) { + my @addrs = split(/\n/, $k); + if ($#addrs < 1) { + return undef; + } + if (not defined $second_pc) { + $second_pc = $addrs[1]; + } else { + if ($second_pc ne $addrs[1]) { + return undef; + } + } + } + return $second_pc; +} + +sub ExtractSymbolNameInlineStack { + my $symbols = shift; + my $address = shift; + + my @stack = (); + + if (exists $symbols->{$address}) { + my @localinlinestack = @{$symbols->{$address}}; + for (my $i = $#localinlinestack; $i > 0; $i-=3) { + my $file = $localinlinestack[$i-1]; + my $fn = $localinlinestack[$i-0]; + + if ($file eq "?" || $file eq ":0") { + $file = "??:0"; + } + if ($fn eq '??') { + # If we can't get the symbol name, at least use the file information. + $fn = $file; + } + my $suffix = "[inline]"; + if ($i == 2) { + $suffix = ""; + } + push (@stack, $fn.$suffix); + } + } + else { + # If we can't get a symbol name, at least fill in the address. + push (@stack, $address); + } + + return @stack; +} + +sub ExtractSymbolLocation { + my $symbols = shift; + my $address = shift; + # 'addr2line' outputs "??:0" for unknown locations; we do the + # same to be consistent. + my $location = "??:0:unknown"; + if (exists $symbols->{$address}) { + my $file = $symbols->{$address}->[1]; + if ($file eq "?") { + $file = "??:0" + } + $location = $file . ":" . $symbols->{$address}->[0]; + } + return $location; +} + +# Extracts a graph of calls. +sub ExtractCalls { + my $symbols = shift; + my $profile = shift; + + my $calls = {}; + while( my ($stack_trace, $count) = each %$profile ) { + my @address = split(/\n/, $stack_trace); + my $destination = ExtractSymbolLocation($symbols, $address[0]); + AddEntry($calls, $destination, $count); + for (my $i = 1; $i <= $#address; $i++) { + my $source = ExtractSymbolLocation($symbols, $address[$i]); + my $call = "$source -> $destination"; + AddEntry($calls, $call, $count); + $destination = $source; + } + } + + return $calls; +} + +sub FilterFrames { + my $symbols = shift; + my $profile = shift; + + if ($main::opt_retain eq '' && $main::opt_exclude eq '') { + return $profile; + } + + my $result = {}; + foreach my $k (keys(%{$profile})) { + my $count = $profile->{$k}; + my @addrs = split(/\n/, $k); + my @path = (); + foreach my $a (@addrs) { + my $sym; + if (exists($symbols->{$a})) { + $sym = $symbols->{$a}->[0]; + } else { + $sym = $a; + } + if ($main::opt_retain ne '' && $sym !~ m/$main::opt_retain/) { + next; + } + if ($main::opt_exclude ne '' && $sym =~ m/$main::opt_exclude/) { + next; + } + push(@path, $a); + } + if (scalar(@path) > 0) { + my $reduced_path = join("\n", @path); + AddEntry($result, $reduced_path, $count); + } + } + + return $result; +} + +sub PrintCollapsedStacks { + my $symbols = shift; + my $profile = shift; + + while (my ($stack_trace, $count) = each %$profile) { + my @address = split(/\n/, $stack_trace); + my @names = reverse ( map { ExtractSymbolNameInlineStack($symbols, $_) } @address ); + printf("%s %d\n", join(";", @names), $count); + } +} + +sub RemoveUninterestingFrames { + my $symbols = shift; + my $profile = shift; + + # List of function names to skip + my %skip = (); + my $skip_regexp = 'NOMATCH'; + if ($main::profile_type eq 'heap' || $main::profile_type eq 'growth') { + foreach my $name ('@JEMALLOC_PREFIX@calloc', + 'cfree', + '@JEMALLOC_PREFIX@malloc', + 'je_malloc_default', + 'newImpl', + 'void* newImpl', + 'fallbackNewImpl', + 'void* fallbackNewImpl', + '@JEMALLOC_PREFIX@free', + '@JEMALLOC_PREFIX@memalign', + '@JEMALLOC_PREFIX@posix_memalign', + '@JEMALLOC_PREFIX@aligned_alloc', + 'pvalloc', + '@JEMALLOC_PREFIX@valloc', + '@JEMALLOC_PREFIX@realloc', + '@JEMALLOC_PREFIX@mallocx', + '@JEMALLOC_PREFIX@rallocx', + 'do_rallocx', + '@JEMALLOC_PREFIX@xallocx', + '@JEMALLOC_PREFIX@dallocx', + '@JEMALLOC_PREFIX@sdallocx', + '@JEMALLOC_PREFIX@sdallocx_noflags', + 'tc_calloc', + 'tc_cfree', + 'tc_malloc', + 'tc_free', + 'tc_memalign', + 'tc_posix_memalign', + 'tc_pvalloc', + 'tc_valloc', + 'tc_realloc', + 'tc_new', + 'tc_delete', + 'tc_newarray', + 'tc_deletearray', + 'tc_new_nothrow', + 'tc_newarray_nothrow', + 'do_malloc', + '::do_malloc', # new name -- got moved to an unnamed ns + '::do_malloc_or_cpp_alloc', + 'DoSampledAllocation', + 'simple_alloc::allocate', + '__malloc_alloc_template::allocate', + '__builtin_delete', + '__builtin_new', + '__builtin_vec_delete', + '__builtin_vec_new', + 'operator new', + 'operator new[]', + # The entry to our memory-allocation routines on OS X + 'malloc_zone_malloc', + 'malloc_zone_calloc', + 'malloc_zone_valloc', + 'malloc_zone_realloc', + 'malloc_zone_memalign', + 'malloc_zone_free', + # These mark the beginning/end of our custom sections + '__start_google_malloc', + '__stop_google_malloc', + '__start_malloc_hook', + '__stop_malloc_hook') { + $skip{$name} = 1; + $skip{"_" . $name} = 1; # Mach (OS X) adds a _ prefix to everything + } + # TODO: Remove TCMalloc once everything has been + # moved into the tcmalloc:: namespace and we have flushed + # old code out of the system. + $skip_regexp = "TCMalloc|^tcmalloc::"; + } elsif ($main::profile_type eq 'contention') { + foreach my $vname ('base::RecordLockProfileData', + 'base::SubmitMutexProfileData', + 'base::SubmitSpinLockProfileData', + 'Mutex::Unlock', + 'Mutex::UnlockSlow', + 'Mutex::ReaderUnlock', + 'MutexLock::~MutexLock', + 'SpinLock::Unlock', + 'SpinLock::SlowUnlock', + 'SpinLockHolder::~SpinLockHolder') { + $skip{$vname} = 1; + } + } elsif ($main::profile_type eq 'cpu') { + # Drop signal handlers used for CPU profile collection + # TODO(dpeng): this should not be necessary; it's taken + # care of by the general 2nd-pc mechanism below. + foreach my $name ('ProfileData::Add', # historical + 'ProfileData::prof_handler', # historical + 'CpuProfiler::prof_handler', + '__FRAME_END__', + '__pthread_sighandler', + '__restore') { + $skip{$name} = 1; + } + } else { + # Nothing skipped for unknown types + } + + if ($main::profile_type eq 'cpu') { + # If all the second-youngest program counters are the same, + # this STRONGLY suggests that it is an artifact of measurement, + # i.e., stack frames pushed by the CPU profiler signal handler. + # Hence, we delete them. + # (The topmost PC is read from the signal structure, not from + # the stack, so it does not get involved.) + while (my $second_pc = IsSecondPcAlwaysTheSame($profile)) { + my $result = {}; + my $func = ''; + if (exists($symbols->{$second_pc})) { + $second_pc = $symbols->{$second_pc}->[0]; + } + print STDERR "Removing $second_pc from all stack traces.\n"; + foreach my $k (keys(%{$profile})) { + my $count = $profile->{$k}; + my @addrs = split(/\n/, $k); + splice @addrs, 1, 1; + my $reduced_path = join("\n", @addrs); + AddEntry($result, $reduced_path, $count); + } + $profile = $result; + } + } + + my $result = {}; + foreach my $k (keys(%{$profile})) { + my $count = $profile->{$k}; + my @addrs = split(/\n/, $k); + my @path = (); + foreach my $a (@addrs) { + if (exists($symbols->{$a})) { + my $func = $symbols->{$a}->[0]; + if ($skip{$func} || ($func =~ m/$skip_regexp/)) { + # Throw away the portion of the backtrace seen so far, under the + # assumption that previous frames were for functions internal to the + # allocator. + @path = (); + next; + } + } + push(@path, $a); + } + my $reduced_path = join("\n", @path); + AddEntry($result, $reduced_path, $count); + } + + $result = FilterFrames($symbols, $result); + + return $result; +} + +# Reduce profile to granularity given by user +sub ReduceProfile { + my $symbols = shift; + my $profile = shift; + my $result = {}; + my $fullname_to_shortname_map = {}; + FillFullnameToShortnameMap($symbols, $fullname_to_shortname_map); + foreach my $k (keys(%{$profile})) { + my $count = $profile->{$k}; + my @translated = TranslateStack($symbols, $fullname_to_shortname_map, $k); + my @path = (); + my %seen = (); + $seen{''} = 1; # So that empty keys are skipped + foreach my $e (@translated) { + # To avoid double-counting due to recursion, skip a stack-trace + # entry if it has already been seen + if (!$seen{$e}) { + $seen{$e} = 1; + push(@path, $e); + } + } + my $reduced_path = join("\n", @path); + AddEntry($result, $reduced_path, $count); + } + return $result; +} + +# Does the specified symbol array match the regexp? +sub SymbolMatches { + my $sym = shift; + my $re = shift; + if (defined($sym)) { + for (my $i = 0; $i < $#{$sym}; $i += 3) { + if ($sym->[$i] =~ m/$re/ || $sym->[$i+1] =~ m/$re/) { + return 1; + } + } + } + return 0; +} + +# Focus only on paths involving specified regexps +sub FocusProfile { + my $symbols = shift; + my $profile = shift; + my $focus = shift; + my $result = {}; + foreach my $k (keys(%{$profile})) { + my $count = $profile->{$k}; + my @addrs = split(/\n/, $k); + foreach my $a (@addrs) { + # Reply if it matches either the address/shortname/fileline + if (($a =~ m/$focus/) || SymbolMatches($symbols->{$a}, $focus)) { + AddEntry($result, $k, $count); + last; + } + } + } + return $result; +} + +# Focus only on paths not involving specified regexps +sub IgnoreProfile { + my $symbols = shift; + my $profile = shift; + my $ignore = shift; + my $result = {}; + foreach my $k (keys(%{$profile})) { + my $count = $profile->{$k}; + my @addrs = split(/\n/, $k); + my $matched = 0; + foreach my $a (@addrs) { + # Reply if it matches either the address/shortname/fileline + if (($a =~ m/$ignore/) || SymbolMatches($symbols->{$a}, $ignore)) { + $matched = 1; + last; + } + } + if (!$matched) { + AddEntry($result, $k, $count); + } + } + return $result; +} + +# Get total count in profile +sub TotalProfile { + my $profile = shift; + my $result = 0; + foreach my $k (keys(%{$profile})) { + $result += $profile->{$k}; + } + return $result; +} + +# Add A to B +sub AddProfile { + my $A = shift; + my $B = shift; + + my $R = {}; + # add all keys in A + foreach my $k (keys(%{$A})) { + my $v = $A->{$k}; + AddEntry($R, $k, $v); + } + # add all keys in B + foreach my $k (keys(%{$B})) { + my $v = $B->{$k}; + AddEntry($R, $k, $v); + } + return $R; +} + +# Merges symbol maps +sub MergeSymbols { + my $A = shift; + my $B = shift; + + my $R = {}; + foreach my $k (keys(%{$A})) { + $R->{$k} = $A->{$k}; + } + if (defined($B)) { + foreach my $k (keys(%{$B})) { + $R->{$k} = $B->{$k}; + } + } + return $R; +} + + +# Add A to B +sub AddPcs { + my $A = shift; + my $B = shift; + + my $R = {}; + # add all keys in A + foreach my $k (keys(%{$A})) { + $R->{$k} = 1 + } + # add all keys in B + foreach my $k (keys(%{$B})) { + $R->{$k} = 1 + } + return $R; +} + +# Subtract B from A +sub SubtractProfile { + my $A = shift; + my $B = shift; + + my $R = {}; + foreach my $k (keys(%{$A})) { + my $v = $A->{$k} - GetEntry($B, $k); + if ($v < 0 && $main::opt_drop_negative) { + $v = 0; + } + AddEntry($R, $k, $v); + } + if (!$main::opt_drop_negative) { + # Take care of when subtracted profile has more entries + foreach my $k (keys(%{$B})) { + if (!exists($A->{$k})) { + AddEntry($R, $k, 0 - $B->{$k}); + } + } + } + return $R; +} + +# Get entry from profile; zero if not present +sub GetEntry { + my $profile = shift; + my $k = shift; + if (exists($profile->{$k})) { + return $profile->{$k}; + } else { + return 0; + } +} + +# Add entry to specified profile +sub AddEntry { + my $profile = shift; + my $k = shift; + my $n = shift; + if (!exists($profile->{$k})) { + $profile->{$k} = 0; + } + $profile->{$k} += $n; +} + +# Add a stack of entries to specified profile, and add them to the $pcs +# list. +sub AddEntries { + my $profile = shift; + my $pcs = shift; + my $stack = shift; + my $count = shift; + my @k = (); + + foreach my $e (split(/\s+/, $stack)) { + my $pc = HexExtend($e); + $pcs->{$pc} = 1; + push @k, $pc; + } + AddEntry($profile, (join "\n", @k), $count); +} + +##### Code to profile a server dynamically ##### + +sub CheckSymbolPage { + my $url = SymbolPageURL(); + my $command = ShellEscape(@URL_FETCHER, $url); + open(SYMBOL, "$command |") or error($command); + my $line = ; + $line =~ s/\r//g; # turn windows-looking lines into unix-looking lines + close(SYMBOL); + unless (defined($line)) { + error("$url doesn't exist\n"); + } + + if ($line =~ /^num_symbols:\s+(\d+)$/) { + if ($1 == 0) { + error("Stripped binary. No symbols available.\n"); + } + } else { + error("Failed to get the number of symbols from $url\n"); + } +} + +sub IsProfileURL { + my $profile_name = shift; + if (-f $profile_name) { + printf STDERR "Using local file $profile_name.\n"; + return 0; + } + return 1; +} + +sub ParseProfileURL { + my $profile_name = shift; + + if (!defined($profile_name) || $profile_name eq "") { + return (); + } + + # Split profile URL - matches all non-empty strings, so no test. + $profile_name =~ m,^(https?://)?([^/]+)(.*?)(/|$PROFILES)?$,; + + my $proto = $1 || "http://"; + my $hostport = $2; + my $prefix = $3; + my $profile = $4 || "/"; + + my $host = $hostport; + $host =~ s/:.*//; + + my $baseurl = "$proto$hostport$prefix"; + return ($host, $baseurl, $profile); +} + +# We fetch symbols from the first profile argument. +sub SymbolPageURL { + my ($host, $baseURL, $path) = ParseProfileURL($main::pfile_args[0]); + return "$baseURL$SYMBOL_PAGE"; +} + +sub FetchProgramName() { + my ($host, $baseURL, $path) = ParseProfileURL($main::pfile_args[0]); + my $url = "$baseURL$PROGRAM_NAME_PAGE"; + my $command_line = ShellEscape(@URL_FETCHER, $url); + open(CMDLINE, "$command_line |") or error($command_line); + my $cmdline = ; + $cmdline =~ s/\r//g; # turn windows-looking lines into unix-looking lines + close(CMDLINE); + error("Failed to get program name from $url\n") unless defined($cmdline); + $cmdline =~ s/\x00.+//; # Remove argv[1] and latters. + $cmdline =~ s!\n!!g; # Remove LFs. + return $cmdline; +} + +# Gee, curl's -L (--location) option isn't reliable at least +# with its 7.12.3 version. Curl will forget to post data if +# there is a redirection. This function is a workaround for +# curl. Redirection happens on borg hosts. +sub ResolveRedirectionForCurl { + my $url = shift; + my $command_line = ShellEscape(@URL_FETCHER, "--head", $url); + open(CMDLINE, "$command_line |") or error($command_line); + while () { + s/\r//g; # turn windows-looking lines into unix-looking lines + if (/^Location: (.*)/) { + $url = $1; + } + } + close(CMDLINE); + return $url; +} + +# Add a timeout flat to URL_FETCHER. Returns a new list. +sub AddFetchTimeout { + my $timeout = shift; + my @fetcher = @_; + if (defined($timeout)) { + if (join(" ", @fetcher) =~ m/\bcurl -s/) { + push(@fetcher, "--max-time", sprintf("%d", $timeout)); + } elsif (join(" ", @fetcher) =~ m/\brpcget\b/) { + push(@fetcher, sprintf("--deadline=%d", $timeout)); + } + } + return @fetcher; +} + +# Reads a symbol map from the file handle name given as $1, returning +# the resulting symbol map. Also processes variables relating to symbols. +# Currently, the only variable processed is 'binary=' which updates +# $main::prog to have the correct program name. +sub ReadSymbols { + my $in = shift; + my $map = {}; + while (<$in>) { + s/\r//g; # turn windows-looking lines into unix-looking lines + # Removes all the leading zeroes from the symbols, see comment below. + if (m/^0x0*([0-9a-f]+)\s+(.+)/) { + $map->{$1} = $2; + } elsif (m/^---/) { + last; + } elsif (m/^([a-z][^=]*)=(.*)$/ ) { + my ($variable, $value) = ($1, $2); + for ($variable, $value) { + s/^\s+//; + s/\s+$//; + } + if ($variable eq "binary") { + if ($main::prog ne $UNKNOWN_BINARY && $main::prog ne $value) { + printf STDERR ("Warning: Mismatched binary name '%s', using '%s'.\n", + $main::prog, $value); + } + $main::prog = $value; + } else { + printf STDERR ("Ignoring unknown variable in symbols list: " . + "'%s' = '%s'\n", $variable, $value); + } + } + } + return $map; +} + +sub URLEncode { + my $str = shift; + $str =~ s/([^A-Za-z0-9\-_.!~*'()])/ sprintf "%%%02x", ord $1 /eg; + return $str; +} + +sub AppendSymbolFilterParams { + my $url = shift; + my @params = (); + if ($main::opt_retain ne '') { + push(@params, sprintf("retain=%s", URLEncode($main::opt_retain))); + } + if ($main::opt_exclude ne '') { + push(@params, sprintf("exclude=%s", URLEncode($main::opt_exclude))); + } + if (scalar @params > 0) { + $url = sprintf("%s?%s", $url, join("&", @params)); + } + return $url; +} + +# Fetches and processes symbols to prepare them for use in the profile output +# code. If the optional 'symbol_map' arg is not given, fetches symbols from +# $SYMBOL_PAGE for all PC values found in profile. Otherwise, the raw symbols +# are assumed to have already been fetched into 'symbol_map' and are simply +# extracted and processed. +sub FetchSymbols { + my $pcset = shift; + my $symbol_map = shift; + + my %seen = (); + my @pcs = grep { !$seen{$_}++ } keys(%$pcset); # uniq + + if (!defined($symbol_map)) { + my $post_data = join("+", sort((map {"0x" . "$_"} @pcs))); + + open(POSTFILE, ">$main::tmpfile_sym"); + print POSTFILE $post_data; + close(POSTFILE); + + my $url = SymbolPageURL(); + + my $command_line; + if (join(" ", @URL_FETCHER) =~ m/\bcurl -s/) { + $url = ResolveRedirectionForCurl($url); + $url = AppendSymbolFilterParams($url); + $command_line = ShellEscape(@URL_FETCHER, "-d", "\@$main::tmpfile_sym", + $url); + } else { + $url = AppendSymbolFilterParams($url); + $command_line = (ShellEscape(@URL_FETCHER, "--post", $url) + . " < " . ShellEscape($main::tmpfile_sym)); + } + # We use c++filt in case $SYMBOL_PAGE gives us mangled symbols. + my $escaped_cppfilt = ShellEscape($obj_tool_map{"c++filt"}); + open(SYMBOL, "$command_line | $escaped_cppfilt |") or error($command_line); + $symbol_map = ReadSymbols(*SYMBOL{IO}); + close(SYMBOL); + } + + my $symbols = {}; + foreach my $pc (@pcs) { + my $fullname; + # For 64 bits binaries, symbols are extracted with 8 leading zeroes. + # Then /symbol reads the long symbols in as uint64, and outputs + # the result with a "0x%08llx" format which get rid of the zeroes. + # By removing all the leading zeroes in both $pc and the symbols from + # /symbol, the symbols match and are retrievable from the map. + my $shortpc = $pc; + $shortpc =~ s/^0*//; + # Each line may have a list of names, which includes the function + # and also other functions it has inlined. They are separated (in + # PrintSymbolizedProfile), by --, which is illegal in function names. + my $fullnames; + if (defined($symbol_map->{$shortpc})) { + $fullnames = $symbol_map->{$shortpc}; + } else { + $fullnames = "0x" . $pc; # Just use addresses + } + my $sym = []; + $symbols->{$pc} = $sym; + foreach my $fullname (split("--", $fullnames)) { + my $name = ShortFunctionName($fullname); + push(@{$sym}, $name, "?", $fullname); + } + } + return $symbols; +} + +sub BaseName { + my $file_name = shift; + $file_name =~ s!^.*/!!; # Remove directory name + return $file_name; +} + +sub MakeProfileBaseName { + my ($binary_name, $profile_name) = @_; + my ($host, $baseURL, $path) = ParseProfileURL($profile_name); + my $binary_shortname = BaseName($binary_name); + return sprintf("%s.%s.%s", + $binary_shortname, $main::op_time, $host); +} + +sub FetchDynamicProfile { + my $binary_name = shift; + my $profile_name = shift; + my $fetch_name_only = shift; + my $encourage_patience = shift; + + if (!IsProfileURL($profile_name)) { + return $profile_name; + } else { + my ($host, $baseURL, $path) = ParseProfileURL($profile_name); + if ($path eq "" || $path eq "/") { + # Missing type specifier defaults to cpu-profile + $path = $PROFILE_PAGE; + } + + my $profile_file = MakeProfileBaseName($binary_name, $profile_name); + + my $url = "$baseURL$path"; + my $fetch_timeout = undef; + if ($path =~ m/$PROFILE_PAGE|$PMUPROFILE_PAGE/) { + if ($path =~ m/[?]/) { + $url .= "&"; + } else { + $url .= "?"; + } + $url .= sprintf("seconds=%d", $main::opt_seconds); + $fetch_timeout = $main::opt_seconds * 1.01 + 60; + # Set $profile_type for consumption by PrintSymbolizedProfile. + $main::profile_type = 'cpu'; + } else { + # For non-CPU profiles, we add a type-extension to + # the target profile file name. + my $suffix = $path; + $suffix =~ s,/,.,g; + $profile_file .= $suffix; + # Set $profile_type for consumption by PrintSymbolizedProfile. + if ($path =~ m/$HEAP_PAGE/) { + $main::profile_type = 'heap'; + } elsif ($path =~ m/$GROWTH_PAGE/) { + $main::profile_type = 'growth'; + } elsif ($path =~ m/$CONTENTION_PAGE/) { + $main::profile_type = 'contention'; + } + } + + my $profile_dir = $ENV{"JEPROF_TMPDIR"} || ($ENV{HOME} . "/jeprof"); + if (! -d $profile_dir) { + mkdir($profile_dir) + || die("Unable to create profile directory $profile_dir: $!\n"); + } + my $tmp_profile = "$profile_dir/.tmp.$profile_file"; + my $real_profile = "$profile_dir/$profile_file"; + + if ($fetch_name_only > 0) { + return $real_profile; + } + + my @fetcher = AddFetchTimeout($fetch_timeout, @URL_FETCHER); + my $cmd = ShellEscape(@fetcher, $url) . " > " . ShellEscape($tmp_profile); + if ($path =~ m/$PROFILE_PAGE|$PMUPROFILE_PAGE|$CENSUSPROFILE_PAGE/){ + print STDERR "Gathering CPU profile from $url for $main::opt_seconds seconds to\n ${real_profile}\n"; + if ($encourage_patience) { + print STDERR "Be patient...\n"; + } + } else { + print STDERR "Fetching $path profile from $url to\n ${real_profile}\n"; + } + + (system($cmd) == 0) || error("Failed to get profile: $cmd: $!\n"); + (system("mv", $tmp_profile, $real_profile) == 0) || error("Unable to rename profile\n"); + print STDERR "Wrote profile to $real_profile\n"; + $main::collected_profile = $real_profile; + return $main::collected_profile; + } +} + +# Collect profiles in parallel +sub FetchDynamicProfiles { + my $items = scalar(@main::pfile_args); + my $levels = log($items) / log(2); + + if ($items == 1) { + $main::profile_files[0] = FetchDynamicProfile($main::prog, $main::pfile_args[0], 0, 1); + } else { + # math rounding issues + if ((2 ** $levels) < $items) { + $levels++; + } + my $count = scalar(@main::pfile_args); + for (my $i = 0; $i < $count; $i++) { + $main::profile_files[$i] = FetchDynamicProfile($main::prog, $main::pfile_args[$i], 1, 0); + } + print STDERR "Fetching $count profiles, Be patient...\n"; + FetchDynamicProfilesRecurse($levels, 0, 0); + $main::collected_profile = join(" \\\n ", @main::profile_files); + } +} + +# Recursively fork a process to get enough processes +# collecting profiles +sub FetchDynamicProfilesRecurse { + my $maxlevel = shift; + my $level = shift; + my $position = shift; + + if (my $pid = fork()) { + $position = 0 | ($position << 1); + TryCollectProfile($maxlevel, $level, $position); + wait; + } else { + $position = 1 | ($position << 1); + TryCollectProfile($maxlevel, $level, $position); + cleanup(); + exit(0); + } +} + +# Collect a single profile +sub TryCollectProfile { + my $maxlevel = shift; + my $level = shift; + my $position = shift; + + if ($level >= ($maxlevel - 1)) { + if ($position < scalar(@main::pfile_args)) { + FetchDynamicProfile($main::prog, $main::pfile_args[$position], 0, 0); + } + } else { + FetchDynamicProfilesRecurse($maxlevel, $level+1, $position); + } +} + +##### Parsing code ##### + +# Provide a small streaming-read module to handle very large +# cpu-profile files. Stream in chunks along a sliding window. +# Provides an interface to get one 'slot', correctly handling +# endian-ness differences. A slot is one 32-bit or 64-bit word +# (depending on the input profile). We tell endianness and bit-size +# for the profile by looking at the first 8 bytes: in cpu profiles, +# the second slot is always 3 (we'll accept anything that's not 0). +BEGIN { + package CpuProfileStream; + + sub new { + my ($class, $file, $fname) = @_; + my $self = { file => $file, + base => 0, + stride => 512 * 1024, # must be a multiple of bitsize/8 + slots => [], + unpack_code => "", # N for big-endian, V for little + perl_is_64bit => 1, # matters if profile is 64-bit + }; + bless $self, $class; + # Let unittests adjust the stride + if ($main::opt_test_stride > 0) { + $self->{stride} = $main::opt_test_stride; + } + # Read the first two slots to figure out bitsize and endianness. + my $slots = $self->{slots}; + my $str; + read($self->{file}, $str, 8); + # Set the global $address_length based on what we see here. + # 8 is 32-bit (8 hexadecimal chars); 16 is 64-bit (16 hexadecimal chars). + $address_length = ($str eq (chr(0)x8)) ? 16 : 8; + if ($address_length == 8) { + if (substr($str, 6, 2) eq chr(0)x2) { + $self->{unpack_code} = 'V'; # Little-endian. + } elsif (substr($str, 4, 2) eq chr(0)x2) { + $self->{unpack_code} = 'N'; # Big-endian + } else { + ::error("$fname: header size >= 2**16\n"); + } + @$slots = unpack($self->{unpack_code} . "*", $str); + } else { + # If we're a 64-bit profile, check if we're a 64-bit-capable + # perl. Otherwise, each slot will be represented as a float + # instead of an int64, losing precision and making all the + # 64-bit addresses wrong. We won't complain yet, but will + # later if we ever see a value that doesn't fit in 32 bits. + my $has_q = 0; + eval { $has_q = pack("Q", "1") ? 1 : 1; }; + if (!$has_q) { + $self->{perl_is_64bit} = 0; + } + read($self->{file}, $str, 8); + if (substr($str, 4, 4) eq chr(0)x4) { + # We'd love to use 'Q', but it's a) not universal, b) not endian-proof. + $self->{unpack_code} = 'V'; # Little-endian. + } elsif (substr($str, 0, 4) eq chr(0)x4) { + $self->{unpack_code} = 'N'; # Big-endian + } else { + ::error("$fname: header size >= 2**32\n"); + } + my @pair = unpack($self->{unpack_code} . "*", $str); + # Since we know one of the pair is 0, it's fine to just add them. + @$slots = (0, $pair[0] + $pair[1]); + } + return $self; + } + + # Load more data when we access slots->get(X) which is not yet in memory. + sub overflow { + my ($self) = @_; + my $slots = $self->{slots}; + $self->{base} += $#$slots + 1; # skip over data we're replacing + my $str; + read($self->{file}, $str, $self->{stride}); + if ($address_length == 8) { # the 32-bit case + # This is the easy case: unpack provides 32-bit unpacking primitives. + @$slots = unpack($self->{unpack_code} . "*", $str); + } else { + # We need to unpack 32 bits at a time and combine. + my @b32_values = unpack($self->{unpack_code} . "*", $str); + my @b64_values = (); + for (my $i = 0; $i < $#b32_values; $i += 2) { + # TODO(csilvers): if this is a 32-bit perl, the math below + # could end up in a too-large int, which perl will promote + # to a double, losing necessary precision. Deal with that. + # Right now, we just die. + my ($lo, $hi) = ($b32_values[$i], $b32_values[$i+1]); + if ($self->{unpack_code} eq 'N') { # big-endian + ($lo, $hi) = ($hi, $lo); + } + my $value = $lo + $hi * (2**32); + if (!$self->{perl_is_64bit} && # check value is exactly represented + (($value % (2**32)) != $lo || int($value / (2**32)) != $hi)) { + ::error("Need a 64-bit perl to process this 64-bit profile.\n"); + } + push(@b64_values, $value); + } + @$slots = @b64_values; + } + } + + # Access the i-th long in the file (logically), or -1 at EOF. + sub get { + my ($self, $idx) = @_; + my $slots = $self->{slots}; + while ($#$slots >= 0) { + if ($idx < $self->{base}) { + # The only time we expect a reference to $slots[$i - something] + # after referencing $slots[$i] is reading the very first header. + # Since $stride > |header|, that shouldn't cause any lookback + # errors. And everything after the header is sequential. + print STDERR "Unexpected look-back reading CPU profile"; + return -1; # shrug, don't know what better to return + } elsif ($idx > $self->{base} + $#$slots) { + $self->overflow(); + } else { + return $slots->[$idx - $self->{base}]; + } + } + # If we get here, $slots is [], which means we've reached EOF + return -1; # unique since slots is supposed to hold unsigned numbers + } +} + +# Reads the top, 'header' section of a profile, and returns the last +# line of the header, commonly called a 'header line'. The header +# section of a profile consists of zero or more 'command' lines that +# are instructions to jeprof, which jeprof executes when reading the +# header. All 'command' lines start with a %. After the command +# lines is the 'header line', which is a profile-specific line that +# indicates what type of profile it is, and perhaps other global +# information about the profile. For instance, here's a header line +# for a heap profile: +# heap profile: 53: 38236 [ 5525: 1284029] @ heapprofile +# For historical reasons, the CPU profile does not contain a text- +# readable header line. If the profile looks like a CPU profile, +# this function returns "". If no header line could be found, this +# function returns undef. +# +# The following commands are recognized: +# %warn -- emit the rest of this line to stderr, prefixed by 'WARNING:' +# +# The input file should be in binmode. +sub ReadProfileHeader { + local *PROFILE = shift; + my $firstchar = ""; + my $line = ""; + read(PROFILE, $firstchar, 1); + seek(PROFILE, -1, 1); # unread the firstchar + if ($firstchar !~ /[[:print:]]/) { # is not a text character + return ""; + } + while (defined($line = )) { + $line =~ s/\r//g; # turn windows-looking lines into unix-looking lines + if ($line =~ /^%warn\s+(.*)/) { # 'warn' command + # Note this matches both '%warn blah\n' and '%warn\n'. + print STDERR "WARNING: $1\n"; # print the rest of the line + } elsif ($line =~ /^%/) { + print STDERR "Ignoring unknown command from profile header: $line"; + } else { + # End of commands, must be the header line. + return $line; + } + } + return undef; # got to EOF without seeing a header line +} + +sub IsSymbolizedProfileFile { + my $file_name = shift; + if (!(-e $file_name) || !(-r $file_name)) { + return 0; + } + # Check if the file contains a symbol-section marker. + open(TFILE, "<$file_name"); + binmode TFILE; + my $firstline = ReadProfileHeader(*TFILE); + close(TFILE); + if (!$firstline) { + return 0; + } + $SYMBOL_PAGE =~ m,[^/]+$,; # matches everything after the last slash + my $symbol_marker = $&; + return $firstline =~ /^--- *$symbol_marker/; +} + +# Parse profile generated by common/profiler.cc and return a reference +# to a map: +# $result->{version} Version number of profile file +# $result->{period} Sampling period (in microseconds) +# $result->{profile} Profile object +# $result->{threads} Map of thread IDs to profile objects +# $result->{map} Memory map info from profile +# $result->{pcs} Hash of all PC values seen, key is hex address +sub ReadProfile { + my $prog = shift; + my $fname = shift; + my $result; # return value + + $CONTENTION_PAGE =~ m,[^/]+$,; # matches everything after the last slash + my $contention_marker = $&; + $GROWTH_PAGE =~ m,[^/]+$,; # matches everything after the last slash + my $growth_marker = $&; + $SYMBOL_PAGE =~ m,[^/]+$,; # matches everything after the last slash + my $symbol_marker = $&; + $PROFILE_PAGE =~ m,[^/]+$,; # matches everything after the last slash + my $profile_marker = $&; + $HEAP_PAGE =~ m,[^/]+$,; # matches everything after the last slash + my $heap_marker = $&; + + # Look at first line to see if it is a heap or a CPU profile. + # CPU profile may start with no header at all, and just binary data + # (starting with \0\0\0\0) -- in that case, don't try to read the + # whole firstline, since it may be gigabytes(!) of data. + open(PROFILE, "<$fname") || error("$fname: $!\n"); + binmode PROFILE; # New perls do UTF-8 processing + my $header = ReadProfileHeader(*PROFILE); + if (!defined($header)) { # means "at EOF" + error("Profile is empty.\n"); + } + + my $symbols; + if ($header =~ m/^--- *$symbol_marker/o) { + # Verify that the user asked for a symbolized profile + if (!$main::use_symbolized_profile) { + # we have both a binary and symbolized profiles, abort + error("FATAL ERROR: Symbolized profile\n $fname\ncannot be used with " . + "a binary arg. Try again without passing\n $prog\n"); + } + # Read the symbol section of the symbolized profile file. + $symbols = ReadSymbols(*PROFILE{IO}); + # Read the next line to get the header for the remaining profile. + $header = ReadProfileHeader(*PROFILE) || ""; + } + + if ($header =~ m/^--- *($heap_marker|$growth_marker)/o) { + # Skip "--- ..." line for profile types that have their own headers. + $header = ReadProfileHeader(*PROFILE) || ""; + } + + $main::profile_type = ''; + + if ($header =~ m/^heap profile:.*$growth_marker/o) { + $main::profile_type = 'growth'; + $result = ReadHeapProfile($prog, *PROFILE, $header); + } elsif ($header =~ m/^heap profile:/) { + $main::profile_type = 'heap'; + $result = ReadHeapProfile($prog, *PROFILE, $header); + } elsif ($header =~ m/^heap/) { + $main::profile_type = 'heap'; + $result = ReadThreadedHeapProfile($prog, $fname, $header); + } elsif ($header =~ m/^--- *$contention_marker/o) { + $main::profile_type = 'contention'; + $result = ReadSynchProfile($prog, *PROFILE); + } elsif ($header =~ m/^--- *Stacks:/) { + print STDERR + "Old format contention profile: mistakenly reports " . + "condition variable signals as lock contentions.\n"; + $main::profile_type = 'contention'; + $result = ReadSynchProfile($prog, *PROFILE); + } elsif ($header =~ m/^--- *$profile_marker/) { + # the binary cpu profile data starts immediately after this line + $main::profile_type = 'cpu'; + $result = ReadCPUProfile($prog, $fname, *PROFILE); + } else { + if (defined($symbols)) { + # a symbolized profile contains a format we don't recognize, bail out + error("$fname: Cannot recognize profile section after symbols.\n"); + } + # no ascii header present -- must be a CPU profile + $main::profile_type = 'cpu'; + $result = ReadCPUProfile($prog, $fname, *PROFILE); + } + + close(PROFILE); + + # if we got symbols along with the profile, return those as well + if (defined($symbols)) { + $result->{symbols} = $symbols; + } + + return $result; +} + +# Subtract one from caller pc so we map back to call instr. +# However, don't do this if we're reading a symbolized profile +# file, in which case the subtract-one was done when the file +# was written. +# +# We apply the same logic to all readers, though ReadCPUProfile uses an +# independent implementation. +sub FixCallerAddresses { + my $stack = shift; + # --raw/http: Always subtract one from pc's, because PrintSymbolizedProfile() + # dumps unadjusted profiles. + { + $stack =~ /(\s)/; + my $delimiter = $1; + my @addrs = split(' ', $stack); + my @fixedaddrs; + $#fixedaddrs = $#addrs; + if ($#addrs >= 0) { + $fixedaddrs[0] = $addrs[0]; + } + for (my $i = 1; $i <= $#addrs; $i++) { + $fixedaddrs[$i] = AddressSub($addrs[$i], "0x1"); + } + return join $delimiter, @fixedaddrs; + } +} + +# CPU profile reader +sub ReadCPUProfile { + my $prog = shift; + my $fname = shift; # just used for logging + local *PROFILE = shift; + my $version; + my $period; + my $i; + my $profile = {}; + my $pcs = {}; + + # Parse string into array of slots. + my $slots = CpuProfileStream->new(*PROFILE, $fname); + + # Read header. The current header version is a 5-element structure + # containing: + # 0: header count (always 0) + # 1: header "words" (after this one: 3) + # 2: format version (0) + # 3: sampling period (usec) + # 4: unused padding (always 0) + if ($slots->get(0) != 0 ) { + error("$fname: not a profile file, or old format profile file\n"); + } + $i = 2 + $slots->get(1); + $version = $slots->get(2); + $period = $slots->get(3); + # Do some sanity checking on these header values. + if ($version > (2**32) || $period > (2**32) || $i > (2**32) || $i < 5) { + error("$fname: not a profile file, or corrupted profile file\n"); + } + + # Parse profile + while ($slots->get($i) != -1) { + my $n = $slots->get($i++); + my $d = $slots->get($i++); + if ($d > (2**16)) { # TODO(csilvers): what's a reasonable max-stack-depth? + my $addr = sprintf("0%o", $i * ($address_length == 8 ? 4 : 8)); + print STDERR "At index $i (address $addr):\n"; + error("$fname: stack trace depth >= 2**32\n"); + } + if ($slots->get($i) == 0) { + # End of profile data marker + $i += $d; + last; + } + + # Make key out of the stack entries + my @k = (); + for (my $j = 0; $j < $d; $j++) { + my $pc = $slots->get($i+$j); + # Subtract one from caller pc so we map back to call instr. + $pc--; + $pc = sprintf("%0*x", $address_length, $pc); + $pcs->{$pc} = 1; + push @k, $pc; + } + + AddEntry($profile, (join "\n", @k), $n); + $i += $d; + } + + # Parse map + my $map = ''; + seek(PROFILE, $i * 4, 0); + read(PROFILE, $map, (stat PROFILE)[7]); + + my $r = {}; + $r->{version} = $version; + $r->{period} = $period; + $r->{profile} = $profile; + $r->{libs} = ParseLibraries($prog, $map, $pcs); + $r->{pcs} = $pcs; + + return $r; +} + +sub HeapProfileIndex { + my $index = 1; + if ($main::opt_inuse_space) { + $index = 1; + } elsif ($main::opt_inuse_objects) { + $index = 0; + } elsif ($main::opt_alloc_space) { + $index = 3; + } elsif ($main::opt_alloc_objects) { + $index = 2; + } + return $index; +} + +sub ReadMappedLibraries { + my $fh = shift; + my $map = ""; + # Read the /proc/self/maps data + while (<$fh>) { + s/\r//g; # turn windows-looking lines into unix-looking lines + $map .= $_; + } + return $map; +} + +sub ReadMemoryMap { + my $fh = shift; + my $map = ""; + # Read /proc/self/maps data as formatted by DumpAddressMap() + my $buildvar = ""; + while () { + s/\r//g; # turn windows-looking lines into unix-looking lines + # Parse "build=" specification if supplied + if (m/^\s*build=(.*)\n/) { + $buildvar = $1; + } + + # Expand "$build" variable if available + $_ =~ s/\$build\b/$buildvar/g; + + $map .= $_; + } + return $map; +} + +sub AdjustSamples { + my ($sample_adjustment, $sampling_algorithm, $n1, $s1, $n2, $s2) = @_; + if ($sample_adjustment) { + if ($sampling_algorithm == 2) { + # Remote-heap version 2 + # The sampling frequency is the rate of a Poisson process. + # This means that the probability of sampling an allocation of + # size X with sampling rate Y is 1 - exp(-X/Y) + if ($n1 != 0) { + my $ratio = (($s1*1.0)/$n1)/($sample_adjustment); + my $scale_factor = 1/(1 - exp(-$ratio)); + $n1 *= $scale_factor; + $s1 *= $scale_factor; + } + if ($n2 != 0) { + my $ratio = (($s2*1.0)/$n2)/($sample_adjustment); + my $scale_factor = 1/(1 - exp(-$ratio)); + $n2 *= $scale_factor; + $s2 *= $scale_factor; + } + } else { + # Remote-heap version 1 + my $ratio; + $ratio = (($s1*1.0)/$n1)/($sample_adjustment); + if ($ratio < 1) { + $n1 /= $ratio; + $s1 /= $ratio; + } + $ratio = (($s2*1.0)/$n2)/($sample_adjustment); + if ($ratio < 1) { + $n2 /= $ratio; + $s2 /= $ratio; + } + } + } + return ($n1, $s1, $n2, $s2); +} + +sub ReadHeapProfile { + my $prog = shift; + local *PROFILE = shift; + my $header = shift; + + my $index = HeapProfileIndex(); + + # Find the type of this profile. The header line looks like: + # heap profile: 1246: 8800744 [ 1246: 8800744] @ /266053 + # There are two pairs , the first inuse objects/space, and the + # second allocated objects/space. This is followed optionally by a profile + # type, and if that is present, optionally by a sampling frequency. + # For remote heap profiles (v1): + # The interpretation of the sampling frequency is that the profiler, for + # each sample, calculates a uniformly distributed random integer less than + # the given value, and records the next sample after that many bytes have + # been allocated. Therefore, the expected sample interval is half of the + # given frequency. By default, if not specified, the expected sample + # interval is 128KB. Only remote-heap-page profiles are adjusted for + # sample size. + # For remote heap profiles (v2): + # The sampling frequency is the rate of a Poisson process. This means that + # the probability of sampling an allocation of size X with sampling rate Y + # is 1 - exp(-X/Y) + # For version 2, a typical header line might look like this: + # heap profile: 1922: 127792360 [ 1922: 127792360] @ _v2/524288 + # the trailing number (524288) is the sampling rate. (Version 1 showed + # double the 'rate' here) + my $sampling_algorithm = 0; + my $sample_adjustment = 0; + chomp($header); + my $type = "unknown"; + if ($header =~ m"^heap profile:\s*(\d+):\s+(\d+)\s+\[\s*(\d+):\s+(\d+)\](\s*@\s*([^/]*)(/(\d+))?)?") { + if (defined($6) && ($6 ne '')) { + $type = $6; + my $sample_period = $8; + # $type is "heapprofile" for profiles generated by the + # heap-profiler, and either "heap" or "heap_v2" for profiles + # generated by sampling directly within tcmalloc. It can also + # be "growth" for heap-growth profiles. The first is typically + # found for profiles generated locally, and the others for + # remote profiles. + if (($type eq "heapprofile") || ($type !~ /heap/) ) { + # No need to adjust for the sampling rate with heap-profiler-derived data + $sampling_algorithm = 0; + } elsif ($type =~ /_v2/) { + $sampling_algorithm = 2; # version 2 sampling + if (defined($sample_period) && ($sample_period ne '')) { + $sample_adjustment = int($sample_period); + } + } else { + $sampling_algorithm = 1; # version 1 sampling + if (defined($sample_period) && ($sample_period ne '')) { + $sample_adjustment = int($sample_period)/2; + } + } + } else { + # We detect whether or not this is a remote-heap profile by checking + # that the total-allocated stats ($n2,$s2) are exactly the + # same as the in-use stats ($n1,$s1). It is remotely conceivable + # that a non-remote-heap profile may pass this check, but it is hard + # to imagine how that could happen. + # In this case it's so old it's guaranteed to be remote-heap version 1. + my ($n1, $s1, $n2, $s2) = ($1, $2, $3, $4); + if (($n1 == $n2) && ($s1 == $s2)) { + # This is likely to be a remote-heap based sample profile + $sampling_algorithm = 1; + } + } + } + + if ($sampling_algorithm > 0) { + # For remote-heap generated profiles, adjust the counts and sizes to + # account for the sample rate (we sample once every 128KB by default). + if ($sample_adjustment == 0) { + # Turn on profile adjustment. + $sample_adjustment = 128*1024; + print STDERR "Adjusting heap profiles for 1-in-128KB sampling rate\n"; + } else { + printf STDERR ("Adjusting heap profiles for 1-in-%d sampling rate\n", + $sample_adjustment); + } + if ($sampling_algorithm > 1) { + # We don't bother printing anything for the original version (version 1) + printf STDERR "Heap version $sampling_algorithm\n"; + } + } + + my $profile = {}; + my $pcs = {}; + my $map = ""; + + while () { + s/\r//g; # turn windows-looking lines into unix-looking lines + if (/^MAPPED_LIBRARIES:/) { + $map .= ReadMappedLibraries(*PROFILE); + last; + } + + if (/^--- Memory map:/) { + $map .= ReadMemoryMap(*PROFILE); + last; + } + + # Read entry of the form: + # : [: ] @ a1 a2 a3 ... an + s/^\s*//; + s/\s*$//; + if (m/^\s*(\d+):\s+(\d+)\s+\[\s*(\d+):\s+(\d+)\]\s+@\s+(.*)$/) { + my $stack = $5; + my ($n1, $s1, $n2, $s2) = ($1, $2, $3, $4); + my @counts = AdjustSamples($sample_adjustment, $sampling_algorithm, + $n1, $s1, $n2, $s2); + AddEntries($profile, $pcs, FixCallerAddresses($stack), $counts[$index]); + } + } + + my $r = {}; + $r->{version} = "heap"; + $r->{period} = 1; + $r->{profile} = $profile; + $r->{libs} = ParseLibraries($prog, $map, $pcs); + $r->{pcs} = $pcs; + return $r; +} + +sub ReadThreadedHeapProfile { + my ($prog, $fname, $header) = @_; + + my $index = HeapProfileIndex(); + my $sampling_algorithm = 0; + my $sample_adjustment = 0; + chomp($header); + my $type = "unknown"; + # Assuming a very specific type of header for now. + if ($header =~ m"^heap_v2/(\d+)") { + $type = "_v2"; + $sampling_algorithm = 2; + $sample_adjustment = int($1); + } + if ($type ne "_v2" || !defined($sample_adjustment)) { + die "Threaded heap profiles require v2 sampling with a sample rate\n"; + } + + my $profile = {}; + my $thread_profiles = {}; + my $pcs = {}; + my $map = ""; + my $stack = ""; + + while () { + s/\r//g; + if (/^MAPPED_LIBRARIES:/) { + $map .= ReadMappedLibraries(*PROFILE); + last; + } + + if (/^--- Memory map:/) { + $map .= ReadMemoryMap(*PROFILE); + last; + } + + # Read entry of the form: + # @ a1 a2 ... an + # t*: : [: ] + # t1: : [: ] + # ... + # tn: : [: ] + s/^\s*//; + s/\s*$//; + if (m/^@\s+(.*)$/) { + $stack = $1; + } elsif (m/^\s*(t(\*|\d+)):\s+(\d+):\s+(\d+)\s+\[\s*(\d+):\s+(\d+)\]$/) { + if ($stack eq "") { + # Still in the header, so this is just a per-thread summary. + next; + } + my $thread = $2; + my ($n1, $s1, $n2, $s2) = ($3, $4, $5, $6); + my @counts = AdjustSamples($sample_adjustment, $sampling_algorithm, + $n1, $s1, $n2, $s2); + if ($thread eq "*") { + AddEntries($profile, $pcs, FixCallerAddresses($stack), $counts[$index]); + } else { + if (!exists($thread_profiles->{$thread})) { + $thread_profiles->{$thread} = {}; + } + AddEntries($thread_profiles->{$thread}, $pcs, + FixCallerAddresses($stack), $counts[$index]); + } + } + } + + my $r = {}; + $r->{version} = "heap"; + $r->{period} = 1; + $r->{profile} = $profile; + $r->{threads} = $thread_profiles; + $r->{libs} = ParseLibraries($prog, $map, $pcs); + $r->{pcs} = $pcs; + return $r; +} + +sub ReadSynchProfile { + my $prog = shift; + local *PROFILE = shift; + my $header = shift; + + my $map = ''; + my $profile = {}; + my $pcs = {}; + my $sampling_period = 1; + my $cyclespernanosec = 2.8; # Default assumption for old binaries + my $seen_clockrate = 0; + my $line; + + my $index = 0; + if ($main::opt_total_delay) { + $index = 0; + } elsif ($main::opt_contentions) { + $index = 1; + } elsif ($main::opt_mean_delay) { + $index = 2; + } + + while ( $line = ) { + $line =~ s/\r//g; # turn windows-looking lines into unix-looking lines + if ( $line =~ /^\s*(\d+)\s+(\d+) \@\s*(.*?)\s*$/ ) { + my ($cycles, $count, $stack) = ($1, $2, $3); + + # Convert cycles to nanoseconds + $cycles /= $cyclespernanosec; + + # Adjust for sampling done by application + $cycles *= $sampling_period; + $count *= $sampling_period; + + my @values = ($cycles, $count, $cycles / $count); + AddEntries($profile, $pcs, FixCallerAddresses($stack), $values[$index]); + + } elsif ( $line =~ /^(slow release).*thread \d+ \@\s*(.*?)\s*$/ || + $line =~ /^\s*(\d+) \@\s*(.*?)\s*$/ ) { + my ($cycles, $stack) = ($1, $2); + if ($cycles !~ /^\d+$/) { + next; + } + + # Convert cycles to nanoseconds + $cycles /= $cyclespernanosec; + + # Adjust for sampling done by application + $cycles *= $sampling_period; + + AddEntries($profile, $pcs, FixCallerAddresses($stack), $cycles); + + } elsif ( $line =~ m/^([a-z][^=]*)=(.*)$/ ) { + my ($variable, $value) = ($1,$2); + for ($variable, $value) { + s/^\s+//; + s/\s+$//; + } + if ($variable eq "cycles/second") { + $cyclespernanosec = $value / 1e9; + $seen_clockrate = 1; + } elsif ($variable eq "sampling period") { + $sampling_period = $value; + } elsif ($variable eq "ms since reset") { + # Currently nothing is done with this value in jeprof + # So we just silently ignore it for now + } elsif ($variable eq "discarded samples") { + # Currently nothing is done with this value in jeprof + # So we just silently ignore it for now + } else { + printf STDERR ("Ignoring unnknown variable in /contention output: " . + "'%s' = '%s'\n",$variable,$value); + } + } else { + # Memory map entry + $map .= $line; + } + } + + if (!$seen_clockrate) { + printf STDERR ("No cycles/second entry in profile; Guessing %.1f GHz\n", + $cyclespernanosec); + } + + my $r = {}; + $r->{version} = 0; + $r->{period} = $sampling_period; + $r->{profile} = $profile; + $r->{libs} = ParseLibraries($prog, $map, $pcs); + $r->{pcs} = $pcs; + return $r; +} + +# Given a hex value in the form "0x1abcd" or "1abcd", return either +# "0001abcd" or "000000000001abcd", depending on the current (global) +# address length. +sub HexExtend { + my $addr = shift; + + $addr =~ s/^(0x)?0*//; + my $zeros_needed = $address_length - length($addr); + if ($zeros_needed < 0) { + printf STDERR "Warning: address $addr is longer than address length $address_length\n"; + return $addr; + } + return ("0" x $zeros_needed) . $addr; +} + +##### Symbol extraction ##### + +# Aggressively search the lib_prefix values for the given library +# If all else fails, just return the name of the library unmodified. +# If the lib_prefix is "/my/path,/other/path" and $file is "/lib/dir/mylib.so" +# it will search the following locations in this order, until it finds a file: +# /my/path/lib/dir/mylib.so +# /other/path/lib/dir/mylib.so +# /my/path/dir/mylib.so +# /other/path/dir/mylib.so +# /my/path/mylib.so +# /other/path/mylib.so +# /lib/dir/mylib.so (returned as last resort) +sub FindLibrary { + my $file = shift; + my $suffix = $file; + + # Search for the library as described above + do { + foreach my $prefix (@prefix_list) { + my $fullpath = $prefix . $suffix; + if (-e $fullpath) { + return $fullpath; + } + } + } while ($suffix =~ s|^/[^/]+/|/|); + return $file; +} + +# Return path to library with debugging symbols. +# For libc libraries, the copy in /usr/lib/debug contains debugging symbols +sub DebuggingLibrary { + my $file = shift; + + if ($file !~ m|^/|) { + return undef; + } + + # Find debug symbol file if it's named after the library's name. + + if (-f "/usr/lib/debug$file") { + if($main::opt_debug) { print STDERR "found debug info for $file in /usr/lib/debug$file\n"; } + return "/usr/lib/debug$file"; + } elsif (-f "/usr/lib/debug$file.debug") { + if($main::opt_debug) { print STDERR "found debug info for $file in /usr/lib/debug$file.debug\n"; } + return "/usr/lib/debug$file.debug"; + } + + if(!$main::opt_debug_syms_by_id) { + if($main::opt_debug) { print STDERR "no debug symbols found for $file\n" }; + return undef; + } + + # Find debug file if it's named after the library's build ID. + + my $readelf = ''; + if (!$main::gave_up_on_elfutils) { + $readelf = qx/eu-readelf -n ${file}/; + if ($?) { + print STDERR "Cannot run eu-readelf. To use --debug-syms-by-id you must be on Linux, with elfutils installed.\n"; + $main::gave_up_on_elfutils = 1; + return undef; + } + my $buildID = $1 if $readelf =~ /Build ID: ([A-Fa-f0-9]+)/s; + if (defined $buildID && length $buildID > 0) { + my $symbolFile = '/usr/lib/debug/.build-id/' . substr($buildID, 0, 2) . '/' . substr($buildID, 2) . '.debug'; + if (-e $symbolFile) { + if($main::opt_debug) { print STDERR "found debug symbol file $symbolFile for $file\n" }; + return $symbolFile; + } else { + if($main::opt_debug) { print STDERR "no debug symbol file found for $file, build ID: $buildID\n" }; + return undef; + } + } + } + + if($main::opt_debug) { print STDERR "no debug symbols found for $file, build ID unknown\n" }; + return undef; +} + + +# Parse text section header of a library using objdump +sub ParseTextSectionHeaderFromObjdump { + my $lib = shift; + + my $size = undef; + my $vma; + my $file_offset; + # Get objdump output from the library file to figure out how to + # map between mapped addresses and addresses in the library. + my $cmd = ShellEscape($obj_tool_map{"objdump"}, "-h", $lib); + open(OBJDUMP, "$cmd |") || error("$cmd: $!\n"); + while () { + s/\r//g; # turn windows-looking lines into unix-looking lines + # Idx Name Size VMA LMA File off Algn + # 10 .text 00104b2c 420156f0 420156f0 000156f0 2**4 + # For 64-bit objects, VMA and LMA will be 16 hex digits, size and file + # offset may still be 8. But AddressSub below will still handle that. + my @x = split; + if (($#x >= 6) && ($x[1] eq '.text')) { + $size = $x[2]; + $vma = $x[3]; + $file_offset = $x[5]; + last; + } + } + close(OBJDUMP); + + if (!defined($size)) { + return undef; + } + + my $r = {}; + $r->{size} = $size; + $r->{vma} = $vma; + $r->{file_offset} = $file_offset; + + return $r; +} + +# Parse text section header of a library using otool (on OS X) +sub ParseTextSectionHeaderFromOtool { + my $lib = shift; + + my $size = undef; + my $vma = undef; + my $file_offset = undef; + # Get otool output from the library file to figure out how to + # map between mapped addresses and addresses in the library. + my $command = ShellEscape($obj_tool_map{"otool"}, "-l", $lib); + open(OTOOL, "$command |") || error("$command: $!\n"); + my $cmd = ""; + my $sectname = ""; + my $segname = ""; + foreach my $line () { + $line =~ s/\r//g; # turn windows-looking lines into unix-looking lines + # Load command <#> + # cmd LC_SEGMENT + # [...] + # Section + # sectname __text + # segname __TEXT + # addr 0x000009f8 + # size 0x00018b9e + # offset 2552 + # align 2^2 (4) + # We will need to strip off the leading 0x from the hex addresses, + # and convert the offset into hex. + if ($line =~ /Load command/) { + $cmd = ""; + $sectname = ""; + $segname = ""; + } elsif ($line =~ /Section/) { + $sectname = ""; + $segname = ""; + } elsif ($line =~ /cmd (\w+)/) { + $cmd = $1; + } elsif ($line =~ /sectname (\w+)/) { + $sectname = $1; + } elsif ($line =~ /segname (\w+)/) { + $segname = $1; + } elsif (!(($cmd eq "LC_SEGMENT" || $cmd eq "LC_SEGMENT_64") && + $sectname eq "__text" && + $segname eq "__TEXT")) { + next; + } elsif ($line =~ /\baddr 0x([0-9a-fA-F]+)/) { + $vma = $1; + } elsif ($line =~ /\bsize 0x([0-9a-fA-F]+)/) { + $size = $1; + } elsif ($line =~ /\boffset ([0-9]+)/) { + $file_offset = sprintf("%016x", $1); + } + if (defined($vma) && defined($size) && defined($file_offset)) { + last; + } + } + close(OTOOL); + + if (!defined($vma) || !defined($size) || !defined($file_offset)) { + return undef; + } + + my $r = {}; + $r->{size} = $size; + $r->{vma} = $vma; + $r->{file_offset} = $file_offset; + + return $r; +} + +sub ParseTextSectionHeader { + # obj_tool_map("otool") is only defined if we're in a Mach-O environment + if (defined($obj_tool_map{"otool"})) { + my $r = ParseTextSectionHeaderFromOtool(@_); + if (defined($r)){ + return $r; + } + } + # If otool doesn't work, or we don't have it, fall back to objdump + return ParseTextSectionHeaderFromObjdump(@_); +} + +# Split /proc/pid/maps dump into a list of libraries +sub ParseLibraries { + return if $main::use_symbol_page; # We don't need libraries info. + my $prog = Cwd::abs_path(shift); + my $map = shift; + my $pcs = shift; + + my $result = []; + my $h = "[a-f0-9]+"; + my $zero_offset = HexExtend("0"); + + my $buildvar = ""; + foreach my $l (split("\n", $map)) { + if ($l =~ m/^\s*build=(.*)$/) { + $buildvar = $1; + } + + my $start; + my $finish; + my $offset; + my $lib; + if ($l =~ /^($h)-($h)\s+..x.\s+($h)\s+\S+:\S+\s+\d+\s+(\S+\.(so|dll|dylib|bundle)((\.\d+)+\w*(\.\d+){0,3})?)$/i) { + # Full line from /proc/self/maps. Example: + # 40000000-40015000 r-xp 00000000 03:01 12845071 /lib/ld-2.3.2.so + $start = HexExtend($1); + $finish = HexExtend($2); + $offset = HexExtend($3); + $lib = $4; + $lib =~ s|\\|/|g; # turn windows-style paths into unix-style paths + } elsif ($l =~ /^\s*($h)-($h):\s*(\S+\.so(\.\d+)*)/) { + # Cooked line from DumpAddressMap. Example: + # 40000000-40015000: /lib/ld-2.3.2.so + $start = HexExtend($1); + $finish = HexExtend($2); + $offset = $zero_offset; + $lib = $3; + } elsif (($l =~ /^($h)-($h)\s+..x.\s+($h)\s+\S+:\S+\s+\d+\s+(\S+)$/i) && ($4 eq $prog)) { + # PIEs and address space randomization do not play well with our + # default assumption that main executable is at lowest + # addresses. So we're detecting main executable in + # /proc/self/maps as well. + $start = HexExtend($1); + $finish = HexExtend($2); + $offset = HexExtend($3); + $lib = $4; + $lib =~ s|\\|/|g; # turn windows-style paths into unix-style paths + } + # FreeBSD 10.0 virtual memory map /proc/curproc/map as defined in + # function procfs_doprocmap (sys/fs/procfs/procfs_map.c) + # + # Example: + # 0x800600000 0x80061a000 26 0 0xfffff800035a0000 r-x 75 33 0x1004 COW NC vnode /libexec/ld-elf.s + # o.1 NCH -1 + elsif ($l =~ /^(0x$h)\s(0x$h)\s\d+\s\d+\s0x$h\sr-x\s\d+\s\d+\s0x\d+\s(COW|NCO)\s(NC|NNC)\svnode\s(\S+\.so(\.\d+)*)/) { + $start = HexExtend($1); + $finish = HexExtend($2); + $offset = $zero_offset; + $lib = FindLibrary($5); + + } else { + next; + } + + # Expand "$build" variable if available + $lib =~ s/\$build\b/$buildvar/g; + + $lib = FindLibrary($lib); + + # Check for pre-relocated libraries, which use pre-relocated symbol tables + # and thus require adjusting the offset that we'll use to translate + # VM addresses into symbol table addresses. + # Only do this if we're not going to fetch the symbol table from a + # debugging copy of the library. + if (!DebuggingLibrary($lib)) { + my $text = ParseTextSectionHeader($lib); + if (defined($text)) { + my $vma_offset = AddressSub($text->{vma}, $text->{file_offset}); + $offset = AddressAdd($offset, $vma_offset); + } + } + + if($main::opt_debug) { printf STDERR "$start:$finish ($offset) $lib\n"; } + push(@{$result}, [$lib, $start, $finish, $offset]); + } + + # Append special entry for additional library (not relocated) + if ($main::opt_lib ne "") { + my $text = ParseTextSectionHeader($main::opt_lib); + if (defined($text)) { + my $start = $text->{vma}; + my $finish = AddressAdd($start, $text->{size}); + + push(@{$result}, [$main::opt_lib, $start, $finish, $start]); + } + } + + # Append special entry for the main program. This covers + # 0..max_pc_value_seen, so that we assume pc values not found in one + # of the library ranges will be treated as coming from the main + # program binary. + my $min_pc = HexExtend("0"); + my $max_pc = $min_pc; # find the maximal PC value in any sample + foreach my $pc (keys(%{$pcs})) { + if (HexExtend($pc) gt $max_pc) { $max_pc = HexExtend($pc); } + } + push(@{$result}, [$prog, $min_pc, $max_pc, $zero_offset]); + + return $result; +} + +# Add two hex addresses of length $address_length. +# Run jeprof --test for unit test if this is changed. +sub AddressAdd { + my $addr1 = shift; + my $addr2 = shift; + my $sum; + + if ($address_length == 8) { + # Perl doesn't cope with wraparound arithmetic, so do it explicitly: + $sum = (hex($addr1)+hex($addr2)) % (0x10000000 * 16); + return sprintf("%08x", $sum); + + } else { + # Do the addition in 7-nibble chunks to trivialize carry handling. + + if ($main::opt_debug and $main::opt_test) { + print STDERR "AddressAdd $addr1 + $addr2 = "; + } + + my $a1 = substr($addr1,-7); + $addr1 = substr($addr1,0,-7); + my $a2 = substr($addr2,-7); + $addr2 = substr($addr2,0,-7); + $sum = hex($a1) + hex($a2); + my $c = 0; + if ($sum > 0xfffffff) { + $c = 1; + $sum -= 0x10000000; + } + my $r = sprintf("%07x", $sum); + + $a1 = substr($addr1,-7); + $addr1 = substr($addr1,0,-7); + $a2 = substr($addr2,-7); + $addr2 = substr($addr2,0,-7); + $sum = hex($a1) + hex($a2) + $c; + $c = 0; + if ($sum > 0xfffffff) { + $c = 1; + $sum -= 0x10000000; + } + $r = sprintf("%07x", $sum) . $r; + + $sum = hex($addr1) + hex($addr2) + $c; + if ($sum > 0xff) { $sum -= 0x100; } + $r = sprintf("%02x", $sum) . $r; + + if ($main::opt_debug and $main::opt_test) { print STDERR "$r\n"; } + + return $r; + } +} + + +# Subtract two hex addresses of length $address_length. +# Run jeprof --test for unit test if this is changed. +sub AddressSub { + my $addr1 = shift; + my $addr2 = shift; + my $diff; + + if ($address_length == 8) { + # Perl doesn't cope with wraparound arithmetic, so do it explicitly: + $diff = (hex($addr1)-hex($addr2)) % (0x10000000 * 16); + return sprintf("%08x", $diff); + + } else { + # Do the addition in 7-nibble chunks to trivialize borrow handling. + # if ($main::opt_debug) { print STDERR "AddressSub $addr1 - $addr2 = "; } + + my $a1 = hex(substr($addr1,-7)); + $addr1 = substr($addr1,0,-7); + my $a2 = hex(substr($addr2,-7)); + $addr2 = substr($addr2,0,-7); + my $b = 0; + if ($a2 > $a1) { + $b = 1; + $a1 += 0x10000000; + } + $diff = $a1 - $a2; + my $r = sprintf("%07x", $diff); + + $a1 = hex(substr($addr1,-7)); + $addr1 = substr($addr1,0,-7); + $a2 = hex(substr($addr2,-7)) + $b; + $addr2 = substr($addr2,0,-7); + $b = 0; + if ($a2 > $a1) { + $b = 1; + $a1 += 0x10000000; + } + $diff = $a1 - $a2; + $r = sprintf("%07x", $diff) . $r; + + $a1 = hex($addr1); + $a2 = hex($addr2) + $b; + if ($a2 > $a1) { $a1 += 0x100; } + $diff = $a1 - $a2; + $r = sprintf("%02x", $diff) . $r; + + # if ($main::opt_debug) { print STDERR "$r\n"; } + + return $r; + } +} + +# Increment a hex addresses of length $address_length. +# Run jeprof --test for unit test if this is changed. +sub AddressInc { + my $addr = shift; + my $sum; + + if ($address_length == 8) { + # Perl doesn't cope with wraparound arithmetic, so do it explicitly: + $sum = (hex($addr)+1) % (0x10000000 * 16); + return sprintf("%08x", $sum); + + } else { + # Do the addition in 7-nibble chunks to trivialize carry handling. + # We are always doing this to step through the addresses in a function, + # and will almost never overflow the first chunk, so we check for this + # case and exit early. + + # if ($main::opt_debug) { print STDERR "AddressInc $addr1 = "; } + + my $a1 = substr($addr,-7); + $addr = substr($addr,0,-7); + $sum = hex($a1) + 1; + my $r = sprintf("%07x", $sum); + if ($sum <= 0xfffffff) { + $r = $addr . $r; + # if ($main::opt_debug) { print STDERR "$r\n"; } + return HexExtend($r); + } else { + $r = "0000000"; + } + + $a1 = substr($addr,-7); + $addr = substr($addr,0,-7); + $sum = hex($a1) + 1; + $r = sprintf("%07x", $sum) . $r; + if ($sum <= 0xfffffff) { + $r = $addr . $r; + # if ($main::opt_debug) { print STDERR "$r\n"; } + return HexExtend($r); + } else { + $r = "00000000000000"; + } + + $sum = hex($addr) + 1; + if ($sum > 0xff) { $sum -= 0x100; } + $r = sprintf("%02x", $sum) . $r; + + # if ($main::opt_debug) { print STDERR "$r\n"; } + return $r; + } +} + +# Extract symbols for all PC values found in profile +sub ExtractSymbols { + my $libs = shift; + my $pcset = shift; + + my $symbols = {}; + + # Map each PC value to the containing library. To make this faster, + # we sort libraries by their starting pc value (highest first), and + # advance through the libraries as we advance the pc. Sometimes the + # addresses of libraries may overlap with the addresses of the main + # binary, so to make sure the libraries 'win', we iterate over the + # libraries in reverse order (which assumes the binary doesn't start + # in the middle of a library, which seems a fair assumption). + my @pcs = (sort { $a cmp $b } keys(%{$pcset})); # pcset is 0-extended strings + foreach my $lib (sort {$b->[1] cmp $a->[1]} @{$libs}) { + my $libname = $lib->[0]; + my $start = $lib->[1]; + my $finish = $lib->[2]; + my $offset = $lib->[3]; + + # Use debug library if it exists + my $debug_libname = DebuggingLibrary($libname); + if ($debug_libname) { + $libname = $debug_libname; + } + + # Get list of pcs that belong in this library. + my $contained = []; + my ($start_pc_index, $finish_pc_index); + # Find smallest finish_pc_index such that $finish < $pc[$finish_pc_index]. + for ($finish_pc_index = $#pcs + 1; $finish_pc_index > 0; + $finish_pc_index--) { + last if $pcs[$finish_pc_index - 1] le $finish; + } + # Find smallest start_pc_index such that $start <= $pc[$start_pc_index]. + for ($start_pc_index = $finish_pc_index; $start_pc_index > 0; + $start_pc_index--) { + last if $pcs[$start_pc_index - 1] lt $start; + } + # This keeps PC values higher than $pc[$finish_pc_index] in @pcs, + # in case there are overlaps in libraries and the main binary. + @{$contained} = splice(@pcs, $start_pc_index, + $finish_pc_index - $start_pc_index); + # Map to symbols + MapToSymbols($libname, AddressSub($start, $offset), $contained, $symbols); + } + + return $symbols; +} + +# Map list of PC values to symbols for a given image +sub MapToSymbols { + my $image = shift; + my $offset = shift; + my $pclist = shift; + my $symbols = shift; + + my $debug = 0; + + # Ignore empty binaries + if ($#{$pclist} < 0) { return; } + + # Figure out the addr2line command to use + my $addr2line = $obj_tool_map{"addr2line"}; + my $cmd = ShellEscape($addr2line, "-f", "-C", "-e", $image); + if (exists $obj_tool_map{"addr2line_pdb"}) { + $addr2line = $obj_tool_map{"addr2line_pdb"}; + $cmd = ShellEscape($addr2line, "--demangle", "-f", "-C", "-e", $image); + } + + # If "addr2line" isn't installed on the system at all, just use + # nm to get what info we can (function names, but not line numbers). + if (system(ShellEscape($addr2line, "--help") . " >$dev_null 2>&1") != 0) { + MapSymbolsWithNM($image, $offset, $pclist, $symbols); + return; + } + + # "addr2line -i" can produce a variable number of lines per input + # address, with no separator that allows us to tell when data for + # the next address starts. So we find the address for a special + # symbol (_fini) and interleave this address between all real + # addresses passed to addr2line. The name of this special symbol + # can then be used as a separator. + $sep_address = undef; # May be filled in by MapSymbolsWithNM() + my $nm_symbols = {}; + MapSymbolsWithNM($image, $offset, $pclist, $nm_symbols); + if (defined($sep_address)) { + # Only add " -i" to addr2line if the binary supports it. + # addr2line --help returns 0, but not if it sees an unknown flag first. + if (system("$cmd -i --help >$dev_null 2>&1") == 0) { + $cmd .= " -i"; + } else { + $sep_address = undef; # no need for sep_address if we don't support -i + } + } + + # Make file with all PC values with intervening 'sep_address' so + # that we can reliably detect the end of inlined function list + open(ADDRESSES, ">$main::tmpfile_sym") || error("$main::tmpfile_sym: $!\n"); + if ($debug) { print("---- $image ---\n"); } + for (my $i = 0; $i <= $#{$pclist}; $i++) { + # addr2line always reads hex addresses, and does not need '0x' prefix. + if ($debug) { printf STDERR ("%s\n", $pclist->[$i]); } + printf ADDRESSES ("%s\n", AddressSub($pclist->[$i], $offset)); + if (defined($sep_address)) { + printf ADDRESSES ("%s\n", $sep_address); + } + } + close(ADDRESSES); + if ($debug) { + print("----\n"); + system("cat", $main::tmpfile_sym); + print("----\n"); + system("$cmd < " . ShellEscape($main::tmpfile_sym)); + print("----\n"); + } + + open(SYMBOLS, "$cmd <" . ShellEscape($main::tmpfile_sym) . " |") + || error("$cmd: $!\n"); + my $count = 0; # Index in pclist + while () { + # Read fullfunction and filelineinfo from next pair of lines + s/\r?\n$//g; + my $fullfunction = $_; + $_ = ; + s/\r?\n$//g; + my $filelinenum = $_; + + if (defined($sep_address) && $fullfunction eq $sep_symbol) { + # Terminating marker for data for this address + $count++; + next; + } + + $filelinenum =~ s|\\|/|g; # turn windows-style paths into unix-style paths + + my $pcstr = $pclist->[$count]; + my $function = ShortFunctionName($fullfunction); + my $nms = $nm_symbols->{$pcstr}; + if (defined($nms)) { + if ($fullfunction eq '??') { + # nm found a symbol for us. + $function = $nms->[0]; + $fullfunction = $nms->[2]; + } else { + # MapSymbolsWithNM tags each routine with its starting address, + # useful in case the image has multiple occurrences of this + # routine. (It uses a syntax that resembles template parameters, + # that are automatically stripped out by ShortFunctionName().) + # addr2line does not provide the same information. So we check + # if nm disambiguated our symbol, and if so take the annotated + # (nm) version of the routine-name. TODO(csilvers): this won't + # catch overloaded, inlined symbols, which nm doesn't see. + # Better would be to do a check similar to nm's, in this fn. + if ($nms->[2] =~ m/^\Q$function\E/) { # sanity check it's the right fn + $function = $nms->[0]; + $fullfunction = $nms->[2]; + } + } + } + + # Prepend to accumulated symbols for pcstr + # (so that caller comes before callee) + my $sym = $symbols->{$pcstr}; + if (!defined($sym)) { + $sym = []; + $symbols->{$pcstr} = $sym; + } + unshift(@{$sym}, $function, $filelinenum, $fullfunction); + if ($debug) { printf STDERR ("%s => [%s]\n", $pcstr, join(" ", @{$sym})); } + if (!defined($sep_address)) { + # Inlining is off, so this entry ends immediately + $count++; + } + } + close(SYMBOLS); +} + +# Use nm to map the list of referenced PCs to symbols. Return true iff we +# are able to read procedure information via nm. +sub MapSymbolsWithNM { + my $image = shift; + my $offset = shift; + my $pclist = shift; + my $symbols = shift; + + # Get nm output sorted by increasing address + my $symbol_table = GetProcedureBoundaries($image, "."); + if (!%{$symbol_table}) { + return 0; + } + # Start addresses are already the right length (8 or 16 hex digits). + my @names = sort { $symbol_table->{$a}->[0] cmp $symbol_table->{$b}->[0] } + keys(%{$symbol_table}); + + if ($#names < 0) { + # No symbols: just use addresses + foreach my $pc (@{$pclist}) { + my $pcstr = "0x" . $pc; + $symbols->{$pc} = [$pcstr, "?", $pcstr]; + } + return 0; + } + + # Sort addresses so we can do a join against nm output + my $index = 0; + my $fullname = $names[0]; + my $name = ShortFunctionName($fullname); + foreach my $pc (sort { $a cmp $b } @{$pclist}) { + # Adjust for mapped offset + my $mpc = AddressSub($pc, $offset); + while (($index < $#names) && ($mpc ge $symbol_table->{$fullname}->[1])){ + $index++; + $fullname = $names[$index]; + $name = ShortFunctionName($fullname); + } + if ($mpc lt $symbol_table->{$fullname}->[1]) { + $symbols->{$pc} = [$name, "?", $fullname]; + } else { + my $pcstr = "0x" . $pc; + $symbols->{$pc} = [$pcstr, "?", $pcstr]; + } + } + return 1; +} + +sub ShortFunctionName { + my $function = shift; + while ($function =~ s/\([^()]*\)(\s*const)?//g) { } # Argument types + while ($function =~ s/<[^<>]*>//g) { } # Remove template arguments + $function =~ s/^.*\s+(\w+::)/$1/; # Remove leading type + return $function; +} + +# Trim overly long symbols found in disassembler output +sub CleanDisassembly { + my $d = shift; + while ($d =~ s/\([^()%]*\)(\s*const)?//g) { } # Argument types, not (%rax) + while ($d =~ s/(\w+)<[^<>]*>/$1/g) { } # Remove template arguments + return $d; +} + +# Clean file name for display +sub CleanFileName { + my ($f) = @_; + $f =~ s|^/proc/self/cwd/||; + $f =~ s|^\./||; + return $f; +} + +# Make address relative to section and clean up for display +sub UnparseAddress { + my ($offset, $address) = @_; + $address = AddressSub($address, $offset); + $address =~ s/^0x//; + $address =~ s/^0*//; + return $address; +} + +##### Miscellaneous ##### + +# Find the right versions of the above object tools to use. The +# argument is the program file being analyzed, and should be an ELF +# 32-bit or ELF 64-bit executable file. The location of the tools +# is determined by considering the following options in this order: +# 1) --tools option, if set +# 2) JEPROF_TOOLS environment variable, if set +# 3) the environment +sub ConfigureObjTools { + my $prog_file = shift; + + # Check for the existence of $prog_file because /usr/bin/file does not + # predictably return error status in prod. + (-e $prog_file) || error("$prog_file does not exist.\n"); + + my $file_type = undef; + if (-e "/usr/bin/file") { + # Follow symlinks (at least for systems where "file" supports that). + my $escaped_prog_file = ShellEscape($prog_file); + $file_type = `/usr/bin/file -L $escaped_prog_file 2>$dev_null || + /usr/bin/file $escaped_prog_file`; + } elsif ($^O == "MSWin32") { + $file_type = "MS Windows"; + } else { + print STDERR "WARNING: Can't determine the file type of $prog_file"; + } + + if ($file_type =~ /64-bit/) { + # Change $address_length to 16 if the program file is ELF 64-bit. + # We can't detect this from many (most?) heap or lock contention + # profiles, since the actual addresses referenced are generally in low + # memory even for 64-bit programs. + $address_length = 16; + } + + if ($file_type =~ /MS Windows/) { + # For windows, we provide a version of nm and addr2line as part of + # the opensource release, which is capable of parsing + # Windows-style PDB executables. It should live in the path, or + # in the same directory as jeprof. + $obj_tool_map{"nm_pdb"} = "nm-pdb"; + $obj_tool_map{"addr2line_pdb"} = "addr2line-pdb"; + } + + if ($file_type =~ /Mach-O/) { + # OS X uses otool to examine Mach-O files, rather than objdump. + $obj_tool_map{"otool"} = "otool"; + $obj_tool_map{"addr2line"} = "false"; # no addr2line + $obj_tool_map{"objdump"} = "false"; # no objdump + } + + # Go fill in %obj_tool_map with the pathnames to use: + foreach my $tool (keys %obj_tool_map) { + $obj_tool_map{$tool} = ConfigureTool($obj_tool_map{$tool}); + } +} + +# Returns the path of a caller-specified object tool. If --tools or +# JEPROF_TOOLS are specified, then returns the full path to the tool +# with that prefix. Otherwise, returns the path unmodified (which +# means we will look for it on PATH). +sub ConfigureTool { + my $tool = shift; + my $path; + + # --tools (or $JEPROF_TOOLS) is a comma separated list, where each + # item is either a) a pathname prefix, or b) a map of the form + # :. First we look for an entry of type (b) for our + # tool. If one is found, we use it. Otherwise, we consider all the + # pathname prefixes in turn, until one yields an existing file. If + # none does, we use a default path. + my $tools = $main::opt_tools || $ENV{"JEPROF_TOOLS"} || ""; + if ($tools =~ m/(,|^)\Q$tool\E:([^,]*)/) { + $path = $2; + # TODO(csilvers): sanity-check that $path exists? Hard if it's relative. + } elsif ($tools ne '') { + foreach my $prefix (split(',', $tools)) { + next if ($prefix =~ /:/); # ignore "tool:fullpath" entries in the list + if (-x $prefix . $tool) { + $path = $prefix . $tool; + last; + } + } + if (!$path) { + error("No '$tool' found with prefix specified by " . + "--tools (or \$JEPROF_TOOLS) '$tools'\n"); + } + } else { + # ... otherwise use the version that exists in the same directory as + # jeprof. If there's nothing there, use $PATH. + $0 =~ m,[^/]*$,; # this is everything after the last slash + my $dirname = $`; # this is everything up to and including the last slash + if (-x "$dirname$tool") { + $path = "$dirname$tool"; + } else { + $path = $tool; + } + } + if ($main::opt_debug) { print STDERR "Using '$path' for '$tool'.\n"; } + return $path; +} + +sub ShellEscape { + my @escaped_words = (); + foreach my $word (@_) { + my $escaped_word = $word; + if ($word =~ m![^a-zA-Z0-9/.,_=-]!) { # check for anything not in whitelist + $escaped_word =~ s/'/'\\''/; + $escaped_word = "'$escaped_word'"; + } + push(@escaped_words, $escaped_word); + } + return join(" ", @escaped_words); +} + +sub cleanup { + unlink($main::tmpfile_sym); + unlink(keys %main::tempnames); + + # We leave any collected profiles in $HOME/jeprof in case the user wants + # to look at them later. We print a message informing them of this. + if ((scalar(@main::profile_files) > 0) && + defined($main::collected_profile)) { + if (scalar(@main::profile_files) == 1) { + print STDERR "Dynamically gathered profile is in $main::collected_profile\n"; + } + print STDERR "If you want to investigate this profile further, you can do:\n"; + print STDERR "\n"; + print STDERR " jeprof \\\n"; + print STDERR " $main::prog \\\n"; + print STDERR " $main::collected_profile\n"; + print STDERR "\n"; + } +} + +sub sighandler { + cleanup(); + exit(1); +} + +sub error { + my $msg = shift; + print STDERR $msg; + cleanup(); + exit(1); +} + + +# Run $nm_command and get all the resulting procedure boundaries whose +# names match "$regexp" and returns them in a hashtable mapping from +# procedure name to a two-element vector of [start address, end address] +sub GetProcedureBoundariesViaNm { + my $escaped_nm_command = shift; # shell-escaped + my $regexp = shift; + + my $symbol_table = {}; + open(NM, "$escaped_nm_command |") || error("$escaped_nm_command: $!\n"); + my $last_start = "0"; + my $routine = ""; + while () { + s/\r//g; # turn windows-looking lines into unix-looking lines + if (m/^\s*([0-9a-f]+) (.) (..*)/) { + my $start_val = $1; + my $type = $2; + my $this_routine = $3; + + # It's possible for two symbols to share the same address, if + # one is a zero-length variable (like __start_google_malloc) or + # one symbol is a weak alias to another (like __libc_malloc). + # In such cases, we want to ignore all values except for the + # actual symbol, which in nm-speak has type "T". The logic + # below does this, though it's a bit tricky: what happens when + # we have a series of lines with the same address, is the first + # one gets queued up to be processed. However, it won't + # *actually* be processed until later, when we read a line with + # a different address. That means that as long as we're reading + # lines with the same address, we have a chance to replace that + # item in the queue, which we do whenever we see a 'T' entry -- + # that is, a line with type 'T'. If we never see a 'T' entry, + # we'll just go ahead and process the first entry (which never + # got touched in the queue), and ignore the others. + if ($start_val eq $last_start && $type =~ /t/i) { + # We are the 'T' symbol at this address, replace previous symbol. + $routine = $this_routine; + next; + } elsif ($start_val eq $last_start) { + # We're not the 'T' symbol at this address, so ignore us. + next; + } + + if ($this_routine eq $sep_symbol) { + $sep_address = HexExtend($start_val); + } + + # Tag this routine with the starting address in case the image + # has multiple occurrences of this routine. We use a syntax + # that resembles template parameters that are automatically + # stripped out by ShortFunctionName() + $this_routine .= "<$start_val>"; + + if (defined($routine) && $routine =~ m/$regexp/) { + $symbol_table->{$routine} = [HexExtend($last_start), + HexExtend($start_val)]; + } + $last_start = $start_val; + $routine = $this_routine; + } elsif (m/^Loaded image name: (.+)/) { + # The win32 nm workalike emits information about the binary it is using. + if ($main::opt_debug) { print STDERR "Using Image $1\n"; } + } elsif (m/^PDB file name: (.+)/) { + # The win32 nm workalike emits information about the pdb it is using. + if ($main::opt_debug) { print STDERR "Using PDB $1\n"; } + } + } + close(NM); + # Handle the last line in the nm output. Unfortunately, we don't know + # how big this last symbol is, because we don't know how big the file + # is. For now, we just give it a size of 0. + # TODO(csilvers): do better here. + if (defined($routine) && $routine =~ m/$regexp/) { + $symbol_table->{$routine} = [HexExtend($last_start), + HexExtend($last_start)]; + } + return $symbol_table; +} + +# Gets the procedure boundaries for all routines in "$image" whose names +# match "$regexp" and returns them in a hashtable mapping from procedure +# name to a two-element vector of [start address, end address]. +# Will return an empty map if nm is not installed or not working properly. +sub GetProcedureBoundaries { + my $image = shift; + my $regexp = shift; + + # If $image doesn't start with /, then put ./ in front of it. This works + # around an obnoxious bug in our probing of nm -f behavior. + # "nm -f $image" is supposed to fail on GNU nm, but if: + # + # a. $image starts with [BbSsPp] (for example, bin/foo/bar), AND + # b. you have a.out in your current directory (a not uncommon occurrence) + # + # then "nm -f $image" succeeds because -f only looks at the first letter of + # the argument, which looks valid because it's [BbSsPp], and then since + # there's no image provided, it looks for a.out and finds it. + # + # This regex makes sure that $image starts with . or /, forcing the -f + # parsing to fail since . and / are not valid formats. + $image =~ s#^[^/]#./$&#; + + # For libc libraries, the copy in /usr/lib/debug contains debugging symbols + my $debugging = DebuggingLibrary($image); + if ($debugging) { + $image = $debugging; + } + + my $nm = $obj_tool_map{"nm"}; + my $cppfilt = $obj_tool_map{"c++filt"}; + + # nm can fail for two reasons: 1) $image isn't a debug library; 2) nm + # binary doesn't support --demangle. In addition, for OS X we need + # to use the -f flag to get 'flat' nm output (otherwise we don't sort + # properly and get incorrect results). Unfortunately, GNU nm uses -f + # in an incompatible way. So first we test whether our nm supports + # --demangle and -f. + my $demangle_flag = ""; + my $cppfilt_flag = ""; + my $to_devnull = ">$dev_null 2>&1"; + if (system(ShellEscape($nm, "--demangle", $image) . $to_devnull) == 0) { + # In this mode, we do "nm --demangle " + $demangle_flag = "--demangle"; + $cppfilt_flag = ""; + } elsif (system(ShellEscape($cppfilt, $image) . $to_devnull) == 0) { + # In this mode, we do "nm | c++filt" + $cppfilt_flag = " | " . ShellEscape($cppfilt); + }; + my $flatten_flag = ""; + if (system(ShellEscape($nm, "-f", $image) . $to_devnull) == 0) { + $flatten_flag = "-f"; + } + + # Finally, in the case $imagie isn't a debug library, we try again with + # -D to at least get *exported* symbols. If we can't use --demangle, + # we use c++filt instead, if it exists on this system. + my @nm_commands = (ShellEscape($nm, "-n", $flatten_flag, $demangle_flag, + $image) . " 2>$dev_null $cppfilt_flag", + ShellEscape($nm, "-D", "-n", $flatten_flag, $demangle_flag, + $image) . " 2>$dev_null $cppfilt_flag", + # 6nm is for Go binaries + ShellEscape("6nm", "$image") . " 2>$dev_null | sort", + ); + + # If the executable is an MS Windows PDB-format executable, we'll + # have set up obj_tool_map("nm_pdb"). In this case, we actually + # want to use both unix nm and windows-specific nm_pdb, since + # PDB-format executables can apparently include dwarf .o files. + if (exists $obj_tool_map{"nm_pdb"}) { + push(@nm_commands, + ShellEscape($obj_tool_map{"nm_pdb"}, "--demangle", $image) + . " 2>$dev_null"); + } + + foreach my $nm_command (@nm_commands) { + my $symbol_table = GetProcedureBoundariesViaNm($nm_command, $regexp); + return $symbol_table if (%{$symbol_table}); + } + my $symbol_table = {}; + return $symbol_table; +} + + +# The test vectors for AddressAdd/Sub/Inc are 8-16-nibble hex strings. +# To make them more readable, we add underscores at interesting places. +# This routine removes the underscores, producing the canonical representation +# used by jeprof to represent addresses, particularly in the tested routines. +sub CanonicalHex { + my $arg = shift; + return join '', (split '_',$arg); +} + + +# Unit test for AddressAdd: +sub AddressAddUnitTest { + my $test_data_8 = shift; + my $test_data_16 = shift; + my $error_count = 0; + my $fail_count = 0; + my $pass_count = 0; + # print STDERR "AddressAddUnitTest: ", 1+$#{$test_data_8}, " tests\n"; + + # First a few 8-nibble addresses. Note that this implementation uses + # plain old arithmetic, so a quick sanity check along with verifying what + # happens to overflow (we want it to wrap): + $address_length = 8; + foreach my $row (@{$test_data_8}) { + if ($main::opt_debug and $main::opt_test) { print STDERR "@{$row}\n"; } + my $sum = AddressAdd ($row->[0], $row->[1]); + if ($sum ne $row->[2]) { + printf STDERR "ERROR: %s != %s + %s = %s\n", $sum, + $row->[0], $row->[1], $row->[2]; + ++$fail_count; + } else { + ++$pass_count; + } + } + printf STDERR "AddressAdd 32-bit tests: %d passes, %d failures\n", + $pass_count, $fail_count; + $error_count = $fail_count; + $fail_count = 0; + $pass_count = 0; + + # Now 16-nibble addresses. + $address_length = 16; + foreach my $row (@{$test_data_16}) { + if ($main::opt_debug and $main::opt_test) { print STDERR "@{$row}\n"; } + my $sum = AddressAdd (CanonicalHex($row->[0]), CanonicalHex($row->[1])); + my $expected = join '', (split '_',$row->[2]); + if ($sum ne CanonicalHex($row->[2])) { + printf STDERR "ERROR: %s != %s + %s = %s\n", $sum, + $row->[0], $row->[1], $row->[2]; + ++$fail_count; + } else { + ++$pass_count; + } + } + printf STDERR "AddressAdd 64-bit tests: %d passes, %d failures\n", + $pass_count, $fail_count; + $error_count += $fail_count; + + return $error_count; +} + + +# Unit test for AddressSub: +sub AddressSubUnitTest { + my $test_data_8 = shift; + my $test_data_16 = shift; + my $error_count = 0; + my $fail_count = 0; + my $pass_count = 0; + # print STDERR "AddressSubUnitTest: ", 1+$#{$test_data_8}, " tests\n"; + + # First a few 8-nibble addresses. Note that this implementation uses + # plain old arithmetic, so a quick sanity check along with verifying what + # happens to overflow (we want it to wrap): + $address_length = 8; + foreach my $row (@{$test_data_8}) { + if ($main::opt_debug and $main::opt_test) { print STDERR "@{$row}\n"; } + my $sum = AddressSub ($row->[0], $row->[1]); + if ($sum ne $row->[3]) { + printf STDERR "ERROR: %s != %s - %s = %s\n", $sum, + $row->[0], $row->[1], $row->[3]; + ++$fail_count; + } else { + ++$pass_count; + } + } + printf STDERR "AddressSub 32-bit tests: %d passes, %d failures\n", + $pass_count, $fail_count; + $error_count = $fail_count; + $fail_count = 0; + $pass_count = 0; + + # Now 16-nibble addresses. + $address_length = 16; + foreach my $row (@{$test_data_16}) { + if ($main::opt_debug and $main::opt_test) { print STDERR "@{$row}\n"; } + my $sum = AddressSub (CanonicalHex($row->[0]), CanonicalHex($row->[1])); + if ($sum ne CanonicalHex($row->[3])) { + printf STDERR "ERROR: %s != %s - %s = %s\n", $sum, + $row->[0], $row->[1], $row->[3]; + ++$fail_count; + } else { + ++$pass_count; + } + } + printf STDERR "AddressSub 64-bit tests: %d passes, %d failures\n", + $pass_count, $fail_count; + $error_count += $fail_count; + + return $error_count; +} + + +# Unit test for AddressInc: +sub AddressIncUnitTest { + my $test_data_8 = shift; + my $test_data_16 = shift; + my $error_count = 0; + my $fail_count = 0; + my $pass_count = 0; + # print STDERR "AddressIncUnitTest: ", 1+$#{$test_data_8}, " tests\n"; + + # First a few 8-nibble addresses. Note that this implementation uses + # plain old arithmetic, so a quick sanity check along with verifying what + # happens to overflow (we want it to wrap): + $address_length = 8; + foreach my $row (@{$test_data_8}) { + if ($main::opt_debug and $main::opt_test) { print STDERR "@{$row}\n"; } + my $sum = AddressInc ($row->[0]); + if ($sum ne $row->[4]) { + printf STDERR "ERROR: %s != %s + 1 = %s\n", $sum, + $row->[0], $row->[4]; + ++$fail_count; + } else { + ++$pass_count; + } + } + printf STDERR "AddressInc 32-bit tests: %d passes, %d failures\n", + $pass_count, $fail_count; + $error_count = $fail_count; + $fail_count = 0; + $pass_count = 0; + + # Now 16-nibble addresses. + $address_length = 16; + foreach my $row (@{$test_data_16}) { + if ($main::opt_debug and $main::opt_test) { print STDERR "@{$row}\n"; } + my $sum = AddressInc (CanonicalHex($row->[0])); + if ($sum ne CanonicalHex($row->[4])) { + printf STDERR "ERROR: %s != %s + 1 = %s\n", $sum, + $row->[0], $row->[4]; + ++$fail_count; + } else { + ++$pass_count; + } + } + printf STDERR "AddressInc 64-bit tests: %d passes, %d failures\n", + $pass_count, $fail_count; + $error_count += $fail_count; + + return $error_count; +} + + +# Driver for unit tests. +# Currently just the address add/subtract/increment routines for 64-bit. +sub RunUnitTests { + my $error_count = 0; + + # This is a list of tuples [a, b, a+b, a-b, a+1] + my $unit_test_data_8 = [ + [qw(aaaaaaaa 50505050 fafafafa 5a5a5a5a aaaaaaab)], + [qw(50505050 aaaaaaaa fafafafa a5a5a5a6 50505051)], + [qw(ffffffff aaaaaaaa aaaaaaa9 55555555 00000000)], + [qw(00000001 ffffffff 00000000 00000002 00000002)], + [qw(00000001 fffffff0 fffffff1 00000011 00000002)], + ]; + my $unit_test_data_16 = [ + # The implementation handles data in 7-nibble chunks, so those are the + # interesting boundaries. + [qw(aaaaaaaa 50505050 + 00_000000f_afafafa 00_0000005_a5a5a5a 00_000000a_aaaaaab)], + [qw(50505050 aaaaaaaa + 00_000000f_afafafa ff_ffffffa_5a5a5a6 00_0000005_0505051)], + [qw(ffffffff aaaaaaaa + 00_000001a_aaaaaa9 00_0000005_5555555 00_0000010_0000000)], + [qw(00000001 ffffffff + 00_0000010_0000000 ff_ffffff0_0000002 00_0000000_0000002)], + [qw(00000001 fffffff0 + 00_000000f_ffffff1 ff_ffffff0_0000011 00_0000000_0000002)], + + [qw(00_a00000a_aaaaaaa 50505050 + 00_a00000f_afafafa 00_a000005_a5a5a5a 00_a00000a_aaaaaab)], + [qw(0f_fff0005_0505050 aaaaaaaa + 0f_fff000f_afafafa 0f_ffefffa_5a5a5a6 0f_fff0005_0505051)], + [qw(00_000000f_fffffff 01_800000a_aaaaaaa + 01_800001a_aaaaaa9 fe_8000005_5555555 00_0000010_0000000)], + [qw(00_0000000_0000001 ff_fffffff_fffffff + 00_0000000_0000000 00_0000000_0000002 00_0000000_0000002)], + [qw(00_0000000_0000001 ff_fffffff_ffffff0 + ff_fffffff_ffffff1 00_0000000_0000011 00_0000000_0000002)], + ]; + + $error_count += AddressAddUnitTest($unit_test_data_8, $unit_test_data_16); + $error_count += AddressSubUnitTest($unit_test_data_8, $unit_test_data_16); + $error_count += AddressIncUnitTest($unit_test_data_8, $unit_test_data_16); + if ($error_count > 0) { + print STDERR $error_count, " errors: FAILED\n"; + } else { + print STDERR "PASS\n"; + } + exit ($error_count); +} \ No newline at end of file diff --git a/src/server/status_server/mod.rs b/src/server/status_server/mod.rs index 3e68b0b6310..b76454ffab8 100644 --- a/src/server/status_server/mod.rs +++ b/src/server/status_server/mod.rs @@ -40,11 +40,7 @@ use openssl::{ x509::X509, }; use pin_project::pin_project; -pub use profile::{ - activate_heap_profile, deactivate_heap_profile, heap_profiles_dir, jeprof_heap_profile, - list_heap_profiles, read_file, start_one_cpu_profile, start_one_heap_profile, - HEAP_PROFILE_REGEX, -}; +use profile::*; use prometheus::TEXT_FORMAT; use regex::Regex; use resource_control::ResourceGroupManager; @@ -170,16 +166,22 @@ where Ok(val) => val, Err(err) => return Ok(make_response(StatusCode::BAD_REQUEST, err.to_string())), }, - None => 60, + None => 0, }; - let interval = Duration::from_secs(interval); - let period = GLOBAL_TIMER_HANDLE - .interval(Instant::now() + interval, interval) - .compat() - .map_ok(|_| ()) - .map_err(|_| TIMER_CANCELED.to_owned()) - .into_stream(); + let period = if interval == 0 { + None + } else { + let interval = Duration::from_secs(interval); + Some( + GLOBAL_TIMER_HANDLE + .interval(Instant::now() + interval, interval) + .compat() + .map_ok(|_| ()) + .map_err(|_| TIMER_CANCELED.to_owned()) + .into_stream(), + ) + }; let (tx, rx) = oneshot::channel(); let callback = move || tx.send(()).unwrap_or_default(); let res = Handle::current().spawn(activate_heap_profile(period, store_path, callback)); @@ -201,7 +203,6 @@ where Ok(make_response(StatusCode::OK, body)) } - #[allow(dead_code)] async fn dump_heap_prof_to_resp(req: Request) -> hyper::Result> { let query = req.uri().query().unwrap_or(""); let query_pairs: HashMap<_, _> = url::form_urlencoded::parse(query.as_bytes()).collect(); @@ -239,21 +240,7 @@ where return Ok(make_response(StatusCode::BAD_REQUEST, errmsg)); } } else { - let mut seconds = 10; - if let Some(s) = query_pairs.get("seconds") { - match s.parse() { - Ok(val) => seconds = val, - Err(_) => { - let errmsg = "request should have seconds argument".to_owned(); - return Ok(make_response(StatusCode::BAD_REQUEST, errmsg)); - } - } - } - let timer = GLOBAL_TIMER_HANDLE.delay(Instant::now() + Duration::from_secs(seconds)); - let end = Compat01As03::new(timer) - .map_err(|_| TIMER_CANCELED.to_owned()) - .into_future(); - start_one_heap_profile(end, use_jeprof).await + dump_one_heap_profile() }; match result { diff --git a/src/server/status_server/profile.rs b/src/server/status_server/profile.rs index dd49c394046..3941c6c12b6 100644 --- a/src/server/status_server/profile.rs +++ b/src/server/status_server/profile.rs @@ -1,11 +1,11 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. use std::{ fs::{File, Metadata}, - io::Read, + io::{Read, Write}, path::PathBuf, pin::Pin, - process::Command, - sync::Mutex as StdMutex, + process::{Command, Stdio}, + sync::Mutex, time::{Duration, UNIX_EPOCH}, }; @@ -23,7 +23,6 @@ use regex::Regex; use tempfile::{NamedTempFile, TempDir}; #[cfg(not(test))] use tikv_alloc::{activate_prof, deactivate_prof, dump_prof}; -use tokio::sync::{Mutex, MutexGuard}; #[cfg(test)] pub use self::test_utils::TEST_PROFILE_MUTEX; @@ -35,10 +34,10 @@ pub const HEAP_PROFILE_SUFFIX: &str = ".heap"; pub const HEAP_PROFILE_REGEX: &str = r"^[0-9]{6,6}\.heap$"; lazy_static! { - // If it's locked it means there are already a heap or CPU profiling. - static ref PROFILE_MUTEX: Mutex<()> = Mutex::new(()); - // The channel is used to deactivate a profiling. - static ref PROFILE_ACTIVE: StdMutex, TempDir)>> = StdMutex::new(None); + // If it's some it means there are already a CPU profiling. + static ref CPU_PROFILE_ACTIVE: Mutex> = Mutex::new(None); + // If it's some it means there are already a heap profiling. The channel is used to deactivate a profiling. + static ref HEAP_PROFILE_ACTIVE: Mutex>, TempDir)>> = Mutex::new(None); // To normalize thread names. static ref THREAD_NAME_RE: Regex = @@ -48,32 +47,26 @@ lazy_static! { type OnEndFn = Box Result + Send + 'static>; -struct ProfileGuard<'a, I, T> { - _guard: MutexGuard<'a, ()>, +struct ProfileRunner { item: Option, on_end: Option>, end: BoxFuture<'static, Result<(), String>>, } -impl<'a, I, T> Unpin for ProfileGuard<'a, I, T> {} +impl Unpin for ProfileRunner {} -impl<'a, I, T> ProfileGuard<'a, I, T> { +impl ProfileRunner { fn new( on_start: F1, on_end: F2, end: BoxFuture<'static, Result<(), String>>, - ) -> Result, String> + ) -> Result where F1: FnOnce() -> Result, F2: FnOnce(I) -> Result + Send + 'static, { - let _guard = match PROFILE_MUTEX.try_lock() { - Ok(guard) => guard, - _ => return Err("Already in Profiling".to_owned()), - }; let item = on_start()?; - Ok(ProfileGuard { - _guard, + Ok(ProfileRunner { item: Some(item), on_end: Some(Box::new(on_end) as OnEndFn), end, @@ -81,7 +74,7 @@ impl<'a, I, T> ProfileGuard<'a, I, T> { } } -impl<'a, I, T> Future for ProfileGuard<'a, I, T> { +impl Future for ProfileRunner { type Output = Result; fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { match self.end.as_mut().poll(cx) { @@ -99,34 +92,22 @@ impl<'a, I, T> Future for ProfileGuard<'a, I, T> { } } -/// Trigger a heap profie and return the content. -#[allow(dead_code)] -pub async fn start_one_heap_profile(end: F, use_jeprof: bool) -> Result, String> -where - F: Future> + Send + 'static, -{ - let on_start = || activate_prof().map_err(|e| format!("activate_prof: {}", e)); - - let on_end = move |_| { - deactivate_prof().map_err(|e| format!("deactivate_prof: {}", e))?; - let f = NamedTempFile::new().map_err(|e| format!("create tmp file fail: {}", e))?; - let path = f.path().to_str().unwrap(); - dump_prof(path).map_err(|e| format!("dump_prof: {}", e))?; - if use_jeprof { - jeprof_heap_profile(path) - } else { - read_file(path) - } - }; - - ProfileGuard::new(on_start, on_end, end.boxed())?.await +/// Trigger a heap profile and return the content. +pub fn dump_one_heap_profile() -> Result, String> { + if HEAP_PROFILE_ACTIVE.lock().unwrap().is_none() { + return Err("heap profiling is not activated".to_owned()); + } + let f = NamedTempFile::new().map_err(|e| format!("create tmp file fail: {}", e))?; + let path = f.path().to_str().unwrap(); + dump_prof(path).map_err(|e| format!("dump_prof: {}", e))?; + read_file(path) } /// Activate heap profile and call `callback` if successfully. /// `deactivate_heap_profile` can only be called after it's notified from /// `callback`. pub async fn activate_heap_profile( - dump_period: S, + dump_period: Option, store_path: PathBuf, callback: F, ) -> Result<(), String> @@ -134,6 +115,10 @@ where S: Stream> + Send + Unpin + 'static, F: FnOnce() + Send + 'static, { + if HEAP_PROFILE_ACTIVE.lock().unwrap().is_some() { + return Err("Already in Heap Profiling".to_owned()); + } + let (tx, rx) = oneshot::channel(); let dir = tempfile::Builder::new() .prefix("heap-") @@ -142,40 +127,55 @@ where let dir_path = dir.path().to_str().unwrap().to_owned(); let on_start = move || { - let mut activate = PROFILE_ACTIVE.lock().unwrap(); + let mut activate = HEAP_PROFILE_ACTIVE.lock().unwrap(); assert!(activate.is_none()); + *activate = Some((Some(tx), dir)); activate_prof().map_err(|e| format!("activate_prof: {}", e))?; - *activate = Some((tx, dir)); callback(); info!("periodical heap profiling is started"); Ok(()) }; let on_end = |_| { - deactivate_heap_profile(); - deactivate_prof().map_err(|e| format!("deactivate_prof: {}", e)) + let res = deactivate_prof().map_err(|e| format!("deactivate_prof: {}", e)); + *HEAP_PROFILE_ACTIVE.lock().unwrap() = None; + res }; let end = async move { - select! { - _ = rx.fuse() => { - info!("periodical heap profiling is canceled"); - Ok(()) - }, - res = dump_heap_profile_periodically(dump_period, dir_path).fuse() => { - warn!("the heap profiling dump loop shouldn't break"; "res" => ?res); - res + if let Some(dump_period) = dump_period { + select! { + _ = rx.fuse() => { + info!("periodical heap profiling is canceled"); + Ok(()) + }, + res = dump_heap_profile_periodically(dump_period, dir_path).fuse() => { + warn!("the heap profiling dump loop shouldn't break"; "res" => ?res); + res + } } + } else { + let _ = rx.await; + info!("periodical heap profiling is canceled"); + Ok(()) } }; - ProfileGuard::new(on_start, on_end, end.boxed())?.await + ProfileRunner::new(on_start, on_end, end.boxed())?.await } /// Deactivate heap profile. Return `false` if it hasn't been activated. pub fn deactivate_heap_profile() -> bool { - let mut activate = PROFILE_ACTIVE.lock().unwrap(); - activate.take().is_some() + let mut activate = HEAP_PROFILE_ACTIVE.lock().unwrap(); + match activate.as_mut() { + Some((tx, _)) => { + if let Some(tx) = tx.take() { + let _ = tx.send(()); + } + true + } + None => false, + } } /// Trigger one cpu profile. @@ -187,7 +187,14 @@ pub async fn start_one_cpu_profile( where F: Future> + Send + 'static, { + if CPU_PROFILE_ACTIVE.lock().unwrap().is_some() { + return Err("Already in CPU Profiling".to_owned()); + } + let on_start = || { + let mut activate = CPU_PROFILE_ACTIVE.lock().unwrap(); + assert!(activate.is_none()); + *activate = Some(()); let guard = pprof::ProfilerGuardBuilder::default() .frequency(frequency) .blocklist(&["libc", "libgcc", "pthread", "vdso"]) @@ -218,10 +225,13 @@ where .flamegraph(&mut body) .map_err(|e| format!("generate flamegraph from report fail: {}", e))?; } + drop(guard); + *CPU_PROFILE_ACTIVE.lock().unwrap() = None; + Ok(body) }; - ProfileGuard::new(on_start, on_end, end.boxed())?.await + ProfileRunner::new(on_start, on_end, end.boxed())?.await } pub fn read_file(path: &str) -> Result, String> { @@ -234,9 +244,26 @@ pub fn read_file(path: &str) -> Result, String> { pub fn jeprof_heap_profile(path: &str) -> Result, String> { info!("using jeprof to process {}", path); - let output = Command::new("./jeprof") - .args(["--show_bytes", "./bin/tikv-server", path, "--svg"]) - .output() + let bin = std::env::current_exe().map_err(|e| format!("get current exe path fail: {}", e))?; + let mut jeprof = Command::new("perl") + .args([ + "/dev/stdin", + "--show_bytes", + &bin.as_os_str().to_string_lossy(), + path, + "--svg", + ]) + .stdin(Stdio::piped()) + .spawn() + .map_err(|e| format!("spawn jeprof fail: {}", e))?; + jeprof + .stdin + .take() + .unwrap() + .write_all(include_bytes!("jeprof.in")) + .unwrap(); + let output = jeprof + .wait_with_output() .map_err(|e| format!("jeprof: {}", e))?; if !output.status.success() { let stderr = std::str::from_utf8(&output.stderr).unwrap_or("invalid utf8"); @@ -246,7 +273,7 @@ pub fn jeprof_heap_profile(path: &str) -> Result, String> { } pub fn heap_profiles_dir() -> Option { - PROFILE_ACTIVE + HEAP_PROFILE_ACTIVE .lock() .unwrap() .as_ref() @@ -381,7 +408,7 @@ mod tests { .build() .unwrap(); - let expected = "Already in Profiling"; + let expected = "Already in CPU Profiling"; let (tx1, rx1) = oneshot::channel(); let rx1 = rx1.map_err(|_| "channel canceled".to_owned()); @@ -393,17 +420,29 @@ mod tests { let res2 = rt.spawn(start_one_cpu_profile(rx2, 99, false)); assert_eq!(block_on(res2).unwrap().unwrap_err(), expected); - let (_tx2, rx2) = oneshot::channel(); - let rx2 = rx2.map_err(|_| "channel canceled".to_owned()); - let res2 = rt.spawn(start_one_heap_profile(rx2, false)); - assert_eq!(block_on(res2).unwrap().unwrap_err(), expected); + drop(tx1); + block_on(res1).unwrap().unwrap_err(); + + let expected = "Already in Heap Profiling"; + + let (tx1, rx1) = mpsc::channel(1); + let res1 = rt.spawn(activate_heap_profile( + Some(rx1), + std::env::temp_dir(), + || {}, + )); + thread::sleep(Duration::from_millis(100)); let (_tx2, rx2) = mpsc::channel(1); - let res2 = rt.spawn(activate_heap_profile(rx2, std::env::temp_dir(), || {})); + let res2 = rt.spawn(activate_heap_profile( + Some(rx2), + std::env::temp_dir(), + || {}, + )); assert_eq!(block_on(res2).unwrap().unwrap_err(), expected); drop(tx1); - block_on(res1).unwrap().unwrap_err(); + block_on(res1).unwrap().unwrap(); } #[test] @@ -416,7 +455,7 @@ mod tests { // Test activated profiling can be stopped by canceling the period stream. let (tx, rx) = mpsc::channel(1); - let res = rt.spawn(activate_heap_profile(rx, std::env::temp_dir(), || {})); + let res = rt.spawn(activate_heap_profile(Some(rx), std::env::temp_dir(), || {})); drop(tx); block_on(res).unwrap().unwrap(); @@ -427,7 +466,7 @@ mod tests { let (_tx, _rx) = mpsc::channel(1); let res = rt.spawn(activate_heap_profile( - _rx, + Some(_rx), std::env::temp_dir(), on_activated, )); @@ -446,7 +485,7 @@ mod tests { // Test heap profiling can be stopped by sending an error. let (mut tx, rx) = mpsc::channel(1); - let res = rt.spawn(activate_heap_profile(rx, std::env::temp_dir(), || {})); + let res = rt.spawn(activate_heap_profile(Some(rx), std::env::temp_dir(), || {})); block_on(tx.send(Err("test".to_string()))).unwrap(); block_on(res).unwrap().unwrap_err(); @@ -457,7 +496,7 @@ mod tests { let (_tx, _rx) = mpsc::channel(1); let res = rt.spawn(activate_heap_profile( - _rx, + Some(_rx), std::env::temp_dir(), on_activated, )); From 2918ffb94e221c3c0c92ef12127ee96c5bf885b2 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Tue, 7 Nov 2023 18:23:11 +0800 Subject: [PATCH 106/220] log-backup: use row-level memory usage statistic for initial scan (#15872) (#15930) close tikv/tikv#15714 Signed-off-by: hillium Co-authored-by: hillium Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- components/backup-stream/src/endpoint.rs | 8 +- components/backup-stream/src/event_loader.rs | 129 ++++++----- components/backup-stream/src/lib.rs | 2 +- components/backup-stream/src/router.rs | 9 + .../backup-stream/tests/failpoints/mod.rs | 77 +++++-- .../backup-stream/tests/integration/mod.rs | 76 +++---- components/backup-stream/tests/suite.rs | 212 +++++++++++------- components/tikv_util/src/memory.rs | 52 ++++- 8 files changed, 362 insertions(+), 203 deletions(-) diff --git a/components/backup-stream/src/endpoint.rs b/components/backup-stream/src/endpoint.rs index 834a40f8bdd..6c19edc9f93 100644 --- a/components/backup-stream/src/endpoint.rs +++ b/components/backup-stream/src/endpoint.rs @@ -29,6 +29,7 @@ use tikv_util::{ box_err, config::ReadableDuration, debug, defer, info, + memory::MemoryQuota, sys::thread::ThreadBuildWrapper, time::{Instant, Limiter}, warn, @@ -51,7 +52,7 @@ use crate::{ GetCheckpointResult, RegionIdWithVersion, Subscription, }, errors::{Error, Result}, - event_loader::{InitialDataLoader, PendingMemoryQuota}, + event_loader::InitialDataLoader, future, metadata::{store::MetaStore, MetadataClient, MetadataEvent, StreamTask}, metrics::{self, TaskStatus}, @@ -139,8 +140,9 @@ where pool.spawn(Self::starts_flush_ticks(range_router.clone())); - let initial_scan_memory_quota = - PendingMemoryQuota::new(config.initial_scan_pending_memory_quota.0 as _); + let initial_scan_memory_quota = Arc::new(MemoryQuota::new( + config.initial_scan_pending_memory_quota.0 as _, + )); let limit = if config.initial_scan_rate_limit.0 > 0 { config.initial_scan_rate_limit.0 as f64 } else { diff --git a/components/backup-stream/src/event_loader.rs b/components/backup-stream/src/event_loader.rs index bfb88d5cd5f..0a957ea87ed 100644 --- a/components/backup-stream/src/event_loader.rs +++ b/components/backup-stream/src/event_loader.rs @@ -12,15 +12,16 @@ use raftstore::{ use tikv::storage::{ kv::StatisticsSummary, mvcc::{DeltaScanner, ScannerBuilder}, - txn::{EntryBatch, TxnEntry, TxnEntryScanner}, + txn::{TxnEntry, TxnEntryScanner}, Snapshot, Statistics, }; use tikv_util::{ box_err, + memory::{MemoryQuota, OwnedAllocated}, time::{Instant, Limiter}, worker::Scheduler, }; -use tokio::sync::{OwnedSemaphorePermit, Semaphore}; +use tokio::sync::Semaphore; use txn_types::{Key, Lock, TimeStamp}; use crate::{ @@ -34,41 +35,17 @@ use crate::{ const MAX_GET_SNAPSHOT_RETRY: usize = 5; -#[derive(Clone)] -pub struct PendingMemoryQuota(Arc); - -impl std::fmt::Debug for PendingMemoryQuota { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("PendingMemoryQuota") - .field("remain", &self.0.available_permits()) - .field("total", &self.0) - .finish() - } -} - -pub struct PendingMemory(OwnedSemaphorePermit); - -impl PendingMemoryQuota { - pub fn new(quota: usize) -> Self { - Self(Arc::new(Semaphore::new(quota))) - } - - pub async fn pending(&self, size: usize) -> PendingMemory { - PendingMemory( - self.0 - .clone() - .acquire_many_owned(size as _) - .await - .expect("BUG: the semaphore is closed unexpectedly."), - ) - } +struct ScanResult { + more: bool, + out_of_memory: bool, + statistics: Statistics, } /// EventLoader transforms data from the snapshot into ApplyEvent. pub struct EventLoader { scanner: DeltaScanner, // pooling the memory. - entry_batch: EntryBatch, + entry_batch: Vec, } const ENTRY_BATCH_SIZE: usize = 1024; @@ -97,20 +74,48 @@ impl EventLoader { Ok(Self { scanner, - entry_batch: EntryBatch::with_capacity(ENTRY_BATCH_SIZE), + entry_batch: Vec::with_capacity(ENTRY_BATCH_SIZE), }) } + fn scan_result(&mut self, more: bool) -> ScanResult { + ScanResult { + more, + out_of_memory: false, + statistics: self.scanner.take_statistics(), + } + } + + fn out_of_memory(&mut self) -> ScanResult { + ScanResult { + more: true, + out_of_memory: true, + statistics: self.scanner.take_statistics(), + } + } + /// Scan a batch of events from the snapshot, and save them into the /// internal buffer. - fn fill_entries(&mut self) -> Result { + fn fill_entries(&mut self, memory_quota: &mut OwnedAllocated) -> Result { assert!( self.entry_batch.is_empty(), - "EventLoader: the entry batch isn't empty when filling entries, which is error-prone, please call `omit_entries` first. (len = {})", + "EventLoader: the entry batch isn't empty when filling entries, which is error-prone, please call `emit_entries_to` first. (len = {})", self.entry_batch.len() ); - self.scanner.scan_entries(&mut self.entry_batch)?; - Ok(self.scanner.take_statistics()) + let batch = &mut self.entry_batch; + while batch.len() < batch.capacity() { + match self.scanner.next_entry()? { + Some(entry) => { + let size = entry.size(); + batch.push(entry); + if memory_quota.alloc(size).is_err() { + return Ok(self.out_of_memory()); + } + } + None => return Ok(self.scan_result(false)), + } + } + Ok(self.scan_result(true)) } /// Drain the internal buffer, converting them to the [`ApplyEvents`], @@ -120,7 +125,7 @@ impl EventLoader { result: &mut ApplyEvents, resolver: &mut TwoPhaseResolver, ) -> Result<()> { - for entry in self.entry_batch.drain() { + for entry in self.entry_batch.drain(..) { match entry { TxnEntry::Prewrite { default: (key, value), @@ -180,7 +185,7 @@ pub struct InitialDataLoader { pub(crate) tracing: SubscriptionTracer, pub(crate) scheduler: Scheduler, - pub(crate) quota: PendingMemoryQuota, + pub(crate) quota: Arc, pub(crate) limit: Limiter, // If there are too many concurrent initial scanning, the limit of disk speed or pending memory // quota will probably be triggered. Then the whole scanning will be pretty slow. And when @@ -202,7 +207,7 @@ where sink: Router, tracing: SubscriptionTracer, sched: Scheduler, - quota: PendingMemoryQuota, + quota: Arc, limiter: Limiter, cdc_handle: H, concurrency_limit: Arc, @@ -384,40 +389,44 @@ where let mut events = ApplyEvents::with_capacity(1024, region.id); // Note: the call of `fill_entries` is the only step which would read the disk. // we only need to record the disk throughput of this. - let (stat, disk_read) = - utils::with_record_read_throughput(|| event_loader.fill_entries()); - // We must use the size of entry batch here to check whether we have progress. - // Or we may exit too early if there are only records: - // - can be inlined to `write` CF (hence it won't be written to default CF) - // - are prewritten. (hence it will only contains `Prewrite` records). - // In this condition, ALL records generate no ApplyEvent(only lock change), - // and we would exit after the first run of loop :( - let no_progress = event_loader.entry_batch.is_empty(); - let stat = stat?; + let mut allocated = OwnedAllocated::new(Arc::clone(&self.quota)); + let (res, disk_read) = + utils::with_record_read_throughput(|| event_loader.fill_entries(&mut allocated)); + let res = res?; self.with_resolver(region, handle, |r| { event_loader.emit_entries_to(&mut events, r) })?; - if no_progress { - metrics::INITIAL_SCAN_DURATION.observe(start.saturating_elapsed_secs()); - return Ok(stats.stat); - } - stats.add_statistics(&stat); + stats.add_statistics(&res.statistics); let region_id = region.get_id(); let sink = self.sink.clone(); let event_size = events.size(); let sched = self.scheduler.clone(); - let permit = self.quota.pending(event_size).await; self.limit.consume(disk_read as _).await; debug!("sending events to router"; "size" => %event_size, "region" => %region_id); metrics::INCREMENTAL_SCAN_SIZE.observe(event_size as f64); metrics::INCREMENTAL_SCAN_DISK_READ.inc_by(disk_read as f64); metrics::HEAP_MEMORY.add(event_size as _); + fail::fail_point!("scan_and_async_send::about_to_consume"); join_handles.push(tokio::spawn(async move { utils::handle_on_event_result(&sched, sink.on_events(events).await); metrics::HEAP_MEMORY.sub(event_size as _); + drop(allocated); debug!("apply event done"; "size" => %event_size, "region" => %region_id); - drop(permit); })); + if !res.more { + metrics::INITIAL_SCAN_DURATION.observe(start.saturating_elapsed_secs()); + return Ok(stats.stat); + } + if res.out_of_memory { + futures::future::try_join_all(join_handles.drain(..)) + .await + .map_err(|err| { + annotate!( + err, + "failed to join tokio runtime during out-of-memory-quota" + ) + })?; + } } } @@ -465,10 +474,13 @@ where #[cfg(test)] mod tests { + use std::sync::Arc; + use futures::executor::block_on; use kvproto::metapb::*; use tikv::storage::{txn::tests::*, TestEngineBuilder}; use tikv_kv::SnapContext; + use tikv_util::memory::{MemoryQuota, OwnedAllocated}; use txn_types::TimeStamp; use super::EventLoader; @@ -498,10 +510,13 @@ mod tests { let snap = block_on(async { tikv_kv::snapshot(&mut engine, SnapContext::default()).await }) .unwrap(); + let quota_inf = Arc::new(MemoryQuota::new(usize::MAX)); let mut loader = EventLoader::load_from(snap, TimeStamp::zero(), TimeStamp::max(), &r).unwrap(); - let (r, data_load) = with_record_read_throughput(|| loader.fill_entries()); + let (r, data_load) = with_record_read_throughput(|| { + loader.fill_entries(&mut OwnedAllocated::new(quota_inf)) + }); r.unwrap(); let mut events = ApplyEvents::with_capacity(1024, 42); let mut res = TwoPhaseResolver::new(42, None); diff --git a/components/backup-stream/src/lib.rs b/components/backup-stream/src/lib.rs index 3d4690d7f48..0402e5d2ee3 100644 --- a/components/backup-stream/src/lib.rs +++ b/components/backup-stream/src/lib.rs @@ -10,7 +10,7 @@ mod endpoint; pub mod errors; mod event_loader; pub mod metadata; -pub(crate) mod metrics; +pub mod metrics; pub mod observer; pub mod router; mod service; diff --git a/components/backup-stream/src/router.rs b/components/backup-stream/src/router.rs index 1786d513dc8..d43a58d3965 100644 --- a/components/backup-stream/src/router.rs +++ b/components/backup-stream/src/router.rs @@ -540,6 +540,15 @@ impl RouterInner { let task_info = self.get_task_info(&task).await?; task_info.on_events(events).await?; let file_size_limit = self.temp_file_size_limit.load(Ordering::SeqCst); + #[cfg(features = "failpoints")] + { + let delayed = (|| { + fail::fail_point!("router_on_event_delay_ms", |v| { + v.and_then(|v| v.parse::().ok()).unwrap_or(0) + }) + })(); + tokio::time::sleep(Duration::from_millis(delayed)).await; + } // When this event make the size of temporary files exceeds the size limit, make // a flush. Note that we only flush if the size is less than the limit before diff --git a/components/backup-stream/tests/failpoints/mod.rs b/components/backup-stream/tests/failpoints/mod.rs index ff9b9f82ba1..8dfc21529e4 100644 --- a/components/backup-stream/tests/failpoints/mod.rs +++ b/components/backup-stream/tests/failpoints/mod.rs @@ -9,7 +9,13 @@ pub use suite::*; mod all { - use std::time::Duration; + use std::{ + sync::{ + atomic::{AtomicBool, Ordering}, + Arc, + }, + time::Duration, + }; use backup_stream::{ metadata::{ @@ -19,7 +25,7 @@ mod all { GetCheckpointResult, RegionCheckpointOperation, RegionSet, Task, }; use futures::executor::block_on; - use tikv_util::defer; + use tikv_util::{config::ReadableSize, defer}; use super::{ make_record_key, make_split_key_at_record, mutation, run_async_test, SuiteBuilder, @@ -30,7 +36,7 @@ mod all { let mut suite = SuiteBuilder::new_named("basic").build(); fail::cfg("try_start_observe", "1*return").unwrap(); - run_async_test(async { + let (round1, round2) = run_async_test(async { // write data before the task starting, for testing incremental scanning. let round1 = suite.write_records(0, 128, 1).await; suite.must_register_task(1, "test_basic"); @@ -38,13 +44,13 @@ mod all { let round2 = suite.write_records(256, 128, 1).await; suite.force_flush_files("test_basic"); suite.wait_for_flush(); - suite - .check_for_write_records( - suite.flushed_files.path(), - round1.union(&round2).map(Vec::as_slice), - ) - .await; + (round1, round2) }); + suite.check_for_write_records( + suite.flushed_files.path(), + round1.union(&round2).map(Vec::as_slice), + ); + suite.cluster.shutdown(); } #[test] @@ -97,10 +103,10 @@ mod all { let keys2 = run_async_test(suite.write_records(256, 128, 1)); suite.force_flush_files("region_failure"); suite.wait_for_flush(); - run_async_test(suite.check_for_write_records( + suite.check_for_write_records( suite.flushed_files.path(), keys.union(&keys2).map(|s| s.as_slice()), - )); + ); } #[test] fn initial_scan_failure() { @@ -121,10 +127,10 @@ mod all { let keys2 = run_async_test(suite.write_records(256, 128, 1)); suite.force_flush_files("initial_scan_failure"); suite.wait_for_flush(); - run_async_test(suite.check_for_write_records( + suite.check_for_write_records( suite.flushed_files.path(), keys.union(&keys2).map(|s| s.as_slice()), - )); + ); } #[test] fn failed_during_refresh_region() { @@ -147,10 +153,10 @@ mod all { let keys2 = run_async_test(suite.write_records(256, 128, 1)); suite.force_flush_files("fail_to_refresh_region"); suite.wait_for_flush(); - run_async_test(suite.check_for_write_records( + suite.check_for_write_records( suite.flushed_files.path(), keys.union(&keys2).map(|s| s.as_slice()), - )); + ); let leader = suite.cluster.leader_of_region(1).unwrap().store_id; let (tx, rx) = std::sync::mpsc::channel(); suite.endpoints[&leader] @@ -212,12 +218,7 @@ mod all { let items = run_async_test(suite.write_records(0, 128, 1)); suite.force_flush_files("retry_abort"); suite.wait_for_flush(); - run_async_test( - suite.check_for_write_records( - suite.flushed_files.path(), - items.iter().map(Vec::as_slice), - ), - ); + suite.check_for_write_records(suite.flushed_files.path(), items.iter().map(Vec::as_slice)); } #[test] fn failure_and_split() { @@ -240,12 +241,42 @@ mod all { let round2 = run_async_test(suite.write_records(256, 128, 1)); suite.force_flush_files("failure_and_split"); suite.wait_for_flush(); - run_async_test(suite.check_for_write_records( + suite.check_for_write_records( suite.flushed_files.path(), round1.union(&round2).map(Vec::as_slice), - )); + ); let cp = suite.global_checkpoint(); assert!(cp > 512, "it is {}", cp); suite.cluster.shutdown(); } + + #[test] + fn memory_quota() { + let mut suite = SuiteBuilder::new_named("memory_quota") + .cfg(|cfg| cfg.initial_scan_pending_memory_quota = ReadableSize::kb(2)) + .build(); + let keys = run_async_test(suite.write_records(0, 128, 1)); + let failed = Arc::new(AtomicBool::new(false)); + fail::cfg("router_on_event_delay_ms", "6*return(1000)").unwrap(); + fail::cfg_callback("scan_and_async_send::about_to_consume", { + let failed = failed.clone(); + move || { + let v = backup_stream::metrics::HEAP_MEMORY.get(); + // Not greater than max key length * concurrent initial scan number. + if v > 4096 * 6 { + println!("[[ FAILED ]] The memory usage is {v} which exceeds the quota"); + failed.store(true, Ordering::SeqCst); + } + } + }) + .unwrap(); + suite.must_register_task(1, "memory_quota"); + suite.force_flush_files("memory_quota"); + suite.wait_for_flush(); + suite.check_for_write_records( + suite.flushed_files.path(), + keys.iter().map(|v| v.as_slice()), + ); + assert!(!failed.load(Ordering::SeqCst)); + } } diff --git a/components/backup-stream/tests/integration/mod.rs b/components/backup-stream/tests/integration/mod.rs index 79a756f684d..395159060c1 100644 --- a/components/backup-stream/tests/integration/mod.rs +++ b/components/backup-stream/tests/integration/mod.rs @@ -28,20 +28,19 @@ mod all { #[test] fn with_split() { let mut suite = SuiteBuilder::new_named("with_split").build(); - run_async_test(async { + let (round1, round2) = run_async_test(async { let round1 = suite.write_records(0, 128, 1).await; suite.must_split(&make_split_key_at_record(1, 42)); suite.must_register_task(1, "test_with_split"); let round2 = suite.write_records(256, 128, 1).await; - suite.force_flush_files("test_with_split"); - suite.wait_for_flush(); - suite - .check_for_write_records( - suite.flushed_files.path(), - round1.union(&round2).map(Vec::as_slice), - ) - .await; + (round1, round2) }); + suite.force_flush_files("test_with_split"); + suite.wait_for_flush(); + suite.check_for_write_records( + suite.flushed_files.path(), + round1.union(&round2).map(Vec::as_slice), + ); suite.cluster.shutdown(); } @@ -63,7 +62,7 @@ mod all { #[test] fn with_split_txn() { let mut suite = SuiteBuilder::new_named("split_txn").build(); - run_async_test(async { + let (commit_ts, start_ts, keys) = run_async_test(async { let start_ts = suite.cluster.pd_client.get_tso().await.unwrap(); let keys = (1..1960).map(|i| make_record_key(1, i)).collect::>(); suite.must_kv_prewrite( @@ -76,26 +75,25 @@ mod all { start_ts, ); let commit_ts = suite.cluster.pd_client.get_tso().await.unwrap(); - suite.commit_keys(keys[1913..].to_vec(), start_ts, commit_ts); - suite.must_register_task(1, "test_split_txn"); - suite.commit_keys(keys[..1913].to_vec(), start_ts, commit_ts); - suite.force_flush_files("test_split_txn"); - suite.wait_for_flush(); - let keys_encoded = keys - .iter() - .map(|v| { - Key::from_raw(v.as_slice()) - .append_ts(commit_ts) - .into_encoded() - }) - .collect::>(); - suite - .check_for_write_records( - suite.flushed_files.path(), - keys_encoded.iter().map(Vec::as_slice), - ) - .await; + (commit_ts, start_ts, keys) }); + suite.commit_keys(keys[1913..].to_vec(), start_ts, commit_ts); + suite.must_register_task(1, "test_split_txn"); + suite.commit_keys(keys[..1913].to_vec(), start_ts, commit_ts); + suite.force_flush_files("test_split_txn"); + suite.wait_for_flush(); + let keys_encoded = keys + .iter() + .map(|v| { + Key::from_raw(v.as_slice()) + .append_ts(commit_ts) + .into_encoded() + }) + .collect::>(); + suite.check_for_write_records( + suite.flushed_files.path(), + keys_encoded.iter().map(Vec::as_slice), + ); suite.cluster.shutdown(); } @@ -111,10 +109,10 @@ mod all { let round2 = run_async_test(suite.write_records(256, 128, 1)); suite.force_flush_files("test_leader_down"); suite.wait_for_flush(); - run_async_test(suite.check_for_write_records( + suite.check_for_write_records( suite.flushed_files.path(), round1.union(&round2).map(Vec::as_slice), - )); + ); suite.cluster.shutdown(); } @@ -346,10 +344,10 @@ mod all { } assert_eq!(items.last().unwrap().end_key, Vec::::default()); - run_async_test(suite.check_for_write_records( + suite.check_for_write_records( suite.flushed_files.path(), round1.union(&round2).map(|x| x.as_slice()), - )); + ); } #[test] @@ -373,18 +371,18 @@ mod all { .unwrap(); suite.sync(); std::thread::sleep(Duration::from_secs(2)); - run_async_test(suite.check_for_write_records( + suite.check_for_write_records( suite.flushed_files.path(), round1.iter().map(|x| x.as_slice()), - )); + ); assert!(suite.global_checkpoint() > 256); suite.force_flush_files("r"); suite.wait_for_flush(); assert!(suite.global_checkpoint() > 512); - run_async_test(suite.check_for_write_records( + suite.check_for_write_records( suite.flushed_files.path(), round1.union(&round2).map(|x| x.as_slice()), - )); + ); } #[test] @@ -426,10 +424,10 @@ mod all { ts, cps ); - run_async_test(suite.check_for_write_records( + suite.check_for_write_records( suite.flushed_files.path(), round1.iter().map(|k| k.as_slice()), - )) + ) } #[test] diff --git a/components/backup-stream/tests/suite.rs b/components/backup-stream/tests/suite.rs index 41a57f5858b..0e4038d07a0 100644 --- a/components/backup-stream/tests/suite.rs +++ b/components/backup-stream/tests/suite.rs @@ -2,7 +2,8 @@ use std::{ collections::{HashMap, HashSet}, - path::Path, + fmt::Display, + path::{Path, PathBuf}, sync::Arc, time::Duration, }; @@ -30,7 +31,6 @@ use kvproto::{ tikvpb::*, }; use pd_client::PdClient; -use protobuf::parse_from_bytes; use raftstore::{router::CdcRaftRouter, RegionInfoAccessor}; use resolved_ts::LeadershipResolver; use tempdir::TempDir; @@ -43,13 +43,25 @@ use tikv_util::{ number::NumberEncoder, stream_event::{EventIterator, Iterator}, }, - info, + debug, info, worker::LazyWorker, HandyRwLock, }; use txn_types::{Key, TimeStamp, WriteRef}; use walkdir::WalkDir; +#[derive(Debug)] +pub struct FileSegments { + path: PathBuf, + segments: Vec<(usize, usize)>, +} + +#[derive(Default, Debug)] +pub struct LogFiles { + default_cf: Vec, + write_cf: Vec, +} + pub type TestEndpoint = Endpoint< ErrorStore, RegionInfoAccessor, @@ -453,7 +465,12 @@ impl Suite { for ts in (from..(from + n)).map(|x| x * 2) { let ts = ts as u64; let key = make_record_key(for_table, ts); - let muts = vec![mutation(key.clone(), b"hello, world".to_vec())]; + let value = if ts % 4 == 0 { + b"hello, world".to_vec() + } else { + [0xdd; 4096].to_vec() + }; + let muts = vec![mutation(key.clone(), value)]; let enc_key = Key::from_raw(&key).into_encoded(); let region = self.cluster.get_region_id(&enc_key); let start_ts = self.cluster.pd_client.get_tso().await.unwrap(); @@ -510,45 +527,53 @@ impl Suite { } } - pub fn load_metadata_for_write_records( - &self, - path: &Path, - ) -> HashMap> { - let mut meta_map: HashMap> = HashMap::new(); - for entry in WalkDir::new(path) { - let entry = entry.unwrap(); - if entry.file_type().is_file() - && entry - .file_name() - .to_str() - .map_or(false, |s| s.ends_with(".meta")) - { - let content = std::fs::read(entry.path()).unwrap(); - let meta = parse_from_bytes::(content.as_ref()).unwrap(); - for g in meta.file_groups.into_iter() { - let path = g.path.split('/').last().unwrap(); - for f in g.data_files_info.into_iter() { - let file_info = meta_map.get_mut(path); - if let Some(v) = file_info { - v.push(( - f.range_offset as usize, - (f.range_offset + f.range_length) as usize, - )); + pub fn get_files_to_check(&self, path: &Path) -> std::io::Result { + let mut res = LogFiles::default(); + for entry in WalkDir::new(path.join("v1/backupmeta")) { + let entry = entry?; + println!("reading {}", entry.path().display()); + if entry.file_name().to_str().unwrap().ends_with(".meta") { + let content = std::fs::read(entry.path())?; + let meta = protobuf::parse_from_bytes::(&content)?; + for fg in meta.get_file_groups() { + let mut default_segs = vec![]; + let mut write_segs = vec![]; + for file in fg.get_data_files_info() { + let v = if file.cf == "default" || file.cf.is_empty() { + Some(&mut default_segs) + } else if file.cf == "write" { + Some(&mut write_segs) } else { - let v = vec![( - f.range_offset as usize, - (f.range_offset + f.range_length) as usize, - )]; - meta_map.insert(String::from(path), v); - } + None + }; + v.into_iter().for_each(|v| { + v.push(( + file.get_range_offset() as usize, + (file.get_range_offset() + file.get_range_length()) as usize, + )) + }); + } + let p = path.join(fg.get_path()); + if !default_segs.is_empty() { + res.default_cf.push(FileSegments { + path: p.clone(), + segments: default_segs, + }) + } + if !write_segs.is_empty() { + res.write_cf.push(FileSegments { + path: p, + segments: write_segs, + }) } } } } - meta_map + Ok(res) } - pub async fn check_for_write_records<'a>( + #[track_caller] + pub fn check_for_write_records<'a>( &self, path: &Path, key_set: impl std::iter::Iterator, @@ -557,45 +582,72 @@ impl Suite { let n = remain_keys.len(); let mut extra_key = 0; let mut extra_len = 0; - let meta_map = self.load_metadata_for_write_records(path); - for entry in WalkDir::new(path) { - let entry = entry.unwrap(); - println!("checking: {:?}", entry); - if entry.file_type().is_file() - && entry - .file_name() - .to_str() - .map_or(false, |s| s.ends_with(".log")) - { - let buf = std::fs::read(entry.path()).unwrap(); - let file_infos = meta_map.get(entry.file_name().to_str().unwrap()).unwrap(); - for &file_info in file_infos { - let mut decoder = ZstdDecoder::new(Vec::new()); - let pbuf: &[u8] = &buf[file_info.0..file_info.1]; - decoder.write_all(pbuf).await.unwrap(); - decoder.flush().await.unwrap(); - decoder.close().await.unwrap(); - let content = decoder.into_inner(); - - let mut iter = EventIterator::new(&content); - loop { - if !iter.valid() { - break; - } - iter.next().unwrap(); - if !remain_keys.remove(iter.key()) { - extra_key += 1; - extra_len += iter.key().len() + iter.value().len(); - } + let files = self.get_files_to_check(path).unwrap_or_default(); + let mut default_keys = HashSet::new(); + let content_of = |buf: &[u8], range: (usize, usize)| { + let mut decoder = ZstdDecoder::new(Vec::new()); + let pbuf: &[u8] = &buf[range.0..range.1]; + run_async_test(async { + decoder.write_all(pbuf).await.unwrap(); + decoder.flush().await.unwrap(); + decoder.close().await.unwrap(); + }); + decoder.into_inner() + }; + for entry in files.write_cf { + debug!("checking write: {:?}", entry); + + let buf = std::fs::read(&entry.path).unwrap(); + for &file_info in entry.segments.iter() { + let data = content_of(&buf, file_info); + let mut iter = EventIterator::new(&data); + loop { + if !iter.valid() { + break; + } + iter.next().unwrap(); + if !remain_keys.remove(iter.key()) { + extra_key += 1; + extra_len += iter.key().len() + iter.value().len(); + } + + let value = iter.value(); + let wf = WriteRef::parse(value).unwrap(); + if wf.short_value.is_none() { + let mut key = Key::from_encoded_slice(iter.key()).truncate_ts().unwrap(); + key.append_ts_inplace(wf.start_ts); - let value = iter.value(); - let wf = WriteRef::parse(value).unwrap(); + default_keys.insert(key.into_encoded()); + } else { assert_eq!(wf.short_value, Some(b"hello, world" as &[u8])); } } } } + for entry in files.default_cf { + debug!("checking default: {:?}", entry); + + let buf = std::fs::read(&entry.path).unwrap(); + for &file_info in entry.segments.iter() { + let data = content_of(&buf, file_info); + let mut iter = EventIterator::new(&data); + loop { + if !iter.valid() { + break; + } + iter.next().unwrap(); + if !default_keys.remove(iter.key()) { + extra_key += 1; + extra_len += iter.key().len() + iter.value().len(); + } + + let value = iter.value(); + assert_eq!(value, &[0xdd; 4096]); + } + } + } + if extra_key != 0 { println!( "check_for_write_records of “{}”: extra {} keys ({:.02}% of recorded keys), extra {} bytes.", @@ -605,17 +657,19 @@ impl Suite { extra_len ) } - if !remain_keys.is_empty() { - panic!( - "not all keys are recorded: it remains {:?} (total = {})", - remain_keys - .iter() - .take(3) - .map(|v| hex::encode(v)) - .collect::>(), - remain_keys.len() - ); - } + assert_empty(&remain_keys, "not all keys are recorded"); + assert_empty(&default_keys, "some keys don't have default entry"); + } +} + +#[track_caller] +fn assert_empty(v: &HashSet>, msg: impl Display) { + if !v.is_empty() { + panic!( + "{msg}: it remains {:?}... (total = {})", + v.iter().take(3).map(|v| hex::encode(v)).collect::>(), + v.len() + ); } } diff --git a/components/tikv_util/src/memory.rs b/components/tikv_util/src/memory.rs index 291254c5227..15ffece4425 100644 --- a/components/tikv_util/src/memory.rs +++ b/components/tikv_util/src/memory.rs @@ -2,7 +2,10 @@ use std::{ mem, - sync::atomic::{AtomicUsize, Ordering}, + sync::{ + atomic::{AtomicUsize, Ordering}, + Arc, + }, }; use kvproto::{ @@ -87,6 +90,32 @@ pub struct MemoryQuota { capacity: AtomicUsize, } +pub struct OwnedAllocated { + allocated: usize, + from: Arc, +} + +impl OwnedAllocated { + pub fn new(target: Arc) -> Self { + Self { + allocated: 0, + from: target, + } + } + + pub fn alloc(&mut self, bytes: usize) -> Result<(), MemoryQuotaExceeded> { + self.from.alloc(bytes)?; + self.allocated += bytes; + Ok(()) + } +} + +impl Drop for OwnedAllocated { + fn drop(&mut self) { + self.from.free(self.allocated) + } +} + impl MemoryQuota { pub fn new(capacity: usize) -> MemoryQuota { MemoryQuota { @@ -182,4 +211,25 @@ mod tests { quota.alloc(40).unwrap(); assert_eq!(quota.in_use(), 50); } + + #[test] + fn test_allocated() { + let quota = Arc::new(MemoryQuota::new(100)); + let mut allocated = OwnedAllocated::new(Arc::clone("a)); + allocated.alloc(42).unwrap(); + assert_eq!(quota.in_use(), 42); + quota.alloc(59).unwrap_err(); + allocated.alloc(16).unwrap(); + assert_eq!(quota.in_use(), 58); + let mut allocated2 = OwnedAllocated::new(Arc::clone("a)); + allocated2.alloc(8).unwrap(); + allocated2.alloc(40).unwrap_err(); + assert_eq!(quota.in_use(), 66); + quota.alloc(4).unwrap(); + assert_eq!(quota.in_use(), 70); + drop(allocated); + assert_eq!(quota.in_use(), 12); + drop(allocated2); + assert_eq!(quota.in_use(), 4); + } } From dbfc91898ccfc6bf3737a92ab79be121a4227a55 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Wed, 8 Nov 2023 12:35:13 +0800 Subject: [PATCH 107/220] cdc: limit cdc event fetching speed to reduce RocksDB read load (#15849) (#15893) close tikv/tikv#11390 None Signed-off-by: qupeng Co-authored-by: qupeng --- components/cdc/src/endpoint.rs | 50 ++++++++++++++++++++-- components/cdc/src/initializer.rs | 28 +++++++----- src/config/mod.rs | 5 +++ tests/integrations/config/mod.rs | 1 + tests/integrations/config/test-custom.toml | 1 + 5 files changed, 71 insertions(+), 14 deletions(-) diff --git a/components/cdc/src/endpoint.rs b/components/cdc/src/endpoint.rs index a5f00a08028..e62650c77c6 100644 --- a/components/cdc/src/endpoint.rs +++ b/components/cdc/src/endpoint.rs @@ -384,6 +384,7 @@ pub struct Endpoint { workers: Runtime, scan_concurrency_semaphore: Arc, scan_speed_limiter: Limiter, + fetch_speed_limiter: Limiter, max_scan_batch_bytes: usize, max_scan_batch_size: usize, sink_memory_quota: Arc, @@ -439,11 +440,16 @@ impl, E: KvEngine, S: StoreRegionMeta> Endpoint 0 { + let scan_speed_limiter = Limiter::new(if config.incremental_scan_speed_limit.0 > 0 { config.incremental_scan_speed_limit.0 as f64 } else { f64::INFINITY }); + let fetch_speed_limiter = Limiter::new(if config.incremental_fetch_speed_limit.0 > 0 { + config.incremental_fetch_speed_limit.0 as f64 + } else { + f64::INFINITY + }); CDC_SINK_CAP.set(sink_memory_quota.capacity() as i64); // For scan efficiency, the scan batch bytes should be around 1MB. @@ -469,7 +475,8 @@ impl, E: KvEngine, S: StoreRegionMeta> Endpoint, E: KvEngine, S: StoreRegionMeta> Endpoint 0 { + self.config.incremental_fetch_speed_limit.0 as f64 + } else { + f64::INFINITY + }; + + self.fetch_speed_limiter.set_speed_limit(new_speed_limit); + } } pub fn set_max_scan_batch_size(&mut self, max_scan_batch_size: usize) { @@ -793,7 +809,8 @@ impl, E: KvEngine, S: StoreRegionMeta> Endpoint { pub(crate) request_id: u64, pub(crate) checkpoint_ts: TimeStamp, - pub(crate) speed_limiter: Limiter, + pub(crate) scan_speed_limiter: Limiter, + pub(crate) fetch_speed_limiter: Limiter, + pub(crate) max_scan_batch_bytes: usize, pub(crate) max_scan_batch_size: usize, @@ -404,16 +406,14 @@ impl Initializer { perf_delta, } = self.do_scan(scanner, old_value_cursors, &mut entries)?; - CDC_SCAN_BYTES.inc_by(emit as _); TLS_CDC_PERF_STATS.with(|x| *x.borrow_mut() += perf_delta); tls_flush_perf_stats(); - let require = if let Some(bytes) = disk_read { + if let Some(bytes) = disk_read { CDC_SCAN_DISK_READ_BYTES.inc_by(bytes as _); - bytes - } else { - perf_delta.block_read_byte as usize - }; - self.speed_limiter.consume(require).await; + self.scan_speed_limiter.consume(bytes).await; + } + CDC_SCAN_BYTES.inc_by(emit as _); + self.fetch_speed_limiter.consume(emit as _).await; if let Some(resolver) = resolver { // Track the locks. @@ -624,7 +624,8 @@ mod tests { } fn mock_initializer( - speed_limit: usize, + scan_limit: usize, + fetch_limit: usize, buffer: usize, engine: Option, kv_api: ChangeDataRequestKvApi, @@ -665,7 +666,8 @@ mod tests { conn_id: ConnId::new(), request_id: 0, checkpoint_ts: 1.into(), - speed_limiter: Limiter::new(speed_limit as _), + scan_speed_limiter: Limiter::new(scan_limit as _), + fetch_speed_limiter: Limiter::new(fetch_limit as _), max_scan_batch_bytes: 1024 * 1024, max_scan_batch_size: 1024, build_resolver: true, @@ -717,6 +719,7 @@ mod tests { // Buffer must be large enough to unblock async incremental scan. let buffer = 1000; let (mut worker, pool, mut initializer, rx, mut drain) = mock_initializer( + total_bytes, total_bytes, buffer, engine.kv_engine(), @@ -832,6 +835,7 @@ mod tests { // Buffer must be large enough to unblock async incremental scan. let buffer = 1000; let (mut worker, pool, mut initializer, _rx, mut drain) = mock_initializer( + total_bytes, total_bytes, buffer, engine.kv_engine(), @@ -914,6 +918,7 @@ mod tests { // Do incremental scan with different `hint_min_ts` values. for checkpoint_ts in [200, 100, 150] { let (mut worker, pool, mut initializer, _rx, mut drain) = mock_initializer( + usize::MAX, usize::MAX, 1000, engine.kv_engine(), @@ -979,6 +984,7 @@ mod tests { let total_bytes = 1; let buffer = 1; let (mut worker, _pool, mut initializer, rx, _drain) = mock_initializer( + total_bytes, total_bytes, buffer, None, @@ -1034,7 +1040,7 @@ mod tests { let total_bytes = 1; let buffer = 1; let (mut worker, pool, mut initializer, _rx, _drain) = - mock_initializer(total_bytes, buffer, None, kv_api, false); + mock_initializer(total_bytes, total_bytes, buffer, None, kv_api, false); let change_cmd = ChangeObserver::from_cdc(1, ObserveHandle::new()); let raft_router = CdcRaftRouter(MockRaftStoreRouter::new()); diff --git a/src/config/mod.rs b/src/config/mod.rs index d1fb1e4f8d8..237ac3c7a72 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -2940,7 +2940,11 @@ pub struct CdcConfig { #[online_config(skip)] pub incremental_scan_threads: usize, pub incremental_scan_concurrency: usize, + /// Limit scan speed based on disk I/O traffic. pub incremental_scan_speed_limit: ReadableSize, + /// Limit scan speed based on memory accesing traffic. + #[doc(hidden)] + pub incremental_fetch_speed_limit: ReadableSize, /// `TsFilter` can increase speed and decrease resource usage when /// incremental content is much less than total content. However in /// other cases, `TsFilter` can make performance worse because it needs @@ -2979,6 +2983,7 @@ impl Default for CdcConfig { // TiCDC requires a SSD, the typical write speed of SSD // is more than 500MB/s, so 128MB/s is enough. incremental_scan_speed_limit: ReadableSize::mb(128), + incremental_fetch_speed_limit: ReadableSize::mb(512), incremental_scan_ts_filter_ratio: 0.2, tso_worker_threads: 1, // 512MB memory for CDC sink. diff --git a/tests/integrations/config/mod.rs b/tests/integrations/config/mod.rs index 2f4f5ba7695..639e05a02c3 100644 --- a/tests/integrations/config/mod.rs +++ b/tests/integrations/config/mod.rs @@ -849,6 +849,7 @@ fn test_serde_custom_tikv_config() { incremental_scan_threads: 3, incremental_scan_concurrency: 4, incremental_scan_speed_limit: ReadableSize(7), + incremental_fetch_speed_limit: ReadableSize(8), incremental_scan_ts_filter_ratio: 0.7, tso_worker_threads: 2, old_value_cache_memory_quota: ReadableSize::mb(14), diff --git a/tests/integrations/config/test-custom.toml b/tests/integrations/config/test-custom.toml index 1bb52fad5fc..ef7a4809168 100644 --- a/tests/integrations/config/test-custom.toml +++ b/tests/integrations/config/test-custom.toml @@ -699,6 +699,7 @@ hibernate-regions-compatible = false incremental-scan-threads = 3 incremental-scan-concurrency = 4 incremental-scan-speed-limit = 7 +incremental-fetch-speed-limit = 8 incremental-scan-ts-filter-ratio = 0.7 tso-worker-threads = 2 old-value-cache-memory-quota = "14MB" From 92454e4b743db17ae64e7de8a24911b4071a5b74 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Wed, 8 Nov 2023 13:39:41 +0800 Subject: [PATCH 108/220] raftstore: enhance split check (#15900) (#15935) close tikv/tikv#15863 Signed-off-by: Qi Xu Signed-off-by: tonyxuqqi Co-authored-by: Qi Xu Co-authored-by: tonyxuqqi Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- components/raftstore-v2/src/router/imp.rs | 14 +- .../raftstore/src/coprocessor/dispatcher.rs | 26 ++- .../src/coprocessor/split_check/keys.rs | 56 +++++-- .../src/coprocessor/split_check/size.rs | 43 ++++- components/raftstore/src/router.rs | 8 +- components/raftstore/src/store/fsm/peer.rs | 109 +++++++------ components/raftstore/src/store/msg.rs | 22 ++- components/raftstore/src/store/peer.rs | 150 ++++++++++++++---- .../raftstore/src/store/worker/split_check.rs | 32 +++- components/test_raftstore/src/util.rs | 40 ++++- components/tikv_util/src/log.rs | 12 ++ tests/failpoints/cases/test_split_region.rs | 62 ++++++++ .../raftstore/test_split_region.rs | 2 +- 13 files changed, 457 insertions(+), 119 deletions(-) diff --git a/components/raftstore-v2/src/router/imp.rs b/components/raftstore-v2/src/router/imp.rs index 23a8a3c7d4e..e7a63f6d48f 100644 --- a/components/raftstore-v2/src/router/imp.rs +++ b/components/raftstore-v2/src/router/imp.rs @@ -43,12 +43,18 @@ impl AsyncReadNotifier for StoreRouter { } impl raftstore::coprocessor::StoreHandle for StoreRouter { - fn update_approximate_size(&self, region_id: u64, size: u64) { - let _ = self.send(region_id, PeerMsg::UpdateRegionSize { size }); + // TODO: add splitable logic in raftstore-v2 + fn update_approximate_size(&self, region_id: u64, size: Option, _may_split: Option) { + if let Some(size) = size { + let _ = self.send(region_id, PeerMsg::UpdateRegionSize { size }); + } } - fn update_approximate_keys(&self, region_id: u64, keys: u64) { - let _ = self.send(region_id, PeerMsg::UpdateRegionKeys { keys }); + // TODO: add splitable logic in raftstore-v2 + fn update_approximate_keys(&self, region_id: u64, keys: Option, _may_split: Option) { + if let Some(keys) = keys { + let _ = self.send(region_id, PeerMsg::UpdateRegionKeys { keys }); + } } fn ask_split( diff --git a/components/raftstore/src/coprocessor/dispatcher.rs b/components/raftstore/src/coprocessor/dispatcher.rs index d082013cd2c..c7d6731d3e9 100644 --- a/components/raftstore/src/coprocessor/dispatcher.rs +++ b/components/raftstore/src/coprocessor/dispatcher.rs @@ -19,8 +19,8 @@ use crate::store::BucketRange; /// A handle for coprocessor to schedule some command back to raftstore. pub trait StoreHandle: Clone + Send { - fn update_approximate_size(&self, region_id: u64, size: u64); - fn update_approximate_keys(&self, region_id: u64, keys: u64); + fn update_approximate_size(&self, region_id: u64, size: Option, splitable: Option); + fn update_approximate_keys(&self, region_id: u64, keys: Option, splitable: Option); fn ask_split( &self, region_id: u64, @@ -48,11 +48,13 @@ pub trait StoreHandle: Clone + Send { pub enum SchedTask { UpdateApproximateSize { region_id: u64, - size: u64, + splitable: Option, + size: Option, }, UpdateApproximateKeys { region_id: u64, - keys: u64, + splitable: Option, + keys: Option, }, AskSplit { region_id: u64, @@ -75,12 +77,20 @@ pub enum SchedTask { } impl StoreHandle for std::sync::mpsc::SyncSender { - fn update_approximate_size(&self, region_id: u64, size: u64) { - let _ = self.try_send(SchedTask::UpdateApproximateSize { region_id, size }); + fn update_approximate_size(&self, region_id: u64, size: Option, splitable: Option) { + let _ = self.try_send(SchedTask::UpdateApproximateSize { + region_id, + splitable, + size, + }); } - fn update_approximate_keys(&self, region_id: u64, keys: u64) { - let _ = self.try_send(SchedTask::UpdateApproximateKeys { region_id, keys }); + fn update_approximate_keys(&self, region_id: u64, keys: Option, splitable: Option) { + let _ = self.try_send(SchedTask::UpdateApproximateKeys { + region_id, + splitable, + keys, + }); } fn ask_split( diff --git a/components/raftstore/src/coprocessor/split_check/keys.rs b/components/raftstore/src/coprocessor/split_check/keys.rs index 2c0e71dd8cb..d6a49175441 100644 --- a/components/raftstore/src/coprocessor/split_check/keys.rs +++ b/components/raftstore/src/coprocessor/split_check/keys.rs @@ -157,9 +157,11 @@ impl SplitCheckObserver for KeysCheckObserver } }; - self.router.update_approximate_keys(region_id, region_keys); + self.router + .update_approximate_keys(region_id, Some(region_keys), None); REGION_KEYS_HISTOGRAM.observe(region_keys as f64); + // if bucket checker using scan is added, to utilize the scan, // add keys checker as well for free // It has the assumption that the size's checker is before the keys's check in @@ -299,12 +301,28 @@ mod tests { None, )); // keys has not reached the max_keys 100 yet. - match rx.try_recv() { - Ok(SchedTask::UpdateApproximateSize { region_id, .. }) - | Ok(SchedTask::UpdateApproximateKeys { region_id, .. }) => { - assert_eq!(region_id, region.get_id()); + let mut recv_cnt = 0; + loop { + match rx.try_recv() { + Ok(SchedTask::UpdateApproximateSize { + region_id, + splitable, + .. + }) + | Ok(SchedTask::UpdateApproximateKeys { + region_id, + splitable, + .. + }) => { + assert_eq!(region_id, region.get_id()); + assert!(splitable.is_none()); + recv_cnt += 1; + if recv_cnt == 2 { + break; + } + } + others => panic!("expect recv empty, but got {:?}", others), } - others => panic!("expect recv empty, but got {:?}", others), } put_data(&engine, 90, 160, true); @@ -403,12 +421,28 @@ mod tests { None, )); // keys has not reached the max_keys 100 yet. - match rx.try_recv() { - Ok(SchedTask::UpdateApproximateSize { region_id, .. }) - | Ok(SchedTask::UpdateApproximateKeys { region_id, .. }) => { - assert_eq!(region_id, region.get_id()); + let mut recv_cnt = 0; + loop { + match rx.try_recv() { + Ok(SchedTask::UpdateApproximateSize { + region_id, + splitable, + .. + }) + | Ok(SchedTask::UpdateApproximateKeys { + region_id, + splitable, + .. + }) => { + assert_eq!(region_id, region.get_id()); + assert!(splitable.is_none()); + recv_cnt += 1; + if recv_cnt == 2 { + break; + } + } + others => panic!("expect recv empty, but got {:?}", others), } - others => panic!("expect recv empty, but got {:?}", others), } put_data(&engine, 90, 160, true); diff --git a/components/raftstore/src/coprocessor/split_check/size.rs b/components/raftstore/src/coprocessor/split_check/size.rs index 4b320bef1b6..e5048a83826 100644 --- a/components/raftstore/src/coprocessor/split_check/size.rs +++ b/components/raftstore/src/coprocessor/split_check/size.rs @@ -158,13 +158,14 @@ impl SplitCheckObserver for SizeCheckObserver }; // send it to raftstore to update region approximate size - self.router.update_approximate_size(region_id, region_size); + self.router + .update_approximate_size(region_id, Some(region_size), None); + let need_split_region = region_size >= host.cfg.region_max_size().0; let need_bucket_checker = host.cfg.enable_region_bucket() && region_size >= 2 * host.cfg.region_bucket_size.0; REGION_SIZE_HISTOGRAM.observe(region_size as f64); - let need_split_region = region_size >= host.cfg.region_max_size().0; if need_split_region || need_bucket_checker { // when it's a large region use approximate way to produce split keys if need_split_region { @@ -265,11 +266,23 @@ pub mod tests { exp_split_keys: Vec>, ignore_split_keys: bool, ) { + let mut split = false; loop { match rx.try_recv() { - Ok(SchedTask::UpdateApproximateKeys { region_id, .. }) - | Ok(SchedTask::UpdateApproximateSize { region_id, .. }) - | Ok(SchedTask::RefreshRegionBuckets { region_id, .. }) => { + Ok(SchedTask::UpdateApproximateKeys { + region_id, + splitable, + .. + }) + | Ok(SchedTask::UpdateApproximateSize { + region_id, + splitable, + .. + }) => { + assert_eq!(region_id, exp_region.get_id()); + split = split || splitable.unwrap_or(false); + } + Ok(SchedTask::RefreshRegionBuckets { region_id, .. }) => { assert_eq!(region_id, exp_region.get_id()); } Ok(SchedTask::AskSplit { @@ -283,6 +296,7 @@ pub mod tests { if !ignore_split_keys { assert_eq!(split_keys, exp_split_keys); } + assert!(split); break; } others => panic!("expect split check result, but got {:?}", others), @@ -303,11 +317,23 @@ pub mod tests { exp_region: &Region, exp_split_keys_count: usize, ) { + let mut split = false; loop { match rx.try_recv() { - Ok(SchedTask::UpdateApproximateSize { region_id, .. }) - | Ok(SchedTask::UpdateApproximateKeys { region_id, .. }) - | Ok(SchedTask::RefreshRegionBuckets { region_id, .. }) => { + Ok(SchedTask::UpdateApproximateSize { + region_id, + splitable, + .. + }) + | Ok(SchedTask::UpdateApproximateKeys { + region_id, + splitable, + .. + }) => { + assert_eq!(region_id, exp_region.get_id()); + split = split || splitable.unwrap_or(false); + } + Ok(SchedTask::RefreshRegionBuckets { region_id, .. }) => { assert_eq!(region_id, exp_region.get_id()); } Ok(SchedTask::AskSplit { @@ -319,6 +345,7 @@ pub mod tests { assert_eq!(region_id, exp_region.get_id()); assert_eq!(®ion_epoch, exp_region.get_region_epoch()); assert_eq!(split_keys.len(), exp_split_keys_count); + assert!(split); break; } others => panic!("expect split check result, but got {:?}", others), diff --git a/components/raftstore/src/router.rs b/components/raftstore/src/router.rs index 77d3a35e306..fd50357fa38 100644 --- a/components/raftstore/src/router.rs +++ b/components/raftstore/src/router.rs @@ -289,11 +289,11 @@ impl RaftStoreRouter for RaftRouter { // duplicated codes. impl crate::coprocessor::StoreHandle for RaftRouter { - fn update_approximate_size(&self, region_id: u64, size: u64) { + fn update_approximate_size(&self, region_id: u64, size: Option, splitable: Option) { if let Err(e) = CasualRouter::send( self, region_id, - CasualMessage::RegionApproximateSize { size }, + CasualMessage::RegionApproximateSize { size, splitable }, ) { warn!( "failed to send approximate region size"; @@ -304,11 +304,11 @@ impl crate::coprocessor::StoreHandle for RaftRoute } } - fn update_approximate_keys(&self, region_id: u64, keys: u64) { + fn update_approximate_keys(&self, region_id: u64, keys: Option, splitable: Option) { if let Err(e) = CasualRouter::send( self, region_id, - CasualMessage::RegionApproximateKeys { keys }, + CasualMessage::RegionApproximateKeys { keys, splitable }, ) { warn!( "failed to send approximate region keys"; diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index f4be67260f3..79e02fd8272 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -51,7 +51,7 @@ use raft::{ use smallvec::SmallVec; use tikv_alloc::trace::TraceEvent; use tikv_util::{ - box_err, debug, defer, error, escape, info, is_zero_duration, + box_err, debug, defer, error, escape, info, info_or_debug, is_zero_duration, mpsc::{self, LooseBoundedSender, Receiver}, store::{find_peer, find_peer_by_id, is_learner, region_on_same_stores}, sys::disk::DiskUsage, @@ -1085,11 +1085,11 @@ where } => { self.on_hash_computed(index, context, hash); } - CasualMessage::RegionApproximateSize { size } => { - self.on_approximate_region_size(size); + CasualMessage::RegionApproximateSize { size, splitable } => { + self.on_approximate_region_size(size, splitable); } - CasualMessage::RegionApproximateKeys { keys } => { - self.on_approximate_region_keys(keys); + CasualMessage::RegionApproximateKeys { keys, splitable } => { + self.on_approximate_region_keys(keys, splitable); } CasualMessage::RefreshRegionBuckets { region_epoch, @@ -1367,9 +1367,7 @@ where } fn on_clear_region_size(&mut self) { - self.fsm.peer.approximate_size = None; - self.fsm.peer.approximate_keys = None; - self.fsm.peer.may_skip_split_check = false; + self.fsm.peer.split_check_trigger.on_clear_region_size(); self.register_split_region_check_tick(); } @@ -4113,8 +4111,18 @@ where // if share_source_region_size is true, it means the new region contains any // data from the origin region if share_source_region_size { - share_size = self.fsm.peer.approximate_size.map(|v| v / new_region_count); - share_keys = self.fsm.peer.approximate_keys.map(|v| v / new_region_count); + share_size = self + .fsm + .peer + .split_check_trigger + .approximate_size + .map(|v| v / new_region_count); + share_keys = self + .fsm + .peer + .split_check_trigger + .approximate_keys + .map(|v| v / new_region_count); } let mut meta = self.ctx.store_meta.lock().unwrap(); @@ -4126,14 +4134,11 @@ where ); self.fsm.peer.post_split(); - // It's not correct anymore, so set it to false to schedule a split check task. - self.fsm.peer.may_skip_split_check = false; - let is_leader = self.fsm.peer.is_leader(); if is_leader { if share_source_region_size { - self.fsm.peer.approximate_size = share_size; - self.fsm.peer.approximate_keys = share_keys; + self.fsm.peer.split_check_trigger.approximate_size = share_size; + self.fsm.peer.split_check_trigger.approximate_keys = share_keys; } self.fsm.peer.heartbeat_pd(self.ctx); // Notify pd immediately to let it update the region meta. @@ -4162,7 +4167,6 @@ where if meta.region_ranges.remove(&last_key).is_none() { panic!("{} original region should exist", self.fsm.peer.tag); } - let last_region_id = regions.last().unwrap().get_id(); for (new_region, locks) in regions.into_iter().zip(region_locks) { let new_region_id = new_region.get_id(); @@ -4269,8 +4273,8 @@ where new_peer.has_ready |= campaigned; if is_leader { - new_peer.peer.approximate_size = share_size; - new_peer.peer.approximate_keys = share_keys; + new_peer.peer.split_check_trigger.approximate_size = share_size; + new_peer.peer.split_check_trigger.approximate_keys = share_keys; *new_peer.peer.txn_ext.pessimistic_locks.write() = locks; // The new peer is likely to become leader, send a heartbeat immediately to // reduce client query miss. @@ -4288,11 +4292,6 @@ where .insert(new_region_id, ReadDelegate::from_peer(new_peer.get_peer())); meta.region_read_progress .insert(new_region_id, new_peer.peer.read_progress.clone()); - if last_region_id == new_region_id { - // To prevent from big region, the right region needs run split - // check again after split. - new_peer.peer.size_diff_hint = self.ctx.cfg.region_split_check_diff().0; - } let mailbox = BasicMailbox::new(sender, new_peer, self.ctx.router.state_cnt().clone()); self.ctx.router.register(new_region_id, mailbox); self.ctx @@ -4787,7 +4786,7 @@ where // make approximate size and keys updated in time. // the reason why follower need to update is that there is a issue that after // merge and then transfer leader, the new leader may have stale size and keys. - self.fsm.peer.size_diff_hint = self.ctx.cfg.region_split_check_diff().0; + self.fsm.peer.split_check_trigger.reset_skip_check(); self.fsm.peer.reset_region_buckets(); if self.fsm.peer.is_leader() { info!( @@ -5248,6 +5247,14 @@ where &mut self, msg: &RaftCmdRequest, ) -> Result> { + // failpoint + fail_point!( + "fail_pre_propose_split", + msg.has_admin_request() + && msg.get_admin_request().get_cmd_type() == AdminCmdType::BatchSplit, + |_| Err(Error::Other(box_err!("fail_point"))) + ); + // Check store_id, make sure that the msg is dispatched to the right place. if let Err(e) = util::check_store_id(msg.get_header(), self.store_id()) { self.ctx @@ -5472,7 +5479,10 @@ where return; } Err(e) => { - debug!( + // log for admin requests + let is_admin_request = msg.has_admin_request(); + info_or_debug!( + is_admin_request; "failed to propose"; "region_id" => self.region_id(), "peer_id" => self.fsm.peer_id(), @@ -5840,9 +5850,11 @@ where // whether the region should split. // We assume that `may_skip_split_check` is only set true after the split check // task is scheduled. - if self.fsm.peer.may_skip_split_check - && self.fsm.peer.compaction_declined_bytes < self.ctx.cfg.region_split_check_diff().0 - && self.fsm.peer.size_diff_hint < self.ctx.cfg.region_split_check_diff().0 + if self + .fsm + .peer + .split_check_trigger + .should_skip(self.ctx.cfg.region_split_check_diff().0) { return; } @@ -5857,6 +5869,11 @@ where return; } + // To avoid run the check if it's splitting. + if self.fsm.peer.is_splitting() { + return; + } + // When Lightning or BR is importing data to TiKV, their ingest-request may fail // because of region-epoch not matched. So we hope TiKV do not check region size // and split region during importing. @@ -5895,10 +5912,7 @@ where ); return; } - self.fsm.peer.size_diff_hint = 0; - self.fsm.peer.compaction_declined_bytes = 0; - // the task is scheduled, next tick may skip it. - self.fsm.peer.may_skip_split_check = true; + self.fsm.peer.split_check_trigger.post_triggered(); } fn on_prepare_split_region( @@ -5974,15 +5988,21 @@ where } } - fn on_approximate_region_size(&mut self, size: u64) { - self.fsm.peer.approximate_size = Some(size); + fn on_approximate_region_size(&mut self, size: Option, splitable: Option) { + self.fsm + .peer + .split_check_trigger + .on_approximate_region_size(size, splitable); self.register_split_region_check_tick(); self.register_pd_heartbeat_tick(); fail_point!("on_approximate_region_size"); } - fn on_approximate_region_keys(&mut self, keys: u64) { - self.fsm.peer.approximate_keys = Some(keys); + fn on_approximate_region_keys(&mut self, keys: Option, splitable: Option) { + self.fsm + .peer + .split_check_trigger + .on_approximate_region_keys(keys, splitable); self.register_split_region_check_tick(); self.register_pd_heartbeat_tick(); } @@ -6130,8 +6150,10 @@ where } fn on_compaction_declined_bytes(&mut self, declined_bytes: u64) { - self.fsm.peer.compaction_declined_bytes += declined_bytes; - if self.fsm.peer.compaction_declined_bytes >= self.ctx.cfg.region_split_check_diff().0 { + self.fsm.peer.split_check_trigger.compaction_declined_bytes += declined_bytes; + if self.fsm.peer.split_check_trigger.compaction_declined_bytes + >= self.ctx.cfg.region_split_check_diff().0 + { UPDATE_REGION_SIZE_BY_COMPACTION_COUNTER.inc(); } self.register_split_region_check_tick(); @@ -6537,17 +6559,14 @@ where size += sst.total_bytes; keys += sst.total_kvs; } - self.fsm.peer.approximate_size = - Some(self.fsm.peer.approximate_size.unwrap_or_default() + size); - self.fsm.peer.approximate_keys = - Some(self.fsm.peer.approximate_keys.unwrap_or_default() + keys); + self.fsm + .peer + .split_check_trigger + .on_ingest_sst_result(size, keys); if let Some(buckets) = &mut self.fsm.peer.region_buckets_info_mut().bucket_stat_mut() { buckets.ingest_sst(keys, size); } - // The ingested file may be overlapped with the data in engine, so we need to - // check it again to get the accurate value. - self.fsm.peer.may_skip_split_check = false; if self.fsm.peer.is_leader() { self.on_pd_heartbeat_tick(); self.register_split_region_check_tick(); diff --git a/components/raftstore/src/store/msg.rs b/components/raftstore/src/store/msg.rs index a92e5169549..0dca9793d35 100644 --- a/components/raftstore/src/store/msg.rs +++ b/components/raftstore/src/store/msg.rs @@ -557,12 +557,14 @@ pub enum CasualMessage { /// Approximate size of target region. This message can only be sent by /// split-check thread. RegionApproximateSize { - size: u64, + size: Option, + splitable: Option, }, /// Approximate key count of target region. RegionApproximateKeys { - keys: u64, + keys: Option, + splitable: Option, }, CompactionDeclinedBytes { bytes: u64, @@ -647,11 +649,19 @@ impl fmt::Debug for CasualMessage { KeysInfoFormatter(split_keys.iter()), source, ), - CasualMessage::RegionApproximateSize { size } => { - write!(fmt, "Region's approximate size [size: {:?}]", size) + CasualMessage::RegionApproximateSize { size, splitable } => { + write!( + fmt, + "Region's approximate size [size: {:?}], [splitable: {:?}]", + size, splitable + ) } - CasualMessage::RegionApproximateKeys { keys } => { - write!(fmt, "Region's approximate keys [keys: {:?}]", keys) + CasualMessage::RegionApproximateKeys { keys, splitable } => { + write!( + fmt, + "Region's approximate keys [keys: {:?}], [splitable: {:?}", + keys, splitable + ) } CasualMessage::CompactionDeclinedBytes { bytes } => { write!(fmt, "compaction declined bytes {}", bytes) diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index 57ad30785f6..1e78be03be9 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -570,6 +570,119 @@ pub fn can_amend_read( false } +/// The SplitCheckTrigger maintains the internal status to determine +/// if a split check task should be triggered. +#[derive(Default, Debug)] +pub struct SplitCheckTrigger { + /// An inaccurate difference in region size since last reset. + /// It is used to decide whether split check is needed. + size_diff_hint: u64, + /// An inaccurate difference in region size after compaction. + /// It is used to trigger check split to update approximate size and keys + /// after space reclamation of deleted entries. + pub compaction_declined_bytes: u64, + /// Approximate size of the region. + pub approximate_size: Option, + may_split_size: Option, + /// Approximate keys of the region. + pub approximate_keys: Option, + may_split_keys: Option, + /// Whether this region has scheduled a split check task. If we just + /// splitted the region or ingested one file which may be overlapped + /// with the existed data, reset the flag so that the region can be + /// splitted again. + may_skip_split_check: bool, +} + +impl SplitCheckTrigger { + pub fn should_skip(&self, threshold: u64) -> bool { + self.may_skip_split_check + && self.compaction_declined_bytes < threshold + && self.size_diff_hint < threshold + } + + pub fn post_triggered(&mut self) { + self.size_diff_hint = 0; + self.compaction_declined_bytes = 0; + // The task is scheduled, the next tick may skip it only when the size and keys + // are small. + // If either size or keys are big enough to do a split, + // keep split check tick until split is done + if !matches!(self.may_split_size, Some(true)) && !matches!(self.may_split_keys, Some(true)) + { + self.may_skip_split_check = true; + } + } + + pub fn post_split(&mut self) { + self.size_diff_hint = 0; + self.may_split_keys = None; + self.may_split_size = None; + // It's not correct anymore, so set it to false to schedule a split check task. + self.may_skip_split_check = false; + } + + pub fn add_size_diff(&mut self, size_diff: i64) { + let diff = self.size_diff_hint as i64 + size_diff; + self.size_diff_hint = cmp::max(diff, 0) as u64; + } + + pub fn reset_skip_check(&mut self) { + self.may_skip_split_check = false; + } + + pub fn on_clear_region_size(&mut self) { + self.approximate_size = None; + self.approximate_keys = None; + self.may_split_size = None; + self.may_split_keys = None; + self.may_skip_split_check = false; + } + + pub fn on_approximate_region_size(&mut self, size: Option, splitable: Option) { + // If size is none, it means no estimated size + if size.is_some() { + self.approximate_size = size; + } + + if splitable.is_some() { + self.may_split_size = splitable; + } + + // If the region is truly splitable, + // may_skip_split_check should be false + if matches!(splitable, Some(true)) { + self.may_skip_split_check = false; + } + } + + pub fn on_approximate_region_keys(&mut self, keys: Option, splitable: Option) { + // if keys is none, it means no estimated keys + if keys.is_some() { + self.approximate_keys = keys; + } + + if splitable.is_some() { + self.may_split_keys = splitable; + } + + // If the region is truly splitable, + // may_skip_split_check should be false + if matches!(splitable, Some(true)) { + self.may_skip_split_check = false; + } + } + + pub fn on_ingest_sst_result(&mut self, size: u64, keys: u64) { + self.approximate_size = Some(self.approximate_size.unwrap_or_default() + size); + self.approximate_keys = Some(self.approximate_keys.unwrap_or_default() + keys); + + // The ingested file may be overlapped with the data in engine, so we need to + // check it again to get the accurate value. + self.may_skip_split_check = false; + } +} + #[derive(Getters, MutGetters)] pub struct Peer where @@ -657,25 +770,10 @@ where pub peers_start_pending_time: Vec<(u64, Instant)>, /// A inaccurate cache about which peer is marked as down. down_peer_ids: Vec, - - /// An inaccurate difference in region size since last reset. - /// It is used to decide whether split check is needed. - pub size_diff_hint: u64, + /// the split check trigger + pub split_check_trigger: SplitCheckTrigger, /// The count of deleted keys since last reset. delete_keys_hint: u64, - /// An inaccurate difference in region size after compaction. - /// It is used to trigger check split to update approximate size and keys - /// after space reclamation of deleted entries. - pub compaction_declined_bytes: u64, - /// Approximate size of the region. - pub approximate_size: Option, - /// Approximate keys of the region. - pub approximate_keys: Option, - /// Whether this region has scheduled a split check task. If we just - /// splitted the region or ingested one file which may be overlapped - /// with the existed data, reset the flag so that the region can be - /// splitted again. - pub may_skip_split_check: bool, /// The state for consistency check. pub consistency_state: ConsistencyState, @@ -861,12 +959,8 @@ where wait_data_peers: Vec::default(), peers_start_pending_time: vec![], down_peer_ids: vec![], - size_diff_hint: 0, + split_check_trigger: SplitCheckTrigger::default(), delete_keys_hint: 0, - approximate_size: None, - approximate_keys: None, - may_skip_split_check: false, - compaction_declined_bytes: 0, leader_unreachable: false, pending_remove: false, wait_data, @@ -3357,8 +3451,8 @@ where self.peer_stat.written_keys += apply_metrics.written_keys; self.peer_stat.written_bytes += apply_metrics.written_bytes; self.delete_keys_hint += apply_metrics.delete_keys_hint; - let diff = self.size_diff_hint as i64 + apply_metrics.size_diff_hint; - self.size_diff_hint = cmp::max(diff, 0) as u64; + self.split_check_trigger + .add_size_diff(apply_metrics.size_diff_hint); if self.has_pending_snapshot() && self.ready_to_handle_pending_snap() { has_ready = true; @@ -3390,9 +3484,9 @@ where } pub fn post_split(&mut self) { - // Reset delete_keys_hint and size_diff_hint. self.delete_keys_hint = 0; - self.size_diff_hint = 0; + self.split_check_trigger.post_split(); + self.reset_region_buckets(); } @@ -5207,8 +5301,8 @@ where pending_peers: self.collect_pending_peers(ctx), written_bytes: self.peer_stat.written_bytes, written_keys: self.peer_stat.written_keys, - approximate_size: self.approximate_size, - approximate_keys: self.approximate_keys, + approximate_size: self.split_check_trigger.approximate_size, + approximate_keys: self.split_check_trigger.approximate_keys, replication_status: self.region_replication_status(ctx), wait_data_peers: self.wait_data_peers.clone(), }); diff --git a/components/raftstore/src/store/worker/split_check.rs b/components/raftstore/src/store/worker/split_check.rs index 94708e84f7a..e3c0042acf0 100644 --- a/components/raftstore/src/store/worker/split_check.rs +++ b/components/raftstore/src/store/worker/split_check.rs @@ -695,6 +695,19 @@ impl Runner { }; if !split_keys.is_empty() { + // Notify peer that if the region is truly splitable. + // If it's truly splitable, then skip_split_check should be false; + self.router.update_approximate_size( + region.get_id(), + None, + Some(!split_keys.is_empty()), + ); + self.router.update_approximate_keys( + region.get_id(), + None, + Some(!split_keys.is_empty()), + ); + let region_epoch = region.get_region_epoch().clone(); self.router .ask_split(region_id, region_epoch, split_keys, "split checker".into()); @@ -736,6 +749,7 @@ impl Runner { } else { (!host.enable_region_bucket(), &empty_bucket) }; + let mut split_keys = vec![]; MergedIterator::<::Iterator>::new( tablet, LARGE_CFS, start_key, end_key, false, @@ -748,6 +762,7 @@ impl Runner { let mut skip_on_kv = false; while let Some(e) = iter.next() { if skip_on_kv && skip_check_bucket { + split_keys = host.split_keys(); return; } if !skip_on_kv && host.on_kv(region, &e) { @@ -810,6 +825,8 @@ impl Runner { } } + split_keys = host.split_keys(); + // if we scan the whole range, we can update approximate size and keys with // accurate value. if is_key_range { @@ -823,8 +840,17 @@ impl Runner { "bucket_count" => buckets.len(), "bucket_size" => bucket_size, ); - self.router.update_approximate_size(region.get_id(), size); - self.router.update_approximate_keys(region.get_id(), keys); + + self.router.update_approximate_size( + region.get_id(), + Some(size), + Some(!split_keys.is_empty()), + ); + self.router.update_approximate_keys( + region.get_id(), + Some(keys), + Some(!split_keys.is_empty()), + ); })?; if host.enable_region_bucket() { @@ -839,7 +865,7 @@ impl Runner { } timer.observe_duration(); - Ok(host.split_keys()) + Ok(split_keys) } fn change_cfg(&mut self, change: ConfigChange) { diff --git a/components/test_raftstore/src/util.rs b/components/test_raftstore/src/util.rs index ff47525ea37..5eb7d97796e 100644 --- a/components/test_raftstore/src/util.rs +++ b/components/test_raftstore/src/util.rs @@ -17,7 +17,7 @@ use engine_rocks::{config::BlobRunMode, RocksEngine, RocksSnapshot, RocksStatist use engine_test::raft::RaftTestEngine; use engine_traits::{ CfName, CfNamesExt, Engines, Iterable, KvEngine, Peekable, RaftEngineDebug, RaftEngineReadOnly, - CF_DEFAULT, CF_RAFT, + CF_DEFAULT, CF_RAFT, CF_WRITE, }; use file_system::IoRateLimiter; use futures::{executor::block_on, future::BoxFuture, StreamExt}; @@ -783,6 +783,14 @@ pub fn put_till_size( put_cf_till_size(cluster, CF_DEFAULT, limit, range) } +pub fn put_till_count( + cluster: &mut Cluster, + limit: u64, + range: &mut dyn Iterator, +) -> Vec { + put_cf_till_count(cluster, CF_WRITE, limit, range) +} + pub fn put_cf_till_size( cluster: &mut Cluster, cf: &'static str, @@ -815,6 +823,36 @@ pub fn put_cf_till_size( key.into_bytes() } +pub fn put_cf_till_count( + cluster: &mut Cluster, + cf: &'static str, + limit: u64, + range: &mut dyn Iterator, +) -> Vec { + assert!(limit > 0); + let mut len = 0; + let mut rng = rand::thread_rng(); + let mut key = String::new(); + let mut value = vec![0; 64]; + while len < limit { + let batch_size = std::cmp::min(5, limit - len); + let mut reqs = vec![]; + for _ in 0..batch_size { + key.clear(); + let key_id = range.next().unwrap(); + write!(key, "{:09}", key_id).unwrap(); + rng.fill_bytes(&mut value); + reqs.push(new_put_cf_cmd(cf, key.as_bytes(), &value)); + } + len += batch_size; + cluster.batch_put(key.as_bytes(), reqs).unwrap(); + // Approximate size of memtable is inaccurate for small data, + // we flush it to SST so we can use the size properties instead. + cluster.must_flush_cf(cf, true); + } + key.into_bytes() +} + pub fn new_mutation(op: Op, k: &[u8], v: &[u8]) -> Mutation { let mut mutation = Mutation::default(); mutation.set_op(op); diff --git a/components/tikv_util/src/log.rs b/components/tikv_util/src/log.rs index fd351eecbd4..91bd5013c1e 100644 --- a/components/tikv_util/src/log.rs +++ b/components/tikv_util/src/log.rs @@ -83,6 +83,18 @@ macro_rules! trace(($($args:tt)+) => { ::slog_global::trace!($($args)+) };); +/// Logs a infor or debug level message using the slog global logger. +#[macro_export] +macro_rules! info_or_debug{ + ($cond:expr; $($args:tt)+) => { + if $cond { + info!($($args)+) + } else { + debug!($($args)+) + } + }; +} + use std::fmt::{self, Display, Write}; use slog::{BorrowedKV, OwnedKVList, Record, KV}; diff --git a/tests/failpoints/cases/test_split_region.rs b/tests/failpoints/cases/test_split_region.rs index 65c50793d7a..2ef3d499d22 100644 --- a/tests/failpoints/cases/test_split_region.rs +++ b/tests/failpoints/cases/test_split_region.rs @@ -1549,3 +1549,65 @@ fn test_split_region_with_no_valid_split_keys() { rx.recv_timeout(Duration::from_secs(5)).unwrap(); rx.try_recv().unwrap_err(); } + +/// This test case test if a split failed for some reason, +/// it can continue run split check and eventually the split will finish +#[test_case(test_raftstore::new_node_cluster)] +fn test_split_by_split_check_on_size() { + let mut cluster = new_cluster(0, 1); + cluster.cfg.raft_store.right_derive_when_split = true; + cluster.cfg.raft_store.split_region_check_tick_interval = ReadableDuration::millis(50); + cluster.cfg.raft_store.pd_heartbeat_tick_interval = ReadableDuration::millis(100); + cluster.cfg.raft_store.region_split_check_diff = Some(ReadableSize(10)); + let region_max_size = 1440; + let region_split_size = 960; + cluster.cfg.coprocessor.region_max_size = Some(ReadableSize(region_max_size)); + cluster.cfg.coprocessor.region_split_size = Some(ReadableSize(region_split_size)); + let pd_client = cluster.pd_client.clone(); + pd_client.disable_default_operator(); + let _r = cluster.run_conf_change(); + + // make first split fail + // 1*return means it would run "return" action once + fail::cfg("fail_pre_propose_split", "1*return").unwrap(); + + // Insert region_max_size into the cluster. + // It should trigger the split + let mut range = 1..; + let key = put_till_size(&mut cluster, region_max_size / 2, &mut range); + let region = pd_client.get_region(&key).unwrap(); + put_till_size(&mut cluster, region_max_size / 2 + 100, &mut range); + // waiting the split, + cluster.wait_region_split(®ion); +} + +/// This test case test if a split failed for some reason, +/// it can continue run split check and eventually the split will finish +#[test_case(test_raftstore::new_node_cluster)] +fn test_split_by_split_check_on_keys() { + let mut cluster = new_cluster(0, 1); + cluster.cfg.raft_store.right_derive_when_split = true; + cluster.cfg.raft_store.split_region_check_tick_interval = ReadableDuration::millis(50); + cluster.cfg.raft_store.pd_heartbeat_tick_interval = ReadableDuration::millis(100); + cluster.cfg.raft_store.region_split_check_diff = Some(ReadableSize(10)); + let region_max_keys = 15; + let region_split_keys = 10; + cluster.cfg.coprocessor.region_max_keys = Some(region_max_keys); + cluster.cfg.coprocessor.region_split_keys = Some(region_split_keys); + let pd_client = cluster.pd_client.clone(); + pd_client.disable_default_operator(); + let _r = cluster.run_conf_change(); + + // make first split fail + // 1*return means it would run "return" action once + fail::cfg("fail_pre_propose_split", "1*return").unwrap(); + + // Insert region_max_size into the cluster. + // It should trigger the split + let mut range = 1..; + let key = put_till_count(&mut cluster, region_max_keys / 2, &mut range); + let region = pd_client.get_region(&key).unwrap(); + put_till_count(&mut cluster, region_max_keys / 2 + 3, &mut range); + // waiting the split, + cluster.wait_region_split(®ion); +} diff --git a/tests/integrations/raftstore/test_split_region.rs b/tests/integrations/raftstore/test_split_region.rs index 0324b57e724..8e957190f7b 100644 --- a/tests/integrations/raftstore/test_split_region.rs +++ b/tests/integrations/raftstore/test_split_region.rs @@ -609,7 +609,7 @@ fn test_node_split_region_after_reboot_with_config_change() { sleep_ms(200); assert_eq!(pd_client.get_split_count(), 0); - // change the config to make the region splittable + // change the config to make the region splitable cluster.cfg.coprocessor.region_max_size = Some(ReadableSize(region_max_size / 3)); cluster.cfg.coprocessor.region_split_size = Some(ReadableSize(region_split_size / 3)); cluster.cfg.coprocessor.region_bucket_size = ReadableSize(region_split_size / 3); From e4a3db11a46baf68694a099e3a3199ca542913cd Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Wed, 8 Nov 2023 16:39:43 +0800 Subject: [PATCH 109/220] cdc: incremental scans acquire snapshots before semaphores to avoid useless queue (#15865) (#15903) close tikv/tikv#15866 cdc: incremental scans acquire snapshots before semaphores to avoid useless queue Signed-off-by: qupeng Co-authored-by: qupeng --- components/cdc/src/delegate.rs | 7 ++- components/cdc/src/endpoint.rs | 7 +-- components/cdc/src/initializer.rs | 93 ++++++++++--------------------- 3 files changed, 37 insertions(+), 70 deletions(-) diff --git a/components/cdc/src/delegate.rs b/components/cdc/src/delegate.rs index c82c4cb6f13..780cfe8dea6 100644 --- a/components/cdc/src/delegate.rs +++ b/components/cdc/src/delegate.rs @@ -423,10 +423,15 @@ impl Delegate { downstream.state.store(DownstreamState::Stopped); let error_event = error.clone(); if let Err(err) = downstream.sink_error_event(region_id, error_event) { - warn!("cdc broadcast error failed"; + warn!("cdc send region error failed"; "region_id" => region_id, "error" => ?err, "origin_error" => ?error, "downstream_id" => ?downstream.id, "downstream" => ?downstream.peer, "request_id" => downstream.req_id, "conn_id" => ?downstream.conn_id); + } else { + info!("cdc send region error success"; + "region_id" => region_id, "origin_error" => ?error, + "downstream_id" => ?downstream.id, "downstream" => ?downstream.peer, + "request_id" => downstream.req_id, "conn_id" => ?downstream.conn_id); } Ok(()) }; diff --git a/components/cdc/src/endpoint.rs b/components/cdc/src/endpoint.rs index e62650c77c6..82233af8f14 100644 --- a/components/cdc/src/endpoint.rs +++ b/components/cdc/src/endpoint.rs @@ -809,6 +809,7 @@ impl, E: KvEngine, S: StoreRegionMeta> Endpoint, E: KvEngine, S: StoreRegionMeta> Endpoint { CDC_SCAN_TASKS.with_label_values(&["finish"]).inc(); } diff --git a/components/cdc/src/initializer.rs b/components/cdc/src/initializer.rs index bd8f5e4e637..2882d2e975e 100644 --- a/components/cdc/src/initializer.rs +++ b/components/cdc/src/initializer.rs @@ -35,7 +35,7 @@ use tikv_kv::Iterator; use tikv_util::{ box_err, codec::number, - debug, error, info, + debug, defer, error, info, memory::MemoryQuota, sys::inspector::{self_thread_inspector, ThreadInspector}, time::{Instant, Limiter}, @@ -90,6 +90,7 @@ pub(crate) struct Initializer { pub(crate) request_id: u64, pub(crate) checkpoint_ts: TimeStamp, + pub(crate) scan_concurrency_semaphore: Arc, pub(crate) scan_speed_limiter: Limiter, pub(crate) fetch_speed_limiter: Limiter, @@ -109,30 +110,9 @@ impl Initializer { &mut self, change_observer: ChangeObserver, cdc_handle: T, - concurrency_semaphore: Arc, memory_quota: Arc, ) -> Result<()> { fail_point!("cdc_before_initialize"); - let _permit = concurrency_semaphore.acquire().await; - - // When downstream_state is Stopped, it means the corresponding delegate - // is stopped. The initialization can be safely canceled. - // - // Acquiring a permit may take some time, it is possible that - // initialization can be canceled. - if self.downstream_state.load() == DownstreamState::Stopped { - info!("cdc async incremental scan canceled"; - "region_id" => self.region_id, - "downstream_id" => ?self.downstream_id, - "observe_id" => ?self.observe_id, - "conn_id" => ?self.conn_id); - return Err(box_err!("scan canceled")); - } - - CDC_SCAN_TASKS.with_label_values(&["ongoing"]).inc(); - tikv_util::defer!({ - CDC_SCAN_TASKS.with_label_values(&["ongoing"]).dec(); - }); // To avoid holding too many snapshots and holding them too long, // we need to acquire scan concurrency permit before taking snapshot. @@ -187,8 +167,8 @@ impl Initializer { memory_quota: Arc, ) -> Result<()> { if let Some(region_snapshot) = resp.snapshot { - assert_eq!(self.region_id, region_snapshot.get_region().get_id()); let region = region_snapshot.get_region().clone(); + assert_eq!(self.region_id, region.get_id()); self.async_incremental_scan(region_snapshot, region, memory_quota) .await } else { @@ -208,10 +188,29 @@ impl Initializer { region: Region, memory_quota: Arc, ) -> Result<()> { - let downstream_id = self.downstream_id; + let scan_concurrency_semaphore = self.scan_concurrency_semaphore.clone(); + let _permit = scan_concurrency_semaphore.acquire().await; + CDC_SCAN_TASKS.with_label_values(&["ongoing"]).inc(); + defer!(CDC_SCAN_TASKS.with_label_values(&["ongoing"]).dec()); + let region_id = region.get_id(); + let downstream_id = self.downstream_id; let observe_id = self.observe_id; + let conn_id = self.conn_id; let kv_api = self.kv_api; + let on_cancel = || -> Result<()> { + info!("cdc async incremental scan canceled"; + "region_id" => region_id, + "downstream_id" => ?downstream_id, + "observe_id" => ?observe_id, + "conn_id" => ?conn_id); + Err(box_err!("scan canceled")) + }; + + if self.downstream_state.load() == DownstreamState::Stopped { + return on_cancel(); + } + self.observed_range.update_region_key_range(®ion); debug!("cdc async incremental scan"; "region_id" => region_id, @@ -260,7 +259,6 @@ impl Initializer { }; fail_point!("cdc_incremental_scan_start"); - let conn_id = self.conn_id; let mut done = false; let start = Instant::now_coarse(); @@ -270,15 +268,6 @@ impl Initializer { DownstreamState::Initializing | DownstreamState::Stopped )); - let on_cancel = || -> Result<()> { - info!("cdc async incremental scan canceled"; - "region_id" => region_id, - "downstream_id" => ?downstream_id, - "observe_id" => ?observe_id, - "conn_id" => ?conn_id); - Err(box_err!("scan canceled")) - }; - while !done { // When downstream_state is Stopped, it means the corresponding // delegate is stopped. The initialization can be safely canceled. @@ -666,6 +655,7 @@ mod tests { conn_id: ConnId::new(), request_id: 0, checkpoint_ts: 1.into(), + scan_concurrency_semaphore: Arc::new(Semaphore::new(1)), scan_speed_limiter: Limiter::new(scan_limit as _), fetch_speed_limiter: Limiter::new(fetch_limit as _), max_scan_batch_bytes: 1024 * 1024, @@ -1044,51 +1034,26 @@ mod tests { let change_cmd = ChangeObserver::from_cdc(1, ObserveHandle::new()); let raft_router = CdcRaftRouter(MockRaftStoreRouter::new()); - let concurrency_semaphore = Arc::new(Semaphore::new(1)); let memory_quota = Arc::new(MemoryQuota::new(usize::MAX)); initializer.downstream_state.store(DownstreamState::Stopped); - block_on(initializer.initialize( - change_cmd, - raft_router.clone(), - concurrency_semaphore.clone(), - memory_quota.clone(), - )) - .unwrap_err(); - - let (tx, rx) = sync_channel(1); - let concurrency_semaphore_ = concurrency_semaphore.clone(); - pool.spawn(async move { - let _permit = concurrency_semaphore_.acquire().await; - tx.send(()).unwrap(); - tx.send(()).unwrap(); - tx.send(()).unwrap(); - }); - rx.recv_timeout(Duration::from_millis(200)).unwrap(); + block_on(initializer.initialize(change_cmd, raft_router.clone(), memory_quota.clone())) + .unwrap_err(); let (tx1, rx1) = sync_channel(1); let change_cmd = ChangeObserver::from_cdc(1, ObserveHandle::new()); pool.spawn(async move { // Migrated to 2021 migration. This let statement is probably not needed, see // https://doc.rust-lang.org/edition-guide/rust-2021/disjoint-capture-in-closures.html - let _ = ( - &initializer, - &change_cmd, - &raft_router, - &concurrency_semaphore, - ); let res = initializer - .initialize(change_cmd, raft_router, concurrency_semaphore, memory_quota) + .initialize(change_cmd, raft_router, memory_quota) .await; tx1.send(res).unwrap(); }); - // Must timeout because there is no enough permit. - rx1.recv_timeout(Duration::from_millis(200)).unwrap_err(); - // Release the permit - rx.recv_timeout(Duration::from_millis(200)).unwrap(); + // Shouldn't timeout, gets an error instead. let res = rx1.recv_timeout(Duration::from_millis(200)).unwrap(); - res.unwrap_err(); + assert!(res.is_err()); worker.stop(); } From b760000ed14fe267d4f9da97a700a20d8220a03a Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Thu, 9 Nov 2023 15:05:13 +0800 Subject: [PATCH 110/220] grafana: Fix wrong scheduler command variables of grafana in cloud env (#15833) (#15936) close tikv/tikv#15832 Fix wrong scheduler command variables of grafana in cloud env by adding a `\b` to regex to make sure it's at the word boundary. Signed-off-by: Connor1996 Signed-off-by: Neil Shen Co-authored-by: Connor1996 Co-authored-by: Neil Shen Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- metrics/grafana/tikv_details.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/metrics/grafana/tikv_details.json b/metrics/grafana/tikv_details.json index f2654ba3da1..60eead841d7 100644 --- a/metrics/grafana/tikv_details.json +++ b/metrics/grafana/tikv_details.json @@ -48470,7 +48470,7 @@ "refId": "StandardVariableQuery" }, "refresh": 1, - "regex": "/type=\"([^\"]+)\"/", + "regex": "/\\btype=\"([^\"]+)\"/", "skipUrlSync": false, "sort": 1, "tagValuesQuery": "", @@ -48568,4 +48568,4 @@ "title": "Test-Cluster-TiKV-Details", "uid": "RDVQiEzZz", "version": 1 -} \ No newline at end of file +} From 71b0f78c89df358c9a0e23c3353661e2f9b34b0f Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Thu, 9 Nov 2023 22:29:13 +0800 Subject: [PATCH 111/220] resource_control: add quota limiter per priority (#15918) (#15951) ref tikv/tikv#15917 Signed-off-by: glorv Co-authored-by: glorv --- components/backup/src/endpoint.rs | 2 +- components/resource_control/src/future.rs | 25 +- .../resource_control/src/resource_group.rs | 256 +++++++++++++++++- components/resource_control/src/service.rs | 7 +- components/resource_control/src/worker.rs | 20 +- src/coprocessor/endpoint.rs | 8 + src/import/sst_service.rs | 4 +- src/storage/mod.rs | 18 ++ src/storage/txn/scheduler.rs | 4 + 9 files changed, 319 insertions(+), 25 deletions(-) diff --git a/components/backup/src/endpoint.rs b/components/backup/src/endpoint.rs index a4efc162092..956455e523e 100644 --- a/components/backup/src/endpoint.rs +++ b/components/backup/src/endpoint.rs @@ -928,7 +928,7 @@ impl Endpoint { let sst_max_size = self.config_manager.0.read().unwrap().sst_max_size.0; let limit = self.softlimit.limit(); let resource_limiter = self.resource_ctl.as_ref().and_then(|r| { - r.get_resource_limiter(&request.resource_group_name, &request.source_tag) + r.get_background_resource_limiter(&request.resource_group_name, &request.source_tag) }); self.pool.borrow_mut().spawn(async move { diff --git a/components/resource_control/src/future.rs b/components/resource_control/src/future.rs index fd98fc9a092..a935c3b41fa 100644 --- a/components/resource_control/src/future.rs +++ b/components/resource_control/src/future.rs @@ -16,7 +16,7 @@ use tokio_timer::Delay; use crate::{ resource_group::{ResourceConsumeType, ResourceController}, - resource_limiter::ResourceLimiter, + resource_limiter::{ResourceLimiter, ResourceType}, }; const MAX_WAIT_DURATION: Duration = Duration::from_secs(10); @@ -125,13 +125,24 @@ impl Future for LimitedFuture { if this.res.is_ready() { return std::mem::replace(this.res, Poll::Pending); } - let last_io_bytes = match get_thread_io_bytes_stats() { - Ok(b) => Some(b), - Err(e) => { - warn!("load thread io bytes failed"; "err" => e); - None + // get io stats is very expensive, so we only do so if only io control is + // enabled. + let mut last_io_bytes = None; + if this + .resource_limiter + .get_limiter(ResourceType::Io) + .get_rate_limit() + .is_finite() + { + match get_thread_io_bytes_stats() { + Ok(b) => { + last_io_bytes = Some(b); + } + Err(e) => { + warn!("load thread io bytes failed"; "err" => e); + } } - }; + } let start = Instant::now(); let res = this.f.poll(cx); let dur = start.saturating_elapsed(); diff --git a/components/resource_control/src/resource_group.rs b/components/resource_control/src/resource_group.rs index a4b30e3d4ad..b7e7ca28705 100644 --- a/components/resource_control/src/resource_group.rs +++ b/components/resource_control/src/resource_group.rs @@ -22,6 +22,7 @@ use kvproto::{ resource_manager::{GroupMode, ResourceGroup as PbResourceGroup}, }; use parking_lot::{MappedRwLockReadGuard, RwLock, RwLockReadGuard}; +use strum::{EnumCount, EnumIter, IntoEnumIterator}; use tikv_util::{info, time::Instant}; use yatp::queue::priority::TaskPriorityProvider; @@ -56,21 +57,70 @@ pub enum ResourceConsumeType { IoBytes(u64), } +#[derive(Copy, Clone, Eq, PartialEq, EnumCount, EnumIter)] +#[repr(usize)] +pub enum TaskPriority { + High = 0, + Medium = 1, + Low = 2, +} + +impl TaskPriority { + pub fn as_str(&self) -> &'static str { + match *self { + TaskPriority::High => "high", + TaskPriority::Medium => "medium", + TaskPriority::Low => "low", + } + } +} + +impl From for TaskPriority { + fn from(value: u32) -> Self { + // map the resource group priority value (1,8,16) to (Low,Medium,High) + if value < 6 { + Self::Low + } else if value < 11 { + Self::Medium + } else { + Self::High + } + } +} + /// ResourceGroupManager manages the metadata of each resource group. pub struct ResourceGroupManager { pub(crate) resource_groups: DashMap, + // the count of all groups, a fast path because call `DashMap::len` is a little slower. + group_count: AtomicU64, registry: RwLock>>, // auto incremental version generator used for mark the background // resource limiter has changed. version_generator: AtomicU64, + // the shared resource limiter of each priority + priority_limiters: [Arc; TaskPriority::COUNT], } impl Default for ResourceGroupManager { fn default() -> Self { + let priority_limiters = TaskPriority::iter() + .map(|p| { + Arc::new(ResourceLimiter::new( + p.as_str().to_owned(), + f64::INFINITY, + f64::INFINITY, + 0, + )) + }) + .collect::>() + .try_into() + .unwrap(); let manager = Self { resource_groups: Default::default(), + group_count: AtomicU64::new(0), registry: Default::default(), version_generator: AtomicU64::new(0), + priority_limiters, }; // init the default resource group by default. @@ -90,6 +140,11 @@ impl Default for ResourceGroupManager { } impl ResourceGroupManager { + #[inline] + pub fn get_group_count(&self) -> u64 { + self.group_count.load(Ordering::Relaxed) + } + fn get_ru_setting(rg: &PbResourceGroup, is_read: bool) -> u64 { match (rg.get_mode(), is_read) { // RU mode, read and write use the same setting. @@ -129,8 +184,13 @@ impl ResourceGroupManager { .and_then(|g| g.limiter.clone()); let limiter = self.build_resource_limiter(&rg, prev_limiter); - self.resource_groups - .insert(group_name, ResourceGroup::new(rg, limiter)); + if self + .resource_groups + .insert(group_name, ResourceGroup::new(rg, limiter)) + .is_none() + { + self.group_count.fetch_add(1, Ordering::Relaxed); + } } fn build_resource_limiter( @@ -161,6 +221,7 @@ impl ResourceGroupManager { if self.resource_groups.remove(&group_name).is_some() { deregister_metrics(name); info!("remove resource group"; "name"=> name); + self.group_count.fetch_sub(1, Ordering::Relaxed); } } @@ -184,6 +245,8 @@ impl ResourceGroupManager { controller.remove_resource_group(name.as_bytes()); } }); + self.group_count + .fetch_sub(removed_names.len() as u64, Ordering::Relaxed); } } @@ -234,24 +297,79 @@ impl ResourceGroupManager { } } + // only enable priority quota limiter when there is at least 1 user-defined + // resource group. + #[inline] + fn enable_priority_limiter(&self) -> bool { + self.get_group_count() > 1 + } + + // Always return the background resource limiter if any; + // Only return the foregroup limiter when priority is enabled. pub fn get_resource_limiter( &self, rg: &str, request_source: &str, + override_priority: u64, + ) -> Option> { + let (limiter, group_priority) = + self.get_background_resource_limiter_with_priority(rg, request_source); + if limiter.is_some() { + return limiter; + } + + // if there is only 1 resource group, priority quota limiter is useless so just + // return None for better performance. + if !self.enable_priority_limiter() { + return None; + } + + // request priority has higher priority, 0 means priority is not set. + let mut task_priority = override_priority as u32; + if task_priority == 0 { + task_priority = group_priority; + } + Some(self.priority_limiters[TaskPriority::from(task_priority) as usize].clone()) + } + + // return a ResourceLimiter for background tasks only. + pub fn get_background_resource_limiter( + &self, + rg: &str, + request_source: &str, ) -> Option> { + self.get_background_resource_limiter_with_priority(rg, request_source) + .0 + } + + fn get_background_resource_limiter_with_priority( + &self, + rg: &str, + request_source: &str, + ) -> (Option>, u32) { fail_point!("only_check_source_task_name", |name| { assert_eq!(&name.unwrap(), request_source); - None + (None, 8) }); + let mut group_priority = None; if let Some(group) = self.resource_groups.get(rg) { + group_priority = Some(group.group.priority); if !group.fallback_default { - return group.get_resource_limiter(request_source); + return ( + group.get_background_resource_limiter(request_source), + group.group.priority, + ); } } - self.resource_groups + let default_group = self + .resource_groups .get(DEFAULT_RESOURCE_GROUP_NAME) - .and_then(|g| g.get_resource_limiter(request_source)) + .unwrap(); + ( + default_group.get_background_resource_limiter(request_source), + group_priority.unwrap_or(default_group.group.priority), + ) } } @@ -286,7 +404,10 @@ impl ResourceGroup { .get_fill_rate() } - fn get_resource_limiter(&self, request_source: &str) -> Option> { + fn get_background_resource_limiter( + &self, + request_source: &str, + ) -> Option> { self.limiter.as_ref().and_then(|limiter| { // the source task name is the last part of `request_source` separated by "_" // the request_source is @@ -871,6 +992,35 @@ pub(crate) mod tests { ); } + #[test] + fn test_resource_group_crud() { + let resource_manager = ResourceGroupManager::default(); + assert_eq!(resource_manager.get_group_count(), 1); + + let group1 = new_resource_group_ru("test1".into(), 100, HIGH_PRIORITY); + resource_manager.add_resource_group(group1); + assert_eq!(resource_manager.get_group_count(), 2); + + let group2 = new_resource_group_ru("test2".into(), 200, LOW_PRIORITY); + resource_manager.add_resource_group(group2); + assert_eq!(resource_manager.get_group_count(), 3); + + let group1 = new_resource_group_ru("test1".into(), 150, HIGH_PRIORITY); + resource_manager.add_resource_group(group1.clone()); + assert_eq!(resource_manager.get_group_count(), 3); + assert_eq!( + resource_manager.get_resource_group("test1").unwrap().group, + group1 + ); + + resource_manager.remove_resource_group("test2"); + assert!(resource_manager.get_resource_group("test2").is_none()); + assert_eq!(resource_manager.get_group_count(), 2); + + resource_manager.remove_resource_group("test2"); + assert_eq!(resource_manager.get_group_count(), 2); + } + #[test] fn test_resource_group_priority() { let resource_manager = ResourceGroupManager::default(); @@ -1165,4 +1315,96 @@ pub(crate) mod tests { assert_eq!(metadata1.group_name(), group_name.as_bytes()); } } + + #[test] + fn test_get_resource_limiter() { + let mgr = ResourceGroupManager::default(); + + let default_group = new_background_resource_group_ru( + "default".into(), + 200, + MEDIUM_PRIORITY, + vec!["br".into(), "stats".into()], + ); + mgr.add_resource_group(default_group); + let default_limiter = mgr + .get_resource_group("default") + .unwrap() + .limiter + .clone() + .unwrap(); + + assert!(mgr.get_resource_limiter("default", "query", 0).is_none()); + assert!( + mgr.get_resource_limiter("default", "query", HIGH_PRIORITY as u64) + .is_none() + ); + + let group1 = new_resource_group("test1".into(), true, 100, 100, HIGH_PRIORITY); + mgr.add_resource_group(group1); + + let bg_group = new_background_resource_group_ru( + "bg".into(), + 50, + LOW_PRIORITY, + vec!["ddl".into(), "stats".into()], + ); + mgr.add_resource_group(bg_group); + let bg_limiter = mgr + .get_resource_group("bg") + .unwrap() + .limiter + .clone() + .unwrap(); + + assert!( + mgr.get_background_resource_limiter("test1", "ddl") + .is_none() + ); + assert!(Arc::ptr_eq( + &mgr.get_background_resource_limiter("test1", "stats") + .unwrap(), + &default_limiter + )); + + assert!(Arc::ptr_eq( + &mgr.get_background_resource_limiter("bg", "stats").unwrap(), + &bg_limiter + )); + assert!(mgr.get_background_resource_limiter("bg", "br").is_none()); + assert!( + mgr.get_background_resource_limiter("bg", "invalid") + .is_none() + ); + + assert!(Arc::ptr_eq( + &mgr.get_background_resource_limiter("unknown", "stats") + .unwrap(), + &default_limiter + )); + + assert!(Arc::ptr_eq( + &mgr.get_resource_limiter("test1", "stats", 0).unwrap(), + &default_limiter + )); + assert!(Arc::ptr_eq( + &mgr.get_resource_limiter("test1", "query", 0).unwrap(), + &mgr.priority_limiters[0] + )); + assert!(Arc::ptr_eq( + &mgr.get_resource_limiter("test1", "query", LOW_PRIORITY as u64) + .unwrap(), + &mgr.priority_limiters[2] + )); + + assert!(Arc::ptr_eq( + &mgr.get_resource_limiter("default", "query", LOW_PRIORITY as u64) + .unwrap(), + &mgr.priority_limiters[2] + )); + assert!(Arc::ptr_eq( + &mgr.get_resource_limiter("unknown", "query", 0).unwrap(), + &mgr.priority_limiters[1] + )); + } } diff --git a/components/resource_control/src/service.rs b/components/resource_control/src/service.rs index 5ecac9d74c4..2c2bbdc5549 100644 --- a/components/resource_control/src/service.rs +++ b/components/resource_control/src/service.rs @@ -565,7 +565,10 @@ pub mod tests { s_clone.report_ru_metrics().await; }); // Mock consume. - let bg_limiter = s.manager.get_resource_limiter("background", "br").unwrap(); + let bg_limiter = s + .manager + .get_background_resource_limiter("background", "br") + .unwrap(); bg_limiter.consume( Duration::from_secs(2), IoBytes { @@ -584,7 +587,7 @@ pub mod tests { s.manager.add_resource_group(background_group); let new_bg_limiter = s .manager - .get_resource_limiter("background", "lightning") + .get_background_resource_limiter("background", "lightning") .unwrap(); new_bg_limiter.consume( Duration::from_secs(5), diff --git a/components/resource_control/src/worker.rs b/components/resource_control/src/worker.rs index deb1b2e44de..7bc76691e1f 100644 --- a/components/resource_control/src/worker.rs +++ b/components/resource_control/src/worker.rs @@ -340,7 +340,11 @@ mod tests { let resource_ctl = Arc::new(ResourceGroupManager::default()); let rg1 = new_resource_group_ru("test".into(), 1000, 14); resource_ctl.add_resource_group(rg1); - assert!(resource_ctl.get_resource_limiter("test", "br").is_none()); + assert!( + resource_ctl + .get_background_resource_limiter("test", "br") + .is_none() + ); let test_provider = TestResourceStatsProvider::new(8.0, 10000.0); let mut worker = @@ -351,10 +355,12 @@ mod tests { resource_ctl.add_resource_group(default_bg); assert!( resource_ctl - .get_resource_limiter("default", "lightning") + .get_background_resource_limiter("default", "lightning") .is_none() ); - let limiter = resource_ctl.get_resource_limiter("default", "br").unwrap(); + let limiter = resource_ctl + .get_background_resource_limiter("default", "br") + .unwrap(); assert!( limiter .get_limiter(ResourceType::Cpu) @@ -513,13 +519,15 @@ mod tests { let default = new_background_resource_group_ru("default".into(), 2000, 8, vec!["br".into()]); resource_ctl.add_resource_group(default); - let new_limiter = resource_ctl.get_resource_limiter("default", "br").unwrap(); + let new_limiter = resource_ctl + .get_background_resource_limiter("default", "br") + .unwrap(); assert_eq!(&*new_limiter as *const _, &*limiter as *const _); let bg = new_background_resource_group_ru("background".into(), 1000, 15, vec!["br".into()]); resource_ctl.add_resource_group(bg); let bg_limiter = resource_ctl - .get_resource_limiter("background", "br") + .get_background_resource_limiter("background", "br") .unwrap(); reset_quota(&mut worker, 5.0, 7000.0, Duration::from_secs(1)); @@ -581,7 +589,7 @@ mod tests { new_background_resource_group_ru("background".into(), 1000, 15, vec!["br".into()]); resource_ctl.add_resource_group(new_bg); let new_bg_limiter = resource_ctl - .get_resource_limiter("background", "br") + .get_background_resource_limiter("background", "br") .unwrap(); assert_ne!(&*bg_limiter as *const _, &*new_bg_limiter as *const _); assert!( diff --git a/src/coprocessor/endpoint.rs b/src/coprocessor/endpoint.rs index 7a12c7493e5..521e5a8e2cd 100644 --- a/src/coprocessor/endpoint.rs +++ b/src/coprocessor/endpoint.rs @@ -511,6 +511,10 @@ impl Endpoint { .get_resource_control_context() .get_resource_group_name(), req_ctx.context.get_request_source(), + req_ctx + .context + .get_resource_control_context() + .get_override_priority(), ) }); // box the tracker so that moving it is cheap. @@ -756,6 +760,10 @@ impl Endpoint { .get_resource_control_context() .get_resource_group_name(), req_ctx.context.get_request_source(), + req_ctx + .context + .get_resource_control_context() + .get_override_priority(), ) }); let key_ranges = req_ctx diff --git a/src/import/sst_service.rs b/src/import/sst_service.rs index 92e73ca9f8f..2dc4f76b194 100644 --- a/src/import/sst_service.rs +++ b/src/import/sst_service.rs @@ -751,7 +751,7 @@ macro_rules! impl_write { let (meta, resource_limiter) = match first_req { Some(r) => { let limiter = resource_manager.as_ref().and_then(|m| { - m.get_resource_limiter( + m.get_background_resource_limiter( r.get_context() .get_resource_control_context() .get_resource_group_name(), @@ -1060,7 +1060,7 @@ impl ImportSst for ImportSstService { let tablets = self.tablets.clone(); let start = Instant::now(); let resource_limiter = self.resource_manager.as_ref().and_then(|r| { - r.get_resource_limiter( + r.get_background_resource_limiter( req.get_context() .get_resource_control_context() .get_resource_group_name(), diff --git a/src/storage/mod.rs b/src/storage/mod.rs index cc48d9e36e3..c0d6e6fc4a3 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -609,6 +609,7 @@ impl Storage { r.get_resource_limiter( ctx.get_resource_control_context().get_resource_group_name(), ctx.get_request_source(), + ctx.get_resource_control_context().get_override_priority(), ) }); let priority_tag = get_priority_tag(priority); @@ -782,6 +783,10 @@ impl Storage { .get_resource_control_context() .get_resource_group_name(), requests[0].get_context().get_request_source(), + requests[0] + .get_context() + .get_resource_control_context() + .get_override_priority(), ) }); let concurrency_manager = self.concurrency_manager.clone(); @@ -978,6 +983,7 @@ impl Storage { r.get_resource_limiter( ctx.get_resource_control_context().get_resource_group_name(), ctx.get_request_source(), + ctx.get_resource_control_context().get_override_priority(), ) }); let priority_tag = get_priority_tag(priority); @@ -1170,6 +1176,7 @@ impl Storage { r.get_resource_limiter( ctx.get_resource_control_context().get_resource_group_name(), ctx.get_request_source(), + ctx.get_resource_control_context().get_override_priority(), ) }); let priority_tag = get_priority_tag(priority); @@ -1346,6 +1353,7 @@ impl Storage { r.get_resource_limiter( ctx.get_resource_control_context().get_resource_group_name(), ctx.get_request_source(), + ctx.get_resource_control_context().get_override_priority(), ) }); let priority_tag = get_priority_tag(priority); @@ -1662,6 +1670,7 @@ impl Storage { r.get_resource_limiter( ctx.get_resource_control_context().get_resource_group_name(), ctx.get_request_source(), + ctx.get_resource_control_context().get_override_priority(), ) }); let priority_tag = get_priority_tag(priority); @@ -1754,6 +1763,10 @@ impl Storage { .get_resource_control_context() .get_resource_group_name(), gets[0].get_context().get_request_source(), + gets[0] + .get_context() + .get_resource_control_context() + .get_override_priority(), ) }); let priority_tag = get_priority_tag(priority); @@ -1893,6 +1906,7 @@ impl Storage { r.get_resource_limiter( ctx.get_resource_control_context().get_resource_group_name(), ctx.get_request_source(), + ctx.get_resource_control_context().get_override_priority(), ) }); let priority_tag = get_priority_tag(priority); @@ -2399,6 +2413,7 @@ impl Storage { r.get_resource_limiter( ctx.get_resource_control_context().get_resource_group_name(), ctx.get_request_source(), + ctx.get_resource_control_context().get_override_priority(), ) }); let priority_tag = get_priority_tag(priority); @@ -2536,6 +2551,7 @@ impl Storage { r.get_resource_limiter( ctx.get_resource_control_context().get_resource_group_name(), ctx.get_request_source(), + ctx.get_resource_control_context().get_override_priority(), ) }); let priority_tag = get_priority_tag(priority); @@ -2698,6 +2714,7 @@ impl Storage { r.get_resource_limiter( ctx.get_resource_control_context().get_resource_group_name(), ctx.get_request_source(), + ctx.get_resource_control_context().get_override_priority(), ) }); let priority_tag = get_priority_tag(priority); @@ -2879,6 +2896,7 @@ impl Storage { r.get_resource_limiter( ctx.get_resource_control_context().get_resource_group_name(), ctx.get_request_source(), + ctx.get_resource_control_context().get_override_priority(), ) }); let priority_tag = get_priority_tag(priority); diff --git a/src/storage/txn/scheduler.rs b/src/storage/txn/scheduler.rs index 36492f22701..995c361e163 100644 --- a/src/storage/txn/scheduler.rs +++ b/src/storage/txn/scheduler.rs @@ -1236,6 +1236,10 @@ impl TxnScheduler { .get_resource_control_context() .get_resource_group_name(), task.cmd.ctx().get_request_source(), + task.cmd + .ctx() + .get_resource_control_context() + .get_override_priority(), ) }); let mut sample = quota_limiter.new_sample(true); From ba575397c5bbcbb161e44eb0c44b9e18f44c9085 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Fri, 10 Nov 2023 11:44:12 +0800 Subject: [PATCH 112/220] alert: tackle the false-postive case where alerting `gc not work`. (#15948) (#15957) close tikv/tikv#15796 Signed-off-by: lucasliang Co-authored-by: lucasliang --- metrics/alertmanager/tikv.rules.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metrics/alertmanager/tikv.rules.yml b/metrics/alertmanager/tikv.rules.yml index e43ca401d42..a4de231de7b 100644 --- a/metrics/alertmanager/tikv.rules.yml +++ b/metrics/alertmanager/tikv.rules.yml @@ -15,7 +15,7 @@ groups: - alert: TiKV_GC_can_not_work expr: sum(increase(tikv_gcworker_gc_tasks_vec{task="gc"}[1d])) < 1 and (sum(increase(tikv_gc_compaction_filter_perform[1d])) < 1 and sum(increase(tikv_engine_event_total{db="kv", cf="write", type="compaction"}[1d])) >= 1) - for: 1m + for: 5m labels: env: ENV_LABELS_ENV level: emergency From c0d7a1bc39cd4dfea9ce6a6f0f58bfcc92cdad88 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Fri, 10 Nov 2023 12:09:42 +0800 Subject: [PATCH 113/220] server: Introduce heap profiling config (#15883) (#15959) close tikv/tikv#15958 introduce heap profiling config Signed-off-by: Connor1996 Co-authored-by: Connor1996 Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- cmd/tikv-server/src/main.rs | 3 + components/raftstore/src/store/config.rs | 23 ++- components/server/src/server.rs | 5 +- components/server/src/server2.rs | 3 +- components/server/src/setup.rs | 2 - components/tikv_alloc/src/default.rs | 9 ++ components/tikv_alloc/src/jemalloc.rs | 69 +++++++-- etc/config-template.toml | 12 ++ src/config/mod.rs | 142 ++++++++++++++++-- src/server/config.rs | 13 +- src/server/status_server/mod.rs | 1 + src/server/status_server/profile.rs | 12 +- tests/integrations/config/mod.rs | 17 +-- .../config/test-cache-compatible.toml | 2 + tests/integrations/config/test-custom.toml | 8 +- tests/integrations/config/test-default.toml | 2 + 16 files changed, 256 insertions(+), 67 deletions(-) diff --git a/cmd/tikv-server/src/main.rs b/cmd/tikv-server/src/main.rs index 9fdcad81c58..3f4372c32cc 100644 --- a/cmd/tikv-server/src/main.rs +++ b/cmd/tikv-server/src/main.rs @@ -217,6 +217,9 @@ fn main() { process::exit(1) } + // Init memory related settings. + config.memory.init(); + let (service_event_tx, service_event_rx) = tikv_util::mpsc::unbounded(); // pipe for controling service match config.storage.engine { EngineType::RaftKv => server::server::run_tikv(config, service_event_tx, service_event_rx), diff --git a/components/raftstore/src/store/config.rs b/components/raftstore/src/store/config.rs index 95c4aed9349..81009dd5d59 100644 --- a/components/raftstore/src/store/config.rs +++ b/components/raftstore/src/store/config.rs @@ -104,12 +104,11 @@ pub struct Config { pub max_manual_flush_rate: f64, // When a peer is not responding for this time, leader will not keep entry cache for it. pub raft_entry_cache_life_time: ReadableDuration, - // Deprecated! The configuration has no effect. - // They are preserved for compatibility check. // When a peer is newly added, reject transferring leader to the peer for a while. #[doc(hidden)] #[serde(skip_serializing)] - #[online_config(skip)] + #[online_config(hidden)] + #[deprecated = "The configuration has been removed. It has no effect"] pub raft_reject_transfer_leader_duration: ReadableDuration, /// Whether to disable checking quorum for the raft group. This will make @@ -320,27 +319,26 @@ pub struct Config { pub io_reschedule_concurrent_max_count: usize, pub io_reschedule_hotpot_duration: ReadableDuration, - // Deprecated! Batch is done in raft client. #[doc(hidden)] #[serde(skip_serializing)] - #[online_config(skip)] + #[online_config(hidden)] + #[deprecated = "The configuration has been removed. Batch is done in raft client."] pub raft_msg_flush_interval: ReadableDuration, - // Deprecated! These configuration has been moved to Coprocessor. - // They are preserved for compatibility check. #[doc(hidden)] #[serde(skip_serializing)] - #[online_config(skip)] + #[online_config(hidden)] + #[deprecated = "The configuration has been moved to coprocessor.region_max_size."] pub region_max_size: ReadableSize, #[doc(hidden)] #[serde(skip_serializing)] - #[online_config(skip)] + #[online_config(hidden)] + #[deprecated = "The configuration has been moved to coprocessor.region_split_size."] pub region_split_size: ReadableSize, - // Deprecated! The time to clean stale peer safely can be decided based on RocksDB snapshot - // sequence number. #[doc(hidden)] #[serde(skip_serializing)] - #[online_config(skip)] + #[online_config(hidden)] + #[deprecated = "The configuration has been removed. The time to clean stale peer safely can be decided based on RocksDB snapshot sequence number."] pub clean_stale_peer_delay: ReadableDuration, // Interval to inspect the latency of raftstore for slow store detection. @@ -400,6 +398,7 @@ pub struct Config { } impl Default for Config { + #[allow(deprecated)] fn default() -> Config { Config { prevote: true, diff --git a/components/server/src/server.rs b/components/server/src/server.rs index a4b6276a587..006750fd518 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -75,7 +75,9 @@ use security::SecurityManager; use service::{service_event::ServiceEvent, service_manager::GrpcServiceManager}; use snap_recovery::RecoveryService; use tikv::{ - config::{ConfigController, DbConfigManger, DbType, LogConfigManager, TikvConfig}, + config::{ + ConfigController, DbConfigManger, DbType, LogConfigManager, MemoryConfigManager, TikvConfig, + }, coprocessor::{self, MEMTRACE_ROOT as MEMTRACE_COPROCESSOR}, coprocessor_v2, import::{ImportSstService, SstImporter}, @@ -506,6 +508,7 @@ where ); cfg_controller.register(tikv::config::Module::Log, Box::new(LogConfigManager)); + cfg_controller.register(tikv::config::Module::Memory, Box::new(MemoryConfigManager)); // Create cdc. let mut cdc_worker = Box::new(LazyWorker::new("cdc")); diff --git a/components/server/src/server2.rs b/components/server/src/server2.rs index 65d02f58c08..fdbb18b6205 100644 --- a/components/server/src/server2.rs +++ b/components/server/src/server2.rs @@ -73,7 +73,7 @@ use service::{service_event::ServiceEvent, service_manager::GrpcServiceManager}; use tikv::{ config::{ loop_registry, ConfigController, ConfigurableDb, DbConfigManger, DbType, LogConfigManager, - TikvConfig, + MemoryConfigManager, TikvConfig, }, coprocessor::{self, MEMTRACE_ROOT as MEMTRACE_COPROCESSOR}, coprocessor_v2, @@ -441,6 +441,7 @@ where ); cfg_controller.register(tikv::config::Module::Log, Box::new(LogConfigManager)); + cfg_controller.register(tikv::config::Module::Memory, Box::new(MemoryConfigManager)); let lock_mgr = LockManager::new(&self.core.config.pessimistic_txn); cfg_controller.register( diff --git a/components/server/src/setup.rs b/components/server/src/setup.rs index b758b9e39df..b11ffbc45b6 100644 --- a/components/server/src/setup.rs +++ b/components/server/src/setup.rs @@ -245,12 +245,10 @@ pub fn initial_metric(cfg: &MetricConfig) { pub fn overwrite_config_with_cmd_args(config: &mut TikvConfig, matches: &ArgMatches<'_>) { if let Some(level) = matches.value_of("log-level") { config.log.level = logger::get_level_by_string(level).unwrap().into(); - config.log_level = slog::Level::Info.into(); } if let Some(file) = matches.value_of("log-file") { config.log.file.filename = file.to_owned(); - config.log_file = "".to_owned(); } if let Some(addr) = matches.value_of("addr") { diff --git a/components/tikv_alloc/src/default.rs b/components/tikv_alloc/src/default.rs index 2674331c3cd..5133d76e172 100644 --- a/components/tikv_alloc/src/default.rs +++ b/components/tikv_alloc/src/default.rs @@ -8,6 +8,7 @@ use crate::AllocStats; pub fn dump_stats() -> String { String::new() } + pub fn dump_prof(_path: &str) -> ProfResult<()> { Err(ProfError::MemProfilingNotEnabled) } @@ -24,6 +25,14 @@ pub fn deactivate_prof() -> ProfResult<()> { Err(ProfError::MemProfilingNotEnabled) } +pub fn set_prof_sample(_rate: u64) -> ProfResult<()> { + Err(ProfError::MemProfilingNotEnabled) +} + +pub fn is_profiling_active() -> bool { + false +} + /// # Safety /// /// It is safe. The unsafe marker is just for matching the function signature. diff --git a/components/tikv_alloc/src/jemalloc.rs b/components/tikv_alloc/src/jemalloc.rs index 876afa9fcd5..245f6280b71 100644 --- a/components/tikv_alloc/src/jemalloc.rs +++ b/components/tikv_alloc/src/jemalloc.rs @@ -133,7 +133,7 @@ pub fn remove_thread_memory_accessor() { use std::thread::ThreadId; -pub use self::profiling::{activate_prof, deactivate_prof, dump_prof}; +pub use self::profiling::*; pub fn dump_stats() -> String { let mut buf = Vec::with_capacity(1024); @@ -311,6 +311,21 @@ mod profiling { // C string should end with a '\0'. const PROF_ACTIVE: &[u8] = b"prof.active\0"; const PROF_DUMP: &[u8] = b"prof.dump\0"; + const PROF_RESET: &[u8] = b"prof.reset\0"; + const OPT_PROF: &[u8] = b"opt.prof\0"; + + pub fn set_prof_sample(rate: u64) -> ProfResult<()> { + let rate = (rate as f64).log2().ceil() as usize; + unsafe { + if let Err(e) = tikv_jemalloc_ctl::raw::write(PROF_RESET, rate) { + return Err(ProfError::JemallocError(format!( + "failed to set prof sample: {}", + e + ))); + } + } + Ok(()) + } pub fn activate_prof() -> ProfResult<()> { unsafe { @@ -351,22 +366,44 @@ mod profiling { Ok(()) } + pub fn is_profiling_active() -> bool { + match unsafe { tikv_jemalloc_ctl::raw::read(PROF_ACTIVE) } { + Err(e) => { + panic!("is_profiling_active: {:?}", e); + } + Ok(prof) => prof, + } + } + + pub fn is_profiling_enabled() -> bool { + match unsafe { tikv_jemalloc_ctl::raw::read(OPT_PROF) } { + Err(e) => { + // Shouldn't be possible since mem-profiling is set + panic!("is_profiling_enabled: {:?}", e); + } + Ok(prof) => prof, + } + } + #[cfg(test)] mod tests { use std::fs; use tempfile::Builder; - const OPT_PROF: &[u8] = b"opt.prof\0"; + use super::*; - fn is_profiling_on() -> bool { - match unsafe { tikv_jemalloc_ctl::raw::read(OPT_PROF) } { - Err(e) => { - // Shouldn't be possible since mem-profiling is set - panic!("is_profiling_on: {:?}", e); - } - Ok(prof) => prof, - } + #[test] + #[ignore = "#ifdef MALLOC_CONF"] + fn test_profiling_active() { + // Make sure somebody has turned on profiling + assert!(is_profiling_enabled(), "set MALLOC_CONF=prof:true"); + activate_prof().unwrap(); + assert!(is_profiling_active()); + deactivate_prof().unwrap(); + assert!(!is_profiling_active()); + + super::set_prof_sample(512 * 1024 * 1024).unwrap(); } // Only trigger this test with jemallocs `opt.prof` set to @@ -382,7 +419,7 @@ mod profiling { #[ignore = "#ifdef MALLOC_CONF"] fn test_profiling_memory_ifdef_malloc_conf() { // Make sure somebody has turned on profiling - assert!(is_profiling_on(), "set MALLOC_CONF=prof:true"); + assert!(is_profiling_enabled(), "set MALLOC_CONF=prof:true"); let dir = Builder::new() .prefix("test_profiling_memory") @@ -391,11 +428,11 @@ mod profiling { let os_path = dir.path().to_path_buf().join("test1.dump").into_os_string(); let path = os_path.into_string().unwrap(); - super::dump_prof(&path).unwrap(); + dump_prof(&path).unwrap(); let os_path = dir.path().to_path_buf().join("test2.dump").into_os_string(); let path = os_path.into_string().unwrap(); - super::dump_prof(&path).unwrap(); + dump_prof(&path).unwrap(); let files = fs::read_dir(dir.path()).unwrap().count(); assert_eq!(files, 2); @@ -431,4 +468,10 @@ mod profiling { pub fn deactivate_prof() -> ProfResult<()> { Err(ProfError::MemProfilingNotEnabled) } + pub fn set_prof_sample(_rate: u64) -> ProfResult<()> { + Err(ProfError::MemProfilingNotEnabled) + } + pub fn is_profiling_active() -> bool { + false + } } diff --git a/etc/config-template.toml b/etc/config-template.toml index 3c8a6015910..3e55004feb2 100644 --- a/etc/config-template.toml +++ b/etc/config-template.toml @@ -83,6 +83,18 @@ ## maximum number of old log files to retain # max-backups = 0 +[memory] +## Whether enable the heap profiling which may have a bit performance overhead about 2% for the +## default sample rate. +# enable-heap-profiling = true + +## Average interval between allocation samples, as measured in bytes of allocation activity. +## Increasing the sampling interval decreases profile fidelity, but also decreases the +## computational overhead. +## The default sample interval is 512 KB. It only accepts power of two, otherwise it will be +## rounded up to the next power of two. +# profiling-sample-per-bytes = "512KB" + ## Configurations for the single thread pool serving read requests. [readpool.unified] ## The minimal working thread count of the thread pool. diff --git a/src/config/mod.rs b/src/config/mod.rs index 237ac3c7a72..b192a7ac5f7 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -82,6 +82,7 @@ use crate::{ server::{ gc_worker::{GcConfig, RawCompactionFilterFactory, WriteCompactionFilterFactory}, lock_manager::Config as PessimisticTxnConfig, + status_server::HEAP_PROFILE_ACTIVE, ttl::TtlCompactionFilterFactory, Config as ServerConfig, CONFIG_ROCKSDB_GAUGE, }, @@ -1263,10 +1264,10 @@ pub struct DbConfig { #[serde(with = "rocks_config::rate_limiter_mode_serde")] #[online_config(skip)] pub rate_limiter_mode: DBRateLimiterMode, - // deprecated. use rate_limiter_auto_tuned. - #[online_config(skip)] + #[online_config(hidden)] #[doc(hidden)] #[serde(skip_serializing)] + #[deprecated = "The configuration has been removed. Use `rate_limiter_auto_tuned` instead"] pub auto_tuned: Option, pub rate_limiter_auto_tuned: bool, pub bytes_per_sync: ReadableSize, @@ -1318,6 +1319,7 @@ pub struct DbResources { } impl Default for DbConfig { + #[allow(deprecated)] fn default() -> DbConfig { DbConfig { wal_recovery_mode: DBRecoveryMode::PointInTime, @@ -2965,13 +2967,15 @@ pub struct CdcConfig { pub old_value_cache_memory_quota: ReadableSize, // Deprecated! preserved for compatibility check. - #[online_config(skip)] + #[online_config(hidden)] #[doc(hidden)] #[serde(skip_serializing)] + #[deprecated = "The configuration has been removed."] pub old_value_cache_size: usize, } impl Default for CdcConfig { + #[allow(deprecated)] fn default() -> Self { Self { min_ts_interval: ReadableDuration::secs(1), @@ -3211,6 +3215,72 @@ impl ConfigManager for LogConfigManager { } } +#[derive(Clone, Serialize, Deserialize, PartialEq, Debug, OnlineConfig)] +#[serde(default)] +#[serde(rename_all = "kebab-case")] +pub struct MemoryConfig { + // Whether enable the heap profiling which may have a bit performance overhead about 2% for the + // default sample rate. + pub enable_heap_profiling: bool, + + // Average interval between allocation samples, as measured in bytes of allocation activity. + // Increasing the sampling interval decreases profile fidelity, but also decreases the + // computational overhead. + // The default sample interval is 512 KB. It only accepts power of two, otherwise it will be + // rounded up to the next power of two. + pub profiling_sample_per_bytes: ReadableSize, +} + +impl Default for MemoryConfig { + fn default() -> Self { + Self { + enable_heap_profiling: true, + profiling_sample_per_bytes: ReadableSize::kb(512), + } + } +} + +impl MemoryConfig { + pub fn init(&self) { + if self.enable_heap_profiling { + let mut activate = HEAP_PROFILE_ACTIVE.lock().unwrap(); + if let Err(e) = tikv_alloc::activate_prof() { + error!("failed to enable heap profiling"; "err" => ?e); + return; + } + *activate = Some(None); + tikv_alloc::set_prof_sample(self.profiling_sample_per_bytes.0).unwrap(); + } + } +} + +pub struct MemoryConfigManager; + +impl ConfigManager for MemoryConfigManager { + fn dispatch(&mut self, changes: ConfigChange) -> CfgResult<()> { + if let Some(ConfigValue::Bool(enable)) = changes.get("enable_heap_profiling") { + if *enable { + let mut activate = HEAP_PROFILE_ACTIVE.lock().unwrap(); + // already enabled by HTTP API, do nothing + if activate.is_none() { + tikv_alloc::activate_prof()?; + *activate = Some(None); + } + } else { + let mut activate = HEAP_PROFILE_ACTIVE.lock().unwrap(); + tikv_alloc::deactivate_prof()?; + *activate = None; + } + } + + if let Some(ConfigValue::Size(sample_rate)) = changes.get("profiling_sample_per_bytes") { + tikv_alloc::set_prof_sample(*sample_rate).unwrap(); + } + info!("update memory config"; "config" => ?changes); + Ok(()) + } +} + #[derive(Clone, Serialize, Deserialize, PartialEq, Debug, OnlineConfig)] #[serde(default)] #[serde(rename_all = "kebab-case")] @@ -3261,21 +3331,29 @@ pub struct TikvConfig { #[online_config(hidden)] pub cfg_path: String, - // Deprecated! These configuration has been moved to LogConfig. - // They are preserved for compatibility check. #[doc(hidden)] - #[online_config(skip)] + #[online_config(hidden)] + #[serde(skip_serializing)] + #[deprecated = "The configuration has been moved to log.level."] pub log_level: LogLevel, #[doc(hidden)] - #[online_config(skip)] + #[online_config(hidden)] + #[serde(skip_serializing)] + #[deprecated = "The configuration has been moved to log.file.filename."] pub log_file: String, #[doc(hidden)] - #[online_config(skip)] + #[online_config(hidden)] + #[serde(skip_serializing)] + #[deprecated = "The configuration has been moved to log.format."] pub log_format: LogFormat, - #[online_config(skip)] + #[online_config(hidden)] + #[serde(skip_serializing)] + #[deprecated = "The configuration has been moved to log.file.max_days."] pub log_rotation_timespan: ReadableDuration, #[doc(hidden)] - #[online_config(skip)] + #[online_config(hidden)] + #[serde(skip_serializing)] + #[deprecated = "The configuration has been moved to log.file.max_size."] pub log_rotation_size: ReadableSize, #[online_config(skip)] @@ -3306,6 +3384,9 @@ pub struct TikvConfig { #[online_config(submodule)] pub log: LogConfig, + #[online_config(submodule)] + pub memory: MemoryConfig, + #[online_config(submodule)] pub quota: QuotaConfig, @@ -3383,6 +3464,7 @@ pub struct TikvConfig { } impl Default for TikvConfig { + #[allow(deprecated)] fn default() -> TikvConfig { TikvConfig { cfg_path: "".to_owned(), @@ -3399,6 +3481,7 @@ impl Default for TikvConfig { memory_usage_limit: None, memory_usage_high_water: 0.9, log: LogConfig::default(), + memory: MemoryConfig::default(), quota: QuotaConfig::default(), readpool: ReadPoolConfig::default(), server: ServerConfig::default(), @@ -3777,6 +3860,7 @@ impl TikvConfig { // As the init of `logger` is very early, this adjust needs to be separated and // called immediately after parsing the command line. + #[allow(deprecated)] pub fn logger_compatible_adjust(&mut self) { let default_tikv_cfg = TikvConfig::default(); let default_log_cfg = LogConfig::default(); @@ -3828,6 +3912,7 @@ impl TikvConfig { } } + #[allow(deprecated)] pub fn compatible_adjust(&mut self) { let default_raft_store = RaftstoreConfig::default(); let default_coprocessor = CopConfig::default(); @@ -4435,6 +4520,7 @@ pub enum Module { BackupStream, Quota, Log, + Memory, Unknown(String), } @@ -4463,6 +4549,7 @@ impl From<&str> for Module { "resource_metering" => Module::ResourceMetering, "quota" => Module::Quota, "log" => Module::Log, + "memory" => Module::Memory, n => Module::Unknown(n.to_owned()), } } @@ -4766,7 +4853,7 @@ mod tests { assert_eq!(last_cfg_metadata.modified().unwrap(), first_modified); // write to file when config is the inequivalent of last one. - cfg.log_level = slog::Level::Warning.into(); + cfg.log.level = slog::Level::Warning.into(); persist_config(&cfg).unwrap(); last_cfg_metadata = last_cfg_path.metadata().unwrap(); assert_ne!(last_cfg_metadata.modified().unwrap(), first_modified); @@ -5364,7 +5451,7 @@ mod tests { } #[test] - fn test_change_logconfig() { + fn test_change_log_config() { let (cfg, _dir) = TikvConfig::with_tmp().unwrap(); let cfg_controller = ConfigController::new(cfg); @@ -5386,6 +5473,37 @@ mod tests { ); } + #[test] + #[cfg(feature = "mem-profiling")] + fn test_change_memory_config() { + let (cfg, _dir) = TikvConfig::with_tmp().unwrap(); + let cfg_controller = ConfigController::new(cfg); + + cfg_controller.register(Module::Memory, Box::new(MemoryConfigManager)); + cfg_controller + .update_config("memory.enable_heap_profiling", "false") + .unwrap(); + assert_eq!(tikv_alloc::is_profiling_active(), false); + cfg_controller + .update_config("memory.enable_heap_profiling", "true") + .unwrap(); + assert_eq!(tikv_alloc::is_profiling_active(), true); + + cfg_controller + .update_config("memory.profiling_sample_per_bytes", "1MB") + .unwrap(); + assert_eq!( + cfg_controller + .get_current() + .memory + .profiling_sample_per_bytes, + ReadableSize::mb(1), + ); + cfg_controller + .update_config("memory.profiling_sample_per_bytes", "invalid") + .unwrap_err(); + } + #[test] fn test_dispatch_titan_blob_run_mode_config() { let mut cfg = TikvConfig::default(); diff --git a/src/server/config.rs b/src/server/config.rs index 013d1a66238..4e66e5802c0 100644 --- a/src/server/config.rs +++ b/src/server/config.rs @@ -187,26 +187,27 @@ pub struct Config { #[online_config(skip)] pub labels: HashMap, - // deprecated. use readpool.coprocessor.xx_concurrency. #[doc(hidden)] #[serde(skip_serializing)] - #[online_config(skip)] + #[online_config(hidden)] + #[deprecated = "The configuration has been moved to readpool.coprocessor.*_concurrency."] pub end_point_concurrency: Option, - // deprecated. use readpool.coprocessor.stack_size. #[doc(hidden)] #[serde(skip_serializing)] - #[online_config(skip)] + #[online_config(hidden)] + #[deprecated = "The configuration has been moved to readpool.coprocessor.stack_size."] pub end_point_stack_size: Option, - // deprecated. use readpool.coprocessor.max_tasks_per_worker_xx. #[doc(hidden)] #[serde(skip_serializing)] - #[online_config(skip)] + #[online_config(hidden)] + #[deprecated = "The configuration has been moved to readpool.coprocessor.max_tasks_per_worker_*."] pub end_point_max_tasks: Option, } impl Default for Config { + #[allow(deprecated)] fn default() -> Config { let cpu_num = SysQuota::cpu_cores_quota(); let background_thread_count = if cpu_num > 16.0 { 3 } else { 2 }; diff --git a/src/server/status_server/mod.rs b/src/server/status_server/mod.rs index b76454ffab8..60b267a6d94 100644 --- a/src/server/status_server/mod.rs +++ b/src/server/status_server/mod.rs @@ -40,6 +40,7 @@ use openssl::{ x509::X509, }; use pin_project::pin_project; +pub use profile::HEAP_PROFILE_ACTIVE; use profile::*; use prometheus::TEXT_FORMAT; use regex::Regex; diff --git a/src/server/status_server/profile.rs b/src/server/status_server/profile.rs index 3941c6c12b6..dbf819b35fe 100644 --- a/src/server/status_server/profile.rs +++ b/src/server/status_server/profile.rs @@ -37,7 +37,7 @@ lazy_static! { // If it's some it means there are already a CPU profiling. static ref CPU_PROFILE_ACTIVE: Mutex> = Mutex::new(None); // If it's some it means there are already a heap profiling. The channel is used to deactivate a profiling. - static ref HEAP_PROFILE_ACTIVE: Mutex>, TempDir)>> = Mutex::new(None); + pub static ref HEAP_PROFILE_ACTIVE: Mutex, TempDir)>>> = Mutex::new(None); // To normalize thread names. static ref THREAD_NAME_RE: Regex = @@ -129,7 +129,7 @@ where let on_start = move || { let mut activate = HEAP_PROFILE_ACTIVE.lock().unwrap(); assert!(activate.is_none()); - *activate = Some((Some(tx), dir)); + *activate = Some(Some((tx, dir))); activate_prof().map_err(|e| format!("activate_prof: {}", e))?; callback(); info!("periodical heap profiling is started"); @@ -168,9 +168,11 @@ where pub fn deactivate_heap_profile() -> bool { let mut activate = HEAP_PROFILE_ACTIVE.lock().unwrap(); match activate.as_mut() { - Some((tx, _)) => { - if let Some(tx) = tx.take() { + Some(tx) => { + if let Some((tx, _)) = tx.take() { let _ = tx.send(()); + } else { + *activate = None; } true } @@ -277,7 +279,7 @@ pub fn heap_profiles_dir() -> Option { .lock() .unwrap() .as_ref() - .map(|(_, dir)| dir.path().to_owned()) + .and_then(|v| v.as_ref().map(|(_, dir)| dir.path().to_owned())) } pub fn list_heap_profiles() -> Result, String> { diff --git a/tests/integrations/config/mod.rs b/tests/integrations/config/mod.rs index 639e05a02c3..8a1def7d7e7 100644 --- a/tests/integrations/config/mod.rs +++ b/tests/integrations/config/mod.rs @@ -65,7 +65,6 @@ fn read_file_in_project_dir(path: &str) -> String { #[test] fn test_serde_custom_tikv_config() { let mut value = TikvConfig::default(); - value.log_rotation_timespan = ReadableDuration::days(1); value.log.level = Level::Critical.into(); value.log.file.filename = "foo".to_owned(); value.log.format = LogFormat::Json; @@ -77,6 +76,8 @@ fn test_serde_custom_tikv_config() { value.abort_on_panic = true; value.memory_usage_limit = Some(ReadableSize::gb(10)); value.memory_usage_high_water = 0.65; + value.memory.enable_heap_profiling = false; + value.memory.profiling_sample_per_bytes = ReadableSize::mb(1); value.server = ServerConfig { cluster_id: 0, // KEEP IT ZERO, it is skipped by serde. addr: "example.com:443".to_owned(), @@ -103,9 +104,6 @@ fn test_serde_custom_tikv_config() { grpc_stream_initial_window_size: ReadableSize(12_345), grpc_keepalive_time: ReadableDuration::secs(3), grpc_keepalive_timeout: ReadableDuration::secs(60), - end_point_concurrency: None, - end_point_max_tasks: None, - end_point_stack_size: None, end_point_recursion_limit: 100, end_point_stream_channel_size: 16, end_point_batch_row_limit: 64, @@ -125,6 +123,7 @@ fn test_serde_custom_tikv_config() { forward_max_connections_per_address: 5, reject_messages_on_memory_ratio: 0.8, simplify_metrics: false, + ..Default::default() }; value.readpool = ReadPoolConfig { unified: UnifiedReadPoolConfig { @@ -191,11 +190,9 @@ fn test_serde_custom_tikv_config() { raft_engine_purge_interval: ReadableDuration::minutes(20), max_manual_flush_rate: 5.0, raft_entry_cache_life_time: ReadableDuration::secs(12), - raft_reject_transfer_leader_duration: ReadableDuration::secs(3), split_region_check_tick_interval: ReadableDuration::secs(12), region_split_check_diff: Some(ReadableSize::mb(20)), region_compact_check_interval: ReadableDuration::secs(12), - clean_stale_peer_delay: ReadableDuration::secs(0), region_compact_check_step: Some(1_234), region_compact_min_tombstones: 999, region_compact_tombstones_percent: 33, @@ -231,8 +228,6 @@ fn test_serde_custom_tikv_config() { use_delete_range: true, snap_generator_pool_size: 2, cleanup_import_sst_interval: ReadableDuration::minutes(12), - region_max_size: ReadableSize(0), - region_split_size: ReadableSize(0), local_read_batch_size: 33, apply_batch_system, store_batch_system, @@ -253,7 +248,6 @@ fn test_serde_custom_tikv_config() { io_reschedule_hotpot_duration: ReadableDuration::secs(4321), inspect_interval: ReadableDuration::millis(444), report_min_resolved_ts_interval: ReadableDuration::millis(233), - raft_msg_flush_interval: ReadableDuration::micros(250), check_leader_lease_interval: ReadableDuration::millis(123), renew_leader_lease_advance_duration: ReadableDuration::millis(456), reactive_memory_lock_tick_interval: ReadableDuration::millis(566), @@ -270,6 +264,7 @@ fn test_serde_custom_tikv_config() { slow_trend_unsensitive_result: 0.5, enable_v2_compatible_learner: false, unsafe_disable_check_quorum: false, + ..Default::default() }; value.pd = PdConfig::new(vec!["example.com:443".to_owned()]); let titan_cf_config = TitanCfConfig { @@ -316,7 +311,6 @@ fn test_serde_custom_tikv_config() { rate_bytes_per_sec: ReadableSize::kb(1), rate_limiter_refill_period: ReadableDuration::millis(10), rate_limiter_mode: DBRateLimiterMode::AllIo, - auto_tuned: None, rate_limiter_auto_tuned: false, bytes_per_sync: ReadableSize::mb(1), wal_bytes_per_sync: ReadableSize::kb(32), @@ -614,6 +608,7 @@ fn test_serde_custom_tikv_config() { write_buffer_limit: None, }, titan: titan_db_config.clone(), + ..Default::default() }; value.raftdb = RaftDbConfig { info_log_level: LogLevel::Info, @@ -844,7 +839,6 @@ fn test_serde_custom_tikv_config() { }; value.cdc = CdcConfig { min_ts_interval: ReadableDuration::secs(4), - old_value_cache_size: 0, hibernate_regions_compatible: false, incremental_scan_threads: 3, incremental_scan_concurrency: 4, @@ -854,6 +848,7 @@ fn test_serde_custom_tikv_config() { tso_worker_threads: 2, old_value_cache_memory_quota: ReadableSize::mb(14), sink_memory_quota: ReadableSize::mb(7), + ..Default::default() }; value.resolved_ts = ResolvedTsConfig { enable: true, diff --git a/tests/integrations/config/test-cache-compatible.toml b/tests/integrations/config/test-cache-compatible.toml index 9fce88833ed..f91b5cdafc3 100644 --- a/tests/integrations/config/test-cache-compatible.toml +++ b/tests/integrations/config/test-cache-compatible.toml @@ -2,6 +2,8 @@ [log.file] +[memory] + [readpool.coprocessor] [readpool.storage] diff --git a/tests/integrations/config/test-custom.toml b/tests/integrations/config/test-custom.toml index ef7a4809168..a9772e285af 100644 --- a/tests/integrations/config/test-custom.toml +++ b/tests/integrations/config/test-custom.toml @@ -1,9 +1,5 @@ -log-level = "info" -log-file = "" -log-format = "text" slow-log-file = "slow_foo" slow-log-threshold = "1s" -log-rotation-timespan = "1d" panic-when-unexpected-key-or-data = true abort-on-panic = true memory-usage-limit = "10GB" @@ -19,6 +15,10 @@ max-size = 1 max-backups = 2 max-days = 3 +[memory] +enable-heap-profiling = false +profiling-sample-per-bytes = "1MB" + [readpool.unified] min-thread-count = 5 max-thread-count = 10 diff --git a/tests/integrations/config/test-default.toml b/tests/integrations/config/test-default.toml index 23e53b9daf3..ca1abc0081b 100644 --- a/tests/integrations/config/test-default.toml +++ b/tests/integrations/config/test-default.toml @@ -2,6 +2,8 @@ [log.file] +[memory] + [readpool.unified] [readpool.storage] From 9bece34a3b3e3eb2fb5d8296cdbd8a459eeddbd6 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Fri, 10 Nov 2023 17:59:14 +0800 Subject: [PATCH 114/220] cdc: notify pending tasks if associated regions change (#15947) (#15964) close tikv/tikv#15910 Signed-off-by: qupeng Signed-off-by: qupeng Co-authored-by: qupeng Co-authored-by: qupeng Co-authored-by: Ping Yu --- components/cdc/src/endpoint.rs | 7 +++- components/cdc/src/initializer.rs | 41 +++++++++++++++---- components/cdc/src/observer.rs | 30 ++++++++------ .../cdc/tests/failpoints/test_endpoint.rs | 26 ++++++++++++ .../cdc/tests/failpoints/test_register.rs | 6 ++- 5 files changed, 86 insertions(+), 24 deletions(-) diff --git a/components/cdc/src/endpoint.rs b/components/cdc/src/endpoint.rs index 82233af8f14..e62650c77c6 100644 --- a/components/cdc/src/endpoint.rs +++ b/components/cdc/src/endpoint.rs @@ -809,7 +809,6 @@ impl, E: KvEngine, S: StoreRegionMeta> Endpoint, E: KvEngine, S: StoreRegionMeta> Endpoint { CDC_SCAN_TASKS.with_label_values(&["finish"]).inc(); } diff --git a/components/cdc/src/initializer.rs b/components/cdc/src/initializer.rs index 2882d2e975e..ef39a693e3e 100644 --- a/components/cdc/src/initializer.rs +++ b/components/cdc/src/initializer.rs @@ -90,7 +90,6 @@ pub(crate) struct Initializer { pub(crate) request_id: u64, pub(crate) checkpoint_ts: TimeStamp, - pub(crate) scan_concurrency_semaphore: Arc, pub(crate) scan_speed_limiter: Limiter, pub(crate) fetch_speed_limiter: Limiter, @@ -110,9 +109,11 @@ impl Initializer { &mut self, change_observer: ChangeObserver, cdc_handle: T, + concurrency_semaphore: Arc, memory_quota: Arc, ) -> Result<()> { fail_point!("cdc_before_initialize"); + let _permit = concurrency_semaphore.acquire().await; // To avoid holding too many snapshots and holding them too long, // we need to acquire scan concurrency permit before taking snapshot. @@ -188,8 +189,6 @@ impl Initializer { region: Region, memory_quota: Arc, ) -> Result<()> { - let scan_concurrency_semaphore = self.scan_concurrency_semaphore.clone(); - let _permit = scan_concurrency_semaphore.acquire().await; CDC_SCAN_TASKS.with_label_values(&["ongoing"]).inc(); defer!(CDC_SCAN_TASKS.with_label_values(&["ongoing"]).dec()); @@ -655,7 +654,6 @@ mod tests { conn_id: ConnId::new(), request_id: 0, checkpoint_ts: 1.into(), - scan_concurrency_semaphore: Arc::new(Semaphore::new(1)), scan_speed_limiter: Limiter::new(scan_limit as _), fetch_speed_limiter: Limiter::new(fetch_limit as _), max_scan_batch_bytes: 1024 * 1024, @@ -1034,26 +1032,51 @@ mod tests { let change_cmd = ChangeObserver::from_cdc(1, ObserveHandle::new()); let raft_router = CdcRaftRouter(MockRaftStoreRouter::new()); + let concurrency_semaphore = Arc::new(Semaphore::new(1)); let memory_quota = Arc::new(MemoryQuota::new(usize::MAX)); initializer.downstream_state.store(DownstreamState::Stopped); - block_on(initializer.initialize(change_cmd, raft_router.clone(), memory_quota.clone())) - .unwrap_err(); + block_on(initializer.initialize( + change_cmd, + raft_router.clone(), + concurrency_semaphore.clone(), + memory_quota.clone(), + )) + .unwrap_err(); + + let (tx, rx) = sync_channel(1); + let concurrency_semaphore_ = concurrency_semaphore.clone(); + pool.spawn(async move { + let _permit = concurrency_semaphore_.acquire().await; + tx.send(()).unwrap(); + tx.send(()).unwrap(); + tx.send(()).unwrap(); + }); + rx.recv_timeout(Duration::from_millis(200)).unwrap(); let (tx1, rx1) = sync_channel(1); let change_cmd = ChangeObserver::from_cdc(1, ObserveHandle::new()); pool.spawn(async move { // Migrated to 2021 migration. This let statement is probably not needed, see // https://doc.rust-lang.org/edition-guide/rust-2021/disjoint-capture-in-closures.html + let _ = ( + &initializer, + &change_cmd, + &raft_router, + &concurrency_semaphore, + ); let res = initializer - .initialize(change_cmd, raft_router, memory_quota) + .initialize(change_cmd, raft_router, concurrency_semaphore, memory_quota) .await; tx1.send(res).unwrap(); }); + // Must timeout because there is no enough permit. + rx1.recv_timeout(Duration::from_millis(200)).unwrap_err(); - // Shouldn't timeout, gets an error instead. + // Release the permit + rx.recv_timeout(Duration::from_millis(200)).unwrap(); let res = rx1.recv_timeout(Duration::from_millis(200)).unwrap(); - assert!(res.is_err()); + res.unwrap_err(); worker.stop(); } diff --git a/components/cdc/src/observer.rs b/components/cdc/src/observer.rs index aac2842e404..cfcedfeb59d 100644 --- a/components/cdc/src/observer.rs +++ b/components/cdc/src/observer.rs @@ -177,20 +177,26 @@ impl RegionChangeObserver for CdcObserver { event: RegionChangeEvent, _: StateRole, ) { - if let RegionChangeEvent::Destroy = event { - let region_id = ctx.region().get_id(); - if let Some(observe_id) = self.is_subscribed(region_id) { - // Unregister all downstreams. - let store_err = RaftStoreError::RegionNotFound(region_id); - let deregister = Deregister::Delegate { - region_id, - observe_id, - err: CdcError::request(store_err.into()), - }; - if let Err(e) = self.sched.schedule(Task::Deregister(deregister)) { - error!("cdc schedule cdc task failed"; "error" => ?e); + match event { + RegionChangeEvent::Destroy + | RegionChangeEvent::Update( + RegionChangeReason::Split | RegionChangeReason::CommitMerge, + ) => { + let region_id = ctx.region().get_id(); + if let Some(observe_id) = self.is_subscribed(region_id) { + // Unregister all downstreams. + let store_err = RaftStoreError::RegionNotFound(region_id); + let deregister = Deregister::Delegate { + region_id, + observe_id, + err: CdcError::request(store_err.into()), + }; + if let Err(e) = self.sched.schedule(Task::Deregister(deregister)) { + error!("cdc schedule cdc task failed"; "error" => ?e); + } } } + _ => {} } } } diff --git a/components/cdc/tests/failpoints/test_endpoint.rs b/components/cdc/tests/failpoints/test_endpoint.rs index f7cc387625d..42977cc3856 100644 --- a/components/cdc/tests/failpoints/test_endpoint.rs +++ b/components/cdc/tests/failpoints/test_endpoint.rs @@ -569,3 +569,29 @@ fn test_cdc_stream_multiplexing() { } assert!(request_2_ready); } + +// This case tests pending regions can still get region split/merge +// notifications. +#[test] +fn test_cdc_notify_pending_regions() { + let cluster = new_server_cluster(0, 1); + cluster.pd_client.disable_default_operator(); + let mut suite = TestSuiteBuilder::new().cluster(cluster).build(); + let region = suite.cluster.get_region(&[]); + let rid = region.id; + let (mut req_tx, _, receive_event) = new_event_feed_v2(suite.get_region_cdc_client(rid)); + + fail::cfg("cdc_before_initialize", "pause").unwrap(); + let mut req = suite.new_changedata_request(rid); + req.request_id = 1; + block_on(req_tx.send((req, WriteFlags::default()))).unwrap(); + + thread::sleep(Duration::from_millis(100)); + suite.cluster.must_split(®ion, b"x"); + let event = receive_event(false); + matches!( + event.get_events()[0].event, + Some(Event_oneof_event::Error(ref e)) if e.has_region_not_found(), + ); + fail::remove("cdc_before_initialize"); +} diff --git a/components/cdc/tests/failpoints/test_register.rs b/components/cdc/tests/failpoints/test_register.rs index 4558397f8a9..2b6be3744af 100644 --- a/components/cdc/tests/failpoints/test_register.rs +++ b/components/cdc/tests/failpoints/test_register.rs @@ -165,7 +165,11 @@ fn test_connections_register_impl() { let mut events = receive_event(false).events.to_vec(); match events.pop().unwrap().event.unwrap() { Event_oneof_event::Error(err) => { - assert!(err.has_epoch_not_match(), "{:?}", err); + assert!( + err.has_epoch_not_match() || err.has_region_not_found(), + "{:?}", + err + ); } other => panic!("unknown event {:?}", other), } From 076b79c54b9e7656f9119865d4ed75bec3bd58d6 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Fri, 10 Nov 2023 21:42:42 +0800 Subject: [PATCH 115/220] resource_control: support automatically tuning priority resource limiters (#15929) (#15966) close tikv/tikv#15917 Signed-off-by: glorv Co-authored-by: glorv --- components/resource_control/src/future.rs | 8 +- components/resource_control/src/lib.rs | 14 +- components/resource_control/src/metrics.rs | 6 + .../resource_control/src/resource_group.rs | 29 +- .../resource_control/src/resource_limiter.rs | 28 +- components/resource_control/src/worker.rs | 356 +++++++++++++++++- components/server/src/server.rs | 3 +- components/server/src/server2.rs | 3 +- components/tikv_util/src/yatp_pool/metrics.rs | 4 +- components/tikv_util/src/yatp_pool/mod.rs | 60 ++- src/read_pool.rs | 42 ++- src/server/service/kv.rs | 62 +-- src/storage/mod.rs | 23 +- src/storage/txn/sched_pool.rs | 30 +- src/storage/txn/scheduler.rs | 29 +- tests/failpoints/cases/test_storage.rs | 4 +- 16 files changed, 603 insertions(+), 98 deletions(-) diff --git a/components/resource_control/src/future.rs b/components/resource_control/src/future.rs index a935c3b41fa..53bca48b301 100644 --- a/components/resource_control/src/future.rs +++ b/components/resource_control/src/future.rs @@ -274,7 +274,13 @@ mod tests { .name_prefix("test") .build_future_pool(); - let resource_limiter = Arc::new(ResourceLimiter::new("".into(), f64::INFINITY, 1000.0, 0)); + let resource_limiter = Arc::new(ResourceLimiter::new( + "".into(), + f64::INFINITY, + 1000.0, + 0, + true, + )); fn spawn_and_wait(pool: &FuturePool, f: F, limiter: Arc) where diff --git a/components/resource_control/src/lib.rs b/components/resource_control/src/lib.rs index 6cfd24914a1..a7b4cf03192 100644 --- a/components/resource_control/src/lib.rs +++ b/components/resource_control/src/lib.rs @@ -1,6 +1,7 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. #![feature(test)] #![feature(local_key_cell_methods)] +#![feature(array_zip)] use std::sync::Arc; @@ -10,8 +11,8 @@ use serde::{Deserialize, Serialize}; mod resource_group; pub use resource_group::{ - ResourceConsumeType, ResourceController, ResourceGroupManager, TaskMetadata, - MIN_PRIORITY_UPDATE_INTERVAL, + priority_from_task_meta, ResourceConsumeType, ResourceController, ResourceGroupManager, + TaskMetadata, MIN_PRIORITY_UPDATE_INTERVAL, }; mod future; @@ -29,7 +30,9 @@ pub use channel::ResourceMetered; mod resource_limiter; pub use resource_limiter::ResourceLimiter; use tikv_util::worker::Worker; -use worker::{GroupQuotaAdjustWorker, BACKGROUND_LIMIT_ADJUST_DURATION}; +use worker::{ + GroupQuotaAdjustWorker, PriorityLimiterAdjustWorker, BACKGROUND_LIMIT_ADJUST_DURATION, +}; mod metrics; pub mod worker; @@ -66,10 +69,13 @@ pub fn start_periodic_tasks( bg_worker.spawn_async_task(async move { resource_mgr_service_clone.watch_resource_groups().await; }); - // spawn a task to auto adjust background quota limiter. + // spawn a task to auto adjust background quota limiter and priority quota + // limiter. let mut worker = GroupQuotaAdjustWorker::new(mgr.clone(), io_bandwidth); + let mut priority_worker = PriorityLimiterAdjustWorker::new(mgr.clone()); bg_worker.spawn_interval_task(BACKGROUND_LIMIT_ADJUST_DURATION, move || { worker.adjust_quota(); + priority_worker.adjust(); }); // spawn a task to periodically upload resource usage statistics to PD. bg_worker.spawn_async_task(async move { diff --git a/components/resource_control/src/metrics.rs b/components/resource_control/src/metrics.rs index 16338f41c6c..c9404092501 100644 --- a/components/resource_control/src/metrics.rs +++ b/components/resource_control/src/metrics.rs @@ -22,6 +22,12 @@ lazy_static! { &["name"] ) .unwrap(); + pub static ref PRIORITY_QUOTA_LIMIT_VEC: IntGaugeVec = register_int_gauge_vec!( + "tikv_resource_control_priority_quota_limit", + "The quota limiter for each priority in resource control", + &["priority"] + ) + .unwrap(); } pub fn deregister_metrics(name: &str) { diff --git a/components/resource_control/src/resource_group.rs b/components/resource_control/src/resource_group.rs index b7e7ca28705..b45a9833bb8 100644 --- a/components/resource_control/src/resource_group.rs +++ b/components/resource_control/src/resource_group.rs @@ -41,7 +41,6 @@ const DEFAULT_MAX_RU_QUOTA: u64 = 10_000; /// The maximum RU quota that can be configured. const MAX_RU_QUOTA: u64 = i32::MAX as u64; -#[cfg(test)] const LOW_PRIORITY: u32 = 1; const MEDIUM_PRIORITY: u32 = 8; #[cfg(test)] @@ -57,7 +56,7 @@ pub enum ResourceConsumeType { IoBytes(u64), } -#[derive(Copy, Clone, Eq, PartialEq, EnumCount, EnumIter)] +#[derive(Copy, Clone, Eq, PartialEq, EnumCount, EnumIter, Debug)] #[repr(usize)] pub enum TaskPriority { High = 0, @@ -110,6 +109,7 @@ impl Default for ResourceGroupManager { f64::INFINITY, f64::INFINITY, 0, + false, )) }) .collect::>() @@ -206,6 +206,7 @@ impl ResourceGroupManager { f64::INFINITY, f64::INFINITY, version, + true, ))) }) } else { @@ -304,6 +305,14 @@ impl ResourceGroupManager { self.get_group_count() > 1 } + /// return the priority of target resource group. + #[inline] + pub fn get_resource_group_priority(&self, group: &str) -> u32 { + self.resource_groups + .get(group) + .map_or(LOW_PRIORITY, |g| g.group.priority) + } + // Always return the background resource limiter if any; // Only return the foregroup limiter when priority is enabled. pub fn get_resource_limiter( @@ -371,6 +380,11 @@ impl ResourceGroupManager { group_priority.unwrap_or(default_group.group.priority), ) } + + #[inline] + pub fn get_priority_resource_limiters(&self) -> [Arc; 3] { + self.priority_limiters.clone() + } } pub(crate) struct ResourceGroup { @@ -708,7 +722,7 @@ impl<'a> TaskMetadata<'a> { self.metadata.into_owned() } - fn override_priority(&self) -> u32 { + pub fn override_priority(&self) -> u32 { if self.metadata.is_empty() { return 0; } @@ -734,6 +748,15 @@ impl<'a> TaskMetadata<'a> { } } +// return the TaskPriority value from task metadata. +// This function is used for handling thread pool task waiting metrics. +pub fn priority_from_task_meta(meta: &[u8]) -> usize { + let priority = TaskMetadata::from_bytes(meta).override_priority(); + // mapping (high(15), medium(8), low(1)) -> (0, 1, 2) + debug_assert!(priority <= 16); + TaskPriority::from(priority) as usize +} + impl TaskPriorityProvider for ResourceController { fn priority_of(&self, extras: &yatp::queue::Extras) -> u64 { let metadata = TaskMetadata::from_bytes(extras.metadata()); diff --git a/components/resource_control/src/resource_limiter.rs b/components/resource_control/src/resource_limiter.rs index 8898b4eba23..bce6867ac2e 100644 --- a/components/resource_control/src/resource_limiter.rs +++ b/components/resource_control/src/resource_limiter.rs @@ -39,6 +39,8 @@ pub struct ResourceLimiter { name: String, version: u64, limiters: [QuotaLimiter; ResourceType::COUNT], + // whether the resource limiter is a background limiter or priority limiter. + is_background: bool, } impl std::fmt::Debug for ResourceLimiter { @@ -48,16 +50,27 @@ impl std::fmt::Debug for ResourceLimiter { } impl ResourceLimiter { - pub fn new(name: String, cpu_limit: f64, io_limit: f64, version: u64) -> Self { + pub fn new( + name: String, + cpu_limit: f64, + io_limit: f64, + version: u64, + is_background: bool, + ) -> Self { let cpu_limiter = QuotaLimiter::new(cpu_limit); let io_limiter = QuotaLimiter::new(io_limit); Self { name, version, limiters: [cpu_limiter, io_limiter], + is_background, } } + pub fn is_background(&self) -> bool { + self.is_background + } + pub fn consume(&self, cpu_time: Duration, io_bytes: IoBytes) -> Duration { let cpu_dur = self.limiters[ResourceType::Cpu as usize].consume(cpu_time.as_micros() as u64); @@ -86,7 +99,7 @@ impl ResourceLimiter { } pub(crate) fn get_limit_statistics(&self, ty: ResourceType) -> GroupStatistics { - let (total_consumed, total_wait_dur_us, read_consumed, write_consumed) = + let (total_consumed, total_wait_dur_us, read_consumed, write_consumed, request_count) = self.limiters[ty as usize].get_statistics(); GroupStatistics { version: self.version, @@ -94,6 +107,7 @@ impl ResourceLimiter { total_wait_dur_us, read_consumed, write_consumed, + request_count, } } } @@ -104,6 +118,7 @@ pub(crate) struct QuotaLimiter { total_wait_dur_us: AtomicU64, read_bytes: AtomicU64, write_bytes: AtomicU64, + req_count: AtomicU64, } impl QuotaLimiter { @@ -113,6 +128,7 @@ impl QuotaLimiter { total_wait_dur_us: AtomicU64::new(0), read_bytes: AtomicU64::new(0), write_bytes: AtomicU64::new(0), + req_count: AtomicU64::new(0), } } @@ -128,12 +144,13 @@ impl QuotaLimiter { self.limiter.set_speed_limit(limit); } - fn get_statistics(&self) -> (u64, u64, u64, u64) { + fn get_statistics(&self) -> (u64, u64, u64, u64, u64) { ( self.limiter.total_bytes_consumed() as u64, self.total_wait_dur_us.load(Ordering::Relaxed), self.read_bytes.load(Ordering::Relaxed), self.write_bytes.load(Ordering::Relaxed), + self.req_count.load(Ordering::Relaxed), ) } @@ -146,6 +163,7 @@ impl QuotaLimiter { self.total_wait_dur_us .fetch_add(dur.as_micros() as u64, Ordering::Relaxed); } + self.req_count.fetch_add(1, Ordering::Relaxed); dur } @@ -162,6 +180,7 @@ impl QuotaLimiter { self.total_wait_dur_us .fetch_add(dur.as_micros() as u64, Ordering::Relaxed); } + self.req_count.fetch_add(1, Ordering::Relaxed); dur } } @@ -173,6 +192,7 @@ pub struct GroupStatistics { pub total_wait_dur_us: u64, pub read_consumed: u64, pub write_consumed: u64, + pub request_count: u64, } impl std::ops::Sub for GroupStatistics { @@ -184,6 +204,7 @@ impl std::ops::Sub for GroupStatistics { total_wait_dur_us: self.total_wait_dur_us.saturating_sub(rhs.total_wait_dur_us), read_consumed: self.read_consumed.saturating_sub(rhs.read_consumed), write_consumed: self.write_consumed.saturating_sub(rhs.write_consumed), + request_count: self.request_count.saturating_sub(rhs.request_count), } } } @@ -198,6 +219,7 @@ impl std::ops::Div for GroupStatistics { total_wait_dur_us: (self.total_wait_dur_us as f64 / rhs) as u64, read_consumed: (self.read_consumed as f64 / rhs) as u64, write_consumed: (self.write_consumed as f64 / rhs) as u64, + request_count: (self.request_count as f64 / rhs) as u64, } } } diff --git a/components/resource_control/src/worker.rs b/components/resource_control/src/worker.rs index 7bc76691e1f..79dea73d0ae 100644 --- a/components/resource_control/src/worker.rs +++ b/components/resource_control/src/worker.rs @@ -9,16 +9,19 @@ use std::{ }; use file_system::{fetch_io_bytes, IoBytes, IoType}; -use strum::EnumCount; +use prometheus::Histogram; +use strum::{EnumCount, IntoEnumIterator}; use tikv_util::{ + debug, sys::{cpu_time::ProcessStat, SysQuota}, time::Instant, warn, + yatp_pool::metrics::YATP_POOL_SCHEDULE_WAIT_DURATION_VEC, }; use crate::{ metrics::*, - resource_group::ResourceGroupManager, + resource_group::{ResourceGroupManager, TaskPriority}, resource_limiter::{GroupStatistics, ResourceLimiter, ResourceType}, }; @@ -39,7 +42,7 @@ pub struct SysQuotaGetter { process_stat: ProcessStat, prev_io_stats: [IoBytes; IoType::COUNT], prev_io_ts: Instant, - io_bandwidth: u64, + io_bandwidth: f64, } impl ResourceStatsProvider for SysQuotaGetter { @@ -55,7 +58,7 @@ impl ResourceStatsProvider for SysQuotaGetter { } ResourceType::Io => { let mut stats = ResourceUsageStats { - total_quota: self.io_bandwidth as f64, + total_quota: self.io_bandwidth, current_used: 0.0, }; let now = Instant::now_coarse(); @@ -97,7 +100,7 @@ impl GroupQuotaAdjustWorker { process_stat: ProcessStat::cur_proc_stat().unwrap(), prev_io_stats: [IoBytes::default(); IoType::COUNT], prev_io_ts: Instant::now_coarse(), - io_bandwidth, + io_bandwidth: io_bandwidth as f64, }; Self::with_quota_getter(resource_ctl, resource_quota_getter) } @@ -295,6 +298,235 @@ struct GroupStats { expect_cost_rate: f64, } +/// PriorityLimiterAdjustWorker automically adjust the quota of each priority +/// limiter based on the statistics data during a certain period of time. +/// In general, caller should call this function in a fixed interval. +pub struct PriorityLimiterAdjustWorker { + resource_ctl: Arc, + trackers: [PriorityLimiterStatsTracker; 3], + resource_quota_getter: R, + last_adjust_time: Instant, + is_last_low_cpu: bool, + is_last_single_group: bool, +} + +impl PriorityLimiterAdjustWorker { + pub fn new(resource_ctl: Arc) -> Self { + let resource_quota_getter = SysQuotaGetter { + process_stat: ProcessStat::cur_proc_stat().unwrap(), + prev_io_stats: [IoBytes::default(); IoType::COUNT], + prev_io_ts: Instant::now_coarse(), + io_bandwidth: f64::INFINITY, + }; + Self::with_quota_getter(resource_ctl, resource_quota_getter) + } +} + +impl PriorityLimiterAdjustWorker { + fn with_quota_getter( + resource_ctl: Arc, + resource_quota_getter: R, + ) -> Self { + let priorities: [_; 3] = TaskPriority::iter().collect::>().try_into().unwrap(); + let trackers = resource_ctl + .get_priority_resource_limiters() + .zip(priorities) + .map(|(l, p)| PriorityLimiterStatsTracker::new(l, p.as_str())); + Self { + resource_ctl, + trackers, + resource_quota_getter, + last_adjust_time: Instant::now_coarse(), + is_last_low_cpu: true, + is_last_single_group: true, + } + } + pub fn adjust(&mut self) { + let now = Instant::now_coarse(); + let dur = now.saturating_duration_since(self.last_adjust_time); + if dur < Duration::from_secs(1) { + warn!("adjust duration too small, skip adjustment."; "dur" => ?dur); + return; + } + self.last_adjust_time = now; + + // fast path for only the default resource group which means resource + // control is not used at all. + let group_count = self.resource_ctl.get_group_count(); + if group_count == 1 { + if self.is_last_single_group { + return; + } + self.is_last_single_group = true; + self.trackers.iter().skip(1).for_each(|t| { + t.limiter + .get_limiter(ResourceType::Cpu) + .set_rate_limit(f64::INFINITY) + }); + return; + } + self.is_last_single_group = false; + + let stats: [_; 3] = + std::array::from_fn(|i| self.trackers[i].get_and_update_last_stats(dur.as_secs_f64())); + + let process_cpu_stats = match self + .resource_quota_getter + .get_current_stats(ResourceType::Cpu) + { + Ok(s) => s, + Err(e) => { + warn!("get process total cpu failed; skip adjusment."; "err" => ?e); + return; + } + }; + + if process_cpu_stats.current_used < process_cpu_stats.total_quota * 0.5 { + if self.is_last_low_cpu { + return; + } + self.is_last_low_cpu = true; + self.trackers.iter().skip(1).for_each(|t| { + t.limiter + .get_limiter(ResourceType::Cpu) + .set_rate_limit(f64::INFINITY); + // 0 represent infinity + PRIORITY_QUOTA_LIMIT_VEC + .get_metric_with_label_values(&[t.priority]) + .unwrap() + .set(0); + }); + return; + } + self.is_last_low_cpu = false; + + let total_reqs: u64 = stats.iter().map(|s| s.req_count).sum(); + let max_reqs = stats.iter().map(|s| s.req_count).max().unwrap(); + // there is only 1 active priority, do not restrict. + if total_reqs * 99 / 100 <= max_reqs { + self.trackers + .iter() + .skip(1) + .for_each(|t: &PriorityLimiterStatsTracker| { + t.limiter + .get_limiter(ResourceType::Cpu) + .set_rate_limit(f64::INFINITY) + }); + return; + } + + let real_cpu_total: f64 = stats.iter().map(|s| s.cpu_secs).sum(); + let expect_pool_cpu_total = real_cpu_total * (process_cpu_stats.total_quota * 0.95) + / process_cpu_stats.current_used; + let mut limits = [0.0; 2]; + let level_expected: [_; 3] = + std::array::from_fn(|i| stats[i].cpu_secs + stats[i].wait_secs); + // substract the cpu time usage for priority high. + let mut expect_cpu_time_total = expect_pool_cpu_total - level_expected[0]; + + // still reserve a minimal cpu quota + let minimal_quota = process_cpu_stats.total_quota / MICROS_PER_SEC * 0.05; + for i in 1..self.trackers.len() { + if expect_cpu_time_total < minimal_quota { + expect_cpu_time_total = minimal_quota; + } + let limit = expect_cpu_time_total * MICROS_PER_SEC; + self.trackers[i] + .limiter + .get_limiter(ResourceType::Cpu) + .set_rate_limit(limit); + PRIORITY_QUOTA_LIMIT_VEC + .get_metric_with_label_values(&[self.trackers[i].priority]) + .unwrap() + .set(limit as i64); + limits[i - 1] = limit; + expect_cpu_time_total -= level_expected[i]; + } + debug!("adjsut cpu limiter by priority"; "cpu_quota" => process_cpu_stats.total_quota, "process_cpu" => process_cpu_stats.current_used, "expected_cpu" => ?level_expected, + "limits" => ?limits, "limit_cpu_total" => expect_pool_cpu_total, "pool_cpu_cost" => real_cpu_total); + } +} + +#[derive(Debug)] +struct LimiterStats { + // QuotaLimiter consumed cpu secs in total + cpu_secs: f64, + // QuotaLimiter waited secs in total. + wait_secs: f64, + // the total number of tasks that are scheduled. + req_count: u64, +} + +struct HistogramTracker { + metrics: Histogram, + last_sum: f64, + last_count: u64, +} + +impl HistogramTracker { + fn new(metrics: Histogram) -> Self { + let last_sum = metrics.get_sample_sum(); + let last_count = metrics.get_sample_count(); + Self { + metrics, + last_sum, + last_count, + } + } + + fn get_and_upate_statistics(&mut self) -> (f64, u64) { + let cur_sum = self.metrics.get_sample_sum(); + let cur_count = self.metrics.get_sample_count(); + let res = (cur_sum - self.last_sum, cur_count - self.last_count); + self.last_sum = cur_sum; + self.last_count = cur_count; + res + } +} + +struct PriorityLimiterStatsTracker { + priority: &'static str, + limiter: Arc, + last_stats: GroupStatistics, + // unified-read-pool and schedule-worker-pool wait duration metrics. + task_wait_dur_trakcers: [HistogramTracker; 2], +} + +impl PriorityLimiterStatsTracker { + fn new(limiter: Arc, priority: &'static str) -> Self { + let task_wait_dur_trakcers = + ["unified-read-pool", "sched-worker-priority"].map(|pool_name| { + HistogramTracker::new( + YATP_POOL_SCHEDULE_WAIT_DURATION_VEC + .get_metric_with_label_values(&[pool_name, priority]) + .unwrap(), + ) + }); + let last_stats = limiter.get_limit_statistics(ResourceType::Cpu); + Self { + priority, + limiter, + last_stats, + task_wait_dur_trakcers, + } + } + + fn get_and_update_last_stats(&mut self, dur_secs: f64) -> LimiterStats { + let cur_stats = self.limiter.get_limit_statistics(ResourceType::Cpu); + let stats_delta = (cur_stats - self.last_stats) / dur_secs; + self.last_stats = cur_stats; + let wait_stats: [_; 2] = + std::array::from_fn(|i| self.task_wait_dur_trakcers[i].get_and_upate_statistics()); + let schedule_wait_dur_secs = wait_stats.iter().map(|s| s.0).sum::() / dur_secs; + LimiterStats { + cpu_secs: stats_delta.total_consumed as f64 / MICROS_PER_SEC, + wait_secs: stats_delta.total_wait_dur_us as f64 / MICROS_PER_SEC + + schedule_wait_dur_secs, + req_count: stats_delta.request_count, + } + } +} + #[cfg(test)] mod tests { use std::time::Duration; @@ -658,4 +890,118 @@ mod tests { }, ); } + + #[test] + fn test_adjust_priority_resource_limiter() { + let resource_ctl = Arc::new(ResourceGroupManager::default()); + let priority_limiters = resource_ctl.get_priority_resource_limiters(); + let test_provider = TestResourceStatsProvider::new(8.0, f64::INFINITY); + let mut worker = + PriorityLimiterAdjustWorker::with_quota_getter(resource_ctl.clone(), test_provider); + + let reset_quota = |worker: &mut PriorityLimiterAdjustWorker, + cpu: f64| { + worker.resource_quota_getter.cpu_used = cpu; + worker.last_adjust_time = Instant::now_coarse() - Duration::from_secs(10); + priority_limiters[1] + .get_limiter(ResourceType::Cpu) + .set_rate_limit(f64::INFINITY); + priority_limiters[2] + .get_limiter(ResourceType::Cpu) + .set_rate_limit(f64::INFINITY); + }; + + fn check(val: f64, expected: f64) { + assert!( + (val.is_infinite() && expected.is_infinite()) + || (expected * 0.99 < val && val < expected * 1.01), + "actual: {}, expected: {}", + val, + expected + ); + } + + let check_limiter = |high: f64, medium: f64, low: f64| { + check( + priority_limiters[0] + .get_limiter(ResourceType::Cpu) + .get_rate_limit(), + high * MICROS_PER_SEC, + ); + check( + priority_limiters[1] + .get_limiter(ResourceType::Cpu) + .get_rate_limit(), + medium * MICROS_PER_SEC, + ); + check( + priority_limiters[2] + .get_limiter(ResourceType::Cpu) + .get_rate_limit(), + low * MICROS_PER_SEC, + ); + }; + + // only default group, always return infinity. + reset_quota(&mut worker, 6.4); + priority_limiters[1].consume(Duration::from_secs(50), IoBytes::default()); + worker.adjust(); + check_limiter(f64::INFINITY, f64::INFINITY, f64::INFINITY); + + let rg1 = new_resource_group_ru("test_high".into(), 1000, 16); + resource_ctl.add_resource_group(rg1); + let rg2 = new_resource_group_ru("test_low".into(), 2000, 1); + resource_ctl.add_resource_group(rg2); + + reset_quota(&mut worker, 6.4); + priority_limiters[1].consume(Duration::from_secs(64), IoBytes::default()); + worker.adjust(); + check_limiter(f64::INFINITY, f64::INFINITY, f64::INFINITY); + + reset_quota(&mut worker, 6.4); + for _i in 0..100 { + priority_limiters[0].consume(Duration::from_millis(240), IoBytes::default()); + priority_limiters[1].consume(Duration::from_millis(400), IoBytes::default()); + } + worker.adjust(); + check_limiter(f64::INFINITY, 5.2, 1.2); + + reset_quota(&mut worker, 6.4); + for _i in 0..100 { + priority_limiters[0].consume(Duration::from_millis(120), IoBytes::default()); + priority_limiters[1].consume(Duration::from_millis(200), IoBytes::default()); + } + worker.adjust(); + check_limiter(f64::INFINITY, 2.6, 0.6); + + reset_quota(&mut worker, 6.4); + for _i in 0..100 { + priority_limiters[2].consume(Duration::from_millis(200), IoBytes::default()); + } + worker.adjust(); + check_limiter(f64::INFINITY, f64::INFINITY, f64::INFINITY); + + reset_quota(&mut worker, 8.0); + for _i in 0..100 { + priority_limiters[0].consume(Duration::from_millis(240), IoBytes::default()); + priority_limiters[1].consume(Duration::from_millis(240), IoBytes::default()); + priority_limiters[2].consume(Duration::from_millis(320), IoBytes::default()); + } + worker.adjust(); + check_limiter(f64::INFINITY, 5.2, 2.8); + + reset_quota(&mut worker, 6.0); + for _i in 0..100 { + priority_limiters[0].consume(Duration::from_millis(240), IoBytes::default()); + priority_limiters[2].consume(Duration::from_millis(360), IoBytes::default()); + } + worker.adjust(); + check_limiter(f64::INFINITY, 5.2, 5.2); + + // duration too small, unchanged. + worker.resource_quota_getter.cpu_used = 6.0; + worker.last_adjust_time = Instant::now_coarse() - Duration::from_millis(500); + worker.adjust(); + check_limiter(f64::INFINITY, 5.2, 5.2); + } } diff --git a/components/server/src/server.rs b/components/server/src/server.rs index 006750fd518..72e09a9f8d8 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -70,7 +70,7 @@ use raftstore::{ RaftRouterCompactedEventSender, }; use resolved_ts::{LeadershipResolver, Task}; -use resource_control::ResourceGroupManager; +use resource_control::{priority_from_task_meta, ResourceGroupManager}; use security::SecurityManager; use service::{service_event::ServiceEvent, service_manager::GrpcServiceManager}; use snap_recovery::RecoveryService; @@ -558,6 +558,7 @@ where engines.engine.clone(), resource_ctl, CleanupMethod::Remote(self.core.background_worker.remote()), + Some(Arc::new(priority_from_task_meta)), )) } else { None diff --git a/components/server/src/server2.rs b/components/server/src/server2.rs index fdbb18b6205..eab384871e6 100644 --- a/components/server/src/server2.rs +++ b/components/server/src/server2.rs @@ -67,7 +67,7 @@ use raftstore_v2::{ StateStorage, }; use resolved_ts::Task; -use resource_control::ResourceGroupManager; +use resource_control::{priority_from_task_meta, ResourceGroupManager}; use security::SecurityManager; use service::{service_event::ServiceEvent, service_manager::GrpcServiceManager}; use tikv::{ @@ -469,6 +469,7 @@ where engines.engine.clone(), resource_ctl, CleanupMethod::Remote(self.core.background_worker.remote()), + Some(Arc::new(priority_from_task_meta)), )) } else { None diff --git a/components/tikv_util/src/yatp_pool/metrics.rs b/components/tikv_util/src/yatp_pool/metrics.rs index 8ae1aa8910e..efb1379dcc7 100644 --- a/components/tikv_util/src/yatp_pool/metrics.rs +++ b/components/tikv_util/src/yatp_pool/metrics.rs @@ -19,8 +19,8 @@ lazy_static! { pub static ref YATP_POOL_SCHEDULE_WAIT_DURATION_VEC: HistogramVec = register_histogram_vec!( "tikv_yatp_pool_schedule_wait_duration", "Histogram of yatp pool schedule wait duration.", - &["name"], - exponential_buckets(1e-5, 4.0, 12).unwrap() // 10us ~ 41s + &["name", "priority"], + exponential_buckets(1e-5, 2.0, 18).unwrap() // 10us ~ 2.5s ) .unwrap(); } diff --git a/components/tikv_util/src/yatp_pool/mod.rs b/components/tikv_util/src/yatp_pool/mod.rs index fc80e69cd84..b4a3d3c0825 100644 --- a/components/tikv_util/src/yatp_pool/mod.rs +++ b/components/tikv_util/src/yatp_pool/mod.rs @@ -1,14 +1,14 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. mod future_pool; -mod metrics; +pub mod metrics; use std::sync::Arc; use fail::fail_point; pub use future_pool::{Full, FuturePool}; use futures::{compat::Stream01CompatExt, StreamExt}; -use prometheus::{local::LocalHistogram, Histogram}; +use prometheus::{local::LocalHistogram, Histogram, HistogramOpts}; use yatp::{ pool::{CloneRunnerBuilder, Local, Remote, Runner}, queue::{multilevel, priority, Extras, QueueType, TaskCell as _}, @@ -165,7 +165,10 @@ pub struct YatpPoolRunner { before_pause: Option>, // Statistics about the schedule wait duration. - schedule_wait_duration: LocalHistogram, + // local histogram for high,medium,low priority tasks. + schedule_wait_durations: [LocalHistogram; 3], + // return the index of `schedule_wait_durations` from task metadata. + metric_idx_from_task_meta: Arc usize + Send + Sync>, } impl Runner for YatpPoolRunner { @@ -190,12 +193,12 @@ impl Runner for YatpPoolRunner { fn handle(&mut self, local: &mut Local, mut task_cell: Self::TaskCell) -> bool { let extras = task_cell.mut_extras(); if let Some(schedule_time) = extras.schedule_time() { - self.schedule_wait_duration - .observe(schedule_time.elapsed().as_secs_f64()); + let idx = (*self.metric_idx_from_task_meta)(extras.metadata()); + self.schedule_wait_durations[idx].observe(schedule_time.elapsed().as_secs_f64()); } let finished = self.inner.handle(local, task_cell); if self.ticker.try_tick() { - self.schedule_wait_duration.flush(); + self.schedule_wait_durations.iter().for_each(|m| m.flush()); } finished } @@ -229,7 +232,8 @@ impl YatpPoolRunner { after_start: Option>, before_stop: Option>, before_pause: Option>, - schedule_wait_duration: Histogram, + schedule_wait_durations: [Histogram; 3], + metric_idx_from_task_meta: Arc usize + Send + Sync>, ) -> Self { YatpPoolRunner { inner, @@ -238,7 +242,8 @@ impl YatpPoolRunner { after_start, before_stop, before_pause, - schedule_wait_duration: schedule_wait_duration.local(), + schedule_wait_durations: schedule_wait_durations.map(|m| m.local()), + metric_idx_from_task_meta, } } } @@ -256,6 +261,10 @@ pub struct YatpPoolBuilder { max_tasks: usize, cleanup_method: CleanupMethod, + // whether to tracker task scheduling wait duration + enable_task_wait_metrics: bool, + metric_idx_from_task_meta: Option usize + Send + Sync>>, + #[cfg(test)] background_cleanup_hook: Option>, } @@ -275,6 +284,9 @@ impl YatpPoolBuilder { max_tasks: std::usize::MAX, cleanup_method: CleanupMethod::InPlace, + enable_task_wait_metrics: false, + metric_idx_from_task_meta: None, + #[cfg(test)] background_cleanup_hook: None, } @@ -344,6 +356,19 @@ impl YatpPoolBuilder { self } + pub fn enable_task_wait_metrics(mut self) -> Self { + self.enable_task_wait_metrics = true; + self + } + + pub fn metric_idx_from_task_meta( + mut self, + f: Arc usize + Send + Sync>, + ) -> Self { + self.metric_idx_from_task_meta = Some(f); + self + } + pub fn build_future_pool(self) -> FuturePool { let name = self .name_prefix @@ -469,15 +494,24 @@ impl YatpPoolBuilder { let after_start = self.after_start.take(); let before_stop = self.before_stop.take(); let before_pause = self.before_pause.take(); - let schedule_wait_duration = - metrics::YATP_POOL_SCHEDULE_WAIT_DURATION_VEC.with_label_values(&[&name]); + let schedule_wait_durations = if self.enable_task_wait_metrics { + ["high", "medium", "low"].map(|p| { + metrics::YATP_POOL_SCHEDULE_WAIT_DURATION_VEC.with_label_values(&[&name, p]) + }) + } else { + std::array::from_fn(|_| Histogram::with_opts(HistogramOpts::new("_", "_")).unwrap()) + }; + let metric_idx_from_task_meta = self + .metric_idx_from_task_meta + .unwrap_or_else(|| Arc::new(|_| 0)); let read_pool_runner = YatpPoolRunner::new( Default::default(), self.ticker.clone(), after_start, before_stop, before_pause, - schedule_wait_duration, + schedule_wait_durations, + metric_idx_from_task_meta, ); (builder, read_pool_runner) } @@ -500,6 +534,7 @@ mod tests { let name = "test_record_schedule_wait_duration"; let pool = YatpPoolBuilder::new(DefaultTicker::default()) .name_prefix(name) + .enable_task_wait_metrics() .build_single_level_pool(); let (tx, rx) = mpsc::channel(); for _ in 0..3 { @@ -518,7 +553,8 @@ mod tests { } // Drop the pool so the local metrics are flushed. drop(pool); - let histogram = metrics::YATP_POOL_SCHEDULE_WAIT_DURATION_VEC.with_label_values(&[name]); + let histogram = + metrics::YATP_POOL_SCHEDULE_WAIT_DURATION_VEC.with_label_values(&[name, "high"]); assert_eq!(histogram.get_sample_count() as u32, 6, "{:?}", histogram); } diff --git a/src/read_pool.rs b/src/read_pool.rs index a5898ea4f63..fb44bcb4cc9 100644 --- a/src/read_pool.rs +++ b/src/read_pool.rs @@ -429,6 +429,7 @@ pub fn build_yatp_read_pool( engine: E, resource_ctl: Option>, cleanup_method: CleanupMethod, + metric_idx_from_task_meta_fn: Option usize + Send + Sync + 'static>>, ) -> ReadPool { let unified_read_pool_name = get_unified_read_pool_name(); build_yatp_read_pool_with_name( @@ -438,6 +439,7 @@ pub fn build_yatp_read_pool( resource_ctl, cleanup_method, unified_read_pool_name, + metric_idx_from_task_meta_fn, ) } @@ -448,9 +450,10 @@ pub fn build_yatp_read_pool_with_name( resource_ctl: Option>, cleanup_method: CleanupMethod, unified_read_pool_name: String, + metric_idx_from_task_meta_fn: Option usize + Send + Sync + 'static>>, ) -> ReadPool { let raftkv = Arc::new(Mutex::new(engine)); - let builder = YatpPoolBuilder::new(ReporterTicker { reporter }) + let mut builder = YatpPoolBuilder::new(ReporterTicker { reporter }) .name_prefix(&unified_read_pool_name) .cleanup_method(cleanup_method) .stack_size(config.stack_size.0 as usize) @@ -473,6 +476,12 @@ pub fn build_yatp_read_pool_with_name( .before_stop(|| unsafe { destroy_tls_engine::(); }); + if let Some(metric_idx_from_task_meta_fn) = metric_idx_from_task_meta_fn { + builder = builder + .enable_task_wait_metrics() + .metric_idx_from_task_meta(metric_idx_from_task_meta_fn); + } + let pool = if let Some(ref r) = resource_ctl { builder.build_priority_pool(r.clone()) } else { @@ -796,8 +805,14 @@ mod tests { // max running tasks number should be 2*1 = 2 let engine = TestEngineBuilder::new().build().unwrap(); - let pool = - build_yatp_read_pool(&config, DummyReporter, engine, None, CleanupMethod::InPlace); + let pool = build_yatp_read_pool( + &config, + DummyReporter, + engine, + None, + CleanupMethod::InPlace, + None, + ); let gen_task = || { let (tx, rx) = oneshot::channel::<()>(); @@ -844,8 +859,14 @@ mod tests { // max running tasks number should be 2*1 = 2 let engine = TestEngineBuilder::new().build().unwrap(); - let pool = - build_yatp_read_pool(&config, DummyReporter, engine, None, CleanupMethod::InPlace); + let pool = build_yatp_read_pool( + &config, + DummyReporter, + engine, + None, + CleanupMethod::InPlace, + None, + ); let gen_task = || { let (tx, rx) = oneshot::channel::<()>(); @@ -900,8 +921,14 @@ mod tests { // max running tasks number should be 2*1 = 2 let engine = TestEngineBuilder::new().build().unwrap(); - let pool = - build_yatp_read_pool(&config, DummyReporter, engine, None, CleanupMethod::InPlace); + let pool = build_yatp_read_pool( + &config, + DummyReporter, + engine, + None, + CleanupMethod::InPlace, + None, + ); let gen_task = || { let (tx, rx) = oneshot::channel::<()>(); @@ -1027,6 +1054,7 @@ mod tests { resource_manager, CleanupMethod::InPlace, name.clone(), + None, ); let gen_task = || { diff --git a/src/server/service/kv.rs b/src/server/service/kv.rs index 77f92d33d95..3be66a47f79 100644 --- a/src/server/service/kv.rs +++ b/src/server/service/kv.rs @@ -192,14 +192,14 @@ macro_rules! handle_request { handle_request!($fn_name, $future_name, $req_ty, $resp_ty, no_time_detail); }; ($fn_name: ident, $future_name: ident, $req_ty: ident, $resp_ty: ident, $time_detail: tt) => { - fn $fn_name(&mut self, ctx: RpcContext<'_>, req: $req_ty, sink: UnarySink<$resp_ty>) { + fn $fn_name(&mut self, ctx: RpcContext<'_>, mut req: $req_ty, sink: UnarySink<$resp_ty>) { forward_unary!(self.proxy, $fn_name, ctx, req, sink); let begin_instant = Instant::now(); let source = req.get_context().get_request_source().to_owned(); - let resource_control_ctx = req.get_context().get_resource_control_context(); + let resource_control_ctx = req.mut_context().mut_resource_control_context(); if let Some(resource_manager) = &self.resource_manager { - resource_manager.consume_penalty(resource_control_ctx); + consume_penalty_and_set_priority(resource_manager, resource_control_ctx); } GRPC_RESOURCE_GROUP_COUNTER_VEC .with_label_values(&[resource_control_ctx.get_resource_group_name()]) @@ -229,6 +229,20 @@ macro_rules! handle_request { } } +// consume resource group penalty and set explicit group priority +// We override the override_priority here to make handling tasks easier. +fn consume_penalty_and_set_priority( + resource_manager: &Arc, + resource_control_ctx: &mut ResourceControlContext, +) { + resource_manager.consume_penalty(resource_control_ctx); + if resource_control_ctx.get_override_priority() == 0 { + let prioirty = resource_manager + .get_resource_group_priority(resource_control_ctx.get_resource_group_name()); + resource_control_ctx.override_priority = prioirty as u64; + } +} + macro_rules! set_total_time { ($resp:ident, $duration:expr,no_time_detail) => {}; ($resp:ident, $duration:expr,has_time_detail) => { @@ -476,12 +490,12 @@ impl Tikv for Service { ctx.spawn(task); } - fn coprocessor(&mut self, ctx: RpcContext<'_>, req: Request, sink: UnarySink) { + fn coprocessor(&mut self, ctx: RpcContext<'_>, mut req: Request, sink: UnarySink) { forward_unary!(self.proxy, coprocessor, ctx, req, sink); let source = req.get_context().get_request_source().to_owned(); - let resource_control_ctx = req.get_context().get_resource_control_context(); + let resource_control_ctx = req.mut_context().mut_resource_control_context(); if let Some(resource_manager) = &self.resource_manager { - resource_manager.consume_penalty(resource_control_ctx); + consume_penalty_and_set_priority(resource_manager, resource_control_ctx); } GRPC_RESOURCE_GROUP_COUNTER_VEC .with_label_values(&[resource_control_ctx.get_resource_group_name()]) @@ -513,13 +527,13 @@ impl Tikv for Service { fn raw_coprocessor( &mut self, ctx: RpcContext<'_>, - req: RawCoprocessorRequest, + mut req: RawCoprocessorRequest, sink: UnarySink, ) { let source = req.get_context().get_request_source().to_owned(); - let resource_control_ctx = req.get_context().get_resource_control_context(); + let resource_control_ctx = req.mut_context().mut_resource_control_context(); if let Some(resource_manager) = &self.resource_manager { - resource_manager.consume_penalty(resource_control_ctx); + consume_penalty_and_set_priority(resource_manager, resource_control_ctx); } GRPC_RESOURCE_GROUP_COUNTER_VEC .with_label_values(&[resource_control_ctx.get_resource_group_name()]) @@ -602,13 +616,13 @@ impl Tikv for Service { fn coprocessor_stream( &mut self, ctx: RpcContext<'_>, - req: Request, + mut req: Request, mut sink: ServerStreamingSink, ) { let begin_instant = Instant::now(); - let resource_control_ctx = req.get_context().get_resource_control_context(); + let resource_control_ctx = req.mut_context().mut_resource_control_context(); if let Some(resource_manager) = &self.resource_manager { - resource_manager.consume_penalty(resource_control_ctx); + consume_penalty_and_set_priority(resource_manager, resource_control_ctx); } GRPC_RESOURCE_GROUP_COUNTER_VEC .with_label_values(&[resource_control_ctx.get_resource_group_name()]) @@ -1149,10 +1163,10 @@ fn handle_batch_commands_request( let resp = future::ok(batch_commands_response::Response::default()); response_batch_commands_request(id, resp, tx.clone(), begin_instant, GrpcTypeKind::invalid, String::default()); }, - Some(batch_commands_request::request::Cmd::Get(req)) => { - let resource_control_ctx = req.get_context().get_resource_control_context(); + Some(batch_commands_request::request::Cmd::Get(mut req)) => { + let resource_control_ctx = req.mut_context().mut_resource_control_context(); if let Some(resource_manager) = resource_manager { - resource_manager.consume_penalty(resource_control_ctx); + consume_penalty_and_set_priority(resource_manager, resource_control_ctx); } GRPC_RESOURCE_GROUP_COUNTER_VEC .with_label_values(&[resource_control_ctx.get_resource_group_name()]) @@ -1170,10 +1184,10 @@ fn handle_batch_commands_request( response_batch_commands_request(id, resp, tx.clone(), begin_instant, GrpcTypeKind::kv_get, source); } }, - Some(batch_commands_request::request::Cmd::RawGet(req)) => { - let resource_control_ctx = req.get_context().get_resource_control_context(); + Some(batch_commands_request::request::Cmd::RawGet(mut req)) => { + let resource_control_ctx = req.mut_context().mut_resource_control_context(); if let Some(resource_manager) = resource_manager { - resource_manager.consume_penalty(resource_control_ctx); + consume_penalty_and_set_priority(resource_manager, resource_control_ctx); } GRPC_RESOURCE_GROUP_COUNTER_VEC .with_label_values(&[resource_control_ctx.get_resource_group_name()]) @@ -1191,10 +1205,10 @@ fn handle_batch_commands_request( response_batch_commands_request(id, resp, tx.clone(), begin_instant, GrpcTypeKind::raw_get, source); } }, - Some(batch_commands_request::request::Cmd::Coprocessor(req)) => { - let resource_control_ctx = req.get_context().get_resource_control_context(); + Some(batch_commands_request::request::Cmd::Coprocessor(mut req)) => { + let resource_control_ctx = req.mut_context().mut_resource_control_context(); if let Some(resource_manager) = resource_manager { - resource_manager.consume_penalty(resource_control_ctx); + consume_penalty_and_set_priority(resource_manager, resource_control_ctx); } GRPC_RESOURCE_GROUP_COUNTER_VEC .with_label_values(&[resource_control_ctx.get_resource_group_name()]) @@ -1225,10 +1239,10 @@ fn handle_batch_commands_request( String::default(), ); } - $(Some(batch_commands_request::request::Cmd::$cmd(req)) => { - let resource_control_ctx = req.get_context().get_resource_control_context(); + $(Some(batch_commands_request::request::Cmd::$cmd(mut req)) => { + let resource_control_ctx = req.mut_context().mut_resource_control_context(); if let Some(resource_manager) = resource_manager { - resource_manager.consume_penalty(resource_control_ctx); + consume_penalty_and_set_priority(resource_manager, resource_control_ctx); } GRPC_RESOURCE_GROUP_COUNTER_VEC .with_label_values(&[resource_control_ctx.get_resource_group_name()]) diff --git a/src/storage/mod.rs b/src/storage/mod.rs index c0d6e6fc4a3..c89a767a80b 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -3344,7 +3344,8 @@ impl TestStorageBuilder { } else { None }; - + let manager = Arc::new(ResourceGroupManager::default()); + let resource_ctl = manager.derive_controller("test".into(), false); Storage::from_engine( self.engine, &self.config, @@ -3362,11 +3363,8 @@ impl TestStorageBuilder { Arc::new(QuotaLimiter::default()), latest_feature_gate(), ts_provider, - Some(Arc::new(ResourceController::new_for_test( - "test".to_owned(), - false, - ))), - None, + Some(resource_ctl), + Some(manager), ) } @@ -3379,7 +3377,8 @@ impl TestStorageBuilder { &crate::config::StorageReadPoolConfig::default_for_test(), engine.clone(), ); - + let manager = Arc::new(ResourceGroupManager::default()); + let resource_ctl = manager.derive_controller("test".into(), false); Storage::from_engine( engine, &self.config, @@ -3397,16 +3396,14 @@ impl TestStorageBuilder { Arc::new(QuotaLimiter::default()), latest_feature_gate(), None, - Some(Arc::new(ResourceController::new_for_test( - "test".to_owned(), - false, - ))), - None, + Some(resource_ctl), + Some(manager), ) } pub fn build_for_resource_controller( self, + resource_manager: Arc, resource_controller: Arc, ) -> Result, L, F>> { let engine = TxnTestEngine { @@ -3436,7 +3433,7 @@ impl TestStorageBuilder { latest_feature_gate(), None, Some(resource_controller), - None, + Some(resource_manager), ) } } diff --git a/src/storage/txn/sched_pool.rs b/src/storage/txn/sched_pool.rs index 19736304373..8674a581c72 100644 --- a/src/storage/txn/sched_pool.rs +++ b/src/storage/txn/sched_pool.rs @@ -12,7 +12,10 @@ use kvproto::{kvrpcpb::CommandPri, pdpb::QueryKind}; use pd_client::{Feature, FeatureGate}; use prometheus::local::*; use raftstore::store::WriteStats; -use resource_control::{ControlledFuture, ResourceController, TaskMetadata}; +use resource_control::{ + priority_from_task_meta, with_resource_limiter, ControlledFuture, ResourceController, + ResourceGroupManager, TaskMetadata, +}; use tikv_util::{ sys::SysQuota, yatp_pool::{Full, FuturePool, PoolTicker, YatpPoolBuilder}, @@ -101,6 +104,7 @@ impl VanillaQueue { struct PriorityQueue { worker_pool: FuturePool, resource_ctl: Arc, + resource_mgr: Arc, } impl PriorityQueue { @@ -118,15 +122,23 @@ impl PriorityQueue { // TODO: maybe use a better way to generate task_id let task_id = rand::random::(); let group_name = metadata.group_name().to_owned(); + let resource_limiter = self.resource_mgr.get_resource_limiter( + unsafe { std::str::from_utf8_unchecked(&group_name) }, + "", + metadata.override_priority() as u64, + ); let mut extras = Extras::new_multilevel(task_id, fixed_level); extras.set_metadata(metadata.to_vec()); self.worker_pool.spawn_with_extras( - ControlledFuture::new( - async move { - f.await; - }, - self.resource_ctl.clone(), - group_name, + with_resource_limiter( + ControlledFuture::new( + async move { + f.await; + }, + self.resource_ctl.clone(), + group_name, + ), + resource_limiter, ), extras, ) @@ -155,6 +167,7 @@ impl SchedPool { reporter: R, feature_gate: FeatureGate, resource_ctl: Option>, + resource_mgr: Option>, ) -> Self { let builder = |pool_size: usize, name_prefix: &str| { let engine = Arc::new(Mutex::new(engine.clone())); @@ -181,6 +194,8 @@ impl SchedPool { destroy_tls_engine::(); tls_flush(&reporter); }) + .enable_task_wait_metrics() + .metric_idx_from_task_meta(Arc::new(priority_from_task_meta)) }; let vanilla = VanillaQueue { worker_pool: builder(pool_size, "sched-worker-pool").build_future_pool(), @@ -191,6 +206,7 @@ impl SchedPool { worker_pool: builder(pool_size, "sched-worker-priority") .build_priority_future_pool(r.clone()), resource_ctl: r.clone(), + resource_mgr: resource_mgr.unwrap(), }); let queue_type = if resource_ctl.is_some() { QueueType::Dynamic diff --git a/src/storage/txn/scheduler.rs b/src/storage/txn/scheduler.rs index 995c361e163..6d087d894df 100644 --- a/src/storage/txn/scheduler.rs +++ b/src/storage/txn/scheduler.rs @@ -472,6 +472,7 @@ impl TxnScheduler { reporter, feature_gate.clone(), resource_ctl, + resource_manager.clone(), ), control_mutex: Arc::new(tokio::sync::Mutex::new(false)), lock_mgr, @@ -1300,10 +1301,14 @@ impl TxnScheduler { // TODO: write bytes can be a bit inaccurate due to error requests or in-memory // pessimistic locks. sample.add_write_bytes(write_bytes); - // estimate the cpu time for write by the schdule cpu time and write bytes - let expected_dur = (sample.cpu_time() + Duration::from_micros(write_bytes as u64)) - * SCHEDULER_CPU_TIME_FACTOR; if let Some(limiter) = resource_limiter { + let expected_dur = if limiter.is_background() { + // estimate the cpu time for write by the schduling cpu time and write bytes + (sample.cpu_time() + Duration::from_micros(write_bytes as u64)) + * SCHEDULER_CPU_TIME_FACTOR + } else { + sample.cpu_time() + }; limiter .async_consume( expected_dur, @@ -2032,6 +2037,8 @@ mod tests { enable_async_apply_prewrite: false, ..Default::default() }; + let resource_manager = Arc::new(ResourceGroupManager::default()); + let controller = resource_manager.derive_controller("test".into(), false); ( TxnScheduler::new( engine.clone(), @@ -2049,11 +2056,8 @@ mod tests { ResourceTagFactory::new_for_test(), Arc::new(QuotaLimiter::default()), latest_feature_gate(), - Some(Arc::new(ResourceController::new_for_test( - "test".to_owned(), - true, - ))), - None, + Some(controller), + Some(resource_manager), ), engine, ) @@ -2388,6 +2392,8 @@ mod tests { }; let feature_gate = FeatureGate::default(); feature_gate.set_version("6.0.0").unwrap(); + let resource_manager = Arc::new(ResourceGroupManager::default()); + let controller = resource_manager.derive_controller("test".into(), false); let scheduler = TxnScheduler::new( engine, @@ -2405,11 +2411,8 @@ mod tests { ResourceTagFactory::new_for_test(), Arc::new(QuotaLimiter::default()), feature_gate.clone(), - Some(Arc::new(ResourceController::new_for_test( - "test".to_owned(), - true, - ))), - None, + Some(controller), + Some(resource_manager), ); // Use sync mode if pipelined_pessimistic_lock is false. assert_eq!(scheduler.pessimistic_lock_mode(), PessimisticLockMode::Sync); diff --git a/tests/failpoints/cases/test_storage.rs b/tests/failpoints/cases/test_storage.rs index 533d8d0abd4..fec1ccc931d 100644 --- a/tests/failpoints/cases/test_storage.rs +++ b/tests/failpoints/cases/test_storage.rs @@ -352,12 +352,12 @@ fn test_scheduler_pool_auto_switch_for_resource_ctl() { .get(&1) .unwrap() .clone(); - let resource_manager = ResourceGroupManager::default(); + let resource_manager = Arc::new(ResourceGroupManager::default()); let resource_ctl = resource_manager.derive_controller("test".to_string(), true); let storage = TestStorageBuilderApiV1::from_engine_and_lock_mgr(engine, MockLockManager::new()) .config(cluster.cfg.tikv.storage.clone()) - .build_for_resource_controller(resource_ctl) + .build_for_resource_controller(resource_manager.clone(), resource_ctl) .unwrap(); let region = cluster.get_region(b"k1"); From 10c9d52fecccd8a7c19372e74fb9f0b010f83dc4 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Thu, 16 Nov 2023 14:24:48 +0800 Subject: [PATCH 116/220] resource_control: do not force set override priority at handle gRPC request (#16003) (#16004) close tikv/tikv#15994 Signed-off-by: glorv Co-authored-by: glorv --- .../resource_control/src/resource_group.rs | 21 ++++++- src/server/service/kv.rs | 62 +++++++------------ 2 files changed, 44 insertions(+), 39 deletions(-) diff --git a/components/resource_control/src/resource_group.rs b/components/resource_control/src/resource_group.rs index b45a9833bb8..7e6d4279a25 100644 --- a/components/resource_control/src/resource_group.rs +++ b/components/resource_control/src/resource_group.rs @@ -77,7 +77,10 @@ impl TaskPriority { impl From for TaskPriority { fn from(value: u32) -> Self { // map the resource group priority value (1,8,16) to (Low,Medium,High) - if value < 6 { + // 0 means the priority is not set, so map it to medium by default. + if value == 0 { + Self::Medium + } else if value < 6 { Self::Low } else if value < 11 { Self::Medium @@ -1430,4 +1433,20 @@ pub(crate) mod tests { &mgr.priority_limiters[1] )); } + + #[test] + fn test_task_priority() { + use TaskPriority::*; + let cases = [ + (0, Medium), + (1, Low), + (7, Medium), + (8, Medium), + (15, High), + (16, High), + ]; + for (value, priority) in cases { + assert_eq!(TaskPriority::from(value), priority); + } + } } diff --git a/src/server/service/kv.rs b/src/server/service/kv.rs index 3be66a47f79..77f92d33d95 100644 --- a/src/server/service/kv.rs +++ b/src/server/service/kv.rs @@ -192,14 +192,14 @@ macro_rules! handle_request { handle_request!($fn_name, $future_name, $req_ty, $resp_ty, no_time_detail); }; ($fn_name: ident, $future_name: ident, $req_ty: ident, $resp_ty: ident, $time_detail: tt) => { - fn $fn_name(&mut self, ctx: RpcContext<'_>, mut req: $req_ty, sink: UnarySink<$resp_ty>) { + fn $fn_name(&mut self, ctx: RpcContext<'_>, req: $req_ty, sink: UnarySink<$resp_ty>) { forward_unary!(self.proxy, $fn_name, ctx, req, sink); let begin_instant = Instant::now(); let source = req.get_context().get_request_source().to_owned(); - let resource_control_ctx = req.mut_context().mut_resource_control_context(); + let resource_control_ctx = req.get_context().get_resource_control_context(); if let Some(resource_manager) = &self.resource_manager { - consume_penalty_and_set_priority(resource_manager, resource_control_ctx); + resource_manager.consume_penalty(resource_control_ctx); } GRPC_RESOURCE_GROUP_COUNTER_VEC .with_label_values(&[resource_control_ctx.get_resource_group_name()]) @@ -229,20 +229,6 @@ macro_rules! handle_request { } } -// consume resource group penalty and set explicit group priority -// We override the override_priority here to make handling tasks easier. -fn consume_penalty_and_set_priority( - resource_manager: &Arc, - resource_control_ctx: &mut ResourceControlContext, -) { - resource_manager.consume_penalty(resource_control_ctx); - if resource_control_ctx.get_override_priority() == 0 { - let prioirty = resource_manager - .get_resource_group_priority(resource_control_ctx.get_resource_group_name()); - resource_control_ctx.override_priority = prioirty as u64; - } -} - macro_rules! set_total_time { ($resp:ident, $duration:expr,no_time_detail) => {}; ($resp:ident, $duration:expr,has_time_detail) => { @@ -490,12 +476,12 @@ impl Tikv for Service { ctx.spawn(task); } - fn coprocessor(&mut self, ctx: RpcContext<'_>, mut req: Request, sink: UnarySink) { + fn coprocessor(&mut self, ctx: RpcContext<'_>, req: Request, sink: UnarySink) { forward_unary!(self.proxy, coprocessor, ctx, req, sink); let source = req.get_context().get_request_source().to_owned(); - let resource_control_ctx = req.mut_context().mut_resource_control_context(); + let resource_control_ctx = req.get_context().get_resource_control_context(); if let Some(resource_manager) = &self.resource_manager { - consume_penalty_and_set_priority(resource_manager, resource_control_ctx); + resource_manager.consume_penalty(resource_control_ctx); } GRPC_RESOURCE_GROUP_COUNTER_VEC .with_label_values(&[resource_control_ctx.get_resource_group_name()]) @@ -527,13 +513,13 @@ impl Tikv for Service { fn raw_coprocessor( &mut self, ctx: RpcContext<'_>, - mut req: RawCoprocessorRequest, + req: RawCoprocessorRequest, sink: UnarySink, ) { let source = req.get_context().get_request_source().to_owned(); - let resource_control_ctx = req.mut_context().mut_resource_control_context(); + let resource_control_ctx = req.get_context().get_resource_control_context(); if let Some(resource_manager) = &self.resource_manager { - consume_penalty_and_set_priority(resource_manager, resource_control_ctx); + resource_manager.consume_penalty(resource_control_ctx); } GRPC_RESOURCE_GROUP_COUNTER_VEC .with_label_values(&[resource_control_ctx.get_resource_group_name()]) @@ -616,13 +602,13 @@ impl Tikv for Service { fn coprocessor_stream( &mut self, ctx: RpcContext<'_>, - mut req: Request, + req: Request, mut sink: ServerStreamingSink, ) { let begin_instant = Instant::now(); - let resource_control_ctx = req.mut_context().mut_resource_control_context(); + let resource_control_ctx = req.get_context().get_resource_control_context(); if let Some(resource_manager) = &self.resource_manager { - consume_penalty_and_set_priority(resource_manager, resource_control_ctx); + resource_manager.consume_penalty(resource_control_ctx); } GRPC_RESOURCE_GROUP_COUNTER_VEC .with_label_values(&[resource_control_ctx.get_resource_group_name()]) @@ -1163,10 +1149,10 @@ fn handle_batch_commands_request( let resp = future::ok(batch_commands_response::Response::default()); response_batch_commands_request(id, resp, tx.clone(), begin_instant, GrpcTypeKind::invalid, String::default()); }, - Some(batch_commands_request::request::Cmd::Get(mut req)) => { - let resource_control_ctx = req.mut_context().mut_resource_control_context(); + Some(batch_commands_request::request::Cmd::Get(req)) => { + let resource_control_ctx = req.get_context().get_resource_control_context(); if let Some(resource_manager) = resource_manager { - consume_penalty_and_set_priority(resource_manager, resource_control_ctx); + resource_manager.consume_penalty(resource_control_ctx); } GRPC_RESOURCE_GROUP_COUNTER_VEC .with_label_values(&[resource_control_ctx.get_resource_group_name()]) @@ -1184,10 +1170,10 @@ fn handle_batch_commands_request( response_batch_commands_request(id, resp, tx.clone(), begin_instant, GrpcTypeKind::kv_get, source); } }, - Some(batch_commands_request::request::Cmd::RawGet(mut req)) => { - let resource_control_ctx = req.mut_context().mut_resource_control_context(); + Some(batch_commands_request::request::Cmd::RawGet(req)) => { + let resource_control_ctx = req.get_context().get_resource_control_context(); if let Some(resource_manager) = resource_manager { - consume_penalty_and_set_priority(resource_manager, resource_control_ctx); + resource_manager.consume_penalty(resource_control_ctx); } GRPC_RESOURCE_GROUP_COUNTER_VEC .with_label_values(&[resource_control_ctx.get_resource_group_name()]) @@ -1205,10 +1191,10 @@ fn handle_batch_commands_request( response_batch_commands_request(id, resp, tx.clone(), begin_instant, GrpcTypeKind::raw_get, source); } }, - Some(batch_commands_request::request::Cmd::Coprocessor(mut req)) => { - let resource_control_ctx = req.mut_context().mut_resource_control_context(); + Some(batch_commands_request::request::Cmd::Coprocessor(req)) => { + let resource_control_ctx = req.get_context().get_resource_control_context(); if let Some(resource_manager) = resource_manager { - consume_penalty_and_set_priority(resource_manager, resource_control_ctx); + resource_manager.consume_penalty(resource_control_ctx); } GRPC_RESOURCE_GROUP_COUNTER_VEC .with_label_values(&[resource_control_ctx.get_resource_group_name()]) @@ -1239,10 +1225,10 @@ fn handle_batch_commands_request( String::default(), ); } - $(Some(batch_commands_request::request::Cmd::$cmd(mut req)) => { - let resource_control_ctx = req.mut_context().mut_resource_control_context(); + $(Some(batch_commands_request::request::Cmd::$cmd(req)) => { + let resource_control_ctx = req.get_context().get_resource_control_context(); if let Some(resource_manager) = resource_manager { - consume_penalty_and_set_priority(resource_manager, resource_control_ctx); + resource_manager.consume_penalty(resource_control_ctx); } GRPC_RESOURCE_GROUP_COUNTER_VEC .with_label_values(&[resource_control_ctx.get_resource_group_name()]) From 4e35738a351f3954c7e2665f2d5f8ec52a9f9255 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Fri, 17 Nov 2023 12:23:16 +0800 Subject: [PATCH 117/220] raftstore: fine-tune SlowScore. (#15991) (#16002) ref tikv/tikv#15909, close tikv/tikv#16011 Signed-off-by: lucasliang Co-authored-by: lucasliang Co-authored-by: tonyxuqqi --- components/raftstore/src/store/util.rs | 35 ++++++++++++++++++--- components/raftstore/src/store/worker/pd.rs | 4 ++- 2 files changed, 33 insertions(+), 6 deletions(-) diff --git a/components/raftstore/src/store/util.rs b/components/raftstore/src/store/util.rs index d9076a67d8a..d1ef3fde75a 100644 --- a/components/raftstore/src/store/util.rs +++ b/components/raftstore/src/store/util.rs @@ -1733,13 +1733,38 @@ pub struct RaftstoreDuration { } impl RaftstoreDuration { + #[inline] pub fn sum(&self) -> std::time::Duration { - self.store_wait_duration.unwrap_or_default() - + self.store_process_duration.unwrap_or_default() + self.delays_on_disk_io(true) + self.delays_on_net_io() + } + + #[inline] + /// Returns the delayed duration on Disk I/O. + pub fn delays_on_disk_io(&self, include_wait_duration: bool) -> std::time::Duration { + let duration = self.store_process_duration.unwrap_or_default() + self.store_write_duration.unwrap_or_default() - + self.store_commit_duration.unwrap_or_default() - + self.apply_wait_duration.unwrap_or_default() - + self.apply_process_duration.unwrap_or_default() + + self.apply_process_duration.unwrap_or_default(); + if include_wait_duration { + duration + + self.store_wait_duration.unwrap_or_default() + + self.apply_wait_duration.unwrap_or_default() + } else { + duration + } + } + + #[inline] + /// Returns the delayed duration on Network I/O. + /// + /// Normally, it can be reflected by the duraiton on + /// `store_commit_duraiton`. + pub fn delays_on_net_io(&self) -> std::time::Duration { + // The `store_commit_duration` serves as an indicator for latency + // during the duration of transferring Raft logs to peers and appending + // logs. In most scenarios, instances of latency fluctuations in the + // network are reflected by this duration. Hence, it is selected as a + // representative of network latency. + self.store_commit_duration.unwrap_or_default() } } diff --git a/components/raftstore/src/store/worker/pd.rs b/components/raftstore/src/store/worker/pd.rs index 606576b22e4..935e93f1ba9 100644 --- a/components/raftstore/src/store/worker/pd.rs +++ b/components/raftstore/src/store/worker/pd.rs @@ -2255,7 +2255,9 @@ where } => self.handle_update_max_timestamp(region_id, initial_status, txn_ext), Task::QueryRegionLeader { region_id } => self.handle_query_region_leader(region_id), Task::UpdateSlowScore { id, duration } => { - self.slow_score.record(id, duration.sum()); + // Fine-tuned, `SlowScore` only takes the I/O jitters on the disk into account. + self.slow_score + .record(id, duration.delays_on_disk_io(false)); self.slow_trend_cause.record( tikv_util::time::duration_to_us(duration.store_wait_duration.unwrap()), Instant::now(), From 05be690efe6b708c6a38e65eb668ade0051ebe0f Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Fri, 17 Nov 2023 15:06:47 +0800 Subject: [PATCH 118/220] raftstore: fix an OOM issue by paginate scan unapplied config changes (#15806) (#15814) close tikv/tikv#15770 Before start election, raft-rs has to check if there is any unapplied conf change entry. In the current implementation, this needs to scan logs from [unapplied_index, committed_index]. It essentially takes unbounded memory when raft peers that has many unapplied logs. To fix the issue, TiKV can paginate scan raft log which has a fixed memory usage upper bound. Signed-off-by: Neil Shen Co-authored-by: Neil Shen Co-authored-by: tonyxuqqi --- Cargo.lock | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 901b768d24d..e2013144cea 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4316,7 +4316,7 @@ dependencies = [ [[package]] name = "raft" version = "0.7.0" -source = "git+https://github.com/tikv/raft-rs?branch=master#9d360a3b0cdb691da8e500a4f73c457b605a1d73" +source = "git+https://github.com/tikv/raft-rs?branch=master#f60fb9e143e5b93f7db8917ea376cda04effcbb4" dependencies = [ "bytes", "fxhash", @@ -4375,7 +4375,7 @@ dependencies = [ [[package]] name = "raft-proto" version = "0.7.0" -source = "git+https://github.com/tikv/raft-rs?branch=master#9d360a3b0cdb691da8e500a4f73c457b605a1d73" +source = "git+https://github.com/tikv/raft-rs?branch=master#f60fb9e143e5b93f7db8917ea376cda04effcbb4" dependencies = [ "bytes", "protobuf", From 30b578fdf20c1fd0d73a00ca26d57adb0242652f Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Fri, 17 Nov 2023 15:21:47 +0800 Subject: [PATCH 119/220] raftstore: gc uninitialized stale peer after merge (#15934) (#15939) close tikv/tikv#15919 A "stale peer" refers to a peer that still exists on a TiKV node but has been removed from the raft group, typically through a confchange operation. TiKV performs regular checks and validations on its peers to ensure that no such stale peer exists. However, the current stale peer check is not enough when dealing with uninitialized stale peers that its region has been merged. These uninitialized stale peers are left indefinitely, consuming CPU, memory, and blocking resolved ts. This commit introduces an in-memory state for peers whose raft messages create uninitialized stale peers. The stale peer then sends a MsgCheckStalePeer message to the corresponding peer, validating whether it should be removed. Signed-off-by: Neil Shen Co-authored-by: Neil Shen Co-authored-by: tonyxuqqi Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- components/raftstore/src/store/fsm/peer.rs | 5 ++ components/raftstore/src/store/fsm/store.rs | 5 ++ components/raftstore/src/store/peer.rs | 13 ++++- components/raftstore/src/store/worker/pd.rs | 10 +++- tests/integrations/raftstore/test_merge.rs | 57 +++++++++++++++++++ .../integrations/raftstore/test_stale_peer.rs | 47 +++++++++++++++ 6 files changed, 134 insertions(+), 3 deletions(-) diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index 79e02fd8272..98cc9ae16b7 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -286,6 +286,7 @@ where region, meta_peer, wait_data, + None, )?, tick_registry: [false; PeerTick::VARIANT_COUNT], missing_ticks: 0, @@ -316,6 +317,7 @@ where engines: Engines, region_id: u64, peer: metapb::Peer, + create_by_peer: metapb::Peer, ) -> Result> { // We will remove tombstone key when apply snapshot info!( @@ -323,6 +325,8 @@ where "region_id" => region_id, "peer_id" => peer.get_id(), "store_id" => store_id, + "create_by_peer_id" => create_by_peer.get_id(), + "create_by_peer_store_id" => create_by_peer.get_store_id(), ); let mut region = metapb::Region::default(); @@ -342,6 +346,7 @@ where ®ion, peer, false, + Some(create_by_peer), )?, tick_registry: [false; PeerTick::VARIANT_COUNT], missing_ticks: 0, diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index 2efcbf87b09..ae9fd9caa18 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -674,6 +674,8 @@ where "region_id" => region_id, "current_region_epoch" => ?cur_epoch, "msg_type" => ?msg_type, + "to_peer_id" => ?from_peer.get_id(), + "to_peer_store_id" => ?from_peer.get_store_id(), ); self.raft_metrics.message_dropped.stale_msg.inc(); @@ -692,6 +694,8 @@ where error!(?e; "send gc message failed"; "region_id" => region_id, + "to_peer_id" => ?from_peer.get_id(), + "to_peer_store_id" => ?from_peer.get_store_id(), ); } } @@ -2376,6 +2380,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER self.ctx.engines.clone(), region_id, target.clone(), + msg.get_from_peer().clone(), )?; // WARNING: The checking code must be above this line. diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index 1e78be03be9..57a684f9fe5 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -707,6 +707,8 @@ where pub peer_heartbeats: HashMap, /// Record the waiting data status of each follower or learner peer. pub wait_data_peers: Vec, + /// This peer is created by a raft message from `create_by_peer`. + create_by_peer: Option, proposals: ProposalQueue>, leader_missing_time: Option, @@ -903,6 +905,7 @@ where region: &metapb::Region, peer: metapb::Peer, wait_data: bool, + create_by_peer: Option, ) -> Result> { let peer_id = peer.get_id(); if peer_id == raft::INVALID_ID { @@ -957,6 +960,7 @@ where peer_cache: RefCell::new(HashMap::default()), peer_heartbeats: HashMap::default(), wait_data_peers: Vec::default(), + create_by_peer, peers_start_pending_time: vec![], down_peer_ids: vec![], split_check_trigger: SplitCheckTrigger::default(), @@ -5432,9 +5436,16 @@ where &mut self, ctx: &mut PollContext, ) { - if self.check_stale_conf_ver < self.region().get_region_epoch().get_conf_ver() { + if self.check_stale_conf_ver < self.region().get_region_epoch().get_conf_ver() + || self.region().get_region_epoch().get_conf_ver() == 0 + { self.check_stale_conf_ver = self.region().get_region_epoch().get_conf_ver(); self.check_stale_peers = self.region().get_peers().to_vec(); + if let Some(create_by_peer) = self.create_by_peer.as_ref() { + // Push create_by_peer in case the peer is removed before + // initialization which has no peer in region. + self.check_stale_peers.push(create_by_peer.clone()); + } } for peer in &self.check_stale_peers { if peer.get_id() == self.peer_id() { diff --git a/components/raftstore/src/store/worker/pd.rs b/components/raftstore/src/store/worker/pd.rs index 935e93f1ba9..5e97adf8d3e 100644 --- a/components/raftstore/src/store/worker/pd.rs +++ b/components/raftstore/src/store/worker/pd.rs @@ -1567,8 +1567,14 @@ where } } Ok(None) => { - // splitted Region has not yet reported to PD. - // TODO: handle merge + // Splitted region has not yet reported to PD. + // + // Or region has been merged. This case is handled by + // message `MsgCheckStalePeer`, stale peers will be + // removed eventually. + PD_VALIDATE_PEER_COUNTER_VEC + .with_label_values(&["region not found"]) + .inc(); } Err(e) => { error!("get region failed"; "err" => ?e); diff --git a/tests/integrations/raftstore/test_merge.rs b/tests/integrations/raftstore/test_merge.rs index 8d93d2c5a5c..7d964c03319 100644 --- a/tests/integrations/raftstore/test_merge.rs +++ b/tests/integrations/raftstore/test_merge.rs @@ -407,6 +407,63 @@ fn test_node_check_merged_message() { must_get_none(&engine3, b"v5"); } +/// Test if an uninitialized stale peer will be handled properly after merge. +#[test_case(test_raftstore::new_node_cluster)] +// #[test_case(test_raftstore_v2::new_node_cluster)] +fn test_node_gc_uninitialized_peer_after_merge() { + let mut cluster = new_cluster(0, 4); + configure_for_merge(&mut cluster.cfg); + ignore_merge_target_integrity(&mut cluster.cfg, &cluster.pd_client); + cluster.cfg.raft_store.raft_election_timeout_ticks = 5; + cluster.cfg.raft_store.raft_store_max_leader_lease = ReadableDuration::millis(40); + cluster.cfg.raft_store.max_leader_missing_duration = ReadableDuration::millis(150); + cluster.cfg.raft_store.abnormal_leader_missing_duration = ReadableDuration::millis(100); + cluster.cfg.raft_store.peer_stale_state_check_interval = ReadableDuration::millis(100); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + cluster.run_conf_change(); + + cluster.must_put(b"k1", b"v1"); + cluster.must_put(b"k3", b"v3"); + + // test if an uninitialized stale peer before conf removal is destroyed + // automatically + let region = pd_client.get_region(b"k1").unwrap(); + pd_client.must_add_peer(region.get_id(), new_peer(2, 2)); + pd_client.must_add_peer(region.get_id(), new_peer(3, 3)); + + cluster.must_split(®ion, b"k2"); + let left = pd_client.get_region(b"k1").unwrap(); + let right = pd_client.get_region(b"k2").unwrap(); + + // Block snapshot messages, so that new peers will never be initialized. + cluster.add_send_filter(CloneFilterFactory( + RegionPacketFilter::new(left.get_id(), 4) + .msg_type(MessageType::MsgSnapshot) + .direction(Direction::Recv), + )); + // Add peer (4,4), remove peer (4,4) and then merge regions. + // Peer (4,4) will be an an uninitialized stale peer. + pd_client.must_add_peer(left.get_id(), new_peer(4, 4)); + cluster.must_region_exist(left.get_id(), 4); + cluster.add_send_filter(IsolationFilterFactory::new(4)); + pd_client.must_remove_peer(left.get_id(), new_peer(4, 4)); + pd_client.must_merge(left.get_id(), right.get_id()); + cluster.clear_send_filters(); + + // Wait for the peer (4,4) to be destroyed. + sleep_ms( + 2 * cluster + .cfg + .raft_store + .max_leader_missing_duration + .as_millis(), + ); + cluster.must_region_not_exist(left.get_id(), 4); +} + // Test if a merge handled properly when there is a unfinished slow split before // merge. // No v2, it requires all peers to be available to check trim status. diff --git a/tests/integrations/raftstore/test_stale_peer.rs b/tests/integrations/raftstore/test_stale_peer.rs index e12584d6c60..f76373756f9 100644 --- a/tests/integrations/raftstore/test_stale_peer.rs +++ b/tests/integrations/raftstore/test_stale_peer.rs @@ -6,8 +6,10 @@ use std::{sync::Arc, thread, time::*}; use engine_traits::{Peekable, CF_RAFT}; use kvproto::raft_serverpb::{PeerState, RegionLocalState}; +use pd_client::PdClient; use raft::eraftpb::MessageType; use test_raftstore::*; +use test_raftstore_macro::test_case; use tikv_util::{config::ReadableDuration, HandyRwLock}; /// A helper function for testing the behaviour of the gc of stale peer @@ -310,3 +312,48 @@ fn test_stale_learner_with_read_index() { let state: RegionLocalState = engine3.get_msg_cf(CF_RAFT, &state_key).unwrap().unwrap(); assert_eq!(state.get_state(), PeerState::Tombstone); } + +/// Test if an uninitialized stale peer will be removed after restart. +#[test_case(test_raftstore::new_node_cluster)] +// #[test_case(test_raftstore_v2::new_node_cluster)] +fn test_node_restart_gc_uninitialized_peer_after_merge() { + let mut cluster = new_cluster(0, 4); + configure_for_merge(&mut cluster.cfg); + ignore_merge_target_integrity(&mut cluster.cfg, &cluster.pd_client); + cluster.cfg.raft_store.raft_election_timeout_ticks = 5; + cluster.cfg.raft_store.raft_store_max_leader_lease = ReadableDuration::millis(40); + cluster.cfg.raft_store.max_leader_missing_duration = ReadableDuration::millis(150); + cluster.cfg.raft_store.abnormal_leader_missing_duration = ReadableDuration::millis(100); + cluster.cfg.raft_store.peer_stale_state_check_interval = ReadableDuration::millis(100); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + cluster.run_conf_change(); + + cluster.must_put(b"k1", b"v1"); + + // test if an uninitialized stale peer before conf removal is destroyed + // automatically + let region = pd_client.get_region(b"k1").unwrap(); + pd_client.must_add_peer(region.get_id(), new_peer(2, 2)); + pd_client.must_add_peer(region.get_id(), new_peer(3, 3)); + + // Block snapshot messages, so that new peers will never be initialized. + cluster.add_send_filter(CloneFilterFactory( + RegionPacketFilter::new(region.get_id(), 4) + .msg_type(MessageType::MsgSnapshot) + .direction(Direction::Recv), + )); + // Add peer (4,4), remove peer (4,4) and then merge regions. + // Peer (4,4) will be an an uninitialized stale peer. + pd_client.must_add_peer(region.get_id(), new_peer(4, 4)); + cluster.must_region_exist(region.get_id(), 4); + cluster.add_send_filter(IsolationFilterFactory::new(4)); + pd_client.must_remove_peer(region.get_id(), new_peer(4, 4)); + + // An uninitialized stale peer is removed automatically after restart. + cluster.stop_node(4); + cluster.run_node(4).unwrap(); + cluster.must_region_not_exist(region.get_id(), 4); +} From e9e9ba13268378c92b254a1ed0eedf98e777c0ac Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Thu, 23 Nov 2023 14:17:40 +0800 Subject: [PATCH 120/220] update default value of region_compact_redundant_rows_percent (#16051) (#16052) ref tikv/tikv#15282 Change the default value of raftstore.redundant-rows-percent-threshold to 20 from 100. This would triggers a compaction when a region has 20% entries as stale MVCC versions. Signed-off-by: tonyxuqqi Co-authored-by: tonyxuqqi --- components/raftstore/src/store/config.rs | 11 +---------- etc/config-template.toml | 2 +- src/config/mod.rs | 2 +- 3 files changed, 3 insertions(+), 12 deletions(-) diff --git a/components/raftstore/src/store/config.rs b/components/raftstore/src/store/config.rs index 81009dd5d59..f7e1fc9a0a0 100644 --- a/components/raftstore/src/store/config.rs +++ b/components/raftstore/src/store/config.rs @@ -431,7 +431,7 @@ impl Default for Config { region_compact_min_tombstones: 10000, region_compact_tombstones_percent: 30, region_compact_min_redundant_rows: 50000, - region_compact_redundant_rows_percent: None, + region_compact_redundant_rows_percent: Some(20), pd_heartbeat_tick_interval: ReadableDuration::minutes(1), pd_store_heartbeat_tick_interval: ReadableDuration::secs(10), notify_capacity: 40960, @@ -616,15 +616,6 @@ impl Config { } } - if self.region_compact_redundant_rows_percent.is_none() { - if raft_kv_v2 { - self.region_compact_redundant_rows_percent = Some(20); - } else { - // Disable redundant rows check in default for v1. - self.region_compact_redundant_rows_percent = Some(100); - } - } - // When use raft kv v2, we can set raft log gc size limit to a smaller value to // avoid too many entry logs in cache. // The snapshot support to increment snapshot sst, so the old snapshot files diff --git a/etc/config-template.toml b/etc/config-template.toml index 3e55004feb2..8b4f2cf9ed7 100644 --- a/etc/config-template.toml +++ b/etc/config-template.toml @@ -456,7 +456,7 @@ ## It should be set between 1 and 100. Manual compaction is only triggered when the number of ## duplicated MVCC keys exceeds `region-compact-min-redundant-rows` and the percentage of duplicated MVCC keys ## exceeds `region-compact-redundant-rows-percent`. -# region-compact-redundant-rows-percent = 100 +# region-compact-redundant-rows-percent = 20 ## Interval to check whether to start a manual compaction for Lock Column Family. ## If written bytes reach `lock-cf-compact-bytes-threshold` for Lock Column Family, TiKV will diff --git a/src/config/mod.rs b/src/config/mod.rs index b192a7ac5f7..8c8cf81b8f1 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -6780,7 +6780,7 @@ mod tests { cfg.raft_store .region_compact_redundant_rows_percent .unwrap(), - 100 + 20 ); let content = r#" From 3a68f777a4abd23e6ac86ae5ab6b84d6a890cfbe Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Mon, 27 Nov 2023 14:56:44 +0800 Subject: [PATCH 121/220] raftstore: update apply state even if peer is removed (#16060) (#16083) close tikv/tikv#16069, close pingcap/tidb#48802 When a peer is removed, it is necessary to update its apply state because this peer may be simultaneously taking a snapshot. An outdated apply state will invalidate the coprocessor cache assumption and potentially lead to a violation of linearizability (returning stale cache). Signed-off-by: Neil Shen Co-authored-by: Neil Shen --- components/raftstore/src/store/fsm/apply.rs | 146 +++++++++++++++++++- 1 file changed, 143 insertions(+), 3 deletions(-) diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index 339dff68e76..1639f441e38 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -662,9 +662,7 @@ where results: VecDeque>, ) { if self.host.pre_persist(&delegate.region, true, None) { - if !delegate.pending_remove { - delegate.maybe_write_apply_state(self); - } + delegate.maybe_write_apply_state(self); self.commit_opt(delegate, false); } else { debug!("do not persist when finish_for"; @@ -5532,6 +5530,21 @@ mod tests { ) } + fn cb_conf_change( + idx: u64, + term: u64, + tx: Sender, + ) -> Proposal> { + proposal( + true, + idx, + term, + Callback::write(Box::new(move |resp: WriteResponse| { + tx.send(resp.response).unwrap(); + })), + ) + } + struct EntryBuilder { entry: Entry, req: RaftCmdRequest, @@ -5659,6 +5672,14 @@ mod tests { self } + fn conf_change(mut self, changes: Vec) -> EntryBuilder { + let mut req = AdminRequest::default(); + req.set_cmd_type(AdminCmdType::ChangePeerV2); + req.mut_change_peer_v2().set_changes(changes.into()); + self.req.set_admin_request(req); + self + } + fn build(mut self) -> Entry { self.entry .set_data(self.req.write_to_bytes().unwrap().into()); @@ -7656,6 +7677,125 @@ mod tests { system.shutdown(); } + // When a peer is removed, it is necessary to update its apply state because + // this peer may be simultaneously taking a snapshot. An outdated apply state + // invalidates the coprocessor cache assumption (apply state must match data + // in the snapshot) and potentially lead to a violation of linearizability + // (returning stale cache). + #[test] + fn test_conf_change_remove_node_update_apply_state() { + let (_path, engine) = create_tmp_engine("test-delegate"); + let (_import_dir, importer) = create_tmp_importer("test-delegate"); + let peer_id = 3; + let mut reg = Registration { + id: peer_id, + term: 1, + ..Default::default() + }; + reg.region.set_id(1); + reg.region.set_end_key(b"k5".to_vec()); + reg.region.mut_region_epoch().set_version(3); + let peers = vec![new_peer(2, 3), new_peer(4, 5), new_learner_peer(6, 7)]; + reg.region.set_peers(peers.into()); + let (tx, apply_res_rx) = mpsc::channel(); + let sender = Box::new(TestNotifier { tx }); + let coprocessor_host = CoprocessorHost::::default(); + let (region_scheduler, _) = dummy_scheduler(); + let cfg = Arc::new(VersionTrack::new(Config::default())); + let (router, mut system) = create_apply_batch_system(&cfg.value(), None); + let pending_create_peers = Arc::new(Mutex::new(HashMap::default())); + let builder = super::Builder:: { + tag: "test-store".to_owned(), + cfg, + sender, + importer, + region_scheduler, + coprocessor_host, + engine: engine.clone(), + router: router.clone(), + store_id: 2, + pending_create_peers, + }; + system.spawn("test-conf-change".to_owned(), builder); + + router.schedule_task(1, Msg::Registration(reg.dup())); + + let mut index_id = 1; + let epoch = reg.region.get_region_epoch().to_owned(); + + // Write some data. + let (capture_tx, capture_rx) = mpsc::channel(); + let put_entry = EntryBuilder::new(index_id, 1) + .put(b"k1", b"v1") + .epoch(epoch.get_conf_ver(), epoch.get_version()) + .build(); + router.schedule_task( + 1, + Msg::apply(apply( + peer_id, + 1, + 1, + vec![put_entry], + vec![cb(index_id, 1, capture_tx)], + )), + ); + let resp = capture_rx.recv_timeout(Duration::from_secs(3)).unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + let initial_state: RaftApplyState = engine + .get_msg_cf(CF_RAFT, &keys::apply_state_key(1)) + .unwrap() + .unwrap(); + assert_ne!(initial_state.get_applied_index(), 0); + match apply_res_rx.recv_timeout(Duration::from_secs(3)) { + Ok(PeerMsg::ApplyRes { + res: TaskRes::Apply(apply_res), + }) => assert_eq!(apply_res.apply_state, initial_state), + e => panic!("unexpected result: {:?}", e), + } + index_id += 1; + + // Remove itself. + let (capture_tx, capture_rx) = mpsc::channel(); + let mut remove_node = ChangePeerRequest::default(); + remove_node.set_change_type(ConfChangeType::RemoveNode); + remove_node.set_peer(new_peer(2, 3)); + let conf_change = EntryBuilder::new(index_id, 1) + .conf_change(vec![remove_node]) + .epoch(epoch.get_conf_ver(), epoch.get_version()) + .build(); + router.schedule_task( + 1, + Msg::apply(apply( + peer_id, + 1, + 1, + vec![conf_change], + vec![cb_conf_change(index_id, 1, capture_tx)], + )), + ); + let resp = capture_rx.recv_timeout(Duration::from_secs(3)).unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + + let apply_state: RaftApplyState = engine + .get_msg_cf(CF_RAFT, &keys::apply_state_key(1)) + .unwrap() + .unwrap(); + match apply_res_rx.recv_timeout(Duration::from_secs(3)) { + Ok(PeerMsg::ApplyRes { + res: TaskRes::Apply(apply_res), + }) => assert_eq!(apply_res.apply_state, apply_state), + e => panic!("unexpected result: {:?}", e), + } + assert!( + apply_state.get_applied_index() > initial_state.get_applied_index(), + "\n{:?}\n{:?}", + apply_state, + initial_state + ); + + system.shutdown(); + } + #[test] fn pending_cmd_leak() { let res = panic_hook::recover_safe(|| { From bd8a0aabd08fd77687f788e0b45858ccd3516e4d Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Tue, 28 Nov 2023 20:41:47 +0800 Subject: [PATCH 122/220] titan: update titan to fix compaction filter (#16092) (#16093) close tikv/tikv#16091 update titan to fix compaction filter Signed-off-by: ti-chi-bot Signed-off-by: Connor1996 Co-authored-by: Connor Co-authored-by: Connor1996 --- Cargo.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e2013144cea..aa545ed6497 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2998,7 +2998,7 @@ dependencies = [ [[package]] name = "librocksdb_sys" version = "0.1.0" -source = "git+https://github.com/tikv/rust-rocksdb.git#aa41eb102d373f56846be88ffd250c2b581b48d4" +source = "git+https://github.com/tikv/rust-rocksdb.git#c4b7047314a9b27926a1b7b25d2e6d1a37a48d2b" dependencies = [ "bindgen 0.65.1", "bzip2-sys", @@ -3017,7 +3017,7 @@ dependencies = [ [[package]] name = "libtitan_sys" version = "0.0.1" -source = "git+https://github.com/tikv/rust-rocksdb.git#aa41eb102d373f56846be88ffd250c2b581b48d4" +source = "git+https://github.com/tikv/rust-rocksdb.git#c4b7047314a9b27926a1b7b25d2e6d1a37a48d2b" dependencies = [ "bzip2-sys", "cc", @@ -4936,7 +4936,7 @@ dependencies = [ [[package]] name = "rocksdb" version = "0.3.0" -source = "git+https://github.com/tikv/rust-rocksdb.git#aa41eb102d373f56846be88ffd250c2b581b48d4" +source = "git+https://github.com/tikv/rust-rocksdb.git#c4b7047314a9b27926a1b7b25d2e6d1a37a48d2b" dependencies = [ "libc 0.2.146", "librocksdb_sys", From 9b20517295821bc4972757d5a3aaba585445fe35 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Wed, 6 Dec 2023 17:19:21 +0800 Subject: [PATCH 123/220] resource_control: replace limited future's post delay with delay before first poll (#16100) (#16150) ref tikv/tikv#16026 Signed-off-by: glorv Co-authored-by: glorv --- components/resource_control/src/future.rs | 63 +++++++++++-------- .../resource_control/src/resource_limiter.rs | 37 ++++++----- components/resource_control/src/service.rs | 2 + components/resource_control/src/worker.rs | 31 +++++---- 4 files changed, 81 insertions(+), 52 deletions(-) diff --git a/components/resource_control/src/future.rs b/components/resource_control/src/future.rs index 53bca48b301..0750a21c574 100644 --- a/components/resource_control/src/future.rs +++ b/components/resource_control/src/future.rs @@ -92,7 +92,9 @@ pub struct LimitedFuture { #[pin] post_delay: OptionalFuture>, resource_limiter: Arc, - res: Poll, + // if the future is first polled, we need to let it consume a 0 value + // to compensate the debt of previously finished tasks. + is_first_poll: bool, } impl LimitedFuture { @@ -102,7 +104,7 @@ impl LimitedFuture { pre_delay: None.into(), post_delay: None.into(), resource_limiter, - res: Poll::Pending, + is_first_poll: true, } } } @@ -112,19 +114,32 @@ impl Future for LimitedFuture { fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { let mut this = self.project(); - if !this.post_delay.is_done() { - assert!(this.pre_delay.is_done()); + if *this.is_first_poll { + debug_assert!(this.pre_delay.finished && this.post_delay.finished); + *this.is_first_poll = false; + let wait_dur = this + .resource_limiter + .consume(Duration::ZERO, IoBytes::default(), true) + .min(MAX_WAIT_DURATION); + if wait_dur > Duration::ZERO { + *this.pre_delay = Some( + GLOBAL_TIMER_HANDLE + .delay(std::time::Instant::now() + wait_dur) + .compat(), + ) + .into(); + } + } + if !this.post_delay.finished { + assert!(this.pre_delay.finished); std::mem::swap(&mut *this.pre_delay, &mut *this.post_delay); } - if !this.pre_delay.is_done() { + if !this.pre_delay.finished { let res = this.pre_delay.poll(cx); if res.is_pending() { return Poll::Pending; } } - if this.res.is_ready() { - return std::mem::replace(this.res, Poll::Pending); - } // get io stats is very expensive, so we only do so if only io control is // enabled. let mut last_io_bytes = None; @@ -157,8 +172,10 @@ impl Future for LimitedFuture { } else { IoBytes::default() }; - let mut wait_dur = this.resource_limiter.consume(dur, io_bytes); - if wait_dur == Duration::ZERO { + let mut wait_dur = this + .resource_limiter + .consume(dur, io_bytes, res.is_pending()); + if wait_dur == Duration::ZERO || res.is_ready() { return res; } if wait_dur > MAX_WAIT_DURATION { @@ -171,31 +188,24 @@ impl Future for LimitedFuture { .compat(), ) .into(); - if this.post_delay.poll(cx).is_ready() { - return res; - } - *this.res = res; + _ = this.post_delay.poll(cx); Poll::Pending } } /// `OptionalFuture` is similar to futures::OptionFuture, but provide an extra -/// `is_done` method. +/// `finished` flag to determine if the future requires poll. #[pin_project] struct OptionalFuture { #[pin] f: Option, - done: bool, + finished: bool, } impl OptionalFuture { fn new(f: Option) -> Self { - let done = f.is_none(); - Self { f, done } - } - - fn is_done(&self) -> bool { - self.done + let finished = f.is_none(); + Self { f, finished } } } @@ -212,7 +222,7 @@ impl Future for OptionalFuture { let this = self.project(); match this.f.as_pin_mut() { Some(x) => x.poll(cx).map(|r| { - *this.done = true; + *this.finished = true; Some(r) }), None => Poll::Ready(None), @@ -312,7 +322,7 @@ mod tests { let delta = new_stats - stats; let dur = start.saturating_elapsed(); assert_eq!(delta.total_consumed, 150); - assert_eq!(delta.total_wait_dur_us, 150_000); + assert!(delta.total_wait_dur_us >= 140_000 && delta.total_wait_dur_us <= 160_000); assert!(dur >= Duration::from_millis(150) && dur <= Duration::from_millis(160)); // fetch io bytes failed, consumed value is 0. @@ -320,7 +330,10 @@ mod tests { { fail::cfg("failed_to_get_thread_io_bytes_stats", "1*return").unwrap(); spawn_and_wait(&pool, empty(), resource_limiter.clone()); - assert_eq!(resource_limiter.get_limit_statistics(Io), new_stats); + assert_eq!( + resource_limiter.get_limit_statistics(Io).total_consumed, + new_stats.total_consumed + ); fail::remove("failed_to_get_thread_io_bytes_stats"); } } diff --git a/components/resource_control/src/resource_limiter.rs b/components/resource_control/src/resource_limiter.rs index bce6867ac2e..ab2144f18cc 100644 --- a/components/resource_control/src/resource_limiter.rs +++ b/components/resource_control/src/resource_limiter.rs @@ -71,19 +71,22 @@ impl ResourceLimiter { self.is_background } - pub fn consume(&self, cpu_time: Duration, io_bytes: IoBytes) -> Duration { + pub fn consume(&self, cpu_time: Duration, io_bytes: IoBytes, wait: bool) -> Duration { let cpu_dur = - self.limiters[ResourceType::Cpu as usize].consume(cpu_time.as_micros() as u64); - let io_dur = self.limiters[ResourceType::Io as usize].consume_io(io_bytes); + self.limiters[ResourceType::Cpu as usize].consume(cpu_time.as_micros() as u64, wait); + let io_dur = self.limiters[ResourceType::Io as usize].consume_io(io_bytes, wait); let wait_dur = cpu_dur.max(io_dur); - BACKGROUND_TASKS_WAIT_DURATION - .with_label_values(&[&self.name]) - .inc_by(wait_dur.as_micros() as u64); + if wait_dur > Duration::ZERO { + BACKGROUND_TASKS_WAIT_DURATION + .with_label_values(&[&self.name]) + .inc_by(wait_dur.as_micros() as u64); + } + wait_dur } pub async fn async_consume(&self, cpu_time: Duration, io_bytes: IoBytes) -> Duration { - let dur = self.consume(cpu_time, io_bytes); + let dur = self.consume(cpu_time, io_bytes, true); if !dur.is_zero() { _ = GLOBAL_TIMER_HANDLE .delay(Instant::now() + dur) @@ -154,12 +157,14 @@ impl QuotaLimiter { ) } - fn consume(&self, value: u64) -> Duration { - if value == 0 { + fn consume(&self, value: u64, wait: bool) -> Duration { + if value == 0 && self.limiter.speed_limit().is_infinite() { return Duration::ZERO; } - let dur = self.limiter.consume_duration(value as usize); - if dur != Duration::ZERO { + let mut dur = self.limiter.consume_duration(value as usize); + if !wait { + dur = Duration::ZERO; + } else if dur != Duration::ZERO { self.total_wait_dur_us .fetch_add(dur.as_micros() as u64, Ordering::Relaxed); } @@ -167,16 +172,18 @@ impl QuotaLimiter { dur } - fn consume_io(&self, value: IoBytes) -> Duration { + fn consume_io(&self, value: IoBytes, wait: bool) -> Duration { self.read_bytes.fetch_add(value.read, Ordering::Relaxed); self.write_bytes.fetch_add(value.write, Ordering::Relaxed); let value = value.read + value.write; - if value == 0 { + if value == 0 && self.limiter.speed_limit().is_infinite() { return Duration::ZERO; } - let dur = self.limiter.consume_duration(value as usize); - if dur != Duration::ZERO { + let mut dur = self.limiter.consume_duration(value as usize); + if !wait { + dur = Duration::ZERO; + } else if dur != Duration::ZERO { self.total_wait_dur_us .fetch_add(dur.as_micros() as u64, Ordering::Relaxed); } diff --git a/components/resource_control/src/service.rs b/components/resource_control/src/service.rs index 2c2bbdc5549..26652cda00e 100644 --- a/components/resource_control/src/service.rs +++ b/components/resource_control/src/service.rs @@ -575,6 +575,7 @@ pub mod tests { read: 1000, write: 1000, }, + true, ); // Wait for report ru metrics. std::thread::sleep(Duration::from_millis(100)); @@ -595,6 +596,7 @@ pub mod tests { read: 2000, write: 2000, }, + true, ); // Wait for report ru metrics. std::thread::sleep(Duration::from_millis(100)); diff --git a/components/resource_control/src/worker.rs b/components/resource_control/src/worker.rs index 79dea73d0ae..1dbcd9ffaf0 100644 --- a/components/resource_control/src/worker.rs +++ b/components/resource_control/src/worker.rs @@ -690,6 +690,7 @@ mod tests { read: 1000, write: 1000, }, + true, ); worker.adjust_quota(); check_limiter( @@ -719,6 +720,7 @@ mod tests { read: 1000, write: 1000, }, + true, ); worker.adjust_quota(); check_limiter( @@ -737,6 +739,7 @@ mod tests { read: 5000, write: 5000, }, + true, ); worker.adjust_quota(); check_limiter( @@ -788,6 +791,7 @@ mod tests { read: 600, write: 600, }, + true, ); bg_limiter.consume( Duration::from_millis(1800), @@ -795,6 +799,7 @@ mod tests { read: 900, write: 900, }, + true, ); worker.adjust_quota(); check_limiter( @@ -863,6 +868,7 @@ mod tests { read: 600, write: 600, }, + true, ); new_bg_limiter.consume( Duration::from_millis(1800), @@ -870,6 +876,7 @@ mod tests { read: 900, write: 900, }, + true, ); worker.adjust_quota(); @@ -944,7 +951,7 @@ mod tests { // only default group, always return infinity. reset_quota(&mut worker, 6.4); - priority_limiters[1].consume(Duration::from_secs(50), IoBytes::default()); + priority_limiters[1].consume(Duration::from_secs(50), IoBytes::default(), true); worker.adjust(); check_limiter(f64::INFINITY, f64::INFINITY, f64::INFINITY); @@ -954,46 +961,46 @@ mod tests { resource_ctl.add_resource_group(rg2); reset_quota(&mut worker, 6.4); - priority_limiters[1].consume(Duration::from_secs(64), IoBytes::default()); + priority_limiters[1].consume(Duration::from_secs(64), IoBytes::default(), true); worker.adjust(); check_limiter(f64::INFINITY, f64::INFINITY, f64::INFINITY); reset_quota(&mut worker, 6.4); for _i in 0..100 { - priority_limiters[0].consume(Duration::from_millis(240), IoBytes::default()); - priority_limiters[1].consume(Duration::from_millis(400), IoBytes::default()); + priority_limiters[0].consume(Duration::from_millis(240), IoBytes::default(), true); + priority_limiters[1].consume(Duration::from_millis(400), IoBytes::default(), true); } worker.adjust(); check_limiter(f64::INFINITY, 5.2, 1.2); reset_quota(&mut worker, 6.4); for _i in 0..100 { - priority_limiters[0].consume(Duration::from_millis(120), IoBytes::default()); - priority_limiters[1].consume(Duration::from_millis(200), IoBytes::default()); + priority_limiters[0].consume(Duration::from_millis(120), IoBytes::default(), true); + priority_limiters[1].consume(Duration::from_millis(200), IoBytes::default(), true); } worker.adjust(); check_limiter(f64::INFINITY, 2.6, 0.6); reset_quota(&mut worker, 6.4); for _i in 0..100 { - priority_limiters[2].consume(Duration::from_millis(200), IoBytes::default()); + priority_limiters[2].consume(Duration::from_millis(200), IoBytes::default(), true); } worker.adjust(); check_limiter(f64::INFINITY, f64::INFINITY, f64::INFINITY); reset_quota(&mut worker, 8.0); for _i in 0..100 { - priority_limiters[0].consume(Duration::from_millis(240), IoBytes::default()); - priority_limiters[1].consume(Duration::from_millis(240), IoBytes::default()); - priority_limiters[2].consume(Duration::from_millis(320), IoBytes::default()); + priority_limiters[0].consume(Duration::from_millis(240), IoBytes::default(), true); + priority_limiters[1].consume(Duration::from_millis(240), IoBytes::default(), true); + priority_limiters[2].consume(Duration::from_millis(320), IoBytes::default(), true); } worker.adjust(); check_limiter(f64::INFINITY, 5.2, 2.8); reset_quota(&mut worker, 6.0); for _i in 0..100 { - priority_limiters[0].consume(Duration::from_millis(240), IoBytes::default()); - priority_limiters[2].consume(Duration::from_millis(360), IoBytes::default()); + priority_limiters[0].consume(Duration::from_millis(240), IoBytes::default(), true); + priority_limiters[2].consume(Duration::from_millis(360), IoBytes::default(), true); } worker.adjust(); check_limiter(f64::INFINITY, 5.2, 5.2); From ec09e54b456c7b638387f6c80628cbb0258f6be4 Mon Sep 17 00:00:00 2001 From: lucasliang Date: Fri, 8 Dec 2023 14:41:50 +0800 Subject: [PATCH 124/220] [cherry-pick-7.5] raftstore: fine-tunes slow score (#16087) (#16153) ref tikv/tikv#15909 Cherry-pick the fine-tuning works on SlowScore from nightly to release-7.5. Signed-off-by: lucasliang Co-authored-by: tonyxuqqi --- components/raftstore-v2/src/operation/life.rs | 7 +- .../raftstore-v2/src/operation/ready/mod.rs | 16 ++-- components/raftstore/src/store/config.rs | 10 ++- .../raftstore/src/store/local_metrics.rs | 80 +++++++++++++++---- components/raftstore/src/store/worker/pd.rs | 57 ++++++++++--- tests/integrations/config/mod.rs | 1 + tests/integrations/config/test-custom.toml | 65 +++------------ 7 files changed, 140 insertions(+), 96 deletions(-) diff --git a/components/raftstore-v2/src/operation/life.rs b/components/raftstore-v2/src/operation/life.rs index 00df317f73a..e9fc84643da 100644 --- a/components/raftstore-v2/src/operation/life.rs +++ b/components/raftstore-v2/src/operation/life.rs @@ -45,6 +45,7 @@ use raftstore::{ life::{build_peer_destroyed_report, forward_destroy_to_source_peer}, Proposal, }, + local_metrics::IoType as InspectIoType, metrics::RAFT_PEER_PENDING_DURATION, util, DiskFullPeers, Transport, WriteTask, }, @@ -579,9 +580,9 @@ impl Store { { // Record the last statistics of commit-log-duration and store-write-duration. inspector.record_store_wait(start_ts.saturating_elapsed()); - inspector.record_store_commit(ctx.raft_metrics.stat_commit_log.avg()); - // Reset the stat_commit_log and wait it to be refreshed in the next tick. - ctx.raft_metrics.stat_commit_log.reset(); + inspector.record_store_commit(ctx.raft_metrics.health_stats.avg(InspectIoType::Network)); + // Reset the health_stats and wait it to be refreshed in the next tick. + ctx.raft_metrics.health_stats.reset(); ctx.pending_latency_inspect.push(inspector); } } diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index a2697f29f02..39ce9707359 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -43,6 +43,7 @@ use raftstore::{ coprocessor::{RegionChangeEvent, RoleChange}, store::{ fsm::store::StoreRegionMeta, + local_metrics::IoType, needs_evict_entry_cache, util::{self, is_first_append_entry, is_initial_msg}, worker_metrics::SNAP_COUNTER, @@ -989,7 +990,7 @@ impl Peer { return; } let now = Instant::now(); - let stat_raft_commit_log = &mut ctx.raft_metrics.stat_commit_log; + let health_stats = &mut ctx.raft_metrics.health_stats; for i in old_index + 1..=new_index { if let Some((term, trackers)) = self.proposals().find_trackers(i) { if self.entry_storage().term(i).map_or(false, |t| t == term) { @@ -1002,14 +1003,11 @@ impl Peer { for tracker in trackers { // Collect the metrics related to commit_log // durations. - stat_raft_commit_log.record(Duration::from_nanos(tracker.observe( - now, - hist, - |t| { - t.metrics.commit_not_persisted = !commit_persisted; - &mut t.metrics.wf_commit_log_nanos - }, - ))); + let duration = tracker.observe(now, hist, |t| { + t.metrics.commit_not_persisted = !commit_persisted; + &mut t.metrics.wf_commit_log_nanos + }); + health_stats.observe(Duration::from_nanos(duration), IoType::Network); } } } diff --git a/components/raftstore/src/store/config.rs b/components/raftstore/src/store/config.rs index f7e1fc9a0a0..92704f69e84 100644 --- a/components/raftstore/src/store/config.rs +++ b/components/raftstore/src/store/config.rs @@ -343,6 +343,9 @@ pub struct Config { // Interval to inspect the latency of raftstore for slow store detection. pub inspect_interval: ReadableDuration, + /// Threshold of CPU utilization to inspect for slow store detection. + #[doc(hidden)] + pub inspect_cpu_util_thd: f64, // The unsensitive(increase it to reduce sensitiveness) of the cause-trend detection pub slow_trend_unsensitive_cause: f64, @@ -502,7 +505,12 @@ impl Default for Config { region_max_size: ReadableSize(0), region_split_size: ReadableSize(0), clean_stale_peer_delay: ReadableDuration::minutes(0), - inspect_interval: ReadableDuration::millis(500), + inspect_interval: ReadableDuration::millis(100), + // The default value of `inspect_cpu_util_thd` is 0.4, which means + // when the cpu utilization is greater than 40%, the store might be + // regarded as a slow node if there exists delayed inspected messages. + // It's good enough for most cases to reduce the false positive rate. + inspect_cpu_util_thd: 0.4, // The param `slow_trend_unsensitive_cause == 2.0` can yield good results, // make it `10.0` to reduce a bit sensitiveness because SpikeFilter is disabled slow_trend_unsensitive_cause: 10.0, diff --git a/components/raftstore/src/store/local_metrics.rs b/components/raftstore/src/store/local_metrics.rs index aceacdb81ee..c96ce2a9a29 100644 --- a/components/raftstore/src/store/local_metrics.rs +++ b/components/raftstore/src/store/local_metrics.rs @@ -68,35 +68,81 @@ impl RaftSendMessageMetrics { } } +/// Buffered statistics for recording local raftstore message duration. +/// +/// As it's only used for recording local raftstore message duration, +/// and it will be manually reset preiodically, so it's not necessary +/// to use `LocalHistogram`. #[derive(Default)] -pub struct RaftCommitLogStatistics { - pub last_commit_log_duration_sum: Duration, - pub last_commit_log_count_sum: u64, +struct LocalHealthStatistics { + duration_sum: Duration, + count: u64, } -impl RaftCommitLogStatistics { +impl LocalHealthStatistics { #[inline] - pub fn record(&mut self, dur: Duration) { - self.last_commit_log_count_sum += 1; - self.last_commit_log_duration_sum += dur; + fn observe(&mut self, dur: Duration) { + self.count += 1; + self.duration_sum += dur; } #[inline] - pub fn avg(&self) -> Duration { - if self.last_commit_log_count_sum > 0 { - Duration::from_micros( - self.last_commit_log_duration_sum.as_micros() as u64 - / self.last_commit_log_count_sum, - ) + fn avg(&self) -> Duration { + if self.count > 0 { + Duration::from_micros(self.duration_sum.as_micros() as u64 / self.count) } else { Duration::default() } } #[inline] + fn reset(&mut self) { + self.count = 0; + self.duration_sum = Duration::default(); + } +} + +#[repr(u8)] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum IoType { + Disk = 0, + Network = 1, +} + +/// Buffered statistics for recording the health of raftstore. +#[derive(Default)] +pub struct HealthStatistics { + // represents periodic latency on the disk io. + disk_io_dur: LocalHealthStatistics, + // represents the latency of the network io. + network_io_dur: LocalHealthStatistics, +} + +impl HealthStatistics { + #[inline] + pub fn observe(&mut self, dur: Duration, io_type: IoType) { + match io_type { + IoType::Disk => self.disk_io_dur.observe(dur), + IoType::Network => self.network_io_dur.observe(dur), + } + } + + #[inline] + pub fn avg(&self, io_type: IoType) -> Duration { + match io_type { + IoType::Disk => self.disk_io_dur.avg(), + IoType::Network => self.network_io_dur.avg(), + } + } + + #[inline] + /// Reset HealthStatistics. + /// + /// Should be manually reset when the metrics are + /// accepted by slowness inspector. pub fn reset(&mut self) { - self.last_commit_log_count_sum = 0; - self.last_commit_log_duration_sum = Duration::default(); + self.disk_io_dur.reset(); + self.network_io_dur.reset(); } } @@ -133,7 +179,7 @@ pub struct RaftMetrics { pub wf_commit_not_persist_log: LocalHistogram, // local statistics for slowness - pub stat_commit_log: RaftCommitLogStatistics, + pub health_stats: HealthStatistics, pub leader_missing: Arc>>, @@ -171,7 +217,7 @@ impl RaftMetrics { wf_persist_log: STORE_WF_PERSIST_LOG_DURATION_HISTOGRAM.local(), wf_commit_log: STORE_WF_COMMIT_LOG_DURATION_HISTOGRAM.local(), wf_commit_not_persist_log: STORE_WF_COMMIT_NOT_PERSIST_LOG_DURATION_HISTOGRAM.local(), - stat_commit_log: RaftCommitLogStatistics::default(), + health_stats: HealthStatistics::default(), leader_missing: Arc::default(), last_flush_time: Instant::now_coarse(), } diff --git a/components/raftstore/src/store/worker/pd.rs b/components/raftstore/src/store/worker/pd.rs index 5e97adf8d3e..a5bf52567ca 100644 --- a/components/raftstore/src/store/worker/pd.rs +++ b/components/raftstore/src/store/worker/pd.rs @@ -41,7 +41,7 @@ use tikv_util::{ box_err, debug, error, info, metrics::ThreadInfoStatistics, store::QueryStats, - sys::thread::StdThreadBuildWrapper, + sys::{thread::StdThreadBuildWrapper, SysQuota}, thd_name, time::{Instant as TiInstant, UnixSecs}, timer::GLOBAL_TIMER_HANDLE, @@ -225,6 +225,9 @@ pub struct StoreStat { pub store_cpu_usages: RecordPairVec, pub store_read_io_rates: RecordPairVec, pub store_write_io_rates: RecordPairVec, + + store_cpu_quota: f64, // quota of cpu usage + store_cpu_busy_thd: f64, } impl Default for StoreStat { @@ -249,10 +252,33 @@ impl Default for StoreStat { store_cpu_usages: RecordPairVec::default(), store_read_io_rates: RecordPairVec::default(), store_write_io_rates: RecordPairVec::default(), + + store_cpu_quota: 0.0_f64, + store_cpu_busy_thd: 0.8_f64, } } } +impl StoreStat { + fn set_cpu_quota(&mut self, cpu_cores: f64, busy_thd: f64) { + self.store_cpu_quota = cpu_cores * 100.0; + self.store_cpu_busy_thd = busy_thd; + } + + fn maybe_busy(&self) -> bool { + if self.store_cpu_quota < 1.0 || self.store_cpu_busy_thd > 1.0 { + return false; + } + + let mut cpu_usage = 0_u64; + for record in self.store_cpu_usages.iter() { + cpu_usage += record.get_value(); + } + + (cpu_usage as f64 / self.store_cpu_quota) >= self.store_cpu_busy_thd + } +} + #[derive(Default)] pub struct PeerStat { pub read_bytes: u64, @@ -864,14 +890,14 @@ impl SlowScore { } } - fn record(&mut self, id: u64, duration: Duration) { + fn record(&mut self, id: u64, duration: Duration, not_busy: bool) { self.last_record_time = Instant::now(); if id != self.last_tick_id { return; } self.last_tick_finished = true; self.total_requests += 1; - if duration >= self.inspect_interval { + if not_busy && duration >= self.inspect_interval { self.timeout_requests += 1; } } @@ -986,6 +1012,8 @@ where causal_ts_provider: Option>, // used for rawkv apiv2 grpc_service_manager: GrpcServiceManager, ) -> Runner { + let mut store_stat = StoreStat::default(); + store_stat.set_cpu_quota(SysQuota::cpu_cores_quota(), cfg.inspect_cpu_util_thd); let store_heartbeat_interval = cfg.pd_store_heartbeat_tick_interval.0; let interval = store_heartbeat_interval / NUM_COLLECT_STORE_INFOS_PER_HEARTBEAT; let mut stats_monitor = StatsMonitor::new( @@ -1010,7 +1038,7 @@ where is_hb_receiver_scheduled: false, region_peers: HashMap::default(), region_buckets: HashMap::default(), - store_stat: StoreStat::default(), + store_stat, start_ts: UnixSecs::now(), scheduler, store_heartbeat_interval, @@ -2262,11 +2290,10 @@ where Task::QueryRegionLeader { region_id } => self.handle_query_region_leader(region_id), Task::UpdateSlowScore { id, duration } => { // Fine-tuned, `SlowScore` only takes the I/O jitters on the disk into account. - self.slow_score - .record(id, duration.delays_on_disk_io(false)); - self.slow_trend_cause.record( - tikv_util::time::duration_to_us(duration.store_wait_duration.unwrap()), - Instant::now(), + self.slow_score.record( + id, + duration.delays_on_disk_io(false), + !self.store_stat.maybe_busy(), ); } Task::RegionCpuRecords(records) => self.handle_region_cpu_records(records), @@ -2306,7 +2333,12 @@ where self.update_health_status(ServingStatus::Serving); } if !self.slow_score.last_tick_finished { - self.slow_score.record_timeout(); + // If the last tick is not finished, it means that the current store might + // be busy on handling requests or delayed on I/O operations. And only when + // the current store is not busy, it should record the last_tick as a timeout. + if !self.store_stat.maybe_busy() { + self.slow_score.record_timeout(); + } // If the last slow_score already reached abnormal state and was delayed for // reporting by `store-heartbeat` to PD, we should report it here manually as // a FAKE `store-heartbeat`. @@ -2340,13 +2372,14 @@ where STORE_INSPECT_DURATION_HISTOGRAM .with_label_values(&["store_process"]) .observe(tikv_util::time::duration_to_sec( - duration.store_process_duration.unwrap(), + duration.store_process_duration.unwrap_or_default(), )); STORE_INSPECT_DURATION_HISTOGRAM .with_label_values(&["store_wait"]) .observe(tikv_util::time::duration_to_sec( - duration.store_wait_duration.unwrap(), + duration.store_wait_duration.unwrap_or_default(), )); + STORE_INSPECT_DURATION_HISTOGRAM .with_label_values(&["all"]) .observe(tikv_util::time::duration_to_sec(dur)); diff --git a/tests/integrations/config/mod.rs b/tests/integrations/config/mod.rs index 8a1def7d7e7..26eb599d6ee 100644 --- a/tests/integrations/config/mod.rs +++ b/tests/integrations/config/mod.rs @@ -247,6 +247,7 @@ fn test_serde_custom_tikv_config() { io_reschedule_concurrent_max_count: 1234, io_reschedule_hotpot_duration: ReadableDuration::secs(4321), inspect_interval: ReadableDuration::millis(444), + inspect_cpu_util_thd: 0.666, report_min_resolved_ts_interval: ReadableDuration::millis(233), check_leader_lease_interval: ReadableDuration::millis(123), renew_leader_lease_advance_duration: ReadableDuration::millis(456), diff --git a/tests/integrations/config/test-custom.toml b/tests/integrations/config/test-custom.toml index a9772e285af..80c92b6c8ac 100644 --- a/tests/integrations/config/test-custom.toml +++ b/tests/integrations/config/test-custom.toml @@ -3,7 +3,7 @@ slow-log-threshold = "1s" panic-when-unexpected-key-or-data = true abort-on-panic = true memory-usage-limit = "10GB" -memory-usage-high-water= 0.65 +memory-usage-high-water = 0.65 [log] level = "fatal" @@ -134,9 +134,7 @@ export-priority = "high" other-priority = "low" [pd] -endpoints = [ - "example.com:443", -] +endpoints = ["example.com:443"] [metric] job = "tikv_1" @@ -222,6 +220,7 @@ waterfall-metrics = true io-reschedule-concurrent-max-count = 1234 io-reschedule-hotpot-duration = "4321s" inspect-interval = "444ms" +inspect-cpu-util-thd = 0.666 check-leader-lease-interval = "123ms" renew-leader-lease-advance-duration = "456ms" reactive-memory-lock-tick-interval = "566ms" @@ -302,15 +301,7 @@ bloom-filter-bits-per-key = 123 block-based-bloom-filter = true ribbon-filter-above-level = 1 read-amp-bytes-per-bit = 0 -compression-per-level = [ - "no", - "no", - "zstd", - "zstd", - "no", - "zstd", - "lz4", -] +compression-per-level = ["no", "no", "zstd", "zstd", "no", "zstd", "lz4"] bottommost-level-compression = "disable" bottommost-zstd-compression-dict-size = 1024 bottommost-zstd-compression-sample-size = 1024 @@ -373,15 +364,7 @@ bloom-filter-bits-per-key = 123 block-based-bloom-filter = true ribbon-filter-above-level = 1 read-amp-bytes-per-bit = 0 -compression-per-level = [ - "no", - "no", - "zstd", - "zstd", - "no", - "zstd", - "lz4", -] +compression-per-level = ["no", "no", "zstd", "zstd", "no", "zstd", "lz4"] write-buffer-size = "1MB" max-write-buffer-number = 12 min-write-buffer-number-to-merge = 12 @@ -427,15 +410,7 @@ bloom-filter-bits-per-key = 123 block-based-bloom-filter = true ribbon-filter-above-level = 1 read-amp-bytes-per-bit = 0 -compression-per-level = [ - "no", - "no", - "zstd", - "zstd", - "no", - "zstd", - "lz4", -] +compression-per-level = ["no", "no", "zstd", "zstd", "no", "zstd", "lz4"] write-buffer-size = "1MB" write-buffer-limit = "16MB" max-write-buffer-number = 12 @@ -482,15 +457,7 @@ bloom-filter-bits-per-key = 123 block-based-bloom-filter = true ribbon-filter-above-level = 1 read-amp-bytes-per-bit = 0 -compression-per-level = [ - "no", - "no", - "zstd", - "zstd", - "no", - "zstd", - "lz4", -] +compression-per-level = ["no", "no", "zstd", "zstd", "no", "zstd", "lz4"] write-buffer-size = "1MB" max-write-buffer-number = 12 min-write-buffer-number-to-merge = 12 @@ -568,15 +535,7 @@ bloom-filter-bits-per-key = 123 block-based-bloom-filter = true ribbon-filter-above-level = 1 read-amp-bytes-per-bit = 0 -compression-per-level = [ - "no", - "no", - "zstd", - "zstd", - "no", - "zstd", - "lz4", -] +compression-per-level = ["no", "no", "zstd", "zstd", "no", "zstd", "lz4"] write-buffer-size = "1MB" max-write-buffer-number = 12 min-write-buffer-number-to-merge = 12 @@ -636,9 +595,7 @@ ca-path = "invalid path" cert-path = "invalid path" key-path = "invalid path" redact-info-log = true -cert-allowed-cn = [ - "example.tikv.com", -] +cert-allowed-cn = ["example.tikv.com"] [security.encryption] data-encryption-method = "aes128-ctr" @@ -686,9 +643,9 @@ enable-compaction-filter = false compaction-filter-skip-version-check = true [pessimistic-txn] -enabled = false # test backward compatibility +enabled = false # test backward compatibility wait-for-lock-timeout = "10ms" -wake-up-delay-duration = 100 # test backward compatibility +wake-up-delay-duration = 100 # test backward compatibility pipelined = false in-memory = false From 337602e6501edaf65e894c9d90dd8afb4368de07 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Wed, 13 Dec 2023 07:51:18 +0800 Subject: [PATCH 125/220] config: Fix titan blob-run-mode setting (#15988) (#16014) close tikv/tikv#15978, close tikv/tikv#15987 Fix titan config blob-run-mode's from implementation. Signed-off-by: tonyxuqqi Co-authored-by: tonyxuqqi --- components/engine_rocks/src/config.rs | 11 +++++++---- src/config/mod.rs | 23 ++++++++++++++++++++++- 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/components/engine_rocks/src/config.rs b/components/engine_rocks/src/config.rs index e121a1cea18..d55c5cb3dfc 100644 --- a/components/engine_rocks/src/config.rs +++ b/components/engine_rocks/src/config.rs @@ -340,9 +340,9 @@ pub enum BlobRunMode { impl From for ConfigValue { fn from(mode: BlobRunMode) -> ConfigValue { let str_value = match mode { - BlobRunMode::Normal => "normal", - BlobRunMode::ReadOnly => "read-only", - BlobRunMode::Fallback => "fallback", + BlobRunMode::Normal => "kNormal", + BlobRunMode::ReadOnly => "kReadOnly", + BlobRunMode::Fallback => "kFallback", }; ConfigValue::String(str_value.into()) } @@ -366,8 +366,11 @@ impl FromStr for BlobRunMode { "normal" => Ok(BlobRunMode::Normal), "read-only" => Ok(BlobRunMode::ReadOnly), "fallback" => Ok(BlobRunMode::Fallback), + "kNormal" => Ok(BlobRunMode::Normal), + "kReadOnly" => Ok(BlobRunMode::ReadOnly), + "kFallback" => Ok(BlobRunMode::Fallback), m => Err(format!( - "expect: normal, read-only or fallback, got: {:?}", + "expect: normal, kNormal, read-only, kReadOnly, kFallback or fallback, got: {:?}", m )), } diff --git a/src/config/mod.rs b/src/config/mod.rs index 8c8cf81b8f1..9f6bc30ae0d 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -5521,7 +5521,28 @@ mod tests { let diff = config_value_to_string(diff.into_iter().collect()); assert_eq!(diff.len(), 1); assert_eq!(diff[0].0.as_str(), "blob_run_mode"); - assert_eq!(diff[0].1.as_str(), "fallback"); + assert_eq!(diff[0].1.as_str(), "kFallback"); + } + + #[test] + fn test_update_titan_blob_run_mode_config() { + let mut cfg = TikvConfig::default(); + cfg.rocksdb.titan.enabled = true; + let (_, cfg_controller, ..) = new_engines::(cfg); + for run_mode in [ + "kFallback", + "kNormal", + "kReadOnly", + "fallback", + "normal", + "read-only", + ] { + let change = HashMap::from([( + "rocksdb.defaultcf.titan.blob-run-mode".to_string(), + run_mode.to_string(), + )]); + cfg_controller.update_without_persist(change).unwrap(); + } } #[test] From 7a5a7045295f4c24e65efa619cf84cd4fa9ebd8e Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Mon, 25 Dec 2023 17:40:57 +0800 Subject: [PATCH 126/220] expression: `cast_duration_as_time` should not consider time zone (#16212) (#16219) close tikv/tikv#16211 `cast_duration_as_time` should not consider time zone Signed-off-by: gengliqi Co-authored-by: gengliqi --- .../src/codec/mysql/time/mod.rs | 5 +---- components/tidb_query_expr/src/impl_cast.rs | 15 +++++++++++---- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/components/tidb_query_datatype/src/codec/mysql/time/mod.rs b/components/tidb_query_datatype/src/codec/mysql/time/mod.rs index 4c6c2f676d7..7b1ad248d2a 100644 --- a/components/tidb_query_datatype/src/codec/mysql/time/mod.rs +++ b/components/tidb_query_datatype/src/codec/mysql/time/mod.rs @@ -1349,10 +1349,7 @@ impl Time { ) -> Result { let dur = chrono::Duration::nanoseconds(duration.to_nanos()); - let time = Utc::today() - .and_hms(0, 0, 0) - .checked_add_signed(dur) - .map(|utc| utc.with_timezone(&ctx.cfg.tz)); + let time = Utc::today().and_hms(0, 0, 0).checked_add_signed(dur); let time = time.ok_or::(box_err!("parse from duration {} overflows", duration))?; diff --git a/components/tidb_query_expr/src/impl_cast.rs b/components/tidb_query_expr/src/impl_cast.rs index 76e90f79c5b..53750d02d2d 100644 --- a/components/tidb_query_expr/src/impl_cast.rs +++ b/components/tidb_query_expr/src/impl_cast.rs @@ -1612,7 +1612,7 @@ mod tests { mysql::{ charset::*, decimal::{max_decimal, max_or_min_dec}, - Decimal, Duration, Json, RoundMode, Time, TimeType, MAX_FSP, MIN_FSP, + Decimal, Duration, Json, RoundMode, Time, TimeType, Tz, MAX_FSP, MIN_FSP, }, }, expr::{EvalConfig, EvalContext, Flag}, @@ -2933,13 +2933,20 @@ mod tests { fn test_cast_duration_as_time() { use chrono::Datelike; - let cases = vec!["11:30:45.123456", "-35:30:46"]; + let cases = vec!["11:30:45.123456", "-35:30:46", "25:59:59.999999"]; for case in cases { - let mut ctx = EvalContext::default(); - + let mut cfg = EvalConfig::default(); + cfg.tz = Tz::from_tz_name("America/New_York").unwrap(); + let mut ctx = EvalContext::new(Arc::new(cfg)); let duration = Duration::parse(&mut ctx, case, MAX_FSP).unwrap(); + + let mut cfg2 = EvalConfig::default(); + cfg2.tz = Tz::from_tz_name("Asia/Tokyo").unwrap(); + let ctx2 = EvalContext::new(Arc::new(cfg2)); + let now = RpnFnScalarEvaluator::new() + .context(ctx2) .push_param(duration) .return_field_type( FieldTypeBuilder::new() From ce1c0ab5e1a7c09b589bb167eedb11c291df1d16 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Wed, 3 Jan 2024 18:03:33 +0800 Subject: [PATCH 127/220] metrics: change resource group label name (#16192) (#16283) close tikv/tikv#16191 change metrics label name for resource-group-name from "name" to "resource_group". To be backward compatible with old grafana panel, we add a new label name and keep the old one. We are going to deprecate the old label in v8.0. Signed-off-by: ti-chi-bot Signed-off-by: nolouch Co-authored-by: glorv Co-authored-by: nolouch --- components/resource_control/src/metrics.rs | 6 ++--- src/server/metrics.rs | 3 ++- src/server/service/kv.rs | 27 ++++++++++++++-------- 3 files changed, 23 insertions(+), 13 deletions(-) diff --git a/components/resource_control/src/metrics.rs b/components/resource_control/src/metrics.rs index c9404092501..45723063492 100644 --- a/components/resource_control/src/metrics.rs +++ b/components/resource_control/src/metrics.rs @@ -7,19 +7,19 @@ lazy_static! { pub static ref BACKGROUND_QUOTA_LIMIT_VEC: IntGaugeVec = register_int_gauge_vec!( "tikv_resource_control_background_quota_limiter", "The quota limiter of background resource groups per resource type", - &["name", "type"] + &["resource_group", "type"] ) .unwrap(); pub static ref BACKGROUND_RESOURCE_CONSUMPTION: IntCounterVec = register_int_counter_vec!( "tikv_resource_control_background_resource_consumption", "Total resource consumed of background resource groups per resource type", - &["name", "type"] + &["resource_group", "type"] ) .unwrap(); pub static ref BACKGROUND_TASKS_WAIT_DURATION: IntCounterVec = register_int_counter_vec!( "tikv_resource_control_background_task_wait_duration", "Total wait duration of background tasks per resource group", - &["name"] + &["resource_group"] ) .unwrap(); pub static ref PRIORITY_QUOTA_LIMIT_VEC: IntGaugeVec = register_int_gauge_vec!( diff --git a/src/server/metrics.rs b/src/server/metrics.rs index 122748cdfa9..e3c9029c12f 100644 --- a/src/server/metrics.rs +++ b/src/server/metrics.rs @@ -209,10 +209,11 @@ lazy_static! { &["type"] ) .unwrap(); + // TODO: deprecate the "name" label in v8.0. pub static ref GRPC_RESOURCE_GROUP_COUNTER_VEC: IntCounterVec = register_int_counter_vec!( "tikv_grpc_resource_group_total", "Total number of handle grpc message for each resource group", - &["name"] + &["name", "resource_group"] ) .unwrap(); pub static ref GRPC_PROXY_MSG_COUNTER_VEC: IntCounterVec = register_int_counter_vec!( diff --git a/src/server/service/kv.rs b/src/server/service/kv.rs index 77f92d33d95..a528e40e484 100644 --- a/src/server/service/kv.rs +++ b/src/server/service/kv.rs @@ -202,7 +202,7 @@ macro_rules! handle_request { resource_manager.consume_penalty(resource_control_ctx); } GRPC_RESOURCE_GROUP_COUNTER_VEC - .with_label_values(&[resource_control_ctx.get_resource_group_name()]) + .with_label_values(&[resource_control_ctx.get_resource_group_name(), resource_control_ctx.get_resource_group_name()]) .inc(); let resp = $future_name(&self.storage, req); let task = async move { @@ -484,7 +484,10 @@ impl Tikv for Service { resource_manager.consume_penalty(resource_control_ctx); } GRPC_RESOURCE_GROUP_COUNTER_VEC - .with_label_values(&[resource_control_ctx.get_resource_group_name()]) + .with_label_values(&[ + resource_control_ctx.get_resource_group_name(), + resource_control_ctx.get_resource_group_name(), + ]) .inc(); let begin_instant = Instant::now(); @@ -522,7 +525,10 @@ impl Tikv for Service { resource_manager.consume_penalty(resource_control_ctx); } GRPC_RESOURCE_GROUP_COUNTER_VEC - .with_label_values(&[resource_control_ctx.get_resource_group_name()]) + .with_label_values(&[ + resource_control_ctx.get_resource_group_name(), + resource_control_ctx.get_resource_group_name(), + ]) .inc(); let begin_instant = Instant::now(); @@ -611,7 +617,10 @@ impl Tikv for Service { resource_manager.consume_penalty(resource_control_ctx); } GRPC_RESOURCE_GROUP_COUNTER_VEC - .with_label_values(&[resource_control_ctx.get_resource_group_name()]) + .with_label_values(&[ + resource_control_ctx.get_resource_group_name(), + resource_control_ctx.get_resource_group_name(), + ]) .inc(); let mut stream = self @@ -1155,7 +1164,7 @@ fn handle_batch_commands_request( resource_manager.consume_penalty(resource_control_ctx); } GRPC_RESOURCE_GROUP_COUNTER_VEC - .with_label_values(&[resource_control_ctx.get_resource_group_name()]) + .with_label_values(&[resource_control_ctx.get_resource_group_name(), resource_control_ctx.get_resource_group_name()]) .inc(); if batcher.as_mut().map_or(false, |req_batch| { req_batch.can_batch_get(&req) @@ -1176,8 +1185,8 @@ fn handle_batch_commands_request( resource_manager.consume_penalty(resource_control_ctx); } GRPC_RESOURCE_GROUP_COUNTER_VEC - .with_label_values(&[resource_control_ctx.get_resource_group_name()]) - .inc(); + .with_label_values(&[resource_control_ctx.get_resource_group_name(), resource_control_ctx.get_resource_group_name()]) + .inc(); if batcher.as_mut().map_or(false, |req_batch| { req_batch.can_batch_raw_get(&req) }) { @@ -1197,7 +1206,7 @@ fn handle_batch_commands_request( resource_manager.consume_penalty(resource_control_ctx); } GRPC_RESOURCE_GROUP_COUNTER_VEC - .with_label_values(&[resource_control_ctx.get_resource_group_name()]) + .with_label_values(&[resource_control_ctx.get_resource_group_name(), resource_control_ctx.get_resource_group_name()]) .inc(); let begin_instant = Instant::now(); let source = req.get_context().get_request_source().to_owned(); @@ -1231,7 +1240,7 @@ fn handle_batch_commands_request( resource_manager.consume_penalty(resource_control_ctx); } GRPC_RESOURCE_GROUP_COUNTER_VEC - .with_label_values(&[resource_control_ctx.get_resource_group_name()]) + .with_label_values(&[resource_control_ctx.get_resource_group_name(), resource_control_ctx.get_resource_group_name()]) .inc(); let begin_instant = Instant::now(); let source = req.get_context().get_request_source().to_owned(); From 776488aaecdbd306f362ca1020f1380ce5cc8990 Mon Sep 17 00:00:00 2001 From: lucasliang Date: Fri, 12 Jan 2024 01:43:54 +0800 Subject: [PATCH 128/220] [cherry-pick-7.5] raft_log_engine: update to latest version. (#16294) (#16329) close tikv/tikv#16324 Update `raft_log_engine` lib to the latest version, to fix some issues, including: - rewrite: optimize the interval of sync when rewriting memtables #347. - Return error instead of panicking if rewriting fails #343. Signed-off-by: lucasliang --- Cargo.lock | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index aa545ed6497..7514a4cdcc7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3177,9 +3177,9 @@ dependencies = [ [[package]] name = "memmap2" -version = "0.7.0" +version = "0.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "180d4b35be83d33392d1d1bfbd2ae1eca7ff5de1a94d3fc87faaa99a069e7cbd" +checksum = "45fd3a57831bf88bc63f8cebc0cf956116276e97fef3966103e96416209f7c92" dependencies = [ "libc 0.2.146", ] @@ -4331,7 +4331,7 @@ dependencies = [ [[package]] name = "raft-engine" version = "0.4.1" -source = "git+https://github.com/tikv/raft-engine.git#fa56f891fdf0b1cb5b7849b7bee3c5dadbb96103" +source = "git+https://github.com/tikv/raft-engine.git#e505d631c8c6d63f7fc63d83ea6e8fb88cf970a5" dependencies = [ "byteorder", "crc32fast", @@ -4345,7 +4345,7 @@ dependencies = [ "libc 0.2.146", "log", "lz4-sys", - "memmap2 0.7.0", + "memmap2 0.9.3", "nix 0.26.2", "num-derive 0.4.0", "num-traits", @@ -4365,7 +4365,7 @@ dependencies = [ [[package]] name = "raft-engine-ctl" version = "0.4.1" -source = "git+https://github.com/tikv/raft-engine.git#fa56f891fdf0b1cb5b7849b7bee3c5dadbb96103" +source = "git+https://github.com/tikv/raft-engine.git#e505d631c8c6d63f7fc63d83ea6e8fb88cf970a5" dependencies = [ "clap 3.1.6", "env_logger 0.10.0", From 8efe76ef37ea479f88f734f605f32686460b27d5 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Mon, 22 Jan 2024 21:43:49 +0800 Subject: [PATCH 129/220] cop: fix the scan panic when checksum is enabled (#16373) (#16379) close tikv/tikv#16371 Fix the scan panic issue when checksum is enabled. Signed-off-by: cfzjywxk Co-authored-by: cfzjywxk --- components/test_coprocessor/src/store.rs | 20 +++++++++++++++++++ .../src/codec/row/v2/row_slice.rs | 12 +++++++++-- tests/integrations/coprocessor/test_select.rs | 9 +++++++-- 3 files changed, 37 insertions(+), 4 deletions(-) diff --git a/components/test_coprocessor/src/store.rs b/components/test_coprocessor/src/store.rs index 96f405d8f39..876e3a6d1b9 100644 --- a/components/test_coprocessor/src/store.rs +++ b/components/test_coprocessor/src/store.rs @@ -299,6 +299,26 @@ impl Store { .collect(); FixtureStore::new(data) } + + pub fn insert_all_null_row( + &mut self, + tbl: &Table, + ctx: Context, + with_checksum: bool, + extra_checksum: Option, + ) { + self.begin(); + let inserts = self + .insert_into(tbl) + .set(&tbl["id"], Datum::Null) + .set(&tbl["name"], Datum::Null) + .set(&tbl["count"], Datum::Null) + .set_v2(&tbl["id"], ScalarValue::Int(None)) + .set_v2(&tbl["name"], ScalarValue::Bytes(None)) + .set_v2(&tbl["count"], ScalarValue::Int(None)); + inserts.execute_with_v2_checksum(ctx, with_checksum, extra_checksum); + self.commit(); + } } /// A trait for a general implementation to convert to a Txn store. diff --git a/components/tidb_query_datatype/src/codec/row/v2/row_slice.rs b/components/tidb_query_datatype/src/codec/row/v2/row_slice.rs index da117c96e2c..4a0a171eb60 100644 --- a/components/tidb_query_datatype/src/codec/row/v2/row_slice.rs +++ b/components/tidb_query_datatype/src/codec/row/v2/row_slice.rs @@ -233,7 +233,11 @@ impl RowSlice<'_> { RowSlice::Big { offsets, values, .. } => { - let last_slice_idx = offsets.get(non_null_col_num - 1).unwrap() as usize; + let last_slice_idx = if non_null_col_num == 0 { + 0 + } else { + offsets.get(non_null_col_num - 1).unwrap() as usize + }; let slice = values.slice; *values = LeBytes::new(&slice[..last_slice_idx]); &slice[last_slice_idx..] @@ -241,7 +245,11 @@ impl RowSlice<'_> { RowSlice::Small { offsets, values, .. } => { - let last_slice_idx = offsets.get(non_null_col_num - 1).unwrap() as usize; + let last_slice_idx = if non_null_col_num == 0 { + 0 + } else { + offsets.get(non_null_col_num - 1).unwrap() as usize + }; let slice = values.slice; *values = LeBytes::new(&slice[..last_slice_idx]); &slice[last_slice_idx..] diff --git a/tests/integrations/coprocessor/test_select.rs b/tests/integrations/coprocessor/test_select.rs index 9af28b6e3d6..43ef627a2ee 100644 --- a/tests/integrations/coprocessor/test_select.rs +++ b/tests/integrations/coprocessor/test_select.rs @@ -2086,11 +2086,16 @@ fn test_select_v2_format_with_checksum() { for extra_checksum in [None, Some(132423)] { // The row value encoded with checksum bytes should have no impact on cop task // processing and related result chunk filling. - let (_, endpoint) = + let (mut store, endpoint) = init_data_with_commit_v2_checksum(&product, &data, true, extra_checksum); + store.insert_all_null_row(&product, Context::default(), true, extra_checksum); let req = DagSelect::from(&product).build(); let mut resp = handle_select(&endpoint, req); - let spliter = DagChunkSpliter::new(resp.take_chunks().into(), 3); + let mut spliter = DagChunkSpliter::new(resp.take_chunks().into(), 3); + let first_row = spliter.next().unwrap(); + assert_eq!(first_row[0], Datum::I64(0)); + assert_eq!(first_row[1], Datum::Null); + assert_eq!(first_row[2], Datum::Null); for (row, (id, name, cnt)) in spliter.zip(data.clone()) { let name_datum = name.map(|s| s.as_bytes()).into(); let expected_encoded = datum::encode_value( From 8358b7c8a02ac5ec876e8d1aa715e385c4fe22b6 Mon Sep 17 00:00:00 2001 From: lucasliang Date: Wed, 24 Jan 2024 02:42:20 +0800 Subject: [PATCH 130/220] [cherry-pick-v7.5] raftstore: address the corner case on WakeUp hibernate regions. (#16408) (#16428) close tikv/tikv#16368 This pull request addresses a corner case where `WakeUp` messages were being ignored during I/O hang scenarios. Signed-off-by: lucasliang --- components/raftstore/src/store/fsm/peer.rs | 28 ++++++----- tests/failpoints/cases/test_hibernate.rs | 57 +++++++++++++++++++++- 2 files changed, 73 insertions(+), 12 deletions(-) diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index 98cc9ae16b7..11e129f8358 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -2144,6 +2144,11 @@ where self.fsm.hibernate_state.group_state() == GroupState::Idle, |_| {} ); + fail_point!( + "on_raft_base_tick_chaos", + self.fsm.hibernate_state.group_state() == GroupState::Chaos, + |_| {} + ); if self.fsm.peer.pending_remove { self.fsm.peer.mut_store().flush_entry_cache_metrics(); @@ -2828,18 +2833,19 @@ where fn on_extra_message(&mut self, mut msg: RaftMessage) { match msg.get_extra_msg().get_type() { ExtraMessageType::MsgRegionWakeUp | ExtraMessageType::MsgCheckStalePeer => { - if self.fsm.hibernate_state.group_state() == GroupState::Idle { - if msg.get_extra_msg().forcely_awaken { - // Forcely awaken this region by manually setting this GroupState - // into Chaos to trigger a new voting in this RaftGroup. - self.reset_raft_tick(if !self.fsm.peer.is_leader() { - GroupState::Chaos - } else { - GroupState::Ordered - }); + if msg.get_extra_msg().forcely_awaken { + // Forcely awaken this region by manually setting the GroupState + // into `Chaos` to trigger a new voting in the Raft Group. + // Meanwhile, it avoids the peer entering the `PreChaos` state, + // which would wait for another long tick to enter the `Chaos` state. + self.reset_raft_tick(if !self.fsm.peer.is_leader() { + GroupState::Chaos } else { - self.reset_raft_tick(GroupState::Ordered); - } + GroupState::Ordered + }); + } + if self.fsm.hibernate_state.group_state() == GroupState::Idle { + self.reset_raft_tick(GroupState::Ordered); } if msg.get_extra_msg().get_type() == ExtraMessageType::MsgRegionWakeUp && self.fsm.peer.is_leader() diff --git a/tests/failpoints/cases/test_hibernate.rs b/tests/failpoints/cases/test_hibernate.rs index d2eb9aa10dd..4fe7406e372 100644 --- a/tests/failpoints/cases/test_hibernate.rs +++ b/tests/failpoints/cases/test_hibernate.rs @@ -6,7 +6,7 @@ use std::{ time::Duration, }; -use kvproto::raft_serverpb::RaftMessage; +use kvproto::raft_serverpb::{ExtraMessage, ExtraMessageType, RaftMessage}; use raft::eraftpb::MessageType; use raftstore::store::{PeerMsg, PeerTick}; use test_raftstore::*; @@ -178,3 +178,58 @@ fn test_check_long_uncommitted_proposals_while_hibernate() { rx.recv_timeout(2 * cluster.cfg.raft_store.long_uncommitted_base_threshold.0) .unwrap(); } + +#[test] +fn test_forcely_awaken_hibenrate_regions() { + let mut cluster = new_node_cluster(0, 3); + let base_tick_ms = 50; + cluster.cfg.raft_store.raft_base_tick_interval = ReadableDuration::millis(base_tick_ms); + cluster.cfg.raft_store.raft_heartbeat_ticks = 2; + cluster.cfg.raft_store.raft_election_timeout_ticks = 10; + // So the random election timeout will always be 10, which makes the case more + // stable. + cluster.cfg.raft_store.raft_min_election_timeout_ticks = 10; + cluster.cfg.raft_store.raft_max_election_timeout_ticks = 11; + configure_for_hibernate(&mut cluster.cfg); + cluster.pd_client.disable_default_operator(); + let r = cluster.run_conf_change(); + cluster.pd_client.must_add_peer(r, new_peer(2, 2)); + cluster.pd_client.must_add_peer(r, new_peer(3, 3)); + + cluster.must_put(b"k1", b"v1"); + must_get_equal(&cluster.get_engine(2), b"k1", b"v1"); + must_get_equal(&cluster.get_engine(3), b"k1", b"v1"); + + // Wait until all peers of region 1 hibernate. + thread::sleep(Duration::from_millis(base_tick_ms * 30)); + + // Firstly, send `CheckPeerStaleState` message to trigger the check. + let router = cluster.sim.rl().get_router(3).unwrap(); + router + .send(1, PeerMsg::Tick(PeerTick::CheckPeerStaleState)) + .unwrap(); + + // Secondly, forcely send `MsgRegionWakeUp` message for awakening hibernated + // regions. + let (tx, rx) = mpsc::sync_channel(128); + fail::cfg_callback("on_raft_base_tick_chaos", move || { + tx.send(base_tick_ms).unwrap() + }) + .unwrap(); + let mut message = RaftMessage::default(); + message.region_id = 1; + message.set_from_peer(new_peer(3, 3)); + message.set_to_peer(new_peer(3, 3)); + message.mut_region_epoch().version = 1; + message.mut_region_epoch().conf_ver = 3; + let mut msg = ExtraMessage::default(); + msg.set_type(ExtraMessageType::MsgRegionWakeUp); + msg.forcely_awaken = true; + message.set_extra_msg(msg); + router.send_raft_message(message).unwrap(); + assert_eq!( + rx.recv_timeout(Duration::from_secs(1)).unwrap(), + base_tick_ms + ); + fail::remove("on_raft_base_tick_chaos"); +} From 940f02b981086e1411b222eb4b5171867b1ec47c Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Wed, 31 Jan 2024 17:54:25 +0800 Subject: [PATCH 131/220] server: fix panic of gRPC threads due to thread group properties not set (#16258) (#16273) close tikv/tikv#16236 Set thread group properties for gRPC threads to avoid panic when checking "is_shutdown". Signed-off-by: Ping Yu Co-authored-by: Ping Yu Co-authored-by: tongjian <1045931706@qq.com> --- components/server/src/server.rs | 5 ++++- components/server/src/server2.rs | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/components/server/src/server.rs b/components/server/src/server.rs index 72e09a9f8d8..132d24b7429 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -293,11 +293,14 @@ where SecurityManager::new(&config.security) .unwrap_or_else(|e| fatal!("failed to create security manager: {}", e)), ); + let props = tikv_util::thread_group::current_properties(); let env = Arc::new( EnvBuilder::new() .cq_count(config.server.grpc_concurrency) .name_prefix(thd_name!(GRPC_THREAD_PREFIX)) - .after_start(|| { + .after_start(move || { + tikv_util::thread_group::set_properties(props.clone()); + // SAFETY: we will call `remove_thread_memory_accessor` at before_stop. unsafe { add_thread_memory_accessor() }; }) diff --git a/components/server/src/server2.rs b/components/server/src/server2.rs index eab384871e6..98341796367 100644 --- a/components/server/src/server2.rs +++ b/components/server/src/server2.rs @@ -286,11 +286,14 @@ where SecurityManager::new(&config.security) .unwrap_or_else(|e| fatal!("failed to create security manager: {}", e)), ); + let props = tikv_util::thread_group::current_properties(); let env = Arc::new( EnvBuilder::new() .cq_count(config.server.grpc_concurrency) .name_prefix(thd_name!(GRPC_THREAD_PREFIX)) - .after_start(|| { + .after_start(move || { + tikv_util::thread_group::set_properties(props.clone()); + // SAFETY: we will call `remove_thread_memory_accessor` at before_stop. unsafe { add_thread_memory_accessor() }; }) From d530b6055e306bb58aaf938eeb5e7411bc0df2b3 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Wed, 31 Jan 2024 18:19:54 +0800 Subject: [PATCH 132/220] ctl: backoff load key range in finish flashback when meet `notLeader` or `regionNotFound` (#16058) (#16068) close tikv/tikv#15712 Root: After `PrepareFlashback` the region and the region leader transfer, when executing `FinishFlashback` will meet `notLeader`. Since the tikv ctl retry `FinishFlashback` for the same peer, it just keeps doing useless retries. Solution: neet to support backoff load key range to identify peer in finish flashback when meet `notLeader` or `regionNotFound` Signed-off-by: husharp Co-authored-by: husharp Co-authored-by: Hu# Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- cmd/tikv-ctl/src/executor.rs | 10 +++++----- cmd/tikv-ctl/src/main.rs | 18 ++++++++++++++++-- src/server/debug.rs | 16 ++++++++++------ 3 files changed, 31 insertions(+), 13 deletions(-) diff --git a/cmd/tikv-ctl/src/executor.rs b/cmd/tikv-ctl/src/executor.rs index a20d6ce2602..3e4e505a32a 100644 --- a/cmd/tikv-ctl/src/executor.rs +++ b/cmd/tikv-ctl/src/executor.rs @@ -715,7 +715,7 @@ pub trait DebugExecutor { _key_range: KeyRange, _start_ts: u64, _commit_ts: u64, - ) -> Result<(), KeyRange>; + ) -> Result<(), (KeyRange, grpcio::Error)>; fn get_region_read_progress(&self, region_id: u64, log: bool, min_start_ts: u64); } @@ -948,7 +948,7 @@ impl DebugExecutor for DebugClient { key_range: KeyRange, start_ts: u64, commit_ts: u64, - ) -> Result<(), KeyRange> { + ) -> Result<(), (KeyRange, grpcio::Error)> { let mut req = FlashbackToVersionRequest::default(); req.set_version(version); req.set_region_id(region_id); @@ -963,7 +963,7 @@ impl DebugExecutor for DebugClient { "flashback key_range {:?} with start_ts {:?}, commit_ts {:?} need to retry, err is {:?}", key_range, start_ts, commit_ts, err ); - Err(key_range) + Err((key_range, err)) } } } @@ -1293,7 +1293,7 @@ where _key_range: KeyRange, _start_ts: u64, _commit_ts: u64, - ) -> Result<(), KeyRange> { + ) -> Result<(), (KeyRange, grpcio::Error)> { unimplemented!("only available for remote mode"); } @@ -1515,7 +1515,7 @@ impl DebugExecutor for DebuggerImplV2 { _key_range: KeyRange, _start_ts: u64, _commit_ts: u64, - ) -> Result<(), KeyRange> { + ) -> Result<(), (KeyRange, grpcio::Error)> { unimplemented!("only available for remote mode"); } diff --git a/cmd/tikv-ctl/src/main.rs b/cmd/tikv-ctl/src/main.rs index df17e81f1ef..9c52d653a85 100644 --- a/cmd/tikv-ctl/src/main.rs +++ b/cmd/tikv-ctl/src/main.rs @@ -906,7 +906,7 @@ fn flashback_whole_cluster( .await { Ok(res) => { - if let Err(key_range) = res { + if let Err((key_range, _)) = res { // Retry specific key range to prepare flashback. let stale_key_range = (key_range.start_key.clone(), key_range.end_key.clone()); let mut key_range_to_prepare = key_range_to_prepare.write().unwrap(); @@ -986,7 +986,21 @@ fn flashback_whole_cluster( { Ok(res) => match res { Ok(_) => break, - Err(_) => { + Err((key_range, err)) => { + // Retry `NotLeader` or `RegionNotFound`. + if err.to_string().contains("not leader") || err.to_string().contains("not found") { + // When finished `PrepareFlashback`, the region may change leader in the `flashback in progress` + // Neet to retry specific key range to finish flashback. + let stale_key_range = (key_range.start_key.clone(), key_range.end_key.clone()); + let mut key_range_to_finish = key_range_to_finish.write().unwrap(); + // Remove stale key range. + key_range_to_finish.remove(&stale_key_range); + load_key_range(&pd_client, stale_key_range.0.clone(), stale_key_range.1.clone()) + .into_iter().for_each(|(key_range, region_info)| { + // Need to update `key_range_to_finish` to replace stale key range. + key_range_to_finish.insert(key_range, region_info); + }); + } thread::sleep(Duration::from_micros(WAIT_APPLY_FLASHBACK_STATE)); continue; } diff --git a/src/server/debug.rs b/src/server/debug.rs index 9e01852455c..70e1df855d5 100644 --- a/src/server/debug.rs +++ b/src/server/debug.rs @@ -1111,9 +1111,11 @@ async fn async_key_range_flashback_to_version ?resp.get_error(), "region_err" => ?resp.get_region_error()); - return Err(Error::FlashbackFailed( - "exec prepare flashback failed.".into(), - )); + return Err(Error::FlashbackFailed(format!( + "exec prepare flashback failed: resp err is: {:?}, region err is: {:?}", + resp.get_error(), + resp.get_region_error() + ))); } } else { let mut req = kvrpcpb::FlashbackToVersionRequest::new(); @@ -1127,9 +1129,11 @@ async fn async_key_range_flashback_to_version ?resp.get_error(), "region_err" => ?resp.get_region_error()); - return Err(Error::FlashbackFailed( - "exec finish flashback failed.".into(), - )); + return Err(Error::FlashbackFailed(format!( + "exec finish flashback failed: resp err is: {:?}, region err is: {:?}", + resp.get_error(), + resp.get_region_error() + ))); } } Ok(()) From 2dba32947f23407a7173f0ca672c1ac3acf7c356 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Wed, 31 Jan 2024 18:42:24 +0800 Subject: [PATCH 133/220] log-backup: make initialize failure fatal error, release memory while task stopped. (#16071) (#16109) close tikv/tikv#16056, close tikv/tikv#16070 Signed-off-by: hillium Co-authored-by: hillium Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- components/backup-stream/src/endpoint.rs | 100 ++++++++++-------- .../backup-stream/src/metadata/client.rs | 14 ++- components/backup-stream/src/router.rs | 34 ++++++ components/backup-stream/src/tempfiles.rs | 5 + .../backup-stream/tests/failpoints/mod.rs | 29 ++++- .../backup-stream/tests/integration/mod.rs | 2 +- components/backup-stream/tests/suite.rs | 5 + 7 files changed, 139 insertions(+), 50 deletions(-) diff --git a/components/backup-stream/src/endpoint.rs b/components/backup-stream/src/endpoint.rs index 6c19edc9f93..f453469768c 100644 --- a/components/backup-stream/src/endpoint.rs +++ b/components/backup-stream/src/endpoint.rs @@ -211,6 +211,53 @@ where self.meta_client.clone() } + fn on_fatal_error_of_task(&self, task: &str, err: &Error) -> future![()] { + metrics::update_task_status(TaskStatus::Error, task); + let meta_cli = self.get_meta_client(); + let pdc = self.pd_client.clone(); + let store_id = self.store_id; + let sched = self.scheduler.clone(); + let safepoint_name = self.pause_guard_id_for_task(task); + let safepoint_ttl = self.pause_guard_duration(); + let code = err.error_code().code.to_owned(); + let msg = err.to_string(); + let task = task.to_owned(); + async move { + let err_fut = async { + let safepoint = meta_cli.global_progress_of_task(&task).await?; + pdc.update_service_safe_point( + safepoint_name, + TimeStamp::new(safepoint.saturating_sub(1)), + safepoint_ttl, + ) + .await?; + meta_cli.pause(&task).await?; + let mut last_error = StreamBackupError::new(); + last_error.set_error_code(code); + last_error.set_error_message(msg.clone()); + last_error.set_store_id(store_id); + last_error.set_happen_at(TimeStamp::physical_now()); + meta_cli.report_last_error(&task, last_error).await?; + Result::Ok(()) + }; + if let Err(err_report) = err_fut.await { + err_report.report(format_args!("failed to upload error {}", err_report)); + let name = task.to_owned(); + // Let's retry reporting after 5s. + tokio::task::spawn(async move { + tokio::time::sleep(Duration::from_secs(5)).await; + try_send!( + sched, + Task::FatalError( + TaskSelector::ByName(name), + Box::new(annotate!(err_report, "origin error: {}", msg)) + ) + ); + }); + } + } + } + fn on_fatal_error(&self, select: TaskSelector, err: Box) { err.report_fatal(); let tasks = self @@ -220,49 +267,7 @@ where for task in tasks { // Let's pause the task first. self.unload_task(&task); - metrics::update_task_status(TaskStatus::Error, &task); - - let meta_cli = self.get_meta_client(); - let pdc = self.pd_client.clone(); - let store_id = self.store_id; - let sched = self.scheduler.clone(); - let safepoint_name = self.pause_guard_id_for_task(&task); - let safepoint_ttl = self.pause_guard_duration(); - let code = err.error_code().code.to_owned(); - let msg = err.to_string(); - self.pool.block_on(async move { - let err_fut = async { - let safepoint = meta_cli.global_progress_of_task(&task).await?; - pdc.update_service_safe_point( - safepoint_name, - TimeStamp::new(safepoint.saturating_sub(1)), - safepoint_ttl, - ) - .await?; - meta_cli.pause(&task).await?; - let mut last_error = StreamBackupError::new(); - last_error.set_error_code(code); - last_error.set_error_message(msg.clone()); - last_error.set_store_id(store_id); - last_error.set_happen_at(TimeStamp::physical_now()); - meta_cli.report_last_error(&task, last_error).await?; - Result::Ok(()) - }; - if let Err(err_report) = err_fut.await { - err_report.report(format_args!("failed to upload error {}", err_report)); - // Let's retry reporting after 5s. - tokio::task::spawn(async move { - tokio::time::sleep(Duration::from_secs(5)).await; - try_send!( - sched, - Task::FatalError( - TaskSelector::ByName(task.to_owned()), - Box::new(annotate!(err_report, "origin error: {}", msg)) - ) - ); - }); - } - }); + self.pool.block_on(self.on_fatal_error_of_task(&task, &err)); } } @@ -637,6 +642,9 @@ where let run = async move { let task_name = task.info.get_name(); let ranges = cli.ranges_of_task(task_name).await?; + fail::fail_point!("load_task::error_when_fetching_ranges", |_| { + Err(Error::Other("what range? no such thing, go away.".into())) + }); info!( "register backup stream ranges"; "task" => ?task, @@ -664,10 +672,8 @@ where Result::Ok(()) }; if let Err(e) = run.await { - e.report(format!( - "failed to register backup stream task {} to router: ranges not found", - task_clone.info.get_name() - )); + self.on_fatal_error_of_task(&task_clone.info.name, &Box::new(e)) + .await; } }); metrics::update_task_status(TaskStatus::Running, &task_name); diff --git a/components/backup-stream/src/metadata/client.rs b/components/backup-stream/src/metadata/client.rs index 1fdc1b3b1e8..2232770915f 100644 --- a/components/backup-stream/src/metadata/client.rs +++ b/components/backup-stream/src/metadata/client.rs @@ -286,7 +286,19 @@ impl MetadataClient { Ok(()) } - pub async fn get_last_error( + pub async fn get_last_error(&self, name: &str) -> Result> { + let key = MetaKey::last_errors_of(name); + + let r = self.meta_store.get_latest(Keys::Prefix(key)).await?.inner; + if r.is_empty() { + return Ok(None); + } + let r = &r[0]; + let err = protobuf::parse_from_bytes(r.value())?; + Ok(Some(err)) + } + + pub async fn get_last_error_of( &self, name: &str, store_id: u64, diff --git a/components/backup-stream/src/router.rs b/components/backup-stream/src/router.rs index d43a58d3965..a5e6489f6fc 100644 --- a/components/backup-stream/src/router.rs +++ b/components/backup-stream/src/router.rs @@ -831,6 +831,28 @@ pub struct StreamTaskInfo { temp_file_pool: Arc, } +impl Drop for StreamTaskInfo { + fn drop(&mut self) { + let (success, failed): (Vec<_>, Vec<_>) = self + .flushing_files + .get_mut() + .drain(..) + .chain(self.flushing_meta_files.get_mut().drain(..)) + .map(|(_, f, _)| f.inner.path().to_owned()) + .map(|p| self.temp_file_pool.remove(&p)) + .partition(|r| *r); + info!("stream task info dropped[1/2], removing flushing_temp files"; "success" => %success.len(), "failure" => %failed.len()); + let (success, failed): (Vec<_>, Vec<_>) = self + .files + .get_mut() + .drain() + .map(|(_, f)| f.into_inner().inner.path().to_owned()) + .map(|p| self.temp_file_pool.remove(&p)) + .partition(|r| *r); + info!("stream task info dropped[2/2], removing temp files"; "success" => %success.len(), "failure" => %failed.len()); + } +} + impl std::fmt::Debug for StreamTaskInfo { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("StreamTaskInfo") @@ -2090,6 +2112,12 @@ mod tests { let (task, _path) = task("cleanup_test".to_owned()).await?; must_register_table(&router, task, 1).await; write_simple_data(&router).await; + let tempfiles = router + .get_task_info("cleanup_test") + .await + .unwrap() + .temp_file_pool + .clone(); router .get_task_info("cleanup_test") .await? @@ -2098,6 +2126,7 @@ mod tests { write_simple_data(&router).await; let mut w = walkdir::WalkDir::new(&tmp).into_iter(); assert!(w.next().is_some(), "the temp files doesn't created"); + assert!(tempfiles.mem_used() > 0, "the temp files doesn't created."); drop(router); let w = walkdir::WalkDir::new(&tmp) .into_iter() @@ -2115,6 +2144,11 @@ mod tests { "the temp files should be removed, but it is {:?}", w ); + assert_eq!( + tempfiles.mem_used(), + 0, + "the temp files hasn't been cleared." + ); Ok(()) } diff --git a/components/backup-stream/src/tempfiles.rs b/components/backup-stream/src/tempfiles.rs index add1ee67c12..b8f9c9e1120 100644 --- a/components/backup-stream/src/tempfiles.rs +++ b/components/backup-stream/src/tempfiles.rs @@ -259,6 +259,11 @@ impl TempFilePool { &self.cfg } + #[cfg(test)] + pub fn mem_used(&self) -> usize { + self.current.load(Ordering::Acquire) + } + /// Create a file for writting. /// This function is synchronous so we can call it easier in the polling /// context. (Anyway, it is really hard to call an async function in the diff --git a/components/backup-stream/tests/failpoints/mod.rs b/components/backup-stream/tests/failpoints/mod.rs index 8dfc21529e4..ea09e9c7a1f 100644 --- a/components/backup-stream/tests/failpoints/mod.rs +++ b/components/backup-stream/tests/failpoints/mod.rs @@ -30,6 +30,32 @@ mod all { use super::{ make_record_key, make_split_key_at_record, mutation, run_async_test, SuiteBuilder, }; + use crate::make_table_key; + + #[test] + fn failed_register_task() { + let suite = SuiteBuilder::new_named("failed_register_task").build(); + fail::cfg("load_task::error_when_fetching_ranges", "return").unwrap(); + let cli = suite.get_meta_cli(); + block_on(cli.insert_task_with_range( + &suite.simple_task("failed_register_task"), + &[(&make_table_key(1, b""), &make_table_key(2, b""))], + )) + .unwrap(); + + for _ in 0..10 { + if block_on(cli.get_last_error_of("failed_register_task", 1)) + .unwrap() + .is_some() + { + return; + } + std::thread::sleep(Duration::from_millis(100)); + } + + suite.dump_slash_etc(); + panic!("No error uploaded when failed to comminate to PD."); + } #[test] fn basic() { @@ -192,7 +218,8 @@ mod all { suite.must_split(&make_split_key_at_record(1, 42)); std::thread::sleep(Duration::from_secs(2)); - let error = run_async_test(suite.get_meta_cli().get_last_error("retry_abort", 1)).unwrap(); + let error = + run_async_test(suite.get_meta_cli().get_last_error_of("retry_abort", 1)).unwrap(); let error = error.expect("no error uploaded"); error .get_error_message() diff --git a/components/backup-stream/tests/integration/mod.rs b/components/backup-stream/tests/integration/mod.rs index 395159060c1..04fee6b2c09 100644 --- a/components/backup-stream/tests/integration/mod.rs +++ b/components/backup-stream/tests/integration/mod.rs @@ -160,7 +160,7 @@ mod all { let err = run_async_test( suite .get_meta_cli() - .get_last_error("test_fatal_error", *victim), + .get_last_error_of("test_fatal_error", *victim), ) .unwrap() .unwrap(); diff --git a/components/backup-stream/tests/suite.rs b/components/backup-stream/tests/suite.rs index 0e4038d07a0..2886bb4f5d7 100644 --- a/components/backup-stream/tests/suite.rs +++ b/components/backup-stream/tests/suite.rs @@ -395,6 +395,11 @@ impl Suite { MetadataClient::new(self.meta_store.clone(), 0) } + #[allow(dead_code)] + pub fn dump_slash_etc(&self) { + self.meta_store.inner.blocking_lock().dump(); + } + pub fn must_split(&mut self, key: &[u8]) { let region = self.cluster.get_region(key); self.cluster.must_split(®ion, key); From 80a87309024fccfd14963236cc67e37df63f2df0 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Wed, 31 Jan 2024 19:10:54 +0800 Subject: [PATCH 134/220] tidb_query_datatype: fix timezone conversion by upgrading chrono-tz (#16221) (#16227) ref tikv/tikv#16220, ref pingcap/tidb#49586 Brazil no longer observes DST since 2020[1]. Updating chrono-tz from 0.5.1 to 0.5.2 bumps the timezone database from 2018i to 2020a, which includes this change, thus fixes the issue. [1]: https://en.wikipedia.org/wiki/Daylight_saving_time_in_Brazil Signed-off-by: Neil Shen Co-authored-by: Neil Shen Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> Co-authored-by: Wenxuan --- Cargo.lock | 8 +-- .../src/codec/mysql/time/mod.rs | 54 +++++++++++++++---- 2 files changed, 48 insertions(+), 14 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 7514a4cdcc7..dc0ebb334a6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -972,9 +972,9 @@ dependencies = [ [[package]] name = "chrono-tz" -version = "0.5.1" +version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e0e430fad0384e4defc3dc6b1223d1b886087a8bf9b7080e5ae027f73851ea15" +checksum = "2554a3155fec064362507487171dcc4edc3df60cb10f3a1fb10ed8094822b120" dependencies = [ "chrono", "parse-zoneinfo", @@ -3791,9 +3791,9 @@ dependencies = [ [[package]] name = "parse-zoneinfo" -version = "0.2.0" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "089a398ccdcdd77b8c38909d5a1e4b67da1bc4c9dbfe6d5b536c828eddb779e5" +checksum = "c705f256449c60da65e11ff6626e0c16a0a0b96aaa348de61376b249bc340f41" dependencies = [ "regex", ] diff --git a/components/tidb_query_datatype/src/codec/mysql/time/mod.rs b/components/tidb_query_datatype/src/codec/mysql/time/mod.rs index 7b1ad248d2a..41131fc5933 100644 --- a/components/tidb_query_datatype/src/codec/mysql/time/mod.rs +++ b/components/tidb_query_datatype/src/codec/mysql/time/mod.rs @@ -2401,15 +2401,19 @@ mod tests { #[test] fn test_parse_time_with_tz() -> Result<()> { - let ctx_with_tz = |tz: &str| { + let ctx_with_tz = |tz: &str, by_offset: bool| { let mut cfg = EvalConfig::default(); - let raw = tz.as_bytes(); - // brutally turn timezone in format +08:00 into offset in minute - let offset = if raw[0] == b'-' { -1 } else { 1 } - * ((raw[1] - b'0') as i64 * 10 + (raw[2] - b'0') as i64) - * 60 - + ((raw[4] - b'0') as i64 * 10 + (raw[5] - b'0') as i64); - cfg.set_time_zone_by_offset(offset * 60).unwrap(); + if by_offset { + let raw = tz.as_bytes(); + // brutally turn timezone in format +08:00 into offset in minute + let offset = if raw[0] == b'-' { -1 } else { 1 } + * ((raw[1] - b'0') as i64 * 10 + (raw[2] - b'0') as i64) + * 60 + + ((raw[4] - b'0') as i64 * 10 + (raw[5] - b'0') as i64); + cfg.set_time_zone_by_offset(offset * 60).unwrap(); + } else { + cfg.set_time_zone_by_name(tz).unwrap(); + } let warnings = cfg.new_eval_warnings(); EvalContext { cfg: Arc::new(cfg), @@ -2418,6 +2422,7 @@ mod tests { }; struct Case { tz: &'static str, + by_offset: bool, t: &'static str, r: Option<&'static str>, tp: TimeType, @@ -2425,60 +2430,70 @@ mod tests { let cases = vec![ Case { tz: "+00:00", + by_offset: true, t: "2020-10-10T10:10:10Z", r: Some("2020-10-10 10:10:10.000000"), tp: TimeType::DateTime, }, Case { tz: "+00:00", + by_offset: true, t: "2020-10-10T10:10:10+", r: None, tp: TimeType::DateTime, }, Case { tz: "+00:00", + by_offset: true, t: "2020-10-10T10:10:10+14:01", r: None, tp: TimeType::DateTime, }, Case { tz: "+00:00", + by_offset: true, t: "2020-10-10T10:10:10-00:00", r: None, tp: TimeType::DateTime, }, Case { tz: "-08:00", + by_offset: true, t: "2020-10-10T10:10:10-08", r: Some("2020-10-10 10:10:10.000000"), tp: TimeType::DateTime, }, Case { tz: "+08:00", + by_offset: true, t: "2020-10-10T10:10:10+08:00", r: Some("2020-10-10 10:10:10.000000"), tp: TimeType::DateTime, }, Case { tz: "+08:00", + by_offset: true, t: "2020-10-10T10:10:10+08:00", r: Some("2020-10-10 10:10:10.000000"), tp: TimeType::Timestamp, }, Case { tz: "+08:00", + by_offset: true, t: "2022-06-02T10:10:10Z", r: Some("2022-06-02 18:10:10.000000"), tp: TimeType::DateTime, }, Case { tz: "-08:00", + by_offset: true, t: "2022-06-02T10:10:10Z", r: Some("2022-06-02 02:10:10.000000"), tp: TimeType::DateTime, }, Case { tz: "+06:30", + by_offset: true, t: "2022-06-02T10:10:10-05:00", r: Some("2022-06-02 21:40:10.000000"), tp: TimeType::DateTime, @@ -2486,26 +2501,45 @@ mod tests { // Time with fraction Case { tz: "+08:00", + by_offset: true, t: "2022-06-02T10:10:10.123Z", r: Some("2022-06-02 18:10:10.123000"), tp: TimeType::DateTime, }, Case { tz: "-08:00", + by_offset: true, t: "2022-06-02T10:10:10.123Z", r: Some("2022-06-02 02:10:10.123000"), tp: TimeType::DateTime, }, Case { tz: "+06:30", + by_offset: true, t: "2022-06-02T10:10:10.654321-05:00", r: Some("2022-06-02 21:40:10.654321"), tp: TimeType::DateTime, }, + Case { + // Note: this case may fail if Brazil observes DST again. + // See https://github.com/pingcap/tidb/issues/49586 + tz: "Brazil/East", + by_offset: false, + t: "2023-11-30T17:02:00.654321+00:00", + r: Some("2023-11-30 14:02:00.654321"), + tp: TimeType::DateTime, + }, ]; let mut result: Vec> = vec![]; - for Case { tz, t, r: _, tp } in &cases { - let mut ctx = ctx_with_tz(tz); + for Case { + tz, + by_offset, + t, + r: _, + tp, + } in &cases + { + let mut ctx = ctx_with_tz(tz, *by_offset); let parsed = Time::parse(&mut ctx, t, *tp, 6, true); match parsed { Ok(p) => result.push(Some(p.to_string())), From aee4d999c72acd7443808448146e8487d8512783 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Wed, 31 Jan 2024 21:07:54 +0800 Subject: [PATCH 135/220] raftstore: check stale peer on leader missing (#16038) (#16046) close tikv/tikv#11847, close tikv/tikv#15520, close pingcap/tidb#39130 Stale peers can impede TiKV store resolved ts and impact RTO for essential functions. Default 2-hour interval for stale peer check is insufficient for stale reads, flashbacks, and ebs backup. To mitigate this, we speed up stale read check by allowing TiKV to check for stale peers every 10 minutes in the event that a leader is missing. Signed-off-by: Neil Shen Co-authored-by: Neil Shen Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- components/raftstore/src/store/fsm/peer.rs | 33 +++++++---- .../raftstore/src/store/local_metrics.rs | 5 +- components/raftstore/src/store/metrics.rs | 5 ++ components/raftstore/src/store/peer.rs | 14 +++-- metrics/grafana/tikv_details.json | 9 +++ tests/integrations/raftstore/test_merge.rs | 59 +++++++++++++++++++ 6 files changed, 107 insertions(+), 18 deletions(-) diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index 11e129f8358..ea573b5ba57 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -6402,19 +6402,26 @@ where fail_point!("peer_check_stale_state", state != StaleState::Valid, |_| {}); match state { StaleState::Valid => (), - StaleState::LeaderMissing => { - warn!( - "leader missing longer than abnormal_leader_missing_duration"; - "region_id" => self.fsm.region_id(), - "peer_id" => self.fsm.peer_id(), - "expect" => %self.ctx.cfg.abnormal_leader_missing_duration, - ); - self.ctx - .raft_metrics - .leader_missing - .lock() - .unwrap() - .insert(self.region_id()); + StaleState::LeaderMissing | StaleState::MaybeLeaderMissing => { + if state == StaleState::LeaderMissing { + warn!( + "leader missing longer than abnormal_leader_missing_duration"; + "region_id" => self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + "expect" => %self.ctx.cfg.abnormal_leader_missing_duration, + ); + self.ctx + .raft_metrics + .leader_missing + .lock() + .unwrap() + .insert(self.region_id()); + } + + // It's very likely that this is a stale peer. To prevent + // resolved ts from being blocked for too long, we check stale + // peer eagerly. + self.fsm.peer.bcast_check_stale_peer_message(self.ctx); } StaleState::ToValidate => { // for peer B in case 1 above diff --git a/components/raftstore/src/store/local_metrics.rs b/components/raftstore/src/store/local_metrics.rs index c96ce2a9a29..dc94a3afbe7 100644 --- a/components/raftstore/src/store/local_metrics.rs +++ b/components/raftstore/src/store/local_metrics.rs @@ -4,7 +4,7 @@ use std::sync::{Arc, Mutex}; use collections::HashSet; -use prometheus::local::LocalHistogram; +use prometheus::local::{LocalHistogram, LocalIntCounter}; use raft::eraftpb::MessageType; use tikv_util::time::{Duration, Instant}; use tracker::{Tracker, TrackerToken, GLOBAL_TRACKERS, INVALID_TRACKER_TOKEN}; @@ -181,6 +181,7 @@ pub struct RaftMetrics { // local statistics for slowness pub health_stats: HealthStatistics, + pub check_stale_peer: LocalIntCounter, pub leader_missing: Arc>>, last_flush_time: Instant, @@ -218,6 +219,7 @@ impl RaftMetrics { wf_commit_log: STORE_WF_COMMIT_LOG_DURATION_HISTOGRAM.local(), wf_commit_not_persist_log: STORE_WF_COMMIT_NOT_PERSIST_LOG_DURATION_HISTOGRAM.local(), health_stats: HealthStatistics::default(), + check_stale_peer: CHECK_STALE_PEER_COUNTER.local(), leader_missing: Arc::default(), last_flush_time: Instant::now_coarse(), } @@ -257,6 +259,7 @@ impl RaftMetrics { self.wf_commit_not_persist_log.flush(); } + self.check_stale_peer.flush(); let mut missing = self.leader_missing.lock().unwrap(); LEADER_MISSING.set(missing.len() as i64); missing.clear(); diff --git a/components/raftstore/src/store/metrics.rs b/components/raftstore/src/store/metrics.rs index a4f2b7820cb..de6f654de12 100644 --- a/components/raftstore/src/store/metrics.rs +++ b/components/raftstore/src/store/metrics.rs @@ -658,6 +658,11 @@ lazy_static! { "Total number of leader missed region." ).unwrap(); + pub static ref CHECK_STALE_PEER_COUNTER: IntCounter = register_int_counter!( + "tikv_raftstore_check_stale_peer", + "Total number of checking stale peers." + ).unwrap(); + pub static ref INGEST_SST_DURATION_SECONDS: Histogram = register_histogram!( "tikv_snapshot_ingest_sst_duration_seconds", diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index 57a684f9fe5..a3a6b60175a 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -127,6 +127,7 @@ pub enum StaleState { Valid, ToValidate, LeaderMissing, + MaybeLeaderMissing, } #[derive(Debug)] @@ -2113,7 +2114,6 @@ where self.leader_missing_time = None; return StaleState::Valid; } - let naive_peer = !self.is_initialized() || !self.raft_group.raft.promotable(); // Updates the `leader_missing_time` according to the current state. // // If we are checking this it means we suspect the leader might be missing. @@ -2133,13 +2133,18 @@ where StaleState::ToValidate } Some(instant) - if instant.saturating_elapsed() >= ctx.cfg.abnormal_leader_missing_duration.0 - && !naive_peer => + if instant.saturating_elapsed() >= ctx.cfg.abnormal_leader_missing_duration.0 => { // A peer is considered as in the leader missing state // if it's initialized but is isolated from its leader or // something bad happens that the raft group can not elect a leader. - StaleState::LeaderMissing + if self.is_initialized() && self.raft_group.raft.promotable() { + StaleState::LeaderMissing + } else { + // Uninitialized peer and learner may not have leader info, + // even if there is a valid leader. + StaleState::MaybeLeaderMissing + } } _ => StaleState::Valid, } @@ -5436,6 +5441,7 @@ where &mut self, ctx: &mut PollContext, ) { + ctx.raft_metrics.check_stale_peer.inc(); if self.check_stale_conf_ver < self.region().get_region_epoch().get_conf_ver() || self.region().get_region_epoch().get_conf_ver() == 0 { diff --git a/metrics/grafana/tikv_details.json b/metrics/grafana/tikv_details.json index 60eead841d7..0547de621ea 100644 --- a/metrics/grafana/tikv_details.json +++ b/metrics/grafana/tikv_details.json @@ -40376,6 +40376,15 @@ "legendFormat": "{{instance}}-{{reason}}", "refId": "A", "step": 10 + }, + { + "expr": "sum(delta(tikv_raftstore_check_stale_peer{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (instance)", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{instance}}-stale-peer", + "refId": "B", + "step": 10 } ], "thresholds": [], diff --git a/tests/integrations/raftstore/test_merge.rs b/tests/integrations/raftstore/test_merge.rs index 7d964c03319..8482feb8481 100644 --- a/tests/integrations/raftstore/test_merge.rs +++ b/tests/integrations/raftstore/test_merge.rs @@ -464,6 +464,65 @@ fn test_node_gc_uninitialized_peer_after_merge() { cluster.must_region_not_exist(left.get_id(), 4); } +/// Test leader missing should issue check stale peer requests. +#[test_case(test_raftstore::new_node_cluster)] +// #[test_case(test_raftstore_v2::new_node_cluster)] +fn test_node_gc_uninitialized_peer_after_merge_on_leader_missing() { + let mut cluster = new_cluster(0, 4); + configure_for_merge(&mut cluster.cfg); + ignore_merge_target_integrity(&mut cluster.cfg, &cluster.pd_client); + cluster.cfg.raft_store.raft_election_timeout_ticks = 5; + cluster.cfg.raft_store.raft_store_max_leader_lease = ReadableDuration::millis(40); + cluster.cfg.raft_store.peer_stale_state_check_interval = ReadableDuration::millis(100); + cluster.cfg.raft_store.abnormal_leader_missing_duration = ReadableDuration::millis(100); + // Set a large max_leader_missing_duration so that check stale peer will + // only be triggered by leader missing. + cluster.cfg.raft_store.max_leader_missing_duration = ReadableDuration::hours(1); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + cluster.run_conf_change(); + + cluster.must_put(b"k1", b"v1"); + cluster.must_put(b"k3", b"v3"); + + // test if an uninitialized stale peer before conf removal is destroyed + // automatically + let region = pd_client.get_region(b"k1").unwrap(); + pd_client.must_add_peer(region.get_id(), new_peer(2, 2)); + pd_client.must_add_peer(region.get_id(), new_peer(3, 3)); + + cluster.must_split(®ion, b"k2"); + let left = pd_client.get_region(b"k1").unwrap(); + let right = pd_client.get_region(b"k2").unwrap(); + + // Block snapshot messages, so that new peers will never be initialized. + cluster.add_send_filter(CloneFilterFactory( + RegionPacketFilter::new(left.get_id(), 4) + .msg_type(MessageType::MsgSnapshot) + .direction(Direction::Recv), + )); + // Add peer (4,4), remove peer (4,4) and then merge regions. + // Peer (4,4) will be an an uninitialized stale peer. + pd_client.must_add_peer(left.get_id(), new_peer(4, 4)); + cluster.must_region_exist(left.get_id(), 4); + cluster.add_send_filter(IsolationFilterFactory::new(4)); + pd_client.must_remove_peer(left.get_id(), new_peer(4, 4)); + pd_client.must_merge(left.get_id(), right.get_id()); + cluster.clear_send_filters(); + + // Wait for the peer (4,4) to be destroyed. + sleep_ms( + 3 * cluster + .cfg + .raft_store + .abnormal_leader_missing_duration + .as_millis(), + ); + cluster.must_region_not_exist(left.get_id(), 4); +} + // Test if a merge handled properly when there is a unfinished slow split before // merge. // No v2, it requires all peers to be available to check trim status. From d85e609d975b8ab966fe2b17815bde4428647a97 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Wed, 31 Jan 2024 21:28:56 +0800 Subject: [PATCH 136/220] raftstore: fix a panic cause by peer destroy racing (#16112) (#16117) close tikv/tikv#16111, close pingcap/tidb#49012 In case a node is isolated during the merge and the target peer is replaced by a peer with a larger ID, an "atomic_snapshot" is created which covers both the source peer and the snapshot of the target peer. In such cases, the snapshot needs to destroy the source peer too. However, if the source peer is already being destroyed triggered by gc message, it may result in a panic with a "no entry found for key" message. This commit resolves the issue by cleaning up atomic_snap_regions after the destroy, so the target peer is no longer expected to find the source peer. This cleanup is safe because the source region has already cleaned up its data and metadata from disk. Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- components/raftstore/src/store/fsm/apply.rs | 1 + components/raftstore/src/store/fsm/peer.rs | 16 ++- tests/failpoints/cases/test_merge.rs | 105 ++++++++++++++++++++ 3 files changed, 117 insertions(+), 5 deletions(-) diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index 1639f441e38..1f2e4c3f5c3 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -4074,6 +4074,7 @@ where /// Handles peer destroy. When a peer is destroyed, the corresponding apply /// delegate should be removed too. fn handle_destroy(&mut self, ctx: &mut ApplyContext, d: Destroy) { + fail_point!("on_apply_handle_destroy"); assert_eq!(d.region_id, self.delegate.region_id()); if d.merge_from_snapshot { assert_eq!(self.delegate.stopped, false); diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index ea573b5ba57..3e9d1644e4f 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -3848,14 +3848,19 @@ where self.fsm.peer.tag ); } else { + // Remove itself from atomic_snap_regions as it has cleaned both + // data and metadata. let target_region_id = *meta.targets_map.get(®ion_id).unwrap(); - let is_ready = meta - .atomic_snap_regions + meta.atomic_snap_regions .get_mut(&target_region_id) .unwrap() - .get_mut(®ion_id) - .unwrap(); - *is_ready = true; + .remove(®ion_id); + meta.destroyed_region_for_snap.remove(®ion_id); + info!("peer has destroyed, clean up for incoming overlapped snapshot"; + "region_id" => region_id, + "peer_id" => self.fsm.peer_id(), + "target_region_id" => target_region_id, + ); } } @@ -4990,6 +4995,7 @@ where "region_id" => self.fsm.region_id(), "peer_id" => self.fsm.peer_id(), "region" => ?region, + "destroy_regions" => ?persist_res.destroy_regions, ); let mut state = self.ctx.global_replication_state.lock().unwrap(); diff --git a/tests/failpoints/cases/test_merge.rs b/tests/failpoints/cases/test_merge.rs index eb15c7e16fa..929afeb70f4 100644 --- a/tests/failpoints/cases/test_merge.rs +++ b/tests/failpoints/cases/test_merge.rs @@ -2068,3 +2068,108 @@ fn test_restart_may_lose_merging_state() { cluster.must_put(b"k400", b"v400"); } + +// If a node is isolated during merge, and the target peer is replaced by a peer +// with a larger ID, then the snapshot of the target peer covers the source +// regions as well. +// In such cases, the snapshot becomes an "atomic_snapshot" which needs to +// destroy the source peer too. +// This test case checks the race between destroying the source peer by atomic +// snapshot and the gc message. The source peer must be successfully destroyed +// in this case. +#[test_case(test_raftstore::new_node_cluster)] +fn test_destroy_race_during_atomic_snapshot_after_merge() { + let mut cluster = new_cluster(0, 3); + configure_for_merge(&mut cluster.cfg); + cluster.run(); + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + cluster.must_transfer_leader(1, new_peer(1, 1)); + + cluster.must_put(b"k1", b"v1"); + cluster.must_put(b"k3", b"v3"); + + let region = cluster.get_region(b"k1"); + cluster.must_split(®ion, b"k2"); + let left = cluster.get_region(b"k1"); + let right = cluster.get_region(b"k3"); + + // Allow raft messages to source peer on store 3 before PrepareMerge. + let left_filter_block = Arc::new(atomic::AtomicBool::new(false)); + let left_filter_block_ = left_filter_block.clone(); + let left_blocked_messages = Arc::new(Mutex::new(vec![])); + let left_filter = RegionPacketFilter::new(left.get_id(), 3) + .direction(Direction::Recv) + .when(left_filter_block.clone()) + .reserve_dropped(left_blocked_messages.clone()) + .set_msg_callback(Arc::new(move |msg: &RaftMessage| { + debug!("dbg left msg_callback"; "msg" => ?msg); + if left_filter_block.load(atomic::Ordering::SeqCst) { + return; + } + for e in msg.get_message().get_entries() { + let ctx = raftstore::store::ProposalContext::from_bytes(&e.context); + if ctx.contains(raftstore::store::ProposalContext::PREPARE_MERGE) { + // Block further messages. + left_filter_block.store(true, atomic::Ordering::SeqCst); + } + } + })); + cluster.sim.wl().add_recv_filter(3, Box::new(left_filter)); + // Block messages to target peer on store 3. + let right_filter_block = Arc::new(atomic::AtomicBool::new(true)); + let new_peer_id = 1004; + let (new_peer_id_tx, new_peer_id_rx) = std::sync::mpsc::channel(); + let new_peer_id_tx = Mutex::new(Some(new_peer_id_tx)); + let (new_peer_snap_tx, new_peer_snap_rx) = std::sync::mpsc::channel(); + let new_peer_snap_tx = Mutex::new(new_peer_snap_tx); + let right_filter = RegionPacketFilter::new(right.get_id(), 3) + .direction(Direction::Recv) + .when(right_filter_block.clone()) + .set_msg_callback(Arc::new(move |msg: &RaftMessage| { + debug!("dbg right msg_callback"; "msg" => ?msg); + if msg.get_to_peer().get_id() == new_peer_id { + let _ = new_peer_id_tx.lock().unwrap().take().map(|tx| tx.send(())); + if msg.get_message().get_msg_type() == MessageType::MsgSnapshot { + let _ = new_peer_snap_tx.lock().unwrap().send(()); + } + } + })); + cluster.sim.wl().add_recv_filter(3, Box::new(right_filter)); + pd_client.must_merge(left.get_id(), right.get_id()); + + // Make target peer on store 3 a stale peer. + pd_client.must_remove_peer(right.get_id(), find_peer(&right, 3).unwrap().to_owned()); + pd_client.must_add_peer(right.get_id(), new_peer(3, new_peer_id)); + // Unblock messages to target peer on store 3. + right_filter_block.store(false, atomic::Ordering::SeqCst); + // Wait for receiving new peer id message to destroy stale target peer. + new_peer_id_rx.recv_timeout(Duration::from_secs(5)).unwrap(); + cluster.must_region_not_exist(right.get_id(), 3); + // Let source peer continue prepare merge. It will fails to schedule merge, + // because the target peer is destroyed. + left_filter_block_.store(false, atomic::Ordering::SeqCst); + // Before sending blocked messages, make sure source peer is paused at + // destroy apply delegate, so that the new right peer snapshot can will + // try to destroy source peer before applying snapshot. + fail::cfg("on_apply_handle_destroy", "pause").unwrap(); + // Send blocked messages to source peer. Prepare merge must fail to schedule + // CommitMerge because now target peer stale peer is destroyed. + let router = cluster.sim.wl().get_router(3).unwrap(); + for raft_msg in std::mem::take(&mut *left_blocked_messages.lock().unwrap()) { + router.send_raft_message(raft_msg).unwrap(); + } + // Wait the new right peer snapshot. + new_peer_snap_rx + .recv_timeout(Duration::from_secs(5)) + .unwrap(); + // Give it some time to step snapshot message. + sleep_ms(500); + // Let source peer destroy continue, so it races with atomic snapshot destroy. + fail::remove("on_apply_handle_destroy"); + + // New peer applies snapshot eventually. + cluster.must_transfer_leader(right.get_id(), new_peer(3, new_peer_id)); + cluster.must_put(b"k4", b"v4"); +} From 17c34da9388873a4cf0adf616077e742eee69fa2 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Thu, 1 Feb 2024 07:03:24 +0800 Subject: [PATCH 137/220] raftstore: check last heartbeat time before doing conf change remove node (#16174) (#16416) close tikv/tikv#15799 Check the last heartbeat time before doing remove node operation. It defines 8*heartbeat interval as the threshold of slow peer. And if the remove node operation will lead to at least half of the peers are slow, then the remove node operation will fail. Signed-off-by: Qi Xu Signed-off-by: tonyxuqqi Co-authored-by: Qi Xu Co-authored-by: tonyxuqqi --- .../operation/command/admin/conf_change.rs | 1 + components/raftstore-v2/src/raft/peer.rs | 5 + components/raftstore/src/store/peer.rs | 1 + components/raftstore/src/store/util.rs | 177 ++++++++++++++++++ .../raftstore/test_conf_change.rs | 37 ++++ 5 files changed, 221 insertions(+) diff --git a/components/raftstore-v2/src/operation/command/admin/conf_change.rs b/components/raftstore-v2/src/operation/command/admin/conf_change.rs index 55cee490e52..5c7ff96a955 100644 --- a/components/raftstore-v2/src/operation/command/admin/conf_change.rs +++ b/components/raftstore-v2/src/operation/command/admin/conf_change.rs @@ -106,6 +106,7 @@ impl Peer { changes.as_ref(), &cc, self.is_in_force_leader(), + self.get_peer_heartbeats(), )?; // TODO: check if the new peer is already in history record. diff --git a/components/raftstore-v2/src/raft/peer.rs b/components/raftstore-v2/src/raft/peer.rs index 9b095b872e7..be04f6d05a0 100644 --- a/components/raftstore-v2/src/raft/peer.rs +++ b/components/raftstore-v2/src/raft/peer.rs @@ -590,6 +590,11 @@ impl Peer { self.peer_heartbeats.remove(&peer_id); } + #[inline] + pub fn get_peer_heartbeats(&self) -> &HashMap { + &self.peer_heartbeats + } + #[inline] pub fn has_peer(&self, peer_id: u64) -> bool { self.region() diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index a3a6b60175a..68c81651cc3 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -4740,6 +4740,7 @@ where changes.as_ref(), &cc, self.is_in_force_leader(), + &self.peer_heartbeats, )?; ctx.raft_metrics.propose.conf_change.inc(); diff --git a/components/raftstore/src/store/util.rs b/components/raftstore/src/store/util.rs index d1ef3fde75a..351f67484ef 100644 --- a/components/raftstore/src/store/util.rs +++ b/components/raftstore/src/store/util.rs @@ -1012,6 +1012,7 @@ pub fn check_conf_change( change_peers: &[ChangePeerRequest], cc: &impl ConfChangeI, ignore_safety: bool, + peer_heartbeat: &collections::HashMap, ) -> Result<()> { let current_progress = node.status().progress.unwrap().clone(); let mut after_progress = current_progress.clone(); @@ -1095,6 +1096,7 @@ pub fn check_conf_change( return Err(box_err!("multiple changes that only effect learner")); } + check_remove_or_demote_voter(region, cfg, change_peers, leader.get_id(), peer_heartbeat)?; if !ignore_safety { let promoted_commit_index = after_progress.maximal_committed_index().0; let first_index = node.raft.raft_log.first_index(); @@ -1123,6 +1125,82 @@ pub fn check_conf_change( } } +fn check_remove_or_demote_voter( + region: &metapb::Region, + cfg: &Config, + change_peers: &[ChangePeerRequest], + leader_id: u64, + peer_heartbeat: &collections::HashMap, +) -> Result<()> { + let mut slow_peer_count = 0; + let mut normal_peer_count = 0; + // Here we assume if the last beartbeat is within 2 election timeout, the peer + // is healthy. When a region is hibernate, we expect all its peers are *slow* + // and it would still allow the operation + let slow_peer_threshold = + 2 * cfg.raft_base_tick_interval.0 * cfg.raft_max_election_timeout_ticks as u32; + for (id, last_heartbeat) in peer_heartbeat { + // for slow and normal peer calculation, we only count voter role + if region + .get_peers() + .iter() + .find(|p| p.get_id() == *id) + .map_or(false, |p| p.role == PeerRole::Voter) + { + // leader itself is not a slow peer + if *id == leader_id || last_heartbeat.elapsed() <= slow_peer_threshold { + normal_peer_count += 1; + } else { + slow_peer_count += 1; + } + } + } + + let mut normal_peers_to_remove = vec![]; + for cp in change_peers { + let (change_type, peer) = (cp.get_change_type(), cp.get_peer()); + if change_type == ConfChangeType::RemoveNode + || change_type == ConfChangeType::AddLearnerNode + { + let is_voter = region + .get_peers() + .iter() + .find(|p| p.get_id() == peer.get_id()) + .map_or(false, |p| p.role == PeerRole::Voter); + + // If the change_type is AddLearnerNode and the last heartbeat is found, it + // means it's a demote from voter as AddLearnerNode on existing learner node is + // not allowed. + if is_voter && let Some(last_heartbeat) = peer_heartbeat.get(&peer.get_id()) { + // peer itself is *not* slow peer, but current slow peer is >= total peers/2 + if last_heartbeat.elapsed() <= slow_peer_threshold { + normal_peer_count -= 1; + normal_peers_to_remove.push(peer.clone()); + } + } + } + } + + // only block the conf change when there's chance to improve the availability + // For example, if there's no normal peers actually, then we still allow the + // option to finish as there's no choice. + // We only block the operation when normal peers are going to be removed and it + // could lead to slow peers more than normal peers + if !normal_peers_to_remove.is_empty() + && slow_peer_count > 0 + && slow_peer_count >= normal_peer_count + { + return Err(box_err!( + "Ignore conf change command on region {} because RemoveNode or Demote a voter on peers {:?} may lead to unavailability. There're {} slow peers and {} normal peers", + region.get_id(), + &normal_peers_to_remove, + slow_peer_count, + normal_peer_count + )); + } + + Ok(()) +} pub struct MsgType<'a>(pub &'a RaftMessage); impl Display for MsgType<'_> { @@ -2504,4 +2582,103 @@ mod tests { mismatch_err.set_store_peer_id(2); assert_eq!(region_err.get_mismatch_peer_id(), &mismatch_err) } + + #[test] + fn test_check_conf_change_upon_slow_peers() { + // Create a sample configuration + let mut cfg = Config::default(); + cfg.raft_max_election_timeout_ticks = 10; + // Initialize change_peers + let change_peers = vec![ + ChangePeerRequest { + change_type: eraftpb::ConfChangeType::RemoveNode, + peer: Some(metapb::Peer { + id: 2, + ..Default::default() + }) + .into(), + ..Default::default() + }, + ChangePeerRequest { + change_type: eraftpb::ConfChangeType::AddLearnerNode, + peer: Some(metapb::Peer { + id: 2, + ..Default::default() + }) + .into(), + ..Default::default() + }, + ]; + + let mut region = Region::default(); + for i in 1..4 { + region.mut_peers().push(metapb::Peer { + id: i, + ..Default::default() + }); + } + for i in 0..change_peers.len() { + // Call the function under test and assert that the function returns failed + let mut cp = vec![change_peers[i].clone()]; + let mut peer_heartbeat = collections::HashMap::default(); + peer_heartbeat.insert( + 1, + std::time::Instant::now() - std::time::Duration::from_secs(1), + ); + peer_heartbeat.insert( + 2, + std::time::Instant::now() - std::time::Duration::from_secs(1), + ); + peer_heartbeat.insert( + 3, + std::time::Instant::now() - std::time::Duration::from_secs(1), + ); + // Call the function under test and assert that the function returns Ok + check_remove_or_demote_voter(®ion, &cfg, &cp, 1, &peer_heartbeat).unwrap(); + + // now make one peer slow + if let Some(peer_heartbeat) = peer_heartbeat.get_mut(&3) { + *peer_heartbeat = std::time::Instant::now() - std::time::Duration::from_secs(100); + } + + // Call the function under test + let result = check_remove_or_demote_voter(®ion, &cfg, &cp, 1, &peer_heartbeat); + // Assert that the function returns failed + assert!(result.is_err()); + + // remove the slow peer instead + cp[0].peer = Some(metapb::Peer { + id: 3, + ..Default::default() + }) + .into(); + // Call the function under test + check_remove_or_demote_voter(®ion, &cfg, &cp, 1, &peer_heartbeat).unwrap(); + + // make peer to learner and remove the peer 2 + region.mut_peers()[1].set_role(metapb::PeerRole::Learner); + cp[0].peer = Some(metapb::Peer { + id: 2, + ..Default::default() + }) + .into(); + // Call the function under test + check_remove_or_demote_voter(®ion, &cfg, &cp, 1, &peer_heartbeat).unwrap(); + // set peer 2 voter again + region.mut_peers()[1].set_role(metapb::PeerRole::Voter); + + // there's no remove node, it's fine with slow peers. + cp[0] = ChangePeerRequest { + change_type: eraftpb::ConfChangeType::AddNode, + peer: Some(metapb::Peer { + id: 2, + ..Default::default() + }) + .into(), + ..Default::default() + }; + // Call the function under test + check_remove_or_demote_voter(®ion, &cfg, &cp, 1, &peer_heartbeat).unwrap(); + } + } } diff --git a/tests/integrations/raftstore/test_conf_change.rs b/tests/integrations/raftstore/test_conf_change.rs index 79b3488d868..08a2ff48d17 100644 --- a/tests/integrations/raftstore/test_conf_change.rs +++ b/tests/integrations/raftstore/test_conf_change.rs @@ -863,3 +863,40 @@ fn test_conf_change_fast() { must_get_equal(&cluster.get_engine(2), b"k1", b"v1"); assert!(timer.saturating_elapsed() < Duration::from_secs(5)); } + +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] +fn test_remove_node_on_partition() { + let count = 3; + let mut cluster = new_cluster(0, count); + let pd_client = Arc::clone(&cluster.pd_client); + // Disable default max peer number check. + pd_client.disable_default_operator(); + cluster.cfg.raft_store.raft_heartbeat_ticks = 1; + cluster.cfg.raft_store.raft_base_tick_interval = ReadableDuration::millis(10); + cluster.cfg.raft_store.raft_election_timeout_ticks = 3; + cluster.cfg.raft_store.raft_store_max_leader_lease = ReadableDuration::millis(20); + let r1 = cluster.run_conf_change(); + + cluster.must_put(b"k0", b"v0"); + pd_client.must_add_peer(r1, new_peer(2, 2)); + must_get_equal(&cluster.get_engine(2), b"k0", b"v0"); + pd_client.must_add_peer(r1, new_peer(3, 3)); + must_get_equal(&cluster.get_engine(3), b"k0", b"v0"); + + // peer 3 isolation + cluster.add_send_filter(IsolationFilterFactory::new(3)); + // sleep for 13 heartbeat interval (>12 should be ok) + let sleep_time = cluster.cfg.raft_store.raft_base_tick_interval.0 + * (4 * cluster.cfg.raft_store.raft_election_timeout_ticks as u32 + 1); + thread::sleep(sleep_time); + pd_client.remove_peer(r1, new_peer(2, 2)); + cluster.must_put(b"k1", b"v1"); + thread::sleep(Duration::from_millis(500)); + // remove peer 2 should not work + pd_client.must_have_peer(r1, new_peer(2, 2)); + + // remove peer 3 should work + pd_client.must_remove_peer(r1, new_peer(3, 3)); + cluster.must_put(b"k3", b"v3"); +} From 4075fba85cbae708c0402376da2a84070098c67a Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Thu, 1 Feb 2024 07:25:54 +0800 Subject: [PATCH 138/220] engine_rocks: log SST corruption reason (#16253) (#16310) ref tikv/tikv#16308 Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- components/engine_rocks/src/event_listener.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/components/engine_rocks/src/event_listener.rs b/components/engine_rocks/src/event_listener.rs index 03a40d005c8..4ba4061a60f 100644 --- a/components/engine_rocks/src/event_listener.rs +++ b/components/engine_rocks/src/event_listener.rs @@ -130,6 +130,7 @@ impl rocksdb::EventListener for RocksEventListener { if let Some(path) = resolve_sst_filename_from_err(&err) { warn!( "detected rocksdb background error"; + "reason" => r, "sst" => &path, "err" => &err ); From 2ec454f100810a3e9987ab8211ac8c3d8699b22c Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Sat, 3 Feb 2024 03:15:26 +0800 Subject: [PATCH 139/220] tikv-ctl: Don't send compact commands to TiFlash stores (#16190) (#16350) close tikv/tikv#16189 tikv-ctl compact-cluster now works with clusters that have TiFlash nodes Signed-off-by: Fred Wulff Co-authored-by: Fred Wulff Co-authored-by: tonyxuqqi --- cmd/tikv-ctl/src/main.rs | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/cmd/tikv-ctl/src/main.rs b/cmd/tikv-ctl/src/main.rs index 9c52d653a85..86bfa724608 100644 --- a/cmd/tikv-ctl/src/main.rs +++ b/cmd/tikv-ctl/src/main.rs @@ -776,12 +776,18 @@ fn compact_whole_cluster( threads: u32, bottommost: BottommostLevelCompaction, ) { - let stores = pd_client + let all_stores = pd_client .get_all_stores(true) // Exclude tombstone stores. .unwrap_or_else(|e| perror_and_exit("Get all cluster stores from PD failed", e)); + let tikv_stores = all_stores.iter().filter(|s| { + !s.get_labels() + .iter() + .any(|l| l.get_key() == "engine" && l.get_value() == "tiflash") + }); + let mut handles = Vec::new(); - for s in stores { + for s in tikv_stores { let cfg = cfg.clone(); let mgr = Arc::clone(&mgr); let addr = s.address.clone(); From db819295028447edc3ad0a677e116274c11f5895 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Sun, 4 Feb 2024 10:36:54 +0800 Subject: [PATCH 140/220] executor: omit truncating error when handling decimal multiplication in arithmetic operators (#16187) (#16277) close tikv/tikv#16268, close pingcap/tidb#48332 omit truncating error when handling decimal multiplication in arithmetic operators Signed-off-by: TONG, Zhigao Co-authored-by: TONG, Zhigao Co-authored-by: tonyxuqqi Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- .../src/codec/mysql/decimal.rs | 69 ++++++++++++++----- .../tidb_query_expr/src/impl_arithmetic.rs | 16 ++++- 2 files changed, 64 insertions(+), 21 deletions(-) diff --git a/components/tidb_query_datatype/src/codec/mysql/decimal.rs b/components/tidb_query_datatype/src/codec/mysql/decimal.rs index bc18d7192f9..3a2be14758e 100644 --- a/components/tidb_query_datatype/src/codec/mysql/decimal.rs +++ b/components/tidb_query_datatype/src/codec/mysql/decimal.rs @@ -806,6 +806,9 @@ fn do_mul(lhs: &Decimal, rhs: &Decimal) -> Res { i32::from(word_cnt!(rhs.int_cnt)), i32::from(word_cnt!(rhs.frac_cnt)), ); + + let old_r_int_word_cnt = r_int_word_cnt; + let (int_word_to, frac_word_to) = ( word_cnt!(lhs.int_cnt + rhs.int_cnt) as usize, l_frac_word_cnt + r_frac_word_cnt, @@ -831,7 +834,7 @@ fn do_mul(lhs: &Decimal, rhs: &Decimal) -> Res { l_frac_word_cnt = 0; r_frac_word_cnt = 0; } else { - old_frac_word_to -= int_word_to as i32; + old_frac_word_to -= frac_word_to as i32; old_int_word_to = old_frac_word_to / 2; if l_frac_word_cnt <= r_frac_word_cnt { l_frac_word_cnt -= old_int_word_to; @@ -843,41 +846,43 @@ fn do_mul(lhs: &Decimal, rhs: &Decimal) -> Res { } } - let mut start_to = int_word_to + frac_word_to; - let (offset_min, offset_max) = (0, i32::from(WORD_BUF_LEN)); - let r_start = num::clamp(r_int_word_cnt + r_frac_word_cnt, offset_min, offset_max) as usize; - let left_stop = num::clamp(l_int_word_cnt + l_frac_word_cnt, offset_min, offset_max) as usize; - for l_idx in (0..left_stop).rev() { - if start_to < r_start { - break; - } + let mut start_to = (int_word_to + frac_word_to - 1) as isize; + let r_start = old_r_int_word_cnt + r_frac_word_cnt - 1; + let r_stop = old_r_int_word_cnt - r_int_word_cnt; + let mut l_idx = l_int_word_cnt + l_frac_word_cnt - 1; + + while l_idx >= 0 { let (mut carry, mut idx_to) = (0, start_to); - start_to -= 1; - for r_idx in (0..r_start).rev() { - idx_to -= 1; - let p = u64::from(lhs.word_buf[l_idx]) * u64::from(rhs.word_buf[r_idx]); + let mut r_idx = r_start; + while r_idx >= r_stop { + let p = + u64::from(lhs.word_buf[l_idx as usize]) * u64::from(rhs.word_buf[r_idx as usize]); let hi = p / u64::from(WORD_BASE); let lo = p - hi * u64::from(WORD_BASE); add( - dec.word_buf[idx_to], + dec.word_buf[idx_to as usize], lo as u32, &mut carry, - &mut dec.word_buf[idx_to], + &mut dec.word_buf[idx_to as usize], ); carry += hi as u32; + r_idx -= 1; + idx_to -= 1; } while carry > 0 { - if idx_to == 0 { + if idx_to < 0 { return Res::Overflow(dec); } - idx_to -= 1; add( - dec.word_buf[idx_to], + dec.word_buf[idx_to as usize], 0, &mut carry, - &mut dec.word_buf[idx_to], + &mut dec.word_buf[idx_to as usize], ); + idx_to -= 1; } + l_idx -= 1; + start_to -= 1; } // Now we have to check for -0.000 case @@ -3356,6 +3361,32 @@ mod tests { } } + #[test] + fn test_mul_truncated() { + let cases = vec![( + "999999999999999999999999999999999.9999", + "766507373740683764182618847769240.9770", + Res::Truncated( + "766507373740683764182618847769239999923349262625931623581738115223.07600000", + ), + Res::Truncated( + "766507373740683764182618847769240210492626259316235817381152230759.02300000", + ), + )]; + + for (lhs_str, rhs_str, exp_str, rev_exp_str) in cases { + let lhs: Decimal = lhs_str.parse().unwrap(); + let rhs: Decimal = rhs_str.parse().unwrap(); + let exp = exp_str.map(|s| s.to_owned()); + let res = (&lhs * &rhs).map(|d| d.to_string()); + assert_eq!(res, exp); + + let exp = rev_exp_str.map(|s| s.to_owned()); + let res = (&rhs * &lhs).map(|d| d.to_string()); + assert_eq!(res, exp); + } + } + #[test] fn test_div_mod() { let cases = vec![ diff --git a/components/tidb_query_expr/src/impl_arithmetic.rs b/components/tidb_query_expr/src/impl_arithmetic.rs index 2f48fec4693..5c34a8431b5 100644 --- a/components/tidb_query_expr/src/impl_arithmetic.rs +++ b/components/tidb_query_expr/src/impl_arithmetic.rs @@ -317,7 +317,12 @@ impl ArithmeticOp for DecimalMultiply { type T = Decimal; fn calc(lhs: &Decimal, rhs: &Decimal) -> Result> { - let res: codec::Result = (lhs * rhs).into(); + let res: codec::Result = match lhs * rhs { + codec::mysql::Res::Ok(t) => Ok(t), + codec::mysql::Res::Truncated(t) => Ok(t), + other => other.into(), + }; + Ok(Some(res?)) } } @@ -854,7 +859,14 @@ mod tests { #[test] fn test_multiply_decimal() { - let test_cases = vec![("1.1", "2.2", "2.42")]; + let test_cases = vec![ + ("1.1", "2.2", "2.42"), + ( + "999999999999999999999999999999999.9999", + "766507373740683764182618847769240.9770", + "766507373740683764182618847769239999923349262625931623581738115223.07600000", + ), + ]; for (lhs, rhs, expected) in test_cases { let expected: Option = expected.parse().ok(); let output = RpnFnScalarEvaluator::new() From 4e153fa68a66a83f81c9fa75dd788602c0fec59f Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Sun, 4 Feb 2024 11:53:25 +0800 Subject: [PATCH 141/220] raftstore: Fix group commit is mistakenly enabled in sync recover state (#15830) (#15842) close tikv/tikv#15817 When splitting a region, group commit is mistakenly enabled in the sync-recover state. If the region is in joint state and demoting voter is down, the commit condition can't meet. Fix group commit is mistakenly enabled in sync recover state Signed-off-by: ti-chi-bot Signed-off-by: tonyxuqqi Co-authored-by: Connor Co-authored-by: tonyxuqqi Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- components/raftstore/src/store/peer.rs | 5 +- components/test_pd_client/src/pd.rs | 1 + .../raftstore/test_replication_mode.rs | 71 +++++++++++++++++-- 3 files changed, 72 insertions(+), 5 deletions(-) diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index 68c81651cc3..26f475d009f 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -1063,7 +1063,10 @@ where return; } self.replication_mode_version = state.status().get_dr_auto_sync().state_id; - let enable = state.status().get_dr_auto_sync().get_state() != DrAutoSyncState::Async; + let enable = !matches!( + state.status().get_dr_auto_sync().get_state(), + DrAutoSyncState::Async | DrAutoSyncState::SyncRecover + ); self.raft_group.raft.enable_group_commit(enable); self.dr_auto_sync_state = state.status().get_dr_auto_sync().get_state(); } diff --git a/components/test_pd_client/src/pd.rs b/components/test_pd_client/src/pd.rs index 7f00cf35ccd..3ad5dbdcb8c 100644 --- a/components/test_pd_client/src/pd.rs +++ b/components/test_pd_client/src/pd.rs @@ -1450,6 +1450,7 @@ impl TestPdClient { dr.state_id += 1; return; } + status.set_mode(ReplicationMode::DrAutoSync); let mut dr = status.mut_dr_auto_sync(); dr.state_id += 1; dr.set_state(state.unwrap()); diff --git a/tests/integrations/raftstore/test_replication_mode.rs b/tests/integrations/raftstore/test_replication_mode.rs index 38054c1a995..76059fa8f87 100644 --- a/tests/integrations/raftstore/test_replication_mode.rs +++ b/tests/integrations/raftstore/test_replication_mode.rs @@ -1,6 +1,6 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. -use std::{sync::Arc, thread, time::Duration}; +use std::{iter::FromIterator, sync::Arc, thread, time::Duration}; use kvproto::replication_modepb::*; use pd_client::PdClient; @@ -99,6 +99,67 @@ fn test_dr_auto_sync() { assert_eq!(state.state, RegionReplicationState::IntegrityOverLabel); } +// When in sync recover state, and the region is in joint state. The leave joint +// state should be committed successfully. +#[test] +fn test_sync_recover_joint_state() { + let mut cluster = new_server_cluster(0, 5); + cluster.pd_client.disable_default_operator(); + cluster.pd_client.configure_dr_auto_sync("zone"); + cluster.cfg.raft_store.pd_store_heartbeat_tick_interval = ReadableDuration::millis(50); + cluster.cfg.raft_store.raft_log_gc_threshold = 1; + cluster.add_label(1, "zone", "ES"); + cluster.add_label(2, "zone", "ES"); + cluster.add_label(3, "zone", "ES"); + cluster.add_label(4, "zone", "WS"); // old dr + cluster.add_label(5, "zone", "WS"); // new dr + + let pd_client = Arc::clone(&cluster.pd_client); + let region_id = cluster.run_conf_change(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 5); + cluster.must_put(b"k1", b"v1"); + + cluster + .pd_client + .switch_replication_mode(Some(DrAutoSyncState::Async), vec![]); + + pd_client.must_add_peer(region_id, new_peer(2, 2)); + pd_client.must_add_peer(region_id, new_peer(3, 3)); + pd_client.must_add_peer(region_id, new_peer(4, 4)); + pd_client.must_add_peer(region_id, new_learner_peer(5, 5)); + + // Make one node down + cluster.stop_node(4); + + // Switch to sync recover + cluster + .pd_client + .switch_replication_mode(Some(DrAutoSyncState::SyncRecover), vec![]); + + cluster.must_put(b"k2", b"v2"); + assert_eq!(cluster.must_get(b"k2").unwrap(), b"v2"); + + // Enter joint, now we have C_old(1, 2, 3, 4) and C_new(1, 2, 3, 5) + pd_client.must_joint_confchange( + region_id, + vec![ + (ConfChangeType::AddLearnerNode, new_learner_peer(4, 4)), + (ConfChangeType::AddNode, new_peer(5, 5)), + ], + ); + + let region = pd_client.get_region(b"k1").unwrap(); + cluster.must_split(®ion, b"k2"); + let left = pd_client.get_region(b"k1").unwrap(); + let right = pd_client.get_region(b"k2").unwrap(); + assert_ne!(left.get_id(), right.get_id()); + + // Leave joint + pd_client.must_leave_joint(left.get_id()); + pd_client.must_leave_joint(right.get_id()); +} + #[test] fn test_sync_recover_after_apply_snapshot() { let mut cluster = prepare_cluster(); @@ -501,7 +562,7 @@ fn test_migrate_majority_to_drautosync() { assert_eq!(state.state_id, 1); assert_eq!(state.state, RegionReplicationState::IntegrityOverLabel); - // 2. swith to marjority mode. + // 2. switch to majority mode. cluster.pd_client.switch_replication_mode(None, vec![]); thread::sleep(Duration::from_millis(150)); @@ -514,9 +575,11 @@ fn test_migrate_majority_to_drautosync() { let region_m = cluster.get_region(b"n4"); let region_k = cluster.get_region(b"k1"); - // 4. switch to dy-auto-sync mode, the new region generated at marjority mode + // 4. switch to dy-auto-sync mode, the new region generated at majority mode // becomes IntegrityOverLabel again. - cluster.pd_client.switch_to_drautosync_mode(); + cluster + .pd_client + .switch_replication_mode(Some(DrAutoSyncState::SyncRecover), vec![]); thread::sleep(Duration::from_millis(100)); let state_m = cluster .pd_client From 58749254331841a0516aabd75f94fe61802a2cd8 Mon Sep 17 00:00:00 2001 From: crazycs Date: Mon, 5 Feb 2024 16:49:13 +0800 Subject: [PATCH 142/220] *: uniform deadline exceeded error in cop response (#16155) (#16502) close tikv/tikv#16154 Signed-off-by: crazycs520 --- components/raftstore/src/errors.rs | 25 +++++++++++++- components/tikv_util/src/deadline.rs | 9 +++++ src/coprocessor/endpoint.rs | 40 +++++++++++++++++++--- src/storage/errors.rs | 8 ++--- tests/failpoints/cases/test_coprocessor.rs | 36 +++++++++++++------ 5 files changed, 97 insertions(+), 21 deletions(-) diff --git a/components/raftstore/src/errors.rs b/components/raftstore/src/errors.rs index d1597a77121..49a52de26e1 100644 --- a/components/raftstore/src/errors.rs +++ b/components/raftstore/src/errors.rs @@ -7,7 +7,10 @@ use error_code::{self, ErrorCode, ErrorCodeExt}; use kvproto::{errorpb, metapb, raft_serverpb}; use protobuf::ProtobufError; use thiserror::Error; -use tikv_util::{codec, deadline::DeadlineError}; +use tikv_util::{ + codec, + deadline::{set_deadline_exceeded_busy_error, DeadlineError}, +}; use super::{coprocessor::Error as CopError, store::SnapError}; @@ -287,6 +290,9 @@ impl From for errorpb::Error { e.set_store_peer_id(store_peer_id); errorpb.set_mismatch_peer_id(e); } + Error::DeadlineExceeded => { + set_deadline_exceeded_busy_error(&mut errorpb); + } _ => {} }; @@ -350,3 +356,20 @@ impl ErrorCodeExt for Error { } } } + +#[cfg(test)] +mod tests { + use kvproto::errorpb; + + use crate::Error; + + #[test] + fn test_deadline_exceeded_error() { + let err: errorpb::Error = Error::DeadlineExceeded.into(); + assert_eq!( + err.get_server_is_busy().reason, + "deadline is exceeded".to_string() + ); + assert_eq!(err.get_message(), "Deadline is exceeded"); + } +} diff --git a/components/tikv_util/src/deadline.rs b/components/tikv_util/src/deadline.rs index 84463f507b9..64416999fe3 100644 --- a/components/tikv_util/src/deadline.rs +++ b/components/tikv_util/src/deadline.rs @@ -1,6 +1,7 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. use fail::fail_point; +use kvproto::errorpb; use super::time::{Duration, Instant}; @@ -58,3 +59,11 @@ impl Deadline { std::time::Instant::now() + self.deadline.duration_since(Instant::now_coarse()) } } + +const DEADLINE_EXCEEDED: &str = "deadline is exceeded"; + +pub fn set_deadline_exceeded_busy_error(e: &mut errorpb::Error) { + let mut server_is_busy_err = errorpb::ServerIsBusy::default(); + server_is_busy_err.set_reason(DEADLINE_EXCEEDED.to_owned()); + e.set_server_is_busy(server_is_busy_err); +} diff --git a/src/coprocessor/endpoint.rs b/src/coprocessor/endpoint.rs index 521e5a8e2cd..8504f92e1d1 100644 --- a/src/coprocessor/endpoint.rs +++ b/src/coprocessor/endpoint.rs @@ -19,7 +19,9 @@ use resource_metering::{FutureExt, ResourceTagFactory, StreamExt}; use tidb_query_common::execute_stats::ExecSummary; use tikv_alloc::trace::MemoryTraceGuard; use tikv_kv::SnapshotExt; -use tikv_util::{quota_limiter::QuotaLimiter, time::Instant}; +use tikv_util::{ + deadline::set_deadline_exceeded_busy_error, quota_limiter::QuotaLimiter, time::Instant, +}; use tipb::{AnalyzeReq, AnalyzeType, ChecksumRequest, ChecksumScanOn, DagRequest, ExecType}; use tokio::sync::Semaphore; use txn_types::Lock; @@ -835,7 +837,10 @@ fn make_error_batch_response(batch_resp: &mut coppb::StoreBatchTaskResponse, e: } Error::DeadlineExceeded => { tag = "deadline_exceeded"; - batch_resp.set_other_error(e.to_string()); + let mut err = errorpb::Error::default(); + set_deadline_exceeded_busy_error(&mut err); + err.set_message(e.to_string()); + batch_resp.set_region_error(err); } Error::MaxPendingTasksExceeded => { tag = "max_pending_tasks_exceeded"; @@ -872,7 +877,10 @@ fn make_error_response(e: Error) -> coppb::Response { } Error::DeadlineExceeded => { tag = "deadline_exceeded"; - resp.set_other_error(e.to_string()); + let mut err = errorpb::Error::default(); + set_deadline_exceeded_busy_error(&mut err); + err.set_message(e.to_string()); + resp.set_region_error(err); } Error::MaxPendingTasksExceeded => { tag = "max_pending_tasks_exceeded"; @@ -1953,7 +1961,11 @@ mod tests { let resp = block_on(copr.handle_unary_request(config, handler_builder)).unwrap(); assert_eq!(resp.get_data().len(), 0); - assert!(!resp.get_other_error().is_empty()); + let region_err = resp.get_region_error(); + assert_eq!( + region_err.get_server_is_busy().reason, + "deadline is exceeded".to_string() + ); } { @@ -1970,7 +1982,11 @@ mod tests { let resp = block_on(copr.handle_unary_request(config, handler_builder)).unwrap(); assert_eq!(resp.get_data().len(), 0); - assert!(!resp.get_other_error().is_empty()); + let region_err = resp.get_region_error(); + assert_eq!( + region_err.get_server_is_busy().reason, + "deadline is exceeded".to_string() + ); } } @@ -2022,4 +2038,18 @@ mod tests { let resp = block_on(copr.parse_and_handle_unary_request(req, None)); assert_eq!(resp.get_locked().get_key(), b"key"); } + + #[test] + fn test_make_error_response() { + let resp = make_error_response(Error::DeadlineExceeded); + let region_err = resp.get_region_error(); + assert_eq!( + region_err.get_server_is_busy().reason, + "deadline is exceeded".to_string() + ); + assert_eq!( + region_err.get_message(), + "Coprocessor task terminated due to exceeding the deadline" + ); + } } diff --git a/src/storage/errors.rs b/src/storage/errors.rs index 0e7db9ffc96..b603b904708 100644 --- a/src/storage/errors.rs +++ b/src/storage/errors.rs @@ -12,7 +12,7 @@ use std::{ use error_code::{self, ErrorCode, ErrorCodeExt}; use kvproto::{errorpb, kvrpcpb, kvrpcpb::ApiVersion}; use thiserror::Error; -use tikv_util::deadline::DeadlineError; +use tikv_util::deadline::{set_deadline_exceeded_busy_error, DeadlineError}; use txn_types::{KvPair, TimeStamp}; use crate::storage::{ @@ -222,7 +222,6 @@ impl Display for ErrorHeaderKind { const SCHEDULER_IS_BUSY: &str = "scheduler is busy"; const GC_WORKER_IS_BUSY: &str = "gc worker is busy"; -const DEADLINE_EXCEEDED: &str = "deadline is exceeded"; /// Get the `ErrorHeaderKind` enum that corresponds to the error in the protobuf /// message. Returns `ErrorHeaderKind::Other` if no match found. @@ -319,9 +318,8 @@ pub fn extract_region_error_from_error(e: &Error) -> Option { } Error(box ErrorInner::DeadlineExceeded) => { let mut err = errorpb::Error::default(); - let mut server_is_busy_err = errorpb::ServerIsBusy::default(); - server_is_busy_err.set_reason(DEADLINE_EXCEEDED.to_owned()); - err.set_server_is_busy(server_is_busy_err); + err.set_message(e.to_string()); + set_deadline_exceeded_busy_error(&mut err); Some(err) } _ => None, diff --git a/tests/failpoints/cases/test_coprocessor.rs b/tests/failpoints/cases/test_coprocessor.rs index 0710f778aa7..be9d978b23a 100644 --- a/tests/failpoints/cases/test_coprocessor.rs +++ b/tests/failpoints/cases/test_coprocessor.rs @@ -31,8 +31,15 @@ fn test_deadline() { fail::cfg("deadline_check_fail", "return()").unwrap(); let resp = handle_request(&endpoint, req); - - assert!(resp.get_other_error().contains("exceeding the deadline")); + let region_err = resp.get_region_error(); + assert_eq!( + region_err.get_server_is_busy().reason, + "deadline is exceeded".to_string() + ); + assert_eq!( + region_err.get_message(), + "Coprocessor task terminated due to exceeding the deadline" + ); } #[test] @@ -46,8 +53,15 @@ fn test_deadline_2() { fail::cfg("rockskv_async_snapshot", "panic").unwrap(); fail::cfg("deadline_check_fail", "return()").unwrap(); let resp = handle_request(&endpoint, req); - - assert!(resp.get_other_error().contains("exceeding the deadline")); + let region_err = resp.get_region_error(); + assert_eq!( + region_err.get_server_is_busy().reason, + "deadline is exceeded".to_string() + ); + assert_eq!( + region_err.get_message(), + "Coprocessor task terminated due to exceeding the deadline" + ); } /// Test deadline exceeded when request is handling @@ -80,12 +94,14 @@ fn test_deadline_3() { let mut resp = SelectResponse::default(); resp.merge_from_bytes(cop_resp.get_data()).unwrap(); - assert!( - cop_resp.other_error.contains("exceeding the deadline") - || resp - .get_error() - .get_msg() - .contains("exceeding the deadline") + let region_err = cop_resp.get_region_error(); + assert_eq!( + region_err.get_server_is_busy().reason, + "deadline is exceeded".to_string() + ); + assert_eq!( + region_err.get_message(), + "Coprocessor task terminated due to exceeding the deadline" ); } From 24fe68e0615bbcdff2e59fa061d20ffbf711886a Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Wed, 7 Feb 2024 03:44:44 +0800 Subject: [PATCH 143/220] raftstore: polish the availability check on conf change requests (#16486) (#16489) close tikv/tikv#16465 When calculating the impact of conf change, include all operations into considerations. Signed-off-by: ti-chi-bot Signed-off-by: tonyxuqqi Co-authored-by: tonyxuqqi --- components/raftstore/src/store/util.rs | 484 +++++++++++++++++++------ 1 file changed, 368 insertions(+), 116 deletions(-) diff --git a/components/raftstore/src/store/util.rs b/components/raftstore/src/store/util.rs index 351f67484ef..e1320f48712 100644 --- a/components/raftstore/src/store/util.rs +++ b/components/raftstore/src/store/util.rs @@ -1012,7 +1012,7 @@ pub fn check_conf_change( change_peers: &[ChangePeerRequest], cc: &impl ConfChangeI, ignore_safety: bool, - peer_heartbeat: &collections::HashMap, + peer_heartbeats: &collections::HashMap, ) -> Result<()> { let current_progress = node.status().progress.unwrap().clone(); let mut after_progress = current_progress.clone(); @@ -1096,7 +1096,13 @@ pub fn check_conf_change( return Err(box_err!("multiple changes that only effect learner")); } - check_remove_or_demote_voter(region, cfg, change_peers, leader.get_id(), peer_heartbeat)?; + check_availability_by_last_heartbeats( + region, + cfg, + change_peers, + leader.get_id(), + peer_heartbeats, + )?; if !ignore_safety { let promoted_commit_index = after_progress.maximal_committed_index().0; let first_index = node.raft.raft_log.first_index(); @@ -1125,77 +1131,103 @@ pub fn check_conf_change( } } -fn check_remove_or_demote_voter( +/// Check the would-be availability if the operation proceed. +/// If the slow peers count would be equal or larger than normal peers count, +/// then the operations would be rejected +fn check_availability_by_last_heartbeats( region: &metapb::Region, cfg: &Config, change_peers: &[ChangePeerRequest], leader_id: u64, - peer_heartbeat: &collections::HashMap, + peer_heartbeats: &collections::HashMap, ) -> Result<()> { - let mut slow_peer_count = 0; - let mut normal_peer_count = 0; + let mut slow_voters = vec![]; + let mut normal_voters = vec![]; + // Here we assume if the last beartbeat is within 2 election timeout, the peer // is healthy. When a region is hibernate, we expect all its peers are *slow* // and it would still allow the operation - let slow_peer_threshold = + let slow_voter_threshold = 2 * cfg.raft_base_tick_interval.0 * cfg.raft_max_election_timeout_ticks as u32; - for (id, last_heartbeat) in peer_heartbeat { + for (id, last_heartbeat) in peer_heartbeats { // for slow and normal peer calculation, we only count voter role if region .get_peers() .iter() .find(|p| p.get_id() == *id) - .map_or(false, |p| p.role == PeerRole::Voter) + .map_or(false, |p| { + p.role == PeerRole::Voter || p.role == PeerRole::IncomingVoter + }) { // leader itself is not a slow peer - if *id == leader_id || last_heartbeat.elapsed() <= slow_peer_threshold { - normal_peer_count += 1; + if *id == leader_id || last_heartbeat.elapsed() <= slow_voter_threshold { + normal_voters.push(*id); } else { - slow_peer_count += 1; + slow_voters.push(*id); } } } - let mut normal_peers_to_remove = vec![]; + let is_healthy = normal_voters.len() > slow_voters.len(); + // if it's already unhealthy, let it go + if !is_healthy { + return Ok(()); + } + + let mut normal_voters_to_remove = vec![]; + let mut slow_voters_to_add = vec![]; for cp in change_peers { let (change_type, peer) = (cp.get_change_type(), cp.get_peer()); - if change_type == ConfChangeType::RemoveNode - || change_type == ConfChangeType::AddLearnerNode - { - let is_voter = region - .get_peers() - .iter() - .find(|p| p.get_id() == peer.get_id()) - .map_or(false, |p| p.role == PeerRole::Voter); + let is_voter = region + .get_peers() + .iter() + .find(|p| p.get_id() == peer.get_id()) + .map_or(false, |p| { + p.role == PeerRole::Voter || p.role == PeerRole::IncomingVoter + }); + if !is_voter && change_type == ConfChangeType::AddNode { + // exiting peers, promoting from learner to voter + if let Some(last_heartbeat) = peer_heartbeats.get(&peer.get_id()) { + if last_heartbeat.elapsed() <= slow_voter_threshold { + normal_voters.push(peer.get_id()); + } else { + slow_voters.push(peer.get_id()); + slow_voters_to_add.push(peer.get_id()); + } + } else { + // it's a new peer, assuming it's a normal voter + normal_voters.push(peer.get_id()); + } + } + if is_voter + && (change_type == ConfChangeType::RemoveNode + || change_type == ConfChangeType::AddLearnerNode) + { // If the change_type is AddLearnerNode and the last heartbeat is found, it // means it's a demote from voter as AddLearnerNode on existing learner node is // not allowed. - if is_voter && let Some(last_heartbeat) = peer_heartbeat.get(&peer.get_id()) { - // peer itself is *not* slow peer, but current slow peer is >= total peers/2 - if last_heartbeat.elapsed() <= slow_peer_threshold { - normal_peer_count -= 1; - normal_peers_to_remove.push(peer.clone()); + if let Some(last_heartbeat) = peer_heartbeats.get(&peer.get_id()) { + if last_heartbeat.elapsed() <= slow_voter_threshold { + normal_voters.retain(|id| *id != peer.get_id()); + normal_voters_to_remove.push(peer.clone()); } } } } - // only block the conf change when there's chance to improve the availability - // For example, if there's no normal peers actually, then we still allow the - // option to finish as there's no choice. - // We only block the operation when normal peers are going to be removed and it - // could lead to slow peers more than normal peers - if !normal_peers_to_remove.is_empty() - && slow_peer_count > 0 - && slow_peer_count >= normal_peer_count - { + // Only block the conf change when currently it's healthy, but would be + // unhealthy. If currently it's already unhealthy, let it go. + if slow_voters.len() >= normal_voters.len() { return Err(box_err!( - "Ignore conf change command on region {} because RemoveNode or Demote a voter on peers {:?} may lead to unavailability. There're {} slow peers and {} normal peers", + "Ignore conf change command on [region_id={}] because the operations may lead to unavailability.\ + Normal voters to remove {:?}, slow voters to add {:?}.\ + Normal voters would be {:?}, slow voters would be {:?}.", region.get_id(), - &normal_peers_to_remove, - slow_peer_count, - normal_peer_count + &normal_voters_to_remove, + &slow_voters_to_add, + &normal_voters, + &slow_voters )); } @@ -2588,97 +2620,317 @@ mod tests { // Create a sample configuration let mut cfg = Config::default(); cfg.raft_max_election_timeout_ticks = 10; - // Initialize change_peers - let change_peers = vec![ - ChangePeerRequest { - change_type: eraftpb::ConfChangeType::RemoveNode, - peer: Some(metapb::Peer { - id: 2, - ..Default::default() - }) - .into(), - ..Default::default() - }, - ChangePeerRequest { - change_type: eraftpb::ConfChangeType::AddLearnerNode, - peer: Some(metapb::Peer { - id: 2, - ..Default::default() - }) - .into(), - ..Default::default() - }, - ]; + // peer 1, 2, 3 are voters, 4, 5 are learners. let mut region = Region::default(); - for i in 1..4 { + for i in 1..3 { + region.mut_peers().push(metapb::Peer { + id: i, + role: PeerRole::Voter, + ..Default::default() + }); + } + region.mut_peers().push(metapb::Peer { + id: 3, + role: PeerRole::IncomingVoter, + ..Default::default() + }); + for i in 4..6 { region.mut_peers().push(metapb::Peer { id: i, + role: PeerRole::Learner, ..Default::default() }); } - for i in 0..change_peers.len() { + + // heartbeats: peer 3, 5 are slow + let mut peer_heartbeat = collections::HashMap::default(); + peer_heartbeat.insert( + 1, + std::time::Instant::now() - std::time::Duration::from_secs(1), + ); + peer_heartbeat.insert( + 2, + std::time::Instant::now() - std::time::Duration::from_secs(1), + ); + peer_heartbeat.insert( + 3, + std::time::Instant::now() - std::time::Duration::from_secs(100), + ); + peer_heartbeat.insert( + 4, + std::time::Instant::now() - std::time::Duration::from_secs(1), + ); + peer_heartbeat.insert( + 5, + std::time::Instant::now() - std::time::Duration::from_secs(100), + ); + + // Initialize change_peers + let change_peers_and_expect = vec![ + // promote peer 4 from learner to voter, it should work + ( + vec![ChangePeerRequest { + change_type: eraftpb::ConfChangeType::AddNode, + peer: Some(metapb::Peer { + id: 4, + ..Default::default() + }) + .into(), + ..Default::default() + }], + true, + ), + // promote peer 5 from learner to voter, it should be rejected (two slow voters vs two + // normal voters) + ( + vec![ChangePeerRequest { + change_type: eraftpb::ConfChangeType::AddNode, + peer: Some(metapb::Peer { + id: 4, + ..Default::default() + }) + .into(), + ..Default::default() + }], + true, + ), + // remove a peer 3, it should work as peer 3 is slow + ( + vec![ChangePeerRequest { + change_type: eraftpb::ConfChangeType::RemoveNode, + peer: Some(metapb::Peer { + id: 3, + ..Default::default() + }) + .into(), + ..Default::default() + }], + true, + ), + // remove a peer 2, it should be rejected as peer 3 is slow + ( + vec![ChangePeerRequest { + change_type: eraftpb::ConfChangeType::RemoveNode, + peer: Some(metapb::Peer { + id: 2, + ..Default::default() + }) + .into(), + ..Default::default() + }], + false, + ), + // demote peer2, it should be rejected + ( + vec![ChangePeerRequest { + change_type: eraftpb::ConfChangeType::AddLearnerNode, + peer: Some(metapb::Peer { + id: 2, + ..Default::default() + }) + .into(), + ..Default::default() + }], + false, + ), + // demote peer 2, but promote peer 4 as voter, it should work + ( + vec![ + ChangePeerRequest { + change_type: eraftpb::ConfChangeType::AddNode, + peer: Some(metapb::Peer { + id: 4, + ..Default::default() + }) + .into(), + ..Default::default() + }, + ChangePeerRequest { + change_type: eraftpb::ConfChangeType::AddLearnerNode, + peer: Some(metapb::Peer { + id: 2, + ..Default::default() + }) + .into(), + ..Default::default() + }, + ], + true, + ), + // demote peer 2, but promote peer 5 as voter, it should be rejected because peer 5 is + // slow + ( + vec![ + ChangePeerRequest { + change_type: eraftpb::ConfChangeType::AddNode, + peer: Some(metapb::Peer { + id: 5, + ..Default::default() + }) + .into(), + ..Default::default() + }, + ChangePeerRequest { + change_type: eraftpb::ConfChangeType::AddLearnerNode, + peer: Some(metapb::Peer { + id: 2, + ..Default::default() + }) + .into(), + ..Default::default() + }, + ], + false, + ), + // promote peer 4 and 5 as voter, it should be ok + ( + vec![ + ChangePeerRequest { + change_type: eraftpb::ConfChangeType::AddNode, + peer: Some(metapb::Peer { + id: 4, + ..Default::default() + }) + .into(), + ..Default::default() + }, + ChangePeerRequest { + change_type: eraftpb::ConfChangeType::AddNode, + peer: Some(metapb::Peer { + id: 5, + ..Default::default() + }) + .into(), + ..Default::default() + }, + ], + true, + ), + ]; + + for (cp, expect_result) in change_peers_and_expect { // Call the function under test and assert that the function returns failed - let mut cp = vec![change_peers[i].clone()]; - let mut peer_heartbeat = collections::HashMap::default(); - peer_heartbeat.insert( - 1, - std::time::Instant::now() - std::time::Duration::from_secs(1), - ); - peer_heartbeat.insert( - 2, - std::time::Instant::now() - std::time::Duration::from_secs(1), - ); - peer_heartbeat.insert( - 3, - std::time::Instant::now() - std::time::Duration::from_secs(1), - ); // Call the function under test and assert that the function returns Ok - check_remove_or_demote_voter(®ion, &cfg, &cp, 1, &peer_heartbeat).unwrap(); - - // now make one peer slow - if let Some(peer_heartbeat) = peer_heartbeat.get_mut(&3) { - *peer_heartbeat = std::time::Instant::now() - std::time::Duration::from_secs(100); + let result = + check_availability_by_last_heartbeats(®ion, &cfg, &cp, 1, &peer_heartbeat); + if expect_result { + assert!(result.is_ok()); + } else { + assert!(result.is_err(), "{:?}", cp); } + } + } - // Call the function under test - let result = check_remove_or_demote_voter(®ion, &cfg, &cp, 1, &peer_heartbeat); - // Assert that the function returns failed - assert!(result.is_err()); + #[test] + fn test_check_conf_change_on_unhealthy_status() { + // Create a sample configuration + let mut cfg = Config::default(); + cfg.raft_max_election_timeout_ticks = 10; - // remove the slow peer instead - cp[0].peer = Some(metapb::Peer { - id: 3, - ..Default::default() - }) - .into(); - // Call the function under test - check_remove_or_demote_voter(®ion, &cfg, &cp, 1, &peer_heartbeat).unwrap(); - - // make peer to learner and remove the peer 2 - region.mut_peers()[1].set_role(metapb::PeerRole::Learner); - cp[0].peer = Some(metapb::Peer { - id: 2, + // peer 1, 2, 3 are voters, 4 is learner + let mut region = Region::default(); + region.mut_peers().push(metapb::Peer { + id: 1, + role: PeerRole::Voter, + ..Default::default() + }); + for i in 2..4 { + region.mut_peers().push(metapb::Peer { + id: i, + role: PeerRole::IncomingVoter, ..Default::default() - }) - .into(); - // Call the function under test - check_remove_or_demote_voter(®ion, &cfg, &cp, 1, &peer_heartbeat).unwrap(); - // set peer 2 voter again - region.mut_peers()[1].set_role(metapb::PeerRole::Voter); - - // there's no remove node, it's fine with slow peers. - cp[0] = ChangePeerRequest { - change_type: eraftpb::ConfChangeType::AddNode, - peer: Some(metapb::Peer { - id: 2, + }); + } + region.mut_peers().push(metapb::Peer { + id: 4, + role: PeerRole::Learner, + ..Default::default() + }); + + // heartbeats: peer 2, 3, 4 are slow, it's already unhealthy now + let mut peer_heartbeat = collections::HashMap::default(); + peer_heartbeat.insert( + 1, + std::time::Instant::now() - std::time::Duration::from_secs(1), + ); + peer_heartbeat.insert( + 2, + std::time::Instant::now() - std::time::Duration::from_secs(100), + ); + peer_heartbeat.insert( + 3, + std::time::Instant::now() - std::time::Duration::from_secs(100), + ); + peer_heartbeat.insert( + 4, + std::time::Instant::now() - std::time::Duration::from_secs(100), + ); + + // Initialize change_peers + let change_peers_and_expect = vec![ + // promote peer 4 from learner to voter, it should work + ( + vec![ChangePeerRequest { + change_type: eraftpb::ConfChangeType::AddNode, + peer: Some(metapb::Peer { + id: 4, + ..Default::default() + }) + .into(), ..Default::default() - }) - .into(), - ..Default::default() - }; - // Call the function under test - check_remove_or_demote_voter(®ion, &cfg, &cp, 1, &peer_heartbeat).unwrap(); + }], + true, + ), + // remove a peer 3, it should work as peer 3 is slow + ( + vec![ChangePeerRequest { + change_type: eraftpb::ConfChangeType::RemoveNode, + peer: Some(metapb::Peer { + id: 3, + ..Default::default() + }) + .into(), + ..Default::default() + }], + true, + ), + // remove a peer 2, 3, it should work + ( + vec![ + ChangePeerRequest { + change_type: eraftpb::ConfChangeType::RemoveNode, + peer: Some(metapb::Peer { + id: 2, + ..Default::default() + }) + .into(), + ..Default::default() + }, + ChangePeerRequest { + change_type: eraftpb::ConfChangeType::AddLearnerNode, + peer: Some(metapb::Peer { + id: 3, + ..Default::default() + }) + .into(), + ..Default::default() + }, + ], + true, + ), + ]; + + for (cp, expect_result) in change_peers_and_expect { + // Call the function under test and assert that the function returns failed + // Call the function under test and assert that the function returns Ok + let result = + check_availability_by_last_heartbeats(®ion, &cfg, &cp, 1, &peer_heartbeat); + if expect_result { + assert!(result.is_ok()); + } else { + assert!(result.is_err(), "{:?}", cp); + } } } } From b5295c1d35ba7a9c94d2268545cedc9eab045fa5 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Thu, 8 Feb 2024 03:07:15 +0800 Subject: [PATCH 144/220] server: make gc support multi-threads (#16096) (#16175) close tikv/tikv#16101 do parallel region gc and expose the gc thread configuration. The configuration can be dynamically updated. Signed-off-by: nolouch Co-authored-by: tonyxuqqi Co-authored-by: nolouch Co-authored-by: ShuNing --- components/tikv_util/src/worker/pool.rs | 50 +++--- .../tikv_util/src/yatp_pool/future_pool.rs | 5 + src/server/gc_worker/compaction_filter.rs | 2 +- src/server/gc_worker/config.rs | 23 ++- src/server/gc_worker/gc_manager.rs | 103 ++++++++--- src/server/gc_worker/gc_worker.rs | 165 ++++++++++++++---- tests/integrations/config/mod.rs | 1 + tests/integrations/config/test-custom.toml | 1 + 8 files changed, 265 insertions(+), 85 deletions(-) diff --git a/components/tikv_util/src/worker/pool.rs b/components/tikv_util/src/worker/pool.rs index c3919e42619..a22732a7aae 100644 --- a/components/tikv_util/src/worker/pool.rs +++ b/components/tikv_util/src/worker/pool.rs @@ -7,7 +7,7 @@ use std::{ future::Future, sync::{ atomic::{AtomicBool, AtomicUsize, Ordering}, - Arc, Mutex, + Arc, }, time::{Duration, Instant}, }; @@ -20,13 +20,13 @@ use futures::{ stream::StreamExt, }; use prometheus::IntGauge; -use yatp::{Remote, ThreadPool}; +use yatp::Remote; use super::metrics::*; use crate::{ future::{block_on_timeout, poll_future_notify}, timer::GLOBAL_TIMER_HANDLE, - yatp_pool::{DefaultTicker, YatpPoolBuilder}, + yatp_pool::{DefaultTicker, FuturePool, YatpPoolBuilder}, }; #[derive(PartialEq)] @@ -222,7 +222,15 @@ impl LazyWorker { } pub fn remote(&self) -> Remote { - self.worker.remote.clone() + self.worker.remote() + } + + pub fn pool_size(&self) -> usize { + self.worker.pool_size() + } + + pub fn pool(&self) -> FuturePool { + self.worker.pool() } } @@ -301,11 +309,8 @@ impl> Builder { let pool = YatpPoolBuilder::new(DefaultTicker::default()) .name_prefix(self.name) .thread_count(self.thread_count, self.thread_count, self.thread_count) - .build_single_level_pool(); - let remote = pool.remote().clone(); - let pool = Arc::new(Mutex::new(Some(pool))); + .build_future_pool(); Worker { - remote, stop: Arc::new(AtomicBool::new(false)), pool, counter: Arc::new(AtomicUsize::new(0)), @@ -318,8 +323,7 @@ impl> Builder { /// A worker that can schedule time consuming tasks. #[derive(Clone)] pub struct Worker { - pool: Arc>>>, - remote: Remote, + pool: FuturePool, pending_capacity: usize, counter: Arc, stop: Arc, @@ -371,7 +375,7 @@ impl Worker { .interval(std::time::Instant::now(), interval) .compat(); let stop = self.stop.clone(); - self.remote.spawn(async move { + let _ = self.pool.spawn(async move { while !stop.load(Ordering::Relaxed) && let Some(Ok(_)) = interval.next().await { @@ -389,7 +393,7 @@ impl Worker { .interval(std::time::Instant::now(), interval) .compat(); let stop = self.stop.clone(); - self.remote.spawn(async move { + let _ = self.pool.spawn(async move { while !stop.load(Ordering::Relaxed) && let Some(Ok(_)) = interval.next().await { @@ -403,7 +407,7 @@ impl Worker { where F: Future + Send + 'static, { - self.remote.spawn(f); + let _ = self.pool.spawn(f); } fn delay_notify(tx: UnboundedSender>, timeout: Duration) { @@ -438,10 +442,8 @@ impl Worker { /// Stops the worker thread. pub fn stop(&self) { - if let Some(pool) = self.pool.lock().unwrap().take() { - self.stop.store(true, Ordering::Release); - pool.shutdown(); - } + self.stop.store(true, Ordering::Release); + self.pool.shutdown(); } /// Checks if underlying worker can't handle task immediately. @@ -451,7 +453,15 @@ impl Worker { } pub fn remote(&self) -> Remote { - self.remote.clone() + self.pool.remote().clone() + } + + pub fn pool_size(&self) -> usize { + self.pool.get_pool_size() + } + + pub fn pool(&self) -> FuturePool { + self.pool.clone() } fn start_impl( @@ -461,7 +471,7 @@ impl Worker { metrics_pending_task_count: IntGauge, ) { let counter = self.counter.clone(); - self.remote.spawn(async move { + let _ = self.pool.spawn(async move { let mut handle = RunnableWrapper { inner: runner }; while let Some(msg) = receiver.next().await { match msg { @@ -488,7 +498,7 @@ impl Worker { let counter = self.counter.clone(); let timeout = runner.get_interval(); Self::delay_notify(tx.clone(), timeout); - self.remote.spawn(async move { + let _ = self.pool.spawn(async move { let mut handle = RunnableWrapper { inner: runner }; while let Some(msg) = receiver.next().await { match msg { diff --git a/components/tikv_util/src/yatp_pool/future_pool.rs b/components/tikv_util/src/yatp_pool/future_pool.rs index 827ffbbdce2..2deead30580 100644 --- a/components/tikv_util/src/yatp_pool/future_pool.rs +++ b/components/tikv_util/src/yatp_pool/future_pool.rs @@ -119,6 +119,11 @@ impl FuturePool { pub fn shutdown(&self) { self.inner.pool.shutdown(); } + + // Get a remote queue for spawning tasks without owning the thread pool. + pub fn remote(&self) -> &yatp::Remote { + self.inner.pool.remote() + } } struct PoolInner { diff --git a/src/server/gc_worker/compaction_filter.rs b/src/server/gc_worker/compaction_filter.rs index 665824a1bac..2bea0cf347b 100644 --- a/src/server/gc_worker/compaction_filter.rs +++ b/src/server/gc_worker/compaction_filter.rs @@ -888,7 +888,7 @@ pub mod test_utils { cfg.ratio_threshold = ratio_threshold; } cfg.enable_compaction_filter = true; - GcWorkerConfigManager(Arc::new(VersionTrack::new(cfg))) + GcWorkerConfigManager(Arc::new(VersionTrack::new(cfg)), None) }; let feature_gate = { let feature_gate = FeatureGate::default(); diff --git a/src/server/gc_worker/config.rs b/src/server/gc_worker/config.rs index 1816dd845e1..809c55e1268 100644 --- a/src/server/gc_worker/config.rs +++ b/src/server/gc_worker/config.rs @@ -3,7 +3,10 @@ use std::sync::Arc; use online_config::{ConfigChange, ConfigManager, OnlineConfig}; -use tikv_util::config::{ReadableSize, VersionTrack}; +use tikv_util::{ + config::{ReadableSize, VersionTrack}, + yatp_pool::FuturePool, +}; const DEFAULT_GC_RATIO_THRESHOLD: f64 = 1.1; pub const DEFAULT_GC_BATCH_KEYS: usize = 512; @@ -22,6 +25,8 @@ pub struct GcConfig { /// greater than 5.0.0. Change `compaction_filter_skip_version_check` /// can enable it by force. pub compaction_filter_skip_version_check: bool, + /// gc threads count + pub num_threads: usize, } impl Default for GcConfig { @@ -32,6 +37,7 @@ impl Default for GcConfig { max_write_bytes_per_sec: ReadableSize(DEFAULT_GC_MAX_WRITE_BYTES_PER_SEC), enable_compaction_filter: true, compaction_filter_skip_version_check: false, + num_threads: 1, } } } @@ -41,12 +47,15 @@ impl GcConfig { if self.batch_keys == 0 { return Err("gc.batch_keys should not be 0".into()); } + if self.num_threads == 0 { + return Err("gc.thread_count should not be 0".into()); + } Ok(()) } } #[derive(Clone, Default)] -pub struct GcWorkerConfigManager(pub Arc>); +pub struct GcWorkerConfigManager(pub Arc>, pub Option); impl ConfigManager for GcWorkerConfigManager { fn dispatch( @@ -55,6 +64,16 @@ impl ConfigManager for GcWorkerConfigManager { ) -> std::result::Result<(), Box> { { let change = change.clone(); + if let Some(pool) = self.1.as_ref() { + if let Some(v) = change.get("num_threads") { + let pool_size: usize = v.into(); + pool.scale_pool_size(pool_size); + info!( + "GC worker thread count is changed"; + "new_thread_count" => pool_size, + ); + } + } self.0 .update(move |cfg: &mut GcConfig| cfg.update(change))?; } diff --git a/src/server/gc_worker/gc_manager.rs b/src/server/gc_worker/gc_manager.rs index be18f8216d5..d9c5287b67d 100644 --- a/src/server/gc_worker/gc_manager.rs +++ b/src/server/gc_worker/gc_manager.rs @@ -4,7 +4,7 @@ use std::{ cmp::Ordering, sync::{ atomic::{AtomicU64, Ordering as AtomicOrdering}, - mpsc, Arc, + mpsc, Arc, Condvar, Mutex, }, thread::{self, Builder as ThreadBuilder, JoinHandle}, time::Duration, @@ -20,10 +20,10 @@ use txn_types::{Key, TimeStamp}; use super::{ compaction_filter::is_compaction_filter_allowed, config::GcWorkerConfigManager, - gc_worker::{sync_gc, GcSafePointProvider, GcTask}, + gc_worker::{schedule_gc, GcSafePointProvider, GcTask}, Result, }; -use crate::{server::metrics::*, tikv_util::sys::thread::StdThreadBuildWrapper}; +use crate::{server::metrics::*, storage::Callback, tikv_util::sys::thread::StdThreadBuildWrapper}; const POLL_SAFE_POINT_INTERVAL_SECS: u64 = 10; @@ -245,6 +245,8 @@ pub(super) struct GcManager GcManager { @@ -254,6 +256,7 @@ impl GcMan worker_scheduler: Scheduler>, cfg_tracker: GcWorkerConfigManager, feature_gate: FeatureGate, + concurrent_tasks: usize, ) -> GcManager { GcManager { cfg, @@ -263,6 +266,7 @@ impl GcMan gc_manager_ctx: GcManagerContext::new(), cfg_tracker, feature_gate, + max_concurrent_tasks: concurrent_tasks, } } @@ -442,13 +446,27 @@ impl GcMan let mut progress = Some(Key::from_encoded(BEGIN_KEY.to_vec())); // Records how many region we have GC-ed. - let mut processed_regions = 0; + let mut scheduled_regions = 0; + let task_controller = Arc::new((Mutex::new(0), Condvar::new())); + // the task_controller is the combination to control the number + // of tasks The mutex is used for protecting the number of current + // tasks, while the condvar is used for notifying/get notified when the + // number of current tasks is changed. + let (lock, cvar) = &*task_controller; + let maybe_wait = |max_tasks| { + let mut current_tasks: std::sync::MutexGuard<'_, usize> = lock.lock().unwrap(); + while *current_tasks > max_tasks { + // Wait until the number of current tasks is below the limit + current_tasks = cvar.wait(current_tasks).unwrap(); + } + }; info!("gc_worker: auto gc starts"; "safe_point" => self.curr_safe_point()); // The following loop iterates all regions whose leader is on this TiKV and does // GC on them. At the same time, check whether safe_point is updated // periodically. If it's updated, rewinding will happen. + loop { self.gc_manager_ctx.check_stopped()?; if is_compaction_filter_allowed(&self.cfg_tracker.value(), &self.feature_gate) { @@ -462,9 +480,9 @@ impl GcMan // We have worked to the end and we need to rewind. Restart from beginning. progress = Some(Key::from_encoded(BEGIN_KEY.to_vec())); need_rewind = false; - info!("gc_worker: auto gc rewinds"; "processed_regions" => processed_regions); + info!("gc_worker: auto gc rewinds"; "scheduled_regions" => scheduled_regions); - processed_regions = 0; + scheduled_regions = 0; // Set the metric to zero to show that rewinding has happened. AUTO_GC_PROCESSED_REGIONS_GAUGE_VEC .with_label_values(&[PROCESS_TYPE_GC]) @@ -483,19 +501,40 @@ impl GcMan if finished { // We have worked to the end of the TiKV or our progress has reached `end`, and // we don't need to rewind. In this case, the round of GC has finished. - info!("gc_worker: auto gc finishes"; "processed_regions" => processed_regions); - return Ok(()); + info!("gc_worker: all regions task are scheduled"; + "processed_regions" => scheduled_regions, + ); + break; } } - assert!(progress.is_some()); // Before doing GC, check whether safe_point is updated periodically to // determine if rewinding is needed. self.check_if_need_rewind(&progress, &mut need_rewind, &mut end); - progress = self.gc_next_region(progress.unwrap(), &mut processed_regions)?; + let controller: Arc<(Mutex, Condvar)> = Arc::clone(&task_controller); + let cb = Box::new(move |_res| { + let (lock, cvar) = &*controller; + let mut current_tasks = lock.lock().unwrap(); + *current_tasks -= 1; + cvar.notify_one(); + AUTO_GC_PROCESSED_REGIONS_GAUGE_VEC + .with_label_values(&[PROCESS_TYPE_GC]) + .inc(); + }); + maybe_wait(self.max_concurrent_tasks - 1); + let mut current_tasks = lock.lock().unwrap(); + progress = self.async_gc_next_region(progress.unwrap(), cb, &mut current_tasks)?; + scheduled_regions += 1; } + + // wait for all tasks finished + self.gc_manager_ctx.check_stopped()?; + maybe_wait(0); + info!("gc_worker: auto gc finishes"; "processed_regions" => scheduled_regions); + + Ok(()) } /// Checks whether we need to rewind in this round of GC. Only used in @@ -536,13 +575,14 @@ impl GcMan } } - /// Does GC on the next region after `from_key`. Returns the end key of the - /// region it processed. If we have processed to the end of all regions, - /// returns `None`. - fn gc_next_region( + /// Does GC on the next region after `from_key` asynchronously. Returns the + /// end key of the region it processed. If we have processed to the end + /// of all regions, returns `None`. + fn async_gc_next_region( &mut self, from_key: Key, - processed_regions: &mut usize, + callback: Callback<()>, + running_tasks: &mut usize, ) -> GcManagerResult> { // Get the information of the next region to do GC. let (region, next_key) = self.get_next_gc_context(from_key); @@ -552,16 +592,16 @@ impl GcMan let hex_end = format!("{:?}", log_wrappers::Value::key(region.get_end_key())); debug!("trying gc"; "region_id" => region.id, "start_key" => &hex_start, "end_key" => &hex_end); - if let Err(e) = sync_gc(&self.worker_scheduler, region, self.curr_safe_point()) { - // Ignore the error and continue, since it's useless to retry this. - // TODO: Find a better way to handle errors. Maybe we should retry. - warn!("failed gc"; "start_key" => &hex_start, "end_key" => &hex_end, "err" => ?e); - } - - *processed_regions += 1; - AUTO_GC_PROCESSED_REGIONS_GAUGE_VEC - .with_label_values(&[PROCESS_TYPE_GC]) - .inc(); + let _ = schedule_gc( + &self.worker_scheduler, + region, + self.curr_safe_point(), + callback, + ) + .map(|_| { + *running_tasks += 1; + Ok::<(), GcManagerError>(()) + }); Ok(next_key) } @@ -710,8 +750,16 @@ mod tests { impl GcManagerTestUtil { pub fn new(regions: BTreeMap, RegionInfo>) -> Self { let (gc_task_sender, gc_task_receiver) = channel(); - let worker = WorkerBuilder::new("test-gc-manager").create(); - let scheduler = worker.start("gc-manager", MockGcRunner { tx: gc_task_sender }); + let worker = WorkerBuilder::new("test-gc-manager") + .thread_count(2) + .create(); + let scheduler = worker.start( + "gc-manager", + MockGcRunner { + tx: gc_task_sender.clone(), + }, + ); + worker.start("gc-manager", MockGcRunner { tx: gc_task_sender }); let (safe_point_sender, safe_point_receiver) = channel(); @@ -731,6 +779,7 @@ mod tests { scheduler, GcWorkerConfigManager::default(), Default::default(), + 2, ); Self { gc_manager: Some(gc_manager), diff --git a/src/server/gc_worker/gc_worker.rs b/src/server/gc_worker/gc_worker.rs index c608470ba87..a0537a478d0 100644 --- a/src/server/gc_worker/gc_worker.rs +++ b/src/server/gc_worker/gc_worker.rs @@ -34,6 +34,7 @@ use tikv_util::{ Either, }; use txn_types::{Key, TimeStamp}; +use yatp::{task::future::TaskCell, Remote}; use super::{ check_need_gc, @@ -178,7 +179,7 @@ where } /// Used to perform GC operations on the engine. -pub struct GcRunner { +pub struct GcRunnerCore { store_id: u64, engine: E, @@ -193,6 +194,26 @@ pub struct GcRunner { stats_map: HashMap, } +impl Clone for GcRunnerCore { + fn clone(&self) -> Self { + GcRunnerCore { + store_id: self.store_id, + engine: self.engine.clone(), + flow_info_sender: self.flow_info_sender.clone(), + limiter: self.limiter.clone(), + cfg: self.cfg.clone(), + cfg_tracker: self.cfg_tracker.clone(), + stats_map: HashMap::default(), + } + } +} + +/// Used to perform GC operations on the engine. +pub struct GcRunner { + inner: GcRunnerCore, + pool: Remote, +} + pub const MAX_RAW_WRITE_SIZE: usize = 32 * 1024; pub struct MvccRaw { @@ -282,7 +303,7 @@ fn init_snap_ctx(store_id: u64, region: &Region) -> Context { ctx } -impl GcRunner { +impl GcRunnerCore { pub fn new( store_id: u64, engine: E, @@ -918,18 +939,12 @@ impl GcRunner { error!("failed to flush deletes, will leave garbage"; "err" => ?e); } } -} - -impl Runnable for GcRunner { - type Task = GcTask; #[inline] fn run(&mut self, task: GcTask) { let _io_type_guard = WithIoType::new(IoType::Gc); let enum_label = task.get_enum_label(); - GC_GCTASK_COUNTER_STATIC.get(enum_label).inc(); - let timer = SlowTimer::from_secs(GC_TASK_SLOW_SECONDS); let update_metrics = |is_err| { GC_TASK_DURATION_HISTOGRAM_VEC @@ -941,9 +956,6 @@ impl Runnable for GcRunner { } }; - // Refresh config before handle task - self.refresh_cfg(); - match task { GcTask::Gc { region, @@ -1062,6 +1074,37 @@ impl Runnable for GcRunner { } } +impl GcRunner { + pub fn new( + store_id: u64, + engine: E, + flow_info_sender: Sender, + cfg_tracker: Tracker, + cfg: GcConfig, + pool: Remote, + ) -> Self { + Self { + inner: GcRunnerCore::new(store_id, engine, flow_info_sender, cfg_tracker, cfg), + pool, + } + } +} + +impl Runnable for GcRunner { + type Task = GcTask; + + #[inline] + fn run(&mut self, task: GcTask) { + // Refresh config before handle task + self.inner.refresh_cfg(); + + let mut inner = self.inner.clone(); + self.pool.spawn(async move { + inner.run(task); + }); + } +} + /// When we failed to schedule a `GcTask` to `GcRunner`, use this to handle the /// `ScheduleError`. fn handle_gc_task_schedule_error(e: ScheduleError>) -> Result<()> { @@ -1081,7 +1124,7 @@ fn handle_gc_task_schedule_error(e: ScheduleError>) -> Res } /// Schedules a `GcTask` to the `GcRunner`. -fn schedule_gc( +pub fn schedule_gc( scheduler: &Scheduler>, region: Region, safe_point: TimeStamp, @@ -1174,13 +1217,18 @@ impl GcWorker { feature_gate: FeatureGate, region_info_provider: Arc, ) -> Self { - let worker_builder = WorkerBuilder::new("gc-worker").pending_capacity(GC_MAX_PENDING_TASKS); + let worker_builder = WorkerBuilder::new("gc-worker") + .pending_capacity(GC_MAX_PENDING_TASKS) + .thread_count(cfg.num_threads); let worker = worker_builder.create().lazy_build("gc-worker"); let worker_scheduler = worker.scheduler(); GcWorker { engine, flow_info_sender: Some(flow_info_sender), - config_manager: GcWorkerConfigManager(Arc::new(VersionTrack::new(cfg))), + config_manager: GcWorkerConfigManager( + Arc::new(VersionTrack::new(cfg)), + Some(worker.pool()), + ), refs: Arc::new(AtomicUsize::new(1)), worker: Arc::new(Mutex::new(worker)), worker_scheduler, @@ -1219,6 +1267,7 @@ impl GcWorker { self.scheduler(), self.config_manager.clone(), self.feature_gate.clone(), + self.config_manager.value().num_threads, ) .start()?; *handle = Some(new_handle); @@ -1226,14 +1275,20 @@ impl GcWorker { } pub fn start(&mut self, store_id: u64) -> Result<()> { + let mut worker = self.worker.lock().unwrap(); let runner = GcRunner::new( store_id, self.engine.clone(), self.flow_info_sender.take().unwrap(), - self.config_manager.0.clone().tracker("gc-woker".to_owned()), + self.config_manager + .0 + .clone() + .tracker("gc-worker".to_owned()), self.config_manager.value().clone(), + worker.remote(), ); - self.worker.lock().unwrap().start(runner); + worker.start(runner); + Ok(()) } @@ -1296,6 +1351,10 @@ impl GcWorker { pub fn get_config_manager(&self) -> GcWorkerConfigManager { self.config_manager.clone() } + + pub fn get_worker_thread_count(&self) -> usize { + self.worker.lock().unwrap().pool_size() + } } #[cfg(any(test, feature = "testexport"))] @@ -1486,6 +1545,7 @@ mod tests { use engine_traits::Peekable as _; use futures::executor::block_on; use kvproto::{kvrpcpb::ApiVersion, metapb::Peer}; + use online_config::{ConfigChange, ConfigManager, ConfigValue}; use raft::StateRole; use raftstore::coprocessor::{ region_info_accessor::{MockRegionInfoProvider, RegionInfoAccessor}, @@ -1634,10 +1694,12 @@ mod tests { region2.mut_peers().push(new_peer(store_id, 2)); region2.set_start_key(split_key.to_vec()); + let mut gc_config = GcConfig::default(); + gc_config.num_threads = 2; let mut gc_worker = GcWorker::new( engine, tx, - GcConfig::default(), + gc_config, gate, Arc::new(MockRegionInfoProvider::new(vec![region1, region2])), ); @@ -1810,10 +1872,12 @@ mod tests { let mut host = CoprocessorHost::::default(); let ri_provider = RegionInfoAccessor::new(&mut host); + let mut gc_config = GcConfig::default(); + gc_config.num_threads = 2; let mut gc_worker = GcWorker::new( prefixed_engine.clone(), tx, - GcConfig::default(), + gc_config, feature_gate, Arc::new(ri_provider.clone()), ); @@ -1902,13 +1966,13 @@ mod tests { let (tx, _rx) = mpsc::channel(); let cfg = GcConfig::default(); - let mut runner = GcRunner::new( + let mut runner = GcRunnerCore::new( store_id, prefixed_engine.clone(), tx, - GcWorkerConfigManager(Arc::new(VersionTrack::new(cfg.clone()))) + GcWorkerConfigManager(Arc::new(VersionTrack::new(cfg.clone())), None) .0 - .tracker("gc-woker".to_owned()), + .tracker("gc-worker".to_owned()), cfg, ); @@ -1966,13 +2030,13 @@ mod tests { let (tx, _rx) = mpsc::channel(); let cfg = GcConfig::default(); - let mut runner = GcRunner::new( + let mut runner = GcRunnerCore::new( store_id, prefixed_engine.clone(), tx, - GcWorkerConfigManager(Arc::new(VersionTrack::new(cfg.clone()))) + GcWorkerConfigManager(Arc::new(VersionTrack::new(cfg.clone())), None) .0 - .tracker("gc-woker".to_owned()), + .tracker("gc-worker".to_owned()), cfg, ); @@ -2067,13 +2131,13 @@ mod tests { let (tx, _rx) = mpsc::channel(); let cfg = GcConfig::default(); - let mut runner = GcRunner::new( + let mut runner = GcRunnerCore::new( 1, prefixed_engine.clone(), tx, - GcWorkerConfigManager(Arc::new(VersionTrack::new(cfg.clone()))) + GcWorkerConfigManager(Arc::new(VersionTrack::new(cfg.clone())), None) .0 - .tracker("gc-woker".to_owned()), + .tracker("gc-worker".to_owned()), cfg, ); @@ -2202,10 +2266,12 @@ mod tests { let mut region = Region::default(); region.mut_peers().push(new_peer(store_id, 1)); + let mut gc_config = GcConfig::default(); + gc_config.num_threads = 2; let mut gc_worker = GcWorker::new( engine.clone(), tx, - GcConfig::default(), + gc_config, gate, Arc::new(MockRegionInfoProvider::new(vec![region.clone()])), ); @@ -2333,7 +2399,7 @@ mod tests { ) -> ( MultiRocksEngine, Arc, - GcRunner, + GcRunnerCore, Vec, mpsc::Receiver, ) { @@ -2386,13 +2452,13 @@ mod tests { ])); let cfg = GcConfig::default(); - let gc_runner = GcRunner::new( + let gc_runner = GcRunnerCore::new( store_id, engine.clone(), tx, - GcWorkerConfigManager(Arc::new(VersionTrack::new(cfg.clone()))) + GcWorkerConfigManager(Arc::new(VersionTrack::new(cfg.clone())), None) .0 - .tracker("gc-woker".to_owned()), + .tracker("gc-worker".to_owned()), cfg, ); @@ -2564,13 +2630,13 @@ mod tests { let ri_provider = Arc::new(MockRegionInfoProvider::new(vec![r1, r2])); let cfg = GcConfig::default(); - let mut gc_runner = GcRunner::new( + let mut gc_runner = GcRunnerCore::new( store_id, engine.clone(), tx, - GcWorkerConfigManager(Arc::new(VersionTrack::new(cfg.clone()))) + GcWorkerConfigManager(Arc::new(VersionTrack::new(cfg.clone())), None) .0 - .tracker("gc-woker".to_owned()), + .tracker("gc-worker".to_owned()), cfg, ); @@ -2756,4 +2822,33 @@ mod tests { test_destroy_range_for_multi_rocksdb_impl(b"k05", b"k195", vec![1, 2]); test_destroy_range_for_multi_rocksdb_impl(b"k099", b"k25", vec![2, 3]); } + + #[test] + fn test_update_gc_thread_count() { + let engine = TestEngineBuilder::new().build().unwrap(); + let (tx, _rx) = mpsc::channel(); + let gate = FeatureGate::default(); + gate.set_version("5.0.0").unwrap(); + let mut gc_config = GcConfig::default(); + gc_config.num_threads = 1; + let gc_worker = GcWorker::new( + engine, + tx, + gc_config, + gate, + Arc::new(MockRegionInfoProvider::new(vec![])), + ); + let mut config_change = ConfigChange::new(); + config_change.insert(String::from("num_threads"), ConfigValue::Usize(5)); + let mut cfg_manager = gc_worker.get_config_manager(); + cfg_manager.dispatch(config_change).unwrap(); + + assert_eq!(gc_worker.get_worker_thread_count(), 5); + + let mut config_change = ConfigChange::new(); + config_change.insert(String::from("num_threads"), ConfigValue::Usize(2)); + cfg_manager.dispatch(config_change).unwrap(); + + assert_eq!(gc_worker.get_worker_thread_count(), 2); + } } diff --git a/tests/integrations/config/mod.rs b/tests/integrations/config/mod.rs index 26eb599d6ee..f2a47252589 100644 --- a/tests/integrations/config/mod.rs +++ b/tests/integrations/config/mod.rs @@ -831,6 +831,7 @@ fn test_serde_custom_tikv_config() { max_write_bytes_per_sec: ReadableSize::mb(10), enable_compaction_filter: false, compaction_filter_skip_version_check: true, + num_threads: 2, }; value.pessimistic_txn = PessimisticTxnConfig { wait_for_lock_timeout: ReadableDuration::millis(10), diff --git a/tests/integrations/config/test-custom.toml b/tests/integrations/config/test-custom.toml index 80c92b6c8ac..54ed8216ec9 100644 --- a/tests/integrations/config/test-custom.toml +++ b/tests/integrations/config/test-custom.toml @@ -641,6 +641,7 @@ batch-keys = 256 max-write-bytes-per-sec = "10MB" enable-compaction-filter = false compaction-filter-skip-version-check = true +num-threads = 2 [pessimistic-txn] enabled = false # test backward compatibility From 02c9891928d51aa7f64d78311a4b856bd9f26677 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Thu, 8 Feb 2024 15:56:45 +0800 Subject: [PATCH 145/220] raftstore: Verify checksum right after SST files are generated (#16107) (#16126) close tikv/tikv#15986 Verify checksum right after SST files are generated to avoid corrupted SST being transferred to other TiKVs Signed-off-by: Connor1996 Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> Co-authored-by: Connor1996 --- components/engine_test/src/lib.rs | 8 ++- components/raftstore/src/store/snap.rs | 20 ++++++-- components/raftstore/src/store/snap/io.rs | 60 ++++++++++++++++++++--- metrics/alertmanager/tikv.rules.yml | 12 +++++ tests/failpoints/cases/test_snap.rs | 17 +++++++ tests/integrations/storage/test_titan.rs | 2 + 6 files changed, 107 insertions(+), 12 deletions(-) diff --git a/components/engine_test/src/lib.rs b/components/engine_test/src/lib.rs index dd56d9a5db4..85d9d4c1b78 100644 --- a/components/engine_test/src/lib.rs +++ b/components/engine_test/src/lib.rs @@ -127,7 +127,7 @@ pub mod kv { } fn destroy_tablet(&self, _ctx: TabletContext, path: &Path) -> Result<()> { - encryption::trash_dir_all(path, self.db_opt.key_manager.as_deref())?; + encryption::trash_dir_all(path, self.db_opt.get_key_manager().as_deref())?; Ok(()) } @@ -202,13 +202,17 @@ pub mod ctor { #[derive(Clone, Default)] pub struct DbOptions { - pub(crate) key_manager: Option>, + key_manager: Option>, rate_limiter: Option>, state_storage: Option>, enable_multi_batch_write: bool, } impl DbOptions { + pub fn get_key_manager(&self) -> Option> { + self.key_manager.clone() + } + pub fn set_key_manager(&mut self, key_manager: Option>) { self.key_manager = key_manager; } diff --git a/components/raftstore/src/store/snap.rs b/components/raftstore/src/store/snap.rs index 6fe21fe9750..690c3af1c76 100644 --- a/components/raftstore/src/store/snap.rs +++ b/components/raftstore/src/store/snap.rs @@ -92,6 +92,12 @@ impl From for Error { } } +impl From for Error { + fn from(e: engine_traits::Error) -> Self { + Error::Other(Box::new(e)) + } +} + pub type Result = result::Result; impl ErrorCodeExt for Error { @@ -873,8 +879,13 @@ impl Snapshot { self.switch_to_cf_file(cf)?; let cf_file = &mut self.cf_files[self.cf_index]; let cf_stat = if plain_file_used(cf_file.cf) { - let key_mgr = self.mgr.encryption_key_manager.as_ref(); - snap_io::build_plain_cf_file::(cf_file, key_mgr, kv_snap, &begin_key, &end_key)? + snap_io::build_plain_cf_file::( + cf_file, + self.mgr.encryption_key_manager.as_ref(), + kv_snap, + &begin_key, + &end_key, + )? } else { snap_io::build_sst_cf_file_list::( cf_file, @@ -885,6 +896,7 @@ impl Snapshot { self.mgr .get_actual_max_per_file_size(allow_multi_files_snapshot), &self.mgr.limiter, + self.mgr.encryption_key_manager.clone(), )? }; SNAPSHOT_LIMIT_GENERATE_BYTES.inc_by(cf_stat.total_size as u64); @@ -1212,7 +1224,7 @@ impl Snapshot { if file_for_recving.written_size != cf_file.size[i] { return Err(io::Error::new( - ErrorKind::Other, + ErrorKind::InvalidData, format!( "snapshot file {} for cf {} size mismatches, \ real size {}, expected size {}", @@ -1227,7 +1239,7 @@ impl Snapshot { let checksum = file_for_recving.write_digest.finalize(); if checksum != cf_file.checksum[i] { return Err(io::Error::new( - ErrorKind::Other, + ErrorKind::InvalidData, format!( "snapshot file {} for cf {} checksum \ mismatches, real checksum {}, expected \ diff --git a/components/raftstore/src/store/snap/io.rs b/components/raftstore/src/store/snap/io.rs index 3cdee1e40f1..837ae7176fb 100644 --- a/components/raftstore/src/store/snap/io.rs +++ b/components/raftstore/src/store/snap/io.rs @@ -13,13 +13,14 @@ use encryption::{ }; use engine_traits::{ CfName, EncryptionKeyManager, Error as EngineError, Iterable, KvEngine, Mutable, - SstCompressionType, SstWriter, SstWriterBuilder, WriteBatch, + SstCompressionType, SstReader, SstWriter, SstWriterBuilder, WriteBatch, }; +use fail::fail_point; use kvproto::encryptionpb::EncryptionMethod; use tikv_util::{ box_try, codec::bytes::{BytesEncoder, CompactBytesFromFileDecoder}, - debug, info, + debug, error, info, time::{Instant, Limiter}, }; @@ -116,6 +117,7 @@ pub fn build_sst_cf_file_list( end_key: &[u8], raw_size_per_file: u64, io_limiter: &Limiter, + key_mgr: Option>, ) -> Result where E: KvEngine, @@ -133,6 +135,53 @@ where let sst_writer = RefCell::new(create_sst_file_writer::(engine, cf, &path)?); let mut file_length: usize = 0; + let finish_sst_writer = |sst_writer: E::SstWriter, + path: String, + key_mgr: Option>| + -> Result<(), Error> { + sst_writer.finish()?; + (|| { + fail_point!("inject_sst_file_corruption", |_| { + static CALLED: std::sync::atomic::AtomicBool = + std::sync::atomic::AtomicBool::new(false); + if CALLED + .compare_exchange( + false, + true, + std::sync::atomic::Ordering::SeqCst, + std::sync::atomic::Ordering::SeqCst, + ) + .is_err() + { + return; + } + // overwrite the file to break checksum + let mut f = OpenOptions::new().write(true).open(&path).unwrap(); + f.write_all(b"x").unwrap(); + }); + })(); + + let sst_reader = if let Some(mgr) = key_mgr { + E::SstReader::open_encrypted(&path, mgr)? + } else { + E::SstReader::open(&path)? + }; + + if let Err(e) = sst_reader.verify_checksum() { + // use sst reader to verify block checksum, it would detect corrupted SST due to + // memory bit-flip + fs::remove_file(&path)?; + error!( + "failed to pass block checksum verification"; + "file" => path, + "err" => ?e, + ); + return Err(io::Error::new(io::ErrorKind::InvalidData, e).into()); + } + File::open(&path).and_then(|f| f.sync_all())?; + Ok(()) + }; + let instant = Instant::now(); box_try!(snap.scan(cf, start_key, end_key, false, |key, value| { let entry_len = key.len() + value.len(); @@ -151,8 +200,7 @@ where match result { Ok(new_sst_writer) => { let old_writer = sst_writer.replace(new_sst_writer); - box_try!(old_writer.finish()); - box_try!(File::open(prev_path).and_then(|f| f.sync_all())); + box_try!(finish_sst_writer(old_writer, prev_path, key_mgr.clone())); } Err(e) => { let io_error = io::Error::new(io::ErrorKind::Other, e); @@ -178,9 +226,8 @@ where Ok(true) })); if stats.key_count > 0 { + box_try!(finish_sst_writer(sst_writer.into_inner(), path, key_mgr)); cf_file.add_file(file_id); - box_try!(sst_writer.into_inner().finish()); - box_try!(File::open(path).and_then(|f| f.sync_all())); info!( "build_sst_cf_file_list builds {} files in cf {}. Total keys {}, total size {}. raw_size_per_file {}, total takes {:?}", file_id + 1, @@ -427,6 +474,7 @@ mod tests { &keys::data_key(b"z"), *max_file_size, &limiter, + db_opt.as_ref().and_then(|opt| opt.get_key_manager()), ) .unwrap(); if stats.key_count == 0 { diff --git a/metrics/alertmanager/tikv.rules.yml b/metrics/alertmanager/tikv.rules.yml index a4de231de7b..94805bc0733 100644 --- a/metrics/alertmanager/tikv.rules.yml +++ b/metrics/alertmanager/tikv.rules.yml @@ -1,6 +1,18 @@ groups: - name: alert.rules rules: + - alert: TiKV_critical_error + expr: sum(rate(tikv_critical_error_total[1m])) BY (type, instance) > 0 + # without the for clause will become active on the first evaluation. + labels: + env: ENV_LABELS_ENV + level: critical + expr: sum(rate(tikv_critical_error_total[1m])) BY (type, instance) > 0 + annotations: + description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}' + value: '{{ $value }}' + summary: TiKV encounters critical error + - alert: TiKV_memory_used_too_fast expr: process_resident_memory_bytes{job=~"tikv",instance=~".*"} - (process_resident_memory_bytes{job=~"tikv",instance=~".*"} offset 5m) > 5*1024*1024*1024 for: 5m diff --git a/tests/failpoints/cases/test_snap.rs b/tests/failpoints/cases/test_snap.rs index 7748b1d2985..ca23b4c5a17 100644 --- a/tests/failpoints/cases/test_snap.rs +++ b/tests/failpoints/cases/test_snap.rs @@ -992,3 +992,20 @@ fn test_snapshot_send_failed() { sleep_ms(100); assert!(mgr.list_snapshot().unwrap().is_empty()); } + +#[test] +/// Test a corrupted snapshot can be detected and retry to generate a new one. +fn test_retry_corrupted_snapshot() { + let mut cluster = new_node_cluster(0, 3); + let pd_client = cluster.pd_client.clone(); + pd_client.disable_default_operator(); + + let r = cluster.run_conf_change(); + cluster.must_put(b"k1", b"v1"); + must_get_none(&cluster.get_engine(3), b"k1"); + pd_client.must_add_peer(r, new_peer(2, 2)); + fail::cfg("inject_sst_file_corruption", "return").unwrap(); + pd_client.must_add_peer(r, new_peer(3, 3)); + + must_get_equal(&cluster.get_engine(3), b"k1", b"v1"); +} diff --git a/tests/integrations/storage/test_titan.rs b/tests/integrations/storage/test_titan.rs index 4bb8fee4087..752c6aaee1a 100644 --- a/tests/integrations/storage/test_titan.rs +++ b/tests/integrations/storage/test_titan.rs @@ -376,6 +376,7 @@ fn test_delete_files_in_range_for_titan() { b"{", u64::MAX, &limiter, + None, ) .unwrap(); let mut cf_file_write = CfFile::new( @@ -392,6 +393,7 @@ fn test_delete_files_in_range_for_titan() { b"{", u64::MAX, &limiter, + None, ) .unwrap(); From 524c3f4adf532c6e27cac6e9e153c211ac3480dd Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Sun, 18 Feb 2024 17:07:54 +0800 Subject: [PATCH 146/220] engine: calculate table properties correctly for Titan (#16320) (#16332) close tikv/tikv#16319 Signed-off-by: qupeng Co-authored-by: qupeng --- components/cdc/src/initializer.rs | 76 +++++++++++++++++-- components/engine_rocks/src/properties.rs | 61 ++++++++------- components/engine_rocks/src/ttl_properties.rs | 1 + src/config/mod.rs | 21 +++++ src/storage/kv/test_engine_builder.rs | 4 +- 5 files changed, 128 insertions(+), 35 deletions(-) diff --git a/components/cdc/src/initializer.rs b/components/cdc/src/initializer.rs index ef39a693e3e..f06576941fc 100644 --- a/components/cdc/src/initializer.rs +++ b/components/cdc/src/initializer.rs @@ -559,11 +559,14 @@ mod tests { use std::{ collections::BTreeMap, fmt::Display, - sync::mpsc::{channel, sync_channel, Receiver, RecvTimeoutError, Sender}, + sync::{ + mpsc::{channel, sync_channel, Receiver, RecvTimeoutError, Sender}, + Arc, + }, time::Duration, }; - use engine_rocks::RocksEngine; + use engine_rocks::{BlobRunMode, RocksEngine}; use engine_traits::{MiscExt, CF_WRITE}; use futures::{executor::block_on, StreamExt}; use kvproto::{ @@ -573,15 +576,19 @@ mod tests { use raftstore::{coprocessor::ObserveHandle, router::CdcRaftRouter, store::RegionSnapshot}; use resolved_ts::TxnLocks; use test_raftstore::MockRaftStoreRouter; - use tikv::storage::{ - kv::Engine, - txn::tests::{ - must_acquire_pessimistic_lock, must_commit, must_prewrite_delete, must_prewrite_put, - must_prewrite_put_with_txn_soucre, + use tikv::{ + config::DbConfig, + storage::{ + kv::Engine, + txn::tests::{ + must_acquire_pessimistic_lock, must_commit, must_prewrite_delete, + must_prewrite_put, must_prewrite_put_with_txn_soucre, + }, + TestEngineBuilder, }, - TestEngineBuilder, }; use tikv_util::{ + config::ReadableSize, memory::MemoryQuota, sys::thread::ThreadBuildWrapper, worker::{LazyWorker, Runnable}, @@ -1080,4 +1087,57 @@ mod tests { worker.stop(); } + + #[test] + fn test_scanner_with_titan() { + let mut cfg = DbConfig::default(); + cfg.titan.enabled = true; + cfg.defaultcf.titan.blob_run_mode = BlobRunMode::Normal; + cfg.defaultcf.titan.min_blob_size = ReadableSize(0); + cfg.writecf.titan.blob_run_mode = BlobRunMode::Normal; + cfg.writecf.titan.min_blob_size = ReadableSize(0); + cfg.lockcf.titan.blob_run_mode = BlobRunMode::Normal; + cfg.lockcf.titan.min_blob_size = ReadableSize(0); + let mut engine = TestEngineBuilder::new().build_with_cfg(&cfg).unwrap(); + + must_prewrite_put(&mut engine, b"zkey", b"value", b"zkey", 100); + must_commit(&mut engine, b"zkey", 100, 110); + for cf in &[CF_WRITE, CF_DEFAULT] { + engine.kv_engine().unwrap().flush_cf(cf, true).unwrap(); + } + must_prewrite_put(&mut engine, b"zkey", b"value", b"zkey", 150); + must_commit(&mut engine, b"zkey", 150, 160); + for cf in &[CF_WRITE, CF_DEFAULT] { + engine.kv_engine().unwrap().flush_cf(cf, true).unwrap(); + } + + let (mut worker, pool, mut initializer, _rx, mut drain) = mock_initializer( + usize::MAX, + usize::MAX, + 1000, + engine.kv_engine(), + ChangeDataRequestKvApi::TiDb, + false, + ); + initializer.checkpoint_ts = 120.into(); + let snap = engine.snapshot(Default::default()).unwrap(); + + let th = pool.spawn(async move { + let memory_quota = Arc::new(MemoryQuota::new(usize::MAX)); + initializer + .async_incremental_scan(snap, Region::default(), memory_quota) + .await + .unwrap(); + }); + + let mut total_entries = 0; + while let Some((event, _)) = block_on(drain.drain().next()) { + if let CdcEvent::Event(e) = event { + total_entries += e.get_entries().get_entries().len(); + } + } + assert_eq!(total_entries, 2); + block_on(th).unwrap(); + worker.stop(); + } } diff --git a/components/engine_rocks/src/properties.rs b/components/engine_rocks/src/properties.rs index 87ccab9e5ab..03d6877a9dd 100644 --- a/components/engine_rocks/src/properties.rs +++ b/components/engine_rocks/src/properties.rs @@ -414,7 +414,10 @@ impl TablePropertiesCollector for MvccPropertiesCollector { // TsFilter filters sst based on max_ts and min_ts during iterating. // To prevent seeing outdated (GC) records, we should consider // RocksDB delete entry type. - if entry_type != DBEntryType::Put && entry_type != DBEntryType::Delete { + if entry_type != DBEntryType::Put + && entry_type != DBEntryType::Delete + && entry_type != DBEntryType::BlobIndex + { return; } @@ -452,37 +455,43 @@ impl TablePropertiesCollector for MvccPropertiesCollector { self.props.max_row_versions = self.row_versions; } - if self.key_mode == KeyMode::Raw { - let decode_raw_value = ApiV2::decode_raw_value(value); - match decode_raw_value { - Ok(raw_value) => { - if raw_value.is_valid(self.current_ts) { - self.props.num_puts += 1; - } else { - self.props.num_deletes += 1; + if entry_type != DBEntryType::BlobIndex { + if self.key_mode == KeyMode::Raw { + let decode_raw_value = ApiV2::decode_raw_value(value); + match decode_raw_value { + Ok(raw_value) => { + if raw_value.is_valid(self.current_ts) { + self.props.num_puts += 1; + } else { + self.props.num_deletes += 1; + } + if let Some(expire_ts) = raw_value.expire_ts { + self.props.ttl.add(expire_ts); + } } - if let Some(expire_ts) = raw_value.expire_ts { - self.props.ttl.add(expire_ts); + Err(_) => { + self.num_errors += 1; } } - Err(_) => { - self.num_errors += 1; + } else { + let write_type = match Write::parse_type(value) { + Ok(v) => v, + Err(_) => { + self.num_errors += 1; + return; + } + }; + + match write_type { + WriteType::Put => self.props.num_puts += 1, + WriteType::Delete => self.props.num_deletes += 1, + _ => {} } } } else { - let write_type = match Write::parse_type(value) { - Ok(v) => v, - Err(_) => { - self.num_errors += 1; - return; - } - }; - - match write_type { - WriteType::Put => self.props.num_puts += 1, - WriteType::Delete => self.props.num_deletes += 1, - _ => {} - } + // NOTE: if titan is enabled, the entry will always be treated as PUT. + // Be careful if you try to enable Titan on CF_WRITE. + self.props.num_puts += 1; } // Add new row. diff --git a/components/engine_rocks/src/ttl_properties.rs b/components/engine_rocks/src/ttl_properties.rs index 8e6021939bc..62731ac1aa4 100644 --- a/components/engine_rocks/src/ttl_properties.rs +++ b/components/engine_rocks/src/ttl_properties.rs @@ -74,6 +74,7 @@ pub struct TtlPropertiesCollector { impl TablePropertiesCollector for TtlPropertiesCollector { fn add(&mut self, key: &[u8], value: &[u8], entry_type: DBEntryType, _: u64, _: u64) { + // DBEntryType::BlobIndex will be skipped because we can't parse the value. if entry_type != DBEntryType::Put { return; } diff --git a/src/config/mod.rs b/src/config/mod.rs index 9f6bc30ae0d..3f3b39d5f13 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -3855,6 +3855,11 @@ impl TikvConfig { self.quota.validate()?; self.causal_ts.validate()?; + // Validate feature TTL with Titan configuration. + if self.rocksdb.titan.enabled && self.storage.enable_ttl { + return Err("Titan is unavailable for feature TTL".to_string().into()); + } + Ok(()) } @@ -4813,6 +4818,7 @@ mod tests { // Check api version. { + tikv_cfg.rocksdb.titan.enabled = false; let cases = [ (ApiVersion::V1, ApiVersion::V1, true), (ApiVersion::V1, ApiVersion::V1ttl, false), @@ -5960,6 +5966,21 @@ mod tests { cfg.validate().unwrap_err(); cfg.rocksdb.writecf.format_version = Some(5); cfg.validate().unwrap(); + + let mut valid_cfg = TikvConfig::default(); + valid_cfg.storage.api_version = 2; + valid_cfg.storage.enable_ttl = true; + valid_cfg.rocksdb.titan.enabled = false; + valid_cfg.validate().unwrap(); + + let mut invalid_cfg = TikvConfig::default(); + invalid_cfg.storage.api_version = 2; + invalid_cfg.storage.enable_ttl = true; + invalid_cfg.rocksdb.titan.enabled = true; + assert_eq!( + invalid_cfg.validate().unwrap_err().to_string(), + "Titan is unavailable for feature TTL" + ); } #[test] diff --git a/src/storage/kv/test_engine_builder.rs b/src/storage/kv/test_engine_builder.rs index 23a0bfcd594..30b14d22274 100644 --- a/src/storage/kv/test_engine_builder.rs +++ b/src/storage/kv/test_engine_builder.rs @@ -126,7 +126,9 @@ impl TestEngineBuilder { _ => (*cf, RocksCfOptions::default()), }) .collect(); - let engine = RocksEngine::new(&path, None, cfs_opts, self.io_rate_limiter)?; + let resources = cfg_rocksdb.build_resources(Default::default(), EngineType::RaftKv); + let db_opts = cfg_rocksdb.build_opt(&resources, EngineType::RaftKv); + let engine = RocksEngine::new(&path, Some(db_opts), cfs_opts, self.io_rate_limiter)?; Ok(engine) } } From 10b5725b4e489f4eb840bc9f15d262e1cf27fcf3 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Mon, 19 Feb 2024 15:43:56 +0800 Subject: [PATCH 147/220] raftstore: add a timeout for sending snapshot (#16466) (#16528) close tikv/tikv#16435 Signed-off-by: glorv Co-authored-by: glorv --- .../raftstore/src/store/peer_storage.rs | 2 + src/server/snap.rs | 63 ++++++++++++++++--- tests/failpoints/cases/test_snap.rs | 38 +++++++++++ 3 files changed, 96 insertions(+), 7 deletions(-) diff --git a/components/raftstore/src/store/peer_storage.rs b/components/raftstore/src/store/peer_storage.rs index a888929ca98..99897ba551c 100644 --- a/components/raftstore/src/store/peer_storage.rs +++ b/components/raftstore/src/store/peer_storage.rs @@ -507,6 +507,8 @@ where *snap_state = SnapState::Relax; *tried_cnt = 0; if self.validate_snap(&s, request_index) { + info!("start sending snapshot"; "region_id" => self.region.get_id(), + "peer_id" => self.peer_id, "request_peer" => to,); return Ok(s); } } diff --git a/src/server/snap.rs b/src/server/snap.rs index 34b32848ad3..4b7540f7fec 100644 --- a/src/server/snap.rs +++ b/src/server/snap.rs @@ -8,12 +8,14 @@ use std::{ atomic::{AtomicUsize, Ordering}, Arc, }, - time::Duration, + time::{Duration, Instant as StdInstant}, }; use file_system::{IoType, WithIoType}; use futures::{ - future::{Future, TryFutureExt}, + compat::Future01CompatExt, + future::{select, Either, Future, TryFutureExt}, + pin_mut, sink::SinkExt, stream::{Stream, StreamExt, TryStreamExt}, task::{Context, Poll}, @@ -36,8 +38,10 @@ use raftstore::store::{SnapEntry, SnapKey, SnapManager, Snapshot}; use security::SecurityManager; use tikv_kv::RaftExtension; use tikv_util::{ - config::{Tracker, VersionTrack}, + box_err, + config::{Tracker, VersionTrack, MIB}, time::{Instant, UnixSecs}, + timer::GLOBAL_TIMER_HANDLE, worker::Runnable, DeferContext, }; @@ -50,6 +54,25 @@ pub type Callback = Box) + Send>; pub const DEFAULT_POOL_SIZE: usize = 4; +// the default duration before a snapshot sending task is canceled. +const SNAP_SEND_TIMEOUT_DURATION: Duration = Duration::from_secs(600); +// the minimum expected send speed for sending snapshot, this is used to avoid +// timeout too early when the snapshot size is too big. +const MIN_SNAP_SEND_SPEED: u64 = MIB; + +#[inline] +fn get_snap_timeout(size: u64) -> Duration { + let timeout = (|| { + fail_point!("snap_send_duration_timeout", |t| -> Duration { + let t = t.unwrap().parse::(); + Duration::from_millis(t.unwrap()) + }); + SNAP_SEND_TIMEOUT_DURATION + })(); + let max_expected_dur = Duration::from_secs(size / MIN_SNAP_SEND_SPEED); + std::cmp::max(timeout, max_expected_dur) +} + /// A task for either receiving Snapshot or sending Snapshot pub enum Task { Recv { @@ -191,10 +214,36 @@ pub fn send_snap( let (sink, receiver) = client.snapshot()?; let send_task = async move { - let mut sink = sink.sink_map_err(Error::from); - sink.send_all(&mut chunks).await?; - sink.close().await?; - let recv_result = receiver.map_err(Error::from).await; + let send_and_recv = async { + let mut sink = sink.sink_map_err(Error::from); + + #[cfg(feature = "failpoints")] + { + let should_delay = (|| { + fail::fail_point!("snap_send_timer_delay", |_| { true }); + false + })(); + if should_delay { + _ = GLOBAL_TIMER_HANDLE + .delay(StdInstant::now() + Duration::from_secs(1)) + .compat() + .await; + } + } + sink.send_all(&mut chunks).await?; + sink.close().await?; + Ok(receiver.map_err(Error::from).await) + }; + let wait_timeout = GLOBAL_TIMER_HANDLE + .delay(StdInstant::now() + get_snap_timeout(total_size)) + .compat(); + let recv_result = { + pin_mut!(send_and_recv, wait_timeout); + match select(send_and_recv, wait_timeout).await { + Either::Left((r, _)) => r, + Either::Right((..)) => Err(Error::Other(box_err!("send snapshot timeout"))), + } + }; send_timer.observe_duration(); drop(deregister); drop(client); diff --git a/tests/failpoints/cases/test_snap.rs b/tests/failpoints/cases/test_snap.rs index ca23b4c5a17..8f2ae2f61cc 100644 --- a/tests/failpoints/cases/test_snap.rs +++ b/tests/failpoints/cases/test_snap.rs @@ -1009,3 +1009,41 @@ fn test_retry_corrupted_snapshot() { must_get_equal(&cluster.get_engine(3), b"k1", b"v1"); } + +#[test] +fn test_send_snapshot_timeout() { + let mut cluster = new_server_cluster(1, 5); + cluster.cfg.raft_store.raft_log_gc_tick_interval = ReadableDuration::millis(20); + cluster.cfg.raft_store.raft_log_gc_count_limit = Some(8); + cluster.cfg.raft_store.merge_max_log_gap = 3; + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + cluster.run(); + cluster.must_transfer_leader(1, new_peer(1, 1)); + cluster.stop_node(4); + cluster.stop_node(5); + (0..10).for_each(|_| cluster.must_put(b"k2", b"v2")); + // Sleep for a while to ensure all logs are compacted. + thread::sleep(Duration::from_millis(100)); + + fail::cfg("snap_send_duration_timeout", "return(100)").unwrap(); + + // Let store 4 inform leader to generate a snapshot. + cluster.run_node(4).unwrap(); + must_get_equal(&cluster.get_engine(4), b"k2", b"v2"); + + // add a delay to let send snapshot fail due to timeout. + fail::cfg("snap_send_timer_delay", "return(1000)").unwrap(); + cluster.run_node(5).unwrap(); + thread::sleep(Duration::from_millis(150)); + must_get_none(&cluster.get_engine(5), b"k2"); + + // only delay once, the snapshot should success after retry. + fail::cfg("snap_send_timer_delay", "1*return(1000)").unwrap(); + thread::sleep(Duration::from_millis(500)); + must_get_equal(&cluster.get_engine(5), b"k2", b"v2"); + + fail::remove("snap_send_timer_delay"); + fail::remove("snap_send_duration_timeout"); +} From 43066fe7369806dbb6a9a12b3b0154b95255f703 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Tue, 20 Feb 2024 11:21:26 +0800 Subject: [PATCH 148/220] raftstore: real batch ingest sst files in different region (#16298) (#16316) ref tikv/tikv#16267 raftstore: real batch ingest sst files in different region Signed-off-by: 3pointer Co-authored-by: 3pointer Co-authored-by: tonyxuqqi Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- components/raftstore/src/store/fsm/apply.rs | 49 ++++++++++++++++----- 1 file changed, 37 insertions(+), 12 deletions(-) diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index 1f2e4c3f5c3..cafb3660d9f 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -555,7 +555,8 @@ where delegate.unfinished_write_seqno.push(seqno); } self.prepare_for(delegate); - delegate.last_flush_applied_index = delegate.apply_state.get_applied_index() + delegate.last_flush_applied_index = delegate.apply_state.get_applied_index(); + delegate.has_pending_ssts = false; } self.kv_wb_last_bytes = self.kv_wb().data_size() as u64; self.kv_wb_last_keys = self.kv_wb().count() as u64; @@ -791,7 +792,7 @@ pub fn notify_stale_req_with_msg(term: u64, msg: String, cb: impl ErrorCallback) } /// Checks if a write is needed to be issued before handling the command. -fn should_write_to_engine(cmd: &RaftCmdRequest) -> bool { +fn should_write_to_engine(has_pending_writes: bool, cmd: &RaftCmdRequest) -> bool { if cmd.has_admin_request() { match cmd.get_admin_request().get_cmd_type() { // ComputeHash require an up to date snapshot. @@ -809,7 +810,7 @@ fn should_write_to_engine(cmd: &RaftCmdRequest) -> bool { if req.has_delete_range() { return true; } - if req.has_ingest_sst() { + if req.has_ingest_sst() && has_pending_writes { return true; } } @@ -1043,6 +1044,8 @@ where buckets: Option, unfinished_write_seqno: Vec, + + has_pending_ssts: bool, } impl ApplyDelegate @@ -1077,6 +1080,7 @@ where trace: ApplyMemoryTrace::default(), buckets: None, unfinished_write_seqno: vec![], + has_pending_ssts: false, } } @@ -1227,9 +1231,15 @@ where if apply_ctx.yield_high_latency_operation && has_high_latency_operation(&cmd) { self.priority = Priority::Low; } + if self.has_pending_ssts { + // we are in low priority handler and to avoid overlapped ssts with same region + // just return Yield + return ApplyResult::Yield; + } let mut has_unflushed_data = self.last_flush_applied_index != self.apply_state.get_applied_index(); - if (has_unflushed_data && should_write_to_engine(&cmd) + if (has_unflushed_data + && should_write_to_engine(!apply_ctx.kv_wb().is_empty(), &cmd) || apply_ctx.kv_wb().should_write_to_engine()) && apply_ctx.host.pre_persist(&self.region, false, Some(&cmd)) { @@ -1997,6 +2007,7 @@ where match ctx.importer.validate(sst) { Ok(meta_info) => { ctx.pending_ssts.push(meta_info.clone()); + self.has_pending_ssts = true; ssts.push(meta_info) } Err(e) => { @@ -2005,7 +2016,6 @@ where panic!("{} ingest {:?}: {:?}", self.tag, sst, e); } }; - Ok(()) } } @@ -4646,6 +4656,7 @@ where self.apply_ctx.flush(); for fsm in fsms.iter_mut().flatten() { fsm.delegate.last_flush_applied_index = fsm.delegate.apply_state.get_applied_index(); + fsm.delegate.has_pending_ssts = false; fsm.delegate.update_memory_trace(&mut self.trace_event); } MEMTRACE_APPLYS.trace(mem::take(&mut self.trace_event)); @@ -5193,7 +5204,7 @@ mod tests { req.set_ingest_sst(IngestSstRequest::default()); let mut cmd = RaftCmdRequest::default(); cmd.mut_requests().push(req); - assert_eq!(should_write_to_engine(&cmd), true); + assert_eq!(should_write_to_engine(true, &cmd), true); assert_eq!(should_sync_log(&cmd), true); // Normal command @@ -5207,7 +5218,17 @@ mod tests { let mut req = RaftCmdRequest::default(); req.mut_admin_request() .set_cmd_type(AdminCmdType::ComputeHash); - assert_eq!(should_write_to_engine(&req), true); + assert_eq!(should_write_to_engine(true, &req), true); + assert_eq!(should_write_to_engine(false, &req), true); + + // DeleteRange command + let mut req = Request::default(); + req.set_cmd_type(CmdType::DeleteRange); + req.set_delete_range(DeleteRangeRequest::default()); + let mut cmd = RaftCmdRequest::default(); + cmd.mut_requests().push(req); + assert_eq!(should_write_to_engine(true, &cmd), true); + assert_eq!(should_write_to_engine(false, &cmd), true); // IngestSst command let mut req = Request::default(); @@ -5215,7 +5236,8 @@ mod tests { req.set_ingest_sst(IngestSstRequest::default()); let mut cmd = RaftCmdRequest::default(); cmd.mut_requests().push(req); - assert_eq!(should_write_to_engine(&cmd), true); + assert_eq!(should_write_to_engine(true, &cmd), true); + assert_eq!(should_write_to_engine(false, &cmd), false); } #[test] @@ -6214,7 +6236,7 @@ mod tests { // nomral put command, so the first apple_res.exec_res should be empty. let apply_res = fetch_apply_res(&rx); assert!(apply_res.exec_res.is_empty()); - // The region was rescheduled low-priority becasuee of ingest command, + // The region was rescheduled low-priority because of ingest command, // only put entry has been applied; let apply_res = fetch_apply_res(&rx); assert_eq!(apply_res.applied_term, 3); @@ -6853,9 +6875,12 @@ mod tests { assert!(!resp.get_header().has_error(), "{:?}", resp); } let mut res = fetch_apply_res(&rx); - // There may be one or two ApplyRes which depends on whether these two apply - // msgs are batched together. - if res.apply_state.get_applied_index() == 3 { + // There are five entries [put, ingest, put, ingest, put] in one region. + // so the apply results should be notified at index 2/4. + if res.apply_state.get_applied_index() == 2 { + res = fetch_apply_res(&rx); + } + if res.apply_state.get_applied_index() == 4 { res = fetch_apply_res(&rx); } assert_eq!(res.apply_state.get_applied_index(), 5); From 154c02f3dca41c795c1bb8f019c6373dcdcca62d Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Tue, 20 Feb 2024 14:15:27 +0800 Subject: [PATCH 149/220] *: fix issue of modify resolved-ts.advance-ts-interval from 5s to 2s is not work (#15836) (#16232) close tikv/tikv#15835 Signed-off-by: crazycs520 Co-authored-by: crazycs520 Co-authored-by: crazycs --- components/resolved_ts/src/advance.rs | 5 +---- components/resolved_ts/src/endpoint.rs | 8 ++------ 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/components/resolved_ts/src/advance.rs b/components/resolved_ts/src/advance.rs index dd6e9c2002c..856d042a75d 100644 --- a/components/resolved_ts/src/advance.rs +++ b/components/resolved_ts/src/advance.rs @@ -51,7 +51,6 @@ const DEFAULT_GRPC_MIN_MESSAGE_SIZE_TO_COMPRESS: usize = 4096; pub struct AdvanceTsWorker { pd_client: Arc, - advance_ts_interval: Duration, timer: SteadyTimer, worker: Runtime, scheduler: Scheduler, @@ -65,7 +64,6 @@ pub struct AdvanceTsWorker { impl AdvanceTsWorker { pub fn new( - advance_ts_interval: Duration, pd_client: Arc, scheduler: Scheduler, concurrency_manager: ConcurrencyManager, @@ -81,7 +79,6 @@ impl AdvanceTsWorker { scheduler, pd_client, worker, - advance_ts_interval, timer: SteadyTimer::default(), concurrency_manager, last_pd_tso: Arc::new(std::sync::Mutex::new(None)), @@ -104,7 +101,7 @@ impl AdvanceTsWorker { let timeout = self.timer.delay(advance_ts_interval); let min_timeout = self.timer.delay(cmp::min( DEFAULT_CHECK_LEADER_TIMEOUT_DURATION, - self.advance_ts_interval, + advance_ts_interval, )); let last_pd_tso = self.last_pd_tso.clone(); diff --git a/components/resolved_ts/src/endpoint.rs b/components/resolved_ts/src/endpoint.rs index 9de21b27d9e..a668d8b0f52 100644 --- a/components/resolved_ts/src/endpoint.rs +++ b/components/resolved_ts/src/endpoint.rs @@ -658,12 +658,8 @@ where let meta = store_meta.lock().unwrap(); (meta.region_read_progress().clone(), meta.store_id()) }; - let advance_worker = AdvanceTsWorker::new( - cfg.advance_ts_interval.0, - pd_client.clone(), - scheduler.clone(), - concurrency_manager, - ); + let advance_worker = + AdvanceTsWorker::new(pd_client.clone(), scheduler.clone(), concurrency_manager); let scanner_pool = ScannerPool::new(cfg.scan_lock_pool_size, cdc_handle); let store_resolver_gc_interval = Duration::from_secs(60); let leader_resolver = LeadershipResolver::new( From 3448ddbc8b783adebb27d1647e9b568bf65dd773 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Tue, 20 Feb 2024 17:25:57 +0800 Subject: [PATCH 150/220] copr, json: handle u64 json correctly (#16513) (#16538) close tikv/tikv#16512 Signed-off-by: Yang Keao Co-authored-by: Yang Keao Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- .../tidb_query_datatype/src/codec/mysql/json/binary.rs | 3 ++- .../src/codec/mysql/json/json_contains.rs | 1 + .../src/codec/mysql/json/json_type.rs | 3 ++- .../tidb_query_datatype/src/codec/mysql/json/serde.rs | 10 +++++++--- components/tidb_query_expr/src/impl_json.rs | 2 +- 5 files changed, 13 insertions(+), 6 deletions(-) diff --git a/components/tidb_query_datatype/src/codec/mysql/json/binary.rs b/components/tidb_query_datatype/src/codec/mysql/json/binary.rs index c965247b8da..5028967ac59 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/binary.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/binary.rs @@ -167,7 +167,8 @@ mod tests { (r#"["d1","d2"]"#, JsonType::Array), (r#"-3"#, JsonType::I64), (r#"3"#, JsonType::I64), - (r#"18446744073709551615"#, JsonType::Double), + (r#"18446744073709551615"#, JsonType::U64), + (r#"18446744073709551616"#, JsonType::Double), (r#"3.0"#, JsonType::Double), (r#"null"#, JsonType::Literal), (r#"true"#, JsonType::Literal), diff --git a/components/tidb_query_datatype/src/codec/mysql/json/json_contains.rs b/components/tidb_query_datatype/src/codec/mysql/json/json_contains.rs index 46de1af9e0b..db8ec3331eb 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/json_contains.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/json_contains.rs @@ -86,6 +86,7 @@ mod tests { (r#"{"a":{"a":1},"b":2}"#, r#"{"b":3}"#, false), (r#"[1,2,[1,[5,{"a":[2,3]}]]]"#, r#"[1,{"a":[3]}]"#, true), (r#"[1,2,[1,[5,{"a":[2,3]}]]]"#, r#"[10,{"a":[3]}]"#, false), + (r#"[9223372036854775807]"#, r#"9223372036854775808"#, false), ]; for (i, (js, value, expected)) in test_cases.drain(..).enumerate() { let j = js.parse(); diff --git a/components/tidb_query_datatype/src/codec/mysql/json/json_type.rs b/components/tidb_query_datatype/src/codec/mysql/json/json_type.rs index 70321080ef7..bf48791b298 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/json_type.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/json_type.rs @@ -65,7 +65,8 @@ mod tests { (r#"["a", "b"]"#, JSON_TYPE_ARRAY), ("-5", JSON_TYPE_INTEGER), ("5", JSON_TYPE_INTEGER), - ("18446744073709551615", JSON_TYPE_DOUBLE), + ("18446744073709551615", JSON_TYPE_UNSIGNED_INTEGER), + ("18446744073709551616", JSON_TYPE_DOUBLE), ("5.6", JSON_TYPE_DOUBLE), (r#""hello, world""#, JSON_TYPE_STRING), ("true", JSON_TYPE_BOOLEAN), diff --git a/components/tidb_query_datatype/src/codec/mysql/json/serde.rs b/components/tidb_query_datatype/src/codec/mysql/json/serde.rs index 4bf487eefc1..598e27347c8 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/serde.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/serde.rs @@ -192,10 +192,10 @@ impl<'de> Visitor<'de> for JsonVisitor { where E: de::Error, { - if v > (i64::MAX as u64) { - Ok(Json::from_f64(v as f64).map_err(de::Error::custom)?) + if v < i64::MAX as u64 { + Json::from_i64(v as i64).map_err(de::Error::custom) } else { - Ok(Json::from_i64(v as i64).map_err(de::Error::custom)?) + Json::from_u64(v).map_err(de::Error::custom) } } @@ -286,6 +286,10 @@ mod tests { r#"9223372036854775807"#, Json::from_i64(9223372036854775807), ), + ( + r#"9223372036854775808"#, + Json::from_u64(9223372036854775808), + ), ]; for (json_str, json) in cases { diff --git a/components/tidb_query_expr/src/impl_json.rs b/components/tidb_query_expr/src/impl_json.rs index 68132ae08e2..f24dea0e5c8 100644 --- a/components/tidb_query_expr/src/impl_json.rs +++ b/components/tidb_query_expr/src/impl_json.rs @@ -493,7 +493,7 @@ mod tests { (Some(r#"null"#), Some("NULL")), (Some(r#"-3"#), Some("INTEGER")), (Some(r#"3"#), Some("INTEGER")), - (Some(r#"9223372036854775808"#), Some("DOUBLE")), + (Some(r#"9223372036854775808"#), Some("UNSIGNED INTEGER")), (Some(r#"3.14"#), Some("DOUBLE")), (Some(r#"[1, 2, 3]"#), Some("ARRAY")), (Some(r#"{"name": 123}"#), Some("OBJECT")), From 67613e3ac3e4f7f5db856da1d8f054ed30c596df Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Tue, 20 Feb 2024 20:54:26 +0800 Subject: [PATCH 151/220] snapshot_backup: enhanced prepare stage (#15946) (#16354) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit close tikv/tikv#15739 It contains: - A coprocessor that can fully reject all admin and ingest commands. - A new region-leveled wait apply implementation, which allow us to wait all pending commands to be applied. Signed-off-by: Yu Juncen Co-authored-by: 山岚 <36239017+YuJuncen@users.noreply.github.com> Co-authored-by: Yu Juncen --- Cargo.lock | 4 +- Cargo.toml | 4 +- components/backup/src/disk_snap.rs | 372 +++++++++++++++++ components/backup/src/endpoint.rs | 9 +- components/backup/src/lib.rs | 1 + components/backup/src/service.rs | 174 +++++--- .../src/operation/disk_snapshot_backup.rs | 37 ++ components/raftstore-v2/src/operation/mod.rs | 2 + components/raftstore-v2/src/router/mod.rs | 1 + .../raftstore/src/coprocessor/dispatcher.rs | 4 + components/raftstore/src/coprocessor/error.rs | 4 +- components/raftstore/src/coprocessor/mod.rs | 13 +- components/raftstore/src/errors.rs | 9 + components/raftstore/src/store/fsm/peer.rs | 57 ++- components/raftstore/src/store/metrics.rs | 50 +++ components/raftstore/src/store/mod.rs | 8 +- components/raftstore/src/store/msg.rs | 12 +- components/raftstore/src/store/peer.rs | 71 +++- .../raftstore/src/store/snapshot_backup.rs | 391 ++++++++++++++++++ .../raftstore/src/store/unsafe_recovery.rs | 49 +-- components/server/src/server.rs | 31 +- components/server/src/server2.rs | 8 +- components/snap_recovery/src/services.rs | 56 ++- components/sst_importer/src/errors.rs | 1 + components/test_backup/Cargo.toml | 2 + components/test_backup/src/disk_snap.rs | 246 +++++++++++ components/test_backup/src/lib.rs | 2 + .../test_raftstore/src/transport_simulate.rs | 6 + components/test_raftstore/src/util.rs | 1 + components/test_util/src/lib.rs | 19 + components/tikv_util/src/time.rs | 3 +- src/import/sst_service.rs | 11 +- tests/Cargo.toml | 6 +- tests/failpoints/cases/mod.rs | 1 + tests/failpoints/cases/test_disk_snap_br.rs | 42 ++ tests/integrations/backup/disk_snap.rs | 206 +++++++++ tests/integrations/backup/mod.rs | 2 + tests/integrations/import/test_sst_service.rs | 22 +- .../raftstore/test_snap_recovery.rs | 36 +- 39 files changed, 1762 insertions(+), 211 deletions(-) create mode 100644 components/backup/src/disk_snap.rs create mode 100644 components/raftstore-v2/src/operation/disk_snapshot_backup.rs create mode 100644 components/raftstore/src/store/snapshot_backup.rs create mode 100644 components/test_backup/src/disk_snap.rs create mode 100644 tests/failpoints/cases/test_disk_snap_br.rs create mode 100644 tests/integrations/backup/disk_snap.rs diff --git a/Cargo.lock b/Cargo.lock index dc0ebb334a6..e96a72afebb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2933,7 +2933,7 @@ dependencies = [ [[package]] name = "kvproto" version = "0.0.2" -source = "git+https://github.com/pingcap/kvproto.git#87bebcc0d071a18cbbd94a4fc02de9c4988af815" +source = "git+https://github.com/pingcap/kvproto.git?branch=release-7.5#c4a09794a10c8564d8b4645f45b4092b8ff0b29c" dependencies = [ "futures 0.3.15", "grpcio", @@ -6003,6 +6003,7 @@ dependencies = [ "collections", "concurrency_manager", "crc64fast", + "engine_rocks", "engine_traits", "external_storage_export", "file_system", @@ -6012,6 +6013,7 @@ dependencies = [ "grpcio", "kvproto", "protobuf", + "raftstore", "rand 0.8.5", "tempfile", "test_raftstore", diff --git a/Cargo.toml b/Cargo.toml index 8abff4f9ca8..ff07c91f8d9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -336,7 +336,7 @@ server = { path = "components/server" } service = { path = "components/service" } snap_recovery = { path = "components/snap_recovery", default-features = false } sst_importer = { path = "components/sst_importer", default-features = false } -test_backup = { path = "components/test_backup" } +test_backup = { path = "components/test_backup", default-features = false } test_coprocessor = { path = "components/test_coprocessor", default-features = false } example_coprocessor_plugin = { path = "components/test_coprocessor_plugin/example_plugin" } test_pd = { path = "components/test_pd" } @@ -365,7 +365,7 @@ raft = { version = "0.7.0", default-features = false, features = ["protobuf-code grpcio = { version = "0.10.4", default-features = false, features = ["openssl-vendored", "protobuf-codec", "nightly"] } grpcio-health = { version = "0.10.4", default-features = false, features = ["protobuf-codec"] } tipb = { git = "https://github.com/pingcap/tipb.git" } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { git = "https://github.com/pingcap/kvproto.git", branch = "release-7.5" } yatp = { git = "https://github.com/tikv/yatp.git", branch = "master" } tokio-timer = { git = "https://github.com/tikv/tokio", branch = "tokio-timer-hotfix" } tokio-executor = { git = "https://github.com/tikv/tokio", branch = "tokio-timer-hotfix" } diff --git a/components/backup/src/disk_snap.rs b/components/backup/src/disk_snap.rs new file mode 100644 index 00000000000..94d956cc11c --- /dev/null +++ b/components/backup/src/disk_snap.rs @@ -0,0 +1,372 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. +//! This module contains things about disk snapshot. + +use std::{ + future::Pending, + sync::{ + atomic::{AtomicU64, Ordering}, + Arc, + }, + task::Poll, + time::Duration, +}; + +use futures::future; +use futures_util::{ + future::{BoxFuture, FutureExt}, + sink::SinkExt, + stream::{AbortHandle, Abortable, StreamExt}, +}; +use grpcio::{RpcStatus, RpcStatusCode, WriteFlags}; +use kvproto::{ + brpb::{ + PrepareSnapshotBackupEventType as PEvnT, PrepareSnapshotBackupRequest as PReq, + PrepareSnapshotBackupRequestType as PReqT, PrepareSnapshotBackupResponse as PResp, + }, + errorpb::{self, StaleCommand}, + metapb::Region, +}; +use raftstore::store::{ + snapshot_backup::{ + AbortReason, PrepareDiskSnapObserver, SnapshotBrHandle, SnapshotBrWaitApplyRequest, + }, + SnapshotBrWaitApplySyncer, +}; +use tikv_util::{sys::thread::ThreadBuildWrapper, warn, Either}; +use tokio::{ + runtime::{Handle, Runtime}, + sync::oneshot, +}; +use tokio_stream::Stream; + +const DEFAULT_RT_THREADS: usize = 2; + +type Result = std::result::Result; + +enum Error { + Uninitialized, + LeaseExpired, + /// Wait apply has been aborted. + /// When the `reason` is `None`, implies the request itself has been + /// canceled (seldom) due to message lost or something. + WaitApplyAborted(Option), + RaftStore(raftstore::Error), +} + +enum HandleErr { + AbortStream(RpcStatus), + SendErrResp(errorpb::Error), +} + +pub struct ResultSink(grpcio::DuplexSink); + +impl From> for ResultSink { + fn from(value: grpcio::DuplexSink) -> Self { + Self(value) + } +} + +impl ResultSink { + async fn send( + mut self, + result: Result, + error_extra_info: impl FnOnce(&mut PResp), + ) -> grpcio::Result { + match result { + // Note: should we batch here? + Ok(item) => self.0.send((item, WriteFlags::default())).await?, + Err(err) => match err.into() { + HandleErr::AbortStream(status) => { + self.0.fail(status.clone()).await?; + return Err(grpcio::Error::RpcFinished(Some(status))); + } + HandleErr::SendErrResp(err) => { + let mut resp = PResp::new(); + error_extra_info(&mut resp); + resp.set_error(err); + self.0.send((resp, WriteFlags::default())).await?; + } + }, + } + Ok(self) + } +} + +impl From for HandleErr { + fn from(value: Error) -> Self { + match value { + Error::Uninitialized => HandleErr::AbortStream(RpcStatus::with_message( + grpcio::RpcStatusCode::UNAVAILABLE, + "coprocessor not initialized".to_owned(), + )), + Error::RaftStore(r) => HandleErr::SendErrResp(errorpb::Error::from(r)), + Error::WaitApplyAborted(reason) => HandleErr::SendErrResp({ + let mut err = errorpb::Error::new(); + err.set_message(format!("wait apply has been aborted, perhaps epoch not match or leadership changed, note = {:?}", reason)); + match reason { + Some(AbortReason::EpochNotMatch(enm)) => err.set_epoch_not_match(enm), + Some(AbortReason::StaleCommand { .. }) => { + err.set_stale_command(StaleCommand::new()) + } + _ => {} + } + err + }), + Error::LeaseExpired => HandleErr::AbortStream(RpcStatus::with_message( + grpcio::RpcStatusCode::FAILED_PRECONDITION, + "the lease has expired, you may not send `wait_apply` because it is no meaning" + .to_string(), + )), + } + } +} + +#[derive(Clone)] +pub struct Env { + pub(crate) handle: SR, + rejector: Arc, + active_stream: Arc, + // Left: a shared tokio runtime. + // Right: a hosted runtime(usually for test cases). + runtime: Either>, +} + +impl Env { + pub fn new( + handle: SR, + rejector: Arc, + runtime: Option, + ) -> Self { + let runtime = match runtime { + None => Either::Right(Self::default_runtime()), + Some(rt) => Either::Left(rt), + }; + Self { + handle, + rejector, + active_stream: Arc::new(AtomicU64::new(0)), + runtime, + } + } + + pub fn active_stream(&self) -> u64 { + self.active_stream.load(Ordering::SeqCst) + } + + pub fn get_async_runtime(&self) -> &Handle { + match &self.runtime { + Either::Left(h) => h, + Either::Right(rt) => rt.handle(), + } + } + + fn check_initialized(&self) -> Result<()> { + if !self.rejector.initialized() { + return Err(Error::Uninitialized); + } + Ok(()) + } + + fn check_rejected(&self) -> Result<()> { + self.check_initialized()?; + if self.rejector.allowed() { + return Err(Error::LeaseExpired); + } + Ok(()) + } + + fn update_lease(&self, lease_dur: Duration) -> Result { + self.check_initialized()?; + let mut event = PResp::new(); + event.set_ty(PEvnT::UpdateLeaseResult); + event.set_last_lease_is_valid(self.rejector.update_lease(lease_dur)); + Ok(event) + } + + fn reset(&self) -> PResp { + let rejected = !self.rejector.allowed(); + self.rejector.reset(); + let mut event = PResp::new(); + event.set_ty(PEvnT::UpdateLeaseResult); + event.set_last_lease_is_valid(rejected); + event + } + + fn default_runtime() -> Arc { + let rt = tokio::runtime::Builder::new_multi_thread() + .worker_threads(DEFAULT_RT_THREADS) + .enable_all() + .with_sys_hooks() + .thread_name("snap_br_backup_prepare") + .build() + .unwrap(); + Arc::new(rt) + } +} + +pub struct StreamHandleLoop { + pending_regions: Vec)>>, + env: Env, + aborted: Abortable>, +} + +impl Drop for StreamHandleLoop { + fn drop(&mut self) { + self.env.active_stream.fetch_sub(1, Ordering::SeqCst); + } +} + +enum StreamHandleEvent { + Req(PReq), + WaitApplyDone(Region, Result<()>), + ConnectionGone(Option), + Abort, +} + +impl StreamHandleLoop { + pub fn new(env: Env) -> (Self, AbortHandle) { + let (aborted, handle) = futures_util::future::abortable(std::future::pending()); + env.active_stream.fetch_add(1, Ordering::SeqCst); + let this = Self { + env, + aborted, + pending_regions: vec![], + }; + (this, handle) + } + + fn async_wait_apply(&mut self, region: &Region) -> BoxFuture<'static, (Region, Result<()>)> { + if let Err(err) = self.env.check_rejected() { + return Box::pin(future::ready((region.clone(), Err(err)))); + } + + let (tx, rx) = oneshot::channel(); + let syncer = SnapshotBrWaitApplySyncer::new(region.id, tx); + let handle = self.env.handle.clone(); + let region = region.clone(); + let epoch = region.get_region_epoch().clone(); + let id = region.get_id(); + let send_res = handle + .send_wait_apply(id, SnapshotBrWaitApplyRequest::strict(syncer, epoch)) + .map_err(Error::RaftStore); + Box::pin( + async move { + send_res?; + rx.await + .map_err(|_| Error::WaitApplyAborted(None)) + .and_then(|report| match report.aborted { + Some(reason) => Err(Error::WaitApplyAborted(Some(reason))), + None => Ok(()), + }) + } + .map(move |res| (region, res)), + ) + } + + async fn next_event( + &mut self, + input: &mut (impl Stream> + Unpin), + ) -> StreamHandleEvent { + let pending_regions = &mut self.pending_regions; + let wait_applies = future::poll_fn(|cx| { + let selected = pending_regions.iter_mut().enumerate().find_map(|(i, fut)| { + match fut.poll_unpin(cx) { + Poll::Ready(r) => Some((i, r)), + Poll::Pending => None, + } + }); + match selected { + Some((i, region)) => { + // We have polled the future (and make sure it has ready) before, it is + // safe to drop this future directly. + let _ = pending_regions.swap_remove(i); + region.into() + } + None => Poll::Pending, + } + }); + + tokio::select! { + wres = wait_applies => { + StreamHandleEvent::WaitApplyDone(wres.0, wres.1) + } + req = input.next() => { + match req { + Some(Ok(req)) => StreamHandleEvent::Req(req), + Some(Err(err)) => StreamHandleEvent::ConnectionGone(Some(err)), + None => StreamHandleEvent::ConnectionGone(None) + } + } + _ = &mut self.aborted => { + StreamHandleEvent::Abort + } + } + } + + pub async fn run( + mut self, + mut input: impl Stream> + Unpin, + mut sink: ResultSink, + ) -> grpcio::Result<()> { + loop { + match self.next_event(&mut input).await { + StreamHandleEvent::Req(req) => match req.get_ty() { + PReqT::UpdateLease => { + let lease_dur = Duration::from_secs(req.get_lease_in_seconds()); + sink = sink + .send(self.env.update_lease(lease_dur), |resp| { + resp.set_ty(PEvnT::UpdateLeaseResult); + }) + .await?; + } + PReqT::WaitApply => { + let regions = req.get_regions(); + for region in regions { + let res = self.async_wait_apply(region); + self.pending_regions.push(res); + } + } + PReqT::Finish => { + sink.send(Ok(self.env.reset()), |_| {}) + .await? + .0 + .close() + .await?; + return Ok(()); + } + }, + StreamHandleEvent::WaitApplyDone(region, res) => { + let resp = res.map(|_| { + let mut resp = PResp::new(); + resp.set_region(region.clone()); + resp.set_ty(PEvnT::WaitApplyDone); + resp + }); + sink = sink + .send(resp, |resp| { + resp.set_ty(PEvnT::WaitApplyDone); + resp.set_region(region); + }) + .await?; + } + StreamHandleEvent::ConnectionGone(err) => { + warn!("the client has gone, aborting loop"; "err" => ?err); + return match err { + None => Ok(()), + Some(err) => Err(err), + }; + } + StreamHandleEvent::Abort => { + warn!("Aborted disk snapshot prepare loop by the server."); + return sink + .0 + .fail(RpcStatus::with_message( + RpcStatusCode::CANCELLED, + "the loop has been aborted by server".to_string(), + )) + .await; + } + } + } + } +} diff --git a/components/backup/src/endpoint.rs b/components/backup/src/endpoint.rs index 956455e523e..8ffd229e813 100644 --- a/components/backup/src/endpoint.rs +++ b/components/backup/src/endpoint.rs @@ -44,7 +44,7 @@ use tikv_util::{ warn, worker::Runnable, }; -use tokio::runtime::Runtime; +use tokio::runtime::{Handle, Runtime}; use txn_types::{Key, Lock, TimeStamp}; use crate::{ @@ -1155,6 +1155,13 @@ impl Endpoint { )); } } + + /// Get the internal handle of the io thread pool used by the backup + /// endpoint. This is mainly shared for disk snapshot backup (so they + /// don't need to spawn on the gRPC pool.) + pub fn io_pool_handle(&self) -> &Handle { + self.io_pool.handle() + } } impl Runnable for Endpoint { diff --git a/components/backup/src/lib.rs b/components/backup/src/lib.rs index bf333424603..30345665369 100644 --- a/components/backup/src/lib.rs +++ b/components/backup/src/lib.rs @@ -5,6 +5,7 @@ #[allow(unused_extern_crates)] extern crate tikv_alloc; +pub mod disk_snap; mod endpoint; mod errors; mod metrics; diff --git a/components/backup/src/service.rs b/components/backup/src/service.rs index 237234c061e..bb419e9702a 100644 --- a/components/backup/src/service.rs +++ b/components/backup/src/service.rs @@ -1,88 +1,85 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. -use std::sync::atomic::*; +use std::sync::{atomic::*, Arc, Mutex}; -use engine_traits::{KvEngine, RaftEngine}; use futures::{channel::mpsc, FutureExt, SinkExt, StreamExt, TryFutureExt}; +use futures_util::stream::AbortHandle; use grpcio::{self, *}; use kvproto::brpb::*; -use raftstore::store::{ - fsm::store::RaftRouter, - msg::{PeerMsg, SignificantMsg}, -}; -use tikv_util::{error, info, worker::*}; +use raftstore::store::snapshot_backup::SnapshotBrHandle; +use tikv_util::{error, info, warn, worker::*}; use super::Task; +use crate::disk_snap::{self, StreamHandleLoop}; /// Service handles the RPC messages for the `Backup` service. #[derive(Clone)] -pub struct Service { +pub struct Service { scheduler: Scheduler, - router: Option>, + snap_br_env: disk_snap::Env, + abort_last_req: Arc>>, } -impl Service +impl Service where - EK: KvEngine, - ER: RaftEngine, + H: SnapshotBrHandle, { - // Create a new backup service without router, this used for raftstore v2. - // because we don't have RaftStoreRouter any more. - pub fn new(scheduler: Scheduler) -> Self { + /// Create a new backup service. + pub fn new(scheduler: Scheduler, env: disk_snap::Env) -> Self { Service { scheduler, - router: None, - } - } - - // Create a new backup service with router, this used for raftstore v1. - pub fn with_router(scheduler: Scheduler, router: RaftRouter) -> Self { - Service { - scheduler, - router: Some(router), + snap_br_env: env, + abort_last_req: Arc::default(), } } } -impl Backup for Service +impl Backup for Service where - EK: KvEngine, - ER: RaftEngine, + H: SnapshotBrHandle + 'static, { + /// Check a region whether there is pending admin requests(including pending + /// merging). + /// + /// In older versions of disk snapshot backup, this will be called after we + /// paused all scheduler. + /// + /// This is kept for compatibility with previous versions. fn check_pending_admin_op( &mut self, ctx: RpcContext<'_>, _req: CheckAdminRequest, mut sink: ServerStreamingSink, ) { - let (tx, rx) = mpsc::unbounded(); - match &self.router { - Some(router) => { - router.broadcast_normal(|| { - PeerMsg::SignificantMsg(SignificantMsg::CheckPendingAdmin(tx.clone())) - }); - let send_task = async move { - let mut s = rx.map(|resp| Ok((resp, WriteFlags::default()))); - sink.send_all(&mut s).await?; - sink.close().await?; - Ok(()) - } - .map(|res: Result<()>| match res { - Ok(_) => { - info!("check admin closed"); - } - Err(e) => { - error!("check admin canceled"; "error" => ?e); - } - }); - ctx.spawn(send_task); + let handle = self.snap_br_env.handle.clone(); + let tokio_handle = self.snap_br_env.get_async_runtime().clone(); + let peer = ctx.peer(); + let task = async move { + let (tx, rx) = mpsc::unbounded(); + if let Err(err) = handle.broadcast_check_pending_admin(tx) { + return sink + .fail(RpcStatus::with_message( + RpcStatusCode::INTERNAL, + format!("{err}"), + )) + .await; } - None => { - // check pending admin reqeust is used for EBS Backup. - // for raftstore v2. we don't need it for now. so just return unimplemented - unimplemented_call!(ctx, sink) + sink.send_all(&mut rx.map(|resp| Ok((resp, WriteFlags::default())))) + .await?; + sink.close().await?; + Ok(()) + }; + + tokio_handle.spawn(async move { + match task.await { + Err(err) => { + warn!("check admin canceled"; "peer" => %peer, "err" => %err); + } + Ok(()) => { + info!("check admin closed"; "peer" => %peer); + } } - } + }); } fn backup( @@ -137,25 +134,90 @@ where ctx.spawn(send_task); } + + /// The new method for preparing a disk snapshot backup. + /// Generally there will be some steps for the client to do: + /// 1. Establish a `prepare_snapshot_backup` connection. + /// 2. Send a initial `UpdateLease`. And we should update the lease + /// periodically. + /// 3. Send `WaitApply` to each leader peer in this store. + /// 4. Once `WaitApply` for all regions have done, we can take disk + /// snapshot. + /// 5. Once all snapshots have been taken, send `Finalize` to stop. + fn prepare_snapshot_backup( + &mut self, + ctx: grpcio::RpcContext<'_>, + stream: grpcio::RequestStream, + sink: grpcio::DuplexSink, + ) { + let (l, new_cancel) = StreamHandleLoop::new(self.snap_br_env.clone()); + let peer = ctx.peer(); + // Note: should we disconnect here once there are more than one stream...? + // Generally once two streams enter here, one may exit + info!("A new prepare snapshot backup stream created!"; + "peer" => %peer, + "stream_count" => %self.snap_br_env.active_stream(), + ); + let abort_last_req = self.abort_last_req.clone(); + self.snap_br_env.get_async_runtime().spawn(async move { + { + let mut lock = abort_last_req.lock().unwrap(); + if let Some(cancel) = &*lock { + cancel.abort(); + } + *lock = Some(new_cancel); + } + let res = l.run(stream, sink.into()).await; + info!("stream closed; probably everything is done or a problem cannot be retried happens"; + "result" => ?res, "peer" => %peer); + }); + } } #[cfg(test)] mod tests { use std::{sync::Arc, time::Duration}; - use engine_rocks::RocksEngine; use external_storage_export::make_local_backend; use tikv::storage::txn::tests::{must_commit, must_prewrite_put}; use tikv_util::worker::{dummy_scheduler, ReceiverWrapper}; use txn_types::TimeStamp; use super::*; - use crate::endpoint::tests::*; + use crate::{disk_snap::Env, endpoint::tests::*}; + + #[derive(Clone)] + struct PanicHandle; + + impl SnapshotBrHandle for PanicHandle { + fn send_wait_apply( + &self, + _region: u64, + _req: raftstore::store::snapshot_backup::SnapshotBrWaitApplyRequest, + ) -> raftstore::Result<()> { + panic!("this case shouldn't call this!") + } + + fn broadcast_wait_apply( + &self, + _req: raftstore::store::snapshot_backup::SnapshotBrWaitApplyRequest, + ) -> raftstore::Result<()> { + panic!("this case shouldn't call this!") + } + + fn broadcast_check_pending_admin( + &self, + _tx: mpsc::UnboundedSender, + ) -> raftstore::Result<()> { + panic!("this case shouldn't call this!") + } + } fn new_rpc_suite() -> (Server, BackupClient, ReceiverWrapper) { let env = Arc::new(EnvBuilder::new().build()); let (scheduler, rx) = dummy_scheduler(); - let backup_service = super::Service::::new(scheduler); + let backup_service = + super::Service::new(scheduler, Env::new(PanicHandle, Default::default(), None)); let builder = ServerBuilder::new(env.clone()).register_service(create_backup(backup_service)); let mut server = builder.bind("127.0.0.1", 0).build().unwrap(); diff --git a/components/raftstore-v2/src/operation/disk_snapshot_backup.rs b/components/raftstore-v2/src/operation/disk_snapshot_backup.rs new file mode 100644 index 00000000000..1e033248b23 --- /dev/null +++ b/components/raftstore-v2/src/operation/disk_snapshot_backup.rs @@ -0,0 +1,37 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use futures::channel::mpsc::UnboundedSender; +use kvproto::brpb::CheckAdminResponse; +use raftstore::store::snapshot_backup::{SnapshotBrHandle, SnapshotBrWaitApplyRequest}; +use tikv_util::box_err; + +const REASON: &str = "Raftstore V2 doesn't support snapshot backup yet."; + +#[derive(Clone, Copy)] +pub struct UnimplementedHandle; + +impl SnapshotBrHandle for UnimplementedHandle { + fn send_wait_apply(&self, _region: u64, _req: SnapshotBrWaitApplyRequest) -> crate::Result<()> { + Err(crate::Error::Other(box_err!( + "send_wait_apply not implemented; note: {}", + REASON + ))) + } + + fn broadcast_wait_apply(&self, _req: SnapshotBrWaitApplyRequest) -> crate::Result<()> { + Err(crate::Error::Other(box_err!( + "broadcast_wait_apply not implemented; note: {}", + REASON + ))) + } + + fn broadcast_check_pending_admin( + &self, + _tx: UnboundedSender, + ) -> crate::Result<()> { + Err(crate::Error::Other(box_err!( + "broadcast_check_pending_admin not implemented; note: {}", + REASON + ))) + } +} diff --git a/components/raftstore-v2/src/operation/mod.rs b/components/raftstore-v2/src/operation/mod.rs index 9ccf08d6d54..6c43fcdaa3b 100644 --- a/components/raftstore-v2/src/operation/mod.rs +++ b/components/raftstore-v2/src/operation/mod.rs @@ -2,6 +2,7 @@ mod bucket; mod command; +mod disk_snapshot_backup; mod life; mod misc; mod pd; @@ -17,6 +18,7 @@ pub use command::{ SplitFlowControl, SplitPendingAppend, MERGE_IN_PROGRESS_PREFIX, MERGE_SOURCE_PREFIX, SPLIT_PREFIX, }; +pub use disk_snapshot_backup::UnimplementedHandle as DiskSnapBackupHandle; pub use life::{AbnormalPeerContext, DestroyProgress, GcPeerContext}; pub use ready::{ write_initial_states, ApplyTrace, AsyncWriter, DataTrace, GenSnapTask, ReplayWatch, SnapState, diff --git a/components/raftstore-v2/src/router/mod.rs b/components/raftstore-v2/src/router/mod.rs index 83a2497b331..d63e1abc733 100644 --- a/components/raftstore-v2/src/router/mod.rs +++ b/components/raftstore-v2/src/router/mod.rs @@ -20,3 +20,4 @@ pub use self::{ DebugInfoSubscriber, QueryResChannel, QueryResult, ReadResponse, }, }; +pub use super::operation::DiskSnapBackupHandle; diff --git a/components/raftstore/src/coprocessor/dispatcher.rs b/components/raftstore/src/coprocessor/dispatcher.rs index c7d6731d3e9..5b06d92d8e0 100644 --- a/components/raftstore/src/coprocessor/dispatcher.rs +++ b/components/raftstore/src/coprocessor/dispatcher.rs @@ -672,6 +672,10 @@ impl CoprocessorHost { ); } + pub fn pre_transfer_leader(&self, r: &Region, tr: &TransferLeaderRequest) -> Result<()> { + try_loop_ob!(r, &self.registry.admin_observers, pre_transfer_leader, tr) + } + pub fn post_apply_snapshot( &self, region: &Region, diff --git a/components/raftstore/src/coprocessor/error.rs b/components/raftstore/src/coprocessor/error.rs index 233c7c4197a..d979cac98dd 100644 --- a/components/raftstore/src/coprocessor/error.rs +++ b/components/raftstore/src/coprocessor/error.rs @@ -1,12 +1,14 @@ // Copyright 2016 TiKV Project Authors. Licensed under Apache-2.0. -use std::{error::Error as StdError, result::Result as StdResult}; +use std::{error::Error as StdError, result::Result as StdResult, time::Duration}; use error_code::{self, ErrorCode, ErrorCodeExt}; use thiserror::Error; #[derive(Debug, Error)] pub enum Error { + #[error("required retry after {after:?}, hint: {reason:?}")] + RequireDelay { after: Duration, reason: String }, #[error("{0}")] Other(#[from] Box), } diff --git a/components/raftstore/src/coprocessor/mod.rs b/components/raftstore/src/coprocessor/mod.rs index 0592e23200b..2e05d01f905 100644 --- a/components/raftstore/src/coprocessor/mod.rs +++ b/components/raftstore/src/coprocessor/mod.rs @@ -13,7 +13,10 @@ use engine_traits::{CfName, SstMetaInfo}; use kvproto::{ metapb::Region, pdpb::CheckPolicy, - raft_cmdpb::{AdminRequest, AdminResponse, RaftCmdRequest, RaftCmdResponse, Request}, + raft_cmdpb::{ + AdminRequest, AdminResponse, RaftCmdRequest, RaftCmdResponse, Request, + TransferLeaderRequest, + }, raft_serverpb::RaftApplyState, }; use raft::{eraftpb, StateRole}; @@ -130,6 +133,14 @@ pub trait AdminObserver: Coprocessor { ) -> bool { false } + + fn pre_transfer_leader( + &self, + _ctx: &mut ObserverContext<'_>, + _tr: &TransferLeaderRequest, + ) -> Result<()> { + Ok(()) + } } pub trait QueryObserver: Coprocessor { diff --git a/components/raftstore/src/errors.rs b/components/raftstore/src/errors.rs index 49a52de26e1..6d512e5bf80 100644 --- a/components/raftstore/src/errors.rs +++ b/components/raftstore/src/errors.rs @@ -290,6 +290,15 @@ impl From for errorpb::Error { e.set_store_peer_id(store_peer_id); errorpb.set_mismatch_peer_id(e); } + Error::Coprocessor(CopError::RequireDelay { + after, + reason: hint, + }) => { + let mut e = errorpb::ServerIsBusy::new(); + e.set_backoff_ms(after.as_millis() as _); + e.set_reason(hint); + errorpb.set_server_is_busy(e); + } Error::DeadlineExceeded => { set_deadline_exceeded_busy_error(&mut errorpb); } diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index 3e9d1644e4f..e4bff52ec5d 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -89,15 +89,14 @@ use crate::{ TRANSFER_LEADER_COMMAND_REPLY_CTX, }, region_meta::RegionMeta, + snapshot_backup::{AbortReason, SnapshotBrState, SnapshotBrWaitApplyRequest}, transport::Transport, unsafe_recovery::{ - exit_joint_request, ForceLeaderState, SnapshotRecoveryState, - SnapshotRecoveryWaitApplySyncer, UnsafeRecoveryExecutePlanSyncer, + exit_joint_request, ForceLeaderState, UnsafeRecoveryExecutePlanSyncer, UnsafeRecoveryFillOutReportSyncer, UnsafeRecoveryForceLeaderSyncer, UnsafeRecoveryState, UnsafeRecoveryWaitApplySyncer, }, - util, - util::{KeysInfoFormatter, LeaseState}, + util::{self, compare_region_epoch, KeysInfoFormatter, LeaseState}, worker::{ Bucket, BucketRange, CleanupTask, ConsistencyCheckTask, GcSnapshotTask, RaftlogGcTask, ReadDelegate, ReadProgress, RegionTask, SplitCheckTask, @@ -949,7 +948,7 @@ where // func be invoked firstly after assigned leader by BR, wait all leader apply to // last log index func be invoked secondly wait follower apply to last // index, however the second call is broadcast, it may improve in future - fn on_snapshot_recovery_wait_apply(&mut self, syncer: SnapshotRecoveryWaitApplySyncer) { + fn on_snapshot_br_wait_apply(&mut self, req: SnapshotBrWaitApplyRequest) { if let Some(state) = &self.fsm.peer.snapshot_recovery_state { warn!( "can't wait apply, another recovery in progress"; @@ -957,20 +956,47 @@ where "peer_id" => self.fsm.peer_id(), "state" => ?state, ); - syncer.abort(); + req.syncer.abort(AbortReason::Duplicated); return; } let target_index = self.fsm.peer.raft_group.raft.raft_log.last_index(); + let applied_index = self.fsm.peer.raft_group.raft.raft_log.applied; + let term = self.fsm.peer.raft_group.raft.term; + if let Some(e) = &req.expected_epoch { + if let Err(err) = compare_region_epoch(e, self.region(), true, true, true) { + warn!("epoch not match for wait apply, aborting."; "err" => %err, + "peer" => self.fsm.peer.peer_id(), + "region" => self.fsm.peer.region().get_id()); + let mut pberr = errorpb::Error::from(err); + req.syncer + .abort(AbortReason::EpochNotMatch(pberr.take_epoch_not_match())); + return; + } + } + + // trivial case: no need to wait apply -- already the latest. + // Return directly for avoiding to print tons of logs. + if target_index == applied_index { + debug!( + "skip trivial case of waiting apply."; + "region_id" => self.region_id(), + "peer_id" => self.fsm.peer_id(), + "target_index" => target_index, + "applied_index" => applied_index, + ); + SNAP_BR_WAIT_APPLY_EVENT.trivial.inc(); + return; + } // during the snapshot recovery, broadcast waitapply, some peer may stale if !self.fsm.peer.is_leader() { info!( - "snapshot follower recovery started"; + "snapshot follower wait apply started"; "region_id" => self.region_id(), "peer_id" => self.fsm.peer_id(), "target_index" => target_index, - "applied_index" => self.fsm.peer.raft_group.raft.raft_log.applied, + "applied_index" => applied_index, "pending_remove" => self.fsm.peer.pending_remove, "voter" => self.fsm.peer.raft_group.raft.vote, ); @@ -980,7 +1006,8 @@ where // case#2 if peer is suppose to remove if self.fsm.peer.raft_group.raft.vote == 0 || self.fsm.peer.pending_remove { info!( - "this peer is never vote before or pending remove, it should be skip to wait apply" + "this peer is never vote before or pending remove, it should be skip to wait apply"; + "region" => %self.region_id(), ); return; } @@ -990,13 +1017,15 @@ where "region_id" => self.region_id(), "peer_id" => self.fsm.peer_id(), "target_index" => target_index, - "applied_index" => self.fsm.peer.raft_group.raft.raft_log.applied, + "applied_index" => applied_index, ); } + SNAP_BR_WAIT_APPLY_EVENT.accepted.inc(); - self.fsm.peer.snapshot_recovery_state = Some(SnapshotRecoveryState::WaitLogApplyToLast { + self.fsm.peer.snapshot_recovery_state = Some(SnapshotBrState::WaitLogApplyToLast { target_index, - syncer, + valid_for_term: req.abort_when_term_change.then_some(term), + syncer: req.syncer, }); self.fsm .peer @@ -1503,9 +1532,7 @@ where self.on_unsafe_recovery_fill_out_report(syncer) } // for snapshot recovery (safe recovery) - SignificantMsg::SnapshotRecoveryWaitApply(syncer) => { - self.on_snapshot_recovery_wait_apply(syncer) - } + SignificantMsg::SnapshotBrWaitApply(syncer) => self.on_snapshot_br_wait_apply(syncer), SignificantMsg::CheckPendingAdmin(ch) => self.on_check_pending_admin(ch), } } diff --git a/components/raftstore/src/store/metrics.rs b/components/raftstore/src/store/metrics.rs index de6f654de12..269c4aca23f 100644 --- a/components/raftstore/src/store/metrics.rs +++ b/components/raftstore/src/store/metrics.rs @@ -257,6 +257,31 @@ make_static_metric! { unable_to_split_cpu_top, } + pub label_enum SnapshotBrWaitApplyEventType { + sent, + trivial, + accepted, + term_not_match, + epoch_not_match, + duplicated, + finished, + } + + pub struct SnapshotBrWaitApplyEvent : IntCounter { + "event" => SnapshotBrWaitApplyEventType + } + + pub label_enum SnapshotBrLeaseEventType { + create, + renew, + expired, + reset, + } + + pub struct SnapshotBrLeaseEvent : IntCounter { + "event" => SnapshotBrLeaseEventType + } + pub struct HibernatedPeerStateGauge: IntGauge { "state" => { awaken, @@ -893,4 +918,29 @@ lazy_static! { "tikv_raftstore_peer_in_flashback_state", "Total number of peers in the flashback state" ).unwrap(); + + pub static ref SNAP_BR_SUSPEND_COMMAND_TYPE: IntCounterVec = register_int_counter_vec!( + "tikv_raftstore_snap_br_suspend_command_type", + "The statistic of rejecting some admin commands being proposed.", + &["type"] + ).unwrap(); + + pub static ref SNAP_BR_WAIT_APPLY_EVENT: SnapshotBrWaitApplyEvent = register_static_int_counter_vec!( + SnapshotBrWaitApplyEvent, + "tikv_raftstore_snap_br_wait_apply_event", + "The events of wait apply issued by snapshot br.", + &["event"] + ).unwrap(); + + pub static ref SNAP_BR_SUSPEND_COMMAND_LEASE_UNTIL: IntGauge = register_int_gauge!( + "tikv_raftstore_snap_br_suspend_command_lease_until", + "The lease that snapshot br holds of rejecting some type of commands. (In unix timestamp.)" + ).unwrap(); + + pub static ref SNAP_BR_LEASE_EVENT: SnapshotBrLeaseEvent = register_static_int_counter_vec!( + SnapshotBrLeaseEvent, + "tikv_raftstore_snap_br_lease_event", + "The events of the lease to denying new admin commands being proposed by snapshot br.", + &["event"] + ).unwrap(); } diff --git a/components/raftstore/src/store/mod.rs b/components/raftstore/src/store/mod.rs index 4cae84d1d25..cccab6f72b0 100644 --- a/components/raftstore/src/store/mod.rs +++ b/components/raftstore/src/store/mod.rs @@ -11,6 +11,7 @@ pub mod msg; mod peer; mod read_queue; pub mod region_meta; +pub mod snapshot_backup; pub mod transport; #[macro_use] pub mod util; @@ -74,13 +75,14 @@ pub use self::{ ApplyOptions, CfFile, Error as SnapError, SnapEntry, SnapKey, SnapManager, SnapManagerBuilder, Snapshot, SnapshotStatistics, TabletSnapKey, TabletSnapManager, }, + snapshot_backup::SnapshotBrWaitApplySyncer, transport::{CasualRouter, ProposalRouter, SignificantRouter, StoreRouter, Transport}, txn_ext::{LocksStatus, PeerPessimisticLocks, PessimisticLockPair, TxnExt}, unsafe_recovery::{ demote_failed_voters_request, exit_joint_request, ForceLeaderState, - SnapshotRecoveryWaitApplySyncer, UnsafeRecoveryExecutePlanSyncer, - UnsafeRecoveryFillOutReportSyncer, UnsafeRecoveryForceLeaderSyncer, UnsafeRecoveryHandle, - UnsafeRecoveryState, UnsafeRecoveryWaitApplySyncer, + UnsafeRecoveryExecutePlanSyncer, UnsafeRecoveryFillOutReportSyncer, + UnsafeRecoveryForceLeaderSyncer, UnsafeRecoveryHandle, UnsafeRecoveryState, + UnsafeRecoveryWaitApplySyncer, }, util::{RegionReadProgress, RegionReadProgressRegistry}, worker::{ diff --git a/components/raftstore/src/store/msg.rs b/components/raftstore/src/store/msg.rs index 0dca9793d35..2f05a068ddb 100644 --- a/components/raftstore/src/store/msg.rs +++ b/components/raftstore/src/store/msg.rs @@ -26,14 +26,16 @@ use smallvec::{smallvec, SmallVec}; use tikv_util::{deadline::Deadline, escape, memory::HeapSize, time::Instant}; use tracker::{get_tls_tracker_token, TrackerToken}; -use super::{local_metrics::TimeTracker, region_meta::RegionMeta, FetchedLogs, RegionSnapshot}; +use super::{ + local_metrics::TimeTracker, region_meta::RegionMeta, + snapshot_backup::SnapshotBrWaitApplyRequest, FetchedLogs, RegionSnapshot, +}; use crate::store::{ fsm::apply::{CatchUpLogs, ChangeObserver, TaskRes as ApplyTaskRes}, metrics::RaftEventDurationType, unsafe_recovery::{ - SnapshotRecoveryWaitApplySyncer, UnsafeRecoveryExecutePlanSyncer, - UnsafeRecoveryFillOutReportSyncer, UnsafeRecoveryForceLeaderSyncer, - UnsafeRecoveryWaitApplySyncer, + UnsafeRecoveryExecutePlanSyncer, UnsafeRecoveryFillOutReportSyncer, + UnsafeRecoveryForceLeaderSyncer, UnsafeRecoveryWaitApplySyncer, }, util::{KeysInfoFormatter, LatencyInspector}, worker::{Bucket, BucketRange}, @@ -528,7 +530,7 @@ where UnsafeRecoveryDestroy(UnsafeRecoveryExecutePlanSyncer), UnsafeRecoveryWaitApply(UnsafeRecoveryWaitApplySyncer), UnsafeRecoveryFillOutReport(UnsafeRecoveryFillOutReportSyncer), - SnapshotRecoveryWaitApply(SnapshotRecoveryWaitApplySyncer), + SnapshotBrWaitApply(SnapshotBrWaitApplyRequest), CheckPendingAdmin(UnboundedSender), } diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index 26f475d009f..76affa90b93 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -101,8 +101,9 @@ use crate::{ memory::{needs_evict_entry_cache, MEMTRACE_RAFT_ENTRIES}, msg::{CasualMessage, ErrorCallback, RaftCommand}, peer_storage::HandleSnapshotResult, + snapshot_backup::{AbortReason, SnapshotBrState}, txn_ext::LocksStatus, - unsafe_recovery::{ForceLeaderState, SnapshotRecoveryState, UnsafeRecoveryState}, + unsafe_recovery::{ForceLeaderState, UnsafeRecoveryState}, util::{admin_cmd_epoch_lookup, RegionReadProgress}, worker::{ CleanupTask, CompactTask, HeartbeatTask, RaftlogGcTask, ReadDelegate, ReadExecutor, @@ -887,7 +888,7 @@ where /// lead_transferee if this peer(leader) is in a leadership transferring. pub lead_transferee: u64, pub unsafe_recovery_state: Option, - pub snapshot_recovery_state: Option, + pub snapshot_recovery_state: Option, last_record_safe_point: u64, } @@ -3628,7 +3629,7 @@ where self.check_normal_proposal_with_disk_full_opt(ctx, disk_full_opt) .and_then(|_| self.propose_normal(ctx, req)) } - Ok(RequestPolicy::ProposeConfChange) => self.propose_conf_change(ctx, &req), + Ok(RequestPolicy::ProposeConfChange) => self.propose_conf_change(ctx, req), Err(e) => Err(e), }; fail_point!("after_propose"); @@ -4616,9 +4617,23 @@ where req: RaftCmdRequest, cb: Callback, ) -> bool { + let transfer_leader = get_transfer_leader_cmd(&req).unwrap(); + if let Err(err) = ctx + .coprocessor_host + .pre_transfer_leader(self.region(), transfer_leader) + { + warn!("Coprocessor rejected transfer leader."; "err" => ?err, + "region_id" => self.region_id, + "peer_id" => self.peer.get_id(), + "transferee" => transfer_leader.get_peer().get_id()); + let mut resp = RaftCmdResponse::new(); + *resp.mut_header().mut_error() = Error::from(err).into(); + cb.invoke_with_response(resp); + return false; + } + ctx.raft_metrics.propose.transfer_leader.inc(); - let transfer_leader = get_transfer_leader_cmd(&req).unwrap(); let prs = self.raft_group.raft.prs(); let (_, peers) = transfer_leader @@ -4671,7 +4686,7 @@ where fn propose_conf_change( &mut self, ctx: &mut PollContext, - req: &RaftCmdRequest, + mut req: RaftCmdRequest, ) -> Result> { if self.pending_merge_state.is_some() { return Err(Error::ProposalInMergingMode(self.region_id)); @@ -4699,7 +4714,24 @@ where self.term() )); } - if let Some(index) = self.cmd_epoch_checker.propose_check_epoch(req, self.term()) { + + if let Err(err) = ctx.coprocessor_host.pre_propose(self.region(), &mut req) { + warn!("Coprocessor rejected proposing conf change."; + "err" => ?err, + "region_id" => self.region_id, + "peer_id" => self.peer.get_id(), + ); + return Err(box_err!( + "{} rejected by coprocessor(reason = {})", + self.tag, + err + )); + } + + if let Some(index) = self + .cmd_epoch_checker + .propose_check_epoch(&req, self.term()) + { return Ok(Either::Right(index)); } @@ -5136,10 +5168,31 @@ where } pub fn snapshot_recovery_maybe_finish_wait_apply(&mut self, force: bool) { - if let Some(SnapshotRecoveryState::WaitLogApplyToLast { target_index, .. }) = - &self.snapshot_recovery_state + if let Some(SnapshotBrState::WaitLogApplyToLast { + target_index, + valid_for_term, + .. + }) = &self.snapshot_recovery_state { - if self.raft_group.raft.term != self.raft_group.raft.raft_log.last_term() { + if valid_for_term + .map(|vt| vt != self.raft_group.raft.term) + .unwrap_or(false) + { + info!("leadership changed, aborting syncer because required."; "region_id" => self.region().id); + match self.snapshot_recovery_state.take() { + Some(SnapshotBrState::WaitLogApplyToLast { + syncer, + valid_for_term, + .. + }) => { + syncer.abort(AbortReason::StaleCommand { + region_id: self.region().get_id(), + expected_term: valid_for_term.unwrap_or_default(), + current_term: self.raft_group.raft.term, + }); + } + _ => unreachable!(), + }; return; } diff --git a/components/raftstore/src/store/snapshot_backup.rs b/components/raftstore/src/store/snapshot_backup.rs new file mode 100644 index 00000000000..9168e974fc2 --- /dev/null +++ b/components/raftstore/src/store/snapshot_backup.rs @@ -0,0 +1,391 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + sync::{ + atomic::{AtomicBool, AtomicU64, Ordering}, + Arc, Mutex, + }, + time::Duration, +}; + +use engine_traits::{KvEngine, RaftEngine}; +use futures::channel::mpsc::UnboundedSender; +use kvproto::{brpb::CheckAdminResponse, metapb::RegionEpoch, raft_cmdpb::AdminCmdType}; +use tikv_util::{info, warn}; +use tokio::sync::oneshot; + +use super::{metrics, PeerMsg, RaftRouter, SignificantMsg, SignificantRouter}; +use crate::coprocessor::{ + AdminObserver, BoxAdminObserver, BoxQueryObserver, Coprocessor, CoprocessorHost, + Error as CopError, QueryObserver, +}; + +fn epoch_second_coarse() -> u64 { + let spec = tikv_util::time::monotonic_coarse_now(); + spec.sec as u64 +} + +#[derive(Debug, Clone)] +pub struct SnapshotBrWaitApplyRequest { + pub syncer: SnapshotBrWaitApplySyncer, + pub expected_epoch: Option, + pub abort_when_term_change: bool, +} + +impl SnapshotBrWaitApplyRequest { + /// Create a "relax" request for waiting apply. + /// This only waits to the last index, without checking the region epoch or + /// leadership migrating. + pub fn relaxed(syncer: SnapshotBrWaitApplySyncer) -> Self { + Self { + syncer, + expected_epoch: None, + abort_when_term_change: false, + } + } + + /// Create a "strict" request for waiting apply. + /// This will wait to last applied index, and aborts if the region epoch not + /// match or the last index may not be committed. + pub fn strict(syncer: SnapshotBrWaitApplySyncer, epoch: RegionEpoch) -> Self { + Self { + syncer, + expected_epoch: Some(epoch), + abort_when_term_change: true, + } + } +} + +pub trait SnapshotBrHandle: Sync + Send + Clone { + fn send_wait_apply(&self, region: u64, req: SnapshotBrWaitApplyRequest) -> crate::Result<()>; + fn broadcast_wait_apply(&self, req: SnapshotBrWaitApplyRequest) -> crate::Result<()>; + fn broadcast_check_pending_admin( + &self, + tx: UnboundedSender, + ) -> crate::Result<()>; +} + +impl SnapshotBrHandle for Arc>> { + fn send_wait_apply(&self, region: u64, req: SnapshotBrWaitApplyRequest) -> crate::Result<()> { + let msg = SignificantMsg::SnapshotBrWaitApply(req); + metrics::SNAP_BR_WAIT_APPLY_EVENT.sent.inc(); + self.lock().unwrap().significant_send(region, msg) + } + + fn broadcast_wait_apply(&self, req: SnapshotBrWaitApplyRequest) -> crate::Result<()> { + let msg_gen = || { + metrics::SNAP_BR_WAIT_APPLY_EVENT.sent.inc(); + PeerMsg::SignificantMsg(SignificantMsg::SnapshotBrWaitApply(req.clone())) + }; + self.lock().unwrap().broadcast_normal(msg_gen); + Ok(()) + } + + fn broadcast_check_pending_admin( + &self, + tx: UnboundedSender, + ) -> crate::Result<()> { + self.lock().unwrap().broadcast_normal(|| { + PeerMsg::SignificantMsg(SignificantMsg::CheckPendingAdmin(tx.clone())) + }); + Ok(()) + } +} + +#[derive(Default)] +pub struct PrepareDiskSnapObserver { + before: AtomicU64, + initialized: AtomicBool, +} + +impl PrepareDiskSnapObserver { + pub fn register_to(self: &Arc, coprocessor_host: &mut CoprocessorHost) { + let reg = &mut coprocessor_host.registry; + reg.register_query_observer(0, BoxQueryObserver::new(Arc::clone(self))); + reg.register_admin_observer(0, BoxAdminObserver::new(Arc::clone(self))); + info!("registered reject ingest and admin coprocessor to TiKV."); + } + + pub fn remained_secs(&self) -> u64 { + self.before + .load(Ordering::Acquire) + .saturating_sub(epoch_second_coarse()) + } + + fn reject(&self) -> CopError { + CopError::RequireDelay { + after: Duration::from_secs(self.remained_secs()), + reason: + "[Suspended] Preparing disk snapshot backup, ingests and some of admin commands are suspended." + .to_owned(), + } + } + + pub fn allowed(&self) -> bool { + let mut v = self.before.load(Ordering::Acquire); + if v == 0 { + return true; + } + let mut expired = v < epoch_second_coarse(); + while expired { + match self + .before + .compare_exchange(v, 0, Ordering::SeqCst, Ordering::SeqCst) + { + Ok(_) => { + metrics::SNAP_BR_SUSPEND_COMMAND_LEASE_UNTIL.set(0); + metrics::SNAP_BR_LEASE_EVENT.expired.inc(); + break; + } + Err(new_val) => { + v = new_val; + expired = v < epoch_second_coarse(); + } + } + } + + expired + } + + pub fn initialized(&self) -> bool { + self.initialized.load(Ordering::Acquire) + } + + /// Extend the lease. + /// + /// # Returns + /// + /// Whether previously there is a lease. + pub fn update_lease(&self, lease: Duration) -> bool { + let mut v = self.before.load(Ordering::SeqCst); + let now = epoch_second_coarse(); + let new_lease = now + lease.as_secs(); + let last_lease_valid = v > now; + while v < new_lease { + let res = self + .before + .fetch_update(Ordering::SeqCst, Ordering::SeqCst, |v| { + if v > new_lease { None } else { Some(new_lease) } + }); + match res { + Ok(_) => { + metrics::SNAP_BR_SUSPEND_COMMAND_LEASE_UNTIL.set(new_lease as _); + break; + } + Err(prev) => v = prev, + } + } + if last_lease_valid { + metrics::SNAP_BR_LEASE_EVENT.renew.inc(); + } else { + metrics::SNAP_BR_LEASE_EVENT.create.inc(); + } + last_lease_valid + } + + pub fn reset(&self) { + self.before.store(0, Ordering::SeqCst); + metrics::SNAP_BR_SUSPEND_COMMAND_LEASE_UNTIL.set(0); + metrics::SNAP_BR_LEASE_EVENT.reset.inc(); + } +} + +impl Coprocessor for Arc { + fn start(&self) { + self.initialized.store(true, Ordering::Release) + } + + fn stop(&self) { + self.initialized.store(false, Ordering::Release) + } +} + +impl QueryObserver for Arc { + fn pre_propose_query( + &self, + cx: &mut crate::coprocessor::ObserverContext<'_>, + reqs: &mut Vec, + ) -> crate::coprocessor::Result<()> { + if self.allowed() { + return Ok(()); + } + for req in reqs { + if req.has_ingest_sst() { + // Note: this will reject the batch of commands, which isn't so effective. + // But we cannot reject proposing a subset of command for now... + cx.bypass = true; + metrics::SNAP_BR_SUSPEND_COMMAND_TYPE + .with_label_values(&["Ingest"]) + .inc(); + return Err(self.reject()); + } + } + Ok(()) + } +} + +impl AdminObserver for Arc { + fn pre_propose_admin( + &self, + _: &mut crate::coprocessor::ObserverContext<'_>, + admin: &mut kvproto::raft_cmdpb::AdminRequest, + ) -> crate::coprocessor::Result<()> { + if self.allowed() { + return Ok(()); + } + // NOTE: We have disabled `CompactLog` here because if the log get truncated, + // we may take a long time to send snapshots during restoring. + // Also note it may impact the TP workload if we are preparing for a long time. + let should_reject = matches!( + admin.get_cmd_type(), + AdminCmdType::Split | + AdminCmdType::BatchSplit | + // We will allow `Commit/RollbackMerge` here because the + // `wait_pending_admin` will wait until the merge get finished. + // If we reject them, they won't be able to see the merge get finished. + // And will finally time out. + AdminCmdType::PrepareMerge | + AdminCmdType::ChangePeer | + AdminCmdType::ChangePeerV2 | + AdminCmdType::BatchSwitchWitness | + AdminCmdType::CompactLog + ); + if should_reject { + metrics::SNAP_BR_SUSPEND_COMMAND_TYPE + .with_label_values(&[&format!("{:?}", admin.get_cmd_type())]) + .inc(); + return Err(self.reject()); + } + Ok(()) + } + + fn pre_transfer_leader( + &self, + _ctx: &mut crate::coprocessor::ObserverContext<'_>, + _tr: &kvproto::raft_cmdpb::TransferLeaderRequest, + ) -> crate::coprocessor::Result<()> { + if self.allowed() { + return Ok(()); + } + metrics::SNAP_BR_SUSPEND_COMMAND_TYPE + .with_label_values(&["TransferLeader"]) + .inc(); + Err(self.reject()) + } +} + +#[derive(Debug)] +struct SyncerCore { + report_id: u64, + feedback: Option>, +} + +#[derive(Debug, PartialEq)] +pub struct SyncReport { + pub report_id: u64, + pub aborted: Option, +} + +impl SyncerCore { + fn new(report_id: u64, feedback: oneshot::Sender) -> Self { + Self { + report_id, + feedback: Some(feedback), + } + } + + fn is_aborted(&self) -> bool { + self.feedback.is_none() + } + + /// Abort this syncer. + /// This will fire a message right now. + /// And disable all clones of this syncer. + /// If already aborted, this will do nothing. + fn abort(&mut self, reason: AbortReason) { + if let Some(ch) = self.feedback.take() { + let report = SyncReport { + report_id: self.report_id, + aborted: Some(reason), + }; + if let Err(report) = ch.send(report) { + warn!("reply waitapply states failure."; "report" => ?report); + } + } + } + + fn make_success_result(&self) -> SyncReport { + SyncReport { + report_id: self.report_id, + aborted: None, + } + } +} + +impl Drop for SyncerCore { + fn drop(&mut self) { + if let Some(ch) = self.feedback.take() { + let report = self.make_success_result(); + if let Err(report) = ch.send(report) { + warn!("reply waitapply states failure."; "report" => ?report); + } + metrics::SNAP_BR_WAIT_APPLY_EVENT.finished.inc() + } else { + warn!("wait apply aborted."; "report" => self.report_id); + } + } +} + +/// A syncer for wait apply. +/// The sender used for constructing this structure will: +/// Be closed, if the `abort` has been called. +/// Send the report id to the caller, if all replicas of this Syncer has been +/// dropped. +#[derive(Debug, Clone)] +pub struct SnapshotBrWaitApplySyncer(Arc>); + +impl SnapshotBrWaitApplySyncer { + pub fn new(report_id: u64, sender: oneshot::Sender) -> Self { + let core = SyncerCore::new(report_id, sender); + Self(Arc::new(Mutex::new(core))) + } + + pub fn abort(self, reason: AbortReason) { + let mut core = self.0.lock().unwrap(); + warn!("aborting wait apply."; "reason" => ?reason, "id" => %core.report_id, "already_aborted" => %core.is_aborted()); + match reason { + AbortReason::EpochNotMatch(_) => { + metrics::SNAP_BR_WAIT_APPLY_EVENT.epoch_not_match.inc() + } + AbortReason::StaleCommand { .. } => { + metrics::SNAP_BR_WAIT_APPLY_EVENT.term_not_match.inc() + } + AbortReason::Duplicated => metrics::SNAP_BR_WAIT_APPLY_EVENT.duplicated.inc(), + } + core.abort(reason); + } +} + +#[derive(Debug, PartialEq)] +pub enum AbortReason { + EpochNotMatch(kvproto::errorpb::EpochNotMatch), + StaleCommand { + expected_term: u64, + current_term: u64, + region_id: u64, + }, + Duplicated, +} + +#[derive(Debug)] +pub enum SnapshotBrState { + // This state is set by the leader peer fsm. Once set, it sync and check leader commit index + // and force forward to last index once follower appended and then it also is checked + // every time this peer applies a the last index, if the last index is met, this state is + // reset / droppeds. The syncer is dropped and send the response to the invoker. + WaitLogApplyToLast { + target_index: u64, + valid_for_term: Option, + syncer: SnapshotBrWaitApplySyncer, + }, +} diff --git a/components/raftstore/src/store/unsafe_recovery.rs b/components/raftstore/src/store/unsafe_recovery.rs index 28943ae7339..4bc84ebe2a7 100644 --- a/components/raftstore/src/store/unsafe_recovery.rs +++ b/components/raftstore/src/store/unsafe_recovery.rs @@ -2,7 +2,7 @@ use std::{ fmt, mem, - sync::{mpsc::SyncSender, Arc, Mutex}, + sync::{Arc, Mutex}, time::Duration, }; @@ -278,40 +278,6 @@ impl UnsafeRecoveryExecutePlanSyncer { *self.abort.lock().unwrap() = true; } } -// Syncer only send to leader in 2nd BR restore -#[derive(Clone, Debug)] -pub struct SnapshotRecoveryWaitApplySyncer { - _closure: Arc, - abort: Arc>, -} - -impl SnapshotRecoveryWaitApplySyncer { - pub fn new(region_id: u64, sender: SyncSender) -> Self { - let thread_safe_router = Mutex::new(sender); - let abort = Arc::new(Mutex::new(false)); - let abort_clone = abort.clone(); - let closure = InvokeClosureOnDrop(Some(Box::new(move || { - info!("region {} wait apply finished", region_id); - if *abort_clone.lock().unwrap() { - warn!("wait apply aborted"); - return; - } - let router_ptr = thread_safe_router.lock().unwrap(); - - _ = router_ptr.send(region_id).map_err(|_| { - warn!("reply waitapply states failure."); - }); - }))); - SnapshotRecoveryWaitApplySyncer { - _closure: Arc::new(closure), - abort, - } - } - - pub fn abort(&self) { - *self.abort.lock().unwrap() = true; - } -} #[derive(Clone, Debug)] pub struct UnsafeRecoveryWaitApplySyncer { @@ -386,19 +352,6 @@ impl UnsafeRecoveryFillOutReportSyncer { } } -#[derive(Debug)] -pub enum SnapshotRecoveryState { - // This state is set by the leader peer fsm. Once set, it sync and check leader commit index - // and force forward to last index once follower appended and then it also is checked - // every time this peer applies a the last index, if the last index is met, this state is - // reset / droppeds. The syncer is droped and send the response to the invoker, triggers - // the next step of recovery process. - WaitLogApplyToLast { - target_index: u64, - syncer: SnapshotRecoveryWaitApplySyncer, - }, -} - #[derive(Debug)] pub enum UnsafeRecoveryState { // Stores the state that is necessary for the wait apply stage of unsafe recovery process. diff --git a/components/server/src/server.rs b/components/server/src/server.rs index 132d24b7429..976a4add68d 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -64,6 +64,7 @@ use raftstore::{ RaftBatchSystem, RaftRouter, StoreMeta, MULTI_FILES_SNAPSHOT_FEATURE, PENDING_MSG_CAP, }, memory::MEMTRACE_ROOT as MEMTRACE_RAFTSTORE, + snapshot_backup::PrepareDiskSnapObserver, AutoSplitController, CheckLeaderRunner, LocalReader, SnapManager, SnapManagerBuilder, SplitCheckRunner, SplitConfigManager, StoreMetaDelegate, }, @@ -256,6 +257,7 @@ struct TikvServer { br_snap_recovery_mode: bool, // use for br snapshot recovery resolved_ts_scheduler: Option>, grpc_service_mgr: GrpcServiceManager, + snap_br_rejector: Option>, } struct TikvEngines { @@ -445,6 +447,7 @@ where br_snap_recovery_mode: is_recovering_marked, resolved_ts_scheduler: None, grpc_service_mgr: GrpcServiceManager::new(tx), + snap_br_rejector: None, } } @@ -829,6 +832,10 @@ where )), ); + let rejector = Arc::new(PrepareDiskSnapObserver::default()); + rejector.register_to(self.coprocessor_host.as_mut().unwrap()); + self.snap_br_rejector = Some(rejector); + // Start backup stream let backup_stream_scheduler = if self.core.config.log_backup.enable { // Create backup stream. @@ -1174,16 +1181,6 @@ where // Backup service. let mut backup_worker = Box::new(self.core.background_worker.lazy_build("backup-endpoint")); let backup_scheduler = backup_worker.scheduler(); - let backup_service = - backup::Service::::with_router(backup_scheduler, self.router.clone()); - if servers - .server - .register_service(create_backup(backup_service)) - .is_some() - { - fatal!("failed to register backup service"); - } - let backup_endpoint = backup::Endpoint::new( servers.node.id(), engines.engine.clone(), @@ -1195,6 +1192,20 @@ where self.causal_ts_provider.clone(), self.resource_manager.clone(), ); + let env = backup::disk_snap::Env::new( + Arc::new(Mutex::new(self.router.clone())), + self.snap_br_rejector.take().unwrap(), + Some(backup_endpoint.io_pool_handle().clone()), + ); + let backup_service = backup::Service::new(backup_scheduler, env); + if servers + .server + .register_service(create_backup(backup_service)) + .is_some() + { + fatal!("failed to register backup service"); + } + self.cfg_controller.as_mut().unwrap().register( tikv::config::Module::Backup, Box::new(backup_endpoint.get_config_manager()), diff --git a/components/server/src/server2.rs b/components/server/src/server2.rs index 98341796367..ffa96e4e770 100644 --- a/components/server/src/server2.rs +++ b/components/server/src/server2.rs @@ -27,6 +27,7 @@ use std::{ }; use api_version::{dispatch_api_version, KvFormat}; +use backup::disk_snap::Env; use backup_stream::{ config::BackupStreamConfigManager, metadata::store::PdStore, observer::BackupStreamObserver, BackupStreamResolver, @@ -63,7 +64,7 @@ use raftstore::{ RegionInfoAccessor, }; use raftstore_v2::{ - router::{PeerMsg, RaftRouter}, + router::{DiskSnapBackupHandle, PeerMsg, RaftRouter}, StateStorage, }; use resolved_ts::Task; @@ -926,7 +927,10 @@ where // Backup service. let mut backup_worker = Box::new(self.core.background_worker.lazy_build("backup-endpoint")); let backup_scheduler = backup_worker.scheduler(); - let backup_service = backup::Service::::new(backup_scheduler); + let backup_service = backup::Service::new( + backup_scheduler, + Env::new(DiskSnapBackupHandle, Default::default(), None), + ); if servers .server .register_service(create_backup(backup_service)) diff --git a/components/snap_recovery/src/services.rs b/components/snap_recovery/src/services.rs index daf6e7ed30f..d72f10e4f43 100644 --- a/components/snap_recovery/src/services.rs +++ b/components/snap_recovery/src/services.rs @@ -7,7 +7,6 @@ use std::{ result, sync::{ atomic::{AtomicBool, Ordering}, - mpsc::{sync_channel, SyncSender}, Arc, Mutex, }, thread::Builder, @@ -36,12 +35,14 @@ use raftstore::{ store::{ fsm::RaftRouter, msg::{PeerMsg, SignificantMsg}, + snapshot_backup::{SnapshotBrWaitApplyRequest, SyncReport}, transport::SignificantRouter, - SnapshotRecoveryWaitApplySyncer, + SnapshotBrWaitApplySyncer, }, }; use thiserror::Error; use tikv_util::sys::thread::{StdThreadBuildWrapper, ThreadBuildWrapper}; +use tokio::sync::oneshot::{self, Sender}; use crate::{ data_resolver::DataResolverManager, @@ -218,11 +219,11 @@ impl RecoveryService { // a new wait apply syncer share with all regions, // when all region reached the target index, share reference decreased to 0, // trigger closure to send finish info back. - pub fn wait_apply_last(router: RaftRouter, sender: SyncSender) { - let wait_apply = SnapshotRecoveryWaitApplySyncer::new(0, sender); + pub fn wait_apply_last(router: RaftRouter, sender: Sender) { + let wait_apply = SnapshotBrWaitApplySyncer::new(0, sender); router.broadcast_normal(|| { - PeerMsg::SignificantMsg(SignificantMsg::SnapshotRecoveryWaitApply( - wait_apply.clone(), + PeerMsg::SignificantMsg(SignificantMsg::SnapshotBrWaitApply( + SnapshotBrWaitApplyRequest::relaxed(wait_apply.clone()), )) }); } @@ -335,12 +336,14 @@ impl RecoverData for RecoveryService { // wait apply to the last log let mut rx_apply = Vec::with_capacity(leaders.len()); for ®ion_id in &leaders { - let (tx, rx) = sync_channel(1); + let (tx, rx) = oneshot::channel(); REGION_EVENT_COUNTER.start_wait_leader_apply.inc(); - let wait_apply = SnapshotRecoveryWaitApplySyncer::new(region_id, tx.clone()); + let wait_apply = SnapshotBrWaitApplySyncer::new(region_id, tx); if let Err(e) = raft_router.get_mut().unwrap().significant_send( region_id, - SignificantMsg::SnapshotRecoveryWaitApply(wait_apply.clone()), + SignificantMsg::SnapshotBrWaitApply(SnapshotBrWaitApplyRequest::relaxed( + wait_apply.clone(), + )), ) { error!( "failed to send wait apply"; @@ -348,27 +351,21 @@ impl RecoverData for RecoveryService { "err" => ?e, ); } - rx_apply.push(Some(rx)); + rx_apply.push(rx); } // leader apply to last log for (rid, rx) in leaders.iter().zip(rx_apply) { - if let Some(rx) = rx { - CURRENT_WAIT_APPLY_LEADER.set(*rid as _); - // FIXME: we cannot the former RPC when we get stuck at here. - // Perhaps we need to make `SnapshotRecoveryWaitApplySyncer` be able to support - // asynchronous channels. But for now, waiting seems won't cause live lock, so - // we are keeping it unchanged. - match rx.recv() { - Ok(region_id) => { - debug!("leader apply to last log"; "region_id" => region_id); - } - Err(e) => { - error!("leader failed to apply to last log"; "error" => ?e); - } + CURRENT_WAIT_APPLY_LEADER.set(*rid as _); + match rx.await { + Ok(_) => { + debug!("leader apply to last log"; "region_id" => rid); + } + Err(e) => { + error!("leader failed to apply to last log"; "error" => ?e); } - REGION_EVENT_COUNTER.finish_wait_leader_apply.inc(); } + REGION_EVENT_COUNTER.finish_wait_leader_apply.inc(); } CURRENT_WAIT_APPLY_LEADER.set(0); @@ -410,14 +407,11 @@ impl RecoverData for RecoveryService { info!("wait_apply start"); let task = async move { let now = Instant::now(); - // FIXME: this function will exit once the first region finished apply. - // BUT for the flashback resolve KV implementation, that is fine because the - // raft log stats is consistent. - let (tx, rx) = sync_channel(1); - RecoveryService::wait_apply_last(router, tx.clone()); - match rx.recv() { + let (tx, rx) = oneshot::channel(); + RecoveryService::wait_apply_last(router, tx); + match rx.await { Ok(id) => { - info!("follower apply to last log"; "error" => id); + info!("follower apply to last log"; "report" => ?id); } Err(e) => { error!("follower failed to apply to last log"; "error" => ?e); diff --git a/components/sst_importer/src/errors.rs b/components/sst_importer/src/errors.rs index e03288bb3e1..e5e235e9761 100644 --- a/components/sst_importer/src/errors.rs +++ b/components/sst_importer/src/errors.rs @@ -178,6 +178,7 @@ impl From for import_sstpb::Error { let mut server_is_busy = errorpb::ServerIsBusy::default(); server_is_busy.set_backoff_ms(time_to_lease_expire.as_millis() as _); store_err.set_server_is_busy(server_is_busy); + store_err.set_message(format!("{}", e)); err.set_store_error(store_err); err.set_message(format!("{}", e)); } diff --git a/components/test_backup/Cargo.toml b/components/test_backup/Cargo.toml index 59300f993e3..9d773b5244b 100644 --- a/components/test_backup/Cargo.toml +++ b/components/test_backup/Cargo.toml @@ -16,6 +16,7 @@ backup = { workspace = true } collections = { workspace = true } concurrency_manager = { workspace = true } crc64fast = "0.1" +engine_rocks = { workspace = true } engine_traits = { workspace = true } external_storage_export = { workspace = true } file_system = { workspace = true } @@ -25,6 +26,7 @@ futures-util = { version = "0.3", default-features = false, features = ["io"] } grpcio = { workspace = true } kvproto = { workspace = true } protobuf = "2" +raftstore = { workspace = true } rand = "0.8" tempfile = "3.0" test_raftstore = { workspace = true } diff --git a/components/test_backup/src/disk_snap.rs b/components/test_backup/src/disk_snap.rs new file mode 100644 index 00000000000..5f6b4cd9236 --- /dev/null +++ b/components/test_backup/src/disk_snap.rs @@ -0,0 +1,246 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + collections::{HashMap, HashSet}, + sync::{Arc, Mutex}, + time::Duration, +}; + +use backup::disk_snap::Env as BEnv; +use futures_executor::block_on; +use futures_util::{ + sink::SinkExt, + stream::{Fuse, StreamExt}, +}; +use grpcio::{ + ChannelBuilder, ClientDuplexReceiver, Environment, Server, ServerBuilder, StreamingCallSink, + WriteFlags, +}; +use kvproto::{ + brpb::{ + self, PrepareSnapshotBackupEventType, PrepareSnapshotBackupRequest, + PrepareSnapshotBackupRequestType, PrepareSnapshotBackupResponse, + }, + metapb::Region, + raft_cmdpb::RaftCmdResponse, +}; +use raftstore::store::{snapshot_backup::PrepareDiskSnapObserver, Callback, WriteResponse}; +use test_raftstore::*; +use tikv_util::{ + future::{block_on_timeout, paired_future_callback}, + worker::dummy_scheduler, + HandyRwLock, +}; + +pub struct Node { + service: Option, + pub rejector: Arc, + pub backup_client: Option, +} + +pub struct Suite { + pub cluster: Cluster, + pub nodes: HashMap, + grpc_env: Arc, +} + +impl Suite { + fn crate_node(&mut self, id: u64) { + let rej = Arc::new(PrepareDiskSnapObserver::default()); + let rej2 = rej.clone(); + let mut w = self.cluster.sim.wl(); + w.coprocessor_hooks + .entry(id) + .or_default() + .push(Box::new(move |host| { + rej2.register_to(host); + })); + self.nodes.insert( + id, + Node { + service: None, + rejector: rej, + backup_client: None, + }, + ); + } + + fn start_backup(&mut self, id: u64) { + let (sched, _) = dummy_scheduler(); + let w = self.cluster.sim.wl(); + let router = Arc::new(Mutex::new(w.get_router(id).unwrap())); + let env = BEnv::new(router, self.nodes[&id].rejector.clone(), None); + let service = backup::Service::new(sched, env); + let builder = ServerBuilder::new(Arc::clone(&self.grpc_env)) + .register_service(brpb::create_backup(service)); + let mut server = builder.bind("127.0.0.1", 0).build().unwrap(); + server.start(); + let (_, port) = server.bind_addrs().next().unwrap(); + let addr = format!("127.0.0.1:{}", port); + let channel = ChannelBuilder::new(self.grpc_env.clone()).connect(&addr); + println!("connecting channel to {} for store {}", addr, id); + let client = brpb::BackupClient::new(channel); + let node = self.nodes.get_mut(&id).unwrap(); + node.service = Some(server); + node.backup_client = Some(client); + } + + pub fn try_split(&mut self, split_key: &[u8]) -> WriteResponse { + let region = self.cluster.get_region(split_key); + let (tx, rx) = paired_future_callback(); + self.cluster + .split_region(®ion, split_key, Callback::write(tx)); + block_on(rx).unwrap() + } + + pub fn split(&mut self, split_key: &[u8]) { + let region = self.cluster.get_region(split_key); + self.try_split(split_key); + self.cluster.wait_region_split(®ion); + } + + fn backup(&self, id: u64) -> &brpb::BackupClient { + self.nodes[&id].backup_client.as_ref().unwrap() + } + + pub fn prepare_backup(&self, node: u64) -> PrepareBackup { + let cli = self.backup(node); + let (tx, rx) = cli.prepare_snapshot_backup().unwrap(); + PrepareBackup { + store_id: node, + tx, + rx: rx.fuse(), + } + } + + pub fn new(node_count: u64) -> Self { + Self::new_with_cfg(node_count, |_| {}) + } + + pub fn new_with_cfg(node_count: u64, cfg: impl FnOnce(&mut Config)) -> Self { + let cluster = new_server_cluster(42, node_count as usize); + let grpc_env = Arc::new(Environment::new(1)); + let mut suite = Suite { + cluster, + nodes: HashMap::default(), + grpc_env, + }; + for id in 1..=node_count { + suite.crate_node(id); + } + cfg(&mut suite.cluster.cfg); + suite.cluster.run(); + for id in 1..=node_count { + suite.start_backup(id); + } + suite + } +} + +pub struct PrepareBackup { + tx: StreamingCallSink, + rx: Fuse>, + + pub store_id: u64, +} + +impl PrepareBackup { + pub fn prepare(&mut self, lease_sec: u64) { + let mut req = PrepareSnapshotBackupRequest::new(); + req.set_ty(PrepareSnapshotBackupRequestType::UpdateLease); + req.set_lease_in_seconds(lease_sec); + block_on(async { + self.tx.send((req, WriteFlags::default())).await.unwrap(); + self.rx.next().await.unwrap().unwrap(); + }); + } + + pub fn wait_apply(&mut self, r: impl IntoIterator) { + let mut req = PrepareSnapshotBackupRequest::new(); + req.set_ty(PrepareSnapshotBackupRequestType::WaitApply); + req.set_regions(r.into_iter().collect()); + let mut regions = req + .get_regions() + .iter() + .map(|x| x.id) + .collect::>(); + block_on(async { + self.tx.send((req, WriteFlags::default())).await.unwrap(); + while !regions.is_empty() { + let resp = self.rx.next().await.unwrap().unwrap(); + assert_eq!(resp.ty, PrepareSnapshotBackupEventType::WaitApplyDone); + assert!(!resp.has_error(), "{resp:?}"); + assert!(regions.remove(&resp.get_region().id), "{regions:?}"); + } + }); + } + + pub fn send_wait_apply(&mut self, r: impl IntoIterator) { + let mut req = PrepareSnapshotBackupRequest::new(); + req.set_ty(PrepareSnapshotBackupRequestType::WaitApply); + req.set_regions(r.into_iter().collect()); + block_on(async { + self.tx.send((req, WriteFlags::default())).await.unwrap(); + }) + } + + pub fn send_finalize(mut self) -> bool { + block_on(self.tx.send({ + let mut req = PrepareSnapshotBackupRequest::new(); + req.set_ty(PrepareSnapshotBackupRequestType::Finish); + (req, WriteFlags::default()) + })) + .unwrap(); + block_on_timeout( + async { + while let Some(item) = self.rx.next().await { + let item = item.unwrap(); + if item.ty == PrepareSnapshotBackupEventType::UpdateLeaseResult { + return item.last_lease_is_valid; + } + } + false + }, + Duration::from_secs(2), + ) + .expect("take too long to finalize the stream") + } + + pub fn next(&mut self) -> PrepareSnapshotBackupResponse { + self.try_next().unwrap() + } + + pub fn try_next(&mut self) -> grpcio::Result { + block_on(self.rx.next()).unwrap() + } +} + +#[track_caller] +pub fn must_wait_apply_success(res: &PrepareSnapshotBackupResponse) -> u64 { + assert!(!res.has_error(), "{res:?}"); + assert_eq!(res.ty, PrepareSnapshotBackupEventType::WaitApplyDone); + res.get_region().id +} + +#[track_caller] +pub fn assert_success(resp: &RaftCmdResponse) { + assert!(!resp.get_header().has_error(), "{:?}", resp); +} + +#[track_caller] +pub fn assert_failure(resp: &RaftCmdResponse) { + assert!(resp.get_header().has_error(), "{:?}", resp); +} + +#[track_caller] +pub fn assert_failure_because(resp: &RaftCmdResponse, reason_contains: &str) { + assert!(resp.get_header().has_error(), "{:?}", resp); + assert!( + resp.get_header() + .get_error() + .get_message() + .contains(reason_contains), + "{:?}", + resp + ); +} diff --git a/components/test_backup/src/lib.rs b/components/test_backup/src/lib.rs index 3a5800e989b..b952d71f5a6 100644 --- a/components/test_backup/src/lib.rs +++ b/components/test_backup/src/lib.rs @@ -38,6 +38,8 @@ use tikv_util::{ }; use txn_types::TimeStamp; +pub mod disk_snap; + pub struct TestSuite { pub cluster: Cluster, pub endpoints: HashMap>, diff --git a/components/test_raftstore/src/transport_simulate.rs b/components/test_raftstore/src/transport_simulate.rs index ef569e3987a..66568b0e2ba 100644 --- a/components/test_raftstore/src/transport_simulate.rs +++ b/components/test_raftstore/src/transport_simulate.rs @@ -268,6 +268,12 @@ pub trait FilterFactory { fn generate(&self, node_id: u64) -> Vec>; } +impl Fl, Fl: Filter + 'static> FilterFactory for F { + fn generate(&self, node_id: u64) -> Vec> { + vec![Box::new(self(node_id)) as _] + } +} + #[derive(Default)] pub struct DefaultFilterFactory(PhantomData); diff --git a/components/test_raftstore/src/util.rs b/components/test_raftstore/src/util.rs index 5eb7d97796e..a34bd614995 100644 --- a/components/test_raftstore/src/util.rs +++ b/components/test_raftstore/src/util.rs @@ -609,6 +609,7 @@ pub fn must_error_read_on_peer( } } +#[track_caller] pub fn must_contains_error(resp: &RaftCmdResponse, msg: &str) { let header = resp.get_header(); assert!(header.has_error()); diff --git a/components/test_util/src/lib.rs b/components/test_util/src/lib.rs index 453ed7fb7f1..d4de9fdc58e 100644 --- a/components/test_util/src/lib.rs +++ b/components/test_util/src/lib.rs @@ -18,6 +18,7 @@ use std::{ fmt::Debug, sync::atomic::{AtomicU16, Ordering}, thread, + time::Duration, }; use rand::Rng; @@ -154,3 +155,21 @@ pub fn assert_eq_debug(lhs: &C, rhs: &C) { lhs_diff, rhs_diff ); } + +#[track_caller] +pub fn eventually(tick: Duration, total: Duration, mut check: impl FnMut() -> bool) { + let start = std::time::Instant::now(); + loop { + if check() { + return; + } + if start.elapsed() < total { + std::thread::sleep(tick); + continue; + } + panic!( + "failed to pass the check after {:?} elapsed", + start.elapsed() + ); + } +} diff --git a/components/tikv_util/src/time.rs b/components/tikv_util/src/time.rs index 8594379a9bd..f329247c563 100644 --- a/components/tikv_util/src/time.rs +++ b/components/tikv_util/src/time.rs @@ -200,10 +200,9 @@ impl Drop for Monitor { } } -use self::inner::monotonic_coarse_now; -pub use self::inner::monotonic_now; /// Returns the monotonic raw time since some unspecified starting point. pub use self::inner::monotonic_raw_now; +pub use self::inner::{monotonic_coarse_now, monotonic_now}; use crate::sys::thread::StdThreadBuildWrapper; const NANOSECONDS_PER_SECOND: u64 = 1_000_000_000; diff --git a/src/import/sst_service.rs b/src/import/sst_service.rs index 2dc4f76b194..4d87f249492 100644 --- a/src/import/sst_service.rs +++ b/src/import/sst_service.rs @@ -1139,12 +1139,14 @@ impl ImportSst for ImportSstService { ) { let label = "ingest"; let timer = Instant::now_coarse(); + let mut resp = IngestResponse::default(); + if let Err(err) = self.check_suspend() { - ctx.spawn(async move { crate::send_rpc_response!(Err(err), sink, label, timer) }); + resp.set_error(ImportPbError::from(err).take_store_error()); + ctx.spawn(async move { crate::send_rpc_response!(Ok(resp), sink, label, timer) }); return; } - let mut resp = IngestResponse::default(); let region_id = req.get_context().get_region_id(); if let Some(errorpb) = self.check_write_stall(region_id) { resp.set_error(errorpb); @@ -1186,12 +1188,13 @@ impl ImportSst for ImportSstService { ) { let label = "multi-ingest"; let timer = Instant::now_coarse(); + let mut resp = IngestResponse::default(); if let Err(err) = self.check_suspend() { - ctx.spawn(async move { crate::send_rpc_response!(Err(err), sink, label, timer) }); + resp.set_error(ImportPbError::from(err).take_store_error()); + ctx.spawn(async move { crate::send_rpc_response!(Ok(resp), sink, label, timer) }); return; } - let mut resp = IngestResponse::default(); if let Some(errorpb) = self.check_write_stall(req.get_context().get_region_id()) { resp.set_error(errorpb); ctx.spawn( diff --git a/tests/Cargo.toml b/tests/Cargo.toml index aa0c2c29dec..dd851c95822 100644 --- a/tests/Cargo.toml +++ b/tests/Cargo.toml @@ -55,10 +55,10 @@ test-engine-raft-raft-engine = [ "raftstore/test-engine-raft-raft-engine" ] test-engines-rocksdb = [ - "raftstore/test-engines-rocksdb", + "raftstore/test-engines-rocksdb" ] test-engines-panic = [ - "raftstore/test-engines-panic", + "raftstore/test-engines-panic" ] jemalloc = ["tikv/jemalloc"] mimalloc = ["tikv/mimalloc"] @@ -141,7 +141,7 @@ resource_metering = { workspace = true } security = { workspace = true } serde_json = "1.0" sst_importer = { workspace = true } -test_backup = { workspace = true } +test_backup = { workspace = true, default-features = false } test_coprocessor = { workspace = true } test_pd = { workspace = true } test_pd_client = { workspace = true } diff --git a/tests/failpoints/cases/mod.rs b/tests/failpoints/cases/mod.rs index ed2b8d79f9c..caf994fc1cd 100644 --- a/tests/failpoints/cases/mod.rs +++ b/tests/failpoints/cases/mod.rs @@ -9,6 +9,7 @@ mod test_conf_change; mod test_coprocessor; mod test_debugger; mod test_disk_full; +mod test_disk_snap_br; mod test_early_apply; mod test_encryption; mod test_engine; diff --git a/tests/failpoints/cases/test_disk_snap_br.rs b/tests/failpoints/cases/test_disk_snap_br.rs new file mode 100644 index 00000000000..83956aa9367 --- /dev/null +++ b/tests/failpoints/cases/test_disk_snap_br.rs @@ -0,0 +1,42 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +// FIXME: Now, for making sure there isn't a regression after the advanced +// prepare patch (anyway it is just a patch...), we won't reject the +// `CommitMerge` command, or the client may fall into an eternal wait over it +// while waiting pending admin command finish. +// +// Omitting rejecting the command won't break the consistency (at least won't +// make things worse), but will break the case: this case itself wants to prove +// that the `CommitMerge` won't be proposed. +#[test] +#[ignore = "See the comment of `test_merge`"] +fn test_merge() { + use std::time::Duration; + + use test_backup::disk_snap::{assert_success, Suite}; + + let mut suite = Suite::new(1); + suite.split(b"k"); + let mut source = suite.cluster.get_region(b"a"); + let target = suite.cluster.get_region(b"z"); + assert_ne!(source.id, target.id); + fail::cfg("on_schedule_merge", "pause").unwrap(); + let resp = suite.cluster.try_merge(source.id, target.id); + assert_success(&resp); + let mut call = suite.prepare_backup(1); + call.prepare(60); + fail::remove("on_schedule_merge"); + // Manually "apply" the prepare merge on region epoch. + source.mut_region_epoch().set_conf_ver(2); + source.mut_region_epoch().set_version(3); + call.wait_apply([&source, &target].into_iter().cloned()); + let source = suite.cluster.get_region(b"a"); + let target = suite.cluster.get_region(b"z"); + assert_ne!(source.id, target.id); + suite.nodes[&1].rejector.reset(); + test_util::eventually(Duration::from_secs(1), Duration::from_secs(10), || { + let source = suite.cluster.get_region(b"a"); + let target = suite.cluster.get_region(b"z"); + source.id == target.id + }) +} diff --git a/tests/integrations/backup/disk_snap.rs b/tests/integrations/backup/disk_snap.rs new file mode 100644 index 00000000000..23a61a937e9 --- /dev/null +++ b/tests/integrations/backup/disk_snap.rs @@ -0,0 +1,206 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{collections::HashSet, time::Duration}; + +use futures::executor::block_on; +use kvproto::raft_cmdpb::{CmdType, PutRequest, RaftCmdRequest, Request}; +use raft::prelude::MessageType; +use raftstore::store::Callback; +use test_backup::disk_snap::{ + assert_failure, assert_failure_because, assert_success, must_wait_apply_success, Suite, +}; +use test_raftstore::{must_contains_error, Direction, RegionPacketFilter, Simulator}; +use test_util::eventually; +use tikv_util::HandyRwLock; + +#[test] +fn test_basic() { + let mut suite = Suite::new(1); + let mut call = suite.prepare_backup(1); + call.prepare(60); + let resp = suite.try_split(b"k"); + debug!("Failed to split"; "err" => ?resp.response.get_header().get_error()); + must_contains_error(&resp.response, "[Suspended] Preparing disk snapshot backup"); +} + +#[test] +fn test_conf_change() { + let mut suite = Suite::new(4); + let the_region = suite.cluster.get_region(b""); + let last_peer = the_region.peers.last().unwrap(); + let res = block_on( + suite + .cluster + .async_remove_peer(the_region.get_id(), last_peer.clone()) + .unwrap(), + ); + assert_success(&res); + eventually(Duration::from_millis(100), Duration::from_secs(2), || { + let r = suite.cluster.get_region(b""); + !r.peers.iter().any(|p| p.id == last_peer.id) + }); + let mut calls = vec![]; + for i in 1..=4 { + let mut call = suite.prepare_backup(i); + call.prepare(60); + calls.push(call); + } + + // Make sure the change has been synchronized to all stores. + std::thread::sleep(Duration::from_millis(500)); + let the_region = suite.cluster.get_region(b""); + let res2 = block_on( + suite + .cluster + .async_remove_peer(the_region.get_id(), last_peer.clone()) + .unwrap(), + ); + assert_failure_because(&res2, "rejected by coprocessor"); + let last_peer = the_region.peers.last().unwrap(); + calls.into_iter().for_each(|c| assert!(c.send_finalize())); + let res3 = block_on( + suite + .cluster + .async_remove_peer(the_region.get_id(), last_peer.clone()) + .unwrap(), + ); + assert_success(&res3); + eventually(Duration::from_millis(100), Duration::from_secs(2), || { + let r = suite.cluster.get_region(b""); + !r.peers.iter().any(|p| p.id == last_peer.id) + }); +} + +#[test] +fn test_transfer_leader() { + let mut suite = Suite::new(3); + let mut calls = vec![]; + for i in 1..=3 { + let mut call = suite.prepare_backup(i); + call.prepare(60); + calls.push(call); + } + let region = suite.cluster.get_region(b""); + let leader = suite.cluster.leader_of_region(region.get_id()).unwrap(); + let new_leader = region.peers.iter().find(|r| r.id != leader.id).unwrap(); + let res = suite + .cluster + .try_transfer_leader(region.id, new_leader.clone()); + assert_failure_because(&res, "[Suspended] Preparing disk snapshot backup"); + calls.into_iter().for_each(|c| assert!(c.send_finalize())); + let res = suite + .cluster + .try_transfer_leader(region.id, new_leader.clone()); + assert_success(&res); +} + +#[test] +fn test_prepare_merge() { + let mut suite = Suite::new(1); + suite.split(b"k"); + let source = suite.cluster.get_region(b"a"); + let target = suite.cluster.get_region(b"z"); + assert_ne!(source.id, target.id); + let mut call = suite.prepare_backup(1); + call.prepare(60); + let resp = suite.cluster.try_merge(source.id, target.id); + assert_failure(&resp); +} + +#[test] +fn test_abort_last_one() { + let suite = Suite::new(1); + let mut call = suite.prepare_backup(1); + call.prepare(10); + let mut call2 = suite.prepare_backup(1); + call2.prepare(10); + let should_err = call.try_next(); + assert!(should_err.is_err(), "{:?}", should_err); + assert!(call2.send_finalize()); +} + +#[test] +fn test_wait_apply() { + let mut suite = Suite::new(3); + for key in 'a'..'k' { + suite.split(&[key as u8]); + } + let rc = suite.cluster.get_region(b"ca"); + suite.cluster.add_send_filter(|i| { + RegionPacketFilter::new(rc.id, i) + .msg_type(MessageType::MsgAppend) + .direction(Direction::Send) + }); + let (tx, rx) = std::sync::mpsc::channel::<()>(); + let mut ld_sid = None; + // Propose a simple write command to each region. + for c in 'a'..'k' { + let region = suite.cluster.get_region(&[c as u8]); + let mut cmd = RaftCmdRequest::new(); + let mut put = PutRequest::new(); + put.set_key(vec![c as u8, b'a']); + put.set_value(b"meow?".to_vec()); + let mut req = Request::new(); + req.set_put(put); + req.set_cmd_type(CmdType::Put); + cmd.mut_requests().push(req); + cmd.mut_header().set_region_id(region.id); + cmd.mut_header() + .set_region_epoch(region.get_region_epoch().clone()); + let ld = suite.cluster.leader_of_region(region.id).unwrap(); + if let Some(lid) = ld_sid { + assert_eq!( + lid, ld.store_id, + "not all leader are in the same store, this case cannot run" + ); + } + ld_sid = Some(ld.store_id); + cmd.mut_header().set_peer(ld); + let r = suite.cluster.sim.rl(); + r.async_command_on_node( + ld_sid.unwrap(), + cmd, + Callback::write_ext( + Box::new(|resp| assert_success(&resp.response)), + Some(Box::new({ + let tx = tx.clone(); + move || drop(tx) + })), + None, + ), + ) + .unwrap(); + } + let mut call = suite.prepare_backup(ld_sid.unwrap()); + call.prepare(60); + + drop(tx); + rx.recv_timeout(Duration::from_secs(5)).unwrap_err(); + + let v = ('a'..'k') + .map(|c| suite.cluster.get_region(&[c as u8])) + .collect::>(); + let mut regions_ok = v + .iter() + .map(|r| r.id) + .filter(|id| *id != rc.id) + .collect::>(); + call.send_wait_apply(v); + + // The regions w/o network isolation must success to wait apply. + while !regions_ok.is_empty() { + let res = call.next(); + let removed = regions_ok.remove(&must_wait_apply_success(&res)); + let mut k = res.get_region().start_key.clone(); + k.push(b'a'); + let v = suite.cluster.must_get(&k); + // Due to we have wait to it applied, this write result must be observable. + assert_eq!(v.as_deref(), Some(b"meow?".as_slice()), "{res:?}"); + assert!(removed, "{regions_ok:?} {res:?}"); + } + + suite.cluster.clear_send_filters(); + // After the network partition restored, the item must be restored. + let res = call.next(); + assert_eq!(must_wait_apply_success(&res), rc.id); +} diff --git a/tests/integrations/backup/mod.rs b/tests/integrations/backup/mod.rs index 4cfd4be07be..56074811772 100644 --- a/tests/integrations/backup/mod.rs +++ b/tests/integrations/backup/mod.rs @@ -17,6 +17,8 @@ use tikv::coprocessor::checksum_crc64_xor; use tikv_util::HandyRwLock; use txn_types::TimeStamp; +mod disk_snap; + fn assert_same_file_name(s1: String, s2: String) { let tokens1: Vec<&str> = s1.split('_').collect(); let tokens2: Vec<&str> = s2.split('_').collect(); diff --git a/tests/integrations/import/test_sst_service.rs b/tests/integrations/import/test_sst_service.rs index 6c56ab0018b..1ed4b116937 100644 --- a/tests/integrations/import/test_sst_service.rs +++ b/tests/integrations/import/test_sst_service.rs @@ -609,10 +609,18 @@ fn test_suspend_import() { ); let write_res = write(sst_range); write_res.unwrap(); - let ingest_res = ingest(&sst); - assert_to_string_contains!(ingest_res.unwrap_err(), "Suspended"); - let multi_ingest_res = multi_ingest(&[sst.clone()]); - assert_to_string_contains!(multi_ingest_res.unwrap_err(), "Suspended"); + let ingest_res = ingest(&sst).unwrap(); + assert!( + ingest_res.get_error().has_server_is_busy(), + "{:?}", + ingest_res + ); + let multi_ingest_res = multi_ingest(&[sst.clone()]).unwrap(); + assert!( + multi_ingest_res.get_error().has_server_is_busy(), + "{:?}", + multi_ingest_res + ); assert!( import @@ -637,7 +645,11 @@ fn test_suspend_import() { let write_res = write(sst_range); let sst = write_res.unwrap().metas; let res = multi_ingest(&sst); - assert_to_string_contains!(res.unwrap_err(), "Suspended"); + assert!( + res.as_ref().unwrap().get_error().has_server_is_busy(), + "{:?}", + res + ); std::thread::sleep(Duration::from_secs(1)); multi_ingest(&sst).unwrap(); diff --git a/tests/integrations/raftstore/test_snap_recovery.rs b/tests/integrations/raftstore/test_snap_recovery.rs index 70f9ae8d97c..38a7206399f 100644 --- a/tests/integrations/raftstore/test_snap_recovery.rs +++ b/tests/integrations/raftstore/test_snap_recovery.rs @@ -2,11 +2,15 @@ use std::time::Duration; -use futures::StreamExt; +use futures::{executor::block_on, StreamExt}; use raft::eraftpb::MessageType; -use raftstore::store::{PeerMsg, SignificantMsg, SnapshotRecoveryWaitApplySyncer}; +use raftstore::store::{ + snapshot_backup::{SnapshotBrWaitApplyRequest, SyncReport}, + PeerMsg, SignificantMsg, SnapshotBrWaitApplySyncer, +}; use test_raftstore::*; -use tikv_util::HandyRwLock; +use tikv_util::{future::block_on_timeout, HandyRwLock}; +use tokio::sync::oneshot; #[test] fn test_check_pending_admin() { @@ -94,17 +98,17 @@ fn test_snap_wait_apply() { let router = cluster.sim.wl().get_router(1).unwrap(); - let (tx, rx) = std::sync::mpsc::sync_channel(1); - + let (tx, rx) = oneshot::channel(); + let syncer = SnapshotBrWaitApplySyncer::new(1, tx); router.broadcast_normal(|| { - PeerMsg::SignificantMsg(SignificantMsg::SnapshotRecoveryWaitApply( - SnapshotRecoveryWaitApplySyncer::new(1, tx.clone()), + PeerMsg::SignificantMsg(SignificantMsg::SnapshotBrWaitApply( + SnapshotBrWaitApplyRequest::relaxed(syncer.clone()), )) }); // we expect recv timeout because the leader peer on store 1 cannot finished the // apply. so the wait apply will timeout. - rx.recv_timeout(Duration::from_secs(1)).unwrap_err(); + block_on_timeout(rx, Duration::from_secs(1)).unwrap_err(); // clear filter so we can make wait apply finished. cluster.clear_send_filters(); @@ -112,13 +116,21 @@ fn test_snap_wait_apply() { // after clear the filter the leader peer on store 1 can finsihed the wait // apply. - let (tx, rx) = std::sync::mpsc::sync_channel(1); + let (tx, rx) = oneshot::channel(); + let syncer = SnapshotBrWaitApplySyncer::new(1, tx); router.broadcast_normal(|| { - PeerMsg::SignificantMsg(SignificantMsg::SnapshotRecoveryWaitApply( - SnapshotRecoveryWaitApplySyncer::new(1, tx.clone()), + PeerMsg::SignificantMsg(SignificantMsg::SnapshotBrWaitApply( + SnapshotBrWaitApplyRequest::relaxed(syncer.clone()), )) }); + drop(syncer); // we expect recv the region id from rx. - assert_eq!(rx.recv(), Ok(1)); + assert_eq!( + block_on(rx), + Ok(SyncReport { + report_id: 1, + aborted: None + }) + ); } From b963d7666d771ea60e6682942cd47a2b2da646e6 Mon Sep 17 00:00:00 2001 From: lijie Date: Wed, 21 Feb 2024 14:33:58 +0800 Subject: [PATCH 152/220] chore: bump version to v7.5.1 (#16548) Signed-off-by: lijie --- Cargo.lock | 2 +- Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e96a72afebb..c2751fce415 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6528,7 +6528,7 @@ dependencies = [ [[package]] name = "tikv" -version = "7.5.0" +version = "7.5.1" dependencies = [ "anyhow", "api_version", diff --git a/Cargo.toml b/Cargo.toml index ff07c91f8d9..463de210105 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "tikv" -version = "7.5.0" +version = "7.5.1" authors = ["The TiKV Authors"] description = "A distributed transactional key-value database powered by Rust and Raft" license = "Apache-2.0" From 7bbc6d9222f5b36e126a1c7380179012c27e5312 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Wed, 3 Apr 2024 08:10:47 +0800 Subject: [PATCH 153/220] rocksdb: Fix partial synced inactive WAL (#16706) (#16721) close tikv/tikv#16705 Fix partial synced inactive WAL Signed-off-by: Qi Xu Co-authored-by: Qi Xu --- Cargo.lock | 8 ++++---- Cargo.toml | 2 +- src/config/mod.rs | 7 +++++-- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c2751fce415..8a229bfb1b5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2998,7 +2998,7 @@ dependencies = [ [[package]] name = "librocksdb_sys" version = "0.1.0" -source = "git+https://github.com/tikv/rust-rocksdb.git#c4b7047314a9b27926a1b7b25d2e6d1a37a48d2b" +source = "git+https://github.com/tikv/rust-rocksdb.git#224bed6ffa29ba3bbe9a91ef6bda7186200c59a8" dependencies = [ "bindgen 0.65.1", "bzip2-sys", @@ -3017,7 +3017,7 @@ dependencies = [ [[package]] name = "libtitan_sys" version = "0.0.1" -source = "git+https://github.com/tikv/rust-rocksdb.git#c4b7047314a9b27926a1b7b25d2e6d1a37a48d2b" +source = "git+https://github.com/tikv/rust-rocksdb.git#224bed6ffa29ba3bbe9a91ef6bda7186200c59a8" dependencies = [ "bzip2-sys", "cc", @@ -4936,7 +4936,7 @@ dependencies = [ [[package]] name = "rocksdb" version = "0.3.0" -source = "git+https://github.com/tikv/rust-rocksdb.git#c4b7047314a9b27926a1b7b25d2e6d1a37a48d2b" +source = "git+https://github.com/tikv/rust-rocksdb.git#224bed6ffa29ba3bbe9a91ef6bda7186200c59a8" dependencies = [ "libc 0.2.146", "librocksdb_sys", @@ -5635,7 +5635,7 @@ dependencies = [ [[package]] name = "snappy-sys" version = "0.1.0" -source = "git+https://github.com/busyjay/rust-snappy.git?branch=static-link#8c12738bad811397600455d6982aff754ea2ac44" +source = "git+https://github.com/tikv/rust-snappy.git?branch=static-link#8c12738bad811397600455d6982aff754ea2ac44" dependencies = [ "cmake", "libc 0.2.146", diff --git a/Cargo.toml b/Cargo.toml index 463de210105..8c94b3d8195 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -186,7 +186,7 @@ rusoto_mock = { git = "https://github.com/tikv/rusoto", branch = "gh1482-s3-addr rusoto_s3 = { git = "https://github.com/tikv/rusoto", branch = "gh1482-s3-addr-styles" } rusoto_sts = { git = "https://github.com/tikv/rusoto", branch = "gh1482-s3-addr-styles" } -snappy-sys = { git = "https://github.com/busyjay/rust-snappy.git", branch = "static-link" } +snappy-sys = { git = "https://github.com/tikv/rust-snappy.git", branch = "static-link" } # remove this when https://github.com/danburkert/fs2-rs/pull/42 is merged. fs2 = { git = "https://github.com/tabokie/fs2-rs", branch = "tikv" } diff --git a/src/config/mod.rs b/src/config/mod.rs index 3f3b39d5f13..1a83cc25d3d 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -31,7 +31,7 @@ use engine_rocks::{ raw::{ BlockBasedOptions, Cache, ChecksumType, CompactionPriority, ConcurrentTaskLimiter, DBCompactionStyle, DBCompressionType, DBRateLimiterMode, DBRecoveryMode, Env, - PrepopulateBlockCache, RateLimiter, WriteBufferManager, + LRUCacheOptions, PrepopulateBlockCache, RateLimiter, WriteBufferManager, }, util::{ FixedPrefixSliceTransform, FixedSuffixSliceTransform, NoopSliceTransform, @@ -190,7 +190,10 @@ impl TitanCfConfig { let mut opts = RocksTitanDbOptions::new(); opts.set_min_blob_size(self.min_blob_size.0); opts.set_blob_file_compression(self.blob_file_compression.into()); - opts.set_blob_cache(self.blob_cache_size.0 as usize, -1, false, 0.0); + let mut cache_opts = LRUCacheOptions::new(); + cache_opts.set_capacity(self.blob_cache_size.0 as usize); + let cache = Cache::new_lru_cache(cache_opts); + opts.set_blob_cache(&cache); opts.set_min_gc_batch_size(self.min_gc_batch_size.0); opts.set_max_gc_batch_size(self.max_gc_batch_size.0); opts.set_discardable_ratio(self.discardable_ratio); From 50c6b3d9dd82e359d2048e5324453fd233ccc327 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Thu, 4 Apr 2024 06:16:47 +0800 Subject: [PATCH 154/220] Raftstore: destroy peer after applying snapshot (or aborted) if necessary (#16579) (#16695) (#16746) close tikv/tikv#16561 Destroy peer after applying snapshot (or aborted) if it has destroy peer cmd during applying snapshot. Before this change, it would require extra destroy message to trigger the destroy, which may not happen in short time if the region is hibernated. And it would block the resolve-ts forward. Signed-off-by: ti-chi-bot Signed-off-by: Qi Xu Co-authored-by: Qi Xu Co-authored-by: tonyxuqqi --- components/raftstore/src/store/fsm/peer.rs | 19 +++++++++++++------ components/raftstore/src/store/peer.rs | 20 +++++++++++++++++++- tests/failpoints/cases/test_stale_peer.rs | 3 ++- 3 files changed, 34 insertions(+), 8 deletions(-) diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index e4bff52ec5d..4d7d752dd90 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -699,12 +699,7 @@ where PeerMsg::UpdateReplicationMode => self.on_update_replication_mode(), PeerMsg::Destroy(peer_id) => { if self.fsm.peer.peer_id() == peer_id { - match self.fsm.peer.maybe_destroy(self.ctx) { - None => self.ctx.raft_metrics.message_dropped.applying_snap.inc(), - Some(job) => { - self.handle_destroy_peer(job); - } - } + self.maybe_destroy(); } } } @@ -1237,6 +1232,9 @@ where } CasualMessage::SnapshotApplied => { self.fsm.has_ready = true; + if self.fsm.peer.should_destroy_after_apply_snapshot() { + self.maybe_destroy(); + } } CasualMessage::Campaign => { let _ = self.fsm.peer.raft_group.campaign(); @@ -3625,6 +3623,15 @@ where } } + fn maybe_destroy(&mut self) { + match self.fsm.peer.maybe_destroy(self.ctx) { + None => self.ctx.raft_metrics.message_dropped.applying_snap.inc(), + Some(job) => { + self.handle_destroy_peer(job); + } + } + } + /// Check if destroy can be executed immediately. If it can't, the reason is /// returned. fn maybe_delay_destroy(&mut self) -> Option { diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index 76affa90b93..5bf45971ff0 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -468,6 +468,12 @@ pub struct ApplySnapshotContext { /// The message should be sent after snapshot is applied. pub msgs: Vec, pub persist_res: Option, + /// Destroy the peer after apply task finished or aborted + /// This flag is set to true when the peer destroy is skipped because of + /// running snapshot task. + /// This is to accelerate peer destroy without waiting for extra destory + /// peer message. + pub destroy_peer_after_apply: bool, } #[derive(PartialEq, Debug)] @@ -1234,13 +1240,14 @@ where } } - if let Some(snap_ctx) = self.apply_snap_ctx.as_ref() { + if let Some(snap_ctx) = self.apply_snap_ctx.as_mut() { if !snap_ctx.scheduled { info!( "stale peer is persisting snapshot, will destroy next time"; "region_id" => self.region_id, "peer_id" => self.peer.get_id(), ); + snap_ctx.destroy_peer_after_apply = true; return None; } } @@ -1251,6 +1258,9 @@ where "region_id" => self.region_id, "peer_id" => self.peer.get_id(), ); + if let Some(snap_ctx) = self.apply_snap_ctx.as_mut() { + snap_ctx.destroy_peer_after_apply = true; + } return None; } @@ -1624,6 +1634,13 @@ where self.apply_snap_ctx.is_some() || self.get_store().is_applying_snapshot() } + #[inline] + pub fn should_destroy_after_apply_snapshot(&self) -> bool { + self.apply_snap_ctx + .as_ref() + .map_or(false, |ctx| ctx.destroy_peer_after_apply) + } + /// Returns `true` if the raft group has replicated a snapshot but not /// committed it yet. #[inline] @@ -2841,6 +2858,7 @@ where destroy_regions, for_witness, }), + destroy_peer_after_apply: false, }); if self.last_compacted_idx == 0 && last_first_index >= RAFT_INIT_LOG_INDEX { // There may be stale logs in raft engine, so schedule a task to clean it diff --git a/tests/failpoints/cases/test_stale_peer.rs b/tests/failpoints/cases/test_stale_peer.rs index 80c73f03a16..df714ff7b09 100644 --- a/tests/failpoints/cases/test_stale_peer.rs +++ b/tests/failpoints/cases/test_stale_peer.rs @@ -210,8 +210,9 @@ fn test_stale_peer_destroy_when_apply_snapshot() { fail::remove(region_apply_snap_fp); // Wait for peer 3 changing `SnapState` sleep_ms(100); - cluster.sim.wl().send_raft_msg(tombstone_msg).unwrap(); + // we expect the peer would be destroyed after applying the snapshot without + // another message trigger must_get_none(&cluster.get_engine(3), b"k1"); } From ca7d7fc0721da54d90d62ec51cc25cf5915831c6 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Mon, 8 Apr 2024 18:03:20 +0800 Subject: [PATCH 155/220] build: bump tikv pkg version (#16781) Signed-off-by: ti-chi-bot --- Cargo.lock | 2 +- Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 8a229bfb1b5..902b492c715 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6528,7 +6528,7 @@ dependencies = [ [[package]] name = "tikv" -version = "7.5.1" +version = "7.5.2" dependencies = [ "anyhow", "api_version", diff --git a/Cargo.toml b/Cargo.toml index 8c94b3d8195..606f6f868e6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "tikv" -version = "7.5.1" +version = "7.5.2" authors = ["The TiKV Authors"] description = "A distributed transactional key-value database powered by Rust and Raft" license = "Apache-2.0" From 16636b943b1df65e294977c2b74bf8fc756bce11 Mon Sep 17 00:00:00 2001 From: lucasliang Date: Thu, 11 Apr 2024 17:30:23 +0800 Subject: [PATCH 156/220] [cherry-pick-7.5] server: stop manual compaction jobs in engines before shutdown (#16761) close tikv/tikv#16680 Stop background manual compaction before shutdown server. Signed-off-by: lucasliang --- components/engine_panic/src/misc.rs | 8 +++ components/engine_rocks/src/misc.rs | 14 ++++- components/engine_traits/src/misc.rs | 6 ++ .../raftstore/src/store/worker/compact.rs | 59 ++++++++++++++++++- components/server/src/server.rs | 10 ++++ 5 files changed, 93 insertions(+), 4 deletions(-) diff --git a/components/engine_panic/src/misc.rs b/components/engine_panic/src/misc.rs index 8da5c48d3e6..ab7201275df 100644 --- a/components/engine_panic/src/misc.rs +++ b/components/engine_panic/src/misc.rs @@ -75,6 +75,14 @@ impl MiscExt for PanicEngine { panic!() } + fn disable_manual_compaction(&self) -> Result<()> { + panic!() + } + + fn enable_manual_compaction(&self) -> Result<()> { + panic!() + } + fn pause_background_work(&self) -> Result<()> { panic!() } diff --git a/components/engine_rocks/src/misc.rs b/components/engine_rocks/src/misc.rs index b1406cacdb8..c043e9b2050 100644 --- a/components/engine_rocks/src/misc.rs +++ b/components/engine_rocks/src/misc.rs @@ -334,16 +334,26 @@ impl MiscExt for RocksEngine { self.as_inner().sync_wal().map_err(r2e) } + fn disable_manual_compaction(&self) -> Result<()> { + self.as_inner().disable_manual_compaction(); + Ok(()) + } + + fn enable_manual_compaction(&self) -> Result<()> { + self.as_inner().enable_manual_compaction(); + Ok(()) + } + fn pause_background_work(&self) -> Result<()> { // This will make manual compaction return error instead of waiting. In practice // we might want to identify this case by parsing error message. - self.as_inner().disable_manual_compaction(); + self.disable_manual_compaction()?; self.as_inner().pause_bg_work(); Ok(()) } fn continue_background_work(&self) -> Result<()> { - self.as_inner().enable_manual_compaction(); + self.enable_manual_compaction()?; self.as_inner().continue_bg_work(); Ok(()) } diff --git a/components/engine_traits/src/misc.rs b/components/engine_traits/src/misc.rs index 7871b3b8ecc..228e2cd501e 100644 --- a/components/engine_traits/src/misc.rs +++ b/components/engine_traits/src/misc.rs @@ -121,6 +121,12 @@ pub trait MiscExt: CfNamesExt + FlowControlFactorsExt + WriteBatchExt { fn sync_wal(&self) -> Result<()>; + /// Disable manual compactions, some on-going manual compactions may be + /// aborted. + fn disable_manual_compaction(&self) -> Result<()>; + + fn enable_manual_compaction(&self) -> Result<()>; + /// Depending on the implementation, some on-going manual compactions may be /// aborted. fn pause_background_work(&self) -> Result<()>; diff --git a/components/raftstore/src/store/worker/compact.rs b/components/raftstore/src/store/worker/compact.rs index 3b2a2ec0404..96199884db6 100644 --- a/components/raftstore/src/store/worker/compact.rs +++ b/components/raftstore/src/store/worker/compact.rs @@ -277,8 +277,8 @@ mod tests { kv::{new_engine, new_engine_opt, KvTestEngine}, }; use engine_traits::{ - MiscExt, Mutable, SyncMutable, WriteBatch, WriteBatchExt, CF_DEFAULT, CF_LOCK, CF_RAFT, - CF_WRITE, + CompactExt, MiscExt, Mutable, SyncMutable, WriteBatch, WriteBatchExt, CF_DEFAULT, CF_LOCK, + CF_RAFT, CF_WRITE, }; use keys::data_key; use tempfile::Builder; @@ -286,6 +286,61 @@ mod tests { use super::*; + #[test] + fn test_disable_manual_compaction() { + let path = Builder::new() + .prefix("test_disable_manual_compaction") + .tempdir() + .unwrap(); + let db = new_engine(path.path().to_str().unwrap(), &[CF_DEFAULT]).unwrap(); + + // Generate the first SST file. + let mut wb = db.write_batch(); + for i in 0..1000 { + let k = format!("key_{}", i); + wb.put_cf(CF_DEFAULT, k.as_bytes(), b"whatever content") + .unwrap(); + } + wb.write().unwrap(); + db.flush_cf(CF_DEFAULT, true).unwrap(); + + // Generate another SST file has the same content with first SST file. + let mut wb = db.write_batch(); + for i in 0..1000 { + let k = format!("key_{}", i); + wb.put_cf(CF_DEFAULT, k.as_bytes(), b"whatever content") + .unwrap(); + } + wb.write().unwrap(); + db.flush_cf(CF_DEFAULT, true).unwrap(); + + // Get the total SST files size. + let old_sst_files_size = db.get_total_sst_files_size_cf(CF_DEFAULT).unwrap().unwrap(); + + // Stop the assistant. + { + let _ = db.disable_manual_compaction(); + + // Manually compact range. + let _ = db.compact_range_cf(CF_DEFAULT, None, None, false, 1); + + // Get the total SST files size after compact range. + let new_sst_files_size = db.get_total_sst_files_size_cf(CF_DEFAULT).unwrap().unwrap(); + assert_eq!(old_sst_files_size, new_sst_files_size); + } + // Restart the assistant. + { + let _ = db.enable_manual_compaction(); + + // Manually compact range. + let _ = db.compact_range_cf(CF_DEFAULT, None, None, false, 1); + + // Get the total SST files size after compact range. + let new_sst_files_size = db.get_total_sst_files_size_cf(CF_DEFAULT).unwrap().unwrap(); + assert!(old_sst_files_size > new_sst_files_size); + } + } + #[test] fn test_compact_range() { let path = Builder::new() diff --git a/components/server/src/server.rs b/components/server/src/server.rs index 976a4add68d..ee5e4dba72b 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -1492,8 +1492,18 @@ where } } + fn prepare_stop(&self) { + if let Some(engines) = self.engines.as_ref() { + // Disable manul compaction jobs before shutting down the engines. And it + // will stop the compaction thread in advance, so it won't block the + // cleanup thread when exiting. + let _ = engines.engines.kv.disable_manual_compaction(); + } + } + fn stop(self) { tikv_util::thread_group::mark_shutdown(); + self.prepare_stop(); let mut servers = self.servers.unwrap(); servers .server From c00d501be887a6f6293e7ccb08119cddd829a142 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Thu, 18 Apr 2024 18:48:07 +0800 Subject: [PATCH 157/220] test: use a temp dir to store the test cluster configuration (#16874) (#16878) close tikv/tikv#16871 Always create a temp dir as the test cluster's config path. This can avoid the online configs change the "common-test.toml" file which can impact other test cases. Signed-off-by: glorv Co-authored-by: glorv --- components/test_raftstore-v2/src/cluster.rs | 12 +++++---- components/test_raftstore-v2/src/util.rs | 5 ++-- components/test_raftstore/src/cluster.rs | 12 +++++---- components/test_raftstore/src/config.rs | 27 ++++++++++++++++++++- tests/integrations/import/util.rs | 10 ++------ 5 files changed, 44 insertions(+), 22 deletions(-) diff --git a/components/test_raftstore-v2/src/cluster.rs b/components/test_raftstore-v2/src/cluster.rs index 53ff2c0f0b6..d7e369e3a4f 100644 --- a/components/test_raftstore-v2/src/cluster.rs +++ b/components/test_raftstore-v2/src/cluster.rs @@ -56,7 +56,7 @@ use test_raftstore::{ new_tikv_config_with_api_ver, new_transfer_leader_cmd, sleep_ms, Config, Filter, FilterFactory, PartitionFilterFactory, RawEngine, }; -use tikv::{server::Result as ServerResult, storage::config::EngineType}; +use tikv::{config::TikvConfig, server::Result as ServerResult, storage::config::EngineType}; use tikv_util::{ box_err, box_try, debug, error, future::block_on_timeout, @@ -405,10 +405,7 @@ impl, EK: KvEngine> Cluster { let mut tikv_cfg = new_tikv_config_with_api_ver(id, api_version); tikv_cfg.storage.engine = EngineType::RaftKv2; Cluster { - cfg: Config { - tikv: tikv_cfg, - prefer_mem: true, - }, + cfg: Config::new(tikv_cfg, true), count, tablet_registries: HashMap::default(), key_managers_map: HashMap::default(), @@ -431,6 +428,11 @@ impl, EK: KvEngine> Cluster { } } + pub fn set_cfg(&mut self, mut cfg: TikvConfig) { + cfg.cfg_path = self.cfg.tikv.cfg_path.clone(); + self.cfg.tikv = cfg; + } + pub fn id(&self) -> u64 { self.cfg.server.cluster_id } diff --git a/components/test_raftstore-v2/src/util.rs b/components/test_raftstore-v2/src/util.rs index 315150e29c2..0efad0505e8 100644 --- a/components/test_raftstore-v2/src/util.rs +++ b/components/test_raftstore-v2/src/util.rs @@ -2,7 +2,6 @@ use std::{ fmt::Write, - path::Path, sync::Arc, thread, time::{Duration, Instant}, @@ -141,12 +140,12 @@ pub fn put_cf_till_size, EK: KvEngine>( } pub fn configure_for_encryption(config: &mut Config) { - let manifest_dir = Path::new(env!("CARGO_MANIFEST_DIR")); + let master_key = test_util::new_test_file_master_key(config.cfg_dir.as_ref().unwrap().path()); let cfg = &mut config.security.encryption; cfg.data_encryption_method = EncryptionMethod::Aes128Ctr; cfg.data_key_rotation_period = ReadableDuration(Duration::from_millis(100)); - cfg.master_key = test_util::new_test_file_master_key(manifest_dir); + cfg.master_key = master_key; } pub fn configure_for_snapshot(config: &mut Config) { diff --git a/components/test_raftstore/src/cluster.rs b/components/test_raftstore/src/cluster.rs index a08f858c031..1fbcb8c01b3 100644 --- a/components/test_raftstore/src/cluster.rs +++ b/components/test_raftstore/src/cluster.rs @@ -52,7 +52,7 @@ use raftstore::{ use resource_control::ResourceGroupManager; use tempfile::TempDir; use test_pd_client::TestPdClient; -use tikv::server::Result as ServerResult; +use tikv::{config::TikvConfig, server::Result as ServerResult}; use tikv_util::{ thread_group::GroupProperties, time::{Instant, ThreadReadId}, @@ -194,10 +194,7 @@ impl Cluster { // TODO: In the future, maybe it's better to test both case where // `use_delete_range` is true and false Cluster { - cfg: Config { - tikv: new_tikv_config_with_api_ver(id, api_version), - prefer_mem: true, - }, + cfg: Config::new(new_tikv_config_with_api_ver(id, api_version), true), leaders: HashMap::default(), count, paths: vec![], @@ -219,6 +216,11 @@ impl Cluster { } } + pub fn set_cfg(&mut self, mut cfg: TikvConfig) { + cfg.cfg_path = self.cfg.tikv.cfg_path.clone(); + self.cfg.tikv = cfg; + } + // To destroy temp dir later. pub fn take_path(&mut self) -> Vec { std::mem::take(&mut self.paths) diff --git a/components/test_raftstore/src/config.rs b/components/test_raftstore/src/config.rs index a86b8eb1bf0..001e304ece8 100644 --- a/components/test_raftstore/src/config.rs +++ b/components/test_raftstore/src/config.rs @@ -4,12 +4,37 @@ use std::ops::{Deref, DerefMut}; use tikv::config::TikvConfig; -#[derive(Clone)] pub struct Config { + // temp dir to store the persisted configuration. + // We use a temp dir to ensure the original `common-test.toml` won't be + // changed by online config. + pub cfg_dir: Option, pub tikv: TikvConfig, pub prefer_mem: bool, } +impl Config { + pub fn new(mut tikv: TikvConfig, prefer_mem: bool) -> Self { + let cfg_dir = test_util::temp_dir("test-cfg", prefer_mem); + tikv.cfg_path = cfg_dir.path().join("tikv.toml").display().to_string(); + Self { + cfg_dir: Some(cfg_dir), + tikv, + prefer_mem, + } + } +} + +impl Clone for Config { + fn clone(&self) -> Self { + Self { + cfg_dir: None, + tikv: self.tikv.clone(), + prefer_mem: self.prefer_mem, + } + } +} + impl Deref for Config { type Target = TikvConfig; #[inline] diff --git a/tests/integrations/import/util.rs b/tests/integrations/import/util.rs index d8a11d50746..4f747feeb19 100644 --- a/tests/integrations/import/util.rs +++ b/tests/integrations/import/util.rs @@ -33,10 +33,7 @@ const CLEANUP_SST_MILLIS: u64 = 10; pub fn new_cluster(cfg: TikvConfig) -> (Cluster, Context) { let count = 1; let mut cluster = new_server_cluster(0, count); - cluster.cfg = Config { - tikv: cfg, - prefer_mem: true, - }; + cluster.set_cfg(cfg); cluster.run(); let region_id = 1; @@ -58,10 +55,7 @@ pub fn new_cluster_v2( ) { let count = 1; let mut cluster = test_raftstore_v2::new_server_cluster(0, count); - cluster.cfg = Config { - tikv: cfg, - prefer_mem: true, - }; + cluster.set_cfg(cfg); cluster.run(); let region_id = 1; From 516f190f36e6e1e487c3763f259f8f14e2d18fde Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Fri, 19 Apr 2024 15:03:08 +0800 Subject: [PATCH 158/220] cdc: add more metrics about output events queue time (#16281) (#16287) close tikv/tikv#16282 Signed-off-by: qupeng Co-authored-by: qupeng --- components/cdc/src/channel.rs | 20 ++++++++++++-------- components/cdc/src/initializer.rs | 8 ++++++-- components/cdc/src/metrics.rs | 12 ++++++++++++ 3 files changed, 30 insertions(+), 10 deletions(-) diff --git a/components/cdc/src/channel.rs b/components/cdc/src/channel.rs index b386c3561bb..af9caadd394 100644 --- a/components/cdc/src/channel.rs +++ b/components/cdc/src/channel.rs @@ -235,7 +235,7 @@ macro_rules! impl_from_future_send_error { impl_from_future_send_error! { FuturesSendError, - TrySendError<(CdcEvent, usize)>, + TrySendError<(Instant, CdcEvent, usize)>, } impl From for SendError { @@ -246,8 +246,8 @@ impl From for SendError { #[derive(Clone)] pub struct Sink { - unbounded_sender: UnboundedSender<(CdcEvent, usize)>, - bounded_sender: Sender<(CdcEvent, usize)>, + unbounded_sender: UnboundedSender<(Instant, CdcEvent, usize)>, + bounded_sender: Sender<(Instant, CdcEvent, usize)>, memory_quota: Arc, } @@ -258,7 +258,8 @@ impl Sink { if bytes != 0 { self.memory_quota.alloc(bytes)?; } - match self.unbounded_sender.unbounded_send((event, bytes)) { + let now = Instant::now_coarse(); + match self.unbounded_sender.unbounded_send((now, event, bytes)) { Ok(_) => Ok(()), Err(e) => { // Free quota if send fails. @@ -276,9 +277,11 @@ impl Sink { total_bytes += bytes; } self.memory_quota.alloc(total_bytes as _)?; + + let now = Instant::now_coarse(); for event in events { let bytes = event.size() as usize; - if let Err(e) = self.bounded_sender.feed((event, bytes)).await { + if let Err(e) = self.bounded_sender.feed((now, event, bytes)).await { // Free quota if send fails. self.memory_quota.free(total_bytes as _); return Err(SendError::from(e)); @@ -294,15 +297,16 @@ impl Sink { } pub struct Drain { - unbounded_receiver: UnboundedReceiver<(CdcEvent, usize)>, - bounded_receiver: Receiver<(CdcEvent, usize)>, + unbounded_receiver: UnboundedReceiver<(Instant, CdcEvent, usize)>, + bounded_receiver: Receiver<(Instant, CdcEvent, usize)>, memory_quota: Arc, } impl<'a> Drain { pub fn drain(&'a mut self) -> impl Stream + 'a { stream::select(&mut self.bounded_receiver, &mut self.unbounded_receiver).map( - |(mut event, size)| { + |(start, mut event, size)| { + CDC_EVENTS_PENDING_DURATION.observe(start.saturating_elapsed_secs() * 1000.0); if let CdcEvent::Barrier(ref mut barrier) = event { if let Some(barrier) = barrier.take() { // Unset barrier when it is received. diff --git a/components/cdc/src/initializer.rs b/components/cdc/src/initializer.rs index f06576941fc..53d4eadb332 100644 --- a/components/cdc/src/initializer.rs +++ b/components/cdc/src/initializer.rs @@ -1,5 +1,5 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::sync::Arc; +use std::{sync::Arc, time::Duration}; use api_version::ApiV2; use crossbeam::atomic::AtomicCell; @@ -38,7 +38,7 @@ use tikv_util::{ debug, defer, error, info, memory::MemoryQuota, sys::inspector::{self_thread_inspector, ThreadInspector}, - time::{Instant, Limiter}, + time::{duration_to_sec, Instant, Limiter}, warn, worker::Scheduler, Either, @@ -260,6 +260,7 @@ impl Initializer { fail_point!("cdc_incremental_scan_start"); let mut done = false; let start = Instant::now_coarse(); + let mut sink_time = Duration::default(); let curr_state = self.downstream_state.load(); assert!(matches!( @@ -282,7 +283,9 @@ impl Initializer { } debug!("cdc scan entries"; "len" => entries.len(), "region_id" => region_id); fail_point!("before_schedule_incremental_scan"); + let start_sink = Instant::now_coarse(); self.sink_scan_events(entries, done).await?; + sink_time += start_sink.saturating_elapsed(); } fail_point!("before_post_incremental_scan"); @@ -302,6 +305,7 @@ impl Initializer { } CDC_SCAN_DURATION_HISTOGRAM.observe(takes.as_secs_f64()); + CDC_SCAN_SINK_DURATION_HISTOGRAM.observe(duration_to_sec(sink_time)); Ok(()) } diff --git a/components/cdc/src/metrics.rs b/components/cdc/src/metrics.rs index 5db91572112..6bef4313959 100644 --- a/components/cdc/src/metrics.rs +++ b/components/cdc/src/metrics.rs @@ -88,6 +88,11 @@ lazy_static! { exponential_buckets(0.005, 2.0, 20).unwrap() ) .unwrap(); + pub static ref CDC_SCAN_SINK_DURATION_HISTOGRAM: Histogram = register_histogram!( + "tikv_cdc_scan_sink_duration_seconds", + "Bucketed histogram of cdc async scan sink time duration", + ) + .unwrap(); pub static ref CDC_SCAN_BYTES: IntCounter = register_int_counter!( "tikv_cdc_scan_bytes_total", "Total fetched bytes of CDC incremental scan" @@ -214,6 +219,13 @@ lazy_static! { pub static ref CDC_ROCKSDB_PERF_COUNTER_STATIC: PerfCounter = auto_flush_from!(CDC_ROCKSDB_PERF_COUNTER, PerfCounter); + + pub static ref CDC_EVENTS_PENDING_DURATION: Histogram = register_histogram!( + "tikv_cdc_events_pending_duration", + "Pending duration for all events, in milliseconds", + exponential_buckets(0.01, 2.0, 17).unwrap(), + ) + .unwrap(); } thread_local! { From 46c2b1e4fa7bc9a71e9294770766c2f2b8bb8990 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Thu, 25 Apr 2024 17:27:11 +0800 Subject: [PATCH 159/220] expr: fix the wrong decimal's result frac in some cases (#16914) (#16919) close tikv/tikv#16913 Fix the wrong decimal's result frac in some cases. Signed-off-by: gengliqi Co-authored-by: gengliqi --- .../tidb_query_datatype/src/codec/convert.rs | 14 ++++++++++ .../src/codec/mysql/decimal.rs | 28 ++++++++++++++++--- 2 files changed, 38 insertions(+), 4 deletions(-) diff --git a/components/tidb_query_datatype/src/codec/convert.rs b/components/tidb_query_datatype/src/codec/convert.rs index 418841547ca..e4e2a3d2e8f 100644 --- a/components/tidb_query_datatype/src/codec/convert.rs +++ b/components/tidb_query_datatype/src/codec/convert.rs @@ -2354,6 +2354,13 @@ mod tests { ft.set_flen(flen); ft.set_decimal(decimal); let nd = produce_dec_with_specified_tp(&mut ctx, dec, &ft).unwrap(); + assert_eq!( + nd.frac_cnt(), + nd.result_frac_cnt(), + "frac_cnt {} is not equal to result_frac_cnt {}", + nd.frac_cnt(), + nd.result_frac_cnt() + ); assert_eq!(nd, want, "{}, {}, {}, {}, {}", dec, nd, want, flen, decimal); } } @@ -2765,6 +2772,13 @@ mod tests { match &expect { Ok(d) => { assert!(r.is_ok(), "{}", log); + assert_eq!( + d.frac_cnt(), + d.result_frac_cnt(), + "frac_cnt {} is not equal to result_frac_cnt {}", + d.frac_cnt(), + d.result_frac_cnt() + ); assert_eq!(&r.unwrap(), d, "{}", log); } Err(Error::Eval(..)) => { diff --git a/components/tidb_query_datatype/src/codec/mysql/decimal.rs b/components/tidb_query_datatype/src/codec/mysql/decimal.rs index 3a2be14758e..4aea3843604 100644 --- a/components/tidb_query_datatype/src/codec/mysql/decimal.rs +++ b/components/tidb_query_datatype/src/codec/mysql/decimal.rs @@ -979,7 +979,7 @@ impl Decimal { Decimal { int_cnt, frac_cnt, - result_frac_cnt: 0, + result_frac_cnt: frac_cnt, negative, word_buf: [0; 9], } @@ -1196,10 +1196,12 @@ impl Decimal { res.word_buf[idx as usize] = 0; } res.frac_cnt = frac as u8; + res.result_frac_cnt = res.frac_cnt; return res; } if frac >= res.frac_cnt as i8 { res.frac_cnt = frac as u8; + res.result_frac_cnt = res.frac_cnt; return res; } @@ -1342,6 +1344,7 @@ impl Decimal { dec.int_cnt = 1; dec.negative = false; dec.frac_cnt = cmp::max(0, frac) as u8; + dec.result_frac_cnt = dec.frac_cnt; for i in 0..idx { dec.word_buf[i as usize] = 0; } @@ -1355,6 +1358,7 @@ impl Decimal { dec.int_cnt += 1; } dec.frac_cnt = cmp::max(0, frac) as u8; + dec.result_frac_cnt = dec.frac_cnt; dec } @@ -1728,6 +1732,16 @@ impl Decimal { let len = word_cnt!(self.int_cnt) + word_cnt!(self.frac_cnt); self.word_buf[0..len as usize].iter().all(|&x| x == 0) } + + #[cfg(test)] + pub fn result_frac_cnt(&self) -> u8 { + self.result_frac_cnt + } + + #[cfg(test)] + pub fn frac_cnt(&self) -> u8 { + self.frac_cnt + } } macro_rules! enable_conv_for_int { @@ -2962,11 +2976,17 @@ mod tests { for (dec_str, scale, half_exp, trunc_exp, ceil_exp) in cases { let dec = dec_str.parse::().unwrap(); - let res = dec.round(scale, RoundMode::HalfEven).map(|d| d.to_string()); + let round_dec = dec.round(scale, RoundMode::HalfEven); + assert_eq!(round_dec.frac_cnt, round_dec.result_frac_cnt); + let res = round_dec.map(|d| d.to_string()); assert_eq!(res, half_exp.map(|s| s.to_owned())); - let res = dec.round(scale, RoundMode::Truncate).map(|d| d.to_string()); + let round_dec = dec.round(scale, RoundMode::Truncate); + assert_eq!(round_dec.frac_cnt, round_dec.result_frac_cnt); + let res = round_dec.map(|d| d.to_string()); assert_eq!(res, trunc_exp.map(|s| s.to_owned())); - let res = dec.round(scale, RoundMode::Ceiling).map(|d| d.to_string()); + let round_dec = dec.round(scale, RoundMode::Ceiling); + assert_eq!(round_dec.frac_cnt, round_dec.result_frac_cnt); + let res = round_dec.map(|d| d.to_string()); assert_eq!(res, ceil_exp.map(|s| s.to_owned())); } } From 3478895c2a700e4824bb41940260b6b28013275e Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Sun, 28 Apr 2024 16:13:27 +0800 Subject: [PATCH 160/220] resolved_ts: use smaller timeout when do check_leader (#16000) (#16928) close tikv/tikv#15999 Signed-off-by: ti-chi-bot Signed-off-by: crazycs520 Co-authored-by: crazycs Co-authored-by: crazycs520 --- components/backup-stream/src/endpoint.rs | 9 ++- .../backup-stream/src/subscription_manager.rs | 4 +- components/cdc/src/endpoint.rs | 2 +- components/resolved_ts/src/advance.rs | 66 ++++++++++++++++--- 4 files changed, 67 insertions(+), 14 deletions(-) diff --git a/components/backup-stream/src/endpoint.rs b/components/backup-stream/src/endpoint.rs index f453469768c..770fafaaaaf 100644 --- a/components/backup-stream/src/endpoint.rs +++ b/components/backup-stream/src/endpoint.rs @@ -1098,9 +1098,14 @@ where RT: CdcHandle + 'static, EK: KvEngine, { - pub async fn resolve(&mut self, regions: Vec, min_ts: TimeStamp) -> Vec { + pub async fn resolve( + &mut self, + regions: Vec, + min_ts: TimeStamp, + timeout: Option, + ) -> Vec { match self { - BackupStreamResolver::V1(x) => x.resolve(regions, min_ts).await, + BackupStreamResolver::V1(x) => x.resolve(regions, min_ts, timeout).await, BackupStreamResolver::V2(x, _) => { let x = x.clone(); resolve_by_raft(regions, min_ts, x).await diff --git a/components/backup-stream/src/subscription_manager.rs b/components/backup-stream/src/subscription_manager.rs index 7aeecb775cc..df04521bbcb 100644 --- a/components/backup-stream/src/subscription_manager.rs +++ b/components/backup-stream/src/subscription_manager.rs @@ -471,7 +471,9 @@ where warn!("waiting for initial scanning done timed out, forcing progress!"; "take" => ?now.saturating_elapsed(), "timedout" => %timedout); } - let regions = resolver.resolve(self.subs.current_regions(), min_ts).await; + let regions = resolver + .resolve(self.subs.current_regions(), min_ts, None) + .await; let cps = self.subs.resolve_with(min_ts, regions); let min_region = cps.iter().min_by_key(|rs| rs.checkpoint); // If there isn't any region observed, the `min_ts` can be used as resolved ts diff --git a/components/cdc/src/endpoint.rs b/components/cdc/src/endpoint.rs index e62650c77c6..e583c97645e 100644 --- a/components/cdc/src/endpoint.rs +++ b/components/cdc/src/endpoint.rs @@ -1172,7 +1172,7 @@ impl, E: KvEngine, S: StoreRegionMeta> Endpoint, min_ts: TimeStamp) -> Vec { + pub async fn resolve( + &mut self, + regions: Vec, + min_ts: TimeStamp, + timeout: Option, + ) -> Vec { if regions.is_empty() { return regions; } @@ -309,6 +316,8 @@ impl LeadershipResolver { .find(|req| !req.regions.is_empty()) .map_or(0, |req| req.regions[0].compute_size()); let mut check_leader_rpcs = Vec::with_capacity(store_req_map.len()); + let timeout = get_min_timeout(timeout, DEFAULT_CHECK_LEADER_TIMEOUT_DURATION); + for (store_id, req) in store_req_map { if req.regions.is_empty() { continue; @@ -323,9 +332,16 @@ impl LeadershipResolver { let rpc = async move { PENDING_CHECK_LEADER_REQ_COUNT.inc(); defer!(PENDING_CHECK_LEADER_REQ_COUNT.dec()); - let client = get_tikv_client(to_store, pd_client, security_mgr, env, tikv_clients) - .await - .map_err(|e| (to_store, e.retryable(), format!("[get tikv client] {}", e)))?; + let client = get_tikv_client( + to_store, + pd_client, + security_mgr, + env, + tikv_clients, + timeout, + ) + .await + .map_err(|e| (to_store, e.retryable(), format!("[get tikv client] {}", e)))?; // Set min_ts in the request. req.set_ts(min_ts.into_inner()); @@ -356,7 +372,6 @@ impl LeadershipResolver { PENDING_CHECK_LEADER_REQ_SENT_COUNT.inc(); defer!(PENDING_CHECK_LEADER_REQ_SENT_COUNT.dec()); - let timeout = DEFAULT_CHECK_LEADER_TIMEOUT_DURATION; let resp = tokio::time::timeout(timeout, rpc) .map_err(|e| (to_store, true, format!("[timeout] {}", e))) .await? @@ -451,6 +466,11 @@ where resps.into_iter().flatten().collect::>() } +#[inline] +fn get_min_timeout(timeout: Option, default: Duration) -> Duration { + timeout.unwrap_or(default).min(default) +} + fn region_has_quorum(peers: &[Peer], stores: &[u64]) -> bool { let mut voters = 0; let mut incoming_voters = 0; @@ -507,6 +527,7 @@ async fn get_tikv_client( security_mgr: &SecurityManager, env: Arc, tikv_clients: &Mutex>, + timeout: Duration, ) -> pd_client::Result { { let clients = tikv_clients.lock().await; @@ -514,7 +535,6 @@ async fn get_tikv_client( return Ok(client); } } - let timeout = DEFAULT_CHECK_LEADER_TIMEOUT_DURATION; let store = tokio::time::timeout(timeout, pd_client.get_store_async(store_id)) .await .map_err(|e| pd_client::Error::Other(Box::new(e))) @@ -653,19 +673,45 @@ mod tests { .region_read_progress .insert(2, Arc::new(progress2)); - leader_resolver.resolve(vec![1, 2], TimeStamp::new(1)).await; + leader_resolver + .resolve(vec![1, 2], TimeStamp::new(1), None) + .await; let req = rx.recv_timeout(Duration::from_secs(1)).unwrap(); assert_eq!(req.regions.len(), 2); // Checking one region only send 1 region in request. - leader_resolver.resolve(vec![1], TimeStamp::new(1)).await; + leader_resolver + .resolve(vec![1], TimeStamp::new(1), None) + .await; let req = rx.recv_timeout(Duration::from_secs(1)).unwrap(); assert_eq!(req.regions.len(), 1); // Checking zero region does not send request. - leader_resolver.resolve(vec![], TimeStamp::new(1)).await; + leader_resolver + .resolve(vec![], TimeStamp::new(1), None) + .await; rx.recv_timeout(Duration::from_secs(1)).unwrap_err(); let _ = server.shutdown().await; } + + #[test] + fn test_get_min_timeout() { + assert_eq!( + get_min_timeout(None, Duration::from_secs(5)), + Duration::from_secs(5) + ); + assert_eq!( + get_min_timeout(None, Duration::from_secs(2)), + Duration::from_secs(2) + ); + assert_eq!( + get_min_timeout(Some(Duration::from_secs(1)), Duration::from_secs(5)), + Duration::from_secs(1) + ); + assert_eq!( + get_min_timeout(Some(Duration::from_secs(20)), Duration::from_secs(5)), + Duration::from_secs(5) + ); + } } From ee5bd74cfec316736bf6abc03f22955f88d53e24 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Wed, 1 May 2024 23:13:03 +0800 Subject: [PATCH 161/220] server: change the log level to debug for cop error response (#15882) (#16927) ref tikv/tikv#15881 Change the coprocessor error response log level to DEBUG Signed-off-by: ti-chi-bot Signed-off-by: cfzjywxk Co-authored-by: cfzjywxk Co-authored-by: cfzjywxk Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- src/coprocessor/endpoint.rs | 114 +++++++++++++++--------------------- src/read_pool.rs | 4 ++ 2 files changed, 51 insertions(+), 67 deletions(-) diff --git a/src/coprocessor/endpoint.rs b/src/coprocessor/endpoint.rs index 8504f92e1d1..001d1e94ca0 100644 --- a/src/coprocessor/endpoint.rs +++ b/src/coprocessor/endpoint.rs @@ -550,8 +550,9 @@ impl Endpoint { if let Err(busy_err) = self.read_pool.check_busy_threshold(Duration::from_millis( req.get_context().get_busy_threshold_ms() as u64, )) { - let mut resp = coppb::Response::default(); - resp.mut_region_error().set_server_is_busy(busy_err); + let mut pb_error = errorpb::Error::new(); + pb_error.set_server_is_busy(busy_err); + let resp = make_error_response(Error::Region(pb_error)); return Either::Left(async move { resp.into() }); } @@ -820,83 +821,62 @@ impl Endpoint { } } +macro_rules! make_error_response_common { + ($resp:expr, $tag:expr, $e:expr) => {{ + match $e { + Error::Region(e) => { + $tag = storage::get_tag_from_header(&e); + $resp.set_region_error(e); + } + Error::Locked(info) => { + $tag = "meet_lock"; + $resp.set_locked(info); + } + Error::DeadlineExceeded => { + $tag = "deadline_exceeded"; + let mut err = errorpb::Error::default(); + set_deadline_exceeded_busy_error(&mut err); + err.set_message($e.to_string()); + $resp.set_region_error(err); + } + Error::MaxPendingTasksExceeded => { + $tag = "max_pending_tasks_exceeded"; + let mut server_is_busy_err = errorpb::ServerIsBusy::default(); + server_is_busy_err.set_reason($e.to_string()); + let mut errorpb = errorpb::Error::default(); + errorpb.set_message($e.to_string()); + errorpb.set_server_is_busy(server_is_busy_err); + $resp.set_region_error(errorpb); + } + Error::Other(_) => { + $tag = "other"; + warn!("unexpected other error encountered processing coprocessor task"; + "error" => ?&$e, + ); + $resp.set_other_error($e.to_string()); + } + }; + COPR_REQ_ERROR.with_label_values(&[$tag]).inc(); + }}; +} + fn make_error_batch_response(batch_resp: &mut coppb::StoreBatchTaskResponse, e: Error) { - warn!( + debug!( "batch cop task error-response"; "err" => %e ); let tag; - match e { - Error::Region(e) => { - tag = storage::get_tag_from_header(&e); - batch_resp.set_region_error(e); - } - Error::Locked(info) => { - tag = "meet_lock"; - batch_resp.set_locked(info); - } - Error::DeadlineExceeded => { - tag = "deadline_exceeded"; - let mut err = errorpb::Error::default(); - set_deadline_exceeded_busy_error(&mut err); - err.set_message(e.to_string()); - batch_resp.set_region_error(err); - } - Error::MaxPendingTasksExceeded => { - tag = "max_pending_tasks_exceeded"; - let mut server_is_busy_err = errorpb::ServerIsBusy::default(); - server_is_busy_err.set_reason(e.to_string()); - let mut errorpb = errorpb::Error::default(); - errorpb.set_message(e.to_string()); - errorpb.set_server_is_busy(server_is_busy_err); - batch_resp.set_region_error(errorpb); - } - Error::Other(_) => { - tag = "other"; - batch_resp.set_other_error(e.to_string()); - } - }; - COPR_REQ_ERROR.with_label_values(&[tag]).inc(); + make_error_response_common!(batch_resp, tag, e); } fn make_error_response(e: Error) -> coppb::Response { - warn!( + debug!( "error-response"; "err" => %e ); - let mut resp = coppb::Response::default(); let tag; - match e { - Error::Region(e) => { - tag = storage::get_tag_from_header(&e); - resp.set_region_error(e); - } - Error::Locked(info) => { - tag = "meet_lock"; - resp.set_locked(info); - } - Error::DeadlineExceeded => { - tag = "deadline_exceeded"; - let mut err = errorpb::Error::default(); - set_deadline_exceeded_busy_error(&mut err); - err.set_message(e.to_string()); - resp.set_region_error(err); - } - Error::MaxPendingTasksExceeded => { - tag = "max_pending_tasks_exceeded"; - let mut server_is_busy_err = errorpb::ServerIsBusy::default(); - server_is_busy_err.set_reason(e.to_string()); - let mut errorpb = errorpb::Error::default(); - errorpb.set_message(e.to_string()); - errorpb.set_server_is_busy(server_is_busy_err); - resp.set_region_error(errorpb); - } - Error::Other(_) => { - tag = "other"; - resp.set_other_error(e.to_string()); - } - }; - COPR_REQ_ERROR.with_label_values(&[tag]).inc(); + let mut resp = coppb::Response::default(); + make_error_response_common!(resp, tag, e); resp } diff --git a/src/read_pool.rs b/src/read_pool.rs index fb44bcb4cc9..301ea648274 100644 --- a/src/read_pool.rs +++ b/src/read_pool.rs @@ -312,6 +312,10 @@ impl ReadPoolHandle { let mut busy_err = errorpb::ServerIsBusy::default(); busy_err.set_reason("estimated wait time exceeds threshold".to_owned()); busy_err.estimated_wait_ms = u32::try_from(estimated_wait.as_millis()).unwrap_or(u32::MAX); + warn!("Already many pending tasks in the read queue, task is rejected"; + "busy_threshold" => ?&busy_threshold, + "busy_err" => ?&busy_err, + ); Err(busy_err) } } From a0b1254aafa44634f4c96d339fb95f0180d1353f Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Thu, 9 May 2024 23:25:08 +0800 Subject: [PATCH 162/220] expression: fix a `cast_string_to_decimal` truncate bug (#16963) (#16985) close tikv/tikv#16962 fix a `cast_string_to_decimal` truncate bug Signed-off-by: gengliqi Co-authored-by: gengliqi --- .../src/codec/mysql/decimal.rs | 106 ++++++++++++------ 1 file changed, 72 insertions(+), 34 deletions(-) diff --git a/components/tidb_query_datatype/src/codec/mysql/decimal.rs b/components/tidb_query_datatype/src/codec/mysql/decimal.rs index 4aea3843604..077c5f50365 100644 --- a/components/tidb_query_datatype/src/codec/mysql/decimal.rs +++ b/components/tidb_query_datatype/src/codec/mysql/decimal.rs @@ -1672,35 +1672,39 @@ impl Decimal { if inner_idx != 0 { d.word_buf[word_idx] = word * TEN_POW[DIGITS_PER_WORD as usize - inner_idx]; } - if end_idx < bs.len() && (bs[end_idx] == b'e' || bs[end_idx] == b'E') { - let exp = convert::bytes_to_int_without_context(&bs[end_idx + 1..])?; - if exp > i64::from(i32::MAX) / 2 { - return Ok(Res::Overflow(max_or_min_dec( - d.negative, - WORD_BUF_LEN * DIGITS_PER_WORD, - 0, - ))); - } - if exp < i64::from(i32::MIN) / 2 && !d.is_overflow() { - return Ok(Res::Truncated(Self::zero())); - } - if !d.is_overflow() { - let is_truncated = d.is_truncated(); - d = match d.unwrap().shift(exp as isize) { - Res::Overflow(v) => Res::Overflow(max_or_min_dec( - v.negative, + if end_idx < bs.len() { + if bs[end_idx] == b'e' || bs[end_idx] == b'E' { + let exp = convert::bytes_to_int_without_context(&bs[end_idx + 1..])?; + if exp > i64::from(i32::MAX) / 2 { + d = Res::Overflow(max_or_min_dec( + d.negative, WORD_BUF_LEN * DIGITS_PER_WORD, 0, - )), - Res::Ok(v) => { - if is_truncated { - Res::Truncated(v) - } else { - Res::Ok(v) + )); + } + if exp < i64::from(i32::MIN) / 2 && !d.is_overflow() { + d = Res::Truncated(Self::zero()); + } + if !d.is_overflow() { + let is_truncated = d.is_truncated(); + d = match d.unwrap().shift(exp as isize) { + Res::Overflow(v) => Res::Overflow(max_or_min_dec( + v.negative, + WORD_BUF_LEN * DIGITS_PER_WORD, + 0, + )), + Res::Ok(v) => { + if is_truncated { + Res::Truncated(v) + } else { + Res::Ok(v) + } } - } - res => res, - }; + res => res, + }; + } + } else if bs[end_idx..].iter().any(|c| !c.is_ascii_whitespace()) { + d = Res::Truncated(d.unwrap()); } } if d.word_buf.iter().all(|c| *c == 0) { @@ -2440,7 +2444,7 @@ mod tests { use super::{DEFAULT_DIV_FRAC_INCR, WORD_BUF_LEN, *}; use crate::{ - codec::error::ERR_DATA_OUT_OF_RANGE, + codec::error::*, expr::{EvalConfig, Flag}, }; @@ -2997,8 +3001,8 @@ mod tests { let cases = vec![ (WORD_BUF_LEN, b"12345" as &'static [u8], Res::Ok("12345")), (WORD_BUF_LEN, b"12345.", Res::Ok("12345")), - (WORD_BUF_LEN, b"123.45.", Res::Ok("123.45")), - (WORD_BUF_LEN, b"-123.45.", Res::Ok("-123.45")), + (WORD_BUF_LEN, b"123.45.", Res::Truncated("123.45")), + (WORD_BUF_LEN, b"-123.45.", Res::Truncated("-123.45")), ( WORD_BUF_LEN, b".00012345000098765", @@ -3046,8 +3050,11 @@ mod tests { (WORD_BUF_LEN, b"2.2E-1", Res::Ok("0.22")), (WORD_BUF_LEN, b"2.23E2", Res::Ok("223")), (WORD_BUF_LEN, b"2.23E2abc", Res::Ok("223")), - (WORD_BUF_LEN, b"2.23a2", Res::Ok("2.23")), - (WORD_BUF_LEN, b"223\xE0\x80\x80", Res::Ok("223")), + (WORD_BUF_LEN, b"2.23a2", Res::Truncated("2.23")), + (WORD_BUF_LEN, b"223\xE0\x80\x80", Res::Truncated("223")), + (WORD_BUF_LEN, b"223 ", Res::Ok("223")), + (WORD_BUF_LEN, b"223.2 ", Res::Ok("223.2")), + (WORD_BUF_LEN, b"223.2 .", Res::Truncated("223.2")), (WORD_BUF_LEN, b"1e -1", Res::Ok("0.1")), (WORD_BUF_LEN, b"1e001", Res::Ok("10")), (WORD_BUF_LEN, b"1e00", Res::Ok("1")), @@ -3748,19 +3755,20 @@ mod tests { #[test] fn test_bytes_to_decimal() { + let mut ctx = EvalContext::default(); let cases: Vec<(&[u8], Decimal)> = vec![ ( b"123456.1", - ConvertTo::::convert(&123456.1, &mut EvalContext::default()).unwrap(), + ConvertTo::::convert(&123456.1, &mut ctx).unwrap(), ), ( b"-123456.1", - ConvertTo::::convert(&-123456.1, &mut EvalContext::default()).unwrap(), + ConvertTo::::convert(&-123456.1, &mut ctx).unwrap(), ), (b"123456", Decimal::from(123456)), (b"-123456", Decimal::from(-123456)), + (b"1 ", Decimal::from(1)), ]; - let mut ctx = EvalContext::default(); for (s, expect) in cases { let got: Decimal = s.convert(&mut ctx).unwrap(); assert_eq!(got, expect, "from {:?}, expect: {} got: {}", s, expect, got); @@ -3779,6 +3787,36 @@ mod tests { assert_eq!(val, max, "expect: {}, got: {}", val, max); assert_eq!(ctx.warnings.warning_cnt, 1); assert_eq!(ctx.warnings.warnings[0].get_code(), ERR_DATA_OUT_OF_RANGE); + + // Truncate cases + let truncate_cases: Vec<(&[u8], Decimal)> = vec![ + ( + b"123.45.", + ConvertTo::::convert(&123.45, &mut ctx).unwrap(), + ), + ( + b"-123.45.", + ConvertTo::::convert(&-123.45, &mut ctx).unwrap(), + ), + ( + b"1.1.1.1.1", + ConvertTo::::convert(&1.1, &mut ctx).unwrap(), + ), + (b"1asf", Decimal::from(1)), + (b"1 1", Decimal::from(1)), + ]; + for (s, expect) in truncate_cases { + let val: Result = s.convert(&mut ctx); + assert!(val.is_err(), "expected error, but got {:?}", val); + assert_eq!(val.unwrap_err().code(), WARN_DATA_TRUNCATED); + + let mut truncate_as_warning_ctx = EvalContext::new(std::sync::Arc::new( + EvalConfig::from_flag(Flag::TRUNCATE_AS_WARNING), + )); + let got: Decimal = s.convert(&mut truncate_as_warning_ctx).unwrap(); + assert_eq!(got, expect, "from {:?}, expect: {} got: {}", s, expect, got); + assert_eq!(truncate_as_warning_ctx.warnings.warning_cnt, 1); + } } #[test] From 353aa6865c5d77e5b22fb759e17055edb92df548 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Fri, 10 May 2024 14:08:39 +0800 Subject: [PATCH 163/220] expression: fix overflow panic in `conv` (#16970) (#16980) close tikv/tikv#16969 fix overflow panic in `conv` Signed-off-by: gengliqi Co-authored-by: gengliqi Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- components/tidb_query_expr/src/impl_math.rs | 74 ++++++++++++++++----- 1 file changed, 58 insertions(+), 16 deletions(-) diff --git a/components/tidb_query_expr/src/impl_math.rs b/components/tidb_query_expr/src/impl_math.rs index beeeef288b4..f9f790d7d17 100644 --- a/components/tidb_query_expr/src/impl_math.rs +++ b/components/tidb_query_expr/src/impl_math.rs @@ -387,16 +387,20 @@ pub fn conv(n: BytesRef, from_base: &Int, to_base: &Int) -> Result let s = s.trim(); let from_base = IntWithSign::from_int(*from_base); let to_base = IntWithSign::from_int(*to_base); - Ok(if is_valid_base(from_base) && is_valid_base(to_base) { + if is_valid_base(from_base) && is_valid_base(to_base) { if let Some((num_str, is_neg)) = extract_num_str(s, from_base) { - let num = extract_num(num_str.as_ref(), is_neg, from_base); - Some(num.format_to_base(to_base).into_bytes()) + match extract_num(num_str.as_ref(), is_neg, from_base) { + Some(num) => Ok(Some(num.format_to_base(to_base).into_bytes())), + None => { + Err(Error::overflow("BIGINT UNSIGNED", format!("conv({})", num_str)).into()) + } + } } else { - Some(b"0".to_vec()) + Ok(Some(b"0".to_vec())) } } else { - None - }) + Ok(None) + } } #[inline] @@ -566,7 +570,9 @@ impl IntWithSign { // Shrink num to fit the boundary of i64. fn shrink_from_signed_uint(num: u64, is_neg: bool) -> IntWithSign { let value = if is_neg { - num.min(-Int::min_value() as u64) + // Avoid int64 overflow error. + // -int64_min = int64_max + 1 + num.min(Int::max_value() as u64 + 1) } else { num.min(Int::max_value() as u64) }; @@ -594,7 +600,8 @@ impl IntWithSign { let IntWithSign(value, is_neg) = self; let IntWithSign(to_base, should_ignore_sign) = to_base; let mut real_val = value as i64; - if is_neg && !should_ignore_sign { + // real_val > 0 is to avoid overflow issue when value is -int64_min. + if is_neg && !should_ignore_sign && real_val > 0 { real_val = -real_val; } let mut ret = IntWithSign::format_radix(real_val as u64, to_base as u32); @@ -629,14 +636,17 @@ fn extract_num_str(s: &str, from_base: IntWithSign) -> Option<(String, bool)> { } } -fn extract_num(num_s: &str, is_neg: bool, from_base: IntWithSign) -> IntWithSign { +fn extract_num(num_s: &str, is_neg: bool, from_base: IntWithSign) -> Option { let IntWithSign(from_base, signed) = from_base; - let value = u64::from_str_radix(num_s, from_base as u32).unwrap(); - if signed { + let value = match u64::from_str_radix(num_s, from_base as u32) { + Ok(v) => v, + Err(_) => return None, + }; + Some(if signed { IntWithSign::shrink_from_signed_uint(value, is_neg) } else { IntWithSign::from_signed_uint(value, is_neg) - } + }) } // Returns (isize, is_positive): convert an i64 to usize, and whether the input @@ -1605,6 +1615,18 @@ mod tests { ("+", 10, 8, "0"), ("-", 10, 8, "0"), ("", 2, 16, "0"), + ( + "18446744073709551615", + 10, + 2, + "1111111111111111111111111111111111111111111111111111111111111111", + ), + ( + "-18446744073709551615", + -10, + 2, + "1000000000000000000000000000000000000000000000000000000000000000", + ), ]; for (n, f, t, e) in tests { let n = Some(n.as_bytes().to_vec()); @@ -1621,17 +1643,37 @@ mod tests { } let invalid_tests = vec![ - (None, Some(10), Some(10), None), - (Some(b"a6a".to_vec()), Some(1), Some(8), None), + (None, Some(10), Some(10)), + (Some(b"111".to_vec()), None, Some(7)), + (Some(b"112".to_vec()), Some(10), None), + (None, None, None), + (Some(b"222".to_vec()), Some(2), Some(100)), + (Some(b"333".to_vec()), Some(37), Some(2)), + (Some(b"a6a".to_vec()), Some(1), Some(8)), ]; - for (n, f, t, e) in invalid_tests { + for (n, f, t) in invalid_tests { let got = RpnFnScalarEvaluator::new() .push_param(n) .push_param(f) .push_param(t) .evaluate::(ScalarFuncSig::Conv) .unwrap(); - assert_eq!(got, e); + assert_eq!(got, None); + } + + let error_tests = vec![ + ("18446744073709551616", Some(10), Some(10)), + ("100000000000000000001", Some(10), Some(8)), + ("-18446744073709551616", Some(-10), Some(4)), + ]; + for (n, f, t) in error_tests { + let n = Some(n.as_bytes().to_vec()); + let got = RpnFnScalarEvaluator::new() + .push_param(n) + .push_param(f) + .push_param(t) + .evaluate::(ScalarFuncSig::Conv); + got.unwrap_err(); } } From f2be3c0b9f0e60b619dade22410979ca91f4d85a Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Tue, 14 May 2024 19:02:42 +0800 Subject: [PATCH 164/220] *: fix issue of stale peer block resolve-ts cause by ignore gc message (#16505) (#16932) close tikv/tikv#16504 Fix issue of stale peer block resolve-ts cause by ignore gc message. Signed-off-by: crazycs520 Signed-off-by: cfzjywxk Co-authored-by: crazycs520 Co-authored-by: cfzjywxk Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- components/raftstore/src/store/fsm/peer.rs | 3 +- components/test_raftstore/src/cluster.rs | 28 ++++++++ tests/integrations/raftstore/test_life.rs | 84 +++++++++++++++++++++- 3 files changed, 112 insertions(+), 3 deletions(-) diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index 4d7d752dd90..d4a130ba01d 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -3150,11 +3150,12 @@ where return; } - if self.fsm.peer.peer != *msg.get_to_peer() { + if self.fsm.peer.peer.get_id() != msg.get_to_peer().get_id() { info!( "receive stale gc message, ignore."; "region_id" => self.fsm.region_id(), "peer_id" => self.fsm.peer_id(), + "to_peer_id" => msg.get_to_peer().get_id(), ); self.ctx.raft_metrics.message_dropped.stale_msg.inc(); return; diff --git a/components/test_raftstore/src/cluster.rs b/components/test_raftstore/src/cluster.rs index 1fbcb8c01b3..912b61e041a 100644 --- a/components/test_raftstore/src/cluster.rs +++ b/components/test_raftstore/src/cluster.rs @@ -1303,6 +1303,34 @@ impl Cluster { ); } + pub fn wait_peer_role(&self, region_id: u64, store_id: u64, peer_id: u64, role: PeerRole) { + for _ in 0..100 { + if let Some(state) = self + .get_engine(store_id) + .get_msg_cf::( + engine_traits::CF_RAFT, + &keys::region_state_key(region_id), + ) + .unwrap() + { + let peer = state + .get_region() + .get_peers() + .iter() + .find(|p| p.get_id() == peer_id) + .unwrap(); + if peer.role == role { + return; + } + } + sleep_ms(10); + } + panic!( + "[region {}] peer state still not reach {:?}", + region_id, role + ); + } + pub fn wait_last_index( &mut self, region_id: u64, diff --git a/tests/integrations/raftstore/test_life.rs b/tests/integrations/raftstore/test_life.rs index 809904c7f46..0d01799f534 100644 --- a/tests/integrations/raftstore/test_life.rs +++ b/tests/integrations/raftstore/test_life.rs @@ -5,9 +5,16 @@ use std::{ time::Duration, }; -use kvproto::raft_serverpb::{ExtraMessageType, PeerState, RaftMessage}; +use kvproto::{ + metapb::PeerRole::Learner, + raft_serverpb::{ExtraMessageType, PeerState, RaftMessage}, +}; +use raft::{eraftpb::ConfChangeType, prelude::MessageType}; use raftstore::errors::Result; -use test_raftstore::{new_learner_peer, new_peer, Filter, FilterFactory, Simulator as S1}; +use test_raftstore::{ + new_admin_request, new_change_peer_request, new_learner_peer, new_peer, Direction, Filter, + FilterFactory, RegionPacketFilter, Simulator as S1, +}; use test_raftstore_v2::Simulator as S2; use tikv_util::{config::ReadableDuration, time::Instant, HandyRwLock}; @@ -205,3 +212,76 @@ fn test_gc_removed_peer() { Duration::from_millis(200) )); } + +#[test] +fn test_gc_peer_with_conf_change() { + let mut cluster = test_raftstore::new_node_cluster(0, 5); + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + let region_id = cluster.run_conf_change(); + pd_client.must_add_peer(region_id, new_peer(2, 2)); + pd_client.must_add_peer(region_id, new_peer(3, 3)); + cluster.must_transfer_leader(region_id, new_peer(1, 1)); + cluster.must_put(b"k1", b"v1"); + let mut region_epoch = cluster.get_region_epoch(region_id); + + // Create a learner peer 4 on store 4. + let extra_store_id = 4; + let extra_peer_id = 4; + let cc = new_change_peer_request( + ConfChangeType::AddLearnerNode, + new_learner_peer(extra_store_id, extra_peer_id), + ); + let req = new_admin_request(region_id, ®ion_epoch, cc); + let res = cluster + .call_command_on_leader(req, Duration::from_secs(3)) + .unwrap(); + assert!(!res.get_header().has_error(), "{:?}", res); + region_epoch.conf_ver += 1; + cluster.wait_peer_state(region_id, 4, PeerState::Normal); + + // Isolate peer 4 from other region peers. + let left_filter = RegionPacketFilter::new(region_id, extra_store_id) + .direction(Direction::Recv) + .skip(MessageType::MsgHup); + cluster + .sim + .wl() + .add_recv_filter(extra_store_id, Box::new(left_filter)); + + // Change peer 4 to voter. + let cc = new_change_peer_request( + ConfChangeType::AddNode, + new_peer(extra_store_id, extra_peer_id), + ); + let req = new_admin_request(region_id, ®ion_epoch, cc); + let res = cluster + .call_command_on_leader(req, Duration::from_secs(3)) + .unwrap(); + assert!(!res.get_header().has_error(), "{:?}", res); + region_epoch.conf_ver += 1; + + // Remove peer 4 from region 1. + let cc = new_change_peer_request( + ConfChangeType::RemoveNode, + new_peer(extra_store_id, extra_peer_id), + ); + let req = new_admin_request(region_id, ®ion_epoch, cc); + let res = cluster + .call_command_on_leader(req, Duration::from_secs(3)) + .unwrap(); + assert!(!res.get_header().has_error(), "{:?}", res); + region_epoch.conf_ver += 1; + + // GC peer 4 using Voter peer state, peer 4 is learner because it's isolated. + cluster.wait_peer_role(region_id, extra_store_id, extra_peer_id, Learner); + let mut gc_msg = RaftMessage::default(); + gc_msg.set_region_id(region_id); + gc_msg.set_from_peer(new_peer(1, 1)); + gc_msg.set_to_peer(new_peer(4, 4)); + gc_msg.set_region_epoch(region_epoch); + gc_msg.set_is_tombstone(true); + cluster.send_raft_msg(gc_msg).unwrap(); + cluster.wait_peer_state(region_id, 4, PeerState::Tombstone); +} From 31c050d3b90893ff5ad87c7c95b730a3eae317b7 Mon Sep 17 00:00:00 2001 From: Yang Zhang Date: Mon, 20 May 2024 11:46:15 -0700 Subject: [PATCH 165/220] rocksdb: Expose track-and-verify-wals-in-manifest config (#16546) (#16784) ref tikv/tikv#16549 Expose track-and-verify-wals-in-manifest config. For further investigating corrupted WAL issue happened during EBS restore process. Signed-off-by: v01dstar Signed-off-by: Yang Zhang Co-authored-by: Jinpeng Zhang Co-authored-by: tonyxuqqi --- components/engine_panic/src/db_options.rs | 4 ++++ components/engine_rocks/src/db_options.rs | 4 ++++ components/engine_traits/src/db_options.rs | 1 + src/config/mod.rs | 4 ++++ 4 files changed, 13 insertions(+) diff --git a/components/engine_panic/src/db_options.rs b/components/engine_panic/src/db_options.rs index 05147ca06fb..0753bb7e0fc 100644 --- a/components/engine_panic/src/db_options.rs +++ b/components/engine_panic/src/db_options.rs @@ -59,6 +59,10 @@ impl DbOptions for PanicDbOptions { fn set_titandb_options(&mut self, opts: &Self::TitanDbOptions) { panic!() } + + fn set_track_and_verify_wals_in_manifest(&mut self, v: bool) { + panic!() + } } pub struct PanicTitanDbOptions; diff --git a/components/engine_rocks/src/db_options.rs b/components/engine_rocks/src/db_options.rs index 38587663084..c95f81f8297 100644 --- a/components/engine_rocks/src/db_options.rs +++ b/components/engine_rocks/src/db_options.rs @@ -120,6 +120,10 @@ impl DbOptions for RocksDbOptions { fn set_titandb_options(&mut self, opts: &Self::TitanDbOptions) { self.0.set_titandb_options(opts.as_raw()) } + + fn set_track_and_verify_wals_in_manifest(&mut self, v: bool) { + self.0.set_track_and_verify_wals_in_manifest(v) + } } pub struct RocksTitanDbOptions(RawTitanDBOptions); diff --git a/components/engine_traits/src/db_options.rs b/components/engine_traits/src/db_options.rs index 9713c406978..60cacb1f76f 100644 --- a/components/engine_traits/src/db_options.rs +++ b/components/engine_traits/src/db_options.rs @@ -24,6 +24,7 @@ pub trait DbOptions { fn get_flush_size(&self) -> Result; fn set_flush_oldest_first(&mut self, f: bool) -> Result<()>; fn set_titandb_options(&mut self, opts: &Self::TitanDbOptions); + fn set_track_and_verify_wals_in_manifest(&mut self, v: bool); } /// Titan-specefic options diff --git a/src/config/mod.rs b/src/config/mod.rs index 1a83cc25d3d..786c73cf6cc 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -1296,6 +1296,8 @@ pub struct DbConfig { #[doc(hidden)] #[serde(skip_serializing)] pub write_buffer_flush_oldest_first: bool, + #[online_config(skip)] + pub track_and_verify_wals_in_manifest: bool, // Dangerous option only for programming use. #[online_config(skip)] #[serde(skip)] @@ -1360,6 +1362,7 @@ impl Default for DbConfig { write_buffer_limit: None, write_buffer_stall_ratio: 0.0, write_buffer_flush_oldest_first: true, + track_and_verify_wals_in_manifest: false, paranoid_checks: None, defaultcf: DefaultCfConfig::default(), writecf: WriteCfConfig::default(), @@ -1536,6 +1539,7 @@ impl DbConfig { // Historical stats are not used. opts.set_stats_persist_period_sec(0); } + opts.set_track_and_verify_wals_in_manifest(self.track_and_verify_wals_in_manifest); opts } From a6b4f248a873e7851f3bca2f485b4477bb92fe48 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Tue, 21 May 2024 10:26:45 +0800 Subject: [PATCH 166/220] txn: Fix the issue that CheckTxnStatus didn't make rollback on optimistic transaction's primary protected, which may break transaction atomicity (#16621) (#16954) close tikv/tikv#16620 Fix the issue that CheckTxnStatus didn't make rollback on optimistic transaction's primary protected, which may break transaction atomicity Signed-off-by: ti-chi-bot Signed-off-by: MyonKeminta Co-authored-by: MyonKeminta <9948422+MyonKeminta@users.noreply.github.com> Co-authored-by: MyonKeminta Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- components/resolved_ts/src/cmd.rs | 31 ++++++++- src/storage/mvcc/txn.rs | 16 +++-- src/storage/txn/actions/check_txn_status.rs | 4 +- src/storage/txn/actions/cleanup.rs | 5 +- src/storage/txn/commands/check_txn_status.rs | 73 ++++++++++++++++++-- 5 files changed, 114 insertions(+), 15 deletions(-) diff --git a/components/resolved_ts/src/cmd.rs b/components/resolved_ts/src/cmd.rs index 47d14304112..ec88bf45149 100644 --- a/components/resolved_ts/src/cmd.rs +++ b/components/resolved_ts/src/cmd.rs @@ -143,7 +143,8 @@ pub(crate) fn decode_write(key: &[u8], value: &[u8], is_apply: bool) -> Option = engine .take_last_modifies() @@ -398,6 +405,26 @@ mod tests { commit_ts: None, write_type: WriteType::Rollback, }, + ChangeRow::Prewrite { + key: k1.clone(), + start_ts: 6.into(), + value: Some(b"v4".to_vec()), + lock_type: LockType::Put, + }, + ChangeRow::Commit { + key: k1.clone(), + start_ts: Some(6.into()), + commit_ts: Some(7.into()), + write_type: WriteType::Put, + }, + ChangeRow::Prewrite { + key: k1.clone(), + start_ts: 7.into(), + value: Some(b"v5".to_vec()), + lock_type: LockType::Put, + }, + // Rollback of the txn@start_ts=7 will be missing as overlapped rollback is not + // hanlded. ]; assert_eq!(rows, expected); diff --git a/src/storage/mvcc/txn.rs b/src/storage/mvcc/txn.rs index a446ef64d22..08f72bf777a 100644 --- a/src/storage/mvcc/txn.rs +++ b/src/storage/mvcc/txn.rs @@ -543,8 +543,10 @@ pub(crate) mod tests { // Rollback lock must_rollback(&mut engine, k, 15, false); - // Rollbacks of optimistic transactions needn't be protected - must_get_rollback_protected(&mut engine, k, 15, false); + // Rollbacks of optimistic transactions need to be protected + // TODO: Re-check how the test can be better written after refinement of + // `must_rollback`'s semantics. + must_get_rollback_protected(&mut engine, k, 15, true); } #[test] @@ -896,16 +898,20 @@ pub(crate) mod tests { #[test] fn test_collapse_prev_rollback() { let mut engine = TestEngineBuilder::new().build().unwrap(); - let (key, value) = (b"key", b"value"); + let (key, pk, value) = (b"key", b"pk", b"value"); + + // Worked around the problem that `must_rollback` always protects primary lock + // by setting different PK. + // TODO: Cover primary when working on https://github.com/tikv/tikv/issues/16625 // Add a Rollback whose start ts is 1. - must_prewrite_put(&mut engine, key, value, key, 1); + must_prewrite_put(&mut engine, key, value, pk, 1); must_rollback(&mut engine, key, 1, false); must_get_rollback_ts(&mut engine, key, 1); // Add a Rollback whose start ts is 2, the previous Rollback whose // start ts is 1 will be collapsed. - must_prewrite_put(&mut engine, key, value, key, 2); + must_prewrite_put(&mut engine, key, value, pk, 2); must_rollback(&mut engine, key, 2, false); must_get_none(&mut engine, key, 2); must_get_rollback_ts(&mut engine, key, 2); diff --git a/src/storage/txn/actions/check_txn_status.rs b/src/storage/txn/actions/check_txn_status.rs index 6e786aec5fa..8ec8f438f21 100644 --- a/src/storage/txn/actions/check_txn_status.rs +++ b/src/storage/txn/actions/check_txn_status.rs @@ -322,8 +322,8 @@ pub fn rollback_lock( txn.delete_value(key.clone(), lock.ts); } - // Only the primary key of a pessimistic transaction needs to be protected. - let protected: bool = is_pessimistic_txn && key.is_encoded_from(&lock.primary); + // The primary key of a transaction needs to be protected. + let protected: bool = key.is_encoded_from(&lock.primary); if let Some(write) = make_rollback(reader.start_ts, protected, overlapped_write) { txn.put_write(key.clone(), reader.start_ts, write.as_ref().to_bytes()); } diff --git a/src/storage/txn/actions/cleanup.rs b/src/storage/txn/actions/cleanup.rs index 5ed77d4fab3..d28368aa1bf 100644 --- a/src/storage/txn/actions/cleanup.rs +++ b/src/storage/txn/actions/cleanup.rs @@ -223,8 +223,9 @@ pub mod tests { // TTL expired. The lock should be removed. must_succeed(&mut engine, k, ts(10, 0), ts(120, 0)); must_unlocked(&mut engine, k); - // Rollbacks of optimistic transactions needn't be protected - must_get_rollback_protected(&mut engine, k, ts(10, 0), false); + // Rollbacks of optimistic transactions need to be protected + // See: https://github.com/tikv/tikv/issues/16620 + must_get_rollback_protected(&mut engine, k, ts(10, 0), true); must_get_rollback_ts(&mut engine, k, ts(10, 0)); // Rollbacks of primary keys in pessimistic transactions should be protected diff --git a/src/storage/txn/commands/check_txn_status.rs b/src/storage/txn/commands/check_txn_status.rs index 9e9a6cc0895..ce5774c0db6 100644 --- a/src/storage/txn/commands/check_txn_status.rs +++ b/src/storage/txn/commands/check_txn_status.rs @@ -159,7 +159,9 @@ impl WriteCommand for CheckTxnStatus { #[cfg(test)] pub mod tests { use concurrency_manager::ConcurrencyManager; - use kvproto::kvrpcpb::{self, Context, LockInfo, PrewriteRequestPessimisticAction::*}; + use kvproto::kvrpcpb::{ + self, Context, LockInfo, PrewriteRequestPessimisticAction::*, WriteConflictReason, + }; use tikv_util::deadline::Deadline; use txn_types::{Key, LastChange, WriteType}; @@ -168,7 +170,7 @@ pub mod tests { kv::Engine, lock_manager::MockLockManager, mvcc, - mvcc::tests::*, + mvcc::{tests::*, ErrorInner}, txn::{ self, actions::acquire_pessimistic_lock::tests::acquire_pessimistic_lock_allow_lock_with_conflict, @@ -224,7 +226,12 @@ pub mod tests { ) .unwrap(); if let ProcessResult::TxnStatus { txn_status } = result.pr { - assert!(status_pred(txn_status)); + let formatted_txn_status = format!("{:?}", txn_status); + assert!( + status_pred(txn_status), + "txn_status returned by check_txn_status ({}) doesn't pass the check", + formatted_txn_status + ); } else { unreachable!(); } @@ -414,7 +421,7 @@ pub mod tests { |s| s == TtlExpire, ); must_unlocked(&mut engine, b"k1"); - must_get_rollback_protected(&mut engine, b"k1", 1, false); + must_get_rollback_protected(&mut engine, b"k1", 1, true); // case 2: primary is prewritten (pessimistic) must_acquire_pessimistic_lock(&mut engine, b"k2", b"k2", 15, 15); @@ -829,6 +836,7 @@ pub mod tests { ts(20, 0), WriteType::Rollback, ); + must_get_rollback_protected(&mut engine, k, ts(20, 0), true); // Push the min_commit_ts of pessimistic locks. must_acquire_pessimistic_lock_for_large_txn(&mut engine, k, k, ts(4, 0), ts(130, 0), 200); @@ -1437,4 +1445,61 @@ pub mod tests { ) .unwrap_err(); } + + #[test] + fn test_check_txn_status_rollback_optimistic() { + let mut engine = TestEngineBuilder::new().build().unwrap(); + let k = b"k1"; + let (v1, v2) = (b"v1", b"v2"); + + let ts = TimeStamp::compose; + + must_prewrite_put_async_commit(&mut engine, k, v1, k, &Some(vec![]), ts(1, 0), ts(1, 1)); + must_commit(&mut engine, k, ts(1, 0), ts(2, 0)); + + must_prewrite_put(&mut engine, k, v2, k, ts(2, 0)); + assert!(!must_have_write(&mut engine, k, ts(2, 0)).has_overlapped_rollback); + + must_success( + &mut engine, + k, + ts(2, 0), + ts(3, 0), + ts(3, 0), + true, + false, + false, + |s| s == TtlExpire, + ); + must_get_overlapped_rollback( + &mut engine, + k, + ts(2, 0), + ts(1, 0), + WriteType::Put, + Some(0.into()), + ); + + let e = must_prewrite_put_err(&mut engine, k, v2, k, ts(2, 0)); + match &*e.0 { + ErrorInner::WriteConflict { + start_ts, + conflict_start_ts, + conflict_commit_ts, + key, + primary, + reason, + } => { + assert_eq!(*start_ts, ts(2, 0)); + assert_eq!(*conflict_start_ts, ts(1, 0)); + assert_eq!(*conflict_commit_ts, ts(2, 0)); + assert_eq!(key.as_slice(), k); + assert_eq!(primary.as_slice(), k); + assert_eq!(*reason, WriteConflictReason::SelfRolledBack); + } + e => { + panic!("unexpected error: {:?}", e); + } + } + } } From 45eb612973b86b397260aded22f7e1d110fcb2ec Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Wed, 22 May 2024 11:22:16 +0800 Subject: [PATCH 167/220] logger: fix thread_id in log (#16399) (#16401) close tikv/tikv#16398 The current "thread_id" in the log is always 0x5. This is because: 1)TiKV logs asynchronously by sending all log records to a dedicated thread called "slogger", which is the fifth thread spawned by TiKV; and 2) "thread_id" is evaluated lazily by the "slogger" thread. To fix this issue, this commit obtains the "thread_id" before sending it to the "slogger" thread. Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> Co-authored-by: tonyxuqqi --- components/tikv_util/src/logger/mod.rs | 281 +++++++++++-------------- 1 file changed, 123 insertions(+), 158 deletions(-) diff --git a/components/tikv_util/src/logger/mod.rs b/components/tikv_util/src/logger/mod.rs index c321f56a1b5..1eb82ced3e2 100644 --- a/components/tikv_util/src/logger/mod.rs +++ b/components/tikv_util/src/logger/mod.rs @@ -6,7 +6,6 @@ mod formatter; use std::{ env, fmt, io::{self, BufWriter}, - num::NonZeroU64, path::{Path, PathBuf}, sync::{ atomic::{AtomicUsize, Ordering}, @@ -16,10 +15,7 @@ use std::{ }; use log::{self, SetLoggerError}; -use slog::{ - self, slog_o, Drain, FnValue, Key, OwnedKV, OwnedKVList, PushFnValue, Record, - SendSyncRefUnwindSafeKV, KV, -}; +use slog::{self, slog_o, Drain, FnValue, Key, OwnedKVList, PushFnValue, Record, KV}; pub use slog::{FilterFn, Level}; use slog_async::{Async, AsyncGuard, OverflowStrategy}; use slog_term::{Decorator, PlainDecorator, RecordDecorator}; @@ -76,6 +72,24 @@ where } }; + fn build_log_drain( + drain: I, + threshold: u64, + filter: impl FilterFn, + ) -> impl Drain + where + I: Drain, + { + let drain = SlowLogFilter { + threshold, + inner: drain, + }; + let drain = ThreadIDrain(drain); + // Let GlobalLevelFilter wrap ThreadIDrain, so that it saves getting + // thread id for flittered logs. + GlobalLevelFilter::new(drain.filter(filter).fuse()) + } + let (logger, guard) = if use_async { let (async_log, guard) = Async::new(LogAndFuse(drain)) .chan_size(SLOG_CHANNEL_SIZE) @@ -83,21 +97,12 @@ where .thread_name(thd_name!("slogger")) .build_with_guard(); let drain = async_log.fuse(); - let drain = SlowLogFilter { - threshold: slow_threshold, - inner: drain, - }; - let filtered = GlobalLevelFilter::new(drain.filter(filter).fuse()); - - (slog::Logger::root(filtered, get_values()), Some(guard)) + let drain = build_log_drain(drain, slow_threshold, filter); + (slog::Logger::root(drain, slog_o!()), Some(guard)) } else { let drain = LogAndFuse(Mutex::new(drain)); - let drain = SlowLogFilter { - threshold: slow_threshold, - inner: drain, - }; - let filtered = GlobalLevelFilter::new(drain.filter(filter).fuse()); - (slog::Logger::root(filtered, get_values()), None) + let drain = build_log_drain(drain, slow_threshold, filter); + (slog::Logger::root(drain, slog_o!()), None) }; set_global_logger(level, init_stdlog, logger, guard) @@ -632,16 +637,22 @@ fn write_log_fields( Ok(()) } -fn format_thread_id(thread_id: NonZeroU64) -> String { - format!("{:#0x}", thread_id) -} +struct ThreadIDrain(pub D); -fn get_values() -> OwnedKV { - slog_o!( - "thread_id" => FnValue(|_| { - format_thread_id(std::thread::current().id().as_u64()) - }) - ) +impl Drain for ThreadIDrain +where + D: Drain, +{ + type Ok = D::Ok; + type Err = D::Err; + fn log(&self, record: &Record<'_>, values: &OwnedKVList) -> Result { + let values = slog::o!( + "thread_id" => std::thread::current().id().as_u64().get(), + // OwnedKVList is essentially an Arc, clone is cheap. + values.clone(), + ); + self.0.log(record, &OwnedKVList::from(values)) + } } struct Serializer<'a> { @@ -695,7 +706,7 @@ impl<'a> slog::Serializer for Serializer<'a> { #[cfg(test)] mod tests { - use std::{cell::RefCell, io, io::Write, str::from_utf8, sync::RwLock, time::Duration}; + use std::{cell::RefCell, io, io::Write, str::from_utf8, sync::Arc, time::Duration}; use chrono::DateTime; use regex::Regex; @@ -704,19 +715,13 @@ mod tests { use super::*; - // Due to the requirements of `Logger::root*` on a writer with a 'static - // lifetime we need to make a Thread Local, - // and implement a custom writer. - thread_local! { - static BUFFER: RefCell> = RefCell::new(Vec::new()); - } - struct TestWriter; + struct TestWriter(Arc>>); impl Write for TestWriter { fn write(&mut self, buf: &[u8]) -> io::Result { - BUFFER.with(|buffer| buffer.borrow_mut().write(buf)) + self.0.lock().unwrap().write(buf) } fn flush(&mut self) -> io::Result<()> { - BUFFER.with(|buffer| buffer.borrow_mut().flush()) + self.0.lock().unwrap().flush() } } @@ -775,121 +780,125 @@ mod tests { #[test] fn test_log_format_text() { - let decorator = PlainSyncDecorator::new(TestWriter); + let buffer: Arc>> = Arc::default(); + let decorator = PlainSyncDecorator::new(TestWriter(buffer.clone())); let drain = TikvFormat::new(decorator, true).fuse(); - let logger = slog::Logger::root_typed(drain, get_values()).into_erased(); + let drain = ThreadIDrain(drain); + let drain = slog::Logger::root_typed(drain, slog_o!("raft_id" => 1)).into_erased(); + let logger = slog::Logger::root_typed(drain, slog_o!()).into_erased(); log_format_cases(logger); - let thread_id = format_thread_id(std::thread::current().id().as_u64()); + let thread_id = std::thread::current().id().as_u64(); let expect = format!( - r#"[2019/01/15 13:40:39.619 +08:00] [INFO] [mod.rs:469] [] [thread_id={0}] -[2019/01/15 13:40:39.619 +08:00] [INFO] [mod.rs:469] [Welcome] [thread_id={0}] -[2019/01/15 13:40:39.619 +08:00] [INFO] [mod.rs:470] ["Welcome TiKV"] [thread_id={0}] -[2019/01/15 13:40:39.619 +08:00] [INFO] [mod.rs:471] [欢迎] [thread_id={0}] -[2019/01/15 13:40:39.619 +08:00] [INFO] [mod.rs:472] ["欢迎 TiKV"] [thread_id={0}] -[2019/01/15 13:40:39.615 +08:00] [INFO] [mod.rs:455] ["failed to fetch URL"] [backoff=3s] [attempt=3] [url=http://example.com] [thread_id={0}] -[2019/01/15 13:40:39.619 +08:00] [INFO] [mod.rs:460] ["failed to \"fetch\" [URL]: http://example.com"] [thread_id={0}] -[2019/01/15 13:40:39.619 +08:00] [DEBUG] [mod.rs:463] ["Slow query"] ["process keys"=1500] [duration=123ns] [sql="SELECT * FROM TABLE WHERE ID=\"abc\""] [thread_id={0}] -[2019/01/15 13:40:39.619 +08:00] [WARN] [mod.rs:473] [Type] [Other=-inf] [Score=inf] [Counter=NaN] [thread_id={0}] -[2019/01/16 16:56:04.854 +08:00] [INFO] [mod.rs:391] ["more type tests"] [str_array="[\"💖\", \"�\", \"☺☻☹\", \"日a本b語ç日ð本Ê語þ日¥本¼語i日©\", \"日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©\", \"\\\\x80\\\\x80\\\\x80\\\\x80\", \"XML\"]"] [u8=34] [is_None=None] [is_false=false] [is_true=true] ["store ids"="[1, 2, 3]"] [url-peers="[\"peer1\", \"peer 2\"]"] [urls="[\"http://xxx.com:2347\", \"http://xxx.com:2432\"]"] [field2="in quote"] [field1=no_quote] [thread_id={0}] + r#"[2019/01/15 13:40:39.619 +08:00] [INFO] [mod.rs:469] [] [raft_id=1] [thread_id={0}] +[2019/01/15 13:40:39.619 +08:00] [INFO] [mod.rs:469] [Welcome] [raft_id=1] [thread_id={0}] +[2019/01/15 13:40:39.619 +08:00] [INFO] [mod.rs:470] ["Welcome TiKV"] [raft_id=1] [thread_id={0}] +[2019/01/15 13:40:39.619 +08:00] [INFO] [mod.rs:471] [欢迎] [raft_id=1] [thread_id={0}] +[2019/01/15 13:40:39.619 +08:00] [INFO] [mod.rs:472] ["欢迎 TiKV"] [raft_id=1] [thread_id={0}] +[2019/01/15 13:40:39.615 +08:00] [INFO] [mod.rs:455] ["failed to fetch URL"] [backoff=3s] [attempt=3] [url=http://example.com] [raft_id=1] [thread_id={0}] +[2019/01/15 13:40:39.619 +08:00] [INFO] [mod.rs:460] ["failed to \"fetch\" [URL]: http://example.com"] [raft_id=1] [thread_id={0}] +[2019/01/15 13:40:39.619 +08:00] [DEBUG] [mod.rs:463] ["Slow query"] ["process keys"=1500] [duration=123ns] [sql="SELECT * FROM TABLE WHERE ID=\"abc\""] [raft_id=1] [thread_id={0}] +[2019/01/15 13:40:39.619 +08:00] [WARN] [mod.rs:473] [Type] [Other=-inf] [Score=inf] [Counter=NaN] [raft_id=1] [thread_id={0}] +[2019/01/16 16:56:04.854 +08:00] [INFO] [mod.rs:391] ["more type tests"] [str_array="[\"💖\", \"�\", \"☺☻☹\", \"日a本b語ç日ð本Ê語þ日¥本¼語i日©\", \"日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©\", \"\\\\x80\\\\x80\\\\x80\\\\x80\", \"XML\"]"] [u8=34] [is_None=None] [is_false=false] [is_true=true] ["store ids"="[1, 2, 3]"] [url-peers="[\"peer1\", \"peer 2\"]"] [urls="[\"http://xxx.com:2347\", \"http://xxx.com:2432\"]"] [field2="in quote"] [field1=no_quote] [raft_id=1] [thread_id={0}] "#, thread_id ); - BUFFER.with(|buffer| { - let mut buffer = buffer.borrow_mut(); - let output = from_utf8(&buffer).unwrap(); - assert_eq!(output.lines().count(), expect.lines().count()); - - let re = Regex::new(r"(?P\[.*?\])\s(?P\[.*?\])\s(?P\[.*?\])\s(?P\[.*?\])\s?(?P\[.*\])?").unwrap(); - - for (output_line, expect_line) in output.lines().zip(expect.lines()) { - let expect_segments = re.captures(expect_line).unwrap(); - let output_segments = re.captures(output_line).unwrap(); + let buffer = buffer.lock().unwrap(); + let output = from_utf8(&buffer).unwrap(); + assert_eq!( + output.lines().count(), + expect.lines().count(), + "{}\n===\n{}", + output, + expect + ); - validate_log_datetime(peel(&output_segments["datetime"])); + let re = Regex::new(r"(?P\[.*?\])\s(?P\[.*?\])\s(?P\[.*?\])\s(?P\[.*?\])\s?(?P\[.*\])?").unwrap(); - assert!(validate_log_source_file( - peel(&expect_segments["source_file"]), - peel(&output_segments["source_file"]) - )); - assert_eq!(expect_segments["level"], output_segments["level"]); - assert_eq!(expect_segments["msg"], output_segments["msg"]); - assert_eq!( - expect_segments.name("kvs").map(|s| s.as_str()), - output_segments.name("kvs").map(|s| s.as_str()) - ); - } - buffer.clear(); - }); + for (output_line, expect_line) in output.lines().zip(expect.lines()) { + let expect_segments = re.captures(expect_line).unwrap(); + let output_segments = re.captures(output_line).unwrap(); + + validate_log_datetime(peel(&output_segments["datetime"])); + + assert!(validate_log_source_file( + peel(&expect_segments["source_file"]), + peel(&output_segments["source_file"]) + )); + assert_eq!(expect_segments["level"], output_segments["level"]); + assert_eq!(expect_segments["msg"], output_segments["msg"]); + assert_eq!( + expect_segments.name("kvs").map(|s| s.as_str()), + output_segments.name("kvs").map(|s| s.as_str()) + ); + } } #[test] fn test_log_format_json() { use serde_json::{from_str, Value}; - let drain = Mutex::new(json_format(TestWriter, true)).map(slog::Fuse); - let logger = slog::Logger::root_typed(drain, get_values()).into_erased(); + let buffer: Arc>> = Arc::default(); + let drain = Mutex::new(json_format(TestWriter(buffer.clone()), true)).map(slog::Fuse); + let drain = ThreadIDrain(drain); + let logger = slog::Logger::root_typed(drain, slog_o!()).into_erased(); log_format_cases(logger); - let thread_id = format_thread_id(std::thread::current().id().as_u64()); + let thread_id = std::thread::current().id().as_u64(); let expect = format!( - r#"{{"time":"2020/05/16 15:49:52.449 +08:00","level":"INFO","caller":"mod.rs:469","message":"","thread_id":"{0}"}} -{{"time":"2020/05/16 15:49:52.450 +08:00","level":"INFO","caller":"mod.rs:469","message":"Welcome","thread_id":"{0}"}} -{{"time":"2020/05/16 15:49:52.450 +08:00","level":"INFO","caller":"mod.rs:470","message":"Welcome TiKV","thread_id":"{0}"}} -{{"time":"2020/05/16 15:49:52.450 +08:00","level":"INFO","caller":"mod.rs:471","message":"欢迎","thread_id":"{0}"}} -{{"time":"2020/05/16 15:49:52.450 +08:00","level":"INFO","caller":"mod.rs:472","message":"欢迎 TiKV","thread_id":"{0}"}} -{{"time":"2020/05/16 15:49:52.450 +08:00","level":"INFO","caller":"mod.rs:455","message":"failed to fetch URL","backoff":"3s","attempt":3,"url":"http://example.com","thread_id":"{0}"}} -{{"time":"2020/05/16 15:49:52.450 +08:00","level":"INFO","caller":"mod.rs:460","message":"failed to \"fetch\" [URL]: http://example.com","thread_id":"{0}"}} -{{"time":"2020/05/16 15:49:52.450 +08:00","level":"DEBUG","caller":"mod.rs:463","message":"Slow query","process keys":1500,"duration":"123ns","sql":"SELECT * FROM TABLE WHERE ID=\"abc\"","thread_id":"{0}"}} -{{"time":"2020/05/16 15:49:52.450 +08:00","level":"WARN","caller":"mod.rs:473","message":"Type","Other":null,"Score":null,"Counter":null,"thread_id":"{0}"}} -{{"time":"2020/05/16 15:49:52.451 +08:00","level":"INFO","caller":"mod.rs:391","message":"more type tests","str_array":"[\"💖\", \"�\", \"☺☻☹\", \"日a本b語ç日ð本Ê語þ日¥本¼語i日©\", \"日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©\", \"\\\\x80\\\\x80\\\\x80\\\\x80\", \"XML\"]","u8":34,"is_None":null,"is_false":false,"is_true":true,"store ids":"[1, 2, 3]","url-peers":"[\"peer1\", \"peer 2\"]","urls":"[\"http://xxx.com:2347\", \"http://xxx.com:2432\"]","field2":"in quote","field1":"no_quote","thread_id":"{0}"}} + r#"{{"time":"2020/05/16 15:49:52.449 +08:00","level":"INFO","caller":"mod.rs:469","message":"","thread_id":{0}}} +{{"time":"2020/05/16 15:49:52.450 +08:00","level":"INFO","caller":"mod.rs:469","message":"Welcome","thread_id":{0}}} +{{"time":"2020/05/16 15:49:52.450 +08:00","level":"INFO","caller":"mod.rs:470","message":"Welcome TiKV","thread_id":{0}}} +{{"time":"2020/05/16 15:49:52.450 +08:00","level":"INFO","caller":"mod.rs:471","message":"欢迎","thread_id":{0}}} +{{"time":"2020/05/16 15:49:52.450 +08:00","level":"INFO","caller":"mod.rs:472","message":"欢迎 TiKV","thread_id":{0}}} +{{"time":"2020/05/16 15:49:52.450 +08:00","level":"INFO","caller":"mod.rs:455","message":"failed to fetch URL","backoff":"3s","attempt":3,"url":"http://example.com","thread_id":{0}}} +{{"time":"2020/05/16 15:49:52.450 +08:00","level":"INFO","caller":"mod.rs:460","message":"failed to \"fetch\" [URL]: http://example.com","thread_id":{0}}} +{{"time":"2020/05/16 15:49:52.450 +08:00","level":"DEBUG","caller":"mod.rs:463","message":"Slow query","process keys":1500,"duration":"123ns","sql":"SELECT * FROM TABLE WHERE ID=\"abc\"","thread_id":{0}}} +{{"time":"2020/05/16 15:49:52.450 +08:00","level":"WARN","caller":"mod.rs:473","message":"Type","Other":null,"Score":null,"Counter":null,"thread_id":{0}}} +{{"time":"2020/05/16 15:49:52.451 +08:00","level":"INFO","caller":"mod.rs:391","message":"more type tests","str_array":"[\"💖\", \"�\", \"☺☻☹\", \"日a本b語ç日ð本Ê語þ日¥本¼語i日©\", \"日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©\", \"\\\\x80\\\\x80\\\\x80\\\\x80\", \"XML\"]","u8":34,"is_None":null,"is_false":false,"is_true":true,"store ids":"[1, 2, 3]","url-peers":"[\"peer1\", \"peer 2\"]","urls":"[\"http://xxx.com:2347\", \"http://xxx.com:2432\"]","field2":"in quote","field1":"no_quote","thread_id":{0}}} "#, thread_id ); - BUFFER.with(|buffer| { - let mut buffer = buffer.borrow_mut(); - let output = from_utf8(&buffer).unwrap(); - assert_eq!(output.lines().count(), expect.lines().count()); + let buffer = buffer.lock().unwrap(); + let output = from_utf8(&buffer).unwrap(); + assert_eq!(output.lines().count(), expect.lines().count()); - for (output_line, expect_line) in output.lines().zip(expect.lines()) { - let mut expect_json = from_str::(expect_line).unwrap(); - let mut output_json = from_str::(output_line).unwrap(); + for (output_line, expect_line) in output.lines().zip(expect.lines()) { + let mut expect_json = from_str::(expect_line).unwrap(); + let mut output_json = from_str::(output_line).unwrap(); - validate_log_datetime(output_json["time"].take().as_str().unwrap()); - // Remove time field to bypass timestamp mismatch. - let _ = expect_json["time"].take(); + validate_log_datetime(output_json["time"].take().as_str().unwrap()); + // Remove time field to bypass timestamp mismatch. + let _ = expect_json["time"].take(); - validate_log_source_file( - output_json["caller"].take().as_str().unwrap(), - expect_json["caller"].take().as_str().unwrap(), - ); + validate_log_source_file( + output_json["caller"].take().as_str().unwrap(), + expect_json["caller"].take().as_str().unwrap(), + ); - assert_eq!(expect_json, output_json); - } - buffer.clear(); - }); + assert_eq!(expect_json, output_json); + } } #[test] fn test_global_level_filter() { - let decorator = PlainSyncDecorator::new(TestWriter); + let buffer: Arc>> = Arc::default(); + let decorator = PlainSyncDecorator::new(TestWriter(buffer.clone())); let drain = TikvFormat::new(decorator, true).fuse(); let logger = slog::Logger::root_typed(GlobalLevelFilter::new(drain), slog_o!()).into_erased(); let expected = "[2019/01/15 13:40:39.619 +08:00] [INFO] [mod.rs:871] [Welcome]\n"; let check_log = |log: &str| { - BUFFER.with(|buffer| { - let mut buffer = buffer.borrow_mut(); - let output = from_utf8(&buffer).unwrap(); - // only check the log len here as some field like timestamp, location may - // change. - assert_eq!(output.len(), log.len()); - buffer.clear(); - }); + let mut buffer = buffer.lock().unwrap(); + let output = from_utf8(&buffer).unwrap(); + // only check the log len here as some field like timestamp, location may + // change. + assert_eq!(output.len(), log.len()); + buffer.clear(); }; set_log_level(Level::Info); @@ -1096,48 +1105,4 @@ mod tests { } }); } - - static THREAD_SAFE_BUFFER: RwLock> = RwLock::new(Vec::new()); - - struct ThreadSafeWriter; - impl Write for ThreadSafeWriter { - fn write(&mut self, data: &[u8]) -> io::Result { - let mut buffer = THREAD_SAFE_BUFFER.write().unwrap(); - buffer.write(data) - } - - fn flush(&mut self) -> io::Result<()> { - let mut buffer = THREAD_SAFE_BUFFER.write().unwrap(); - buffer.flush() - } - } - - #[test] - fn test_threadid() { - let drain = TikvFormat::new(PlainSyncDecorator::new(ThreadSafeWriter), true).fuse(); - let logger = slog::Logger::root_typed(drain, get_values()).into_erased(); - - slog_info!(logger, "Hello from the first thread"); - let this_threadid = thread::current().id().as_u64(); - let this_threadid = format_thread_id(this_threadid); - - let handle = thread::spawn(move || { - slog_info!(logger, "Hello from the second thread"); - }); - let other_threadid = handle.thread().id().as_u64(); - let other_threadid = format_thread_id(other_threadid); - handle.join().unwrap(); - - let expected = vec![this_threadid, other_threadid]; - - let re = Regex::new(r"\[thread_id=(.*?)\]").unwrap(); - let buffer = THREAD_SAFE_BUFFER.read().unwrap(); - let output = from_utf8(&buffer).unwrap(); - let actual: Vec<&str> = output - .lines() - .map(|line| re.captures(line).unwrap()) - .map(|captures| captures.get(1).unwrap().as_str()) - .collect(); - assert_eq!(expected, actual); - } } From 0df42b777bc0113cf2d9fe0732a5d642965840d5 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Wed, 22 May 2024 14:07:17 +0800 Subject: [PATCH 168/220] *: remove unnecessary async blocks to save memory (#16541) (#16669) close tikv/tikv#16540 *: enable linters about async and futures We should be pedantic about writing async code, as it's easy to write suboptimal or even bloat code. See: https://github.com/rust-lang/rust/issues/69826 *: remove unnecessary async blocks to save memory This commit favors FutureExt::map over async blocks to mitigate the issue of async block doubled memory usage. Through the sysbench oltp_read_only test, it was observed that this adjustment resulted in approximately 26% reduction in memory usage. See: https://github.com/rust-lang/rust/issues/59087 Signed-off-by: Neil Shen Co-authored-by: Neil Shen Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- .../tikv_util/src/yatp_pool/future_pool.rs | 17 ++++++++++------ src/read_pool.rs | 20 +++++++++---------- src/storage/txn/sched_pool.rs | 8 +------- 3 files changed, 22 insertions(+), 23 deletions(-) diff --git a/components/tikv_util/src/yatp_pool/future_pool.rs b/components/tikv_util/src/yatp_pool/future_pool.rs index 2deead30580..c65219ffbb1 100644 --- a/components/tikv_util/src/yatp_pool/future_pool.rs +++ b/components/tikv_util/src/yatp_pool/future_pool.rs @@ -13,6 +13,7 @@ use std::{ use fail::fail_point; use futures::channel::oneshot::{self, Canceled}; +use futures_util::future::FutureExt; use prometheus::{IntCounter, IntGauge}; use tracker::TrackedFuture; use yatp::{queue::Extras, task::future}; @@ -179,11 +180,13 @@ impl PoolInner { metrics_running_task_count.inc(); - let f = async move { - let _ = future.await; + // NB: Prefer FutureExt::map to async block, because an async block + // doubles memory usage. + // See https://github.com/rust-lang/rust/issues/59087 + let f = future.map(move |_| { metrics_handled_task_count.inc(); metrics_running_task_count.dec(); - }; + }); if let Some(extras) = extras { self.pool.spawn(future::TaskCell::new(f, extras)); @@ -208,12 +211,14 @@ impl PoolInner { let (tx, rx) = oneshot::channel(); metrics_running_task_count.inc(); - self.pool.spawn(async move { - let res = future.await; + // NB: Prefer FutureExt::map to async block, because an async block + // doubles memory usage. + // See https://github.com/rust-lang/rust/issues/59087 + self.pool.spawn(future.map(move |res| { metrics_handled_task_count.inc(); metrics_running_task_count.dec(); let _ = tx.send(res); - }); + })); Ok(rx) } } diff --git a/src/read_pool.rs b/src/read_pool.rs index 301ea648274..da251c78aa0 100644 --- a/src/read_pool.rs +++ b/src/read_pool.rs @@ -12,7 +12,10 @@ use std::{ }; use file_system::{set_io_type, IoType}; -use futures::{channel::oneshot, future::TryFutureExt}; +use futures::{ + channel::oneshot, + future::{FutureExt, TryFutureExt}, +}; use kvproto::{errorpb, kvrpcpb::CommandPri}; use online_config::{ConfigChange, ConfigManager, ConfigValue, Result as CfgResult}; use prometheus::{core::Metric, Histogram, IntCounter, IntGauge}; @@ -171,10 +174,9 @@ impl ReadPoolHandle { TaskCell::new( TrackedFuture::new(with_resource_limiter( ControlledFuture::new( - async move { - f.await; + f.map(move |_| { running_tasks.dec(); - }, + }), resource_ctl.clone(), group_name, ), @@ -184,10 +186,9 @@ impl ReadPoolHandle { ) } else { TaskCell::new( - TrackedFuture::new(async move { - f.await; + TrackedFuture::new(f.map(move |_| { running_tasks.dec(); - }), + })), extras, ) }; @@ -211,10 +212,9 @@ impl ReadPoolHandle { { let (tx, rx) = oneshot::channel::(); let res = self.spawn( - async move { - let res = f.await; + f.map(move |res| { let _ = tx.send(res); - }, + }), priority, task_id, metadata, diff --git a/src/storage/txn/sched_pool.rs b/src/storage/txn/sched_pool.rs index 8674a581c72..70d54c24cfa 100644 --- a/src/storage/txn/sched_pool.rs +++ b/src/storage/txn/sched_pool.rs @@ -131,13 +131,7 @@ impl PriorityQueue { extras.set_metadata(metadata.to_vec()); self.worker_pool.spawn_with_extras( with_resource_limiter( - ControlledFuture::new( - async move { - f.await; - }, - self.resource_ctl.clone(), - group_name, - ), + ControlledFuture::new(f, self.resource_ctl.clone(), group_name), resource_limiter, ), extras, From a222e72215b8a9d44ff73beaac9c81272fc458fa Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Wed, 22 May 2024 17:43:17 +0800 Subject: [PATCH 169/220] log_backup: make a more rusty `CallbackWaitGroup` (#16740) (#16757) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit close tikv/tikv#16739 This make `CallbackWaitGroup` returns an equivalent future of the `BoxFuture` returned by `wait`. Also this fixed where a stale notify may also resolve the future. Signed-off-by: ti-chi-bot Signed-off-by: Yu Juncen Co-authored-by: 山岚 <36239017+YuJuncen@users.noreply.github.com> Co-authored-by: Yu Juncen Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- components/backup-stream/src/endpoint.rs | 4 +- .../backup-stream/src/subscription_manager.rs | 18 ++-- components/backup-stream/src/utils.rs | 94 +++++++++++-------- 3 files changed, 66 insertions(+), 50 deletions(-) diff --git a/components/backup-stream/src/endpoint.rs b/components/backup-stream/src/endpoint.rs index 770fafaaaaf..ed27b09d5c5 100644 --- a/components/backup-stream/src/endpoint.rs +++ b/components/backup-stream/src/endpoint.rs @@ -61,7 +61,7 @@ use crate::{ subscription_manager::{RegionSubscriptionManager, ResolvedRegions}, subscription_track::{Ref, RefMut, ResolveResult, SubscriptionTracer}, try_send, - utils::{self, CallbackWaitGroup, StopWatch, Work}, + utils::{self, FutureWaitGroup, StopWatch, Work}, }; const SLOW_EVENT_THRESHOLD: f64 = 120.0; @@ -1060,7 +1060,7 @@ where } pub fn do_backup(&self, events: Vec) { - let wg = CallbackWaitGroup::new(); + let wg = FutureWaitGroup::new(); for batch in events { self.backup_batch(batch, wg.clone().work()); } diff --git a/components/backup-stream/src/subscription_manager.rs b/components/backup-stream/src/subscription_manager.rs index df04521bbcb..a959029f8a8 100644 --- a/components/backup-stream/src/subscription_manager.rs +++ b/components/backup-stream/src/subscription_manager.rs @@ -32,7 +32,7 @@ use crate::{ router::{Router, TaskSelector}, subscription_track::{CheckpointType, ResolveResult, SubscriptionTracer}, try_send, - utils::{self, CallbackWaitGroup, Work}, + utils::{self, FutureWaitGroup, Work}, Task, }; @@ -295,7 +295,7 @@ pub struct RegionSubscriptionManager { messenger: Sender, scan_pool_handle: Arc, - scans: Arc, + scans: Arc, } impl Clone for RegionSubscriptionManager @@ -316,7 +316,7 @@ where subs: self.subs.clone(), messenger: self.messenger.clone(), scan_pool_handle: self.scan_pool_handle.clone(), - scans: CallbackWaitGroup::new(), + scans: FutureWaitGroup::new(), } } } @@ -375,7 +375,7 @@ where subs: initial_loader.tracing, messenger: tx, scan_pool_handle: Arc::new(scan_pool_handle), - scans: CallbackWaitGroup::new(), + scans: FutureWaitGroup::new(), }; let fut = op.clone().region_operator_loop(rx, resolver); (op, fut) @@ -392,8 +392,10 @@ where } /// wait initial scanning get finished. - pub fn wait(&self, timeout: Duration) -> future![bool] { - tokio::time::timeout(timeout, self.scans.wait()).map(|result| result.is_err()) + pub async fn wait(&self, timeout: Duration) -> bool { + tokio::time::timeout(timeout, self.scans.wait()) + .map(move |result| result.is_err()) + .await } /// the handler loop. @@ -759,7 +761,7 @@ mod test { use std::time::Duration; use super::ScanCmd; - use crate::{subscription_manager::spawn_executors, utils::CallbackWaitGroup}; + use crate::{subscription_manager::spawn_executors, utils::FutureWaitGroup}; fn should_finish_in(f: impl FnOnce() + Send + 'static, d: std::time::Duration) { let (tx, rx) = futures::channel::oneshot::channel(); @@ -776,7 +778,7 @@ mod test { } let pool = spawn_executors(NoopInitialScan, 1); - let wg = CallbackWaitGroup::new(); + let wg = FutureWaitGroup::new(); fail::cfg("execute_scan_command_sleep_100", "return").unwrap(); for _ in 0..100 { let wg = wg.clone(); diff --git a/components/backup-stream/src/utils.rs b/components/backup-stream/src/utils.rs index 974b1762cf2..6cf5fd80d9b 100644 --- a/components/backup-stream/src/utils.rs +++ b/components/backup-stream/src/utils.rs @@ -5,20 +5,21 @@ use std::{ borrow::Borrow, cell::RefCell, collections::{hash_map::RandomState, BTreeMap, HashMap}, + future::Future, ops::{Bound, RangeBounds}, path::Path, sync::{ atomic::{AtomicUsize, Ordering}, Arc, }, - task::Context, + task::{Context, Waker}, time::Duration, }; use async_compression::{tokio::write::ZstdEncoder, Level}; use engine_rocks::ReadPerfInstant; use engine_traits::{CfName, CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE}; -use futures::{ready, task::Poll, FutureExt}; +use futures::{ready, task::Poll}; use kvproto::{ brpb::CompressionType, metapb::Region, @@ -37,13 +38,12 @@ use tikv_util::{ use tokio::{ fs::File, io::{AsyncRead, AsyncWrite, AsyncWriteExt, BufWriter}, - sync::{oneshot, Mutex, RwLock}, + sync::{Mutex, RwLock}, }; use txn_types::{Key, Lock, LockType}; use crate::{ errors::{Error, Result}, - metadata::store::BoxFuture, router::TaskSelector, Task, }; @@ -378,47 +378,65 @@ pub fn should_track_lock(l: &Lock) -> bool { } } -pub struct CallbackWaitGroup { +pub struct FutureWaitGroup { running: AtomicUsize, - on_finish_all: std::sync::Mutex>>, + wakers: std::sync::Mutex>, } -impl CallbackWaitGroup { +pub struct Work(Arc); + +impl Drop for Work { + fn drop(&mut self) { + self.0.work_done(); + } +} + +pub struct WaitAll<'a>(&'a FutureWaitGroup); + +impl<'a> Future for WaitAll<'a> { + type Output = (); + + fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { + // Fast path: nothing to wait. + let running = self.0.running.load(Ordering::SeqCst); + if running == 0 { + return Poll::Ready(()); + } + + // <1> + let mut callbacks = self.0.wakers.lock().unwrap(); + callbacks.push(cx.waker().clone()); + let running = self.0.running.load(Ordering::SeqCst); + // Unlikely path: if all background tasks finish at <1>, there will be a long + // period that nobody will wake the `wakers` even the condition is ready. + // We need to help ourselves here. + if running == 0 { + callbacks.drain(..).for_each(|w| w.wake()); + } + Poll::Pending + } +} + +impl FutureWaitGroup { pub fn new() -> Arc { Arc::new(Self { running: AtomicUsize::new(0), - on_finish_all: std::sync::Mutex::default(), + wakers: Default::default(), }) } fn work_done(&self) { let last = self.running.fetch_sub(1, Ordering::SeqCst); if last == 1 { - self.on_finish_all - .lock() - .unwrap() - .drain(..) - .for_each(|x| x()) + self.wakers.lock().unwrap().drain(..).for_each(|x| { + x.wake(); + }) } } /// wait until all running tasks done. - pub fn wait(&self) -> BoxFuture<()> { - // Fast path: no uploading. - if self.running.load(Ordering::SeqCst) == 0 { - return Box::pin(futures::future::ready(())); - } - - let (tx, rx) = oneshot::channel(); - self.on_finish_all.lock().unwrap().push(Box::new(move || { - // The waiter may timed out. - let _ = tx.send(()); - })); - // try to acquire the lock again. - if self.running.load(Ordering::SeqCst) == 0 { - return Box::pin(futures::future::ready(())); - } - Box::pin(rx.map(|_| ())) + pub fn wait(&self) -> WaitAll<'_> { + WaitAll(self) } /// make a work, as long as the return value held, mark a work in the group @@ -429,14 +447,6 @@ impl CallbackWaitGroup { } } -pub struct Work(Arc); - -impl Drop for Work { - fn drop(&mut self) { - self.0.work_done(); - } -} - struct ReadThroughputRecorder { // The system tool set. ins: Option, @@ -812,7 +822,7 @@ mod test { use kvproto::metapb::{Region, RegionEpoch}; use tokio::io::{AsyncWriteExt, BufReader}; - use crate::utils::{is_in_range, CallbackWaitGroup, SegmentMap}; + use crate::utils::{is_in_range, FutureWaitGroup, SegmentMap}; #[test] fn test_redact() { @@ -921,8 +931,8 @@ mod test { } fn run_case(c: Case) { + let wg = FutureWaitGroup::new(); for i in 0..c.repeat { - let wg = CallbackWaitGroup::new(); let cnt = Arc::new(AtomicUsize::new(c.bg_task)); for _ in 0..c.bg_task { let cnt = cnt.clone(); @@ -933,7 +943,7 @@ mod test { }); } block_on(tokio::time::timeout(Duration::from_secs(20), wg.wait())).unwrap(); - assert_eq!(cnt.load(Ordering::SeqCst), 0, "{:?}@{}", c, i); + assert_eq!(cnt.load(Ordering::SeqCst), 0, "{:?}@{}", c, i,); } } @@ -950,6 +960,10 @@ mod test { bg_task: 512, repeat: 1, }, + Case { + bg_task: 16, + repeat: 10000, + }, Case { bg_task: 2, repeat: 100000, From 85bf8b1e52a94b5e2e133cd26b3b4dacfb31f6e7 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Thu, 23 May 2024 16:13:17 +0800 Subject: [PATCH 170/220] tikv-ctl: add region state to `raft region` command output (#17038) (#17052) close tikv/tikv#17037 add region state to `raft region` command output Signed-off-by: glorv Co-authored-by: glorv Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- cmd/tikv-ctl/src/executor.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/cmd/tikv-ctl/src/executor.rs b/cmd/tikv-ctl/src/executor.rs index 3e4e505a32a..80b1898032d 100644 --- a/cmd/tikv-ctl/src/executor.rs +++ b/cmd/tikv-ctl/src/executor.rs @@ -240,6 +240,7 @@ pub trait DebugExecutor { json!({ "region": json!({ "id": r.get_id(), + "state": format!("{:?}", s.get_state()), "start_key": hex::encode_upper(r.get_start_key()), "end_key": hex::encode_upper(r.get_end_key()), "region_epoch": json!({ From 429bf11b0561fc50d0d9fa7e2c874f91a6d53f44 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Thu, 23 May 2024 16:46:17 +0800 Subject: [PATCH 171/220] log-backup: Fix flush invalid ts (#16832) (#16861) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit close tikv/tikv#16809 Added more sync before flushing. Signed-off-by: Yu Juncen Co-authored-by: 山岚 <36239017+YuJuncen@users.noreply.github.com> Co-authored-by: Yu Juncen Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- .../backup-stream/src/checkpoint_manager.rs | 125 +++++++++++++++--- components/backup-stream/src/endpoint.rs | 76 ++++++----- .../backup-stream/src/subscription_manager.rs | 1 + .../backup-stream/tests/failpoints/mod.rs | 91 ++++++++++++- components/backup-stream/tests/suite.rs | 13 +- 5 files changed, 246 insertions(+), 60 deletions(-) diff --git a/components/backup-stream/src/checkpoint_manager.rs b/components/backup-stream/src/checkpoint_manager.rs index d32c2ea7c00..fabd8695c19 100644 --- a/components/backup-stream/src/checkpoint_manager.rs +++ b/components/backup-stream/src/checkpoint_manager.rs @@ -14,7 +14,7 @@ use kvproto::{ metapb::Region, }; use pd_client::PdClient; -use tikv_util::{box_err, defer, info, time::Instant, warn, worker::Scheduler}; +use tikv_util::{box_err, defer, info, warn, worker::Scheduler}; use txn_types::TimeStamp; use uuid::Uuid; @@ -35,6 +35,7 @@ use crate::{ #[derive(Default)] pub struct CheckpointManager { checkpoint_ts: HashMap, + frozen_resolved_ts: HashMap, resolved_ts: HashMap, manager_handle: Option>, } @@ -185,15 +186,29 @@ impl CheckpointManager { sub.main_loop() } + /// update the "dynamic" part, which is `resolved_ts`. + /// We call it "dynamic" because the data corresponding to the incoming data + /// part (in contrast of the flushing data part which is about to be write + /// to external storage and cannot be appended.) pub fn resolve_regions(&mut self, region_and_checkpoint: Vec) { for res in region_and_checkpoint { self.do_update(res.region, res.checkpoint); } } - pub fn flush(&mut self) { - info!("log backup checkpoint manager flushing."; "resolved_ts_len" => %self.resolved_ts.len(), "resolved_ts" => ?self.get_resolved_ts()); - self.checkpoint_ts = std::mem::take(&mut self.resolved_ts); + /// flush the `frozen_resolved_ts` to `checkpoint_ts`, and notify the + /// subscribers, with a possible final update to the checkpoint ts. + /// You may provide some extra resolve result from the `last_dive` argument. + /// They will be applied directly to the final checkpoint ts. It is the + /// caller's duty to make sure the resolve result is safe (i.e. All events + /// are surely flushed.) + pub fn flush_and_notify(&mut self, last_dive: Vec) { + info!("Notifying the flush result."; "last_dive_len" => last_dive.len()); + for rr in last_dive { + Self::update_ts(&mut self.frozen_resolved_ts, rr.region, rr.checkpoint); + } + // Replace the storage directly with the content of this run. + self.checkpoint_ts = std::mem::take(&mut self.frozen_resolved_ts); // Clippy doesn't know this iterator borrows `self.checkpoint_ts` :( #[allow(clippy::needless_collect)] let items = self @@ -205,6 +220,38 @@ impl CheckpointManager { self.notify(items.into_iter()); } + /// "freeze" the current resolved ts to the checkpoint ts. + /// This is usually called before we are going to flush and after freezing + /// the current batch of mutations. + /// + /// When a flush of the data collector triggered: + /// + /// ```text + /// ----------------------|-----------------> + /// ^^^ + /// Flushing data-+|+- Incoming data. + /// | + /// Flush Freeze Tempfiles + /// ``` + /// + /// Resolving over incoming data shouldn't advance the checkpoint of the + /// flushing data. So the current progress should be "freezed" when we are + /// about to flush. + pub fn freeze(&mut self) { + info!("log backup checkpoint manager freezing."; + "resolved_ts_len" => %self.resolved_ts.len(), + "resolved_ts" => ?self.get_resolved_ts(), + "frozen" => self.checkpoint_ts.len(), + ); + self.frozen_resolved_ts = std::mem::take(&mut self.resolved_ts); + } + + #[cfg(test)] + fn freeze_and_flush(&mut self) { + self.freeze(); + self.flush_and_notify(vec![]); + } + /// update a region checkpoint in need. #[cfg(test)] fn update_region_checkpoint(&mut self, region: &Region, checkpoint: TimeStamp) { @@ -224,6 +271,15 @@ impl CheckpointManager { e.and_modify(|old_cp| { let old_ver = old_cp.region.get_region_epoch().get_version(); let checkpoint_is_newer = old_cp.checkpoint < checkpoint; + if !checkpoint_is_newer { + warn!("received older checkpoint, maybe region merge."; + "region_id" => old_cp.region.get_id(), + "old_ver" => old_ver, + "new_ver" => ver, + "old_checkpoint" => old_cp.checkpoint.into_inner(), + "new_checkpoint" => checkpoint.into_inner() + ); + } if old_ver < ver || (old_ver == ver && checkpoint_is_newer) { *old_cp = LastFlushTsOfRegion { checkpoint, @@ -474,7 +530,6 @@ pub struct CheckpointV3FlushObserver { checkpoints: Vec, global_checkpoint_cache: HashMap, - start_time: Instant, } impl CheckpointV3FlushObserver { @@ -486,7 +541,6 @@ impl CheckpointV3FlushObserver { // We almost always have only one entry. global_checkpoint_cache: HashMap::with_capacity(1), baseline, - start_time: Instant::now(), } } } @@ -521,12 +575,9 @@ where } async fn after(&mut self, task: &str, _rts: u64) -> Result<()> { - let resolve_task = Task::RegionCheckpointsOp(RegionCheckpointOperation::Resolved { - checkpoints: std::mem::take(&mut self.checkpoints), - start_time: self.start_time, - }); - let flush_task = Task::RegionCheckpointsOp(RegionCheckpointOperation::Flush); - try_send!(self.sched, resolve_task); + let flush_task = Task::RegionCheckpointsOp(RegionCheckpointOperation::FlushWith( + std::mem::take(&mut self.checkpoints), + )); try_send!(self.sched, flush_task); let global_checkpoint = self.get_checkpoint(task).await?; @@ -685,7 +736,7 @@ pub mod tests { .unwrap(); mgr.resolve_regions(vec![simple_resolve_result()]); - mgr.flush(); + mgr.freeze_and_flush(); mgr.sync_with_subs_mgr(|_| {}); assert_eq!(trivial_sink.0.lock().unwrap().items.len(), 1); } @@ -703,7 +754,7 @@ pub mod tests { rt.block_on(mgr.add_subscriber(error_sink.clone())).unwrap(); mgr.resolve_regions(vec![simple_resolve_result()]); - mgr.flush(); + mgr.freeze_and_flush(); assert_eq!(mgr.sync_with_subs_mgr(|item| { item.subscribers.len() }), 0); let sink = error_sink.0.lock().unwrap(); assert_eq!(sink.items.len(), 0); @@ -721,12 +772,12 @@ pub mod tests { let r = mgr.get_from_region(RegionIdWithVersion::new(1, 32)); assert_matches::assert_matches!(r, GetCheckpointResult::NotFound { .. }); - mgr.flush(); + mgr.freeze_and_flush(); let r = mgr.get_from_region(RegionIdWithVersion::new(1, 32)); assert_matches::assert_matches!(r, GetCheckpointResult::Ok { checkpoint , .. } if checkpoint.into_inner() == 8); let r = mgr.get_from_region(RegionIdWithVersion::new(2, 35)); assert_matches::assert_matches!(r, GetCheckpointResult::Ok { checkpoint , .. } if checkpoint.into_inner() == 16); - mgr.flush(); + mgr.freeze_and_flush(); let r = mgr.get_from_region(RegionIdWithVersion::new(1, 32)); assert_matches::assert_matches!(r, GetCheckpointResult::NotFound { .. }); } @@ -758,6 +809,48 @@ pub mod tests { assert_matches::assert_matches!(r, GetCheckpointResult::Ok{checkpoint, ..} if checkpoint.into_inner() == 24); } + #[test] + fn test_mgr_freeze() { + let mut mgr = super::CheckpointManager::default(); + mgr.resolve_regions(vec![ + ResolveResult { + region: region(1, 32, 8), + checkpoint: TimeStamp::new(8), + checkpoint_type: CheckpointType::MinTs, + }, + ResolveResult { + region: region(2, 34, 8), + checkpoint: TimeStamp::new(15), + checkpoint_type: CheckpointType::MinTs, + }, + ]); + + // Freezed + mgr.freeze(); + let r = mgr.get_from_region(RegionIdWithVersion::new(1, 32)); + assert_matches::assert_matches!(r, GetCheckpointResult::NotFound { .. }); + let r = mgr.get_from_region(RegionIdWithVersion::new(2, 34)); + assert_matches::assert_matches!(r, GetCheckpointResult::NotFound { .. }); + // Shouldn't be recorded to resolved ts. + mgr.resolve_regions(vec![ResolveResult { + region: region(1, 32, 8), + checkpoint: TimeStamp::new(16), + checkpoint_type: CheckpointType::MinTs, + }]); + + // Flush done, should be able to be queried. + mgr.flush_and_notify(vec![ResolveResult { + region: region(2, 34, 8), + checkpoint: TimeStamp::new(17), + checkpoint_type: CheckpointType::MinTs, + }]); + + let r = mgr.get_from_region(RegionIdWithVersion::new(1, 32)); + assert_matches::assert_matches!(r, GetCheckpointResult::Ok{checkpoint, ..} if checkpoint.into_inner() == 8); + let r = mgr.get_from_region(RegionIdWithVersion::new(2, 34)); + assert_matches::assert_matches!(r, GetCheckpointResult::Ok{checkpoint, ..} if checkpoint.into_inner() == 17); + } + pub struct MockPdClient { safepoint: RwLock>, } diff --git a/components/backup-stream/src/endpoint.rs b/components/backup-stream/src/endpoint.rs index ed27b09d5c5..ae47741ffa2 100644 --- a/components/backup-stream/src/endpoint.rs +++ b/components/backup-stream/src/endpoint.rs @@ -39,7 +39,7 @@ use tikv_util::{ use tokio::{ io::Result as TokioResult, runtime::{Handle, Runtime}, - sync::{oneshot, Semaphore}, + sync::Semaphore, }; use tokio_stream::StreamExt; use txn_types::TimeStamp; @@ -91,7 +91,7 @@ pub struct Endpoint { // We holds the config before, even it is useless for now, // however probably it would be useful in the future. config: BackupStreamConfig, - checkpoint_mgr: CheckpointManager, + pub checkpoint_mgr: CheckpointManager, // Runtime status: /// The handle to abort last save storage safe point. @@ -493,6 +493,11 @@ where let total_size = kvs.size(); metrics::HEAP_MEMORY .add(total_size as _); + #[cfg(feature = "failpoints")] + tokio::time::sleep(Duration::from_millis((|| { + fail::fail_point!("log_backup_batch_delay", |val| val.and_then( |x| x.parse::().ok()).unwrap_or(0)); + 0 + })())).await; utils::handle_on_event_result(&sched, router.on_events(kvs).await); metrics::HEAP_MEMORY .sub(total_size as _); @@ -758,29 +763,11 @@ where } } - fn get_resolved_regions(&self, min_ts: TimeStamp) -> future![Result] { - let (tx, rx) = oneshot::channel(); - let op = self.region_operator.clone(); - async move { - let req = ObserveOp::ResolveRegions { - callback: Box::new(move |rs| { - let _ = tx.send(rs); - }), - min_ts, - }; - op.request(req).await; - rx.await - .map_err(|err| annotate!(err, "failed to send request for resolve regions")) - } - } - - fn do_flush(&self, task: String, min_ts: TimeStamp) -> future![Result<()>] { - let get_rts = self.get_resolved_regions(min_ts); + fn do_flush(&self, task: String, mut resolved: ResolvedRegions) -> future![Result<()>] { let router = self.range_router.clone(); let store_id = self.store_id; let mut flush_ob = self.flush_observer(); async move { - let mut resolved = get_rts.await?; let mut new_rts = resolved.global_checkpoint(); fail::fail_point!("delay_on_flush"); flush_ob.before(resolved.take_resolve_result()).await; @@ -809,20 +796,37 @@ where // This should only happen in testing, it would be to unwrap... let _ = info.unwrap().set_flushing_status_cas(false, true); let mts = self.prepare_min_ts().await; - try_send!(self.scheduler, Task::FlushWithMinTs(task, mts)); + let sched = self.scheduler.clone(); + self.region_operator + .request(ObserveOp::ResolveRegions { + callback: Box::new(move |res| { + try_send!(sched, Task::ExecFlush(task, res)); + }), + min_ts: mts, + }) + .await; }); } pub fn on_flush(&self, task: String) { self.pool.block_on(async move { let mts = self.prepare_min_ts().await; + let sched = self.scheduler.clone(); info!("min_ts prepared for flushing"; "min_ts" => %mts); - try_send!(self.scheduler, Task::FlushWithMinTs(task, mts)); + self.region_operator + .request(ObserveOp::ResolveRegions { + callback: Box::new(move |res| { + try_send!(sched, Task::ExecFlush(task, res)); + }), + min_ts: mts, + }) + .await }) } - fn on_flush_with_min_ts(&self, task: String, min_ts: TimeStamp) { - self.pool.spawn(self.do_flush(task, min_ts).map(|r| { + fn on_exec_flush(&mut self, task: String, resolved: ResolvedRegions) { + self.checkpoint_mgr.freeze(); + self.pool.spawn(self.do_flush(task, resolved).map(|r| { if let Err(err) = r { err.report("during updating flush status") } @@ -963,7 +967,7 @@ where } } Task::MarkFailover(t) => self.failover_time = Some(t), - Task::FlushWithMinTs(task, min_ts) => self.on_flush_with_min_ts(task, min_ts), + Task::ExecFlush(task, min_ts) => self.on_exec_flush(task, min_ts), Task::RegionCheckpointsOp(s) => self.handle_region_checkpoints_op(s), Task::UpdateGlobalCheckpoint(task) => self.on_update_global_checkpoint(task), } @@ -992,8 +996,8 @@ where self.checkpoint_mgr.resolve_regions(checkpoints); metrics::MIN_TS_RESOLVE_DURATION.observe(start_time.saturating_elapsed_secs()); } - RegionCheckpointOperation::Flush => { - self.checkpoint_mgr.flush(); + RegionCheckpointOperation::FlushWith(checkpoints) => { + self.checkpoint_mgr.flush_and_notify(checkpoints); } RegionCheckpointOperation::Get(g, cb) => { let _guard = self.pool.handle().enter(); @@ -1123,7 +1127,7 @@ pub enum RegionSet { } pub enum RegionCheckpointOperation { - Flush, + FlushWith(Vec), PrepareMinTsForResolve, Resolve { min_ts: TimeStamp, @@ -1140,7 +1144,7 @@ pub enum RegionCheckpointOperation { impl fmt::Debug for RegionCheckpointOperation { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { - Self::Flush => f.debug_tuple("Flush").finish(), + Self::FlushWith(checkpoints) => f.debug_tuple("FlushWith").field(checkpoints).finish(), Self::Get(arg0, _) => f.debug_tuple("Get").field(arg0).finish(), Self::Subscribe(_) => f.debug_tuple("Subscription").finish(), @@ -1185,9 +1189,9 @@ pub enum Task { MarkFailover(Instant), /// Flush the task with name. Flush(String), - /// Execute the flush with the calculated `min_ts`. + /// Execute the flush with the calculated resolved result. /// This is an internal command only issued by the `Flush` task. - FlushWithMinTs(String, TimeStamp), + ExecFlush(String, ResolvedRegions), /// The command for getting region checkpoints. RegionCheckpointsOp(RegionCheckpointOperation), /// update global-checkpoint-ts to storage. @@ -1294,10 +1298,10 @@ impl fmt::Debug for Task { .debug_tuple("MarkFailover") .field(&format_args!("{:?} ago", t.saturating_elapsed())) .finish(), - Self::FlushWithMinTs(arg0, arg1) => f - .debug_tuple("FlushWithMinTs") + Self::ExecFlush(arg0, arg1) => f + .debug_tuple("ExecFlush") .field(arg0) - .field(arg1) + .field(&arg1.global_checkpoint()) .finish(), Self::RegionCheckpointsOp(s) => f.debug_tuple("GetRegionCheckpoints").field(s).finish(), Self::UpdateGlobalCheckpoint(task) => { @@ -1337,7 +1341,7 @@ impl Task { Task::FatalError(..) => "fatal_error", Task::Sync(..) => "sync", Task::MarkFailover(_) => "mark_failover", - Task::FlushWithMinTs(..) => "flush_with_min_ts", + Task::ExecFlush(..) => "flush_with_min_ts", Task::RegionCheckpointsOp(..) => "get_checkpoints", Task::UpdateGlobalCheckpoint(..) => "update_global_checkpoint", } diff --git a/components/backup-stream/src/subscription_manager.rs b/components/backup-stream/src/subscription_manager.rs index a959029f8a8..bc4b8335c8d 100644 --- a/components/backup-stream/src/subscription_manager.rs +++ b/components/backup-stream/src/subscription_manager.rs @@ -467,6 +467,7 @@ where } } ObserveOp::ResolveRegions { callback, min_ts } => { + fail::fail_point!("subscription_manager_resolve_regions"); let now = Instant::now(); let timedout = self.wait(Duration::from_secs(5)).await; if timedout { diff --git a/components/backup-stream/tests/failpoints/mod.rs b/components/backup-stream/tests/failpoints/mod.rs index ea09e9c7a1f..fff47d1f20a 100644 --- a/components/backup-stream/tests/failpoints/mod.rs +++ b/components/backup-stream/tests/failpoints/mod.rs @@ -25,12 +25,16 @@ mod all { GetCheckpointResult, RegionCheckpointOperation, RegionSet, Task, }; use futures::executor::block_on; - use tikv_util::{config::ReadableSize, defer}; + use tikv_util::{ + config::{ReadableDuration, ReadableSize}, + defer, + }; + use txn_types::Key; use super::{ make_record_key, make_split_key_at_record, mutation, run_async_test, SuiteBuilder, }; - use crate::make_table_key; + use crate::{make_table_key, Suite}; #[test] fn failed_register_task() { @@ -306,4 +310,87 @@ mod all { ); assert!(!failed.load(Ordering::SeqCst)); } + + #[test] + fn resolve_during_flushing() { + let mut suite = SuiteBuilder::new_named("resolve_during_flushing") + .cfg(|cfg| { + cfg.min_ts_interval = ReadableDuration::days(1); + cfg.initial_scan_concurrency = 1; + }) + .nodes(2) + .build(); + suite.must_register_task(1, "resolve_during_flushing"); + let key = make_record_key(1, 1); + + let start_ts = suite.tso(); + suite.must_kv_prewrite( + 1, + vec![mutation( + key.clone(), + Suite::PROMISED_SHORT_VALUE.to_owned(), + )], + key.clone(), + start_ts, + ); + fail::cfg("after_moving_to_flushing_files", "pause").unwrap(); + suite.force_flush_files("resolve_during_flushing"); + let commit_ts = suite.tso(); + suite.just_commit_a_key(key.clone(), start_ts, commit_ts); + suite.run(|| Task::RegionCheckpointsOp(RegionCheckpointOperation::PrepareMinTsForResolve)); + // Wait until the resolve done. Sadly for now we don't have good solutions :( + std::thread::sleep(Duration::from_secs(2)); + fail::remove("after_moving_to_flushing_files"); + suite.wait_for_flush(); + assert_eq!(suite.global_checkpoint(), start_ts.into_inner()); + // transfer the leader, make sure everything has been flushed. + suite.must_shuffle_leader(1); + suite.wait_with(|cfg| cfg.initial_scan_semaphore.available_permits() > 0); + suite.force_flush_files("resolve_during_flushing"); + suite.wait_for_flush(); + let enc_key = Key::from_raw(&key).append_ts(commit_ts); + suite.check_for_write_records( + suite.flushed_files.path(), + std::iter::once(enc_key.as_encoded().as_slice()), + ); + } + + #[test] + fn commit_during_flushing() { + let mut suite = SuiteBuilder::new_named("commit_during_flushing") + .nodes(1) + .build(); + suite.must_register_task(1, "commit_during_flushing"); + let key = make_record_key(1, 1); + let start_ts = suite.tso(); + suite.must_kv_prewrite( + 1, + vec![mutation( + key.clone(), + Suite::PROMISED_SHORT_VALUE.to_owned(), + )], + key.clone(), + start_ts, + ); + fail::cfg("subscription_manager_resolve_regions", "pause").unwrap(); + let commit_ts = suite.tso(); + suite.force_flush_files("commit_during_flushing"); + suite.sync(); + suite.sync(); + fail::cfg("log_backup_batch_delay", "return(2000)").unwrap(); + suite.just_commit_a_key(key.clone(), start_ts, commit_ts); + fail::remove("subscription_manager_resolve_regions"); + suite.wait_for_flush(); + let enc_key = Key::from_raw(&key).append_ts(commit_ts); + assert!( + suite.global_checkpoint() > commit_ts.into_inner(), + "{} {:?}", + suite.global_checkpoint(), + commit_ts + ); + suite.check_for_write_records( + suite.flushed_files.path(), + std::iter::once(enc_key.as_encoded().as_slice()), + ) + } } diff --git a/components/backup-stream/tests/suite.rs b/components/backup-stream/tests/suite.rs index 2886bb4f5d7..3034bc26e4c 100644 --- a/components/backup-stream/tests/suite.rs +++ b/components/backup-stream/tests/suite.rs @@ -263,6 +263,9 @@ pub struct Suite { } impl Suite { + pub const PROMISED_SHORT_VALUE: &'static [u8] = b"hello, world"; + pub const PROMISED_LONG_VALUE: &'static [u8] = &[0xbb; 4096]; + pub fn simple_task(&self, name: &str) -> StreamTask { let mut task = StreamTask::default(); task.info.set_name(name.to_owned()); @@ -347,7 +350,6 @@ impl Suite { let (_, port) = server.bind_addrs().next().unwrap(); let addr = format!("127.0.0.1:{}", port); let channel = ChannelBuilder::new(self.env.clone()).connect(&addr); - println!("connecting channel to {} for store {}", addr, id); let client = LogBackupClient::new(channel); self.servers.push(server); client @@ -471,9 +473,9 @@ impl Suite { let ts = ts as u64; let key = make_record_key(for_table, ts); let value = if ts % 4 == 0 { - b"hello, world".to_vec() + Self::PROMISED_SHORT_VALUE.to_vec() } else { - [0xdd; 4096].to_vec() + Self::PROMISED_LONG_VALUE.to_vec() }; let muts = vec![mutation(key.clone(), value)]; let enc_key = Key::from_raw(&key).into_encoded(); @@ -536,7 +538,6 @@ impl Suite { let mut res = LogFiles::default(); for entry in WalkDir::new(path.join("v1/backupmeta")) { let entry = entry?; - println!("reading {}", entry.path().display()); if entry.file_name().to_str().unwrap().ends_with(".meta") { let content = std::fs::read(entry.path())?; let meta = protobuf::parse_from_bytes::(&content)?; @@ -624,7 +625,7 @@ impl Suite { default_keys.insert(key.into_encoded()); } else { - assert_eq!(wf.short_value, Some(b"hello, world" as &[u8])); + assert_eq!(wf.short_value, Some(Self::PROMISED_SHORT_VALUE)); } } } @@ -648,7 +649,7 @@ impl Suite { } let value = iter.value(); - assert_eq!(value, &[0xdd; 4096]); + assert_eq!(value, Self::PROMISED_LONG_VALUE); } } } From 46020807fc7a87b1d54bb2c21ea4f15e8ded0a14 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Thu, 23 May 2024 18:22:17 +0800 Subject: [PATCH 172/220] sst_importer: remove the SST file where no kvs written into (#16146) (#16934) close tikv/tikv#16005 This PR has remove the empty SST file in the db/import. * remove the sst file where no kvs written into Signed-off-by: Leavrth Signed-off-by: Jianjun Liao Co-authored-by: Leavrth Co-authored-by: Jianjun Liao Co-authored-by: Jianjun Liao <36503113+Leavrth@users.noreply.github.com> --- components/sst_importer/src/sst_importer.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/components/sst_importer/src/sst_importer.rs b/components/sst_importer/src/sst_importer.rs index 54a41cea15b..23ff6f26d91 100644 --- a/components/sst_importer/src/sst_importer.rs +++ b/components/sst_importer/src/sst_importer.rs @@ -1380,6 +1380,9 @@ impl SstImporter { Ok(Some(final_range)) } else { // nothing is written: prevents finishing the SST at all. + // also delete the empty sst file that is created when creating sst_writer + drop(sst_writer); + let _ = file_system::remove_file(&path.save); Ok(None) } } @@ -2775,6 +2778,9 @@ mod tests { db, ); + let path = importer.dir.join_for_write(&meta).unwrap(); + assert!(!file_system::file_exists(path.save)); + match result { Ok(None) => {} _ => panic!("unexpected download result: {:?}", result), From 212c51dc443505f2cc9b2de9bcbae66b31c2c0ac Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Fri, 24 May 2024 11:46:17 +0800 Subject: [PATCH 173/220] set the unified read pool size when bootstrapped (#16633) (#16953) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit close tikv/tikv#16629 Register the size of unified read pool at starting TiKV. Signed-off-by: ti-chi-bot Signed-off-by: Yu Juncen Co-authored-by: 山岚 <36239017+YuJuncen@users.noreply.github.com> Co-authored-by: Yu Juncen Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- src/read_pool.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/read_pool.rs b/src/read_pool.rs index da251c78aa0..18e9a11cbd5 100644 --- a/src/read_pool.rs +++ b/src/read_pool.rs @@ -496,8 +496,12 @@ pub fn build_yatp_read_pool_with_name( pool, running_tasks: UNIFIED_READ_POOL_RUNNING_TASKS .with_label_values(&[&unified_read_pool_name]), - running_threads: UNIFIED_READ_POOL_RUNNING_THREADS - .with_label_values(&[&unified_read_pool_name]), + running_threads: { + let running_threads = + UNIFIED_READ_POOL_RUNNING_THREADS.with_label_values(&[&unified_read_pool_name]); + running_threads.set(config.max_thread_count as _); + running_threads + }, max_tasks: config .max_tasks_per_worker .saturating_mul(config.max_thread_count), From 6d682903156cb3d8c4449127fec7ecbdc5f64e6d Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Fri, 24 May 2024 12:02:47 +0800 Subject: [PATCH 174/220] backup: handle the error when peer not found (#16581) (#16724) close tikv/tikv#16394 This PR fixes the panic when peer not found in the local region. When peer not found in the local region, it would skip backing up the region, and retry to backup it in finegrained step. Signed-off-by: Leavrth Co-authored-by: Leavrth Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- components/backup/src/endpoint.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/components/backup/src/endpoint.rs b/components/backup/src/endpoint.rs index 8ffd229e813..71a5c9e215c 100644 --- a/components/backup/src/endpoint.rs +++ b/components/backup/src/endpoint.rs @@ -810,7 +810,13 @@ impl Progress { break; } } - let peer = find_peer(region, store_id).unwrap().to_owned(); + let peer = if let Some(peer) = find_peer(region, store_id) { + peer.to_owned() + } else { + // skip the region at this time, and would retry to backup the region in + // finegrained step. + continue; + }; // Raft peer role has to match the replica read flag. if replica_read || info.role == StateRole::Leader { let ekey = get_min_end_key(end_key.as_ref(), region); From 97b4b5dda19fbc331e55413c147b8a31ec511ee8 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Fri, 24 May 2024 12:23:47 +0800 Subject: [PATCH 175/220] log_backup: fix panic when encountered error during resuming (#17021) (#17025) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit close tikv/tikv#17020 Spawn thread from the thread pool directly. (Instead of the thread local runtime handle.) Signed-off-by: ti-chi-bot Signed-off-by: Yu Juncen Signed-off-by: 山岚 <36239017+YuJuncen@users.noreply.github.com> Co-authored-by: 山岚 <36239017+YuJuncen@users.noreply.github.com> Co-authored-by: Yu Juncen --- components/backup-stream/src/endpoint.rs | 2 +- components/backup-stream/src/metadata/client.rs | 14 +++++++++++++- components/backup-stream/tests/failpoints/mod.rs | 14 ++++++++++++++ 3 files changed, 28 insertions(+), 2 deletions(-) diff --git a/components/backup-stream/src/endpoint.rs b/components/backup-stream/src/endpoint.rs index ae47741ffa2..b8e0ec3139e 100644 --- a/components/backup-stream/src/endpoint.rs +++ b/components/backup-stream/src/endpoint.rs @@ -708,7 +708,7 @@ where Err(err) => { err.report(format!("failed to resume backup stream task {}", task_name)); let sched = self.scheduler.clone(); - tokio::task::spawn(async move { + self.pool.spawn(async move { tokio::time::sleep(Duration::from_secs(5)).await; sched .schedule(Task::WatchTask(TaskOp::ResumeTask(task_name))) diff --git a/components/backup-stream/src/metadata/client.rs b/components/backup-stream/src/metadata/client.rs index 2232770915f..59f685b9f23 100644 --- a/components/backup-stream/src/metadata/client.rs +++ b/components/backup-stream/src/metadata/client.rs @@ -331,6 +331,13 @@ impl MetadataClient { .await } + /// resume a task. + pub async fn resume(&self, name: &str) -> Result<()> { + self.meta_store + .delete(Keys::Key(MetaKey::pause_of(name))) + .await + } + pub async fn get_tasks_pause_status(&self) -> Result, bool>> { let kvs = self .meta_store @@ -354,6 +361,11 @@ impl MetadataClient { defer! { super::metrics::METADATA_OPERATION_LATENCY.with_label_values(&["task_get"]).observe(now.saturating_elapsed().as_secs_f64()) } + fail::fail_point!("failed_to_get_task", |_| { + Err(Error::MalformedMetadata( + "failed to connect etcd client".to_string(), + )) + }); let items = self .meta_store .get_latest(Keys::Key(MetaKey::task_of(name))) @@ -376,7 +388,7 @@ impl MetadataClient { } fail::fail_point!("failed_to_get_tasks", |_| { Err(Error::MalformedMetadata( - "faild to connect etcd client".to_string(), + "failed to connect etcd client".to_string(), )) }); let kvs = self diff --git a/components/backup-stream/tests/failpoints/mod.rs b/components/backup-stream/tests/failpoints/mod.rs index fff47d1f20a..53f5d5b0ddc 100644 --- a/components/backup-stream/tests/failpoints/mod.rs +++ b/components/backup-stream/tests/failpoints/mod.rs @@ -311,6 +311,20 @@ mod all { assert!(!failed.load(Ordering::SeqCst)); } + #[test] + fn failed_to_get_task_when_pausing() { + let suite = SuiteBuilder::new_named("resume_error").nodes(1).build(); + suite.must_register_task(1, "resume_error"); + let mcli = suite.get_meta_cli(); + run_async_test(mcli.pause("resume_error")).unwrap(); + suite.sync(); + fail::cfg("failed_to_get_task", "1*return").unwrap(); + run_async_test(mcli.resume("resume_error")).unwrap(); + suite.sync(); + // Make sure our suite doesn't panic. + suite.sync(); + } + #[test] fn resolve_during_flushing() { let mut suite = SuiteBuilder::new_named("resolve_during_flushing") From 00f71e02cd4f460d2154eba1cbef9cbcf0fd0f10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B1=B1=E5=B2=9A?= <36239017+YuJuncen@users.noreply.github.com> Date: Mon, 27 May 2024 17:02:50 +0800 Subject: [PATCH 176/220] log_backup: make the retry rule unify with master (#16608) (#17067) ref tikv/tikv#16554 Signed-off-by: Yu Juncen --- components/backup-stream/src/subscription_manager.rs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/components/backup-stream/src/subscription_manager.rs b/components/backup-stream/src/subscription_manager.rs index bc4b8335c8d..5690483c0de 100644 --- a/components/backup-stream/src/subscription_manager.rs +++ b/components/backup-stream/src/subscription_manager.rs @@ -38,8 +38,6 @@ use crate::{ type ScanPool = tokio::runtime::Runtime; -const INITIAL_SCAN_FAILURE_MAX_RETRY_TIME: usize = 10; - // The retry parameters for failed to get last checkpoint ts. // When PD is temporarily disconnected, we may need this retry. // The total duration of retrying is about 345s ( 20 * 16 + 15 ), @@ -196,11 +194,14 @@ impl ScanCmd { /// execute the command, when meeting error, retrying. async fn exec_by_with_retry(self, init: impl InitialScan) { - let mut retry_time = INITIAL_SCAN_FAILURE_MAX_RETRY_TIME; + let mut retry_time = TRY_START_OBSERVE_MAX_RETRY_TIME; loop { match self.exec_by(init.clone()).await { Err(err) if should_retry(&err) && retry_time > 0 => { - tokio::time::sleep(Duration::from_millis(500)).await; + tokio::time::sleep(backoff_for_start_observe( + TRY_START_OBSERVE_MAX_RETRY_TIME - retry_time, + )) + .await; warn!("meet retryable error"; "err" => %err, "retry_time" => retry_time); retry_time -= 1; continue; From fb9f2c8d69ed2d1cfc7e764b9787eb577a3bb93f Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Tue, 28 May 2024 13:41:49 +0800 Subject: [PATCH 177/220] raftstore: Avoid snapshot IO in raftstore thread (#16682) (#16952) close tikv/tikv#16564 Avoid snapshot IO in raftstore thread Signed-off-by: Connor1996 Co-authored-by: Connor1996 Co-authored-by: glorv --- components/raftstore/src/store/fsm/peer.rs | 2 +- components/raftstore/src/store/snap.rs | 74 ++++++---------------- 2 files changed, 22 insertions(+), 54 deletions(-) diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index d4a130ba01d..40fd58290e1 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -3198,7 +3198,7 @@ where // No need to get snapshot for witness, as witness's empty snapshot bypass // snapshot manager. let key = SnapKey::from_region_snap(region_id, snap); - self.ctx.snap_mgr.get_snapshot_for_applying(&key)?; + self.ctx.snap_mgr.meta_file_exist(&key)?; Some(key) } else { None diff --git a/components/raftstore/src/store/snap.rs b/components/raftstore/src/store/snap.rs index 690c3af1c76..6ec7111e09a 100644 --- a/components/raftstore/src/store/snap.rs +++ b/components/raftstore/src/store/snap.rs @@ -718,21 +718,11 @@ impl Snapshot { )); } if meta.get_size() != 0 { - let file_path = self.cf_files[cf_idx].add_file_with_size_checksum( + let _ = self.cf_files[cf_idx].add_file_with_size_checksum( file_idx, meta.get_size(), meta.get_checksum(), ); - if file_exists(&file_path) { - let mgr = self.mgr.encryption_key_manager.as_ref(); - let file_path = Path::new(&file_path); - let (_, size) = calc_checksum_and_size(file_path, mgr)?; - check_file_size( - size, - *(self.cf_files[cf_idx].size.last().unwrap()), - file_path, - )?; - } } file_idx += 1; if file_idx >= cf_file_count_from_meta[cf_idx] { @@ -1710,6 +1700,20 @@ impl SnapManager { Ok(Box::new(s)) } + pub fn meta_file_exist(&self, key: &SnapKey) -> RaftStoreResult<()> { + let _lock = self.core.registry.rl(); + let base = &self.core.base; + // Use CheckPolicy::None to avoid reading meta file + let s = Snapshot::new(base, key, false, CheckPolicy::None, &self.core)?; + if !file_exists(s.meta_file.path.as_path()) { + return Err(RaftStoreError::Other(From::from(format!( + "snapshot of {:?} not exists.", + key + )))); + } + Ok(()) + } + /// Get the approximate size of snap file exists in snap directory. /// /// Return value is not guaranteed to be accurate. @@ -2772,26 +2776,6 @@ pub mod tests { assert!(s2.exists()); } - // Make all the snapshot in the specified dir corrupted to have incorrect size. - fn corrupt_snapshot_size_in>(dir: T) { - let dir_path = dir.into(); - let read_dir = file_system::read_dir(dir_path).unwrap(); - for p in read_dir { - if p.is_ok() { - let e = p.as_ref().unwrap(); - if !e - .file_name() - .into_string() - .unwrap() - .ends_with(META_FILE_SUFFIX) - { - let mut f = OpenOptions::new().append(true).open(e.path()).unwrap(); - f.write_all(b"xxxxx").unwrap(); - } - } - } - } - // Make all the snapshot in the specified dir corrupted to have incorrect // checksum. fn corrupt_snapshot_checksum_in>(dir: T) -> Vec { @@ -2892,7 +2876,7 @@ pub mod tests { } #[test] - fn test_snap_corruption_on_size_or_checksum() { + fn test_snap_corruption_on_checksum() { let region_id = 1; let region = gen_test_region(region_id, 1, 1); let db_dir = Builder::new() @@ -2911,22 +2895,11 @@ pub mod tests { let mut s1 = Snapshot::new_for_building(dir.path(), &key, &mgr_core).unwrap(); assert!(!s1.exists()); - let _ = s1 + let snap_data = s1 .build(&db, &snapshot, ®ion, true, false, UnixSecs::now()) .unwrap(); assert!(s1.exists()); - corrupt_snapshot_size_in(dir.path()); - - Snapshot::new_for_sending(dir.path(), &key, &mgr_core).unwrap_err(); - - let mut s2 = Snapshot::new_for_building(dir.path(), &key, &mgr_core).unwrap(); - assert!(!s2.exists()); - let snap_data = s2 - .build(&db, &snapshot, ®ion, true, false, UnixSecs::now()) - .unwrap(); - assert!(s2.exists()); - let dst_dir = Builder::new() .prefix("test-snap-corruption-dst") .tempdir() @@ -2939,12 +2912,11 @@ pub mod tests { snap_data.get_meta().clone(), ); - let mut metas = corrupt_snapshot_checksum_in(dst_dir.path()); + let metas = corrupt_snapshot_checksum_in(dst_dir.path()); assert_eq!(1, metas.len()); - let snap_meta = metas.pop().unwrap(); - let mut s5 = Snapshot::new_for_applying(dst_dir.path(), &key, &mgr_core).unwrap(); - assert!(s5.exists()); + let mut s2 = Snapshot::new_for_applying(dst_dir.path(), &key, &mgr_core).unwrap(); + assert!(s2.exists()); let dst_db_dir = Builder::new() .prefix("test-snap-corruption-dst-db") @@ -2959,11 +2931,7 @@ pub mod tests { coprocessor_host: CoprocessorHost::::default(), ingest_copy_symlink: false, }; - s5.apply(options).unwrap_err(); - - corrupt_snapshot_size_in(dst_dir.path()); - Snapshot::new_for_receiving(dst_dir.path(), &key, &mgr_core, snap_meta).unwrap_err(); - Snapshot::new_for_applying(dst_dir.path(), &key, &mgr_core).unwrap_err(); + s2.apply(options).unwrap_err(); } #[test] From 8fbb6d9574ca39cd876ba912dcd7621188de586f Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Tue, 28 May 2024 13:56:50 +0800 Subject: [PATCH 178/220] copr: don't resize binary opaque when the type flen is unspecified (#16617) (#16713) close tikv/tikv#16616 Signed-off-by: Yang Keao Co-authored-by: Yang Keao Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- components/tidb_query_expr/src/impl_cast.rs | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/components/tidb_query_expr/src/impl_cast.rs b/components/tidb_query_expr/src/impl_cast.rs index 53750d02d2d..6ce43695213 100644 --- a/components/tidb_query_expr/src/impl_cast.rs +++ b/components/tidb_query_expr/src/impl_cast.rs @@ -1385,8 +1385,9 @@ fn cast_string_as_json( let mut vec; if typ.tp() == FieldTypeTp::String { vec = (*val).to_owned(); - // the `flen` of string is always greater than zero - vec.resize(typ.flen().try_into().unwrap(), 0); + if typ.flen() > 0 { + vec.resize(typ.flen().try_into().unwrap(), 0); + } buf = &vec; } @@ -7021,6 +7022,17 @@ mod tests { Json::from_opaque(FieldTypeTp::String, &[97]).unwrap(), true, ), + ( + FieldTypeBuilder::new() + .tp(FieldTypeTp::VarChar) + .flen(UNSPECIFIED_LENGTH) + .charset(CHARSET_BIN) + .collation(Collation::Binary) + .build(), + "a".to_string(), + Json::from_opaque(FieldTypeTp::String, &[97]).unwrap(), + true, + ), ]; for (arg_type, input, expect, parse_to_json) in cs { let arg_value = ScalarValue::Bytes(Some(input.clone().into_bytes())); From 1a2c88dcc22c62e1f138d440956214ce367e838b Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Tue, 28 May 2024 17:55:21 +0800 Subject: [PATCH 179/220] raftstore: Add slow log for peer and store msg (#16605) (#17035) ref tikv/tikv#16600 Add slow log for peer and store msg Signed-off-by: ti-chi-bot Signed-off-by: Connor1996 Co-authored-by: Connor Co-authored-by: Connor1996 Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- Cargo.lock | 15 ++++++ components/raftstore/Cargo.toml | 2 + components/raftstore/src/store/fsm/peer.rs | 19 +++++-- components/raftstore/src/store/fsm/store.rs | 21 ++++++-- components/raftstore/src/store/msg.rs | 56 ++++++++++++++++++--- 5 files changed, 99 insertions(+), 14 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 902b492c715..833bc7e302d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4465,6 +4465,8 @@ dependencies = [ "slog-global", "smallvec", "sst_importer", + "strum 0.20.0", + "strum_macros 0.24.3", "tempfile", "test_sst_importer", "thiserror", @@ -5808,6 +5810,19 @@ dependencies = [ "syn 1.0.103", ] +[[package]] +name = "strum_macros" +version = "0.24.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e385be0d24f186b4ce2f9982191e7101bb737312ad61c1f2f984f34bcf85d59" +dependencies = [ + "heck 0.4.1", + "proc-macro2", + "quote", + "rustversion", + "syn 1.0.103", +] + [[package]] name = "strum_macros" version = "0.25.0" diff --git a/components/raftstore/Cargo.toml b/components/raftstore/Cargo.toml index 27380a52882..d086b040f66 100644 --- a/components/raftstore/Cargo.toml +++ b/components/raftstore/Cargo.toml @@ -82,6 +82,8 @@ slog = { workspace = true } slog-global = { workspace = true } smallvec = "1.4" sst_importer = { workspace = true } +strum = { version = "0.20", features = ["derive"] } +strum_macros = "0.24" tempfile = "3.0" thiserror = "1.0" tidb_query_datatype = { workspace = true } diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index 40fd58290e1..e0f9381ca61 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -22,6 +22,7 @@ use engine_traits::{Engines, KvEngine, RaftEngine, SstMetaInfo, WriteBatchExt, C use error_code::ErrorCodeExt; use fail::fail_point; use futures::channel::mpsc::UnboundedSender; +use itertools::Itertools; use keys::{self, enc_end_key, enc_start_key}; use kvproto::{ brpb::CheckAdminResponse, @@ -49,13 +50,15 @@ use raft::{ GetEntriesContext, Progress, ReadState, SnapshotStatus, StateRole, INVALID_INDEX, NO_LIMIT, }; use smallvec::SmallVec; +use strum::{EnumCount, VariantNames}; use tikv_alloc::trace::TraceEvent; use tikv_util::{ box_err, debug, defer, error, escape, info, info_or_debug, is_zero_duration, mpsc::{self, LooseBoundedSender, Receiver}, + slow_log, store::{find_peer, find_peer_by_id, is_learner, region_on_same_stores}, sys::disk::DiskUsage, - time::{monotonic_raw_now, Instant as TiInstant}, + time::{monotonic_raw_now, Instant as TiInstant, SlowTimer}, trace, warn, worker::{ScheduleError, Scheduler}, Either, @@ -617,9 +620,12 @@ where } pub fn handle_msgs(&mut self, msgs: &mut Vec>) { - let timer = TiInstant::now_coarse(); + let timer = SlowTimer::from_millis(100); let count = msgs.len(); + #[allow(const_evaluatable_unchecked)] + let mut distribution = [0; PeerMsg::::COUNT]; for m in msgs.drain(..) { + distribution[m.discriminant()] += 1; match m { PeerMsg::RaftMessage(msg, sent_time) => { if let Some(sent_time) = sent_time { @@ -705,12 +711,19 @@ where } } self.on_loop_finished(); + slow_log!( + T timer, + "{} handle {} peer messages {:?}", + self.fsm.peer.tag, + count, + PeerMsg::::VARIANTS.iter().zip(distribution).filter(|(_, c)| *c > 0).format(", "), + ); self.ctx.raft_metrics.peer_msg_len.observe(count as f64); self.ctx .raft_metrics .event_time .peer_msg - .observe(timer.saturating_elapsed_secs()); + .observe(timer.saturating_elapsed().as_secs_f64()); } #[inline] diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index ae9fd9caa18..010cbbefb23 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -34,6 +34,7 @@ use fail::fail_point; use file_system::{IoType, WithIoType}; use futures::{compat::Future01CompatExt, FutureExt}; use grpcio_health::HealthService; +use itertools::Itertools; use keys::{self, data_end_key, data_key, enc_end_key, enc_start_key}; use kvproto::{ metapb::{self, Region, RegionEpoch}, @@ -49,6 +50,7 @@ use resource_control::{channel::unbounded, ResourceGroupManager}; use resource_metering::CollectorRegHandle; use service::service_manager::GrpcServiceManager; use sst_importer::SstImporter; +use strum::{EnumCount, VariantNames}; use tikv_alloc::trace::TraceEvent; use tikv_util::{ box_try, @@ -61,7 +63,7 @@ use tikv_util::{ store::{find_peer, region_on_stores}, sys as sys_util, sys::disk::{get_disk_status, DiskUsage}, - time::{duration_to_sec, monotonic_raw_now, Instant as TiInstant}, + time::{duration_to_sec, monotonic_raw_now, Instant as TiInstant, SlowTimer}, timer::SteadyTimer, warn, worker::{LazyWorker, Scheduler, Worker}, @@ -783,15 +785,19 @@ impl<'a, EK: KvEngine + 'static, ER: RaftEngine + 'static, T: Transport> .observe(duration_to_sec(elapsed)); slow_log!( elapsed, - "[store {}] handle timeout {:?}", + "[store {}] handle tick {:?}", self.fsm.store.id, tick ); } fn handle_msgs(&mut self, msgs: &mut Vec>) { - let timer = TiInstant::now_coarse(); + let timer = SlowTimer::from_millis(100); + let count = msgs.len(); + #[allow(const_evaluatable_unchecked)] + let mut distribution = [0; StoreMsg::::COUNT]; for m in msgs.drain(..) { + distribution[m.discriminant()] += 1; match m { StoreMsg::Tick(tick) => self.on_tick(tick), StoreMsg::RaftMessage(msg) => { @@ -844,11 +850,18 @@ impl<'a, EK: KvEngine + 'static, ER: RaftEngine + 'static, T: Transport> } } } + slow_log!( + T timer, + "[store {}] handle {} store messages {:?}", + self.fsm.store.id, + count, + StoreMsg::::VARIANTS.iter().zip(distribution).filter(|(_, c)| *c > 0).format(", "), + ); self.ctx .raft_metrics .event_time .store_msg - .observe(timer.saturating_elapsed_secs()); + .observe(timer.saturating_elapsed().as_secs_f64()); } fn start(&mut self, store: metapb::Store) { diff --git a/components/raftstore/src/store/msg.rs b/components/raftstore/src/store/msg.rs index 2f05a068ddb..35a17903919 100644 --- a/components/raftstore/src/store/msg.rs +++ b/components/raftstore/src/store/msg.rs @@ -23,6 +23,7 @@ use pd_client::BucketMeta; use raft::SnapshotStatus; use resource_control::ResourceMetered; use smallvec::{smallvec, SmallVec}; +use strum::{EnumCount, EnumVariantNames}; use tikv_util::{deadline::Deadline, escape, memory::HeapSize, time::Instant}; use tracker::{get_tls_tracker_token, TrackerToken}; @@ -748,11 +749,13 @@ pub struct InspectedRaftMessage { /// Message that can be sent to a peer. #[allow(clippy::large_enum_variant)] +#[derive(EnumCount, EnumVariantNames)] +#[repr(u8)] pub enum PeerMsg { /// Raft message is the message sent between raft nodes in the same /// raft group. Messages need to be redirected to raftstore if target /// peer doesn't exist. - RaftMessage(InspectedRaftMessage, Option), + RaftMessage(InspectedRaftMessage, Option) = 0, /// Raft command is the command that is expected to be proposed by the /// leader of the target raft group. If it's failed to be sent, callback /// usually needs to be called before dropping in case of resource leak. @@ -818,6 +821,23 @@ impl fmt::Debug for PeerMsg { } impl PeerMsg { + pub fn discriminant(&self) -> usize { + match self { + PeerMsg::RaftMessage(..) => 0, + PeerMsg::RaftCommand(_) => 1, + PeerMsg::Tick(_) => 2, + PeerMsg::SignificantMsg(_) => 3, + PeerMsg::ApplyRes { .. } => 4, + PeerMsg::Start => 5, + PeerMsg::Noop => 6, + PeerMsg::Persisted { .. } => 7, + PeerMsg::CasualMessage(_) => 8, + PeerMsg::HeartbeatPd => 9, + PeerMsg::UpdateReplicationMode => 10, + PeerMsg::Destroy(_) => 11, + } + } + /// For some specific kind of messages, it's actually acceptable if failed /// to send it by `significant_send`. This function determine if the /// current message is acceptable to fail. @@ -829,6 +849,7 @@ impl PeerMsg { } } +#[derive(EnumCount, EnumVariantNames)] pub enum StoreMsg where EK: KvEngine, @@ -861,10 +882,6 @@ where inspector: LatencyInspector, }, - /// Message only used for test. - #[cfg(any(test, feature = "testexport"))] - Validate(Box), - UnsafeRecoveryReport(pdpb::StoreReport), UnsafeRecoveryCreatePeer { syncer: UnsafeRecoveryExecutePlanSyncer, @@ -876,6 +893,10 @@ where AwakenRegions { abnormal_stores: Vec, }, + + /// Message only used for test. + #[cfg(any(test, feature = "testexport"))] + Validate(Box), } impl ResourceMetered for StoreMsg {} @@ -901,8 +922,6 @@ where ), StoreMsg::Tick(tick) => write!(fmt, "StoreTick {:?}", tick), StoreMsg::Start { ref store } => write!(fmt, "Start store {:?}", store), - #[cfg(any(test, feature = "testexport"))] - StoreMsg::Validate(_) => write!(fmt, "Validate config"), StoreMsg::UpdateReplicationMode(_) => write!(fmt, "UpdateReplicationMode"), StoreMsg::LatencyInspect { .. } => write!(fmt, "LatencyInspect"), StoreMsg::UnsafeRecoveryReport(..) => write!(fmt, "UnsafeRecoveryReport"), @@ -911,6 +930,29 @@ where } StoreMsg::GcSnapshotFinish => write!(fmt, "GcSnapshotFinish"), StoreMsg::AwakenRegions { .. } => write!(fmt, "AwakenRegions"), + #[cfg(any(test, feature = "testexport"))] + StoreMsg::Validate(_) => write!(fmt, "Validate config"), + } + } +} + +impl StoreMsg { + pub fn discriminant(&self) -> usize { + match self { + StoreMsg::RaftMessage(_) => 0, + StoreMsg::StoreUnreachable { .. } => 1, + StoreMsg::CompactedEvent(_) => 2, + StoreMsg::ClearRegionSizeInRange { .. } => 3, + StoreMsg::Tick(_) => 4, + StoreMsg::Start { .. } => 5, + StoreMsg::UpdateReplicationMode(_) => 6, + StoreMsg::LatencyInspect { .. } => 7, + StoreMsg::UnsafeRecoveryReport(_) => 8, + StoreMsg::UnsafeRecoveryCreatePeer { .. } => 9, + StoreMsg::GcSnapshotFinish => 10, + StoreMsg::AwakenRegions { .. } => 11, + #[cfg(any(test, feature = "testexport"))] + StoreMsg::Validate(_) => 12, } } } From 5d8600afc99588af4eade70140767cee32680eea Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Wed, 5 Jun 2024 10:59:56 +0800 Subject: [PATCH 180/220] cdc: handles region registers correctly after connection deregistered (#17076) (#17096) close tikv/tikv#16444 cdc: handles region registers correctly after connection deregistered Signed-off-by: qupeng Co-authored-by: qupeng --- components/cdc/src/endpoint.rs | 55 ++++++++++++++++++++++++++++++++-- 1 file changed, 53 insertions(+), 2 deletions(-) diff --git a/components/cdc/src/endpoint.rs b/components/cdc/src/endpoint.rs index e583c97645e..e70358ccf2a 100644 --- a/components/cdc/src/endpoint.rs +++ b/components/cdc/src/endpoint.rs @@ -691,8 +691,20 @@ impl, E: KvEngine, S: StoreRegionMeta> Endpoint conn, + None => { + info!("cdc register region on an deregistered connection, ignore"; + "region_id" => region_id, + "conn_id" => ?conn_id, + "req_id" => request_id, + "downstream_id" => ?downstream_id); + return; + } + }; downstream.set_sink(conn.get_sink().clone()); // Check if the cluster id matches if supported. @@ -2991,4 +3003,43 @@ mod tests { assert!(check); } } + + #[test] + fn test_register_after_connection_deregistered() { + let cfg = CdcConfig { + min_ts_interval: ReadableDuration(Duration::from_secs(60)), + ..Default::default() + }; + let mut suite = mock_endpoint(&cfg, None, ApiVersion::V1); + suite.add_region(1, 100); + let quota = Arc::new(MemoryQuota::new(usize::MAX)); + let (tx, _rx) = channel::channel(1, quota); + + let conn = Conn::new(tx, String::new()); + let conn_id = conn.get_id(); + suite.run(Task::OpenConn { conn }); + + suite.run(Task::Deregister(Deregister::Conn(conn_id))); + + let mut req = ChangeDataRequest::default(); + + req.set_region_id(1); + req.set_request_id(1); + let region_epoch = req.get_region_epoch().clone(); + let downstream = Downstream::new( + "".to_string(), + region_epoch, + 1, + conn_id, + ChangeDataRequestKvApi::TiDb, + false, + ObservedRange::default(), + ); + suite.run(Task::Register { + request: req, + downstream, + conn_id, + }); + assert!(suite.connections.is_empty()); + } } From 769817eedf2279fcbef420d3ce4a24635376556e Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Sat, 8 Jun 2024 21:56:57 +0800 Subject: [PATCH 181/220] chore: add prow OWNERS files for critial configuration files (#17071) (#17109) close tikv/tikv#17004 Signed-off-by: wuhuizuo Co-authored-by: wuhuizuo --- OWNERS_ALIASES | 13 +++++++++++++ components/batch-system/src/OWNERS | 7 +++++++ components/cdc/src/OWNERS | 7 +++++++ components/encryption/src/OWNERS | 7 +++++++ components/pd_client/src/OWNERS | 7 +++++++ components/raftstore/src/coprocessor/OWNERS | 7 +++++++ components/raftstore/src/store/worker/OWNERS | 7 +++++++ components/sst_importer/src/OWNERS | 7 +++++++ etc/OWNERS | 7 +++++++ src/config/OWNERS | 7 +++++++ src/coprocessor_v2/OWNERS | 7 +++++++ src/server/OWNERS | 7 +++++++ src/server/gc_worker/OWNERS | 7 +++++++ src/server/lock_manager/OWNERS | 7 +++++++ src/storage/OWNERS | 7 +++++++ 15 files changed, 111 insertions(+) create mode 100644 OWNERS_ALIASES create mode 100644 components/batch-system/src/OWNERS create mode 100644 components/cdc/src/OWNERS create mode 100644 components/encryption/src/OWNERS create mode 100644 components/pd_client/src/OWNERS create mode 100644 components/raftstore/src/coprocessor/OWNERS create mode 100644 components/raftstore/src/store/worker/OWNERS create mode 100644 components/sst_importer/src/OWNERS create mode 100644 etc/OWNERS create mode 100644 src/config/OWNERS create mode 100644 src/coprocessor_v2/OWNERS create mode 100644 src/server/OWNERS create mode 100644 src/server/gc_worker/OWNERS create mode 100644 src/server/lock_manager/OWNERS create mode 100644 src/storage/OWNERS diff --git a/OWNERS_ALIASES b/OWNERS_ALIASES new file mode 100644 index 00000000000..ef0d19adae4 --- /dev/null +++ b/OWNERS_ALIASES @@ -0,0 +1,13 @@ +# Sort the member alphabetically. +aliases: + sig-critical-approvers-config-components: + - easonn7 + - kevin-xianliu + - zhangjinpeng87 + sig-critical-approvers-config-src: + - easonn7 + - kevin-xianliu + - cfzjywxk + sig-critical-approvers-config-cdc: + - BenMeadowcroft + - kevin-xianliu diff --git a/components/batch-system/src/OWNERS b/components/batch-system/src/OWNERS new file mode 100644 index 00000000000..70df3de925d --- /dev/null +++ b/components/batch-system/src/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|config\\.rs)$": + approvers: + - sig-critical-approvers-config-components diff --git a/components/cdc/src/OWNERS b/components/cdc/src/OWNERS new file mode 100644 index 00000000000..774ac39e9f3 --- /dev/null +++ b/components/cdc/src/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|config\\.rs)$": + approvers: + - sig-critical-approvers-config-cdc diff --git a/components/encryption/src/OWNERS b/components/encryption/src/OWNERS new file mode 100644 index 00000000000..70df3de925d --- /dev/null +++ b/components/encryption/src/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|config\\.rs)$": + approvers: + - sig-critical-approvers-config-components diff --git a/components/pd_client/src/OWNERS b/components/pd_client/src/OWNERS new file mode 100644 index 00000000000..70df3de925d --- /dev/null +++ b/components/pd_client/src/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|config\\.rs)$": + approvers: + - sig-critical-approvers-config-components diff --git a/components/raftstore/src/coprocessor/OWNERS b/components/raftstore/src/coprocessor/OWNERS new file mode 100644 index 00000000000..70df3de925d --- /dev/null +++ b/components/raftstore/src/coprocessor/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|config\\.rs)$": + approvers: + - sig-critical-approvers-config-components diff --git a/components/raftstore/src/store/worker/OWNERS b/components/raftstore/src/store/worker/OWNERS new file mode 100644 index 00000000000..eef90ab7017 --- /dev/null +++ b/components/raftstore/src/store/worker/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|split_config\\.rs)$": + approvers: + - sig-critical-approvers-config-components diff --git a/components/sst_importer/src/OWNERS b/components/sst_importer/src/OWNERS new file mode 100644 index 00000000000..70df3de925d --- /dev/null +++ b/components/sst_importer/src/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|config\\.rs)$": + approvers: + - sig-critical-approvers-config-components diff --git a/etc/OWNERS b/etc/OWNERS new file mode 100644 index 00000000000..e8de98f61cb --- /dev/null +++ b/etc/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|config-template\\.toml)$": + approvers: + - sig-critical-approvers-config-src diff --git a/src/config/OWNERS b/src/config/OWNERS new file mode 100644 index 00000000000..8bddc663cb4 --- /dev/null +++ b/src/config/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|mod\\.rs)$": + approvers: + - sig-critical-approvers-config-src diff --git a/src/coprocessor_v2/OWNERS b/src/coprocessor_v2/OWNERS new file mode 100644 index 00000000000..0ce3dbff3a6 --- /dev/null +++ b/src/coprocessor_v2/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|config\\.rs)$": + approvers: + - sig-critical-approvers-config-src diff --git a/src/server/OWNERS b/src/server/OWNERS new file mode 100644 index 00000000000..0ce3dbff3a6 --- /dev/null +++ b/src/server/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|config\\.rs)$": + approvers: + - sig-critical-approvers-config-src diff --git a/src/server/gc_worker/OWNERS b/src/server/gc_worker/OWNERS new file mode 100644 index 00000000000..0ce3dbff3a6 --- /dev/null +++ b/src/server/gc_worker/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|config\\.rs)$": + approvers: + - sig-critical-approvers-config-src diff --git a/src/server/lock_manager/OWNERS b/src/server/lock_manager/OWNERS new file mode 100644 index 00000000000..0ce3dbff3a6 --- /dev/null +++ b/src/server/lock_manager/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|config\\.rs)$": + approvers: + - sig-critical-approvers-config-src diff --git a/src/storage/OWNERS b/src/storage/OWNERS new file mode 100644 index 00000000000..0ce3dbff3a6 --- /dev/null +++ b/src/storage/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|config\\.rs)$": + approvers: + - sig-critical-approvers-config-src From 568b414e99bebf118eedd9b50f24f299efbcab79 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Tue, 11 Jun 2024 18:25:30 +0800 Subject: [PATCH 182/220] OWNERS: Auto Sync OWNERS files from community membership (#16973) (#17119) Signed-off-by: Ti Chi Robot Signed-off-by: Jinpeng Zhang Co-authored-by: Jinpeng Zhang Co-authored-by: wuhuizuo --- OWNERS | 76 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 OWNERS diff --git a/OWNERS b/OWNERS new file mode 100644 index 00000000000..b0e73247005 --- /dev/null +++ b/OWNERS @@ -0,0 +1,76 @@ +# See the OWNERS docs at https://go.k8s.io/owners +approvers: + - 5kbpers + - AndreMouche + - andylokandy + - breezewish + - brson + - bufferflies + - BusyJay + - cfzjywxk + - Connor1996 + - coocood + - disksing + - ekexium + - gengliqi + - glorv + - hicqu + - hunterlxt + - imtbkcat + - innerr + - iosmanthus + - jackysp + - kennytm + - Little-Wallace + - liuzix + - lonng + - LykxSassinator + - lysu + - marsishandsome + - MyonKeminta + - niedhui + - NingLin-P + - nrc + - overvenus + - pingyu + - skyzh + - SpadeA-Tang + - sticnarf + - sunxiaoguang + - tabokie + - TennyZhuang + - tonyxuqqi + - yiwu-arbug + - you06 + - youjiali1995 + - YuJuncen + - zhangjinpeng87 + - zhongzc + - zhouqiang-cl +reviewers: + - 3pointer + - CalvinNeo + - ethercflow + - fredchenbj + - Fullstop000 + - gozssky + - haojinming + - hi-rustin + - HuSharp + - jayzhan211 + - Jibbow + - JmPotato + - Leavrth + - lhy1024 + - longfangsong + - lzmhhh123 + - Mossaka + - MrCroxx + - nolouch + - rleungx + - tier-cap + - v01dstar + - wjhuang2016 + - wshwsh12 + - Xuanwo + - zyguan From 6a8601d0c9d3e60bc6d3306edbbde0b7404889d7 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Fri, 14 Jun 2024 02:16:55 +0800 Subject: [PATCH 183/220] build: bump tikv pkg version (#17136) Signed-off-by: ti-chi-bot --- Cargo.lock | 2 +- Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 833bc7e302d..c53479e0017 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6543,7 +6543,7 @@ dependencies = [ [[package]] name = "tikv" -version = "7.5.2" +version = "7.5.3" dependencies = [ "anyhow", "api_version", diff --git a/Cargo.toml b/Cargo.toml index 606f6f868e6..7fd93076d6f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "tikv" -version = "7.5.2" +version = "7.5.3" authors = ["The TiKV Authors"] description = "A distributed transactional key-value database powered by Rust and Raft" license = "Apache-2.0" From 9fb4aaac9bfe07db066e99397b56384d7a92b3e0 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Wed, 26 Jun 2024 18:04:23 +0800 Subject: [PATCH 184/220] Update OWNERS_ALIASES (#17193) (#17205) ref tikv/tikv#17004 add `zhangjinpeng87` into alias: `sig-critical-approvers-config-src` to approval tikv storage parts. Signed-off-by: wuhuizuo Co-authored-by: wuhuizuo --- OWNERS_ALIASES | 1 + 1 file changed, 1 insertion(+) diff --git a/OWNERS_ALIASES b/OWNERS_ALIASES index ef0d19adae4..cd4a74373a9 100644 --- a/OWNERS_ALIASES +++ b/OWNERS_ALIASES @@ -8,6 +8,7 @@ aliases: - easonn7 - kevin-xianliu - cfzjywxk + - zhangjinpeng87 sig-critical-approvers-config-cdc: - BenMeadowcroft - kevin-xianliu From bd8245e4d5abac92fb55ca253955a77aa55e9812 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Thu, 11 Jul 2024 18:33:06 +0800 Subject: [PATCH 185/220] grpc: set compression args for TiKV service (#17180) (#17263) close tikv/tikv#17176 Set compression arguments for TiKV service. The compression arguments are loaded from TiKV config. It will affect TiKV's response to TiDB. Signed-off-by: ekexium Co-authored-by: ekexium --- src/server/server.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/server/server.rs b/src/server/server.rs index 09782be4e16..3d6e5659705 100644 --- a/src/server/server.rs +++ b/src/server/server.rs @@ -101,6 +101,8 @@ where .http2_max_ping_strikes(i32::MAX) // For pings without data from clients. .keepalive_time(self.cfg.value().grpc_keepalive_time.into()) .keepalive_timeout(self.cfg.value().grpc_keepalive_timeout.into()) + .default_compression_algorithm(self.cfg.value().grpc_compression_algorithm()) + .default_gzip_compression_level(self.cfg.value().grpc_gzip_compression_level) .build_args(); let sb = ServerBuilder::new(Arc::clone(&env)) From cb972bf0020c506b73c4a3fc6b90d6dd2f153508 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Fri, 12 Jul 2024 14:49:34 +0800 Subject: [PATCH 186/220] raftstore: gc abnormal snapshots and destroy peer if failed to apply snapshots. (#16992) (#17267) close tikv/tikv#15292 Replace `SnapshotApplied` with `SnapshotApplied { peer_id: u64, tombstone: bool}`. And if `tombstone` == true, the relative peer will be automatically GCed. Signed-off-by: ti-chi-bot Signed-off-by: lucasliang Co-authored-by: lucasliang --- components/engine_rocks/src/import.rs | 18 +------- components/raftstore/src/store/fsm/peer.rs | 14 +++++- components/raftstore/src/store/fsm/store.rs | 12 +++++ components/raftstore/src/store/msg.rs | 12 ++++- components/raftstore/src/store/peer.rs | 8 +++- .../raftstore/src/store/peer_storage.rs | 13 +++--- .../raftstore/src/store/worker/region.rs | 27 +++++++++--- tests/failpoints/cases/test_pending_peers.rs | 44 ++++++++++++++++++- 8 files changed, 115 insertions(+), 33 deletions(-) diff --git a/components/engine_rocks/src/import.rs b/components/engine_rocks/src/import.rs index 1aa65ec07fa..e0f5461d2a3 100644 --- a/components/engine_rocks/src/import.rs +++ b/components/engine_rocks/src/import.rs @@ -1,11 +1,7 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. -use std::fs::File; - use engine_traits::{ImportExt, IngestExternalFileOptions, Result}; -use rocksdb::{ - set_external_sst_file_global_seq_no, IngestExternalFileOptions as RawIngestExternalFileOptions, -}; +use rocksdb::IngestExternalFileOptions as RawIngestExternalFileOptions; use crate::{engine::RocksEngine, r2e, util}; @@ -17,17 +13,7 @@ impl ImportExt for RocksEngine { let mut opts = RocksIngestExternalFileOptions::new(); opts.move_files(true); opts.set_write_global_seqno(false); - files.iter().try_for_each(|file| -> Result<()> { - let f = File::open(file)?; - // Prior to v5.2.0, TiKV use `write_global_seqno=true` for ingestion. For - // backward compatibility, in case TiKV is retrying an ingestion job - // generated by older version, it needs to reset the global seqno to - // 0. - set_external_sst_file_global_seq_no(self.as_inner(), cf, file, 0).map_err(r2e)?; - f.sync_all() - .map_err(|e| format!("sync {}: {:?}", file, e)) - .map_err(r2e) - })?; + // Note: no need reset the global seqno to 0 for compatibility as #16992 // This is calling a specially optimized version of // ingest_external_file_cf. In cases where the memtable needs to be // flushed it avoids blocking writers while doing the flush. The unused diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index e0f9381ca61..d3bc49a6169 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -1243,8 +1243,18 @@ where let raft_msg = self.fsm.peer.build_raft_messages(self.ctx, vec![msg]); self.fsm.peer.send_raft_messages(self.ctx, raft_msg); } - CasualMessage::SnapshotApplied => { + CasualMessage::SnapshotApplied { peer_id, tombstone } => { self.fsm.has_ready = true; + // If failed on applying snapshot, it should record the peer as an invalid peer. + if tombstone && self.fsm.peer.peer_id() == peer_id && !self.fsm.peer.is_leader() { + info!( + "mark the region damaged on applying snapshot"; + "region_id" => self.region_id(), + "peer_id" => peer_id, + ); + let mut meta = self.ctx.store_meta.lock().unwrap(); + meta.damaged_regions.insert(self.region_id()); + } if self.fsm.peer.should_destroy_after_apply_snapshot() { self.maybe_destroy(); } @@ -3793,6 +3803,8 @@ where ); })(); let mut meta = self.ctx.store_meta.lock().unwrap(); + meta.damaged_regions.remove(&self.fsm.region_id()); + meta.damaged_regions.shrink_to_fit(); let is_latest_initialized = { if let Some(latest_region_info) = meta.regions.get(®ion_id) { util::is_region_initialized(latest_region_info) diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index 010cbbefb23..e15d7608ff2 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -171,6 +171,10 @@ pub struct StoreMeta { pub region_read_progress: RegionReadProgressRegistry, /// record sst_file_name -> (sst_smallest_key, sst_largest_key) pub damaged_ranges: HashMap, Vec)>, + /// Record regions are damaged on some corner cases, the relative peer must + /// be safely removed from the store, such as applying snapshot or + /// compacting raft logs. + pub damaged_regions: HashSet, } impl StoreRegionMeta for StoreMeta { @@ -221,6 +225,7 @@ impl StoreMeta { destroyed_region_for_snap: HashMap::default(), region_read_progress: RegionReadProgressRegistry::new(), damaged_ranges: HashMap::default(), + damaged_regions: HashSet::default(), } } @@ -2561,6 +2566,13 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER let damaged_regions_id = meta.get_all_damaged_region_ids().into_iter().collect(); stats.set_damaged_regions_id(damaged_regions_id); } + if !meta.damaged_regions.is_empty() { + // Note: no need to filter overlapped regions, since the regions in + // `damaged_ranges` are already non-overlapping. + stats + .mut_damaged_regions_id() + .extend(meta.damaged_regions.iter()); + } } let snap_stats = self.ctx.snap_mgr.stats(); diff --git a/components/raftstore/src/store/msg.rs b/components/raftstore/src/store/msg.rs index 35a17903919..09c33fe1616 100644 --- a/components/raftstore/src/store/msg.rs +++ b/components/raftstore/src/store/msg.rs @@ -622,7 +622,11 @@ pub enum CasualMessage { RenewLease, // Snapshot is applied - SnapshotApplied, + SnapshotApplied { + peer_id: u64, + /// Whether the peer is destroyed after applying the snapshot + tombstone: bool, + }, // Trigger raft to campaign which is used after exiting force leader Campaign, @@ -691,7 +695,11 @@ impl fmt::Debug for CasualMessage { } CasualMessage::RefreshRegionBuckets { .. } => write!(fmt, "RefreshRegionBuckets"), CasualMessage::RenewLease => write!(fmt, "RenewLease"), - CasualMessage::SnapshotApplied => write!(fmt, "SnapshotApplied"), + CasualMessage::SnapshotApplied { peer_id, tombstone } => write!( + fmt, + "SnapshotApplied, peer_id={}, tombstone={}", + peer_id, tombstone + ), CasualMessage::Campaign => write!(fmt, "Campaign"), } } diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index 5bf45971ff0..c985af40830 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -2842,7 +2842,13 @@ where if for_witness { // inform next round to check apply status ctx.router - .send_casual_msg(snap_region.get_id(), CasualMessage::SnapshotApplied) + .send_casual_msg( + snap_region.get_id(), + CasualMessage::SnapshotApplied { + peer_id: self.peer.get_id(), + tombstone: false, + }, + ) .unwrap(); } // When applying snapshot, there is no log applied and not compacted yet. diff --git a/components/raftstore/src/store/peer_storage.rs b/components/raftstore/src/store/peer_storage.rs index 99897ba551c..dd261ebb13a 100644 --- a/components/raftstore/src/store/peer_storage.rs +++ b/components/raftstore/src/store/peer_storage.rs @@ -795,8 +795,9 @@ where } else if s == JOB_STATUS_CANCELLED { SnapState::ApplyAborted } else if s == JOB_STATUS_FAILED { - // TODO: cleanup region and treat it as tombstone. - panic!("{} applying snapshot failed", self.tag,); + // Cleanup region and treat it as tombstone. + warn!("{} applying snapshot failed", self.tag); + SnapState::ApplyAborted } else { return CheckApplyingSnapStatus::Applying; } @@ -2029,8 +2030,8 @@ pub mod tests { s.snap_state = RefCell::new(SnapState::Applying(Arc::new(AtomicUsize::new( JOB_STATUS_FAILED, )))); - let res = panic_hook::recover_safe(|| s.cancel_applying_snap()); - res.unwrap_err(); + assert!(s.cancel_applying_snap()); + assert_eq!(*s.snap_state.borrow(), SnapState::ApplyAborted); } #[test] @@ -2079,8 +2080,8 @@ pub mod tests { s.snap_state = RefCell::new(SnapState::Applying(Arc::new(AtomicUsize::new( JOB_STATUS_FAILED, )))); - let res = panic_hook::recover_safe(|| s.check_applying_snap()); - res.unwrap_err(); + assert!(s.cancel_applying_snap()); + assert_eq!(*s.snap_state.borrow(), SnapState::ApplyAborted); } #[test] diff --git a/components/raftstore/src/store/worker/region.rs b/components/raftstore/src/store/worker/region.rs index 068904b2a67..fe5ef64b9cd 100644 --- a/components/raftstore/src/store/worker/region.rs +++ b/components/raftstore/src/store/worker/region.rs @@ -46,7 +46,7 @@ use crate::{ }, snap::{plain_file_used, Error, Result, SNAPSHOT_CFS}, transport::CasualRouter, - ApplyOptions, CasualMessage, Config, SnapEntry, SnapKey, SnapManager, + ApplyOptions, CasualMessage, Config, SnapEntry, SnapError, SnapKey, SnapManager, }, }; @@ -454,6 +454,9 @@ where fn apply_snap(&mut self, region_id: u64, peer_id: u64, abort: Arc) -> Result<()> { info!("begin apply snap data"; "region_id" => region_id, "peer_id" => peer_id); fail_point!("region_apply_snap", |_| { Ok(()) }); + fail_point!("region_apply_snap_io_err", |_| { + Err(SnapError::Other(box_err!("io error"))) + }); check_abort(&abort)?; let mut region_state = self.region_state(region_id)?; @@ -521,10 +524,11 @@ where let start = Instant::now(); - match self.apply_snap(region_id, peer_id, Arc::clone(&status)) { + let tombstone = match self.apply_snap(region_id, peer_id, Arc::clone(&status)) { Ok(()) => { status.swap(JOB_STATUS_FINISHED, Ordering::SeqCst); SNAP_COUNTER.apply.success.inc(); + false } Err(Error::Abort) => { warn!("applying snapshot is aborted"; "region_id" => region_id); @@ -535,18 +539,29 @@ where JOB_STATUS_CANCELLING ); SNAP_COUNTER.apply.abort.inc(); + // The snapshot is applied abort, it's not necessary to tombstone the peer. + false } Err(e) => { - error!(%e; "failed to apply snap!!!"); + warn!("failed to apply snap!!!"; "region_id" => region_id, "err" => %e); + self.coprocessor_host + .cancel_apply_snapshot(region_id, peer_id); status.swap(JOB_STATUS_FAILED, Ordering::SeqCst); SNAP_COUNTER.apply.fail.inc(); + // As the snapshot failed, the related peer should be marked tombstone. + // And as for the abnormal snapshot, it will be automatically cleaned up by + // the CleanupWorker later. + true } - } + }; SNAP_HISTOGRAM .apply .observe(start.saturating_elapsed_secs()); - let _ = self.router.send(region_id, CasualMessage::SnapshotApplied); + let _ = self.router.send( + region_id, + CasualMessage::SnapshotApplied { peer_id, tombstone }, + ); } /// Tries to clean up files in pending ranges overlapping with the given @@ -1275,7 +1290,7 @@ pub(crate) mod tests { let wait_apply_finish = |ids: &[u64]| { for id in ids { match receiver.recv_timeout(Duration::from_secs(5)) { - Ok((region_id, CasualMessage::SnapshotApplied)) => { + Ok((region_id, CasualMessage::SnapshotApplied { .. })) => { assert_eq!(region_id, *id); } msg => panic!("expected {} SnapshotApplied, but got {:?}", id, msg), diff --git a/tests/failpoints/cases/test_pending_peers.rs b/tests/failpoints/cases/test_pending_peers.rs index c41c97034b4..76bf56ae698 100644 --- a/tests/failpoints/cases/test_pending_peers.rs +++ b/tests/failpoints/cases/test_pending_peers.rs @@ -1,6 +1,6 @@ // Copyright 2017 TiKV Project Authors. Licensed under Apache-2.0. -use std::sync::Arc; +use std::{sync::Arc, time::Duration}; use test_raftstore::*; use tikv_util::{config::*, time::Instant}; @@ -109,3 +109,45 @@ fn test_pending_snapshot() { state2 ); } + +#[test] +fn test_on_apply_snap_failed() { + let mut cluster = new_node_cluster(0, 3); + cluster.cfg.raft_store.raft_base_tick_interval = ReadableDuration::millis(5); + cluster.cfg.raft_store.raft_store_max_leader_lease = ReadableDuration::millis(100); + cluster.cfg.raft_store.pd_heartbeat_tick_interval = ReadableDuration::millis(100); + cluster.cfg.raft_store.pd_store_heartbeat_tick_interval = ReadableDuration::millis(100); + + let pd_client = Arc::clone(&cluster.pd_client); + // Disable default max peer count check. + pd_client.disable_default_operator(); + + let region_id = cluster.run_conf_change(); + pd_client.must_add_peer(region_id, new_peer(2, 2)); + + // To ensure peer 2 is not pending. + cluster.must_put(b"k1", b"v1"); + must_get_equal(&cluster.get_engine(2), b"k1", b"v1"); + + // Mock applying snapshot failed on peer 3. + fail::cfg("region_apply_snap_io_err", "return").unwrap(); + pd_client.must_add_peer(region_id, new_peer(3, 3)); + // Region worker is failed on applying snapshot. + test_util::eventually(Duration::from_millis(100), Duration::from_secs(1), || { + let pending_peers = pd_client.get_pending_peers(); + pending_peers[&3] == new_peer(3, 3) + }); + must_get_none(&cluster.get_engine(3), b"k1"); + cluster.must_send_store_heartbeat(3); + // Check that the region is marked as damaged. + test_util::eventually(Duration::from_millis(100), Duration::from_secs(1), || { + if let Some(stats) = pd_client.get_store_stats(3) { + !stats.damaged_regions_id.is_empty() + } else { + false + } + }); + let stats = pd_client.get_store_stats(3).unwrap(); + assert!(stats.damaged_regions_id.contains(®ion_id)); + fail::remove("region_apply_snap_io_err"); +} From 0cb14016e4ea83c1ba8642a2fc06d08a9b96e336 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Mon, 15 Jul 2024 17:11:59 +0800 Subject: [PATCH 187/220] raftstore: optimize AutoSplitController memory usage (#16678) (#17044) ref tikv/tikv#16653, close tikv/tikv#16716 raftstore: optimize AutoSplitController memory usage * Replaced unbounded channels with bounded channels to prevent unexpected memory buildup when AutoSplitController runs slowly. * Implemented reusability of temporary vectors and maps during CPU stats handling to reduce memory allocation and deallocation overhead, saving about 10% CPU. raftstore: fix the bug when the channel is full in AutoSplitController. (#16726) close tikv/tikv#16716 raftstore: supply extra ut for testing non-blocking channel. (#16729) Add an extra ut to test the non-blocking channel in pd_worker for sending collected ReadStats and CPU statistics. Signed-off-by: lucasliang Signed-off-by: Neil Shen Co-authored-by: lucasliang Co-authored-by: Neil Shen Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- components/raftstore/src/store/worker/pd.rs | 115 +++++-- .../src/store/worker/split_controller.rs | 315 ++++++++++++++++-- 2 files changed, 380 insertions(+), 50 deletions(-) diff --git a/components/raftstore/src/store/worker/pd.rs b/components/raftstore/src/store/worker/pd.rs index a5bf52567ca..e728ab12502 100644 --- a/components/raftstore/src/store/worker/pd.rs +++ b/components/raftstore/src/store/worker/pd.rs @@ -7,7 +7,7 @@ use std::{ io, mem, sync::{ atomic::Ordering, - mpsc::{self, Receiver, Sender}, + mpsc::{self, Receiver, Sender, SyncSender}, Arc, Mutex, }, thread::{Builder, JoinHandle}, @@ -53,6 +53,7 @@ use tikv_util::{ use txn_types::TimeStamp; use yatp::Remote; +use super::split_controller::AutoSplitControllerContext; use crate::{ coprocessor::CoprocessorHost, router::RaftStoreRouter, @@ -73,6 +74,10 @@ use crate::{ }; pub const NUM_COLLECT_STORE_INFOS_PER_HEARTBEAT: u32 = 2; +/// The upper bound of buffered stats messages. +/// It prevents unexpected memory buildup when AutoSplitController +/// runs slowly. +const STATS_CHANNEL_CAPACITY_LIMIT: usize = 128; type RecordPairVec = Vec; @@ -583,8 +588,8 @@ where reporter: T, handle: Option>, timer: Option>, - read_stats_sender: Option>, - cpu_stats_sender: Option>>, + read_stats_sender: Option>, + cpu_stats_sender: Option>>, collect_store_infos_interval: Duration, load_base_split_check_interval: Duration, collect_tick_interval: Duration, @@ -661,10 +666,12 @@ where let (timer_tx, timer_rx) = mpsc::channel(); self.timer = Some(timer_tx); - let (read_stats_sender, read_stats_receiver) = mpsc::channel(); + let (read_stats_sender, read_stats_receiver) = + mpsc::sync_channel(STATS_CHANNEL_CAPACITY_LIMIT); self.read_stats_sender = Some(read_stats_sender); - let (cpu_stats_sender, cpu_stats_receiver) = mpsc::channel(); + let (cpu_stats_sender, cpu_stats_receiver) = + mpsc::sync_channel(STATS_CHANNEL_CAPACITY_LIMIT); self.cpu_stats_sender = Some(cpu_stats_sender); let reporter = self.reporter.clone(); @@ -683,6 +690,8 @@ where let mut collect_store_infos_thread_stats = ThreadInfoStatistics::new(); let mut load_base_split_thread_stats = ThreadInfoStatistics::new(); let mut region_cpu_records_collector = None; + let mut auto_split_controller_ctx = + AutoSplitControllerContext::new(STATS_CHANNEL_CAPACITY_LIMIT); // Register the region CPU records collector. if auto_split_controller .cfg @@ -704,6 +713,7 @@ where if is_enable_tick(timer_cnt, load_base_split_check_interval) { StatsMonitor::load_base_split( &mut auto_split_controller, + &mut auto_split_controller_ctx, &read_stats_receiver, &cpu_stats_receiver, &mut load_base_split_thread_stats, @@ -740,6 +750,7 @@ where pub fn load_base_split( auto_split_controller: &mut AutoSplitController, + auto_split_controller_ctx: &mut AutoSplitControllerContext, read_stats_receiver: &Receiver, cpu_stats_receiver: &Receiver>, thread_stats: &mut ThreadInfoStatistics, @@ -761,18 +772,14 @@ where } SplitConfigChange::Noop => {} } - let mut read_stats_vec = vec![]; - while let Ok(read_stats) = read_stats_receiver.try_recv() { - read_stats_vec.push(read_stats); - } - let mut cpu_stats_vec = vec![]; - while let Ok(cpu_stats) = cpu_stats_receiver.try_recv() { - cpu_stats_vec.push(cpu_stats); - } - thread_stats.record(); - let (top_qps, split_infos) = - auto_split_controller.flush(read_stats_vec, cpu_stats_vec, thread_stats); + let (top_qps, split_infos) = auto_split_controller.flush( + auto_split_controller_ctx, + read_stats_receiver, + cpu_stats_receiver, + thread_stats, + ); auto_split_controller.clear(); + auto_split_controller_ctx.maybe_gc(); reporter.auto_split(split_infos); for i in 0..TOP_N { if i < top_qps.len() { @@ -800,8 +807,8 @@ where #[inline] pub fn maybe_send_read_stats(&self, read_stats: ReadStats) { if let Some(sender) = &self.read_stats_sender { - if sender.send(read_stats).is_err() { - warn!("send read_stats failed, are we shutting down?") + if sender.try_send(read_stats).is_err() { + debug!("send read_stats failed, are we shutting down or channel is full?") } } } @@ -809,8 +816,8 @@ where #[inline] pub fn maybe_send_cpu_stats(&self, cpu_stats: &Arc) { if let Some(sender) = &self.cpu_stats_sender { - if sender.send(cpu_stats.clone()).is_err() { - warn!("send region cpu info failed, are we shutting down?") + if sender.try_send(cpu_stats.clone()).is_err() { + debug!("send region cpu info failed, are we shutting down or channel is full?") } } } @@ -2663,8 +2670,10 @@ mod tests { use kvproto::{kvrpcpb, pdpb::QueryKind}; use pd_client::{new_bucket_stats, BucketMeta}; + use tikv_util::worker::LazyWorker; use super::*; + use crate::store::{fsm::StoreMeta, util::build_key_range}; const DEFAULT_TEST_STORE_ID: u64 = 1; @@ -2674,7 +2683,6 @@ mod tests { use std::{sync::Mutex, time::Instant}; use engine_test::{kv::KvTestEngine, raft::RaftTestEngine}; - use tikv_util::worker::LazyWorker; use crate::store::fsm::StoreMeta; @@ -2988,4 +2996,69 @@ mod tests { assert_eq!(used, 111); assert_eq!(avail, 333); } + + #[test] + fn test_pd_worker_send_stats_on_read_and_cpu() { + let mut pd_worker: LazyWorker> = + LazyWorker::new("test-pd-worker-collect-stats"); + // Set the interval long enough for mocking the channel full state. + let interval = 600_u64; + let mut stats_monitor = StatsMonitor::new( + Duration::from_secs(interval), + Duration::from_secs(0), + Duration::from_secs(interval), + WrappedScheduler(pd_worker.scheduler()), + ); + let store_meta = Arc::new(Mutex::new(StoreMeta::new(0))); + let region_read_progress = store_meta.lock().unwrap().region_read_progress.clone(); + stats_monitor + .start( + AutoSplitController::default(), + region_read_progress, + CollectorRegHandle::new_for_test(), + 0, + ) + .unwrap(); + // Add some read stats and cpu stats to the stats monitor. + { + for _ in 0..=STATS_CHANNEL_CAPACITY_LIMIT + 10 { + let mut read_stats = ReadStats::with_sample_num(1); + read_stats.add_query_num( + 1, + &Peer::default(), + build_key_range(b"a", b"b", false), + QueryKind::Get, + ); + stats_monitor.maybe_send_read_stats(read_stats); + } + + let raw_records = Arc::new(RawRecords { + begin_unix_time_secs: UnixSecs::now().into_inner(), + duration: Duration::default(), + records: { + let mut records = HashMap::default(); + records.insert( + Arc::new(TagInfos { + store_id: 0, + region_id: 1, + peer_id: 0, + key_ranges: vec![], + extra_attachment: b"a".to_vec(), + }), + RawRecord { + cpu_time: 111, + read_keys: 1, + write_keys: 0, + }, + ); + records + }, + }); + for _ in 0..=STATS_CHANNEL_CAPACITY_LIMIT + 10 { + stats_monitor.maybe_send_cpu_stats(&raw_records); + } + } + + pd_worker.stop(); + } } diff --git a/components/raftstore/src/store/worker/split_controller.rs b/components/raftstore/src/store/worker/split_controller.rs index 4bbcc773763..5fef6fd76a0 100644 --- a/components/raftstore/src/store/worker/split_controller.rs +++ b/components/raftstore/src/store/worker/split_controller.rs @@ -2,12 +2,13 @@ use std::{ cmp::{min, Ordering}, - collections::{BinaryHeap, HashMap, HashSet}, + collections::{BinaryHeap, HashSet}, slice::{Iter, IterMut}, sync::{mpsc::Receiver, Arc}, time::{Duration, SystemTime}, }; +use collections::HashMap; use kvproto::{ kvrpcpb::KeyRange, metapb::{self, Peer}, @@ -21,6 +22,7 @@ use tikv_util::{ debug, info, metrics::ThreadInfoStatistics, store::{is_read_query, QueryStats}, + time::Instant, warn, }; @@ -648,11 +650,15 @@ impl AutoSplitController { // collect the read stats from read_stats_vec and dispatch them to a Region // HashMap. - fn collect_read_stats(read_stats_vec: Vec) -> HashMap> { + fn collect_read_stats( + ctx: &mut AutoSplitControllerContext, + read_stats_receiver: &Receiver, + ) -> HashMap> { + let read_stats_vec = ctx.batch_recv_read_stats(read_stats_receiver); // RegionID -> Vec, collect the RegionInfo from different threads. let mut region_infos_map = HashMap::default(); let capacity = read_stats_vec.len(); - for read_stats in read_stats_vec { + for read_stats in read_stats_vec.drain(..) { for (region_id, region_info) in read_stats.region_infos { let region_infos = region_infos_map .entry(region_id) @@ -665,19 +671,27 @@ impl AutoSplitController { // collect the CPU stats from cpu_stats_vec and dispatch them to a Region // HashMap. - fn collect_cpu_stats( - &self, - cpu_stats_vec: Vec>, - ) -> HashMap)> { + fn collect_cpu_stats<'c>( + &mut self, + ctx: &'c mut AutoSplitControllerContext, + cpu_stats_receiver: &Receiver>, + ) -> &'c HashMap)> { // RegionID -> (CPU usage, Hottest Key Range), calculate the CPU usage and its // hottest key range. - let mut region_cpu_map = HashMap::default(); if !self.should_check_region_cpu() { - return region_cpu_map; + return ctx.empty_region_cpu_map(); } + + let ( + cpu_stats_vec, + CpuStatsCache { + region_cpu_map, + hottest_key_range_cpu_time_map, + }, + ) = ctx.batch_recv_cpu_stats(cpu_stats_receiver); // Calculate the Region CPU usage. let mut collect_interval_ms = 0; - let mut region_key_range_cpu_time_map = HashMap::new(); + let mut region_key_range_cpu_time_map = HashMap::default(); cpu_stats_vec.iter().for_each(|cpu_stats| { cpu_stats.records.iter().for_each(|(tag, record)| { // Calculate the Region ID -> CPU Time. @@ -704,7 +718,6 @@ impl AutoSplitController { } }); // Choose the hottest key range for each Region. - let mut hottest_key_range_cpu_time_map = HashMap::with_capacity(region_cpu_map.len()); region_key_range_cpu_time_map .iter() .for_each(|((region_id, key_range), cpu_time)| { @@ -740,15 +753,17 @@ impl AutoSplitController { // be split according to all the stats info the recorder has collected before. pub fn flush( &mut self, - read_stats_vec: Vec, - cpu_stats_vec: Vec>, - thread_stats: &ThreadInfoStatistics, + ctx: &mut AutoSplitControllerContext, + read_stats_receiver: &Receiver, + cpu_stats_receiver: &Receiver>, + thread_stats: &mut ThreadInfoStatistics, ) -> (Vec, Vec) { let mut top_cpu_usage = vec![]; let mut top_qps = BinaryHeap::with_capacity(TOP_N); - let region_infos_map = Self::collect_read_stats(read_stats_vec); - let region_cpu_map = self.collect_cpu_stats(cpu_stats_vec); + let region_infos_map = Self::collect_read_stats(ctx, read_stats_receiver); + let region_cpu_map = self.collect_cpu_stats(ctx, cpu_stats_receiver); // Prepare some diagnostic info. + thread_stats.record(); let (grpc_thread_usage, unified_read_pool_thread_usage) = ( Self::collect_thread_usage(thread_stats, "grpc-server"), Self::collect_thread_usage(thread_stats, "unified-read-po"), @@ -939,8 +954,89 @@ impl AutoSplitController { } } +#[derive(Default)] +pub struct CpuStatsCache { + region_cpu_map: HashMap)>, + hottest_key_range_cpu_time_map: HashMap, +} + +pub struct AutoSplitControllerContext { + read_stats_vec: Vec, + cpu_stats_vec: Vec>, + cpu_stats_cache: CpuStatsCache, + batch_recv_len: usize, + + last_gc_time: Instant, + gc_duration: Duration, +} + +impl AutoSplitControllerContext { + pub fn new(batch_recv_len: usize) -> Self { + AutoSplitControllerContext { + read_stats_vec: Vec::default(), + cpu_stats_vec: Vec::default(), + cpu_stats_cache: CpuStatsCache::default(), + batch_recv_len, + last_gc_time: Instant::now_coarse(), + // 30 seconds is a balance between efficient memory usage and + // maintaining performance under load. + gc_duration: Duration::from_secs(30), + } + } + + pub fn batch_recv_read_stats( + &mut self, + read_stats_receiver: &Receiver, + ) -> &mut Vec { + self.read_stats_vec.clear(); + + while let Ok(read_stats) = read_stats_receiver.try_recv() { + self.read_stats_vec.push(read_stats); + if self.read_stats_vec.len() == self.batch_recv_len { + break; + } + } + &mut self.read_stats_vec + } + + pub fn batch_recv_cpu_stats( + &mut self, + cpu_stats_receiver: &Receiver>, + ) -> (&mut Vec>, &mut CpuStatsCache) { + self.cpu_stats_vec.clear(); + self.cpu_stats_cache.region_cpu_map.clear(); + self.cpu_stats_cache.hottest_key_range_cpu_time_map.clear(); + + while let Ok(cpu_stats) = cpu_stats_receiver.try_recv() { + self.cpu_stats_vec.push(cpu_stats); + if self.cpu_stats_vec.len() == self.batch_recv_len { + break; + } + } + (&mut self.cpu_stats_vec, &mut self.cpu_stats_cache) + } + + pub fn empty_region_cpu_map(&mut self) -> &HashMap)> { + self.cpu_stats_cache.region_cpu_map.clear(); + &self.cpu_stats_cache.region_cpu_map + } + + pub fn maybe_gc(&mut self) { + let now = Instant::now_coarse(); + if now.saturating_duration_since(self.last_gc_time) > self.gc_duration { + self.read_stats_vec = Vec::default(); + self.cpu_stats_vec = Vec::default(); + self.cpu_stats_cache = CpuStatsCache::default(); + + self.last_gc_time = now; + } + } +} + #[cfg(test)] mod tests { + use std::sync::mpsc::{self, TryRecvError}; + use online_config::{ConfigChange, ConfigManager, ConfigValue}; use resource_metering::{RawRecord, TagInfos}; use tikv_util::config::{ReadableSize, VersionTrack}; @@ -1190,6 +1286,30 @@ mod tests { fail::remove("mock_region_is_busy"); } + fn new_auto_split_controller_ctx( + read_stats: Vec, + cpu_stats: Vec>, + ) -> ( + AutoSplitControllerContext, + Receiver, + Receiver>, + ) { + let len = std::cmp::max(read_stats.len(), cpu_stats.len()); + let (read_stats_sender, read_stats_receiver) = mpsc::sync_channel(len); + let (cpu_stats_sender, cpu_stats_receiver) = mpsc::sync_channel(len); + for s in cpu_stats { + cpu_stats_sender.try_send(s).unwrap(); + } + for s in read_stats { + read_stats_sender.try_send(s).unwrap(); + } + ( + AutoSplitControllerContext::new(len), + read_stats_receiver, + cpu_stats_receiver, + ) + } + fn check_split_key(mode: &[u8], qps_stats: Vec, split_keys: Vec<&[u8]>) { let mode = String::from_utf8(Vec::from(mode)).unwrap(); let mut hub = AutoSplitController::default(); @@ -1197,8 +1317,14 @@ mod tests { hub.cfg.sample_threshold = 0; for i in 0..10 { - let (_, split_infos) = - hub.flush(qps_stats.clone(), vec![], &ThreadInfoStatistics::default()); + let (mut ctx, read_stats_receiver, cpu_stats_receiver) = + new_auto_split_controller_ctx(qps_stats.clone(), vec![]); + let (_, split_infos) = hub.flush( + &mut ctx, + &read_stats_receiver, + &cpu_stats_receiver, + &mut ThreadInfoStatistics::default(), + ); if (i + 1) % hub.cfg.detect_times != 0 { continue; } @@ -1230,10 +1356,13 @@ mod tests { hub.cfg.sample_threshold = 0; for i in 0..10 { + let (mut ctx, read_stats_receiver, cpu_stats_receiver) = + new_auto_split_controller_ctx(qps_stats.clone(), cpu_stats.clone()); let (_, split_infos) = hub.flush( - qps_stats.clone(), - cpu_stats.clone(), - &ThreadInfoStatistics::default(), + &mut ctx, + &read_stats_receiver, + &cpu_stats_receiver, + &mut ThreadInfoStatistics::default(), ); if (i + 1) % hub.cfg.detect_times != 0 { continue; @@ -1318,7 +1447,15 @@ mod tests { ); } qps_stats_vec.push(qps_stats); - hub.flush(qps_stats_vec, vec![], &ThreadInfoStatistics::default()); + + let (mut ctx, read_stats_receiver, cpu_stats_receiver) = + new_auto_split_controller_ctx(qps_stats_vec.clone(), vec![]); + hub.flush( + &mut ctx, + &read_stats_receiver, + &cpu_stats_receiver, + &mut ThreadInfoStatistics::default(), + ); } // Test the empty key ranges. @@ -1331,7 +1468,15 @@ mod tests { qps_stats.add_query_num(1, &Peer::default(), KeyRange::default(), QueryKind::Get); } qps_stats_vec.push(qps_stats); - hub.flush(qps_stats_vec, vec![], &ThreadInfoStatistics::default()); + + let (mut ctx, read_stats_receiver, cpu_stats_receiver) = + new_auto_split_controller_ctx(qps_stats_vec, vec![]); + hub.flush( + &mut ctx, + &read_stats_receiver, + &cpu_stats_receiver, + &mut ThreadInfoStatistics::default(), + ); } fn check_sample_length(key_ranges: Vec>) { @@ -1678,8 +1823,10 @@ mod tests { #[test] fn test_collect_cpu_stats() { - let auto_split_controller = AutoSplitController::default(); - let region_cpu_map = auto_split_controller.collect_cpu_stats(vec![]); + let mut auto_split_controller = AutoSplitController::default(); + + let (mut ctx, _, cpu_stats_receiver) = new_auto_split_controller_ctx(vec![], vec![]); + let region_cpu_map = auto_split_controller.collect_cpu_stats(&mut ctx, &cpu_stats_receiver); assert!(region_cpu_map.is_empty()); let ab_key_range_tag = Arc::new(TagInfos { @@ -1767,8 +1914,11 @@ mod tests { write_keys: 0, }, ); + + let (mut ctx, _, cpu_stats_receiver) = + new_auto_split_controller_ctx(vec![], vec![Arc::new(raw_records)]); let region_cpu_map = - auto_split_controller.collect_cpu_stats(vec![Arc::new(raw_records)]); + auto_split_controller.collect_cpu_stats(&mut ctx, &cpu_stats_receiver); assert_eq!( region_cpu_map.len(), 1, @@ -1869,12 +2019,21 @@ mod tests { for _i in 0..10 { other_qps_stats.push(default_qps_stats()); } + let (read_stats_sender, read_stats_receiver) = mpsc::sync_channel(other_qps_stats.len()); + let (_, cpu_stats_receiver) = mpsc::sync_channel(other_qps_stats.len()); + let mut ctx = AutoSplitControllerContext::new(other_qps_stats.len()); + let mut threads = ThreadInfoStatistics::default(); + b.iter(|| { let mut hub = AutoSplitController::default(); + for s in other_qps_stats.clone() { + read_stats_sender.send(s).unwrap(); + } hub.flush( - other_qps_stats.clone(), - vec![], - &ThreadInfoStatistics::default(), + &mut ctx, + &read_stats_receiver, + &cpu_stats_receiver, + &mut threads, ); }); } @@ -1915,4 +2074,102 @@ mod tests { ); }); } + + #[test] + fn test_auto_split_controller_ctx_batch_recv() { + let batch_limit = 3; + let mut ctx = AutoSplitControllerContext::new(batch_limit); + for len in [0, 2, 3, 5, 6] { + let (read_stats_sender, read_stats_receiver) = mpsc::sync_channel(len); + let (cpu_stats_sender, cpu_stats_receiver) = mpsc::sync_channel(len); + + let read_stats = ReadStats::default(); + let cpu_stats = Arc::new(RawRecords::default()); + for _ in 0..len { + read_stats_sender.try_send(read_stats.clone()).unwrap(); + cpu_stats_sender.try_send(cpu_stats.clone()).unwrap(); + } + // If channel is full, should return error. + assert!(read_stats_sender.try_send(read_stats.clone()).is_err()); + assert!(cpu_stats_sender.try_send(cpu_stats.clone()).is_err()); + loop { + let batch = ctx.batch_recv_read_stats(&read_stats_receiver); + if batch.is_empty() { + break; + } + assert!( + batch.len() == batch_limit || batch.len() == len % batch_limit, + "{:?}", + (len, batch.len()) + ); + } + assert_eq!( + read_stats_receiver.try_recv().unwrap_err(), + TryRecvError::Empty + ); + + loop { + let (batch, cache) = ctx.batch_recv_cpu_stats(&cpu_stats_receiver); + if batch.is_empty() { + break; + } + assert!( + batch.len() == batch_limit || batch.len() == len % batch_limit, + "{:?}", + (len, batch.len()) + ); + assert!(cache.region_cpu_map.is_empty()); + assert!(cache.hottest_key_range_cpu_time_map.is_empty()); + // The cache should be empty after the batch_recv_cpu_stats. + cache.region_cpu_map.insert(1, (0.0, None)); + cache.hottest_key_range_cpu_time_map.insert(1, 1); + } + assert_eq!( + read_stats_receiver.try_recv().unwrap_err(), + TryRecvError::Empty + ); + } + } + + #[test] + fn test_auto_split_controller_empty_region_cpu_map() { + let mut ctx = AutoSplitControllerContext::new(1); + ctx.cpu_stats_cache.region_cpu_map.insert(1, (0.0, None)); + assert!(ctx.empty_region_cpu_map().is_empty()); + } + + #[test] + fn test_auto_split_controller_empty_gc() { + let mut ctx = AutoSplitControllerContext::new(1); + ctx.cpu_stats_cache.region_cpu_map.insert(1, (0.0, None)); + ctx.cpu_stats_cache + .hottest_key_range_cpu_time_map + .insert(1, 1); + ctx.cpu_stats_vec.push(Arc::new(RawRecords::default())); + ctx.read_stats_vec.push(ReadStats::default()); + + ctx.last_gc_time = Instant::now_coarse(); + ctx.maybe_gc(); + + assert!(!ctx.cpu_stats_cache.region_cpu_map.is_empty()); + assert!( + !ctx.cpu_stats_cache + .hottest_key_range_cpu_time_map + .is_empty() + ); + assert!(!ctx.cpu_stats_vec.is_empty()); + assert!(!ctx.read_stats_vec.is_empty()); + + ctx.last_gc_time = Instant::now_coarse() - 2 * ctx.gc_duration; + ctx.maybe_gc(); + + assert!(ctx.cpu_stats_cache.region_cpu_map.is_empty()); + assert!( + ctx.cpu_stats_cache + .hottest_key_range_cpu_time_map + .is_empty() + ); + assert!(ctx.cpu_stats_vec.is_empty()); + assert!(ctx.read_stats_vec.is_empty()); + } } From ef78cb3581e091dda1b920aee769c431d275c35c Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Tue, 16 Jul 2024 20:32:59 +0800 Subject: [PATCH 188/220] Fix the problem that CDC and log-backup didn't use advance-ts-interval to limit the timeout of check_leader (#17113) (#17262) ref tikv/tikv#16698, close tikv/tikv#17107 Fix the problem that CDC and log-backup didn't use advance-ts-interval to limit the timeout of check_leader Signed-off-by: ti-chi-bot Signed-off-by: MyonKeminta Co-authored-by: MyonKeminta <9948422+MyonKeminta@users.noreply.github.com> Co-authored-by: MyonKeminta Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- components/backup-stream/src/endpoint.rs | 4 +++- components/backup-stream/src/subscription_manager.rs | 11 ++++++++++- components/backup-stream/tests/suite.rs | 3 ++- components/cdc/src/endpoint.rs | 11 +++++++++-- components/cdc/tests/mod.rs | 7 ++++++- components/server/src/server.rs | 2 ++ components/server/src/server2.rs | 2 ++ 7 files changed, 34 insertions(+), 6 deletions(-) diff --git a/components/backup-stream/src/endpoint.rs b/components/backup-stream/src/endpoint.rs index b8e0ec3139e..1df518094bc 100644 --- a/components/backup-stream/src/endpoint.rs +++ b/components/backup-stream/src/endpoint.rs @@ -24,7 +24,7 @@ use raftstore::{ router::CdcHandle, }; use resolved_ts::{resolve_by_raft, LeadershipResolver}; -use tikv::config::BackupStreamConfig; +use tikv::config::{BackupStreamConfig, ResolvedTsConfig}; use tikv_util::{ box_err, config::ReadableDuration, @@ -112,6 +112,7 @@ where store_id: u64, store: S, config: BackupStreamConfig, + resolved_ts_config: ResolvedTsConfig, scheduler: Scheduler, observer: BackupStreamObserver, accessor: R, @@ -172,6 +173,7 @@ where pd_client.clone(), ((config.num_threads + 1) / 2).max(1), resolver, + resolved_ts_config.advance_ts_interval.0, ); pool.spawn(op_loop); let mut checkpoint_mgr = CheckpointManager::default(); diff --git a/components/backup-stream/src/subscription_manager.rs b/components/backup-stream/src/subscription_manager.rs index 5690483c0de..1c090ceda4d 100644 --- a/components/backup-stream/src/subscription_manager.rs +++ b/components/backup-stream/src/subscription_manager.rs @@ -297,6 +297,8 @@ pub struct RegionSubscriptionManager { messenger: Sender, scan_pool_handle: Arc, scans: Arc, + + advance_ts_interval: Duration, } impl Clone for RegionSubscriptionManager @@ -318,6 +320,7 @@ where messenger: self.messenger.clone(), scan_pool_handle: self.scan_pool_handle.clone(), scans: FutureWaitGroup::new(), + advance_ts_interval: self.advance_ts_interval, } } } @@ -358,6 +361,7 @@ where pd_client: Arc, scan_pool_size: usize, resolver: BackupStreamResolver, + advance_ts_interval: Duration, ) -> (Self, future![()]) where E: KvEngine, @@ -377,6 +381,7 @@ where messenger: tx, scan_pool_handle: Arc::new(scan_pool_handle), scans: FutureWaitGroup::new(), + advance_ts_interval, }; let fut = op.clone().region_operator_loop(rx, resolver); (op, fut) @@ -476,7 +481,11 @@ where "take" => ?now.saturating_elapsed(), "timedout" => %timedout); } let regions = resolver - .resolve(self.subs.current_regions(), min_ts, None) + .resolve( + self.subs.current_regions(), + min_ts, + Some(self.advance_ts_interval), + ) .await; let cps = self.subs.resolve_with(min_ts, regions); let min_region = cps.iter().min_by_key(|rs| rs.checkpoint); diff --git a/components/backup-stream/tests/suite.rs b/components/backup-stream/tests/suite.rs index 3034bc26e4c..af2ca08a92f 100644 --- a/components/backup-stream/tests/suite.rs +++ b/components/backup-stream/tests/suite.rs @@ -37,7 +37,7 @@ use tempdir::TempDir; use test_pd_client::TestPdClient; use test_raftstore::{new_server_cluster, Cluster, ServerCluster}; use test_util::retry; -use tikv::config::BackupStreamConfig; +use tikv::config::{BackupStreamConfig, ResolvedTsConfig}; use tikv_util::{ codec::{ number::NumberEncoder, @@ -382,6 +382,7 @@ impl Suite { id, self.meta_store.clone(), cfg, + ResolvedTsConfig::default(), worker.scheduler(), ob, regions, diff --git a/components/cdc/src/endpoint.rs b/components/cdc/src/endpoint.rs index e70358ccf2a..0eeae5d8638 100644 --- a/components/cdc/src/endpoint.rs +++ b/components/cdc/src/endpoint.rs @@ -36,7 +36,7 @@ use raftstore::{ use resolved_ts::{resolve_by_raft, LeadershipResolver, Resolver}; use security::SecurityManager; use tikv::{ - config::CdcConfig, + config::{CdcConfig, ResolvedTsConfig}, storage::{kv::LocalTablets, Statistics}, }; use tikv_util::{ @@ -378,6 +378,7 @@ pub struct Endpoint { raftstore_v2: bool, config: CdcConfig, + resolved_ts_config: ResolvedTsConfig, api_version: ApiVersion, // Incremental scan @@ -407,6 +408,7 @@ impl, E: KvEngine, S: StoreRegionMeta> Endpoint, @@ -480,6 +482,7 @@ impl, E: KvEngine, S: StoreRegionMeta> Endpoint, E: KvEngine, S: StoreRegionMeta> Endpoint, E: KvEngine, S: StoreRegionMeta> Endpoint Date: Tue, 16 Jul 2024 20:48:00 +0800 Subject: [PATCH 189/220] coprocessor: limit concurrent requests by memory quota (#16662) (#17049) close tikv/tikv#16653 Fix an issue that a large number of concurrent coprocessor requests may cause OOM. This issue is resolved by implementing a memory quota for coprocessors, which rejects incoming requests when the memory quota is full. Signed-off-by: Neil Shen Co-authored-by: Neil Shen Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- components/cdc/src/delegate.rs | 12 +- components/error_code/src/coprocessor.rs | 1 + components/pd_client/src/lib.rs | 11 +- components/raftstore/src/store/fsm/apply.rs | 16 +- components/raftstore/src/store/peer.rs | 10 +- components/raftstore/src/store/read_queue.rs | 12 +- components/resolved_ts/src/endpoint.rs | 20 +- components/resolved_ts/src/resolver.rs | 4 +- components/resource_metering/src/lib.rs | 13 ++ components/server/src/server.rs | 22 ++- components/server/src/server2.rs | 21 +- components/tikv_util/src/memory.rs | 96 ++++++++- metrics/grafana/tikv_details.json | 111 +++++++++++ src/config/mod.rs | 68 +++++-- src/coprocessor/config_manager.rs | 30 +++ src/coprocessor/endpoint.rs | 194 +++++++++++++++---- src/coprocessor/error.rs | 11 ++ src/coprocessor/metrics.rs | 35 ++++ src/coprocessor/mod.rs | 13 +- src/coprocessor/tracker.rs | 15 +- src/server/config.rs | 66 +++++-- tests/Cargo.toml | 5 + tests/benches/memory/mod.rs | 122 ++++++++++++ tests/integrations/config/dynamic/snap.rs | 10 + 24 files changed, 792 insertions(+), 126 deletions(-) create mode 100644 src/coprocessor/config_manager.rs create mode 100644 tests/benches/memory/mod.rs diff --git a/components/cdc/src/delegate.rs b/components/cdc/src/delegate.rs index 780cfe8dea6..7afb36899b6 100644 --- a/components/cdc/src/delegate.rs +++ b/components/cdc/src/delegate.rs @@ -248,7 +248,7 @@ impl Pending { } fn push_pending_lock(&mut self, lock: PendingLock) -> Result<()> { - let bytes = lock.heap_size(); + let bytes = lock.approximate_heap_size(); self.memory_quota.alloc(bytes)?; self.locks.push(lock); self.pending_bytes += bytes; @@ -262,7 +262,7 @@ impl Pending { )); // Must take locks, otherwise it may double free memory quota on drop. for lock in mem::take(&mut self.locks) { - self.memory_quota.free(lock.heap_size()); + self.memory_quota.free(lock.approximate_heap_size()); match lock { PendingLock::Track { key, start_ts } => { resolver.track_lock(start_ts, key, None)?; @@ -286,7 +286,7 @@ impl Drop for Pending { let mut bytes = 0; let num_locks = locks.len(); for lock in locks { - bytes += lock.heap_size(); + bytes += lock.approximate_heap_size(); } if bytes > ON_DROP_WARN_HEAP_SIZE { warn!("cdc drop huge Pending"; @@ -306,9 +306,11 @@ enum PendingLock { } impl HeapSize for PendingLock { - fn heap_size(&self) -> usize { + fn approximate_heap_size(&self) -> usize { match self { - PendingLock::Track { key, .. } | PendingLock::Untrack { key } => key.heap_size(), + PendingLock::Track { key, .. } | PendingLock::Untrack { key } => { + key.approximate_heap_size() + } } } } diff --git a/components/error_code/src/coprocessor.rs b/components/error_code/src/coprocessor.rs index d98c85162bf..31f0ed4224a 100644 --- a/components/error_code/src/coprocessor.rs +++ b/components/error_code/src/coprocessor.rs @@ -6,6 +6,7 @@ define_error_codes!( LOCKED => ("Locked", "", ""), DEADLINE_EXCEEDED => ("DeadlineExceeded", "", ""), MAX_PENDING_TASKS_EXCEEDED => ("MaxPendingTasksExceeded", "", ""), + MEMORY_QUOTA_EXCEEDED => ("MemoryQuotaExceeded", "", ""), INVALID_DATA_TYPE => ("InvalidDataType", "", ""), ENCODING => ("Encoding", "", ""), diff --git a/components/pd_client/src/lib.rs b/components/pd_client/src/lib.rs index 21ae61ccd61..ad2881a0b8c 100644 --- a/components/pd_client/src/lib.rs +++ b/components/pd_client/src/lib.rs @@ -24,7 +24,10 @@ use kvproto::{ resource_manager::TokenBucketsRequest, }; use pdpb::QueryStats; -use tikv_util::time::{Instant, UnixSecs}; +use tikv_util::{ + memory::HeapSize, + time::{Instant, UnixSecs}, +}; use txn_types::TimeStamp; pub use self::{ @@ -133,6 +136,12 @@ impl BucketMeta { } } +impl HeapSize for BucketMeta { + fn approximate_heap_size(&self) -> usize { + self.keys.approximate_heap_size() + self.sizes.approximate_heap_size() + } +} + #[derive(Debug, Clone)] pub struct BucketStat { pub meta: Arc, diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index cafb3660d9f..8c99d0b19df 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -3373,10 +3373,10 @@ where } fn update_memory_trace(&mut self, event: &mut TraceEvent) { - let pending_cmds = self.pending_cmds.heap_size(); + let pending_cmds = self.pending_cmds.approximate_heap_size(); let merge_yield = if let Some(ref mut state) = self.yield_state { if state.heap_size.is_none() { - state.heap_size = Some(state.heap_size()); + state.heap_size = Some(state.approximate_heap_size()); } state.heap_size.unwrap() } else { @@ -4954,7 +4954,7 @@ mod memtrace { } impl HeapSize for PendingCmdQueue { - fn heap_size(&self) -> usize { + fn approximate_heap_size(&self) -> usize { // Some fields of `PendingCmd` are on stack, but ignore them because they are // just some small boxed closures. self.normals.capacity() * mem::size_of::>() @@ -4965,7 +4965,7 @@ mod memtrace { where EK: KvEngine, { - fn heap_size(&self) -> usize { + fn approximate_heap_size(&self) -> usize { let mut size = self.pending_entries.capacity() * mem::size_of::(); for e in &self.pending_entries { size += bytes_capacity(&e.data) + bytes_capacity(&e.context); @@ -4973,7 +4973,7 @@ mod memtrace { size += self.pending_msgs.capacity() * mem::size_of::>(); for msg in &self.pending_msgs { - size += msg.heap_size(); + size += msg.approximate_heap_size(); } size @@ -4985,9 +4985,9 @@ mod memtrace { EK: KvEngine, { /// Only consider large fields in `Msg`. - fn heap_size(&self) -> usize { + fn approximate_heap_size(&self) -> usize { match self { - Msg::LogsUpToDate(l) => l.heap_size(), + Msg::LogsUpToDate(l) => l.approximate_heap_size(), // For entries in `Msg::Apply`, heap size is already updated when fetching them // from `raft::Storage`. So use `0` here. Msg::Apply { .. } => 0, @@ -5005,7 +5005,7 @@ mod memtrace { } impl HeapSize for CatchUpLogs { - fn heap_size(&self) -> usize { + fn approximate_heap_size(&self) -> usize { let mut size: usize = 0; for e in &self.merge.entries { size += bytes_capacity(&e.data) + bytes_capacity(&e.context); diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index c985af40830..020f3e00b9c 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -4646,9 +4646,9 @@ where .coprocessor_host .pre_transfer_leader(self.region(), transfer_leader) { - warn!("Coprocessor rejected transfer leader."; "err" => ?err, - "region_id" => self.region_id, - "peer_id" => self.peer.get_id(), + warn!("Coprocessor rejected transfer leader."; "err" => ?err, + "region_id" => self.region_id, + "peer_id" => self.peer.get_id(), "transferee" => transfer_leader.get_peer().get_id()); let mut resp = RaftCmdResponse::new(); *resp.mut_header().mut_error() = Error::from(err).into(); @@ -5888,9 +5888,9 @@ mod memtrace { ER: RaftEngine, { pub fn proposal_size(&self) -> usize { - let mut heap_size = self.pending_reads.heap_size(); + let mut heap_size = self.pending_reads.approximate_heap_size(); for prop in &self.proposals.queue { - heap_size += prop.heap_size(); + heap_size += prop.approximate_heap_size(); } heap_size } diff --git a/components/raftstore/src/store/read_queue.rs b/components/raftstore/src/store/read_queue.rs index 376f168c26d..bde49b4ed30 100644 --- a/components/raftstore/src/store/read_queue.rs +++ b/components/raftstore/src/store/read_queue.rs @@ -46,7 +46,7 @@ impl ReadIndexRequest { pub fn push_command(&mut self, req: RaftCmdRequest, cb: C, read_index: u64) { RAFT_READ_INDEX_PENDING_COUNT.inc(); - self.cmds_heap_size += req.heap_size(); + self.cmds_heap_size += req.approximate_heap_size(); self.cmds.push((req, cb, Some(read_index))); } @@ -54,7 +54,7 @@ impl ReadIndexRequest { RAFT_READ_INDEX_PENDING_COUNT.inc(); // Ignore heap allocations for `Callback`. - let cmds_heap_size = req.heap_size(); + let cmds_heap_size = req.approximate_heap_size(); let mut cmds = MustConsumeVec::with_capacity("callback of index read", 1); cmds.push((req, cb, None)); @@ -434,10 +434,10 @@ mod memtrace { use super::*; impl HeapSize for ReadIndexRequest { - fn heap_size(&self) -> usize { + fn approximate_heap_size(&self) -> usize { let mut size = self.cmds_heap_size + Self::CMD_SIZE * self.cmds.capacity(); if let Some(ref add) = self.addition_request { - size += add.heap_size(); + size += add.approximate_heap_size(); } size } @@ -445,12 +445,12 @@ mod memtrace { impl HeapSize for ReadIndexQueue { #[inline] - fn heap_size(&self) -> usize { + fn approximate_heap_size(&self) -> usize { let mut size = self.reads.capacity() * mem::size_of::>() // For one Uuid and one usize. + 24 * self.contexts.len(); for read in &self.reads { - size += read.heap_size(); + size += read.approximate_heap_size(); } size } diff --git a/components/resolved_ts/src/endpoint.rs b/components/resolved_ts/src/endpoint.rs index a668d8b0f52..6a227716a3d 100644 --- a/components/resolved_ts/src/endpoint.rs +++ b/components/resolved_ts/src/endpoint.rs @@ -76,7 +76,7 @@ impl Drop for ResolverStatus { let mut bytes = 0; let num_locks = locks.len(); for lock in locks { - bytes += lock.heap_size(); + bytes += lock.approximate_heap_size(); } if bytes > ON_DROP_WARN_HEAP_SIZE { warn!("drop huge ResolverStatus"; @@ -101,10 +101,12 @@ impl ResolverStatus { }; // Check if adding a new lock or unlock will exceed the memory // quota. - memory_quota.alloc(lock.heap_size()).map_err(|e| { - fail::fail_point!("resolved_ts_on_pending_locks_memory_quota_exceeded"); - Error::MemoryQuotaExceeded(e) - })?; + memory_quota + .alloc(lock.approximate_heap_size()) + .map_err(|e| { + fail::fail_point!("resolved_ts_on_pending_locks_memory_quota_exceeded"); + Error::MemoryQuotaExceeded(e) + })?; locks.push(lock); Ok(()) } @@ -143,7 +145,7 @@ impl ResolverStatus { ( *tracked_index, locks.into_iter().map(|lock| { - memory_quota.free(lock.heap_size()); + memory_quota.free(lock.approximate_heap_size()); lock }), ) @@ -164,10 +166,10 @@ enum PendingLock { } impl HeapSize for PendingLock { - fn heap_size(&self) -> usize { + fn approximate_heap_size(&self) -> usize { match self { PendingLock::Track { key, .. } | PendingLock::Untrack { key, .. } => { - key.as_encoded().heap_size() + key.as_encoded().approximate_heap_size() } } } @@ -440,7 +442,7 @@ where match &observed_region.resolver_status { ResolverStatus::Pending { locks, .. } => { for l in locks { - stats.heap_size += l.heap_size() as i64; + stats.heap_size += l.approximate_heap_size() as i64; } stats.unresolved_count += 1; } diff --git a/components/resolved_ts/src/resolver.rs b/components/resolved_ts/src/resolver.rs index 239ef566605..2aec9c336cd 100644 --- a/components/resolved_ts/src/resolver.rs +++ b/components/resolved_ts/src/resolver.rs @@ -5,7 +5,7 @@ use std::{cmp, collections::BTreeMap, sync::Arc, time::Duration}; use collections::{HashMap, HashMapEntry}; use raftstore::store::RegionReadProgress; use tikv_util::{ - memory::{HeapSize, MemoryQuota, MemoryQuotaExceeded}, + memory::{MemoryQuota, MemoryQuotaExceeded}, time::Instant, }; use txn_types::{Key, TimeStamp}; @@ -257,7 +257,7 @@ impl Resolver { // the same Arc<[u8]>, so lock_ts_heap is negligible. Also, it's hard to // track accurate memory usage of lock_ts_heap as a timestamp may have // many keys. - key.heap_size() + std::mem::size_of::() + std::mem::size_of_val(key) + std::mem::size_of::() } fn shrink_ratio(&mut self, ratio: usize) { diff --git a/components/resource_metering/src/lib.rs b/components/resource_metering/src/lib.rs index ba8e2174e19..2c699998d3d 100644 --- a/components/resource_metering/src/lib.rs +++ b/components/resource_metering/src/lib.rs @@ -33,6 +33,7 @@ pub use reporter::{ ConfigChangeNotifier as ReporterConfigChangeNotifier, Reporter, Task, }; use tikv_util::{ + memory::HeapSize, sys::thread, warn, worker::{Scheduler, Worker}, @@ -98,6 +99,12 @@ impl ResourceMeteringTag { } } +impl HeapSize for ResourceMeteringTag { + fn approximate_heap_size(&self) -> usize { + self.infos.approximate_mem_size() + } +} + /// An RAII implementation of a [ResourceMeteringTag]. When this structure is /// dropped (falls out of scope), the tag will be removed. You can also clean /// up other data here if necessary. @@ -319,6 +326,12 @@ impl TagInfos { } } +impl HeapSize for TagInfos { + fn approximate_heap_size(&self) -> usize { + self.key_ranges.approximate_heap_size() + self.extra_attachment.approximate_heap_size() + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/components/server/src/server.rs b/components/server/src/server.rs index 5139e28335d..cc9f8ad3cf2 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -797,20 +797,25 @@ where .unwrap_or_else(|e| fatal!("failed to bootstrap node id: {}", e)); self.snap_mgr = Some(snap_mgr.clone()); + + // Create coprocessor endpoint. + let copr = coprocessor::Endpoint::new( + &server_config.value(), + cop_read_pool_handle, + self.concurrency_manager.clone(), + resource_tag_factory, + self.quota_limiter.clone(), + self.resource_manager.clone(), + ); + let copr_config_manager = copr.config_manager(); + // Create server let server = Server::new( node.id(), &server_config, &self.security_mgr, storage.clone(), - coprocessor::Endpoint::new( - &server_config.value(), - cop_read_pool_handle, - self.concurrency_manager.clone(), - resource_tag_factory, - self.quota_limiter.clone(), - self.resource_manager.clone(), - ), + copr, coprocessor_v2::Endpoint::new(&self.core.config.coprocessor_v2), self.resolver.clone().unwrap(), Either::Left(snap_mgr.clone()), @@ -829,6 +834,7 @@ where server.get_snap_worker_scheduler(), server_config.clone(), server.get_grpc_mem_quota().clone(), + copr_config_manager, )), ); diff --git a/components/server/src/server2.rs b/components/server/src/server2.rs index eb7c7e4d979..fbc7db4b4ce 100644 --- a/components/server/src/server2.rs +++ b/components/server/src/server2.rs @@ -764,6 +764,17 @@ where let node = self.node.as_ref().unwrap(); + // Create coprocessor endpoint. + let copr = coprocessor::Endpoint::new( + &server_config.value(), + cop_read_pool_handle, + self.concurrency_manager.clone(), + resource_tag_factory, + self.quota_limiter.clone(), + self.resource_manager.clone(), + ); + let copr_config_manager = copr.config_manager(); + self.snap_mgr = Some(snap_mgr.clone()); // Create server let server = Server::new( @@ -771,14 +782,7 @@ where &server_config, &self.security_mgr, storage, - coprocessor::Endpoint::new( - &server_config.value(), - cop_read_pool_handle, - self.concurrency_manager.clone(), - resource_tag_factory, - self.quota_limiter.clone(), - self.resource_manager.clone(), - ), + copr, coprocessor_v2::Endpoint::new(&self.core.config.coprocessor_v2), self.resolver.clone().unwrap(), Either::Right(snap_mgr.clone()), @@ -797,6 +801,7 @@ where server.get_snap_worker_scheduler(), server_config.clone(), server.get_grpc_mem_quota().clone(), + copr_config_manager, )), ); diff --git a/components/tikv_util/src/memory.rs b/components/tikv_util/src/memory.rs index 15ffece4425..c216fa84b8e 100644 --- a/components/tikv_util/src/memory.rs +++ b/components/tikv_util/src/memory.rs @@ -8,9 +8,11 @@ use std::{ }, }; +use collections::HashMap; use kvproto::{ + coprocessor as coppb, encryptionpb::EncryptionMeta, - kvrpcpb::LockInfo, + kvrpcpb::{self, LockInfo}, metapb::{Peer, Region, RegionEpoch}, raft_cmdpb::{self, RaftCmdRequest, ReadIndexRequest}, }; @@ -29,19 +31,74 @@ pub unsafe fn vec_transmute(from: Vec) -> Vec { } pub trait HeapSize { - fn heap_size(&self) -> usize { + fn approximate_heap_size(&self) -> usize { 0 } + + fn approximate_mem_size(&self) -> usize + where + Self: Sized, + { + mem::size_of::() + self.approximate_heap_size() + } +} + +macro_rules! impl_zero_heap_size{ + ( $($typ: ty,)+ ) => { + $( + impl HeapSize for $typ { + fn approximate_heap_size(&self) -> usize { 0 } + } + )+ + } +} +impl_zero_heap_size! { + bool, u8, u64, +} +// Do not impl HeapSize for [T], because type coercions make it error-prone. +// E.g., Vec[u8] may be casted to &[u8] which does not own any byte in heap. +impl HeapSize for Vec { + fn approximate_heap_size(&self) -> usize { + let cap_bytes = self.capacity() * std::mem::size_of::(); + if self.is_empty() { + cap_bytes + } else { + // Prefer an approximation of its actually heap size, because we + // want the time complexity to be O(1). + self.len() * self[0].approximate_heap_size() + cap_bytes + } + } +} +impl HeapSize for (A, B) { + fn approximate_heap_size(&self) -> usize { + self.0.approximate_heap_size() + self.1.approximate_heap_size() + } +} +impl HeapSize for Option { + fn approximate_heap_size(&self) -> usize { + match self { + Some(t) => t.approximate_heap_size(), + None => 0, + } + } } -impl HeapSize for [u8] { - fn heap_size(&self) -> usize { - self.len() * mem::size_of::() +impl HeapSize for HashMap { + fn approximate_heap_size(&self) -> usize { + let cap_bytes = self.capacity() * (mem::size_of::() + mem::size_of::()); + if self.is_empty() { + cap_bytes + } else { + let kv = self.iter().next().unwrap(); + // Prefer an approximation of its actually heap size, because we + // want the time complexity to be O(1). + cap_bytes + self.len() * (kv.0.approximate_heap_size() + kv.1.approximate_heap_size()) + } } } impl HeapSize for Region { - fn heap_size(&self) -> usize { + fn approximate_heap_size(&self) -> usize { let mut size = self.start_key.capacity() + self.end_key.capacity(); size += mem::size_of::(); size += self.peers.capacity() * mem::size_of::(); @@ -53,7 +110,7 @@ impl HeapSize for Region { } impl HeapSize for ReadIndexRequest { - fn heap_size(&self) -> usize { + fn approximate_heap_size(&self) -> usize { self.key_ranges .iter() .map(|r| r.start_key.capacity() + r.end_key.capacity()) @@ -62,7 +119,7 @@ impl HeapSize for ReadIndexRequest { } impl HeapSize for LockInfo { - fn heap_size(&self) -> usize { + fn approximate_heap_size(&self) -> usize { self.primary_lock.capacity() + self.key.capacity() + self.secondaries.iter().map(|k| k.len()).sum::() @@ -70,7 +127,7 @@ impl HeapSize for LockInfo { } impl HeapSize for RaftCmdRequest { - fn heap_size(&self) -> usize { + fn approximate_heap_size(&self) -> usize { mem::size_of::() + self.requests.capacity() * mem::size_of::() + mem::size_of_val(&self.admin_request) @@ -78,6 +135,27 @@ impl HeapSize for RaftCmdRequest { } } +impl HeapSize for coppb::KeyRange { + fn approximate_heap_size(&self) -> usize { + self.start.capacity() + self.end.capacity() + } +} + +impl HeapSize for kvrpcpb::Context { + fn approximate_heap_size(&self) -> usize { + self.resolved_locks.capacity() * mem::size_of::() + + self.committed_locks.capacity() * mem::size_of::() + + self.resource_group_tag.capacity() + + self.request_source.as_bytes().len() + + self + .get_resource_control_context() + .resource_group_name + .as_bytes() + .len() + + self.get_source_stmt().session_alias.as_bytes().len() + } +} + #[derive(Debug)] pub struct MemoryQuotaExceeded; diff --git a/metrics/grafana/tikv_details.json b/metrics/grafana/tikv_details.json index 0547de621ea..25684f2638d 100644 --- a/metrics/grafana/tikv_details.json +++ b/metrics/grafana/tikv_details.json @@ -25969,6 +25969,117 @@ "align": false, "alignLevel": null } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "description": "Total bytes of memory used by coprocessor requests", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 71 + }, + "hiddenSeries": false, + "hideTimeOverride": false, + "id": 24763573286, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sort": "max", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.5.27", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum((\n tikv_coprocessor_memory_quota\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=~\"$tidb_cluster.*\",instance=~\"$instance\"}\n \n)) by (instance, type) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}-{{type}}", + "metric": "", + "query": "sum((\n tikv_coprocessor_memory_quota\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=~\"$tidb_cluster.*\",instance=~\"$instance\"}\n \n)) by (instance, type) ", + "refId": "A", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Memory Quota", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transformations": [], + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": 0 + } } ], "repeat": null, diff --git a/src/config/mod.rs b/src/config/mod.rs index 786c73cf6cc..cf57b3960fb 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -4701,7 +4701,10 @@ impl ConfigController { #[cfg(test)] mod tests { - use std::{sync::Arc, time::Duration}; + use std::{ + sync::{mpsc::channel, Arc}, + time::Duration, + }; use api_version::{ApiV1, KvFormat}; use case_macros::*; @@ -5249,22 +5252,27 @@ mod tests { assert_eq!(flow_controller.enabled(), true); } - #[test] - fn test_change_resolved_ts_config() { - use crossbeam::channel; + struct MockCfgManager(Box); - pub struct TestConfigManager(channel::Sender); - impl ConfigManager for TestConfigManager { - fn dispatch(&mut self, change: ConfigChange) -> online_config::Result<()> { - self.0.send(change).unwrap(); - Ok(()) - } + impl ConfigManager for MockCfgManager { + fn dispatch(&mut self, change: ConfigChange) -> online_config::Result<()> { + (self.0)(change); + Ok(()) } + } + #[test] + fn test_change_resolved_ts_config() { let (cfg, _dir) = TikvConfig::with_tmp().unwrap(); let cfg_controller = ConfigController::new(cfg); - let (tx, rx) = channel::unbounded(); - cfg_controller.register(Module::ResolvedTs, Box::new(TestConfigManager(tx))); + let (tx, rx) = channel(); + let tx = std::sync::Mutex::new(tx); + cfg_controller.register( + Module::ResolvedTs, + Box::new(MockCfgManager(Box::new(move |c| { + tx.lock().unwrap().send(c).unwrap(); + }))), + ); // Return error if try to update not support config or unknow config cfg_controller @@ -5732,12 +5740,14 @@ mod tests { let cfg_controller = ConfigController::new(cfg.clone()); let (scheduler, _receiver) = dummy_scheduler(); let version_tracker = Arc::new(VersionTrack::new(cfg.server.clone())); + let cop_manager = MockCfgManager(Box::new(|_| {})); cfg_controller.register( Module::Server, Box::new(ServerConfigManager::new( scheduler, version_tracker.clone(), ResourceQuota::new(None), + Box::new(cop_manager), )), ); @@ -5789,6 +5799,40 @@ mod tests { ); } + #[test] + fn test_change_coprocessor_endpoint_config() { + let (mut cfg, _dir) = TikvConfig::with_tmp().unwrap(); + cfg.validate().unwrap(); + let cfg_controller = ConfigController::new(cfg.clone()); + let (scheduler, _receiver) = dummy_scheduler(); + let version_tracker = Arc::new(VersionTrack::new(cfg.server.clone())); + + let (cop_tx, cop_rx) = channel(); + let cop_tx = std::sync::Mutex::new(cop_tx); + let cop_manager = MockCfgManager(Box::new(move |c| { + cop_tx.lock().unwrap().send(c).unwrap(); + })); + cfg_controller.register( + Module::Server, + Box::new(ServerConfigManager::new( + scheduler, + version_tracker, + ResourceQuota::new(None), + Box::new(cop_manager), + )), + ); + + cfg_controller + .update_config("server.end-point-memory-quota", "32MB") + .unwrap(); + let mut change = cop_rx.try_recv().unwrap(); + let quota = change.remove("end_point_memory_quota").unwrap(); + let cap: ReadableSize = quota.into(); + assert_eq!(cap, ReadableSize::mb(32)); + cfg.server.end_point_memory_quota = ReadableSize::mb(32); + assert_eq_debug(&cfg_controller.get_current(), &cfg); + } + #[test] fn test_compatible_adjust_validate_equal() { // After calling many time of `compatible_adjust` and `validate` should has diff --git a/src/coprocessor/config_manager.rs b/src/coprocessor/config_manager.rs new file mode 100644 index 00000000000..4afd030a692 --- /dev/null +++ b/src/coprocessor/config_manager.rs @@ -0,0 +1,30 @@ +// Copyright 2024 TiKV Project Authors. Licensed under Apache-2.0. + +//! Coprocessor online config manager. + +use std::sync::Arc; + +use online_config::{ConfigChange, ConfigManager, ConfigValue, Result as CfgResult}; +use tikv_util::{config::ReadableSize, memory::MemoryQuota}; + +pub(super) struct CopConfigManager { + memory_quota: Arc, +} + +impl CopConfigManager { + pub fn new(memory_quota: Arc) -> Self { + Self { memory_quota } + } +} + +impl ConfigManager for CopConfigManager { + fn dispatch(&mut self, mut change: ConfigChange) -> CfgResult<()> { + if let Some(quota) = change.remove("end_point_memory_quota") { + if quota != ConfigValue::None { + let cap: ReadableSize = quota.into(); + self.memory_quota.set_capacity(cap.0 as _); + } + } + Ok(()) + } +} diff --git a/src/coprocessor/endpoint.rs b/src/coprocessor/endpoint.rs index 001d1e94ca0..f9a1e28dd19 100644 --- a/src/coprocessor/endpoint.rs +++ b/src/coprocessor/endpoint.rs @@ -1,7 +1,8 @@ // Copyright 2018 TiKV Project Authors. Licensed under Apache-2.0. use std::{ - borrow::Cow, future::Future, iter::FromIterator, marker::PhantomData, sync::Arc, time::Duration, + borrow::Cow, future::Future, iter::FromIterator, marker::PhantomData, mem, sync::Arc, + time::Duration, }; use ::tracker::{ @@ -11,21 +12,30 @@ use api_version::{dispatch_api_version, KvFormat}; use async_stream::try_stream; use concurrency_manager::ConcurrencyManager; use engine_traits::PerfLevel; -use futures::{channel::mpsc, future::Either, prelude::*}; -use kvproto::{coprocessor as coppb, errorpb, kvrpcpb}; +use futures::{ + channel::{mpsc, oneshot}, + future::Either, + prelude::*, +}; +use kvproto::{coprocessor as coppb, errorpb, kvrpcpb, kvrpcpb::CommandPri}; +use online_config::ConfigManager; use protobuf::{CodedInputStream, Message}; -use resource_control::{ResourceGroupManager, TaskMetadata}; +use resource_control::{ResourceGroupManager, ResourceLimiter, TaskMetadata}; use resource_metering::{FutureExt, ResourceTagFactory, StreamExt}; use tidb_query_common::execute_stats::ExecSummary; use tikv_alloc::trace::MemoryTraceGuard; use tikv_kv::SnapshotExt; use tikv_util::{ - deadline::set_deadline_exceeded_busy_error, quota_limiter::QuotaLimiter, time::Instant, + deadline::set_deadline_exceeded_busy_error, + memory::{MemoryQuota, OwnedAllocated}, + quota_limiter::QuotaLimiter, + time::Instant, }; use tipb::{AnalyzeReq, AnalyzeType, ChecksumRequest, ChecksumScanOn, DagRequest, ExecType}; use tokio::sync::Semaphore; use txn_types::Lock; +use super::config_manager::CopConfigManager; use crate::{ coprocessor::{cache::CachedRequestHandler, interceptors::*, metrics::*, tracker::Tracker, *}, read_pool::ReadPoolHandle, @@ -51,6 +61,8 @@ pub struct Endpoint { /// The concurrency limiter of the coprocessor. semaphore: Option>, + /// The memory quota for coprocessor requests. + memory_quota: Arc, concurrency_manager: ConcurrencyManager, @@ -88,18 +100,18 @@ impl Endpoint { quota_limiter: Arc, resource_ctl: Option>, ) -> Self { - // FIXME: When yatp is used, we need to limit coprocessor requests in progress - // to avoid using too much memory. However, if there are a number of large - // requests, small requests will still be blocked. This needs to be improved. let semaphore = match &read_pool { ReadPoolHandle::Yatp { .. } => { Some(Arc::new(Semaphore::new(cfg.end_point_max_concurrency))) } _ => None, }; + let memory_quota = Arc::new(MemoryQuota::new(cfg.end_point_memory_quota.0 as _)); + register_coprocessor_memory_quota_metrics(memory_quota.clone()); Self { read_pool, semaphore, + memory_quota, concurrency_manager, perf_level: cfg.end_point_perf_level, resource_tag_factory, @@ -115,6 +127,10 @@ impl Endpoint { } } + pub fn config_manager(&self) -> Box { + Box::new(CopConfigManager::new(self.memory_quota.clone())) + } + fn check_memory_locks(&self, req_ctx: &ReqContext) -> Result<()> { let start_ts = req_ctx.txn_start_ts; if !req_ctx.context.get_stale_read() { @@ -497,7 +513,7 @@ impl Endpoint { ) -> impl Future>> { let priority = req_ctx.context.get_priority(); let task_id = req_ctx.build_task_id(); - let key_ranges = req_ctx + let key_ranges: Vec<_> = req_ctx .ranges .iter() .map(|key_range| (key_range.get_start().to_vec(), key_range.get_end().to_vec())) @@ -505,6 +521,8 @@ impl Endpoint { let resource_tag = self .resource_tag_factory .new_tag_with_key_ranges(&req_ctx.context, key_ranges); + let mut allocated_bytes = resource_tag.approximate_heap_size(); + let metadata = TaskMetadata::from_ctx(req_ctx.context.get_resource_control_context()); let resource_limiter = self.resource_ctl.as_ref().and_then(|r| { r.get_resource_limiter( @@ -521,19 +539,27 @@ impl Endpoint { }); // box the tracker so that moving it is cheap. let tracker = Box::new(Tracker::new(req_ctx, self.slow_log_threshold)); - - let res = self - .read_pool - .spawn_handle( - Self::handle_unary_request_impl(self.semaphore.clone(), tracker, handler_builder) - .in_resource_metering_tag(resource_tag), - priority, - task_id, - metadata, - resource_limiter, - ) - .map_err(|_| Error::MaxPendingTasksExceeded); - async move { res.await? } + allocated_bytes += tracker.approximate_mem_size(); + + let (tx, rx) = oneshot::channel(); + let future = + Self::handle_unary_request_impl(self.semaphore.clone(), tracker, handler_builder) + .in_resource_metering_tag(resource_tag) + .map(|res| { + let _ = tx.send(res); + }); + let res = self.read_pool_spawn_with_memory_quota_check( + allocated_bytes, + future, + priority, + task_id, + metadata, + resource_limiter, + ); + async move { + res?; + rx.map_err(|_| Error::MaxPendingTasksExceeded).await? + } } /// Parses and handles a unary request. Returns a future that will never @@ -777,24 +803,29 @@ impl Endpoint { let resource_tag = self .resource_tag_factory .new_tag_with_key_ranges(&req_ctx.context, key_ranges); + let mut allocated_bytes = resource_tag.approximate_heap_size(); + let task_id = req_ctx.build_task_id(); let tracker = Box::new(Tracker::new(req_ctx, self.slow_log_threshold)); + allocated_bytes += tracker.approximate_mem_size(); + + let future = + Self::handle_stream_request_impl(self.semaphore.clone(), tracker, handler_builder) + .in_resource_metering_tag(resource_tag) + .then(futures::future::ok::<_, mpsc::SendError>) + .forward(tx) + .unwrap_or_else(|e| { + warn!("coprocessor stream send error"; "error" => %e); + }); - self.read_pool - .spawn( - Self::handle_stream_request_impl(self.semaphore.clone(), tracker, handler_builder) - .in_resource_metering_tag(resource_tag) - .then(futures::future::ok::<_, mpsc::SendError>) - .forward(tx) - .unwrap_or_else(|e| { - warn!("coprocessor stream send error"; "error" => %e); - }), - priority, - task_id, - metadata, - resource_limiter, - ) - .map_err(|_| Error::MaxPendingTasksExceeded)?; + self.read_pool_spawn_with_memory_quota_check( + allocated_bytes, + future, + priority, + task_id, + metadata, + resource_limiter, + )?; Ok(rx) } @@ -819,6 +850,30 @@ impl Endpoint { .or_else(|e| futures::future::ok(make_error_response(e))) // Stream .map(|item: std::result::Result<_, ()>| item.unwrap()) } + + fn read_pool_spawn_with_memory_quota_check( + &self, + mut allocated_bytes: usize, + future: F, + priority: CommandPri, + task_id: u64, + metadata: TaskMetadata<'_>, + resource_limiter: Option>, + ) -> Result<()> + where + F: Future + Send + 'static, + { + allocated_bytes += mem::size_of_val(&future); + let mut owned_quota = OwnedAllocated::new(self.memory_quota.clone()); + owned_quota.alloc(allocated_bytes)?; + let fut = future.map(move |_| { + // Release quota after handle completed. + drop(owned_quota); + }); + self.read_pool + .spawn(fut, priority, task_id, metadata, resource_limiter) + .map_err(|_| Error::MaxPendingTasksExceeded) + } } macro_rules! make_error_response_common { @@ -848,6 +903,15 @@ macro_rules! make_error_response_common { errorpb.set_server_is_busy(server_is_busy_err); $resp.set_region_error(errorpb); } + Error::MemoryQuotaExceeded => { + $tag = "memory_quota_exceeded"; + let mut server_is_busy_err = errorpb::ServerIsBusy::default(); + server_is_busy_err.set_reason($e.to_string()); + let mut errorpb = errorpb::Error::default(); + errorpb.set_message($e.to_string()); + errorpb.set_server_is_busy(server_is_busy_err); + $resp.set_region_error(errorpb); + } Error::Other(_) => { $tag = "other"; warn!("unexpected other error encountered processing coprocessor task"; @@ -2032,4 +2096,60 @@ mod tests { "Coprocessor task terminated due to exceeding the deadline" ); } + + #[test] + fn test_memory_quota() { + let engine = TestEngineBuilder::new().build().unwrap(); + let read_pool = ReadPool::from(build_read_pool_for_test( + &CoprReadPoolConfig::default_for_test(), + engine, + )); + let cm = ConcurrencyManager::new(1.into()); + let copr = Endpoint::::new( + &Config::default(), + read_pool.handle(), + cm, + ResourceTagFactory::new_for_test(), + Arc::new(QuotaLimiter::default()), + None, + ); + + // By default, coprocessor does not return memory quota exceeded error. + { + let handler_builder = Box::new(|_, _: &_| { + Ok(UnaryFixture::new(Ok(coppb::Response::default())).into_boxed()) + }); + + let mut config = ReqContext::default_for_test(); + config.deadline = Deadline::from_now(Duration::from_millis(500)); + + let resp = block_on(copr.handle_unary_request(config, handler_builder)).unwrap(); + assert!(!resp.has_region_error(), "{:?}", resp); + } + + // Trigger memory quota exceeded error. + copr.memory_quota.set_capacity(1); + { + let handler_builder = Box::new(|_, _: &_| { + Ok(UnaryFixture::new(Ok(coppb::Response::default())).into_boxed()) + }); + + let mut config = ReqContext::default_for_test(); + config.deadline = Deadline::from_now(Duration::from_millis(500)); + + let res = block_on(copr.handle_unary_request(config, handler_builder)); + assert!(res.is_err(), "{:?}", res); + let resp = make_error_response(res.unwrap_err()); + assert_eq!(resp.get_data().len(), 0); + let region_err = resp.get_region_error(); + assert!( + region_err + .get_server_is_busy() + .reason + .contains("exceeding memory quota"), + "{:?}", + region_err.get_server_is_busy().reason + ); + } + } } diff --git a/src/coprocessor/error.rs b/src/coprocessor/error.rs index 25751553c28..5c3ce554cc2 100644 --- a/src/coprocessor/error.rs +++ b/src/coprocessor/error.rs @@ -2,6 +2,7 @@ use error_code::{self, ErrorCode, ErrorCodeExt}; use thiserror::Error; +use tikv_util::memory::MemoryQuotaExceeded; use crate::{ storage, @@ -26,6 +27,9 @@ pub enum Error { #[error("Coprocessor task canceled due to exceeding max pending tasks")] MaxPendingTasksExceeded, + #[error("Coprocessor task canceled due to exceeding memory quota")] + MemoryQuotaExceeded, + #[error("{0}")] Other(String), } @@ -117,6 +121,12 @@ impl From for Error { } } +impl From for Error { + fn from(_: MemoryQuotaExceeded) -> Self { + Error::MemoryQuotaExceeded + } +} + pub type Result = std::result::Result; impl ErrorCodeExt for Error { @@ -126,6 +136,7 @@ impl ErrorCodeExt for Error { Error::Locked(_) => error_code::coprocessor::LOCKED, Error::DeadlineExceeded => error_code::coprocessor::DEADLINE_EXCEEDED, Error::MaxPendingTasksExceeded => error_code::coprocessor::MAX_PENDING_TASKS_EXCEEDED, + Error::MemoryQuotaExceeded => error_code::coprocessor::MEMORY_QUOTA_EXCEEDED, Error::Other(_) => error_code::UNKNOWN, } } diff --git a/src/coprocessor/metrics.rs b/src/coprocessor/metrics.rs index 02f45d35311..d41f3666f2f 100644 --- a/src/coprocessor/metrics.rs +++ b/src/coprocessor/metrics.rs @@ -8,6 +8,7 @@ use pd_client::BucketMeta; use prometheus::*; use prometheus_static_metric::*; use raftstore::store::{util::build_key_range, ReadStats}; +use tikv_util::memory::MemoryQuota; use crate::{ server::metrics::{GcKeysCF, GcKeysDetail}, @@ -324,3 +325,37 @@ pub fn tls_collect_query( .add_query_num(region_id, peer, key_range, QueryKind::Coprocessor); }); } + +pub fn register_coprocessor_memory_quota_metrics(source: Arc) { + struct MemoryQuotaCollector { + gauges: IntGaugeVec, + source: Arc, + } + impl prometheus::core::Collector for MemoryQuotaCollector { + fn desc(&self) -> Vec<&prometheus::core::Desc> { + self.gauges.desc() + } + fn collect(&self) -> Vec { + self.gauges + .with_label_values(&["capacity"]) + .set(self.source.capacity() as _); + self.gauges + .with_label_values(&["in_use"]) + .set(self.source.in_use() as _); + self.gauges.collect() + } + } + let gauges = IntGaugeVec::new( + Opts::new( + "tikv_coprocessor_memory_quota", + "Statistics of in_use and capacity of coprocessor memory quota", + ), + &["type"], + ) + .unwrap(); + if let Err(e) = + prometheus::default_registry().register(Box::new(MemoryQuotaCollector { gauges, source })) + { + warn!("register memory quota metrics failed"; "error" => ?e); + } +} diff --git a/src/coprocessor/mod.rs b/src/coprocessor/mod.rs index fcd16f9b947..28f40d8b9e3 100644 --- a/src/coprocessor/mod.rs +++ b/src/coprocessor/mod.rs @@ -23,6 +23,7 @@ mod cache; mod checksum; +mod config_manager; pub mod dag; mod endpoint; mod error; @@ -43,7 +44,7 @@ use metrics::ReqTag; use rand::prelude::*; use tidb_query_common::execute_stats::ExecSummary; use tikv_alloc::{mem_trace, Id, MemoryTrace, MemoryTraceGuard}; -use tikv_util::{deadline::Deadline, time::Duration}; +use tikv_util::{deadline::Deadline, memory::HeapSize, time::Duration}; use txn_types::TsSet; pub use self::{ @@ -147,6 +148,16 @@ pub struct ReqContext { pub allowed_in_flashback: bool, } +impl HeapSize for ReqContext { + fn approximate_heap_size(&self) -> usize { + self.context.approximate_heap_size() + + self.ranges.approximate_heap_size() + + self.peer.as_ref().map_or(0, |p| p.as_bytes().len()) + + self.lower_bound.approximate_heap_size() + + self.upper_bound.approximate_heap_size() + } +} + impl ReqContext { pub fn new( tag: ReqTag, diff --git a/src/coprocessor/tracker.rs b/src/coprocessor/tracker.rs index f6502c2459e..94d6b56d2ef 100644 --- a/src/coprocessor/tracker.rs +++ b/src/coprocessor/tracker.rs @@ -7,7 +7,10 @@ use engine_traits::{PerfContext, PerfContextExt, PerfContextKind}; use kvproto::{kvrpcpb, kvrpcpb::ScanDetailV2}; use pd_client::BucketMeta; use tikv_kv::Engine; -use tikv_util::time::{self, Duration, Instant}; +use tikv_util::{ + memory::HeapSize, + time::{self, Duration, Instant}, +}; use txn_types::Key; use super::metrics::*; @@ -467,6 +470,16 @@ impl Drop for Tracker { } } +impl HeapSize for Tracker { + fn approximate_heap_size(&self) -> usize { + self.req_ctx.approximate_heap_size() + + self + .buckets + .as_ref() + .map_or(0, |b| b.approximate_heap_size()) + } +} + #[cfg(test)] mod tests { use std::{sync::Arc, time::Duration, vec}; diff --git a/src/server/config.rs b/src/server/config.rs index 4e66e5802c0..4b3d47f7338 100644 --- a/src/server/config.rs +++ b/src/server/config.rs @@ -40,6 +40,30 @@ const DEFAULT_ENDPOINT_REQUEST_MAX_HANDLE_SECS: u64 = 60; // Number of rows in each chunk for streaming coprocessor. const DEFAULT_ENDPOINT_STREAM_BATCH_ROW_LIMIT: usize = 128; +// By default, endpoint memory quota will be set to 12.5% of system memory. +// +// TPCC check test shows that: +// * The actual endpoint memory usage is about 3 times to memory quota. +// * Setting memory quota too low can lead to ServerIsBusy errors, which slow +// down performance. +// * With 1000 warehouses and 1000 threads, the peak memory usage of the TPCC +// check is 11.5 GiB, which is too large for common scenario 16GiB memory, +// because default block cache takes about 45% memory (7.2GiB). +// +// The 12.5% default quota is a balance between efficient memory usage and +// maintaining performance under load. +const DEFAULT_ENDPOINT_MEMORY_QUOTA_RATIO: f64 = 0.125; + +lazy_static! { + static ref DEFAULT_ENDPOINT_MEMORY_QUOTA: ReadableSize = { + let total_mem = SysQuota::memory_limit_in_bytes(); + let quota = (total_mem as f64) * DEFAULT_ENDPOINT_MEMORY_QUOTA_RATIO; + // In order to ensure that coprocessor can function properly under low + // memory conditions, we use 500MB as the minimum default value. + ReadableSize(cmp::max(ReadableSize::mb(500).0, quota as _)) + }; +} + // At least 4 long coprocessor requests are allowed to run concurrently. const MIN_ENDPOINT_MAX_CONCURRENCY: usize = 4; @@ -145,6 +169,7 @@ pub struct Config { #[serde(with = "perf_level_serde")] #[online_config(skip)] pub end_point_perf_level: PerfLevel, + pub end_point_memory_quota: ReadableSize, #[serde(alias = "snap-max-write-bytes-per-sec")] pub snap_io_max_bytes_per_sec: ReadableSize, pub snap_max_total_size: ReadableSize, @@ -250,6 +275,7 @@ impl Default for Config { end_point_request_max_handle_duration: None, end_point_max_concurrency: cmp::max(cpu_num as usize, MIN_ENDPOINT_MAX_CONCURRENCY), end_point_perf_level: PerfLevel::Uninitialized, + end_point_memory_quota: *DEFAULT_ENDPOINT_MEMORY_QUOTA, snap_io_max_bytes_per_sec: ReadableSize(DEFAULT_SNAP_MAX_BYTES_PER_SEC), snap_max_total_size: ReadableSize(0), stats_concurrency: 1, @@ -358,6 +384,11 @@ impl Config { )); } + if self.end_point_memory_quota == *DEFAULT_ENDPOINT_MEMORY_QUOTA { + info!("using default coprocessor quota"; + "quota" => ?*DEFAULT_ENDPOINT_MEMORY_QUOTA); + } + if self.max_grpc_send_msg_len <= 0 { return Err(box_err!( "server.max-grpc-send-msg-len must be bigger than 0." @@ -436,6 +467,7 @@ pub struct ServerConfigManager { tx: Scheduler, config: Arc>, grpc_mem_quota: ResourceQuota, + copr_config_manager: Box, } unsafe impl Send for ServerConfigManager {} @@ -446,32 +478,38 @@ impl ServerConfigManager { tx: Scheduler, config: Arc>, grpc_mem_quota: ResourceQuota, + copr_config_manager: Box, ) -> ServerConfigManager { ServerConfigManager { tx, config, grpc_mem_quota, + copr_config_manager, } } } impl ConfigManager for ServerConfigManager { fn dispatch(&mut self, c: ConfigChange) -> std::result::Result<(), Box> { - { - let change = c.clone(); - self.config.update(move |cfg| cfg.update(change))?; - if let Some(value) = c.get("grpc_memory_pool_quota") { - let mem_quota: ReadableSize = value.clone().into(); - // the resize is done inplace indeed, but grpc-rs's api need self, so we just - // clone it here, but this no extra side effect here. - self.grpc_mem_quota - .clone() - .resize_memory(mem_quota.0 as usize); - } - if let Err(e) = self.tx.schedule(SnapTask::RefreshConfigEvent) { - error!("server configuration manager schedule refresh snapshot work task failed"; "err"=> ?e); - } + let change = c.clone(); + self.config.update(move |cfg| cfg.update(change))?; + if let Some(value) = c.get("grpc_memory_pool_quota") { + let mem_quota: ReadableSize = value.clone().into(); + // the resize is done inplace indeed, but grpc-rs's api need self, so we just + // clone it here, but this no extra side effect here. + self.grpc_mem_quota + .clone() + .resize_memory(mem_quota.0 as usize); + } + if let Err(e) = self.tx.schedule(SnapTask::RefreshConfigEvent) { + error!("server configuration manager schedule refresh snapshot work task failed"; "err"=> ?e); } + + // Dispatch coprocessor config. + if let Err(e) = self.copr_config_manager.dispatch(c.clone()) { + error!("server configuration manager fails to update coprocessor config"; "err"=> ?e); + } + info!("server configuration changed"; "change" => ?c); Ok(()) } diff --git a/tests/Cargo.toml b/tests/Cargo.toml index dd851c95822..fcf228de924 100644 --- a/tests/Cargo.toml +++ b/tests/Cargo.toml @@ -39,6 +39,11 @@ name = "deadlock_detector" harness = false path = "benches/deadlock_detector/mod.rs" +[[bench]] +name = "memory" +harness = false +path = "benches/memory/mod.rs" + [features] default = ["failpoints", "testexport", "test-engine-kv-rocksdb", "test-engine-raft-raft-engine", "cloud-aws", "cloud-gcp", "cloud-azure"] failpoints = ["fail/failpoints", "tikv/failpoints", "pd_client/failpoints"] diff --git a/tests/benches/memory/mod.rs b/tests/benches/memory/mod.rs new file mode 100644 index 00000000000..33b7b66f80b --- /dev/null +++ b/tests/benches/memory/mod.rs @@ -0,0 +1,122 @@ +// Copyright 2024 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + sync::{ + atomic::{AtomicBool, Ordering}, + Arc, + }, + thread, + time::Duration, +}; + +use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion}; +use tikv_util::memory::{MemoryQuota, OwnedAllocated}; + +fn bench_memory_quota_alloc(c: &mut Criterion) { + let mut group = c.benchmark_group("Alloc Only"); + + let bytes = 0b1010100; + let quota = Arc::new(MemoryQuota::new(bytes - 1)); + let max_quota = Arc::new(MemoryQuota::new(usize::MAX)); + + group.bench_function(BenchmarkId::new("Alloc", "ok"), |b| { + b.iter(|| { + let _ = black_box(max_quota.alloc(bytes)); + }) + }); + group.bench_function(BenchmarkId::new("Alloc", "fail"), |b| { + b.iter(|| { + let _ = black_box(quota.alloc(bytes)); + }) + }); + + group.finish(); +} + +fn bench_memory_quota_alloc_free(c: &mut Criterion) { + let mut group = c.benchmark_group("Alloc Free"); + + let bytes = 0b1010100; + let quota = Arc::new(MemoryQuota::new(10 * bytes)); + let quota_ = quota.clone(); + + group.bench_function(BenchmarkId::new("MemoryQuota", "alloc free"), |b| { + b.iter(|| { + let _ = black_box(quota.alloc(bytes)); + quota.free(bytes); + }) + }); + group.bench_function(BenchmarkId::new("OwnedAllocated", "alloc free"), |b| { + b.iter(|| { + let mut owned_quota = OwnedAllocated::new(quota_.clone()); + let _ = black_box(owned_quota.alloc(bytes)); + drop(owned_quota); + }) + }); + + group.finish(); +} + +fn bench_memory_quota_multi_threads(c: &mut Criterion) { + memory_quota_multi_threads(c, 32); + memory_quota_multi_threads(c, 64); +} + +fn memory_quota_multi_threads(c: &mut Criterion, total_threads: usize) { + let threads = total_threads - 1; + let mut group = c.benchmark_group(format!("{} Threads", total_threads)); + + let bytes = 0b1010100; + let quota = Arc::new(MemoryQuota::new(2 * threads * bytes)); + + // Alloc and free by multiple thread. + let mut handles = Vec::with_capacity(threads); + let done = Arc::new(AtomicBool::default()); + // Alloc and free take about 20ns on Intel(R) Xeon(R) Gold 6240 CPU @ 2.60GHz. + let duration = Duration::from_nanos(20); + let check_interval = Duration::from_millis(500); + let batch_work_count = check_interval.as_nanos() / duration.as_nanos(); + for _ in 0..threads { + let quota_ = quota.clone(); + let done_ = done.clone(); + handles.push(thread::spawn(move || { + loop { + if done_.load(Ordering::Relaxed) { + return; + } + for _ in 0..batch_work_count { + let _ = black_box(quota_.alloc(bytes)); + quota_.free(bytes); + } + } + })); + } + + let quota_ = quota.clone(); + group.bench_function(BenchmarkId::new("MemoryQuota", "alloc free"), |b| { + b.iter(|| { + let _ = black_box(quota.alloc(bytes)); + quota.free(bytes); + }) + }); + group.bench_function(BenchmarkId::new("OwnedAllocated", "alloc free"), |b| { + b.iter(|| { + let mut owned_quota = OwnedAllocated::new(quota_.clone()); + let _ = black_box(owned_quota.alloc(bytes)); + drop(owned_quota); + }) + }); + + done.store(true, Ordering::Relaxed); + let _ = handles.into_iter().map(|h| h.join().unwrap()); + group.finish(); +} + +criterion_group!( + benches, + bench_memory_quota_alloc, + bench_memory_quota_alloc_free, + bench_memory_quota_multi_threads, +); + +criterion_main!(benches); diff --git a/tests/integrations/config/dynamic/snap.rs b/tests/integrations/config/dynamic/snap.rs index bb91d0d62eb..3cb7f6e43a3 100644 --- a/tests/integrations/config/dynamic/snap.rs +++ b/tests/integrations/config/dynamic/snap.rs @@ -7,6 +7,7 @@ use std::{ use engine_rocks::RocksEngine; use grpcio::{EnvBuilder, ResourceQuota}; +use online_config::ConfigManager; use raft_log_engine::RaftLogEngine; use raftstore::store::{fsm::create_raft_batch_system, SnapManager}; use security::SecurityManager; @@ -24,6 +25,14 @@ use tikv_util::{ worker::{LazyWorker, Scheduler, Worker}, }; +struct MockCfgManager; + +impl ConfigManager for MockCfgManager { + fn dispatch(&mut self, _: online_config::ConfigChange) -> online_config::Result<()> { + Ok(()) + } +} + fn start_server( cfg: TikvConfig, dir: &TempDir, @@ -57,6 +66,7 @@ fn start_server( snap_worker_scheduler, server_config.clone(), ResourceQuota::new(None), + Box::new(MockCfgManager), )), ); let snap_runner = SnapHandler::new( From 397222927b4844e2de13dca8e64be647562bfe5c Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Wed, 17 Jul 2024 11:24:00 +0800 Subject: [PATCH 190/220] cdc: skip incremental scaned events after region fails (#17248) (#17254) close tikv/tikv#17233 cdc: skip incremental scaned events after region fails Signed-off-by: qupeng Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> Co-authored-by: qupeng --- components/cdc/src/channel.rs | 119 ++++++++++++++---- components/cdc/src/delegate.rs | 27 ++-- components/cdc/src/endpoint.rs | 6 +- components/cdc/src/initializer.rs | 13 +- components/cdc/src/service.rs | 9 +- .../cdc/tests/failpoints/test_endpoint.rs | 63 +++++++++- 6 files changed, 197 insertions(+), 40 deletions(-) diff --git a/components/cdc/src/channel.rs b/components/cdc/src/channel.rs index af9caadd394..c49bec00547 100644 --- a/components/cdc/src/channel.rs +++ b/components/cdc/src/channel.rs @@ -1,6 +1,13 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. -use std::{fmt, sync::Arc, time::Duration}; +use std::{ + fmt, + sync::{ + atomic::{AtomicBool, Ordering}, + Arc, + }, + time::Duration, +}; use futures::{ channel::mpsc::{ @@ -235,7 +242,8 @@ macro_rules! impl_from_future_send_error { impl_from_future_send_error! { FuturesSendError, - TrySendError<(Instant, CdcEvent, usize)>, + TrySendError, + TrySendError, } impl From for SendError { @@ -244,22 +252,63 @@ impl From for SendError { } } +pub struct ObservedEvent { + pub created: Instant, + pub event: CdcEvent, + pub size: usize, +} + +pub struct ScanedEvent { + pub created: Instant, + pub event: CdcEvent, + pub size: usize, + // Incremental scan can be canceled by region errors. We must check it when draing + // an event instead of emit it to `Sink`. + pub truncated: Arc, +} + +impl ObservedEvent { + fn new(created: Instant, event: CdcEvent, size: usize) -> Self { + ObservedEvent { + created, + event, + size, + } + } +} + +impl ScanedEvent { + fn new(created: Instant, event: CdcEvent, size: usize, truncated: Arc) -> Self { + ScanedEvent { + created, + event, + size, + truncated, + } + } +} + #[derive(Clone)] pub struct Sink { - unbounded_sender: UnboundedSender<(Instant, CdcEvent, usize)>, - bounded_sender: Sender<(Instant, CdcEvent, usize)>, + unbounded_sender: UnboundedSender, + bounded_sender: Sender, memory_quota: Arc, } impl Sink { - pub fn unbounded_send(&self, event: CdcEvent, force: bool) -> Result<(), SendError> { + /// Only observed events can be sent by `unbounded_send`. + pub fn unbounded_send(&self, observed_event: CdcEvent, force: bool) -> Result<(), SendError> { // Try it's best to send error events. - let bytes = if !force { event.size() as usize } else { 0 }; + let bytes = if !force { + observed_event.size() as usize + } else { + 0 + }; if bytes != 0 { self.memory_quota.alloc(bytes)?; } - let now = Instant::now_coarse(); - match self.unbounded_sender.unbounded_send((now, event, bytes)) { + let ob_event = ObservedEvent::new(Instant::now_coarse(), observed_event, bytes); + match self.unbounded_sender.unbounded_send(ob_event) { Ok(_) => Ok(()), Err(e) => { // Free quota if send fails. @@ -269,19 +318,25 @@ impl Sink { } } - pub async fn send_all(&mut self, events: Vec) -> Result<(), SendError> { + /// Only scaned events can be sent by `send_all`. + pub async fn send_all( + &mut self, + scaned_events: Vec, + truncated: Arc, + ) -> Result<(), SendError> { // Allocate quota in advance. let mut total_bytes = 0; - for event in &events { + for event in &scaned_events { let bytes = event.size(); total_bytes += bytes; } self.memory_quota.alloc(total_bytes as _)?; let now = Instant::now_coarse(); - for event in events { + for event in scaned_events { let bytes = event.size() as usize; - if let Err(e) = self.bounded_sender.feed((now, event, bytes)).await { + let sc_event = ScanedEvent::new(now, event, bytes, truncated.clone()); + if let Err(e) = self.bounded_sender.feed(sc_event).await { // Free quota if send fails. self.memory_quota.free(total_bytes as _); return Err(SendError::from(e)); @@ -297,25 +352,31 @@ impl Sink { } pub struct Drain { - unbounded_receiver: UnboundedReceiver<(Instant, CdcEvent, usize)>, - bounded_receiver: Receiver<(Instant, CdcEvent, usize)>, + unbounded_receiver: UnboundedReceiver, + bounded_receiver: Receiver, memory_quota: Arc, } impl<'a> Drain { pub fn drain(&'a mut self) -> impl Stream + 'a { - stream::select(&mut self.bounded_receiver, &mut self.unbounded_receiver).map( - |(start, mut event, size)| { - CDC_EVENTS_PENDING_DURATION.observe(start.saturating_elapsed_secs() * 1000.0); - if let CdcEvent::Barrier(ref mut barrier) = event { - if let Some(barrier) = barrier.take() { - // Unset barrier when it is received. - barrier(()); - } + let observed = (&mut self.unbounded_receiver).map(|x| (x.created, x.event, x.size)); + let scaned = (&mut self.bounded_receiver).filter_map(|x| { + if x.truncated.load(Ordering::Acquire) { + return futures::future::ready(None); + } + futures::future::ready(Some((x.created, x.event, x.size))) + }); + + stream::select(scaned, observed).map(|(start, mut event, size)| { + CDC_EVENTS_PENDING_DURATION.observe(start.saturating_elapsed_secs() * 1000.0); + if let CdcEvent::Barrier(ref mut barrier) = event { + if let Some(barrier) = barrier.take() { + // Unset barrier when it is received. + barrier(()); } - (event, size) - }, - ) + } + (event, size) + }) } // Forwards contents to the sink, simulates StreamExt::forward. @@ -386,7 +447,11 @@ where #[cfg(test)] mod tests { - use std::{assert_matches::assert_matches, sync::mpsc, time::Duration}; + use std::{ + assert_matches::assert_matches, + sync::{mpsc, Arc}, + time::Duration, + }; use futures::executor::block_on; use kvproto::cdcpb::{ @@ -405,7 +470,7 @@ mod tests { if flag { tx.unbounded_send(event, force_send) } else { - block_on(tx.send_all(vec![event])) + block_on(tx.send_all(vec![event], Arc::new(Default::default()))) } }; (Box::new(send), rx) diff --git a/components/cdc/src/delegate.rs b/components/cdc/src/delegate.rs index 7afb36899b6..050e9419cb0 100644 --- a/components/cdc/src/delegate.rs +++ b/components/cdc/src/delegate.rs @@ -4,7 +4,7 @@ use std::{ mem, string::String, sync::{ - atomic::{AtomicUsize, Ordering}, + atomic::{AtomicBool, AtomicUsize, Ordering}, Arc, }, }; @@ -136,6 +136,10 @@ pub struct Downstream { kv_api: ChangeDataRequestKvApi, filter_loop: bool, pub(crate) observed_range: ObservedRange, + + // When meet region errors like split or merge, we can cancel incremental scan draining + // by `scan_truncated`. + pub(crate) scan_truncated: Arc, } impl Downstream { @@ -163,10 +167,14 @@ impl Downstream { kv_api, filter_loop, observed_range, + + scan_truncated: Arc::new(AtomicBool::new(false)), } } - /// Sink events to the downstream. + // NOTE: it's not allowed to sink `EventError` directly by this function, + // because the sink can be also used by an incremental scan. We must ensure + // no more events can be pushed to the sink after an `EventError` is sent. pub fn sink_event(&self, mut event: Event, force: bool) -> Result<()> { event.set_request_id(self.req_id); if self.sink.is_none() { @@ -191,7 +199,14 @@ impl Downstream { } } + /// EventErrors must be sent by this function. And we must ensure no more + /// events or ResolvedTs will be sent to the downstream after + /// `sink_error_event` is called. pub fn sink_error_event(&self, region_id: u64, err_event: EventError) -> Result<()> { + info!("cdc downstream meets region error"; + "conn_id" => ?self.conn_id, "downstream_id" => ?self.id, "req_id" => self.req_id); + + self.scan_truncated.store(true, Ordering::Release); let mut change_data_event = Event::default(); change_data_event.event = Some(Event_oneof_event::Error(err_event)); change_data_event.region_id = region_id; @@ -200,12 +215,6 @@ impl Downstream { self.sink_event(change_data_event, force_send) } - pub fn sink_region_not_found(&self, region_id: u64) -> Result<()> { - let mut err_event = EventError::default(); - err_event.mut_region_not_found().region_id = region_id; - self.sink_error_event(region_id, err_event) - } - pub fn set_sink(&mut self, sink: Sink) { self.sink = Some(sink); } @@ -1555,6 +1564,7 @@ mod tests { region_epoch: RegionEpoch::default(), sink: Some(sink), state: Arc::new(AtomicCell::new(DownstreamState::Normal)), + scan_truncated: Arc::new(Default::default()), kv_api: ChangeDataRequestKvApi::TiDb, filter_loop: false, observed_range, @@ -1629,6 +1639,7 @@ mod tests { region_epoch: RegionEpoch::default(), sink: Some(sink), state: Arc::new(AtomicCell::new(DownstreamState::Normal)), + scan_truncated: Arc::new(Default::default()), kv_api: ChangeDataRequestKvApi::TiDb, filter_loop, observed_range, diff --git a/components/cdc/src/endpoint.rs b/components/cdc/src/endpoint.rs index 0eeae5d8638..f09cc160d34 100644 --- a/components/cdc/src/endpoint.rs +++ b/components/cdc/src/endpoint.rs @@ -740,7 +740,9 @@ impl, E: KvEngine, S: StoreRegionMeta> Endpoint reader.txn_extra_op.clone(), None => { error!("cdc register for a not found region"; "region_id" => region_id); - let _ = downstream.sink_region_not_found(region_id); + let mut err_event = EventError::default(); + err_event.mut_region_not_found().region_id = region_id; + let _ = downstream.sink_error_event(region_id, err_event); return; } }; @@ -813,6 +815,7 @@ impl, E: KvEngine, S: StoreRegionMeta> Endpoint, E: KvEngine, S: StoreRegionMeta> Endpoint { pub(crate) observe_id: ObserveId, pub(crate) downstream_id: DownstreamId, pub(crate) downstream_state: Arc>, + pub(crate) scan_truncated: Arc, pub(crate) conn_id: ConnId, pub(crate) request_id: u64, pub(crate) checkpoint_ts: TimeStamp, @@ -440,7 +444,11 @@ impl Initializer { events.push(CdcEvent::Barrier(Some(cb))); barrier = Some(fut); } - if let Err(e) = self.sink.send_all(events).await { + if let Err(e) = self + .sink + .send_all(events, self.scan_truncated.clone()) + .await + { error!("cdc send scan event failed"; "req_id" => ?self.request_id); return Err(Error::Sink(e)); } @@ -662,6 +670,7 @@ mod tests { observe_id: ObserveId::new(), downstream_id: DownstreamId::new(), downstream_state, + scan_truncated: Arc::new(Default::default()), conn_id: ConnId::new(), request_id: 0, checkpoint_ts: 1.into(), diff --git a/components/cdc/src/service.rs b/components/cdc/src/service.rs index 7cbf268f2b7..e5c21d22217 100644 --- a/components/cdc/src/service.rs +++ b/components/cdc/src/service.rs @@ -575,7 +575,14 @@ mod tests { let send = || { let rts_ = rts.clone(); let mut sink_ = sink.clone(); - Box::pin(async move { sink_.send_all(vec![CdcEvent::ResolvedTs(rts_)]).await }) + Box::pin(async move { + sink_ + .send_all( + vec![CdcEvent::ResolvedTs(rts_)], + Arc::new(Default::default()), + ) + .await + }) }; let must_fill_window = || { let mut window_size = 0; diff --git a/components/cdc/tests/failpoints/test_endpoint.rs b/components/cdc/tests/failpoints/test_endpoint.rs index 42977cc3856..2ca7c18a22e 100644 --- a/components/cdc/tests/failpoints/test_endpoint.rs +++ b/components/cdc/tests/failpoints/test_endpoint.rs @@ -15,7 +15,7 @@ use kvproto::{cdcpb::*, kvrpcpb::*, tikvpb_grpc::TikvClient}; use pd_client::PdClient; use test_raftstore::*; use tikv_util::{debug, worker::Scheduler, HandyRwLock}; -use txn_types::TimeStamp; +use txn_types::{Key, TimeStamp}; use crate::{new_event_feed, new_event_feed_v2, ClientReceiver, TestSuite, TestSuiteBuilder}; @@ -595,3 +595,64 @@ fn test_cdc_notify_pending_regions() { ); fail::remove("cdc_before_initialize"); } + +// The case check whether https://github.com/tikv/tikv/issues/17233 is fixed or not. +#[test] +fn test_delegate_fail_during_incremental_scan() { + let mut cluster = new_server_cluster(0, 1); + configure_for_lease_read(&mut cluster.cfg, Some(100), Some(10)); + cluster.pd_client.disable_default_operator(); + let mut suite = TestSuiteBuilder::new().cluster(cluster).build(); + let region = suite.cluster.get_region(&[]); + let rid = region.id; + let cf_tso = block_on(suite.cluster.pd_client.get_tso()).unwrap(); + + let start_tso = cf_tso.next(); + let pk = format!("key_{:03}", 0).into_bytes(); + let mut mutations = Vec::with_capacity(10); + for i in 0..10 { + let mut mutation = Mutation::default(); + mutation.set_op(Op::Put); + mutation.key = format!("key_{:03}", i).into_bytes(); + mutation.value = vec![b'x'; 16]; + mutations.push(mutation); + } + suite.must_kv_prewrite(rid, mutations, pk, start_tso); + + fail::cfg("before_schedule_incremental_scan", "1*pause").unwrap(); + + let (mut req_tx, recv, receive_event) = new_event_feed_v2(suite.get_region_cdc_client(rid)); + let mut req = suite.new_changedata_request(rid); + req.request_id = 100; + req.checkpoint_ts = cf_tso.into_inner(); + req.set_start_key(Key::from_raw(b"a").into_encoded()); + req.set_end_key(Key::from_raw(b"z").into_encoded()); + block_on(req_tx.send((req.clone(), WriteFlags::default()))).unwrap(); + std::thread::sleep(Duration::from_millis(500)); + + suite.cluster.must_split(®ion, b"f"); + + // After the incremental scan is canceled, we can get the epoch_not_match error. + // And after the error is retrieved, no more entries can be received. + let mut get_epoch_not_match = false; + while !get_epoch_not_match { + for event in receive_event(false).events.to_vec() { + match event.event { + Some(Event_oneof_event::Error(err)) => { + assert!(err.has_epoch_not_match(), "{:?}", err); + get_epoch_not_match = true; + } + Some(Event_oneof_event::Entries(..)) => { + assert!(!get_epoch_not_match); + } + _ => unreachable!(), + } + } + } + + fail::remove("before_schedule_incremental_scan"); + + let mut recver = recv.replace(None).unwrap(); + recv_timeout(&mut recver, Duration::from_secs(1)).unwrap_err(); + recv.replace(Some(recver)); +} From 98673006969d86287b83eaca946b14368ade6ef5 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Thu, 18 Jul 2024 16:10:30 +0800 Subject: [PATCH 191/220] tests,storage: Fix flaky test_rawkv::test_leader_transfer (#16827) (#16846) close tikv/tikv#16789 Add RawKvMaxTimestampNotSynced error and set message to errorpb.Error.max_ts_not_synced to provide more information. Retry on max_ts_not_synced error for must_raw_put. Signed-off-by: ti-chi-bot Signed-off-by: Ping Yu Co-authored-by: Ping Yu Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- components/test_raftstore/src/util.rs | 45 ++++++++++++++++++++------- src/storage/errors.rs | 8 +++++ src/storage/mod.rs | 3 +- src/storage/txn/mod.rs | 9 ++++++ src/storage/txn/scheduler.rs | 3 +- 5 files changed, 52 insertions(+), 16 deletions(-) diff --git a/components/test_raftstore/src/util.rs b/components/test_raftstore/src/util.rs index a34bd614995..21d90097b22 100644 --- a/components/test_raftstore/src/util.rs +++ b/components/test_raftstore/src/util.rs @@ -56,7 +56,12 @@ use tikv::{ }; pub use tikv_util::store::{find_peer, new_learner_peer, new_peer}; use tikv_util::{ - config::*, escape, mpsc::future, time::ThreadReadId, worker::LazyWorker, HandyRwLock, + config::*, + escape, + mpsc::future, + time::{Instant, ThreadReadId}, + worker::LazyWorker, + HandyRwLock, }; use txn_types::Key; @@ -1418,17 +1423,33 @@ pub fn must_raw_put(client: &TikvClient, ctx: Context, key: Vec, value: Vec< put_req.set_context(ctx); put_req.key = key; put_req.value = value; - let put_resp = client.raw_put(&put_req).unwrap(); - assert!( - !put_resp.has_region_error(), - "{:?}", - put_resp.get_region_error() - ); - assert!( - put_resp.get_error().is_empty(), - "{:?}", - put_resp.get_error() - ); + + let retryable = |err: &kvproto::errorpb::Error| -> bool { err.has_max_timestamp_not_synced() }; + let start = Instant::now_coarse(); + loop { + let put_resp = client.raw_put(&put_req).unwrap(); + if put_resp.has_region_error() { + let err = put_resp.get_region_error(); + if retryable(err) && start.saturating_elapsed() < Duration::from_secs(5) { + debug!("must_raw_put meet region error"; "err" => ?err); + sleep_ms(100); + continue; + } + panic!( + "must_raw_put meet region error: {:?}, ctx: {:?}, key: {}, value {}", + err, + put_req.get_context(), + tikv_util::escape(&put_req.key), + tikv_util::escape(&put_req.value), + ); + } + assert!( + put_resp.get_error().is_empty(), + "must_raw_put meet error: {:?}", + put_resp.get_error() + ); + return; + } } pub fn must_raw_get(client: &TikvClient, ctx: Context, key: Vec) -> Option> { diff --git a/src/storage/errors.rs b/src/storage/errors.rs index b603b904708..0cd14d7ff8a 100644 --- a/src/storage/errors.rs +++ b/src/storage/errors.rs @@ -286,6 +286,14 @@ pub fn extract_region_error_from_error(e: &Error) -> Option { err.set_max_timestamp_not_synced(Default::default()); Some(err) } + Error(box ErrorInner::Txn( + e @ TxnError(box TxnErrorInner::RawKvMaxTimestampNotSynced { .. }), + )) => { + let mut err = errorpb::Error::default(); + err.set_max_timestamp_not_synced(Default::default()); + err.set_message(format!("{}", e)); + Some(err) + } Error(box ErrorInner::Txn(TxnError(box TxnErrorInner::FlashbackNotPrepared( region_id, )))) => { diff --git a/src/storage/mod.rs b/src/storage/mod.rs index c89a767a80b..60202d5ff84 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -2010,9 +2010,8 @@ impl Storage { SCHED_STAGE_COUNTER_VEC.get(tag).snapshot_ok.inc(); if !snapshot.ext().is_max_ts_synced() { return Err(Error::from(txn::Error::from( - TxnError::MaxTimestampNotSynced { + TxnError::RawKvMaxTimestampNotSynced { region_id: ctx.get_region_id(), - start_ts: TimeStamp::zero(), }, ))); } diff --git a/src/storage/txn/mod.rs b/src/storage/txn/mod.rs index 8c30ae0a068..ca11fd6db30 100644 --- a/src/storage/txn/mod.rs +++ b/src/storage/txn/mod.rs @@ -143,6 +143,9 @@ pub enum ErrorInner { )] MaxTimestampNotSynced { region_id: u64, start_ts: TimeStamp }, + #[error("RawKV write fails due to potentially stale max timestamp, region_id: {region_id}")] + RawKvMaxTimestampNotSynced { region_id: u64 }, + #[error("region {0} not prepared the flashback")] FlashbackNotPrepared(u64), } @@ -178,6 +181,9 @@ impl ErrorInner { region_id, start_ts, }), + ErrorInner::RawKvMaxTimestampNotSynced { region_id } => { + Some(ErrorInner::RawKvMaxTimestampNotSynced { region_id }) + } ErrorInner::FlashbackNotPrepared(region_id) => { Some(ErrorInner::FlashbackNotPrepared(region_id)) } @@ -231,6 +237,9 @@ impl ErrorCodeExt for Error { ErrorInner::MaxTimestampNotSynced { .. } => { error_code::storage::MAX_TIMESTAMP_NOT_SYNCED } + ErrorInner::RawKvMaxTimestampNotSynced { .. } => { + error_code::storage::MAX_TIMESTAMP_NOT_SYNCED + } ErrorInner::FlashbackNotPrepared(_) => error_code::storage::FLASHBACK_NOT_PREPARED, } } diff --git a/src/storage/txn/scheduler.rs b/src/storage/txn/scheduler.rs index 6d087d894df..e5a1afbd918 100644 --- a/src/storage/txn/scheduler.rs +++ b/src/storage/txn/scheduler.rs @@ -1926,9 +1926,8 @@ pub async fn get_raw_ext( match cmd { Command::RawCompareAndSwap(_) | Command::RawAtomicStore(_) => { if !max_ts_synced { - return Err(ErrorInner::MaxTimestampNotSynced { + return Err(ErrorInner::RawKvMaxTimestampNotSynced { region_id: cmd.ctx().get_region_id(), - start_ts: TimeStamp::zero(), } .into()); } From 65ef9fd4d3ccad80f45ee85ba9b90f1741c37e9b Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Thu, 18 Jul 2024 16:33:02 +0800 Subject: [PATCH 192/220] tests: Fix flaky test_raw_put_key_guard (#16826) (#16838) close tikv/tikv#16825 Fix flaky test_raw_put_key_guard. Signed-off-by: Ping Yu Co-authored-by: Ping Yu Co-authored-by: glorv --- tests/failpoints/cases/test_rawkv.rs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/failpoints/cases/test_rawkv.rs b/tests/failpoints/cases/test_rawkv.rs index a795422c120..1716068321e 100644 --- a/tests/failpoints/cases/test_rawkv.rs +++ b/tests/failpoints/cases/test_rawkv.rs @@ -289,7 +289,8 @@ fn test_raw_put_key_guard() { let region_id = region.get_id(); let client = suite.get_client(region_id); let ctx = suite.get_context(region_id); - let node_id = region.get_peers()[0].get_id(); + let leader = suite.cluster.leader_of_region(region_id).unwrap(); + let node_id = leader.get_id(); let leader_cm = suite.cluster.sim.rl().get_concurrency_manager(node_id); let ts_provider = suite.get_causal_ts_provider(node_id).unwrap(); let ts = block_on(ts_provider.async_get_ts()).unwrap(); @@ -304,9 +305,10 @@ fn test_raw_put_key_guard() { // Wait for global_min_lock_ts. sleep_ms(500); let start = Instant::now(); - while leader_cm.global_min_lock_ts().is_none() - && start.saturating_elapsed() < Duration::from_secs(5) - { + while leader_cm.global_min_lock_ts().is_none() { + if start.saturating_elapsed() > Duration::from_secs(5) { + panic!("wait for global_min_lock_ts timeout"); + } sleep_ms(200); } From 7b111327c4e6f5bb1ebcde84299343a4c13b4e09 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Fri, 19 Jul 2024 14:06:00 +0800 Subject: [PATCH 193/220] backup: continue to seek regions if one range has no located leader region (#17169) (#17196) close tikv/tikv#17168 backup: continue to seek regions if one range has no located leader region Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- components/backup/src/endpoint.rs | 106 ++++++++++++++++++++++++------ 1 file changed, 86 insertions(+), 20 deletions(-) diff --git a/components/backup/src/endpoint.rs b/components/backup/src/endpoint.rs index 71a5c9e215c..4ceb538bb56 100644 --- a/components/backup/src/endpoint.rs +++ b/components/backup/src/endpoint.rs @@ -780,9 +780,13 @@ impl Progress { /// Forward the progress by `ranges` BackupRanges /// /// The size of the returned BackupRanges should <= `ranges` - fn forward(&mut self, limit: usize, replica_read: bool) -> Vec { + /// + /// Notice: Returning an empty BackupRanges means that no leader region + /// corresponding to the current range is sought. The caller should + /// call `forward` again to seek regions for the next range. + fn forward(&mut self, limit: usize, replica_read: bool) -> Option> { if self.finished { - return Vec::new(); + return None; } let store_id = self.store_id; let (tx, rx) = mpsc::channel(); @@ -858,7 +862,7 @@ impl Progress { } else { self.try_next(); } - branges + Some(branges) } } @@ -964,11 +968,10 @@ impl Endpoint { // (See https://tokio.rs/tokio/tutorial/shared-state) // Use &mut and mark the type for making rust-analyzer happy. let progress: &mut Progress<_> = &mut prs.lock().unwrap(); - let batch = progress.forward(batch_size, request.replica_read); - if batch.is_empty() { - return; + match progress.forward(batch_size, request.replica_read) { + Some(batch) => (batch, progress.codec.is_raw_kv, progress.cf), + None => return, } - (batch, progress.codec.is_raw_kv, progress.cf) }; for brange in batch { @@ -1560,7 +1563,7 @@ pub mod tests { let mut ranges = Vec::with_capacity(expect.len()); while ranges.len() != expect.len() { let n = (rand::random::() % 3) + 1; - let mut r = prs.forward(n, false); + let mut r = prs.forward(n, false).unwrap(); // The returned backup ranges should <= n assert!(r.len() <= n); @@ -1805,23 +1808,18 @@ pub mod tests { ); let mut ranges = Vec::with_capacity(expect.len()); - while ranges.len() != expect.len() { + loop { let n = (rand::random::() % 3) + 1; - let mut r = prs.forward(n, false); + let mut r = match prs.forward(n, false) { + None => break, + Some(r) => r, + }; // The returned backup ranges should <= n assert!(r.len() <= n); - if r.is_empty() { - // if return a empty vec then the progress is finished - assert_eq!( - ranges.len(), - expect.len(), - "got {:?}, expect {:?}", - ranges, - expect - ); + if !r.is_empty() { + ranges.append(&mut r); } - ranges.append(&mut r); } for (a, b) in ranges.into_iter().zip(expect) { @@ -1962,6 +1960,74 @@ pub mod tests { } } + fn fake_empty_marker() -> Vec { + vec![super::BackupRange { + start_key: None, + end_key: None, + region: Region::new(), + peer: Peer::new(), + codec: KeyValueCodec::new(false, ApiVersion::V1, ApiVersion::V1), + cf: "", + uses_replica_read: false, + }] + } + + #[test] + fn test_seek_ranges_2() { + let (_tmp, endpoint) = new_endpoint(); + + endpoint.region_info.set_regions(vec![ + (b"2".to_vec(), b"4".to_vec(), 1), + (b"6".to_vec(), b"8".to_vec(), 2), + ]); + let sub_ranges: Vec<(&[u8], &[u8])> = vec![(b"1", b"11"), (b"3", b"7"), (b"8", b"9")]; + let expect: Vec<(&[u8], &[u8])> = vec![(b"", b""), (b"3", b"4"), (b"6", b"7"), (b"", b"")]; + + let mut ranges = Vec::with_capacity(sub_ranges.len()); + for &(start_key, end_key) in &sub_ranges { + let start_key = (!start_key.is_empty()).then_some(Key::from_raw(start_key)); + let end_key = (!end_key.is_empty()).then_some(Key::from_raw(end_key)); + ranges.push((start_key, end_key)); + } + let mut prs = Progress::new_with_ranges( + endpoint.store_id, + ranges, + endpoint.region_info, + KeyValueCodec::new(false, ApiVersion::V1, ApiVersion::V1), + engine_traits::CF_DEFAULT, + ); + + let mut ranges = Vec::with_capacity(expect.len()); + loop { + let n = (rand::random::() % 2) + 1; + let mut r = match prs.forward(n, false) { + None => break, + Some(r) => r, + }; + // The returned backup ranges should <= n + assert!(r.len() <= n); + + if !r.is_empty() { + ranges.append(&mut r); + } else { + // append the empty marker + ranges.append(&mut fake_empty_marker()); + } + } + + assert!(ranges.len() == expect.len()); + for (a, b) in ranges.into_iter().zip(expect) { + assert_eq!( + a.start_key.map_or_else(Vec::new, |k| k.into_raw().unwrap()), + b.0 + ); + assert_eq!( + a.end_key.map_or_else(Vec::new, |k| k.into_raw().unwrap()), + b.1 + ); + } + } + #[test] fn test_handle_backup_task() { let limiter = Arc::new(IoRateLimiter::new_for_test()); From 740fdc649178f1e0c40d38769be236fc911f74b4 Mon Sep 17 00:00:00 2001 From: lucasliang Date: Mon, 5 Aug 2024 14:09:41 +0800 Subject: [PATCH 194/220] raftstore: cherry-pick #16239 & #16494 & #16738 to v7.5. (#17319) ref tikv/tikv#15874 This pr is used to cp #16239 & #16494 & #16738 to v7.1, used to inspect the gap of each peer's `applied_log_index` and `commit_log_index` when restarting. Signed-off-by: lucasliang --- Cargo.toml | 69 ++++++-- .../raftstore-v2/src/operation/command/mod.rs | 2 +- .../raftstore-v2/src/operation/ready/mod.rs | 2 +- components/raftstore-v2/src/raft/peer.rs | 2 +- components/raftstore/src/store/config.rs | 14 ++ components/raftstore/src/store/fsm/peer.rs | 155 +++++++++++++++++- components/raftstore/src/store/fsm/store.rs | 125 +++++++++++++- components/raftstore/src/store/metrics.rs | 30 ++++ components/raftstore/src/store/peer.rs | 48 +++++- metrics/grafana/tikv_details.json | 25 ++- tests/failpoints/cases/test_pending_peers.rs | 116 +++++++++++++ 11 files changed, 555 insertions(+), 33 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 7fd93076d6f..9a2ae00f435 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,7 +12,13 @@ edition = "2021" publish = false [features] -default = ["test-engine-kv-rocksdb", "test-engine-raft-raft-engine", "cloud-aws", "cloud-gcp", "cloud-azure"] +default = [ + "test-engine-kv-rocksdb", + "test-engine-raft-raft-engine", + "cloud-aws", + "cloud-gcp", + "cloud-azure", +] trace-tablet-lifetime = ["engine_rocks/trace-lifetime"] tcmalloc = ["tikv_alloc/tcmalloc"] jemalloc = ["tikv_alloc/jemalloc", "engine_rocks/jemalloc"] @@ -21,11 +27,24 @@ snmalloc = ["tikv_alloc/snmalloc"] portable = ["engine_rocks/portable"] sse = ["engine_rocks/sse"] mem-profiling = ["tikv_alloc/mem-profiling"] -failpoints = ["fail/failpoints", "raftstore/failpoints", "tikv_util/failpoints", "engine_rocks/failpoints", "raft_log_engine/failpoints"] +failpoints = [ + "fail/failpoints", + "raftstore/failpoints", + "tikv_util/failpoints", + "engine_rocks/failpoints", + "raft_log_engine/failpoints", +] cloud-aws = ["encryption_export/cloud-aws", "sst_importer/cloud-aws"] cloud-gcp = ["encryption_export/cloud-gcp", "sst_importer/cloud-gcp"] cloud-azure = ["encryption_export/cloud-azure", "sst_importer/cloud-azure"] -testexport = ["raftstore/testexport", "api_version/testexport", "causal_ts/testexport", "engine_traits/testexport", "engine_rocks/testexport", "engine_panic/testexport"] +testexport = [ + "raftstore/testexport", + "api_version/testexport", + "causal_ts/testexport", + "engine_traits/testexport", + "engine_rocks/testexport", + "engine_panic/testexport", +] test-engine-kv-rocksdb = ["engine_test/test-engine-kv-rocksdb"] test-engine-raft-raft-engine = ["engine_test/test-engine-raft-raft-engine"] test-engines-rocksdb = ["engine_test/test-engines-rocksdb"] @@ -73,7 +92,10 @@ flate2 = { version = "1.0", default-features = false, features = ["zlib"] } futures = { version = "0.3", features = ["thread-pool", "compat"] } futures-executor = "0.3.1" futures-timer = "3.0" -futures-util = { version = "0.3.1", default-features = false, features = ["io", "async-await"] } +futures-util = { version = "0.3.1", default-features = false, features = [ + "io", + "async-await", +] } fxhash = "0.2.1" getset = "0.1" grpcio = { workspace = true } @@ -90,7 +112,10 @@ kvproto = { workspace = true } lazy_static = "1.3" libc = "0.2" libloading = "0.7" -log = { version = "0.4", features = ["max_level_trace", "release_max_level_debug"] } +log = { version = "0.4", features = [ + "max_level_trace", + "release_max_level_debug", +] } log_wrappers = { workspace = true } match-template = "0.0.1" memory_trace_macros = { workspace = true } @@ -108,7 +133,10 @@ paste = "1.0" pd_client = { workspace = true } pin-project = "1.0" pnet_datalink = "0.23" -pprof = { version = "0.11", default-features = false, features = ["flamegraph", "protobuf-codec"] } +pprof = { version = "0.11", default-features = false, features = [ + "flamegraph", + "protobuf-codec", +] } prometheus = { version = "0.13", features = ["nightly"] } prometheus-static-metric = "0.5" protobuf = { version = "2.8", features = ["bytes"] } @@ -161,7 +189,7 @@ yatp = { workspace = true } [dev-dependencies] api_version = { workspace = true, features = ["testexport"] } -example_coprocessor_plugin = { workspace = true } # should be a binary dependency +example_coprocessor_plugin = { workspace = true } # should be a binary dependency hyper-openssl = "0.9" panic_hook = { workspace = true } raftstore = { workspace = true, features = ["testexport"] } @@ -194,7 +222,7 @@ fs2 = { git = "https://github.com/tabokie/fs2-rs", branch = "tikv" } # Remove this when a new version is release. We need to solve rust-lang/cmake-rs#143. cmake = { git = "https://github.com/rust-lang/cmake-rs" } -sysinfo ={ git = "https://github.com/tikv/sysinfo", branch = "0.26-fix-cpu" } +sysinfo = { git = "https://github.com/tikv/sysinfo", branch = "0.26-fix-cpu" } [target.'cfg(target_os = "linux")'.dependencies] procinfo = { git = "https://github.com/tikv/procinfo-rs", rev = "7693954bd1dd86eb1709572fd7b62fd5f7ff2ea1" } @@ -234,9 +262,9 @@ members = [ "components/encryption", "components/encryption/export", "components/engine_rocks_helper", -# Only enable tirocks in local development, otherwise it can slow down compilation. -# TODO: always enable tirocks and remove engine_rocks. -# "components/engine_tirocks", + # Only enable tirocks in local development, otherwise it can slow down compilation. + # TODO: always enable tirocks and remove engine_rocks. + # "components/engine_tirocks", "components/error_code", "components/external_storage", "components/external_storage/export", @@ -361,15 +389,26 @@ tipb_helper = { path = "components/tipb_helper" } tracker = { path = "components/tracker" } txn_types = { path = "components/txn_types" } # External libs -raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } -grpcio = { version = "0.10.4", default-features = false, features = ["openssl-vendored", "protobuf-codec", "nightly"] } -grpcio-health = { version = "0.10.4", default-features = false, features = ["protobuf-codec"] } +raft = { version = "0.7.0", default-features = false, features = [ + "protobuf-codec", +] } +grpcio = { version = "0.10.4", default-features = false, features = [ + "openssl-vendored", + "protobuf-codec", + "nightly", +] } +grpcio-health = { version = "0.10.4", default-features = false, features = [ + "protobuf-codec", +] } tipb = { git = "https://github.com/pingcap/tipb.git" } kvproto = { git = "https://github.com/pingcap/kvproto.git", branch = "release-7.5" } yatp = { git = "https://github.com/tikv/yatp.git", branch = "master" } tokio-timer = { git = "https://github.com/tikv/tokio", branch = "tokio-timer-hotfix" } tokio-executor = { git = "https://github.com/tikv/tokio", branch = "tokio-timer-hotfix" } -slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } +slog = { version = "2.3", features = [ + "max_level_trace", + "release_max_level_debug", +] } slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } [profile.dev.package.grpcio-sys] diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index 70cdbfda237..51384c6eaf2 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -467,7 +467,7 @@ impl Peer { apply_res.applied_index, progress_to_be_updated, ); - self.try_compelete_recovery(); + self.try_complete_recovery(); if !self.pause_for_replay() && self.storage_mut().apply_trace_mut().should_flush() { if let Some(scheduler) = self.apply_scheduler() { scheduler.send(ApplyTask::ManualFlush); diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index 39ce9707359..95eee272a80 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -819,7 +819,7 @@ impl Peer { self.merge_state_changes_to(&mut write_task); self.storage_mut() .handle_raft_ready(ctx, &mut ready, &mut write_task); - self.try_compelete_recovery(); + self.try_complete_recovery(); self.on_advance_persisted_apply_index(ctx, prev_persisted, &mut write_task); if !ready.persisted_messages().is_empty() { diff --git a/components/raftstore-v2/src/raft/peer.rs b/components/raftstore-v2/src/raft/peer.rs index be04f6d05a0..f4c3e1d3d6e 100644 --- a/components/raftstore-v2/src/raft/peer.rs +++ b/components/raftstore-v2/src/raft/peer.rs @@ -507,7 +507,7 @@ impl Peer { // we may have skipped scheduling raft tick when start due to noticable gap // between commit index and apply index. We should scheduling it when raft log // apply catches up. - pub fn try_compelete_recovery(&mut self) { + pub fn try_complete_recovery(&mut self) { if self.pause_for_replay() && self.storage().entry_storage().commit_index() <= self.storage().entry_storage().applied_index() diff --git a/components/raftstore/src/store/config.rs b/components/raftstore/src/store/config.rs index 92704f69e84..7c13446c185 100644 --- a/components/raftstore/src/store/config.rs +++ b/components/raftstore/src/store/config.rs @@ -398,6 +398,13 @@ pub struct Config { #[online_config(hidden)] #[serde(alias = "enable-partitioned-raft-kv-compatible-learner")] pub enable_v2_compatible_learner: bool, + + /// The minimal count of region pending on applying raft logs. + /// Only when the count of regions which not pending on applying logs is + /// less than the threshold, can the raftstore supply service. + #[doc(hidden)] + #[online_config(hidden)] + pub min_pending_apply_region_count: u64, } impl Default for Config { @@ -529,6 +536,7 @@ impl Default for Config { check_request_snapshot_interval: ReadableDuration::minutes(1), enable_v2_compatible_learner: false, unsafe_disable_check_quorum: false, + min_pending_apply_region_count: 10, } } } @@ -927,6 +935,12 @@ impl Config { )); } + if self.min_pending_apply_region_count == 0 { + return Err(box_err!( + "min_pending_apply_region_count must be greater than 0" + )); + } + Ok(()) } diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index d3bc49a6169..07fdc6c0e2c 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -105,8 +105,9 @@ use crate::{ ReadDelegate, ReadProgress, RegionTask, SplitCheckTask, }, CasualMessage, Config, LocksStatus, MergeResultKind, PdTask, PeerMsg, PeerTick, - ProposalContext, RaftCmdExtraOpts, RaftCommand, RaftlogFetchResult, ReadCallback, ReadTask, - SignificantMsg, SnapKey, StoreMsg, WriteCallback, + ProposalContext, RaftCmdExtraOpts, RaftCommand, RaftlogFetchResult, ReadCallback, + ReadIndexContext, ReadTask, SignificantMsg, SnapKey, StoreMsg, WriteCallback, + RAFT_INIT_LOG_INDEX, }, Error, Result, }; @@ -2202,6 +2203,17 @@ where self.fsm.peer.mut_store().flush_entry_cache_metrics(); return; } + + // Update the state whether the peer is pending on applying raft + // logs if necesssary. + self.on_check_peer_complete_apply_logs(); + + // If the peer is busy on apply and missing the last leader committed index, + // it should propose a read index to check whether its lag is behind the leader. + // It won't generate flooding fetching messages. This proposal will only be sent + // out before it gets response and updates the `last_leader_committed_index`. + self.try_to_fetch_committed_index(); + // When having pending snapshot, if election timeout is met, it can't pass // the pending conf change check because first index has been updated to // a value that is larger than last index. @@ -2618,6 +2630,22 @@ where return Ok(()); } + // If this peer is restarting, it may lose some logs, so it should update + // the `last_leader_committed_idx` with the commited index of the first + // `MsgAppend`` message or the committed index in `MsgReadIndexResp` it received + // from leader. + if self.fsm.peer.needs_update_last_leader_committed_idx() + && (MessageType::MsgAppend == msg_type || MessageType::MsgReadIndexResp == msg_type) + { + let committed_index = cmp::max( + msg.get_message().get_commit(), // from MsgAppend + msg.get_message().get_index(), // from MsgReadIndexResp + ); + self.fsm + .peer + .update_last_leader_committed_idx(committed_index); + } + if msg.has_extra_msg() { self.on_extra_message(msg); return Ok(()); @@ -2659,7 +2687,7 @@ where } else { // This can be a message that sent when it's still a follower. Nevertheleast, // it's meaningless to continue to handle the request as callbacks are cleared. - if msg.get_message().get_msg_type() == MessageType::MsgReadIndex + if msg_type == MessageType::MsgReadIndex && self.fsm.peer.is_leader() && (msg.get_message().get_from() == raft::INVALID_ID || msg.get_message().get_from() == self.fsm.peer_id()) @@ -3831,6 +3859,12 @@ where "is_latest_initialized" => is_latest_initialized, ); + // Ensure this peer is removed in the pending apply list. + meta.busy_apply_peers.remove(&self.fsm.peer_id()); + if let Some(count) = meta.completed_apply_peers_count.as_mut() { + *count += 1; + } + if meta.atomic_snap_regions.contains_key(&self.region_id()) { drop(meta); panic!( @@ -6582,6 +6616,121 @@ where fn register_report_region_buckets_tick(&mut self) { self.schedule_tick(PeerTick::ReportBuckets) } + + /// Check whether the peer should send a request to fetch the committed + /// index from the leader. + fn try_to_fetch_committed_index(&mut self) { + // Already completed, skip. + if !self.fsm.peer.needs_update_last_leader_committed_idx() || self.fsm.peer.is_leader() { + return; + } + // Construct a MsgReadIndex message and send it to the leader to + // fetch the latest committed index of this raft group. + let leader_id = self.fsm.peer.leader_id(); + if leader_id == raft::INVALID_ID { + // The leader is unknown, so we can't fetch the committed index. + return; + } + let rctx = ReadIndexContext { + id: uuid::Uuid::new_v4(), + request: None, + locked: None, + }; + self.fsm.peer.raft_group.read_index(rctx.to_bytes()); + debug!( + "try to fetch committed index from leader"; + "region_id" => self.region_id(), + "peer_id" => self.fsm.peer_id() + ); + } + + /// Check whether the peer is pending on applying raft logs. + /// + /// If busy, the peer will be recorded, until the pending logs are + /// applied. And after it completes applying, it will be removed from + /// the recording list. + fn on_check_peer_complete_apply_logs(&mut self) { + // Already completed, skip. + if self.fsm.peer.busy_on_apply.is_none() { + return; + } + + let peer_id = self.fsm.peer.peer_id(); + // No need to check the applying state if the peer is leader. + if self.fsm.peer.is_leader() { + self.fsm.peer.busy_on_apply = None; + // Clear it from recoding list and update the counter, to avoid + // missing it when the peer is changed to leader. + let mut meta = self.ctx.store_meta.lock().unwrap(); + meta.busy_apply_peers.remove(&peer_id); + if let Some(count) = meta.completed_apply_peers_count.as_mut() { + *count += 1; + } + return; + } + + let applied_idx = self.fsm.peer.get_store().applied_index(); + let mut last_idx = self.fsm.peer.get_store().last_index(); + // If the peer is newly added or created, no need to check the apply status. + if last_idx <= RAFT_INIT_LOG_INDEX { + self.fsm.peer.busy_on_apply = None; + // And it should be recorded in the `completed_apply_peers_count`. + let mut meta = self.ctx.store_meta.lock().unwrap(); + meta.busy_apply_peers.remove(&peer_id); + if let Some(count) = meta.completed_apply_peers_count.as_mut() { + *count += 1; + } + debug!( + "no need to check initialized peer"; + "last_commit_idx" => last_idx, + "last_applied_idx" => applied_idx, + "region_id" => self.fsm.region_id(), + "peer_id" => peer_id, + ); + return; + } + assert!(self.fsm.peer.busy_on_apply.is_some()); + + // This peer is restarted and the last leader commit index is not set, so + // it use `u64::MAX` as the last commit index to make it wait for the update + // of the `last_leader_committed_idx` until the `last_leader_committed_idx` has + // been updated. + last_idx = self.fsm.peer.last_leader_committed_idx.unwrap_or(u64::MAX); + + // If the peer has large unapplied logs, this peer should be recorded until + // the lag is less than the given threshold. + if last_idx >= applied_idx + self.ctx.cfg.leader_transfer_max_log_lag { + if !self.fsm.peer.busy_on_apply.unwrap() { + let mut meta = self.ctx.store_meta.lock().unwrap(); + meta.busy_apply_peers.insert(peer_id); + } + self.fsm.peer.busy_on_apply = Some(true); + debug!( + "peer is busy on applying logs"; + "last_commit_idx" => last_idx, + "last_applied_idx" => applied_idx, + "region_id" => self.fsm.region_id(), + "peer_id" => peer_id, + ); + } else { + // Already finish apply, remove it from recording list. + { + let mut meta = self.ctx.store_meta.lock().unwrap(); + meta.busy_apply_peers.remove(&peer_id); + if let Some(count) = meta.completed_apply_peers_count.as_mut() { + *count += 1; + } + } + debug!( + "peer completes applying logs"; + "last_commit_idx" => last_idx, + "last_applied_idx" => applied_idx, + "region_id" => self.fsm.region_id(), + "peer_id" => peer_id, + ); + self.fsm.peer.busy_on_apply = None; + } + } } impl<'a, EK, ER, T: Transport> PeerFsmDelegate<'a, EK, ER, T> diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index e15d7608ff2..3d94ac164b1 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -119,6 +119,15 @@ pub const PENDING_MSG_CAP: usize = 100; pub const ENTRY_CACHE_EVICT_TICK_DURATION: Duration = Duration::from_secs(1); pub const MULTI_FILES_SNAPSHOT_FEATURE: Feature = Feature::require(6, 1, 0); // it only makes sense for large region +// When the store is started, it will take some time for applying pending +// snapshots and delayed raft logs. Before the store is ready, it will report +// `is_busy` to PD, so PD will not schedule operators to the store. +const STORE_CHECK_PENDING_APPLY_DURATION: Duration = Duration::from_secs(5 * 60); +// The minimal percent of region finishing applying pending logs. +// Only when the count of regions which finish applying logs exceed +// the threshold, can the raftstore supply service. +const STORE_CHECK_COMPLETE_APPLY_REGIONS_PERCENT: u64 = 99; + pub struct StoreInfo { pub kv_engine: EK, pub raft_engine: ER, @@ -175,6 +184,18 @@ pub struct StoreMeta { /// be safely removed from the store, such as applying snapshot or /// compacting raft logs. pub damaged_regions: HashSet, + /// Record peers are busy with applying logs + /// (applied_index <= last_idx - leader_transfer_max_log_lag). + /// `busy_apply_peers` and `completed_apply_peers_count` are used + /// to record the accurate count of busy apply peers and peers complete + /// applying logs + pub busy_apply_peers: HashSet, + /// Record the number of peers done for applying logs. + /// Without `completed_apply_peers_count`, it's hard to know whether all + /// peers are ready for applying logs. + /// If None, it means the store is start from empty, no need to check and + /// update it anymore. + pub completed_apply_peers_count: Option, } impl StoreRegionMeta for StoreMeta { @@ -226,6 +247,8 @@ impl StoreMeta { region_read_progress: RegionReadProgressRegistry::new(), damaged_ranges: HashMap::default(), damaged_regions: HashSet::default(), + busy_apply_peers: HashSet::default(), + completed_apply_peers_count: Some(0), } } @@ -2554,10 +2577,75 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER } } + fn check_store_is_busy_on_apply( + &self, + start_ts_sec: u32, + region_count: u64, + busy_apply_peers_count: u64, + completed_apply_peers_count: Option, + ) -> bool { + STORE_BUSY_ON_APPLY_REGIONS_GAUGE_VEC + .busy_apply_peers + .set(busy_apply_peers_count as i64); + STORE_BUSY_ON_APPLY_REGIONS_GAUGE_VEC + .completed_apply_peers + .set(completed_apply_peers_count.unwrap_or_default() as i64); + // No need to check busy status if there are no regions. + if completed_apply_peers_count.is_none() || region_count == 0 { + return false; + } + + let completed_apply_peers_count = completed_apply_peers_count.unwrap(); + let during_starting_stage = { + (time::get_time().sec as u32).saturating_sub(start_ts_sec) + <= STORE_CHECK_PENDING_APPLY_DURATION.as_secs() as u32 + }; + // If the store is busy in handling applying logs when starting, it should not + // be treated as a normal store for balance. Only when the store is + // almost idle (no more pending regions on applying logs), it can be + // regarded as the candidate for balancing leaders. + if during_starting_stage { + let completed_target_count = (|| { + fail_point!("on_mock_store_completed_target_count", |_| 0); + std::cmp::max( + 1, + STORE_CHECK_COMPLETE_APPLY_REGIONS_PERCENT * region_count / 100, + ) + })(); + // If the number of regions on completing applying logs does not occupy the + // majority of regions, the store is regarded as busy. + if completed_apply_peers_count < completed_target_count { + debug!("check store is busy on apply"; + "region_count" => region_count, + "completed_apply_peers_count" => completed_apply_peers_count, + "completed_target_count" => completed_target_count); + true + } else { + let pending_target_count = std::cmp::min( + self.ctx.cfg.min_pending_apply_region_count, + region_count.saturating_sub(completed_target_count), + ); + debug!("check store is busy on apply, has pending peers"; + "region_count" => region_count, + "completed_apply_peers_count" => completed_apply_peers_count, + "completed_target_count" => completed_target_count, + "pending_target_count" => pending_target_count, + "busy_apply_peers_count" => busy_apply_peers_count); + pending_target_count > 0 && busy_apply_peers_count >= pending_target_count + } + } else { + // Already started for a fairy long time. + false + } + } + fn store_heartbeat_pd(&mut self, report: Option) { let mut stats = StoreStats::default(); stats.set_store_id(self.ctx.store_id()); + + let completed_apply_peers_count: Option; + let busy_apply_peers_count: u64; { let meta = self.ctx.store_meta.lock().unwrap(); stats.set_region_count(meta.regions.len() as u32); @@ -2566,6 +2654,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER let damaged_regions_id = meta.get_all_damaged_region_ids().into_iter().collect(); stats.set_damaged_regions_id(damaged_regions_id); } + if !meta.damaged_regions.is_empty() { // Note: no need to filter overlapped regions, since the regions in // `damaged_ranges` are already non-overlapping. @@ -2573,6 +2662,8 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER .mut_damaged_regions_id() .extend(meta.damaged_regions.iter()); } + completed_apply_peers_count = meta.completed_apply_peers_count; + busy_apply_peers_count = meta.busy_apply_peers.len() as u64; } let snap_stats = self.ctx.snap_mgr.stats(); @@ -2587,7 +2678,8 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER .with_label_values(&["receiving"]) .set(snap_stats.receiving_count as i64); - stats.set_start_time(self.fsm.store.start_time.unwrap().sec as u32); + let start_time = self.fsm.store.start_time.unwrap().sec as u32; + stats.set_start_time(start_time); // report store write flow to pd stats.set_bytes_written( @@ -2605,13 +2697,32 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER .swap(0, Ordering::Relaxed), ); - stats.set_is_busy( - self.ctx - .global_stat - .stat - .is_busy - .swap(false, Ordering::Relaxed), + let busy_on_apply = self.check_store_is_busy_on_apply( + start_time, + stats.get_region_count() as u64, + busy_apply_peers_count, + completed_apply_peers_count, ); + // If the store already pass the check, it should clear the + // `completed_apply_peers_count` to skip the check next time. + if !busy_on_apply { + let mut meta = self.ctx.store_meta.lock().unwrap(); + meta.completed_apply_peers_count = None; + meta.busy_apply_peers.clear(); + } + let store_is_busy = self + .ctx + .global_stat + .stat + .is_busy + .swap(false, Ordering::Relaxed); + stats.set_is_busy(store_is_busy || busy_on_apply); + STORE_PROCESS_BUSY_GAUGE_VEC + .applystore_busy + .set(busy_on_apply as i64); + STORE_PROCESS_BUSY_GAUGE_VEC + .raftstore_busy + .set(store_is_busy as i64); let mut query_stats = QueryStats::default(); query_stats.set_put( diff --git a/components/raftstore/src/store/metrics.rs b/components/raftstore/src/store/metrics.rs index 269c4aca23f..7973d9ad042 100644 --- a/components/raftstore/src/store/metrics.rs +++ b/components/raftstore/src/store/metrics.rs @@ -321,6 +321,20 @@ make_static_metric! { pub struct LoadBaseSplitEventCounterVec: IntCounter { "type" => LoadBaseSplitEventType, } + + pub struct StoreBusyOnApplyRegionsGaugeVec: IntGauge { + "type" => { + busy_apply_peers, + completed_apply_peers, + }, + } + + pub struct StoreBusyStateGaugeVec: IntGauge { + "type" => { + raftstore_busy, + applystore_busy, + }, + } } lazy_static! { @@ -943,4 +957,20 @@ lazy_static! { "The events of the lease to denying new admin commands being proposed by snapshot br.", &["event"] ).unwrap(); + + pub static ref STORE_BUSY_ON_APPLY_REGIONS_GAUGE_VEC: StoreBusyOnApplyRegionsGaugeVec = + register_static_int_gauge_vec!( + StoreBusyOnApplyRegionsGaugeVec, + "tikv_raftstore_busy_on_apply_region_total", + "Total number of regions busy on apply or complete apply.", + &["type"] + ).unwrap(); + + pub static ref STORE_PROCESS_BUSY_GAUGE_VEC: StoreBusyStateGaugeVec = + register_static_int_gauge_vec!( + StoreBusyStateGaugeVec, + "tikv_raftstore_process_busy", + "Is raft process busy or not", + &["type"] + ).unwrap(); } diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index 020f3e00b9c..087b255ec7e 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -897,6 +897,16 @@ where pub snapshot_recovery_state: Option, last_record_safe_point: u64, + /// Used for checking whether the peer is busy on apply. + /// * `None` => the peer has no pending logs for apply or already finishes + /// applying. + /// * `Some(false)` => initial state, not be recorded. + /// * `Some(true)` => busy on apply, and already recorded. + pub busy_on_apply: Option, + /// The index of last commited idx in the leader. It's used to check whether + /// this peer has raft log gaps and whether should be marked busy on + /// apply. + pub last_leader_committed_idx: Option, } impl Peer @@ -1041,6 +1051,8 @@ where lead_transferee: raft::INVALID_ID, unsafe_recovery_state: None, snapshot_recovery_state: None, + busy_on_apply: Some(false), + last_leader_committed_idx: None, }; // If this region has only one peer and I am the one, campaign directly. @@ -2682,9 +2694,10 @@ where if let Some(hs) = ready.hs() { let pre_commit_index = self.get_store().commit_index(); - assert!(hs.get_commit() >= pre_commit_index); + let cur_commit_index = hs.get_commit(); + assert!(cur_commit_index >= pre_commit_index); if self.is_leader() { - self.on_leader_commit_idx_changed(pre_commit_index, hs.get_commit()); + self.on_leader_commit_idx_changed(pre_commit_index, cur_commit_index); } } @@ -5235,6 +5248,37 @@ where } } } + + pub fn update_last_leader_committed_idx(&mut self, committed_index: u64) { + if self.is_leader() { + // Ignore. + return; + } + + let local_committed_index = self.get_store().commit_index(); + if committed_index < local_committed_index { + warn!( + "stale committed index"; + "region_id" => self.region().get_id(), + "peer_id" => self.peer_id(), + "last_committed_index" => committed_index, + "local_index" => local_committed_index, + ); + } else { + self.last_leader_committed_idx = Some(committed_index); + debug!( + "update last committed index from leader"; + "region_id" => self.region().get_id(), + "peer_id" => self.peer_id(), + "last_committed_index" => committed_index, + "local_index" => local_committed_index, + ); + } + } + + pub fn needs_update_last_leader_committed_idx(&self) -> bool { + self.busy_on_apply.is_some() && self.last_leader_committed_idx.is_none() + } } #[derive(Default, Debug)] diff --git a/metrics/grafana/tikv_details.json b/metrics/grafana/tikv_details.json index 25684f2638d..b729113a03a 100644 --- a/metrics/grafana/tikv_details.json +++ b/metrics/grafana/tikv_details.json @@ -1989,7 +1989,26 @@ "hide": false, "interval": "", "legendFormat": "store-write-channelfull-{{instance}}", - "refId": "E" + "metric": "", + "query": "sum(rate(\n tikv_raftstore_store_write_msg_block_wait_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum((\n tikv_raftstore_process_busy\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance, type) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}-{{type}}", + "metric": "", + "query": "sum((\n tikv_raftstore_process_busy\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance, type) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], @@ -14520,7 +14539,7 @@ "defaults": {}, "overrides": [] }, - "gridPos": { + "gridPos": { "h": 7, "w": 12, "x": 0, @@ -48688,4 +48707,4 @@ "title": "Test-Cluster-TiKV-Details", "uid": "RDVQiEzZz", "version": 1 -} +} \ No newline at end of file diff --git a/tests/failpoints/cases/test_pending_peers.rs b/tests/failpoints/cases/test_pending_peers.rs index 76bf56ae698..b584f24c83c 100644 --- a/tests/failpoints/cases/test_pending_peers.rs +++ b/tests/failpoints/cases/test_pending_peers.rs @@ -2,6 +2,9 @@ use std::{sync::Arc, time::Duration}; +use crossbeam::channel; +use kvproto::raft_serverpb::RaftMessage; +use raft::eraftpb::MessageType; use test_raftstore::*; use tikv_util::{config::*, time::Instant}; @@ -110,6 +113,119 @@ fn test_pending_snapshot() { ); } +// Tests if store is marked with busy when there exists peers on +// busy on applying raft logs. +#[test] +fn test_on_check_busy_on_apply_peers() { + let mut cluster = new_node_cluster(0, 3); + cluster.cfg.raft_store.raft_base_tick_interval = ReadableDuration::millis(5); + cluster.cfg.raft_store.raft_store_max_leader_lease = ReadableDuration::millis(100); + cluster.cfg.raft_store.leader_transfer_max_log_lag = 10; + cluster.cfg.raft_store.check_long_uncommitted_interval = ReadableDuration::millis(10); // short check interval for recovery + cluster.cfg.raft_store.pd_heartbeat_tick_interval = ReadableDuration::millis(50); + + let pd_client = Arc::clone(&cluster.pd_client); + // Disable default max peer count check. + pd_client.disable_default_operator(); + + let r1 = cluster.run_conf_change(); + pd_client.must_add_peer(r1, new_peer(2, 1002)); + pd_client.must_add_peer(r1, new_peer(3, 1003)); + + cluster.must_put(b"k1", b"v1"); + must_get_equal(&cluster.get_engine(2), b"k1", b"v1"); + must_get_equal(&cluster.get_engine(3), b"k1", b"v1"); + + // Check the start status for peer 1003. + cluster.must_send_store_heartbeat(3); + sleep_ms(100); + let stats = cluster.pd_client.get_store_stats(3).unwrap(); + assert!(!stats.is_busy); + + // Pause peer 1003 on applying logs to make it pending. + let before_apply_stat = cluster.apply_state(r1, 3); + cluster.stop_node(3); + for i in 0..=cluster.cfg.raft_store.leader_transfer_max_log_lag { + let bytes = format!("k{:03}", i).into_bytes(); + cluster.must_put(&bytes, &bytes); + } + cluster.must_put(b"k2", b"v2"); + must_get_equal(&cluster.get_engine(1), b"k2", b"v2"); + must_get_equal(&cluster.get_engine(2), b"k2", b"v2"); + + // Restart peer 1003 and make it busy for applying pending logs. + fail::cfg("on_handle_apply_1003", "pause").unwrap(); + // Case 1: check the leader committed index comes from MsgAppend and + // MsgReadIndexResp is valid. + let (read_tx, read_rx) = channel::unbounded::(); + let (append_tx, append_rx) = channel::unbounded::(); + cluster.add_send_filter_on_node( + 1, + Box::new( + RegionPacketFilter::new(r1, 1) + .direction(Direction::Send) + .msg_type(MessageType::MsgReadIndexResp) + .set_msg_callback(Arc::new(move |msg: &RaftMessage| { + read_tx.send(msg.clone()).unwrap(); + })), + ), + ); + cluster.add_send_filter_on_node( + 1, + Box::new( + RegionPacketFilter::new(r1, 1) + .direction(Direction::Send) + .msg_type(MessageType::MsgAppend) + .set_msg_callback(Arc::new(move |msg: &RaftMessage| { + append_tx.send(msg.clone()).unwrap(); + })), + ), + ); + let leader_apply_state = cluster.apply_state(r1, 1); + cluster.run_node(3).unwrap(); + let append_msg = append_rx.recv_timeout(Duration::from_secs(2)).unwrap(); + assert_eq!( + append_msg.get_message().get_commit(), + leader_apply_state.applied_index + ); + let read_msg = read_rx.recv_timeout(Duration::from_secs(2)).unwrap(); + assert_eq!( + read_msg.get_message().get_index(), + leader_apply_state.applied_index + ); + cluster.clear_send_filter_on_node(1); + + // Case 2: completed regions < target count. + let after_apply_stat = cluster.apply_state(r1, 3); + assert!(after_apply_stat.applied_index == before_apply_stat.applied_index); + sleep_ms(100); + cluster.must_send_store_heartbeat(3); + sleep_ms(100); + let stats = cluster.pd_client.get_store_stats(3).unwrap(); + assert!(stats.is_busy); + sleep_ms(100); + + // Case 3: completed_apply_peers_count > completed_target_count but + // there exists busy peers. + fail::cfg("on_mock_store_completed_target_count", "return").unwrap(); + cluster.must_send_store_heartbeat(3); + sleep_ms(100); + let stats = cluster.pd_client.get_store_stats(3).unwrap(); + assert!(stats.is_busy); + fail::remove("on_mock_store_completed_target_count"); + // After peer 1003 is recovered, store also should not be marked with busy. + fail::remove("on_handle_apply_1003"); + sleep_ms(100); + must_get_equal(&cluster.get_engine(3), b"k2", b"v2"); + sleep_ms(100); + let after_apply_stat = cluster.apply_state(r1, 3); + assert!(after_apply_stat.applied_index > before_apply_stat.applied_index); + cluster.must_send_store_heartbeat(3); + sleep_ms(100); + let stats = cluster.pd_client.get_store_stats(3).unwrap(); + assert!(!stats.is_busy); +} + #[test] fn test_on_apply_snap_failed() { let mut cluster = new_node_cluster(0, 3); From 40291ea71bc08667d619ed818b78a24a5bdaf4e2 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Fri, 23 Aug 2024 18:18:14 +0800 Subject: [PATCH 195/220] encryption: fix unexpected error code for wrong master-key (#17413) (#17417) close tikv/tikv#17410 This PR fixes a bug encountered in the GCP KMS master-key rotation scenario. When decryption fails using the current master-key, the `KsmBackend` is expected to return a `WrongMasterKey` error rather than any other error type. The higher-level caller will only retry with the previous master-key if a `WrongMasterKey` error is received. Signed-off-by: hhwyt Co-authored-by: hhwyt --- components/encryption/src/master_key/kms.rs | 85 ++++++++++++++++----- 1 file changed, 68 insertions(+), 17 deletions(-) diff --git a/components/encryption/src/master_key/kms.rs b/components/encryption/src/master_key/kms.rs index 643cb08a0c6..56cedc00243 100644 --- a/components/encryption/src/master_key/kms.rs +++ b/components/encryption/src/master_key/kms.rs @@ -141,7 +141,11 @@ impl KmsBackend { self.kms_provider.decrypt_data_key(&ciphertext_key), ) })) - .map_err(cloud_convert_error("decrypt encrypted key failed".into()))?; + .map_err(|e| { + Error::WrongMasterKey(box_err!(cloud_convert_error( + "decrypt encrypted key failed".into(), + )(e))) + })?; let data_key = DataKeyPair { encrypted: ciphertext_key, plaintext: PlainKey::new(plaintext, CryptographyType::AesGcm256) @@ -154,6 +158,12 @@ impl KmsBackend { } } } + + #[cfg(test)] + fn clear_state(&mut self) { + let mut opt_state = self.state.lock().unwrap(); + *opt_state = None; + } } impl Backend for KmsBackend { @@ -173,7 +183,10 @@ impl Backend for KmsBackend { #[cfg(test)] mod fake { use async_trait::async_trait; - use cloud::{error::Result, kms::KmsProvider}; + use cloud::{ + error::{Error as CloudError, KmsError, Result}, + kms::KmsProvider, + }; use super::*; @@ -183,12 +196,14 @@ mod fake { #[derive(Debug)] pub struct FakeKms { plaintext_key: PlainKey, + should_decrypt_data_key_fail: bool, } impl FakeKms { - pub fn new(plaintext_key: Vec) -> Self { + pub fn new(plaintext_key: Vec, should_decrypt_data_key_fail: bool) -> Self { Self { plaintext_key: PlainKey::new(plaintext_key, CryptographyType::AesGcm256).unwrap(), + should_decrypt_data_key_fail, } } } @@ -204,7 +219,13 @@ mod fake { } async fn decrypt_data_key(&self, _ciphertext: &EncryptedKey) -> Result> { - Ok(vec![1u8, 32]) + if self.should_decrypt_data_key_fail { + Err(CloudError::KmsError(KmsError::WrongMasterKey(box_err!( + "wrong master key" + )))) + } else { + Ok(vec![1u8, 32]) + } } fn name(&self) -> &str { @@ -241,21 +262,36 @@ mod tests { assert_eq!(state2.cached(&encrypted2), true); } + const PLAIN_TEXT_HEX: &str = "25431587e9ecffc7c37f8d6d52a9bc3310651d46fb0e3bad2726c8f2db653749"; + const CIPHER_TEXT_HEX: &str = + "84e5f23f95648fa247cb28eef53abec947dbf05ac953734618111583840bd980"; + const PLAINKEY_HEX: &str = "c3d99825f2181f4808acd2068eac7441a65bd428f14d2aab43fefc0129091139"; + const IV_HEX: &str = "cafabd9672ca6c79a2fbdc22"; + + #[cfg(test)] + fn prepare_data_for_encrypt() -> (Iv, Vec, Vec, Vec) { + let iv = Vec::from_hex(IV_HEX).unwrap(); + let iv = Iv::from_slice(iv.as_slice()).unwrap(); + let pt = Vec::from_hex(PLAIN_TEXT_HEX).unwrap(); + let plainkey = Vec::from_hex(PLAINKEY_HEX).unwrap(); + let ct = Vec::from_hex(CIPHER_TEXT_HEX).unwrap(); + (iv, pt, plainkey, ct) + } + + #[cfg(test)] + fn prepare_kms_backend(plainkey: Vec, should_decrypt_data_key_fail: bool) -> KmsBackend { + KmsBackend::new(Box::new(FakeKms::new( + plainkey, + should_decrypt_data_key_fail, + ))) + .unwrap() + } + #[test] fn test_kms_backend() { - // See more http://csrc.nist.gov/groups/STM/cavp/documents/mac/gcmtestvectors.zip - let pt = Vec::from_hex("25431587e9ecffc7c37f8d6d52a9bc3310651d46fb0e3bad2726c8f2db653749") - .unwrap(); - let ct = Vec::from_hex("84e5f23f95648fa247cb28eef53abec947dbf05ac953734618111583840bd980") - .unwrap(); - let plainkey = - Vec::from_hex("c3d99825f2181f4808acd2068eac7441a65bd428f14d2aab43fefc0129091139") - .unwrap(); - - let iv = Vec::from_hex("cafabd9672ca6c79a2fbdc22").unwrap(); - - let backend = KmsBackend::new(Box::new(FakeKms::new(plainkey))).unwrap(); - let iv = Iv::from_slice(iv.as_slice()).unwrap(); + let (iv, pt, plainkey, ct) = prepare_data_for_encrypt(); + let backend = prepare_kms_backend(plainkey, false); + let encrypted_content = backend.encrypt_content(&pt, iv).unwrap(); assert_eq!(encrypted_content.get_content(), ct.as_slice()); let plaintext = backend.decrypt_content(&encrypted_content).unwrap(); @@ -293,4 +329,19 @@ mod tests { Error::Other(_) ); } + + #[test] + fn test_kms_backend_wrong_key() { + let (iv, pt, plainkey, ..) = prepare_data_for_encrypt(); + let mut backend = prepare_kms_backend(plainkey, true); + + let encrypted_content = backend.encrypt_content(&pt, iv).unwrap(); + // Clear the cached state to ensure that the subsequent + // backend.decrypt_content() invocation bypasses the cache and triggers the + // mocked FakeKMS::decrypt_data_key() function. + backend.clear_state(); + + let err = backend.decrypt_content(&encrypted_content).unwrap_err(); + assert_matches!(err, Error::WrongMasterKey(_)); + } } From 9342e0c9260bdc6d9b88536f4525b492457cf39b Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Fri, 30 Aug 2024 07:58:22 +0800 Subject: [PATCH 196/220] compaction-filter: consider mvcc-delete as redundant key to trigger rocksdb compaction (#17431) (#17435) close tikv/tikv#17269 compaction-filter: consider mvcc.delete as redundant key to trigger Rocksdb compaction Signed-off-by: Shirly Co-authored-by: Shirly --- components/engine_rocks/src/properties.rs | 1 + components/engine_traits/src/misc.rs | 15 ++++++++++++++- components/raftstore/src/store/worker/compact.rs | 2 +- 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/components/engine_rocks/src/properties.rs b/components/engine_rocks/src/properties.rs index 03d6877a9dd..1739fb1036e 100644 --- a/components/engine_rocks/src/properties.rs +++ b/components/engine_rocks/src/properties.rs @@ -573,6 +573,7 @@ pub fn get_range_stats( num_entries, num_versions: props.num_versions, num_rows: props.num_rows, + num_deletes: props.num_deletes, }) } diff --git a/components/engine_traits/src/misc.rs b/components/engine_traits/src/misc.rs index 228e2cd501e..b4cb4c9a233 100644 --- a/components/engine_traits/src/misc.rs +++ b/components/engine_traits/src/misc.rs @@ -57,12 +57,25 @@ pub trait StatisticsReporter { #[derive(Default)] pub struct RangeStats { - // The number of entries + // The number of entries in write cf. pub num_entries: u64, // The number of MVCC versions of all rows (num_entries - tombstones). pub num_versions: u64, // The number of rows. pub num_rows: u64, + // The number of MVCC deletes of all rows. + pub num_deletes: u64, +} + +impl RangeStats { + /// The number of redundant keys in the range. + /// It's calculated by `num_entries - num_versions + num_deleted`. + pub fn redundant_keys(&self) -> u64 { + // Consider the number of `mvcc_deletes` as the number of redundant keys. + self.num_entries + .saturating_sub(self.num_rows) + .saturating_add(self.num_deletes) + } } pub trait MiscExt: CfNamesExt + FlowControlFactorsExt + WriteBatchExt { diff --git a/components/raftstore/src/store/worker/compact.rs b/components/raftstore/src/store/worker/compact.rs index 96199884db6..8e29b6ed5a5 100644 --- a/components/raftstore/src/store/worker/compact.rs +++ b/components/raftstore/src/store/worker/compact.rs @@ -209,7 +209,7 @@ pub fn need_compact(range_stats: &RangeStats, compact_threshold: &CompactThresho // We trigger region compaction when their are to many tombstones as well as // redundant keys, both of which can severly impact scan operation: let estimate_num_del = range_stats.num_entries - range_stats.num_versions; - let redundant_keys = range_stats.num_entries - range_stats.num_rows; + let redundant_keys = range_stats.redundant_keys(); (redundant_keys >= compact_threshold.redundant_rows_threshold && redundant_keys * 100 >= compact_threshold.redundant_rows_percent_threshold * range_stats.num_entries) From 141029a737220430ae7bfbdb797b9c139422232f Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Thu, 5 Sep 2024 18:26:25 +0800 Subject: [PATCH 197/220] test_util: renew tests certs (#17472) (#17474) close tikv/tikv#17471 Add a script to renew certificates and fix the flaky test `test_security_status_service_without_cn` . Signed-off-by: Neil Shen Co-authored-by: Neil Shen --- components/test_util/data/.gitignore | 2 + components/test_util/data/ca.pem | 37 +++++++------- components/test_util/data/generate_certs.sh | 40 +++++++++++++++ components/test_util/data/key.pem | 55 +++++++++++---------- components/test_util/data/server.pem | 38 +++++++------- 5 files changed, 105 insertions(+), 67 deletions(-) create mode 100644 components/test_util/data/.gitignore create mode 100755 components/test_util/data/generate_certs.sh diff --git a/components/test_util/data/.gitignore b/components/test_util/data/.gitignore new file mode 100644 index 00000000000..0773e460402 --- /dev/null +++ b/components/test_util/data/.gitignore @@ -0,0 +1,2 @@ +ca.key +server.csr diff --git a/components/test_util/data/ca.pem b/components/test_util/data/ca.pem index e130a8eece9..05015192501 100644 --- a/components/test_util/data/ca.pem +++ b/components/test_util/data/ca.pem @@ -1,22 +1,19 @@ -----BEGIN CERTIFICATE----- -MIIDojCCAoqgAwIBAgIUdZFW8VQoZZzek8cA+5GGu6ZInjowDQYJKoZIhvcNAQEL -BQAwVzELMAkGA1UEBhMCQ04xEDAOBgNVBAgTB0JlaWppbmcxEDAOBgNVBAcTB0Jl -aWppbmcxEDAOBgNVBAoTB1BpbmdDQVAxEjAQBgNVBAMTCU15IG93biBDQTAeFw0x -OTA5MDIwNjEyMDBaFw0yNDA4MzEwNjEyMDBaMFcxCzAJBgNVBAYTAkNOMRAwDgYD -VQQIEwdCZWlqaW5nMRAwDgYDVQQHEwdCZWlqaW5nMRAwDgYDVQQKEwdQaW5nQ0FQ -MRIwEAYDVQQDEwlNeSBvd24gQ0EwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEK -AoIBAQDcDtQ7UX+xlVY0vpklp1uUmPoFsN0U6fqRzHU+LvYS5AM5RPJMVLiKBiSi -zGsB+XPmXZ8H7rZZ+osZsEmDIF3HdyiSNpPNzRJKxsz4KVRzfoKZXL9D41TpuE27 -+7tN6qGytYrnAy8cHMA0S1TnQ0biOFTcXZrwh5lvlIcx7ceUamGuEl94tblxSSJl -2SkpHkKIDv0kcgoGmmh4y8SzAtmnwcCjkCSoITvvwKklp5830pFKOnpN9uZJzkXa -tuUSpSji/JG79nQfH91LtL7xMprORVtg9YAa3aJm0Uf33WFvaCTSrt//7CVK8nqK -xayS3u7dNH3GV9b81OGtlR76leFlAgMBAAGjZjBkMA4GA1UdDwEB/wQEAwIBBjAS -BgNVHRMBAf8ECDAGAQH/AgECMB0GA1UdDgQWBBS3hxTaN9B7eF8xr0DKLZ3b5vFn -rDAfBgNVHSMEGDAWgBS3hxTaN9B7eF8xr0DKLZ3b5vFnrDANBgkqhkiG9w0BAQsF -AAOCAQEAi9WiEvTQQjmb7ekXHf1tKwdLNu5akQXIwTKeZSWRSeMgqVQcoyTZMPBX -ythl6K3175RUIMtCwO4uZTOpRU1mTl0pIjoEcJGHYX91zyA5BjWahXZttvt7/hyX -UwJN9clBXLfZTCp1ysLCtarLcip4WxWNsxEwXFUisE2gbu3F9ELHAbRSVUe/CwC6 -8BkY+G+fovazjGoTV4NadJVFRzTR/zsWkBNllBOBTrop8FH23ePVh3hXafzJlcip -bDbRxNqSzNtLr88mwswklgiIHXF6PY2TkyscsXVkHPAswZnrv4lLov7M3VjL8ITA -uYm4Me5Tmj+6pb+Foky15+ehmicQbA== +MIIDITCCAgmgAwIBAgIUVe4Q3uw8yW0seqG9yQMfXrSXLHswDQYJKoZIhvcNAQEL +BQAwFzEVMBMGA1UEAwwMdGlrdl90ZXN0X2NhMCAXDTI0MDMyNzAwMDAwMFoYDzIx +MjQwMzI3MDAwMDAwWjAXMRUwEwYDVQQDDAx0aWt2X3Rlc3RfY2EwggEiMA0GCSqG +SIb3DQEBAQUAA4IBDwAwggEKAoIBAQC+cR9jZ0LtX4ztcupCEyrR8CNmw1TkIsOQ +rPhP43FkdggQN2vqkM9ZtKxlcODuNul748saEFoK1AGv4MgxgKcbt6sTucdz4oC4 +O1rM31eicU630PblPNU8Bstvlta2jCZAbERBhiAm1C3zQncodyVr9Oa2Ff9SRXcW +7icpv4CTsesPi19nF+EhBAuCifeI3Vj1Uvd5wvsK/m0D0gpp3Vp7CNYwHLv9gfPu +Jui0Q8NM5ENBcIfUBK8zOvr8a5glqV36KUA4m7yDXyYHIR2SrD/y3XDc6cbRgxKS +qbQMbc67H2XJHWjHgp7gv9rdU9HGxfv49j+TnxwYNPb3aflBgk1JAgMBAAGjYzBh +MB0GA1UdDgQWBBS+Qw4MGLTrjFTO2xlGwlj+yy1o7zAfBgNVHSMEGDAWgBS+Qw4M +GLTrjFTO2xlGwlj+yy1o7zASBgNVHRMBAf8ECDAGAQH/AgEAMAsGA1UdDwQEAwIB +BjANBgkqhkiG9w0BAQsFAAOCAQEAezNpYWbkxvIv/MfWxpbF+TBA1ssWT1xyuwUu +P9EdXj72XKnpkmZXxysyCSZR1ZH9XwuqHgQkegxQMKMeiv5UQLouTFEa5LUJxlQw +A3O1Ky1r6dv6p/JkOAbMxh+VoWAFCW5Ioo81rwZLGu3DS3+gsauDBpevIqLlL29H +FAQ+JO33pzuAP+PBHnO0Zi3ddvgDNULpHQhC8BUR9fI/NsxKuS4QwPGK1fnd/Qvg +w5aP0PZ8CNheIvVy4qkeUsh2kS7vghMpwa/KkCGurmwg2C6sgCTDKJVgkTOEutx4 +5LfVfuwAmWWnrqWwLtAyJI5SAUhXLtSu2DeyC5ppP5DFRBP3qw== -----END CERTIFICATE----- diff --git a/components/test_util/data/generate_certs.sh b/components/test_util/data/generate_certs.sh new file mode 100755 index 00000000000..08156aa5e68 --- /dev/null +++ b/components/test_util/data/generate_certs.sh @@ -0,0 +1,40 @@ +#!/bin/bash +# +# Copyright 2024 TiKV Project Authors. Licensed under Apache-2.0. + +set -euo pipefail + +SCRIPT_PATH="$(realpath "$0")" +CERT_DIR="$(dirname "$SCRIPT_PATH")" +CA_KEY="$CERT_DIR/ca.key" +CA_CERT="$CERT_DIR/ca.pem" +SERVER_KEY="$CERT_DIR/key.pem" +SERVER_CSR="$CERT_DIR/server.csr" +SERVER_CERT="$CERT_DIR/server.pem" +VALID_DAYS=3650 +RSA_KEY_SIZE=2048 + +# CA certs. +openssl genrsa -out "$CA_KEY" "$RSA_KEY_SIZE" +openssl req -new -x509 -days "$VALID_DAYS" -key "$CA_KEY" -out "$CA_CERT" \ + -subj "/CN=tikv_test_ca" \ + -addext "basicConstraints = critical,CA:TRUE,pathlen:0" \ + -addext "keyUsage = cRLSign, keyCertSign" +echo "CA certificate:" +openssl x509 -text -in "$CA_CERT" -noout + +# Server certs. +openssl genrsa -out "$SERVER_KEY" "$RSA_KEY_SIZE" +openssl req -new -key "$SERVER_KEY" -out "$SERVER_CSR" \ + -extensions v3_ca \ + -subj "/CN=tikv-server" \ + -addext "basicConstraints = critical, CA:FALSE" \ + -addext "keyUsage = critical, digitalSignature, keyEncipherment" \ + -addext "extendedKeyUsage = serverAuth, clientAuth" \ + -addext "subjectAltName = IP.1:172.16.5.40, IP.2:127.0.0.1" +openssl x509 -req -days "$VALID_DAYS" \ + -CA "$CA_CERT" -CAkey "$CA_KEY" -CAcreateserial \ + -copy_extensions copyall \ + -in "$SERVER_CSR" -out "$SERVER_CERT" +echo "Server certificate:" +openssl x509 -text -in "$SERVER_CERT" -noout diff --git a/components/test_util/data/key.pem b/components/test_util/data/key.pem index c7f9fa8c340..61ab0c3f029 100644 --- a/components/test_util/data/key.pem +++ b/components/test_util/data/key.pem @@ -1,27 +1,28 @@ ------BEGIN RSA PRIVATE KEY----- -MIIEogIBAAKCAQEAsRpq/E/VC82YxsC5LlKFvI9HJuchMtKskn53anW4rNE3sfN0 -WDS6qCyxNumUVBqO98J18xxbz/XkV7aP6TcXZrNgEqw07PZWTDoyZVi+n9HXyWwl -BeiE2WWrCESqsar+cXV5UE3oE7Y4CT56tMN+awKqnf1zLyRl9DlqSg1/GabheVzz -fGhdqddqdpAZcaOHH8UMEWdnZ4qTFaaGNRlrRy3W0VjzgIocQorpvvtZkQM5iCxx -z9wuF9/6gGdopTA0J2SvZKa+oI/867NLpN5Hx+cn/ThHhCTh1N34Ulloa0aiou72 -mGgyMIdQxYAsRnG62EHn+9aPtegIjQd13Be9/wIDAQABAoIBAHJ8v3iIKxNMP10M -rSlS032HqdluRLnUExdIhe3eWBnvze9NkIKM47Vf3te+u9J6sL1dil40kO2o6YoC -TJnYsVoEzzCC/lvJCxSP8pAthF1QjAx7yps9KtRWsu/PZAEipwW1iUzub/5+J09i -gnRkhE6tFJq5g0KQZxAwJZPlkaqEcZIOObfh9zD9hutvCPmXBtB600EbQU4XzyjP -KaU08LtNZVm4mhKMuhXuFt8LBkjjfuw6zNcjsvgMkyflFTLc/SgWWIpq1ALHQCsq -OiFfTPyuLy+8tGTbawvRIqiHHRd23XttPcfkdfWbNVTSBfodTOhXGFaVYbJ6EVA4 -OzVzftECgYEAz/D99wpWbjU8vye5cjKjZCY/+QnY0t76YsUmfD9+fQNBDSQnKCKj -6nO6oYFQ9RI/vPMfrNX0sqo5hKfufNBCr/MILDXR6vtcEuaqd84DgaPVPRjHef6v -paYUi0Enb3gF3LXYggTN1mz9leEW8BablTN/DLP5AAvMfM/XSkVzlIsCgYEA2gjc -mcUDL1smAvriFVmpD4IrPzaZ9kINOfFNqkp/+y7S0BZGeS5ESSodrs0CIojttp3o -9GL7QLhZ9DehJWfh2qfA5mvzKGzUeM2oapR2Ts/m3voS4ErPTm+cTBOjRe3gGSSN -4sAJ5LA071RfNjEZBSktow//WX/oWrhIyovnxt0CgYBxyge/4xlO77URSdSySEGf -MUs6pYfQRRKxb/9SaJB4KoqzfUAsN2CJkNDlRlWd9mGIrWZ89wwTpREapabdCD4l -+JFVWBJKS0ikUzOfoc3LaHLtHx0xhgxqUkrVtU62MfDLSXt0Etrs5vGRzf32Xfi/ -mdGBiw7MVqiM+FNwojbQZwKBgDly5E1P78rmhVl7qV5exYDkl2iMhnywYrPFtOUN -xDL2320csWz0l+F/S1rngYx/78KSUPMzsWgYKvuCPN+SQ5xNXzJXdzZLlqBN7/ZF -L/cMKJTP53FZxM2x8sjI09h1GPsG+quoVfL/yrLU1FF/FkyZ0QCKEooOfbaJoARe -YK+xAoGAfT0P200WsLKRl73XYJZNYQl5+h5s7Sk9J8QuPwFWqm/mGwYKTLI042jg -lsAym4krAR0c1CHTW3aHRimYpYbi7/kztZU1zUQgcGL+79afer3ZuFF7mGzR+I/r -yOQ2dEfmVASfl/fMh1qyExpcCaMuejaODWyILlxOwvnywHWMSCU= ------END RSA PRIVATE KEY----- +-----BEGIN PRIVATE KEY----- +MIIEvgIBADANBgkqhkiG9w0BAQEFAASCBKgwggSkAgEAAoIBAQCzMRjAbG2MVTMs +x7Rr+eeIi4jNyhvaQ8LcTB08BdgY8618blS037dB/85GxKTfZMdJZkCygbSsyiVJ +owqyucsT6IKYnZ/kDxfXXYEBIQcOYLnAEU1NNnQLPYSTth7xjnSKvg78gH+wY2LP +4z6QD64XboKa/fmVuByO/QlnZntkr4kiH6O5rJyt1Hm+OzVRYs5RumGU0Mpn3Bjd +xmGqB7+Wldbu2ECfOFFDaC/uizWVr1OD5lsdVau20ZEwZN37udRXsBc0qTp+CaA0 +sTFNCfqq9/Z3SVOGmWldvVOfF33+n9N+n1yyeZ8TlZ3JB2daKoDHDxLY0KKZtu6D +M9RlcMdJAgMBAAECggEAExOZ//pLG1KCtTS5h3k+ZfH86XOnzW/DUfKkdhxlKhrf +EE0qpWrftp4GFtCegM3YzX6eSqK2WmLW32UFIYvYC9GH/bliKURWFG526mjauHQh +sknlQeAXMI8nCxaraqnwzEh5jfYcIeoiveECaxHQRdxU+S0qof7Mw4g6tRBa+Jft +TUW1aa/m6p8s19xTKebtpCj+p1zwUdU8t4fw8eq5qFn4t1jlWaaCLgJv4OAE52IB +81LEJjG0vevVIvifsm+2t58IOqYJhEo0qMw2X0AMQDEOJZPeW1puFg4cdvh1bar6 +RDxVhX0IkDKz7w62VoGb0ZptZkO3vrG8OXN2TI+L0QKBgQC30ASVENK2l7Jo+n5m +Qxf7s4ZoJ9jjWjO8N+Mkd5/Cu+xePlv5EmcBZCDONSmAdtaZXDs9DVO8E4yFXecJ +fidQnvRhhRxrG/LEEwHNzR8lMlm5tc4wx7g2y844Qjan71O8gawUd6eZyRmVDnmk +st6DLUwyWTkwaa7VkDaGFFqjVQKBgQD5kIS0fiq1hhOzrPt6U2FCHybrNnWCyFN5 +ISYJpl1vn7YqFV2wgXwn+A9bcDi7DMK8hx1b49Xx2lwo9n7Cb5Pd0ykhdjo12hUQ +WBqiFEjInsQ84RvivyTzlrvBduVMRtWA8lxp4gFjXFf/avHzoRkM21IfU46Q9QNn +Y8rKTFJ8JQKBgGRgv6/b0QYPj0lOyen0Z8t3PUXxxLpbTvdRGcSXYvJIB4tryHQa +/Y8/675QP8q8mvKC8IKlnX2Ou2U1Y27GqpeXRmNe+qbvS0KSEqEdjA2XEnKc+u2e +k1WxNHt6hThuNK8zrRI8SZVswYCpt/oeB+9gtESmftmWTPipWW0c/mZFAoGBAIbK +pLJr9ptmmFuSUZ1nTX+OHdIq7ts9YcBpxAawQYUZqSUftyOvPCTGWV0Uxr9YjBGR +lKzd6N8hBmflgt93rlDATVXSamxNptTWEUR7WjhpcCpFl28nuEiMoEpE8mH5XDWy +MXHK7N8CsFC3LYld+I62Iqvi0HzAqR79ijkrcd21AoGBAIc+y9eKyMxFfHkz/pZH +cupJiF21Y3+cw7h2TRNvqSLvHTp3enCDo7WG0R0K74BHIvwTDcZxsxThm6KwB+y9 +WIuKQC064e5ASjdF1dfwFFlNpwphL2kebWuzIkpEVtCcGJPUuJ704R7tD3y8q4BN +aSrpjjRGIVr6mLcxXGgHJa5R +-----END PRIVATE KEY----- diff --git a/components/test_util/data/server.pem b/components/test_util/data/server.pem index 09200bd82f6..42deadba2db 100644 --- a/components/test_util/data/server.pem +++ b/components/test_util/data/server.pem @@ -1,22 +1,20 @@ -----BEGIN CERTIFICATE----- -MIIDlTCCAn2gAwIBAgIUGKdjy/Uqp64ZiwqMwpTMGP5tKT0wDQYJKoZIhvcNAQEL -BQAwVzELMAkGA1UEBhMCQ04xEDAOBgNVBAgTB0JlaWppbmcxEDAOBgNVBAcTB0Jl -aWppbmcxEDAOBgNVBAoTB1BpbmdDQVAxEjAQBgNVBAMTCU15IG93biBDQTAgFw0x -OTA5MDIwNjEzMDBaGA8yMTE5MDgwOTA2MTMwMFowFjEUMBIGA1UEAxMLdGlrdi1z -ZXJ2ZXIwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQCxGmr8T9ULzZjG -wLkuUoW8j0cm5yEy0qySfndqdbis0Tex83RYNLqoLLE26ZRUGo73wnXzHFvP9eRX -to/pNxdms2ASrDTs9lZMOjJlWL6f0dfJbCUF6ITZZasIRKqxqv5xdXlQTegTtjgJ -Pnq0w35rAqqd/XMvJGX0OWpKDX8ZpuF5XPN8aF2p12p2kBlxo4cfxQwRZ2dnipMV -poY1GWtHLdbRWPOAihxCium++1mRAzmILHHP3C4X3/qAZ2ilMDQnZK9kpr6gj/zr -s0uk3kfH5yf9OEeEJOHU3fhSWWhrRqKi7vaYaDIwh1DFgCxGcbrYQef71o+16AiN -B3XcF73/AgMBAAGjgZcwgZQwDgYDVR0PAQH/BAQDAgWgMB0GA1UdJQQWMBQGCCsG -AQUFBwMBBggrBgEFBQcDAjAMBgNVHRMBAf8EAjAAMB0GA1UdDgQWBBTw7yUYqbAv -BJw3zZctLUfUi0vyqzAfBgNVHSMEGDAWgBS3hxTaN9B7eF8xr0DKLZ3b5vFnrDAV -BgNVHREEDjAMhwSsEAUohwR/AAABMA0GCSqGSIb3DQEBCwUAA4IBAQCBljfge2fC -5X+tt1v7AkWoH5xpymEVvuIWWJmT/6FNTn1rdnIaxWCQzJbBCXjZS/75lKnwfrTB -ZK7iMv1GQaBevT/qm+7GcApsr5nFrI/MvzrvY+XRqvU8gsRhUjHYI+JPLGWxhzZD -pQdJTAGvsDLHu1VVdHR2KsE4M8ceGq58f7zPSq/suf+8SYEOFP8zfuXX1HfUrFVe -69ZQw8PZh4EYL0PYtE5BYfe9iJyFNNtZiejiribMQz/NtNkKM3M+Hm40ULGuwHXq -bKDjDq1PvmpVb/kKO/xADTIAbqproXETZ4W2keI3hwm6NxysvEbYV9+puQBXQqwT -KOt9Lo4ofSAF +MIIDVTCCAj2gAwIBAgIUUCvVn7LZjm7FD+xeAd5g1oKFSrIwDQYJKoZIhvcNAQEL +BQAwFzEVMBMGA1UEAwwMdGlrdl90ZXN0X2NhMCAXDTI0MDMyNzAwMDAwMFoYDzIx +MjQwMzI3MDAwMDAwWjAWMRQwEgYDVQQDDAt0aWt2LXNlcnZlcjCCASIwDQYJKoZI +hvcNAQEBBQADggEPADCCAQoCggEBALMxGMBsbYxVMyzHtGv554iLiM3KG9pDwtxM +HTwF2BjzrXxuVLTft0H/zkbEpN9kx0lmQLKBtKzKJUmjCrK5yxPogpidn+QPF9dd +gQEhBw5gucARTU02dAs9hJO2HvGOdIq+DvyAf7BjYs/jPpAPrhdugpr9+ZW4HI79 +CWdme2SviSIfo7msnK3Ueb47NVFizlG6YZTQymfcGN3GYaoHv5aV1u7YQJ84UUNo +L+6LNZWvU4PmWx1Vq7bRkTBk3fu51FewFzSpOn4JoDSxMU0J+qr39ndJU4aZaV29 +U58Xff6f036fXLJ5nxOVnckHZ1oqgMcPEtjQopm27oMz1GVwx0kCAwEAAaOBlzCB +lDAMBgNVHRMBAf8EAjAAMA4GA1UdDwEB/wQEAwIFoDAdBgNVHSUEFjAUBggrBgEF +BQcDAQYIKwYBBQUHAwIwFQYDVR0RBA4wDIcErBAFKIcEfwAAATAdBgNVHQ4EFgQU +Z4ACSrFMAFHP3iQAlZihuxlTk64wHwYDVR0jBBgwFoAUvkMODBi064xUztsZRsJY +/sstaO8wDQYJKoZIhvcNAQELBQADggEBALxhZMiXDQvUJCtSGPaCJhvs51O7Sb+8 +xrByuQrtXhcNVsOcq+0OjT/roUzD0x5mf75cTcJm6XZuzg2BPgN7wQU5GPuhOcJv +XFx4uoRDNBzN5FlxZu+ln4Qqw/M/4zsRwD3qkp/J50RpbCOmf1x/b1M6+s1uQcT5 +6sMErUXnOzvY5ey4vCJFiveYu5Z7GIHPB8xlhJtiu3T8FN1o3Us75evFk7hHfJjf +zU1Efd6W9RU/bEPAPvqnLBkSHdx7Urw0hNHlW2IDjaX1zIV5Ibeiw61olyQAiXjy +N3VJrMbuSTRv5BZxp4sKwnan4dAtfXvSSle36pYhN5UTjD72NNlLe8A= -----END CERTIFICATE----- From 3da1ff4cd394c61287243ef24347a9f1117db551 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Fri, 6 Sep 2024 11:22:49 +0800 Subject: [PATCH 198/220] cdc: print log to indicate memory free, and adjust finish_scan_lock method (#17357) (#17481) close tikv/tikv#17368 * add one log to indicate the memory quota is freed when drop the `Drain` * free the truncated scanned event memory quota. * refactor `finish_scan_lock` method, to remove the else branch. * row size calculation should also consider old value * remove some outdate todo Signed-off-by: 3AceShowHand Co-authored-by: 3AceShowHand Co-authored-by: Ling Jin <7138436+3AceShowHand@users.noreply.github.com> --- components/cdc/src/channel.rs | 52 +++++++++++++++--- components/cdc/src/delegate.rs | 16 +++--- components/cdc/src/endpoint.rs | 88 ++++++++++++++++--------------- components/cdc/src/initializer.rs | 2 +- components/cdc/src/observer.rs | 2 - components/cdc/src/service.rs | 35 ++++++------ 6 files changed, 118 insertions(+), 77 deletions(-) diff --git a/components/cdc/src/channel.rs b/components/cdc/src/channel.rs index c49bec00547..ad9ebdd0d44 100644 --- a/components/cdc/src/channel.rs +++ b/components/cdc/src/channel.rs @@ -22,13 +22,13 @@ use kvproto::cdcpb::{ChangeDataEvent, Event, ResolvedTs}; use protobuf::Message; use tikv_util::{ future::block_on_timeout, - impl_display_as_debug, + impl_display_as_debug, info, memory::{MemoryQuota, MemoryQuotaExceeded}, time::Instant, warn, }; -use crate::metrics::*; +use crate::{metrics::*, service::ConnId}; /// The maximum bytes of events can be batched into one `CdcEvent::Event`, 32KB. pub const CDC_EVENT_MAX_BYTES: usize = 32 * 1024; @@ -194,7 +194,7 @@ impl EventBatcher { } } -pub fn channel(buffer: usize, memory_quota: Arc) -> (Sink, Drain) { +pub fn channel(conn_id: ConnId, buffer: usize, memory_quota: Arc) -> (Sink, Drain) { let (unbounded_sender, unbounded_receiver) = unbounded(); let (bounded_sender, bounded_receiver) = bounded(buffer); ( @@ -207,6 +207,7 @@ pub fn channel(buffer: usize, memory_quota: Arc) -> (Sink, Drain) { unbounded_receiver, bounded_receiver, memory_quota, + conn_id, }, ) } @@ -355,6 +356,7 @@ pub struct Drain { unbounded_receiver: UnboundedReceiver, bounded_receiver: Receiver, memory_quota: Arc, + conn_id: ConnId, } impl<'a> Drain { @@ -362,6 +364,7 @@ impl<'a> Drain { let observed = (&mut self.unbounded_receiver).map(|x| (x.created, x.event, x.size)); let scaned = (&mut self.bounded_receiver).filter_map(|x| { if x.truncated.load(Ordering::Acquire) { + self.memory_quota.free(x.size as _); return futures::future::ready(None); } futures::future::ready(Some((x.created, x.event, x.size))) @@ -420,14 +423,17 @@ impl Drop for Drain { self.bounded_receiver.close(); self.unbounded_receiver.close(); let start = Instant::now(); - let mut drain = Box::pin(async { + let mut total_bytes = 0; + let mut drain = Box::pin(async move { + let conn_id = self.conn_id; let memory_quota = self.memory_quota.clone(); - let mut total_bytes = 0; let mut drain = self.drain(); while let Some((_, bytes)) = drain.next().await { total_bytes += bytes; } memory_quota.free(total_bytes); + info!("drop Drain finished, free memory"; "conn_id" => ?conn_id, + "freed_bytes" => total_bytes, "inuse_bytes" => memory_quota.in_use()); }); block_on(&mut drain); let takes = start.saturating_elapsed(); @@ -463,7 +469,7 @@ mod tests { type Send = Box Result<(), SendError>>; fn new_test_channel(buffer: usize, capacity: usize, force_send: bool) -> (Send, Drain) { let memory_quota = Arc::new(MemoryQuota::new(capacity)); - let (mut tx, rx) = channel(buffer, memory_quota); + let (mut tx, rx) = channel(ConnId::default(), buffer, memory_quota); let mut flag = true; let send = move |event| { flag = !flag; @@ -476,6 +482,38 @@ mod tests { (Box::new(send), rx) } + #[test] + fn test_scanned_event() { + let mut e = Event::default(); + e.region_id = 233; + { + let memory_quota = Arc::new(MemoryQuota::new(1024)); + let (mut tx, mut rx) = channel(ConnId::default(), 10, memory_quota); + + let truncated = Arc::new(AtomicBool::new(false)); + let event = CdcEvent::Event(e.clone()); + let size = event.size() as usize; + let _ = block_on(tx.send_all(vec![event], truncated)); + + let memory_quota = rx.memory_quota.clone(); + let mut drain = rx.drain(); + assert_matches!(block_on(drain.next()), Some((CdcEvent::Event(_), _))); + assert_eq!(memory_quota.in_use(), size); + } + { + let memory_quota = Arc::new(MemoryQuota::new(1024)); + let (mut tx, mut rx) = channel(ConnId::default(), 10, memory_quota); + + let truncated = Arc::new(AtomicBool::new(true)); + let _ = block_on(tx.send_all(vec![CdcEvent::Event(e)], truncated)); + + let memory_quota = rx.memory_quota.clone(); + let mut drain = rx.drain(); + recv_timeout(&mut drain, Duration::from_millis(100)).unwrap_err(); + assert_eq!(memory_quota.in_use(), 0); + } + } + #[test] fn test_barrier() { let force_send = false; @@ -611,7 +649,7 @@ mod tests { let max_pending_bytes = 1024; let buffer = max_pending_bytes / event.size(); let memory_quota = Arc::new(MemoryQuota::new(max_pending_bytes as _)); - let (tx, _rx) = channel(buffer as _, memory_quota); + let (tx, _rx) = channel(ConnId::default(), buffer as _, memory_quota); for _ in 0..buffer { tx.unbounded_send(CdcEvent::Event(e.clone()), false) .unwrap(); diff --git a/components/cdc/src/delegate.rs b/components/cdc/src/delegate.rs index 050e9419cb0..60f3ccde938 100644 --- a/components/cdc/src/delegate.rs +++ b/components/cdc/src/delegate.rs @@ -605,7 +605,7 @@ impl Delegate { } decode_default(default.1, &mut row, &mut _has_value); row.old_value = old_value.finalized().unwrap_or_default(); - row_size = row.key.len() + row.value.len(); + row_size = row.key.len() + row.value.len() + row.old_value.len(); } Some(KvEntry::TxnEntry(TxnEntry::Commit { default, @@ -633,7 +633,7 @@ impl Delegate { } set_event_row_type(&mut row, EventLogType::Committed); row.old_value = old_value.finalized().unwrap_or_default(); - row_size = row.key.len() + row.value.len(); + row_size = row.key.len() + row.value.len() + row.old_value.len(); } None => { // This type means scan has finished. @@ -1253,7 +1253,7 @@ mod tests { let region_epoch = region.get_region_epoch().clone(); let quota = Arc::new(MemoryQuota::new(usize::MAX)); - let (sink, mut drain) = crate::channel::channel(1, quota); + let (sink, mut drain) = channel(ConnId::default(), 1, quota); let rx = drain.drain(); let request_id = 123; let mut downstream = Downstream::new( @@ -1555,11 +1555,12 @@ mod tests { } assert_eq!(map.len(), 5); - let (sink, mut drain) = channel(1, Arc::new(MemoryQuota::new(1024))); + let conn_id = ConnId::default(); + let (sink, mut drain) = channel(conn_id, 1, Arc::new(MemoryQuota::new(1024))); let downstream = Downstream { id: DownstreamId::new(), req_id: 1, - conn_id: ConnId::new(), + conn_id, peer: String::new(), region_epoch: RegionEpoch::default(), sink: Some(sink), @@ -1630,11 +1631,12 @@ mod tests { } assert_eq!(map.len(), 5); - let (sink, mut drain) = channel(1, Arc::new(MemoryQuota::new(1024))); + let conn_id = ConnId::default(); + let (sink, mut drain) = channel(conn_id, 1, Arc::new(MemoryQuota::new(1024))); let downstream = Downstream { id: DownstreamId::new(), req_id: 1, - conn_id: ConnId::new(), + conn_id, peer: String::new(), region_epoch: RegionEpoch::default(), sink: Some(sink), diff --git a/components/cdc/src/endpoint.rs b/components/cdc/src/endpoint.rs index f09cc160d34..319153b8c3d 100644 --- a/components/cdc/src/endpoint.rs +++ b/components/cdc/src/endpoint.rs @@ -899,11 +899,22 @@ impl, E: KvEngine, S: StoreRegionMeta> Endpoint { + debug!("cdc region not found on region ready (finish building resolver)"; + "region_id" => region.get_id()); + } + Some(delegate) => { + if delegate.handle.id != observe_id { + debug!("cdc stale region ready"; + "region_id" => region.get_id(), + "observe_id" => ?observe_id, + "current_id" => ?delegate.handle.id); + return; + } match delegate.on_region_ready(resolver, region) { Ok(fails) => { + let mut deregisters = Vec::new(); for (downstream, e) in fails { deregisters.push(Deregister::Downstream { conn_id: downstream.get_conn_id(), @@ -913,27 +924,18 @@ impl, E: KvEngine, S: StoreRegionMeta> Endpoint deregisters.push(Deregister::Delegate { + Err(e) => self.on_deregister(Deregister::Delegate { region_id, observe_id, err: e, }), } - } else { - debug!("cdc stale region ready"; - "region_id" => region.get_id(), - "observe_id" => ?observe_id, - "current_id" => ?delegate.handle.id); } - } else { - debug!("cdc region not found on region ready (finish building resolver)"; - "region_id" => region.get_id()); - } - - // Deregister downstreams if there is any downstream fails to subscribe. - for deregister in deregisters { - self.on_deregister(deregister); } } @@ -1177,7 +1179,7 @@ impl, E: KvEngine, S: StoreRegionMeta> Endpoint (), // Must schedule `RegisterMinTsEvent` event otherwise resolved ts can not // advance normally. - Err(err) => panic!("failed to regiester min ts event, error: {:?}", err), + Err(err) => panic!("failed to register min ts event, error: {:?}", err), } } else { // During shutdown, tso runtime drops future immediately, @@ -1558,10 +1560,10 @@ mod tests { let mut suite = mock_endpoint(&cfg, None, ApiVersion::V1); suite.add_region(1, 100); let quota = Arc::new(MemoryQuota::new(usize::MAX)); - let (tx, mut rx) = channel::channel(1, quota); + let (tx, mut rx) = channel::channel(ConnId::default(), 1, quota); let mut rx = rx.drain(); - let conn = Conn::new(tx, String::new()); + let conn = Conn::new(ConnId::default(), tx, String::new()); let conn_id = conn.get_id(); suite.run(Task::OpenConn { conn }); suite.run(set_conn_verion_task( @@ -1841,14 +1843,14 @@ mod tests { #[test] fn test_raftstore_is_busy() { let quota = Arc::new(MemoryQuota::new(usize::MAX)); - let (tx, _rx) = channel::channel(1, quota); + let (tx, _rx) = channel::channel(ConnId::default(), 1, quota); let mut suite = mock_endpoint(&CdcConfig::default(), None, ApiVersion::V1); // Fill the channel. suite.add_region(1 /* region id */, 1 /* cap */); suite.fill_raft_rx(1); - let conn = Conn::new(tx, String::new()); + let conn = Conn::new(ConnId::default(), tx, String::new()); let conn_id = conn.get_id(); suite.run(Task::OpenConn { conn }); suite.run(set_conn_verion_task(conn_id, semver::Version::new(0, 0, 0))); @@ -1894,10 +1896,10 @@ mod tests { let mut suite = mock_endpoint(&cfg, None, ApiVersion::V1); suite.add_region(1, 100); let quota = Arc::new(MemoryQuota::new(usize::MAX)); - let (tx, mut rx) = channel::channel(1, quota); + let (tx, mut rx) = channel::channel(ConnId::default(), 1, quota); let mut rx = rx.drain(); - let conn = Conn::new(tx, String::new()); + let conn = Conn::new(ConnId::default(), tx, String::new()); let conn_id = conn.get_id(); suite.run(Task::OpenConn { conn }); @@ -2075,11 +2077,11 @@ mod tests { suite.add_region(1, 100); let quota = Arc::new(MemoryQuota::new(usize::MAX)); - let (tx, mut rx) = channel::channel(1, quota); + let (tx, mut rx) = channel::channel(ConnId::default(), 1, quota); let mut rx = rx.drain(); let mut region = Region::default(); region.set_id(1); - let conn = Conn::new(tx, String::new()); + let conn = Conn::new(ConnId::default(), tx, String::new()); let conn_id = conn.get_id(); suite.run(Task::OpenConn { conn }); @@ -2167,11 +2169,11 @@ mod tests { // Register region 3 to another conn which is not support batch resolved ts. let quota = Arc::new(MemoryQuota::new(usize::MAX)); - let (tx, mut rx2) = channel::channel(1, quota); + let (tx, mut rx2) = channel::channel(ConnId::default(), 1, quota); let mut rx2 = rx2.drain(); let mut region = Region::default(); region.set_id(3); - let conn = Conn::new(tx, String::new()); + let conn = Conn::new(ConnId::default(), tx, String::new()); let conn_id = conn.get_id(); suite.run(Task::OpenConn { conn }); suite.run(set_conn_verion_task(conn_id, semver::Version::new(4, 0, 5))); @@ -2239,10 +2241,10 @@ mod tests { let mut suite = mock_endpoint(&CdcConfig::default(), None, ApiVersion::V1); suite.add_region(1, 100); let quota = Arc::new(MemoryQuota::new(usize::MAX)); - let (tx, mut rx) = channel::channel(1, quota); + let (tx, mut rx) = channel::channel(ConnId::default(), 1, quota); let mut rx = rx.drain(); - let conn = Conn::new(tx, String::new()); + let conn = Conn::new(ConnId::default(), tx, String::new()); let conn_id = conn.get_id(); suite.run(Task::OpenConn { conn }); suite.run(set_conn_verion_task(conn_id, semver::Version::new(0, 0, 0))); @@ -2392,10 +2394,10 @@ mod tests { let mut conn_rxs = vec![]; let quota = Arc::new(MemoryQuota::new(usize::MAX)); for region_ids in vec![vec![1, 2], vec![3]] { - let (tx, rx) = channel::channel(1, quota.clone()); + let conn_id = ConnId::default(); + let (tx, rx) = channel::channel(conn_id, 1, quota.clone()); conn_rxs.push(rx); - let conn = Conn::new(tx, String::new()); - let conn_id = conn.get_id(); + let conn = Conn::new(conn_id, tx, String::new()); suite.run(Task::OpenConn { conn }); let version = FeatureGate::batch_resolved_ts(); suite.run(set_conn_verion_task(conn_id, version)); @@ -2507,8 +2509,8 @@ mod tests { let quota = Arc::new(MemoryQuota::new(usize::MAX)); // Open conn a - let (tx1, _rx1) = channel::channel(1, quota.clone()); - let conn_a = Conn::new(tx1, String::new()); + let (tx1, _rx1) = channel::channel(ConnId::default(), 1, quota.clone()); + let conn_a = Conn::new(ConnId::default(), tx1, String::new()); let conn_id_a = conn_a.get_id(); suite.run(Task::OpenConn { conn: conn_a }); suite.run(set_conn_verion_task( @@ -2517,9 +2519,9 @@ mod tests { )); // Open conn b - let (tx2, mut rx2) = channel::channel(1, quota); + let (tx2, mut rx2) = channel::channel(ConnId::default(), 1, quota); let mut rx2 = rx2.drain(); - let conn_b = Conn::new(tx2, String::new()); + let conn_b = Conn::new(ConnId::default(), tx2, String::new()); let conn_id_b = conn_b.get_id(); suite.run(Task::OpenConn { conn: conn_b }); suite.run(set_conn_verion_task( @@ -2671,10 +2673,10 @@ mod tests { }; let mut suite = mock_endpoint(&cfg, None, ApiVersion::V1); let quota = Arc::new(MemoryQuota::new(usize::MAX)); - let (tx, mut rx) = channel::channel(1, quota); + let (tx, mut rx) = channel::channel(ConnId::default(), 1, quota); let mut rx = rx.drain(); - let conn = Conn::new(tx, String::new()); + let conn = Conn::new(ConnId::default(), tx, String::new()); let conn_id = conn.get_id(); suite.run(Task::OpenConn { conn }); // Enable batch resolved ts in the test. @@ -2764,10 +2766,10 @@ mod tests { let mut suite = mock_endpoint(&cfg, None, ApiVersion::V1); suite.add_region(1, 100); let quota = Arc::new(MemoryQuota::new(usize::MAX)); - let (tx, mut rx) = channel::channel(1, quota); + let (tx, mut rx) = channel::channel(ConnId::default(), 1, quota); let mut rx = rx.drain(); - let conn = Conn::new(tx, String::new()); + let conn = Conn::new(ConnId::default(), tx, String::new()); let conn_id = conn.get_id(); suite.run(Task::OpenConn { conn }); @@ -3024,9 +3026,9 @@ mod tests { let mut suite = mock_endpoint(&cfg, None, ApiVersion::V1); suite.add_region(1, 100); let quota = Arc::new(MemoryQuota::new(usize::MAX)); - let (tx, _rx) = channel::channel(1, quota); + let (tx, _rx) = channel::channel(ConnId::default(), 1, quota); - let conn = Conn::new(tx, String::new()); + let conn = Conn::new(ConnId::default(), tx, String::new()); let conn_id = conn.get_id(); suite.run(Task::OpenConn { conn }); diff --git a/components/cdc/src/initializer.rs b/components/cdc/src/initializer.rs index d1930b1fc7b..f0b385c6009 100644 --- a/components/cdc/src/initializer.rs +++ b/components/cdc/src/initializer.rs @@ -646,7 +646,7 @@ mod tests { ) { let (receiver_worker, rx) = new_receiver_worker(); let quota = Arc::new(MemoryQuota::new(usize::MAX)); - let (sink, drain) = crate::channel::channel(buffer, quota); + let (sink, drain) = crate::channel::channel(ConnId::default(), buffer, quota); let pool = Builder::new_multi_thread() .thread_name("test-initializer-worker") diff --git a/components/cdc/src/observer.rs b/components/cdc/src/observer.rs index cfcedfeb59d..a6586f60765 100644 --- a/components/cdc/src/observer.rs +++ b/components/cdc/src/observer.rs @@ -119,8 +119,6 @@ impl CmdObserver for CdcObserver { let mut region = Region::default(); region.mut_peers().push(Peer::default()); // Create a snapshot here for preventing the old value was GC-ed. - // TODO: only need it after enabling old value, may add a flag to indicate - // whether to get it. let snapshot = RegionSnapshot::from_snapshot(Arc::new(engine.snapshot()), Arc::new(region)); let get_old_value = move |key, query_ts, diff --git a/components/cdc/src/service.rs b/components/cdc/src/service.rs index e5c21d22217..8a2f644de3a 100644 --- a/components/cdc/src/service.rs +++ b/components/cdc/src/service.rs @@ -100,9 +100,9 @@ struct DownstreamValue { } impl Conn { - pub fn new(sink: Sink, peer: String) -> Conn { + pub fn new(conn_id: ConnId, sink: Sink, peer: String) -> Conn { Conn { - id: ConnId::new(), + id: conn_id, sink, downstreams: HashMap::default(), peer, @@ -334,18 +334,19 @@ impl Service { request: ChangeDataRequest, conn_id: ConnId, ) -> Result<(), String> { - let observed_range = - match ObservedRange::new(request.start_key.clone(), request.end_key.clone()) { - Ok(observed_range) => observed_range, - Err(e) => { - warn!( - "cdc invalid observed start key or end key version"; - "downstream" => ?peer, "region_id" => request.region_id, - "error" => ?e, - ); - ObservedRange::default() - } - }; + let observed_range = ObservedRange::new(request.start_key.clone(), request.end_key.clone()) + .unwrap_or_else(|e| { + warn!( + "cdc invalid observed start key or end key version"; + "downstream" => ?peer, + "region_id" => request.region_id, + "request_id" => request.region_id, + "error" => ?e, + "start_key" => log_wrappers::Value::key(&request.start_key), + "end_key" => log_wrappers::Value::key(&request.end_key), + ); + ObservedRange::default() + }); let downstream = Downstream::new( peer.to_owned(), request.get_region_epoch().clone(), @@ -405,10 +406,10 @@ impl Service { event_feed_v2: bool, ) { sink.enhance_batch(true); + let conn_id = ConnId::new(); let (event_sink, mut event_drain) = - channel(CDC_CHANNLE_CAPACITY, self.memory_quota.clone()); - let conn = Conn::new(event_sink, ctx.peer()); - let conn_id = conn.get_id(); + channel(conn_id, CDC_CHANNLE_CAPACITY, self.memory_quota.clone()); + let conn = Conn::new(conn_id, event_sink, ctx.peer()); let mut explicit_features = vec![]; if event_feed_v2 { From 18789f4dbabd36e7f2f58f66d4f5de28e04c27c3 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Tue, 10 Sep 2024 19:00:51 +0800 Subject: [PATCH 199/220] lock_manager: Skip updating lock wait info for non-fair-locking requests (#17500) (#17517) close tikv/tikv#17394 lock_manager: Skip updating lock wait info for non-fair-locking requests This is a simpler and lower-risky fix of the OOM issue #17394 for released branches, as an alternative solution to #17451 . In this way, for acquire_pessimistic_lock requests without enabling fair locking, the behavior of update_wait_for will be a noop. So that if fair locking is globally disabled, the behavior will be equivalent to versions before 7.0. Signed-off-by: MyonKeminta Co-authored-by: MyonKeminta --- src/server/lock_manager/waiter_manager.rs | 7 ++++++- src/storage/lock_manager/lock_waiting_queue.rs | 1 + src/storage/lock_manager/mod.rs | 1 + src/storage/txn/scheduler.rs | 2 ++ 4 files changed, 10 insertions(+), 1 deletion(-) diff --git a/src/server/lock_manager/waiter_manager.rs b/src/server/lock_manager/waiter_manager.rs index c0e97e25e3a..f1b3c9b3dbe 100644 --- a/src/server/lock_manager/waiter_manager.rs +++ b/src/server/lock_manager/waiter_manager.rs @@ -554,7 +554,9 @@ impl WaiterManager { continue; } - if let Some((previous_wait_info, diag_ctx)) = previous_wait_info { + if let Some((previous_wait_info, diag_ctx)) = previous_wait_info + && previous_wait_info.allow_lock_with_conflict + { self.detector_scheduler .clean_up_wait_for(event.start_ts, previous_wait_info); self.detector_scheduler @@ -678,6 +680,7 @@ pub mod tests { key: Key::from_raw(b""), lock_digest: LockDigest { ts: lock_ts, hash }, lock_info: Default::default(), + allow_lock_with_conflict: false, }, cancel_callback: Box::new(|_| ()), diag_ctx: DiagnosticContext::default(), @@ -798,6 +801,7 @@ pub mod tests { key: Key::from_raw(&raw_key), lock_digest: lock, lock_info: info.clone(), + allow_lock_with_conflict: false, }, cb, Instant::now() + Duration::from_millis(3000), @@ -1202,6 +1206,7 @@ pub mod tests { key: key.to_raw().unwrap(), ..Default::default() }, + allow_lock_with_conflict: false, }, }; scheduler.update_wait_for(vec![event]); diff --git a/src/storage/lock_manager/lock_waiting_queue.rs b/src/storage/lock_manager/lock_waiting_queue.rs index a81248fe9e2..dbe71f6267b 100644 --- a/src/storage/lock_manager/lock_waiting_queue.rs +++ b/src/storage/lock_manager/lock_waiting_queue.rs @@ -618,6 +618,7 @@ impl LockWaitQueues { hash: entry.lock_hash, }, lock_info: key_state.current_lock.clone(), + allow_lock_with_conflict: entry.parameters.allow_lock_with_conflict, }, }; update_wait_for_events.push(event); diff --git a/src/storage/lock_manager/mod.rs b/src/storage/lock_manager/mod.rs index 5c103f40f82..4c2b4b0ccca 100644 --- a/src/storage/lock_manager/mod.rs +++ b/src/storage/lock_manager/mod.rs @@ -97,6 +97,7 @@ pub struct KeyLockWaitInfo { pub key: Key, pub lock_digest: LockDigest, pub lock_info: LockInfo, + pub allow_lock_with_conflict: bool, } /// Uniquely identifies a lock-waiting request in a `LockManager`. diff --git a/src/storage/txn/scheduler.rs b/src/storage/txn/scheduler.rs index e5a1afbd918..bb2c5194f75 100644 --- a/src/storage/txn/scheduler.rs +++ b/src/storage/txn/scheduler.rs @@ -940,6 +940,7 @@ impl TxnScheduler { let start_ts = lock_info.parameters.start_ts; let is_first_lock = lock_info.parameters.is_first_lock; let wait_timeout = lock_info.parameters.wait_timeout; + let allow_lock_with_conflict = lock_info.parameters.allow_lock_with_conflict; let diag_ctx = DiagnosticContext { key: lock_info.key.to_raw().unwrap(), @@ -967,6 +968,7 @@ impl TxnScheduler { key, lock_digest, lock_info: lock_info_pb, + allow_lock_with_conflict, }; self.inner.lock_mgr.wait_for( wait_token, From 271d446e345477db4478056f11ea29d22663f497 Mon Sep 17 00:00:00 2001 From: lucasliang Date: Tue, 10 Sep 2024 19:18:07 +0800 Subject: [PATCH 200/220] *: fix compatibility to raft-engine.spill-dir (#17358) (#17416) close tikv/tikv#17356 Make the diskfull check mechanism compatible to the configuration `raft-engine.spill-dir`. Signed-off-by: lucasliang Co-authored-by: glorv Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- Cargo.lock | 2 +- .../raftstore-v2/src/worker/pd/store.rs | 9 +- components/raftstore/src/store/worker/pd.rs | 9 +- components/server/Cargo.toml | 22 +- components/server/src/common.rs | 363 +++++++++++++++++- components/server/src/server.rs | 115 ++---- components/server/src/server2.rs | 115 ++---- components/tikv_util/Cargo.toml | 6 +- components/tikv_util/src/sys/disk.rs | 18 +- components/tikv_util/src/sys/mod.rs | 13 +- 10 files changed, 476 insertions(+), 196 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c53479e0017..a027321d2a5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5403,7 +5403,6 @@ dependencies = [ "error_code", "fail", "file_system", - "fs2", "futures 0.3.15", "grpcio", "grpcio-health", @@ -6845,6 +6844,7 @@ dependencies = [ "derive_more", "error_code", "fail", + "fs2", "futures 0.3.15", "futures-util", "gag", diff --git a/components/raftstore-v2/src/worker/pd/store.rs b/components/raftstore-v2/src/worker/pd/store.rs index b3fd3245be6..75e20a06abd 100644 --- a/components/raftstore-v2/src/worker/pd/store.rs +++ b/components/raftstore-v2/src/worker/pd/store.rs @@ -22,6 +22,7 @@ use slog::{error, info, warn}; use tikv_util::{ metrics::RecordPairVec, store::QueryStats, + sys::disk::get_disk_space_stats, time::{Duration, Instant as TiInstant, UnixSecs}, topn::TopN, }; @@ -441,7 +442,8 @@ where /// Returns (capacity, used, available). fn collect_engine_size(&self) -> Option<(u64, u64, u64)> { - let disk_stats = match fs2::statvfs(self.tablet_registry.tablet_root()) { + let (disk_cap, disk_avail) = match get_disk_space_stats(self.tablet_registry.tablet_root()) + { Err(e) => { error!( self.logger, @@ -451,9 +453,8 @@ where ); return None; } - Ok(stats) => stats, + Ok((total_size, available_size)) => (total_size, available_size), }; - let disk_cap = disk_stats.total_space(); let capacity = if self.cfg.value().capacity.0 == 0 { disk_cap } else { @@ -480,7 +481,7 @@ where let mut available = capacity.checked_sub(used_size).unwrap_or_default(); // We only care about rocksdb SST file size, so we should check disk available // here. - available = cmp::min(available, disk_stats.available_space()); + available = cmp::min(available, disk_avail); Some((capacity, used_size, available)) } } diff --git a/components/raftstore/src/store/worker/pd.rs b/components/raftstore/src/store/worker/pd.rs index e728ab12502..5f54eb8740c 100644 --- a/components/raftstore/src/store/worker/pd.rs +++ b/components/raftstore/src/store/worker/pd.rs @@ -41,7 +41,7 @@ use tikv_util::{ box_err, debug, error, info, metrics::ThreadInfoStatistics, store::QueryStats, - sys::{thread::StdThreadBuildWrapper, SysQuota}, + sys::{disk::get_disk_space_stats, thread::StdThreadBuildWrapper, SysQuota}, thd_name, time::{Instant as TiInstant, UnixSecs}, timer::GLOBAL_TIMER_HANDLE, @@ -2621,7 +2621,7 @@ fn collect_engine_size( return Some((engine_size.capacity, engine_size.used, engine_size.avail)); } let store_info = store_info.unwrap(); - let disk_stats = match fs2::statvfs(store_info.kv_engine.path()) { + let (disk_cap, disk_avail) = match get_disk_space_stats(store_info.kv_engine.path()) { Err(e) => { error!( "get disk stat for rocksdb failed"; @@ -2630,9 +2630,8 @@ fn collect_engine_size( ); return None; } - Ok(stats) => stats, + Ok((total_size, available_size)) => (total_size, available_size), }; - let disk_cap = disk_stats.total_space(); let capacity = if store_info.capacity == 0 || disk_cap < store_info.capacity { disk_cap } else { @@ -2656,7 +2655,7 @@ fn collect_engine_size( let mut available = capacity.checked_sub(used_size).unwrap_or_default(); // We only care about rocksdb SST file size, so we should check disk available // here. - available = cmp::min(available, disk_stats.available_space()); + available = cmp::min(available, disk_avail); Some((capacity, used_size, available)) } diff --git a/components/server/Cargo.toml b/components/server/Cargo.toml index 55da894c6e8..a1f08d92d4c 100644 --- a/components/server/Cargo.toml +++ b/components/server/Cargo.toml @@ -17,18 +17,10 @@ failpoints = ["tikv/failpoints"] cloud-aws = ["encryption_export/cloud-aws"] cloud-gcp = ["encryption_export/cloud-gcp"] cloud-azure = ["encryption_export/cloud-azure"] -test-engine-kv-rocksdb = [ - "tikv/test-engine-kv-rocksdb" -] -test-engine-raft-raft-engine = [ - "tikv/test-engine-raft-raft-engine" -] -test-engines-rocksdb = [ - "tikv/test-engines-rocksdb", -] -test-engines-panic = [ - "tikv/test-engines-panic", -] +test-engine-kv-rocksdb = ["tikv/test-engine-kv-rocksdb"] +test-engine-raft-raft-engine = ["tikv/test-engine-raft-raft-engine"] +test-engines-rocksdb = ["tikv/test-engines-rocksdb"] +test-engines-panic = ["tikv/test-engines-panic"] nortcheck = ["engine_rocks/nortcheck"] backup-stream-debug = ["backup-stream/backup-stream-debug"] @@ -51,7 +43,6 @@ engine_traits = { workspace = true } error_code = { workspace = true } fail = "0.5" file_system = { workspace = true } -fs2 = "0.4" futures = "0.3" grpcio = { workspace = true } grpcio-health = { workspace = true } @@ -59,7 +50,10 @@ hex = "0.4" keys = { workspace = true } kvproto = { workspace = true } libc = "0.2" -log = { version = "0.4", features = ["max_level_trace", "release_max_level_debug"] } +log = { version = "0.4", features = [ + "max_level_trace", + "release_max_level_debug", +] } log_wrappers = { workspace = true } pd_client = { workspace = true } prometheus = { version = "0.13", features = ["nightly"] } diff --git a/components/server/src/common.rs b/components/server/src/common.rs index c8cf879d905..32dfb925366 100644 --- a/components/server/src/common.rs +++ b/components/server/src/common.rs @@ -214,8 +214,9 @@ impl TikvServerCore { } } - let disk_stats = fs2::statvfs(&self.config.storage.data_dir).unwrap(); - let mut capacity = disk_stats.total_space(); + let (disk_cap, disk_avail) = + disk::get_disk_space_stats(&self.config.storage.data_dir).unwrap(); + let mut capacity = disk_cap; if self.config.raft_store.capacity.0 > 0 { capacity = cmp::min(capacity, self.config.raft_store.capacity.0); } @@ -223,11 +224,7 @@ impl TikvServerCore { let kv_reserved_size = calculate_reserved_space(capacity, self.config.storage.reserve_space.0); disk::set_disk_reserved_space(kv_reserved_size); - reserve_physical_space( - &self.config.storage.data_dir, - disk_stats.available_space(), - kv_reserved_size, - ); + reserve_physical_space(&self.config.storage.data_dir, disk_avail, kv_reserved_size); let raft_data_dir = if self.config.raft_engine.enable { self.config.raft_engine.config().dir @@ -238,18 +235,13 @@ impl TikvServerCore { let separated_raft_mount_path = path_in_diff_mount_point(&self.config.storage.data_dir, &raft_data_dir); if separated_raft_mount_path { - let raft_disk_stats = fs2::statvfs(&raft_data_dir).unwrap(); + let (raft_disk_cap, raft_disk_avail) = + disk::get_disk_space_stats(&raft_data_dir).unwrap(); // reserve space for raft engine if raft engine is deployed separately - let raft_reserved_size = calculate_reserved_space( - raft_disk_stats.total_space(), - self.config.storage.reserve_raft_space.0, - ); + let raft_reserved_size = + calculate_reserved_space(raft_disk_cap, self.config.storage.reserve_raft_space.0); disk::set_raft_disk_reserved_space(raft_reserved_size); - reserve_physical_space( - &raft_data_dir, - raft_disk_stats.available_space(), - raft_reserved_size, - ); + reserve_physical_space(&raft_data_dir, raft_disk_avail, raft_reserved_size); } } @@ -866,3 +858,340 @@ impl EngineMetricsManager { } } } + +fn calculate_disk_usage(a: disk::DiskUsage, b: disk::DiskUsage) -> disk::DiskUsage { + match (a, b) { + (disk::DiskUsage::AlreadyFull, _) => disk::DiskUsage::AlreadyFull, + (_, disk::DiskUsage::AlreadyFull) => disk::DiskUsage::AlreadyFull, + (disk::DiskUsage::AlmostFull, _) => disk::DiskUsage::AlmostFull, + (_, disk::DiskUsage::AlmostFull) => disk::DiskUsage::AlmostFull, + (disk::DiskUsage::Normal, disk::DiskUsage::Normal) => disk::DiskUsage::Normal, + } +} + +/// A checker to inspect the disk usage of kv engine and raft engine. +/// The caller should call `inspect` periodically to get the disk usage status +/// manually. +#[derive(Clone)] +pub struct DiskUsageChecker { + /// The path of kv engine. + kvdb_path: String, + /// The path of raft engine. + raft_path: String, + /// The path of auxiliary directory of raft engine if specified. + raft_auxiliary_path: Option, + /// Whether the main directory of raft engine is separated from kv engine. + separated_raft_mount_path: bool, + /// Whether the auxiliary directory of raft engine is separated from kv + /// engine. + separated_raft_auxiliary_mount_path: bool, + /// Whether the auxiliary directory of raft engine is both separated from + /// the main directory of raft engine and kv engine. + separated_raft_auxiliary_and_kvdb_mount_path: bool, + /// The threshold of disk usage of kv engine to trigger the almost full + /// status. + kvdb_almost_full_thd: u64, + /// The threshold of disk usage of raft engine to trigger the almost full + /// status. + raft_almost_full_thd: u64, + /// The specified disk capacity for the whole disk. + config_disk_capacity: u64, +} + +impl DiskUsageChecker { + pub fn new( + kvdb_path: String, + raft_path: String, + raft_auxiliary_path: Option, + separated_raft_mount_path: bool, + separated_raft_auxiliary_mount_path: bool, + separated_raft_auxiliary_and_kvdb_mount_path: bool, + kvdb_almost_full_thd: u64, + raft_almost_full_thd: u64, + config_disk_capacity: u64, + ) -> Self { + DiskUsageChecker { + kvdb_path, + raft_path, + raft_auxiliary_path, + separated_raft_mount_path, + separated_raft_auxiliary_mount_path, + separated_raft_auxiliary_and_kvdb_mount_path, + kvdb_almost_full_thd, + raft_almost_full_thd, + config_disk_capacity, + } + } + + /// Inspect the disk usage of kv engine and raft engine. + /// The `kvdb_used_size` is the used size of kv engine, and the + /// `raft_used_size` is the used size of raft engine. + /// + /// Returns the disk usage status of the whole disk, kv engine and raft + /// engine, the whole disk capacity and available size. + pub fn inspect( + &self, + kvdb_used_size: u64, + raft_used_size: u64, + ) -> ( + disk::DiskUsage, // whole disk status + disk::DiskUsage, // kvdb disk status + disk::DiskUsage, // raft disk status + u64, // whole capacity + u64, // whole available + ) { + // By default, the almost full threshold of kv engine is half of the + // configured value. + let kvdb_already_full_thd = self.kvdb_almost_full_thd / 2; + let raft_already_full_thd = self.raft_almost_full_thd / 2; + // Check the disk space of raft engine. + let raft_disk_status = { + if !self.separated_raft_mount_path || self.raft_almost_full_thd == 0 { + disk::DiskUsage::Normal + } else { + let (raft_disk_cap, raft_disk_avail) = match disk::get_disk_space_stats( + &self.raft_path, + ) { + Err(e) => { + error!( + "get disk stat for raft engine failed"; + "raft_engine_path" => &self.raft_path, + "err" => ?e + ); + return ( + disk::DiskUsage::Normal, + disk::DiskUsage::Normal, + disk::DiskUsage::Normal, + 0, + 0, + ); + } + Ok((cap, avail)) => { + if !self.separated_raft_auxiliary_mount_path { + // If the auxiliary directory of raft engine is not separated from + // kv engine, returns u64::MAX to indicate that the disk space of + // the raft engine should not be checked. + (std::u64::MAX, std::u64::MAX) + } else if self.separated_raft_auxiliary_and_kvdb_mount_path { + // If the auxiliary directory of raft engine is separated from kv + // engine and the main directory of + // raft engine, the disk space of + // the auxiliary directory should be + // checked. + assert!(self.raft_auxiliary_path.is_some()); + let (auxiliary_disk_cap, auxiliary_disk_avail) = + match disk::get_disk_space_stats( + self.raft_auxiliary_path.as_ref().unwrap(), + ) { + Err(e) => { + error!( + "get auxiliary disk stat for raft engine failed"; + "raft_engine_path" => self.raft_auxiliary_path.as_ref().unwrap(), + "err" => ?e + ); + (0_u64, 0_u64) + } + Ok((total, avail)) => (total, avail), + }; + (cap + auxiliary_disk_cap, avail + auxiliary_disk_avail) + } else { + (cap, avail) + } + } + }; + let raft_disk_available = cmp::min( + raft_disk_cap + .checked_sub(raft_used_size) + .unwrap_or_default(), + raft_disk_avail, + ); + if raft_disk_available <= raft_already_full_thd { + disk::DiskUsage::AlreadyFull + } else if raft_disk_available <= self.raft_almost_full_thd { + disk::DiskUsage::AlmostFull + } else { + disk::DiskUsage::Normal + } + } + }; + // Check the disk space of kv engine. + let (disk_cap, disk_avail) = match disk::get_disk_space_stats(&self.kvdb_path) { + Err(e) => { + error!( + "get disk stat for kv store failed"; + "kv_path" => &self.kvdb_path, + "err" => ?e + ); + return ( + disk::DiskUsage::Normal, + disk::DiskUsage::Normal, + disk::DiskUsage::Normal, + 0, + 0, + ); + } + Ok((total, avail)) => (total, avail), + }; + let capacity = if self.config_disk_capacity == 0 || disk_cap < self.config_disk_capacity { + disk_cap + } else { + self.config_disk_capacity + }; + let available = cmp::min( + capacity.checked_sub(kvdb_used_size).unwrap_or_default(), + disk_avail, + ); + let cur_kv_disk_status = if available <= kvdb_already_full_thd { + disk::DiskUsage::AlreadyFull + } else if available <= self.kvdb_almost_full_thd { + disk::DiskUsage::AlmostFull + } else { + disk::DiskUsage::Normal + }; + let cur_disk_status = calculate_disk_usage(raft_disk_status, cur_kv_disk_status); + ( + cur_disk_status, + cur_kv_disk_status, + raft_disk_status, + capacity, + available, + ) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_disk_usage_checker() { + let kvdb_path = "/tmp/tikv-kvdb".to_owned(); + let raft_path = "/tmp/tikv-raft".to_owned(); + let raft_spill_path = "/tmp/tikv-raft/spill".to_owned(); + + // Case 1: mock the kvdb and raft engine are not separated. + fail::cfg("mock_disk_space_stats", "return(10000,5000)").unwrap(); + let disk_usage_checker = DiskUsageChecker::new( + kvdb_path.clone(), + raft_path.clone(), + Some(raft_spill_path.clone()), + false, + true, + false, + 100, + 100, + 1000, + ); + let (disk_status, kvdb_status, raft_status, ..) = disk_usage_checker.inspect(4000, 1000); + assert_eq!(disk_status, disk::DiskUsage::AlreadyFull); + assert_eq!(kvdb_status, disk::DiskUsage::AlreadyFull); + assert_eq!(raft_status, disk::DiskUsage::Normal); + + let disk_usage_checker = DiskUsageChecker::new( + kvdb_path.clone(), + raft_path.clone(), + Some(raft_spill_path.clone()), + false, + true, + false, + 100, + 100, + 4100, + ); + let (disk_status, kvdb_status, raft_status, ..) = disk_usage_checker.inspect(4000, 1000); + assert_eq!(raft_status, disk::DiskUsage::Normal); + assert_eq!(kvdb_status, disk::DiskUsage::AlmostFull); + assert_eq!(disk_status, disk::DiskUsage::AlmostFull); + let (disk_status, kvdb_status, raft_status, ..) = disk_usage_checker.inspect(3999, 1000); + assert_eq!(raft_status, disk::DiskUsage::Normal); + assert_eq!(kvdb_status, disk::DiskUsage::Normal); + assert_eq!(disk_status, disk::DiskUsage::Normal); + fail::remove("mock_disk_space_stats"); + + // Case 2: mock the kvdb and raft engine are separated. + fail::cfg( + "mock_disk_space_stats", + "1*return(500,200)->1*return(5000,2000)->1*return(500,200)->1*return(5000,2000)->1*return(500,200)->1*return(5000,2000)", + ) + .unwrap(); + let disk_usage_checker = DiskUsageChecker::new( + kvdb_path.clone(), + raft_path.clone(), + Some(raft_spill_path.clone()), + true, + true, + false, + 100, + 100, + 6000, + ); + let (disk_status, kvdb_status, raft_status, ..) = disk_usage_checker.inspect(4000, 450); + assert_eq!(raft_status, disk::DiskUsage::AlreadyFull); + assert_eq!(kvdb_status, disk::DiskUsage::Normal); + assert_eq!(disk_status, disk::DiskUsage::AlreadyFull); + let (disk_status, kvdb_status, raft_status, ..) = disk_usage_checker.inspect(4000, 400); + assert_eq!(raft_status, disk::DiskUsage::AlmostFull); + assert_eq!(kvdb_status, disk::DiskUsage::Normal); + assert_eq!(disk_status, disk::DiskUsage::AlmostFull); + let (disk_status, kvdb_status, raft_status, ..) = disk_usage_checker.inspect(4000, 399); + assert_eq!(raft_status, disk::DiskUsage::Normal); + assert_eq!(kvdb_status, disk::DiskUsage::Normal); + assert_eq!(disk_status, disk::DiskUsage::Normal); + fail::remove("mock_disk_space_stats"); + + fail::cfg( + "mock_disk_space_stats", + "1*return(500,200)->1*return(5000,2000)->1*return(500,200)->1*return(5000,2000)->1*return(500,200)->1*return(5000,2000)", + ) + .unwrap(); + let disk_usage_checker = DiskUsageChecker::new( + kvdb_path.clone(), + raft_path.clone(), + Some(raft_spill_path.clone()), + true, + false, + false, + 100, + 100, + 6000, + ); + let (disk_status, kvdb_status, raft_status, ..) = disk_usage_checker.inspect(4000, 450); + assert_eq!(raft_status, disk::DiskUsage::Normal); + assert_eq!(kvdb_status, disk::DiskUsage::Normal); + assert_eq!(disk_status, disk::DiskUsage::Normal); + let (disk_status, kvdb_status, raft_status, ..) = disk_usage_checker.inspect(4000, 500); + assert_eq!(raft_status, disk::DiskUsage::Normal); + assert_eq!(kvdb_status, disk::DiskUsage::Normal); + assert_eq!(disk_status, disk::DiskUsage::Normal); + let (disk_status, kvdb_status, raft_status, ..) = disk_usage_checker.inspect(4900, 500); + assert_eq!(raft_status, disk::DiskUsage::Normal); + assert_eq!(kvdb_status, disk::DiskUsage::AlmostFull); + assert_eq!(disk_status, disk::DiskUsage::AlmostFull); + fail::remove("mock_disk_space_stats"); + + // Case 3: mock the kvdb and raft engine are separated and the auxiliary + // directory of raft engine is separated from the main directory of + // raft. + fail::cfg( + "mock_disk_space_stats", + "1*return(500,200)->1*return(100,20)->1*return(5000,2000)", + ) + .unwrap(); + let disk_usage_checker = DiskUsageChecker::new( + kvdb_path, + raft_path, + Some(raft_spill_path), + true, + true, + true, + 100, + 100, + 6000, + ); + let (disk_status, kvdb_status, raft_status, ..) = disk_usage_checker.inspect(4000, 450); + assert_eq!(raft_status, disk::DiskUsage::Normal); + assert_eq!(kvdb_status, disk::DiskUsage::Normal); + assert_eq!(disk_status, disk::DiskUsage::Normal); + fail::remove("mock_disk_space_stats"); + } +} diff --git a/components/server/src/server.rs b/components/server/src/server.rs index cc9f8ad3cf2..7a1108e54c2 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -12,7 +12,6 @@ //! explicitly stopped. We keep these components in the `TikvServer` struct. use std::{ - cmp, collections::HashMap, convert::TryFrom, path::{Path, PathBuf}, @@ -125,7 +124,10 @@ use tikv_util::{ use tokio::runtime::Builder; use crate::{ - common::{ConfiguredRaftEngine, EngineMetricsManager, EnginesResourceInfo, TikvServerCore}, + common::{ + ConfiguredRaftEngine, DiskUsageChecker, EngineMetricsManager, EnginesResourceInfo, + TikvServerCore, + }, memory::*, setup::*, signal_handler, @@ -1327,77 +1329,53 @@ where let raft_path = engines.raft.get_engine_path().to_string(); let separated_raft_mount_path = path_in_diff_mount_point(raft_path.as_str(), engines.kv.path()); - let raft_almost_full_threshold = reserve_raft_space; - let raft_already_full_threshold = reserve_raft_space / 2; - - let almost_full_threshold = reserve_space; - let already_full_threshold = reserve_space / 2; - fn calculate_disk_usage(a: disk::DiskUsage, b: disk::DiskUsage) -> disk::DiskUsage { - match (a, b) { - (disk::DiskUsage::AlreadyFull, _) => disk::DiskUsage::AlreadyFull, - (_, disk::DiskUsage::AlreadyFull) => disk::DiskUsage::AlreadyFull, - (disk::DiskUsage::AlmostFull, _) => disk::DiskUsage::AlmostFull, - (_, disk::DiskUsage::AlmostFull) => disk::DiskUsage::AlmostFull, - (disk::DiskUsage::Normal, disk::DiskUsage::Normal) => disk::DiskUsage::Normal, - } - } + // If the auxiliary directory of raft engine is specified, it's needed to be + // checked. Otherwise, it's not needed to be checked. And as the configuration + // is static, it's safe to check it only once. + let raft_auxiliay_path = if self.core.config.raft_engine.enable { + self.core.config.raft_engine.config().spill_dir + } else { + None + }; + let (separated_raft_auxillay_mount_path, separated_raft_auxiliary_with_kvdb) = + raft_auxiliay_path + .as_ref() + .map(|path| { + let seperated_with_kvdb = + path_in_diff_mount_point(path.as_str(), engines.kv.path()); + let seperated_with_raft = + path_in_diff_mount_point(path.as_str(), raft_path.as_str()); + ( + seperated_with_kvdb && seperated_with_raft, + seperated_with_kvdb, + ) + }) + .unwrap_or((false, false)); + let disk_usage_checker = DiskUsageChecker::new( + store_path.as_path().to_str().unwrap().to_string(), + raft_path, + raft_auxiliay_path, + separated_raft_mount_path, + separated_raft_auxillay_mount_path, + separated_raft_auxiliary_with_kvdb, + reserve_space, + reserve_raft_space, + config_disk_capacity, + ); self.core.background_worker .spawn_interval_task(DEFAULT_STORAGE_STATS_INTERVAL, move || { - let disk_stats = match fs2::statvfs(&store_path) { - Err(e) => { - error!( - "get disk stat for kv store failed"; - "kv_path" => store_path.to_str(), - "err" => ?e - ); - return; - } - Ok(stats) => stats, - }; - let disk_cap = disk_stats.total_space(); let snap_size = snap_mgr.get_total_snap_size().unwrap(); - let kv_size = engines .kv .get_engine_used_size() .expect("get kv engine size"); - let raft_size = engines .raft .get_engine_size() .expect("get raft engine size"); - - let mut raft_disk_status = disk::DiskUsage::Normal; - if separated_raft_mount_path && reserve_raft_space != 0 { - let raft_disk_stats = match fs2::statvfs(&raft_path) { - Err(e) => { - error!( - "get disk stat for raft engine failed"; - "raft_engine_path" => raft_path.clone(), - "err" => ?e - ); - return; - } - Ok(stats) => stats, - }; - let raft_disk_cap = raft_disk_stats.total_space(); - let mut raft_disk_available = - raft_disk_cap.checked_sub(raft_size).unwrap_or_default(); - raft_disk_available = cmp::min(raft_disk_available, raft_disk_stats.available_space()); - raft_disk_status = if raft_disk_available <= raft_already_full_threshold - { - disk::DiskUsage::AlreadyFull - } else if raft_disk_available <= raft_almost_full_threshold - { - disk::DiskUsage::AlmostFull - } else { - disk::DiskUsage::Normal - }; - } let placeholer_file_path = PathBuf::from_str(&data_dir) .unwrap() .join(Path::new(file_system::SPACE_PLACEHOLDER_FILE)); - let placeholder_size: u64 = file_system::get_file_size(placeholer_file_path).unwrap_or(0); @@ -1406,24 +1384,9 @@ where } else { snap_size + kv_size + placeholder_size }; - let capacity = if config_disk_capacity == 0 || disk_cap < config_disk_capacity { - disk_cap - } else { - config_disk_capacity - }; - - let mut available = capacity.checked_sub(used_size).unwrap_or_default(); - available = cmp::min(available, disk_stats.available_space()); - + // Check the disk usage and update the disk usage status. + let (cur_disk_status, cur_kv_disk_status, raft_disk_status, capacity, available) = disk_usage_checker.inspect(used_size, raft_size); let prev_disk_status = disk::get_disk_status(0); //0 no need care about failpoint. - let cur_kv_disk_status = if available <= already_full_threshold { - disk::DiskUsage::AlreadyFull - } else if available <= almost_full_threshold { - disk::DiskUsage::AlmostFull - } else { - disk::DiskUsage::Normal - }; - let cur_disk_status = calculate_disk_usage(raft_disk_status, cur_kv_disk_status); if prev_disk_status != cur_disk_status { warn!( "disk usage {:?}->{:?} (raft engine usage: {:?}, kv engine usage: {:?}), seperated raft mount={}, kv available={}, snap={}, kv={}, raft={}, capacity={}", diff --git a/components/server/src/server2.rs b/components/server/src/server2.rs index fbc7db4b4ce..c6becff29c9 100644 --- a/components/server/src/server2.rs +++ b/components/server/src/server2.rs @@ -12,7 +12,6 @@ //! explicitly stopped. We keep these components in the `TikvServer` struct. use std::{ - cmp, collections::HashMap, marker::PhantomData, path::{Path, PathBuf}, @@ -121,7 +120,10 @@ use tikv_util::{ use tokio::runtime::Builder; use crate::{ - common::{ConfiguredRaftEngine, EngineMetricsManager, EnginesResourceInfo, TikvServerCore}, + common::{ + ConfiguredRaftEngine, DiskUsageChecker, EngineMetricsManager, EnginesResourceInfo, + TikvServerCore, + }, memory::*, setup::*, signal_handler, @@ -1159,36 +1161,42 @@ where let raft_path = raft_engine.get_engine_path().to_string(); let separated_raft_mount_path = path_in_diff_mount_point(raft_path.as_str(), tablet_registry.tablet_root()); - let raft_almost_full_threshold = reserve_raft_space; - let raft_already_full_threshold = reserve_raft_space / 2; - - let almost_full_threshold = reserve_space; - let already_full_threshold = reserve_space / 2; - fn calculate_disk_usage(a: disk::DiskUsage, b: disk::DiskUsage) -> disk::DiskUsage { - match (a, b) { - (disk::DiskUsage::AlreadyFull, _) => disk::DiskUsage::AlreadyFull, - (_, disk::DiskUsage::AlreadyFull) => disk::DiskUsage::AlreadyFull, - (disk::DiskUsage::AlmostFull, _) => disk::DiskUsage::AlmostFull, - (_, disk::DiskUsage::AlmostFull) => disk::DiskUsage::AlmostFull, - (disk::DiskUsage::Normal, disk::DiskUsage::Normal) => disk::DiskUsage::Normal, - } - } + // If the auxiliary directory of raft engine is specified, it's needed to be + // checked. Otherwise, it's not needed to be checked. And as the configuration + // is static, it's safe to check it only once. + let raft_auxiliay_path = if self.core.config.raft_engine.enable { + self.core.config.raft_engine.config().spill_dir + } else { + None + }; + let (separated_raft_auxillay_mount_path, separated_raft_auxiliary_with_kvdb) = + raft_auxiliay_path + .as_ref() + .map(|path| { + let seperated_with_kvdb = + path_in_diff_mount_point(path.as_str(), tablet_registry.tablet_root()); + let seperated_with_raft = + path_in_diff_mount_point(path.as_str(), raft_path.as_str()); + ( + seperated_with_kvdb && seperated_with_raft, + seperated_with_kvdb, + ) + }) + .unwrap_or((false, false)); + let disk_usage_checker = DiskUsageChecker::new( + store_path.as_path().to_str().unwrap().to_string(), + raft_path, + raft_auxiliay_path, + separated_raft_mount_path, + separated_raft_auxillay_mount_path, + separated_raft_auxiliary_with_kvdb, + reserve_space, + reserve_raft_space, + config_disk_capacity, + ); self.core.background_worker .spawn_interval_task(DEFAULT_STORAGE_STATS_INTERVAL, move || { - let disk_stats = match fs2::statvfs(&store_path) { - Err(e) => { - error!( - "get disk stat for kv store failed"; - "kv_path" => store_path.to_str(), - "err" => ?e - ); - return; - } - Ok(stats) => stats, - }; - let disk_cap = disk_stats.total_space(); let snap_size = snap_mgr.total_snap_size().unwrap(); - let mut kv_size = 0; tablet_registry.for_each_opened_tablet(|_, cached| { if let Some(tablet) = cached.latest() { @@ -1196,42 +1204,12 @@ where } true }); - let raft_size = raft_engine .get_engine_size() .expect("get raft engine size"); - - let mut raft_disk_status = disk::DiskUsage::Normal; - if separated_raft_mount_path && reserve_raft_space != 0 { - let raft_disk_stats = match fs2::statvfs(&raft_path) { - Err(e) => { - error!( - "get disk stat for raft engine failed"; - "raft_engine_path" => raft_path.clone(), - "err" => ?e - ); - return; - } - Ok(stats) => stats, - }; - let raft_disk_cap = raft_disk_stats.total_space(); - let mut raft_disk_available = - raft_disk_cap.checked_sub(raft_size).unwrap_or_default(); - raft_disk_available = cmp::min(raft_disk_available, raft_disk_stats.available_space()); - raft_disk_status = if raft_disk_available <= raft_already_full_threshold - { - disk::DiskUsage::AlreadyFull - } else if raft_disk_available <= raft_almost_full_threshold - { - disk::DiskUsage::AlmostFull - } else { - disk::DiskUsage::Normal - }; - } let placeholer_file_path = PathBuf::from_str(&data_dir) .unwrap() .join(Path::new(file_system::SPACE_PLACEHOLDER_FILE)); - let placeholder_size: u64 = file_system::get_file_size(placeholer_file_path).unwrap_or(0); @@ -1240,24 +1218,9 @@ where } else { snap_size + kv_size + placeholder_size }; - let capacity = if config_disk_capacity == 0 || disk_cap < config_disk_capacity { - disk_cap - } else { - config_disk_capacity - }; - - let mut available = capacity.checked_sub(used_size).unwrap_or_default(); - available = cmp::min(available, disk_stats.available_space()); - + // Check the disk usage and update the disk usage status. + let (cur_disk_status, cur_kv_disk_status, raft_disk_status, capacity, available) = disk_usage_checker.inspect(used_size, raft_size); let prev_disk_status = disk::get_disk_status(0); //0 no need care about failpoint. - let cur_kv_disk_status = if available <= already_full_threshold { - disk::DiskUsage::AlreadyFull - } else if available <= almost_full_threshold { - disk::DiskUsage::AlmostFull - } else { - disk::DiskUsage::Normal - }; - let cur_disk_status = calculate_disk_usage(raft_disk_status, cur_kv_disk_status); if prev_disk_status != cur_disk_status { warn!( "disk usage {:?}->{:?} (raft engine usage: {:?}, kv engine usage: {:?}), seperated raft mount={}, kv available={}, snap={}, kv={}, raft={}, capacity={}", diff --git a/components/tikv_util/Cargo.toml b/components/tikv_util/Cargo.toml index b502a701136..6279dd8f5f5 100644 --- a/components/tikv_util/Cargo.toml +++ b/components/tikv_util/Cargo.toml @@ -23,6 +23,7 @@ crossbeam-skiplist = "0.1" derive_more = "0.99.3" error_code = { workspace = true } fail = "0.5" +fs2 = "0.4" futures = { version = "0.3", features = ["compat", "thread-pool"] } futures-util = { version = "0.3", default-features = false, features = ["io"] } grpcio = { workspace = true } @@ -30,7 +31,10 @@ http = "0.2.0" kvproto = { workspace = true } lazy_static = "1.3" libc = "0.2" -log = { version = "0.4", features = ["max_level_trace", "release_max_level_debug"] } +log = { version = "0.4", features = [ + "max_level_trace", + "release_max_level_debug", +] } log_wrappers = { workspace = true } mnt = "0.3.1" nix = "0.24" diff --git a/components/tikv_util/src/sys/disk.rs b/components/tikv_util/src/sys/disk.rs index c8fe87a56b0..5918bdd8e3b 100644 --- a/components/tikv_util/src/sys/disk.rs +++ b/components/tikv_util/src/sys/disk.rs @@ -1,5 +1,8 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. -use std::sync::atomic::{AtomicI32, AtomicU64, Ordering}; +use std::{ + path::Path, + sync::atomic::{AtomicI32, AtomicU64, Ordering}, +}; use fail::fail_point; pub use kvproto::disk_usage::DiskUsage; @@ -78,3 +81,16 @@ pub fn get_disk_status(_store_id: u64) -> DiskUsage { _ => panic!("Disk Status Value not meet expectations"), } } + +pub fn get_disk_space_stats>(path: P) -> std::io::Result<(u64, u64)> { + fail_point!("mock_disk_space_stats", |stats| { + let stats = stats.unwrap(); + let values = stats.split(',').collect::>(); + Ok(( + values[0].parse::().unwrap(), + values[1].parse::().unwrap(), + )) + }); + let disk_stats = fs2::statvfs(path)?; + Ok((disk_stats.total_space(), disk_stats.available_space())) +} diff --git a/components/tikv_util/src/sys/mod.rs b/components/tikv_util/src/sys/mod.rs index 797da2aea54..3afa8cd724c 100644 --- a/components/tikv_util/src/sys/mod.rs +++ b/components/tikv_util/src/sys/mod.rs @@ -195,10 +195,11 @@ pub fn path_in_diff_mount_point(_path1: impl AsRef, _path2: impl AsRef 0); + assert!(available > 0); + assert!(capacity >= available); + + disk::get_disk_space_stats("/non-exist-path").unwrap_err(); + } } From 8dc6cdb9c147463075e43b72ca94fb2272d0a21d Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Fri, 13 Sep 2024 14:22:00 +0800 Subject: [PATCH 201/220] RocksDB: Fix bloom filter incompatible issue (#17361) (#17372) close tikv/tikv#17272 TiKV no longer names bloom filter blocks with suffix like "FullBloom" or "Ribbon". Signed-off-by: Yang Zhang Co-authored-by: Yang Zhang Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- Cargo.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a027321d2a5..fb4ae399a30 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2998,7 +2998,7 @@ dependencies = [ [[package]] name = "librocksdb_sys" version = "0.1.0" -source = "git+https://github.com/tikv/rust-rocksdb.git#224bed6ffa29ba3bbe9a91ef6bda7186200c59a8" +source = "git+https://github.com/tikv/rust-rocksdb.git#c92c467a3ab0b60484a0db83fcf89366791716cd" dependencies = [ "bindgen 0.65.1", "bzip2-sys", @@ -3017,7 +3017,7 @@ dependencies = [ [[package]] name = "libtitan_sys" version = "0.0.1" -source = "git+https://github.com/tikv/rust-rocksdb.git#224bed6ffa29ba3bbe9a91ef6bda7186200c59a8" +source = "git+https://github.com/tikv/rust-rocksdb.git#c92c467a3ab0b60484a0db83fcf89366791716cd" dependencies = [ "bzip2-sys", "cc", @@ -4938,7 +4938,7 @@ dependencies = [ [[package]] name = "rocksdb" version = "0.3.0" -source = "git+https://github.com/tikv/rust-rocksdb.git#224bed6ffa29ba3bbe9a91ef6bda7186200c59a8" +source = "git+https://github.com/tikv/rust-rocksdb.git#c92c467a3ab0b60484a0db83fcf89366791716cd" dependencies = [ "libc 0.2.146", "librocksdb_sys", From fc1d8fb3fcac5ee2cc8b5dd62e60f54c19d0711a Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Tue, 24 Sep 2024 17:03:42 +0800 Subject: [PATCH 202/220] raftstore: fix panic due to stale peer handling snapshot (#17535) (#17566) close tikv/tikv#17469 The commit fixes a panic in TiKV that occurs in a rare scenario that involves region splits and immediate removal of the new peer. When a region splits, the new peer on a follower can be created in two ways: (1) By receiving a Raft message from the new region (`fn maybe_create_peer`) (2) By applying the split operation locally (`fn on_ready_split_region`). Depending on timing, a new peer might first be created by a Raft message and then again when the split is applied. This is a known situation. When it happens, the second peer replaces the first, and the first peer is dicarded. However, the discarded peer may continue processing existing messages, leading to unexpected states. The panic can be reproduced with the following sequence of events: 1. The first peer is created by a Raft message and is waiting for a Raft snapshot. 2. The second peer (of the same region) is created by `on_ready_split_region` when the split operation is applied, replacing the first peer and closing its mailbox (as expected). 3. The second peer is immediately removed. This removes the region metadata. 4. The first peer continues processing the Raft snapshot message, expecting the metadata of the region to exist, causing the panic. Signed-off-by: Bisheng Huang Co-authored-by: Bisheng Huang --- components/raftstore/src/store/fsm/peer.rs | 50 ++++++++- tests/failpoints/cases/test_split_region.rs | 109 ++++++++++++++++++++ 2 files changed, 156 insertions(+), 3 deletions(-) diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index 07fdc6c0e2c..59cb3d1ab40 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -2579,13 +2579,40 @@ where "is_initialized_peer" => is_initialized_peer, ); + let msg_type = msg.get_message().get_msg_type(); + let fp_enable = |target_msg_type: MessageType| -> bool { + self.fsm.region_id() == 1000 + && self.store_id() == 2 + && !is_initialized_peer + && msg_type == target_msg_type + }; + fail_point!( + "on_snap_msg_1000_2", + fp_enable(MessageType::MsgSnapshot), + |_| Ok(()) + ); + fail_point!( + "on_vote_msg_1000_2", + fp_enable(MessageType::MsgRequestVote), + |_| Ok(()) + ); + fail_point!( + "on_append_msg_1000_2", + fp_enable(MessageType::MsgAppend), + |_| Ok(()) + ); + fail_point!( + "on_heartbeat_msg_1000_2", + fp_enable(MessageType::MsgHeartbeat), + |_| Ok(()) + ); + if self.fsm.peer.pending_remove || self.fsm.stopped { return Ok(()); } self.handle_reported_disk_usage(&msg); - let msg_type = msg.get_message().get_msg_type(); if matches!(self.ctx.self_disk_usage, DiskUsage::AlreadyFull) && MessageType::MsgTimeoutNow == msg_type { @@ -3322,7 +3349,24 @@ where } let mut meta = self.ctx.store_meta.lock().unwrap(); - if meta.regions[&self.region_id()] != *self.region() { + // Check if the region matches the metadata. A mismatch means another + // peer has replaced the current peer, which can happen during a split: a + // peer is first created via raft message, then replaced by another peer + // (of the same region) when the split is applied. + let region_mismatch = match meta.regions.get(&self.region_id()) { + Some(region) => *region != *self.region(), + None => { + // If the region doesn't exist, treat it as a mismatch. This can + // happen in rare situations (e.g. #17469). + warn!( + "region not found in meta"; + "region_id" => self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + ); + true + } + }; + if region_mismatch { if !self.fsm.peer.is_initialized() { info!( "stale delegate detected, skip"; @@ -3335,7 +3379,7 @@ where panic!( "{} meta corrupted: {:?} != {:?}", self.fsm.peer.tag, - meta.regions[&self.region_id()], + meta.regions.get(&self.region_id()), self.region() ); } diff --git a/tests/failpoints/cases/test_split_region.rs b/tests/failpoints/cases/test_split_region.rs index 2ef3d499d22..35148f20db2 100644 --- a/tests/failpoints/cases/test_split_region.rs +++ b/tests/failpoints/cases/test_split_region.rs @@ -552,6 +552,115 @@ fn test_split_not_to_split_existing_tombstone_region() { must_get_equal(&cluster.get_engine(2), b"k1", b"v1"); } +#[test] +fn test_stale_peer_handle_snap() { + test_stale_peer_handle_raft_msg("on_snap_msg_1000_2"); +} + +#[test] +fn test_stale_peer_handle_vote() { + test_stale_peer_handle_raft_msg("on_vote_msg_1000_2"); +} + +#[test] +fn test_stale_peer_handle_append() { + test_stale_peer_handle_raft_msg("on_append_msg_1000_2"); +} + +#[test] +fn test_stale_peer_handle_heartbeat() { + test_stale_peer_handle_raft_msg("on_heartbeat_msg_1000_2"); +} + +fn test_stale_peer_handle_raft_msg(on_handle_raft_msg_1000_2_fp: &str) { + // The following diagram represents the final state of the test: + // + // ┌───────────┐ ┌───────────┐ ┌───────────┐ + // │ │ │ │ │ │ + // Region 1 │ Peer 1 │ │ Peer 2 │ │ Peer 3 │ + // [k2, +∞) │ │ │ │ │ │ + // ───────────────────┼───────────┼──┼───────────┼──┼───────────┼── + // │ │ │ │ │ │ + // Region 1000 │ Peer 1001 │ │ Peer 1003 │ │ Peer 1002 │ + // (-∞, k2) │ │ │ │ │ │ + // └───────────┘ └───────────┘ └───────────┘ + // Store 1 Store 2 Store 3 + // + // In this test, there is a split operation and Peer 1003 will be created + // twice (by raft message and by split). The new Peer 1003 will replace the + // old Peer 1003 and but it will be immediately removed. This test verifies + // that TiKV would not panic if the old Peer 1003 continues to process a + // remaining raft message (which may be a snapshot/vote/heartbeat/append + // message). + + let mut cluster = new_node_cluster(0, 3); + configure_for_merge(&mut cluster.cfg); + cluster.cfg.raft_store.right_derive_when_split = true; + cluster.cfg.raft_store.store_batch_system.max_batch_size = Some(1); + cluster.cfg.raft_store.store_batch_system.pool_size = 2; + cluster.cfg.raft_store.apply_batch_system.max_batch_size = Some(1); + cluster.cfg.raft_store.apply_batch_system.pool_size = 2; + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + fail::cfg("on_raft_gc_log_tick", "return()").unwrap(); + let r1 = cluster.run_conf_change(); + // Add Peer 3 + pd_client.must_add_peer(r1, new_peer(3, 3)); + assert_eq!(r1, 1); + + // Pause the snapshot apply of Peer 2. + let before_check_snapshot_1_2_fp = "before_check_snapshot_1_2"; + fail::cfg(before_check_snapshot_1_2_fp, "pause").unwrap(); + + // Add Peer 2. The peer will be created but stuck at applying snapshot due + // to the failpoint above. + pd_client.must_add_peer(r1, new_peer(2, 2)); + cluster.must_put(b"k1", b"v1"); + + // Before the split, pause Peer 1003 when processing a certain raft message. + // The message type depends on the failpoint name input. + fail::cfg(on_handle_raft_msg_1000_2_fp, "pause").unwrap(); + + // Split the region into Region 1 and Region 1000. Peer 1003 will be created + // for the first time when it receives a raft message from Peer 1001, but it + // will remain uninitialized because it's paused due to the failpoint above. + let region = pd_client.get_region(b"k1").unwrap(); + + cluster.must_split(®ion, b"k2"); + cluster.must_put(b"k22", b"v22"); + + // Check that Store 2 doesn't have any data yet. + must_get_none(&cluster.get_engine(2), b"k1"); + must_get_none(&cluster.get_engine(2), b"k22"); + + // Unblock Peer 2. It will proceed to apply the split operation, which + // creates Peer 1003 for the second time and replaces the old Peer 1003. + fail::remove(before_check_snapshot_1_2_fp); + + // Verify that data can be accessed from Peer 2 and the new Peer 1003. + must_get_equal(&cluster.get_engine(2), b"k1", b"v1"); + must_get_equal(&cluster.get_engine(2), b"k22", b"v22"); + + // Immediately remove the new Peer 1003. This removes the region metadata. + let left = pd_client.get_region(b"k1").unwrap(); + let left_peer_2 = find_peer(&left, 2).cloned().unwrap(); + pd_client.must_remove_peer(left.get_id(), left_peer_2); + must_get_none(&cluster.get_engine(2), b"k1"); + must_get_equal(&cluster.get_engine(2), b"k22", b"v22"); + + // Unblock the old Peer 1003 so that it can continue to process its raft + // message. It would lead to a panic when it processes a snapshot message if + // #17469 is not fixed. + fail::remove(on_handle_raft_msg_1000_2_fp); + + // Waiting for the stale peer to handle its raft message. + sleep_ms(300); + + must_get_none(&cluster.get_engine(2), b"k1"); + must_get_equal(&cluster.get_engine(2), b"k22", b"v22"); +} + // TiKV uses memory lock to control the order between spliting and creating // new peer. This case test if tikv continues split if the peer is destroyed // after memory lock check. From 667e19c93125eb9ad89d1bc8e37ea435e78f1842 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Tue, 24 Sep 2024 17:32:21 +0800 Subject: [PATCH 203/220] storage: Fix unexpected flow control after unsafe destroy range (#17458) (#17565) close tikv/tikv#17304 Fix unexpected flow control after unsafe destroy range Flow controller detects pending compaction bytes jump before and after unsafe destroy range. If there is a jump, the controller enters a state that would ignore the high pending compaction bytes until it falls back to normal. Previously, the controller may not enter the state if the pending compaction bytes is lower than the threshold while long term average pending bytes is still high. Then it would trigger flow control mistakenly. Signed-off-by: Connor1996 Co-authored-by: Connor1996 Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- .../singleton_flow_controller.rs | 50 +++++++++++++++++-- 1 file changed, 46 insertions(+), 4 deletions(-) diff --git a/src/storage/txn/flow_controller/singleton_flow_controller.rs b/src/storage/txn/flow_controller/singleton_flow_controller.rs index 5d52c272db6..c5ac88c9f22 100644 --- a/src/storage/txn/flow_controller/singleton_flow_controller.rs +++ b/src/storage/txn/flow_controller/singleton_flow_controller.rs @@ -598,14 +598,24 @@ impl FlowChecker { if !enabled { return; } + if self.wait_for_destroy_range_finish { + // Concurrent unsafe destroy range, ignore the second one + info!("concurrent unsafe destroy range, ignore"); + return; + } self.wait_for_destroy_range_finish = true; let soft = (self.soft_pending_compaction_bytes_limit as f64).log2(); - for cf_checker in self.cf_checkers.values_mut() { + for (cf, cf_checker) in &mut self.cf_checkers { if let Some(long_term_pending_bytes) = cf_checker.long_term_pending_bytes.as_ref() { let v = long_term_pending_bytes.get_avg(); if v <= soft { + info!( + "before unsafe destroy range"; + "cf" => cf, + "pending_bytes" => v + ); cf_checker.pending_bytes_before_unsafe_destroy_range = Some(v); } } @@ -629,9 +639,13 @@ impl FlowChecker { SCHED_THROTTLE_ACTION_COUNTER .with_label_values(&[cf, "pending_bytes_jump"]) .inc(); - } else { - cf_checker.pending_bytes_before_unsafe_destroy_range = None; } + info!( + "after unsafe destroy range"; + "cf" => cf, + "before" => before, + "after" => after + ); } } } @@ -778,7 +792,17 @@ impl FlowChecker { let pending_compaction_bytes = long_term_pending_bytes.get_avg(); let ignore = if let Some(before) = checker.pending_bytes_before_unsafe_destroy_range { + // It assumes that the long term average will eventually come down below the + // soft limit. If the general traffic flow increases during destroy, the long + // term average may never come down and the flow control will be turned off for + // a long time, which would be a rather rare case, so just ignore it. if pending_compaction_bytes <= before && !self.wait_for_destroy_range_finish { + info!( + "pending compaction bytes is back to normal"; + "cf" => &cf, + "pending_compaction_bytes" => pending_compaction_bytes, + "before" => before + ); checker.pending_bytes_before_unsafe_destroy_range = None; } true @@ -1299,6 +1323,14 @@ pub(super) mod tests { stub.0 .pending_compaction_bytes .store(10000000 * 1024 * 1024 * 1024, Ordering::Relaxed); + send_flow_info(tx, region_id); + assert!(flow_controller.discard_ratio(region_id) < f64::EPSILON); + + // after unsafe destroy range, pending compaction bytes may jump back to a lower + // value + stub.0 + .pending_compaction_bytes + .store(100 * 1024 * 1024 * 1024, Ordering::Relaxed); tx.send(FlowInfo::Compaction("default".to_string(), region_id)) .unwrap(); tx.send(FlowInfo::AfterUnsafeDestroyRange(region_id)) @@ -1311,13 +1343,23 @@ pub(super) mod tests { flow_controller.discard_ratio(region_id) ); - // unfreeze the control + // the long term average pending compaction bytes is still high, shouldn't + // unfreeze the jump control + stub.0 + .pending_compaction_bytes + .store(100 * 1024 * 1024 * 1024, Ordering::Relaxed); + send_flow_info(tx, region_id); + assert!(flow_controller.discard_ratio(region_id) < f64::EPSILON); + + // the long term average pending compaction bytes falls below the threshold, + // should unfreeze the jump control stub.0 .pending_compaction_bytes .store(1024 * 1024, Ordering::Relaxed); send_flow_info(tx, region_id); assert!(flow_controller.discard_ratio(region_id) < f64::EPSILON); + // exceeds the threshold, should perform throttle stub.0 .pending_compaction_bytes .store(1000000000 * 1024 * 1024 * 1024, Ordering::Relaxed); From 14ab849c520c450fafbd52db37c6a10a8161eba0 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Tue, 24 Sep 2024 19:22:40 +0800 Subject: [PATCH 204/220] batch-system: Reduce the memory usage of peers' message channel (#17326) (#17567) close tikv/tikv#16229 Reduce the memory usage of peers' message channel Signed-off-by: lucasliang Co-authored-by: lucasliang Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- components/raftstore/src/router.rs | 5 +- components/raftstore/src/store/fsm/apply.rs | 93 +++++++++---------- components/raftstore/src/store/fsm/peer.rs | 54 +++++------ components/raftstore/src/store/fsm/store.rs | 55 +++++------ components/raftstore/src/store/msg.rs | 35 ++++--- .../raftstore/src/store/snapshot_backup.rs | 4 +- components/raftstore/src/store/transport.rs | 7 +- .../raftstore/src/store/unsafe_recovery.rs | 12 ++- .../src/store/worker/cleanup_snapshot.rs | 4 +- components/raftstore/src/store/worker/pd.rs | 8 +- components/snap_recovery/src/services.rs | 4 +- components/test_raftstore/src/router.rs | 5 +- src/server/server.rs | 10 +- .../raftstore/test_snap_recovery.rs | 12 +-- 14 files changed, 168 insertions(+), 140 deletions(-) diff --git a/components/raftstore/src/router.rs b/components/raftstore/src/router.rs index fd50357fa38..f8094de764e 100644 --- a/components/raftstore/src/router.rs +++ b/components/raftstore/src/router.rs @@ -92,7 +92,10 @@ where /// Report a `StoreResolved` event to all Raft groups. fn report_resolved(&self, store_id: u64, group_id: u64) { self.broadcast_normal(|| { - PeerMsg::SignificantMsg(SignificantMsg::StoreResolved { store_id, group_id }) + PeerMsg::SignificantMsg(Box::new(SignificantMsg::StoreResolved { + store_id, + group_id, + })) }) } } diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index 8c99d0b19df..c03efc15ad2 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -918,7 +918,8 @@ where /// All of messages that need to continue to be handled after /// the source peer has applied its logs and pending entries /// are all handled. - pending_msgs: Vec>, + #[allow(clippy::vec_box)] + pending_msgs: Vec>>, /// Cache heap size for itself. heap_size: Option, @@ -2864,11 +2865,11 @@ where fail_point!("before_handle_catch_up_logs_for_merge"); // Sends message to the source peer fsm and pause `exec_commit_merge` process let logs_up_to_date = Arc::new(AtomicU64::new(0)); - let msg = SignificantMsg::CatchUpLogs(CatchUpLogs { + let msg = Box::new(SignificantMsg::CatchUpLogs(CatchUpLogs { target_region_id: self.region_id(), merge: merge.to_owned(), logs_up_to_date: logs_up_to_date.clone(), - }); + })); ctx.notifier .notify_one(source_region_id, PeerMsg::SignificantMsg(msg)); return Ok(( @@ -3771,10 +3772,13 @@ where }, } -impl ResourceMetered for Msg { +impl ResourceMetered for Box> { fn consume_resource(&self, resource_ctl: &Arc) -> Option { - match self { - Msg::Apply { apply, .. } => { + if !resource_ctl.is_customized() { + return None; + } + match **self { + Msg::Apply { ref apply, .. } => { let mut dominant_group = "".to_owned(); let mut max_write_bytes = 0; for cached_entries in &apply.entries { @@ -3907,7 +3911,7 @@ where EK: KvEngine, { delegate: ApplyDelegate, - receiver: Receiver>, + receiver: Receiver>>, mailbox: Option>>, } @@ -3917,12 +3921,14 @@ where { fn from_peer( peer: &Peer, - ) -> (LooseBoundedSender>, Box>) { + ) -> (LooseBoundedSender>>, Box>) { let reg = Registration::new(peer); ApplyFsm::from_registration(reg) } - fn from_registration(reg: Registration) -> (LooseBoundedSender>, Box>) { + fn from_registration( + reg: Registration, + ) -> (LooseBoundedSender>>, Box>) { let (tx, rx) = loose_bounded(usize::MAX); let delegate = ApplyDelegate::from_registration(reg); ( @@ -4093,13 +4099,11 @@ where self.destroy(ctx); ctx.notifier.notify_one( self.delegate.region_id(), - PeerMsg::ApplyRes { - res: TaskRes::Destroy { - region_id: self.delegate.region_id(), - peer_id: self.delegate.id(), - merge_from_snapshot: d.merge_from_snapshot, - }, - }, + PeerMsg::ApplyRes(Box::new(TaskRes::Destroy { + region_id: self.delegate.region_id(), + peer_id: self.delegate.id(), + merge_from_snapshot: d.merge_from_snapshot, + })), ); } } @@ -4163,7 +4167,7 @@ where .store(region_id, Ordering::SeqCst); // To trigger the target apply fsm if let Some(mailbox) = ctx.router.mailbox(catch_up_logs.target_region_id) { - let _ = mailbox.force_send(Msg::Noop); + let _ = mailbox.force_send(Box::new(Msg::Noop)); } else { error!( "failed to get mailbox, are we shutting down?"; @@ -4350,7 +4354,8 @@ where } } - fn handle_tasks(&mut self, apply_ctx: &mut ApplyContext, msgs: &mut Vec>) { + #[allow(clippy::vec_box)] + fn handle_tasks(&mut self, apply_ctx: &mut ApplyContext, msgs: &mut Vec>>) { let mut drainer = msgs.drain(..); let mut batch_apply = None; loop { @@ -4365,7 +4370,7 @@ where }; if batch_apply.is_some() { - match &msg { + match *msg { Msg::Apply { .. } => (), _ => { self.handle_apply(apply_ctx, batch_apply.take().unwrap()); @@ -4378,7 +4383,7 @@ where } } - match msg { + match *msg { Msg::Apply { start, mut apply } => { let apply_wait = start.saturating_elapsed(); apply_ctx.apply_wait.observe(apply_wait.as_secs_f64()); @@ -4399,7 +4404,9 @@ where } else { self.handle_apply(apply_ctx, batch_apply.take().unwrap()); if let Some(ref mut state) = self.delegate.yield_state { - state.pending_msgs.push(Msg::Apply { start, apply }); + state + .pending_msgs + .push(Box::new(Msg::Apply { start, apply })); state.pending_msgs.extend(drainer); break; } @@ -4445,7 +4452,7 @@ impl Fsm for ApplyFsm where EK: KvEngine, { - type Message = Msg; + type Message = Box>; #[inline] fn is_stopped(&self) -> bool { @@ -4552,7 +4559,8 @@ pub struct ApplyPoller where EK: KvEngine, { - msg_buf: Vec>, + #[allow(clippy::vec_box)] + msg_buf: Vec>>, apply_ctx: ApplyContext, messages_per_tick: usize, cfg_tracker: Tracker, @@ -4784,9 +4792,9 @@ where EK: KvEngine, { pub fn schedule_task(&self, region_id: u64, msg: Msg) { - let reg = match self.try_send(region_id, msg) { + let reg = match self.try_send(region_id, Box::new(msg)) { Either::Left(Ok(())) => return, - Either::Left(Err(TrySendError::Disconnected(msg))) | Either::Right(msg) => match msg { + Either::Left(Err(TrySendError::Disconnected(msg))) | Either::Right(msg) => match *msg { Msg::Registration(reg) => reg, Msg::Apply { mut apply, .. } => { info!( @@ -5112,8 +5120,8 @@ mod tests { impl Notifier for TestNotifier { fn notify(&self, apply_res: Vec>) { for r in apply_res { - let res = TaskRes::Apply(r); - let _ = self.tx.send(PeerMsg::ApplyRes { res }); + let res = Box::new(TaskRes::Apply(r)); + let _ = self.tx.send(PeerMsg::ApplyRes(res)); } } fn notify_one(&self, _: u64, msg: PeerMsg) { @@ -5320,10 +5328,7 @@ mod tests { E: KvEngine, { match receiver.recv_timeout(Duration::from_secs(3)) { - Ok(PeerMsg::ApplyRes { - res: TaskRes::Apply(res), - .. - }) => res, + Ok(PeerMsg::ApplyRes(box TaskRes::Apply(res))) => res, e => panic!("unexpected res {:?}", e), } } @@ -5471,10 +5476,7 @@ mod tests { ], ); let apply_res = match rx.recv_timeout(Duration::from_secs(3)) { - Ok(PeerMsg::ApplyRes { - res: TaskRes::Apply(res), - .. - }) => res, + Ok(PeerMsg::ApplyRes(box TaskRes::Apply(res))) => res, e => panic!("unexpected apply result: {:?}", e), }; let apply_state_key = keys::apply_state_key(2); @@ -5505,12 +5507,9 @@ mod tests { router.schedule_task(2, Msg::destroy(2, false)); let (region_id, peer_id) = match rx.recv_timeout(Duration::from_secs(3)) { - Ok(PeerMsg::ApplyRes { - res: TaskRes::Destroy { - region_id, peer_id, .. - }, - .. - }) => (region_id, peer_id), + Ok(PeerMsg::ApplyRes(box TaskRes::Destroy { + region_id, peer_id, .. + })) => (region_id, peer_id), e => panic!("expected destroy result, but got {:?}", e), }; assert_eq!(peer_id, 1); @@ -7773,9 +7772,9 @@ mod tests { .unwrap(); assert_ne!(initial_state.get_applied_index(), 0); match apply_res_rx.recv_timeout(Duration::from_secs(3)) { - Ok(PeerMsg::ApplyRes { - res: TaskRes::Apply(apply_res), - }) => assert_eq!(apply_res.apply_state, initial_state), + Ok(PeerMsg::ApplyRes(box TaskRes::Apply(apply_res))) => { + assert_eq!(apply_res.apply_state, initial_state) + } e => panic!("unexpected result: {:?}", e), } index_id += 1; @@ -7807,9 +7806,9 @@ mod tests { .unwrap() .unwrap(); match apply_res_rx.recv_timeout(Duration::from_secs(3)) { - Ok(PeerMsg::ApplyRes { - res: TaskRes::Apply(apply_res), - }) => assert_eq!(apply_res.apply_state, apply_state), + Ok(PeerMsg::ApplyRes(box TaskRes::Apply(apply_res))) => { + assert_eq!(apply_res.apply_state, apply_state) + } e => panic!("unexpected result: {:?}", e), } assert!( diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index 59cb3d1ab40..89fb55a5ec8 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -208,7 +208,7 @@ where while let Ok(msg) = self.receiver.try_recv() { let callback = match msg { PeerMsg::RaftCommand(cmd) => cmd.callback, - PeerMsg::CasualMessage(CasualMessage::SplitRegion { callback, .. }) => callback, + PeerMsg::CasualMessage(box CasualMessage::SplitRegion { callback, .. }) => callback, PeerMsg::RaftMessage(im, _) => { raft_messages_size += im.heap_size; continue; @@ -674,7 +674,7 @@ where && !self.fsm.peer.disk_full_peers.majority()) || cmd.extra_opts.disk_full_opt == DiskFullOpt::NotAllowedOnFull) { - self.fsm.batch_req_builder.add(cmd, req_size); + self.fsm.batch_req_builder.add(*cmd, req_size); if self.fsm.batch_req_builder.should_finish(&self.ctx.cfg) { self.propose_pending_batch_raft_command(); } @@ -687,7 +687,7 @@ where } } PeerMsg::Tick(tick) => self.on_tick(tick), - PeerMsg::ApplyRes { res } => { + PeerMsg::ApplyRes(res) => { self.on_apply_res(res); } PeerMsg::SignificantMsg(msg) => self.on_significant_msg(msg), @@ -1104,8 +1104,8 @@ where } } - fn on_casual_msg(&mut self, msg: CasualMessage) { - match msg { + fn on_casual_msg(&mut self, msg: Box>) { + match *msg { CasualMessage::SplitRegion { region_epoch, split_keys, @@ -1466,8 +1466,8 @@ where ); } - fn on_significant_msg(&mut self, msg: SignificantMsg) { - match msg { + fn on_significant_msg(&mut self, msg: Box>) { + match *msg { SignificantMsg::SnapshotStatus { to_peer_id, status, .. } => { @@ -1811,7 +1811,7 @@ where // follower state let _ = self.ctx.router.send( self.region_id(), - PeerMsg::CasualMessage(CasualMessage::Campaign), + PeerMsg::CasualMessage(Box::new(CasualMessage::Campaign)), ); } self.fsm.has_ready = true; @@ -2377,9 +2377,9 @@ where } } - fn on_apply_res(&mut self, res: ApplyTaskRes) { + fn on_apply_res(&mut self, res: Box>) { fail_point!("on_apply_res", |_| {}); - match res { + match *res { ApplyTaskRes::Apply(mut res) => { debug!( "async apply finish"; @@ -2547,8 +2547,8 @@ where } } - fn on_raft_message(&mut self, msg: InspectedRaftMessage) -> Result<()> { - let InspectedRaftMessage { heap_size, mut msg } = msg; + fn on_raft_message(&mut self, m: Box) -> Result<()> { + let InspectedRaftMessage { heap_size, mut msg } = *m; let peer_disk_usage = msg.disk_usage; let stepped = Cell::new(false); let memtrace_raft_entries = &mut self.fsm.peer.memtrace_raft_entries as *mut usize; @@ -3115,10 +3115,10 @@ where ); if self.handle_destroy_peer(job) { // It's not frequent, so use 0 as `heap_size` is ok. - let store_msg = StoreMsg::RaftMessage(InspectedRaftMessage { + let store_msg = StoreMsg::RaftMessage(Box::new(InspectedRaftMessage { heap_size: 0, msg: msg.clone(), - }); + })); if let Err(e) = self.ctx.router.send_control(store_msg) { info!( "failed to send back store message, are we shutting down?"; @@ -3465,7 +3465,7 @@ where // may has been merged/splitted already. let _ = self.ctx.router.force_send( exist_region.get_id(), - PeerMsg::CasualMessage(CasualMessage::RegionOverlapped), + PeerMsg::CasualMessage(Box::new(CasualMessage::RegionOverlapped)), ); } } @@ -3543,11 +3543,11 @@ where .router .force_send( source_region_id, - PeerMsg::SignificantMsg(SignificantMsg::MergeResult { + PeerMsg::SignificantMsg(Box::new(SignificantMsg::MergeResult { target_region_id: self.fsm.region_id(), target: self.fsm.peer.peer.clone(), result, - }), + })), ) .unwrap(); } @@ -3775,9 +3775,9 @@ where ) .flush() .when_done(move || { - if let Err(e) = - mb.force_send(PeerMsg::SignificantMsg(SignificantMsg::RaftLogGcFlushed)) - { + if let Err(e) = mb.force_send(PeerMsg::SignificantMsg(Box::new( + SignificantMsg::RaftLogGcFlushed, + ))) { if tikv_util::thread_group::is_shutdown(!cfg!(test)) { return; } @@ -4459,7 +4459,7 @@ where .swap_remove_front(|m| m.get_to_peer() == &meta_peer) { let peer_msg = PeerMsg::RaftMessage( - InspectedRaftMessage { heap_size: 0, msg }, + Box::new(InspectedRaftMessage { heap_size: 0, msg }), Some(TiInstant::now()), ); if let Err(e) = self.ctx.router.force_send(new_region_id, peer_msg) { @@ -4690,14 +4690,14 @@ where .router .force_send( target_id, - PeerMsg::RaftCommand(RaftCommand::new_ext( + PeerMsg::RaftCommand(Box::new(RaftCommand::new_ext( request, Callback::None, RaftCmdExtraOpts { deadline: None, disk_full_opt: DiskFullOpt::AllowedOnAlmostFull, }, - )), + ))), ) .map_err(|_| Error::RegionNotFound(target_id)) } @@ -4954,11 +4954,11 @@ where } if let Err(e) = self.ctx.router.force_send( source.get_id(), - PeerMsg::SignificantMsg(SignificantMsg::MergeResult { + PeerMsg::SignificantMsg(Box::new(SignificantMsg::MergeResult { target_region_id: self.fsm.region_id(), target: self.fsm.peer.peer.clone(), result: MergeResultKind::FromTargetLog, - }), + })), ) { panic!( "{} failed to send merge result(FromTargetLog) to source region {}, err {}", @@ -5232,11 +5232,11 @@ where for r in &persist_res.destroy_regions { if let Err(e) = self.ctx.router.force_send( r.get_id(), - PeerMsg::SignificantMsg(SignificantMsg::MergeResult { + PeerMsg::SignificantMsg(Box::new(SignificantMsg::MergeResult { target_region_id: self.fsm.region_id(), target: self.fsm.peer.peer.clone(), result: MergeResultKind::FromTargetSnapshotStep2, - }), + })), ) { panic!( "{} failed to send merge result(FromTargetSnapshotStep2) to source region {}, err {}", diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index 3d94ac164b1..ad21cc64fec 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -374,9 +374,7 @@ where let region_id = r.region_id; if let Err(e) = self.router.force_send( region_id, - PeerMsg::ApplyRes { - res: ApplyTaskRes::Apply(r), - }, + PeerMsg::ApplyRes(Box::new(ApplyTaskRes::Apply(r))), ) { error!("failed to send apply result"; "region_id" => region_id, "err" => ?e); } @@ -413,7 +411,7 @@ where heap_size += bytes_capacity(&e.data) + bytes_capacity(&e.context); } let peer_msg = PeerMsg::RaftMessage( - InspectedRaftMessage { heap_size, msg }, + Box::new(InspectedRaftMessage { heap_size, msg }), Some(TiInstant::now()), ); let event = TraceEvent::Add(heap_size); @@ -458,10 +456,10 @@ where cmd: RaftCommand, ) -> std::result::Result<(), TrySendError>> { let region_id = cmd.request.get_header().get_region_id(); - match self.send(region_id, PeerMsg::RaftCommand(cmd)) { + match self.send(region_id, PeerMsg::RaftCommand(Box::new(cmd))) { Ok(()) => Ok(()), - Err(TrySendError::Full(PeerMsg::RaftCommand(cmd))) => Err(TrySendError::Full(cmd)), - Err(TrySendError::Disconnected(PeerMsg::RaftCommand(cmd))) => { + Err(TrySendError::Full(PeerMsg::RaftCommand(box cmd))) => Err(TrySendError::Full(cmd)), + Err(TrySendError::Disconnected(PeerMsg::RaftCommand(box cmd))) => { Err(TrySendError::Disconnected(cmd)) } _ => unreachable!(), @@ -470,7 +468,7 @@ where fn report_unreachable(&self, store_id: u64) { self.broadcast_normal(|| { - PeerMsg::SignificantMsg(SignificantMsg::StoreUnreachable { store_id }) + PeerMsg::SignificantMsg(Box::new(SignificantMsg::StoreUnreachable { store_id })) }); } @@ -481,7 +479,10 @@ where /// Broadcasts resolved result to all regions. pub fn report_resolved(&self, store_id: u64, group_id: u64) { self.broadcast_normal(|| { - PeerMsg::SignificantMsg(SignificantMsg::StoreResolved { store_id, group_id }) + PeerMsg::SignificantMsg(Box::new(SignificantMsg::StoreResolved { + store_id, + group_id, + })) }) } @@ -1040,12 +1041,7 @@ impl PollHandler, St fail_point!( "pause_on_peer_destroy_res", peer.peer_id() == 1 - && matches!( - msg, - PeerMsg::ApplyRes { - res: ApplyTaskRes::Destroy { .. }, - } - ), + && matches!(msg, PeerMsg::ApplyRes(box ApplyTaskRes::Destroy { .. })), |_| unreachable!() ); self.peer_msg_buf.push(msg); @@ -1644,7 +1640,9 @@ impl RaftBatchSystem { for region_id in regions { let _ = router_clone.send( region_id, - PeerMsg::CasualMessage(CasualMessage::ForceCompactRaftLogs), + PeerMsg::CasualMessage(Box::new( + CasualMessage::ForceCompactRaftLogs, + )), ); } } @@ -2108,7 +2106,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER Ok(CheckMsgStatus::NewPeer) } - fn on_raft_message(&mut self, msg: InspectedRaftMessage) -> Result<()> { + fn on_raft_message(&mut self, msg: Box) -> Result<()> { let (heap_size, forwarded) = (msg.heap_size, Cell::new(false)); defer!(if !forwarded.get() { MEMTRACE_RAFT_MESSAGES.trace(TraceEvent::Sub(heap_size)); @@ -2198,8 +2196,10 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER check_msg_status == CheckMsgStatus::NewPeerFirst, )? { // Peer created, send the message again. - let peer_msg = - PeerMsg::RaftMessage(InspectedRaftMessage { heap_size, msg }, None); + let peer_msg = PeerMsg::RaftMessage( + Box::new(InspectedRaftMessage { heap_size, msg }), + None, + ); if self.ctx.router.send(region_id, peer_msg).is_ok() { forwarded.set(true); } @@ -2222,7 +2222,8 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER store_meta.pending_msgs.push(msg); } else { drop(store_meta); - let peer_msg = PeerMsg::RaftMessage(InspectedRaftMessage { heap_size, msg }, None); + let peer_msg = + PeerMsg::RaftMessage(Box::new(InspectedRaftMessage { heap_size, msg }), None); if let Err(e) = self.ctx.router.force_send(region_id, peer_msg) { warn!("handle first request failed"; "region_id" => region_id, "error" => ?e); } else { @@ -2388,7 +2389,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER // region may has been merged/splitted already. let _ = self.ctx.router.force_send( exist_region.get_id(), - PeerMsg::CasualMessage(CasualMessage::RegionOverlapped), + PeerMsg::CasualMessage(Box::new(CasualMessage::RegionOverlapped)), ); } } @@ -2403,11 +2404,11 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER .router .force_send( id, - PeerMsg::SignificantMsg(SignificantMsg::MergeResult { + PeerMsg::SignificantMsg(Box::new(SignificantMsg::MergeResult { target_region_id: region_id, target: target.clone(), result: MergeResultKind::Stale, - }), + })), ) .unwrap(); } @@ -2475,9 +2476,9 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER for (region_id, declined_bytes) in region_declined_bytes.drain(..) { let _ = self.ctx.router.send( region_id, - PeerMsg::CasualMessage(CasualMessage::CompactionDeclinedBytes { + PeerMsg::CasualMessage(Box::new(CasualMessage::CompactionDeclinedBytes { bytes: declined_bytes, - }), + })), ); } } @@ -3013,7 +3014,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER let _ = self.ctx.router.send( target_region_id, - PeerMsg::RaftCommand(RaftCommand::new(request, Callback::None)), + PeerMsg::RaftCommand(Box::new(RaftCommand::new(request, Callback::None))), ); } @@ -3051,7 +3052,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER for region_id in regions { let _ = self.ctx.router.send( region_id, - PeerMsg::CasualMessage(CasualMessage::ClearRegionSize), + PeerMsg::CasualMessage(Box::new(CasualMessage::ClearRegionSize)), ); } } diff --git a/components/raftstore/src/store/msg.rs b/components/raftstore/src/store/msg.rs index 09c33fe1616..1d01caa1c76 100644 --- a/components/raftstore/src/store/msg.rs +++ b/components/raftstore/src/store/msg.rs @@ -756,29 +756,25 @@ pub struct InspectedRaftMessage { } /// Message that can be sent to a peer. -#[allow(clippy::large_enum_variant)] #[derive(EnumCount, EnumVariantNames)] -#[repr(u8)] pub enum PeerMsg { /// Raft message is the message sent between raft nodes in the same /// raft group. Messages need to be redirected to raftstore if target /// peer doesn't exist. - RaftMessage(InspectedRaftMessage, Option) = 0, + RaftMessage(Box, Option), /// Raft command is the command that is expected to be proposed by the /// leader of the target raft group. If it's failed to be sent, callback /// usually needs to be called before dropping in case of resource leak. - RaftCommand(RaftCommand), + RaftCommand(Box>), /// Tick is periodical task. If target peer doesn't exist there is a /// potential that the raft node will not work anymore. Tick(PeerTick), /// Result of applying committed entries. The message can't be lost. - ApplyRes { - res: ApplyTaskRes, - }, + ApplyRes(Box>), /// Message that can't be lost but rarely created. If they are lost, real /// bad things happen like some peers will be considered dead in the /// group. - SignificantMsg(SignificantMsg), + SignificantMsg(Box>), /// Start the FSM. Start, /// A message only used to notify a peer. @@ -788,7 +784,7 @@ pub enum PeerMsg { ready_number: u64, }, /// Message that is not important and can be dropped occasionally. - CasualMessage(CasualMessage), + CasualMessage(Box>), /// Ask region to report a heartbeat to PD. HeartbeatPd, /// Asks region to change replication mode. @@ -809,7 +805,7 @@ impl fmt::Debug for PeerMsg { tick }, PeerMsg::SignificantMsg(msg) => write!(fmt, "{:?}", msg), - PeerMsg::ApplyRes { res } => write!(fmt, "ApplyRes {:?}", res), + PeerMsg::ApplyRes(res) => write!(fmt, "ApplyRes {:?}", res), PeerMsg::Start => write!(fmt, "Startup"), PeerMsg::Noop => write!(fmt, "Noop"), PeerMsg::Persisted { @@ -852,7 +848,7 @@ impl PeerMsg { pub fn is_send_failure_ignorable(&self) -> bool { matches!( self, - PeerMsg::SignificantMsg(SignificantMsg::CaptureChange { .. }) + PeerMsg::SignificantMsg(box SignificantMsg::CaptureChange { .. }) ) } } @@ -862,7 +858,7 @@ pub enum StoreMsg where EK: KvEngine, { - RaftMessage(InspectedRaftMessage), + RaftMessage(Box), // Clear region size and keys for all regions in the range, so we can force them to // re-calculate their size later. @@ -964,3 +960,18 @@ impl StoreMsg { } } } + +#[cfg(test)] +mod tests { + #[test] + fn test_msg_size() { + use std::mem; + + use engine_rocks::RocksEngine; + + use super::*; + + // make sure the msg is small enough + assert_eq!(mem::size_of::>(), 32); + } +} diff --git a/components/raftstore/src/store/snapshot_backup.rs b/components/raftstore/src/store/snapshot_backup.rs index 9168e974fc2..710ac281e8a 100644 --- a/components/raftstore/src/store/snapshot_backup.rs +++ b/components/raftstore/src/store/snapshot_backup.rs @@ -75,7 +75,7 @@ impl SnapshotBrHandle for Arc crate::Result<()> { let msg_gen = || { metrics::SNAP_BR_WAIT_APPLY_EVENT.sent.inc(); - PeerMsg::SignificantMsg(SignificantMsg::SnapshotBrWaitApply(req.clone())) + PeerMsg::SignificantMsg(Box::new(SignificantMsg::SnapshotBrWaitApply(req.clone()))) }; self.lock().unwrap().broadcast_normal(msg_gen); Ok(()) @@ -86,7 +86,7 @@ impl SnapshotBrHandle for Arc, ) -> crate::Result<()> { self.lock().unwrap().broadcast_normal(|| { - PeerMsg::SignificantMsg(SignificantMsg::CheckPendingAdmin(tx.clone())) + PeerMsg::SignificantMsg(Box::new(SignificantMsg::CheckPendingAdmin(tx.clone()))) }); Ok(()) } diff --git a/components/raftstore/src/store/transport.rs b/components/raftstore/src/store/transport.rs index 2ca19fbe5fe..35761aa5d18 100644 --- a/components/raftstore/src/store/transport.rs +++ b/components/raftstore/src/store/transport.rs @@ -78,7 +78,10 @@ where { #[inline] fn send(&self, region_id: u64, msg: CasualMessage) -> Result<()> { - match self.router.send(region_id, PeerMsg::CasualMessage(msg)) { + match self + .router + .send(region_id, PeerMsg::CasualMessage(Box::new(msg))) + { Ok(()) => Ok(()), Err(TrySendError::Full(_)) => Err(Error::Transport(DiscardReason::Full)), Err(TrySendError::Disconnected(_)) => Err(Error::RegionNotFound(region_id)), @@ -102,7 +105,7 @@ where fn significant_send(&self, region_id: u64, msg: SignificantMsg) -> Result<()> { if let Err(SendError(msg)) = self .router - .force_send(region_id, PeerMsg::SignificantMsg(msg)) + .force_send(region_id, PeerMsg::SignificantMsg(Box::new(msg))) { // TODO: panic here once we can detect system is shutting down reliably. diff --git a/components/raftstore/src/store/unsafe_recovery.rs b/components/raftstore/src/store/unsafe_recovery.rs index 4bc84ebe2a7..ab344a26239 100644 --- a/components/raftstore/src/store/unsafe_recovery.rs +++ b/components/raftstore/src/store/unsafe_recovery.rs @@ -80,7 +80,9 @@ impl UnsafeRecoveryHandle for Mutex UnsafeRecoveryHandle for Mutex region_id, ); - let gc_snap = PeerMsg::CasualMessage(CasualMessage::GcSnap { snaps }); + let gc_snap = PeerMsg::CasualMessage(Box::new(CasualMessage::GcSnap { snaps })); match (*self.router).send(region_id, gc_snap) { Ok(()) => Ok(()), Err(TrySendError::Disconnected(_)) if self.router.is_shutdown() => Ok(()), Err(TrySendError::Disconnected(PeerMsg::CasualMessage( - CasualMessage::GcSnap { snaps }, + box CasualMessage::GcSnap { snaps }, ))) => { // The snapshot exists because MsgAppend has been rejected. So the // peer must have been exist. But now it's disconnected, so the peer diff --git a/components/raftstore/src/store/worker/pd.rs b/components/raftstore/src/store/worker/pd.rs index 5f54eb8740c..62ccc0418cb 100644 --- a/components/raftstore/src/store/worker/pd.rs +++ b/components/raftstore/src/store/worker/pd.rs @@ -1699,7 +1699,7 @@ where cb: Callback::None, } }; - if let Err(e) = router.send(region_id, PeerMsg::CasualMessage(msg)) { + if let Err(e) = router.send(region_id, PeerMsg::CasualMessage(Box::new(msg))) { error!("send halfsplit request failed"; "region_id" => region_id, "err" => ?e); } } else if resp.has_merge() { @@ -1895,7 +1895,7 @@ where match resp.await { Ok(Some((region, leader))) => { if leader.get_store_id() != 0 { - let msg = CasualMessage::QueryRegionLeaderResp { region, leader }; + let msg = Box::new(CasualMessage::QueryRegionLeaderResp { region, leader }); if let Err(e) = router.send(region_id, PeerMsg::CasualMessage(msg)) { error!("send region info message failed"; "region_id" => region_id, "err" => ?e); } @@ -2153,14 +2153,14 @@ where let start_key = split_info.start_key.unwrap(); let end_key = split_info.end_key.unwrap(); let region_id = region.get_id(); - let msg = CasualMessage::HalfSplitRegion { + let msg = Box::new(CasualMessage::HalfSplitRegion { region_epoch: region.get_region_epoch().clone(), start_key: Some(start_key.clone()), end_key: Some(end_key.clone()), policy: pdpb::CheckPolicy::Scan, source: "auto_split", cb: Callback::None, - }; + }); if let Err(e) = router.send(region_id, PeerMsg::CasualMessage(msg)) { error!("send auto half split request failed"; "region_id" => region_id, diff --git a/components/snap_recovery/src/services.rs b/components/snap_recovery/src/services.rs index d72f10e4f43..57716ec0272 100644 --- a/components/snap_recovery/src/services.rs +++ b/components/snap_recovery/src/services.rs @@ -222,9 +222,9 @@ impl RecoveryService { pub fn wait_apply_last(router: RaftRouter, sender: Sender) { let wait_apply = SnapshotBrWaitApplySyncer::new(0, sender); router.broadcast_normal(|| { - PeerMsg::SignificantMsg(SignificantMsg::SnapshotBrWaitApply( + PeerMsg::SignificantMsg(Box::new(SignificantMsg::SnapshotBrWaitApply( SnapshotBrWaitApplyRequest::relaxed(wait_apply.clone()), - )) + ))) }); } } diff --git a/components/test_raftstore/src/router.rs b/components/test_raftstore/src/router.rs index 3b6b1e962c3..d6a135c9f9a 100644 --- a/components/test_raftstore/src/router.rs +++ b/components/test_raftstore/src/router.rs @@ -60,7 +60,7 @@ impl CasualRouter for MockRaftStoreRouter { fn send(&self, region_id: u64, msg: CasualMessage) -> RaftStoreResult<()> { let mut senders = self.senders.lock().unwrap(); if let Some(tx) = senders.get_mut(®ion_id) { - tx.try_send(PeerMsg::CasualMessage(msg)) + tx.try_send(PeerMsg::CasualMessage(Box::new(msg))) .map_err(|e| handle_send_error(region_id, e)) } else { Err(RaftStoreError::RegionNotFound(region_id)) @@ -76,7 +76,8 @@ impl SignificantRouter for MockRaftStoreRouter { ) -> RaftStoreResult<()> { let mut senders = self.senders.lock().unwrap(); if let Some(tx) = senders.get_mut(®ion_id) { - tx.force_send(PeerMsg::SignificantMsg(msg)).unwrap(); + tx.force_send(PeerMsg::SignificantMsg(Box::new(msg))) + .unwrap(); Ok(()) } else { error!("failed to send significant msg"; "msg" => ?msg); diff --git a/src/server/server.rs b/src/server/server.rs index 3d6e5659705..9f62fe583cf 100644 --- a/src/server/server.rs +++ b/src/server/server.rs @@ -474,14 +474,18 @@ pub mod test_router { cmd: RaftCommand, ) -> std::result::Result<(), crossbeam::channel::TrySendError>> { - let _ = self.tx.send(Either::Left(PeerMsg::RaftCommand(cmd))); + let _ = self + .tx + .send(Either::Left(PeerMsg::RaftCommand(Box::new(cmd)))); Ok(()) } } impl CasualRouter for TestRaftStoreRouter { fn send(&self, _: u64, msg: CasualMessage) -> RaftStoreResult<()> { - let _ = self.tx.send(Either::Left(PeerMsg::CasualMessage(msg))); + let _ = self + .tx + .send(Either::Left(PeerMsg::CasualMessage(Box::new(msg)))); Ok(()) } } @@ -500,7 +504,7 @@ pub mod test_router { impl RaftStoreRouter for TestRaftStoreRouter { fn send_raft_msg(&self, msg: RaftMessage) -> RaftStoreResult<()> { let _ = self.tx.send(Either::Left(PeerMsg::RaftMessage( - InspectedRaftMessage { heap_size: 0, msg }, + Box::new(InspectedRaftMessage { heap_size: 0, msg }), Some(TiInstant::now()), ))); Ok(()) diff --git a/tests/integrations/raftstore/test_snap_recovery.rs b/tests/integrations/raftstore/test_snap_recovery.rs index 38a7206399f..f3fcec4d8a9 100644 --- a/tests/integrations/raftstore/test_snap_recovery.rs +++ b/tests/integrations/raftstore/test_snap_recovery.rs @@ -45,7 +45,7 @@ fn test_check_pending_admin() { let (tx, mut rx) = futures::channel::mpsc::unbounded(); router.broadcast_normal(|| { - PeerMsg::SignificantMsg(SignificantMsg::CheckPendingAdmin(tx.clone())) + PeerMsg::SignificantMsg(Box::new(SignificantMsg::CheckPendingAdmin(tx.clone()))) }); futures::executor::block_on(async { let r = rx.next().await; @@ -61,7 +61,7 @@ fn test_check_pending_admin() { let (tx, mut rx) = futures::channel::mpsc::unbounded(); router.broadcast_normal(|| { - PeerMsg::SignificantMsg(SignificantMsg::CheckPendingAdmin(tx.clone())) + PeerMsg::SignificantMsg(Box::new(SignificantMsg::CheckPendingAdmin(tx.clone()))) }); futures::executor::block_on(async { let r = rx.next().await; @@ -101,9 +101,9 @@ fn test_snap_wait_apply() { let (tx, rx) = oneshot::channel(); let syncer = SnapshotBrWaitApplySyncer::new(1, tx); router.broadcast_normal(|| { - PeerMsg::SignificantMsg(SignificantMsg::SnapshotBrWaitApply( + PeerMsg::SignificantMsg(Box::new(SignificantMsg::SnapshotBrWaitApply( SnapshotBrWaitApplyRequest::relaxed(syncer.clone()), - )) + ))) }); // we expect recv timeout because the leader peer on store 1 cannot finished the @@ -119,9 +119,9 @@ fn test_snap_wait_apply() { let (tx, rx) = oneshot::channel(); let syncer = SnapshotBrWaitApplySyncer::new(1, tx); router.broadcast_normal(|| { - PeerMsg::SignificantMsg(SignificantMsg::SnapshotBrWaitApply( + PeerMsg::SignificantMsg(Box::new(SignificantMsg::SnapshotBrWaitApply( SnapshotBrWaitApplyRequest::relaxed(syncer.clone()), - )) + ))) }); drop(syncer); From 66ecd0129a6d68660ad6c464a69e33c87d1f337f Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Wed, 25 Sep 2024 17:27:24 +0800 Subject: [PATCH 205/220] backup-stream: clean the `pause-guard-gc-safepoint` when unregister the log task (#17317) (#17570) close tikv/tikv#17316 clean `pause-guard-gc-safepoint` when unregister the log task Signed-off-by: ti-chi-bot Signed-off-by: Jianjun Liao Co-authored-by: Jianjun Liao <36503113+Leavrth@users.noreply.github.com> Co-authored-by: Jianjun Liao --- components/backup-stream/src/endpoint.rs | 35 ++++++++++++++---------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/components/backup-stream/src/endpoint.rs b/components/backup-stream/src/endpoint.rs index 1df518094bc..45ad4305abb 100644 --- a/components/backup-stream/src/endpoint.rs +++ b/components/backup-stream/src/endpoint.rs @@ -632,18 +632,7 @@ where ); let task_name = task.info.get_name().to_owned(); - // clean the safepoint created at pause(if there is) - self.pool.spawn( - self.pd_client - .update_service_safe_point( - self.pause_guard_id_for_task(task.info.get_name()), - TimeStamp::zero(), - Duration::new(0, 0), - ) - .map(|r| { - r.map_err(|err| Error::from(err).report("removing safe point for pausing")) - }), - ); + self.clean_pause_guard_id_for_task(&task_name); self.pool.block_on(async move { let task_clone = task.clone(); let run = async move { @@ -686,6 +675,21 @@ where metrics::update_task_status(TaskStatus::Running, &task_name); } + // clean the safepoint created at pause(if there is) + fn clean_pause_guard_id_for_task(&self, task_name: &str) { + self.pool.spawn( + self.pd_client + .update_service_safe_point( + self.pause_guard_id_for_task(task_name), + TimeStamp::zero(), + Duration::new(0, 0), + ) + .map(|r| { + r.map_err(|err| Error::from(err).report("removing safe point for pausing")) + }), + ); + } + fn pause_guard_id_for_task(&self, task: &str) -> String { format!("{}-{}-pause-guard", task, self.store_id) } @@ -720,9 +724,10 @@ where } } - pub fn on_unregister(&self, task: &str) -> Option { - let info = self.unload_task(task); - self.remove_metrics_after_unregister(task); + pub fn on_unregister(&self, task_name: &str) -> Option { + let info = self.unload_task(task_name); + self.clean_pause_guard_id_for_task(task_name); + self.remove_metrics_after_unregister(task_name); info } From 50b4b98ddc0d48162f448de09173548c439790f6 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Thu, 26 Sep 2024 18:51:29 +0800 Subject: [PATCH 206/220] raftkv: fix inaccurate async write duration metric (#17581) (#17591) close tikv/tikv#17579 Fix inaccurate storage async write duration metric, which mistakenly included task wait time in the scheduler worker pool. This occurs because the metric is observed in a future running on the scheduler worker pool, leading to inflated values, especially under load. This can be misleading and cause confusion during troubleshooting. This commit corrects the metric by observing it in the async write callback. Signed-off-by: ti-chi-bot Signed-off-by: lucasliang Co-authored-by: Neil Shen Co-authored-by: lucasliang --- src/server/raftkv/mod.rs | 86 +++++++++++++++++++-------------------- src/server/raftkv2/mod.rs | 41 +++++++++---------- 2 files changed, 61 insertions(+), 66 deletions(-) diff --git a/src/server/raftkv/mod.rs b/src/server/raftkv/mod.rs index 82563666f04..9e6cc711558 100644 --- a/src/server/raftkv/mod.rs +++ b/src/server/raftkv/mod.rs @@ -45,7 +45,7 @@ use raftstore::{ router::{LocalReadRouter, RaftStoreRouter}, store::{ self, util::encode_start_ts_into_flag_data, Callback as StoreCallback, RaftCmdExtraOpts, - ReadCallback, ReadIndexContext, ReadResponse, RegionSnapshot, StoreMsg, WriteResponse, + ReadIndexContext, ReadResponse, RegionSnapshot, StoreMsg, WriteResponse, }, }; use thiserror::Error; @@ -55,7 +55,7 @@ use tikv_util::{ future::{paired_future_callback, paired_must_called_future_callback}, time::Instant, }; -use tracker::GLOBAL_TRACKERS; +use tracker::{get_tls_tracker_token, GLOBAL_TRACKERS}; use txn_types::{Key, TimeStamp, TxnExtra, TxnExtraScheduler, WriteBatchFlags}; use super::metrics::*; @@ -548,6 +548,10 @@ where }); let mut res = match on_write_result::(resp) { Ok(CmdRes::Resp(_)) => { + ASYNC_REQUESTS_COUNTER_VEC.write.success.inc(); + ASYNC_REQUESTS_DURATIONS_VEC + .write + .observe(begin_instant.saturating_elapsed_secs()); fail_point!("raftkv_async_write_finish"); Ok(()) } @@ -581,18 +585,9 @@ where tx.notify(res); } rx.inspect(move |ev| { - let WriteEvent::Finished(res) = ev else { return }; - match res { - Ok(()) => { - ASYNC_REQUESTS_COUNTER_VEC.write.success.inc(); - ASYNC_REQUESTS_DURATIONS_VEC - .write - .observe(begin_instant.saturating_elapsed_secs()); - } - Err(e) => { - let status_kind = get_status_kind_from_engine_error(e); - ASYNC_REQUESTS_COUNTER_VEC.write.get(status_kind).inc(); - } + if let WriteEvent::Finished(Err(e)) = ev { + let status_kind = get_status_kind_from_engine_error(e); + ASYNC_REQUESTS_COUNTER_VEC.write.get(status_kind).inc(); } }) } @@ -639,10 +634,39 @@ where let mut cmd = RaftCmdRequest::default(); cmd.set_header(header); cmd.set_requests(vec![req].into()); + let tracker = get_tls_tracker_token(); let store_cb = StoreCallback::read(Box::new(move |resp| { - cb(on_read_result(resp).map_err(Error::into)); + let res = on_read_result(resp).map_err(Error::into); + if res.is_ok() { + let elapse = begin_instant.saturating_elapsed_secs(); + GLOBAL_TRACKERS.with_tracker(tracker, |tracker| { + if tracker.metrics.read_index_propose_wait_nanos > 0 { + ASYNC_REQUESTS_DURATIONS_VEC + .snapshot_read_index_propose_wait + .observe( + tracker.metrics.read_index_propose_wait_nanos as f64 + / 1_000_000_000.0, + ); + // snapshot may be handled by lease read in raftstore + if tracker.metrics.read_index_confirm_wait_nanos > 0 { + ASYNC_REQUESTS_DURATIONS_VEC + .snapshot_read_index_confirm + .observe( + tracker.metrics.read_index_confirm_wait_nanos as f64 + / 1_000_000_000.0, + ); + } + } else if tracker.metrics.local_read { + ASYNC_REQUESTS_DURATIONS_VEC + .snapshot_local_read + .observe(elapse); + } + }); + ASYNC_REQUESTS_DURATIONS_VEC.snapshot.observe(elapse); + ASYNC_REQUESTS_COUNTER_VEC.snapshot.success.inc(); + } + cb(res); })); - let tracker = store_cb.read_tracker().unwrap(); if res.is_ok() { res = self @@ -673,35 +697,7 @@ where }; Err(e) } - Ok(CmdRes::Snap(s)) => { - let elapse = begin_instant.saturating_elapsed_secs(); - GLOBAL_TRACKERS.with_tracker(tracker, |tracker| { - if tracker.metrics.read_index_propose_wait_nanos > 0 { - ASYNC_REQUESTS_DURATIONS_VEC - .snapshot_read_index_propose_wait - .observe( - tracker.metrics.read_index_propose_wait_nanos as f64 - / 1_000_000_000.0, - ); - // snapshot may be handled by lease read in raftstore - if tracker.metrics.read_index_confirm_wait_nanos > 0 { - ASYNC_REQUESTS_DURATIONS_VEC - .snapshot_read_index_confirm - .observe( - tracker.metrics.read_index_confirm_wait_nanos as f64 - / 1_000_000_000.0, - ); - } - } else if tracker.metrics.local_read { - ASYNC_REQUESTS_DURATIONS_VEC - .snapshot_local_read - .observe(elapse); - } - }); - ASYNC_REQUESTS_DURATIONS_VEC.snapshot.observe(elapse); - ASYNC_REQUESTS_COUNTER_VEC.snapshot.success.inc(); - Ok(s) - } + Ok(CmdRes::Snap(s)) => Ok(s), Err(e) => { let status_kind = get_status_kind_from_engine_error(&e); ASYNC_REQUESTS_COUNTER_VEC.snapshot.get(status_kind).inc(); diff --git a/src/server/raftkv2/mod.rs b/src/server/raftkv2/mod.rs index a9f7eb7586e..64c5a1a3ca1 100644 --- a/src/server/raftkv2/mod.rs +++ b/src/server/raftkv2/mod.rs @@ -232,6 +232,10 @@ impl tikv_kv::Engine for RaftKv2 { .observe(elapse); } }); + // The observed snapshot duration is larger than the actual + // snapshot duration, because it includes the waiting time + // of this future. + // TODO: Fix the inaccuracy, see #17581. ASYNC_REQUESTS_DURATIONS_VEC.snapshot.observe(elapse); ASYNC_REQUESTS_COUNTER_VEC.snapshot.success.inc(); Ok(snap) @@ -305,16 +309,20 @@ impl tikv_kv::Engine for RaftKv2 { if WriteEvent::subscribed_committed(subscribed) { builder.subscribe_committed(); } - if let Some(cb) = on_applied { - builder.before_set(move |resp| { - let mut res = if !resp.get_header().has_error() { - Ok(()) - } else { - Err(tikv_kv::Error::from(resp.get_header().get_error().clone())) - }; + builder.before_set(move |resp| { + let mut res = if !resp.get_header().has_error() { + ASYNC_REQUESTS_COUNTER_VEC.write.success.inc(); + ASYNC_REQUESTS_DURATIONS_VEC + .write + .observe(begin_instant.saturating_elapsed_secs()); + Ok(()) + } else { + Err(tikv_kv::Error::from(resp.get_header().get_error().clone())) + }; + if let Some(cb) = on_applied { cb(&mut res); - }); - } + } + }); let (ch, sub) = builder.build(); let res = if inject_region_not_found { ch.report_error(cmd_resp::new_error(Error::RegionNotFound(region_id))); @@ -340,18 +348,9 @@ impl tikv_kv::Engine for RaftKv2 { early_err: res.err(), }) .inspect(move |ev| { - let WriteEvent::Finished(res) = ev else { return }; - match res { - Ok(()) => { - ASYNC_REQUESTS_COUNTER_VEC.write.success.inc(); - ASYNC_REQUESTS_DURATIONS_VEC - .write - .observe(begin_instant.saturating_elapsed_secs()); - } - Err(e) => { - let status_kind = get_status_kind_from_engine_error(e); - ASYNC_REQUESTS_COUNTER_VEC.write.get(status_kind).inc(); - } + if let WriteEvent::Finished(Err(e)) = ev { + let status_kind = get_status_kind_from_engine_error(e); + ASYNC_REQUESTS_COUNTER_VEC.write.get(status_kind).inc(); } }) } From 816595fb8d7310fd297a56bb9dca3ea5fddeb574 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Fri, 27 Sep 2024 14:57:33 +0800 Subject: [PATCH 207/220] br: pre-check TiKV disk space before download (#17238) (#17569) close tikv/tikv#17224 Add a disk usage check when execute `download` and `apply` RPC from br. When the disk is not `Normal`, the request would be rejected. Signed-off-by: ti-chi-bot Signed-off-by: hillium Co-authored-by: ris <79858083+RidRisR@users.noreply.github.com> Co-authored-by: hillium --- components/error_code/src/sst_importer.rs | 3 +- components/sst_importer/src/errors.rs | 4 ++ src/import/sst_service.rs | 14 ++++++- tests/failpoints/cases/test_import_service.rs | 41 ++++++++++++++++++- tests/integrations/import/test_apply_log.rs | 29 +++++++++++++ 5 files changed, 87 insertions(+), 4 deletions(-) diff --git a/components/error_code/src/sst_importer.rs b/components/error_code/src/sst_importer.rs index 9e568ee00c1..b092796d467 100644 --- a/components/error_code/src/sst_importer.rs +++ b/components/error_code/src/sst_importer.rs @@ -27,5 +27,6 @@ define_error_codes!( "this request has been suspended.", "Probably there are some export tools don't support exporting data inserted by `ingest`(say, snapshot backup). Check the user manual and stop them."), REQUEST_TOO_NEW => ("RequestTooNew", "", ""), - REQUEST_TOO_OLD => ("RequestTooOld", "", "") + REQUEST_TOO_OLD => ("RequestTooOld", "", ""), + DISK_SPACE_NOT_ENOUGH => ("DiskSpaceNotEnough", "", "") ); diff --git a/components/sst_importer/src/errors.rs b/components/sst_importer/src/errors.rs index e5e235e9761..c79bd2db9f8 100644 --- a/components/sst_importer/src/errors.rs +++ b/components/sst_importer/src/errors.rs @@ -136,6 +136,9 @@ pub enum Error { #[error("imports are suspended for {time_to_lease_expire:?}")] Suspended { time_to_lease_expire: Duration }, + + #[error("TiKV disk space is not enough.")] + DiskSpaceNotEnough, } impl Error { @@ -222,6 +225,7 @@ impl ErrorCodeExt for Error { Error::Suspended { .. } => error_code::sst_importer::SUSPENDED, Error::RequestTooNew(_) => error_code::sst_importer::REQUEST_TOO_NEW, Error::RequestTooOld(_) => error_code::sst_importer::REQUEST_TOO_OLD, + Error::DiskSpaceNotEnough => error_code::sst_importer::DISK_SPACE_NOT_ENOUGH, } } } diff --git a/src/import/sst_service.rs b/src/import/sst_service.rs index 4d87f249492..81c947847d4 100644 --- a/src/import/sst_service.rs +++ b/src/import/sst_service.rs @@ -46,7 +46,10 @@ use tikv_kv::{ use tikv_util::{ config::ReadableSize, future::{create_stream_with_buffer, paired_future_callback}, - sys::thread::ThreadBuildWrapper, + sys::{ + disk::{get_disk_status, DiskUsage}, + thread::ThreadBuildWrapper, + }, time::{Instant, Limiter}, HandyRwLock, }; @@ -1032,6 +1035,10 @@ impl ImportSst for ImportSstService { .observe(start.saturating_elapsed().as_secs_f64()); let mut resp = ApplyResponse::default(); + if get_disk_status(0) != DiskUsage::Normal { + resp.set_error(Error::DiskSpaceNotEnough.into()); + return crate::send_rpc_response!(Ok(resp), sink, label, start); + } match Self::apply_imp(req, importer, applier, limiter, max_raft_size).await { Ok(Some(r)) => resp.set_range(r), @@ -1073,6 +1080,11 @@ impl ImportSst for ImportSstService { sst_importer::metrics::IMPORTER_DOWNLOAD_DURATION .with_label_values(&["queue"]) .observe(start.saturating_elapsed().as_secs_f64()); + if get_disk_status(0) != DiskUsage::Normal { + let mut resp = DownloadResponse::default(); + resp.set_error(Error::DiskSpaceNotEnough.into()); + return crate::send_rpc_response!(Ok(resp), sink, label, timer); + } // FIXME: download() should be an async fn, to allow BR to cancel // a download task. diff --git a/tests/failpoints/cases/test_import_service.rs b/tests/failpoints/cases/test_import_service.rs index e51c9862e47..9aa0ad5af20 100644 --- a/tests/failpoints/cases/test_import_service.rs +++ b/tests/failpoints/cases/test_import_service.rs @@ -8,12 +8,12 @@ use std::{ use file_system::calc_crc32; use futures::{executor::block_on, stream, SinkExt}; use grpcio::{ChannelBuilder, Environment, Result, WriteFlags}; -use kvproto::{import_sstpb::*, tikvpb_grpc::TikvClient}; +use kvproto::{disk_usage::DiskUsage, import_sstpb::*, tikvpb_grpc::TikvClient}; use tempfile::{Builder, TempDir}; use test_raftstore::{must_raw_put, Simulator}; use test_sst_importer::*; use tikv::config::TikvConfig; -use tikv_util::{config::ReadableSize, HandyRwLock}; +use tikv_util::{config::ReadableSize, sys::disk, HandyRwLock}; #[allow(dead_code)] #[path = "../../integrations/import/util.rs"] @@ -90,6 +90,43 @@ fn upload_sst(import: &ImportSstClient, meta: &SstMeta, data: &[u8]) -> Result Date: Sun, 29 Sep 2024 10:51:10 +0800 Subject: [PATCH 208/220] resource_control: add metrics for priority resource limiter (#17590) (#17598) close tikv/tikv#17589 Add some metrics for resource control priority resource limiter. Also adjust the build parameters of QuotaLimiter in resource control module to avoid triggering wait too frequently. Signed-off-by: ti-chi-bot Signed-off-by: glorv Co-authored-by: glorv --- Cargo.lock | 9 ++-- components/resource_control/src/lib.rs | 1 + components/resource_control/src/metrics.rs | 20 +++++++++ .../resource_control/src/resource_limiter.rs | 39 ++++++++++++---- components/resource_control/src/worker.rs | 45 ++++++++++++------- components/tikv_util/Cargo.toml | 3 +- 6 files changed, 86 insertions(+), 31 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index fb4ae399a30..4b36f38d4b4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -168,9 +168,8 @@ dependencies = [ [[package]] name = "async-speed-limit" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "481ce9cb6a828f4679495f7376cb6779978d925dd9790b99b48d1bbde6d0f00b" +version = "0.4.1" +source = "git+https://github.com/tikv/async-speed-limit?branch=master#a113aef3cc24bf7fa5faf2b7025abaf02fc53fe3" dependencies = [ "futures-core", "futures-io", @@ -2187,9 +2186,9 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.15" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "acc499defb3b348f8d8f3f66415835a9131856ff7714bf10dadfc4ec4bdb29a1" +checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1" [[package]] name = "futures-lite" diff --git a/components/resource_control/src/lib.rs b/components/resource_control/src/lib.rs index a7b4cf03192..53db62c96e1 100644 --- a/components/resource_control/src/lib.rs +++ b/components/resource_control/src/lib.rs @@ -2,6 +2,7 @@ #![feature(test)] #![feature(local_key_cell_methods)] #![feature(array_zip)] +#![feature(let_chains)] use std::sync::Arc; diff --git a/components/resource_control/src/metrics.rs b/components/resource_control/src/metrics.rs index 45723063492..594c6af486a 100644 --- a/components/resource_control/src/metrics.rs +++ b/components/resource_control/src/metrics.rs @@ -28,6 +28,26 @@ lazy_static! { &["priority"] ) .unwrap(); + pub static ref PRIORITY_CPU_TIME_VEC: IntCounterVec = register_int_counter_vec!( + "tikv_resource_control_priority_task_exec_duration", + "Total execution duration of tasks per-priority", + &["priority"] + ) + .unwrap(); + pub static ref PRIORITY_WAIT_DURATION_VEC: HistogramVec = register_histogram_vec!( + "tikv_resource_control_priority_wait_duration", + "Histogram of wait duration cause by priority quota limiter", + &["priority"], + exponential_buckets(1e-5, 2.0, 18).unwrap() // 10us ~ 2.5s + ) + .unwrap(); + + pub static ref BACKGROUND_TASK_RESOURCE_UTILIZATION_VEC: IntGaugeVec = register_int_gauge_vec!( + "tikv_resource_control_bg_resource_utilization", + "The total resource utilization percentage of background tasks", + &["type"] + ) + .unwrap(); } pub fn deregister_metrics(name: &str) { diff --git a/components/resource_control/src/resource_limiter.rs b/components/resource_control/src/resource_limiter.rs index ab2144f18cc..4cc139152dd 100644 --- a/components/resource_control/src/resource_limiter.rs +++ b/components/resource_control/src/resource_limiter.rs @@ -8,10 +8,11 @@ use std::{ use file_system::IoBytes; use futures::compat::Future01CompatExt; +use prometheus::Histogram; use strum::EnumCount; use tikv_util::{time::Limiter, timer::GLOBAL_TIMER_HANDLE}; -use crate::metrics::BACKGROUND_TASKS_WAIT_DURATION; +use crate::{metrics::PRIORITY_WAIT_DURATION_VEC, resource_group::TaskPriority}; #[derive(Clone, Copy, Eq, PartialEq, EnumCount)] #[repr(usize)] @@ -36,11 +37,13 @@ impl fmt::Debug for ResourceType { } pub struct ResourceLimiter { - name: String, + _name: String, version: u64, limiters: [QuotaLimiter; ResourceType::COUNT], // whether the resource limiter is a background limiter or priority limiter. is_background: bool, + // the wait duration histogram for prioitry limiter. + wait_histogram: Option, } impl std::fmt::Debug for ResourceLimiter { @@ -59,11 +62,23 @@ impl ResourceLimiter { ) -> Self { let cpu_limiter = QuotaLimiter::new(cpu_limit); let io_limiter = QuotaLimiter::new(io_limit); + // high priority tasks does not triggers wait, so no need to generate an empty + // metrics. + let wait_histogram = if !is_background && name != TaskPriority::High.as_str() { + Some( + PRIORITY_WAIT_DURATION_VEC + .get_metric_with_label_values(&[&name]) + .unwrap(), + ) + } else { + None + }; Self { - name, + _name: name, version, limiters: [cpu_limiter, io_limiter], is_background, + wait_histogram, } } @@ -76,12 +91,11 @@ impl ResourceLimiter { self.limiters[ResourceType::Cpu as usize].consume(cpu_time.as_micros() as u64, wait); let io_dur = self.limiters[ResourceType::Io as usize].consume_io(io_bytes, wait); let wait_dur = cpu_dur.max(io_dur); - if wait_dur > Duration::ZERO { - BACKGROUND_TASKS_WAIT_DURATION - .with_label_values(&[&self.name]) - .inc_by(wait_dur.as_micros() as u64); + if !wait_dur.is_zero() + && let Some(h) = &self.wait_histogram + { + h.observe(wait_dur.as_secs_f64()); } - wait_dur } @@ -127,7 +141,14 @@ pub(crate) struct QuotaLimiter { impl QuotaLimiter { fn new(limit: f64) -> Self { Self { - limiter: Limiter::new(limit), + // we use 1s refill and 1ms min_wait duration to avoid trigger + // wait too frequently or waiting too long. + // NOTE: the parameter `refill` mainly impact the capacity + // of token bucket but not refill interval. + limiter: Limiter::builder(limit) + .refill(Duration::from_millis(1000)) + .min_wait(Duration::from_millis(1)) + .build(), total_wait_dur_us: AtomicU64::new(0), read_bytes: AtomicU64::new(0), write_bytes: AtomicU64::new(0), diff --git a/components/resource_control/src/worker.rs b/components/resource_control/src/worker.rs index 1dbcd9ffaf0..007b2e27c47 100644 --- a/components/resource_control/src/worker.rs +++ b/components/resource_control/src/worker.rs @@ -206,6 +206,12 @@ impl GroupQuotaAdjustWorker { BACKGROUND_RESOURCE_CONSUMPTION .with_label_values(&[&g.name, resource_type.as_str()]) .inc_by(stats_delta.total_consumed); + if resource_type == ResourceType::Cpu { + BACKGROUND_TASKS_WAIT_DURATION + .with_label_values(&[&g.name]) + .inc_by(stats_delta.total_wait_dur_us); + } + let stats_per_sec = stats_delta / dur_secs; background_consumed_total += stats_per_sec.total_consumed as f64; g.stats_per_sec = stats_per_sec; @@ -513,16 +519,21 @@ impl PriorityLimiterStatsTracker { fn get_and_update_last_stats(&mut self, dur_secs: f64) -> LimiterStats { let cur_stats = self.limiter.get_limit_statistics(ResourceType::Cpu); - let stats_delta = (cur_stats - self.last_stats) / dur_secs; + let stats_delta = cur_stats - self.last_stats; self.last_stats = cur_stats; + PRIORITY_CPU_TIME_VEC + .with_label_values(&[self.priority]) + .inc_by(stats_delta.total_consumed); + let stats_per_sec = stats_delta / dur_secs; + let wait_stats: [_; 2] = std::array::from_fn(|i| self.task_wait_dur_trakcers[i].get_and_upate_statistics()); let schedule_wait_dur_secs = wait_stats.iter().map(|s| s.0).sum::() / dur_secs; LimiterStats { - cpu_secs: stats_delta.total_consumed as f64 / MICROS_PER_SEC, - wait_secs: stats_delta.total_wait_dur_us as f64 / MICROS_PER_SEC + cpu_secs: stats_per_sec.total_consumed as f64 / MICROS_PER_SEC, + wait_secs: stats_per_sec.total_wait_dur_us as f64 / MICROS_PER_SEC + schedule_wait_dur_secs, - req_count: stats_delta.request_count, + req_count: stats_per_sec.request_count, } } } @@ -629,6 +640,7 @@ mod tests { worker.last_adjust_time = now - dur; }; + #[track_caller] fn check(val: f64, expected: f64) { assert!( expected * 0.99 < val && val < expected * 1.01, @@ -638,6 +650,7 @@ mod tests { ); } + #[track_caller] fn check_limiter(limiter: &Arc, cpu: f64, io: IoBytes) { check( limiter.get_limiter(ResourceType::Cpu).get_rate_limit(), @@ -804,18 +817,18 @@ mod tests { worker.adjust_quota(); check_limiter( &limiter, - 2.4, + 1.2, IoBytes { - read: 1400, - write: 1400, + read: 1800, + write: 1800, }, ); check_limiter( &bg_limiter, - 1.6, + 2.8, IoBytes { - read: 1800, - write: 1800, + read: 1400, + write: 1400, }, ); @@ -882,18 +895,18 @@ mod tests { worker.adjust_quota(); check_limiter( &limiter, - 2.4, + 2.2, IoBytes { - read: 1400, - write: 1400, + read: 2133, + write: 2133, }, ); check_limiter( &new_bg_limiter, - 1.6, + 1.8, IoBytes { - read: 1800, - write: 1800, + read: 1066, + write: 1066, }, ); } diff --git a/components/tikv_util/Cargo.toml b/components/tikv_util/Cargo.toml index 6279dd8f5f5..82764047d0c 100644 --- a/components/tikv_util/Cargo.toml +++ b/components/tikv_util/Cargo.toml @@ -9,7 +9,8 @@ failpoints = ["fail/failpoints"] test-cgroup = [] [dependencies] -async-speed-limit = "0.4.0" +# TODO: use `async-speed-limit` in crates.io after new version(v0.4.2) is released. +async-speed-limit = { git = "https://github.com/tikv/async-speed-limit", branch = "master" } backtrace = "0.3.9" byteorder = "1.2" bytes = "1.0" From b4bddeeb995e7bedc1973ce9e856eeb2d856ce9b Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Tue, 15 Oct 2024 14:24:40 +0800 Subject: [PATCH 209/220] build: bump tikv pkg version (#17653) Signed-off-by: ti-chi-bot --- Cargo.lock | 2 +- Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4b36f38d4b4..e674f80d331 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6541,7 +6541,7 @@ dependencies = [ [[package]] name = "tikv" -version = "7.5.3" +version = "7.5.4" dependencies = [ "anyhow", "api_version", diff --git a/Cargo.toml b/Cargo.toml index 9a2ae00f435..c4db583fb8e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "tikv" -version = "7.5.3" +version = "7.5.4" authors = ["The TiKV Authors"] description = "A distributed transactional key-value database powered by Rust and Raft" license = "Apache-2.0" From 5130f1ac1d8437f538232799fc3ac4e8eaabc286 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Fri, 18 Oct 2024 16:12:58 +0800 Subject: [PATCH 210/220] OWNERS: Auto Sync OWNERS files from community membership (#17659) Signed-off-by: Ti Chi Robot --- OWNERS | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/OWNERS b/OWNERS index b0e73247005..f70f2cbf0ea 100644 --- a/OWNERS +++ b/OWNERS @@ -10,6 +10,7 @@ approvers: - cfzjywxk - Connor1996 - coocood + - crazycs520 - disksing - ekexium - gengliqi @@ -47,7 +48,9 @@ approvers: - zhangjinpeng87 - zhongzc - zhouqiang-cl + - zyguan reviewers: + - 3AceShowHand - 3pointer - CalvinNeo - ethercflow @@ -55,7 +58,7 @@ reviewers: - Fullstop000 - gozssky - haojinming - - hi-rustin + - hbisheng - HuSharp - jayzhan211 - Jibbow @@ -68,9 +71,9 @@ reviewers: - MrCroxx - nolouch - rleungx + - Rustin170506 - tier-cap - v01dstar - wjhuang2016 - wshwsh12 - Xuanwo - - zyguan From e36cdcf039172a8587118694982a5618211ccc27 Mon Sep 17 00:00:00 2001 From: qupeng Date: Wed, 30 Oct 2024 11:40:42 +0800 Subject: [PATCH 211/220] cdc: filter events with the observed range before load old values (#17656) close tikv/tikv#16601, close tikv/tikv#17620 cdc: filter events with the observed range before load old values Signed-off-by: qupeng --- components/cdc/src/delegate.rs | 7 -- components/cdc/src/endpoint.rs | 18 ++- components/cdc/src/initializer.rs | 119 +++++++++++------- .../cdc/tests/failpoints/test_endpoint.rs | 95 +++++++++++++- 4 files changed, 184 insertions(+), 55 deletions(-) diff --git a/components/cdc/src/delegate.rs b/components/cdc/src/delegate.rs index 60f3ccde938..70d6835bbc3 100644 --- a/components/cdc/src/delegate.rs +++ b/components/cdc/src/delegate.rs @@ -578,7 +578,6 @@ impl Delegate { request_id: u64, entries: Vec>, filter_loop: bool, - observed_range: &ObservedRange, ) -> Result> { let entries_len = entries.len(); let mut rows = vec![Vec::with_capacity(entries_len)]; @@ -596,9 +595,6 @@ impl Delegate { lock, old_value, })) => { - if !observed_range.contains_encoded_key(&lock.0) { - continue; - } let l = Lock::parse(&lock.1).unwrap(); if decode_lock(lock.0, l, &mut row, &mut _has_value) { continue; @@ -612,9 +608,6 @@ impl Delegate { write, old_value, })) => { - if !observed_range.contains_encoded_key(&write.0) { - continue; - } if decode_write(write.0, &write.1, &mut row, &mut _has_value, false) { continue; } diff --git a/components/cdc/src/endpoint.rs b/components/cdc/src/endpoint.rs index 319153b8c3d..1b9f5bb1a52 100644 --- a/components/cdc/src/endpoint.rs +++ b/components/cdc/src/endpoint.rs @@ -59,7 +59,7 @@ use txn_types::{TimeStamp, TxnExtra, TxnExtraScheduler}; use crate::{ channel::{CdcEvent, SendError}, delegate::{on_init_downstream, Delegate, Downstream, DownstreamId, DownstreamState}, - initializer::Initializer, + initializer::{InitializeStats, Initializer}, metrics::*, old_value::{OldValueCache, OldValueCallback}, service::{validate_kv_api, Conn, ConnId, FeatureGate}, @@ -160,6 +160,7 @@ type InitCallback = Box; pub enum Validate { Region(u64, Box) + Send>), OldValueCache(Box), + InitializeStats(Box), } pub enum Task { @@ -287,6 +288,7 @@ impl fmt::Debug for Task { Task::Validate(validate) => match validate { Validate::Region(region_id, _) => de.field("region_id", ®ion_id).finish(), Validate::OldValueCache(_) => de.finish(), + Validate::InitializeStats(_) => de.finish(), }, Task::ChangeConfig(change) => de .field("type", &"change_config") @@ -402,6 +404,9 @@ pub struct Endpoint { resolved_region_count: usize, unresolved_region_count: usize, warn_resolved_ts_repeat_count: usize, + + // Validate statistics of the next incremental scan. Only for tests. + validate_next_initialize_stats: Option>, } impl, E: KvEngine, S: StoreRegionMeta> Endpoint { @@ -505,6 +510,8 @@ impl, E: KvEngine, S: StoreRegionMeta> Endpoint, E: KvEngine, S: StoreRegionMeta> Endpoint { + Ok(stats) => { CDC_SCAN_TASKS.with_label_values(&["finish"]).inc(); + if let Some(validate) = validate_initialize_stats { + validate(stats); + } } Err(e) => { CDC_SCAN_TASKS.with_label_values(&["abort"]).inc(); @@ -1319,6 +1330,9 @@ impl, E: KvEngine, S: StoreRegionMeta + Send> Runnable Validate::OldValueCache(validate) => { validate(&self.old_value_cache); } + Validate::InitializeStats(validate) => { + self.validate_next_initialize_stats = Some(validate); + } }, Task::ChangeConfig(change) => self.on_change_cfg(change), } diff --git a/components/cdc/src/initializer.rs b/components/cdc/src/initializer.rs index f0b385c6009..7222f51ecf1 100644 --- a/components/cdc/src/initializer.rs +++ b/components/cdc/src/initializer.rs @@ -59,15 +59,6 @@ use crate::{ Error, Result, Task, }; -struct ScanStat { - // Fetched bytes to the scanner. - emit: usize, - // Bytes from the device, `None` if not possible to get it. - disk_read: Option, - // Perf delta for RocksDB. - perf_delta: ReadPerfContext, -} - pub(crate) enum KvEntry { TxnEntry(TxnEntry), RawKvEntry(KvPair), @@ -115,7 +106,7 @@ impl Initializer { cdc_handle: T, concurrency_semaphore: Arc, memory_quota: Arc, - ) -> Result<()> { + ) -> Result { fail_point!("cdc_before_initialize"); let _permit = concurrency_semaphore.acquire().await; @@ -170,7 +161,7 @@ impl Initializer { &mut self, mut resp: ReadResponse, memory_quota: Arc, - ) -> Result<()> { + ) -> Result { if let Some(region_snapshot) = resp.snapshot { let region = region_snapshot.get_region().clone(); assert_eq!(self.region_id, region.get_id()); @@ -192,7 +183,7 @@ impl Initializer { snap: S, region: Region, memory_quota: Arc, - ) -> Result<()> { + ) -> Result { CDC_SCAN_TASKS.with_label_values(&["ongoing"]).inc(); defer!(CDC_SCAN_TASKS.with_label_values(&["ongoing"]).dec()); @@ -201,7 +192,7 @@ impl Initializer { let observe_id = self.observe_id; let conn_id = self.conn_id; let kv_api = self.kv_api; - let on_cancel = || -> Result<()> { + let on_cancel = || -> Result { info!("cdc async incremental scan canceled"; "region_id" => region_id, "downstream_id" => ?downstream_id, @@ -272,6 +263,7 @@ impl Initializer { DownstreamState::Initializing | DownstreamState::Stopped )); + let mut stats = InitializeStats::default(); while !done { // When downstream_state is Stopped, it means the corresponding // delegate is stopped. The initialization can be safely canceled. @@ -280,7 +272,9 @@ impl Initializer { } let cursors = old_value_cursors.as_mut(); let resolver = resolver.as_mut(); - let entries = self.scan_batch(&mut scanner, cursors, resolver).await?; + let entries = self + .scan_batch(&mut scanner, cursors, resolver, &mut stats) + .await?; if let Some(None) = entries.last() { // If the last element is None, it means scanning is finished. done = true; @@ -310,7 +304,7 @@ impl Initializer { CDC_SCAN_DURATION_HISTOGRAM.observe(takes.as_secs_f64()); CDC_SCAN_SINK_DURATION_HISTOGRAM.observe(duration_to_sec(sink_time)); - Ok(()) + Ok(stats) } // It's extracted from `Initializer::scan_batch` to avoid becoming an @@ -321,7 +315,7 @@ impl Initializer { scanner: &mut Scanner, mut old_value_cursors: Option<&mut OldValueCursors>, entries: &mut Vec>, - ) -> Result { + ) -> Result { let mut read_old_value = |v: &mut OldValue, stats: &mut Statistics| -> Result<()> { let (wc, dc) = match old_value_cursors { Some(ref mut x) => (&mut x.write, &mut x.default), @@ -336,21 +330,28 @@ impl Initializer { Ok(()) }; + let mut stats = InitializeStats::default(); + // This code block shouldn't be switched to other threads. let mut total_bytes = 0; let mut total_size = 0; let perf_instant = ReadPerfInstant::new(); let inspector = self_thread_inspector().ok(); let old_io_stat = inspector.as_ref().and_then(|x| x.io_stat().unwrap_or(None)); - let mut stats = Statistics::default(); while total_bytes <= self.max_scan_batch_bytes && total_size < self.max_scan_batch_size { total_size += 1; match scanner { Scanner::TxnKvScanner(scanner) => match scanner.next_entry()? { Some(mut entry) => { - read_old_value(entry.old_value(), &mut stats)?; - total_bytes += entry.size(); - entries.push(Some(KvEntry::TxnEntry(entry))); + let key = match entry { + TxnEntry::Prewrite { ref lock, .. } => &lock.0, + TxnEntry::Commit { ref write, .. } => &write.0, + }; + if self.observed_range.contains_encoded_key(key) { + read_old_value(entry.old_value(), &mut stats.old_value)?; + total_bytes += entry.size(); + entries.push(Some(KvEntry::TxnEntry(entry))); + } } None => { entries.push(None); @@ -374,19 +375,17 @@ impl Initializer { } } } - flush_oldvalue_stats(&stats, TAG_INCREMENTAL_SCAN); + flush_oldvalue_stats(&stats.old_value, TAG_INCREMENTAL_SCAN); let new_io_stat = inspector.as_ref().and_then(|x| x.io_stat().unwrap_or(None)); - let disk_read = match (old_io_stat, new_io_stat) { + + stats.scan.emit = total_bytes; + stats.scan.disk_read = match (old_io_stat, new_io_stat) { (Some(s1), Some(s2)) => Some((s2.read - s1.read) as usize), _ => None, }; - let perf_delta = perf_instant.delta(); - let emit = total_bytes; - Ok(ScanStat { - emit, - disk_read, - perf_delta, - }) + stats.scan.perf_delta = perf_instant.delta(); + + Ok(stats) } async fn scan_batch( @@ -394,22 +393,22 @@ impl Initializer { scanner: &mut Scanner, old_value_cursors: Option<&mut OldValueCursors>, resolver: Option<&mut Resolver>, + stats: &mut InitializeStats, ) -> Result>> { let mut entries = Vec::with_capacity(self.max_scan_batch_size); - let ScanStat { - emit, - disk_read, - perf_delta, - } = self.do_scan(scanner, old_value_cursors, &mut entries)?; + let delta_stats = self.do_scan(scanner, old_value_cursors, &mut entries)?; + stats.add(&delta_stats); - TLS_CDC_PERF_STATS.with(|x| *x.borrow_mut() += perf_delta); + TLS_CDC_PERF_STATS.with(|x| *x.borrow_mut() += delta_stats.scan.perf_delta); tls_flush_perf_stats(); - if let Some(bytes) = disk_read { + if let Some(bytes) = delta_stats.scan.disk_read { CDC_SCAN_DISK_READ_BYTES.inc_by(bytes as _); self.scan_speed_limiter.consume(bytes).await; } - CDC_SCAN_BYTES.inc_by(emit as _); - self.fetch_speed_limiter.consume(emit as _).await; + CDC_SCAN_BYTES.inc_by(delta_stats.scan.emit as _); + self.fetch_speed_limiter + .consume(delta_stats.scan.emit as _) + .await; if let Some(resolver) = resolver { // Track the locks. @@ -437,7 +436,6 @@ impl Initializer { self.request_id, entries, self.filter_loop, - &self.observed_range, )?; if done { let (cb, fut) = tikv_util::future::paired_future_callback(); @@ -515,6 +513,8 @@ impl Initializer { } fn ts_filter_is_helpful(&self, snap: &S) -> bool { + fail_point!("ts_filter_is_helpful_always_true", |_| true); + if self.ts_filter_ratio < f64::EPSILON { return false; } @@ -566,6 +566,33 @@ impl Initializer { } } +#[derive(Default, Debug)] +pub struct InitializeStats { + pub old_value: Statistics, + pub scan: ScanStats, +} + +#[derive(Default, Debug)] +pub struct ScanStats { + // Fetched bytes to the scanner. + emit: usize, + // Bytes from the device, `None` if not possible to get it. + disk_read: Option, + // Perf delta for RocksDB. + perf_delta: ReadPerfContext, +} + +impl InitializeStats { + fn add(&mut self, other: &InitializeStats) { + self.old_value.add(&other.old_value); + self.scan.emit += other.scan.emit; + if let Some(x) = self.scan.disk_read.as_mut() { + *x += other.scan.disk_read.unwrap_or_default(); + } + self.scan.perf_delta += other.scan.perf_delta; + } +} + #[cfg(test)] mod tests { use std::{ @@ -714,12 +741,14 @@ mod tests { total_bytes += v.len(); let ts = TimeStamp::new(i as _); must_prewrite_put(&mut engine, k, v, k, ts); - let txn_locks = expected_locks.entry(ts).or_insert_with(|| { - let mut txn_locks = TxnLocks::default(); - txn_locks.sample_lock = Some(k.to_vec().into()); - txn_locks - }); - txn_locks.lock_count += 1; + if i < 90 { + let txn_locks = expected_locks.entry(ts).or_insert_with(|| { + let mut txn_locks = TxnLocks::default(); + txn_locks.sample_lock = Some(k.to_vec().into()); + txn_locks + }); + txn_locks.lock_count += 1; + } } let region = Region::default(); diff --git a/components/cdc/tests/failpoints/test_endpoint.rs b/components/cdc/tests/failpoints/test_endpoint.rs index 2ca7c18a22e..fae542a45d0 100644 --- a/components/cdc/tests/failpoints/test_endpoint.rs +++ b/components/cdc/tests/failpoints/test_endpoint.rs @@ -9,12 +9,16 @@ use std::{ use api_version::{test_kv_format_impl, KvFormat}; use causal_ts::CausalTsProvider; use cdc::{recv_timeout, Delegate, OldValueCache, Task, Validate}; +use engine_traits::{ + IterOptions, Iterable, Iterator, MiscExt, Mutable, WriteBatch, WriteBatchExt, WriteOptions, + CF_DEFAULT, CF_WRITE, +}; use futures::{executor::block_on, sink::SinkExt}; use grpcio::{ChannelBuilder, Environment, WriteFlags}; use kvproto::{cdcpb::*, kvrpcpb::*, tikvpb_grpc::TikvClient}; use pd_client::PdClient; use test_raftstore::*; -use tikv_util::{debug, worker::Scheduler, HandyRwLock}; +use tikv_util::{debug, keybuilder::KeyBuilder, worker::Scheduler, HandyRwLock}; use txn_types::{Key, TimeStamp}; use crate::{new_event_feed, new_event_feed_v2, ClientReceiver, TestSuite, TestSuiteBuilder}; @@ -656,3 +660,92 @@ fn test_delegate_fail_during_incremental_scan() { recv_timeout(&mut recver, Duration::from_secs(1)).unwrap_err(); recv.replace(Some(recver)); } + +#[test] +fn test_cdc_load_unnecessary_old_value() { + let mut suite = TestSuite::new(1, ApiVersion::V1); + let region = suite.cluster.get_region(&[]); + let rid = region.id; + let engine = suite.cluster.get_engine(1); + + let start_tso = block_on(suite.cluster.pd_client.get_tso()).unwrap(); + let pk = format!("key_{:05}", 0).into_bytes(); + let mut mutations = Vec::with_capacity(1000); + let mut keys = Vec::with_capacity(1000); + for i in 0..1000 { + let key = format!("key_{:05}", i).into_bytes(); + keys.push(key.clone()); + + let mut mutation = Mutation::default(); + mutation.set_op(Op::Put); + mutation.key = key; + mutation.value = vec![b'x'; 16]; + mutations.push(mutation); + } + suite.must_kv_prewrite(rid, mutations, pk, start_tso); + + let commit_tso = block_on(suite.cluster.pd_client.get_tso()).unwrap(); + suite.must_kv_commit(rid, keys, start_tso, commit_tso); + engine.flush_cf(CF_WRITE, true).unwrap(); + + for cf in &[CF_WRITE, CF_DEFAULT] { + let mut wb = suite.cluster.get_engine(1).write_batch(); + let mut count = 0; + + let start = KeyBuilder::from_vec(vec![b'z'], 0, 0); + let end = KeyBuilder::from_vec(vec![b'z' + 1], 0, 0); + let iter_opts = IterOptions::new(Some(start), Some(end), false); + let mut iter = engine.iterator_opt(cf, iter_opts).unwrap(); + let mut valid = iter.seek_to_first().unwrap(); + + // skip some keys. + while valid && count < 2 { + count += 1; + valid = iter.next().unwrap(); + } + while valid { + count += 1; + let key = iter.key(); + wb.delete_cf(cf, key).unwrap(); + valid = iter.next().unwrap(); + } + assert!(count == 0 || count == 1000); + wb.write_opt(&WriteOptions::default()).unwrap(); + engine.flush_cf(cf, true).unwrap(); + } + + let scheduler = suite.endpoints.values().next().unwrap().scheduler(); + let (tx, rx) = std::sync::mpsc::sync_channel(1); + scheduler + .schedule(Task::Validate(Validate::InitializeStats(Box::new( + move |stats| tx.send(stats).unwrap(), + )))) + .unwrap(); + + fail::cfg("ts_filter_is_helpful_always_true", "return(0)").unwrap(); + let (mut req_tx, _, receive_event) = new_event_feed_v2(suite.get_region_cdc_client(rid)); + let mut req = suite.new_changedata_request(rid); + req.request_id = 100; + req.checkpoint_ts = commit_tso.into_inner() - 1; + req.set_start_key(Key::from_raw(b"aa").into_encoded()); + req.set_end_key(Key::from_raw(b"ab").into_encoded()); + block_on(req_tx.send((req.clone(), WriteFlags::default()))).unwrap(); + + let events = receive_event(false).events.to_vec(); + assert_eq!(events.len(), 1, "{:?}", events); + match events[0].event.as_ref().unwrap() { + Event_oneof_event::Entries(es) => { + assert!(es.entries.len() == 1); + assert_eq!(es.entries[0].get_type(), EventLogType::Initialized); + } + _ => unreachable!(), + } + + let stats = rx.recv().unwrap().old_value.write; + assert_eq!(stats.seek_tombstone, 0); + assert_eq!(stats.next_tombstone, 0); + assert_eq!(stats.prev_tombstone, 0); + + fail::remove("ts_filter_is_helpful_always_true"); + suite.stop(); +} From e3951c7d314584535ba7649ffa61b4079dd32c1f Mon Sep 17 00:00:00 2001 From: Yang Zhang Date: Wed, 13 Nov 2024 23:30:38 -0800 Subject: [PATCH 212/220] RocksDB: Use rust-rocksdb 7.5 for TiKV 7.5 (#17779) close tikv/tikv#17808 Use rust-rocksdb tikv-7.5 for 7.5 release Signed-off-by: Yang Zhang --- Cargo.lock | 6 +++--- components/engine_rocks/Cargo.toml | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e674f80d331..61d98a761b5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2997,7 +2997,7 @@ dependencies = [ [[package]] name = "librocksdb_sys" version = "0.1.0" -source = "git+https://github.com/tikv/rust-rocksdb.git#c92c467a3ab0b60484a0db83fcf89366791716cd" +source = "git+https://github.com/tikv/rust-rocksdb.git?branch=tikv-7.5#268d20d61b8bf097f064a87a9af3cb91725ff179" dependencies = [ "bindgen 0.65.1", "bzip2-sys", @@ -3016,7 +3016,7 @@ dependencies = [ [[package]] name = "libtitan_sys" version = "0.0.1" -source = "git+https://github.com/tikv/rust-rocksdb.git#c92c467a3ab0b60484a0db83fcf89366791716cd" +source = "git+https://github.com/tikv/rust-rocksdb.git?branch=tikv-7.5#268d20d61b8bf097f064a87a9af3cb91725ff179" dependencies = [ "bzip2-sys", "cc", @@ -4937,7 +4937,7 @@ dependencies = [ [[package]] name = "rocksdb" version = "0.3.0" -source = "git+https://github.com/tikv/rust-rocksdb.git#c92c467a3ab0b60484a0db83fcf89366791716cd" +source = "git+https://github.com/tikv/rust-rocksdb.git?branch=tikv-7.5#268d20d61b8bf097f064a87a9af3cb91725ff179" dependencies = [ "libc 0.2.146", "librocksdb_sys", diff --git a/components/engine_rocks/Cargo.toml b/components/engine_rocks/Cargo.toml index 1d275b788c2..8fcdf6eb023 100644 --- a/components/engine_rocks/Cargo.toml +++ b/components/engine_rocks/Cargo.toml @@ -60,6 +60,7 @@ txn_types = { workspace = true } git = "https://github.com/tikv/rust-rocksdb.git" package = "rocksdb" features = ["encryption"] +branch = "tikv-7.5" [dev-dependencies] rand = "0.8" From c375646f3ba1f16cd56146c7dbfc0726dcae53dd Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Fri, 15 Nov 2024 10:26:38 +0800 Subject: [PATCH 213/220] chore: Fix yanked futures-util 0.3.15 (#17611) (#17826) close tikv/tikv#17689 Fixing yanked futures-util 0.3.15 Signed-off-by: ti-chi-bot Signed-off-by: glorv Co-authored-by: Yang Zhang Co-authored-by: glorv --- Cargo.lock | 79 +++++++++-------------- components/resource_control/src/future.rs | 2 +- 2 files changed, 32 insertions(+), 49 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 61d98a761b5..3422fe244d6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -221,13 +221,13 @@ dependencies = [ [[package]] name = "async-trait" -version = "0.1.58" +version = "0.1.69" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e805d94e6b5001b651426cf4cd446b1ab5f319d27bab5c644f61de0a804360c" +checksum = "7b2d0f03b3640e3a630367e40c468cb7f309529c708ed1d88597047b0e7c6ef7" dependencies = [ "proc-macro2", "quote", - "syn 1.0.103", + "syn 2.0.79", ] [[package]] @@ -705,7 +705,7 @@ dependencies = [ "regex", "rustc-hash", "shlex 1.1.0", - "syn 2.0.18", + "syn 2.0.79", ] [[package]] @@ -2158,9 +2158,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.15" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e682a68b29a882df0545c143dc3646daefe80ba479bcdede94d5a703de2871e2" +checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10" dependencies = [ "futures-core", "futures-sink", @@ -2168,9 +2168,9 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.15" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0402f765d8a89a26043b889b26ce3c4679d268fa6bb22cd7c6aad98340e179d1" +checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e" [[package]] name = "futures-executor" @@ -2186,9 +2186,9 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1" +checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" [[package]] name = "futures-lite" @@ -2207,28 +2207,26 @@ dependencies = [ [[package]] name = "futures-macro" -version = "0.3.15" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4c40298486cdf52cc00cd6d6987892ba502c7656a16a4192a9992b1ccedd121" +checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ - "autocfg", - "proc-macro-hack", "proc-macro2", "quote", - "syn 1.0.103", + "syn 2.0.79", ] [[package]] name = "futures-sink" -version = "0.3.15" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a57bead0ceff0d6dde8f465ecd96c9338121bb7717d3e7b108059531870c4282" +checksum = "e575fab7d1e0dcb8d0c7bcf9a63ee213816ab51902e6d244a95819acacf1d4f7" [[package]] name = "futures-task" -version = "0.3.15" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a16bef9fc1a4dddb5bee51c989e3fbba26569cbb0e31f5b303c184e3dd33dae" +checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988" [[package]] name = "futures-timer" @@ -2238,11 +2236,10 @@ checksum = "e64b03909df88034c26dc1547e8970b91f98bdb65165d6a4e9110d94263dbb2c" [[package]] name = "futures-util" -version = "0.3.15" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "feb5c238d27e2bf94ffdfd27b2c29e3df4a68c4193bb6427384259e2bf191967" +checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81" dependencies = [ - "autocfg", "futures 0.1.31", "futures-channel", "futures-core", @@ -2253,8 +2250,6 @@ dependencies = [ "memchr", "pin-project-lite", "pin-utils", - "proc-macro-hack", - "proc-macro-nested", "slab", ] @@ -3504,13 +3499,13 @@ dependencies = [ [[package]] name = "num-derive" -version = "0.4.0" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e6a0fd4f737c707bd9086cc16c925f294943eb62eb71499e9fd4cf71f8b9f4e" +checksum = "ed3955f1a9c7c0c15e092f9c887db08b1fc683305fdf6eb6684f22555355e202" dependencies = [ "proc-macro2", "quote", - "syn 2.0.18", + "syn 2.0.79", ] [[package]] @@ -4065,7 +4060,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3b69d39aab54d069e7f2fe8cb970493e7834601ca2d8c65fd7bbd183578080d1" dependencies = [ "proc-macro2", - "syn 2.0.18", + "syn 2.0.79", ] [[package]] @@ -4092,23 +4087,11 @@ dependencies = [ "version_check 0.9.4", ] -[[package]] -name = "proc-macro-hack" -version = "0.5.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dbf0c48bc1d91375ae5c3cd81e3722dff1abcf81a30960240640d223f59fe0e5" - -[[package]] -name = "proc-macro-nested" -version = "0.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "369a6ed065f249a159e06c45752c780bda2fb53c995718f9e484d08daa9eb42e" - [[package]] name = "proc-macro2" -version = "1.0.60" +version = "1.0.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dec2b086b7a862cf4de201096214fa870344cf922b2b30c167badb3af3195406" +checksum = "b3e4daa0dcf6feba26f985457cdf104d4b4256fc5a09547140f3631bb076b19a" dependencies = [ "unicode-ident", ] @@ -4305,9 +4288,9 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.28" +version = "1.0.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b9ab9c7eadfd8df19006f1cf1a4aed13540ed5cbc047010ece5826e10825488" +checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" dependencies = [ "proc-macro2", ] @@ -4346,7 +4329,7 @@ dependencies = [ "lz4-sys", "memmap2 0.9.3", "nix 0.26.2", - "num-derive 0.4.0", + "num-derive 0.4.2", "num-traits", "parking_lot 0.12.1", "prometheus", @@ -5831,7 +5814,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.18", + "syn 2.0.79", ] [[package]] @@ -5875,9 +5858,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.18" +version = "2.0.79" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32d41677bcbe24c20c52e7c70b0d8db04134c5d1066bf98662e2871ad200ea3e" +checksum = "89132cd0bf050864e1d38dc3bbc07a0eb8e7530af26344d3d2bbbef83499f590" dependencies = [ "proc-macro2", "quote", diff --git a/components/resource_control/src/future.rs b/components/resource_control/src/future.rs index 0750a21c574..3a45e086ba2 100644 --- a/components/resource_control/src/future.rs +++ b/components/resource_control/src/future.rs @@ -323,7 +323,7 @@ mod tests { let dur = start.saturating_elapsed(); assert_eq!(delta.total_consumed, 150); assert!(delta.total_wait_dur_us >= 140_000 && delta.total_wait_dur_us <= 160_000); - assert!(dur >= Duration::from_millis(150) && dur <= Duration::from_millis(160)); + assert!(dur >= Duration::from_millis(140) && dur <= Duration::from_millis(160)); // fetch io bytes failed, consumed value is 0. #[cfg(feature = "failpoints")] From 20f75d03bb6b6e3f59ecf288cf4f6cb9b8ddc35f Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Tue, 19 Nov 2024 23:28:59 +0800 Subject: [PATCH 214/220] expr: fix panic when using radians and degree (#17853) (#17857) close tikv/tikv#17852 expr: fix panic when using radians and degree Signed-off-by: gengliqi Co-authored-by: gengliqi --- components/tidb_query_expr/src/impl_math.rs | 38 +++++++++++++++------ 1 file changed, 28 insertions(+), 10 deletions(-) diff --git a/components/tidb_query_expr/src/impl_math.rs b/components/tidb_query_expr/src/impl_math.rs index f9f790d7d17..8817249b628 100644 --- a/components/tidb_query_expr/src/impl_math.rs +++ b/components/tidb_query_expr/src/impl_math.rs @@ -280,7 +280,7 @@ fn sqrt(arg: &Real) -> Result> { #[inline] #[rpn_fn] fn radians(arg: &Real) -> Result> { - Ok(Real::new(**arg * std::f64::consts::PI / 180_f64).ok()) + Ok(Real::new(**arg * (std::f64::consts::PI / 180_f64)).ok()) } #[inline] @@ -353,7 +353,12 @@ fn rand_with_seed_first_gen(seed: Option<&i64>) -> Result> { #[inline] #[rpn_fn] fn degrees(arg: &Real) -> Result> { - Ok(Real::new(arg.to_degrees()).ok()) + let ret = arg.to_degrees(); + if ret.is_infinite() { + Err(Error::overflow("DOUBLE", format!("degrees({})", arg)).into()) + } else { + Ok(Real::new(ret).ok()) + } } #[inline] @@ -1182,6 +1187,10 @@ mod tests { ), (Some(f64::NAN), None), (Some(f64::INFINITY), Some(Real::new(f64::INFINITY).unwrap())), + ( + Some(1.0E308), + Some(Real::new(1.0E308 * (std::f64::consts::PI / 180_f64)).unwrap()), + ), ]; for (input, expect) in test_cases { let output = RpnFnScalarEvaluator::new() @@ -1221,25 +1230,34 @@ mod tests { #[test] fn test_degrees() { let tests_cases = vec![ - (None, None), - (Some(f64::NAN), None), - (Some(0f64), Some(Real::new(0f64).unwrap())), - (Some(1f64), Some(Real::new(57.29577951308232_f64).unwrap())), + (None, None, false), + (Some(f64::NAN), None, false), + (Some(0f64), Some(Real::new(0f64).unwrap()), false), + ( + Some(1f64), + Some(Real::new(57.29577951308232_f64).unwrap()), + false, + ), ( Some(std::f64::consts::PI), Some(Real::new(180.0_f64).unwrap()), + false, ), ( Some(-std::f64::consts::PI / 2.0_f64), Some(Real::new(-90.0_f64).unwrap()), + false, ), + (Some(1.0E307), None, true), ]; - for (input, expect) in tests_cases { + for (input, expect, is_err) in tests_cases { let output = RpnFnScalarEvaluator::new() .push_param(input) - .evaluate(ScalarFuncSig::Degrees) - .unwrap(); - assert_eq!(expect, output, "{:?}", input); + .evaluate(ScalarFuncSig::Degrees); + assert_eq!(is_err, output.is_err()); + if let Ok(out) = output { + assert_eq!(expect, out, "{:?}", input); + } } } From 8e8dae3e78222b17c81f842f1358fb5971194a95 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Wed, 20 Nov 2024 15:14:14 +0800 Subject: [PATCH 215/220] raftstore: skip handle remaining messages if peer is destroyed (#17841) (#17848) close tikv/tikv#17840 Skip handling remain raft messages after peer fsm is stopped. This can avoid potential panic if the raft message need to read raft log from raft engine. Signed-off-by: glorv Co-authored-by: glorv Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- components/raftstore/src/store/fsm/peer.rs | 10 ++++- tests/failpoints/cases/test_merge.rs | 49 +++++++++++++++++++++- 2 files changed, 57 insertions(+), 2 deletions(-) diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index 89fb55a5ec8..145d5779b7f 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -626,6 +626,12 @@ where #[allow(const_evaluatable_unchecked)] let mut distribution = [0; PeerMsg::::COUNT]; for m in msgs.drain(..) { + // skip handling remain messages if fsm is destroyed. This can aviod handling + // arbitary messages(e.g. CasualMessage::ForceCompactRaftLogs) that may need + // to read raft logs which maybe lead to panic. + if self.fsm.stopped { + break; + } distribution[m.discriminant()] += 1; match m { PeerMsg::RaftMessage(msg, sent_time) => { @@ -3775,6 +3781,7 @@ where ) .flush() .when_done(move || { + fail_point!("destroy_region_before_gc_flush"); if let Err(e) = mb.force_send(PeerMsg::SignificantMsg(Box::new( SignificantMsg::RaftLogGcFlushed, ))) { @@ -3786,6 +3793,7 @@ where region_id, peer_id, e ); } + fail_point!("destroy_region_after_gc_flush"); }); if let Err(e) = self.ctx.raftlog_gc_scheduler.schedule(task) { if tikv_util::thread_group::is_shutdown(!cfg!(test)) { @@ -5744,7 +5752,7 @@ where } fail_point!("on_raft_log_gc_tick_1", self.fsm.peer_id() == 1, |_| {}); fail_point!("on_raft_gc_log_tick", |_| {}); - debug_assert!(!self.fsm.stopped); + assert!(!self.fsm.stopped); // As leader, we would not keep caches for the peers that didn't response // heartbeat in the last few seconds. That happens probably because diff --git a/tests/failpoints/cases/test_merge.rs b/tests/failpoints/cases/test_merge.rs index 929afeb70f4..652bb479b87 100644 --- a/tests/failpoints/cases/test_merge.rs +++ b/tests/failpoints/cases/test_merge.rs @@ -19,7 +19,7 @@ use kvproto::{ }; use pd_client::PdClient; use raft::eraftpb::MessageType; -use raftstore::store::*; +use raftstore::{router::RaftStoreRouter, store::*}; use raftstore_v2::router::{PeerMsg, PeerTick}; use test_raftstore::*; use test_raftstore_macro::test_case; @@ -2173,3 +2173,50 @@ fn test_destroy_race_during_atomic_snapshot_after_merge() { cluster.must_transfer_leader(right.get_id(), new_peer(3, new_peer_id)); cluster.must_put(b"k4", b"v4"); } + +// `test_raft_log_gc_after_merge` tests when a region is destoryed, e.g. due to +// region merge, PeerFsm can still handle pending raft messages correctly. +#[test] +fn test_raft_log_gc_after_merge() { + let mut cluster = new_node_cluster(0, 1); + configure_for_merge(&mut cluster.cfg); + cluster.cfg.raft_store.store_batch_system.pool_size = 2; + cluster.run(); + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + cluster.must_put(b"k1", b"v1"); + cluster.must_put(b"k3", b"v3"); + + let region = cluster.get_region(b"k1"); + cluster.must_split(®ion, b"k2"); + let left = cluster.get_region(b"k1"); + let right = cluster.get_region(b"k3"); + + fail::cfg_callback("destroy_region_before_gc_flush", move || { + fail::cfg("pause_on_peer_collect_message", "pause").unwrap(); + }) + .unwrap(); + + let (tx, rx) = channel(); + let tx = Arc::new(Mutex::new(tx)); + fail::cfg_callback("destroy_region_after_gc_flush", move || { + tx.lock().unwrap().send(()).unwrap(); + }) + .unwrap(); + + // the right peer's id is 1. + pd_client.must_merge(right.get_id(), left.get_id()); + rx.recv_timeout(Duration::from_secs(1)).unwrap(); + + let raft_router = cluster.get_router(1).unwrap(); + raft_router + .send_casual_msg(1, CasualMessage::ForceCompactRaftLogs) + .unwrap(); + + fail::remove("pause_on_peer_collect_message"); + + // wait some time for merge finish. + std::thread::sleep(Duration::from_secs(1)); + must_get_equal(&cluster.get_engine(1), b"k3", b"v3"); +} From 8b006a5bdfae691c7286933ed8ab3b7063a2b76d Mon Sep 17 00:00:00 2001 From: lucasliang Date: Mon, 2 Dec 2024 16:50:38 +0800 Subject: [PATCH 216/220] [Cherry-pick-7.5] raftstore: calculate the slow score by considering individual disk performance factors.(#17801) (#17901) close tikv/tikv#17884 This pr introduces an extra and individual inspector to detect whether there exists I/O hung issues on kvdb disk, if the kvdb is deployed with a separate mount path. Signed-off-by: lucasliang --- Cargo.lock | 1 + components/raftstore-v2/src/worker/pd/mod.rs | 4 +- components/raftstore/src/store/config.rs | 60 ++- components/raftstore/src/store/fsm/store.rs | 45 +- components/raftstore/src/store/metrics.rs | 7 +- components/raftstore/src/store/mod.rs | 6 +- components/raftstore/src/store/msg.rs | 3 +- .../raftstore/src/store/worker/disk_check.rs | 179 +++++++ components/raftstore/src/store/worker/mod.rs | 2 + components/raftstore/src/store/worker/pd.rs | 471 +++++++++--------- components/server/src/server.rs | 14 +- components/test_raftstore/src/node.rs | 1 + components/test_raftstore/src/server.rs | 6 +- components/tikv_util/Cargo.toml | 1 + components/tikv_util/src/lib.rs | 18 + components/tikv_util/src/slow_score.rs | 287 +++++++++++ metrics/grafana/tikv_details.json | 4 +- src/server/node.rs | 8 +- .../integrations/config/dynamic/raftstore.rs | 3 +- .../integrations/raftstore/test_bootstrap.rs | 6 +- .../raftstore/test_status_command.rs | 65 ++- tests/integrations/server/kv_service.rs | 3 +- 22 files changed, 892 insertions(+), 302 deletions(-) create mode 100644 components/raftstore/src/store/worker/disk_check.rs create mode 100644 components/tikv_util/src/slow_score.rs diff --git a/Cargo.lock b/Cargo.lock index 3422fe244d6..7a1b014fe04 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6843,6 +6843,7 @@ dependencies = [ "num_cpus", "online_config", "openssl", + "ordered-float", "page_size", "panic_hook", "parking_lot_core 0.9.1", diff --git a/components/raftstore-v2/src/worker/pd/mod.rs b/components/raftstore-v2/src/worker/pd/mod.rs index 77915dd0378..520403b6ce3 100644 --- a/components/raftstore-v2/src/worker/pd/mod.rs +++ b/components/raftstore-v2/src/worker/pd/mod.rs @@ -25,6 +25,7 @@ use tikv_util::{ config::VersionTrack, time::{Instant as TiInstant, UnixSecs}, worker::{Runnable, Scheduler}, + InspectFactor, }; use yatp::{task::future::TaskCell, Remote}; @@ -257,6 +258,7 @@ where store_heartbeat_interval / NUM_COLLECT_STORE_INFOS_PER_HEARTBEAT, cfg.value().report_min_resolved_ts_interval.0, cfg.value().inspect_interval.0, + std::time::Duration::default(), PdReporter::new(pd_scheduler, logger.clone()), ); stats_monitor.start( @@ -436,7 +438,7 @@ impl StoreStatsReporter for PdReporter { } } - fn update_latency_stats(&self, timer_tick: u64) { + fn update_latency_stats(&self, timer_tick: u64, _factor: InspectFactor) { // Tick slowness statistics. { if let Err(e) = self.scheduler.schedule(Task::TickSlownessStats) { diff --git a/components/raftstore/src/store/config.rs b/components/raftstore/src/store/config.rs index 7c13446c185..febad57331d 100644 --- a/components/raftstore/src/store/config.rs +++ b/components/raftstore/src/store/config.rs @@ -341,14 +341,25 @@ pub struct Config { #[deprecated = "The configuration has been removed. The time to clean stale peer safely can be decided based on RocksDB snapshot sequence number."] pub clean_stale_peer_delay: ReadableDuration, - // Interval to inspect the latency of raftstore for slow store detection. + #[online_config(hidden)] + // Interval to inspect the latency of flushing raft logs for slow store detection. pub inspect_interval: ReadableDuration, + // Interval to inspect the latency of flushes on kvdb for slow store detection. + // If the kvdb uses the same mount path with raftdb, the default value will be + // optimized to `0` to avoid duplicated inspection. + #[doc(hidden)] + #[online_config(hidden)] + pub inspect_kvdb_interval: ReadableDuration, /// Threshold of CPU utilization to inspect for slow store detection. #[doc(hidden)] + #[online_config(hidden)] pub inspect_cpu_util_thd: f64, - + #[doc(hidden)] + #[online_config(hidden)] // The unsensitive(increase it to reduce sensitiveness) of the cause-trend detection pub slow_trend_unsensitive_cause: f64, + #[doc(hidden)] + #[online_config(hidden)] // The unsensitive(increase it to reduce sensitiveness) of the result-trend detection pub slow_trend_unsensitive_result: f64, @@ -513,6 +524,7 @@ impl Default for Config { region_split_size: ReadableSize(0), clean_stale_peer_delay: ReadableDuration::minutes(0), inspect_interval: ReadableDuration::millis(100), + inspect_kvdb_interval: ReadableDuration::secs(2), // The default value of `inspect_cpu_util_thd` is 0.4, which means // when the cpu utilization is greater than 40%, the store might be // regarded as a slow node if there exists delayed inspected messages. @@ -645,6 +657,29 @@ impl Config { } } + /// Optimize the interval of different inspectors according to the + /// configuration. + pub fn optimize_inspector(&mut self, separated_raft_mount_path: bool) { + // If the kvdb uses the same mount path with raftdb, the health status + // of kvdb will be inspected by raftstore automatically. So it's not necessary + // to inspect kvdb. + if !separated_raft_mount_path { + self.inspect_kvdb_interval = ReadableDuration::ZERO; + } else { + // If the inspect_kvdb_interval is less than inspect_interval, it should + // use `inspect_interval` * 10 as an empirical inspect interval for KvDB Disk + // I/O. + let inspect_kvdb_interval = if self.inspect_kvdb_interval < self.inspect_interval + && self.inspect_kvdb_interval != ReadableDuration::ZERO + { + self.inspect_interval * 10 + } else { + self.inspect_kvdb_interval + }; + self.inspect_kvdb_interval = inspect_kvdb_interval; + } + } + pub fn validate( &mut self, region_split_size: ReadableSize, @@ -1561,5 +1596,26 @@ mod tests { cfg.raft_log_gc_count_limit(), split_size * 3 / 4 / ReadableSize::kb(1) ); + + cfg = Config::new(); + cfg.optimize_inspector(false); + assert_eq!(cfg.inspect_kvdb_interval, ReadableDuration::ZERO); + + cfg = Config::new(); + cfg.inspect_kvdb_interval = ReadableDuration::secs(1); + cfg.optimize_inspector(false); + assert_eq!(cfg.inspect_kvdb_interval, ReadableDuration::ZERO); + cfg.optimize_inspector(true); + assert_eq!(cfg.inspect_kvdb_interval, ReadableDuration::ZERO); + + cfg.inspect_kvdb_interval = ReadableDuration::secs(1); + cfg.optimize_inspector(true); + assert_eq!(cfg.inspect_kvdb_interval, ReadableDuration::secs(1)); + + cfg = Config::new(); + cfg.inspect_kvdb_interval = ReadableDuration::millis(1); + cfg.inspect_interval = ReadableDuration::millis(100); + cfg.optimize_inspector(true); + assert_eq!(cfg.inspect_kvdb_interval, ReadableDuration::secs(1)); } } diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index ad21cc64fec..deecbdd0e02 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -68,7 +68,7 @@ use tikv_util::{ warn, worker::{LazyWorker, Scheduler, Worker}, yatp_pool::FuturePool, - Either, RingQueue, + Either, InspectFactor, RingQueue, }; use time::{self, Timespec}; @@ -102,9 +102,9 @@ use crate::{ worker::{ AutoSplitController, CleanupRunner, CleanupSstRunner, CleanupSstTask, CleanupTask, CompactRunner, CompactTask, ConsistencyCheckRunner, ConsistencyCheckTask, - GcSnapshotRunner, GcSnapshotTask, PdRunner, RaftlogGcRunner, RaftlogGcTask, - ReadDelegate, RefreshConfigRunner, RefreshConfigTask, RegionRunner, RegionTask, - SplitCheckTask, + DiskCheckRunner, DiskCheckTask, GcSnapshotRunner, GcSnapshotTask, PdRunner, + RaftlogGcRunner, RaftlogGcTask, ReadDelegate, RefreshConfigRunner, RefreshConfigTask, + RegionRunner, RegionTask, SplitCheckTask, }, Callback, CasualMessage, CompactThreshold, GlobalReplicationState, InspectedRaftMessage, MergeResultKind, PdTask, PeerMsg, PeerTick, RaftCommand, SignificantMsg, SnapManager, @@ -554,6 +554,7 @@ where pub raftlog_gc_scheduler: Scheduler, pub raftlog_fetch_scheduler: Scheduler>, pub region_scheduler: Scheduler>, + pub disk_check_scheduler: Scheduler, pub apply_router: ApplyRouter, pub router: RaftRouter, pub importer: Arc, @@ -862,11 +863,30 @@ impl<'a, EK: KvEngine + 'static, ER: RaftEngine + 'static, T: Transport> #[cfg(any(test, feature = "testexport"))] StoreMsg::Validate(f) => f(&self.ctx.cfg), StoreMsg::LatencyInspect { + factor, send_time, mut inspector, } => { - inspector.record_store_wait(send_time.saturating_elapsed()); - self.ctx.pending_latency_inspect.push(inspector); + match factor { + InspectFactor::RaftDisk => { + inspector.record_store_wait(send_time.saturating_elapsed()); + self.ctx.pending_latency_inspect.push(inspector); + } + InspectFactor::KvDisk => { + // Send LatencyInspector to disk_check_scheduler to inspect latency. + if let Err(e) = self + .ctx + .disk_check_scheduler + .schedule(DiskCheckTask::InspectLatency { inspector }) + { + warn!( + "Failed to schedule disk check task"; + "error" => ?e, + "store_id" => self.fsm.store.id + ); + } + } + } } StoreMsg::UnsafeRecoveryReport(report) => self.store_heartbeat_pd(Some(report)), StoreMsg::UnsafeRecoveryCreatePeer { syncer, create } => { @@ -1221,6 +1241,7 @@ pub struct RaftPollerBuilder { cleanup_scheduler: Scheduler, raftlog_gc_scheduler: Scheduler, raftlog_fetch_scheduler: Scheduler>, + disk_check_scheduler: Scheduler, pub region_scheduler: Scheduler>, apply_router: ApplyRouter, pub router: RaftRouter, @@ -1457,6 +1478,7 @@ where pd_scheduler: self.pd_scheduler.clone(), consistency_check_scheduler: self.consistency_check_scheduler.clone(), split_check_scheduler: self.split_check_scheduler.clone(), + disk_check_scheduler: self.disk_check_scheduler.clone(), region_scheduler: self.region_scheduler.clone(), apply_router: self.apply_router.clone(), router: self.router.clone(), @@ -1533,6 +1555,7 @@ where cleanup_scheduler: self.cleanup_scheduler.clone(), raftlog_gc_scheduler: self.raftlog_gc_scheduler.clone(), raftlog_fetch_scheduler: self.raftlog_fetch_scheduler.clone(), + disk_check_scheduler: self.disk_check_scheduler.clone(), region_scheduler: self.region_scheduler.clone(), apply_router: self.apply_router.clone(), router: self.router.clone(), @@ -1622,6 +1645,7 @@ impl RaftBatchSystem { collector_reg_handle: CollectorRegHandle, health_service: Option, causal_ts_provider: Option>, // used for rawkv apiv2 + mut disk_check_runner: DiskCheckRunner, grpc_service_mgr: GrpcServiceManager, safe_point: Arc, ) -> Result<()> { @@ -1710,6 +1734,12 @@ impl RaftBatchSystem { let consistency_check_scheduler = workers .background_worker .start("consistency-check", consistency_check_runner); + // The scheduler dedicated to health checking the KvEngine disk when it's using + // a separate disk from RaftEngine. + disk_check_runner.bind_background_worker(workers.background_worker.clone()); + let disk_check_scheduler = workers + .background_worker + .start("disk-check-worker", disk_check_runner); self.store_writers.spawn( meta.get_id(), @@ -1728,6 +1758,7 @@ impl RaftBatchSystem { router: self.router.clone(), split_check_scheduler, region_scheduler, + disk_check_scheduler, pd_scheduler: workers.pd_worker.scheduler(), consistency_check_scheduler, cleanup_scheduler, @@ -1874,7 +1905,7 @@ impl RaftBatchSystem { causal_ts_provider, grpc_service_mgr, ); - assert!(workers.pd_worker.start_with_timer(pd_runner)); + assert!(workers.pd_worker.start(pd_runner)); if let Err(e) = sys_util::thread::set_priority(sys_util::HIGH_PRI) { warn!("set thread priority for raftstore failed"; "error" => ?e); diff --git a/components/raftstore/src/store/metrics.rs b/components/raftstore/src/store/metrics.rs index 7973d9ad042..399e2cb9401 100644 --- a/components/raftstore/src/store/metrics.rs +++ b/components/raftstore/src/store/metrics.rs @@ -829,8 +829,11 @@ lazy_static! { exponential_buckets(0.00001, 2.0, 26).unwrap() ).unwrap(); - pub static ref STORE_SLOW_SCORE_GAUGE: Gauge = - register_gauge!("tikv_raftstore_slow_score", "Slow score of the store.").unwrap(); + pub static ref STORE_SLOW_SCORE_GAUGE: IntGaugeVec = register_int_gauge_vec!( + "tikv_raftstore_slow_score", + "Slow score of the store.", + &["type"] + ).unwrap(); pub static ref STORE_SLOW_TREND_GAUGE: Gauge = register_gauge!("tikv_raftstore_slow_trend", "Slow trend changing rate.").unwrap(); diff --git a/components/raftstore/src/store/mod.rs b/components/raftstore/src/store/mod.rs index cccab6f72b0..3d9d698743f 100644 --- a/components/raftstore/src/store/mod.rs +++ b/components/raftstore/src/store/mod.rs @@ -88,9 +88,9 @@ pub use self::{ worker::{ metrics as worker_metrics, need_compact, AutoSplitController, BatchComponent, Bucket, BucketRange, BucketStatsInfo, CachedReadDelegate, CheckLeaderRunner, CheckLeaderTask, - CompactThreshold, FlowStatistics, FlowStatsReporter, KeyEntry, LocalReadContext, - LocalReader, LocalReaderCore, PdStatsMonitor, PdTask, ReadDelegate, ReadExecutor, - ReadExecutorProvider, ReadProgress, ReadStats, RefreshConfigTask, RegionTask, + CompactThreshold, DiskCheckRunner, FlowStatistics, FlowStatsReporter, KeyEntry, + LocalReadContext, LocalReader, LocalReaderCore, PdStatsMonitor, PdTask, ReadDelegate, + ReadExecutor, ReadExecutorProvider, ReadProgress, ReadStats, RefreshConfigTask, RegionTask, SplitCheckRunner, SplitCheckTask, SplitConfig, SplitConfigManager, SplitInfo, StoreMetaDelegate, StoreStatsReporter, TrackVer, WriteStats, WriterContoller, BIG_REGION_CPU_OVERLOAD_THRESHOLD_RATIO, DEFAULT_BIG_REGION_BYTE_THRESHOLD, diff --git a/components/raftstore/src/store/msg.rs b/components/raftstore/src/store/msg.rs index 1d01caa1c76..1624c449a9a 100644 --- a/components/raftstore/src/store/msg.rs +++ b/components/raftstore/src/store/msg.rs @@ -24,7 +24,7 @@ use raft::SnapshotStatus; use resource_control::ResourceMetered; use smallvec::{smallvec, SmallVec}; use strum::{EnumCount, EnumVariantNames}; -use tikv_util::{deadline::Deadline, escape, memory::HeapSize, time::Instant}; +use tikv_util::{deadline::Deadline, escape, memory::HeapSize, time::Instant, InspectFactor}; use tracker::{get_tls_tracker_token, TrackerToken}; use super::{ @@ -882,6 +882,7 @@ where /// Inspect the latency of raftstore. LatencyInspect { + factor: InspectFactor, send_time: Instant, inspector: LatencyInspector, }, diff --git a/components/raftstore/src/store/worker/disk_check.rs b/components/raftstore/src/store/worker/disk_check.rs new file mode 100644 index 00000000000..145a911adb9 --- /dev/null +++ b/components/raftstore/src/store/worker/disk_check.rs @@ -0,0 +1,179 @@ +// Copyright 2024 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + fmt::{self, Display, Formatter}, + io::Write, + path::PathBuf, + time::Duration, +}; + +use crossbeam::channel::{bounded, Receiver, Sender}; +use tikv_util::{ + time::Instant, + warn, + worker::{Runnable, Worker}, +}; + +use crate::store::util::LatencyInspector; + +#[derive(Debug)] +pub enum Task { + InspectLatency { inspector: LatencyInspector }, +} + +impl Display for Task { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + match *self { + Task::InspectLatency { .. } => write!(f, "InspectLatency"), + } + } +} + +#[derive(Clone)] +/// A simple inspector to measure the latency of disk IO. +/// +/// This is used to measure the latency of disk IO, which is used to determine +/// the health status of the TiKV server. +/// The inspector writes a file to the disk and measures the time it takes to +/// complete the write operation. +pub struct Runner { + target: PathBuf, + notifier: Sender, + receiver: Receiver, + bg_worker: Option, +} + +impl Runner { + /// The filename to write to the disk to measure the latency. + const DISK_IO_LATENCY_INSPECT_FILENAME: &'static str = ".disk_latency_inspector.tmp"; + /// The content to write to the file to measure the latency. + const DISK_IO_LATENCY_INSPECT_FLUSH_STR: &'static [u8] = b"inspect disk io latency"; + + #[inline] + fn build(target: PathBuf) -> Self { + // The disk check mechanism only cares about the latency of the most + // recent request; older requests become stale and irrelevant. To avoid + // unnecessary accumulation of multiple requests, we set a small + // `capacity` for the disk check worker. + let (notifier, receiver) = bounded(3); + Self { + target, + notifier, + receiver, + bg_worker: None, + } + } + + #[inline] + pub fn new(inspect_dir: PathBuf) -> Self { + Self::build(inspect_dir.join(Self::DISK_IO_LATENCY_INSPECT_FILENAME)) + } + + #[inline] + /// Only for test. + /// Generate a dummy Runner. + pub fn dummy() -> Self { + Self::build(PathBuf::from("./").join(Self::DISK_IO_LATENCY_INSPECT_FILENAME)) + } + + #[inline] + pub fn bind_background_worker(&mut self, bg_worker: Worker) { + self.bg_worker = Some(bg_worker); + } + + fn inspect(&self) -> Option { + let mut file = std::fs::OpenOptions::new() + .create(true) + .write(true) + .truncate(true) + .open(&self.target) + .ok()?; + + let start = Instant::now(); + // Ignore the error + file.write_all(Self::DISK_IO_LATENCY_INSPECT_FLUSH_STR) + .ok()?; + file.sync_all().ok()?; + Some(start.saturating_elapsed()) + } + + fn execute(&self) { + if let Ok(task) = self.receiver.try_recv() { + match task { + Task::InspectLatency { mut inspector } => { + if let Some(latency) = self.inspect() { + inspector.record_apply_process(latency); + inspector.finish(); + } else { + warn!("failed to inspect disk io latency"); + } + } + } + } + } +} + +impl Runnable for Runner { + type Task = Task; + + fn run(&mut self, task: Task) { + // Send the task to the limited capacity channel. + if let Err(e) = self.notifier.try_send(task) { + warn!("failed to send task to disk check bg_worker: {:?}", e); + } else { + let runner = self.clone(); + if let Some(bg_worker) = self.bg_worker.as_ref() { + bg_worker.spawn_async_task(async move { + runner.execute(); + }); + } + } + } +} + +#[cfg(test)] +mod tests { + use tikv_util::worker::Builder; + + use super::*; + + #[test] + fn test_disk_check_runner() { + let background_worker = Builder::new("disk-check-worker") + .pending_capacity(256) + .create(); + let (tx, rx) = std::sync::mpsc::sync_channel(1); + let mut runner = Runner::dummy(); + runner.bind_background_worker(background_worker); + // Validate the disk check runner. + { + let tx_1 = tx.clone(); + let inspector = LatencyInspector::new( + 1, + Box::new(move |_, duration| { + let dur = duration.sum(); + tx_1.send(dur).unwrap(); + }), + ); + runner.run(Task::InspectLatency { inspector }); + let latency = rx.recv().unwrap(); + assert!(latency > Duration::from_secs(0)); + } + // Invalid bg_worker and out of capacity + { + runner.bg_worker = None; + for i in 2..=10 { + let tx_2 = tx.clone(); + let inspector = LatencyInspector::new( + i as u64, + Box::new(move |_, duration| { + let dur = duration.sum(); + tx_2.send(dur).unwrap(); + }), + ); + runner.run(Task::InspectLatency { inspector }); + rx.recv_timeout(Duration::from_secs(1)).unwrap_err(); + } + } + } +} diff --git a/components/raftstore/src/store/worker/mod.rs b/components/raftstore/src/store/worker/mod.rs index c6783238520..865326e1c4a 100644 --- a/components/raftstore/src/store/worker/mod.rs +++ b/components/raftstore/src/store/worker/mod.rs @@ -6,6 +6,7 @@ mod cleanup_snapshot; mod cleanup_sst; mod compact; mod consistency_check; +mod disk_check; pub mod metrics; mod pd; mod raftlog_gc; @@ -25,6 +26,7 @@ pub use self::{ cleanup_sst::{Runner as CleanupSstRunner, Task as CleanupSstTask}, compact::{need_compact, CompactThreshold, Runner as CompactRunner, Task as CompactTask}, consistency_check::{Runner as ConsistencyCheckRunner, Task as ConsistencyCheckTask}, + disk_check::{Runner as DiskCheckRunner, Task as DiskCheckTask}, pd::{ new_change_peer_v2_request, FlowStatistics, FlowStatsReporter, HeartbeatTask, Runner as PdRunner, StatsMonitor as PdStatsMonitor, StoreStatsReporter, Task as PdTask, diff --git a/components/raftstore/src/store/worker/pd.rs b/components/raftstore/src/store/worker/pd.rs index 62ccc0418cb..964687bb35f 100644 --- a/components/raftstore/src/store/worker/pd.rs +++ b/components/raftstore/src/store/worker/pd.rs @@ -31,7 +31,6 @@ use kvproto::{ raft_serverpb::RaftMessage, replication_modepb::{RegionReplicationStatus, StoreDrAutoSyncStatus}, }; -use ordered_float::OrderedFloat; use pd_client::{metrics::*, BucketStat, Error, PdClient, RegionStat}; use prometheus::local::LocalHistogram; use raft::eraftpb::ConfChangeType; @@ -40,6 +39,7 @@ use service::service_manager::GrpcServiceManager; use tikv_util::{ box_err, debug, error, info, metrics::ThreadInfoStatistics, + slow_score::SlowScore, store::QueryStats, sys::{disk::get_disk_space_stats, thread::StdThreadBuildWrapper, SysQuota}, thd_name, @@ -48,7 +48,8 @@ use tikv_util::{ topn::TopN, trend::{RequestPerSecRecorder, Trend}, warn, - worker::{Runnable, RunnableWithTimer, ScheduleError, Scheduler}, + worker::{Runnable, ScheduleError, Scheduler}, + InspectFactor, }; use txn_types::TimeStamp; use yatp::Remote; @@ -199,6 +200,7 @@ where }, UpdateSlowScore { id: u64, + factor: InspectFactor, duration: RaftstoreDuration, }, RegionCpuRecords(Arc), @@ -208,6 +210,9 @@ where }, ReportBuckets(BucketStat), ControlGrpcServer(pdpb::ControlGrpcEvent), + InspectLatency { + factor: InspectFactor, + }, } pub struct StoreStat { @@ -445,8 +450,16 @@ where Task::QueryRegionLeader { region_id } => { write!(f, "query the leader of region {}", region_id) } - Task::UpdateSlowScore { id, ref duration } => { - write!(f, "compute slow score: id {}, duration {:?}", id, duration) + Task::UpdateSlowScore { + id, + factor, + ref duration, + } => { + write!( + f, + "compute slow score: id {}, factor: {:?}, duration {:?}", + id, factor, duration + ) } Task::RegionCpuRecords(ref cpu_records) => { write!(f, "get region cpu records: {:?}", cpu_records) @@ -467,6 +480,9 @@ where Task::ControlGrpcServer(ref event) => { write!(f, "control grpc server: {:?}", event) } + Task::InspectLatency { factor } => { + write!(f, "inspect raftstore latency: {:?}", factor) + } } } } @@ -525,7 +541,7 @@ pub trait StoreStatsReporter: Send + Clone + Sync + 'static + Collector { ); fn report_min_resolved_ts(&self, store_id: u64, min_resolved_ts: u64); fn auto_split(&self, split_infos: Vec); - fn update_latency_stats(&self, timer_tick: u64); + fn update_latency_stats(&self, timer_tick: u64, factor: InspectFactor); } impl StoreStatsReporter for WrappedScheduler @@ -575,9 +591,16 @@ where } } - fn update_latency_stats(&self, timer_tick: u64) { - debug!("update latency statistics not implemented for raftstore-v1"; + fn update_latency_stats(&self, timer_tick: u64, factor: InspectFactor) { + debug!("update latency statistics for raftstore-v1"; "tick" => timer_tick); + let task = Task::InspectLatency { factor }; + if let Err(e) = self.0.schedule(task) { + error!( + "failed to send inspect raftstore latency task to pd worker"; + "err" => ?e, + ); + } } } @@ -595,6 +618,7 @@ where collect_tick_interval: Duration, report_min_resolved_ts_interval: Duration, inspect_latency_interval: Duration, + inspect_kvdb_latency_interval: Duration, } impl StatsMonitor @@ -605,6 +629,7 @@ where interval: Duration, report_min_resolved_ts_interval: Duration, inspect_latency_interval: Duration, + inspect_kvdb_latency_interval: Duration, reporter: T, ) -> Self { StatsMonitor { @@ -625,6 +650,7 @@ where cmp::min(default_collect_tick_interval(), interval), ), inspect_latency_interval, + inspect_kvdb_latency_interval, } } @@ -659,9 +685,12 @@ where let report_min_resolved_ts_interval = self .report_min_resolved_ts_interval .div_duration_f64(tick_interval) as u64; - let update_latency_stats_interval = self - .inspect_latency_interval - .div_duration_f64(tick_interval) as u64; + let update_raftdisk_latency_stats_interval = + self.inspect_latency_interval + .div_duration_f64(tick_interval) as u64; + let update_kvdisk_latency_stats_interval = + self.inspect_kvdb_latency_interval + .div_duration_f64(tick_interval) as u64; let (timer_tx, timer_rx) = mpsc::channel(); self.timer = Some(timer_tx); @@ -728,8 +757,11 @@ where region_read_progress.get_min_resolved_ts(), ); } - if is_enable_tick(timer_cnt, update_latency_stats_interval) { - reporter.update_latency_stats(timer_cnt); + if is_enable_tick(timer_cnt, update_raftdisk_latency_stats_interval) { + reporter.update_latency_stats(timer_cnt, InspectFactor::RaftDisk); + } + if is_enable_tick(timer_cnt, update_kvdisk_latency_stats_interval) { + reporter.update_latency_stats(timer_cnt, InspectFactor::KvDisk); } timer_cnt += 1; } @@ -850,105 +882,66 @@ fn hotspot_query_num_report_threshold() -> u64 { /// Max limitation of delayed store_heartbeat. const STORE_HEARTBEAT_DELAY_LIMIT: u64 = 5 * 60; -// Slow score is a value that represents the speed of a store and ranges in [1, -// 100]. It is maintained in the AIMD way. -// If there are some inspecting requests timeout during a round, by default the -// score will be increased at most 1x when above 10% inspecting requests -// timeout. If there is not any timeout inspecting requests, the score will go -// back to 1 in at least 5min. -struct SlowScore { - value: OrderedFloat, - last_record_time: Instant, - last_update_time: Instant, - - timeout_requests: usize, - total_requests: usize, - - inspect_interval: Duration, - // The maximal tolerated timeout ratio. - ratio_thresh: OrderedFloat, - // Minimal time that the score could be decreased from 100 to 1. - min_ttr: Duration, - - // After how many ticks the value need to be updated. - round_ticks: u64, - // Identify every ticks. - last_tick_id: u64, - // If the last tick does not finished, it would be recorded as a timeout. - last_tick_finished: bool, +/// A unified slow score that combines multiple slow scores. +/// +/// It calculates the final slow score of a store by picking the maximum +/// score among multiple factors. Each factor represents a different aspect of +/// the store's performance. Typically, we have two factors: Raft Disk I/O and +/// KvDB Disk I/O. If there are more factors in the future, we can add them +/// here. +#[derive(Default)] +pub struct UnifiedSlowScore { + factors: Vec, } -impl SlowScore { - fn new(inspect_interval: Duration) -> SlowScore { - SlowScore { - value: OrderedFloat(1.0), - - timeout_requests: 0, - total_requests: 0, - - inspect_interval, - ratio_thresh: OrderedFloat(0.1), - min_ttr: Duration::from_secs(5 * 60), - last_record_time: Instant::now(), - last_update_time: Instant::now(), - round_ticks: 30, - last_tick_id: 0, - last_tick_finished: true, - } +impl UnifiedSlowScore { + pub fn new(cfg: &Config) -> Self { + let mut unified_slow_score = UnifiedSlowScore::default(); + // The first factor is for Raft Disk I/O. + unified_slow_score + .factors + .push(SlowScore::new(cfg.inspect_interval.0)); + // The second factor is for KvDB Disk I/O. + unified_slow_score + .factors + .push(SlowScore::new_with_extra_config( + cfg.inspect_kvdb_interval.0, + 0.6, + )); + unified_slow_score } - fn record(&mut self, id: u64, duration: Duration, not_busy: bool) { - self.last_record_time = Instant::now(); - if id != self.last_tick_id { - return; - } - self.last_tick_finished = true; - self.total_requests += 1; - if not_busy && duration >= self.inspect_interval { - self.timeout_requests += 1; - } - } - - fn record_timeout(&mut self) { - self.last_tick_finished = true; - self.total_requests += 1; - self.timeout_requests += 1; + #[inline] + pub fn record( + &mut self, + id: u64, + factor: InspectFactor, + duration: &RaftstoreDuration, + not_busy: bool, + ) { + self.factors[factor as usize].record(id, duration.delays_on_disk_io(false), not_busy); } - fn update(&mut self) -> f64 { - let elapsed = self.last_update_time.elapsed(); - self.update_impl(elapsed).into() + #[inline] + pub fn get(&self, factor: InspectFactor) -> &SlowScore { + &self.factors[factor as usize] } - fn get(&self) -> f64 { - self.value.into() + #[inline] + pub fn get_mut(&mut self, factor: InspectFactor) -> &mut SlowScore { + &mut self.factors[factor as usize] } - // Update the score in a AIMD way. - fn update_impl(&mut self, elapsed: Duration) -> OrderedFloat { - if self.timeout_requests == 0 { - let desc = 100.0 * (elapsed.as_millis() as f64 / self.min_ttr.as_millis() as f64); - if OrderedFloat(desc) > self.value - OrderedFloat(1.0) { - self.value = 1.0.into(); - } else { - self.value -= desc; - } - } else { - let timeout_ratio = self.timeout_requests as f64 / self.total_requests as f64; - let near_thresh = - cmp::min(OrderedFloat(timeout_ratio), self.ratio_thresh) / self.ratio_thresh; - let value = self.value * (OrderedFloat(1.0) + near_thresh); - self.value = cmp::min(OrderedFloat(100.0), value); - } - - self.total_requests = 0; - self.timeout_requests = 0; - self.last_update_time = Instant::now(); - self.value + // Returns the maximum score of all factors. + pub fn get_score(&self) -> f64 { + self.factors + .iter() + .map(|factor| factor.get()) + .fold(1.0, f64::max) } - fn should_force_report_slow_store(&self) -> bool { - self.value >= OrderedFloat(100.0) && (self.last_tick_id % self.round_ticks == 0) + pub fn last_tick_finished(&self) -> bool { + self.factors.iter().all(SlowScore::last_tick_finished) } } @@ -981,7 +974,7 @@ where concurrency_manager: ConcurrencyManager, snap_mgr: SnapManager, remote: Remote, - slow_score: SlowScore, + slow_score: UnifiedSlowScore, slow_trend_cause: Trend, slow_trend_result: Trend, slow_trend_result_recorder: RequestPerSecRecorder, @@ -1027,6 +1020,7 @@ where interval, cfg.report_min_resolved_ts_interval.0, cfg.inspect_interval.0, + cfg.inspect_kvdb_interval.0, WrappedScheduler(scheduler.clone()), ); if let Err(e) = stats_monitor.start( @@ -1054,7 +1048,7 @@ where concurrency_manager, snap_mgr, remote, - slow_score: SlowScore::new(cfg.inspect_interval.0), + slow_score: UnifiedSlowScore::new(cfg), slow_trend_cause: Trend::new( // Disable SpikeFilter for now Duration::from_secs(0), @@ -1398,7 +1392,7 @@ where STORE_SIZE_EVENT_INT_VEC.available.set(available as i64); STORE_SIZE_EVENT_INT_VEC.used.set(used_size as i64); - let slow_score = self.slow_score.get(); + let slow_score = self.slow_score.get_score(); stats.set_slow_score(slow_score as u64); self.set_slow_trend_to_store_stats(&mut stats, total_query_num); @@ -2052,6 +2046,121 @@ where } } } + + fn handle_inspect_latency(&mut self, factor: InspectFactor) { + // all_ticks_finished: The last tick of all factors is finished. + // factor_tick_finished: The last tick of the current factor is finished. + let (all_ticks_finished, factor_tick_finished) = ( + self.slow_score.last_tick_finished(), + self.slow_score.get(factor).last_tick_finished(), + ); + // The health status is recovered to serving as long as any tick + // does not timeout. + if self.curr_health_status == ServingStatus::ServiceUnknown && all_ticks_finished { + self.update_health_status(ServingStatus::Serving); + } + if !all_ticks_finished { + // If the last tick is not finished, it means that the current store might + // be busy on handling requests or delayed on I/O operations. And only when + // the current store is not busy, it should record the last_tick as a timeout. + if !self.store_stat.maybe_busy() && !factor_tick_finished { + self.slow_score.get_mut(factor).record_timeout(); + } + } + + let slow_score_tick_result = self.slow_score.get_mut(factor).tick(); + if slow_score_tick_result.updated_score.is_some() && !slow_score_tick_result.has_new_record + { + self.update_health_status(ServingStatus::ServiceUnknown); + } + if let Some(score) = slow_score_tick_result.updated_score { + STORE_SLOW_SCORE_GAUGE + .with_label_values(&[factor.as_str()]) + .set(score as i64); + } + + let id = slow_score_tick_result.tick_id; + let scheduler = self.scheduler.clone(); + let inspector = { + match factor { + InspectFactor::RaftDisk => { + // Record a fairly great value when timeout + self.slow_trend_cause.record(500_000, Instant::now()); + + // If the last slow_score already reached abnormal state and was delayed for + // reporting by `store-heartbeat` to PD, we should report it here manually as + // a FAKE `store-heartbeat`. + if slow_score_tick_result.should_force_report_slow_store + && self.is_store_heartbeat_delayed() + { + self.handle_fake_store_heartbeat(); + } + LatencyInspector::new( + id, + Box::new(move |id, duration| { + // TODO: use sub metric to record different durations. + STORE_INSPECT_DURATION_HISTOGRAM + .with_label_values(&["store_process"]) + .observe(tikv_util::time::duration_to_sec( + duration.store_process_duration.unwrap_or_default(), + )); + STORE_INSPECT_DURATION_HISTOGRAM + .with_label_values(&["store_wait"]) + .observe(tikv_util::time::duration_to_sec( + duration.store_wait_duration.unwrap_or_default(), + )); + STORE_INSPECT_DURATION_HISTOGRAM + .with_label_values(&["store_commit"]) + .observe(tikv_util::time::duration_to_sec( + duration.store_commit_duration.unwrap_or_default(), + )); + + STORE_INSPECT_DURATION_HISTOGRAM + .with_label_values(&["all"]) + .observe(tikv_util::time::duration_to_sec(duration.sum())); + if let Err(e) = scheduler.schedule(Task::UpdateSlowScore { + id, + factor, + duration, + }) { + warn!("schedule pd task failed"; "err" => ?e); + } + }), + ) + } + InspectFactor::KvDisk => LatencyInspector::new( + id, + Box::new(move |id, duration| { + STORE_INSPECT_DURATION_HISTOGRAM + .with_label_values(&["apply_wait"]) + .observe(tikv_util::time::duration_to_sec( + duration.apply_wait_duration.unwrap_or_default(), + )); + STORE_INSPECT_DURATION_HISTOGRAM + .with_label_values(&["apply_process"]) + .observe(tikv_util::time::duration_to_sec( + duration.apply_process_duration.unwrap_or_default(), + )); + if let Err(e) = scheduler.schedule(Task::UpdateSlowScore { + id, + factor, + duration, + }) { + warn!("schedule pd task failed"; "err" => ?e); + } + }), + ), + } + }; + let msg = StoreMsg::LatencyInspect { + factor, + send_time: TiInstant::now(), + inspector, + }; + if let Err(e) = self.router.send_control(msg) { + warn!("pd worker send latency inspecter failed"; "err" => ?e); + } + } } fn calculate_region_cpu_records( @@ -2295,13 +2404,14 @@ where txn_ext, } => self.handle_update_max_timestamp(region_id, initial_status, txn_ext), Task::QueryRegionLeader { region_id } => self.handle_query_region_leader(region_id), - Task::UpdateSlowScore { id, duration } => { + Task::UpdateSlowScore { + id, + factor, + duration, + } => { // Fine-tuned, `SlowScore` only takes the I/O jitters on the disk into account. - self.slow_score.record( - id, - duration.delays_on_disk_io(false), - !self.store_stat.maybe_busy(), - ); + self.slow_score + .record(id, factor, &duration, !self.store_stat.maybe_busy()); } Task::RegionCpuRecords(records) => self.handle_region_cpu_records(records), Task::ReportMinResolvedTs { @@ -2314,6 +2424,9 @@ where Task::ControlGrpcServer(event) => { self.handle_control_grpc_server(event); } + Task::InspectLatency { factor } => { + self.handle_inspect_latency(factor); + } }; } @@ -2322,93 +2435,6 @@ where } } -impl RunnableWithTimer for Runner -where - EK: KvEngine, - ER: RaftEngine, - T: PdClient + 'static, -{ - fn on_timeout(&mut self) { - // Record a fairly great value when timeout - self.slow_trend_cause.record(500_000, Instant::now()); - - // The health status is recovered to serving as long as any tick - // does not timeout. - if self.curr_health_status == ServingStatus::ServiceUnknown - && self.slow_score.last_tick_finished - { - self.update_health_status(ServingStatus::Serving); - } - if !self.slow_score.last_tick_finished { - // If the last tick is not finished, it means that the current store might - // be busy on handling requests or delayed on I/O operations. And only when - // the current store is not busy, it should record the last_tick as a timeout. - if !self.store_stat.maybe_busy() { - self.slow_score.record_timeout(); - } - // If the last slow_score already reached abnormal state and was delayed for - // reporting by `store-heartbeat` to PD, we should report it here manually as - // a FAKE `store-heartbeat`. - if self.slow_score.should_force_report_slow_store() && self.is_store_heartbeat_delayed() - { - self.handle_fake_store_heartbeat(); - } - } - let scheduler = self.scheduler.clone(); - let id = self.slow_score.last_tick_id + 1; - self.slow_score.last_tick_id += 1; - self.slow_score.last_tick_finished = false; - - if self.slow_score.last_tick_id % self.slow_score.round_ticks == 0 { - // `last_update_time` is refreshed every round. If no update happens in a whole - // round, we set the status to unknown. - if self.curr_health_status == ServingStatus::Serving - && self.slow_score.last_record_time < self.slow_score.last_update_time - { - self.update_health_status(ServingStatus::ServiceUnknown); - } - let slow_score = self.slow_score.update(); - STORE_SLOW_SCORE_GAUGE.set(slow_score); - } - - let inspector = LatencyInspector::new( - id, - Box::new(move |id, duration| { - let dur = duration.sum(); - - STORE_INSPECT_DURATION_HISTOGRAM - .with_label_values(&["store_process"]) - .observe(tikv_util::time::duration_to_sec( - duration.store_process_duration.unwrap_or_default(), - )); - STORE_INSPECT_DURATION_HISTOGRAM - .with_label_values(&["store_wait"]) - .observe(tikv_util::time::duration_to_sec( - duration.store_wait_duration.unwrap_or_default(), - )); - - STORE_INSPECT_DURATION_HISTOGRAM - .with_label_values(&["all"]) - .observe(tikv_util::time::duration_to_sec(dur)); - if let Err(e) = scheduler.schedule(Task::UpdateSlowScore { id, duration }) { - warn!("schedule pd task failed"; "err" => ?e); - } - }), - ); - let msg = StoreMsg::LatencyInspect { - send_time: TiInstant::now(), - inspector, - }; - if let Err(e) = self.router.send_control(msg) { - warn!("pd worker send latency inspecter failed"; "err" => ?e); - } - } - - fn get_interval(&self) -> Duration { - self.slow_score.inspect_interval - } -} - fn new_change_peer_request(change_type: ConfChangeType, peer: metapb::Peer) -> AdminRequest { let mut req = AdminRequest::default(); req.set_cmd_type(AdminCmdType::ChangePeer); @@ -2700,6 +2726,7 @@ mod tests { Duration::from_secs(interval), Duration::from_secs(0), Duration::from_secs(interval), + Duration::default(), WrappedScheduler(scheduler), ); let store_meta = Arc::new(Mutex::new(StoreMeta::new(0))); @@ -2801,59 +2828,6 @@ mod tests { assert_eq!(store_stats.peer_stats.len(), 3) } - #[test] - fn test_slow_score() { - let mut slow_score = SlowScore::new(Duration::from_millis(500)); - slow_score.timeout_requests = 5; - slow_score.total_requests = 100; - assert_eq!( - OrderedFloat(1.5), - slow_score.update_impl(Duration::from_secs(10)) - ); - - slow_score.timeout_requests = 10; - slow_score.total_requests = 100; - assert_eq!( - OrderedFloat(3.0), - slow_score.update_impl(Duration::from_secs(10)) - ); - - slow_score.timeout_requests = 20; - slow_score.total_requests = 100; - assert_eq!( - OrderedFloat(6.0), - slow_score.update_impl(Duration::from_secs(10)) - ); - - slow_score.timeout_requests = 100; - slow_score.total_requests = 100; - assert_eq!( - OrderedFloat(12.0), - slow_score.update_impl(Duration::from_secs(10)) - ); - - slow_score.timeout_requests = 11; - slow_score.total_requests = 100; - assert_eq!( - OrderedFloat(24.0), - slow_score.update_impl(Duration::from_secs(10)) - ); - - slow_score.timeout_requests = 0; - slow_score.total_requests = 100; - assert_eq!( - OrderedFloat(19.0), - slow_score.update_impl(Duration::from_secs(15)) - ); - - slow_score.timeout_requests = 0; - slow_score.total_requests = 100; - assert_eq!( - OrderedFloat(1.0), - slow_score.update_impl(Duration::from_secs(57)) - ); - } - use engine_test::{kv::KvTestEngine, raft::RaftTestEngine}; use metapb::Peer; use resource_metering::{RawRecord, TagInfos}; @@ -3006,6 +2980,7 @@ mod tests { Duration::from_secs(interval), Duration::from_secs(0), Duration::from_secs(interval), + Duration::default(), WrappedScheduler(pd_worker.scheduler()), ); let store_meta = Arc::new(Mutex::new(StoreMeta::new(0))); diff --git a/components/server/src/server.rs b/components/server/src/server.rs index 7a1108e54c2..54f159ed383 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -64,8 +64,8 @@ use raftstore::{ }, memory::MEMTRACE_ROOT as MEMTRACE_RAFTSTORE, snapshot_backup::PrepareDiskSnapObserver, - AutoSplitController, CheckLeaderRunner, LocalReader, SnapManager, SnapManagerBuilder, - SplitCheckRunner, SplitConfigManager, StoreMetaDelegate, + AutoSplitController, CheckLeaderRunner, DiskCheckRunner, LocalReader, SnapManager, + SnapManagerBuilder, SplitCheckRunner, SplitConfigManager, StoreMetaDelegate, }, RaftRouterCompactedEventSender, }; @@ -772,6 +772,13 @@ where let server_config = Arc::new(VersionTrack::new(self.core.config.server.clone())); self.core.config.raft_store.optimize_for(false); + self.core + .config + .raft_store + .optimize_inspector(path_in_diff_mount_point( + engines.engines.raft.get_engine_path().to_string().as_str(), + engines.engines.kv.path(), + )); self.core .config .raft_store @@ -975,6 +982,8 @@ where .registry .register_consistency_check_observer(100, observer); + let disk_check_runner = DiskCheckRunner::new(self.core.store_path.clone()); + node.start( engines.engines.clone(), server.transport(), @@ -988,6 +997,7 @@ where self.concurrency_manager.clone(), collector_reg_handle, self.causal_ts_provider.clone(), + disk_check_runner, self.grpc_service_mgr.clone(), safe_point.clone(), ) diff --git a/components/test_raftstore/src/node.rs b/components/test_raftstore/src/node.rs index f429f27ff8b..6b743a62e1b 100644 --- a/components/test_raftstore/src/node.rs +++ b/components/test_raftstore/src/node.rs @@ -334,6 +334,7 @@ impl Simulator for NodeCluster { cm, CollectorRegHandle::new_for_test(), None, + DiskCheckRunner::dummy(), GrpcServiceManager::dummy(), Arc::new(AtomicU64::new(0)), )?; diff --git a/components/test_raftstore/src/server.rs b/components/test_raftstore/src/server.rs index f5c64fa86e9..a6673ebb66e 100644 --- a/components/test_raftstore/src/server.rs +++ b/components/test_raftstore/src/server.rs @@ -37,8 +37,9 @@ use raftstore::{ store::{ fsm::{store::StoreMeta, ApplyRouter, RaftBatchSystem, RaftRouter}, msg::RaftCmdExtraOpts, - AutoSplitController, Callback, CheckLeaderRunner, LocalReader, RegionSnapshot, SnapManager, - SnapManagerBuilder, SplitCheckRunner, SplitConfigManager, StoreMetaDelegate, + AutoSplitController, Callback, CheckLeaderRunner, DiskCheckRunner, LocalReader, + RegionSnapshot, SnapManager, SnapManagerBuilder, SplitCheckRunner, SplitConfigManager, + StoreMetaDelegate, }, Result, }; @@ -617,6 +618,7 @@ impl ServerCluster { concurrency_manager.clone(), collector_reg_handle, causal_ts_provider, + DiskCheckRunner::dummy(), GrpcServiceManager::dummy(), Arc::new(AtomicU64::new(0)), )?; diff --git a/components/tikv_util/Cargo.toml b/components/tikv_util/Cargo.toml index 82764047d0c..4e509c7f153 100644 --- a/components/tikv_util/Cargo.toml +++ b/components/tikv_util/Cargo.toml @@ -43,6 +43,7 @@ num-traits = "0.2" num_cpus = "1" online_config = { workspace = true } openssl = "0.10" +ordered-float = "2.6" parking_lot_core = "0.9.1" pin-project = "1.0" prometheus = { version = "0.13", features = ["nightly"] } diff --git a/components/tikv_util/src/lib.rs b/components/tikv_util/src/lib.rs index b8aa578a878..59f255faaeb 100644 --- a/components/tikv_util/src/lib.rs +++ b/components/tikv_util/src/lib.rs @@ -5,6 +5,7 @@ #![feature(box_patterns)] #![feature(vec_into_raw_parts)] #![feature(let_chains)] +#![feature(div_duration)] #[cfg(test)] extern crate test; @@ -54,6 +55,7 @@ pub mod memory; pub mod metrics; pub mod mpsc; pub mod quota_limiter; +pub mod slow_score; pub mod store; pub mod stream; pub mod sys; @@ -612,6 +614,22 @@ pub fn set_vec_capacity(v: &mut Vec, cap: usize) { } } +#[derive(Debug, Clone, Copy, PartialEq)] +pub enum InspectFactor { + RaftDisk = 0, + KvDisk, + // TODO: Add more factors, like network io. +} + +impl InspectFactor { + pub fn as_str(&self) -> &str { + match *self { + InspectFactor::RaftDisk => "raft", + InspectFactor::KvDisk => "kvdb", + } + } +} + #[cfg(test)] mod tests { use std::{ diff --git a/components/tikv_util/src/slow_score.rs b/components/tikv_util/src/slow_score.rs new file mode 100644 index 00000000000..676dbef4efa --- /dev/null +++ b/components/tikv_util/src/slow_score.rs @@ -0,0 +1,287 @@ +// Copyright 2024 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + cmp, + time::{Duration, Instant}, +}; + +use ordered_float::OrderedFloat; + +/// The result of a tick of the slow score. +pub struct SlowScoreTickResult { + pub tick_id: u64, + // None if skipped in this tick + pub updated_score: Option, + pub has_new_record: bool, + pub should_force_report_slow_store: bool, +} + +/// Interval for updating the slow score. +const UPDATE_INTERVALS: Duration = Duration::from_secs(10); +/// Recovery intervals for the slow score. +/// If the score has reached 100 and there is no timeout inspecting requests +/// during this interval, the score will go back to 1 after 5min. +const RECOVERY_INTERVALS: Duration = Duration::from_secs(60 * 5); +// Slow score is a value that represents the speed of a store and ranges in [1, +// 100]. It is maintained in the AIMD way. +// If there are some inspecting requests timeout during a round, by default the +// score will be increased at most 1x when above 10% inspecting requests +// timeout. If there is not any timeout inspecting requests, the score will go +// back to 1 in after 5min. +pub struct SlowScore { + value: OrderedFloat, + last_record_time: Instant, + last_update_time: Instant, + + timeout_requests: usize, + total_requests: usize, + + inspect_interval: Duration, + // The maximal tolerated timeout ratio. + ratio_thresh: OrderedFloat, + // Minimal time that the score could be decreased from 100 to 1. + min_ttr: Duration, + + // After how many ticks the value need to be updated. + round_ticks: u64, + // Identify every ticks. + last_tick_id: u64, + // If the last tick does not finished, it would be recorded as a timeout. + last_tick_finished: bool, +} + +impl SlowScore { + pub fn new(inspect_interval: Duration) -> SlowScore { + SlowScore { + value: OrderedFloat(1.0), + + timeout_requests: 0, + total_requests: 0, + + inspect_interval, + ratio_thresh: OrderedFloat(0.1), + min_ttr: RECOVERY_INTERVALS, + last_record_time: Instant::now(), + last_update_time: Instant::now(), + round_ticks: 30, + last_tick_id: 0, + last_tick_finished: true, + } + } + + // Only for kvdb. + pub fn new_with_extra_config(inspect_interval: Duration, timeout_ratio: f64) -> SlowScore { + SlowScore { + value: OrderedFloat(1.0), + + timeout_requests: 0, + total_requests: 0, + + inspect_interval, + ratio_thresh: OrderedFloat(timeout_ratio), + min_ttr: RECOVERY_INTERVALS, + last_record_time: Instant::now(), + last_update_time: Instant::now(), + // The minimal round ticks is 1 for kvdb. + round_ticks: cmp::max( + UPDATE_INTERVALS.div_duration_f64(inspect_interval) as u64, + 1_u64, + ), + last_tick_id: 0, + last_tick_finished: true, + } + } + + pub fn record(&mut self, id: u64, duration: Duration, not_busy: bool) { + self.last_record_time = Instant::now(); + if id != self.last_tick_id { + return; + } + self.last_tick_finished = true; + self.total_requests += 1; + if not_busy && duration >= self.inspect_interval { + self.timeout_requests += 1; + } + } + + pub fn record_timeout(&mut self) { + self.last_tick_finished = true; + self.total_requests += 1; + self.timeout_requests += 1; + } + + pub fn update(&mut self) -> f64 { + let elapsed = self.last_update_time.elapsed(); + self.update_impl(elapsed).into() + } + + pub fn get(&self) -> f64 { + self.value.into() + } + + // Update the score in a AIMD way. + fn update_impl(&mut self, elapsed: Duration) -> OrderedFloat { + if self.timeout_requests == 0 { + let desc = 100.0 * (elapsed.as_millis() as f64 / self.min_ttr.as_millis() as f64); + if OrderedFloat(desc) > self.value - OrderedFloat(1.0) { + self.value = 1.0.into(); + } else { + self.value -= desc; + } + } else { + let timeout_ratio = self.timeout_requests as f64 / self.total_requests as f64; + let near_thresh = + cmp::min(OrderedFloat(timeout_ratio), self.ratio_thresh) / self.ratio_thresh; + let value = self.value * (OrderedFloat(1.0) + near_thresh); + self.value = cmp::min(OrderedFloat(100.0), value); + } + + self.total_requests = 0; + self.timeout_requests = 0; + self.last_update_time = Instant::now(); + self.value + } + + pub fn should_force_report_slow_store(&self) -> bool { + self.value >= OrderedFloat(100.0) && (self.last_tick_id % self.round_ticks == 0) + } + + pub fn get_inspect_interval(&self) -> Duration { + self.inspect_interval + } + + pub fn last_tick_finished(&self) -> bool { + self.last_tick_finished + } + + pub fn tick(&mut self) -> SlowScoreTickResult { + let should_force_report_slow_store = self.should_force_report_slow_store(); + + let id = self.last_tick_id + 1; + self.last_tick_id += 1; + self.last_tick_finished = false; + + let (updated_score, has_new_record) = if self.last_tick_id % self.round_ticks == 0 { + // `last_update_time` is refreshed every round. If no update happens in a whole + // round, we set the status to unknown. + let has_new_record = self.last_record_time >= self.last_update_time; + let slow_score = self.update(); + (Some(slow_score), has_new_record) + } else { + (None, false) + }; + + SlowScoreTickResult { + tick_id: id, + updated_score, + has_new_record, + should_force_report_slow_store, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_slow_score() { + let mut slow_score = SlowScore::new(Duration::from_millis(500)); + slow_score.timeout_requests = 5; + slow_score.total_requests = 100; + assert_eq!( + OrderedFloat(1.5), + slow_score.update_impl(Duration::from_secs(10)) + ); + + slow_score.timeout_requests = 10; + slow_score.total_requests = 100; + assert_eq!( + OrderedFloat(3.0), + slow_score.update_impl(Duration::from_secs(10)) + ); + + slow_score.timeout_requests = 20; + slow_score.total_requests = 100; + assert_eq!( + OrderedFloat(6.0), + slow_score.update_impl(Duration::from_secs(10)) + ); + + slow_score.timeout_requests = 100; + slow_score.total_requests = 100; + assert_eq!( + OrderedFloat(12.0), + slow_score.update_impl(Duration::from_secs(10)) + ); + + slow_score.timeout_requests = 11; + slow_score.total_requests = 100; + assert_eq!( + OrderedFloat(24.0), + slow_score.update_impl(Duration::from_secs(10)) + ); + + slow_score.timeout_requests = 0; + slow_score.total_requests = 100; + assert_eq!( + OrderedFloat(19.0), + slow_score.update_impl(Duration::from_secs(15)) + ); + + slow_score.timeout_requests = 0; + slow_score.total_requests = 100; + assert_eq!( + OrderedFloat(1.0), + slow_score.update_impl(Duration::from_secs(57)) + ); + } + + #[test] + fn test_slow_score_extra() { + let mut slow_score = SlowScore::new_with_extra_config(Duration::from_millis(1000), 0.6); + slow_score.timeout_requests = 1; + slow_score.total_requests = 10; + let score = slow_score.update_impl(Duration::from_secs(10)); + assert!(score > OrderedFloat(1.16)); + assert!(score < OrderedFloat(1.17)); + + slow_score.timeout_requests = 2; + slow_score.total_requests = 10; + let score = slow_score.update_impl(Duration::from_secs(10)); + assert!(score > OrderedFloat(1.5)); + assert!(score < OrderedFloat(1.6)); + + slow_score.timeout_requests = 0; + slow_score.total_requests = 100; + assert_eq!( + OrderedFloat(1.0), + slow_score.update_impl(Duration::from_secs(57)) + ); + + slow_score.timeout_requests = 3; + slow_score.total_requests = 10; + assert_eq!( + OrderedFloat(1.5), + slow_score.update_impl(Duration::from_secs(10)) + ); + + slow_score.timeout_requests = 6; + slow_score.total_requests = 10; + assert_eq!( + OrderedFloat(3.0), + slow_score.update_impl(Duration::from_secs(10)) + ); + + slow_score.timeout_requests = 10; + slow_score.total_requests = 10; + assert_eq!( + OrderedFloat(6.0), + slow_score.update_impl(Duration::from_secs(10)) + ); + + // Test too large inspect interval. + let slow_score = SlowScore::new_with_extra_config(Duration::from_secs(11), 0.1); + assert_eq!(slow_score.round_ticks, 1); + } +} diff --git a/metrics/grafana/tikv_details.json b/metrics/grafana/tikv_details.json index b729113a03a..c48a3317cd6 100644 --- a/metrics/grafana/tikv_details.json +++ b/metrics/grafana/tikv_details.json @@ -9587,11 +9587,11 @@ "targets": [ { "exemplar": true, - "expr": "tikv_raftstore_slow_score{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}", + "expr": "tikv_raftstore_slow_score{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"} by (instance, type)", "format": "time_series", "interval": "", "intervalFactor": 2, - "legendFormat": "{{instance}}", + "legendFormat": "{{instance}}-{{type}}", "refId": "A", "step": 4 } diff --git a/src/server/node.rs b/src/server/node.rs index 228f679ed14..4e7ca0fcfe6 100644 --- a/src/server/node.rs +++ b/src/server/node.rs @@ -20,8 +20,8 @@ use raftstore::{ store::{ self, fsm::{store::StoreMeta, ApplyRouter, RaftBatchSystem, RaftRouter}, - initial_region, AutoSplitController, Config as StoreConfig, GlobalReplicationState, PdTask, - RefreshConfigTask, SnapManager, SplitCheckTask, Transport, + initial_region, AutoSplitController, Config as StoreConfig, DiskCheckRunner, + GlobalReplicationState, PdTask, RefreshConfigTask, SnapManager, SplitCheckTask, Transport, }, }; use resource_metering::CollectorRegHandle; @@ -173,6 +173,7 @@ where concurrency_manager: ConcurrencyManager, collector_reg_handle: CollectorRegHandle, causal_ts_provider: Option>, // used for rawkv apiv2 + disk_check_runner: DiskCheckRunner, grpc_service_mgr: GrpcServiceManager, safe_point: Arc, ) -> Result<()> @@ -212,6 +213,7 @@ where concurrency_manager, collector_reg_handle, causal_ts_provider, + disk_check_runner, grpc_service_mgr, safe_point, )?; @@ -461,6 +463,7 @@ where concurrency_manager: ConcurrencyManager, collector_reg_handle: CollectorRegHandle, causal_ts_provider: Option>, // used for rawkv apiv2 + disk_check_runner: DiskCheckRunner, grpc_service_mgr: GrpcServiceManager, safe_point: Arc, ) -> Result<()> @@ -496,6 +499,7 @@ where collector_reg_handle, self.health_service.clone(), causal_ts_provider, + disk_check_runner, grpc_service_mgr, safe_point, )?; diff --git a/tests/integrations/config/dynamic/raftstore.rs b/tests/integrations/config/dynamic/raftstore.rs index 4d6551ea27c..0489103018f 100644 --- a/tests/integrations/config/dynamic/raftstore.rs +++ b/tests/integrations/config/dynamic/raftstore.rs @@ -15,7 +15,7 @@ use raftstore::{ store::{ config::{Config, RaftstoreConfigManager}, fsm::{StoreMeta, *}, - AutoSplitController, SnapManager, StoreMsg, Transport, + AutoSplitController, DiskCheckRunner, SnapManager, StoreMsg, Transport, }, Result, }; @@ -113,6 +113,7 @@ fn start_raftstore( CollectorRegHandle::new_for_test(), None, None, + DiskCheckRunner::dummy(), GrpcServiceManager::dummy(), Arc::new(AtomicU64::new(0)), ) diff --git a/tests/integrations/raftstore/test_bootstrap.rs b/tests/integrations/raftstore/test_bootstrap.rs index bca389b26e6..69fdb446b75 100644 --- a/tests/integrations/raftstore/test_bootstrap.rs +++ b/tests/integrations/raftstore/test_bootstrap.rs @@ -13,7 +13,10 @@ use engine_traits::{ use kvproto::{kvrpcpb::ApiVersion, metapb, raft_serverpb::RegionLocalState}; use raftstore::{ coprocessor::CoprocessorHost, - store::{bootstrap_store, fsm, fsm::store::StoreMeta, AutoSplitController, SnapManager}, + store::{ + bootstrap_store, fsm, fsm::store::StoreMeta, AutoSplitController, DiskCheckRunner, + SnapManager, + }, }; use raftstore_v2::router::PeerMsg; use resource_metering::CollectorRegHandle; @@ -121,6 +124,7 @@ fn test_node_bootstrap_with_prepared_data() { ConcurrencyManager::new(1.into()), CollectorRegHandle::new_for_test(), None, + DiskCheckRunner::dummy(), GrpcServiceManager::dummy(), Arc::new(AtomicU64::new(0)), ) diff --git a/tests/integrations/raftstore/test_status_command.rs b/tests/integrations/raftstore/test_status_command.rs index 8565d936d9f..22caef23dc7 100644 --- a/tests/integrations/raftstore/test_status_command.rs +++ b/tests/integrations/raftstore/test_status_command.rs @@ -4,7 +4,7 @@ use raftstore::store::{msg::StoreMsg as StoreMsgV1, util::LatencyInspector}; use raftstore_v2::router::StoreMsg as StoreMsgV2; use test_raftstore::Simulator as S1; use test_raftstore_v2::Simulator as S2; -use tikv_util::{time::Instant, HandyRwLock}; +use tikv_util::{config::ReadableDuration, time::Instant, HandyRwLock, InspectFactor}; #[test] fn test_region_detail() { @@ -32,6 +32,7 @@ fn test_region_detail() { fn test_latency_inspect() { let mut cluster_v1 = test_raftstore::new_node_cluster(0, 1); cluster_v1.cfg.raft_store.store_io_pool_size = 2; + cluster_v1.cfg.raft_store.inspect_kvdb_interval = ReadableDuration::millis(500); cluster_v1.run(); let mut cluster_v2 = test_raftstore_v2::new_node_cluster(0, 1); cluster_v2.run(); @@ -42,19 +43,24 @@ fn test_latency_inspect() { { // Test send LatencyInspect to V1. let (tx, rx) = std::sync::mpsc::sync_channel(10); - let inspector = LatencyInspector::new( - 1, - Box::new(move |_, duration| { - let dur = duration.sum(); - tx.send(dur).unwrap(); - }), - ); - let msg = StoreMsgV1::LatencyInspect { - send_time: Instant::now(), - inspector, - }; - router_v1.send_control(msg).unwrap(); - rx.recv_timeout(std::time::Duration::from_secs(2)).unwrap(); + // Inspect different factors. + for factor in [InspectFactor::RaftDisk, InspectFactor::KvDisk].iter() { + let cloned_tx = tx.clone(); + let inspector = LatencyInspector::new( + 1, + Box::new(move |_, duration| { + let dur = duration.sum(); + cloned_tx.send(dur).unwrap(); + }), + ); + let msg = StoreMsgV1::LatencyInspect { + factor: *factor, + send_time: Instant::now(), + inspector, + }; + router_v1.send_control(msg).unwrap(); + rx.recv_timeout(std::time::Duration::from_secs(2)).unwrap(); + } } { // Test send LatencyInspect to V2. @@ -82,17 +88,22 @@ fn test_sync_latency_inspect() { cluster.run(); let router = cluster.sim.wl().get_router(1).unwrap(); let (tx, rx) = std::sync::mpsc::sync_channel(10); - let inspector = LatencyInspector::new( - 1, - Box::new(move |_, duration| { - let dur = duration.sum(); - tx.send(dur).unwrap(); - }), - ); - let msg = StoreMsgV1::LatencyInspect { - send_time: Instant::now(), - inspector, - }; - router.send_control(msg).unwrap(); - rx.recv_timeout(std::time::Duration::from_secs(2)).unwrap(); + // Inspect different factors. + for factor in [InspectFactor::RaftDisk, InspectFactor::KvDisk].iter() { + let cloned_tx = tx.clone(); + let inspector = LatencyInspector::new( + 1, + Box::new(move |_, duration| { + let dur = duration.sum(); + cloned_tx.send(dur).unwrap(); + }), + ); + let msg = StoreMsgV1::LatencyInspect { + factor: *factor, + send_time: Instant::now(), + inspector, + }; + router.send_control(msg).unwrap(); + rx.recv_timeout(std::time::Duration::from_secs(2)).unwrap(); + } } diff --git a/tests/integrations/server/kv_service.rs b/tests/integrations/server/kv_service.rs index 24b6a87bfa5..164fafdf964 100644 --- a/tests/integrations/server/kv_service.rs +++ b/tests/integrations/server/kv_service.rs @@ -29,7 +29,7 @@ use pd_client::PdClient; use raft::eraftpb; use raftstore::{ coprocessor::CoprocessorHost, - store::{fsm::store::StoreMeta, AutoSplitController, SnapManager}, + store::{fsm::store::StoreMeta, AutoSplitController, DiskCheckRunner, SnapManager}, }; use resource_metering::CollectorRegHandle; use service::service_manager::GrpcServiceManager; @@ -1410,6 +1410,7 @@ fn test_double_run_node() { ConcurrencyManager::new(1.into()), CollectorRegHandle::new_for_test(), None, + DiskCheckRunner::dummy(), GrpcServiceManager::dummy(), Arc::new(AtomicU64::new(0)), ) From 7e739586ce3c1d8b5cc91c6832ebf9173ddceda9 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Tue, 3 Dec 2024 14:02:32 +0800 Subject: [PATCH 217/220] cdc: skip loading old values for un-observed ranges (#17878) (#17885) close tikv/tikv#17876, fix tikv/tikv#17876, close tikv/tikv#17877 cdc: skip loading old values for un-observed ranges Signed-off-by: qupeng Co-authored-by: qupeng --- components/cdc/src/delegate.rs | 173 +++++++++++++++++++++------------ 1 file changed, 109 insertions(+), 64 deletions(-) diff --git a/components/cdc/src/delegate.rs b/components/cdc/src/delegate.rs index 70d6835bbc3..70d0d605c6d 100644 --- a/components/cdc/src/delegate.rs +++ b/components/cdc/src/delegate.rs @@ -462,9 +462,9 @@ impl Delegate { self.txn_extra_op.as_ref() } - fn broadcast(&self, send: F) -> Result<()> + fn broadcast(&self, mut send: F) -> Result<()> where - F: Fn(&Downstream) -> Result<()>, + F: FnMut(&Downstream) -> Result<()>, { let downstreams = self.downstreams(); assert!( @@ -677,25 +677,22 @@ impl Delegate { is_one_pc: bool, ) -> Result<()> { debug_assert_eq!(self.txn_extra_op.load(), TxnExtraOp::ReadOldValue); - let mut read_old_value = |row: &mut EventRow, read_old_ts| -> Result<()> { + let read_old_value = |row: &mut EventRow, read_old_ts| -> Result<()> { let key = Key::from_raw(&row.key).append_ts(row.start_ts.into()); let old_value = old_value_cb(key, read_old_ts, old_value_cache, statistics)?; row.old_value = old_value.unwrap_or_default(); Ok(()) }; - // map[key] -> (event, has_value). - let mut txn_rows: HashMap, (EventRow, bool)> = HashMap::default(); + // map[key] -> (event, has_value, old_value_ts). + let mut txn_rows: HashMap, (EventRow, bool, Option)> = + HashMap::default(); let mut raw_rows: Vec = Vec::new(); for mut req in requests { let res = match req.get_cmd_type() { - CmdType::Put => self.sink_put( - req.take_put(), - is_one_pc, - &mut txn_rows, - &mut raw_rows, - &mut read_old_value, - ), + CmdType::Put => { + self.sink_put(req.take_put(), is_one_pc, &mut txn_rows, &mut raw_rows) + } CmdType::Delete => self.sink_delete(req.take_delete()), _ => { debug!( @@ -712,27 +709,85 @@ impl Delegate { } } - let mut rows = Vec::with_capacity(txn_rows.len()); - for (_, (v, has_value)) in txn_rows { - if v.r_type == EventLogType::Prewrite && v.op_type == EventRowOpType::Put && !has_value + self.sink_downstream_tidb(txn_rows.into_values(), read_old_value)?; + self.sink_downstream_raw(raw_rows, index)?; + Ok(()) + } + + fn sink_downstream_tidb( + &mut self, + entries: impl Iterator)>, + mut read_old_value: impl FnMut(&mut EventRow, TimeStamp) -> Result<()>, + ) -> Result<()> { + let mut entries = entries + .filter(|x| !TxnSource::is_lossy_ddl_reorg_source_set(x.0.txn_source)) + // It's possible that a prewrite command only contains lock but without + // default. It's not documented by classic Percolator but introduced with + // Large-Transaction. Those prewrites are not complete, we must skip them. + .filter(|x| !(x.0.r_type == EventLogType::Prewrite && x.0.op_type == EventRowOpType::Put && !x.1)) + .map(|x| (x.0, x.2)) + .collect::>(); + + let downstreams = self.downstreams(); + assert!( + !downstreams.is_empty(), + "region {} miss downstream", + self.region_id + ); + + let region_id = self.region_id; + let send = move |downstream: &Downstream| { + // No ready downstream or a downstream that does not match the kv_api type, will + // be ignored. There will be one region that contains both Txn & Raw entries. + // The judgement here is for sending entries to downstreams with correct kv_api. + if !downstream.state.load().ready_for_change_events() + || downstream.kv_api != ChangeDataRequestKvApi::TiDb { - // It's possible that a prewrite command only contains lock but without - // default. It's not documented by classic Percolator but introduced with - // Large-Transaction. Those prewrites are not complete, we must skip them. - continue; + return Ok(()); + } + + let mut d_entries = Vec::with_capacity(entries.len()); + for (r, old_value_ts) in &mut entries { + if !downstream.observed_range.contains_raw_key(&r.key) + || downstream.filter_loop && TxnSource::is_cdc_write_source_set(r.txn_source) + { + continue; + } + if let Some(ts) = old_value_ts { + read_old_value(r, *ts)?; + *old_value_ts = None; + } + d_entries.push(r.clone()); + } + + if d_entries.is_empty() { + return Ok(()); + } + + let event = Event { + region_id, + request_id: downstream.get_req_id(), + event: Some(Event_oneof_event::Entries(EventEntries { + entries: d_entries.into(), + ..Default::default() + })), + ..Default::default() + }; + + // Do not force send for real time change data events. + let force_send = false; + downstream.sink_event(event, force_send) + }; + match self.broadcast(send) { + Ok(()) => Ok(()), + Err(e) => { + self.mark_failed(); + Err(e) } - rows.push(v); } - self.sink_downstream(rows, index, ChangeDataRequestKvApi::TiDb)?; - self.sink_downstream(raw_rows, index, ChangeDataRequestKvApi::RawKv) } - fn sink_downstream( - &mut self, - entries: Vec, - index: u64, - kv_api: ChangeDataRequestKvApi, - ) -> Result<()> { + fn sink_downstream_raw(&mut self, entries: Vec, index: u64) -> Result<()> { if entries.is_empty() { return Ok(()); } @@ -775,7 +830,9 @@ impl Delegate { // No ready downstream or a downstream that does not match the kv_api type, will // be ignored. There will be one region that contains both Txn & Raw entries. // The judgement here is for sending entries to downstreams with correct kv_api. - if !downstream.state.load().ready_for_change_events() || downstream.kv_api != kv_api { + if !downstream.state.load().ready_for_change_events() + || downstream.kv_api != ChangeDataRequestKvApi::RawKv + { return Ok(()); } if downstream.filter_loop && filtered_entries.is_none() { @@ -822,15 +879,14 @@ impl Delegate { &mut self, put: PutRequest, is_one_pc: bool, - txn_rows: &mut HashMap, (EventRow, bool)>, + txn_rows: &mut HashMap, (EventRow, bool, Option)>, raw_rows: &mut Vec, - read_old_value: impl FnMut(&mut EventRow, TimeStamp) -> Result<()>, ) -> Result<()> { let key_mode = ApiV2::parse_key_mode(put.get_key()); if key_mode == KeyMode::Raw { self.sink_raw_put(put, raw_rows) } else { - self.sink_txn_put(put, is_one_pc, txn_rows, read_old_value) + self.sink_txn_put(put, is_one_pc, txn_rows) } } @@ -845,21 +901,19 @@ impl Delegate { &mut self, mut put: PutRequest, is_one_pc: bool, - rows: &mut HashMap, (EventRow, bool)>, - mut read_old_value: impl FnMut(&mut EventRow, TimeStamp) -> Result<()>, + rows: &mut HashMap, (EventRow, bool, Option)>, ) -> Result<()> { match put.cf.as_str() { "write" => { - let (mut row, mut has_value) = (EventRow::default(), false); + let (mut row, mut has_value, mut old_value_ts) = (EventRow::default(), false, None); if decode_write(put.take_key(), &put.value, &mut row, &mut has_value, true) { return Ok(()); } let commit_ts = if is_one_pc { set_event_row_type(&mut row, EventLogType::Committed); - let commit_ts = TimeStamp::from(row.commit_ts); - read_old_value(&mut row, commit_ts.prev())?; - Some(commit_ts) + old_value_ts = Some(TimeStamp::from(row.commit_ts)); + Some(TimeStamp::from(row.commit_ts)) } else { // 2PC if row.commit_ts == 0 { @@ -885,9 +939,12 @@ impl Delegate { let o = o.into_mut(); mem::swap(&mut o.0.value, &mut row.value); o.0 = row; + if old_value_ts.is_some() { + o.2 = old_value_ts; + } } HashMapEntry::Vacant(v) => { - v.insert((row, has_value)); + v.insert((row, has_value, old_value_ts)); } } } @@ -899,8 +956,7 @@ impl Delegate { return Ok(()); } - let read_old_ts = std::cmp::max(for_update_ts, row.start_ts.into()); - read_old_value(&mut row, read_old_ts)?; + let old_value_ts = Some(std::cmp::max(for_update_ts, row.start_ts.into())); // In order to compute resolved ts, we must track inflight txns. match self.resolver { @@ -922,8 +978,9 @@ impl Delegate { assert!(!has_value); has_value = true; mem::swap(&mut occupied.0.value, &mut row.value); + occupied.2 = old_value_ts; } - *occupied = (row, has_value); + *occupied = (row, has_value, old_value_ts); } "" | "default" => { let key = Key::from_encoded(put.take_key()).truncate_ts().unwrap(); @@ -1221,6 +1278,10 @@ impl ObservedRange { entries.retain(|e| self.is_key_in_range(&self.start_key_raw, &self.end_key_raw, &e.key)); entries } + + fn contains_raw_key(&self, key: &[u8]) -> bool { + self.is_key_in_range(&self.start_key_raw, &self.end_key_raw, key) + } } #[cfg(test)] @@ -1530,21 +1591,14 @@ mod tests { put.key.clone(), 1.into(), 10, - None, + Some(b"value".to_vec()), TimeStamp::zero(), 0, TimeStamp::zero(), false, ) .to_bytes(); - delegate - .sink_txn_put( - put, - false, - &mut map, - |_: &mut EventRow, _: TimeStamp| Ok(()), - ) - .unwrap(); + delegate.sink_txn_put(put, false, &mut map).unwrap(); } assert_eq!(map.len(), 5); @@ -1564,9 +1618,8 @@ mod tests { observed_range, }; delegate.add_downstream(downstream); - let entries = map.values().map(|(r, _)| r).cloned().collect(); delegate - .sink_downstream(entries, 1, ChangeDataRequestKvApi::TiDb) + .sink_downstream_tidb(map.into_values(), |_, _| Ok(())) .unwrap(); let (mut tx, mut rx) = futures::channel::mpsc::unbounded(); @@ -1602,7 +1655,7 @@ mod tests { put.key.clone(), 1.into(), 10, - None, + Some(b"value".to_vec()), TimeStamp::zero(), 0, TimeStamp::zero(), @@ -1613,14 +1666,7 @@ mod tests { lock = lock.set_txn_source(txn_source.into()); } put.value = lock.to_bytes(); - delegate - .sink_txn_put( - put, - false, - &mut map, - |_: &mut EventRow, _: TimeStamp| Ok(()), - ) - .unwrap(); + delegate.sink_txn_put(put, false, &mut map).unwrap(); } assert_eq!(map.len(), 5); @@ -1640,9 +1686,8 @@ mod tests { observed_range, }; delegate.add_downstream(downstream); - let entries = map.values().map(|(r, _)| r).cloned().collect(); delegate - .sink_downstream(entries, 1, ChangeDataRequestKvApi::TiDb) + .sink_downstream_tidb(map.into_values(), |_, _| Ok(())) .unwrap(); let (mut tx, mut rx) = futures::channel::mpsc::unbounded(); From 6cc6d1483803af9009b00f3f009fba5279683061 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Wed, 4 Dec 2024 14:50:47 +0800 Subject: [PATCH 218/220] raftstore: add write batch limit for raft command batch (#17823) (#17924) close tikv/tikv#17701 add write batch limit for raft command batch Signed-off-by: SpadeA-Tang Signed-off-by: SpadeA-Tang Co-authored-by: SpadeA-Tang Co-authored-by: SpadeA-Tang Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- components/raftstore/src/store/fsm/peer.rs | 44 ++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index 145d5779b7f..702b7a6bd4c 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -131,6 +131,10 @@ enum DelayReason { /// in most case. const MAX_REGIONS_IN_ERROR: usize = 10; const REGION_SPLIT_SKIP_MAX_COUNT: usize = 3; +/// Limits the request size that can be batched in a single RaftCmdRequest. +// todo: this fugure maybe changed to a more suitable value. +#[allow(clippy::identity_op)] +const MAX_BATCH_SIZE_LIMIT: u64 = 1 * 1024 * 1024; const UNSAFE_RECOVERY_STATE_TIMEOUT: Duration = Duration::from_secs(60); pub const MAX_PROPOSAL_SIZE_RATIO: f64 = 0.4; @@ -440,8 +444,13 @@ where // No batch request whose size exceed 20% of raft_entry_max_size, // so total size of request in batch_raft_request would not exceed // (40% + 20%) of raft_entry_max_size + // Also, to prevent the write batch size from becoming too large when + // raft_entry_max_size is set too high (all requests in a RaftCmdRequest will be + // written in one RocksDB write batch), we use MAX_APPLY_BATCH_SIZE to + // limit the number of requests batched within a single RaftCmdRequest. if req.get_requests().is_empty() || req_size as u64 > (cfg.raft_entry_max_size.0 as f64 * 0.2) as u64 + || (self.batch_req_size + req_size as u64) > MAX_BATCH_SIZE_LIMIT { return false; } @@ -7352,4 +7361,39 @@ mod tests { assert!(flag.load(Ordering::Acquire)); } } + + #[test] + fn test_batch_raft_cmd_request_builder_size_limit() { + let mut cfg = Config::default(); + cfg.raft_entry_max_size = ReadableSize::gb(1); + let mut q = Request::default(); + let mut builder = BatchRaftCmdRequestBuilder::::new(); + + let mut req = RaftCmdRequest::default(); + let mut put = PutRequest::default(); + put.set_key(b"aaaa".to_vec()); + let val = (0..200_000).map(|_| 0).collect_vec(); + put.set_value(val); + q.set_cmd_type(CmdType::Put); + q.set_put(put); + req.mut_requests().push(q.clone()); + let _ = q.take_put(); + let req_size = req.compute_size(); + assert!(builder.can_batch(&cfg, &req, req_size)); + let cb = Callback::write_ext(Box::new(move |_| {}), None, None); + let cmd = RaftCommand::new(req.clone(), cb); + builder.add(cmd, req_size); + + let mut req = RaftCmdRequest::default(); + let mut put = PutRequest::default(); + put.set_key(b"aaaa".to_vec()); + let val = (0..900_000).map(|_| 0).collect_vec(); + put.set_value(val); + q.set_cmd_type(CmdType::Put); + q.set_put(put); + req.mut_requests().push(q.clone()); + let _ = q.take_put(); + let req_size = req.compute_size(); + assert!(!builder.can_batch(&cfg, &req, req_size)); + } } From 908a384cc918c17783703655fec8fa3c89f80807 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Wed, 4 Dec 2024 15:50:26 +0800 Subject: [PATCH 219/220] raft-engine: fix panics when reading entries on compacted raft logs (#17765) (#17921) close tikv/tikv#17383, close tikv/tikv#17760 To address the corner case where a read thread encounters a panic due to reading with a stale index from the `Memtable` in raft-engine, which has been updated by a background thread that has already purged the stale logs. Signed-off-by: lucasliang Co-authored-by: lucasliang Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- Cargo.lock | 24 +++++++------- Cargo.toml | 4 +++ cmd/tikv-ctl/Cargo.toml | 48 +++++++++++---------------- cmd/tikv-server/Cargo.toml | 18 +++++----- components/raft_log_engine/Cargo.toml | 2 +- 5 files changed, 47 insertions(+), 49 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 7a1b014fe04..de82973eb31 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3087,9 +3087,9 @@ dependencies = [ [[package]] name = "lz4-sys" -version = "1.9.4" +version = "1.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57d27b317e207b10f69f5e75494119e391a96f48861ae870d1da6edac98ca900" +checksum = "e9764018d143cc854c9f17f0b907de70f14393b1f502da6375dce70f00514eb3" dependencies = [ "cc", "libc 0.2.146", @@ -4313,7 +4313,7 @@ dependencies = [ [[package]] name = "raft-engine" version = "0.4.1" -source = "git+https://github.com/tikv/raft-engine.git#e505d631c8c6d63f7fc63d83ea6e8fb88cf970a5" +source = "git+https://github.com/tikv/raft-engine.git?branch=tikv-7.5#8cb23f127f0caf48076a3d3d0e4e2a27a8c8b0ed" dependencies = [ "byteorder", "crc32fast", @@ -4347,7 +4347,7 @@ dependencies = [ [[package]] name = "raft-engine-ctl" version = "0.4.1" -source = "git+https://github.com/tikv/raft-engine.git#e505d631c8c6d63f7fc63d83ea6e8fb88cf970a5" +source = "git+https://github.com/tikv/raft-engine.git?branch=tikv-7.5#8cb23f127f0caf48076a3d3d0e4e2a27a8c8b0ed" dependencies = [ "clap 3.1.6", "env_logger 0.10.0", @@ -5250,9 +5250,9 @@ dependencies = [ [[package]] name = "serde" -version = "1.0.147" +version = "1.0.194" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d193d69bae983fc11a79df82342761dfbf28a99fc8d203dca4c3c1b590948965" +checksum = "0b114498256798c94a0689e1a15fec6005dee8ac1f41de56404b67afc2a4b773" dependencies = [ "serde_derive", ] @@ -5269,13 +5269,13 @@ dependencies = [ [[package]] name = "serde_derive" -version = "1.0.147" +version = "1.0.194" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4f1d362ca8fc9c3e3a7484440752472d68a6caa98f1ab81d99b5dfe517cec852" +checksum = "a3385e45322e8f9931410f01b3031ec534c3947d0e94c18049af4d9f9907d4e0" dependencies = [ "proc-macro2", "quote", - "syn 1.0.103", + "syn 2.0.79", ] [[package]] @@ -5776,7 +5776,7 @@ version = "0.25.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "290d54ea6f91c969195bdbcd7442c8c2a2ba87da8bf60a7ee86a235d4bc1e125" dependencies = [ - "strum_macros 0.25.0", + "strum_macros 0.25.3", ] [[package]] @@ -5806,9 +5806,9 @@ dependencies = [ [[package]] name = "strum_macros" -version = "0.25.0" +version = "0.25.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fe9f3bd7d2e45dcc5e265fbb88d6513e4747d8ef9444cf01a533119bce28a157" +checksum = "23dc1fa9ac9c169a78ba62f0b841814b7abae11bdd047b9c58f893439e309ea0" dependencies = [ "heck 0.4.1", "proc-macro2", diff --git a/Cargo.toml b/Cargo.toml index c4db583fb8e..06278920f60 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -392,6 +392,10 @@ txn_types = { path = "components/txn_types" } raft = { version = "0.7.0", default-features = false, features = [ "protobuf-codec", ] } +raft-engine = { git = "https://github.com/tikv/raft-engine.git", branch = "tikv-7.5", features = [ + "swap", +] } +raft-engine-ctl = { git = "https://github.com/tikv/raft-engine.git", branch = "tikv-7.5" } grpcio = { version = "0.10.4", default-features = false, features = [ "openssl-vendored", "protobuf-codec", diff --git a/cmd/tikv-ctl/Cargo.toml b/cmd/tikv-ctl/Cargo.toml index a36e72b3c64..b8854f98f46 100644 --- a/cmd/tikv-ctl/Cargo.toml +++ b/cmd/tikv-ctl/Cargo.toml @@ -6,7 +6,13 @@ edition = "2021" publish = false [features] -default = ["test-engine-kv-rocksdb", "test-engine-raft-raft-engine", "cloud-aws", "cloud-gcp", "cloud-azure"] +default = [ + "test-engine-kv-rocksdb", + "test-engine-raft-raft-engine", + "cloud-aws", + "cloud-gcp", + "cloud-azure", +] tcmalloc = ["tikv/tcmalloc"] jemalloc = ["tikv/jemalloc"] mimalloc = ["tikv/mimalloc"] @@ -15,32 +21,15 @@ portable = ["tikv/portable"] sse = ["tikv/sse"] mem-profiling = ["tikv/mem-profiling"] failpoints = ["tikv/failpoints"] -cloud-aws = [ - "encryption_export/cloud-aws", - "backup/cloud-aws", -] -cloud-gcp = [ - "encryption_export/cloud-gcp", - "backup/cloud-gcp", -] -cloud-azure = [ - "encryption_export/cloud-azure", - "backup/cloud-azure", -] +cloud-aws = ["encryption_export/cloud-aws", "backup/cloud-aws"] +cloud-gcp = ["encryption_export/cloud-gcp", "backup/cloud-gcp"] +cloud-azure = ["encryption_export/cloud-azure", "backup/cloud-azure"] cloud-storage-grpc = ["backup/cloud-storage-grpc"] cloud-storage-dylib = ["backup/cloud-storage-dylib"] -test-engine-kv-rocksdb = [ - "tikv/test-engine-kv-rocksdb" -] -test-engine-raft-raft-engine = [ - "tikv/test-engine-raft-raft-engine" -] -test-engines-rocksdb = [ - "tikv/test-engines-rocksdb", -] -test-engines-panic = [ - "tikv/test-engines-panic", -] +test-engine-kv-rocksdb = ["tikv/test-engine-kv-rocksdb"] +test-engine-raft-raft-engine = ["tikv/test-engine-raft-raft-engine"] +test-engines-rocksdb = ["tikv/test-engines-rocksdb"] +test-engines-panic = ["tikv/test-engines-panic"] nortcheck = ["engine_rocks/nortcheck"] @@ -65,14 +54,17 @@ hex = "0.4" keys = { workspace = true } kvproto = { workspace = true } libc = "0.2" -log = { version = "0.4", features = ["max_level_trace", "release_max_level_debug"] } +log = { version = "0.4", features = [ + "max_level_trace", + "release_max_level_debug", +] } log_wrappers = { workspace = true } pd_client = { workspace = true } prometheus = { version = "0.13", features = ["nightly"] } protobuf = { version = "2.8", features = ["bytes"] } raft = { workspace = true } -raft-engine = { git = "https://github.com/tikv/raft-engine.git" } -raft-engine-ctl = { git = "https://github.com/tikv/raft-engine.git" } +raft-engine = { workspace = true } +raft-engine-ctl = { workspace = true } raft_log_engine = { workspace = true } raftstore = { workspace = true } rand = "0.8" diff --git a/cmd/tikv-server/Cargo.toml b/cmd/tikv-server/Cargo.toml index 409dc84a62d..6ac8ae03714 100644 --- a/cmd/tikv-server/Cargo.toml +++ b/cmd/tikv-server/Cargo.toml @@ -6,7 +6,13 @@ edition = "2021" publish = false [features] -default = ["test-engine-kv-rocksdb", "test-engine-raft-raft-engine", "cloud-aws", "cloud-gcp", "cloud-azure"] +default = [ + "test-engine-kv-rocksdb", + "test-engine-raft-raft-engine", + "cloud-aws", + "cloud-gcp", + "cloud-azure", +] trace-tablet-lifetime = ["tikv/trace-tablet-lifetime"] tcmalloc = ["server/tcmalloc"] jemalloc = ["server/jemalloc"] @@ -18,12 +24,8 @@ failpoints = ["server/failpoints"] cloud-aws = ["server/cloud-aws"] cloud-gcp = ["server/cloud-gcp"] cloud-azure = ["server/cloud-azure"] -test-engine-kv-rocksdb = [ - "server/test-engine-kv-rocksdb" -] -test-engine-raft-raft-engine = [ - "server/test-engine-raft-raft-engine" -] +test-engine-kv-rocksdb = ["server/test-engine-kv-rocksdb"] +test-engine-raft-raft-engine = ["server/test-engine-raft-raft-engine"] test-engines-rocksdb = ["server/test-engines-rocksdb"] test-engines-panic = ["server/test-engines-panic"] @@ -37,7 +39,7 @@ encryption_export = { workspace = true } engine_traits = { workspace = true } keys = { workspace = true } kvproto = { workspace = true } -raft-engine = { git = "https://github.com/tikv/raft-engine.git" } +raft-engine = { workspace = true } regex = "1" serde_json = { version = "1.0", features = ["preserve_order"] } server = { workspace = true } diff --git a/components/raft_log_engine/Cargo.toml b/components/raft_log_engine/Cargo.toml index 0e640991eea..29012dee8e7 100644 --- a/components/raft_log_engine/Cargo.toml +++ b/components/raft_log_engine/Cargo.toml @@ -18,7 +18,7 @@ num_cpus = "1" online_config = { workspace = true } protobuf = "2" raft = { workspace = true } -raft-engine = { git = "https://github.com/tikv/raft-engine.git", features = ["swap"] } +raft-engine = { workspace = true } serde = "1.0" serde_derive = "1.0" slog = { workspace = true } From 567bff82759505615d7a67848b39a6e6ee9735f7 Mon Sep 17 00:00:00 2001 From: Calvin Neo Date: Thu, 5 Dec 2024 16:15:08 +0800 Subject: [PATCH 220/220] resolve Signed-off-by: Calvin Neo --- Cargo.lock | 309 ++++++++++++++---- proxy_components/engine_tiflash/Cargo.toml | 1 + .../engine_tiflash/src/cf_options.rs | 17 + .../engine_tiflash/src/compact.rs | 4 + .../engine_tiflash/src/db_options.rs | 12 + proxy_components/engine_tiflash/src/misc.rs | 161 ++++++++- .../engine_tiflash/src/properties.rs | 148 +++++++-- .../engine_tiflash/src/range_properties.rs | 7 +- .../engine_tiflash/src/sst_partitioner.rs | 2 + proxy_components/proxy_ffi/Cargo.toml | 5 +- 10 files changed, 554 insertions(+), 112 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 551619f85c8..40261cdb2cf 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -131,6 +131,36 @@ dependencies = [ "nodrop", ] +[[package]] +name = "arrow" +version = "13.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c6bee230122beb516ead31935a61f683715f987c6f003eff44ad6986624105a" +dependencies = [ + "bitflags", + "chrono", + "csv", + "flatbuffers", + "half", + "hex 0.4.2", + "indexmap 1.6.2", + "lazy_static", + "lexical-core", + "multiversion", + "num 0.4.3", + "rand 0.8.5", + "regex", + "serde", + "serde_derive", + "serde_json", +] + +[[package]] +name = "assert-type-eq" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd49a41856ee21a0cfb2b1cfbfcca0f1d3e6c257c38939f0d6ecfaf177f2ea47" + [[package]] name = "async-channel" version = "1.6.1" @@ -2144,6 +2174,17 @@ version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" +[[package]] +name = "flatbuffers" +version = "2.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86b428b715fdbdd1c364b84573b5fdc0f84f8e423661b9f398735278bc7f2b6a" +dependencies = [ + "bitflags", + "smallvec", + "thiserror", +] + [[package]] name = "flate2" version = "1.0.11" @@ -2179,10 +2220,11 @@ checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" [[package]] name = "form_urlencoded" -version = "1.2.0" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a62bc1cf6f830c2ec14a513a9fb124d0a213a629668a4186f329db21fe045652" +checksum = "5fc25a87fa4fd2094bffb06925852034d90a17f0d1e05197d4956d3555752191" dependencies = [ + "matches", "percent-encoding", ] @@ -2618,6 +2660,12 @@ dependencies = [ "allocator-api2", ] +[[package]] +name = "hashbrown" +version = "0.15.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289" + [[package]] name = "heck" version = "0.3.1" @@ -2825,10 +2873,11 @@ checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" [[package]] name = "idna" -version = "0.4.0" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d20d6b07bfbc108882d88ed8e37d39636dcc260e15e30c45e6ba089610b917c" +checksum = "02e2673c30ee86b5b96a9cb52ad15718aa1f966f5ab9ad54a8b95d5ca33120a9" dependencies = [ + "matches", "unicode-bidi", "unicode-normalization", ] @@ -2851,12 +2900,12 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.0.0" +version = "2.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d5477fe2230a79769d8dc68e0eabf5437907c0457a5614a9e8dddb67f65eb65d" +checksum = "62f822373a4fe84d4bb149bf54e584a7f4abec90e072ed49cda0edea5b95471f" dependencies = [ "equivalent", - "hashbrown 0.14.0", + "hashbrown 0.15.2", ] [[package]] @@ -3063,6 +3112,70 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" +[[package]] +name = "lexical-core" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2cde5de06e8d4c2faabc400238f9ae1c74d5412d03a7bd067645ccbc47070e46" +dependencies = [ + "lexical-parse-float", + "lexical-parse-integer", + "lexical-util", + "lexical-write-float", + "lexical-write-integer", +] + +[[package]] +name = "lexical-parse-float" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "683b3a5ebd0130b8fb52ba0bdc718cc56815b6a097e28ae5a6997d0ad17dc05f" +dependencies = [ + "lexical-parse-integer", + "lexical-util", + "static_assertions", +] + +[[package]] +name = "lexical-parse-integer" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d0994485ed0c312f6d965766754ea177d07f9c00c9b82a5ee62ed5b47945ee9" +dependencies = [ + "lexical-util", + "static_assertions", +] + +[[package]] +name = "lexical-util" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5255b9ff16ff898710eb9eb63cb39248ea8a5bb036bea8085b1a767ff6c4e3fc" +dependencies = [ + "static_assertions", +] + +[[package]] +name = "lexical-write-float" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accabaa1c4581f05a3923d1b4cfd124c329352288b7b9da09e766b0668116862" +dependencies = [ + "lexical-util", + "lexical-write-integer", + "static_assertions", +] + +[[package]] +name = "lexical-write-integer" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1b6f3d1f4422866b68192d62f77bc5c700bee84f3069f2469d7bc8c77852446" +dependencies = [ + "lexical-util", + "static_assertions", +] + [[package]] name = "libc" version = "0.1.12" @@ -3497,6 +3610,26 @@ dependencies = [ "serde", ] +[[package]] +name = "multiversion" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "025c962a3dd3cc5e0e520aa9c612201d127dcdf28616974961a649dca64f5373" +dependencies = [ + "multiversion-macros", +] + +[[package]] +name = "multiversion-macros" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8a3e2bde382ebf960c1f3e79689fa5941625fe9bf694a1cb64af3e85faff3af" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.103", +] + [[package]] name = "mur3" version = "0.1.0" @@ -3660,10 +3793,34 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ab3e176191bc4faad357e3122c4747aa098ac880e88b168f106386128736cf4a" dependencies = [ - "num-complex", + "num-complex 0.3.0", + "num-integer", + "num-iter", + "num-rational 0.3.0", + "num-traits", +] + +[[package]] +name = "num" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35bd024e8b2ff75562e5f34e7f4905839deb4b22955ef5e73d2fea1b9813cb23" +dependencies = [ + "num-bigint", + "num-complex 0.4.6", "num-integer", "num-iter", - "num-rational", + "num-rational 0.4.2", + "num-traits", +] + +[[package]] +name = "num-bigint" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" +dependencies = [ + "num-integer", "num-traits", ] @@ -3676,6 +3833,15 @@ dependencies = [ "num-traits", ] +[[package]] +name = "num-complex" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495" +dependencies = [ + "num-traits", +] + [[package]] name = "num-derive" version = "0.3.0" @@ -3710,19 +3876,18 @@ dependencies = [ [[package]] name = "num-integer" -version = "0.1.44" +version = "0.1.46" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2cc698a63b549a70bc047073d2949cce27cd1c7b0a4a862d08a8031bc2801db" +checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" dependencies = [ - "autocfg", "num-traits", ] [[package]] name = "num-iter" -version = "0.1.42" +version = "0.1.45" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2021c8337a54d21aca0d59a92577a029af9431cb59b909b03252b9c164fad59" +checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf" dependencies = [ "autocfg", "num-integer", @@ -3740,11 +3905,22 @@ dependencies = [ "num-traits", ] +[[package]] +name = "num-rational" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824" +dependencies = [ + "num-bigint", + "num-integer", + "num-traits", +] + [[package]] name = "num-traits" -version = "0.2.14" +version = "0.2.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" dependencies = [ "autocfg", ] @@ -4023,9 +4199,9 @@ checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" [[package]] name = "percent-encoding" -version = "2.3.0" +version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b2a4787296e9989611394c33f193f676704af1686e70b8f8033ab5ba9a35a94" +checksum = "d4fd5641d01c8f18a23da7b6fe29298ff4b55afcccdf78973b24cf3175fee32e" [[package]] name = "perfcnt" @@ -4201,14 +4377,14 @@ version = "0.3.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e30165d31df606f5726b090ec7592c308a0eaf61721ff64c9a3018e344a8753e" dependencies = [ - "portable-atomic 1.4.2", + "portable-atomic 1.10.0", ] [[package]] name = "portable-atomic" -version = "1.4.2" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f32154ba0af3a075eefa1eda8bb414ee928f62303a54ea85b8d6638ff1a6ee9e" +checksum = "280dc24453071f1b63954171985a0b0d30058d287960968b9b2aca264c8d4ee6" [[package]] name = "pprof" @@ -4751,6 +4927,7 @@ dependencies = [ "prometheus-static-metric", "protobuf", "rayon", + "rhai", "scopeguard", "serde", "serde_repr", @@ -4758,16 +4935,6 @@ dependencies = [ "thiserror", ] -[[package]] -name = "raft-engine-ctl" -version = "0.4.1" -source = "git+https://github.com/tikv/raft-engine.git?branch=tikv-7.5#8cb23f127f0caf48076a3d3d0e4e2a27a8c8b0ed" -dependencies = [ - "clap 3.1.6", - "env_logger 0.10.0", - "raft-engine", -] - [[package]] name = "raft-proto" version = "0.7.0" @@ -5306,6 +5473,32 @@ dependencies = [ "bytemuck", ] +[[package]] +name = "rhai" +version = "1.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f06953bb8b9e4307cb7ccc0d9d018e2ddd25a30d32831f631ce4fe8f17671f7" +dependencies = [ + "ahash 0.7.4", + "bitflags", + "instant", + "num-traits", + "rhai_codegen", + "smallvec", + "smartstring", +] + +[[package]] +name = "rhai_codegen" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75a39bc2aa9258b282ee5518dac493491a9c4c11a6d7361b9d2644c922fc6488" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.103", +] + [[package]] name = "ring" version = "0.16.16" @@ -5969,6 +6162,17 @@ version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f2dd574626839106c320a323308629dcb1acfc96e32a8cba364ddc61ac23ee83" +[[package]] +name = "smartstring" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fb72c633efbaa2dd666986505016c32c3044395ceaf881518399d2f4127ee29" +dependencies = [ + "autocfg", + "static_assertions", + "version_check 0.9.4", +] + [[package]] name = "snap_recovery" version = "0.1.0" @@ -6815,7 +7019,7 @@ dependencies = [ "log_wrappers", "match-template", "nom 7.1.0", - "num", + "num 0.3.0", "num-derive 0.3.0", "num-traits", "ordered-float", @@ -6877,7 +7081,7 @@ dependencies = [ "hex 0.4.2", "log_wrappers", "match-template", - "num", + "num 0.3.0", "num-traits", "openssl", "panic_hook", @@ -7232,21 +7436,6 @@ dependencies = [ "serde_json", ] -[[package]] -name = "tinyvec" -version = "1.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50" -dependencies = [ - "tinyvec_macros", -] - -[[package]] -name = "tinyvec_macros" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" - [[package]] name = "tipb" version = "0.0.1" @@ -7387,9 +7576,9 @@ dependencies = [ [[package]] name = "toml_datetime" -version = "0.6.3" +version = "0.6.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7cda73e2f1397b1262d6dfdcef8aafae14d1de7748d66822d3bfeeb6d03e5e4b" +checksum = "0dd7358ecb8fc2f8d014bf86f6f638ce72ba252a2c3a2572f2a795f1d23efb41" [[package]] name = "toml_edit" @@ -7397,7 +7586,7 @@ version = "0.19.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c500344a19072298cd05a7224b3c0c629348b78692bf48466c5238656e315a78" dependencies = [ - "indexmap 2.0.0", + "indexmap 2.7.0", "toml_datetime", "winnow", ] @@ -7621,9 +7810,12 @@ checksum = "eeba86d422ce181a719445e51872fa30f1f7413b62becb52e95ec91aa262d85c" [[package]] name = "unicode-bidi" -version = "0.3.13" +version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92888ba5573ff080736b3648696b70cafad7d250551175acbaa4e0385b3e1460" +checksum = "49f2bd0c6468a8230e1db229cff8029217cf623c767ea5d60bfbd42729ea54d5" +dependencies = [ + "matches", +] [[package]] name = "unicode-ident" @@ -7633,11 +7825,11 @@ checksum = "6ceab39d59e4c9499d4e5a8ee0e2735b891bb7308ac83dfb4e80cad195c9f6f3" [[package]] name = "unicode-normalization" -version = "0.1.22" +version = "0.1.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c5713f0fc4b5db668a2ac63cdb7bb4469d8c9fed047b1d0292cc7b0ce2ba921" +checksum = "5479532badd04e128284890390c1e876ef7a993d0570b3597ae43dfa1d59afa4" dependencies = [ - "tinyvec", + "smallvec", ] [[package]] @@ -7666,12 +7858,13 @@ checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a" [[package]] name = "url" -version = "2.4.0" +version = "2.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50bff7831e19200a85b17131d085c25d7811bc4e186efdaf54bbd132994a88cb" +checksum = "a507c383b2d33b5fc35d1861e77e6b383d158b2da5e14fe51b83dfedf6fd578c" dependencies = [ "form_urlencoded", "idna", + "matches", "percent-encoding", "serde", ] diff --git a/proxy_components/engine_tiflash/Cargo.toml b/proxy_components/engine_tiflash/Cargo.toml index f70029b4732..b218c0d6d9e 100644 --- a/proxy_components/engine_tiflash/Cargo.toml +++ b/proxy_components/engine_tiflash/Cargo.toml @@ -83,6 +83,7 @@ yatp = { git = "https://github.com/tikv/yatp.git", branch = "master" } git = "https://github.com/tikv/rust-rocksdb.git" package = "rocksdb" features = ["encryption"] +branch = "tikv-7.5" [dev-dependencies] rand = "0.8" diff --git a/proxy_components/engine_tiflash/src/cf_options.rs b/proxy_components/engine_tiflash/src/cf_options.rs index 1162c67f210..6a2372fb31f 100644 --- a/proxy_components/engine_tiflash/src/cf_options.rs +++ b/proxy_components/engine_tiflash/src/cf_options.rs @@ -40,6 +40,23 @@ impl RocksCfOptions { pub fn into_raw(self) -> RawCfOptions { self.0 } + + pub fn set_flush_size(&mut self, f: usize) -> Result<()> { + if let Some(m) = self.0.get_write_buffer_manager() { + m.set_flush_size(f); + } else { + return Err(box_err!("write buffer manager not found")); + } + Ok(()) + } + + pub fn get_flush_size(&self) -> Result { + if let Some(m) = self.0.get_write_buffer_manager() { + return Ok(m.flush_size() as u64); + } + + Err(box_err!("write buffer manager not found")) + } } impl Deref for RocksCfOptions { diff --git a/proxy_components/engine_tiflash/src/compact.rs b/proxy_components/engine_tiflash/src/compact.rs index 199b7d9f3be..f64c9a7d49e 100644 --- a/proxy_components/engine_tiflash/src/compact.rs +++ b/proxy_components/engine_tiflash/src/compact.rs @@ -121,6 +121,10 @@ impl CompactExt for RocksEngine { db.compact_files_cf(handle, &opts, &files, output_level) .map_err(r2e) } + + fn check_in_range(&self, start: Option<&[u8]>, end: Option<&[u8]>) -> Result<()> { + self.as_inner().check_in_range(start, end).map_err(r2e) + } } #[cfg(test)] diff --git a/proxy_components/engine_tiflash/src/db_options.rs b/proxy_components/engine_tiflash/src/db_options.rs index c9ef2cfda98..c95f81f8297 100644 --- a/proxy_components/engine_tiflash/src/db_options.rs +++ b/proxy_components/engine_tiflash/src/db_options.rs @@ -100,6 +100,14 @@ impl DbOptions for RocksDbOptions { Ok(()) } + fn get_flush_size(&self) -> Result { + if let Some(m) = self.0.get_write_buffer_manager() { + return Ok(m.flush_size() as u64); + } + + Err(box_err!("write buffer manager not found")) + } + fn set_flush_oldest_first(&mut self, f: bool) -> Result<()> { if let Some(m) = self.0.get_write_buffer_manager() { m.set_flush_oldest_first(f); @@ -112,6 +120,10 @@ impl DbOptions for RocksDbOptions { fn set_titandb_options(&mut self, opts: &Self::TitanDbOptions) { self.0.set_titandb_options(opts.as_raw()) } + + fn set_track_and_verify_wals_in_manifest(&mut self, v: bool) { + self.0.set_track_and_verify_wals_in_manifest(v) + } } pub struct RocksTitanDbOptions(RawTitanDBOptions); diff --git a/proxy_components/engine_tiflash/src/misc.rs b/proxy_components/engine_tiflash/src/misc.rs index 29d665ce563..c043e9b2050 100644 --- a/proxy_components/engine_tiflash/src/misc.rs +++ b/proxy_components/engine_tiflash/src/misc.rs @@ -1,16 +1,20 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. -use engine_rocks::{get_range_stats, STORE_ENGINE_EVENT_COUNTER_VEC}; + use engine_traits::{ CfNamesExt, DeleteStrategy, ImportExt, IterOptions, Iterable, Iterator, MiscExt, Mutable, Range, RangeStats, Result, SstWriter, SstWriterBuilder, WriteBatch, WriteBatchExt, - WriteOptions, ALL_CFS, + WriteOptions, }; use rocksdb::{FlushOptions, Range as RocksRange}; use tikv_util::{box_try, keybuilder::KeyBuilder}; use crate::{ - engine::RocksEngine, r2e, rocks_metrics::RocksStatisticsReporter, rocks_metrics_defs::*, - sst::RocksSstWriterBuilder, util, RocksSstWriter, + engine::RocksEngine, + r2e, + rocks_metrics::{RocksStatisticsReporter, STORE_ENGINE_EVENT_COUNTER_VEC}, + rocks_metrics_defs::*, + sst::RocksSstWriterBuilder, + util, RocksSstWriter, }; pub const MAX_DELETE_COUNT_BY_KEY: usize = 2048; @@ -315,7 +319,7 @@ impl MiscExt for RocksEngine { fn get_engine_used_size(&self) -> Result { let mut used_size: u64 = 0; - for cf in ALL_CFS { + for cf in self.cf_names() { let handle = util::get_cf_handle(self.as_inner(), cf)?; used_size += util::get_engine_cf_used_size(self.as_inner(), handle); } @@ -330,16 +334,26 @@ impl MiscExt for RocksEngine { self.as_inner().sync_wal().map_err(r2e) } + fn disable_manual_compaction(&self) -> Result<()> { + self.as_inner().disable_manual_compaction(); + Ok(()) + } + + fn enable_manual_compaction(&self) -> Result<()> { + self.as_inner().enable_manual_compaction(); + Ok(()) + } + fn pause_background_work(&self) -> Result<()> { // This will make manual compaction return error instead of waiting. In practice // we might want to identify this case by parsing error message. - self.as_inner().disable_manual_compaction(); + self.disable_manual_compaction()?; self.as_inner().pause_bg_work(); Ok(()) } fn continue_background_work(&self) -> Result<()> { - self.as_inner().enable_manual_compaction(); + self.enable_manual_compaction()?; self.as_inner().continue_bg_work(); Ok(()) } @@ -411,7 +425,7 @@ impl MiscExt for RocksEngine { } fn get_range_stats(&self, cf: &str, start: &[u8], end: &[u8]) -> Result> { - Ok(get_range_stats(&self.rocks, cf, start, end)) + Ok(crate::properties::get_range_stats(self, cf, start, end)) } fn is_stalled_or_stopped(&self) -> bool { @@ -449,7 +463,8 @@ impl MiscExt for RocksEngine { #[cfg(test)] mod tests { use engine_traits::{ - DeleteStrategy, Iterable, Iterator, Mutable, SyncMutable, WriteBatchExt, ALL_CFS, + CompactExt, DeleteStrategy, Iterable, Iterator, Mutable, SyncMutable, WriteBatchExt, + ALL_CFS, }; use tempfile::Builder; @@ -505,7 +520,8 @@ mod tests { wb.write().unwrap(); check_data(&db, ALL_CFS, kvs.as_slice()); - db.delete_ranges_cfs(strategy, ranges).unwrap(); + db.delete_ranges_cfs(&WriteOptions::default(), strategy, ranges) + .unwrap(); let mut kvs_left: Vec<_> = kvs; for r in ranges { @@ -643,10 +659,18 @@ mod tests { } check_data(&db, ALL_CFS, kvs.as_slice()); - db.delete_ranges_cfs(DeleteStrategy::DeleteFiles, &[Range::new(b"k2", b"k4")]) - .unwrap(); - db.delete_ranges_cfs(DeleteStrategy::DeleteBlobs, &[Range::new(b"k2", b"k4")]) - .unwrap(); + db.delete_ranges_cfs( + &WriteOptions::default(), + DeleteStrategy::DeleteFiles, + &[Range::new(b"k2", b"k4")], + ) + .unwrap(); + db.delete_ranges_cfs( + &WriteOptions::default(), + DeleteStrategy::DeleteBlobs, + &[Range::new(b"k2", b"k4")], + ) + .unwrap(); check_data(&db, ALL_CFS, kvs_left.as_slice()); } @@ -691,10 +715,119 @@ mod tests { // Delete all in ["k2", "k4"). db.delete_ranges_cfs( + &WriteOptions::default(), DeleteStrategy::DeleteByRange, &[Range::new(b"kabcdefg2", b"kabcdefg4")], ) .unwrap(); check_data(&db, &[cf], kvs_left.as_slice()); } + + #[test] + fn test_get_sst_key_ranges() { + let path = Builder::new() + .prefix("test_get_sst_key_ranges") + .tempdir() + .unwrap(); + let path_str = path.path().to_str().unwrap(); + + let mut opts = RocksDbOptions::default(); + opts.create_if_missing(true); + opts.enable_multi_batch_write(true); + + let mut cf_opts = RocksCfOptions::default(); + // Prefix extractor(trim the timestamp at tail) for write cf. + cf_opts + .set_prefix_extractor( + "FixedSuffixSliceTransform", + crate::util::FixedSuffixSliceTransform::new(8), + ) + .unwrap_or_else(|err| panic!("{:?}", err)); + // Create prefix bloom filter for memtable. + cf_opts.set_memtable_prefix_bloom_size_ratio(0.1_f64); + let cf = "default"; + let db = new_engine_opt(path_str, opts, vec![(cf, cf_opts)]).unwrap(); + let mut wb = db.write_batch(); + let kvs: Vec<(&[u8], &[u8])> = vec![ + (b"k1", b"v1"), + (b"k2", b"v2"), + (b"k6", b"v3"), + (b"k7", b"v4"), + ]; + + for &(k, v) in kvs.as_slice() { + wb.put_cf(cf, k, v).unwrap(); + } + wb.write().unwrap(); + + db.flush_cf(cf, true).unwrap(); + let sst_range = db.get_sst_key_ranges(cf, 0).unwrap(); + let expected = vec![(b"k1".to_vec(), b"k7".to_vec())]; + assert_eq!(sst_range, expected); + + let mut wb = db.write_batch(); + let kvs: Vec<(&[u8], &[u8])> = vec![(b"k3", b"v1"), (b"k4", b"v2"), (b"k8", b"v3")]; + + for &(k, v) in kvs.as_slice() { + wb.put_cf(cf, k, v).unwrap(); + } + wb.write().unwrap(); + + db.flush_cf(cf, true).unwrap(); + let sst_range = db.get_sst_key_ranges(cf, 0).unwrap(); + let expected = vec![ + (b"k3".to_vec(), b"k8".to_vec()), + (b"k1".to_vec(), b"k7".to_vec()), + ]; + assert_eq!(sst_range, expected); + + db.compact_range_cf(cf, None, None, false, 1).unwrap(); + let sst_range = db.get_sst_key_ranges(cf, 0).unwrap(); + assert_eq!(sst_range.len(), 0); + let sst_range = db.get_sst_key_ranges(cf, 1).unwrap(); + let expected = vec![(b"k1".to_vec(), b"k8".to_vec())]; + assert_eq!(sst_range, expected); + } + + #[test] + fn test_flush_oldest() { + let path = Builder::new() + .prefix("test_flush_oldest") + .tempdir() + .unwrap(); + let path_str = path.path().to_str().unwrap(); + + let mut opts = RocksDbOptions::default(); + opts.create_if_missing(true); + + let db = new_engine(path_str, ALL_CFS).unwrap(); + db.put_cf("default", b"k", b"v").unwrap(); + std::thread::sleep(std::time::Duration::from_secs(1)); + db.put_cf("write", b"k", b"v").unwrap(); + db.put_cf("lock", b"k", b"v").unwrap(); + assert_eq!( + db.get_total_sst_files_size_cf("default").unwrap().unwrap(), + 0 + ); + assert_eq!(db.get_total_sst_files_size_cf("write").unwrap().unwrap(), 0); + assert_eq!(db.get_total_sst_files_size_cf("lock").unwrap().unwrap(), 0); + let now = std::time::SystemTime::now(); + assert!( + !db.flush_oldest_cf(true, Some(now - std::time::Duration::from_secs(5))) + .unwrap() + ); + assert_eq!( + db.get_total_sst_files_size_cf("default").unwrap().unwrap(), + 0 + ); + assert_eq!(db.get_total_sst_files_size_cf("write").unwrap().unwrap(), 0); + assert_eq!(db.get_total_sst_files_size_cf("lock").unwrap().unwrap(), 0); + assert!( + db.flush_oldest_cf(true, Some(now - std::time::Duration::from_secs(1))) + .unwrap() + ); + assert_eq!(db.get_total_sst_files_size_cf("write").unwrap().unwrap(), 0); + assert_eq!(db.get_total_sst_files_size_cf("lock").unwrap().unwrap(), 0); + assert!(db.get_total_sst_files_size_cf("default").unwrap().unwrap() > 0); + } } diff --git a/proxy_components/engine_tiflash/src/properties.rs b/proxy_components/engine_tiflash/src/properties.rs index b92fe57c30c..1739fb1036e 100644 --- a/proxy_components/engine_tiflash/src/properties.rs +++ b/proxy_components/engine_tiflash/src/properties.rs @@ -8,7 +8,8 @@ use std::{ u64, }; -use engine_traits::{MvccProperties, Range}; +use api_version::{ApiV2, KeyMode, KvFormat}; +use engine_traits::{raw_ttl::ttl_current_ts, MvccProperties, Range, RangeStats}; use rocksdb::{ DBEntryType, TablePropertiesCollector, TablePropertiesCollectorFactory, TitanBlobIndex, UserCollectedProperties, @@ -130,12 +131,6 @@ impl<'a> DecodeProperties for UserCollectedPropertiesDecoder<'a> { } } -#[derive(Debug, Clone, PartialEq, Copy)] -pub enum RangeOffsetKind { - Size, - Keys, -} - #[derive(Debug, Default, Clone, Copy)] pub struct RangeOffsets { pub size: u64, @@ -205,7 +200,6 @@ impl RangeProperties { } /// Returns `size` and `keys`. - #[allow(clippy::redundant_closure)] pub fn get_approximate_distance_in_range(&self, start: &[u8], end: &[u8]) -> (u64, u64) { assert!(start <= end); if start == end { @@ -387,7 +381,8 @@ impl TablePropertiesCollectorFactory for RangeProperti } } -/// Can only be used for write CF. +/// Can be used for write CF in TiDB & TxnKV scenario, or be used for default CF +/// in RawKV scenario. pub struct MvccPropertiesCollector { props: MvccProperties, last_row: Vec, @@ -395,10 +390,12 @@ pub struct MvccPropertiesCollector { row_versions: u64, cur_index_handle: IndexHandle, row_index_handles: IndexHandles, + key_mode: KeyMode, // Use KeyMode::Txn for both TiDB & TxnKV, KeyMode::Raw for RawKV. + current_ts: u64, } impl MvccPropertiesCollector { - fn new() -> MvccPropertiesCollector { + fn new(key_mode: KeyMode) -> MvccPropertiesCollector { MvccPropertiesCollector { props: MvccProperties::new(), last_row: Vec::new(), @@ -406,6 +403,8 @@ impl MvccPropertiesCollector { row_versions: 0, cur_index_handle: IndexHandle::default(), row_index_handles: IndexHandles::new(), + key_mode, + current_ts: ttl_current_ts(), } } } @@ -415,7 +414,10 @@ impl TablePropertiesCollector for MvccPropertiesCollector { // TsFilter filters sst based on max_ts and min_ts during iterating. // To prevent seeing outdated (GC) records, we should consider // RocksDB delete entry type. - if entry_type != DBEntryType::Put && entry_type != DBEntryType::Delete { + if entry_type != DBEntryType::Put + && entry_type != DBEntryType::Delete + && entry_type != DBEntryType::BlobIndex + { return; } @@ -453,18 +455,43 @@ impl TablePropertiesCollector for MvccPropertiesCollector { self.props.max_row_versions = self.row_versions; } - let write_type = match Write::parse_type(value) { - Ok(v) => v, - Err(_) => { - self.num_errors += 1; - return; + if entry_type != DBEntryType::BlobIndex { + if self.key_mode == KeyMode::Raw { + let decode_raw_value = ApiV2::decode_raw_value(value); + match decode_raw_value { + Ok(raw_value) => { + if raw_value.is_valid(self.current_ts) { + self.props.num_puts += 1; + } else { + self.props.num_deletes += 1; + } + if let Some(expire_ts) = raw_value.expire_ts { + self.props.ttl.add(expire_ts); + } + } + Err(_) => { + self.num_errors += 1; + } + } + } else { + let write_type = match Write::parse_type(value) { + Ok(v) => v, + Err(_) => { + self.num_errors += 1; + return; + } + }; + + match write_type { + WriteType::Put => self.props.num_puts += 1, + WriteType::Delete => self.props.num_deletes += 1, + _ => {} + } } - }; - - match write_type { - WriteType::Put => self.props.num_puts += 1, - WriteType::Delete => self.props.num_deletes += 1, - _ => {} + } else { + // NOTE: if titan is enabled, the entry will always be treated as PUT. + // Be careful if you try to enable Titan on CF_WRITE. + self.props.num_puts += 1; } // Add new row. @@ -494,22 +521,33 @@ impl TablePropertiesCollector for MvccPropertiesCollector { } } -/// Can only be used for write CF. +/// Can be used for write CF of TiDB/TxnKV, default CF of RawKV. #[derive(Default)] pub struct MvccPropertiesCollectorFactory {} impl TablePropertiesCollectorFactory for MvccPropertiesCollectorFactory { fn create_table_properties_collector(&mut self, _: u32) -> MvccPropertiesCollector { - MvccPropertiesCollector::new() + MvccPropertiesCollector::new(KeyMode::Txn) + } +} + +#[derive(Default)] +pub struct RawMvccPropertiesCollectorFactory {} + +impl TablePropertiesCollectorFactory + for RawMvccPropertiesCollectorFactory +{ + fn create_table_properties_collector(&mut self, _: u32) -> MvccPropertiesCollector { + MvccPropertiesCollector::new(KeyMode::Raw) } } -pub fn get_range_entries_and_versions( +pub fn get_range_stats( engine: &crate::RocksEngine, cf: &str, start: &[u8], end: &[u8], -) -> Option<(u64, u64)> { +) -> Option { let range = Range::new(start, end); let collection = match engine.get_properties_of_tables_in_range(cf, &[range]) { Ok(v) => v, @@ -531,12 +569,17 @@ pub fn get_range_entries_and_versions( num_entries += v.num_entries(); props.add(&mvcc); } - - Some((num_entries, props.num_versions)) + Some(RangeStats { + num_entries, + num_versions: props.num_versions, + num_rows: props.num_rows, + num_deletes: props.num_deletes, + }) } #[cfg(test)] mod tests { + use api_version::RawValue; use engine_traits::{MiscExt, SyncMutable, CF_WRITE, LARGE_CFS}; use rand::Rng; use tempfile::Builder; @@ -746,10 +789,9 @@ mod tests { let start_keys = keys::data_key(&[]); let end_keys = keys::data_end_key(&[]); - let (entries, versions) = - get_range_entries_and_versions(&db, CF_WRITE, &start_keys, &end_keys).unwrap(); - assert_eq!(entries, (cases.len() * 2) as u64); - assert_eq!(versions, cases.len() as u64); + let range_stats = get_range_stats(&db, CF_WRITE, &start_keys, &end_keys).unwrap(); + assert_eq!(range_stats.num_entries, (cases.len() * 2) as u64); + assert_eq!(range_stats.num_versions, cases.len() as u64); } #[test] @@ -765,7 +807,7 @@ mod tests { ("ef", 6, WriteType::Put, DBEntryType::Delete), ("gh", 7, WriteType::Delete, DBEntryType::Put), ]; - let mut collector = MvccPropertiesCollector::new(); + let mut collector = MvccPropertiesCollector::new(KeyMode::Txn); for &(key, ts, write_type, entry_type) in &cases { let ts = ts.into(); let k = Key::from_raw(key.as_bytes()).append_ts(ts); @@ -784,6 +826,44 @@ mod tests { assert_eq!(props.max_row_versions, 3); } + #[test] + fn test_mvcc_properties_rawkv_mode() { + let test_raws = vec![ + (b"r\0a", 1, false, u64::MAX), + (b"r\0a", 5, false, u64::MAX), + (b"r\0a", 7, false, u64::MAX), + (b"r\0b", 1, false, u64::MAX), + (b"r\0b", 1, true, u64::MAX), + (b"r\0c", 1, true, 10), + (b"r\0d", 1, true, 10), + ]; + + let mut collector = MvccPropertiesCollector::new(KeyMode::Raw); + for &(key, ts, is_delete, expire_ts) in &test_raws { + let encode_key = ApiV2::encode_raw_key(key, Some(ts.into())); + let k = keys::data_key(encode_key.as_encoded()); + let v = ApiV2::encode_raw_value(RawValue { + user_value: &[0; 10][..], + expire_ts: Some(expire_ts), + is_delete, + }); + collector.add(&k, &v, DBEntryType::Put, 0, 0); + } + + let result = UserProperties(collector.finish()); + + let props = RocksMvccProperties::decode(&result).unwrap(); + assert_eq!(props.min_ts, 1.into()); + assert_eq!(props.max_ts, 7.into()); + assert_eq!(props.num_rows, 4); + assert_eq!(props.num_deletes, 3); + assert_eq!(props.num_puts, 4); + assert_eq!(props.num_versions, 7); + assert_eq!(props.max_row_versions, 3); + assert_eq!(props.ttl.max_expire_ts, Some(u64::MAX)); + assert_eq!(props.ttl.min_expire_ts, Some(10)); + } + #[bench] fn bench_mvcc_properties(b: &mut Bencher) { let ts = 1.into(); @@ -797,7 +877,7 @@ mod tests { entries.push((k, w.as_ref().to_bytes())); } - let mut collector = MvccPropertiesCollector::new(); + let mut collector = MvccPropertiesCollector::new(KeyMode::Txn); b.iter(|| { for &(ref k, ref v) in &entries { collector.add(k, v, DBEntryType::Put, 0, 0); diff --git a/proxy_components/engine_tiflash/src/range_properties.rs b/proxy_components/engine_tiflash/src/range_properties.rs index 101a004982a..dfc41db5f6e 100644 --- a/proxy_components/engine_tiflash/src/range_properties.rs +++ b/proxy_components/engine_tiflash/src/range_properties.rs @@ -9,7 +9,7 @@ use tikv_util::{box_err, box_try, debug, info}; use crate::{ engine::RocksEngine, - properties::{get_range_entries_and_versions, RangeProperties}, + properties::{get_range_stats, RangeProperties}, }; impl RangePropertiesExt for RocksEngine { @@ -27,9 +27,8 @@ impl RangePropertiesExt for RocksEngine { let start = &range.start_key; let end = &range.end_key; - let (_, keys) = - get_range_entries_and_versions(self, CF_WRITE, start, end).unwrap_or_default(); - Ok(keys) + let range_stats = get_range_stats(self, CF_WRITE, start, end).unwrap_or_default(); + Ok(range_stats.num_versions) } fn get_range_approximate_keys_cf( diff --git a/proxy_components/engine_tiflash/src/sst_partitioner.rs b/proxy_components/engine_tiflash/src/sst_partitioner.rs index fc1dcd40270..f642a94f28f 100644 --- a/proxy_components/engine_tiflash/src/sst_partitioner.rs +++ b/proxy_components/engine_tiflash/src/sst_partitioner.rs @@ -23,6 +23,8 @@ impl rocksdb::SstPartitionerFactory output_level: context.output_level, smallest_key: context.smallest_key, largest_key: context.largest_key, + next_level_boundaries: context.next_level_boundaries.clone(), + next_level_sizes: context.next_level_sizes.clone(), }; self.0.create_partitioner(&ctx).map(RocksSstPartitioner) } diff --git a/proxy_components/proxy_ffi/Cargo.toml b/proxy_components/proxy_ffi/Cargo.toml index 6b32f62c7f8..6241342f930 100644 --- a/proxy_components/proxy_ffi/Cargo.toml +++ b/proxy_components/proxy_ffi/Cargo.toml @@ -46,10 +46,11 @@ tokio = { version = "1.5", features = ["sync", "rt-multi-thread"] } tokio-timer = { git = "https://github.com/tikv/tokio", branch = "tokio-timer-hotfix" } tracker = { workspace = true, default-features = false } reqwest = { version = "0.11", features = ["blocking"] } -url = "2.4.0" +url = "2" collections = { workspace = true } [dependencies.rocksdb] git = "https://github.com/tikv/rust-rocksdb.git" package = "rocksdb" -features = ["encryption"] \ No newline at end of file +features = ["encryption"] +branch = "tikv-7.5" \ No newline at end of file