diff --git a/curvefs/src/mds/heartbeat/heartbeat_manager.cpp b/curvefs/src/mds/heartbeat/heartbeat_manager.cpp index c41864c71c..5bd9d4bccf 100644 --- a/curvefs/src/mds/heartbeat/heartbeat_manager.cpp +++ b/curvefs/src/mds/heartbeat/heartbeat_manager.cpp @@ -43,12 +43,13 @@ namespace heartbeat { HeartbeatManager::HeartbeatManager( const HeartbeatOption &option, const std::shared_ptr &topology, const std::shared_ptr &coordinator, + const std::shared_ptr &topologyManager, const std::shared_ptr &spaceManager) : topology_(topology), spaceManager_(spaceManager) { healthyChecker_ = std::make_shared(option, topology); - topoUpdater_ = std::make_shared(topology); + topoUpdater_ = std::make_shared(topology, topologyManager); copysetConfGenerator_ = std::make_shared( topology, coordinator, option.mdsStartTime, diff --git a/curvefs/src/mds/heartbeat/heartbeat_manager.h b/curvefs/src/mds/heartbeat/heartbeat_manager.h index 29eded4beb..7244fe83fa 100644 --- a/curvefs/src/mds/heartbeat/heartbeat_manager.h +++ b/curvefs/src/mds/heartbeat/heartbeat_manager.h @@ -68,6 +68,7 @@ class HeartbeatManager { HeartbeatManager(const HeartbeatOption &option, const std::shared_ptr &topology, const std::shared_ptr &coordinator, + const std::shared_ptr &topologyManager, const std::shared_ptr &spaceManager); ~HeartbeatManager() { Stop(); } diff --git a/curvefs/src/mds/heartbeat/topo_updater.cpp b/curvefs/src/mds/heartbeat/topo_updater.cpp index 5d7351a180..1686043b28 100644 --- a/curvefs/src/mds/heartbeat/topo_updater.cpp +++ b/curvefs/src/mds/heartbeat/topo_updater.cpp @@ -222,6 +222,28 @@ void TopoUpdater::UpdatePartitionTopo( LOG(WARNING) << "hearbeat report partition which is not in topo" << ", copysetId = " << copySetId << ", partitionId = " << it.GetPartitionId(); + + const int maxRetries = 3; + int retries = 0; + + // get copyset members + std::set copysetMemberAddr; + TopoStatusCode ret; + do { + ret = topologyManager_->GetCopysetMembers(it.GetPoolId(), copySetId, ©setMemberAddr); + if (ret == TopoStatusCode::TOPO_OK) { + break; + } + ++retries; + } while (retries < maxRetries); + + if (ret != TopoStatusCode::TOPO_OK) { + LOG(ERROR) << "GetCopysetMembers failed, poolId = " << it.GetPoolId() + << ", copysetId = " << copySetId; + } + else { + topologyManager_->DeleteAbnormalPartition(it.GetPoolId(), copySetId, it.GetPartitionId(), copysetMemberAddr); + } continue; } diff --git a/curvefs/src/mds/heartbeat/topo_updater.h b/curvefs/src/mds/heartbeat/topo_updater.h index 921972aea2..a9392033d9 100644 --- a/curvefs/src/mds/heartbeat/topo_updater.h +++ b/curvefs/src/mds/heartbeat/topo_updater.h @@ -37,7 +37,8 @@ namespace heartbeat { using curvefs::mds::topology::CopySetIdType; class TopoUpdater { public: - explicit TopoUpdater(const std::shared_ptr &topo) : topo_(topo) {} + explicit TopoUpdater(const std::shared_ptr &topo, + const std::shared_ptr &topologyManager) : topo_(topo), topologyManager_(topologyManager){} ~TopoUpdater() {} /* @@ -65,6 +66,7 @@ class TopoUpdater { private: std::shared_ptr topo_; + std::shared_ptr topologyManager_; }; } // namespace heartbeat } // namespace mds diff --git a/curvefs/src/mds/mds.cpp b/curvefs/src/mds/mds.cpp index 6e5d8644b5..3a13442bc0 100644 --- a/curvefs/src/mds/mds.cpp +++ b/curvefs/src/mds/mds.cpp @@ -422,7 +422,7 @@ void MDS::InitHeartbeatManager() { heartbeatOption.mdsStartTime = steady_clock::now(); heartbeatManager_ = std::make_shared( - heartbeatOption, topology_, coordinator_, spaceManager_); + heartbeatOption, topology_, coordinator_, topologyManager_, spaceManager_); heartbeatManager_->Init(); } diff --git a/curvefs/src/mds/topology/topology_manager.cpp b/curvefs/src/mds/topology/topology_manager.cpp index c546857bbc..852c414ff7 100644 --- a/curvefs/src/mds/topology/topology_manager.cpp +++ b/curvefs/src/mds/topology/topology_manager.cpp @@ -765,6 +765,16 @@ TopoStatusCode TopologyManager::DeletePartition(uint32_t partitionId) { return TopoStatusCode::TOPO_OK; } +void TopologyManager::DeleteAbnormalPartition(uint32_t poolId, uint32_t copysetId, uint32_t partitionId, + const std::set &addrs){ + auto fret = metaserverClient_->DeletePartition(poolId, copysetId, partitionId, addrs); + if (fret != FSStatusCode::OK) { + LOG(ERROR) << "Failed to delete partition. PoolId: " << poolId + << ", CopysetId: " << copysetId + << ", PartitionId: " << partitionId; + } +} + void TopologyManager::DeletePartition(const DeletePartitionRequest *request, DeletePartitionResponse *response) { uint32_t partitionId = request->partitionid(); @@ -1258,6 +1268,10 @@ void TopologyManager::GetTopology(ListTopologyResponse *response) { ListMetaserverOfCluster(response->mutable_metaservers()); } +std::shared_ptr TopologyManager::GetMetaserverClient(){ + return metaserverclient_; +} + void TopologyManager::ListZone(ListZoneResponse *response) { response->set_statuscode(TopoStatusCode::TOPO_OK); auto zoneIdVec = topology_->GetZoneInCluster(); diff --git a/curvefs/src/mds/topology/topology_manager.h b/curvefs/src/mds/topology/topology_manager.h index d31be90b9a..a508ec98d7 100644 --- a/curvefs/src/mds/topology/topology_manager.h +++ b/curvefs/src/mds/topology/topology_manager.h @@ -109,6 +109,9 @@ class TopologyManager { virtual void CreatePartitions(const CreatePartitionRequest *request, CreatePartitionResponse *response); + virtual void DeleteAbnormalPartition(uint32_t poolId, uint32_t copysetId, uint32_t partitionId, + const std::set &addrs); + virtual void DeletePartition(const DeletePartitionRequest *request, DeletePartitionResponse *response); @@ -162,6 +165,8 @@ class TopologyManager { virtual void GetTopology(ListTopologyResponse* response); + virtual std::shared_ptr GetMetaserverClient(); + virtual void ListZone(ListZoneResponse* response); virtual void ListServer(ListServerResponse* response); diff --git a/curvefs/test/mds/heartbeat/topo_update_test.cpp b/curvefs/test/mds/heartbeat/topo_update_test.cpp index 4ef911fd58..0557a64c3c 100644 --- a/curvefs/test/mds/heartbeat/topo_update_test.cpp +++ b/curvefs/test/mds/heartbeat/topo_update_test.cpp @@ -197,6 +197,13 @@ TEST_F(TestTopoUpdater, test_UpdatePartitionTopo_case4) { EXPECT_CALL(*topology_, GetPartition(_, _)).WillOnce(Return(false)); + std::set copysetMemberAddr; + EXPECT_CALL(*topologyManager_, GetCopysetMembers(_, _, _)) + .Times(AtMost(3)) + .WillRepeatedly(DoAll(SetArgPointee<2>(copysetMemberAddr), Return(TopoStatusCode::TOPO_OK))); + + EXPECT_CALL(*topologyManager_, DeleteAbnormalPartition(_, _, _, _)).Times(1); + std::list<::curvefs::mds::topology::Partition> partitionList; partitionList.push_back(partition);