Skip to content

Commit

Permalink
add PreparingSleep status
Browse files Browse the repository at this point in the history
  • Loading branch information
huyongqii committed Dec 30, 2024
1 parent 2fdf0ed commit 9b407e5
Show file tree
Hide file tree
Showing 27 changed files with 1,301 additions and 707 deletions.
3 changes: 0 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,6 @@ cmake-build-*
build/
.cache/

.cache/*
build/*

third_party/*

# Header only
Expand Down
1 change: 0 additions & 1 deletion dependencies/cmake/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ add_subdirectory(mongodb-cxx-driver)
add_subdirectory(ranges-v3)
add_subdirectory(backward-cpp)
add_subdirectory(fpm)
add_subdirectory(influxdb-cxx)
add_subdirectory(fast-cpp-csv-parser)
#add_subdirectory(mariadb-connector-c)

Expand Down
5 changes: 2 additions & 3 deletions dependencies/cmake/fast-cpp-csv-parser/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,23 +4,22 @@ set(CMAKE_POLICY_DEFAULT_CMP0069 NEW)
include(FetchContent)

if (CRANE_USE_GITEE_SOURCE)
set(CSV_PARSER_URL "https://gitee.com/mirrors/fast-cpp-csv-parser/raw/master/csv.h")
# TODO: replace with gitee source
else ()
set(CSV_PARSER_URL "https://raw.githubusercontent.com/ben-strasser/fast-cpp-csv-parser/master/csv.h")
endif ()

FetchContent_Declare(
fast_cpp_csv_parser
URL ${CSV_PARSER_URL}
DOWNLOAD_NO_EXTRACT TRUE # 因为是单个头文件
DOWNLOAD_NO_EXTRACT TRUE
INACTIVITY_TIMEOUT 5
)

FetchContent_GetProperties(fast_cpp_csv_parser)
if(NOT fast_cpp_csv_parser_POPULATED)
FetchContent_Populate(fast_cpp_csv_parser)

# 创建 INTERFACE 库
add_library(fast_cpp_csv_parser INTERFACE)
target_include_directories(fast_cpp_csv_parser INTERFACE
${fast_cpp_csv_parser_SOURCE_DIR}
Expand Down
22 changes: 0 additions & 22 deletions dependencies/cmake/influxdb-cxx/CMakeLists.txt

This file was deleted.

14 changes: 14 additions & 0 deletions etc/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -72,10 +72,24 @@ Nodes:
- name: "cn[15-16]"
cpu: 2
memory: 2G
bmc:
ip: "10.10.10.10"
username: "ADMIN"
password: "ADMIN"
ssh:
username: "root"
password: "root"

- name: "cn[17-18]"
cpu: 2
memory: 2G
bmc:
ip: "10.10.10.11"
username: "ADMIN"
password: "ADMIN"
ssh:
username: "root"
password: "root"
gres:
- name: gpu
type: a100
Expand Down
7 changes: 7 additions & 0 deletions etc/database.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,10 @@ DbHost: localhost
DbPort: 27017
DbReplSetName: crane_rs
DbName: crane_db

# InfluxDB settings
InfluxDbUrl: YOUR_INFLUXDB_URL
InfluxDbToken: "YOUR_INFLUXDB_TOKEN"
InfluxDbOrg: YOUR_INFLUXDB_ORG
InfluxDbNodeBucket: YOUR_INFLUXDB_NODE_BUCKET
InfluxDbTaskBucket: YOUR_INFLUXDB_TASK_BUCKET
29 changes: 23 additions & 6 deletions protos/Crane.proto
Original file line number Diff line number Diff line change
Expand Up @@ -607,6 +607,22 @@ message QueryCranedRemoteMetaReply {
CranedRemoteMeta craned_remote_meta = 2;
}

message QueryCranedNICInfoRequest {}

message QueryCranedNICInfoReply{
bool ok = 1;
string interface_name = 2;
string mac_address = 3;
string error_message = 4;
}

message SuspendCranedRequest {}

message SuspendCranedReply {
bool ok = 1;
string error_message = 2;
}

message StreamCrunRequest {
enum CrunRequestType {
TASK_REQUEST = 0;
Expand Down Expand Up @@ -807,6 +823,10 @@ service CraneCtld {
rpc QueryPartitionInfo(QueryPartitionInfoRequest) returns (QueryPartitionInfoReply);
rpc ModifyTask(ModifyTaskRequest) returns (ModifyTaskReply);
rpc ModifyNode(ModifyCranedStateRequest) returns (ModifyCranedStateReply);
rpc SleepCraned(SleepCranedRequest) returns (SleepCranedReply);
rpc WakeupCraned(WakeupCranedRequest) returns (WakeupCranedReply);
rpc ShutdownCraned(ShutdownCranedRequest) returns (ShutdownCranedReply);
rpc PowerOnCraned(PowerOnCranedRequest) returns (PowerOnCranedReply);

/* RPCs called from cacctmgr */
rpc AddAccount(AddAccountRequest) returns (AddAccountReply);
Expand All @@ -832,11 +852,6 @@ service CraneCtld {

/* common RPCs */
rpc QueryTasksInfo(QueryTasksInfoRequest) returns (QueryTasksInfoReply);

rpc SleepCraned(SleepCranedRequest) returns (SleepCranedReply) {}
rpc WakeupCraned(WakeupCranedRequest) returns (WakeupCranedReply) {}
rpc ShutdownCraned(ShutdownCranedRequest) returns (ShutdownCranedReply) {}
rpc PowerOnCraned(PowerOnCranedRequest) returns (PowerOnCranedReply) {}
}

service Craned {
Expand All @@ -849,7 +864,9 @@ service Craned {
rpc ReleaseCgroupForTasks(ReleaseCgroupForTasksRequest) returns(ReleaseCgroupForTasksReply);

rpc QueryCranedRemoteMeta(QueryCranedRemoteMetaRequest) returns(QueryCranedRemoteMetaReply);

rpc QueryCranedNICInfo(QueryCranedNICInfoRequest) returns (QueryCranedNICInfoReply);
rpc SuspendCraned(SuspendCranedRequest) returns (SuspendCranedReply);

/*
If the task is an interactive task, the resource uuid is also revoked.
If there's no process in this interactive task, just deallocate all the resources.
Expand Down
7 changes: 4 additions & 3 deletions protos/PublicDefs.proto
Original file line number Diff line number Diff line change
Expand Up @@ -85,9 +85,10 @@ enum CranedResourceState {
CRANE_MIX = 2;

// Not Running
CRANE_SLEEPED = 4;
CRANE_SHUTDOWN = 5;
CRANE_WAKING_UP = 6;
CRANE_SLEEPED = 3;
CRANE_SHUTDOWN = 4;
CRANE_WAKING_UP = 5;
CRANE_PREPARING_SLEEP = 6;
CRANE_POWERING_UP = 7;
CRANE_SHUTTING_DOWN = 8;
CRANE_UNKNOWN = 9;
Expand Down
1 change: 0 additions & 1 deletion src/CraneCtld/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,6 @@ target_link_libraries(cranectld PRIVATE

Backward::Interface

influxdb_cpp
CURL::libcurl
fast_cpp_csv_parser
ssh
Expand Down
5 changes: 1 addition & 4 deletions src/CraneCtld/CraneCtld.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -362,17 +362,14 @@ void ParseConfig(int argc, char** argv) {

if (node["bmc"]) {
auto bmc = node["bmc"].as<YAML::Node>();
// TODO: Communicate how to set the BMC IP address format.
node_ptr->bmc.ip = bmc["ip"].as<std::string>();
node_ptr->bmc.port = bmc["port"].as<uint32_t>();
node_ptr->bmc.username = bmc["username"].as<std::string>();
node_ptr->bmc.password = bmc["password"].as<std::string>();
node_ptr->bmc.interface = bmc["interface"].as<std::string>();
}

if (node["ssh"]) {
auto ssh = node["ssh"].as<YAML::Node>();
node_ptr->ssh.ip = ssh["ip"].as<std::string>();
node_ptr->ssh.port = ssh["port"].as<uint32_t>();
node_ptr->ssh.username = ssh["username"].as<std::string>();
node_ptr->ssh.password = ssh["password"].as<std::string>();
}
Expand Down
41 changes: 41 additions & 0 deletions src/CraneCtld/CranedKeeper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,46 @@ CraneErr CranedStub::QueryCranedRemoteMeta(CranedRemoteMeta *meta) {
return CraneErr::kGenericFailure;
}

CraneErr CranedStub::QueryCranedNICInfo(CranedRemoteMeta *meta) {
using crane::grpc::QueryCranedNICInfoReply;
using crane::grpc::QueryCranedNICInfoRequest;
ClientContext context;
Status grpc_status;

QueryCranedNICInfoRequest request;
QueryCranedNICInfoReply reply;

auto status = m_stub_->QueryCranedNICInfo(&context, request, &reply);
if (!status.ok()) {
CRANE_ERROR("Failed to query NIC info for craned {}: {}", m_craned_id_,
status.error_message());
return CraneErr::kRpcFailure;
}

meta->nic.interface_name = reply.interface_name();
meta->nic.mac_address = reply.mac_address();
return CraneErr::kOk;
}

CraneErr CranedStub::SuspendCraned() {
using crane::grpc::SuspendCranedReply;
using crane::grpc::SuspendCranedRequest;

ClientContext context;
Status status;
SuspendCranedRequest request;
SuspendCranedReply reply;

status = m_stub_->SuspendCraned(&context, request, &reply);
if (!status.ok()) {
CRANE_ERROR("SuspendCraned RPC for Node {} returned with status not ok: {}",
m_craned_id_, status.error_message());
return CraneErr::kRpcFailure;
}

return CraneErr::kOk;
}

crane::grpc::ExecuteTasksRequest CranedStub::NewExecuteTasksRequests(
const CranedId &craned_id, const std::vector<TaskInCtld *> &tasks) {
crane::grpc::ExecuteTasksRequest request;
Expand Down Expand Up @@ -726,6 +766,7 @@ void CranedKeeper::ConnectCranedNode_(CranedId const &craned_id) {
craned->m_stub_ = crane::grpc::Craned::NewStub(craned->m_channel_);

craned->m_craned_id_ = craned_id;
craned->m_craned_ip_ = ip_addr;
craned->m_clean_up_cb_ = CranedChannelConnectFail_;

CqTag *tag = m_tag_sync_allocator_->new_object<CqTag>(
Expand Down
7 changes: 7 additions & 0 deletions src/CraneCtld/CranedKeeper.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,14 @@ class CranedStub {

CraneErr QueryCranedRemoteMeta(CranedRemoteMeta *meta);

CraneErr QueryCranedNICInfo(CranedRemoteMeta *meta);

CraneErr SuspendCraned();

bool Invalid() const { return m_invalid_; }

std::string GetCranedIp() const { return m_craned_ip_; }

private:
CranedKeeper *m_craned_keeper_;

Expand All @@ -77,6 +83,7 @@ class CranedStub {
uint32_t m_failure_retry_times_;

CranedId m_craned_id_;
std::string m_craned_ip_;

// void* parameter is m_data_. Used to free m_data_ when CranedStub is being
// destructed.
Expand Down
Loading

0 comments on commit 9b407e5

Please sign in to comment.