Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

User QoS resource total limit #370

Merged
merged 17 commits into from
Jan 21, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions dependencies/cmake/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ add_subdirectory(mongodb-cxx-driver)
add_subdirectory(ranges-v3)
add_subdirectory(backward-cpp)
add_subdirectory(fpm)
add_subdirectory(parallel-hashmap)


include(${CMAKE_SOURCE_DIR}/CMakeModule/SuppressHeaderWarning.cmake)
Expand Down
11 changes: 11 additions & 0 deletions dependencies/cmake/parallel-hashmap/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
include(FetchContent)

set(HASHMAP_SRC_URL "https://github.com/greg7mdp/parallel-hashmap/archive/refs/tags/v1.4.1.tar.gz")

FetchContent_Declare(parallel-hashmap
URL ${HASHMAP_SRC_URL}
URL_HASH SHA256=949874f4207b8735422438b23b884fb1f4b926689bb5eebff38cc4d357d09cd2
INACTIVITY_TIMEOUT 5
)

FetchContent_MakeAvailable(parallel-hashmap)
13 changes: 8 additions & 5 deletions src/CraneCtld/AccountManager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

#include "AccountManager.h"

#include "AccountMetaContainer.h"
#include "protos/PublicDefs.pb.h"
#include "range/v3/algorithm/contains.hpp"

Expand Down Expand Up @@ -832,6 +833,7 @@ AccountManager::CraneExpected<void> AccountManager::ModifyQos(
// Mongodb
Qos qos;
g_db_client->SelectQos("name", name, &qos);

*m_qos_map_[name] = std::move(qos);

return {};
Expand Down Expand Up @@ -969,15 +971,14 @@ std::expected<void, std::string> AccountManager::CheckAndApplyQosLimitOnTask(
} else if (task->time_limit > qos_share_ptr->max_time_limit_per_task)
return std::unexpected("time-limit reached the user's limit.");

if (static_cast<double>(task->cpus_per_task) >
qos_share_ptr->max_cpus_per_user)
return std::unexpected("cpus-per-task reached the user's limit.");
if (!g_account_meta_container->CheckAndMallocQosResourceFromUser(
user_share_ptr->name, *task, *qos_share_ptr))
return std::unexpected("The requested QoS resources have reached the user's limit.");

return {};
}

std::expected<void, std::string> AccountManager::CheckUidIsAdmin(
uint32_t uid) {
std::expected<void, std::string> AccountManager::CheckUidIsAdmin(uint32_t uid) {
util::read_lock_guard user_guard(m_rw_user_mutex_);
auto user_result = GetUserInfoByUidNoLock_(uid);
if (!user_result) {
Expand Down Expand Up @@ -1767,6 +1768,8 @@ AccountManager::CraneExpected<void> AccountManager::DeleteUser_(
m_account_map_[coordinatorAccount]->coordinators.remove(name);
}

g_account_meta_container->DeleteUserResource(name);

m_user_map_[name] = std::make_unique<User>(std::move(res_user));

return {};
Expand Down
74 changes: 74 additions & 0 deletions src/CraneCtld/AccountMetaContainer.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
/**
* Copyright (c) 2024 Peking University and Peking University
* Changsha Institute for Computing and Digital Economy
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/

#include "AccountMetaContainer.h"

#include "AccountManager.h"

namespace Ctld {

bool AccountMetaContainer::CheckAndMallocQosResourceFromUser(
const std::string& username, const TaskInCtld& task, const Qos& qos) {
if (static_cast<double>(task.cpus_per_task) > qos.max_cpus_per_user ||
qos.max_jobs_per_user == 0)
return false;

bool result = true;

ResourceView resource_view{task.requested_node_res_view * task.node_num};

user_meta_map_.try_emplace_l(
username,
[&](std::pair<const std::string, QosToResourceMap>& pair) {
auto& qos_to_resource_map = pair.second;
auto iter = qos_to_resource_map.find(task.qos);
if (iter == qos_to_resource_map.end()) {
qos_to_resource_map.emplace(task.qos, QosResource{std::move(resource_view), 1});
return ;
}

auto& val = iter->second;
if (val.resource.CpuCount() + static_cast<double>(task.cpus_per_task) >
qos.max_cpus_per_user || val.jobs_per_user >= qos.max_jobs_per_user) {
result = false;
return;
}
val.resource.GetAllocatableRes() +=
(task.requested_node_res_view * task.node_num).GetAllocatableRes();
val.jobs_per_user++;
},
QosToResourceMap{{task.qos, QosResource{std::move(resource_view), 1}}});

return result;
}

void AccountMetaContainer::FreeQosResource(const std::string& username,
const TaskInCtld& task) {
user_meta_map_.modify_if(username, [&](std::pair<const std::string, QosToResourceMap>& pair) {
auto& val = pair.second[task.qos];
val.resource.GetAllocatableRes() -=
(task.requested_node_res_view * task.node_num).GetAllocatableRes();
val.jobs_per_user--;
});
}

void AccountMetaContainer::DeleteUserResource(const std::string& username) {
user_meta_map_.erase(username);
}

} // namespace Ctld
55 changes: 55 additions & 0 deletions src/CraneCtld/AccountMetaContainer.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
/**
* Copyright (c) 2024 Peking University and Peking University
* Changsha Institute for Computing and Digital Economy
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/

#pragma once

#include "CtldPublicDefs.h"
// Precompiled header comes first!

namespace Ctld {

class AccountMetaContainer final {
public:

using QosToResourceMap = std::unordered_map<std::string, // qos_name
QosResource>;

using UserResourceMetaMap = phmap::parallel_flat_hash_map<
std::string, //username
QosToResourceMap, phmap::priv::hash_default_hash<std::string>,
phmap::priv::hash_default_eq<std::string>,
std::allocator<std::pair<const std::string, QosToResourceMap>>, 4, std::shared_mutex>;

AccountMetaContainer() = default;
~AccountMetaContainer() = default;

bool CheckAndMallocQosResourceFromUser(const std::string& username,
const TaskInCtld& task,
const Qos& qos);

void FreeQosResource(const std::string& username, const TaskInCtld& task);

void DeleteUserResource(const std::string& username);

private:
UserResourceMetaMap user_meta_map_;
};

inline std::unique_ptr<Ctld::AccountMetaContainer> g_account_meta_container;

} // namespace Ctld
4 changes: 4 additions & 0 deletions src/CraneCtld/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ add_executable(cranectld
CranedMetaContainer.cpp
AccountManager.h
AccountManager.cpp
AccountMetaContainer.h
AccountMetaContainer.cpp
EmbeddedDbClient.cpp
EmbeddedDbClient.h
CraneCtld.cpp
Expand All @@ -38,6 +40,8 @@ target_link_libraries(cranectld PRIVATE
absl::synchronization
absl::flat_hash_map

phmap

crane_proto_lib

bs_thread_pool
Expand Down
3 changes: 3 additions & 0 deletions src/CraneCtld/CraneCtld.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include <filesystem>

#include "AccountManager.h"
#include "AccountMetaContainer.h"
#include "CranedKeeper.h"
#include "CranedMetaContainer.h"
#include "CtldGrpcServer.h"
Expand Down Expand Up @@ -683,6 +684,8 @@ void InitializeCtldGlobalVariables() {
g_meta_container = std::make_unique<CranedMetaContainer>();
g_meta_container->InitFromConfig(g_config);

g_account_meta_container = std::make_unique<AccountMetaContainer>();

bool ok;
g_embedded_db_client = std::make_unique<Ctld::EmbeddedDbClient>();
ok = g_embedded_db_client->Init(g_config.CraneCtldDbPath);
Expand Down
3 changes: 3 additions & 0 deletions src/CraneCtld/CtldPreCompiledHeader.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,9 @@
#include <absl/synchronization/mutex.h>
#include <absl/time/time.h> // NOLINT(modernize-deprecated-headers)

// parallel-hashmap
#include <parallel_hashmap/phmap.h>

// Thread pool
#include <BS_thread_pool.hpp>

Expand Down
5 changes: 5 additions & 0 deletions src/CraneCtld/CtldPublicDefs.h
Original file line number Diff line number Diff line change
Expand Up @@ -703,6 +703,11 @@ inline bool CheckIfTimeLimitIsValid(absl::Duration d) {
return CheckIfTimeLimitSecIsValid(sec);
}

struct QosResource {
ResourceView resource;
uint32_t jobs_per_user;
};

} // namespace Ctld

inline std::unique_ptr<BS::thread_pool> g_thread_pool;
5 changes: 3 additions & 2 deletions src/CraneCtld/TaskScheduler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include "TaskScheduler.h"

#include "AccountManager.h"
#include "AccountMetaContainer.h"
#include "CranedKeeper.h"
#include "CranedMetaContainer.h"
#include "CtldPublicDefs.h"
Expand Down Expand Up @@ -521,7 +522,6 @@ void TaskScheduler::PutRecoveredTaskIntoRunningQueueLock_(
for (const CranedId& craned_id : task->CranedIds())
g_meta_container->MallocResourceFromNode(craned_id, task->TaskId(),
task->Resources());

// The order of LockGuards matters.
LockGuard running_guard(&m_running_task_map_mtx_);
LockGuard indexes_guard(&m_task_indexes_mtx_);
Expand Down Expand Up @@ -971,6 +971,7 @@ void TaskScheduler::ScheduleThread_() {
auto& task = it.first;
for (CranedId const& craned_id : task->CranedIds())
g_meta_container->FreeResourceFromNode(craned_id, task->TaskId());
g_account_meta_container->FreeQosResource(task->Username(), *task);
}

// Construct the map for cgroups to be released of all failed tasks
Expand Down Expand Up @@ -1690,6 +1691,7 @@ void TaskScheduler::CleanTaskStatusChangeQueueCb_() {
for (CranedId const& craned_id : task->CranedIds()) {
g_meta_container->FreeResourceFromNode(craned_id, task_id);
}
g_account_meta_container->FreeQosResource(task->Username(), *task);

task_raw_ptr_vec.emplace_back(task.get());
task_ptr_vec.emplace_back(std::move(task));
Expand Down Expand Up @@ -2528,7 +2530,6 @@ void MinLoadFirst::NodeSelect(
for (CranedId const& craned_id : craned_ids)
g_meta_container->MallocResourceFromNode(craned_id, task->TaskId(),
task->Resources());

std::unique_ptr<TaskInCtld> moved_task;

// Move task out of pending_task_map and insert it to the
Expand Down
Loading