Skip to content
Open
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion benchmarks/bench_vmm/bench_vmm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
#include <cuda.h>
#include <cuda_runtime.h>

#include "cuda_utils.hpp"
#include "gpu_utils.hpp"
Comment thread
jiarong0907 marked this conversation as resolved.

static constexpr int kNumThds = 1;
static constexpr size_t kPageSize = 2ul << 20; // MB
Expand Down
89 changes: 43 additions & 46 deletions csrc/allocator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,13 @@

#include <memory>
#include <mutex>
#include <torch/extension.h>
#include <unordered_map>

#include "allocator.hpp"
#include "constants.hpp"
#include "cuda_utils.hpp"
#include "ftensor.hpp"
#include "gpu_utils.hpp"
#include "page.hpp"
#include "torch_utils.hpp"

namespace kvcached {
// Global configurable page size
Expand All @@ -20,32 +18,42 @@ size_t kPageSize = 2 * 1024 * 1024; // Default 2MB
std::unique_ptr<FTensorAllocator> FTensorAllocator::g_allocator_;
std::mutex FTensorAllocator::g_allocator_mutex_;

static inline std::shared_ptr<Page> make_shared_page(const torch::Device &dev,
static inline std::shared_ptr<Page> make_shared_page(const c10::Device &dev,
page_id_t page_id,
size_t page_size = 0) {
auto resolve_device_index = [](const c10::Device &device) -> int {
if (device.index() >= 0) {
return device.index();
}
return gpu_vmm::current_device();
};

// is_cuda() returns true for both NVIDIA (CUDA) and AMD (HIP/ROCm) devices,
// because PyTorch's ROCm build masquerades HIP devices as CUDA.
if (dev.is_cuda()) {
return std::make_shared<GPUPage>(page_id, dev.index(), page_size);
return std::make_shared<GPUPage>(page_id, resolve_device_index(dev),
page_size);
} else if (dev.is_cpu()) {
return std::make_shared<CPUPage>(page_id, page_size);
}
ASSERT(false, "Unsupported device type.");
return nullptr;
}

static inline size_t get_v_base_offset(const torch::Tensor &tensor) {
static inline size_t get_v_base_offset(const at::Tensor &tensor) {
size_t num_eles = tensor.numel() * tensor.element_size();
ASSERT(num_eles % (2 * kPageSize) == 0,
"Invalid tensor size: %zu, must be a multiple of 2 * page size %zu",
num_eles, 2 * kPageSize);
return num_eles / 2;
}

FTensorAllocator::FTensorAllocator(const torch::Device &device,
FTensorAllocator::FTensorAllocator(const c10::Device &device,
bool contiguous_layout)
: dev_(device), num_layers_(0), contiguous_layout_(contiguous_layout),
kv_tensor_size_per_layer_(0) {
if (dev_.is_cuda()) {
init_cuda_();
init_gpu_();
}
}

Expand Down Expand Up @@ -78,7 +86,7 @@ void FTensorAllocator::init(const std::string &dev_str, size_t page_size,
kPageSize = page_size;
}

auto device = torch::Device(dev_str);
auto device = c10::Device(dev_str);
g_allocator_ = std::make_unique<FTensorAllocator>(device, contiguous_layout);
}

Expand All @@ -95,8 +103,8 @@ void FTensorAllocator::shutdown() {
}
}

std::vector<torch::Tensor> FTensorAllocator::create_kv_tensors(
size_t size, torch::Dtype dtype, const std::string &dev_str,
std::vector<at::Tensor> FTensorAllocator::create_kv_tensors(
size_t size, c10::ScalarType dtype, const std::string &dev_str,
int64_t num_layers, int64_t num_kv_buffers) {
std::lock_guard<std::mutex> lock(mtx_);

Expand Down Expand Up @@ -218,10 +226,10 @@ std::string FTensorAllocator::get_anon_tensor_name_() {
return std::string(prefix) + std::to_string(counter++);
}

std::vector<torch::Tensor> FTensorAllocator::create_kv_tensors_per_layer_(
std::string_view prefix, size_t size, torch::Dtype dtype,
std::vector<at::Tensor> FTensorAllocator::create_kv_tensors_per_layer_(
std::string_view prefix, size_t size, c10::ScalarType dtype,
const std::string &dev_str, int64_t num_layers) {
std::vector<torch::Tensor> ftensors;
std::vector<at::Tensor> ftensors;
for (int64_t i = 0; i < num_layers; i++) {
auto name = std::string(prefix) + std::to_string(i);
auto tensor = create_ftensor_(size, dtype, dev_str, name);
Expand All @@ -230,8 +238,8 @@ std::vector<torch::Tensor> FTensorAllocator::create_kv_tensors_per_layer_(
return ftensors;
}

std::vector<torch::Tensor> FTensorAllocator::create_kv_tensors_contiguous_(
size_t size, torch::Dtype dtype, const std::string &dev_str,
std::vector<at::Tensor> FTensorAllocator::create_kv_tensors_contiguous_(
size_t size, c10::ScalarType dtype, const std::string &dev_str,
int64_t num_layers, size_t compound_page_size) {
// In contiguous layout, Python passes per-layer size, and we multiply by
// num_layers to get total size
Expand All @@ -249,16 +257,16 @@ std::vector<torch::Tensor> FTensorAllocator::create_kv_tensors_contiguous_(
}

/** this function is not thread-safe */
torch::Tensor FTensorAllocator::create_ftensor_(size_t size, torch::Dtype dtype,
const std::string &dev_str,
std::string name) {
at::Tensor FTensorAllocator::create_ftensor_(size_t size, c10::ScalarType dtype,
const std::string &dev_str,
std::string name) {
if (name.empty())
name = get_anon_tensor_name_();

if (ftensors_.find(name) != ftensors_.end()) {
auto tensor = ftensors_[name].get()->get_tensor();
assert(tensor.numel() * tensor.element_size() == size);
assert(tensor.device() == torch::Device(dev_str));
assert(tensor.device() == c10::Device(dev_str));
return tensor;
}

Expand All @@ -269,44 +277,33 @@ torch::Tensor FTensorAllocator::create_ftensor_(size_t size, torch::Dtype dtype,
}

/** this function is not thread-safe */
void FTensorAllocator::free_ftensor_(torch::Tensor &ftensor) {
void FTensorAllocator::free_ftensor_(at::Tensor &ftensor) {
auto name = ftensor.name();
if (ftensors_.find(name) == ftensors_.end()) {
return;
}
ftensors_.erase(name);
}

void FTensorAllocator::init_cuda_() {
CHECK_RT(cudaFree(0));
void FTensorAllocator::init_gpu_() {
CHECK_GPU(gpu_vmm::initialize_runtime());

CUdevice dev;
CHECK_DRV(cuCtxGetDevice(&dev));
int dev_idx = dev_.index() >= 0 ? dev_.index() : gpu_vmm::current_device();
CHECK_GPU(gpu_vmm::set_device(dev_idx));

int supportsVMM = 0;
CHECK_DRV(cuDeviceGetAttribute(
&supportsVMM, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED,
dev));
// LOGE("Supports VMM: %d", supportsVMM);

CUcontext context;
CHECK_DRV(cuCtxGetCurrent(&context));

CUmemAllocationProp prop{
.type = CU_MEM_ALLOCATION_TYPE_PINNED,
.location =
{
.type = CU_MEM_LOCATION_TYPE_DEVICE,
.id = dev,
},
};
int supports_vmm = 0;
CHECK_GPU(gpu_vmm::get_vmm_support(&supports_vmm, dev_idx));
ASSERT(supports_vmm != 0,
"VMM is not supported on %s device %d. kvcached requires GPU VMM "
"support.",
gpu_vmm::backend_name(), dev_idx);

auto prop = gpu_vmm::make_pinned_device_allocation_prop(dev_idx);
size_t chunk_sz = 0;
CHECK_DRV(cuMemGetAllocationGranularity(&chunk_sz, &prop,
CU_MEM_ALLOC_GRANULARITY_MINIMUM));
CHECK_GPU(gpu_vmm::get_allocation_granularity(&chunk_sz, &prop));
ASSERT(kPageSize % chunk_sz == 0,
"Invalid page size: %lu must be a multiple of CUDA granularity %lu\n",
kPageSize, chunk_sz);
"Invalid page size: %lu must be a multiple of %s granularity %lu\n",
kPageSize, gpu_vmm::backend_name(), chunk_sz);
}

} // namespace kvcached
68 changes: 43 additions & 25 deletions csrc/ftensor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,26 +4,39 @@
#include <fcntl.h>
#include <sys/mman.h>

#include <ATen/ops/from_blob.h>
#include <c10/core/ScalarType.h>

#include "constants.hpp"
#include "cuda_utils.hpp"
#include "ftensor.hpp"
#include "gpu_utils.hpp"
#include "page.hpp"

namespace kvcached {

static std::atomic<size_t> g_vaddr_allocated_offset = 0;

static inline generic_ptr_t alloc_virtual_mem(const torch::Device &dev,
static inline int resolve_device_index(const c10::Device &dev) {
if (dev.index() >= 0) {
return dev.index();
}
return gpu_vmm::current_device();
}

static inline generic_ptr_t alloc_virtual_mem(const c10::Device &dev,
size_t size) {
size_t alignment_2mb = 2 * 1024 * 1024;
ASSERT(size % alignment_2mb == 0,
"alloc size not aligned."); // Ensure alignment.

generic_ptr_t vaddr;
size_t offset = g_vaddr_allocated_offset.fetch_add(size);
// is_cuda() returns true for both NVIDIA (CUDA) and AMD (HIP/ROCm) devices,
// because PyTorch's ROCm build masquerades HIP devices as CUDA.
if (dev.is_cuda()) {
CHECK_DRV(cuMemAddressReserve(reinterpret_cast<CUdeviceptr *>(&vaddr), size,
alignment_2mb, kStartAddr + offset, 0ULL));
CHECK_GPU(gpu_vmm::address_reserve(
reinterpret_cast<void **>(&vaddr), size, alignment_2mb,
reinterpret_cast<void *>(kStartAddr + offset)));
} else {
vaddr = mmap(reinterpret_cast<void *>(kStartAddr + offset), size,
PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
Expand All @@ -33,40 +46,45 @@ static inline generic_ptr_t alloc_virtual_mem(const torch::Device &dev,
return vaddr;
}

static inline std::unique_ptr<Page> make_unique_page(const torch::Device &dev,
static inline std::unique_ptr<Page> make_unique_page(const c10::Device &dev,
page_id_t page_id,
size_t page_size = 0) {
if (dev.is_cuda()) {
return std::make_unique<GPUPage>(page_id, dev.index(), page_size);
return std::make_unique<GPUPage>(page_id, resolve_device_index(dev),
page_size);
} else if (dev.is_cpu()) {
return std::make_unique<CPUPage>(page_id, page_size);
}
ASSERT(false, "Unsupported device type.");
return nullptr;
}

FTensor::FTensor(const std::string &name, size_t size, torch::Dtype dtype,
torch::Device dev, std::shared_ptr<Page> zero_page,
FTensor::FTensor(const std::string &name, size_t size, c10::ScalarType dtype,
c10::Device dev, std::shared_ptr<Page> zero_page,
size_t page_size)
: name_(name), vaddr_(nullptr), size_(size),
page_size_(page_size > 0 ? page_size : kPageSize), dtype_(dtype),
dev_(dev), zero_page_(zero_page) {
vaddr_ = alloc_virtual_mem(dev_, size_);
init_with_zero_();

auto num_elems = static_cast<int64_t>(size / torch::elementSize(dtype_));
auto num_elems = static_cast<int64_t>(size / c10::elementSize(dtype_));
auto options =
torch::TensorOptions().dtype(dtype_).device(dev_).requires_grad(false);
at::TensorOptions().dtype(dtype_).device(dev_).requires_grad(false);
tensor_ =
torch::from_blob(reinterpret_cast<void *>(vaddr_), {num_elems}, options);
at::from_blob(reinterpret_cast<void *>(vaddr_), {num_elems}, options);
}

FTensor::~FTensor() {
mapping_.clear(); // Free all physical pages directly.
zero_page_.reset();
if (vaddr_) {
CHECK_DRV(cuMemUnmap(reinterpret_cast<CUdeviceptr>(vaddr_), size_));
CHECK_DRV(cuMemAddressFree(reinterpret_cast<CUdeviceptr>(vaddr_), size_));
if (dev_.is_cuda()) {
CHECK_GPU(gpu_vmm::mem_unmap(vaddr_, size_));
CHECK_GPU(gpu_vmm::address_free(vaddr_, size_));
} else if (dev_.is_cpu()) {
ASSERT(munmap(vaddr_, size_) == 0, "munmap failed.");
}
}
}

Expand All @@ -81,7 +99,9 @@ bool FTensor::map(offset_t offset) {

auto vaddr = reinterpret_cast<generic_ptr_t>(
reinterpret_cast<uintptr_t>(vaddr_) + offset);
CHECK_DRV(cuMemUnmap(reinterpret_cast<CUdeviceptr>(vaddr), page_size_));
if (dev_.is_cuda()) {
CHECK_GPU(gpu_vmm::mem_unmap(vaddr, page_size_));
}

mapping_[page_id] = make_unique_page(dev_, page_id, page_size_);
mapping_[page_id]->map(vaddr);
Expand All @@ -99,7 +119,9 @@ bool FTensor::unmap(offset_t offset) {

auto vaddr = reinterpret_cast<generic_ptr_t>(
reinterpret_cast<uintptr_t>(vaddr_) + offset);
CHECK_DRV(cuMemUnmap(reinterpret_cast<CUdeviceptr>(vaddr), page_size_));
if (dev_.is_cuda()) {
CHECK_GPU(gpu_vmm::mem_unmap(vaddr, page_size_));
}

// Map the zero page instead to ensure memory integrity.
map_(zero_page_.get(), offset);
Expand All @@ -117,16 +139,12 @@ bool FTensor::map_(Page *page, offset_t offset, bool set_access) {
}

bool FTensor::set_access_(generic_ptr_t addr, size_t size) {
CUmemAccessDesc accessDesc_{
.location =
{
.type = CU_MEM_LOCATION_TYPE_DEVICE,
.id = dev_.index(),
},
.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE,
};
CHECK_DRV(cuMemSetAccess(reinterpret_cast<CUdeviceptr>(addr), size,
&accessDesc_, 1));
if (!dev_.is_cuda()) {
return true;
}
auto access_desc =
gpu_vmm::make_device_rw_access_desc(resolve_device_index(dev_));
CHECK_GPU(gpu_vmm::set_access(addr, size, &access_desc, 1));
return true;
}

Expand Down
Loading