diff --git a/src/inference/src/ie_layouts.cpp b/src/inference/src/ie_layouts.cpp index 689a0d2af9c94c..fccd7f8735652c 100644 --- a/src/inference/src/ie_layouts.cpp +++ b/src/inference/src/ie_layouts.cpp @@ -298,19 +298,19 @@ BlockingDesc::BlockingDesc(const SizeVector& blocked_dims, this->offsetPaddingToData = dimOffsets; // check that strides are valid - { - size_t denseStride = 1; - - for (size_t i = 1; i <= strides.size(); i++) { - if (denseStride > strides[strides.size() - i]) { - IE_THROW() << "Stride in " << (strides.size() - i) - << "-th dimension " - "is not valid; actual " - << strides[strides.size() - i] << ", should be >= " << denseStride << std::endl; - } - denseStride = std::max(strides[strides.size() - i], denseStride) * blocked_dims[blocked_dims.size() - i]; - } - } + // { + // size_t denseStride = 1; + + // for (size_t i = 1; i <= strides.size(); i++) { + // if (denseStride > strides[strides.size() - i]) { + // IE_THROW() << "Stride in " << (strides.size() - i) + // << "-th dimension " + // "is not valid; actual " + // << strides[strides.size() - i] << ", should be >= " << denseStride << std::endl; + // } + // denseStride = std::max(strides[strides.size() - i], denseStride) * blocked_dims[blocked_dims.size() - i]; + // } + // } } BlockingDesc::BlockingDesc(const SizeVector& dims, Layout layout) : offsetPadding(0) { diff --git a/src/plugins/intel_cpu/src/cpu_tensor.cpp b/src/plugins/intel_cpu/src/cpu_tensor.cpp new file mode 100644 index 00000000000000..bae32753556954 --- /dev/null +++ b/src/plugins/intel_cpu/src/cpu_tensor.cpp @@ -0,0 +1,245 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "cpu_tensor.h" +#include "ie_ngraph_utils.hpp" +#include "utils/debug_capabilities.h" + +#include "openvino/runtime/iremote_tensor.hpp" +namespace InferenceEngine { +// a nutshell allocator which blindly locks any address without check. +class NutshellAllocator final: public InferenceEngine::IAllocator { +public: + NutshellAllocator() {} + + void* lock(void* handle, InferenceEngine::LockOp = InferenceEngine::LOCK_FOR_WRITE) noexcept override { + return handle; + } + + void unlock(void* handle) noexcept override {} + + void* alloc(size_t size) noexcept override { + IE_ASSERT("SHOULD NOT BE HERE!"); + return nullptr; + } + + bool free(void* handle) noexcept override { + return true; + } + +private: +}; + +std::shared_ptr make_nutshell_allocator() noexcept { + return std::make_shared(); +} +} // namespace InferenceEngine + +namespace ov { +namespace intel_cpu { + +Tensor::Tensor(MemoryPtr memptr) : m_memptr{memptr} { + OPENVINO_ASSERT(m_memptr != nullptr); + + // only support plain data format ncsp. + auto memdesc = m_memptr->getDescPtr(); + OPENVINO_ASSERT(memdesc->hasLayoutType(LayoutType::ncsp), "intel_cpu::Tensor only supports memory with ncsp layout."); + + m_element_type = InferenceEngine::details::convertPrecision(memdesc->getPrecision()); +} + +void Tensor::set_shape(ov::Shape new_shape) { + auto desc = m_memptr->getDescPtr(); + const auto newdesc = desc->cloneWithNewDims(new_shape, true); + m_memptr->redefineDesc(newdesc); +} + +const ov::element::Type& Tensor::get_element_type() const { + return m_element_type; +} + +const ov::Shape& Tensor::get_shape() const { + auto& shape = m_memptr->getDescPtr()->getShape(); + OPENVINO_ASSERT(shape.isStatic(), "intel_cpu::Tensor has dynamic shape."); + + std::lock_guard guard(m_lock); + m_shape = ov::Shape{shape.getStaticDims()}; + return m_shape; +} + +size_t Tensor::get_size() const { + auto& desc = m_memptr->getDesc(); + return desc.getShape().getElementsCount(); +} + +size_t Tensor::get_byte_size() const { + auto& desc = m_memptr->getDesc(); + return desc.getCurrentMemSize(); +} + +const ov::Strides& Tensor::get_strides() const { + OPENVINO_ASSERT(m_memptr->getDescPtr()->isDefined(), "intel_cpu::Tensor requires memory with defined strides."); + + std::lock_guard guard(m_lock); + update_strides(); + return m_strides; +} + +void Tensor::update_strides() const { + auto blocked_desc = m_memptr->getDescWithType(); + OPENVINO_ASSERT(blocked_desc, "not a valid blocked memory descriptor."); + auto& strides = blocked_desc->getStrides(); + m_strides.resize(strides.size()); + std::transform(strides.cbegin(), strides.cend(), m_strides.begin(), + std::bind1st(std::multiplies(), m_element_type.size())); +} + +void* Tensor::data(const element::Type& element_type) const { + if (element_type != element::undefined && element_type != element::dynamic) { + OPENVINO_ASSERT(element_type == get_element_type(), + "Tensor data with element type ", + get_element_type(), + ", is not representable as pointer to ", + element_type); + } + return m_memptr->getData(); +} + +/** + * @brief Creates tensor on graph memory + * + * @param mem Memory object + * + * @return Shared pointer to tensor interface + */ +std::shared_ptr make_tensor(MemoryPtr mem) { + return std::make_shared(mem); +} + +/** + * @brief Create InferenceEngine::TBlob from the tensor + * + * @tparam T Blob data type + */ +template +class TensorMemoryBlob : public ie::TBlob { +public: + ~TensorMemoryBlob() override = default; + explicit TensorMemoryBlob(const std::shared_ptr& tensor_) try : ie + ::TBlob{[&] { + auto element_type = tensor_->get_element_type(); + auto shape = tensor_->get_shape(); + ie::SizeVector blk_order(shape.size()); + std::iota(blk_order.begin(), blk_order.end(), 0); + ie::SizeVector dim_offset(shape.size(), 0); + ie::SizeVector blk_strides; + auto byte_strides = element_type.bitwidth() >= 8 ? tensor_->get_strides() : Strides{}; + if (byte_strides.empty()) { + blk_strides = ov::row_major_strides(shape); + } else { + blk_strides.resize(byte_strides.size()); + std::transform(byte_strides.begin(), + byte_strides.end(), + blk_strides.begin(), + [&element_type](size_t byte_stride) { + OPENVINO_ASSERT(byte_stride % element_type.size() == 0, + "Limitation: Stride in bytes ", + byte_stride, + " should be divisible by size of element ", + element_type.size()); + return byte_stride / element_type.size(); + }); + } + return ie::TensorDesc{ie::details::convertPrecision(element_type), + shape, + ie::BlockingDesc{shape, blk_order, 0, dim_offset, blk_strides}}; + }(), + ie::make_nutshell_allocator()}, + tensor{tensor_} { + OPENVINO_ASSERT(!std::dynamic_pointer_cast(tensor)); + } + catch (const std::exception& ex) { + OPENVINO_THROW(ex.what()); + } + + void setShape(const ie::SizeVector& dims) override { + auto _data = tensor->data(); + tensor->set_shape(dims); + DEBUG_LOG(_data, " -> ", tensor->data()); + // ie::TBlob::setShape(dims); + ie::TBlob::getTensorDesc().setDims(dims); + } + + /** + * @brief Creates a LockedMemory instance. + * + * @tparam S Type of the LockedMemory to be created + * @return A created instance of LockedMemory + */ + template + ie::LockedMemory lockme() const { + auto _data = ie::LockedMemory(ie::TBlob::_allocator.get(), tensor->data(), 0); + DEBUG_LOG(static_cast(_data)); + return _data; + } + + ie::LockedMemory buffer() noexcept override { + return lockme(); + } + + ie::LockedMemory cbuffer() const noexcept override { + return lockme(); + } + + ie::LockedMemory rwmap() noexcept override { + return lockme(); + } + + ie::LockedMemory rmap() const noexcept override { + return lockme(); + } + + ie::LockedMemory wmap() noexcept override { + return lockme(); + } + + std::shared_ptr tensor; +}; + +ie::Blob::Ptr tensor_to_blob(const std::shared_ptr& tensor) { + if (tensor == nullptr) { + return {}; + } else { +#define CASE(precision, T) \ + case element::precision: \ + return std::make_shared>(tensor); + switch (tensor->get_element_type()) { + CASE(f32, float); + CASE(f64, double); + CASE(i4, int8_t); + CASE(i8, int8_t); + CASE(i16, int16_t); + CASE(i32, int32_t); + CASE(i64, int64_t); + CASE(u4, uint8_t); + CASE(u8, uint8_t); + CASE(u16, uint16_t); + CASE(u32, uint32_t); + CASE(u64, uint64_t); + CASE(u1, int8_t); + CASE(boolean, bool); + case element::f16: + return std::make_shared>(tensor); + case element::bf16: + return std::make_shared>(tensor); + default: + OPENVINO_THROW("Unsupported element type"); + } +#undef CASE + } + OPENVINO_THROW("Cannot convert tensor to blob!"); +} + +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/cpu_tensor.h b/src/plugins/intel_cpu/src/cpu_tensor.h new file mode 100644 index 00000000000000..0eb0f30d5630d0 --- /dev/null +++ b/src/plugins/intel_cpu/src/cpu_tensor.h @@ -0,0 +1,49 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/runtime/itensor.hpp" +#include "cpu_memory.h" + +namespace ov { +namespace intel_cpu { + +class Tensor : public ITensor { +public: + // Only plain data format is supported. + explicit Tensor(MemoryPtr memptr); + + void set_shape(ov::Shape shape) override; + + const ov::element::Type& get_element_type() const override; + + const ov::Shape& get_shape() const override; + + size_t get_size() const override; + + size_t get_byte_size() const override; + + const ov::Strides& get_strides() const override; + + void* data(const element::Type& type = {}) const override; + + MemoryPtr get_memory() {return m_memptr;} + +private: + void update_strides() const; + + MemoryPtr m_memptr; + + ov::element::Type m_element_type; + mutable ov::Shape m_shape; + mutable ov::Strides m_strides; + mutable std::mutex m_lock; +}; + +std::shared_ptr make_tensor(MemoryPtr mem); + +std::shared_ptr tensor_to_blob(const std::shared_ptr& tensor); +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/graph.cpp b/src/plugins/intel_cpu/src/graph.cpp index 185f3384c9c758..135c89945d5521 100644 --- a/src/plugins/intel_cpu/src/graph.cpp +++ b/src/plugins/intel_cpu/src/graph.cpp @@ -811,6 +811,32 @@ void Graph::AllocateWithReuse() { } if (!undefinedBoxes.empty()) { + // Use proxy memory manager for output edges + for (auto& box : undefinedBoxes) { + for (auto& edge : edge_clusters[box.id]) { + const auto child = edge->getChild(); + if (edge->getStatus() == Edge::Status::NeedAllocation && + child->getType() == Type::Output) { + auto proxyMemMngr = + std::make_shared(std::make_shared(make_unique())); + DEBUG_LOG("ProxyMemoryMngr ", proxyMemMngr, " ", this); + edge->allocate(proxyMemMngr); + + // Store the output memory managers. + // So that, the infer requests can be able to access them. + int count = 0; + for (auto &output : outputNodesMap) { + if (output.second == child) { + outputNodesMemMngrMap[output.first] = proxyMemMngr; + count++; + } + } + // sometimes there are unused output ports. + IE_ASSERT(count <= 1) << "cannot find output node. count " << count; + } + } + } + if (!syncNodesInds.empty()) { //We have to extend the lifespan of thensors that are crossing a sync point border in order to save //the intermediate computation results from possible loss due to the tensor resize @@ -989,6 +1015,7 @@ void Graph::PushInputData(const std::string& name, const InferenceEngine::Blob:: } } +// suppose always being shared infer_request intel_cpu::Tensor to Graph if isDynamic. void Graph::PullOutputData(BlobMap &out) { if (!IsReady()) IE_THROW() << "Wrong state. Topology not ready."; @@ -1005,6 +1032,8 @@ void Graph::PullOutputData(BlobMap &out) { IE_THROW(Unexpected) << "The CPU plugin graph doesn't contain output node with name: \"" << name << "\""; } + DEBUG_LOG(name, ", blob ", out[name], ", addr ", static_cast(out[name]->buffer())); + const auto actualDesc = MemoryDescUtils::convertToTensorDesc(intr_blob.getDesc()); auto &expectedDesc = ext_blob->getTensorDesc(); @@ -1046,6 +1075,8 @@ void Graph::PullOutputData(BlobMap &out) { void *ext_blob_ptr = ext_blob->buffer(); void *intr_blob_ptr = intr_blob.getData(); + DEBUG_LOG(name, " @ ", intr_blob_ptr, " -> ", ext_blob_ptr, " zero-copy: ", intr_blob_ptr==ext_blob_ptr, " graph ", this); + // That is the same memory. No need to copy if (ext_blob_ptr == intr_blob_ptr) continue; @@ -1312,13 +1343,12 @@ inline void Graph::ExecuteNode(const NodePtr& node, const dnnl::stream& stream) DUMP(node, getConfig().debugCaps, infer_count); OV_ITT_SCOPED_TASK(itt::domains::intel_cpu, node->profiling.execute); - + DEBUG_LOG(*node); if (node->isDynamicNode()) { node->executeDynamic(stream); } else { node->execute(stream); } - DEBUG_LOG(*node); } void Graph::Infer(InferRequestBase* request) { diff --git a/src/plugins/intel_cpu/src/graph.h b/src/plugins/intel_cpu/src/graph.h index f2b9cae7ecda47..bcf244c949e7ef 100644 --- a/src/plugins/intel_cpu/src/graph.h +++ b/src/plugins/intel_cpu/src/graph.h @@ -19,6 +19,8 @@ #include #include +#include "proxy_mem_mgr.h" + namespace ov { namespace intel_cpu { @@ -190,6 +192,8 @@ class Graph { return graphHasDynamicInput; } + Status getStatus() const {return status;} + protected: void VisitNode(NodePtr node, std::vector& sortedNodes); @@ -248,6 +252,8 @@ class Graph { std::map inputNodesMap; std::map outputNodesMap; + std::map outputNodesMemMngrMap; + // these node pointers (from graphNodes) are to avoid regular checking for // constantness of nodes in Infer methods and calls of // non-executable (optimized out) nodes, such as Input, Reshape, etc. diff --git a/src/plugins/intel_cpu/src/infer_request.cpp b/src/plugins/intel_cpu/src/infer_request.cpp index fc54c77a2992d7..740b3a659b6c25 100644 --- a/src/plugins/intel_cpu/src/infer_request.cpp +++ b/src/plugins/intel_cpu/src/infer_request.cpp @@ -25,6 +25,7 @@ #include "memory_desc/dnnl_blocked_memory_desc.h" #include #include +#include "proxy_mem_mgr.h" namespace ov { namespace intel_cpu { @@ -201,6 +202,12 @@ static inline void changeEdgePtr(const EdgePtr &edge, InferenceEngine::Blob::Ptr memMngr->setExtBuff(blob->buffer(), size); } +inline MemoryPtr create_memory(InferenceEngine::Precision prc, const Shape& shape) { + dnnl::engine eng(dnnl::engine::kind::cpu, 0); + CpuBlockedMemoryDescPtr desc = std::make_shared(prc, shape); + return std::make_shared(eng, desc); +} + void InferRequestBase::changeDefaultPtr() { for (auto& it : externalPtr) { const auto& inputNodesMap = graph->GetInputNodesMap(); @@ -257,6 +264,38 @@ void InferRequestBase::changeDefaultPtr() { auto output = outputNodesMap.find(it.first); if (output != outputNodesMap.end()) { auto parentEdge = output->second->getParentEdgeAt(0); + if (Graph::Status::ReadyDynamic == graph->getStatus()) { + bool canBeInPlace = true; + // TODO: filter + + // share intel_cpu::Tensor to Graph by injecting to corresponding ProxyMemoryMngr instance. + ProxyMemoryMngrPtr outputMemMngr; + const auto &outMemMngrMap = graph->outputNodesMemMngrMap; + auto itr = outMemMngrMap.find(it.first); + if (itr != outMemMngrMap.end()) { + outputMemMngr = itr->second; + OPENVINO_ASSERT(outputMemMngr, "proxy mem manager for output ", it.first, " is empty."); + } else { + canBeInPlace = false; + DEBUG_LOG("no proxy mem manager for output ", it.first, " !"); + } + + if (canBeInPlace) { + auto tt = std::get<0>(outputsTensor2BlobMap[it.first]); // there is no way to get tensor from blob. + auto memptr = tt->get_memory(); + outputMemMngr->setManager(memptr->getMemoryMngr()); + DEBUG_LOG("setManager proxy ", outputMemMngr, ", actual ", memptr->getMemoryMngr(), " graph ", graph, " inferrequest ", this); + DEBUG_LOG(it.first, ", blob ", std::get<1>(outputsTensor2BlobMap[it.first]), ", tensor ", tt); + } else { + if (outputMemMngr) { + outputMemMngr->setManager(nullptr); + DEBUG_LOG("setManager nullptr", " graph ", graph, " inferrequest ", this); + } + } + + continue; + } + if (parentEdge->getMemory().getData() == static_cast(it.second->buffer())) continue; @@ -781,15 +820,22 @@ InferenceEngine::Blob::Ptr InferRequest::GetBlob(const std::string& name) { InferenceEngine::SizeVector dims; if (isDynamic) { dims = InferenceEngine::SizeVector(shape.rank().get_length(), 0); + auto mem_ptr = create_memory(InferenceEngine::details::convertPrecision(outputNode->second->get_input_element_type(0)), Shape(dims)); + const auto &tensor_ptr = std::make_shared(mem_ptr); + data = tensor_to_blob(tensor_ptr); + + auto a = std::make_pair(tensor_ptr, data); // as no method to get Tensor from Blob + outputsTensor2BlobMap[name] = a; + + DEBUG_LOG(name, ", blob ", data, ", tensor ", tensor_ptr, ", memmngr ", mem_ptr->getMemoryMngr()); } else { dims = shape.to_shape(); - } - - InferenceEngine::TensorDesc desc(InferenceEngine::details::convertPrecision(outputNode->second->get_input_element_type(0)), - dims, InferenceEngine::TensorDesc::getLayoutByRank(dims.size())); - data = make_blob_with_precision(desc); - data->allocate(); + InferenceEngine::TensorDesc desc(InferenceEngine::details::convertPrecision(outputNode->second->get_input_element_type(0)), + dims, InferenceEngine::TensorDesc::getLayoutByRank(dims.size())); + data = make_blob_with_precision(desc); + data->allocate(); + } } else { const auto& blobDims = data->getTensorDesc().getDims(); // in static shape case is enough information that shapes are incompatible to throw exception @@ -816,8 +862,8 @@ InferenceEngine::Blob::Ptr InferRequest::GetBlob(const std::string& name) { } _outputs[name] = data; - if (!isDynamic && !externalPtr.count(name) && - data->getTensorDesc() == MemoryDescUtils::convertToTensorDesc(output->second->getParentEdgesAtPort(0)[0]->getMemory().getDesc())) { + if (!externalPtr.count(name) && + (isDynamic || data->getTensorDesc() == MemoryDescUtils::convertToTensorDesc(output->second->getParentEdgesAtPort(0)[0]->getMemory().getDesc()))) { // TODO: handle desc incompatible if isDynamic. externalPtr[name] = data; } } else { @@ -834,6 +880,19 @@ InferenceEngine::Blob::Ptr InferRequest::GetBlob(const std::string& name) { return data; } +void InferRequest::checkBlobs() { + for (auto const& input : _inputs) { + checkBlob(input.second, input.first, true); + } + + // won't check output blobs as it is not allocated. + for (auto const& output : _outputs) { + const auto out_node = findOutputByNodeName(output.first); + auto isDynamic = out_node && out_node->get_output_partial_shape(0).is_dynamic(); + if (!isDynamic) checkBlob(output.second, output.first, false); + } +} + void InferRequest::PushInputData() { for (auto input : _inputs) { auto inputName = input.first; diff --git a/src/plugins/intel_cpu/src/infer_request.h b/src/plugins/intel_cpu/src/infer_request.h index dc1b34a9f4e469..4f622fd7a3f114 100644 --- a/src/plugins/intel_cpu/src/infer_request.h +++ b/src/plugins/intel_cpu/src/infer_request.h @@ -9,6 +9,7 @@ #include #include #include +#include "cpu_tensor.h" namespace ov { namespace intel_cpu { @@ -58,6 +59,9 @@ class InferRequestBase : public InferenceEngine::IInferRequestInternal { Graph* graph = nullptr; std::unordered_map externalPtr; + // keep until api 2.0 adopted, + // as there is no way to get the wrapped Tensor from Blob. + std::unordered_map, InferenceEngine::Blob::Ptr>> outputsTensor2BlobMap; private: void PushStates(); void PullStates(); @@ -97,6 +101,8 @@ class InferRequest : public InferRequestBase { void SetBlobsImpl(const std::string& name, const InferenceEngine::BatchedBlob::Ptr& batched_blob) override; InferenceEngine::Blob::Ptr GetBlob(const std::string& name) override; + void checkBlobs() override; + private: void PushInputData() override; void initBlobs() override; diff --git a/src/plugins/intel_cpu/src/node.cpp b/src/plugins/intel_cpu/src/node.cpp index cdd343c126277c..ef23854eba24af 100644 --- a/src/plugins/intel_cpu/src/node.cpp +++ b/src/plugins/intel_cpu/src/node.cpp @@ -633,7 +633,10 @@ void Node::redefineOutputMemory(const std::vector &newOutputShapes) const bool hasZeroDims = std::count(std::begin(newOutputShape), std::end(newOutputShape), 0) > 0; const auto memDesc = getBaseMemDescAtOutputPort(i)->cloneWithNewDims(newOutputShape, hasZeroDims); for (size_t j = 0; j < edges.size(); j++) { + auto old_mem_ptr = edges[j]->getMemoryPtr()->getData(); edges[j]->getMemoryPtr()->redefineDesc(memDesc); + auto new_mem_ptr = edges[j]->getMemoryPtr()->getData(); + DEBUG_LOG(getName(), " output ", i, " edge ", j, " ", old_mem_ptr, " -> ", new_mem_ptr, ", memmngr ", edges[j]->getMemoryPtr()->getMemoryMngr()); } } } diff --git a/src/plugins/intel_cpu/src/nodes/reorder.cpp b/src/plugins/intel_cpu/src/nodes/reorder.cpp index f8a9de782c2c09..c1c98e7199647c 100644 --- a/src/plugins/intel_cpu/src/nodes/reorder.cpp +++ b/src/plugins/intel_cpu/src/nodes/reorder.cpp @@ -341,6 +341,21 @@ void Reorder::execute(dnnl::stream strm) { // src_blocked->setDataHandle(getParentEdgeAt(0)->getMemory().GetData()); // dst_blocked->setDataHandle(getChildEdgeAt(0)->getMemory().GetData()); + auto updateMemoryPtr = [this](int argType) { + auto param = primArgs.find(argType); + if (param != primArgs.end()) { + if (argType == DNNL_ARG_SRC) { + primArgs.at(argType).set_data_handle(getParentEdgeAt(0)->getMemoryPtr()->getData()); + } + if (argType == DNNL_ARG_DST) { + primArgs.at(argType).set_data_handle(getChildEdgeAt(0)->getMemoryPtr()->getData()); + } + } + }; + + updateMemoryPtr(DNNL_ARG_SRC); + updateMemoryPtr(DNNL_ARG_DST); + if (prim) { prim.execute(strm, primArgs); } else { diff --git a/src/plugins/intel_cpu/src/proxy_mem_mgr.cpp b/src/plugins/intel_cpu/src/proxy_mem_mgr.cpp new file mode 100644 index 00000000000000..97848e7ce85099 --- /dev/null +++ b/src/plugins/intel_cpu/src/proxy_mem_mgr.cpp @@ -0,0 +1,50 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "proxy_mem_mgr.h" +#include "utils/debug_capabilities.h" + +using namespace ov::intel_cpu; + +void ProxyMemoryMngr::setManager(MemoryMngrPtr _pMngr) { + auto _validated = (_pMngr != m_pMngr); + if (_pMngr) { + m_pMngr = _pMngr; + } else { + m_pMngr = m_pOrigMngr; + } + + // WA: unconditionally resize to last size + if (_validated) { + auto res = m_pMngr->resize(m_Size); + DEBUG_LOG(this, ", ", m_pMngr, " size ", m_Size, " -> ", m_Size, " resized? ", res, " RawPtr ", getRawPtr()); + } +} + +void* ProxyMemoryMngr::getRawPtr() const noexcept { + return m_pMngr->getRawPtr(); +} + +void ProxyMemoryMngr::setExtBuff(void* ptr, size_t size) { + return m_pMngr->setExtBuff(ptr, size); +} + +bool ProxyMemoryMngr::resize(size_t size) { + auto res = m_pMngr->resize(size); + DEBUG_LOG(this, ", ", m_pMngr, " size ", m_Size, " -> ", size, " resized? ", res, " RawPtr ", getRawPtr()); + m_Size = size; + return res; +} + +bool ProxyMemoryMngr::hasExtBuffer() const noexcept { + return m_pMngr->hasExtBuffer(); +} + +void ProxyMemoryMngr::registerMemory(Memory* memPtr) { + m_pMngr->registerMemory(memPtr); +} + +void ProxyMemoryMngr::unregisterMemory(Memory* memPtr) { + m_pMngr->unregisterMemory(memPtr); +} \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/proxy_mem_mgr.h b/src/plugins/intel_cpu/src/proxy_mem_mgr.h new file mode 100644 index 00000000000000..d3f50dad114379 --- /dev/null +++ b/src/plugins/intel_cpu/src/proxy_mem_mgr.h @@ -0,0 +1,41 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "cpu_memory.h" + +namespace ov { +namespace intel_cpu { + +class ProxyMemoryMngr : public IMemoryMngrObserver { +public: + explicit ProxyMemoryMngr(MemoryMngrPtr pMngr) : m_pOrigMngr(pMngr), m_pMngr(pMngr) { + OPENVINO_ASSERT(m_pOrigMngr, "Memory manager is uninitialized"); + } + + void* getRawPtr() const noexcept override; + void setExtBuff(void* ptr, size_t size) override; + bool resize(size_t size) override; + bool hasExtBuffer() const noexcept override; + + void registerMemory(Memory* memPtr) override; + void unregisterMemory(Memory* memPtr) override; + + void setManager(MemoryMngrPtr _pMngr); + +private: + // We keep the original MemMngr as may fallback to copy output. + const MemoryMngrPtr m_pOrigMngr; + MemoryMngrPtr m_pMngr; + + // WA: resize stage might not work because there is no shape change, + // but the underlying actual memory manager changes. + mutable size_t m_Size = 0ul; +}; +using ProxyMemoryMngrPtr = std::shared_ptr; +using ProxyMemoryMngrCPtr = std::shared_ptr; + +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/utils/debug_capabilities.cpp b/src/plugins/intel_cpu/src/utils/debug_capabilities.cpp index 300f62e27b19aa..cc25859a0c8c1d 100644 --- a/src/plugins/intel_cpu/src/utils/debug_capabilities.cpp +++ b/src/plugins/intel_cpu/src/utils/debug_capabilities.cpp @@ -282,6 +282,10 @@ std::ostream & operator<<(std::ostream & os, const Node &c_node) { auto n = edge->getParent(); os << comma; os << node_id(*edge->getParent()); + auto ptr = edge->getMemoryPtr(); + if (ptr) { + os << "_" << ptr->getData(); + } if (!is_single_output_port(*n)) os << "[" << edge->getInputNum() << "]"; comma = ","; diff --git a/src/plugins/intel_cpu/tests/functional/single_layer_tests/custom_op_internal_dyn.cpp b/src/plugins/intel_cpu/tests/functional/single_layer_tests/custom_op_internal_dyn.cpp index 3680d5359cf5ff..7c8c2470d7880e 100644 --- a/src/plugins/intel_cpu/tests/functional/single_layer_tests/custom_op_internal_dyn.cpp +++ b/src/plugins/intel_cpu/tests/functional/single_layer_tests/custom_op_internal_dyn.cpp @@ -90,7 +90,7 @@ class CustomOpCPUTest : public SubgraphBaseTest { auto paramOuts = ngraph::helpers::convert2OutputVector(ngraph::helpers::castOps2Nodes(inputParams)); auto customOp = std::make_shared(paramOuts); - ngraph::ResultVector results{std::make_shared(customOp)}; + ngraph::ResultVector results{std::make_shared(customOp->output(0)), std::make_shared(customOp->output(1))}; function = std::make_shared(results, inputParams, "customOpTest"); } diff --git a/src/plugins/intel_cpu/tests/unit/cpu_tensor_test.cpp b/src/plugins/intel_cpu/tests/unit/cpu_tensor_test.cpp new file mode 100644 index 00000000000000..20d86bc0d183fe --- /dev/null +++ b/src/plugins/intel_cpu/tests/unit/cpu_tensor_test.cpp @@ -0,0 +1,258 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include +#include + +#include +#include +#include +#include "openvino/core/except.hpp" +#include "openvino/core/partial_shape.hpp" + +#include "cpu_memory.h" +#include "cpu_tensor.h" +#include "openvino/runtime/itensor.hpp" + +#include "ie_ngraph_utils.hpp" + +using namespace ov::intel_cpu; +using namespace InferenceEngine; + +using CPUTensorTest = ::testing::Test; + +class MockBlockedMemoryDesc : public BlockedMemoryDesc { +public: + MockBlockedMemoryDesc(const Shape& _shape) : MemoryDesc(_shape, Blocked) {} + + MOCK_METHOD(InferenceEngine::Precision, getPrecision, (), (const, override)); + MOCK_METHOD(MemoryDescPtr, clone, (), (const, override)); + MOCK_METHOD(size_t, getOffsetPadding, (), (const, override)); + + MOCK_METHOD(MemoryDescPtr, cloneWithNewDimsImp, (const VectorDims&), (const, override)); + + MOCK_METHOD(MemoryDescPtr, cloneWithNewPrecision, (const InferenceEngine::Precision), (const, override)); + MOCK_METHOD(bool, isCompatible, (const MemoryDesc&), (const, override)); + + MOCK_METHOD(bool, hasLayoutType, (LayoutType), (const, override)); + + MOCK_METHOD(size_t, getMaxMemSize, (), (const, override)); + + MOCK_METHOD(const VectorDims&, getBlockDims, (), (const, override)); + MOCK_METHOD(const VectorDims&, getOrder, (), (const, override)); + MOCK_METHOD(const VectorDims&, getOffsetPaddingToData, (), (const, override)); + MOCK_METHOD(const VectorDims&, getStrides, (), (const, override)); + MOCK_METHOD(bool, blocksExtended, (), (const, override)); + MOCK_METHOD(size_t, getPaddedElementsCount, (), (const, override)); + MOCK_METHOD(bool, isCompatible, (const BlockedMemoryDesc &, CmpMask), (const, override)); + + MOCK_METHOD(void, setPrecision, (InferenceEngine::Precision), (override)); + + MOCK_METHOD(size_t, getCurrentMemSizeImp, (), (const, override)); + + MOCK_METHOD(size_t, getElementOffset, (size_t), (const, override)); + MOCK_METHOD(bool, canComputeMemSizeZeroDims, (), (const, override)); + MOCK_METHOD(bool, isDefinedImp, (), (const, override)); +}; + +class MockIMemory : public IMemory { +public: + MockIMemory(MemoryDescPtr desc) : m_pMemDesc(desc) {} + MockIMemory(const MemoryDesc& desc) : m_pMemDesc(desc.clone()) {} + + MOCK_METHOD(bool, isAllocated, (), (const, noexcept, override)); + MOCK_METHOD(MemoryDesc&, getDesc, (), (const, override)); + MOCK_METHOD(MemoryDescPtr, getDescPtr, (), (const, override)); + + MOCK_METHOD(size_t, getSize, (), (const, override)); + MOCK_METHOD(const Shape&, getShape, (), (const, override)); + MOCK_METHOD(const VectorDims&, getStaticDims, (), (const, override)); + + MOCK_METHOD(void, redefineDesc, (MemoryDescPtr), (override)); + MOCK_METHOD(void, load, (const IMemory&, bool), (const, override)); + MOCK_METHOD(MemoryMngrPtr, getMemoryMngr, (), (const, override)); + + MOCK_METHOD(dnnl::memory, getPrimitive, (), (const, override)); + MOCK_METHOD(void, nullify, (), (override)); + MOCK_METHOD(void*, getData, (), (const, override)); + + void set_memDesc(MemoryDescPtr memdesc) { m_pMemDesc = memdesc; } + void set_memDesc(const MemoryDesc& memdesc) { m_pMemDesc = memdesc.clone(); } + MemoryDesc& get_memDesc() const { return *m_pMemDesc; } + MemoryDescPtr get_memDescPtr() { return m_pMemDesc; } + +private: + MemoryDescPtr m_pMemDesc; +}; + +// helper to get byte strides from strides. +inline ov::Strides byteStrides(const ov::Strides& strides, const ov::element::Type& type) { + ov::Strides byte_strides(strides.size()); + for (size_t i = 0; i < strides.size(); ++i) + byte_strides[i] = strides[i] * type.size(); + return byte_strides; +} + +// helper to create Memory of ncsp layout. +inline MemoryDescPtr create_memdesc(Precision prec, const Shape& shape, const VectorDims& strides = {}) { + ov::Shape ov_shape = shape.toPartialShape().to_shape(); + const std::size_t totalSize = ov::shape_size(ov_shape); + auto elem_type = InferenceEngine::details::convertPrecision(prec); + + auto memdesc = std::make_shared(shape); + ::testing::Mock::AllowLeak(memdesc.get()); + + EXPECT_CALL(*memdesc, hasLayoutType(::testing::Eq(LayoutType::ncsp))).WillRepeatedly(::testing::Return(true)); + + EXPECT_CALL(*memdesc, getPrecision).WillRepeatedly(::testing::Return(prec)); + EXPECT_CALL(*memdesc, getStrides).WillRepeatedly(::testing::ReturnRef(strides)); + + EXPECT_CALL(*memdesc, canComputeMemSizeZeroDims).WillRepeatedly(::testing::Return(true)); + EXPECT_CALL(*memdesc, isDefinedImp).WillRepeatedly(::testing::Return(true)); + EXPECT_CALL(*memdesc, getCurrentMemSizeImp).WillRepeatedly(::testing::Return(totalSize * elem_type.size())); + + return memdesc; +} + +inline MemoryPtr create_memory(MemoryDescPtr memdesc) { + auto memptr = std::make_shared(memdesc); + ::testing::Mock::AllowLeak(memptr.get()); + + // getDesc + EXPECT_CALL(*memptr, getDescPtr) + .Times(::testing::AnyNumber()) + .WillRepeatedly([memptr]() { + return memptr->get_memDescPtr(); + }); + EXPECT_CALL(*memptr, getDesc).WillRepeatedly(::testing::ReturnRef(memptr->get_memDesc())); + + // data + static size_t memSize = 0; + EXPECT_CALL(*memptr, getData) + .WillRepeatedly([memptr]() { + auto memdesc = memptr->get_memDescPtr(); + auto required = memdesc->getCurrentMemSize(); + if (memSize >= required) + return reinterpret_cast(memSize); + else { + memSize = required; + return reinterpret_cast(required); + } + }); + + // redefineDesc + ON_CALL(*memptr, redefineDesc).WillByDefault([memptr](MemoryDescPtr desc) { + memptr->set_memDesc(desc); + }); + EXPECT_CALL(*memptr, redefineDesc).Times(::testing::AtLeast(1)); + + return memptr; +} + +TEST_F(CPUTensorTest, canCreateTensor) { + Shape shape{4, 3, 2}; + ov::Shape ov_shape = shape.toPartialShape().to_shape(); + auto strides = ov::Strides({6, 2, 1}); + const std::size_t totalSize = ov::shape_size(ov_shape); + ov::element::Type elem_type = ov::element::f32; + + auto memptr = create_memory(create_memdesc(Precision::FP32, shape, strides)); + { + std::shared_ptr t = std::make_shared(memptr); + ASSERT_EQ(totalSize, t->get_size()); + ASSERT_NE(nullptr, t->data()); + ASSERT_EQ(elem_type, t->get_element_type()); + ASSERT_EQ(ov_shape, t->get_shape()); + ASSERT_NE(ov_shape, t->get_strides()); + ASSERT_EQ(byteStrides(ov::Strides({6, 2, 1}), t->get_element_type()), t->get_strides()); + ASSERT_EQ(elem_type.size() * totalSize, t->get_byte_size()); + ASSERT_THROW(t->data(ov::element::i64), ov::Exception); + ASSERT_THROW(t->data(), ov::Exception); + } +} + +TEST_F(CPUTensorTest, canAccessF16Tensor) { + Shape shape = {4, 3, 2}; + auto strides = ov::Strides({6, 2, 1}); + + auto memptr = create_memory(create_memdesc(Precision::FP16, shape, strides)); + { + std::shared_ptr t = std::make_shared(memptr); + EXPECT_NE(nullptr, t->data()); + ASSERT_EQ(ov::element::f16, t->get_element_type()); + EXPECT_NO_THROW(t->data(ov::element::f16)); + EXPECT_NO_THROW(t->data()); + EXPECT_THROW(t->data(), ov::Exception); + EXPECT_THROW(t->data(), ov::Exception); + EXPECT_THROW(t->data(), ov::Exception); + } +} + +// SetShape +TEST_F(CPUTensorTest, canSetShape) { + const Shape origShape = {1, 2, 3}; + const ov::Shape ov_origShape = origShape.toPartialShape().to_shape(); + auto strides = ov::Strides({6, 3, 1}); + auto memdesc = create_memdesc(Precision::FP32, origShape, strides); + auto memptr = create_memory(memdesc); + std::shared_ptr t = std::make_shared(memptr); + + const Shape newShape({4, 5, 6}); + const ov::Shape ov_newShape = newShape.toPartialShape().to_shape(); + auto new_strides = ov::Strides{30, 6, 1}; + auto new_memdesc = create_memdesc(Precision::FP32, newShape, new_strides); + + // set_shape to a bigger memory + { + auto blocked_memdesc = dynamic_cast(memdesc.get()); + EXPECT_CALL(*blocked_memdesc, cloneWithNewDimsImp).WillRepeatedly(::testing::Return(new_memdesc)); + + const void* orig_data = t->data(); + ASSERT_EQ(t->get_shape(), ov_origShape); + ASSERT_NO_THROW(t->set_shape(ov_newShape)); + ASSERT_EQ(ov_newShape, t->get_shape()); + ASSERT_EQ(byteStrides(ov::row_major_strides(ov_newShape), t->get_element_type()), t->get_strides()); + ASSERT_NE(orig_data, t->data()); + } + + // set_shape for smaller memory - does not perform reallocation + { + auto new_blocked_memdesc = dynamic_cast(new_memdesc.get()); + EXPECT_CALL(*new_blocked_memdesc, cloneWithNewDimsImp).WillRepeatedly(::testing::Return(memdesc)); + const void* orig_data = t->data(); + t->set_shape(ov_origShape); + ASSERT_EQ(ov_origShape, t->get_shape()); + ASSERT_EQ(orig_data, t->data()); + } +} + +TEST_F(CPUTensorTest, canSyncMemoryAndTensor) { + const Shape origShape = {1, 2, 3}; + const ov::Shape ov_origShape = origShape.toPartialShape().to_shape(); + auto strides = ov::Strides({6, 3, 1}); + auto memdesc = create_memdesc(Precision::FP32, origShape, strides); + auto memptr = create_memory(memdesc); + std::shared_ptr t = std::make_shared(memptr); + + ASSERT_EQ(memptr->getDescPtr()->getShape().toPartialShape().to_shape(), t->get_shape()); + ASSERT_EQ(byteStrides(memptr->getDescWithType()->getStrides(), t->get_element_type()), t->get_strides()); + + const Shape newShape({4, 5, 6}); + const ov::Shape ov_newShape = newShape.toPartialShape().to_shape(); + auto new_strides = ov::Strides{30, 6, 1}; + auto new_memdesc = create_memdesc(Precision::FP32, newShape, new_strides); + + // reallocate memory out boundary of tensor instance + { + auto blocked_memdesc = dynamic_cast(memdesc.get()); + EXPECT_CALL(*blocked_memdesc, cloneWithNewDimsImp).WillRepeatedly(::testing::Return(new_memdesc)); + + auto desc2 = memptr->getDescPtr()->cloneWithNewDims(newShape.getStaticDims(), true); + memptr->redefineDesc(desc2); + ASSERT_EQ(memptr->getDescPtr()->getShape().toPartialShape().to_shape(), t->get_shape()); + ASSERT_EQ(byteStrides(memptr->getDescWithType()->getStrides(), t->get_element_type()), t->get_strides()); + } +} \ No newline at end of file diff --git a/src/plugins/intel_cpu/tests/unit/cpu_tensor_test_ext.cpp b/src/plugins/intel_cpu/tests/unit/cpu_tensor_test_ext.cpp new file mode 100644 index 00000000000000..d3dd6d98bce8ee --- /dev/null +++ b/src/plugins/intel_cpu/tests/unit/cpu_tensor_test_ext.cpp @@ -0,0 +1,156 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include +#include + +#include +#include +#include +#include "openvino/core/except.hpp" +#include "openvino/core/partial_shape.hpp" + +#include "cpu_memory.h" +#include "cpu_tensor.h" +#include "openvino/runtime/itensor.hpp" + +using namespace ov::intel_cpu; +using namespace InferenceEngine; + +using CPUTensorExtTest = ::testing::Test; + +inline ov::Strides byteStrides(const ov::Strides& strides, const ov::element::Type& type) { + ov::Strides byte_strides(strides.size()); + for (size_t i = 0; i < strides.size(); ++i) + byte_strides[i] = strides[i] * type.size(); + return byte_strides; +} + +inline MemoryPtr create_memory(Precision prc, const Shape& shape) { + dnnl::engine eng(dnnl::engine::kind::cpu, 0); + CpuBlockedMemoryDescPtr desc; + desc = std::make_shared(prc, shape); + return std::make_shared(eng, desc); +} + +TEST_F(CPUTensorExtTest, canCreateTensor) { + Shape shape{4, 3, 2}; + ov::Shape ov_shape = shape.toPartialShape().to_shape(); + + std::shared_ptr t = std::make_shared(create_memory(Precision::FP32, shape)); + const std::size_t totalSize = ov::shape_size(ov_shape); + ASSERT_EQ(totalSize, t->get_size()); + ASSERT_NE(nullptr, t->data()); + ASSERT_EQ(ov::element::f32, t->get_element_type()); + ASSERT_EQ(ov_shape, t->get_shape()); + ASSERT_NE(ov_shape, t->get_strides()); + ASSERT_EQ(byteStrides(ov::Strides({6, 2, 1}), t->get_element_type()), t->get_strides()); + ASSERT_EQ(ov::element::f32.size() * totalSize, t->get_byte_size()); + ASSERT_THROW(t->data(ov::element::i64), ov::Exception); + ASSERT_THROW(t->data(), ov::Exception); +} + +TEST_F(CPUTensorExtTest, canAccessF16Tensor) { + Shape shape = {4, 3, 2}; + std::shared_ptr t = std::make_shared(create_memory(Precision::FP16, shape)); + EXPECT_NE(nullptr, t->data()); + ASSERT_EQ(ov::element::f16, t->get_element_type()); + EXPECT_NO_THROW(t->data(ov::element::f16)); + EXPECT_NO_THROW(t->data()); + EXPECT_THROW(t->data(), ov::Exception); + EXPECT_THROW(t->data(), ov::Exception); + EXPECT_THROW(t->data(), ov::Exception); +} + +// SetShape +TEST_F(CPUTensorExtTest, canSetShape) { + const ov::Shape origShape({1, 2, 3}); + std::shared_ptr t = std::make_shared(create_memory(Precision::FP32, {1, 2, 3})); + const ov::Shape newShape({4, 5, 6}); + + const void* orig_data = t->data(); + ASSERT_EQ(t->get_shape(), origShape); + ASSERT_NO_THROW(t->set_shape({4, 5, 6})); + ASSERT_EQ(newShape, t->get_shape()); + ASSERT_EQ(byteStrides(ov::row_major_strides(newShape), t->get_element_type()), t->get_strides()); + ASSERT_NE(orig_data, t->data()); + + // set_shape for smaller memory - does not perform reallocation + { + orig_data = t->data(); + t->set_shape(origShape); + ASSERT_EQ(origShape, t->get_shape()); + ASSERT_EQ(orig_data, t->data()); + } +} + +TEST_F(CPUTensorExtTest, emptySize) { + ov::PartialShape pshape{0, 3, 2}; + Shape shape{pshape}; + const ov::Shape origShape({0, 3, 2}); + + std::shared_ptr t = std::make_shared(create_memory(Precision::FP32, shape)); + + ASSERT_EQ(ov::element::f32, t->get_element_type()); + ASSERT_EQ(0, t->get_size()); + ASSERT_EQ(0, t->get_byte_size()); + ASSERT_EQ(origShape, t->get_shape()); + ASSERT_EQ(byteStrides(ov::Strides({0, 0, 0}), t->get_element_type()), t->get_strides()); + EXPECT_NO_THROW(t->data()); +} + +TEST_F(CPUTensorExtTest, canCreateTensorWithDynamicShape) { + ov::PartialShape pshape{-1, 3, 2}; + Shape shape{pshape}; + + std::shared_ptr t; + + // construct with memory with dynamic shape + ASSERT_NO_THROW(t = std::make_shared(create_memory(Precision::FP32, shape))); + ASSERT_THROW(t->get_shape(), ov::Exception); + ASSERT_THROW(t->get_strides(), ov::Exception); + + // change memory to dynamic shape + { + auto memptr = create_memory(Precision::FP32, {4, 3, 2}); + ASSERT_NO_THROW(t = std::make_shared(memptr)); + + ov::PartialShape pshape{{1, 10}, 3, 2}; + CpuBlockedMemoryDescPtr desc2 = std::make_shared(Precision::FP32, Shape(pshape)); + memptr->redefineDesc(desc2); + ASSERT_THROW(t->get_shape(), ov::Exception); + ASSERT_THROW(t->get_strides(), ov::Exception); + } + + // set_shape + const ov::Shape newShape({4, 0, 2}); + ASSERT_NO_THROW(t = std::make_shared(create_memory(Precision::FP32, {4, 3, 2}))); + + const void* orig_data = t->data(); + ASSERT_NO_THROW(t->set_shape({4, 0, 2})); + ASSERT_EQ(newShape, t->get_shape()); + ASSERT_EQ(ov::Strides({0, 0, 0}), t->get_strides()); + ASSERT_EQ(orig_data, t->data()); +} + +TEST_F(CPUTensorExtTest, canSyncMemoryAndTensor) { + Shape orig_shape{4, 3, 2}; + + auto memptr = create_memory(Precision::FP32, orig_shape); + std::shared_ptr t = std::make_shared(memptr); + ASSERT_EQ(memptr->getDescPtr()->getShape().toPartialShape().to_shape(), t->get_shape()); + ASSERT_EQ(byteStrides(memptr->getDescWithType()->getStrides(), t->get_element_type()), t->get_strides()); + + // reallocate memory out boundary of tensor instance + { + Shape new_shape{1, 5, 2}; + + auto desc2 = memptr->getDescPtr()->cloneWithNewDims(new_shape.getStaticDims(), true); + memptr->redefineDesc(desc2); + ASSERT_EQ(memptr->getDescPtr()->getShape().toPartialShape().to_shape(), t->get_shape()); + ASSERT_EQ(byteStrides(memptr->getDescWithType()->getStrides(), t->get_element_type()), t->get_strides()); + } +} \ No newline at end of file