diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc index 9a232b08434d29..6368d5575f3328 100644 --- a/paddle/framework/executor.cc +++ b/paddle/framework/executor.cc @@ -113,11 +113,12 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, } // if (create_local_scope) } // if (create_vars) + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + for (auto& op_desc : block.AllOps()) { auto op = paddle::framework::OpRegistry::CreateOp(*op_desc); VLOG(4) << op->DebugStringEx(local_scope); - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::RecordEvent record_event(op->Type(), pool.Get(place_)); op->Run(*local_scope, place_); diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt index d68caea99719b3..e649adfc86c34c 100644 --- a/paddle/platform/CMakeLists.txt +++ b/paddle/platform/CMakeLists.txt @@ -37,7 +37,7 @@ nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda) nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context) nv_test(nccl_test SRCS nccl_test.cu DEPS dynload_cuda gpu_info device_context) -cc_library(profiler SRCS profiler.cc DEPS device_context) +cc_library(profiler SRCS profiler.cc DEPS device_context paddle_memory) cc_test(profiler_test SRCS profiler_test.cc DEPS profiler) if(NOT WITH_C_API AND WITH_FLUID) diff --git a/paddle/platform/profiler.cc b/paddle/platform/profiler.cc index 2a8afc940393ba..40ca1c3c6418d0 100644 --- a/paddle/platform/profiler.cc +++ b/paddle/platform/profiler.cc @@ -20,6 +20,14 @@ limitations under the License. */ namespace paddle { namespace platform { +/* start profiler event name */ +constexpr char kStartProfiler[] = "_start_profiler_"; + +/* stop profiler event name */ +constexpr char kStopProfiler[] = "_stop_profiler_"; + +/* start cuda profiler on every device event name */ + // The profiler state, the initial value is ProfilerState::kDisabled static ProfilerState g_state = ProfilerState::kDisabled; // To record which timer the profiler used, CUDA or CPU. @@ -60,6 +68,13 @@ Event::Event(EventKind kind, std::string name, uint32_t thread_id, } #endif cpu_ns_ = GetTimeInNsec(); + + if (has_cuda_) { + memory_used_chars_ = memory::memory_usage(dev_ctx->GetPlace()); + } else { + memory_used_chars_ = memory::memory_usage(platform::CPUPlace()); + } + VLOG(3) << "create event : " << memory_used_chars_; } std::string Event::kind() const { @@ -92,6 +107,11 @@ double Event::CudaElapsedMs(const Event& e) const { #endif } +double Event::MemoryUsed(const Event& e) const { + return static_cast(e.memory_used_chars_ - memory_used_chars_) / + (1024 * 1024); +} + #ifdef PADDLE_WITH_CUDA static void ForEachDevice(std::function func) { auto original_device = GetCurrentDeviceId(); @@ -162,7 +182,7 @@ void EnableProfiler(ProfilerState state) { } #endif // Mark the profiling start. - Mark("_start_profiler_", nullptr); + Mark(kStartProfiler, nullptr); } void ResetProfiler() { @@ -187,7 +207,7 @@ void DisableProfiler(EventSortingKey sorted_key) { PADDLE_ENFORCE(g_state != ProfilerState::kDisabled, "Can't disable profiling, since it's not starting."); // Mark the profiling stop. - Mark("_stop_profiler_", nullptr); + Mark(kStopProfiler, nullptr); g_state = ProfilerState::kDisabled; std::vector> all_events = GetAllEvents(); @@ -238,13 +258,22 @@ void ParseEvents(std::vector>& events, std::vector> events_table; size_t max_name_width = 0; + double app_total_time = 0; + for (size_t i = 0; i < events.size(); i++) { std::list pushed_events; std::vector event_items; std::unordered_map event_idx; for (size_t j = 0; j < events[i].size(); j++) { - if (events[i][j].kind() == "push") { + if (events[i][j].kind() == "mark") { + if (events[i][j].name() == kStartProfiler) { + app_total_time -= events[i][j].GetCpuNs() / 1000000.0; + } + if (events[i][j].name() == kStopProfiler) { + app_total_time += events[i][j].GetCpuNs() / 1000000.0; + } + } else if (events[i][j].kind() == "push") { pushed_events.push_back(events[i][j]); } else if (events[i][j].kind() == "pop") { std::list::reverse_iterator rit = pushed_events.rbegin(); @@ -257,14 +286,21 @@ void ParseEvents(std::vector>& events, double event_time = (g_profiler_place == "CUDA") ? rit->CudaElapsedMs(events[i][j]) : rit->CpuElapsedMs(events[i][j]); + + double event_memory_used = rit->MemoryUsed(events[i][j]); + double total_memory_used = + static_cast(rit->GetMemoryUsed()) / (1024 * 1024); std::string event_name = "thread" + std::to_string(rit->thread_id()) + "::" + rit->name(); max_name_width = std::max(max_name_width, event_name.size()); if (event_idx.find(event_name) == event_idx.end()) { event_idx[event_name] = event_items.size(); - EventItem event_item = {event_name, 1, event_time, - event_time, event_time, event_time}; + EventItem event_item = {event_name, 1, + event_time, event_time, + event_time, event_time, + total_memory_used, event_memory_used, + event_memory_used}; event_items.push_back(event_item); } else { int index = event_idx[event_name]; @@ -277,6 +313,14 @@ void ParseEvents(std::vector>& events, // max time event_items[index].max_time = std::max(event_time, event_items[index].max_time); + + // min memory used + event_items[index].min_memory_used = + std::min(event_memory_used, event_items[index].min_memory_used); + + // max memory used + event_items[index].max_memory_used = + std::max(event_memory_used, event_items[index].max_memory_used); } // remove the push marker from the list @@ -288,7 +332,7 @@ void ParseEvents(std::vector>& events, } } } - // average time + // average time and memory used for (auto& item : event_items) { item.ave_time = item.total_time / item.calls; } @@ -308,35 +352,68 @@ void ParseEvents(std::vector>& events, } // Print report - PrintProfiler(events_table, sorted_domain, max_name_width + 4, 12); + PrintProfiler(events_table, sorted_domain, max_name_width + 4, 12, + app_total_time); } void PrintProfiler(std::vector>& events_table, std::string& sorted_domain, const size_t name_width, - const size_t data_width) { + const size_t data_width, double app_total_time) { + double app_total_memory = .0; + + for (size_t i = 0; i < events_table.size(); ++i) { + for (size_t j = 0; j < events_table[i].size(); ++j) { + EventItem& event_item = events_table[i][j]; + app_total_memory = + std::max(event_item.total_memory_used, app_total_memory); + } + } + // Output header information std::cout << "\n------------------------->" << " Profiling Report " << "<-------------------------\n\n"; - std::cout << "Place: " << g_profiler_place << std::endl; - std::cout << "Time unit: ms" << std::endl; - std::cout << "Sorted by " << sorted_domain - << " in descending order in the same thread\n\n"; + + // clang-format off + std::cout << "Place: " << g_profiler_place << "\t" + << "Total Time:" << app_total_time << "ms" << "\t" + << "Total Memory:" << app_total_memory << "MB" << "\t" + << "Sorted by " << sorted_domain + << " in descending order in the same thread\n" + << std::endl; + // clang-format on + // Output events table + // clang-format off std::cout.setf(std::ios::left); - std::cout << std::setw(name_width) << "Event" << std::setw(data_width) - << "Calls" << std::setw(data_width) << "Total" - << std::setw(data_width) << "Min." << std::setw(data_width) - << "Max." << std::setw(data_width) << "Ave." << std::endl; + std::cout << std::setw(name_width) << "Event" + << std::setw(data_width) << "Calls" + << std::setw(data_width) << "Total" + << std::setw(data_width) << "Min." + << std::setw(data_width) << "Max." + << std::setw(data_width) << "Ave." + << std::setw(data_width) << "Total Memory." + << std::setw(data_width) << "Min Memory." + << std::setw(data_width) << "Max Memory." + << std::endl; + // clang-format on + for (size_t i = 0; i < events_table.size(); ++i) { for (size_t j = 0; j < events_table[i].size(); ++j) { EventItem& event_item = events_table[i][j]; + + // clang-format off std::cout << std::setw(name_width) << event_item.name << std::setw(data_width) << event_item.calls << std::setw(data_width) << event_item.total_time << std::setw(data_width) << event_item.min_time << std::setw(data_width) << event_item.max_time - << std::setw(data_width) << event_item.ave_time << std::endl; + << std::setw(data_width) << event_item.ave_time + << std::setw(data_width) << event_item.total_memory_used + << std::setw(data_width) << event_item.min_memory_used + << std::setw(data_width) << event_item.max_memory_used + << std::endl; + // clang-format on } } std::cout << std::endl; diff --git a/paddle/platform/profiler.h b/paddle/platform/profiler.h index 8de1e6ad296d1e..9755f8af1cf796 100644 --- a/paddle/platform/profiler.h +++ b/paddle/platform/profiler.h @@ -13,10 +13,13 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once + #include #include #include #include + +#include "paddle/memory/memory.h" #include "paddle/platform/device_context.h" namespace paddle { @@ -43,12 +46,18 @@ class Event { double CpuElapsedMs(const Event& e) const; double CudaElapsedMs(const Event& e) const; + double MemoryUsed(const Event& e) const; + /* Get memory used attribute */ + int64_t GetMemoryUsed() const { return memory_used_chars_; } + /* Get cpu time used attribute */ + int64_t GetCpuNs() const { return cpu_ns_; } private: EventKind kind_; std::string name_; uint32_t thread_id_; int64_t cpu_ns_; + int64_t memory_used_chars_; bool has_cuda_; #ifdef PADDLE_WITH_CUDA cudaEvent_t event_ = nullptr; @@ -124,6 +133,9 @@ struct EventItem { double min_time; double max_time; double ave_time; + double total_memory_used; + double min_memory_used; + double max_memory_used; }; // Candidate keys to sort the profiling report @@ -144,7 +156,7 @@ void ParseEvents(std::vector>&, // Print results void PrintProfiler(std::vector>& events_table, std::string& sorted_domain, const size_t name_width, - const size_t data_width); + const size_t data_width, double app_total_time); } // namespace platform } // namespace paddle diff --git a/paddle/platform/profiler_test.cc b/paddle/platform/profiler_test.cc index 81f10c91342f76..23a0aaf40e85b8 100644 --- a/paddle/platform/profiler_test.cc +++ b/paddle/platform/profiler_test.cc @@ -18,15 +18,20 @@ limitations under the License. */ TEST(Event, CpuElapsedTime) { using paddle::platform::Event; using paddle::platform::EventKind; + using namespace paddle::platform; - Event start_event(EventKind::kPushRange, "test", 0, nullptr); + DeviceContext* dev_ctx = new CPUDeviceContext(); + + Event start_event(EventKind::kPushRange, "test", 0, dev_ctx); EXPECT_TRUE(start_event.has_cuda() == false); int counter = 0; while (counter != 1000) { counter++; } - Event stop_event(EventKind::kPopRange, "test", 0, nullptr); + Event stop_event(EventKind::kPopRange, "test", 0, dev_ctx); EXPECT_GT(start_event.CpuElapsedMs(stop_event), 0); + + delete dev_ctx; } #ifdef PADDLE_WITH_CUDA @@ -46,27 +51,26 @@ TEST(Event, CudaElapsedTime) { } Event stop_event(EventKind::kPopRange, "test", 0, dev_ctx); EXPECT_GT(start_event.CudaElapsedMs(stop_event), 0); + + delete dev_ctx; } #endif TEST(RecordEvent, RecordEvent) { - using paddle::platform::DeviceContext; - using paddle::platform::Event; - using paddle::platform::EventKind; - using paddle::platform::RecordEvent; - using paddle::platform::ProfilerState; - using paddle::platform::EventSortingKey; + using namespace paddle::platform; ProfilerState state = ProfilerState::kCPU; DeviceContext* dev_ctx = nullptr; #ifdef PADDLE_WITH_CUDA - using paddle::platform::CUDADeviceContext; - using paddle::platform::CUDAPlace; state = ProfilerState::kCUDA; dev_ctx = new paddle::platform::CUDADeviceContext(paddle::platform::CUDAPlace(0)); +#else + dev_ctx = new CPUDeviceContext(); #endif + EnableProfiler(state); + PADDLE_ENFORCE_NOT_NULL(dev_ctx); /* Usage 1: * PushEvent(evt_name, dev_ctx); @@ -75,6 +79,7 @@ TEST(RecordEvent, RecordEvent) { * ... * PopEvent(evt_name, dev_ctx); */ + for (int loop = 0; loop < 3; ++loop) { for (int i = 1; i < 5; ++i) { std::string name = "op_" + std::to_string(i); @@ -121,6 +126,8 @@ TEST(RecordEvent, RecordEvent) { } } } + + delete dev_ctx; EXPECT_EQ(cuda_startup_count % 5, 0); EXPECT_EQ(start_profiler_count, 1);