Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion paddle/framework/executor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -113,11 +113,12 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
} // if (create_local_scope)
} // if (create_vars)

platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();

for (auto& op_desc : block.AllOps()) {
auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);
VLOG(4) << op->DebugStringEx(local_scope);

platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
platform::RecordEvent record_event(op->Type(), pool.Get(place_));

op->Run(*local_scope, place_);
Expand Down
2 changes: 1 addition & 1 deletion paddle/platform/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context)
nv_test(nccl_test SRCS nccl_test.cu DEPS dynload_cuda gpu_info device_context)

cc_library(profiler SRCS profiler.cc DEPS device_context)
cc_library(profiler SRCS profiler.cc DEPS device_context paddle_memory)
cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)

if(NOT WITH_C_API AND WITH_FLUID)
Expand Down
111 changes: 94 additions & 17 deletions paddle/platform/profiler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,14 @@ limitations under the License. */
namespace paddle {
namespace platform {

/* start profiler event name */
constexpr char kStartProfiler[] = "_start_profiler_";

/* stop profiler event name */
constexpr char kStopProfiler[] = "_stop_profiler_";

/* start cuda profiler on every device event name */

// The profiler state, the initial value is ProfilerState::kDisabled
static ProfilerState g_state = ProfilerState::kDisabled;
// To record which timer the profiler used, CUDA or CPU.
Expand Down Expand Up @@ -60,6 +68,13 @@ Event::Event(EventKind kind, std::string name, uint32_t thread_id,
}
#endif
cpu_ns_ = GetTimeInNsec();

if (has_cuda_) {
memory_used_chars_ = memory::memory_usage(dev_ctx->GetPlace());
} else {
memory_used_chars_ = memory::memory_usage(platform::CPUPlace());
}
VLOG(3) << "create event : " << memory_used_chars_;
}

std::string Event::kind() const {
Expand Down Expand Up @@ -92,6 +107,11 @@ double Event::CudaElapsedMs(const Event& e) const {
#endif
}

double Event::MemoryUsed(const Event& e) const {
return static_cast<double>(e.memory_used_chars_ - memory_used_chars_) /
(1024 * 1024);
}

#ifdef PADDLE_WITH_CUDA
static void ForEachDevice(std::function<void(int)> func) {
auto original_device = GetCurrentDeviceId();
Expand Down Expand Up @@ -162,7 +182,7 @@ void EnableProfiler(ProfilerState state) {
}
#endif
// Mark the profiling start.
Mark("_start_profiler_", nullptr);
Mark(kStartProfiler, nullptr);
}

void ResetProfiler() {
Expand All @@ -187,7 +207,7 @@ void DisableProfiler(EventSortingKey sorted_key) {
PADDLE_ENFORCE(g_state != ProfilerState::kDisabled,
"Can't disable profiling, since it's not starting.");
// Mark the profiling stop.
Mark("_stop_profiler_", nullptr);
Mark(kStopProfiler, nullptr);
g_state = ProfilerState::kDisabled;

std::vector<std::vector<Event>> all_events = GetAllEvents();
Expand Down Expand Up @@ -238,13 +258,22 @@ void ParseEvents(std::vector<std::vector<Event>>& events,

std::vector<std::vector<EventItem>> events_table;
size_t max_name_width = 0;
double app_total_time = 0;

for (size_t i = 0; i < events.size(); i++) {
std::list<Event> pushed_events;
std::vector<EventItem> event_items;
std::unordered_map<std::string, int> event_idx;

for (size_t j = 0; j < events[i].size(); j++) {
if (events[i][j].kind() == "push") {
if (events[i][j].kind() == "mark") {
if (events[i][j].name() == kStartProfiler) {
app_total_time -= events[i][j].GetCpuNs() / 1000000.0;
}
if (events[i][j].name() == kStopProfiler) {
app_total_time += events[i][j].GetCpuNs() / 1000000.0;
}
} else if (events[i][j].kind() == "push") {
pushed_events.push_back(events[i][j]);
} else if (events[i][j].kind() == "pop") {
std::list<Event>::reverse_iterator rit = pushed_events.rbegin();
Expand All @@ -257,14 +286,21 @@ void ParseEvents(std::vector<std::vector<Event>>& events,
double event_time = (g_profiler_place == "CUDA")
? rit->CudaElapsedMs(events[i][j])
: rit->CpuElapsedMs(events[i][j]);

double event_memory_used = rit->MemoryUsed(events[i][j]);
double total_memory_used =
static_cast<double>(rit->GetMemoryUsed()) / (1024 * 1024);
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

double total_memory_used = static_cast<double>(rit->GetMemoryUsed()) / (1024 * 1024);

==>

double total_memory_used = static_cast<double>(rit->GetMemoryUsed() + event_memory_used)  * kMegabyte;

Where kMegabyte equals to 1.0/1024/1024.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

event_memory_used means the memory of this operator creating's.
total_memory_used means, up to now, the total memory has been used, it should include event_memory_used.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we must overload the placement new operator to reach the goal.

std::string event_name =
"thread" + std::to_string(rit->thread_id()) + "::" + rit->name();
max_name_width = std::max(max_name_width, event_name.size());

if (event_idx.find(event_name) == event_idx.end()) {
event_idx[event_name] = event_items.size();
EventItem event_item = {event_name, 1, event_time,
event_time, event_time, event_time};
EventItem event_item = {event_name, 1,
event_time, event_time,
event_time, event_time,
total_memory_used, event_memory_used,
event_memory_used};
event_items.push_back(event_item);
} else {
int index = event_idx[event_name];
Expand All @@ -277,6 +313,14 @@ void ParseEvents(std::vector<std::vector<Event>>& events,
// max time
event_items[index].max_time =
std::max(event_time, event_items[index].max_time);

// min memory used
event_items[index].min_memory_used =
std::min(event_memory_used, event_items[index].min_memory_used);

// max memory used
event_items[index].max_memory_used =
std::max(event_memory_used, event_items[index].max_memory_used);
}

// remove the push marker from the list
Expand All @@ -288,7 +332,7 @@ void ParseEvents(std::vector<std::vector<Event>>& events,
}
}
}
// average time
// average time and memory used
for (auto& item : event_items) {
item.ave_time = item.total_time / item.calls;
}
Expand All @@ -308,35 +352,68 @@ void ParseEvents(std::vector<std::vector<Event>>& events,
}

// Print report
PrintProfiler(events_table, sorted_domain, max_name_width + 4, 12);
PrintProfiler(events_table, sorted_domain, max_name_width + 4, 12,
app_total_time);
}

void PrintProfiler(std::vector<std::vector<EventItem>>& events_table,
std::string& sorted_domain, const size_t name_width,
const size_t data_width) {
const size_t data_width, double app_total_time) {
double app_total_memory = .0;

for (size_t i = 0; i < events_table.size(); ++i) {
for (size_t j = 0; j < events_table[i].size(); ++j) {
EventItem& event_item = events_table[i][j];
app_total_memory =
std::max(event_item.total_memory_used, app_total_memory);
}
}

// Output header information
std::cout << "\n------------------------->"
<< " Profiling Report "
<< "<-------------------------\n\n";
std::cout << "Place: " << g_profiler_place << std::endl;
std::cout << "Time unit: ms" << std::endl;
std::cout << "Sorted by " << sorted_domain
<< " in descending order in the same thread\n\n";

// clang-format off
std::cout << "Place: " << g_profiler_place << "\t"
<< "Total Time:" << app_total_time << "ms" << "\t"
<< "Total Memory:" << app_total_memory << "MB" << "\t"
<< "Sorted by " << sorted_domain
<< " in descending order in the same thread\n"
<< std::endl;
// clang-format on

// Output events table
// clang-format off
std::cout.setf(std::ios::left);
std::cout << std::setw(name_width) << "Event" << std::setw(data_width)
<< "Calls" << std::setw(data_width) << "Total"
<< std::setw(data_width) << "Min." << std::setw(data_width)
<< "Max." << std::setw(data_width) << "Ave." << std::endl;
std::cout << std::setw(name_width) << "Event"
<< std::setw(data_width) << "Calls"
<< std::setw(data_width) << "Total"
<< std::setw(data_width) << "Min."
<< std::setw(data_width) << "Max."
<< std::setw(data_width) << "Ave."
<< std::setw(data_width) << "Total Memory."
<< std::setw(data_width) << "Min Memory."
<< std::setw(data_width) << "Max Memory."
<< std::endl;
// clang-format on

for (size_t i = 0; i < events_table.size(); ++i) {
for (size_t j = 0; j < events_table[i].size(); ++j) {
EventItem& event_item = events_table[i][j];

// clang-format off
std::cout << std::setw(name_width) << event_item.name
<< std::setw(data_width) << event_item.calls
<< std::setw(data_width) << event_item.total_time
<< std::setw(data_width) << event_item.min_time
<< std::setw(data_width) << event_item.max_time
<< std::setw(data_width) << event_item.ave_time << std::endl;
<< std::setw(data_width) << event_item.ave_time
<< std::setw(data_width) << event_item.total_memory_used
<< std::setw(data_width) << event_item.min_memory_used
<< std::setw(data_width) << event_item.max_memory_used
<< std::endl;
// clang-format on
}
}
std::cout << std::endl;
Expand Down
14 changes: 13 additions & 1 deletion paddle/platform/profiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,13 @@ See the License for the specific language governing permissions and
limitations under the License. */

#pragma once

#include <forward_list>
#include <list>
#include <mutex>
#include <vector>

#include "paddle/memory/memory.h"
#include "paddle/platform/device_context.h"

namespace paddle {
Expand All @@ -43,12 +46,18 @@ class Event {

double CpuElapsedMs(const Event& e) const;
double CudaElapsedMs(const Event& e) const;
double MemoryUsed(const Event& e) const;
/* Get memory used attribute */
int64_t GetMemoryUsed() const { return memory_used_chars_; }
/* Get cpu time used attribute */
int64_t GetCpuNs() const { return cpu_ns_; }

private:
EventKind kind_;
std::string name_;
uint32_t thread_id_;
int64_t cpu_ns_;
int64_t memory_used_chars_;
bool has_cuda_;
#ifdef PADDLE_WITH_CUDA
cudaEvent_t event_ = nullptr;
Expand Down Expand Up @@ -124,6 +133,9 @@ struct EventItem {
double min_time;
double max_time;
double ave_time;
double total_memory_used;
double min_memory_used;
double max_memory_used;
};

// Candidate keys to sort the profiling report
Expand All @@ -144,7 +156,7 @@ void ParseEvents(std::vector<std::vector<Event>>&,
// Print results
void PrintProfiler(std::vector<std::vector<EventItem>>& events_table,
std::string& sorted_domain, const size_t name_width,
const size_t data_width);
const size_t data_width, double app_total_time);

} // namespace platform
} // namespace paddle
27 changes: 17 additions & 10 deletions paddle/platform/profiler_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,20 @@ limitations under the License. */
TEST(Event, CpuElapsedTime) {
using paddle::platform::Event;
using paddle::platform::EventKind;
using namespace paddle::platform;

Event start_event(EventKind::kPushRange, "test", 0, nullptr);
DeviceContext* dev_ctx = new CPUDeviceContext();

Event start_event(EventKind::kPushRange, "test", 0, dev_ctx);
EXPECT_TRUE(start_event.has_cuda() == false);
int counter = 0;
while (counter != 1000) {
counter++;
}
Event stop_event(EventKind::kPopRange, "test", 0, nullptr);
Event stop_event(EventKind::kPopRange, "test", 0, dev_ctx);
EXPECT_GT(start_event.CpuElapsedMs(stop_event), 0);

delete dev_ctx;
}

#ifdef PADDLE_WITH_CUDA
Expand All @@ -46,27 +51,26 @@ TEST(Event, CudaElapsedTime) {
}
Event stop_event(EventKind::kPopRange, "test", 0, dev_ctx);
EXPECT_GT(start_event.CudaElapsedMs(stop_event), 0);

delete dev_ctx;
}
#endif

TEST(RecordEvent, RecordEvent) {
using paddle::platform::DeviceContext;
using paddle::platform::Event;
using paddle::platform::EventKind;
using paddle::platform::RecordEvent;
using paddle::platform::ProfilerState;
using paddle::platform::EventSortingKey;
using namespace paddle::platform;

ProfilerState state = ProfilerState::kCPU;
DeviceContext* dev_ctx = nullptr;
#ifdef PADDLE_WITH_CUDA
using paddle::platform::CUDADeviceContext;
using paddle::platform::CUDAPlace;
state = ProfilerState::kCUDA;
dev_ctx =
new paddle::platform::CUDADeviceContext(paddle::platform::CUDAPlace(0));
#else
dev_ctx = new CPUDeviceContext();
#endif

EnableProfiler(state);
PADDLE_ENFORCE_NOT_NULL(dev_ctx);

/* Usage 1:
* PushEvent(evt_name, dev_ctx);
Expand All @@ -75,6 +79,7 @@ TEST(RecordEvent, RecordEvent) {
* ...
* PopEvent(evt_name, dev_ctx);
*/

for (int loop = 0; loop < 3; ++loop) {
for (int i = 1; i < 5; ++i) {
std::string name = "op_" + std::to_string(i);
Expand Down Expand Up @@ -121,6 +126,8 @@ TEST(RecordEvent, RecordEvent) {
}
}
}

delete dev_ctx;
EXPECT_EQ(cuda_startup_count % 5, 0);
EXPECT_EQ(start_profiler_count, 1);

Expand Down