Support multi-GPU monitoring (#15)

Inspired by PR #12 from @herrnils * Add support for monitoring multiple GPUs (usage and memory) * Update `show-graph` tool to visualize all GPU metrics in a single view * Enable GPU monitoring in `uprof-sample` if GPU_MONITOR_NVIDIA is set * Fix invalid visualization in `show-graph` tool when multiple event files are passed time-unordered
Orange-OpenSource · Oct 4, 2024 · 3f145e4 · 3f145e4
2 parents 04f7ecd + 4fa5dd3
commit 3f145e4
Show file tree

Hide file tree

Showing 8 changed files with 139 additions and 64 deletions.
diff --git a/README.md b/README.md
@@ -10,9 +10,9 @@
 
 This project provides a tiny C++ profiling library for monitoring:
 * execution time
-* CPU usage
+* CPU(s) usage
 * memory usage
-* GPU usage and memory
+* GPU(s) usage and memory
 
 This library aims at collecting metrics on embedded devices to monitor device
 performance while operating heavy tasks or booting for example. Those metrics can
@@ -69,11 +69,13 @@ To monitor a specific GPU, you must subclass `IGPUMonitor`:
 
 class MyGPUMonitor: public uprofile::IGPUMonitor {
 public:
-    float getUsage() override;
-    void getMemory(int& usedMem, int& totalMem) override;
+    const std::vector<float>& getUsage() const override;
+    void getMemory(std::vector<int>& usedMem, std::vector<int>& totalMem) override;
 }
 ```
 
+As you can see from the interface methods, `ccpuprofile` **supports multi-gpu monitoring**.
+
 And then inject it at runtime to the `uprofile` monitoring system:
 
 ```cpp

diff --git a/lib/igpumonitor.h b/lib/igpumonitor.h
@@ -10,11 +10,13 @@
 #ifndef IGPUMONITOR_H_
 #define IGPUMONITOR_H_
 
+#include <vector>
+
 namespace uprofile
 {
 
 /**
- * Interface to implement for monitoring GPU usage and memory
+ * Interface to implement for monitoring GPU(s) usage and memory
  *
  * No generic abstraction of GPU metrics exists
  * on Linux nor Windows. So specific IGPUMonitor class should
@@ -33,10 +35,10 @@ class IGPUMonitor
     // Return if monitor is currently watching data
     virtual bool watching() const = 0;
 
-    // Usage should be in percentage
-    virtual float getUsage() const = 0;
-    // usedMem and totalMem should be returned as KiB
-    virtual void getMemory(int& usedMem, int& totalMem) const = 0;
+    // Usages should be in percentage
+    virtual const std::vector<float>& getUsage() const = 0;
+    // usedMems and totalMems should be returned as KiB
+    virtual void getMemory(std::vector<int>& usedMem, std::vector<int>& totalMem) const = 0;
 };
 
 }

diff --git a/lib/monitors/nvidiamonitor.cpp b/lib/monitors/nvidiamonitor.cpp
@@ -20,39 +20,81 @@
 #include <sys/wait.h>
 #include <unistd.h>
 #endif
+#include <memory>
 
+using namespace std;
+
+const string nvidiaSmiCmdName = "/usr/bin/nvidia-smi";
 const string errorMsg = "Failed to monitor nvidia-smi process";
 
+struct RawMetric {
+    string index;
+    string usage;
+    string usedMem;
+    string totalMem;
+};
+
 #if defined(__linux__)
-int read_nvidia_smi_stdout(int fd, string& gpuUsage, string& usedMem, string& totalMem)
+int read_nvidia_smi_stdout(int fd, vector<RawMetric>& metrics)
 {
-    string line;
-    while (line.find('\n') == string::npos) { // full line read
-        char buffer[4096];
-        ssize_t count = read(fd, buffer, sizeof(buffer)); // if child process crashes, we gonna be blocked here forever
-        if (count == -1) {
-            return errno;
-        } else if (count > 0) { // there is something to read
-            line += string(buffer, count);
+    size_t nbCollected = 0;
+    while (nbCollected < metrics.size()) {
+        string line;
+
+        // nvidia-smi dumps metrics for each GPU line by line
+        // so read the stdout line by line and fill the input metrics buffer
+        while (line.find('\n') == string::npos) { // full line read
+            char buffer[4096];
+            ssize_t count = read(fd, buffer, sizeof(buffer)); // if child process crashes, we gonna be blocked here forever
+            if (count == -1) {
+                return errno;
+            } else if (count > 0) { // there is something to read
+                line += string(buffer, count);
+            }
         }
-    }
 
-    // Remove colon to have only spaces and use istringstream
-    auto noSpaceEnd = remove(line.begin(), line.end(), ',');
-    if (noSpaceEnd == line.end()) { // output trace does not have comma so something went wrong with the command
-        return ENODATA;
+        // Remove colon to have only spaces and use istringstream
+        auto noSpaceEnd = remove(line.begin(), line.end(), ',');
+        if (noSpaceEnd == line.end()) { // output trace does not have comma so something went wrong with the command
+            return ENODATA;
+        }
+        line.erase(noSpaceEnd, line.end());
+        istringstream ss(line);
+        RawMetric metric;
+        ss >> metric.index >> metric.usage >> metric.usedMem >> metric.totalMem;
+        metrics[nbCollected] = metric;
+        nbCollected++;
     }
 
-    line.erase(noSpaceEnd, line.end());
-    std::istringstream ss(line);
-    ss >> gpuUsage >> usedMem >> totalMem;
-
     return 0;
 }
 #endif
 
 uprofile::NvidiaMonitor::NvidiaMonitor()
 {
+    // Place nvidia-smi command to retrieve number of GPUs
+    // to initialize usage and memsvectors
+    try {
+        char buffer[128];
+        string result = "";
+        string cmd = nvidiaSmiCmdName;
+        cmd += " --query-gpu=count --format=csv,noheader,nounits";
+        FILE* pipe = popen(cmd.c_str(), "r");
+        if (!pipe) {
+            throw runtime_error("popen() failed!");
+        }
+        while (!feof(pipe)) {
+            if (fgets(buffer, 128, pipe) != NULL)
+                result += buffer;
+        }
+        pclose(pipe);
+        m_nbGPUs = static_cast<size_t>(std::stoull(result));
+        m_totalMems = vector<int>(m_nbGPUs, 0);
+        m_usedMems = vector<int>(m_nbGPUs, 0);
+        m_gpuUsages = vector<float>(m_nbGPUs, 0.0);
+    } catch (const exception& err) {
+        cerr << errorMsg << endl;
+    }
 }
 
 uprofile::NvidiaMonitor::~NvidiaMonitor()
@@ -70,17 +112,17 @@ void uprofile::NvidiaMonitor::stop()
     abortWatchGPU();
 }
 
-float uprofile::NvidiaMonitor::getUsage() const
+const std::vector<float>& uprofile::NvidiaMonitor::getUsage() const
 {
     std::lock_guard<std::mutex> lock(m_mutex);
-    return m_gpuUsage;
+    return m_gpuUsages;
 }
 
-void uprofile::NvidiaMonitor::getMemory(int& usedMem, int& totalMem) const
+void uprofile::NvidiaMonitor::getMemory(std::vector<int>& usedMem, std::vector<int>& totalMem) const
 {
     std::lock_guard<std::mutex> lock(m_mutex);
-    usedMem = m_usedMem;
-    totalMem = m_totalMem;
+    usedMem = m_usedMems;
+    totalMem = m_totalMems;
 }
 
 void uprofile::NvidiaMonitor::watchGPU(int period)
@@ -94,7 +136,7 @@ void uprofile::NvidiaMonitor::watchGPU(int period)
     args[0] = (char*)"/usr/bin/nvidia-smi";
     string period_arg = "-lms=" + to_string(period); // lms stands for continuous watching
     args[1] = (char*)period_arg.c_str();
-    args[2] = (char*)"--query-gpu=utilization.gpu,memory.used,memory.total";
+    args[2] = (char*)"--query-gpu=index,utilization.gpu,memory.used,memory.total";
     args[3] = (char*)"--format=csv,noheader,nounits";
     args[4] = NULL;
     string output;
@@ -128,9 +170,9 @@ void uprofile::NvidiaMonitor::watchGPU(int period)
         m_watching = true;
         m_watcherThread = unique_ptr<std::thread>(new thread([stdout_fd, pid, this]() {
             while (watching()) {
-                string gpuUsage, usedMem, totalMem;
+                vector<RawMetric> metrics(m_nbGPUs);
                 // if the child process crashes, an error is raised here and threads ends up
-                int err = read_nvidia_smi_stdout(stdout_fd, gpuUsage, usedMem, totalMem);
+                int err = read_nvidia_smi_stdout(stdout_fd, metrics);
                 if (err != 0) {
                     cerr << errorMsg << ": read_error = " << strerror(err) << endl;
                     unique_lock<mutex> lk(m_mutex);
@@ -140,9 +182,13 @@ void uprofile::NvidiaMonitor::watchGPU(int period)
                 }
 
                 unique_lock<mutex> lk(m_mutex);
-                m_gpuUsage = !gpuUsage.empty() ? stof(gpuUsage) : 0.f;
-                m_usedMem = !usedMem.empty() ? stoi(usedMem) * 1024 : 0;    // MiB to KiB
-                m_totalMem = !totalMem.empty() ? stoi(totalMem) * 1024 : 0; // MiB to KiB
+                for (size_t i = 0; i < metrics.size(); ++i) {
+                    const auto& m = metrics[i];
+                    size_t idx = stoull(m.index);
+                    m_gpuUsages[idx] = !m.usage.empty() ? stof(m.usage) : 0.f;
+                    m_usedMems[idx] = !m.usedMem.empty() ? stoi(m.usedMem) * 1024 : 0;    // MiB to KiB
+                    m_totalMems[idx] = !m.totalMem.empty() ? stoi(m.totalMem) * 1024 : 0; // MiB to KiB
+                }
                 lk.unlock();
             }
         }));

diff --git a/lib/monitors/nvidiamonitor.h b/lib/monitors/nvidiamonitor.h
@@ -17,8 +17,6 @@
 #include <thread>
 #include <vector>
 
-using namespace std;
-
 namespace uprofile
 {
 class NvidiaMonitor : public IGPUMonitor
@@ -30,8 +28,8 @@ class NvidiaMonitor : public IGPUMonitor
     UPROFAPI void start(int period) override;
     UPROFAPI void stop() override;
     UPROFAPI bool watching() const override;
-    UPROFAPI float getUsage() const override;
-    UPROFAPI void getMemory(int& usedMem, int& totalMem) const override;
+    UPROFAPI const std::vector<float>& getUsage() const override;
+    UPROFAPI void getMemory(std::vector<int>& usedMem, std::vector<int>& totalMem) const override;
 
 private:
     void watchGPU(int period);
@@ -40,9 +38,10 @@ class NvidiaMonitor : public IGPUMonitor
     mutable std::mutex m_mutex;
     std::unique_ptr<std::thread> m_watcherThread;
     bool m_watching = false;
-    int m_totalMem = 0;
-    int m_usedMem = 0;
-    float m_gpuUsage = 0.f;
+    std::vector<int> m_totalMems;
+    std::vector<int> m_usedMems;
+    std::vector<float> m_gpuUsages;
+    size_t m_nbGPUs = 0;
 };
 
 }

diff --git a/lib/uprofileimpl.cpp b/lib/uprofileimpl.cpp
@@ -190,8 +190,10 @@ void UProfileImpl::dumpGpuUsage()
         return;
     }
 
-    float usage = m_gpuMonitor->getUsage();
-    write(ProfilingType::GPU_USAGE, {std::to_string(usage)});
+    auto const& usage = m_gpuMonitor->getUsage();
+    for (size_t i = 0; i < usage.size(); ++i) {
+        write(ProfilingType::GPU_USAGE, {std::to_string(i), std::to_string(usage[i])});
+    }
 }
 
 void UProfileImpl::dumpGpuMemory()
@@ -200,9 +202,11 @@ void UProfileImpl::dumpGpuMemory()
         return;
     }
 
-    int usedMem, totalMem;
-    m_gpuMonitor->getMemory(usedMem, totalMem);
-    write(ProfilingType::GPU_MEMORY, {std::to_string(usedMem), std::to_string(totalMem)});
+    vector<int> usedMems, totalMems;
+    m_gpuMonitor->getMemory(usedMems, totalMems);
+    for (size_t i = 0; i < usedMems.size(); ++i) {
+        write(ProfilingType::GPU_MEMORY, {std::to_string(i), std::to_string(usedMems[i]), std::to_string(totalMems[i])});
+    }
 }
 
 vector<float> UProfileImpl::getInstantCpuUsage()

diff --git a/sample/CMakeLists.txt b/sample/CMakeLists.txt
@@ -21,6 +21,10 @@ ELSE()
     add_compile_options(-Wall -Werror)
 ENDIF()
 
+IF (GPU_MONITOR_NVIDIA)
+  ADD_DEFINITIONS(-DGPU_MONITOR_NVIDIA)
+ENDIF()
+
 SET(Sample_SRCS
     main.cpp
 )

diff --git a/sample/main.cpp b/sample/main.cpp
@@ -11,6 +11,9 @@
 #include <stdlib.h>
 #include <thread>
 #include <uprofile.h>
+#if defined(GPU_MONITOR_NVIDIA)
+#include <monitors/nvidiamonitor.h>
+#endif
 
 void printSystemMemory()
 {
@@ -32,6 +35,11 @@ int main(int argc, char* argv[])
     printf(")\n");
 
     // --- START MONITORING ---
+#if defined(GPU_MONITOR_NVIDIA)
+    uprofile::addGPUMonitor(new uprofile::NvidiaMonitor);
+    uprofile::startGPUMemoryMonitoring(200);
+    uprofile::startGPUUsageMonitoring(200);
+#endif
     uprofile::startCPUUsageMonitoring(200);
     uprofile::startSystemMemoryMonitoring(200);
     uprofile::startProcessMemoryMonitoring(200);

diff --git a/tools/show-graph b/tools/show-graph
@@ -95,7 +95,7 @@ def create_cpu_graphs(df):
 
 
 def create_sys_mem_graphs(df):
-    # 'sys_mem' metrics (format is 'mem:<timestamp>:<total>:<available>:<free>')
+    # 'sys_mem' metrics (format is 'sys_mem:<timestamp>:<total>:<available>:<free>')
     if df.empty:
         return None
 
@@ -108,7 +108,7 @@ def create_sys_mem_graphs(df):
 
 
 def create_proc_mem_graphs(df):
-    # 'proc_mem' metrics (format is 'mem:<timestamp>:<rss>:<shared>')
+    # 'proc_mem' metrics (format is 'proc_mem:<timestamp>:<rss>:<shared>')
     if df.empty:
         return None
 
@@ -121,27 +121,34 @@ def create_proc_mem_graphs(df):
 
 
 def create_gpu_mem_graphs(df):
-    # 'gpu_mem' metrics (format is 'gpu_mem:<timestamp>:<total>:<used>')
+    # 'gpu_mem' metrics (format is 'gpu_mem:<timestamp>:<gpu_number>:<total>:<used>')
     if df.empty:
         return None
 
     names = ["Used", "Total"]
-    for index in range(2):
-        yield go.Scatter(x=pd.to_datetime(df['timestamp'], unit='ms'),
-                         y=(pd.to_numeric(df["extra_{}".format(index + 1)], downcast="integer") / 1024),
-                         name=names[index],
-                         showlegend=True)
+
+    gpus = pd.unique(df['extra_1'])
+    for gpu in gpus:
+        gpu_df = df[df['extra_1'] == gpu]
+        for index in range(2):
+            yield go.Scatter(x=pd.to_datetime(gpu_df['timestamp'], unit='ms'),
+                             y=(pd.to_numeric(gpu_df["extra_{}".format(index + 2)], downcast="integer") / 1024),
+                             name="GPU {} {}".format(gpu, names[index]),
+                             showlegend=True)
 
 
 def create_gpu_usage_graphs(df):
-    # 'gpu' metrics (format is 'gpu:<timestamp>:<percentage_usage>')
+    # 'gpu' metrics (format is 'gpu:<timestamp>:<gpu_number>:<percentage_usage>')
     if df.empty:
         return None
 
-    yield go.Scatter(x=pd.to_datetime(df['timestamp'], unit='ms'),
-                     y=pd.to_numeric(df['extra_1']),
-                     name="GPU usage",
-                     showlegend=True)
+    gpus = pd.unique(df['extra_1'])
+    for gpu in gpus:
+        gpu_df = df[df['extra_1'] == gpu]
+        yield go.Scatter(x=pd.to_datetime(gpu_df['timestamp'], unit='ms'),
+                         y=pd.to_numeric(gpu_df['extra_2']),
+                         name="GPU {} usage".format(gpu),
+                         showlegend=True)
 
 
 def build_graphs(input_files, metrics):
@@ -165,7 +172,10 @@ def build_graphs(input_files, metrics):
     global_df = pd.DataFrame()
     for input in input_files:
         with open(input) as f:
-            global_df = pd.concat([global_df, read(f)])
+            global_df = pd.concat([global_df, read(f)], sort=True)
+
+    # Make sure data are sorted by ascending timestamp
+    global_df.sort_values('timestamp')
 
     for index, metric in enumerate(metrics):
         row_index = index + 1