From 564aecee3fda5205eb2c782f9d0921f6636309ad Mon Sep 17 00:00:00 2001
From: Florent Buisson <florent.buisson.1987@gmail.com>
Date: Thu, 21 Sep 2017 11:23:37 +0200
Subject: [PATCH 01/52] Files for tensort rt pose detection, for now nothing
 done.

---
 .../3_extract_from_image_TensorRT.cpp         | 134 ++++++++++++++
 .../openpose/pose/poseExtractorTensorRT.hpp   |  52 ++++++
 src/openpose/pose/poseExtractorTensorRT.cpp   | 170 ++++++++++++++++++
 3 files changed, 356 insertions(+)
 create mode 100644 examples/tutorial_pose/3_extract_from_image_TensorRT.cpp
 create mode 100644 include/openpose/pose/poseExtractorTensorRT.hpp
 create mode 100644 src/openpose/pose/poseExtractorTensorRT.cpp
diff --git a/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp b/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp
new file mode 100644
index 000000000..48cbcbb96
--- /dev/null
+++ b/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp
@@ -0,0 +1,134 @@
+// ------------------------- OpenPose Library Tutorial - Pose - Example 1 - Extract from Image -------------------------
+// This first example shows the user how to:
+    // 1. Load an image (`filestream` module)
+    // 2. Extract the pose of that image (`pose` module)
+    // 3. Render the pose on a resized copy of the input image (`pose` module)
+    // 4. Display the rendered pose (`gui` module)
+// In addition to the previous OpenPose modules, we also need to use:
+    // 1. `core` module: for the Array<float> class that the `pose` module needs
+    // 2. `utilities` module: for the error & logging functions, i.e. op::error & op::log respectively
+
+// 3rdparty dependencies
+#include <gflags/gflags.h> // DEFINE_bool, DEFINE_int32, DEFINE_int64, DEFINE_uint64, DEFINE_double, DEFINE_string
+#include <glog/logging.h> // google::InitGoogleLogging
+// OpenPose dependencies
+#include <openpose/core/headers.hpp>
+#include <openpose/filestream/headers.hpp>
+#include <openpose/gui/headers.hpp>
+#include <openpose/pose/headers.hpp>
+#include <openpose/utilities/headers.hpp>
+
+// See all the available parameter options withe the `--help` flag. E.g. `./build/examples/openpose/openpose.bin --help`.
+// Note: This command will show you flags for other unnecessary 3rdparty files. Check only the flags for the OpenPose
+// executable. E.g. for `openpose.bin`, look for `Flags from examples/openpose/openpose.cpp:`.
+// Debugging
+DEFINE_int32(logging_level,             3,              "The logging level. Integer in the range [0, 255]. 0 will output any log() message, while"
+                                                        " 255 will not output any. Current OpenPose library messages are in the range 0-4: 1 for"
+                                                        " low priority messages and 4 for important ones.");
+// Producer
+DEFINE_string(image_path,               "examples/media/COCO_val2014_000000000192.jpg",     "Process the desired image.");
+// OpenPose
+DEFINE_string(model_pose,               "COCO",         "Model to be used. E.g. `COCO` (18 keypoints), `MPI` (15 keypoints, ~10% faster), "
+                                                        "`MPI_4_layers` (15 keypoints, even faster but less accurate).");
+DEFINE_string(model_folder,             "models/",      "Folder path (absolute or relative) where the models (pose, face, ...) are located.");
+DEFINE_string(net_resolution,           "656x368",      "Multiples of 16. If it is increased, the accuracy potentially increases. If it is decreased,"
+                                                        " the speed increases. For maximum speed-accuracy balance, it should keep the closest aspect"
+                                                        " ratio possible to the images or videos to be processed. E.g. the default `656x368` is"
+                                                        " optimal for 16:9 videos, e.g. full HD (1980x1080) and HD (1280x720) videos.");
+DEFINE_string(resolution,               "1280x720",     "The image resolution (display and output). Use \"-1x-1\" to force the program to use the"
+                                                        " default images resolution.");
+DEFINE_int32(num_gpu_start,             0,              "GPU device start number.");
+DEFINE_double(scale_gap,                0.3,            "Scale gap between scales. No effect unless scale_number > 1. Initial scale is always 1."
+                                                        " If you want to change the initial scale, you actually want to multiply the"
+                                                        " `net_resolution` by your desired initial scale.");
+DEFINE_int32(scale_number,              1,              "Number of scales to average.");
+// OpenPose Rendering
+DEFINE_bool(disable_blending,           false,          "If blending is enabled, it will merge the results with the original frame. If disabled, it"
+                                                        " will only display the results on a black background.");
+DEFINE_double(render_threshold,         0.05,           "Only estimated keypoints whose score confidences are higher than this threshold will be"
+                                                        " rendered. Generally, a high threshold (> 0.5) will only render very clear body parts;"
+                                                        " while small thresholds (~0.1) will also output guessed and occluded keypoints, but also"
+                                                        " more false positives (i.e. wrong detections).");
+DEFINE_double(alpha_pose,               0.6,            "Blending factor (range 0-1) for the body part rendering. 1 will show it completely, 0 will"
+                                                        " hide it. Only valid for GPU rendering.");
+
+int openPoseTutorialPose1()
+{
+    op::log("OpenPose Library Tutorial - Example 1.", op::Priority::High);
+    // ------------------------- INITIALIZATION -------------------------
+    // Step 1 - Set logging level
+        // - 0 will output all the logging messages
+        // - 255 will output nothing
+    op::check(0 <= FLAGS_logging_level && FLAGS_logging_level <= 255, "Wrong logging_level value.", __LINE__, __FUNCTION__, __FILE__);
+    op::ConfigureLog::setPriorityThreshold((op::Priority)FLAGS_logging_level);
+    op::log("", op::Priority::Low, __LINE__, __FUNCTION__, __FILE__);
+    // Step 2 - Read Google flags (user defined configuration)
+    // outputSize
+    const auto outputSize = op::flagsToPoint(FLAGS_resolution, "1280x720");
+    // netInputSize
+    const auto netInputSize = op::flagsToPoint(FLAGS_net_resolution, "656x368");
+    // netOutputSize
+    const auto netOutputSize = netInputSize;
+    // poseModel
+    const auto poseModel = op::flagsToPoseModel(FLAGS_model_pose);
+    // Check no contradictory flags enabled
+    if (FLAGS_alpha_pose < 0. || FLAGS_alpha_pose > 1.)
+        op::error("Alpha value for blending must be in the range [0,1].", __LINE__, __FUNCTION__, __FILE__);
+    if (FLAGS_scale_gap <= 0. && FLAGS_scale_number > 1)
+        op::error("Incompatible flag configuration: scale_gap must be greater than 0 or scale_number = 1.", __LINE__, __FUNCTION__, __FILE__);
+    // Logging
+    op::log("", op::Priority::Low, __LINE__, __FUNCTION__, __FILE__);
+    // Step 3 - Initialize all required classes
+    op::CvMatToOpInput cvMatToOpInput{netInputSize, FLAGS_scale_number, (float)FLAGS_scale_gap};
+    op::CvMatToOpOutput cvMatToOpOutput{outputSize};
+    op::PoseExtractorCaffe poseExtractorCaffe{netInputSize, netOutputSize, outputSize, FLAGS_scale_number, poseModel,
+                                              FLAGS_model_folder, FLAGS_num_gpu_start};
+    op::PoseRenderer poseRenderer{netOutputSize, outputSize, poseModel, nullptr, (float)FLAGS_render_threshold,
+                                  !FLAGS_disable_blending, (float)FLAGS_alpha_pose};
+    op::OpOutputToCvMat opOutputToCvMat{outputSize};
+    const op::Point<int> windowedSize = outputSize;
+    op::FrameDisplayer frameDisplayer{windowedSize, "OpenPose Tutorial - Example 1"};
+    // Step 4 - Initialize resources on desired thread (in this case single thread, i.e. we init resources here)
+    poseExtractorCaffe.initializationOnThread();
+    poseRenderer.initializationOnThread();
+
+    // ------------------------- POSE ESTIMATION AND RENDERING -------------------------
+    // Step 1 - Read and load image, error if empty (possibly wrong path)
+    cv::Mat inputImage = op::loadImage(FLAGS_image_path, CV_LOAD_IMAGE_COLOR); // Alternative: cv::imread(FLAGS_image_path, CV_LOAD_IMAGE_COLOR);
+    if(inputImage.empty())
+        op::error("Could not open or find the image: " + FLAGS_image_path, __LINE__, __FUNCTION__, __FILE__);
+    // Step 2 - Format input image to OpenPose input and output formats
+    op::Array<float> netInputArray;
+    std::vector<float> scaleRatios;
+    std::tie(netInputArray, scaleRatios) = cvMatToOpInput.format(inputImage);
+    double scaleInputToOutput;
+    op::Array<float> outputArray;
+    std::tie(scaleInputToOutput, outputArray) = cvMatToOpOutput.format(inputImage);
+    // Step 3 - Estimate poseKeypoints
+    poseExtractorCaffe.forwardPass(netInputArray, {inputImage.cols, inputImage.rows}, scaleRatios);
+    const auto poseKeypoints = poseExtractorCaffe.getPoseKeypoints();
+    // Step 4 - Render poseKeypoints
+    poseRenderer.renderPose(outputArray, poseKeypoints);
+    // Step 5 - OpenPose output format to cv::Mat
+    auto outputImage = opOutputToCvMat.formatToCvMat(outputArray);
+
+    // ------------------------- SHOWING RESULT AND CLOSING -------------------------
+    // Step 1 - Show results
+    frameDisplayer.displayFrame(outputImage, 0); // Alternative: cv::imshow(outputImage) + cv::waitKey(0)
+    // Step 2 - Logging information message
+    op::log("Example 1 successfully finished.", op::Priority::High);
+    // Return successful message
+    return 0;
+}
+
+int main(int argc, char *argv[])
+{
+    // Initializing google logging (Caffe uses it for logging)
+    google::InitGoogleLogging("openPoseTutorialPose1");
+
+    // Parsing command line flags
+    gflags::ParseCommandLineFlags(&argc, &argv, true);
+
+    // Running openPoseTutorialPose1
+    return openPoseTutorialPose1();
+}
diff --git a/include/openpose/pose/poseExtractorTensorRT.hpp b/include/openpose/pose/poseExtractorTensorRT.hpp
new file mode 100644
index 000000000..d745c30f6
--- /dev/null
+++ b/include/openpose/pose/poseExtractorTensorRT.hpp
@@ -0,0 +1,52 @@
+#ifdef USE_CAFFE
+#ifndef OPENPOSE_POSE_POSE_EXTRACTOR_CAFFE_HPP
+#define OPENPOSE_POSE_POSE_EXTRACTOR_CAFFE_HPP
+
+#include <caffe/blob.hpp>
+#include <openpose/core/common.hpp>
+#include <openpose/core/net.hpp>
+#include <openpose/core/nmsCaffe.hpp>
+#include <openpose/core/resizeAndMergeCaffe.hpp>
+#include <openpose/pose/bodyPartConnectorCaffe.hpp>
+#include <openpose/pose/enumClasses.hpp>
+#include <openpose/pose/poseExtractor.hpp>
+
+namespace op
+{
+    class OP_API PoseExtractorCaffe : public PoseExtractor
+    {
+    public:
+        PoseExtractorCaffe(const Point<int>& netInputSize, const Point<int>& netOutputSize, const Point<int>& outputSize, const int scaleNumber,
+                           const PoseModel poseModel, const std::string& modelFolder, const int gpuId, const std::vector<HeatMapType>& heatMapTypes = {},
+                           const ScaleMode heatMapScale = ScaleMode::ZeroToOne);
+
+        virtual ~PoseExtractorCaffe();
+
+        void netInitializationOnThread();
+
+        void forwardPass(const Array<float>& inputNetData, const Point<int>& inputDataSize, const std::vector<float>& scaleRatios = {1.f});
+
+        const float* getHeatMapCpuConstPtr() const;
+
+        const float* getHeatMapGpuConstPtr() const;
+
+        const float* getPoseGpuConstPtr() const;
+
+    private:
+        const float mResizeScale;
+        std::shared_ptr<Net> spNet;
+        std::shared_ptr<ResizeAndMergeCaffe<float>> spResizeAndMergeCaffe;
+        std::shared_ptr<NmsCaffe<float>> spNmsCaffe;
+        std::shared_ptr<BodyPartConnectorCaffe<float>> spBodyPartConnectorCaffe;
+        // Init with thread
+        boost::shared_ptr<caffe::Blob<float>> spCaffeNetOutputBlob;
+        std::shared_ptr<caffe::Blob<float>> spHeatMapsBlob;
+        std::shared_ptr<caffe::Blob<float>> spPeaksBlob;
+        std::shared_ptr<caffe::Blob<float>> spPoseBlob;
+
+        DELETE_COPY(PoseExtractorCaffe);
+    };
+}
+
+#endif // OPENPOSE_POSE_POSE_EXTRACTOR_CAFFE_HPP
+#endif
diff --git a/src/openpose/pose/poseExtractorTensorRT.cpp b/src/openpose/pose/poseExtractorTensorRT.cpp
new file mode 100644
index 000000000..bc4374782
--- /dev/null
+++ b/src/openpose/pose/poseExtractorTensorRT.cpp
@@ -0,0 +1,170 @@
+#ifdef USE_CAFFE
+#include <openpose/core/netCaffe.hpp>
+#include <openpose/pose/poseParameters.hpp>
+#include <openpose/utilities/check.hpp>
+#include <openpose/utilities/cuda.hpp>
+#include <openpose/utilities/fastMath.hpp>
+#include <openpose/utilities/openCv.hpp>
+#include <openpose/pose/poseExtractorCaffe.hpp>
+
+namespace op
+{
+    PoseExtractorCaffe::PoseExtractorCaffe(const Point<int>& netInputSize, const Point<int>& netOutputSize, const Point<int>& outputSize, const int scaleNumber,
+                                           const PoseModel poseModel, const std::string& modelFolder, const int gpuId, const std::vector<HeatMapType>& heatMapTypes,
+                                           const ScaleMode heatMapScale) :
+        PoseExtractor{netOutputSize, outputSize, poseModel, heatMapTypes, heatMapScale},
+        mResizeScale{mNetOutputSize.x / (float)netInputSize.x},
+        spNet{std::make_shared<NetCaffe>(std::array<int,4>{scaleNumber, 3, (int)netInputSize.y, (int)netInputSize.x},
+                                         modelFolder + POSE_PROTOTXT[(int)poseModel], modelFolder + POSE_TRAINED_MODEL[(int)poseModel], gpuId)},
+        spResizeAndMergeCaffe{std::make_shared<ResizeAndMergeCaffe<float>>()},
+        spNmsCaffe{std::make_shared<NmsCaffe<float>>()},
+        spBodyPartConnectorCaffe{std::make_shared<BodyPartConnectorCaffe<float>>()}
+    {
+        try
+        {
+            const auto resizeScale = mNetOutputSize.x / (float)netInputSize.x;
+            const auto resizeScaleCheck = resizeScale / (mNetOutputSize.y/(float)netInputSize.y);
+            if (1+1e-6 < resizeScaleCheck || resizeScaleCheck < 1-1e-6)
+                error("Net input and output size must be proportional. resizeScaleCheck = " + std::to_string(resizeScaleCheck), __LINE__, __FUNCTION__, __FILE__);
+        }
+        catch (const std::exception& e)
+        {
+            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+        }
+    }
+
+    PoseExtractorCaffe::~PoseExtractorCaffe()
+    {
+    }
+
+    void PoseExtractorCaffe::netInitializationOnThread()
+    {
+        try
+        {
+            log("Starting initialization on thread.", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
+
+            // Caffe net
+            spNet->initializationOnThread();
+            spCaffeNetOutputBlob = ((NetCaffe*)spNet.get())->getOutputBlob();
+            cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+
+            // HeatMaps extractor blob and layer
+            spHeatMapsBlob = {std::make_shared<caffe::Blob<float>>(1,1,1,1)};
+            spResizeAndMergeCaffe->Reshape({spCaffeNetOutputBlob.get()}, {spHeatMapsBlob.get()}, mResizeScale * POSE_CCN_DECREASE_FACTOR[(int)mPoseModel]);
+            cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+
+            // Pose extractor blob and layer
+            spPeaksBlob = {std::make_shared<caffe::Blob<float>>(1,1,1,1)};
+            spNmsCaffe->Reshape({spHeatMapsBlob.get()}, {spPeaksBlob.get()}, POSE_MAX_PEAKS[(int)mPoseModel]);
+            cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+
+            // Pose extractor blob and layer
+            spPoseBlob = {std::make_shared<caffe::Blob<float>>(1,1,1,1)};
+            spBodyPartConnectorCaffe->setPoseModel(mPoseModel);
+            spBodyPartConnectorCaffe->Reshape({spHeatMapsBlob.get(), spPeaksBlob.get()}, {spPoseBlob.get()});
+            cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+
+            log("Finished initialization on thread.", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
+        }
+        catch (const std::exception& e)
+        {
+            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+        }
+    }
+
+    void PoseExtractorCaffe::forwardPass(const Array<float>& inputNetData, const Point<int>& inputDataSize, const std::vector<float>& scaleRatios)
+    {
+        try
+        {
+            // Security checks
+            if (inputNetData.empty())
+                error("Empty inputNetData.", __LINE__, __FUNCTION__, __FILE__);
+
+            // 1. Caffe deep network
+            spNet->forwardPass(inputNetData.getConstPtr());                                                     // ~79.3836ms
+
+            // 2. Resize heat maps + merge different scales
+            spResizeAndMergeCaffe->setScaleRatios(scaleRatios);
+            #ifndef CPU_ONLY
+                spResizeAndMergeCaffe->Forward_gpu({spCaffeNetOutputBlob.get()}, {spHeatMapsBlob.get()});       // ~5ms
+                cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+            #else
+                error("ResizeAndMergeCaffe CPU version not implemented yet.", __LINE__, __FUNCTION__, __FILE__);
+            #endif
+
+            // 3. Get peaks by Non-Maximum Suppression
+            spNmsCaffe->setThreshold((float)get(PoseProperty::NMSThreshold));
+            #ifndef CPU_ONLY
+                spNmsCaffe->Forward_gpu({spHeatMapsBlob.get()}, {spPeaksBlob.get()});                           // ~2ms
+                cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+            #else
+                error("NmsCaffe CPU version not implemented yet.", __LINE__, __FUNCTION__, __FILE__);
+            #endif
+
+            // Get scale net to output
+            const auto scaleProducerToNetInput = resizeGetScaleFactor(inputDataSize, mNetOutputSize);
+            const Point<int> netSize{intRound(scaleProducerToNetInput*inputDataSize.x), intRound(scaleProducerToNetInput*inputDataSize.y)};
+            mScaleNetToOutput = {(float)resizeGetScaleFactor(netSize, mOutputSize)};
+
+            // 4. Connecting body parts
+            spBodyPartConnectorCaffe->setScaleNetToOutput(mScaleNetToOutput);
+            spBodyPartConnectorCaffe->setInterMinAboveThreshold((int)get(PoseProperty::ConnectInterMinAboveThreshold));
+            spBodyPartConnectorCaffe->setInterThreshold((float)get(PoseProperty::ConnectInterThreshold));
+            spBodyPartConnectorCaffe->setMinSubsetCnt((int)get(PoseProperty::ConnectMinSubsetCnt));
+            spBodyPartConnectorCaffe->setMinSubsetScore((float)get(PoseProperty::ConnectMinSubsetScore));
+
+            // GPU version not implemented yet
+            spBodyPartConnectorCaffe->Forward_cpu({spHeatMapsBlob.get(), spPeaksBlob.get()}, mPoseKeypoints);
+            // spBodyPartConnectorCaffe->Forward_gpu({spHeatMapsBlob.get(), spPeaksBlob.get()}, {spPoseBlob.get()}, mPoseKeypoints);
+        }
+        catch (const std::exception& e)
+        {
+            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+        }
+    }
+
+    const float* PoseExtractorCaffe::getHeatMapCpuConstPtr() const
+    {
+        try
+        {
+            checkThread();
+            return spHeatMapsBlob->cpu_data();
+        }
+        catch (const std::exception& e)
+        {
+            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+            return nullptr;
+        }
+    }
+
+    const float* PoseExtractorCaffe::getHeatMapGpuConstPtr() const
+    {
+        try
+        {
+            checkThread();
+            return spHeatMapsBlob->gpu_data();
+        }
+        catch (const std::exception& e)
+        {
+            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+            return nullptr;
+        }
+    }
+
+    const float* PoseExtractorCaffe::getPoseGpuConstPtr() const
+    {
+        try
+        {
+            error("GPU pointer for people pose data not implemented yet.", __LINE__, __FUNCTION__, __FILE__);
+            checkThread();
+            return spPoseBlob->gpu_data();
+        }
+        catch (const std::exception& e)
+        {
+            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+            return nullptr;
+        }
+    }
+}
+
+#endif

From dfc1f827c0dba611ed535c5fb03ce1febc772b86 Mon Sep 17 00:00:00 2001
From: Florent Buisson <florent.buisson.1987@gmail.com>
Date: Thu, 21 Sep 2017 11:27:48 +0200
Subject: [PATCH 02/52] Adding timer in new demo and checking build before
 replacing inference.

---
 examples/tutorial_pose/3_extract_from_image_TensorRT.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp b/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp
index 48cbcbb96..f431d322d 100644
--- a/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp
+++ b/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp
@@ -54,6 +54,9 @@ DEFINE_double(alpha_pose,               0.6,            "Blending factor (range
 
 int openPoseTutorialPose1()
 {
+    op::log("Starting pose estimation.", op::Priority::High);
+    const auto timerBegin = std::chrono::high_resolution_clock::now();
+  
     op::log("OpenPose Library Tutorial - Example 1.", op::Priority::High);
     // ------------------------- INITIALIZATION -------------------------
     // Step 1 - Set logging level
@@ -117,6 +120,11 @@ int openPoseTutorialPose1()
     frameDisplayer.displayFrame(outputImage, 0); // Alternative: cv::imshow(outputImage) + cv::waitKey(0)
     // Step 2 - Logging information message
     op::log("Example 1 successfully finished.", op::Priority::High);
+  
+    const auto now = std::chrono::high_resolution_clock::now();
+    const auto totalTimeSec = (double)std::chrono::duration_cast<std::chrono::nanoseconds>(now-timerBegin).count() * 1e-9;
+    const auto message = "Pose estimation successfully finished. Total time: " + std::to_string(totalTimeSec) + " seconds.";
+  
     // Return successful message
     return 0;
 }

From a4885e0f15bf67bfaeb08fa5a96cacccf5ded733 Mon Sep 17 00:00:00 2001
From: Florent Buisson <florent.buisson.1987@gmail.com>
Date: Thu, 21 Sep 2017 09:41:54 +0000
Subject: [PATCH 03/52] PoseExtractorTensorRT changed names for build conflicts
 but still performs Caffe inference.

---
 .../openpose/pose/poseExtractorTensorRT.hpp   | 22 +++----
 src/openpose/pose/poseExtractorTensorRT.cpp   | 62 +++++++++----------
 2 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/include/openpose/pose/poseExtractorTensorRT.hpp b/include/openpose/pose/poseExtractorTensorRT.hpp
index d745c30f6..270d2a8f4 100644
--- a/include/openpose/pose/poseExtractorTensorRT.hpp
+++ b/include/openpose/pose/poseExtractorTensorRT.hpp
@@ -1,6 +1,6 @@
 #ifdef USE_CAFFE
-#ifndef OPENPOSE_POSE_POSE_EXTRACTOR_CAFFE_HPP
-#define OPENPOSE_POSE_POSE_EXTRACTOR_CAFFE_HPP
+#ifndef OPENPOSE_POSE_POSE_EXTRACTOR_TENSORRT_HPP
+#define OPENPOSE_POSE_POSE_EXTRACTOR_TENSORRT_HPP
 
 #include <caffe/blob.hpp>
 #include <openpose/core/common.hpp>
@@ -13,14 +13,14 @@
 
 namespace op
 {
-    class OP_API PoseExtractorCaffe : public PoseExtractor
+    class OP_API PoseExtractorTensorRT : public PoseExtractor
     {
     public:
-        PoseExtractorCaffe(const Point<int>& netInputSize, const Point<int>& netOutputSize, const Point<int>& outputSize, const int scaleNumber,
+        PoseExtractorTensorRT(const Point<int>& netInputSize, const Point<int>& netOutputSize, const Point<int>& outputSize, const int scaleNumber,
                            const PoseModel poseModel, const std::string& modelFolder, const int gpuId, const std::vector<HeatMapType>& heatMapTypes = {},
                            const ScaleMode heatMapScale = ScaleMode::ZeroToOne);
 
-        virtual ~PoseExtractorCaffe();
+        virtual ~PoseExtractorTensorRT();
 
         void netInitializationOnThread();
 
@@ -35,18 +35,18 @@ namespace op
     private:
         const float mResizeScale;
         std::shared_ptr<Net> spNet;
-        std::shared_ptr<ResizeAndMergeCaffe<float>> spResizeAndMergeCaffe;
-        std::shared_ptr<NmsCaffe<float>> spNmsCaffe;
-        std::shared_ptr<BodyPartConnectorCaffe<float>> spBodyPartConnectorCaffe;
+        std::shared_ptr<ResizeAndMergeCaffe<float>> spResizeAndMergeTensorRT;
+        std::shared_ptr<NmsCaffe<float>> spNmsTensorRT;
+        std::shared_ptr<BodyPartConnectorCaffe<float>> spBodyPartConnectorTensorRT;
         // Init with thread
-        boost::shared_ptr<caffe::Blob<float>> spCaffeNetOutputBlob;
+        boost::shared_ptr<caffe::Blob<float>> spTensorRTNetOutputBlob;
         std::shared_ptr<caffe::Blob<float>> spHeatMapsBlob;
         std::shared_ptr<caffe::Blob<float>> spPeaksBlob;
         std::shared_ptr<caffe::Blob<float>> spPoseBlob;
 
-        DELETE_COPY(PoseExtractorCaffe);
+        DELETE_COPY(PoseExtractorTensorRT);
     };
 }
 
-#endif // OPENPOSE_POSE_POSE_EXTRACTOR_CAFFE_HPP
+#endif // OPENPOSE_POSE_POSE_EXTRACTOR_TENSORRT_HPP
 #endif
diff --git a/src/openpose/pose/poseExtractorTensorRT.cpp b/src/openpose/pose/poseExtractorTensorRT.cpp
index bc4374782..0bd1dc6df 100644
--- a/src/openpose/pose/poseExtractorTensorRT.cpp
+++ b/src/openpose/pose/poseExtractorTensorRT.cpp
@@ -5,20 +5,20 @@
 #include <openpose/utilities/cuda.hpp>
 #include <openpose/utilities/fastMath.hpp>
 #include <openpose/utilities/openCv.hpp>
-#include <openpose/pose/poseExtractorCaffe.hpp>
+#include <openpose/pose/poseExtractorTensorRT.hpp>
 
 namespace op
 {
-    PoseExtractorCaffe::PoseExtractorCaffe(const Point<int>& netInputSize, const Point<int>& netOutputSize, const Point<int>& outputSize, const int scaleNumber,
+    PoseExtractorTensorRT::PoseExtractorTensorRT(const Point<int>& netInputSize, const Point<int>& netOutputSize, const Point<int>& outputSize, const int scaleNumber,
                                            const PoseModel poseModel, const std::string& modelFolder, const int gpuId, const std::vector<HeatMapType>& heatMapTypes,
                                            const ScaleMode heatMapScale) :
         PoseExtractor{netOutputSize, outputSize, poseModel, heatMapTypes, heatMapScale},
         mResizeScale{mNetOutputSize.x / (float)netInputSize.x},
         spNet{std::make_shared<NetCaffe>(std::array<int,4>{scaleNumber, 3, (int)netInputSize.y, (int)netInputSize.x},
                                          modelFolder + POSE_PROTOTXT[(int)poseModel], modelFolder + POSE_TRAINED_MODEL[(int)poseModel], gpuId)},
-        spResizeAndMergeCaffe{std::make_shared<ResizeAndMergeCaffe<float>>()},
-        spNmsCaffe{std::make_shared<NmsCaffe<float>>()},
-        spBodyPartConnectorCaffe{std::make_shared<BodyPartConnectorCaffe<float>>()}
+        spResizeAndMergeTensorRT{std::make_shared<ResizeAndMergeCaffe<float>>()},
+        spNmsTensorRT{std::make_shared<NmsCaffe<float>>()},
+        spBodyPartConnectorTensorRT{std::make_shared<BodyPartConnectorCaffe<float>>()}
     {
         try
         {
@@ -33,35 +33,35 @@ namespace op
         }
     }
 
-    PoseExtractorCaffe::~PoseExtractorCaffe()
+    PoseExtractorTensorRT::~PoseExtractorTensorRT()
     {
     }
 
-    void PoseExtractorCaffe::netInitializationOnThread()
+    void PoseExtractorTensorRT::netInitializationOnThread()
     {
         try
         {
             log("Starting initialization on thread.", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
 
-            // Caffe net
+            // TensorRT net
             spNet->initializationOnThread();
-            spCaffeNetOutputBlob = ((NetCaffe*)spNet.get())->getOutputBlob();
+            spTensorRTNetOutputBlob = ((NetCaffe*)spNet.get())->getOutputBlob();
             cudaCheck(__LINE__, __FUNCTION__, __FILE__);
 
             // HeatMaps extractor blob and layer
             spHeatMapsBlob = {std::make_shared<caffe::Blob<float>>(1,1,1,1)};
-            spResizeAndMergeCaffe->Reshape({spCaffeNetOutputBlob.get()}, {spHeatMapsBlob.get()}, mResizeScale * POSE_CCN_DECREASE_FACTOR[(int)mPoseModel]);
+            spResizeAndMergeTensorRT->Reshape({spTensorRTNetOutputBlob.get()}, {spHeatMapsBlob.get()}, mResizeScale * POSE_CCN_DECREASE_FACTOR[(int)mPoseModel]);
             cudaCheck(__LINE__, __FUNCTION__, __FILE__);
 
             // Pose extractor blob and layer
             spPeaksBlob = {std::make_shared<caffe::Blob<float>>(1,1,1,1)};
-            spNmsCaffe->Reshape({spHeatMapsBlob.get()}, {spPeaksBlob.get()}, POSE_MAX_PEAKS[(int)mPoseModel]);
+            spNmsTensorRT->Reshape({spHeatMapsBlob.get()}, {spPeaksBlob.get()}, POSE_MAX_PEAKS[(int)mPoseModel]);
             cudaCheck(__LINE__, __FUNCTION__, __FILE__);
 
             // Pose extractor blob and layer
             spPoseBlob = {std::make_shared<caffe::Blob<float>>(1,1,1,1)};
-            spBodyPartConnectorCaffe->setPoseModel(mPoseModel);
-            spBodyPartConnectorCaffe->Reshape({spHeatMapsBlob.get(), spPeaksBlob.get()}, {spPoseBlob.get()});
+            spBodyPartConnectorTensorRT->setPoseModel(mPoseModel);
+            spBodyPartConnectorTensorRT->Reshape({spHeatMapsBlob.get(), spPeaksBlob.get()}, {spPoseBlob.get()});
             cudaCheck(__LINE__, __FUNCTION__, __FILE__);
 
             log("Finished initialization on thread.", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
@@ -72,7 +72,7 @@ namespace op
         }
     }
 
-    void PoseExtractorCaffe::forwardPass(const Array<float>& inputNetData, const Point<int>& inputDataSize, const std::vector<float>& scaleRatios)
+    void PoseExtractorTensorRT::forwardPass(const Array<float>& inputNetData, const Point<int>& inputDataSize, const std::vector<float>& scaleRatios)
     {
         try
         {
@@ -80,25 +80,25 @@ namespace op
             if (inputNetData.empty())
                 error("Empty inputNetData.", __LINE__, __FUNCTION__, __FILE__);
 
-            // 1. Caffe deep network
+            // 1. TensorRT deep network
             spNet->forwardPass(inputNetData.getConstPtr());                                                     // ~79.3836ms
 
             // 2. Resize heat maps + merge different scales
-            spResizeAndMergeCaffe->setScaleRatios(scaleRatios);
+            spResizeAndMergeTensorRT->setScaleRatios(scaleRatios);
             #ifndef CPU_ONLY
-                spResizeAndMergeCaffe->Forward_gpu({spCaffeNetOutputBlob.get()}, {spHeatMapsBlob.get()});       // ~5ms
+                spResizeAndMergeTensorRT->Forward_gpu({spTensorRTNetOutputBlob.get()}, {spHeatMapsBlob.get()});       // ~5ms
                 cudaCheck(__LINE__, __FUNCTION__, __FILE__);
             #else
-                error("ResizeAndMergeCaffe CPU version not implemented yet.", __LINE__, __FUNCTION__, __FILE__);
+                error("ResizeAndMergeTensorRT CPU version not implemented yet.", __LINE__, __FUNCTION__, __FILE__);
             #endif
 
             // 3. Get peaks by Non-Maximum Suppression
-            spNmsCaffe->setThreshold((float)get(PoseProperty::NMSThreshold));
+            spNmsTensorRT->setThreshold((float)get(PoseProperty::NMSThreshold));
             #ifndef CPU_ONLY
-                spNmsCaffe->Forward_gpu({spHeatMapsBlob.get()}, {spPeaksBlob.get()});                           // ~2ms
+                spNmsTensorRT->Forward_gpu({spHeatMapsBlob.get()}, {spPeaksBlob.get()});                           // ~2ms
                 cudaCheck(__LINE__, __FUNCTION__, __FILE__);
             #else
-                error("NmsCaffe CPU version not implemented yet.", __LINE__, __FUNCTION__, __FILE__);
+                error("NmsTensorRT CPU version not implemented yet.", __LINE__, __FUNCTION__, __FILE__);
             #endif
 
             // Get scale net to output
@@ -107,15 +107,15 @@ namespace op
             mScaleNetToOutput = {(float)resizeGetScaleFactor(netSize, mOutputSize)};
 
             // 4. Connecting body parts
-            spBodyPartConnectorCaffe->setScaleNetToOutput(mScaleNetToOutput);
-            spBodyPartConnectorCaffe->setInterMinAboveThreshold((int)get(PoseProperty::ConnectInterMinAboveThreshold));
-            spBodyPartConnectorCaffe->setInterThreshold((float)get(PoseProperty::ConnectInterThreshold));
-            spBodyPartConnectorCaffe->setMinSubsetCnt((int)get(PoseProperty::ConnectMinSubsetCnt));
-            spBodyPartConnectorCaffe->setMinSubsetScore((float)get(PoseProperty::ConnectMinSubsetScore));
+            spBodyPartConnectorTensorRT->setScaleNetToOutput(mScaleNetToOutput);
+            spBodyPartConnectorTensorRT->setInterMinAboveThreshold((int)get(PoseProperty::ConnectInterMinAboveThreshold));
+            spBodyPartConnectorTensorRT->setInterThreshold((float)get(PoseProperty::ConnectInterThreshold));
+            spBodyPartConnectorTensorRT->setMinSubsetCnt((int)get(PoseProperty::ConnectMinSubsetCnt));
+            spBodyPartConnectorTensorRT->setMinSubsetScore((float)get(PoseProperty::ConnectMinSubsetScore));
 
             // GPU version not implemented yet
-            spBodyPartConnectorCaffe->Forward_cpu({spHeatMapsBlob.get(), spPeaksBlob.get()}, mPoseKeypoints);
-            // spBodyPartConnectorCaffe->Forward_gpu({spHeatMapsBlob.get(), spPeaksBlob.get()}, {spPoseBlob.get()}, mPoseKeypoints);
+            spBodyPartConnectorTensorRT->Forward_cpu({spHeatMapsBlob.get(), spPeaksBlob.get()}, mPoseKeypoints);
+            // spBodyPartConnectorTensorRT->Forward_gpu({spHeatMapsBlob.get(), spPeaksBlob.get()}, {spPoseBlob.get()}, mPoseKeypoints);
         }
         catch (const std::exception& e)
         {
@@ -123,7 +123,7 @@ namespace op
         }
     }
 
-    const float* PoseExtractorCaffe::getHeatMapCpuConstPtr() const
+    const float* PoseExtractorTensorRT::getHeatMapCpuConstPtr() const
     {
         try
         {
@@ -137,7 +137,7 @@ namespace op
         }
     }
 
-    const float* PoseExtractorCaffe::getHeatMapGpuConstPtr() const
+    const float* PoseExtractorTensorRT::getHeatMapGpuConstPtr() const
     {
         try
         {
@@ -151,7 +151,7 @@ namespace op
         }
     }
 
-    const float* PoseExtractorCaffe::getPoseGpuConstPtr() const
+    const float* PoseExtractorTensorRT::getPoseGpuConstPtr() const
     {
         try
         {

From c05580d2838e4c9e33bba560de72ba7ec895b01b Mon Sep 17 00:00:00 2001
From: Florent Buisson <florent.buisson.1987@gmail.com>
Date: Thu, 21 Sep 2017 09:55:25 +0000
Subject: [PATCH 04/52] Started modifying tutorial pose 3.

---
 .../tutorial_pose/3_extract_from_image_TensorRT.cpp | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp b/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp
index f431d322d..14e13fac5 100644
--- a/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp
+++ b/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp
@@ -1,4 +1,4 @@
-// ------------------------- OpenPose Library Tutorial - Pose - Example 1 - Extract from Image -------------------------
+// ------------------------- OpenPose Library Tutorial - Pose - Example 3 - Extract from Image with TensorRT -------------------------
 // This first example shows the user how to:
     // 1. Load an image (`filestream` module)
     // 2. Extract the pose of that image (`pose` module)
@@ -52,12 +52,12 @@ DEFINE_double(render_threshold,         0.05,           "Only estimated keypoint
 DEFINE_double(alpha_pose,               0.6,            "Blending factor (range 0-1) for the body part rendering. 1 will show it completely, 0 will"
                                                         " hide it. Only valid for GPU rendering.");
 
-int openPoseTutorialPose1()
+int openPoseTutorialPose3()
 {
     op::log("Starting pose estimation.", op::Priority::High);
     const auto timerBegin = std::chrono::high_resolution_clock::now();
   
-    op::log("OpenPose Library Tutorial - Example 1.", op::Priority::High);
+    op::log("OpenPose Library Tutorial - Pose Example 3.", op::Priority::High);
     // ------------------------- INITIALIZATION -------------------------
     // Step 1 - Set logging level
         // - 0 will output all the logging messages
@@ -114,6 +114,7 @@ int openPoseTutorialPose1()
     poseRenderer.renderPose(outputArray, poseKeypoints);
     // Step 5 - OpenPose output format to cv::Mat
     auto outputImage = opOutputToCvMat.formatToCvMat(outputArray);
+    const auto now = std::chrono::high_resolution_clock::now();
 
     // ------------------------- SHOWING RESULT AND CLOSING -------------------------
     // Step 1 - Show results
@@ -121,9 +122,9 @@ int openPoseTutorialPose1()
     // Step 2 - Logging information message
     op::log("Example 1 successfully finished.", op::Priority::High);
   
-    const auto now = std::chrono::high_resolution_clock::now();
     const auto totalTimeSec = (double)std::chrono::duration_cast<std::chrono::nanoseconds>(now-timerBegin).count() * 1e-9;
     const auto message = "Pose estimation successfully finished. Total time: " + std::to_string(totalTimeSec) + " seconds.";
+    op::log(message, op::Priority::High);
   
     // Return successful message
     return 0;
@@ -132,11 +133,11 @@ int openPoseTutorialPose1()
 int main(int argc, char *argv[])
 {
     // Initializing google logging (Caffe uses it for logging)
-    google::InitGoogleLogging("openPoseTutorialPose1");
+    google::InitGoogleLogging("openPoseTutorialPose3");
 
     // Parsing command line flags
     gflags::ParseCommandLineFlags(&argc, &argv, true);
 
     // Running openPoseTutorialPose1
-    return openPoseTutorialPose1();
+    return openPoseTutorialPose3();
 }

From 9a97e934eb5d76fa6d5e61f0738b83c146a75464 Mon Sep 17 00:00:00 2001
From: Florent Buisson <florent.buisson.1987@gmail.com>
Date: Thu, 21 Sep 2017 10:43:08 +0000
Subject: [PATCH 05/52] More precise timing.

---
 .../3_extract_from_image_TensorRT.cpp         | 28 ++++++++++++++++---
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp b/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp
index 14e13fac5..a8e0b9e38 100644
--- a/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp
+++ b/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp
@@ -52,11 +52,21 @@ DEFINE_double(render_threshold,         0.05,           "Only estimated keypoint
 DEFINE_double(alpha_pose,               0.6,            "Blending factor (range 0-1) for the body part rendering. 1 will show it completely, 0 will"
                                                         " hide it. Only valid for GPU rendering.");
 
+
+static std::vector<std::pair<std::string, std::chrono::high_resolution_clock::time_point>> timings;
+
+static void time_now(const std::string& label){
+    const auto now = std::chrono::high_resolution_clock::now();
+    const auto timing = std::make_pair(label, now);
+    timings.push_back(timing);
+} 
+
 int openPoseTutorialPose3()
 {
     op::log("Starting pose estimation.", op::Priority::High);
-    const auto timerBegin = std::chrono::high_resolution_clock::now();
-  
+    
+    time_now("Start");
+ 
     op::log("OpenPose Library Tutorial - Pose Example 3.", op::Priority::High);
     // ------------------------- INITIALIZATION -------------------------
     // Step 1 - Set logging level
@@ -100,6 +110,7 @@ int openPoseTutorialPose3()
     cv::Mat inputImage = op::loadImage(FLAGS_image_path, CV_LOAD_IMAGE_COLOR); // Alternative: cv::imread(FLAGS_image_path, CV_LOAD_IMAGE_COLOR);
     if(inputImage.empty())
         op::error("Could not open or find the image: " + FLAGS_image_path, __LINE__, __FUNCTION__, __FILE__);
+    time_now("Step 1");
     // Step 2 - Format input image to OpenPose input and output formats
     op::Array<float> netInputArray;
     std::vector<float> scaleRatios;
@@ -107,14 +118,17 @@ int openPoseTutorialPose3()
     double scaleInputToOutput;
     op::Array<float> outputArray;
     std::tie(scaleInputToOutput, outputArray) = cvMatToOpOutput.format(inputImage);
+    time_now("Step 2");
     // Step 3 - Estimate poseKeypoints
     poseExtractorCaffe.forwardPass(netInputArray, {inputImage.cols, inputImage.rows}, scaleRatios);
     const auto poseKeypoints = poseExtractorCaffe.getPoseKeypoints();
+    time_now("Step 3");
     // Step 4 - Render poseKeypoints
     poseRenderer.renderPose(outputArray, poseKeypoints);
+    time_now("Step 4");
     // Step 5 - OpenPose output format to cv::Mat
     auto outputImage = opOutputToCvMat.formatToCvMat(outputArray);
-    const auto now = std::chrono::high_resolution_clock::now();
+    time_now("Step 5");
 
     // ------------------------- SHOWING RESULT AND CLOSING -------------------------
     // Step 1 - Show results
@@ -122,9 +136,15 @@ int openPoseTutorialPose3()
     // Step 2 - Logging information message
     op::log("Example 1 successfully finished.", op::Priority::High);
   
-    const auto totalTimeSec = (double)std::chrono::duration_cast<std::chrono::nanoseconds>(now-timerBegin).count() * 1e-9;
+    const auto totalTimeSec = (double)std::chrono::duration_cast<std::chrono::nanoseconds>(timings.back().second-timings.front().second).count() * 1e-9;
     const auto message = "Pose estimation successfully finished. Total time: " + std::to_string(totalTimeSec) + " seconds.";
     op::log(message, op::Priority::High);
+    
+    for(const auto timing : timings) {
+        const auto log_time = timing.first + " - " + std::to_string((double)std::chrono::duration_cast<std::chrono::duration<double>>(timing.second - timings.front().second).count());
+        op::log(log_time, op::Priority::High);
+    }
+    
   
     // Return successful message
     return 0;

From 4778ed6c257972f66c8a35f8d819d25e3e58e10e Mon Sep 17 00:00:00 2001
From: Florent Buisson <florent.buisson.1987@gmail.com>
Date: Thu, 21 Sep 2017 12:10:21 +0000
Subject: [PATCH 06/52] More precise timings before replacing inference.

---
 .../3_extract_from_image_TensorRT.cpp         | 32 ++++++++++++-------
 1 file changed, 20 insertions(+), 12 deletions(-)

diff --git a/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp b/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp
index a8e0b9e38..b36f362b9 100644
--- a/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp
+++ b/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp
@@ -52,20 +52,26 @@ DEFINE_double(render_threshold,         0.05,           "Only estimated keypoint
 DEFINE_double(alpha_pose,               0.6,            "Blending factor (range 0-1) for the body part rendering. 1 will show it completely, 0 will"
                                                         " hide it. Only valid for GPU rendering.");
 
+typedef std::vector<std::pair<std::string, std::chrono::high_resolution_clock::time_point>> OpTimings;
 
-static std::vector<std::pair<std::string, std::chrono::high_resolution_clock::time_point>> timings;
+static OpTimings timings;
 
-static void time_now(const std::string& label){
+static void timeNow(const std::string& label){
     const auto now = std::chrono::high_resolution_clock::now();
     const auto timing = std::make_pair(label, now);
     timings.push_back(timing);
 } 
 
+static std::string timeDiffToString(const std::chrono::high_resolution_clock::time_point& t1,
+                                const std::chrono::high_resolution_clock::time_point& t2 ) {
+    return std::to_string((double)std::chrono::duration_cast<std::chrono::duration<double>>(t1 - t2).count());
+}
+
 int openPoseTutorialPose3()
 {
     op::log("Starting pose estimation.", op::Priority::High);
     
-    time_now("Start");
+    timeNow("Start");
  
     op::log("OpenPose Library Tutorial - Pose Example 3.", op::Priority::High);
     // ------------------------- INITIALIZATION -------------------------
@@ -104,13 +110,15 @@ int openPoseTutorialPose3()
     // Step 4 - Initialize resources on desired thread (in this case single thread, i.e. we init resources here)
     poseExtractorCaffe.initializationOnThread();
     poseRenderer.initializationOnThread();
+    
+    timeNow("Initialization");
 
     // ------------------------- POSE ESTIMATION AND RENDERING -------------------------
     // Step 1 - Read and load image, error if empty (possibly wrong path)
     cv::Mat inputImage = op::loadImage(FLAGS_image_path, CV_LOAD_IMAGE_COLOR); // Alternative: cv::imread(FLAGS_image_path, CV_LOAD_IMAGE_COLOR);
     if(inputImage.empty())
         op::error("Could not open or find the image: " + FLAGS_image_path, __LINE__, __FUNCTION__, __FILE__);
-    time_now("Step 1");
+    timeNow("Step 1");
     // Step 2 - Format input image to OpenPose input and output formats
     op::Array<float> netInputArray;
     std::vector<float> scaleRatios;
@@ -118,17 +126,17 @@ int openPoseTutorialPose3()
     double scaleInputToOutput;
     op::Array<float> outputArray;
     std::tie(scaleInputToOutput, outputArray) = cvMatToOpOutput.format(inputImage);
-    time_now("Step 2");
+    timeNow("Step 2");
     // Step 3 - Estimate poseKeypoints
     poseExtractorCaffe.forwardPass(netInputArray, {inputImage.cols, inputImage.rows}, scaleRatios);
     const auto poseKeypoints = poseExtractorCaffe.getPoseKeypoints();
-    time_now("Step 3");
+    timeNow("Step 3");
     // Step 4 - Render poseKeypoints
     poseRenderer.renderPose(outputArray, poseKeypoints);
-    time_now("Step 4");
+    timeNow("Step 4");
     // Step 5 - OpenPose output format to cv::Mat
     auto outputImage = opOutputToCvMat.formatToCvMat(outputArray);
-    time_now("Step 5");
+    timeNow("Step 5");
 
     // ------------------------- SHOWING RESULT AND CLOSING -------------------------
     // Step 1 - Show results
@@ -136,12 +144,12 @@ int openPoseTutorialPose3()
     // Step 2 - Logging information message
     op::log("Example 1 successfully finished.", op::Priority::High);
   
-    const auto totalTimeSec = (double)std::chrono::duration_cast<std::chrono::nanoseconds>(timings.back().second-timings.front().second).count() * 1e-9;
-    const auto message = "Pose estimation successfully finished. Total time: " + std::to_string(totalTimeSec) + " seconds.";
+    const auto totalTimeSec = timeDiffToString(timings.back().second, timings.front().second);
+    const auto message = "Pose estimation successfully finished. Total time: " + totalTimeSec + " seconds.";
     op::log(message, op::Priority::High);
     
-    for(const auto timing : timings) {
-        const auto log_time = timing.first + " - " + std::to_string((double)std::chrono::duration_cast<std::chrono::duration<double>>(timing.second - timings.front().second).count());
+    for(OpTimings::iterator timing = timings.begin()+1; timing != timings.end(); ++timing) {
+        const auto log_time = (*timing).first + " - " + timeDiffToString((*timing).second, (*(timing-1)).second);
         op::log(log_time, op::Priority::High);
     }
     

From 9c258b71f6008ad7457a2201dd0e130e21802957 Mon Sep 17 00:00:00 2001
From: Florent Buisson <florent.buisson.1987@gmail.com>
Date: Thu, 21 Sep 2017 12:23:35 +0000
Subject: [PATCH 07/52] Clearer timing display.

---
 examples/tutorial_pose/3_extract_from_image_TensorRT.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp b/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp
index b36f362b9..1b7ed7f6d 100644
--- a/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp
+++ b/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp
@@ -64,7 +64,7 @@ static void timeNow(const std::string& label){
 
 static std::string timeDiffToString(const std::chrono::high_resolution_clock::time_point& t1,
                                 const std::chrono::high_resolution_clock::time_point& t2 ) {
-    return std::to_string((double)std::chrono::duration_cast<std::chrono::duration<double>>(t1 - t2).count());
+    return std::to_string((double)std::chrono::duration_cast<std::chrono::duration<double>>(t1 - t2).count() * 1e3) + " ms";
 }
 
 int openPoseTutorialPose3()

From e6fbd253786bcbdda372e4f08bc373f60f1d2998 Mon Sep 17 00:00:00 2001
From: Florent Buisson <florent.buisson.1987@gmail.com>
Date: Thu, 21 Sep 2017 12:52:38 +0000
Subject: [PATCH 08/52] Replaced poseExtractorCaffe with poseExtractorTensorRT

---
 examples/tutorial_pose/3_extract_from_image_TensorRT.cpp | 8 ++++----
 include/openpose/pose/headers.hpp                        | 1 +
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp b/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp
index 1b7ed7f6d..f4e7eace1 100644
--- a/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp
+++ b/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp
@@ -100,7 +100,7 @@ int openPoseTutorialPose3()
     // Step 3 - Initialize all required classes
     op::CvMatToOpInput cvMatToOpInput{netInputSize, FLAGS_scale_number, (float)FLAGS_scale_gap};
     op::CvMatToOpOutput cvMatToOpOutput{outputSize};
-    op::PoseExtractorCaffe poseExtractorCaffe{netInputSize, netOutputSize, outputSize, FLAGS_scale_number, poseModel,
+    op::PoseExtractorTensorRT poseExtractorTensorRT{netInputSize, netOutputSize, outputSize, FLAGS_scale_number, poseModel,
                                               FLAGS_model_folder, FLAGS_num_gpu_start};
     op::PoseRenderer poseRenderer{netOutputSize, outputSize, poseModel, nullptr, (float)FLAGS_render_threshold,
                                   !FLAGS_disable_blending, (float)FLAGS_alpha_pose};
@@ -108,7 +108,7 @@ int openPoseTutorialPose3()
     const op::Point<int> windowedSize = outputSize;
     op::FrameDisplayer frameDisplayer{windowedSize, "OpenPose Tutorial - Example 1"};
     // Step 4 - Initialize resources on desired thread (in this case single thread, i.e. we init resources here)
-    poseExtractorCaffe.initializationOnThread();
+    poseExtractorTensorRT.initializationOnThread();
     poseRenderer.initializationOnThread();
     
     timeNow("Initialization");
@@ -128,8 +128,8 @@ int openPoseTutorialPose3()
     std::tie(scaleInputToOutput, outputArray) = cvMatToOpOutput.format(inputImage);
     timeNow("Step 2");
     // Step 3 - Estimate poseKeypoints
-    poseExtractorCaffe.forwardPass(netInputArray, {inputImage.cols, inputImage.rows}, scaleRatios);
-    const auto poseKeypoints = poseExtractorCaffe.getPoseKeypoints();
+    poseExtractorTensorRT.forwardPass(netInputArray, {inputImage.cols, inputImage.rows}, scaleRatios);
+    const auto poseKeypoints = poseExtractorTensorRT.getPoseKeypoints();
     timeNow("Step 3");
     // Step 4 - Render poseKeypoints
     poseRenderer.renderPose(outputArray, poseKeypoints);
diff --git a/include/openpose/pose/headers.hpp b/include/openpose/pose/headers.hpp
index 4fe06d461..4d336060a 100644
--- a/include/openpose/pose/headers.hpp
+++ b/include/openpose/pose/headers.hpp
@@ -7,6 +7,7 @@
 #include <openpose/pose/enumClasses.hpp>
 #include <openpose/pose/poseExtractor.hpp>
 #include <openpose/pose/poseExtractorCaffe.hpp>
+#include <openpose/pose/poseExtractorTensorRT.hpp>
 #include <openpose/pose/poseRenderer.hpp>
 #include <openpose/pose/poseParameters.hpp>
 #include <openpose/pose/renderPose.hpp>

From f290fc57f750db208923d34695de4a018d317f29 Mon Sep 17 00:00:00 2001
From: Florent Buisson <florent.buisson.1987@gmail.com>
Date: Thu, 21 Sep 2017 12:53:37 +0000
Subject: [PATCH 09/52] Added inference sample code at end of
 poseExtractorTensorRT to work on laptop. DOES NOT compile, convenience
 commit.

---
 src/openpose/pose/poseExtractorTensorRT.cpp | 448 ++++++++++++++++++++
 1 file changed, 448 insertions(+)

diff --git a/src/openpose/pose/poseExtractorTensorRT.cpp b/src/openpose/pose/poseExtractorTensorRT.cpp
index 0bd1dc6df..0ea2e30e6 100644
--- a/src/openpose/pose/poseExtractorTensorRT.cpp
+++ b/src/openpose/pose/poseExtractorTensorRT.cpp
@@ -168,3 +168,451 @@ namespace op
 }
 
 #endif
+#include <assert.h>
+#include <fstream>
+#include <sstream>
+#include <iostream>
+#include <cmath>
+#include <sys/stat.h>
+#include <cmath>
+#include <time.h>
+#include <cuda_runtime_api.h>
+#include <algorithm>
+#include <chrono>
+#include <string.h>
+#include <map>
+#include <random>
+
+#include "NvInfer.h"
+#include "NvCaffeParser.h"
+
+using namespace nvinfer1;
+using namespace nvcaffeparser1;
+
+#define CHECK(status)									\
+{														\
+	if (status != 0)									\
+	{													\
+		std::cout << "Cuda failure: " << status;		\
+		abort();										\
+	}													\
+}
+
+struct Params
+{
+	std::string deployFile, modelFile, engine, calibrationCache;
+	std::vector<std::string> outputs;
+	int device{ 0 }, batchSize{ 1 }, workspaceSize{ 16 }, iterations{ 10 }, avgRuns{ 10 };
+	bool half2{ false }, int8{ false }, verbose{ false }, hostTime{ false };
+} gParams;
+
+static inline int volume(DimsCHW dims)
+{
+	return dims.c()*dims.h()*dims.w();
+}
+
+std::vector<std::string> gInputs;
+std::map<std::string, DimsCHW> gInputDimensions;
+
+// Logger for GIE info/warning/errors
+class Logger : public ILogger			
+{
+	void log(Severity severity, const char* msg) override
+	{
+		// suppress info-level messages
+		if (severity != Severity::kINFO || gParams.verbose)
+			std::cout << msg << std::endl;
+	}
+} gLogger;
+
+class RndInt8Calibrator : public IInt8EntropyCalibrator
+{
+public:
+	RndInt8Calibrator(int totalSamples = 1)
+		: mTotalSamples(totalSamples)
+		, mCurrentSample(0)
+	{
+		std::default_random_engine generator;
+		std::uniform_real_distribution<float> distribution(-1.0F, 1.0F);
+		for(auto& elem: gInputDimensions)
+		{
+			int elemCount = volume(elem.second);
+
+			std::vector<float> rnd_data(elemCount);
+			for(auto& val: rnd_data)
+				val = distribution(generator);
+
+			void * data;
+			CHECK(cudaMalloc(&data, elemCount * sizeof(float)));
+			CHECK(cudaMemcpy(data, &rnd_data[0], elemCount * sizeof(float), cudaMemcpyHostToDevice));
+
+			mInputDeviceBuffers.insert(std::make_pair(elem.first, data));
+		}
+	}
+
+	~RndInt8Calibrator()
+	{
+		for(auto& elem: mInputDeviceBuffers)
+			CHECK(cudaFree(elem.second));
+	}
+
+	int getBatchSize() const override
+	{
+		return 1;
+	}
+	
+	bool getBatch(void* bindings[], const char* names[], int nbBindings) override
+	{
+		if (mCurrentSample >= mTotalSamples)
+			return false;
+
+		for(int i = 0; i < nbBindings; ++i)
+			bindings[i] = mInputDeviceBuffers[names[i]];
+
+		++mCurrentSample;
+		return true;
+	}
+
+	const void* readCalibrationCache(size_t&) override
+	{
+		return nullptr;
+	}
+
+	virtual void writeCalibrationCache(const void*, size_t) override
+	{
+	}
+
+private:
+	int mTotalSamples;
+	int mCurrentSample;
+	std::map<std::string, void*> mInputDeviceBuffers;
+};
+
+ICudaEngine* caffeToGIEModel()
+{
+	// create the builder
+	IBuilder* builder = createInferBuilder(gLogger);
+
+	// parse the caffe model to populate the network, then set the outputs
+	INetworkDefinition* network = builder->createNetwork();
+	ICaffeParser* parser = createCaffeParser();
+	const IBlobNameToTensor* blobNameToTensor = parser->parse(gParams.deployFile.c_str(),
+															  gParams.modelFile.empty() ? 0 : gParams.modelFile.c_str(),
+															  *network,
+															  gParams.half2 ? DataType::kHALF:DataType::kFLOAT);
+
+
+	if (!blobNameToTensor)
+		return nullptr;
+
+	for (int i = 0, n = network->getNbInputs(); i < n; i++)
+	{
+		DimsCHW dims = static_cast<DimsCHW&&>(network->getInput(i)->getDimensions());
+		gInputs.push_back(network->getInput(i)->getName());
+		gInputDimensions.insert(std::make_pair(network->getInput(i)->getName(), dims));
+		std::cout << "Input \"" << network->getInput(i)->getName() << "\": " << dims.c() << "x" << dims.h() << "x" << dims.w() << std::endl;
+	}
+
+	// specify which tensors are outputs
+	for (auto& s : gParams.outputs)
+	{
+		if (blobNameToTensor->find(s.c_str()) == nullptr)
+		{
+			std::cout << "could not find output blob " << s << std::endl;
+			return nullptr;
+		}
+		network->markOutput(*blobNameToTensor->find(s.c_str()));
+	}
+
+	for (int i = 0, n = network->getNbOutputs(); i < n; i++)
+	{
+		DimsCHW dims = static_cast<DimsCHW&&>(network->getOutput(i)->getDimensions());
+		std::cout << "Output \"" << network->getOutput(i)->getName() << "\": " << dims.c() << "x" << dims.h() << "x" << dims.w() << std::endl;
+	}
+
+	// Build the engine
+	builder->setMaxBatchSize(gParams.batchSize);
+	builder->setMaxWorkspaceSize(gParams.workspaceSize<<20);
+	builder->setHalf2Mode(gParams.half2);
+
+	RndInt8Calibrator calibrator;
+	if (gParams.int8)
+	{
+		builder->setInt8Mode(true);
+		builder->setInt8Calibrator(&calibrator);
+	}
+
+	ICudaEngine* engine = builder->buildCudaEngine(*network);
+	if (engine == nullptr)
+		std::cout << "could not build engine" << std::endl;
+
+	parser->destroy();
+	network->destroy();
+	builder->destroy();
+	shutdownProtobufLibrary();
+	return engine;
+}
+
+void createMemory(const ICudaEngine& engine, std::vector<void*>& buffers, const std::string& name)
+{
+	size_t bindingIndex = engine.getBindingIndex(name.c_str());
+	printf("name=%s, bindingIndex=%d, buffers.size()=%d\n", name.c_str(), (int)bindingIndex, (int)buffers.size());
+	assert(bindingIndex < buffers.size());
+	DimsCHW dimensions = static_cast<DimsCHW&&>(engine.getBindingDimensions((int)bindingIndex));
+	size_t eltCount = dimensions.c()*dimensions.h()*dimensions.w()*gParams.batchSize, memSize = eltCount * sizeof(float);
+
+	float* localMem = new float[eltCount];
+	for (size_t i = 0; i < eltCount; i++)
+		localMem[i] = (float(rand()) / RAND_MAX) * 2 - 1;
+
+	void* deviceMem;
+	CHECK(cudaMalloc(&deviceMem, memSize));
+	if (deviceMem == nullptr)
+	{
+		std::cerr << "Out of memory" << std::endl;
+		exit(1);
+	}
+	CHECK(cudaMemcpy(deviceMem, localMem, memSize, cudaMemcpyHostToDevice));
+
+	delete[] localMem;
+	buffers[bindingIndex] = deviceMem;	
+}
+
+void doInference(ICudaEngine& engine)
+{
+	IExecutionContext *context = engine.createExecutionContext();
+	// input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(),
+	// of these, but in this case we know that there is exactly one input and one output.
+
+	std::vector<void*> buffers(gInputs.size() + gParams.outputs.size());
+	for (size_t i = 0; i < gInputs.size(); i++)
+		createMemory(engine, buffers, gInputs[i]);
+
+	for (size_t i = 0; i < gParams.outputs.size(); i++)
+		createMemory(engine, buffers, gParams.outputs[i]);
+
+	cudaStream_t stream;
+	CHECK(cudaStreamCreate(&stream));
+	cudaEvent_t start, end;
+	CHECK(cudaEventCreate(&start));
+	CHECK(cudaEventCreate(&end));
+
+	for (int j = 0; j < gParams.iterations; j++)
+	{
+		float total = 0, ms;
+		for (int i = 0; i < gParams.avgRuns; i++)
+		{
+			if (gParams.hostTime)
+			{
+				auto t_start = std::chrono::high_resolution_clock::now();
+				context->execute(gParams.batchSize, &buffers[0]);
+				auto t_end = std::chrono::high_resolution_clock::now();
+				ms = std::chrono::duration<float, std::milli>(t_end - t_start).count();
+			}
+			else
+			{
+				cudaEventRecord(start, stream);
+				context->enqueue(gParams.batchSize, &buffers[0], stream, nullptr);
+				cudaEventRecord(end, stream);
+				cudaEventSynchronize(end);
+				cudaEventElapsedTime(&ms, start, end);
+			}
+			total += ms;
+		}
+		total /= gParams.avgRuns;
+		std::cout << "Average over " << gParams.avgRuns << " runs is " << total << " ms." << std::endl;
+	}
+
+
+	cudaStreamDestroy(stream);
+	cudaEventDestroy(start);
+	cudaEventDestroy(end);
+}
+
+
+
+static void printUsage()
+{
+	printf("\n");
+	printf("Mandatory params:\n");
+	printf("  --deploy=<file>      Caffe deploy file\n");
+	printf("  --output=<name>      Output blob name (can be specified multiple times)\n");
+
+	printf("\nOptional params:\n");
+
+	printf("  --model=<file>       Caffe model file (default = no model, random weights used)\n");
+	printf("  --batch=N            Set batch size (default = %d)\n", gParams.batchSize);
+	printf("  --device=N           Set cuda device to N (default = %d)\n", gParams.device);
+	printf("  --iterations=N       Run N iterations (default = %d)\n", gParams.iterations);
+	printf("  --avgRuns=N          Set avgRuns to N - perf is measured as an average of avgRuns (default=%d)\n", gParams.avgRuns);
+	printf("  --workspace=N        Set workspace size in megabytes (default = %d)\n", gParams.workspaceSize);
+	printf("  --half2              Run in paired fp16 mode (default = false)\n");
+	printf("  --int8               Run in int8 mode (default = false)\n");
+	printf("  --verbose            Use verbose logging (default = false)\n");
+	printf("  --hostTime	       Measure host time rather than GPU time (default = false)\n");
+	printf("  --engine=<file>      Generate a serialized GIE engine\n");
+	printf("  --calib=<file>       Read INT8 calibration cache file\n");
+
+	fflush(stdout);
+}
+
+bool parseString(const char* arg, const char* name, std::string& value)
+{
+	size_t n = strlen(name);
+	bool match = arg[0] == '-' && arg[1] == '-' && !strncmp(arg + 2, name, n) && arg[n + 2] == '=';
+	if (match)
+	{
+		value = arg + n + 3;
+		std::cout << name << ": " << value << std::endl;
+	}
+	return match;
+}
+
+bool parseInt(const char* arg, const char* name, int& value)
+{
+	size_t n = strlen(name);
+	bool match = arg[0] == '-' && arg[1] == '-' && !strncmp(arg + 2, name, n) && arg[n + 2] == '=';
+	if (match)
+	{
+		value = atoi(arg + n + 3);
+		std::cout << name << ": " << value << std::endl;
+	}
+	return match;
+}
+
+bool parseBool(const char* arg, const char* name, bool& value)
+{
+	size_t n = strlen(name);
+	bool match = arg[0] == '-' && arg[1] == '-' && !strncmp(arg + 2, name, n);
+	if (match)
+	{
+		std::cout << name << std::endl;
+		value = true;
+	}
+	return match;
+
+}
+
+
+bool parseArgs(int argc, char* argv[])
+{
+	if (argc < 3)
+	{
+		printUsage();
+		return false;
+	}
+
+	for (int j = 1; j < argc; j++)
+	{
+		if (parseString(argv[j], "model", gParams.modelFile) || parseString(argv[j], "deploy", gParams.deployFile) || parseString(argv[j], "engine", gParams.engine))
+			continue;
+
+		if (parseString(argv[j], "calib", gParams.calibrationCache))
+			continue;
+		
+		std::string output;
+		if (parseString(argv[j], "output", output))
+		{
+			gParams.outputs.push_back(output);
+			continue;
+		}
+
+		if (parseInt(argv[j], "batch", gParams.batchSize) || parseInt(argv[j], "iterations", gParams.iterations) || parseInt(argv[j], "avgRuns", gParams.avgRuns) 
+			|| parseInt(argv[j], "device", gParams.device)	|| parseInt(argv[j], "workspace", gParams.workspaceSize))
+			continue;
+
+		if (parseBool(argv[j], "half2", gParams.half2) || parseBool(argv[j], "int8", gParams.int8)
+			|| parseBool(argv[j], "verbose", gParams.verbose) || parseBool(argv[j], "hostTime", gParams.hostTime))
+			continue;
+
+		printf("Unknown argument: %s\n", argv[j]);
+		return false;
+	}
+	return true;
+}
+
+static ICudaEngine* createEngine()
+{
+	ICudaEngine *engine;
+
+	if (!gParams.deployFile.empty()) {
+		engine = caffeToGIEModel();
+		if (!engine)
+		{
+			std::cerr << "Engine could not be created" << std::endl;
+			return nullptr;
+		}
+	
+	
+		if (!gParams.engine.empty())
+		{
+			std::ofstream p(gParams.engine);
+			if (!p)
+			{
+				std::cerr << "could not open plan output file" << std::endl;
+				return nullptr;
+			}
+			IHostMemory *ptr = engine->serialize();
+            assert(ptr);
+            p.write(reinterpret_cast<const char*>(ptr->data()), ptr->size());
+            ptr->destroy();
+		}
+		return engine;
+	}
+
+	// load directly from serialized engine file if deploy not specified
+	if (!gParams.engine.empty()) {
+		char *gieModelStream{nullptr};
+        size_t size{0};
+		std::ifstream file(gParams.engine, std::ios::binary);
+		if (file.good()) {
+            file.seekg(0, file.end);
+            size = file.tellg();
+            file.seekg(0, file.beg);
+            gieModelStream = new char[size];
+            assert(gieModelStream);
+            file.read(gieModelStream, size);
+			file.close();
+		}
+
+		IRuntime* infer = createInferRuntime(gLogger);
+		engine = infer->deserializeCudaEngine(gieModelStream, size, nullptr);
+        if (gieModelStream) delete [] gieModelStream;
+
+		// assume input to be "data" for deserialized engine
+		gInputs.push_back("data");
+		return engine;
+	}
+
+	// complain about empty deploy file
+	std::cerr << "Deploy file not specified" << std::endl;
+	return nullptr;
+}
+
+int main(int argc, char** argv)
+{
+	// create a GIE model from the caffe model and serialize it to a stream
+
+	if (!parseArgs(argc, argv))
+		return -1;
+
+	cudaSetDevice(gParams.device);
+
+	if (gParams.outputs.size() == 0)
+	{
+		std::cerr << "At least one network output must be defined" << std::endl;
+		return -1;
+	}
+
+	ICudaEngine* engine = createEngine();
+	if (!engine)
+	{
+		std::cerr << "Engine could not be created" << std::endl;
+		return -1;
+	}
+
+	doInference(*engine);
+	engine->destroy();
+
+	return 0;
+}

From ddc23969a8574ca4e99752ea65a7f273981f6c55 Mon Sep 17 00:00:00 2001
From: Florent Buisson <florent.buisson.1987@gmail.com>
Date: Fri, 22 Sep 2017 11:48:58 +0200
Subject: [PATCH 10/52] First code adaptation trial. Will not compile, still
 loads to replace.

---
 .../openpose/pose/poseExtractorTensorRT.hpp   |  12 +
 src/openpose/pose/poseExtractorTensorRT.cpp   | 728 +++++++-----------
 2 files changed, 293 insertions(+), 447 deletions(-)

diff --git a/include/openpose/pose/poseExtractorTensorRT.hpp b/include/openpose/pose/poseExtractorTensorRT.hpp
index 270d2a8f4..33f781b8a 100644
--- a/include/openpose/pose/poseExtractorTensorRT.hpp
+++ b/include/openpose/pose/poseExtractorTensorRT.hpp
@@ -43,6 +43,18 @@ namespace op
         std::shared_ptr<caffe::Blob<float>> spHeatMapsBlob;
         std::shared_ptr<caffe::Blob<float>> spPeaksBlob;
         std::shared_ptr<caffe::Blob<float>> spPoseBlob;
+      
+        // TensorRT stuff
+        const Point<int> mNetInputSize;
+        const Point<int> mNetOutputSize;
+        const Point<int> mOutputSize;
+        const int mScaleNumber;
+        const PoseModel mPoseModel;
+        const std::string mModelFolder;
+        const int mGpuId;
+        const std::vector<HeatMapType> mHeatMapTypes;
+        const ScaleMode mHeatMapScale;
+        ICudaEngine* cudaEngine;
 
         DELETE_COPY(PoseExtractorTensorRT);
     };
diff --git a/src/openpose/pose/poseExtractorTensorRT.cpp b/src/openpose/pose/poseExtractorTensorRT.cpp
index 0ea2e30e6..c7f3311d8 100644
--- a/src/openpose/pose/poseExtractorTensorRT.cpp
+++ b/src/openpose/pose/poseExtractorTensorRT.cpp
@@ -7,6 +7,261 @@
 #include <openpose/utilities/openCv.hpp>
 #include <openpose/pose/poseExtractorTensorRT.hpp>
 
+#include <assert.h>
+#include <fstream>
+#include <sstream>
+#include <iostream>
+#include <cmath>
+#include <sys/stat.h>
+#include <cmath>
+#include <time.h>
+#include <cuda_runtime_api.h>
+#include <algorithm>
+#include <chrono>
+#include <string.h>
+#include <map>
+#include <random>
+
+#include "NvInfer.h"
+#include "NvCaffeParser.h"
+
+using namespace nvinfer1;
+using namespace nvcaffeparser1;
+
+#define CHECK(status)								      	\
+{														                \
+if (status != 0)									          \
+{												                   	\
+std::cout << "Cuda failure: " << status;		\
+abort();									                	\
+}										                   			\
+}
+
+struct Params
+{
+  std::string deployFile, modelFile, engine, calibrationCache;
+  std::vector<std::string> outputs;
+  int device{ 0 }, batchSize{ 1 }, workspaceSize{ 16 }, iterations{ 10 }, avgRuns{ 10 };
+  bool half2{ false }, int8{ false }, verbose{ false }, hostTime{ false };
+} gParams;
+
+static inline int volume(DimsCHW dims)
+{
+  return dims.c()*dims.h()*dims.w();
+}
+
+std::vector<std::string> gInputs;
+std::map<std::string, DimsCHW> gInputDimensions;
+
+// Logger for GIE info/warning/errors
+class Logger : public ILogger
+{
+  void log(Severity severity, const char* msg) override
+  {
+    // suppress info-level messages
+    if (severity != Severity::kINFO || gParams.verbose)
+      std::cout << msg << std::endl;
+  }
+} gLogger;
+
+
+ICudaEngine* caffeToGIEModel()
+{
+  // create the builder
+  IBuilder* builder = createInferBuilder(gLogger);
+  
+  // parse the caffe model to populate the network, then set the outputs
+  INetworkDefinition* network = builder->createNetwork();
+  ICaffeParser* parser = createCaffeParser();
+  const IBlobNameToTensor* blobNameToTensor = parser->parse(gParams.deployFile.c_str(),
+                                                            gParams.modelFile.empty() ? 0 : gParams.modelFile.c_str(),
+                                                            *network,
+                                                            gParams.half2 ? DataType::kHALF:DataType::kFLOAT);
+  
+  
+  if (!blobNameToTensor)
+    return nullptr;
+  
+  for (int i = 0, n = network->getNbInputs(); i < n; i++)
+  {
+    DimsCHW dims = static_cast<DimsCHW&&>(network->getInput(i)->getDimensions());
+    gInputs.push_back(network->getInput(i)->getName());
+    gInputDimensions.insert(std::make_pair(network->getInput(i)->getName(), dims));
+    std::cout << "Input \"" << network->getInput(i)->getName() << "\": " << dims.c() << "x" << dims.h() << "x" << dims.w() << std::endl;
+  }
+  
+  // specify which tensors are outputs
+  
+  for (auto& s : gParams.outputs)
+  {
+    if (blobNameToTensor->find(s.c_str()) == nullptr)
+    {
+      std::cout << "could not find output blob " << s << std::endl;
+      return nullptr;
+    }
+    network->markOutput(*blobNameToTensor->find(s.c_str()));
+  }
+  
+  for (int i = 0, n = network->getNbOutputs(); i < n; i++)
+  {
+    DimsCHW dims = static_cast<DimsCHW&&>(network->getOutput(i)->getDimensions());
+    std::cout << "Output \"" << network->getOutput(i)->getName() << "\": " << dims.c() << "x" << dims.h() << "x" << dims.w() << std::endl;
+  }
+  
+  // Build the engine
+  builder->setMaxBatchSize(1);
+  builder->setMaxWorkspaceSize(gParams.workspaceSize<<20);
+  builder->setHalf2Mode(true);
+  
+  ICudaEngine* engine = builder->buildCudaEngine(*network);
+  if (engine == nullptr)
+    std::cout << "could not build engine" << std::endl;
+  
+  parser->destroy();
+  network->destroy();
+  builder->destroy();
+  shutdownProtobufLibrary();
+  return engine;
+}
+
+void createMemory(const ICudaEngine& engine, std::vector<void*>& buffers, const std::string& name)
+{
+  size_t bindingIndex = engine.getBindingIndex(name.c_str());
+  printf("name=%s, bindingIndex=%d, buffers.size()=%d\n", name.c_str(), (int)bindingIndex, (int)buffers.size());
+  assert(bindingIndex < buffers.size());
+  DimsCHW dimensions = static_cast<DimsCHW&&>(engine.getBindingDimensions((int)bindingIndex));
+  size_t eltCount = dimensions.c()*dimensions.h()*dimensions.w()*gParams.batchSize, memSize = eltCount * sizeof(float);
+  
+  float* localMem = new float[eltCount];
+  for (size_t i = 0; i < eltCount; i++)
+    localMem[i] = (float(rand()) / RAND_MAX) * 2 - 1;
+  
+  void* deviceMem;
+  CHECK(cudaMalloc(&deviceMem, memSize));
+  if (deviceMem == nullptr)
+  {
+    std::cerr << "Out of memory" << std::endl;
+    exit(1);
+  }
+  CHECK(cudaMemcpy(deviceMem, localMem, memSize, cudaMemcpyHostToDevice));
+  
+  delete[] localMem;
+  buffers[bindingIndex] = deviceMem;
+}
+
+void doInference(ICudaEngine& engine)
+{
+  IExecutionContext *context = engine.createExecutionContext();
+  // input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(),
+  // of these, but in this case we know that there is exactly one input and one output.
+  
+  std::vector<void*> buffers(gInputs.size() + gParams.outputs.size());
+  for (size_t i = 0; i < gInputs.size(); i++)
+    createMemory(engine, buffers, gInputs[i]);
+  
+  for (size_t i = 0; i < gParams.outputs.size(); i++)
+    createMemory(engine, buffers, gParams.outputs[i]);
+  
+  cudaStream_t stream;
+  CHECK(cudaStreamCreate(&stream));
+  cudaEvent_t start, end;
+  CHECK(cudaEventCreate(&start));
+  CHECK(cudaEventCreate(&end));
+  
+  for (int j = 0; j < gParams.iterations; j++)
+  {
+    float total = 0, ms;
+    for (int i = 0; i < gParams.avgRuns; i++)
+    {
+      if (gParams.hostTime)
+      {
+        auto t_start = std::chrono::high_resolution_clock::now();
+        context->execute(gParams.batchSize, &buffers[0]);
+        auto t_end = std::chrono::high_resolution_clock::now();
+        ms = std::chrono::duration<float, std::milli>(t_end - t_start).count();
+      }
+      else
+      {
+        cudaEventRecord(start, stream);
+        context->enqueue(gParams.batchSize, &buffers[0], stream, nullptr);
+        cudaEventRecord(end, stream);
+        cudaEventSynchronize(end);
+        cudaEventElapsedTime(&ms, start, end);
+      }
+      total += ms;
+    }
+    total /= gParams.avgRuns;
+    std::cout << "Average over " << gParams.avgRuns << " runs is " << total << " ms." << std::endl;
+  }
+  
+  
+  cudaStreamDestroy(stream);
+  cudaEventDestroy(start);
+  cudaEventDestroy(end);
+}
+
+
+static ICudaEngine* createEngine()
+{
+  // TODO replace all gParams with corresponding parameters
+  ICudaEngine *engine;
+  
+  if (!gParams.deployFile.empty()) {
+    engine = caffeToGIEModel();
+    if (!engine)
+    {
+      std::cerr << "Engine could not be created" << std::endl;
+      return nullptr;
+    }
+    
+    
+    if (!gParams.engine.empty())
+    {
+      std::ofstream p(gParams.engine);
+      if (!p)
+      {
+        std::cerr << "could not open plan output file" << std::endl;
+        return nullptr;
+      }
+      IHostMemory *ptr = engine->serialize();
+      assert(ptr);
+      p.write(reinterpret_cast<const char*>(ptr->data()), ptr->size());
+      ptr->destroy();
+    }
+    return engine;
+  }
+  
+  // load directly from serialized engine file if deploy not specified
+  if (!gParams.engine.empty()) {
+    char *gieModelStream{nullptr};
+    size_t size{0};
+    std::ifstream file(gParams.engine, std::ios::binary);
+    if (file.good()) {
+      file.seekg(0, file.end);
+      size = file.tellg();
+      file.seekg(0, file.beg);
+      gieModelStream = new char[size];
+      assert(gieModelStream);
+      file.read(gieModelStream, size);
+      file.close();
+    }
+    
+    IRuntime* infer = createInferRuntime(gLogger);
+    engine = infer->deserializeCudaEngine(gieModelStream, size, nullptr);
+    if (gieModelStream) delete [] gieModelStream;
+    
+    // assume input to be "data" for deserialized engine
+    gInputs.push_back("data");
+    return engine;
+  }
+  
+  // complain about empty deploy file
+  std::cerr << "Deploy file not specified" << std::endl;
+  return nullptr;
+}
+
+
+
 namespace op
 {
     PoseExtractorTensorRT::PoseExtractorTensorRT(const Point<int>& netInputSize, const Point<int>& netOutputSize, const Point<int>& outputSize, const int scaleNumber,
@@ -18,7 +273,16 @@ namespace op
                                          modelFolder + POSE_PROTOTXT[(int)poseModel], modelFolder + POSE_TRAINED_MODEL[(int)poseModel], gpuId)},
         spResizeAndMergeTensorRT{std::make_shared<ResizeAndMergeCaffe<float>>()},
         spNmsTensorRT{std::make_shared<NmsCaffe<float>>()},
-        spBodyPartConnectorTensorRT{std::make_shared<BodyPartConnectorCaffe<float>>()}
+        spBodyPartConnectorTensorRT{std::make_shared<BodyPartConnectorCaffe<float>>()},
+        mNetInputSize(netInputSize),
+        mNetOutputSize(netOutputSize),
+        mOutputSize(outputSize),
+        mScaleNumber(scaleNumber),
+        mPoseModel(poseModel),
+        mModelFolder(modelFolder),
+        mGpuId(gpuId),
+        mHeatMapTypes(heatMapTypes),
+        mHeatMapScale(heatMapScale)
     {
         try
         {
@@ -35,6 +299,9 @@ namespace op
 
     PoseExtractorTensorRT::~PoseExtractorTensorRT()
     {
+      if(cudaEngine)
+        engine->destroy();
+        
     }
 
     void PoseExtractorTensorRT::netInitializationOnThread()
@@ -42,6 +309,13 @@ namespace op
         try
         {
             log("Starting initialization on thread.", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
+          
+            cudaEngine = createEngine();
+            if (!cudaEngine)
+            {
+              std::cerr << "Engine could not be created" << std::endl;
+              return -1;
+            }
 
             // TensorRT net
             spNet->initializationOnThread();
@@ -81,7 +355,12 @@ namespace op
                 error("Empty inputNetData.", __LINE__, __FUNCTION__, __FILE__);
 
             // 1. TensorRT deep network
-            spNet->forwardPass(inputNetData.getConstPtr());                                                     // ~79.3836ms
+            //spNet->forwardPass(inputNetData.getConstPtr());                                                     // ~79.3836ms
+          
+            doInference(inputNetData.getConstPtr());
+          
+            // Replace spNet->forward pass, but how to propagate to next
+            // Replace spTensorRTNetOututBlob.get() ?
 
             // 2. Resize heat maps + merge different scales
             spResizeAndMergeTensorRT->setScaleRatios(scaleRatios);
@@ -168,451 +447,6 @@ namespace op
 }
 
 #endif
-#include <assert.h>
-#include <fstream>
-#include <sstream>
-#include <iostream>
-#include <cmath>
-#include <sys/stat.h>
-#include <cmath>
-#include <time.h>
-#include <cuda_runtime_api.h>
-#include <algorithm>
-#include <chrono>
-#include <string.h>
-#include <map>
-#include <random>
 
-#include "NvInfer.h"
-#include "NvCaffeParser.h"
 
-using namespace nvinfer1;
-using namespace nvcaffeparser1;
-
-#define CHECK(status)									\
-{														\
-	if (status != 0)									\
-	{													\
-		std::cout << "Cuda failure: " << status;		\
-		abort();										\
-	}													\
-}
 
-struct Params
-{
-	std::string deployFile, modelFile, engine, calibrationCache;
-	std::vector<std::string> outputs;
-	int device{ 0 }, batchSize{ 1 }, workspaceSize{ 16 }, iterations{ 10 }, avgRuns{ 10 };
-	bool half2{ false }, int8{ false }, verbose{ false }, hostTime{ false };
-} gParams;
-
-static inline int volume(DimsCHW dims)
-{
-	return dims.c()*dims.h()*dims.w();
-}
-
-std::vector<std::string> gInputs;
-std::map<std::string, DimsCHW> gInputDimensions;
-
-// Logger for GIE info/warning/errors
-class Logger : public ILogger			
-{
-	void log(Severity severity, const char* msg) override
-	{
-		// suppress info-level messages
-		if (severity != Severity::kINFO || gParams.verbose)
-			std::cout << msg << std::endl;
-	}
-} gLogger;
-
-class RndInt8Calibrator : public IInt8EntropyCalibrator
-{
-public:
-	RndInt8Calibrator(int totalSamples = 1)
-		: mTotalSamples(totalSamples)
-		, mCurrentSample(0)
-	{
-		std::default_random_engine generator;
-		std::uniform_real_distribution<float> distribution(-1.0F, 1.0F);
-		for(auto& elem: gInputDimensions)
-		{
-			int elemCount = volume(elem.second);
-
-			std::vector<float> rnd_data(elemCount);
-			for(auto& val: rnd_data)
-				val = distribution(generator);
-
-			void * data;
-			CHECK(cudaMalloc(&data, elemCount * sizeof(float)));
-			CHECK(cudaMemcpy(data, &rnd_data[0], elemCount * sizeof(float), cudaMemcpyHostToDevice));
-
-			mInputDeviceBuffers.insert(std::make_pair(elem.first, data));
-		}
-	}
-
-	~RndInt8Calibrator()
-	{
-		for(auto& elem: mInputDeviceBuffers)
-			CHECK(cudaFree(elem.second));
-	}
-
-	int getBatchSize() const override
-	{
-		return 1;
-	}
-	
-	bool getBatch(void* bindings[], const char* names[], int nbBindings) override
-	{
-		if (mCurrentSample >= mTotalSamples)
-			return false;
-
-		for(int i = 0; i < nbBindings; ++i)
-			bindings[i] = mInputDeviceBuffers[names[i]];
-
-		++mCurrentSample;
-		return true;
-	}
-
-	const void* readCalibrationCache(size_t&) override
-	{
-		return nullptr;
-	}
-
-	virtual void writeCalibrationCache(const void*, size_t) override
-	{
-	}
-
-private:
-	int mTotalSamples;
-	int mCurrentSample;
-	std::map<std::string, void*> mInputDeviceBuffers;
-};
-
-ICudaEngine* caffeToGIEModel()
-{
-	// create the builder
-	IBuilder* builder = createInferBuilder(gLogger);
-
-	// parse the caffe model to populate the network, then set the outputs
-	INetworkDefinition* network = builder->createNetwork();
-	ICaffeParser* parser = createCaffeParser();
-	const IBlobNameToTensor* blobNameToTensor = parser->parse(gParams.deployFile.c_str(),
-															  gParams.modelFile.empty() ? 0 : gParams.modelFile.c_str(),
-															  *network,
-															  gParams.half2 ? DataType::kHALF:DataType::kFLOAT);
-
-
-	if (!blobNameToTensor)
-		return nullptr;
-
-	for (int i = 0, n = network->getNbInputs(); i < n; i++)
-	{
-		DimsCHW dims = static_cast<DimsCHW&&>(network->getInput(i)->getDimensions());
-		gInputs.push_back(network->getInput(i)->getName());
-		gInputDimensions.insert(std::make_pair(network->getInput(i)->getName(), dims));
-		std::cout << "Input \"" << network->getInput(i)->getName() << "\": " << dims.c() << "x" << dims.h() << "x" << dims.w() << std::endl;
-	}
-
-	// specify which tensors are outputs
-	for (auto& s : gParams.outputs)
-	{
-		if (blobNameToTensor->find(s.c_str()) == nullptr)
-		{
-			std::cout << "could not find output blob " << s << std::endl;
-			return nullptr;
-		}
-		network->markOutput(*blobNameToTensor->find(s.c_str()));
-	}
-
-	for (int i = 0, n = network->getNbOutputs(); i < n; i++)
-	{
-		DimsCHW dims = static_cast<DimsCHW&&>(network->getOutput(i)->getDimensions());
-		std::cout << "Output \"" << network->getOutput(i)->getName() << "\": " << dims.c() << "x" << dims.h() << "x" << dims.w() << std::endl;
-	}
-
-	// Build the engine
-	builder->setMaxBatchSize(gParams.batchSize);
-	builder->setMaxWorkspaceSize(gParams.workspaceSize<<20);
-	builder->setHalf2Mode(gParams.half2);
-
-	RndInt8Calibrator calibrator;
-	if (gParams.int8)
-	{
-		builder->setInt8Mode(true);
-		builder->setInt8Calibrator(&calibrator);
-	}
-
-	ICudaEngine* engine = builder->buildCudaEngine(*network);
-	if (engine == nullptr)
-		std::cout << "could not build engine" << std::endl;
-
-	parser->destroy();
-	network->destroy();
-	builder->destroy();
-	shutdownProtobufLibrary();
-	return engine;
-}
-
-void createMemory(const ICudaEngine& engine, std::vector<void*>& buffers, const std::string& name)
-{
-	size_t bindingIndex = engine.getBindingIndex(name.c_str());
-	printf("name=%s, bindingIndex=%d, buffers.size()=%d\n", name.c_str(), (int)bindingIndex, (int)buffers.size());
-	assert(bindingIndex < buffers.size());
-	DimsCHW dimensions = static_cast<DimsCHW&&>(engine.getBindingDimensions((int)bindingIndex));
-	size_t eltCount = dimensions.c()*dimensions.h()*dimensions.w()*gParams.batchSize, memSize = eltCount * sizeof(float);
-
-	float* localMem = new float[eltCount];
-	for (size_t i = 0; i < eltCount; i++)
-		localMem[i] = (float(rand()) / RAND_MAX) * 2 - 1;
-
-	void* deviceMem;
-	CHECK(cudaMalloc(&deviceMem, memSize));
-	if (deviceMem == nullptr)
-	{
-		std::cerr << "Out of memory" << std::endl;
-		exit(1);
-	}
-	CHECK(cudaMemcpy(deviceMem, localMem, memSize, cudaMemcpyHostToDevice));
-
-	delete[] localMem;
-	buffers[bindingIndex] = deviceMem;	
-}
-
-void doInference(ICudaEngine& engine)
-{
-	IExecutionContext *context = engine.createExecutionContext();
-	// input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(),
-	// of these, but in this case we know that there is exactly one input and one output.
-
-	std::vector<void*> buffers(gInputs.size() + gParams.outputs.size());
-	for (size_t i = 0; i < gInputs.size(); i++)
-		createMemory(engine, buffers, gInputs[i]);
-
-	for (size_t i = 0; i < gParams.outputs.size(); i++)
-		createMemory(engine, buffers, gParams.outputs[i]);
-
-	cudaStream_t stream;
-	CHECK(cudaStreamCreate(&stream));
-	cudaEvent_t start, end;
-	CHECK(cudaEventCreate(&start));
-	CHECK(cudaEventCreate(&end));
-
-	for (int j = 0; j < gParams.iterations; j++)
-	{
-		float total = 0, ms;
-		for (int i = 0; i < gParams.avgRuns; i++)
-		{
-			if (gParams.hostTime)
-			{
-				auto t_start = std::chrono::high_resolution_clock::now();
-				context->execute(gParams.batchSize, &buffers[0]);
-				auto t_end = std::chrono::high_resolution_clock::now();
-				ms = std::chrono::duration<float, std::milli>(t_end - t_start).count();
-			}
-			else
-			{
-				cudaEventRecord(start, stream);
-				context->enqueue(gParams.batchSize, &buffers[0], stream, nullptr);
-				cudaEventRecord(end, stream);
-				cudaEventSynchronize(end);
-				cudaEventElapsedTime(&ms, start, end);
-			}
-			total += ms;
-		}
-		total /= gParams.avgRuns;
-		std::cout << "Average over " << gParams.avgRuns << " runs is " << total << " ms." << std::endl;
-	}
-
-
-	cudaStreamDestroy(stream);
-	cudaEventDestroy(start);
-	cudaEventDestroy(end);
-}
-
-
-
-static void printUsage()
-{
-	printf("\n");
-	printf("Mandatory params:\n");
-	printf("  --deploy=<file>      Caffe deploy file\n");
-	printf("  --output=<name>      Output blob name (can be specified multiple times)\n");
-
-	printf("\nOptional params:\n");
-
-	printf("  --model=<file>       Caffe model file (default = no model, random weights used)\n");
-	printf("  --batch=N            Set batch size (default = %d)\n", gParams.batchSize);
-	printf("  --device=N           Set cuda device to N (default = %d)\n", gParams.device);
-	printf("  --iterations=N       Run N iterations (default = %d)\n", gParams.iterations);
-	printf("  --avgRuns=N          Set avgRuns to N - perf is measured as an average of avgRuns (default=%d)\n", gParams.avgRuns);
-	printf("  --workspace=N        Set workspace size in megabytes (default = %d)\n", gParams.workspaceSize);
-	printf("  --half2              Run in paired fp16 mode (default = false)\n");
-	printf("  --int8               Run in int8 mode (default = false)\n");
-	printf("  --verbose            Use verbose logging (default = false)\n");
-	printf("  --hostTime	       Measure host time rather than GPU time (default = false)\n");
-	printf("  --engine=<file>      Generate a serialized GIE engine\n");
-	printf("  --calib=<file>       Read INT8 calibration cache file\n");
-
-	fflush(stdout);
-}
-
-bool parseString(const char* arg, const char* name, std::string& value)
-{
-	size_t n = strlen(name);
-	bool match = arg[0] == '-' && arg[1] == '-' && !strncmp(arg + 2, name, n) && arg[n + 2] == '=';
-	if (match)
-	{
-		value = arg + n + 3;
-		std::cout << name << ": " << value << std::endl;
-	}
-	return match;
-}
-
-bool parseInt(const char* arg, const char* name, int& value)
-{
-	size_t n = strlen(name);
-	bool match = arg[0] == '-' && arg[1] == '-' && !strncmp(arg + 2, name, n) && arg[n + 2] == '=';
-	if (match)
-	{
-		value = atoi(arg + n + 3);
-		std::cout << name << ": " << value << std::endl;
-	}
-	return match;
-}
-
-bool parseBool(const char* arg, const char* name, bool& value)
-{
-	size_t n = strlen(name);
-	bool match = arg[0] == '-' && arg[1] == '-' && !strncmp(arg + 2, name, n);
-	if (match)
-	{
-		std::cout << name << std::endl;
-		value = true;
-	}
-	return match;
-
-}
-
-
-bool parseArgs(int argc, char* argv[])
-{
-	if (argc < 3)
-	{
-		printUsage();
-		return false;
-	}
-
-	for (int j = 1; j < argc; j++)
-	{
-		if (parseString(argv[j], "model", gParams.modelFile) || parseString(argv[j], "deploy", gParams.deployFile) || parseString(argv[j], "engine", gParams.engine))
-			continue;
-
-		if (parseString(argv[j], "calib", gParams.calibrationCache))
-			continue;
-		
-		std::string output;
-		if (parseString(argv[j], "output", output))
-		{
-			gParams.outputs.push_back(output);
-			continue;
-		}
-
-		if (parseInt(argv[j], "batch", gParams.batchSize) || parseInt(argv[j], "iterations", gParams.iterations) || parseInt(argv[j], "avgRuns", gParams.avgRuns) 
-			|| parseInt(argv[j], "device", gParams.device)	|| parseInt(argv[j], "workspace", gParams.workspaceSize))
-			continue;
-
-		if (parseBool(argv[j], "half2", gParams.half2) || parseBool(argv[j], "int8", gParams.int8)
-			|| parseBool(argv[j], "verbose", gParams.verbose) || parseBool(argv[j], "hostTime", gParams.hostTime))
-			continue;
-
-		printf("Unknown argument: %s\n", argv[j]);
-		return false;
-	}
-	return true;
-}
-
-static ICudaEngine* createEngine()
-{
-	ICudaEngine *engine;
-
-	if (!gParams.deployFile.empty()) {
-		engine = caffeToGIEModel();
-		if (!engine)
-		{
-			std::cerr << "Engine could not be created" << std::endl;
-			return nullptr;
-		}
-	
-	
-		if (!gParams.engine.empty())
-		{
-			std::ofstream p(gParams.engine);
-			if (!p)
-			{
-				std::cerr << "could not open plan output file" << std::endl;
-				return nullptr;
-			}
-			IHostMemory *ptr = engine->serialize();
-            assert(ptr);
-            p.write(reinterpret_cast<const char*>(ptr->data()), ptr->size());
-            ptr->destroy();
-		}
-		return engine;
-	}
-
-	// load directly from serialized engine file if deploy not specified
-	if (!gParams.engine.empty()) {
-		char *gieModelStream{nullptr};
-        size_t size{0};
-		std::ifstream file(gParams.engine, std::ios::binary);
-		if (file.good()) {
-            file.seekg(0, file.end);
-            size = file.tellg();
-            file.seekg(0, file.beg);
-            gieModelStream = new char[size];
-            assert(gieModelStream);
-            file.read(gieModelStream, size);
-			file.close();
-		}
-
-		IRuntime* infer = createInferRuntime(gLogger);
-		engine = infer->deserializeCudaEngine(gieModelStream, size, nullptr);
-        if (gieModelStream) delete [] gieModelStream;
-
-		// assume input to be "data" for deserialized engine
-		gInputs.push_back("data");
-		return engine;
-	}
-
-	// complain about empty deploy file
-	std::cerr << "Deploy file not specified" << std::endl;
-	return nullptr;
-}
-
-int main(int argc, char** argv)
-{
-	// create a GIE model from the caffe model and serialize it to a stream
-
-	if (!parseArgs(argc, argv))
-		return -1;
-
-	cudaSetDevice(gParams.device);
-
-	if (gParams.outputs.size() == 0)
-	{
-		std::cerr << "At least one network output must be defined" << std::endl;
-		return -1;
-	}
-
-	ICudaEngine* engine = createEngine();
-	if (!engine)
-	{
-		std::cerr << "Engine could not be created" << std::endl;
-		return -1;
-	}
-
-	doInference(*engine);
-	engine->destroy();
-
-	return 0;
-}

From f09f27b498ac2aaa8c54f9ae33b7c5864bdb3b95 Mon Sep 17 00:00:00 2001
From: Florent Buisson <florent.buisson.1987@gmail.com>
Date: Fri, 22 Sep 2017 16:49:22 +0200
Subject: [PATCH 11/52] New netTensorRT version, cleaner, ready for debug,
 loads of questions.

---
 include/openpose/core/netTensorRT.hpp         |  62 ++++
 .../openpose/pose/poseExtractorTensorRT.hpp   |  12 -
 src/openpose/core/netTensorRT.cpp             | 313 ++++++++++++++++++
 src/openpose/pose/poseExtractorTensorRT.cpp   | 268 +--------------
 4 files changed, 378 insertions(+), 277 deletions(-)
 create mode 100644 include/openpose/core/netTensorRT.hpp
 create mode 100644 src/openpose/core/netTensorRT.cpp

diff --git a/include/openpose/core/netTensorRT.hpp b/include/openpose/core/netTensorRT.hpp
new file mode 100644
index 000000000..0874f475b
--- /dev/null
+++ b/include/openpose/core/netTensorRT.hpp
@@ -0,0 +1,62 @@
+#ifdef USE_CAFFE
+#ifndef OPENPOSE_CORE_NET_TENSORRT_HPP
+#define OPENPOSE_CORE_NET_TENSORRT_HPP
+
+#include <caffe/net.hpp>
+#include <openpose/core/common.hpp>
+#include <openpose/core/net.hpp>
+
+#include <cuda_runtime_api.h>
+
+namespace op
+{
+    class OP_API NetTensorRT : public Net
+    {
+    public:
+        NetTensorRT(const std::array<int, 4>& netInputSize4D, const std::string& caffeProto, const std::string& caffeTrainedModel, const int gpuId = 0,
+                 const std::string& lastBlobName = "net_output");
+
+        virtual ~NetTensorRT();
+
+        void initializationOnThread();
+
+        // Alternative a) getInputDataCpuPtr or getInputDataGpuPtr + forwardPass
+        float* getInputDataCpuPtr() const;
+
+        float* getInputDataGpuPtr() const;
+
+        // Alternative b)
+        void forwardPass(const float* const inputNetData = nullptr) const;
+
+        boost::shared_ptr<caffe::Blob<float>> getOutputBlob() const;
+
+    private:
+        // Init with constructor
+        const int mGpuId;
+        const std::array<int, 4> mNetInputSize4D;
+        const unsigned long mNetInputMemory;
+        const std::string mCaffeProto;
+        const std::string mCaffeTrainedModel;
+        const std::string mLastBlobName;
+        // Init with thread
+        std::unique_ptr<caffe::Net<float>> upTensorRTNet;
+        boost::shared_ptr<caffe::Blob<float>> spOutputBlob;
+      
+        // TensorRT stuff
+        const Point<int> mNetInputSize;
+        const Point<int> mNetOutputSize;
+        const Point<int> mOutputSize;
+        const int mScaleNumber;
+        const PoseModel mPoseModel;
+        const std::string mModelFolder;
+        const int mGpuId;
+        const std::vector<HeatMapType> mHeatMapTypes;
+        const ScaleMode mHeatMapScale;
+        nvinfer1::ICudaEngine* cudaEngine;
+
+        DELETE_COPY(NetTensorRT);
+    };
+}
+
+#endif // OPENPOSE_CORE_NET_TENSORRT_HPP
+#endif
diff --git a/include/openpose/pose/poseExtractorTensorRT.hpp b/include/openpose/pose/poseExtractorTensorRT.hpp
index 33f781b8a..270d2a8f4 100644
--- a/include/openpose/pose/poseExtractorTensorRT.hpp
+++ b/include/openpose/pose/poseExtractorTensorRT.hpp
@@ -43,18 +43,6 @@ namespace op
         std::shared_ptr<caffe::Blob<float>> spHeatMapsBlob;
         std::shared_ptr<caffe::Blob<float>> spPeaksBlob;
         std::shared_ptr<caffe::Blob<float>> spPoseBlob;
-      
-        // TensorRT stuff
-        const Point<int> mNetInputSize;
-        const Point<int> mNetOutputSize;
-        const Point<int> mOutputSize;
-        const int mScaleNumber;
-        const PoseModel mPoseModel;
-        const std::string mModelFolder;
-        const int mGpuId;
-        const std::vector<HeatMapType> mHeatMapTypes;
-        const ScaleMode mHeatMapScale;
-        ICudaEngine* cudaEngine;
 
         DELETE_COPY(PoseExtractorTensorRT);
     };
diff --git a/src/openpose/core/netTensorRT.cpp b/src/openpose/core/netTensorRT.cpp
new file mode 100644
index 000000000..fe6c7202f
--- /dev/null
+++ b/src/openpose/core/netTensorRT.cpp
@@ -0,0 +1,313 @@
+#ifdef USE_CAFFE
+#include <numeric> // std::accumulate
+#include <openpose/utilities/cuda.hpp>
+#include <openpose/core/netTensorRT.hpp>
+#include <assert.h>
+#include <fstream>
+#include <sstream>
+#include <iostream>
+#include <cmath>
+#include <sys/stat.h>
+#include <cmath>
+#include <time.h>
+#include <cuda_runtime_api.h>
+#include <algorithm>
+#include <chrono>
+#include <string.h>
+#include <map>
+#include <random>
+
+#include "NvInfer.h"
+#include "NvCaffeParser.h"
+
+using namespace nvinfer1;
+using namespace nvcaffeparser1;
+
+#define CHECK(status)								      	\
+{														                \
+if (status != 0)									          \
+{												                   	\
+std::cout << "Cuda failure: " << status;		\
+abort();									                	\
+}										                   			\
+}
+
+std::vector<std::string> gInputs;
+std::vector<std::string, DimsCHW> gInputDimensions;
+
+
+
+// Logger for GIE info/warning/errors
+class Logger : public ILogger
+{
+  void log(Severity severity, const char* msg) override
+  {
+    // if suppress info-level message:  if (severity != Severity::kINFO)
+    std::cout << msg << std::endl;
+  }
+} gLogger;
+
+
+ICudaEngine* caffeToGIEModel()
+{
+  // create the builder
+  IBuilder* builder = createInferBuilder(gLogger);
+  
+  // parse the caffe model to populate the network, then set the outputs
+  INetworkDefinition* network = builder->createNetwork();
+  ICaffeParser* parser = createCaffeParser();
+  const IBlobNameToTensor* blobNameToTensor = parser->parse(mCaffeProto.c_str(),
+                                                            mCaffeTrainedModel.c_str(),
+                                                            *network,
+                                                            DataType::kHALF);
+  
+  if (!blobNameToTensor)
+    return nullptr;
+  
+  
+  for (int i = 0, n = network->getNbInputs(); i < n; i++)
+  {
+    DimsCHW dims = static_cast<DimsCHW&&>(network->getInput(i)->getDimensions());
+    gInputs.push_back(network->getInput(i)->getName());
+    gInputDimensions.insert(std::make_pair(network->getInput(i)->getName(), dims));
+    std::cout << "Input \"" << network->getInput(i)->getName() << "\": " << dims.c() << "x" << dims.h() << "x" << dims.w() << std::endl;
+  }
+  
+  // specify which tensors are outputs
+  
+  
+  // TODO, if it works switch to something more generic, add as parameter etc
+  std::string s("net_output");
+  if (blobNameToTensor->find(s.c_str()) == nullptr)
+  {
+    std::cout << "could not find output blob " << s << std::endl;
+    return nullptr;
+  }
+  network->markOutput(*blobNameToTensor->find(s.c_str()));
+  
+  
+  for (int i = 0, n = network->getNbOutputs(); i < n; i++)
+  {
+    DimsCHW dims = static_cast<DimsCHW&&>(network->getOutput(i)->getDimensions());
+    std::cout << "Output \"" << network->getOutput(i)->getName() << "\": " << dims.c() << "x" << dims.h() << "x" << dims.w() << std::endl;
+  }
+  
+  // Build the engine
+  builder->setMaxBatchSize(1);
+  // 16 megabytes, default in giexec. No idea what's best for Jetson though,
+  // maybe check dusty_nv's code on github
+  builder->setMaxWorkspaceSize(16<<20);
+  builder->setHalf2Mode(true);
+  
+  ICudaEngine* engine = builder->buildCudaEngine(*network);
+  if (engine == nullptr)
+    std::cout << "could not build engine" << std::endl;
+  
+  parser->destroy();
+  network->destroy();
+  builder->destroy();
+  shutdownProtobufLibrary();
+  return engine;
+}
+
+void createMemory(const ICudaEngine& engine, std::vector<void*>& buffers, const std::string& name)
+{
+  const int batchSize = 1;
+  size_t bindingIndex = engine.getBindingIndex(name.c_str());
+  printf("name=%s, bindingIndex=%d, buffers.size()=%d\n", name.c_str(), (int)bindingIndex, (int)buffers.size());
+  assert(bindingIndex < buffers.size());
+  DimsCHW dimensions = static_cast<DimsCHW&&>(engine.getBindingDimensions((int)bindingIndex));
+  size_t eltCount = dimensions.c()*dimensions.h()*dimensions.w()*batchSize, memSize = eltCount * sizeof(float);
+  
+  float* localMem = new float[eltCount];
+  for (size_t i = 0; i < eltCount; i++)
+    localMem[i] = (float(rand()) / RAND_MAX) * 2 - 1;
+  
+  void* deviceMem;
+  CHECK(cudaMalloc(&deviceMem, memSize));
+  if (deviceMem == nullptr)
+  {
+    std::cerr << "Out of memory" << std::endl;
+    exit(1);
+  }
+  CHECK(cudaMemcpy(deviceMem, localMem, memSize, cudaMemcpyHostToDevice));
+  
+  delete[] localMem;
+  buffers[bindingIndex] = deviceMem;
+}
+
+
+static ICudaEngine* createEngine()
+{
+  ICudaEngine *engine;
+  
+  engine = caffeToGIEModel();
+  if (!engine)
+  {
+    std::cerr << "Engine could not be created" << std::endl;
+    return nullptr;
+  }
+  
+  /* TODO seems unneeded, remove if so.
+  if (!gParams.engine.empty())
+  {
+    std::ofstream p(gParams.engine);
+    if (!p)
+    {
+      std::cerr << "could not open plan output file" << std::endl;
+      return nullptr;
+    }
+    IHostMemory *ptr = engine->serialize();
+    assert(ptr);
+    p.write(reinterpret_cast<const char*>(ptr->data()), ptr->size());
+    ptr->destroy();
+  }*/
+  return engine;
+}
+
+
+namespace op
+{
+  NetTensorRT::NetTensorRT(const std::array<int, 4>& netInputSize4D, const std::string& caffeProto, const std::string& caffeTrainedModel, const int gpuId, const std::string& lastBlobName) :
+  mGpuId{gpuId},
+  // mNetInputSize4D{netInputSize4D}, // This line crashes on some devices with old G++
+  mNetInputSize4D{netInputSize4D[0], netInputSize4D[1], netInputSize4D[2], netInputSize4D[3]},
+  mNetInputMemory{std::accumulate(mNetInputSize4D.begin(), mNetInputSize4D.end(), 1, std::multiplies<int>()) * sizeof(float)},
+  mCaffeProto{caffeProto},
+  mCaffeTrainedModel{caffeTrainedModel},
+  mLastBlobName{lastBlobName}
+  {
+  }
+  
+  NetTensorRT::~NetTensorRT()
+  {
+    if (cudaEngine)
+      cudaEngine->destroy();
+  }
+  
+  void NetTensorRT::initializationOnThread()
+  {
+    try
+    {
+      // Initialize net
+      cudaSetDevice(mGpuId);
+      
+      cudaEngine = createEngine();
+      if (!cudaEngine)
+      {
+        std::cerr << "Engine could not be created" << std::endl;
+        return;
+      }
+      
+      // For tensor RT is done in caffeToGIE
+      /*
+      //caffe::TensorRT::SetDevice(mGpuId);
+      upTensorRTNet.reset(new caffe::Net<float>{mTensorRTProto, caffe::TEST});
+      upTensorRTNet->CopyTrainedLayersFrom(mTensorRTTrainedModel);
+      upTensorRTNet->blobs()[0]->Reshape({mNetInputSize4D[0], mNetInputSize4D[1], mNetInputSize4D[2], mNetInputSize4D[3]});
+      upTensorRTNet->Reshape();
+      cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+      // Set spOutputBlob
+      spOutputBlob = upTensorRTNet->blob_by_name(mLastBlobName);
+      if (spOutputBlob == nullptr)
+        error("The output blob is a nullptr. Did you use the same name than the prototxt? (Used: " + mLastBlobName + ").", __LINE__, __FUNCTION__, __FILE__);
+      cudaCheck(__LINE__, __FUNCTION__, __FILE__);*/
+    }
+    catch (const std::exception& e)
+    {
+      error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+    }
+  }
+  
+  float* NetTensorRT::getInputDataCpuPtr() const
+  {
+    try
+    {
+      return upTensorRTNet->blobs().at(0)->mutable_cpu_data();
+    }
+    catch (const std::exception& e)
+    {
+      error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+      return nullptr;
+    }
+  }
+  
+  float* NetTensorRT::getInputDataGpuPtr() const
+  {
+    try
+    {
+      return upTensorRTNet->blobs().at(0)->mutable_gpu_data();
+    }
+    catch (const std::exception& e)
+    {
+      error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+      return nullptr;
+    }
+  }
+  
+  void NetTensorRT::forwardPass(const float* const inputData) const
+  {
+    try
+    {
+      // Copy frame data to GPU memory
+      if (inputData != nullptr)
+      {
+        
+        // OLD
+        //auto* gpuImagePtr = upTensorRTNet->blobs().at(0)->mutable_gpu_data();
+        //cudaMemcpy(gpuImagePtr, inputData, mNetInputMemory, cudaMemcpyHostToDevice);
+        
+        // Tensor RT version
+        
+        // TODO maybe move this to init and keep only the execute part
+        IExecutionContext *context = cudaEngine.createExecutionContext();
+        // input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(),
+        // of these, but in this case we know that there is exactly one input and one output.
+        
+        std::vector<void*> buffers(gInputs.size() + 1);
+        for (size_t i = 0; i < gInputs.size(); i++)
+          createMemory(engine, buffers, gInputs[i]);
+        
+        
+        createMemory(engine, buffers, std::string("net_output"));
+        
+        cudaStream_t stream;
+        CHECK(cudaStreamCreate(&stream));
+        cudaEvent_t start, end;
+        CHECK(cudaEventCreate(&start));
+        CHECK(cudaEventCreate(&end));
+        
+        int batchSize = 1;
+        context->execute(batchSize, &buffers[0]);
+        
+        
+        cudaStreamDestroy(stream);
+        cudaEventDestroy(start);
+        cudaEventDestroy(end);
+        
+      }
+      // Old Perform deep network forward pass
+      //upTensorRTNet->ForwardFrom(0);
+      //cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+    }
+    catch (const std::exception& e)
+    {
+      error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+    }
+  }
+  
+  boost::shared_ptr<caffe::Blob<float>> NetTensorRT::getOutputBlob() const
+  {
+    try
+    {
+      return spOutputBlob;
+    }
+    catch (const std::exception& e)
+    {
+      error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+      return nullptr;
+    }
+  }
+}
+
+#endif
diff --git a/src/openpose/pose/poseExtractorTensorRT.cpp b/src/openpose/pose/poseExtractorTensorRT.cpp
index c7f3311d8..000510f1b 100644
--- a/src/openpose/pose/poseExtractorTensorRT.cpp
+++ b/src/openpose/pose/poseExtractorTensorRT.cpp
@@ -1,5 +1,6 @@
 #ifdef USE_CAFFE
 #include <openpose/core/netCaffe.hpp>
+#include <openpose/core/netTensorRT.hpp>
 #include <openpose/pose/poseParameters.hpp>
 #include <openpose/utilities/check.hpp>
 #include <openpose/utilities/cuda.hpp>
@@ -7,260 +8,6 @@
 #include <openpose/utilities/openCv.hpp>
 #include <openpose/pose/poseExtractorTensorRT.hpp>
 
-#include <assert.h>
-#include <fstream>
-#include <sstream>
-#include <iostream>
-#include <cmath>
-#include <sys/stat.h>
-#include <cmath>
-#include <time.h>
-#include <cuda_runtime_api.h>
-#include <algorithm>
-#include <chrono>
-#include <string.h>
-#include <map>
-#include <random>
-
-#include "NvInfer.h"
-#include "NvCaffeParser.h"
-
-using namespace nvinfer1;
-using namespace nvcaffeparser1;
-
-#define CHECK(status)								      	\
-{														                \
-if (status != 0)									          \
-{												                   	\
-std::cout << "Cuda failure: " << status;		\
-abort();									                	\
-}										                   			\
-}
-
-struct Params
-{
-  std::string deployFile, modelFile, engine, calibrationCache;
-  std::vector<std::string> outputs;
-  int device{ 0 }, batchSize{ 1 }, workspaceSize{ 16 }, iterations{ 10 }, avgRuns{ 10 };
-  bool half2{ false }, int8{ false }, verbose{ false }, hostTime{ false };
-} gParams;
-
-static inline int volume(DimsCHW dims)
-{
-  return dims.c()*dims.h()*dims.w();
-}
-
-std::vector<std::string> gInputs;
-std::map<std::string, DimsCHW> gInputDimensions;
-
-// Logger for GIE info/warning/errors
-class Logger : public ILogger
-{
-  void log(Severity severity, const char* msg) override
-  {
-    // suppress info-level messages
-    if (severity != Severity::kINFO || gParams.verbose)
-      std::cout << msg << std::endl;
-  }
-} gLogger;
-
-
-ICudaEngine* caffeToGIEModel()
-{
-  // create the builder
-  IBuilder* builder = createInferBuilder(gLogger);
-  
-  // parse the caffe model to populate the network, then set the outputs
-  INetworkDefinition* network = builder->createNetwork();
-  ICaffeParser* parser = createCaffeParser();
-  const IBlobNameToTensor* blobNameToTensor = parser->parse(gParams.deployFile.c_str(),
-                                                            gParams.modelFile.empty() ? 0 : gParams.modelFile.c_str(),
-                                                            *network,
-                                                            gParams.half2 ? DataType::kHALF:DataType::kFLOAT);
-  
-  
-  if (!blobNameToTensor)
-    return nullptr;
-  
-  for (int i = 0, n = network->getNbInputs(); i < n; i++)
-  {
-    DimsCHW dims = static_cast<DimsCHW&&>(network->getInput(i)->getDimensions());
-    gInputs.push_back(network->getInput(i)->getName());
-    gInputDimensions.insert(std::make_pair(network->getInput(i)->getName(), dims));
-    std::cout << "Input \"" << network->getInput(i)->getName() << "\": " << dims.c() << "x" << dims.h() << "x" << dims.w() << std::endl;
-  }
-  
-  // specify which tensors are outputs
-  
-  for (auto& s : gParams.outputs)
-  {
-    if (blobNameToTensor->find(s.c_str()) == nullptr)
-    {
-      std::cout << "could not find output blob " << s << std::endl;
-      return nullptr;
-    }
-    network->markOutput(*blobNameToTensor->find(s.c_str()));
-  }
-  
-  for (int i = 0, n = network->getNbOutputs(); i < n; i++)
-  {
-    DimsCHW dims = static_cast<DimsCHW&&>(network->getOutput(i)->getDimensions());
-    std::cout << "Output \"" << network->getOutput(i)->getName() << "\": " << dims.c() << "x" << dims.h() << "x" << dims.w() << std::endl;
-  }
-  
-  // Build the engine
-  builder->setMaxBatchSize(1);
-  builder->setMaxWorkspaceSize(gParams.workspaceSize<<20);
-  builder->setHalf2Mode(true);
-  
-  ICudaEngine* engine = builder->buildCudaEngine(*network);
-  if (engine == nullptr)
-    std::cout << "could not build engine" << std::endl;
-  
-  parser->destroy();
-  network->destroy();
-  builder->destroy();
-  shutdownProtobufLibrary();
-  return engine;
-}
-
-void createMemory(const ICudaEngine& engine, std::vector<void*>& buffers, const std::string& name)
-{
-  size_t bindingIndex = engine.getBindingIndex(name.c_str());
-  printf("name=%s, bindingIndex=%d, buffers.size()=%d\n", name.c_str(), (int)bindingIndex, (int)buffers.size());
-  assert(bindingIndex < buffers.size());
-  DimsCHW dimensions = static_cast<DimsCHW&&>(engine.getBindingDimensions((int)bindingIndex));
-  size_t eltCount = dimensions.c()*dimensions.h()*dimensions.w()*gParams.batchSize, memSize = eltCount * sizeof(float);
-  
-  float* localMem = new float[eltCount];
-  for (size_t i = 0; i < eltCount; i++)
-    localMem[i] = (float(rand()) / RAND_MAX) * 2 - 1;
-  
-  void* deviceMem;
-  CHECK(cudaMalloc(&deviceMem, memSize));
-  if (deviceMem == nullptr)
-  {
-    std::cerr << "Out of memory" << std::endl;
-    exit(1);
-  }
-  CHECK(cudaMemcpy(deviceMem, localMem, memSize, cudaMemcpyHostToDevice));
-  
-  delete[] localMem;
-  buffers[bindingIndex] = deviceMem;
-}
-
-void doInference(ICudaEngine& engine)
-{
-  IExecutionContext *context = engine.createExecutionContext();
-  // input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(),
-  // of these, but in this case we know that there is exactly one input and one output.
-  
-  std::vector<void*> buffers(gInputs.size() + gParams.outputs.size());
-  for (size_t i = 0; i < gInputs.size(); i++)
-    createMemory(engine, buffers, gInputs[i]);
-  
-  for (size_t i = 0; i < gParams.outputs.size(); i++)
-    createMemory(engine, buffers, gParams.outputs[i]);
-  
-  cudaStream_t stream;
-  CHECK(cudaStreamCreate(&stream));
-  cudaEvent_t start, end;
-  CHECK(cudaEventCreate(&start));
-  CHECK(cudaEventCreate(&end));
-  
-  for (int j = 0; j < gParams.iterations; j++)
-  {
-    float total = 0, ms;
-    for (int i = 0; i < gParams.avgRuns; i++)
-    {
-      if (gParams.hostTime)
-      {
-        auto t_start = std::chrono::high_resolution_clock::now();
-        context->execute(gParams.batchSize, &buffers[0]);
-        auto t_end = std::chrono::high_resolution_clock::now();
-        ms = std::chrono::duration<float, std::milli>(t_end - t_start).count();
-      }
-      else
-      {
-        cudaEventRecord(start, stream);
-        context->enqueue(gParams.batchSize, &buffers[0], stream, nullptr);
-        cudaEventRecord(end, stream);
-        cudaEventSynchronize(end);
-        cudaEventElapsedTime(&ms, start, end);
-      }
-      total += ms;
-    }
-    total /= gParams.avgRuns;
-    std::cout << "Average over " << gParams.avgRuns << " runs is " << total << " ms." << std::endl;
-  }
-  
-  
-  cudaStreamDestroy(stream);
-  cudaEventDestroy(start);
-  cudaEventDestroy(end);
-}
-
-
-static ICudaEngine* createEngine()
-{
-  // TODO replace all gParams with corresponding parameters
-  ICudaEngine *engine;
-  
-  if (!gParams.deployFile.empty()) {
-    engine = caffeToGIEModel();
-    if (!engine)
-    {
-      std::cerr << "Engine could not be created" << std::endl;
-      return nullptr;
-    }
-    
-    
-    if (!gParams.engine.empty())
-    {
-      std::ofstream p(gParams.engine);
-      if (!p)
-      {
-        std::cerr << "could not open plan output file" << std::endl;
-        return nullptr;
-      }
-      IHostMemory *ptr = engine->serialize();
-      assert(ptr);
-      p.write(reinterpret_cast<const char*>(ptr->data()), ptr->size());
-      ptr->destroy();
-    }
-    return engine;
-  }
-  
-  // load directly from serialized engine file if deploy not specified
-  if (!gParams.engine.empty()) {
-    char *gieModelStream{nullptr};
-    size_t size{0};
-    std::ifstream file(gParams.engine, std::ios::binary);
-    if (file.good()) {
-      file.seekg(0, file.end);
-      size = file.tellg();
-      file.seekg(0, file.beg);
-      gieModelStream = new char[size];
-      assert(gieModelStream);
-      file.read(gieModelStream, size);
-      file.close();
-    }
-    
-    IRuntime* infer = createInferRuntime(gLogger);
-    engine = infer->deserializeCudaEngine(gieModelStream, size, nullptr);
-    if (gieModelStream) delete [] gieModelStream;
-    
-    // assume input to be "data" for deserialized engine
-    gInputs.push_back("data");
-    return engine;
-  }
-  
-  // complain about empty deploy file
-  std::cerr << "Deploy file not specified" << std::endl;
-  return nullptr;
-}
-
-
 
 namespace op
 {
@@ -269,7 +16,7 @@ namespace op
                                            const ScaleMode heatMapScale) :
         PoseExtractor{netOutputSize, outputSize, poseModel, heatMapTypes, heatMapScale},
         mResizeScale{mNetOutputSize.x / (float)netInputSize.x},
-        spNet{std::make_shared<NetCaffe>(std::array<int,4>{scaleNumber, 3, (int)netInputSize.y, (int)netInputSize.x},
+        spNet{std::make_shared<NetTensorRT>(std::array<int,4>{scaleNumber, 3, (int)netInputSize.y, (int)netInputSize.x},
                                          modelFolder + POSE_PROTOTXT[(int)poseModel], modelFolder + POSE_TRAINED_MODEL[(int)poseModel], gpuId)},
         spResizeAndMergeTensorRT{std::make_shared<ResizeAndMergeCaffe<float>>()},
         spNmsTensorRT{std::make_shared<NmsCaffe<float>>()},
@@ -299,9 +46,6 @@ namespace op
 
     PoseExtractorTensorRT::~PoseExtractorTensorRT()
     {
-      if(cudaEngine)
-        engine->destroy();
-        
     }
 
     void PoseExtractorTensorRT::netInitializationOnThread()
@@ -310,16 +54,10 @@ namespace op
         {
             log("Starting initialization on thread.", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
           
-            cudaEngine = createEngine();
-            if (!cudaEngine)
-            {
-              std::cerr << "Engine could not be created" << std::endl;
-              return -1;
-            }
 
             // TensorRT net
             spNet->initializationOnThread();
-            spTensorRTNetOutputBlob = ((NetCaffe*)spNet.get())->getOutputBlob();
+            spTensorRTNetOutputBlob = ((NetTensorRT*)spNet.get())->getOutputBlob();
             cudaCheck(__LINE__, __FUNCTION__, __FILE__);
 
             // HeatMaps extractor blob and layer

From ba2b435c0178fb09625eef4fdeb1999b3e067ac5 Mon Sep 17 00:00:00 2001
From: Florent Buisson <florent.buisson.1987@gmail.com>
Date: Fri, 22 Sep 2017 15:34:26 +0000
Subject: [PATCH 12/52] Fixed everything to compile, runs, reads network and
 convert but then segfault, need precise step logs or debug.

---
 Makefile                                      |  3 ++
 include/openpose/core/netTensorRT.hpp         | 11 +------
 models/pose/coco/pose_deploy_linevec.prototxt |  4 +--
 src/openpose/core/netTensorRT.cpp             | 32 +++++++++----------
 src/openpose/pose/poseExtractorTensorRT.cpp   | 14 ++------
 5 files changed, 24 insertions(+), 40 deletions(-)

diff --git a/Makefile b/Makefile
index 0cdc9bf39..c46c08e25 100644
--- a/Makefile
+++ b/Makefile
@@ -145,6 +145,9 @@ ifneq ($(CPU_ONLY), 1)
 	LIBRARIES += cudart cublas curand
 endif
 
+# TensorRT
+LIBRARIES += nvinfer nvcaffe_parser
+
 # LIBRARIES += glog gflags boost_system boost_filesystem m hdf5_hl hdf5 caffe
 LIBRARIES += glog gflags boost_system boost_filesystem m hdf5_hl hdf5
 
diff --git a/include/openpose/core/netTensorRT.hpp b/include/openpose/core/netTensorRT.hpp
index 0874f475b..05f7bc860 100644
--- a/include/openpose/core/netTensorRT.hpp
+++ b/include/openpose/core/netTensorRT.hpp
@@ -6,7 +6,7 @@
 #include <openpose/core/common.hpp>
 #include <openpose/core/net.hpp>
 
-#include <cuda_runtime_api.h>
+#include "NvInfer.h"
 
 namespace op
 {
@@ -43,15 +43,6 @@ namespace op
         boost::shared_ptr<caffe::Blob<float>> spOutputBlob;
       
         // TensorRT stuff
-        const Point<int> mNetInputSize;
-        const Point<int> mNetOutputSize;
-        const Point<int> mOutputSize;
-        const int mScaleNumber;
-        const PoseModel mPoseModel;
-        const std::string mModelFolder;
-        const int mGpuId;
-        const std::vector<HeatMapType> mHeatMapTypes;
-        const ScaleMode mHeatMapScale;
         nvinfer1::ICudaEngine* cudaEngine;
 
         DELETE_COPY(NetTensorRT);
diff --git a/models/pose/coco/pose_deploy_linevec.prototxt b/models/pose/coco/pose_deploy_linevec.prototxt
index fbe0c8245..6e4322812 100755
--- a/models/pose/coco/pose_deploy_linevec.prototxt
+++ b/models/pose/coco/pose_deploy_linevec.prototxt
@@ -1,8 +1,8 @@
 input: "image"
 input_dim: 1
 input_dim: 3
-input_dim: 1 # This value will be defined at runtime
-input_dim: 1 # This value will be defined at runtime
+input_dim: 96 # This value will be defined at runtime
+input_dim: 128 # This value will be defined at runtime
 layer {
   name: "conv1_1"
   type: "Convolution"
diff --git a/src/openpose/core/netTensorRT.cpp b/src/openpose/core/netTensorRT.cpp
index fe6c7202f..0fc5e6e58 100644
--- a/src/openpose/core/netTensorRT.cpp
+++ b/src/openpose/core/netTensorRT.cpp
@@ -23,7 +23,7 @@
 using namespace nvinfer1;
 using namespace nvcaffeparser1;
 
-#define CHECK(status)								      	\
+#define CUDA_TENSORRT_CHECK(status)								      	\
 {														                \
 if (status != 0)									          \
 {												                   	\
@@ -33,7 +33,7 @@ abort();									                	\
 }
 
 std::vector<std::string> gInputs;
-std::vector<std::string, DimsCHW> gInputDimensions;
+std::map<std::string, DimsCHW> gInputDimensions;
 
 
 
@@ -48,7 +48,7 @@ class Logger : public ILogger
 } gLogger;
 
 
-ICudaEngine* caffeToGIEModel()
+ICudaEngine* caffeToGIEModel(const std::string& caffeProto, const std::string& caffeTrainedModel)
 {
   // create the builder
   IBuilder* builder = createInferBuilder(gLogger);
@@ -56,8 +56,8 @@ ICudaEngine* caffeToGIEModel()
   // parse the caffe model to populate the network, then set the outputs
   INetworkDefinition* network = builder->createNetwork();
   ICaffeParser* parser = createCaffeParser();
-  const IBlobNameToTensor* blobNameToTensor = parser->parse(mCaffeProto.c_str(),
-                                                            mCaffeTrainedModel.c_str(),
+  const IBlobNameToTensor* blobNameToTensor = parser->parse(caffeProto.c_str(),
+                                                            caffeTrainedModel.c_str(),
                                                             *network,
                                                             DataType::kHALF);
   
@@ -124,24 +124,24 @@ void createMemory(const ICudaEngine& engine, std::vector<void*>& buffers, const
     localMem[i] = (float(rand()) / RAND_MAX) * 2 - 1;
   
   void* deviceMem;
-  CHECK(cudaMalloc(&deviceMem, memSize));
+  CUDA_TENSORRT_CHECK(cudaMalloc(&deviceMem, memSize));
   if (deviceMem == nullptr)
   {
     std::cerr << "Out of memory" << std::endl;
     exit(1);
   }
-  CHECK(cudaMemcpy(deviceMem, localMem, memSize, cudaMemcpyHostToDevice));
+  CUDA_TENSORRT_CHECK(cudaMemcpy(deviceMem, localMem, memSize, cudaMemcpyHostToDevice));
   
   delete[] localMem;
   buffers[bindingIndex] = deviceMem;
 }
 
 
-static ICudaEngine* createEngine()
+static ICudaEngine* createEngine(const std::string& caffeProto, const std::string& caffeTrainedModel)
 {
   ICudaEngine *engine;
   
-  engine = caffeToGIEModel();
+  engine = caffeToGIEModel(caffeProto, caffeTrainedModel);
   if (!engine)
   {
     std::cerr << "Engine could not be created" << std::endl;
@@ -192,7 +192,7 @@ namespace op
       // Initialize net
       cudaSetDevice(mGpuId);
       
-      cudaEngine = createEngine();
+      cudaEngine = createEngine(mCaffeProto, mCaffeTrainedModel);
       if (!cudaEngine)
       {
         std::cerr << "Engine could not be created" << std::endl;
@@ -260,22 +260,22 @@ namespace op
         // Tensor RT version
         
         // TODO maybe move this to init and keep only the execute part
-        IExecutionContext *context = cudaEngine.createExecutionContext();
+        IExecutionContext *context = cudaEngine->createExecutionContext();
         // input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(),
         // of these, but in this case we know that there is exactly one input and one output.
         
         std::vector<void*> buffers(gInputs.size() + 1);
         for (size_t i = 0; i < gInputs.size(); i++)
-          createMemory(engine, buffers, gInputs[i]);
+          createMemory(*cudaEngine, buffers, gInputs[i]);
         
         
-        createMemory(engine, buffers, std::string("net_output"));
+        createMemory(*cudaEngine, buffers, std::string("net_output"));
         
         cudaStream_t stream;
-        CHECK(cudaStreamCreate(&stream));
+        CUDA_TENSORRT_CHECK(cudaStreamCreate(&stream));
         cudaEvent_t start, end;
-        CHECK(cudaEventCreate(&start));
-        CHECK(cudaEventCreate(&end));
+        CUDA_TENSORRT_CHECK(cudaEventCreate(&start));
+        CUDA_TENSORRT_CHECK(cudaEventCreate(&end));
         
         int batchSize = 1;
         context->execute(batchSize, &buffers[0]);
diff --git a/src/openpose/pose/poseExtractorTensorRT.cpp b/src/openpose/pose/poseExtractorTensorRT.cpp
index 000510f1b..722524eee 100644
--- a/src/openpose/pose/poseExtractorTensorRT.cpp
+++ b/src/openpose/pose/poseExtractorTensorRT.cpp
@@ -20,16 +20,7 @@ namespace op
                                          modelFolder + POSE_PROTOTXT[(int)poseModel], modelFolder + POSE_TRAINED_MODEL[(int)poseModel], gpuId)},
         spResizeAndMergeTensorRT{std::make_shared<ResizeAndMergeCaffe<float>>()},
         spNmsTensorRT{std::make_shared<NmsCaffe<float>>()},
-        spBodyPartConnectorTensorRT{std::make_shared<BodyPartConnectorCaffe<float>>()},
-        mNetInputSize(netInputSize),
-        mNetOutputSize(netOutputSize),
-        mOutputSize(outputSize),
-        mScaleNumber(scaleNumber),
-        mPoseModel(poseModel),
-        mModelFolder(modelFolder),
-        mGpuId(gpuId),
-        mHeatMapTypes(heatMapTypes),
-        mHeatMapScale(heatMapScale)
+        spBodyPartConnectorTensorRT{std::make_shared<BodyPartConnectorCaffe<float>>()}
     {
         try
         {
@@ -93,9 +84,8 @@ namespace op
                 error("Empty inputNetData.", __LINE__, __FUNCTION__, __FILE__);
 
             // 1. TensorRT deep network
-            //spNet->forwardPass(inputNetData.getConstPtr());                                                     // ~79.3836ms
+            spNet->forwardPass(inputNetData.getConstPtr());
           
-            doInference(inputNetData.getConstPtr());
           
             // Replace spNet->forward pass, but how to propagate to next
             // Replace spTensorRTNetOututBlob.get() ?

From 97bbc05fc01bc78244d0e82ee7c708183ae2bd89 Mon Sep 17 00:00:00 2001
From: Florent Buisson <florent.buisson.1987@gmail.com>
Date: Fri, 22 Sep 2017 17:49:12 +0200
Subject: [PATCH 13/52] Debug logs.

---
 src/openpose/core/netTensorRT.cpp           | 27 +++++++++++++++++++++
 src/openpose/pose/poseExtractorTensorRT.cpp |  4 +++
 2 files changed, 31 insertions(+)

diff --git a/src/openpose/core/netTensorRT.cpp b/src/openpose/core/netTensorRT.cpp
index 0fc5e6e58..7759bfbbd 100644
--- a/src/openpose/core/netTensorRT.cpp
+++ b/src/openpose/core/netTensorRT.cpp
@@ -187,11 +187,18 @@ namespace op
   
   void NetTensorRT::initializationOnThread()
   {
+    
+    std::cout << "InitializationOnThread : start" << std::endl;
+    
     try
     {
+      
+      std::cout << "Forward Pass : setting device" << std::endl;
       // Initialize net
       cudaSetDevice(mGpuId);
       
+      std::cout << "Forward Pass : creating engine" << std::endl;
+      
       cudaEngine = createEngine(mCaffeProto, mCaffeTrainedModel);
       if (!cudaEngine)
       {
@@ -199,6 +206,8 @@ namespace op
         return;
       }
       
+      std::cout << "Forward Pass : done" << std::endl;
+      
       // For tensor RT is done in caffeToGIE
       /*
       //caffe::TensorRT::SetDevice(mGpuId);
@@ -247,6 +256,8 @@ namespace op
   
   void NetTensorRT::forwardPass(const float* const inputData) const
   {
+    
+    std::cout << "Forward Pass : start" << std::endl;
     try
     {
       // Copy frame data to GPU memory
@@ -260,10 +271,15 @@ namespace op
         // Tensor RT version
         
         // TODO maybe move this to init and keep only the execute part
+        
+        std::cout << "Forward Pass : creating execution context" << std::endl;
+        
         IExecutionContext *context = cudaEngine->createExecutionContext();
         // input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(),
         // of these, but in this case we know that there is exactly one input and one output.
         
+        std::cout << "Forward Pass : creating CUDA memory" << std::endl;
+        
         std::vector<void*> buffers(gInputs.size() + 1);
         for (size_t i = 0; i < gInputs.size(); i++)
           createMemory(*cudaEngine, buffers, gInputs[i]);
@@ -271,16 +287,24 @@ namespace op
         
         createMemory(*cudaEngine, buffers, std::string("net_output"));
         
+        
+        std::cout << "Forward Pass : memory created" << std::endl;
+        
         cudaStream_t stream;
         CUDA_TENSORRT_CHECK(cudaStreamCreate(&stream));
         cudaEvent_t start, end;
         CUDA_TENSORRT_CHECK(cudaEventCreate(&start));
         CUDA_TENSORRT_CHECK(cudaEventCreate(&end));
         
+        
+        std::cout << "Forward Pass : executing inference" << std::endl;
+        
         int batchSize = 1;
         context->execute(batchSize, &buffers[0]);
         
         
+        std::cout << "Forward Pass : inference done !" << std::endl;
+        
         cudaStreamDestroy(stream);
         cudaEventDestroy(start);
         cudaEventDestroy(end);
@@ -298,6 +322,7 @@ namespace op
   
   boost::shared_ptr<caffe::Blob<float>> NetTensorRT::getOutputBlob() const
   {
+    std::cout << "Getting output blob." << std::endl;
     try
     {
       return spOutputBlob;
@@ -307,6 +332,8 @@ namespace op
       error(e.what(), __LINE__, __FUNCTION__, __FILE__);
       return nullptr;
     }
+    
+    std::cout << "Got something..." << std::endl;
   }
 }
 
diff --git a/src/openpose/pose/poseExtractorTensorRT.cpp b/src/openpose/pose/poseExtractorTensorRT.cpp
index 722524eee..a9a0abb35 100644
--- a/src/openpose/pose/poseExtractorTensorRT.cpp
+++ b/src/openpose/pose/poseExtractorTensorRT.cpp
@@ -83,9 +83,13 @@ namespace op
             if (inputNetData.empty())
                 error("Empty inputNetData.", __LINE__, __FUNCTION__, __FILE__);
 
+          
+          
+            std::cout << "Forward Pass Pose: tensorrt forward pass" << std::endl;
             // 1. TensorRT deep network
             spNet->forwardPass(inputNetData.getConstPtr());
           
+            std::cout << "Forward Pass Pose: tensorrt passed !" << std::endl;
           
             // Replace spNet->forward pass, but how to propagate to next
             // Replace spTensorRTNetOututBlob.get() ?

From c666163991f53c6d6cd88556b85eb30580821447 Mon Sep 17 00:00:00 2001
From: Florent Buisson <florent.buisson.1987@gmail.com>
Date: Mon, 25 Sep 2017 14:51:17 +0200
Subject: [PATCH 14/52] First try on tensorRT inference with caffe Blobs.

---
 include/openpose/core/netTensorRT.hpp |  3 +-
 src/openpose/core/netTensorRT.cpp     | 80 +++++++++++++++------------
 2 files changed, 47 insertions(+), 36 deletions(-)

diff --git a/include/openpose/core/netTensorRT.hpp b/include/openpose/core/netTensorRT.hpp
index 05f7bc860..6e92ddb07 100644
--- a/include/openpose/core/netTensorRT.hpp
+++ b/include/openpose/core/netTensorRT.hpp
@@ -39,7 +39,8 @@ namespace op
         const std::string mCaffeTrainedModel;
         const std::string mLastBlobName;
         // Init with thread
-        std::unique_ptr<caffe::Net<float>> upTensorRTNet;
+      
+        boost::shared_ptr<caffe::Blob<float>> spInputBlob;
         boost::shared_ptr<caffe::Blob<float>> spOutputBlob;
       
         // TensorRT stuff
diff --git a/src/openpose/core/netTensorRT.cpp b/src/openpose/core/netTensorRT.cpp
index 7759bfbbd..648265d7c 100644
--- a/src/openpose/core/netTensorRT.cpp
+++ b/src/openpose/core/netTensorRT.cpp
@@ -110,32 +110,6 @@ ICudaEngine* caffeToGIEModel(const std::string& caffeProto, const std::string& c
   return engine;
 }
 
-void createMemory(const ICudaEngine& engine, std::vector<void*>& buffers, const std::string& name)
-{
-  const int batchSize = 1;
-  size_t bindingIndex = engine.getBindingIndex(name.c_str());
-  printf("name=%s, bindingIndex=%d, buffers.size()=%d\n", name.c_str(), (int)bindingIndex, (int)buffers.size());
-  assert(bindingIndex < buffers.size());
-  DimsCHW dimensions = static_cast<DimsCHW&&>(engine.getBindingDimensions((int)bindingIndex));
-  size_t eltCount = dimensions.c()*dimensions.h()*dimensions.w()*batchSize, memSize = eltCount * sizeof(float);
-  
-  float* localMem = new float[eltCount];
-  for (size_t i = 0; i < eltCount; i++)
-    localMem[i] = (float(rand()) / RAND_MAX) * 2 - 1;
-  
-  void* deviceMem;
-  CUDA_TENSORRT_CHECK(cudaMalloc(&deviceMem, memSize));
-  if (deviceMem == nullptr)
-  {
-    std::cerr << "Out of memory" << std::endl;
-    exit(1);
-  }
-  CUDA_TENSORRT_CHECK(cudaMemcpy(deviceMem, localMem, memSize, cudaMemcpyHostToDevice));
-  
-  delete[] localMem;
-  buffers[bindingIndex] = deviceMem;
-}
-
 
 static ICudaEngine* createEngine(const std::string& caffeProto, const std::string& caffeTrainedModel)
 {
@@ -193,11 +167,11 @@ namespace op
     try
     {
       
-      std::cout << "Forward Pass : setting device" << std::endl;
+      std::cout << "InitializationOnThread : setting device" << std::endl;
       // Initialize net
       cudaSetDevice(mGpuId);
       
-      std::cout << "Forward Pass : creating engine" << std::endl;
+      std::cout << "InitializationOnThread : creating engine" << std::endl;
       
       cudaEngine = createEngine(mCaffeProto, mCaffeTrainedModel);
       if (!cudaEngine)
@@ -206,7 +180,12 @@ namespace op
         return;
       }
       
-      std::cout << "Forward Pass : done" << std::endl;
+      std::cout << "InitializationOnThread : done" << std::endl;
+      
+      
+      
+      spInputBlob = std::make_shared<caffe::Blob<float>>({1, 3, 128, 96});
+      spOutputBlob = std::make_shared<caffe::Blob<float>>({1, 57, 46, 82});
       
       // For tensor RT is done in caffeToGIE
       /*
@@ -215,9 +194,9 @@ namespace op
       upTensorRTNet->CopyTrainedLayersFrom(mTensorRTTrainedModel);
       upTensorRTNet->blobs()[0]->Reshape({mNetInputSize4D[0], mNetInputSize4D[1], mNetInputSize4D[2], mNetInputSize4D[3]});
       upTensorRTNet->Reshape();
-      cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+      cudaCheck(__LINE__, __FUNCTION__, __FILE__);*/
       // Set spOutputBlob
-      spOutputBlob = upTensorRTNet->blob_by_name(mLastBlobName);
+      /*
       if (spOutputBlob == nullptr)
         error("The output blob is a nullptr. Did you use the same name than the prototxt? (Used: " + mLastBlobName + ").", __LINE__, __FUNCTION__, __FILE__);
       cudaCheck(__LINE__, __FUNCTION__, __FILE__);*/
@@ -280,12 +259,40 @@ namespace op
         
         std::cout << "Forward Pass : creating CUDA memory" << std::endl;
         
-        std::vector<void*> buffers(gInputs.size() + 1);
-        for (size_t i = 0; i < gInputs.size(); i++)
-          createMemory(*cudaEngine, buffers, gInputs[i]);
         
+        /*
+          const int batchSize = 1;
+          size_t bindingIndex = engine.getBindingIndex(name.c_str());
+        std::cout"name=%s, bindingIndex=%d, buffers.size()=%d\n", name.c_str(), (int)bindingIndex, (int)buffers.size());
+          assert(bindingIndex < buffers.size());
+          DimsCHW dimensions = static_cast<DimsCHW&&>(engine.getBindingDimensions((int)bindingIndex));
+          size_t eltCount = dimensions.c()*dimensions.h()*dimensions.w()*batchSize, memSize = eltCount * sizeof(float);
+          
+          float* localMem = new float[eltCount];
+          for (size_t i = 0; i < eltCount; i++)
+            localMem[i] = (float(rand()) / RAND_MAX) * 2 - 1;
+          
+          void* deviceMem;
+          CUDA_TENSORRT_CHECK(cudaMalloc(&deviceMem, memSize));
+          if (deviceMem == nullptr)
+          {
+            std::cerr << "Out of memory" << std::endl;
+            exit(1);
+          }
+          CUDA_TENSORRT_CHECK(cudaMemcpy(deviceMem, localMem, memSize, cudaMemcpyHostToDevice));
+          
+          delete[] localMem;
+          buffers[bindingIndex] = deviceMem;
+          */
+        
+        std::vector<void*> buffers(2);
+        buffers[0] = spInputBlob->mutable_gpu_data();
+        buffers[1] = spOutputBlob->mutable_gpu_data();
+        
+        //createMemory(*cudaEngine, buffers, gInputs[i]);
         
-        createMemory(*cudaEngine, buffers, std::string("net_output"));
+        
+        //createMemory(*cudaEngine, buffers, std::string("net_output"));
         
         
         std::cout << "Forward Pass : memory created" << std::endl;
@@ -303,8 +310,11 @@ namespace op
         context->execute(batchSize, &buffers[0]);
         
         
+        
         std::cout << "Forward Pass : inference done !" << std::endl;
         
+        
+        
         cudaStreamDestroy(stream);
         cudaEventDestroy(start);
         cudaEventDestroy(end);

From 1c77534a0def82067d4946d9741473f95685640d Mon Sep 17 00:00:00 2001
From: Florent Buisson <florent.buisson.1987@gmail.com>
Date: Mon, 25 Sep 2017 17:37:08 +0000
Subject: [PATCH 15/52] Running, but not pose recognition. Find a way to copy
 memory correctly.

---
 src/openpose/core/netCaffe.cpp    |  1 +
 src/openpose/core/netTensorRT.cpp | 12 +++++++-----
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/openpose/core/netCaffe.cpp b/src/openpose/core/netCaffe.cpp
index 12562ff85..ac03d80c0 100644
--- a/src/openpose/core/netCaffe.cpp
+++ b/src/openpose/core/netCaffe.cpp
@@ -34,6 +34,7 @@ namespace op
             cudaCheck(__LINE__, __FUNCTION__, __FILE__);
             // Set spOutputBlob
             spOutputBlob = upCaffeNet->blob_by_name(mLastBlobName);
+            std::cout << spOutputBlob->num() << " " << spOutputBlob->channels() << " " << spOutputBlob->height() << " " << spOutputBlob->width();
             if (spOutputBlob == nullptr)
                 error("The output blob is a nullptr. Did you use the same name than the prototxt? (Used: " + mLastBlobName + ").", __LINE__, __FUNCTION__, __FILE__);
             cudaCheck(__LINE__, __FUNCTION__, __FILE__);
diff --git a/src/openpose/core/netTensorRT.cpp b/src/openpose/core/netTensorRT.cpp
index 648265d7c..28eb09f09 100644
--- a/src/openpose/core/netTensorRT.cpp
+++ b/src/openpose/core/netTensorRT.cpp
@@ -16,6 +16,7 @@
 #include <string.h>
 #include <map>
 #include <random>
+#include <boost/make_shared.hpp>
 
 #include "NvInfer.h"
 #include "NvCaffeParser.h"
@@ -184,8 +185,8 @@ namespace op
       
       
       
-      spInputBlob = std::make_shared<caffe::Blob<float>>({1, 3, 128, 96});
-      spOutputBlob = std::make_shared<caffe::Blob<float>>({1, 57, 46, 82});
+      spInputBlob = boost::make_shared<caffe::Blob<float>>(1, 3, 128, 96);
+      spOutputBlob = boost::make_shared<caffe::Blob<float>>(1, 57, 46, 82);
       
       // For tensor RT is done in caffeToGIE
       /*
@@ -211,7 +212,7 @@ namespace op
   {
     try
     {
-      return upTensorRTNet->blobs().at(0)->mutable_cpu_data();
+      return spInputBlob->mutable_cpu_data();
     }
     catch (const std::exception& e)
     {
@@ -224,7 +225,7 @@ namespace op
   {
     try
     {
-      return upTensorRTNet->blobs().at(0)->mutable_gpu_data();
+      return spInputBlob->mutable_gpu_data();
     }
     catch (const std::exception& e)
     {
@@ -307,8 +308,9 @@ namespace op
         std::cout << "Forward Pass : executing inference" << std::endl;
         
         int batchSize = 1;
+        spInputBlob->Update();
         context->execute(batchSize, &buffers[0]);
-        
+        spOutputBlob->Update();
         
         
         std::cout << "Forward Pass : inference done !" << std::endl;

From 1380b140ddcd8edcb0f8a09010d4ce8c68d3ba6a Mon Sep 17 00:00:00 2001
From: Florent Buisson <florent.buisson.1987@gmail.com>
Date: Mon, 25 Sep 2017 17:38:24 +0000
Subject: [PATCH 16/52] pose.sh script

---
 pose.sh | 1 +
 1 file changed, 1 insertion(+)
 create mode 100755 pose.sh

diff --git a/pose.sh b/pose.sh
new file mode 100755
index 000000000..14f3a4deb
--- /dev/null
+++ b/pose.sh
@@ -0,0 +1 @@
+./build/examples/openpose/openpose.bin -camera_resolution 640x480 -net_resolution 128x96

From 32f53873f8002323e9208e05a5bb05cd8d39bdfa Mon Sep 17 00:00:00 2001
From: Florent Buisson <florent.buisson.1987@gmail.com>
Date: Tue, 26 Sep 2017 09:54:12 +0000
Subject: [PATCH 17/52] Timing in original pose demo

---
 .../tutorial_pose/1_extract_from_image.cpp    | 39 ++++++++++++++++++-
 1 file changed, 38 insertions(+), 1 deletion(-)

diff --git a/examples/tutorial_pose/1_extract_from_image.cpp b/examples/tutorial_pose/1_extract_from_image.cpp
index 48cbcbb96..417f73de0 100644
--- a/examples/tutorial_pose/1_extract_from_image.cpp
+++ b/examples/tutorial_pose/1_extract_from_image.cpp
@@ -52,10 +52,29 @@ DEFINE_double(render_threshold,         0.05,           "Only estimated keypoint
 DEFINE_double(alpha_pose,               0.6,            "Blending factor (range 0-1) for the body part rendering. 1 will show it completely, 0 will"
                                                         " hide it. Only valid for GPU rendering.");
 
+typedef std::vector<std::pair<std::string, std::chrono::high_resolution_clock::time_point>> OpTimings;
+
+static OpTimings timings;
+
+static void timeNow(const std::string& label){
+    const auto now = std::chrono::high_resolution_clock::now();
+    const auto timing = std::make_pair(label, now);
+    timings.push_back(timing);
+}
+
+static std::string timeDiffToString(const std::chrono::high_resolution_clock::time_point& t1,
+                                const std::chrono::high_resolution_clock::time_point& t2 ) {
+    return std::to_string((double)std::chrono::duration_cast<std::chrono::duration<double>>(t1 - t2).count() * 1e3) + " ms";
+}
+
+
 int openPoseTutorialPose1()
 {
     op::log("OpenPose Library Tutorial - Example 1.", op::Priority::High);
     // ------------------------- INITIALIZATION -------------------------
+    
+    timeNow("Start");
+
     // Step 1 - Set logging level
         // - 0 will output all the logging messages
         // - 255 will output nothing
@@ -92,11 +111,14 @@ int openPoseTutorialPose1()
     poseExtractorCaffe.initializationOnThread();
     poseRenderer.initializationOnThread();
 
+    timeNow("Initialization");
+    
     // ------------------------- POSE ESTIMATION AND RENDERING -------------------------
     // Step 1 - Read and load image, error if empty (possibly wrong path)
     cv::Mat inputImage = op::loadImage(FLAGS_image_path, CV_LOAD_IMAGE_COLOR); // Alternative: cv::imread(FLAGS_image_path, CV_LOAD_IMAGE_COLOR);
     if(inputImage.empty())
         op::error("Could not open or find the image: " + FLAGS_image_path, __LINE__, __FUNCTION__, __FILE__);
+    timeNow("Step 1");
     // Step 2 - Format input image to OpenPose input and output formats
     op::Array<float> netInputArray;
     std::vector<float> scaleRatios;
@@ -104,20 +126,35 @@ int openPoseTutorialPose1()
     double scaleInputToOutput;
     op::Array<float> outputArray;
     std::tie(scaleInputToOutput, outputArray) = cvMatToOpOutput.format(inputImage);
+    timeNow("Step 2");
     // Step 3 - Estimate poseKeypoints
     poseExtractorCaffe.forwardPass(netInputArray, {inputImage.cols, inputImage.rows}, scaleRatios);
     const auto poseKeypoints = poseExtractorCaffe.getPoseKeypoints();
+    timeNow("Step 3");
     // Step 4 - Render poseKeypoints
     poseRenderer.renderPose(outputArray, poseKeypoints);
+    timeNow("Step 4");
     // Step 5 - OpenPose output format to cv::Mat
     auto outputImage = opOutputToCvMat.formatToCvMat(outputArray);
-
+    timeNow("Step 5");
+    
     // ------------------------- SHOWING RESULT AND CLOSING -------------------------
     // Step 1 - Show results
     frameDisplayer.displayFrame(outputImage, 0); // Alternative: cv::imshow(outputImage) + cv::waitKey(0)
     // Step 2 - Logging information message
     op::log("Example 1 successfully finished.", op::Priority::High);
     // Return successful message
+
+    const auto totalTimeSec = timeDiffToString(timings.back().second, timings.front().second);
+    const auto message = "Pose estimation successfully finished. Total time: " + totalTimeSec + " seconds.";
+    op::log(message, op::Priority::High);
+
+    for(OpTimings::iterator timing = timings.begin()+1; timing != timings.end(); ++timing) {
+        const auto log_time = (*timing).first + " - " + timeDiffToString((*timing).second, (*(timing-1)).second);
+        op::log(log_time, op::Priority::High);
+    }
+
+
     return 0;
 }
 

From d2310db589d2c9b85f52697f9936fb8100c90ff6 Mon Sep 17 00:00:00 2001
From: Florent Buisson <florent.buisson.1987@gmail.com>
Date: Tue, 26 Sep 2017 11:56:41 +0200
Subject: [PATCH 18/52] Did not take into account forwardPass input data !

---
 src/openpose/core/netTensorRT.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/openpose/core/netTensorRT.cpp b/src/openpose/core/netTensorRT.cpp
index 28eb09f09..ca593522c 100644
--- a/src/openpose/core/netTensorRT.cpp
+++ b/src/openpose/core/netTensorRT.cpp
@@ -244,9 +244,11 @@ namespace op
       if (inputData != nullptr)
       {
         
+      
+        
         // OLD
-        //auto* gpuImagePtr = upTensorRTNet->blobs().at(0)->mutable_gpu_data();
-        //cudaMemcpy(gpuImagePtr, inputData, mNetInputMemory, cudaMemcpyHostToDevice);
+        auto* gpuImagePtr = spInputBlob->mutable_gpu_data();
+        cudaMemcpy(gpuImagePtr, inputData, mNetInputMemory, cudaMemcpyHostToDevice);
         
         // Tensor RT version
         

From 576c055fffb2de7e76a244e34c1392111d713325 Mon Sep 17 00:00:00 2001
From: Florent Buisson <florent.buisson.1987@gmail.com>
Date: Tue, 26 Sep 2017 14:10:12 +0000
Subject: [PATCH 19/52] Data copied to cuda memory. Correct sizes hardcoded, no
 CUDA error anymore, still not working.

---
 src/openpose/core/netCaffe.cpp    |  4 +++-
 src/openpose/core/netTensorRT.cpp | 10 +++++-----
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/openpose/core/netCaffe.cpp b/src/openpose/core/netCaffe.cpp
index ac03d80c0..2108d0178 100644
--- a/src/openpose/core/netCaffe.cpp
+++ b/src/openpose/core/netCaffe.cpp
@@ -32,9 +32,11 @@ namespace op
             upCaffeNet->blobs()[0]->Reshape({mNetInputSize4D[0], mNetInputSize4D[1], mNetInputSize4D[2], mNetInputSize4D[3]});
             upCaffeNet->Reshape();
             cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+            boost::shared_ptr<caffe::Blob<float>> spInputBlob = upCaffeNet->blobs().at(0);
+            std::cout << "Input Blob size : " << spInputBlob->num() << " " << spInputBlob->channels() << " " << spInputBlob->height() << " " << spInputBlob->width() << std::endl;
             // Set spOutputBlob
             spOutputBlob = upCaffeNet->blob_by_name(mLastBlobName);
-            std::cout << spOutputBlob->num() << " " << spOutputBlob->channels() << " " << spOutputBlob->height() << " " << spOutputBlob->width();
+            std::cout << "Output Blob size : " << spOutputBlob->num() << " " << spOutputBlob->channels() << " " << spOutputBlob->height() << " " << spOutputBlob->width() << std::endl;
             if (spOutputBlob == nullptr)
                 error("The output blob is a nullptr. Did you use the same name than the prototxt? (Used: " + mLastBlobName + ").", __LINE__, __FUNCTION__, __FILE__);
             cudaCheck(__LINE__, __FUNCTION__, __FILE__);
diff --git a/src/openpose/core/netTensorRT.cpp b/src/openpose/core/netTensorRT.cpp
index ca593522c..6bca6acd4 100644
--- a/src/openpose/core/netTensorRT.cpp
+++ b/src/openpose/core/netTensorRT.cpp
@@ -60,7 +60,7 @@ ICudaEngine* caffeToGIEModel(const std::string& caffeProto, const std::string& c
   const IBlobNameToTensor* blobNameToTensor = parser->parse(caffeProto.c_str(),
                                                             caffeTrainedModel.c_str(),
                                                             *network,
-                                                            DataType::kHALF);
+                                                            DataType::kFLOAT);
   
   if (!blobNameToTensor)
     return nullptr;
@@ -97,8 +97,8 @@ ICudaEngine* caffeToGIEModel(const std::string& caffeProto, const std::string& c
   builder->setMaxBatchSize(1);
   // 16 megabytes, default in giexec. No idea what's best for Jetson though,
   // maybe check dusty_nv's code on github
-  builder->setMaxWorkspaceSize(16<<20);
-  builder->setHalf2Mode(true);
+  builder->setMaxWorkspaceSize(32<<20);
+  builder->setHalf2Mode(false);
   
   ICudaEngine* engine = builder->buildCudaEngine(*network);
   if (engine == nullptr)
@@ -185,7 +185,7 @@ namespace op
       
       
       
-      spInputBlob = boost::make_shared<caffe::Blob<float>>(1, 3, 128, 96);
+      spInputBlob = boost::make_shared<caffe::Blob<float>>(1, 3, 368, 656);
       spOutputBlob = boost::make_shared<caffe::Blob<float>>(1, 57, 46, 82);
       
       // For tensor RT is done in caffeToGIE
@@ -248,7 +248,7 @@ namespace op
         
         // OLD
         auto* gpuImagePtr = spInputBlob->mutable_gpu_data();
-        cudaMemcpy(gpuImagePtr, inputData, mNetInputMemory, cudaMemcpyHostToDevice);
+        CUDA_TENSORRT_CHECK(cudaMemcpy(gpuImagePtr, inputData, mNetInputMemory, cudaMemcpyHostToDevice));
         
         // Tensor RT version
         

From e5d27fec2553be3c2081661e68e1d2a471c379b0 Mon Sep 17 00:00:00 2001
From: Florent Buisson <florent.buisson.1987@gmail.com>
Date: Tue, 26 Sep 2017 15:09:21 +0000
Subject: [PATCH 20/52] Tutorial pose 3 working !!!! Gaining x2 inference time,
 now time for cleaning.

---
 models/pose/coco/pose_deploy_linevec.prototxt |  4 +-
 src/openpose/core/netTensorRT.cpp             | 57 +++++++++++++++++--
 2 files changed, 54 insertions(+), 7 deletions(-)

diff --git a/models/pose/coco/pose_deploy_linevec.prototxt b/models/pose/coco/pose_deploy_linevec.prototxt
index 6e4322812..c310c8785 100755
--- a/models/pose/coco/pose_deploy_linevec.prototxt
+++ b/models/pose/coco/pose_deploy_linevec.prototxt
@@ -1,8 +1,8 @@
 input: "image"
 input_dim: 1
 input_dim: 3
-input_dim: 96 # This value will be defined at runtime
-input_dim: 128 # This value will be defined at runtime
+input_dim: 368 # This value will be defined at runtime
+input_dim: 656 # This value will be defined at runtime
 layer {
   name: "conv1_1"
   type: "Convolution"
diff --git a/src/openpose/core/netTensorRT.cpp b/src/openpose/core/netTensorRT.cpp
index 6bca6acd4..2ffc550f7 100644
--- a/src/openpose/core/netTensorRT.cpp
+++ b/src/openpose/core/netTensorRT.cpp
@@ -48,6 +48,31 @@ class Logger : public ILogger
   }
 } gLogger;
 
+void createMemory(const ICudaEngine& engine, std::vector<void*>& buffers, const std::string& name)
+{
+        size_t bindingIndex = engine.getBindingIndex(name.c_str());
+        printf("name=%s, bindingIndex=%d, buffers.size()=%d\n", name.c_str(), (int)bindingIndex, (int)buffers.size());
+        assert(bindingIndex < buffers.size());
+        DimsCHW dimensions = static_cast<DimsCHW&&>(engine.getBindingDimensions((int)bindingIndex));
+        size_t eltCount = dimensions.c()*dimensions.h()*dimensions.w()*1, memSize = eltCount * sizeof(float);
+
+        float* localMem = new float[eltCount];
+        for (size_t i = 0; i < eltCount; i++)
+                localMem[i] = (float(rand()) / RAND_MAX) * 2 - 1;
+
+        void* deviceMem;
+        CHECK(cudaMalloc(&deviceMem, memSize));
+        if (deviceMem == nullptr)
+        {
+                std::cerr << "Out of memory" << std::endl;
+                exit(1);
+        }
+        CHECK(cudaMemcpy(deviceMem, localMem, memSize, cudaMemcpyHostToDevice));
+
+        delete[] localMem;
+        buffers[bindingIndex] = deviceMem;
+}
+
 
 ICudaEngine* caffeToGIEModel(const std::string& caffeProto, const std::string& caffeTrainedModel)
 {
@@ -294,6 +319,26 @@ namespace op
         
         //createMemory(*cudaEngine, buffers, gInputs[i]);
         
+        const int batchSize = 1;
+        size_t eltCount = 1*57*46*82*batchSize, memSize = eltCount * sizeof(float);
+          
+        float* localMem = new float[eltCount];
+        for (size_t i = 0; i < eltCount; i++)
+          localMem[i] = (float(rand()) / RAND_MAX) * 2 - 1;
+          
+        void* deviceMem;
+        CUDA_TENSORRT_CHECK(cudaMalloc(&deviceMem, memSize));
+        if (deviceMem == nullptr)
+        {
+          std::cerr << "Out of memory" << std::endl;
+          exit(1);
+        }
+        CUDA_TENSORRT_CHECK(cudaMemcpy(deviceMem, localMem, memSize, cudaMemcpyHostToDevice));
+          
+        
+        buffers[1] = deviceMem;
+        //spOutputBlob->set_gpu_data((float*)deviceMem);
+
         
         //createMemory(*cudaEngine, buffers, std::string("net_output"));
         
@@ -309,12 +354,14 @@ namespace op
         
         std::cout << "Forward Pass : executing inference" << std::endl;
         
-        int batchSize = 1;
-        spInputBlob->Update();
+        //spInputBlob->Update();
         context->execute(batchSize, &buffers[0]);
-        spOutputBlob->Update();
-        
-        
+        //spOutputBlob->Update();
+        spOutputBlob->set_gpu_data((float*)deviceMem);
+        //CUDA_TENSORRT_CHECK(cudaMemcpy(localMem, buffers[1], memSize, cudaMemcpyDeviceToHost)); 
+        //spOutputBlob->set_cpu_data((float*)localMem);
+
+        delete[] localMem;
         std::cout << "Forward Pass : inference done !" << std::endl;
         
         

From 7d370957d82c8c402fabe6e1992b5f86dcc203b0 Mon Sep 17 00:00:00 2001
From: Florent Buisson <florent.buisson.1987@gmail.com>
Date: Tue, 26 Sep 2017 18:13:17 +0000
Subject: [PATCH 21/52] TensorRT Net input and output dimensions at runtime.

---
 include/openpose/core/netTensorRT.hpp |  1 +
 src/openpose/core/netTensorRT.cpp     | 10 +++++++---
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/include/openpose/core/netTensorRT.hpp b/include/openpose/core/netTensorRT.hpp
index 6e92ddb07..36436ca93 100644
--- a/include/openpose/core/netTensorRT.hpp
+++ b/include/openpose/core/netTensorRT.hpp
@@ -34,6 +34,7 @@ namespace op
         // Init with constructor
         const int mGpuId;
         const std::array<int, 4> mNetInputSize4D;
+        std::array<int, 4> mNetOutputSize4D;
         const unsigned long mNetInputMemory;
         const std::string mCaffeProto;
         const std::string mCaffeTrainedModel;
diff --git a/src/openpose/core/netTensorRT.cpp b/src/openpose/core/netTensorRT.cpp
index 2ffc550f7..4991dba3a 100644
--- a/src/openpose/core/netTensorRT.cpp
+++ b/src/openpose/core/netTensorRT.cpp
@@ -97,6 +97,9 @@ ICudaEngine* caffeToGIEModel(const std::string& caffeProto, const std::string& c
     gInputs.push_back(network->getInput(i)->getName());
     gInputDimensions.insert(std::make_pair(network->getInput(i)->getName(), dims));
     std::cout << "Input \"" << network->getInput(i)->getName() << "\": " << dims.c() << "x" << dims.h() << "x" << dims.w() << std::endl;
+    mNetOutputSize4D = { 1, dims.c(), dims.h(), dims.w() };
+    if( i > 0)
+      std::err << "Multiple output unsupported for now!" << std:endl;
   }
   
   // specify which tensors are outputs
@@ -209,9 +212,10 @@ namespace op
       std::cout << "InitializationOnThread : done" << std::endl;
       
       
-      
-      spInputBlob = boost::make_shared<caffe::Blob<float>>(1, 3, 368, 656);
-      spOutputBlob = boost::make_shared<caffe::Blob<float>>(1, 57, 46, 82);
+      std::cout << "NetInputSize4D: " << mNetInputSize4D[0] << " " << mNetInputSize4D[1] << " " << mNetInputSize4D[2] << " " << mNetInputSize4D[3] << std::endl;
+
+      spInputBlob = boost::make_shared<caffe::Blob<float>>(mNetInputSize4D[0], mNetInputSize4D[1], mNetInputSize4D[2], mNetInputSize4D[3]);
+      spOutputBlob = boost::make_shared<caffe::Blob<float>>(mNetOutputSize4D[0], mNetOutputSize4D[1], mNetOutputSize4D[2], mNetOutputSize4D[3]);
       
       // For tensor RT is done in caffeToGIE
       /*

From f3a898c553074f8bf8dc9ee214def049377761d3 Mon Sep 17 00:00:00 2001
From: Florent Buisson <florent.buisson.1987@gmail.com>
Date: Tue, 26 Sep 2017 21:15:45 +0200
Subject: [PATCH 22/52] NetTensorRT cleaning.

---
 include/openpose/core/netTensorRT.hpp |   3 +
 src/openpose/core/netTensorRT.cpp     | 346 ++++++++++----------------
 2 files changed, 132 insertions(+), 217 deletions(-)

diff --git a/include/openpose/core/netTensorRT.hpp b/include/openpose/core/netTensorRT.hpp
index 36436ca93..41df6141a 100644
--- a/include/openpose/core/netTensorRT.hpp
+++ b/include/openpose/core/netTensorRT.hpp
@@ -46,6 +46,9 @@ namespace op
       
         // TensorRT stuff
         nvinfer1::ICudaEngine* cudaEngine;
+        nvinfer1::IExecutionContext* cudaContext;
+        ICudaEngine* caffeToGIEModel();
+        ICudaEngine* createEngine();
 
         DELETE_COPY(NetTensorRT);
     };
diff --git a/src/openpose/core/netTensorRT.cpp b/src/openpose/core/netTensorRT.cpp
index 4991dba3a..8c6a4fc48 100644
--- a/src/openpose/core/netTensorRT.cpp
+++ b/src/openpose/core/netTensorRT.cpp
@@ -24,20 +24,11 @@
 using namespace nvinfer1;
 using namespace nvcaffeparser1;
 
-#define CUDA_TENSORRT_CHECK(status)								      	\
-{														                \
-if (status != 0)									          \
-{												                   	\
-std::cout << "Cuda failure: " << status;		\
-abort();									                	\
-}										                   			\
-}
 
 std::vector<std::string> gInputs;
 std::map<std::string, DimsCHW> gInputDimensions;
 
 
-
 // Logger for GIE info/warning/errors
 class Logger : public ILogger
 {
@@ -48,126 +39,6 @@ class Logger : public ILogger
   }
 } gLogger;
 
-void createMemory(const ICudaEngine& engine, std::vector<void*>& buffers, const std::string& name)
-{
-        size_t bindingIndex = engine.getBindingIndex(name.c_str());
-        printf("name=%s, bindingIndex=%d, buffers.size()=%d\n", name.c_str(), (int)bindingIndex, (int)buffers.size());
-        assert(bindingIndex < buffers.size());
-        DimsCHW dimensions = static_cast<DimsCHW&&>(engine.getBindingDimensions((int)bindingIndex));
-        size_t eltCount = dimensions.c()*dimensions.h()*dimensions.w()*1, memSize = eltCount * sizeof(float);
-
-        float* localMem = new float[eltCount];
-        for (size_t i = 0; i < eltCount; i++)
-                localMem[i] = (float(rand()) / RAND_MAX) * 2 - 1;
-
-        void* deviceMem;
-        CHECK(cudaMalloc(&deviceMem, memSize));
-        if (deviceMem == nullptr)
-        {
-                std::cerr << "Out of memory" << std::endl;
-                exit(1);
-        }
-        CHECK(cudaMemcpy(deviceMem, localMem, memSize, cudaMemcpyHostToDevice));
-
-        delete[] localMem;
-        buffers[bindingIndex] = deviceMem;
-}
-
-
-ICudaEngine* caffeToGIEModel(const std::string& caffeProto, const std::string& caffeTrainedModel)
-{
-  // create the builder
-  IBuilder* builder = createInferBuilder(gLogger);
-  
-  // parse the caffe model to populate the network, then set the outputs
-  INetworkDefinition* network = builder->createNetwork();
-  ICaffeParser* parser = createCaffeParser();
-  const IBlobNameToTensor* blobNameToTensor = parser->parse(caffeProto.c_str(),
-                                                            caffeTrainedModel.c_str(),
-                                                            *network,
-                                                            DataType::kFLOAT);
-  
-  if (!blobNameToTensor)
-    return nullptr;
-  
-  
-  for (int i = 0, n = network->getNbInputs(); i < n; i++)
-  {
-    DimsCHW dims = static_cast<DimsCHW&&>(network->getInput(i)->getDimensions());
-    gInputs.push_back(network->getInput(i)->getName());
-    gInputDimensions.insert(std::make_pair(network->getInput(i)->getName(), dims));
-    std::cout << "Input \"" << network->getInput(i)->getName() << "\": " << dims.c() << "x" << dims.h() << "x" << dims.w() << std::endl;
-    mNetOutputSize4D = { 1, dims.c(), dims.h(), dims.w() };
-    if( i > 0)
-      std::err << "Multiple output unsupported for now!" << std:endl;
-  }
-  
-  // specify which tensors are outputs
-  
-  
-  // TODO, if it works switch to something more generic, add as parameter etc
-  std::string s("net_output");
-  if (blobNameToTensor->find(s.c_str()) == nullptr)
-  {
-    std::cout << "could not find output blob " << s << std::endl;
-    return nullptr;
-  }
-  network->markOutput(*blobNameToTensor->find(s.c_str()));
-  
-  
-  for (int i = 0, n = network->getNbOutputs(); i < n; i++)
-  {
-    DimsCHW dims = static_cast<DimsCHW&&>(network->getOutput(i)->getDimensions());
-    std::cout << "Output \"" << network->getOutput(i)->getName() << "\": " << dims.c() << "x" << dims.h() << "x" << dims.w() << std::endl;
-  }
-  
-  // Build the engine
-  builder->setMaxBatchSize(1);
-  // 16 megabytes, default in giexec. No idea what's best for Jetson though,
-  // maybe check dusty_nv's code on github
-  builder->setMaxWorkspaceSize(32<<20);
-  builder->setHalf2Mode(false);
-  
-  ICudaEngine* engine = builder->buildCudaEngine(*network);
-  if (engine == nullptr)
-    std::cout << "could not build engine" << std::endl;
-  
-  parser->destroy();
-  network->destroy();
-  builder->destroy();
-  shutdownProtobufLibrary();
-  return engine;
-}
-
-
-static ICudaEngine* createEngine(const std::string& caffeProto, const std::string& caffeTrainedModel)
-{
-  ICudaEngine *engine;
-  
-  engine = caffeToGIEModel(caffeProto, caffeTrainedModel);
-  if (!engine)
-  {
-    std::cerr << "Engine could not be created" << std::endl;
-    return nullptr;
-  }
-  
-  /* TODO seems unneeded, remove if so.
-  if (!gParams.engine.empty())
-  {
-    std::ofstream p(gParams.engine);
-    if (!p)
-    {
-      std::cerr << "could not open plan output file" << std::endl;
-      return nullptr;
-    }
-    IHostMemory *ptr = engine->serialize();
-    assert(ptr);
-    p.write(reinterpret_cast<const char*>(ptr->data()), ptr->size());
-    ptr->destroy();
-  }*/
-  return engine;
-}
-
 
 namespace op
 {
@@ -180,14 +51,119 @@ namespace op
   mCaffeTrainedModel{caffeTrainedModel},
   mLastBlobName{lastBlobName}
   {
+    cudaStream_t stream;
+    CUDA_CHECK(cudaStreamCreate(&stream));
+    cudaEvent_t start, end;
+    CUDA_CHECK(cudaEventCreate(&start));
+    CUDA_CHECK(cudaEventCreate(&end));
   }
   
   NetTensorRT::~NetTensorRT()
   {
+    cudaStreamDestroy(stream);
+    cudaEventDestroy(start);
+    cudaEventDestroy(end);
+    
     if (cudaEngine)
       cudaEngine->destroy();
   }
   
+  
+  NetTensorRT::ICudaEngine* caffeToGIEModel()
+  {
+    // create the builder
+    IBuilder* builder = createInferBuilder(gLogger);
+    
+    // parse the caffe model to populate the network, then set the outputs
+    INetworkDefinition* network = builder->createNetwork();
+    ICaffeParser* parser = createCaffeParser();
+    const IBlobNameToTensor* blobNameToTensor = parser->parse(mCaffeProto.c_str(),
+                                                              mCaffeTrainedModel.c_str(),
+                                                              *network,
+                                                              DataType::kFLOAT);
+    
+    if (!blobNameToTensor)
+      return nullptr;
+    
+    
+    for (int i = 0, n = network->getNbInputs(); i < n; i++)
+    {
+      DimsCHW dims = static_cast<DimsCHW&&>(network->getInput(i)->getDimensions());
+      gInputs.push_back(network->getInput(i)->getName());
+      gInputDimensions.insert(std::make_pair(network->getInput(i)->getName(), dims));
+      std::cout << "Input \"" << network->getInput(i)->getName() << "\": " << dims.c() << "x" << dims.h() << "x" << dims.w() << std::endl;
+      mNetOutputSize4D = { 1, dims.c(), dims.h(), dims.w() };
+      if( i > 0)
+        std::err << "Multiple output unsupported for now!" << std:endl;
+    }
+    
+    // specify which tensors are outputs
+    
+    
+    // TODO, if it works switch to something more generic, add as parameter etc
+    std::string s("net_output");
+    if (blobNameToTensor->find(s.c_str()) == nullptr)
+    {
+      std::cout << "could not find output blob " << s << std::endl;
+      return nullptr;
+    }
+    network->markOutput(*blobNameToTensor->find(s.c_str()));
+    
+    
+    for (int i = 0, n = network->getNbOutputs(); i < n; i++)
+    {
+      DimsCHW dims = static_cast<DimsCHW&&>(network->getOutput(i)->getDimensions());
+      std::cout << "Output \"" << network->getOutput(i)->getName() << "\": " << dims.c() << "x" << dims.h() << "x" << dims.w() << std::endl;
+    }
+    
+    // Build the engine
+    builder->setMaxBatchSize(1);
+    // 16 megabytes, default in giexec. No idea what's best for Jetson though,
+    // maybe check dusty_nv's code on github
+    builder->setMaxWorkspaceSize(32<<20);
+    builder->setHalf2Mode(false);
+    
+    ICudaEngine* engine = builder->buildCudaEngine(*network);
+    if (engine == nullptr)
+      std::cout << "could not build engine" << std::endl;
+    
+    parser->destroy();
+    network->destroy();
+    builder->destroy();
+    shutdownProtobufLibrary();
+    
+    return engine;
+  }
+  
+  
+  NetTensorRT::ICudaEngine* createEngine()
+  {
+    ICudaEngine *engine;
+    
+    engine = caffeToGIEModel(caffeProto, caffeTrainedModel);
+    if (!engine)
+    {
+      std::cerr << "Engine could not be created" << std::endl;
+      return nullptr;
+    }
+    
+    /* TODO Serialize and load engines for given net size as optim quite long
+     if (!gParams.engine.empty())
+     {
+     std::ofstream p(gParams.engine);
+     if (!p)
+     {
+     std::cerr << "could not open plan output file" << std::endl;
+     return nullptr;
+     }
+     IHostMemory *ptr = engine->serialize();
+     assert(ptr);
+     p.write(reinterpret_cast<const char*>(ptr->data()), ptr->size());
+     ptr->destroy();
+     }*/
+    return engine;
+  }
+  
   void NetTensorRT::initializationOnThread()
   {
     
@@ -205,7 +181,16 @@ namespace op
       cudaEngine = createEngine(mCaffeProto, mCaffeTrainedModel);
       if (!cudaEngine)
       {
-        std::cerr << "Engine could not be created" << std::endl;
+        std::cerr << "cudaEngine could not be created" << std::endl;
+        return;
+      }
+      
+      std::cout << "InitializationOnThread Pass : creating execution context" << std::endl;
+      
+      cudaContext = cudaEngine->createExecutionContext();
+      if (!cudaContext)
+      {
+        std::cerr << "cudaContext could not be created" << std::endl;
         return;
       }
       
@@ -217,19 +202,7 @@ namespace op
       spInputBlob = boost::make_shared<caffe::Blob<float>>(mNetInputSize4D[0], mNetInputSize4D[1], mNetInputSize4D[2], mNetInputSize4D[3]);
       spOutputBlob = boost::make_shared<caffe::Blob<float>>(mNetOutputSize4D[0], mNetOutputSize4D[1], mNetOutputSize4D[2], mNetOutputSize4D[3]);
       
-      // For tensor RT is done in caffeToGIE
-      /*
-      //caffe::TensorRT::SetDevice(mGpuId);
-      upTensorRTNet.reset(new caffe::Net<float>{mTensorRTProto, caffe::TEST});
-      upTensorRTNet->CopyTrainedLayersFrom(mTensorRTTrainedModel);
-      upTensorRTNet->blobs()[0]->Reshape({mNetInputSize4D[0], mNetInputSize4D[1], mNetInputSize4D[2], mNetInputSize4D[3]});
-      upTensorRTNet->Reshape();
-      cudaCheck(__LINE__, __FUNCTION__, __FILE__);*/
-      // Set spOutputBlob
-      /*
-      if (spOutputBlob == nullptr)
-        error("The output blob is a nullptr. Did you use the same name than the prototxt? (Used: " + mLastBlobName + ").", __LINE__, __FUNCTION__, __FILE__);
-      cudaCheck(__LINE__, __FUNCTION__, __FILE__);*/
+      cudaCheck(__LINE__, __FUNCTION__, __FILE__);
     }
     catch (const std::exception& e)
     {
@@ -272,51 +245,14 @@ namespace op
       // Copy frame data to GPU memory
       if (inputData != nullptr)
       {
-        
-      
-        
-        // OLD
         auto* gpuImagePtr = spInputBlob->mutable_gpu_data();
-        CUDA_TENSORRT_CHECK(cudaMemcpy(gpuImagePtr, inputData, mNetInputMemory, cudaMemcpyHostToDevice));
+        CUDA_CHECK(cudaMemcpy(gpuImagePtr, inputData, mNetInputMemory, cudaMemcpyHostToDevice));
         
-        // Tensor RT version
-        
-        // TODO maybe move this to init and keep only the execute part
-        
-        std::cout << "Forward Pass : creating execution context" << std::endl;
-        
-        IExecutionContext *context = cudaEngine->createExecutionContext();
         // input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(),
         // of these, but in this case we know that there is exactly one input and one output.
         
         std::cout << "Forward Pass : creating CUDA memory" << std::endl;
         
-        
-        /*
-          const int batchSize = 1;
-          size_t bindingIndex = engine.getBindingIndex(name.c_str());
-        std::cout"name=%s, bindingIndex=%d, buffers.size()=%d\n", name.c_str(), (int)bindingIndex, (int)buffers.size());
-          assert(bindingIndex < buffers.size());
-          DimsCHW dimensions = static_cast<DimsCHW&&>(engine.getBindingDimensions((int)bindingIndex));
-          size_t eltCount = dimensions.c()*dimensions.h()*dimensions.w()*batchSize, memSize = eltCount * sizeof(float);
-          
-          float* localMem = new float[eltCount];
-          for (size_t i = 0; i < eltCount; i++)
-            localMem[i] = (float(rand()) / RAND_MAX) * 2 - 1;
-          
-          void* deviceMem;
-          CUDA_TENSORRT_CHECK(cudaMalloc(&deviceMem, memSize));
-          if (deviceMem == nullptr)
-          {
-            std::cerr << "Out of memory" << std::endl;
-            exit(1);
-          }
-          CUDA_TENSORRT_CHECK(cudaMemcpy(deviceMem, localMem, memSize, cudaMemcpyHostToDevice));
-          
-          delete[] localMem;
-          buffers[bindingIndex] = deviceMem;
-          */
-        
         std::vector<void*> buffers(2);
         buffers[0] = spInputBlob->mutable_gpu_data();
         buffers[1] = spOutputBlob->mutable_gpu_data();
@@ -331,53 +267,29 @@ namespace op
           localMem[i] = (float(rand()) / RAND_MAX) * 2 - 1;
           
         void* deviceMem;
-        CUDA_TENSORRT_CHECK(cudaMalloc(&deviceMem, memSize));
+        CUDA_CHECK(cudaMalloc(&deviceMem, memSize));
         if (deviceMem == nullptr)
         {
           std::cerr << "Out of memory" << std::endl;
           exit(1);
         }
-        CUDA_TENSORRT_CHECK(cudaMemcpy(deviceMem, localMem, memSize, cudaMemcpyHostToDevice));
+        CUDA_CHECK(cudaMemcpy(deviceMem, localMem, memSize, cudaMemcpyHostToDevice));
           
         
         buffers[1] = deviceMem;
-        //spOutputBlob->set_gpu_data((float*)deviceMem);
-
-        
-        //createMemory(*cudaEngine, buffers, std::string("net_output"));
-        
-        
-        std::cout << "Forward Pass : memory created" << std::endl;
-        
-        cudaStream_t stream;
-        CUDA_TENSORRT_CHECK(cudaStreamCreate(&stream));
-        cudaEvent_t start, end;
-        CUDA_TENSORRT_CHECK(cudaEventCreate(&start));
-        CUDA_TENSORRT_CHECK(cudaEventCreate(&end));
-        
-        
-        std::cout << "Forward Pass : executing inference" << std::endl;
-        
-        //spInputBlob->Update();
-        context->execute(batchSize, &buffers[0]);
-        //spOutputBlob->Update();
-        spOutputBlob->set_gpu_data((float*)deviceMem);
-        //CUDA_TENSORRT_CHECK(cudaMemcpy(localMem, buffers[1], memSize, cudaMemcpyDeviceToHost)); 
-        //spOutputBlob->set_cpu_data((float*)localMem);
-
         delete[] localMem;
-        std::cout << "Forward Pass : inference done !" << std::endl;
-        
-        
-        
-        cudaStreamDestroy(stream);
-        cudaEventDestroy(start);
-        cudaEventDestroy(end);
         
+        std::cout << "Forward Pass : memory created" << std::endl;
+        cudaCheck(__LINE__, __FUNCTION__, __FILE__);
       }
-      // Old Perform deep network forward pass
-      //upTensorRTNet->ForwardFrom(0);
-      //cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+      std::cout << "Forward Pass : executing inference" << std::endl;
+      
+      context->execute(batchSize, &buffers[0]);
+      
+      spOutputBlob->set_gpu_data((float*)deviceMem);
+      
+      std::cout << "Forward Pass : inference done !" << std::endl;
+      cudaCheck(__LINE__, __FUNCTION__, __FILE__);
     }
     catch (const std::exception& e)
     {

From 5c630b52ec7d3eab879299db03a90bd00cbed7a6 Mon Sep 17 00:00:00 2001
From: Florent Buisson <florent.buisson.1987@gmail.com>
Date: Tue, 26 Sep 2017 21:19:37 +0200
Subject: [PATCH 23/52] NetTensorRT cleaning bis.

---
 src/openpose/core/netTensorRT.cpp | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/src/openpose/core/netTensorRT.cpp b/src/openpose/core/netTensorRT.cpp
index 8c6a4fc48..3c72003a2 100644
--- a/src/openpose/core/netTensorRT.cpp
+++ b/src/openpose/core/netTensorRT.cpp
@@ -92,17 +92,12 @@ namespace op
       gInputs.push_back(network->getInput(i)->getName());
       gInputDimensions.insert(std::make_pair(network->getInput(i)->getName(), dims));
       std::cout << "Input \"" << network->getInput(i)->getName() << "\": " << dims.c() << "x" << dims.h() << "x" << dims.w() << std::endl;
-      mNetOutputSize4D = { 1, dims.c(), dims.h(), dims.w() };
       if( i > 0)
         std::err << "Multiple output unsupported for now!" << std:endl;
     }
     
-    // specify which tensors are outputs
-    
-    
-    // TODO, if it works switch to something more generic, add as parameter etc
-    std::string s("net_output");
-    if (blobNameToTensor->find(s.c_str()) == nullptr)
+    // Specify which tensor is output (multiple unsupported)
+    if (blobNameToTensor->find(mLastBlobName.c_str()) == nullptr)
     {
       std::cout << "could not find output blob " << s << std::endl;
       return nullptr;
@@ -114,6 +109,7 @@ namespace op
     {
       DimsCHW dims = static_cast<DimsCHW&&>(network->getOutput(i)->getDimensions());
       std::cout << "Output \"" << network->getOutput(i)->getName() << "\": " << dims.c() << "x" << dims.h() << "x" << dims.w() << std::endl;
+      mNetOutputSize4D = { 1, dims.c(), dims.h(), dims.w() };
     }
     
     // Build the engine

From a61758319d33f30af1b4be6d6961f8538162cbb1 Mon Sep 17 00:00:00 2001
From: Florent Buisson <florent.buisson.1987@gmail.com>
Date: Tue, 26 Sep 2017 20:00:08 +0000
Subject: [PATCH 24/52] Cleaning compilation fix.

---
 include/openpose/core/netTensorRT.hpp |  6 +++--
 src/openpose/core/netTensorRT.cpp     | 36 +++++++++++++--------------
 2 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/include/openpose/core/netTensorRT.hpp b/include/openpose/core/netTensorRT.hpp
index 41df6141a..00e176ab0 100644
--- a/include/openpose/core/netTensorRT.hpp
+++ b/include/openpose/core/netTensorRT.hpp
@@ -47,8 +47,10 @@ namespace op
         // TensorRT stuff
         nvinfer1::ICudaEngine* cudaEngine;
         nvinfer1::IExecutionContext* cudaContext;
-        ICudaEngine* caffeToGIEModel();
-        ICudaEngine* createEngine();
+        nvinfer1::ICudaEngine* caffeToGIEModel();
+        nvinfer1::ICudaEngine* createEngine();
+        cudaStream_t stream;
+        cudaEvent_t start, end;
 
         DELETE_COPY(NetTensorRT);
     };
diff --git a/src/openpose/core/netTensorRT.cpp b/src/openpose/core/netTensorRT.cpp
index 3c72003a2..79892b1ff 100644
--- a/src/openpose/core/netTensorRT.cpp
+++ b/src/openpose/core/netTensorRT.cpp
@@ -51,9 +51,8 @@ namespace op
   mCaffeTrainedModel{caffeTrainedModel},
   mLastBlobName{lastBlobName}
   {
-    cudaStream_t stream;
+    std::cout << "Caffe file: " << mCaffeProto.c_str() << std::endl;
     CUDA_CHECK(cudaStreamCreate(&stream));
-    cudaEvent_t start, end;
     CUDA_CHECK(cudaEventCreate(&start));
     CUDA_CHECK(cudaEventCreate(&end));
   }
@@ -69,7 +68,7 @@ namespace op
   }
   
   
-  NetTensorRT::ICudaEngine* caffeToGIEModel()
+  ICudaEngine* NetTensorRT::caffeToGIEModel()
   {
     // create the builder
     IBuilder* builder = createInferBuilder(gLogger);
@@ -93,16 +92,16 @@ namespace op
       gInputDimensions.insert(std::make_pair(network->getInput(i)->getName(), dims));
       std::cout << "Input \"" << network->getInput(i)->getName() << "\": " << dims.c() << "x" << dims.h() << "x" << dims.w() << std::endl;
       if( i > 0)
-        std::err << "Multiple output unsupported for now!" << std:endl;
+        std::cerr << "Multiple output unsupported for now!";
     }
     
     // Specify which tensor is output (multiple unsupported)
     if (blobNameToTensor->find(mLastBlobName.c_str()) == nullptr)
     {
-      std::cout << "could not find output blob " << s << std::endl;
+      std::cout << "could not find output blob " << mLastBlobName.c_str() << std::endl;
       return nullptr;
     }
-    network->markOutput(*blobNameToTensor->find(s.c_str()));
+    network->markOutput(*blobNameToTensor->find(mLastBlobName.c_str()));
     
     
     for (int i = 0, n = network->getNbOutputs(); i < n; i++)
@@ -132,11 +131,11 @@ namespace op
   }
   
   
-  NetTensorRT::ICudaEngine* createEngine()
+  ICudaEngine* NetTensorRT::createEngine()
   {
     ICudaEngine *engine;
     
-    engine = caffeToGIEModel(caffeProto, caffeTrainedModel);
+    engine = caffeToGIEModel();
     if (!engine)
     {
       std::cerr << "Engine could not be created" << std::endl;
@@ -174,7 +173,7 @@ namespace op
       
       std::cout << "InitializationOnThread : creating engine" << std::endl;
       
-      cudaEngine = createEngine(mCaffeProto, mCaffeTrainedModel);
+      cudaEngine = createEngine();
       if (!cudaEngine)
       {
         std::cerr << "cudaEngine could not be created" << std::endl;
@@ -238,6 +237,7 @@ namespace op
     std::cout << "Forward Pass : start" << std::endl;
     try
     {
+      const int batchSize = 1;
       // Copy frame data to GPU memory
       if (inputData != nullptr)
       {
@@ -253,10 +253,7 @@ namespace op
         buffers[0] = spInputBlob->mutable_gpu_data();
         buffers[1] = spOutputBlob->mutable_gpu_data();
         
-        //createMemory(*cudaEngine, buffers, gInputs[i]);
-        
-        const int batchSize = 1;
-        size_t eltCount = 1*57*46*82*batchSize, memSize = eltCount * sizeof(float);
+        size_t eltCount = mNetOutputSize4D[0]*mNetOutputSize4D[1]*mNetOutputSize4D[2]*mNetOutputSize4D[3]*batchSize, memSize = eltCount * sizeof(float);
           
         float* localMem = new float[eltCount];
         for (size_t i = 0; i < eltCount; i++)
@@ -277,15 +274,16 @@ namespace op
         
         std::cout << "Forward Pass : memory created" << std::endl;
         cudaCheck(__LINE__, __FUNCTION__, __FILE__);
-      }
-      std::cout << "Forward Pass : executing inference" << std::endl;
       
-      context->execute(batchSize, &buffers[0]);
+        std::cout << "Forward Pass : executing inference" << std::endl;
       
-      spOutputBlob->set_gpu_data((float*)deviceMem);
+        cudaContext->execute(batchSize, &buffers[0]);
       
-      std::cout << "Forward Pass : inference done !" << std::endl;
-      cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+        spOutputBlob->set_gpu_data((float*)deviceMem);
+      
+        std::cout << "Forward Pass : inference done !" << std::endl;
+        cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+      }
     }
     catch (const std::exception& e)
     {

From d3a31e05c0b73d2ca785dfb6c88cb9fc335a7c7d Mon Sep 17 00:00:00 2001
From: Florent Buisson <florent.buisson.1987@gmail.com>
Date: Tue, 26 Sep 2017 20:46:20 +0000
Subject: [PATCH 25/52] caffeToGIE needs fixed input size and cannot be
 determined at runtime for now.

---
 models/pose/coco/pose_deploy_linevec.prototxt |    4 +-
 .../coco/pose_deploy_linevec.prototxt_368x656 | 2976 +++++++++++++++++
 src/openpose/core/netTensorRT.cpp             |    2 +-
 3 files changed, 2979 insertions(+), 3 deletions(-)
 create mode 100755 models/pose/coco/pose_deploy_linevec.prototxt_368x656

diff --git a/models/pose/coco/pose_deploy_linevec.prototxt b/models/pose/coco/pose_deploy_linevec.prototxt
index c310c8785..fbe0c8245 100755
--- a/models/pose/coco/pose_deploy_linevec.prototxt
+++ b/models/pose/coco/pose_deploy_linevec.prototxt
@@ -1,8 +1,8 @@
 input: "image"
 input_dim: 1
 input_dim: 3
-input_dim: 368 # This value will be defined at runtime
-input_dim: 656 # This value will be defined at runtime
+input_dim: 1 # This value will be defined at runtime
+input_dim: 1 # This value will be defined at runtime
 layer {
   name: "conv1_1"
   type: "Convolution"
diff --git a/models/pose/coco/pose_deploy_linevec.prototxt_368x656 b/models/pose/coco/pose_deploy_linevec.prototxt_368x656
new file mode 100755
index 000000000..c310c8785
--- /dev/null
+++ b/models/pose/coco/pose_deploy_linevec.prototxt_368x656
@@ -0,0 +1,2976 @@
+input: "image"
+input_dim: 1
+input_dim: 3
+input_dim: 368 # This value will be defined at runtime
+input_dim: 656 # This value will be defined at runtime
+layer {
+  name: "conv1_1"
+  type: "Convolution"
+  bottom: "image"
+  top: "conv1_1"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu1_1"
+  type: "ReLU"
+  bottom: "conv1_1"
+  top: "conv1_1"
+}
+layer {
+  name: "conv1_2"
+  type: "Convolution"
+  bottom: "conv1_1"
+  top: "conv1_2"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu1_2"
+  type: "ReLU"
+  bottom: "conv1_2"
+  top: "conv1_2"
+}
+layer {
+  name: "pool1_stage1"
+  type: "Pooling"
+  bottom: "conv1_2"
+  top: "pool1_stage1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv2_1"
+  type: "Convolution"
+  bottom: "pool1_stage1"
+  top: "conv2_1"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu2_1"
+  type: "ReLU"
+  bottom: "conv2_1"
+  top: "conv2_1"
+}
+layer {
+  name: "conv2_2"
+  type: "Convolution"
+  bottom: "conv2_1"
+  top: "conv2_2"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu2_2"
+  type: "ReLU"
+  bottom: "conv2_2"
+  top: "conv2_2"
+}
+layer {
+  name: "pool2_stage1"
+  type: "Pooling"
+  bottom: "conv2_2"
+  top: "pool2_stage1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv3_1"
+  type: "Convolution"
+  bottom: "pool2_stage1"
+  top: "conv3_1"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu3_1"
+  type: "ReLU"
+  bottom: "conv3_1"
+  top: "conv3_1"
+}
+layer {
+  name: "conv3_2"
+  type: "Convolution"
+  bottom: "conv3_1"
+  top: "conv3_2"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu3_2"
+  type: "ReLU"
+  bottom: "conv3_2"
+  top: "conv3_2"
+}
+layer {
+  name: "conv3_3"
+  type: "Convolution"
+  bottom: "conv3_2"
+  top: "conv3_3"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu3_3"
+  type: "ReLU"
+  bottom: "conv3_3"
+  top: "conv3_3"
+}
+layer {
+  name: "conv3_4"
+  type: "Convolution"
+  bottom: "conv3_3"
+  top: "conv3_4"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu3_4"
+  type: "ReLU"
+  bottom: "conv3_4"
+  top: "conv3_4"
+}
+layer {
+  name: "pool3_stage1"
+  type: "Pooling"
+  bottom: "conv3_4"
+  top: "pool3_stage1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv4_1"
+  type: "Convolution"
+  bottom: "pool3_stage1"
+  top: "conv4_1"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu4_1"
+  type: "ReLU"
+  bottom: "conv4_1"
+  top: "conv4_1"
+}
+layer {
+  name: "conv4_2"
+  type: "Convolution"
+  bottom: "conv4_1"
+  top: "conv4_2"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu4_2"
+  type: "ReLU"
+  bottom: "conv4_2"
+  top: "conv4_2"
+}
+layer {
+  name: "conv4_3_CPM"
+  type: "Convolution"
+  bottom: "conv4_2"
+  top: "conv4_3_CPM"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu4_3_CPM"
+  type: "ReLU"
+  bottom: "conv4_3_CPM"
+  top: "conv4_3_CPM"
+}
+layer {
+  name: "conv4_4_CPM"
+  type: "Convolution"
+  bottom: "conv4_3_CPM"
+  top: "conv4_4_CPM"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu4_4_CPM"
+  type: "ReLU"
+  bottom: "conv4_4_CPM"
+  top: "conv4_4_CPM"
+}
+layer {
+  name: "conv5_1_CPM_L1"
+  type: "Convolution"
+  bottom: "conv4_4_CPM"
+  top: "conv5_1_CPM_L1"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu5_1_CPM_L1"
+  type: "ReLU"
+  bottom: "conv5_1_CPM_L1"
+  top: "conv5_1_CPM_L1"
+}
+layer {
+  name: "conv5_1_CPM_L2"
+  type: "Convolution"
+  bottom: "conv4_4_CPM"
+  top: "conv5_1_CPM_L2"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu5_1_CPM_L2"
+  type: "ReLU"
+  bottom: "conv5_1_CPM_L2"
+  top: "conv5_1_CPM_L2"
+}
+layer {
+  name: "conv5_2_CPM_L1"
+  type: "Convolution"
+  bottom: "conv5_1_CPM_L1"
+  top: "conv5_2_CPM_L1"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu5_2_CPM_L1"
+  type: "ReLU"
+  bottom: "conv5_2_CPM_L1"
+  top: "conv5_2_CPM_L1"
+}
+layer {
+  name: "conv5_2_CPM_L2"
+  type: "Convolution"
+  bottom: "conv5_1_CPM_L2"
+  top: "conv5_2_CPM_L2"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu5_2_CPM_L2"
+  type: "ReLU"
+  bottom: "conv5_2_CPM_L2"
+  top: "conv5_2_CPM_L2"
+}
+layer {
+  name: "conv5_3_CPM_L1"
+  type: "Convolution"
+  bottom: "conv5_2_CPM_L1"
+  top: "conv5_3_CPM_L1"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu5_3_CPM_L1"
+  type: "ReLU"
+  bottom: "conv5_3_CPM_L1"
+  top: "conv5_3_CPM_L1"
+}
+layer {
+  name: "conv5_3_CPM_L2"
+  type: "Convolution"
+  bottom: "conv5_2_CPM_L2"
+  top: "conv5_3_CPM_L2"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu5_3_CPM_L2"
+  type: "ReLU"
+  bottom: "conv5_3_CPM_L2"
+  top: "conv5_3_CPM_L2"
+}
+layer {
+  name: "conv5_4_CPM_L1"
+  type: "Convolution"
+  bottom: "conv5_3_CPM_L1"
+  top: "conv5_4_CPM_L1"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu5_4_CPM_L1"
+  type: "ReLU"
+  bottom: "conv5_4_CPM_L1"
+  top: "conv5_4_CPM_L1"
+}
+layer {
+  name: "conv5_4_CPM_L2"
+  type: "Convolution"
+  bottom: "conv5_3_CPM_L2"
+  top: "conv5_4_CPM_L2"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu5_4_CPM_L2"
+  type: "ReLU"
+  bottom: "conv5_4_CPM_L2"
+  top: "conv5_4_CPM_L2"
+}
+layer {
+  name: "conv5_5_CPM_L1"
+  type: "Convolution"
+  bottom: "conv5_4_CPM_L1"
+  top: "conv5_5_CPM_L1"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 38
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "conv5_5_CPM_L2"
+  type: "Convolution"
+  bottom: "conv5_4_CPM_L2"
+  top: "conv5_5_CPM_L2"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 19
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "concat_stage2"
+  type: "Concat"
+  bottom: "conv5_5_CPM_L1"
+  bottom: "conv5_5_CPM_L2"
+  bottom: "conv4_4_CPM"
+  top: "concat_stage2"
+  concat_param {
+    axis: 1
+  }
+}
+layer {
+  name: "Mconv1_stage2_L1"
+  type: "Convolution"
+  bottom: "concat_stage2"
+  top: "Mconv1_stage2_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu1_stage2_L1"
+  type: "ReLU"
+  bottom: "Mconv1_stage2_L1"
+  top: "Mconv1_stage2_L1"
+}
+layer {
+  name: "Mconv1_stage2_L2"
+  type: "Convolution"
+  bottom: "concat_stage2"
+  top: "Mconv1_stage2_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu1_stage2_L2"
+  type: "ReLU"
+  bottom: "Mconv1_stage2_L2"
+  top: "Mconv1_stage2_L2"
+}
+layer {
+  name: "Mconv2_stage2_L1"
+  type: "Convolution"
+  bottom: "Mconv1_stage2_L1"
+  top: "Mconv2_stage2_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu2_stage2_L1"
+  type: "ReLU"
+  bottom: "Mconv2_stage2_L1"
+  top: "Mconv2_stage2_L1"
+}
+layer {
+  name: "Mconv2_stage2_L2"
+  type: "Convolution"
+  bottom: "Mconv1_stage2_L2"
+  top: "Mconv2_stage2_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu2_stage2_L2"
+  type: "ReLU"
+  bottom: "Mconv2_stage2_L2"
+  top: "Mconv2_stage2_L2"
+}
+layer {
+  name: "Mconv3_stage2_L1"
+  type: "Convolution"
+  bottom: "Mconv2_stage2_L1"
+  top: "Mconv3_stage2_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu3_stage2_L1"
+  type: "ReLU"
+  bottom: "Mconv3_stage2_L1"
+  top: "Mconv3_stage2_L1"
+}
+layer {
+  name: "Mconv3_stage2_L2"
+  type: "Convolution"
+  bottom: "Mconv2_stage2_L2"
+  top: "Mconv3_stage2_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu3_stage2_L2"
+  type: "ReLU"
+  bottom: "Mconv3_stage2_L2"
+  top: "Mconv3_stage2_L2"
+}
+layer {
+  name: "Mconv4_stage2_L1"
+  type: "Convolution"
+  bottom: "Mconv3_stage2_L1"
+  top: "Mconv4_stage2_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu4_stage2_L1"
+  type: "ReLU"
+  bottom: "Mconv4_stage2_L1"
+  top: "Mconv4_stage2_L1"
+}
+layer {
+  name: "Mconv4_stage2_L2"
+  type: "Convolution"
+  bottom: "Mconv3_stage2_L2"
+  top: "Mconv4_stage2_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu4_stage2_L2"
+  type: "ReLU"
+  bottom: "Mconv4_stage2_L2"
+  top: "Mconv4_stage2_L2"
+}
+layer {
+  name: "Mconv5_stage2_L1"
+  type: "Convolution"
+  bottom: "Mconv4_stage2_L1"
+  top: "Mconv5_stage2_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu5_stage2_L1"
+  type: "ReLU"
+  bottom: "Mconv5_stage2_L1"
+  top: "Mconv5_stage2_L1"
+}
+layer {
+  name: "Mconv5_stage2_L2"
+  type: "Convolution"
+  bottom: "Mconv4_stage2_L2"
+  top: "Mconv5_stage2_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu5_stage2_L2"
+  type: "ReLU"
+  bottom: "Mconv5_stage2_L2"
+  top: "Mconv5_stage2_L2"
+}
+layer {
+  name: "Mconv6_stage2_L1"
+  type: "Convolution"
+  bottom: "Mconv5_stage2_L1"
+  top: "Mconv6_stage2_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu6_stage2_L1"
+  type: "ReLU"
+  bottom: "Mconv6_stage2_L1"
+  top: "Mconv6_stage2_L1"
+}
+layer {
+  name: "Mconv6_stage2_L2"
+  type: "Convolution"
+  bottom: "Mconv5_stage2_L2"
+  top: "Mconv6_stage2_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu6_stage2_L2"
+  type: "ReLU"
+  bottom: "Mconv6_stage2_L2"
+  top: "Mconv6_stage2_L2"
+}
+layer {
+  name: "Mconv7_stage2_L1"
+  type: "Convolution"
+  bottom: "Mconv6_stage2_L1"
+  top: "Mconv7_stage2_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 38
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mconv7_stage2_L2"
+  type: "Convolution"
+  bottom: "Mconv6_stage2_L2"
+  top: "Mconv7_stage2_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 19
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "concat_stage3"
+  type: "Concat"
+  bottom: "Mconv7_stage2_L1"
+  bottom: "Mconv7_stage2_L2"
+  bottom: "conv4_4_CPM"
+  top: "concat_stage3"
+  concat_param {
+    axis: 1
+  }
+}
+layer {
+  name: "Mconv1_stage3_L1"
+  type: "Convolution"
+  bottom: "concat_stage3"
+  top: "Mconv1_stage3_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu1_stage3_L1"
+  type: "ReLU"
+  bottom: "Mconv1_stage3_L1"
+  top: "Mconv1_stage3_L1"
+}
+layer {
+  name: "Mconv1_stage3_L2"
+  type: "Convolution"
+  bottom: "concat_stage3"
+  top: "Mconv1_stage3_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu1_stage3_L2"
+  type: "ReLU"
+  bottom: "Mconv1_stage3_L2"
+  top: "Mconv1_stage3_L2"
+}
+layer {
+  name: "Mconv2_stage3_L1"
+  type: "Convolution"
+  bottom: "Mconv1_stage3_L1"
+  top: "Mconv2_stage3_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu2_stage3_L1"
+  type: "ReLU"
+  bottom: "Mconv2_stage3_L1"
+  top: "Mconv2_stage3_L1"
+}
+layer {
+  name: "Mconv2_stage3_L2"
+  type: "Convolution"
+  bottom: "Mconv1_stage3_L2"
+  top: "Mconv2_stage3_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu2_stage3_L2"
+  type: "ReLU"
+  bottom: "Mconv2_stage3_L2"
+  top: "Mconv2_stage3_L2"
+}
+layer {
+  name: "Mconv3_stage3_L1"
+  type: "Convolution"
+  bottom: "Mconv2_stage3_L1"
+  top: "Mconv3_stage3_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu3_stage3_L1"
+  type: "ReLU"
+  bottom: "Mconv3_stage3_L1"
+  top: "Mconv3_stage3_L1"
+}
+layer {
+  name: "Mconv3_stage3_L2"
+  type: "Convolution"
+  bottom: "Mconv2_stage3_L2"
+  top: "Mconv3_stage3_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu3_stage3_L2"
+  type: "ReLU"
+  bottom: "Mconv3_stage3_L2"
+  top: "Mconv3_stage3_L2"
+}
+layer {
+  name: "Mconv4_stage3_L1"
+  type: "Convolution"
+  bottom: "Mconv3_stage3_L1"
+  top: "Mconv4_stage3_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu4_stage3_L1"
+  type: "ReLU"
+  bottom: "Mconv4_stage3_L1"
+  top: "Mconv4_stage3_L1"
+}
+layer {
+  name: "Mconv4_stage3_L2"
+  type: "Convolution"
+  bottom: "Mconv3_stage3_L2"
+  top: "Mconv4_stage3_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu4_stage3_L2"
+  type: "ReLU"
+  bottom: "Mconv4_stage3_L2"
+  top: "Mconv4_stage3_L2"
+}
+layer {
+  name: "Mconv5_stage3_L1"
+  type: "Convolution"
+  bottom: "Mconv4_stage3_L1"
+  top: "Mconv5_stage3_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu5_stage3_L1"
+  type: "ReLU"
+  bottom: "Mconv5_stage3_L1"
+  top: "Mconv5_stage3_L1"
+}
+layer {
+  name: "Mconv5_stage3_L2"
+  type: "Convolution"
+  bottom: "Mconv4_stage3_L2"
+  top: "Mconv5_stage3_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu5_stage3_L2"
+  type: "ReLU"
+  bottom: "Mconv5_stage3_L2"
+  top: "Mconv5_stage3_L2"
+}
+layer {
+  name: "Mconv6_stage3_L1"
+  type: "Convolution"
+  bottom: "Mconv5_stage3_L1"
+  top: "Mconv6_stage3_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu6_stage3_L1"
+  type: "ReLU"
+  bottom: "Mconv6_stage3_L1"
+  top: "Mconv6_stage3_L1"
+}
+layer {
+  name: "Mconv6_stage3_L2"
+  type: "Convolution"
+  bottom: "Mconv5_stage3_L2"
+  top: "Mconv6_stage3_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu6_stage3_L2"
+  type: "ReLU"
+  bottom: "Mconv6_stage3_L2"
+  top: "Mconv6_stage3_L2"
+}
+layer {
+  name: "Mconv7_stage3_L1"
+  type: "Convolution"
+  bottom: "Mconv6_stage3_L1"
+  top: "Mconv7_stage3_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 38
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mconv7_stage3_L2"
+  type: "Convolution"
+  bottom: "Mconv6_stage3_L2"
+  top: "Mconv7_stage3_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 19
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "concat_stage4"
+  type: "Concat"
+  bottom: "Mconv7_stage3_L1"
+  bottom: "Mconv7_stage3_L2"
+  bottom: "conv4_4_CPM"
+  top: "concat_stage4"
+  concat_param {
+    axis: 1
+  }
+}
+layer {
+  name: "Mconv1_stage4_L1"
+  type: "Convolution"
+  bottom: "concat_stage4"
+  top: "Mconv1_stage4_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu1_stage4_L1"
+  type: "ReLU"
+  bottom: "Mconv1_stage4_L1"
+  top: "Mconv1_stage4_L1"
+}
+layer {
+  name: "Mconv1_stage4_L2"
+  type: "Convolution"
+  bottom: "concat_stage4"
+  top: "Mconv1_stage4_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu1_stage4_L2"
+  type: "ReLU"
+  bottom: "Mconv1_stage4_L2"
+  top: "Mconv1_stage4_L2"
+}
+layer {
+  name: "Mconv2_stage4_L1"
+  type: "Convolution"
+  bottom: "Mconv1_stage4_L1"
+  top: "Mconv2_stage4_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu2_stage4_L1"
+  type: "ReLU"
+  bottom: "Mconv2_stage4_L1"
+  top: "Mconv2_stage4_L1"
+}
+layer {
+  name: "Mconv2_stage4_L2"
+  type: "Convolution"
+  bottom: "Mconv1_stage4_L2"
+  top: "Mconv2_stage4_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu2_stage4_L2"
+  type: "ReLU"
+  bottom: "Mconv2_stage4_L2"
+  top: "Mconv2_stage4_L2"
+}
+layer {
+  name: "Mconv3_stage4_L1"
+  type: "Convolution"
+  bottom: "Mconv2_stage4_L1"
+  top: "Mconv3_stage4_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu3_stage4_L1"
+  type: "ReLU"
+  bottom: "Mconv3_stage4_L1"
+  top: "Mconv3_stage4_L1"
+}
+layer {
+  name: "Mconv3_stage4_L2"
+  type: "Convolution"
+  bottom: "Mconv2_stage4_L2"
+  top: "Mconv3_stage4_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu3_stage4_L2"
+  type: "ReLU"
+  bottom: "Mconv3_stage4_L2"
+  top: "Mconv3_stage4_L2"
+}
+layer {
+  name: "Mconv4_stage4_L1"
+  type: "Convolution"
+  bottom: "Mconv3_stage4_L1"
+  top: "Mconv4_stage4_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu4_stage4_L1"
+  type: "ReLU"
+  bottom: "Mconv4_stage4_L1"
+  top: "Mconv4_stage4_L1"
+}
+layer {
+  name: "Mconv4_stage4_L2"
+  type: "Convolution"
+  bottom: "Mconv3_stage4_L2"
+  top: "Mconv4_stage4_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu4_stage4_L2"
+  type: "ReLU"
+  bottom: "Mconv4_stage4_L2"
+  top: "Mconv4_stage4_L2"
+}
+layer {
+  name: "Mconv5_stage4_L1"
+  type: "Convolution"
+  bottom: "Mconv4_stage4_L1"
+  top: "Mconv5_stage4_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu5_stage4_L1"
+  type: "ReLU"
+  bottom: "Mconv5_stage4_L1"
+  top: "Mconv5_stage4_L1"
+}
+layer {
+  name: "Mconv5_stage4_L2"
+  type: "Convolution"
+  bottom: "Mconv4_stage4_L2"
+  top: "Mconv5_stage4_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu5_stage4_L2"
+  type: "ReLU"
+  bottom: "Mconv5_stage4_L2"
+  top: "Mconv5_stage4_L2"
+}
+layer {
+  name: "Mconv6_stage4_L1"
+  type: "Convolution"
+  bottom: "Mconv5_stage4_L1"
+  top: "Mconv6_stage4_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu6_stage4_L1"
+  type: "ReLU"
+  bottom: "Mconv6_stage4_L1"
+  top: "Mconv6_stage4_L1"
+}
+layer {
+  name: "Mconv6_stage4_L2"
+  type: "Convolution"
+  bottom: "Mconv5_stage4_L2"
+  top: "Mconv6_stage4_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu6_stage4_L2"
+  type: "ReLU"
+  bottom: "Mconv6_stage4_L2"
+  top: "Mconv6_stage4_L2"
+}
+layer {
+  name: "Mconv7_stage4_L1"
+  type: "Convolution"
+  bottom: "Mconv6_stage4_L1"
+  top: "Mconv7_stage4_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 38
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mconv7_stage4_L2"
+  type: "Convolution"
+  bottom: "Mconv6_stage4_L2"
+  top: "Mconv7_stage4_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 19
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "concat_stage5"
+  type: "Concat"
+  bottom: "Mconv7_stage4_L1"
+  bottom: "Mconv7_stage4_L2"
+  bottom: "conv4_4_CPM"
+  top: "concat_stage5"
+  concat_param {
+    axis: 1
+  }
+}
+layer {
+  name: "Mconv1_stage5_L1"
+  type: "Convolution"
+  bottom: "concat_stage5"
+  top: "Mconv1_stage5_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu1_stage5_L1"
+  type: "ReLU"
+  bottom: "Mconv1_stage5_L1"
+  top: "Mconv1_stage5_L1"
+}
+layer {
+  name: "Mconv1_stage5_L2"
+  type: "Convolution"
+  bottom: "concat_stage5"
+  top: "Mconv1_stage5_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu1_stage5_L2"
+  type: "ReLU"
+  bottom: "Mconv1_stage5_L2"
+  top: "Mconv1_stage5_L2"
+}
+layer {
+  name: "Mconv2_stage5_L1"
+  type: "Convolution"
+  bottom: "Mconv1_stage5_L1"
+  top: "Mconv2_stage5_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu2_stage5_L1"
+  type: "ReLU"
+  bottom: "Mconv2_stage5_L1"
+  top: "Mconv2_stage5_L1"
+}
+layer {
+  name: "Mconv2_stage5_L2"
+  type: "Convolution"
+  bottom: "Mconv1_stage5_L2"
+  top: "Mconv2_stage5_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu2_stage5_L2"
+  type: "ReLU"
+  bottom: "Mconv2_stage5_L2"
+  top: "Mconv2_stage5_L2"
+}
+layer {
+  name: "Mconv3_stage5_L1"
+  type: "Convolution"
+  bottom: "Mconv2_stage5_L1"
+  top: "Mconv3_stage5_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu3_stage5_L1"
+  type: "ReLU"
+  bottom: "Mconv3_stage5_L1"
+  top: "Mconv3_stage5_L1"
+}
+layer {
+  name: "Mconv3_stage5_L2"
+  type: "Convolution"
+  bottom: "Mconv2_stage5_L2"
+  top: "Mconv3_stage5_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu3_stage5_L2"
+  type: "ReLU"
+  bottom: "Mconv3_stage5_L2"
+  top: "Mconv3_stage5_L2"
+}
+layer {
+  name: "Mconv4_stage5_L1"
+  type: "Convolution"
+  bottom: "Mconv3_stage5_L1"
+  top: "Mconv4_stage5_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu4_stage5_L1"
+  type: "ReLU"
+  bottom: "Mconv4_stage5_L1"
+  top: "Mconv4_stage5_L1"
+}
+layer {
+  name: "Mconv4_stage5_L2"
+  type: "Convolution"
+  bottom: "Mconv3_stage5_L2"
+  top: "Mconv4_stage5_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu4_stage5_L2"
+  type: "ReLU"
+  bottom: "Mconv4_stage5_L2"
+  top: "Mconv4_stage5_L2"
+}
+layer {
+  name: "Mconv5_stage5_L1"
+  type: "Convolution"
+  bottom: "Mconv4_stage5_L1"
+  top: "Mconv5_stage5_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu5_stage5_L1"
+  type: "ReLU"
+  bottom: "Mconv5_stage5_L1"
+  top: "Mconv5_stage5_L1"
+}
+layer {
+  name: "Mconv5_stage5_L2"
+  type: "Convolution"
+  bottom: "Mconv4_stage5_L2"
+  top: "Mconv5_stage5_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu5_stage5_L2"
+  type: "ReLU"
+  bottom: "Mconv5_stage5_L2"
+  top: "Mconv5_stage5_L2"
+}
+layer {
+  name: "Mconv6_stage5_L1"
+  type: "Convolution"
+  bottom: "Mconv5_stage5_L1"
+  top: "Mconv6_stage5_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu6_stage5_L1"
+  type: "ReLU"
+  bottom: "Mconv6_stage5_L1"
+  top: "Mconv6_stage5_L1"
+}
+layer {
+  name: "Mconv6_stage5_L2"
+  type: "Convolution"
+  bottom: "Mconv5_stage5_L2"
+  top: "Mconv6_stage5_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu6_stage5_L2"
+  type: "ReLU"
+  bottom: "Mconv6_stage5_L2"
+  top: "Mconv6_stage5_L2"
+}
+layer {
+  name: "Mconv7_stage5_L1"
+  type: "Convolution"
+  bottom: "Mconv6_stage5_L1"
+  top: "Mconv7_stage5_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 38
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mconv7_stage5_L2"
+  type: "Convolution"
+  bottom: "Mconv6_stage5_L2"
+  top: "Mconv7_stage5_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 19
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "concat_stage6"
+  type: "Concat"
+  bottom: "Mconv7_stage5_L1"
+  bottom: "Mconv7_stage5_L2"
+  bottom: "conv4_4_CPM"
+  top: "concat_stage6"
+  concat_param {
+    axis: 1
+  }
+}
+layer {
+  name: "Mconv1_stage6_L1"
+  type: "Convolution"
+  bottom: "concat_stage6"
+  top: "Mconv1_stage6_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu1_stage6_L1"
+  type: "ReLU"
+  bottom: "Mconv1_stage6_L1"
+  top: "Mconv1_stage6_L1"
+}
+layer {
+  name: "Mconv1_stage6_L2"
+  type: "Convolution"
+  bottom: "concat_stage6"
+  top: "Mconv1_stage6_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu1_stage6_L2"
+  type: "ReLU"
+  bottom: "Mconv1_stage6_L2"
+  top: "Mconv1_stage6_L2"
+}
+layer {
+  name: "Mconv2_stage6_L1"
+  type: "Convolution"
+  bottom: "Mconv1_stage6_L1"
+  top: "Mconv2_stage6_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu2_stage6_L1"
+  type: "ReLU"
+  bottom: "Mconv2_stage6_L1"
+  top: "Mconv2_stage6_L1"
+}
+layer {
+  name: "Mconv2_stage6_L2"
+  type: "Convolution"
+  bottom: "Mconv1_stage6_L2"
+  top: "Mconv2_stage6_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu2_stage6_L2"
+  type: "ReLU"
+  bottom: "Mconv2_stage6_L2"
+  top: "Mconv2_stage6_L2"
+}
+layer {
+  name: "Mconv3_stage6_L1"
+  type: "Convolution"
+  bottom: "Mconv2_stage6_L1"
+  top: "Mconv3_stage6_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu3_stage6_L1"
+  type: "ReLU"
+  bottom: "Mconv3_stage6_L1"
+  top: "Mconv3_stage6_L1"
+}
+layer {
+  name: "Mconv3_stage6_L2"
+  type: "Convolution"
+  bottom: "Mconv2_stage6_L2"
+  top: "Mconv3_stage6_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu3_stage6_L2"
+  type: "ReLU"
+  bottom: "Mconv3_stage6_L2"
+  top: "Mconv3_stage6_L2"
+}
+layer {
+  name: "Mconv4_stage6_L1"
+  type: "Convolution"
+  bottom: "Mconv3_stage6_L1"
+  top: "Mconv4_stage6_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu4_stage6_L1"
+  type: "ReLU"
+  bottom: "Mconv4_stage6_L1"
+  top: "Mconv4_stage6_L1"
+}
+layer {
+  name: "Mconv4_stage6_L2"
+  type: "Convolution"
+  bottom: "Mconv3_stage6_L2"
+  top: "Mconv4_stage6_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu4_stage6_L2"
+  type: "ReLU"
+  bottom: "Mconv4_stage6_L2"
+  top: "Mconv4_stage6_L2"
+}
+layer {
+  name: "Mconv5_stage6_L1"
+  type: "Convolution"
+  bottom: "Mconv4_stage6_L1"
+  top: "Mconv5_stage6_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu5_stage6_L1"
+  type: "ReLU"
+  bottom: "Mconv5_stage6_L1"
+  top: "Mconv5_stage6_L1"
+}
+layer {
+  name: "Mconv5_stage6_L2"
+  type: "Convolution"
+  bottom: "Mconv4_stage6_L2"
+  top: "Mconv5_stage6_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu5_stage6_L2"
+  type: "ReLU"
+  bottom: "Mconv5_stage6_L2"
+  top: "Mconv5_stage6_L2"
+}
+layer {
+  name: "Mconv6_stage6_L1"
+  type: "Convolution"
+  bottom: "Mconv5_stage6_L1"
+  top: "Mconv6_stage6_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu6_stage6_L1"
+  type: "ReLU"
+  bottom: "Mconv6_stage6_L1"
+  top: "Mconv6_stage6_L1"
+}
+layer {
+  name: "Mconv6_stage6_L2"
+  type: "Convolution"
+  bottom: "Mconv5_stage6_L2"
+  top: "Mconv6_stage6_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu6_stage6_L2"
+  type: "ReLU"
+  bottom: "Mconv6_stage6_L2"
+  top: "Mconv6_stage6_L2"
+}
+layer {
+  name: "Mconv7_stage6_L1"
+  type: "Convolution"
+  bottom: "Mconv6_stage6_L1"
+  top: "Mconv7_stage6_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 38
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mconv7_stage6_L2"
+  type: "Convolution"
+  bottom: "Mconv6_stage6_L2"
+  top: "Mconv7_stage6_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 19
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "concat_stage7"
+  type: "Concat"
+  bottom: "Mconv7_stage6_L2"
+  bottom: "Mconv7_stage6_L1"
+  # top: "concat_stage7"
+  top: "net_output"
+  concat_param {
+    axis: 1
+  }
+}
diff --git a/src/openpose/core/netTensorRT.cpp b/src/openpose/core/netTensorRT.cpp
index 79892b1ff..4ab8ab62d 100644
--- a/src/openpose/core/netTensorRT.cpp
+++ b/src/openpose/core/netTensorRT.cpp
@@ -47,7 +47,7 @@ namespace op
   // mNetInputSize4D{netInputSize4D}, // This line crashes on some devices with old G++
   mNetInputSize4D{netInputSize4D[0], netInputSize4D[1], netInputSize4D[2], netInputSize4D[3]},
   mNetInputMemory{std::accumulate(mNetInputSize4D.begin(), mNetInputSize4D.end(), 1, std::multiplies<int>()) * sizeof(float)},
-  mCaffeProto{caffeProto},
+  mCaffeProto{caffeProto + "_" + std::to_string(mNetInputSize4D[2]) + "x" + std::to_string(mNetInputSize4D[3])},
   mCaffeTrainedModel{caffeTrainedModel},
   mLastBlobName{lastBlobName}
   {

From f6df326db6ad480f20899e8631628f54d9fb1542 Mon Sep 17 00:00:00 2001
From: Florent Buisson <florent.buisson.1987@gmail.com>
Date: Tue, 26 Sep 2017 21:14:13 +0000
Subject: [PATCH 26/52] Engine serialization and deserialization.

---
 src/openpose/core/netTensorRT.cpp | 77 ++++++++++++++++++++++---------
 1 file changed, 54 insertions(+), 23 deletions(-)

diff --git a/src/openpose/core/netTensorRT.cpp b/src/openpose/core/netTensorRT.cpp
index 4ab8ab62d..01c659872 100644
--- a/src/openpose/core/netTensorRT.cpp
+++ b/src/openpose/core/netTensorRT.cpp
@@ -108,7 +108,6 @@ namespace op
     {
       DimsCHW dims = static_cast<DimsCHW&&>(network->getOutput(i)->getDimensions());
       std::cout << "Output \"" << network->getOutput(i)->getName() << "\": " << dims.c() << "x" << dims.h() << "x" << dims.w() << std::endl;
-      mNetOutputSize4D = { 1, dims.c(), dims.h(), dims.w() };
     }
     
     // Build the engine
@@ -130,32 +129,62 @@ namespace op
     return engine;
   }
   
+  inline bool file_exists(const std::string& file_path) {
+    struct stat buffer;
+    return (stat(file_path.c_str(), &buffer) == 0);
+  }
   
   ICudaEngine* NetTensorRT::createEngine()
   {
     ICudaEngine *engine;
     
-    engine = caffeToGIEModel();
-    if (!engine)
+    std::string serializedEnginePath = mCaffeProto + ".bin";
+
+    std::cout << "Serialized engine path: " << serializedEnginePath.c_str() << std::endl;
+    if (file_exists(serializedEnginePath))
     {
-      std::cerr << "Engine could not be created" << std::endl;
-      return nullptr;
+      std::cout << "Found serialized TensorRT engine, deserializing..." << std::endl;
+      char *gieModelStream{nullptr};
+      size_t size{0};
+      std::ifstream file(serializedEnginePath, std::ios::binary);
+      if (file.good())
+      {
+        file.seekg(0, file.end);
+        size = file.tellg();
+        file.seekg(0, file.beg);
+        gieModelStream = new char[size];
+        assert(gieModelStream);
+        file.read(gieModelStream, size);
+        file.close();
+      }
+
+      IRuntime* infer = createInferRuntime(gLogger);
+      engine = infer->deserializeCudaEngine(gieModelStream, size, nullptr);
+      if (gieModelStream) delete [] gieModelStream;
+      
+      return engine; 
+    }
+    else
+    {
+      engine = caffeToGIEModel();
+      if (!engine)
+      {
+        std::cerr << "Engine could not be created" << std::endl;
+        return nullptr;
+      }
+      else // serialize engine
+      {  
+        std::ofstream p(serializedEnginePath);
+        if (!p)
+        {
+          std::cerr << "could not serialize engine" << std::endl;
+        }
+        IHostMemory *ptr = engine->serialize();
+        assert(ptr);
+        p.write(reinterpret_cast<const char*>(ptr->data()), ptr->size());
+        ptr->destroy();
+      }
     }
-    
-    /* TODO Serialize and load engines for given net size as optim quite long
-     if (!gParams.engine.empty())
-     {
-     std::ofstream p(gParams.engine);
-     if (!p)
-     {
-     std::cerr << "could not open plan output file" << std::endl;
-     return nullptr;
-     }
-     IHostMemory *ptr = engine->serialize();
-     assert(ptr);
-     p.write(reinterpret_cast<const char*>(ptr->data()), ptr->size());
-     ptr->destroy();
-     }*/
     return engine;
   }
   
@@ -188,15 +217,17 @@ namespace op
         std::cerr << "cudaContext could not be created" << std::endl;
         return;
       }
-      
-      std::cout << "InitializationOnThread : done" << std::endl;
-      
+
+      DimsCHW outputDims = static_cast<DimsCHW&&>(cudaEngine->getBindingDimensions(cudaEngine->getNbBindings() - 1));      
+      mNetOutputSize4D = { 1, outputDims.c(), outputDims.h(), outputDims.w() };
+
       
       std::cout << "NetInputSize4D: " << mNetInputSize4D[0] << " " << mNetInputSize4D[1] << " " << mNetInputSize4D[2] << " " << mNetInputSize4D[3] << std::endl;
 
       spInputBlob = boost::make_shared<caffe::Blob<float>>(mNetInputSize4D[0], mNetInputSize4D[1], mNetInputSize4D[2], mNetInputSize4D[3]);
       spOutputBlob = boost::make_shared<caffe::Blob<float>>(mNetOutputSize4D[0], mNetOutputSize4D[1], mNetOutputSize4D[2], mNetOutputSize4D[3]);
       
+      std::cout << "InitializationOnThread : done" << std::endl;
       cudaCheck(__LINE__, __FUNCTION__, __FILE__);
     }
     catch (const std::exception& e)

From 404077a8b59f80e09ca3df2034e72ee69df3a5fb Mon Sep 17 00:00:00 2001
From: Florent Buisson <florent.buisson.1987@gmail.com>
Date: Tue, 26 Sep 2017 21:44:37 +0000
Subject: [PATCH 27/52] Targetting highest possible FPS in demo.

---
 .../3_extract_from_image_TensorRT.cpp         |    6 +-
 include/openpose/wrapper/wrapper.hpp          |    2 +-
 .../coco/pose_deploy_linevec.prototxt_96x128  | 2976 +++++++++++++++++
 3 files changed, 2980 insertions(+), 4 deletions(-)
 create mode 100755 models/pose/coco/pose_deploy_linevec.prototxt_96x128

diff --git a/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp b/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp
index f4e7eace1..adba661b0 100644
--- a/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp
+++ b/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp
@@ -31,9 +31,9 @@ DEFINE_string(image_path,               "examples/media/COCO_val2014_00000000019
 DEFINE_string(model_pose,               "COCO",         "Model to be used. E.g. `COCO` (18 keypoints), `MPI` (15 keypoints, ~10% faster), "
                                                         "`MPI_4_layers` (15 keypoints, even faster but less accurate).");
 DEFINE_string(model_folder,             "models/",      "Folder path (absolute or relative) where the models (pose, face, ...) are located.");
-DEFINE_string(net_resolution,           "656x368",      "Multiples of 16. If it is increased, the accuracy potentially increases. If it is decreased,"
+DEFINE_string(net_resolution,           "128x96",      "Multiples of 16. If it is increased, the accuracy potentially increases. If it is decreased,"
                                                         " the speed increases. For maximum speed-accuracy balance, it should keep the closest aspect"
-                                                        " ratio possible to the images or videos to be processed. E.g. the default `656x368` is"
+                                                        " ratio possible to the images or videos to be processed. E.g. the default `128x96` is"
                                                         " optimal for 16:9 videos, e.g. full HD (1980x1080) and HD (1280x720) videos.");
 DEFINE_string(resolution,               "1280x720",     "The image resolution (display and output). Use \"-1x-1\" to force the program to use the"
                                                         " default images resolution.");
@@ -85,7 +85,7 @@ int openPoseTutorialPose3()
     // outputSize
     const auto outputSize = op::flagsToPoint(FLAGS_resolution, "1280x720");
     // netInputSize
-    const auto netInputSize = op::flagsToPoint(FLAGS_net_resolution, "656x368");
+    const auto netInputSize = op::flagsToPoint(FLAGS_net_resolution, "128x96");
     // netOutputSize
     const auto netOutputSize = netInputSize;
     // poseModel
diff --git a/include/openpose/wrapper/wrapper.hpp b/include/openpose/wrapper/wrapper.hpp
index c543a4717..eacbf385d 100644
--- a/include/openpose/wrapper/wrapper.hpp
+++ b/include/openpose/wrapper/wrapper.hpp
@@ -570,7 +570,7 @@ namespace op
             const Point<int>& poseNetOutputSize = wrapperStructPose.netInputSize;
             std::vector<std::shared_ptr<PoseExtractor>> poseExtractors;
             for (auto gpuId = 0; gpuId < gpuNumber; gpuId++)
-                poseExtractors.emplace_back(std::make_shared<PoseExtractorCaffe>(
+                poseExtractors.emplace_back(std::make_shared<PoseExtractorTensorRT>(
                     wrapperStructPose.netInputSize, poseNetOutputSize, finalOutputSize, wrapperStructPose.scalesNumber,
                     wrapperStructPose.poseModel, wrapperStructPose.modelFolder, gpuId + gpuNumberStart,
                     wrapperStructPose.heatMapTypes, wrapperStructPose.heatMapScale
diff --git a/models/pose/coco/pose_deploy_linevec.prototxt_96x128 b/models/pose/coco/pose_deploy_linevec.prototxt_96x128
new file mode 100755
index 000000000..6e4322812
--- /dev/null
+++ b/models/pose/coco/pose_deploy_linevec.prototxt_96x128
@@ -0,0 +1,2976 @@
+input: "image"
+input_dim: 1
+input_dim: 3
+input_dim: 96 # This value will be defined at runtime
+input_dim: 128 # This value will be defined at runtime
+layer {
+  name: "conv1_1"
+  type: "Convolution"
+  bottom: "image"
+  top: "conv1_1"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu1_1"
+  type: "ReLU"
+  bottom: "conv1_1"
+  top: "conv1_1"
+}
+layer {
+  name: "conv1_2"
+  type: "Convolution"
+  bottom: "conv1_1"
+  top: "conv1_2"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu1_2"
+  type: "ReLU"
+  bottom: "conv1_2"
+  top: "conv1_2"
+}
+layer {
+  name: "pool1_stage1"
+  type: "Pooling"
+  bottom: "conv1_2"
+  top: "pool1_stage1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv2_1"
+  type: "Convolution"
+  bottom: "pool1_stage1"
+  top: "conv2_1"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu2_1"
+  type: "ReLU"
+  bottom: "conv2_1"
+  top: "conv2_1"
+}
+layer {
+  name: "conv2_2"
+  type: "Convolution"
+  bottom: "conv2_1"
+  top: "conv2_2"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu2_2"
+  type: "ReLU"
+  bottom: "conv2_2"
+  top: "conv2_2"
+}
+layer {
+  name: "pool2_stage1"
+  type: "Pooling"
+  bottom: "conv2_2"
+  top: "pool2_stage1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv3_1"
+  type: "Convolution"
+  bottom: "pool2_stage1"
+  top: "conv3_1"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu3_1"
+  type: "ReLU"
+  bottom: "conv3_1"
+  top: "conv3_1"
+}
+layer {
+  name: "conv3_2"
+  type: "Convolution"
+  bottom: "conv3_1"
+  top: "conv3_2"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu3_2"
+  type: "ReLU"
+  bottom: "conv3_2"
+  top: "conv3_2"
+}
+layer {
+  name: "conv3_3"
+  type: "Convolution"
+  bottom: "conv3_2"
+  top: "conv3_3"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu3_3"
+  type: "ReLU"
+  bottom: "conv3_3"
+  top: "conv3_3"
+}
+layer {
+  name: "conv3_4"
+  type: "Convolution"
+  bottom: "conv3_3"
+  top: "conv3_4"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu3_4"
+  type: "ReLU"
+  bottom: "conv3_4"
+  top: "conv3_4"
+}
+layer {
+  name: "pool3_stage1"
+  type: "Pooling"
+  bottom: "conv3_4"
+  top: "pool3_stage1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv4_1"
+  type: "Convolution"
+  bottom: "pool3_stage1"
+  top: "conv4_1"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu4_1"
+  type: "ReLU"
+  bottom: "conv4_1"
+  top: "conv4_1"
+}
+layer {
+  name: "conv4_2"
+  type: "Convolution"
+  bottom: "conv4_1"
+  top: "conv4_2"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu4_2"
+  type: "ReLU"
+  bottom: "conv4_2"
+  top: "conv4_2"
+}
+layer {
+  name: "conv4_3_CPM"
+  type: "Convolution"
+  bottom: "conv4_2"
+  top: "conv4_3_CPM"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu4_3_CPM"
+  type: "ReLU"
+  bottom: "conv4_3_CPM"
+  top: "conv4_3_CPM"
+}
+layer {
+  name: "conv4_4_CPM"
+  type: "Convolution"
+  bottom: "conv4_3_CPM"
+  top: "conv4_4_CPM"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu4_4_CPM"
+  type: "ReLU"
+  bottom: "conv4_4_CPM"
+  top: "conv4_4_CPM"
+}
+layer {
+  name: "conv5_1_CPM_L1"
+  type: "Convolution"
+  bottom: "conv4_4_CPM"
+  top: "conv5_1_CPM_L1"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu5_1_CPM_L1"
+  type: "ReLU"
+  bottom: "conv5_1_CPM_L1"
+  top: "conv5_1_CPM_L1"
+}
+layer {
+  name: "conv5_1_CPM_L2"
+  type: "Convolution"
+  bottom: "conv4_4_CPM"
+  top: "conv5_1_CPM_L2"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu5_1_CPM_L2"
+  type: "ReLU"
+  bottom: "conv5_1_CPM_L2"
+  top: "conv5_1_CPM_L2"
+}
+layer {
+  name: "conv5_2_CPM_L1"
+  type: "Convolution"
+  bottom: "conv5_1_CPM_L1"
+  top: "conv5_2_CPM_L1"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu5_2_CPM_L1"
+  type: "ReLU"
+  bottom: "conv5_2_CPM_L1"
+  top: "conv5_2_CPM_L1"
+}
+layer {
+  name: "conv5_2_CPM_L2"
+  type: "Convolution"
+  bottom: "conv5_1_CPM_L2"
+  top: "conv5_2_CPM_L2"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu5_2_CPM_L2"
+  type: "ReLU"
+  bottom: "conv5_2_CPM_L2"
+  top: "conv5_2_CPM_L2"
+}
+layer {
+  name: "conv5_3_CPM_L1"
+  type: "Convolution"
+  bottom: "conv5_2_CPM_L1"
+  top: "conv5_3_CPM_L1"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu5_3_CPM_L1"
+  type: "ReLU"
+  bottom: "conv5_3_CPM_L1"
+  top: "conv5_3_CPM_L1"
+}
+layer {
+  name: "conv5_3_CPM_L2"
+  type: "Convolution"
+  bottom: "conv5_2_CPM_L2"
+  top: "conv5_3_CPM_L2"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu5_3_CPM_L2"
+  type: "ReLU"
+  bottom: "conv5_3_CPM_L2"
+  top: "conv5_3_CPM_L2"
+}
+layer {
+  name: "conv5_4_CPM_L1"
+  type: "Convolution"
+  bottom: "conv5_3_CPM_L1"
+  top: "conv5_4_CPM_L1"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu5_4_CPM_L1"
+  type: "ReLU"
+  bottom: "conv5_4_CPM_L1"
+  top: "conv5_4_CPM_L1"
+}
+layer {
+  name: "conv5_4_CPM_L2"
+  type: "Convolution"
+  bottom: "conv5_3_CPM_L2"
+  top: "conv5_4_CPM_L2"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu5_4_CPM_L2"
+  type: "ReLU"
+  bottom: "conv5_4_CPM_L2"
+  top: "conv5_4_CPM_L2"
+}
+layer {
+  name: "conv5_5_CPM_L1"
+  type: "Convolution"
+  bottom: "conv5_4_CPM_L1"
+  top: "conv5_5_CPM_L1"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 38
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "conv5_5_CPM_L2"
+  type: "Convolution"
+  bottom: "conv5_4_CPM_L2"
+  top: "conv5_5_CPM_L2"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 19
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "concat_stage2"
+  type: "Concat"
+  bottom: "conv5_5_CPM_L1"
+  bottom: "conv5_5_CPM_L2"
+  bottom: "conv4_4_CPM"
+  top: "concat_stage2"
+  concat_param {
+    axis: 1
+  }
+}
+layer {
+  name: "Mconv1_stage2_L1"
+  type: "Convolution"
+  bottom: "concat_stage2"
+  top: "Mconv1_stage2_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu1_stage2_L1"
+  type: "ReLU"
+  bottom: "Mconv1_stage2_L1"
+  top: "Mconv1_stage2_L1"
+}
+layer {
+  name: "Mconv1_stage2_L2"
+  type: "Convolution"
+  bottom: "concat_stage2"
+  top: "Mconv1_stage2_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu1_stage2_L2"
+  type: "ReLU"
+  bottom: "Mconv1_stage2_L2"
+  top: "Mconv1_stage2_L2"
+}
+layer {
+  name: "Mconv2_stage2_L1"
+  type: "Convolution"
+  bottom: "Mconv1_stage2_L1"
+  top: "Mconv2_stage2_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu2_stage2_L1"
+  type: "ReLU"
+  bottom: "Mconv2_stage2_L1"
+  top: "Mconv2_stage2_L1"
+}
+layer {
+  name: "Mconv2_stage2_L2"
+  type: "Convolution"
+  bottom: "Mconv1_stage2_L2"
+  top: "Mconv2_stage2_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu2_stage2_L2"
+  type: "ReLU"
+  bottom: "Mconv2_stage2_L2"
+  top: "Mconv2_stage2_L2"
+}
+layer {
+  name: "Mconv3_stage2_L1"
+  type: "Convolution"
+  bottom: "Mconv2_stage2_L1"
+  top: "Mconv3_stage2_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu3_stage2_L1"
+  type: "ReLU"
+  bottom: "Mconv3_stage2_L1"
+  top: "Mconv3_stage2_L1"
+}
+layer {
+  name: "Mconv3_stage2_L2"
+  type: "Convolution"
+  bottom: "Mconv2_stage2_L2"
+  top: "Mconv3_stage2_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu3_stage2_L2"
+  type: "ReLU"
+  bottom: "Mconv3_stage2_L2"
+  top: "Mconv3_stage2_L2"
+}
+layer {
+  name: "Mconv4_stage2_L1"
+  type: "Convolution"
+  bottom: "Mconv3_stage2_L1"
+  top: "Mconv4_stage2_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu4_stage2_L1"
+  type: "ReLU"
+  bottom: "Mconv4_stage2_L1"
+  top: "Mconv4_stage2_L1"
+}
+layer {
+  name: "Mconv4_stage2_L2"
+  type: "Convolution"
+  bottom: "Mconv3_stage2_L2"
+  top: "Mconv4_stage2_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu4_stage2_L2"
+  type: "ReLU"
+  bottom: "Mconv4_stage2_L2"
+  top: "Mconv4_stage2_L2"
+}
+layer {
+  name: "Mconv5_stage2_L1"
+  type: "Convolution"
+  bottom: "Mconv4_stage2_L1"
+  top: "Mconv5_stage2_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu5_stage2_L1"
+  type: "ReLU"
+  bottom: "Mconv5_stage2_L1"
+  top: "Mconv5_stage2_L1"
+}
+layer {
+  name: "Mconv5_stage2_L2"
+  type: "Convolution"
+  bottom: "Mconv4_stage2_L2"
+  top: "Mconv5_stage2_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu5_stage2_L2"
+  type: "ReLU"
+  bottom: "Mconv5_stage2_L2"
+  top: "Mconv5_stage2_L2"
+}
+layer {
+  name: "Mconv6_stage2_L1"
+  type: "Convolution"
+  bottom: "Mconv5_stage2_L1"
+  top: "Mconv6_stage2_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu6_stage2_L1"
+  type: "ReLU"
+  bottom: "Mconv6_stage2_L1"
+  top: "Mconv6_stage2_L1"
+}
+layer {
+  name: "Mconv6_stage2_L2"
+  type: "Convolution"
+  bottom: "Mconv5_stage2_L2"
+  top: "Mconv6_stage2_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu6_stage2_L2"
+  type: "ReLU"
+  bottom: "Mconv6_stage2_L2"
+  top: "Mconv6_stage2_L2"
+}
+layer {
+  name: "Mconv7_stage2_L1"
+  type: "Convolution"
+  bottom: "Mconv6_stage2_L1"
+  top: "Mconv7_stage2_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 38
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mconv7_stage2_L2"
+  type: "Convolution"
+  bottom: "Mconv6_stage2_L2"
+  top: "Mconv7_stage2_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 19
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "concat_stage3"
+  type: "Concat"
+  bottom: "Mconv7_stage2_L1"
+  bottom: "Mconv7_stage2_L2"
+  bottom: "conv4_4_CPM"
+  top: "concat_stage3"
+  concat_param {
+    axis: 1
+  }
+}
+layer {
+  name: "Mconv1_stage3_L1"
+  type: "Convolution"
+  bottom: "concat_stage3"
+  top: "Mconv1_stage3_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu1_stage3_L1"
+  type: "ReLU"
+  bottom: "Mconv1_stage3_L1"
+  top: "Mconv1_stage3_L1"
+}
+layer {
+  name: "Mconv1_stage3_L2"
+  type: "Convolution"
+  bottom: "concat_stage3"
+  top: "Mconv1_stage3_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu1_stage3_L2"
+  type: "ReLU"
+  bottom: "Mconv1_stage3_L2"
+  top: "Mconv1_stage3_L2"
+}
+layer {
+  name: "Mconv2_stage3_L1"
+  type: "Convolution"
+  bottom: "Mconv1_stage3_L1"
+  top: "Mconv2_stage3_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu2_stage3_L1"
+  type: "ReLU"
+  bottom: "Mconv2_stage3_L1"
+  top: "Mconv2_stage3_L1"
+}
+layer {
+  name: "Mconv2_stage3_L2"
+  type: "Convolution"
+  bottom: "Mconv1_stage3_L2"
+  top: "Mconv2_stage3_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu2_stage3_L2"
+  type: "ReLU"
+  bottom: "Mconv2_stage3_L2"
+  top: "Mconv2_stage3_L2"
+}
+layer {
+  name: "Mconv3_stage3_L1"
+  type: "Convolution"
+  bottom: "Mconv2_stage3_L1"
+  top: "Mconv3_stage3_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu3_stage3_L1"
+  type: "ReLU"
+  bottom: "Mconv3_stage3_L1"
+  top: "Mconv3_stage3_L1"
+}
+layer {
+  name: "Mconv3_stage3_L2"
+  type: "Convolution"
+  bottom: "Mconv2_stage3_L2"
+  top: "Mconv3_stage3_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu3_stage3_L2"
+  type: "ReLU"
+  bottom: "Mconv3_stage3_L2"
+  top: "Mconv3_stage3_L2"
+}
+layer {
+  name: "Mconv4_stage3_L1"
+  type: "Convolution"
+  bottom: "Mconv3_stage3_L1"
+  top: "Mconv4_stage3_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu4_stage3_L1"
+  type: "ReLU"
+  bottom: "Mconv4_stage3_L1"
+  top: "Mconv4_stage3_L1"
+}
+layer {
+  name: "Mconv4_stage3_L2"
+  type: "Convolution"
+  bottom: "Mconv3_stage3_L2"
+  top: "Mconv4_stage3_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu4_stage3_L2"
+  type: "ReLU"
+  bottom: "Mconv4_stage3_L2"
+  top: "Mconv4_stage3_L2"
+}
+layer {
+  name: "Mconv5_stage3_L1"
+  type: "Convolution"
+  bottom: "Mconv4_stage3_L1"
+  top: "Mconv5_stage3_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu5_stage3_L1"
+  type: "ReLU"
+  bottom: "Mconv5_stage3_L1"
+  top: "Mconv5_stage3_L1"
+}
+layer {
+  name: "Mconv5_stage3_L2"
+  type: "Convolution"
+  bottom: "Mconv4_stage3_L2"
+  top: "Mconv5_stage3_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu5_stage3_L2"
+  type: "ReLU"
+  bottom: "Mconv5_stage3_L2"
+  top: "Mconv5_stage3_L2"
+}
+layer {
+  name: "Mconv6_stage3_L1"
+  type: "Convolution"
+  bottom: "Mconv5_stage3_L1"
+  top: "Mconv6_stage3_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu6_stage3_L1"
+  type: "ReLU"
+  bottom: "Mconv6_stage3_L1"
+  top: "Mconv6_stage3_L1"
+}
+layer {
+  name: "Mconv6_stage3_L2"
+  type: "Convolution"
+  bottom: "Mconv5_stage3_L2"
+  top: "Mconv6_stage3_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu6_stage3_L2"
+  type: "ReLU"
+  bottom: "Mconv6_stage3_L2"
+  top: "Mconv6_stage3_L2"
+}
+layer {
+  name: "Mconv7_stage3_L1"
+  type: "Convolution"
+  bottom: "Mconv6_stage3_L1"
+  top: "Mconv7_stage3_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 38
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mconv7_stage3_L2"
+  type: "Convolution"
+  bottom: "Mconv6_stage3_L2"
+  top: "Mconv7_stage3_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 19
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "concat_stage4"
+  type: "Concat"
+  bottom: "Mconv7_stage3_L1"
+  bottom: "Mconv7_stage3_L2"
+  bottom: "conv4_4_CPM"
+  top: "concat_stage4"
+  concat_param {
+    axis: 1
+  }
+}
+layer {
+  name: "Mconv1_stage4_L1"
+  type: "Convolution"
+  bottom: "concat_stage4"
+  top: "Mconv1_stage4_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu1_stage4_L1"
+  type: "ReLU"
+  bottom: "Mconv1_stage4_L1"
+  top: "Mconv1_stage4_L1"
+}
+layer {
+  name: "Mconv1_stage4_L2"
+  type: "Convolution"
+  bottom: "concat_stage4"
+  top: "Mconv1_stage4_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu1_stage4_L2"
+  type: "ReLU"
+  bottom: "Mconv1_stage4_L2"
+  top: "Mconv1_stage4_L2"
+}
+layer {
+  name: "Mconv2_stage4_L1"
+  type: "Convolution"
+  bottom: "Mconv1_stage4_L1"
+  top: "Mconv2_stage4_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu2_stage4_L1"
+  type: "ReLU"
+  bottom: "Mconv2_stage4_L1"
+  top: "Mconv2_stage4_L1"
+}
+layer {
+  name: "Mconv2_stage4_L2"
+  type: "Convolution"
+  bottom: "Mconv1_stage4_L2"
+  top: "Mconv2_stage4_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu2_stage4_L2"
+  type: "ReLU"
+  bottom: "Mconv2_stage4_L2"
+  top: "Mconv2_stage4_L2"
+}
+layer {
+  name: "Mconv3_stage4_L1"
+  type: "Convolution"
+  bottom: "Mconv2_stage4_L1"
+  top: "Mconv3_stage4_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu3_stage4_L1"
+  type: "ReLU"
+  bottom: "Mconv3_stage4_L1"
+  top: "Mconv3_stage4_L1"
+}
+layer {
+  name: "Mconv3_stage4_L2"
+  type: "Convolution"
+  bottom: "Mconv2_stage4_L2"
+  top: "Mconv3_stage4_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu3_stage4_L2"
+  type: "ReLU"
+  bottom: "Mconv3_stage4_L2"
+  top: "Mconv3_stage4_L2"
+}
+layer {
+  name: "Mconv4_stage4_L1"
+  type: "Convolution"
+  bottom: "Mconv3_stage4_L1"
+  top: "Mconv4_stage4_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu4_stage4_L1"
+  type: "ReLU"
+  bottom: "Mconv4_stage4_L1"
+  top: "Mconv4_stage4_L1"
+}
+layer {
+  name: "Mconv4_stage4_L2"
+  type: "Convolution"
+  bottom: "Mconv3_stage4_L2"
+  top: "Mconv4_stage4_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu4_stage4_L2"
+  type: "ReLU"
+  bottom: "Mconv4_stage4_L2"
+  top: "Mconv4_stage4_L2"
+}
+layer {
+  name: "Mconv5_stage4_L1"
+  type: "Convolution"
+  bottom: "Mconv4_stage4_L1"
+  top: "Mconv5_stage4_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu5_stage4_L1"
+  type: "ReLU"
+  bottom: "Mconv5_stage4_L1"
+  top: "Mconv5_stage4_L1"
+}
+layer {
+  name: "Mconv5_stage4_L2"
+  type: "Convolution"
+  bottom: "Mconv4_stage4_L2"
+  top: "Mconv5_stage4_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu5_stage4_L2"
+  type: "ReLU"
+  bottom: "Mconv5_stage4_L2"
+  top: "Mconv5_stage4_L2"
+}
+layer {
+  name: "Mconv6_stage4_L1"
+  type: "Convolution"
+  bottom: "Mconv5_stage4_L1"
+  top: "Mconv6_stage4_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu6_stage4_L1"
+  type: "ReLU"
+  bottom: "Mconv6_stage4_L1"
+  top: "Mconv6_stage4_L1"
+}
+layer {
+  name: "Mconv6_stage4_L2"
+  type: "Convolution"
+  bottom: "Mconv5_stage4_L2"
+  top: "Mconv6_stage4_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu6_stage4_L2"
+  type: "ReLU"
+  bottom: "Mconv6_stage4_L2"
+  top: "Mconv6_stage4_L2"
+}
+layer {
+  name: "Mconv7_stage4_L1"
+  type: "Convolution"
+  bottom: "Mconv6_stage4_L1"
+  top: "Mconv7_stage4_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 38
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mconv7_stage4_L2"
+  type: "Convolution"
+  bottom: "Mconv6_stage4_L2"
+  top: "Mconv7_stage4_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 19
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "concat_stage5"
+  type: "Concat"
+  bottom: "Mconv7_stage4_L1"
+  bottom: "Mconv7_stage4_L2"
+  bottom: "conv4_4_CPM"
+  top: "concat_stage5"
+  concat_param {
+    axis: 1
+  }
+}
+layer {
+  name: "Mconv1_stage5_L1"
+  type: "Convolution"
+  bottom: "concat_stage5"
+  top: "Mconv1_stage5_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu1_stage5_L1"
+  type: "ReLU"
+  bottom: "Mconv1_stage5_L1"
+  top: "Mconv1_stage5_L1"
+}
+layer {
+  name: "Mconv1_stage5_L2"
+  type: "Convolution"
+  bottom: "concat_stage5"
+  top: "Mconv1_stage5_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu1_stage5_L2"
+  type: "ReLU"
+  bottom: "Mconv1_stage5_L2"
+  top: "Mconv1_stage5_L2"
+}
+layer {
+  name: "Mconv2_stage5_L1"
+  type: "Convolution"
+  bottom: "Mconv1_stage5_L1"
+  top: "Mconv2_stage5_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu2_stage5_L1"
+  type: "ReLU"
+  bottom: "Mconv2_stage5_L1"
+  top: "Mconv2_stage5_L1"
+}
+layer {
+  name: "Mconv2_stage5_L2"
+  type: "Convolution"
+  bottom: "Mconv1_stage5_L2"
+  top: "Mconv2_stage5_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu2_stage5_L2"
+  type: "ReLU"
+  bottom: "Mconv2_stage5_L2"
+  top: "Mconv2_stage5_L2"
+}
+layer {
+  name: "Mconv3_stage5_L1"
+  type: "Convolution"
+  bottom: "Mconv2_stage5_L1"
+  top: "Mconv3_stage5_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu3_stage5_L1"
+  type: "ReLU"
+  bottom: "Mconv3_stage5_L1"
+  top: "Mconv3_stage5_L1"
+}
+layer {
+  name: "Mconv3_stage5_L2"
+  type: "Convolution"
+  bottom: "Mconv2_stage5_L2"
+  top: "Mconv3_stage5_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu3_stage5_L2"
+  type: "ReLU"
+  bottom: "Mconv3_stage5_L2"
+  top: "Mconv3_stage5_L2"
+}
+layer {
+  name: "Mconv4_stage5_L1"
+  type: "Convolution"
+  bottom: "Mconv3_stage5_L1"
+  top: "Mconv4_stage5_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu4_stage5_L1"
+  type: "ReLU"
+  bottom: "Mconv4_stage5_L1"
+  top: "Mconv4_stage5_L1"
+}
+layer {
+  name: "Mconv4_stage5_L2"
+  type: "Convolution"
+  bottom: "Mconv3_stage5_L2"
+  top: "Mconv4_stage5_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu4_stage5_L2"
+  type: "ReLU"
+  bottom: "Mconv4_stage5_L2"
+  top: "Mconv4_stage5_L2"
+}
+layer {
+  name: "Mconv5_stage5_L1"
+  type: "Convolution"
+  bottom: "Mconv4_stage5_L1"
+  top: "Mconv5_stage5_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu5_stage5_L1"
+  type: "ReLU"
+  bottom: "Mconv5_stage5_L1"
+  top: "Mconv5_stage5_L1"
+}
+layer {
+  name: "Mconv5_stage5_L2"
+  type: "Convolution"
+  bottom: "Mconv4_stage5_L2"
+  top: "Mconv5_stage5_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu5_stage5_L2"
+  type: "ReLU"
+  bottom: "Mconv5_stage5_L2"
+  top: "Mconv5_stage5_L2"
+}
+layer {
+  name: "Mconv6_stage5_L1"
+  type: "Convolution"
+  bottom: "Mconv5_stage5_L1"
+  top: "Mconv6_stage5_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu6_stage5_L1"
+  type: "ReLU"
+  bottom: "Mconv6_stage5_L1"
+  top: "Mconv6_stage5_L1"
+}
+layer {
+  name: "Mconv6_stage5_L2"
+  type: "Convolution"
+  bottom: "Mconv5_stage5_L2"
+  top: "Mconv6_stage5_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu6_stage5_L2"
+  type: "ReLU"
+  bottom: "Mconv6_stage5_L2"
+  top: "Mconv6_stage5_L2"
+}
+layer {
+  name: "Mconv7_stage5_L1"
+  type: "Convolution"
+  bottom: "Mconv6_stage5_L1"
+  top: "Mconv7_stage5_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 38
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mconv7_stage5_L2"
+  type: "Convolution"
+  bottom: "Mconv6_stage5_L2"
+  top: "Mconv7_stage5_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 19
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "concat_stage6"
+  type: "Concat"
+  bottom: "Mconv7_stage5_L1"
+  bottom: "Mconv7_stage5_L2"
+  bottom: "conv4_4_CPM"
+  top: "concat_stage6"
+  concat_param {
+    axis: 1
+  }
+}
+layer {
+  name: "Mconv1_stage6_L1"
+  type: "Convolution"
+  bottom: "concat_stage6"
+  top: "Mconv1_stage6_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu1_stage6_L1"
+  type: "ReLU"
+  bottom: "Mconv1_stage6_L1"
+  top: "Mconv1_stage6_L1"
+}
+layer {
+  name: "Mconv1_stage6_L2"
+  type: "Convolution"
+  bottom: "concat_stage6"
+  top: "Mconv1_stage6_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu1_stage6_L2"
+  type: "ReLU"
+  bottom: "Mconv1_stage6_L2"
+  top: "Mconv1_stage6_L2"
+}
+layer {
+  name: "Mconv2_stage6_L1"
+  type: "Convolution"
+  bottom: "Mconv1_stage6_L1"
+  top: "Mconv2_stage6_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu2_stage6_L1"
+  type: "ReLU"
+  bottom: "Mconv2_stage6_L1"
+  top: "Mconv2_stage6_L1"
+}
+layer {
+  name: "Mconv2_stage6_L2"
+  type: "Convolution"
+  bottom: "Mconv1_stage6_L2"
+  top: "Mconv2_stage6_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu2_stage6_L2"
+  type: "ReLU"
+  bottom: "Mconv2_stage6_L2"
+  top: "Mconv2_stage6_L2"
+}
+layer {
+  name: "Mconv3_stage6_L1"
+  type: "Convolution"
+  bottom: "Mconv2_stage6_L1"
+  top: "Mconv3_stage6_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu3_stage6_L1"
+  type: "ReLU"
+  bottom: "Mconv3_stage6_L1"
+  top: "Mconv3_stage6_L1"
+}
+layer {
+  name: "Mconv3_stage6_L2"
+  type: "Convolution"
+  bottom: "Mconv2_stage6_L2"
+  top: "Mconv3_stage6_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu3_stage6_L2"
+  type: "ReLU"
+  bottom: "Mconv3_stage6_L2"
+  top: "Mconv3_stage6_L2"
+}
+layer {
+  name: "Mconv4_stage6_L1"
+  type: "Convolution"
+  bottom: "Mconv3_stage6_L1"
+  top: "Mconv4_stage6_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu4_stage6_L1"
+  type: "ReLU"
+  bottom: "Mconv4_stage6_L1"
+  top: "Mconv4_stage6_L1"
+}
+layer {
+  name: "Mconv4_stage6_L2"
+  type: "Convolution"
+  bottom: "Mconv3_stage6_L2"
+  top: "Mconv4_stage6_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu4_stage6_L2"
+  type: "ReLU"
+  bottom: "Mconv4_stage6_L2"
+  top: "Mconv4_stage6_L2"
+}
+layer {
+  name: "Mconv5_stage6_L1"
+  type: "Convolution"
+  bottom: "Mconv4_stage6_L1"
+  top: "Mconv5_stage6_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu5_stage6_L1"
+  type: "ReLU"
+  bottom: "Mconv5_stage6_L1"
+  top: "Mconv5_stage6_L1"
+}
+layer {
+  name: "Mconv5_stage6_L2"
+  type: "Convolution"
+  bottom: "Mconv4_stage6_L2"
+  top: "Mconv5_stage6_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu5_stage6_L2"
+  type: "ReLU"
+  bottom: "Mconv5_stage6_L2"
+  top: "Mconv5_stage6_L2"
+}
+layer {
+  name: "Mconv6_stage6_L1"
+  type: "Convolution"
+  bottom: "Mconv5_stage6_L1"
+  top: "Mconv6_stage6_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu6_stage6_L1"
+  type: "ReLU"
+  bottom: "Mconv6_stage6_L1"
+  top: "Mconv6_stage6_L1"
+}
+layer {
+  name: "Mconv6_stage6_L2"
+  type: "Convolution"
+  bottom: "Mconv5_stage6_L2"
+  top: "Mconv6_stage6_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu6_stage6_L2"
+  type: "ReLU"
+  bottom: "Mconv6_stage6_L2"
+  top: "Mconv6_stage6_L2"
+}
+layer {
+  name: "Mconv7_stage6_L1"
+  type: "Convolution"
+  bottom: "Mconv6_stage6_L1"
+  top: "Mconv7_stage6_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 38
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mconv7_stage6_L2"
+  type: "Convolution"
+  bottom: "Mconv6_stage6_L2"
+  top: "Mconv7_stage6_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 19
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "concat_stage7"
+  type: "Concat"
+  bottom: "Mconv7_stage6_L2"
+  bottom: "Mconv7_stage6_L1"
+  # top: "concat_stage7"
+  top: "net_output"
+  concat_param {
+    axis: 1
+  }
+}

From 1971baa4f3f14d04d639768ab92eefc791fbe76c Mon Sep 17 00:00:00 2001
From: Florent Buisson <florent.buisson.1987@gmail.com>
Date: Tue, 26 Sep 2017 21:54:05 +0000
Subject: [PATCH 28/52] Asynchronous inference.

---
 src/openpose/core/netTensorRT.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/openpose/core/netTensorRT.cpp b/src/openpose/core/netTensorRT.cpp
index 01c659872..fd4a174d2 100644
--- a/src/openpose/core/netTensorRT.cpp
+++ b/src/openpose/core/netTensorRT.cpp
@@ -308,7 +308,7 @@ namespace op
       
         std::cout << "Forward Pass : executing inference" << std::endl;
       
-        cudaContext->execute(batchSize, &buffers[0]);
+        cudaContext->enqueue(batchSize, &buffers[0], stream, nullptr);
       
         spOutputBlob->set_gpu_data((float*)deviceMem);
       

From 330d4bbf0dcedfc9b7a595c0d7af61cb1e428204 Mon Sep 17 00:00:00 2001
From: Florent Buisson <florent.buisson.1987@gmail.com>
Date: Tue, 26 Sep 2017 22:03:38 +0000
Subject: [PATCH 29/52] Way simpler inference code, a lot was useless.

---
 src/openpose/core/netTensorRT.cpp | 27 ---------------------------
 1 file changed, 27 deletions(-)

diff --git a/src/openpose/core/netTensorRT.cpp b/src/openpose/core/netTensorRT.cpp
index fd4a174d2..bb02d5041 100644
--- a/src/openpose/core/netTensorRT.cpp
+++ b/src/openpose/core/netTensorRT.cpp
@@ -277,41 +277,14 @@ namespace op
         
         // input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(),
         // of these, but in this case we know that there is exactly one input and one output.
-        
-        std::cout << "Forward Pass : creating CUDA memory" << std::endl;
-        
         std::vector<void*> buffers(2);
         buffers[0] = spInputBlob->mutable_gpu_data();
         buffers[1] = spOutputBlob->mutable_gpu_data();
-        
-        size_t eltCount = mNetOutputSize4D[0]*mNetOutputSize4D[1]*mNetOutputSize4D[2]*mNetOutputSize4D[3]*batchSize, memSize = eltCount * sizeof(float);
-          
-        float* localMem = new float[eltCount];
-        for (size_t i = 0; i < eltCount; i++)
-          localMem[i] = (float(rand()) / RAND_MAX) * 2 - 1;
-          
-        void* deviceMem;
-        CUDA_CHECK(cudaMalloc(&deviceMem, memSize));
-        if (deviceMem == nullptr)
-        {
-          std::cerr << "Out of memory" << std::endl;
-          exit(1);
-        }
-        CUDA_CHECK(cudaMemcpy(deviceMem, localMem, memSize, cudaMemcpyHostToDevice));
-          
-        
-        buffers[1] = deviceMem;
-        delete[] localMem;
-        
-        std::cout << "Forward Pass : memory created" << std::endl;
-        cudaCheck(__LINE__, __FUNCTION__, __FILE__);
       
         std::cout << "Forward Pass : executing inference" << std::endl;
       
         cudaContext->enqueue(batchSize, &buffers[0], stream, nullptr);
       
-        spOutputBlob->set_gpu_data((float*)deviceMem);
-      
         std::cout << "Forward Pass : inference done !" << std::endl;
         cudaCheck(__LINE__, __FUNCTION__, __FILE__);
       }

From c2be9aa06170054108bcd159be30b14e6b148830 Mon Sep 17 00:00:00 2001
From: Florent Buisson <florent.buisson.1987@gmail.com>
Date: Tue, 26 Sep 2017 23:11:23 +0000
Subject: [PATCH 30/52] Removing log to speedup inference.

---
 src/openpose/core/netTensorRT.cpp           | 7 +------
 src/openpose/pose/poseExtractorTensorRT.cpp | 8 --------
 2 files changed, 1 insertion(+), 14 deletions(-)

diff --git a/src/openpose/core/netTensorRT.cpp b/src/openpose/core/netTensorRT.cpp
index bb02d5041..6d2c81293 100644
--- a/src/openpose/core/netTensorRT.cpp
+++ b/src/openpose/core/netTensorRT.cpp
@@ -264,8 +264,6 @@ namespace op
   
   void NetTensorRT::forwardPass(const float* const inputData) const
   {
-    
-    std::cout << "Forward Pass : start" << std::endl;
     try
     {
       const int batchSize = 1;
@@ -281,12 +279,9 @@ namespace op
         buffers[0] = spInputBlob->mutable_gpu_data();
         buffers[1] = spOutputBlob->mutable_gpu_data();
       
-        std::cout << "Forward Pass : executing inference" << std::endl;
-      
         cudaContext->enqueue(batchSize, &buffers[0], stream, nullptr);
       
-        std::cout << "Forward Pass : inference done !" << std::endl;
-        cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+        //cudaCheck(__LINE__, __FUNCTION__, __FILE__);
       }
     }
     catch (const std::exception& e)
diff --git a/src/openpose/pose/poseExtractorTensorRT.cpp b/src/openpose/pose/poseExtractorTensorRT.cpp
index a9a0abb35..a1b07f00e 100644
--- a/src/openpose/pose/poseExtractorTensorRT.cpp
+++ b/src/openpose/pose/poseExtractorTensorRT.cpp
@@ -83,17 +83,9 @@ namespace op
             if (inputNetData.empty())
                 error("Empty inputNetData.", __LINE__, __FUNCTION__, __FILE__);
 
-          
-          
-            std::cout << "Forward Pass Pose: tensorrt forward pass" << std::endl;
             // 1. TensorRT deep network
             spNet->forwardPass(inputNetData.getConstPtr());
           
-            std::cout << "Forward Pass Pose: tensorrt passed !" << std::endl;
-          
-            // Replace spNet->forward pass, but how to propagate to next
-            // Replace spTensorRTNetOututBlob.get() ?
-
             // 2. Resize heat maps + merge different scales
             spResizeAndMergeTensorRT->setScaleRatios(scaleRatios);
             #ifndef CPU_ONLY

From 89e3b443e8fb9330e4d6eb4ac8cbc72730974164 Mon Sep 17 00:00:00 2001
From: Florent Buisson <florent.buisson.1987@gmail.com>
Date: Wed, 27 Sep 2017 13:10:08 +0000
Subject: [PATCH 31/52] ResizeAndMergeBase CPU version.

---
 src/openpose/core/resizeAndMergeBase.cpp | 71 +++++++++---------------
 1 file changed, 26 insertions(+), 45 deletions(-)

diff --git a/src/openpose/core/resizeAndMergeBase.cpp b/src/openpose/core/resizeAndMergeBase.cpp
index ea324a251..3872e9614 100644
--- a/src/openpose/core/resizeAndMergeBase.cpp
+++ b/src/openpose/core/resizeAndMergeBase.cpp
@@ -1,6 +1,8 @@
-// #include <opencv2/imgproc/imgproc.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#include <opencv2/core/core.hpp>
 #include <openpose/core/resizeAndMergeBase.hpp>
 
+
 namespace op
 {
     template <typename T>
@@ -9,50 +11,29 @@ namespace op
     {
         try
         {
-            UNUSED(targetPtr);
-            UNUSED(sourcePtr);
-            UNUSED(scaleRatios);
-            UNUSED(targetSize);
-            UNUSED(sourceSize);
-            error("CPU version not completely implemented.", __LINE__, __FUNCTION__, __FILE__);
-
-            // TODO: THIS CODE IS WORKING, BUT IT DOES NOT CONSIDER THE SCALES (I.E. SCALE NUMBER, START AND GAP) 
-            // const int num = bottom->shape(0);
-            // const int channel = bottom->shape(1);
-            // const int sourceHeight = bottom->shape(2);
-            // const int sourceWidth = bottom->shape(3);
-            // const int targetHeight = top->shape(2);
-            // const int targetWidth = top->shape(3);
-
-            // //stupid method
-            // for (int n = 0; n < num; n++)
-            // {
-            //     for (int c = 0; c < channel; c++)
-            //     {
-            //         //fill source
-            //         cv::Mat source(sourceWidth, sourceHeight, CV_32FC1);
-            //         const auto sourceOffsetChannel = sourceHeight * sourceWidth;
-            //         const auto sourceOffsetNum = sourceOffsetChannel * channel;
-            //         const auto sourceOffset = n*sourceOffsetNum + c*sourceOffsetChannel;
-            //         const T* const sourcePtr = bottom->cpu_data();
-            //         for (int y = 0; y < sourceHeight; y++)
-            //             for (int x = 0; x < sourceWidth; x++)
-            //                 source.at<T>(x,y) = sourcePtr[sourceOffset + y*sourceWidth + x];
-
-            //         // spatial resize
-            //         cv::Mat target;
-            //         cv::resize(source, target, {targetWidth, targetHeight}, 0, 0, CV_INTER_CUBIC);
-
-            //         //fill top
-            //         const auto targetOffsetChannel = targetHeight * targetWidth;
-            //         const auto targetOffsetNum = targetOffsetChannel * channel;
-            //         const auto targetOffset = n*targetOffsetNum + c*targetOffsetChannel;
-            //         T* targetPtr = top->mutable_cpu_data();
-            //         for (int y = 0; y < targetHeight; y++)
-            //             for (int x = 0; x < targetWidth; x++)
-            //                 targetPtr[targetOffset + y*targetWidth + x] = target.at<T>(x,y);
-            //     }
-            // }
+            const int num = sourceSize[0];
+            const int channels = sourceSize[1];
+            const int sourceHeight = sourceSize[2];
+            const int sourceWidth = sourceSize[3];
+            const int targetHeight = targetSize[2];
+            const int targetWidth = targetSize[3];
+            
+            const auto sourceChannelOffset = sourceHeight * sourceWidth;
+            const auto targetChannelOffset = targetWidth * targetHeight;
+             
+            // Perform resize + merging
+            const auto sourceNumOffset = channels * sourceChannelOffset;
+            for (auto c = 0 ; c < channels ; c++) {
+                cv::Mat target (targetHeight, targetWidth, CV_32F, (void*)(targetPtr + c * targetChannelOffset));
+                cv::multiply(target, 0.f, target);
+                cv::Mat t;
+                for (auto n = 0; n < num; n++) {
+                    cv::Mat source(std::rint(sourceHeight * scaleRatios[n]), std::rint(sourceWidth * scaleRatios[n]), CV_32F, (void*)(sourcePtr + c * sourceChannelOffset + n * sourceNumOffset));
+                    cv::resize(source, t, cv::Size(targetWidth, targetHeight), 0., 0., cv::INTER_CUBIC);
+                    cv::add(target, t, target);
+                }
+                cv::divide(target, (float)num, target);
+            }
         }
         catch (const std::exception& e)
         {

From b54ae119c325926e85f9465110a850b24d9bc248 Mon Sep 17 00:00:00 2001
From: Florent Buisson <florent.buisson.1987@gmail.com>
Date: Wed, 27 Sep 2017 13:11:37 +0000
Subject: [PATCH 32/52] Inference model for pose net size 256x192

---
 .../coco/pose_deploy_linevec.prototxt_192x256 | 2976 +++++++++++++++++
 1 file changed, 2976 insertions(+)
 create mode 100755 models/pose/coco/pose_deploy_linevec.prototxt_192x256

diff --git a/models/pose/coco/pose_deploy_linevec.prototxt_192x256 b/models/pose/coco/pose_deploy_linevec.prototxt_192x256
new file mode 100755
index 000000000..99cc4e4fe
--- /dev/null
+++ b/models/pose/coco/pose_deploy_linevec.prototxt_192x256
@@ -0,0 +1,2976 @@
+input: "image"
+input_dim: 1
+input_dim: 3
+input_dim: 192 # This value will be defined at runtime
+input_dim: 256 # This value will be defined at runtime
+layer {
+  name: "conv1_1"
+  type: "Convolution"
+  bottom: "image"
+  top: "conv1_1"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu1_1"
+  type: "ReLU"
+  bottom: "conv1_1"
+  top: "conv1_1"
+}
+layer {
+  name: "conv1_2"
+  type: "Convolution"
+  bottom: "conv1_1"
+  top: "conv1_2"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu1_2"
+  type: "ReLU"
+  bottom: "conv1_2"
+  top: "conv1_2"
+}
+layer {
+  name: "pool1_stage1"
+  type: "Pooling"
+  bottom: "conv1_2"
+  top: "pool1_stage1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv2_1"
+  type: "Convolution"
+  bottom: "pool1_stage1"
+  top: "conv2_1"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu2_1"
+  type: "ReLU"
+  bottom: "conv2_1"
+  top: "conv2_1"
+}
+layer {
+  name: "conv2_2"
+  type: "Convolution"
+  bottom: "conv2_1"
+  top: "conv2_2"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu2_2"
+  type: "ReLU"
+  bottom: "conv2_2"
+  top: "conv2_2"
+}
+layer {
+  name: "pool2_stage1"
+  type: "Pooling"
+  bottom: "conv2_2"
+  top: "pool2_stage1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv3_1"
+  type: "Convolution"
+  bottom: "pool2_stage1"
+  top: "conv3_1"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu3_1"
+  type: "ReLU"
+  bottom: "conv3_1"
+  top: "conv3_1"
+}
+layer {
+  name: "conv3_2"
+  type: "Convolution"
+  bottom: "conv3_1"
+  top: "conv3_2"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu3_2"
+  type: "ReLU"
+  bottom: "conv3_2"
+  top: "conv3_2"
+}
+layer {
+  name: "conv3_3"
+  type: "Convolution"
+  bottom: "conv3_2"
+  top: "conv3_3"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu3_3"
+  type: "ReLU"
+  bottom: "conv3_3"
+  top: "conv3_3"
+}
+layer {
+  name: "conv3_4"
+  type: "Convolution"
+  bottom: "conv3_3"
+  top: "conv3_4"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu3_4"
+  type: "ReLU"
+  bottom: "conv3_4"
+  top: "conv3_4"
+}
+layer {
+  name: "pool3_stage1"
+  type: "Pooling"
+  bottom: "conv3_4"
+  top: "pool3_stage1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv4_1"
+  type: "Convolution"
+  bottom: "pool3_stage1"
+  top: "conv4_1"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu4_1"
+  type: "ReLU"
+  bottom: "conv4_1"
+  top: "conv4_1"
+}
+layer {
+  name: "conv4_2"
+  type: "Convolution"
+  bottom: "conv4_1"
+  top: "conv4_2"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu4_2"
+  type: "ReLU"
+  bottom: "conv4_2"
+  top: "conv4_2"
+}
+layer {
+  name: "conv4_3_CPM"
+  type: "Convolution"
+  bottom: "conv4_2"
+  top: "conv4_3_CPM"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu4_3_CPM"
+  type: "ReLU"
+  bottom: "conv4_3_CPM"
+  top: "conv4_3_CPM"
+}
+layer {
+  name: "conv4_4_CPM"
+  type: "Convolution"
+  bottom: "conv4_3_CPM"
+  top: "conv4_4_CPM"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu4_4_CPM"
+  type: "ReLU"
+  bottom: "conv4_4_CPM"
+  top: "conv4_4_CPM"
+}
+layer {
+  name: "conv5_1_CPM_L1"
+  type: "Convolution"
+  bottom: "conv4_4_CPM"
+  top: "conv5_1_CPM_L1"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu5_1_CPM_L1"
+  type: "ReLU"
+  bottom: "conv5_1_CPM_L1"
+  top: "conv5_1_CPM_L1"
+}
+layer {
+  name: "conv5_1_CPM_L2"
+  type: "Convolution"
+  bottom: "conv4_4_CPM"
+  top: "conv5_1_CPM_L2"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu5_1_CPM_L2"
+  type: "ReLU"
+  bottom: "conv5_1_CPM_L2"
+  top: "conv5_1_CPM_L2"
+}
+layer {
+  name: "conv5_2_CPM_L1"
+  type: "Convolution"
+  bottom: "conv5_1_CPM_L1"
+  top: "conv5_2_CPM_L1"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu5_2_CPM_L1"
+  type: "ReLU"
+  bottom: "conv5_2_CPM_L1"
+  top: "conv5_2_CPM_L1"
+}
+layer {
+  name: "conv5_2_CPM_L2"
+  type: "Convolution"
+  bottom: "conv5_1_CPM_L2"
+  top: "conv5_2_CPM_L2"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu5_2_CPM_L2"
+  type: "ReLU"
+  bottom: "conv5_2_CPM_L2"
+  top: "conv5_2_CPM_L2"
+}
+layer {
+  name: "conv5_3_CPM_L1"
+  type: "Convolution"
+  bottom: "conv5_2_CPM_L1"
+  top: "conv5_3_CPM_L1"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu5_3_CPM_L1"
+  type: "ReLU"
+  bottom: "conv5_3_CPM_L1"
+  top: "conv5_3_CPM_L1"
+}
+layer {
+  name: "conv5_3_CPM_L2"
+  type: "Convolution"
+  bottom: "conv5_2_CPM_L2"
+  top: "conv5_3_CPM_L2"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu5_3_CPM_L2"
+  type: "ReLU"
+  bottom: "conv5_3_CPM_L2"
+  top: "conv5_3_CPM_L2"
+}
+layer {
+  name: "conv5_4_CPM_L1"
+  type: "Convolution"
+  bottom: "conv5_3_CPM_L1"
+  top: "conv5_4_CPM_L1"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu5_4_CPM_L1"
+  type: "ReLU"
+  bottom: "conv5_4_CPM_L1"
+  top: "conv5_4_CPM_L1"
+}
+layer {
+  name: "conv5_4_CPM_L2"
+  type: "Convolution"
+  bottom: "conv5_3_CPM_L2"
+  top: "conv5_4_CPM_L2"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu5_4_CPM_L2"
+  type: "ReLU"
+  bottom: "conv5_4_CPM_L2"
+  top: "conv5_4_CPM_L2"
+}
+layer {
+  name: "conv5_5_CPM_L1"
+  type: "Convolution"
+  bottom: "conv5_4_CPM_L1"
+  top: "conv5_5_CPM_L1"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 38
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "conv5_5_CPM_L2"
+  type: "Convolution"
+  bottom: "conv5_4_CPM_L2"
+  top: "conv5_5_CPM_L2"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 19
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "concat_stage2"
+  type: "Concat"
+  bottom: "conv5_5_CPM_L1"
+  bottom: "conv5_5_CPM_L2"
+  bottom: "conv4_4_CPM"
+  top: "concat_stage2"
+  concat_param {
+    axis: 1
+  }
+}
+layer {
+  name: "Mconv1_stage2_L1"
+  type: "Convolution"
+  bottom: "concat_stage2"
+  top: "Mconv1_stage2_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu1_stage2_L1"
+  type: "ReLU"
+  bottom: "Mconv1_stage2_L1"
+  top: "Mconv1_stage2_L1"
+}
+layer {
+  name: "Mconv1_stage2_L2"
+  type: "Convolution"
+  bottom: "concat_stage2"
+  top: "Mconv1_stage2_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu1_stage2_L2"
+  type: "ReLU"
+  bottom: "Mconv1_stage2_L2"
+  top: "Mconv1_stage2_L2"
+}
+layer {
+  name: "Mconv2_stage2_L1"
+  type: "Convolution"
+  bottom: "Mconv1_stage2_L1"
+  top: "Mconv2_stage2_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu2_stage2_L1"
+  type: "ReLU"
+  bottom: "Mconv2_stage2_L1"
+  top: "Mconv2_stage2_L1"
+}
+layer {
+  name: "Mconv2_stage2_L2"
+  type: "Convolution"
+  bottom: "Mconv1_stage2_L2"
+  top: "Mconv2_stage2_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu2_stage2_L2"
+  type: "ReLU"
+  bottom: "Mconv2_stage2_L2"
+  top: "Mconv2_stage2_L2"
+}
+layer {
+  name: "Mconv3_stage2_L1"
+  type: "Convolution"
+  bottom: "Mconv2_stage2_L1"
+  top: "Mconv3_stage2_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu3_stage2_L1"
+  type: "ReLU"
+  bottom: "Mconv3_stage2_L1"
+  top: "Mconv3_stage2_L1"
+}
+layer {
+  name: "Mconv3_stage2_L2"
+  type: "Convolution"
+  bottom: "Mconv2_stage2_L2"
+  top: "Mconv3_stage2_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu3_stage2_L2"
+  type: "ReLU"
+  bottom: "Mconv3_stage2_L2"
+  top: "Mconv3_stage2_L2"
+}
+layer {
+  name: "Mconv4_stage2_L1"
+  type: "Convolution"
+  bottom: "Mconv3_stage2_L1"
+  top: "Mconv4_stage2_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu4_stage2_L1"
+  type: "ReLU"
+  bottom: "Mconv4_stage2_L1"
+  top: "Mconv4_stage2_L1"
+}
+layer {
+  name: "Mconv4_stage2_L2"
+  type: "Convolution"
+  bottom: "Mconv3_stage2_L2"
+  top: "Mconv4_stage2_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu4_stage2_L2"
+  type: "ReLU"
+  bottom: "Mconv4_stage2_L2"
+  top: "Mconv4_stage2_L2"
+}
+layer {
+  name: "Mconv5_stage2_L1"
+  type: "Convolution"
+  bottom: "Mconv4_stage2_L1"
+  top: "Mconv5_stage2_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu5_stage2_L1"
+  type: "ReLU"
+  bottom: "Mconv5_stage2_L1"
+  top: "Mconv5_stage2_L1"
+}
+layer {
+  name: "Mconv5_stage2_L2"
+  type: "Convolution"
+  bottom: "Mconv4_stage2_L2"
+  top: "Mconv5_stage2_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu5_stage2_L2"
+  type: "ReLU"
+  bottom: "Mconv5_stage2_L2"
+  top: "Mconv5_stage2_L2"
+}
+layer {
+  name: "Mconv6_stage2_L1"
+  type: "Convolution"
+  bottom: "Mconv5_stage2_L1"
+  top: "Mconv6_stage2_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu6_stage2_L1"
+  type: "ReLU"
+  bottom: "Mconv6_stage2_L1"
+  top: "Mconv6_stage2_L1"
+}
+layer {
+  name: "Mconv6_stage2_L2"
+  type: "Convolution"
+  bottom: "Mconv5_stage2_L2"
+  top: "Mconv6_stage2_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu6_stage2_L2"
+  type: "ReLU"
+  bottom: "Mconv6_stage2_L2"
+  top: "Mconv6_stage2_L2"
+}
+layer {
+  name: "Mconv7_stage2_L1"
+  type: "Convolution"
+  bottom: "Mconv6_stage2_L1"
+  top: "Mconv7_stage2_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 38
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mconv7_stage2_L2"
+  type: "Convolution"
+  bottom: "Mconv6_stage2_L2"
+  top: "Mconv7_stage2_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 19
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "concat_stage3"
+  type: "Concat"
+  bottom: "Mconv7_stage2_L1"
+  bottom: "Mconv7_stage2_L2"
+  bottom: "conv4_4_CPM"
+  top: "concat_stage3"
+  concat_param {
+    axis: 1
+  }
+}
+layer {
+  name: "Mconv1_stage3_L1"
+  type: "Convolution"
+  bottom: "concat_stage3"
+  top: "Mconv1_stage3_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu1_stage3_L1"
+  type: "ReLU"
+  bottom: "Mconv1_stage3_L1"
+  top: "Mconv1_stage3_L1"
+}
+layer {
+  name: "Mconv1_stage3_L2"
+  type: "Convolution"
+  bottom: "concat_stage3"
+  top: "Mconv1_stage3_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu1_stage3_L2"
+  type: "ReLU"
+  bottom: "Mconv1_stage3_L2"
+  top: "Mconv1_stage3_L2"
+}
+layer {
+  name: "Mconv2_stage3_L1"
+  type: "Convolution"
+  bottom: "Mconv1_stage3_L1"
+  top: "Mconv2_stage3_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu2_stage3_L1"
+  type: "ReLU"
+  bottom: "Mconv2_stage3_L1"
+  top: "Mconv2_stage3_L1"
+}
+layer {
+  name: "Mconv2_stage3_L2"
+  type: "Convolution"
+  bottom: "Mconv1_stage3_L2"
+  top: "Mconv2_stage3_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu2_stage3_L2"
+  type: "ReLU"
+  bottom: "Mconv2_stage3_L2"
+  top: "Mconv2_stage3_L2"
+}
+layer {
+  name: "Mconv3_stage3_L1"
+  type: "Convolution"
+  bottom: "Mconv2_stage3_L1"
+  top: "Mconv3_stage3_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu3_stage3_L1"
+  type: "ReLU"
+  bottom: "Mconv3_stage3_L1"
+  top: "Mconv3_stage3_L1"
+}
+layer {
+  name: "Mconv3_stage3_L2"
+  type: "Convolution"
+  bottom: "Mconv2_stage3_L2"
+  top: "Mconv3_stage3_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu3_stage3_L2"
+  type: "ReLU"
+  bottom: "Mconv3_stage3_L2"
+  top: "Mconv3_stage3_L2"
+}
+layer {
+  name: "Mconv4_stage3_L1"
+  type: "Convolution"
+  bottom: "Mconv3_stage3_L1"
+  top: "Mconv4_stage3_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu4_stage3_L1"
+  type: "ReLU"
+  bottom: "Mconv4_stage3_L1"
+  top: "Mconv4_stage3_L1"
+}
+layer {
+  name: "Mconv4_stage3_L2"
+  type: "Convolution"
+  bottom: "Mconv3_stage3_L2"
+  top: "Mconv4_stage3_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu4_stage3_L2"
+  type: "ReLU"
+  bottom: "Mconv4_stage3_L2"
+  top: "Mconv4_stage3_L2"
+}
+layer {
+  name: "Mconv5_stage3_L1"
+  type: "Convolution"
+  bottom: "Mconv4_stage3_L1"
+  top: "Mconv5_stage3_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu5_stage3_L1"
+  type: "ReLU"
+  bottom: "Mconv5_stage3_L1"
+  top: "Mconv5_stage3_L1"
+}
+layer {
+  name: "Mconv5_stage3_L2"
+  type: "Convolution"
+  bottom: "Mconv4_stage3_L2"
+  top: "Mconv5_stage3_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu5_stage3_L2"
+  type: "ReLU"
+  bottom: "Mconv5_stage3_L2"
+  top: "Mconv5_stage3_L2"
+}
+layer {
+  name: "Mconv6_stage3_L1"
+  type: "Convolution"
+  bottom: "Mconv5_stage3_L1"
+  top: "Mconv6_stage3_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu6_stage3_L1"
+  type: "ReLU"
+  bottom: "Mconv6_stage3_L1"
+  top: "Mconv6_stage3_L1"
+}
+layer {
+  name: "Mconv6_stage3_L2"
+  type: "Convolution"
+  bottom: "Mconv5_stage3_L2"
+  top: "Mconv6_stage3_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu6_stage3_L2"
+  type: "ReLU"
+  bottom: "Mconv6_stage3_L2"
+  top: "Mconv6_stage3_L2"
+}
+layer {
+  name: "Mconv7_stage3_L1"
+  type: "Convolution"
+  bottom: "Mconv6_stage3_L1"
+  top: "Mconv7_stage3_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 38
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mconv7_stage3_L2"
+  type: "Convolution"
+  bottom: "Mconv6_stage3_L2"
+  top: "Mconv7_stage3_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 19
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "concat_stage4"
+  type: "Concat"
+  bottom: "Mconv7_stage3_L1"
+  bottom: "Mconv7_stage3_L2"
+  bottom: "conv4_4_CPM"
+  top: "concat_stage4"
+  concat_param {
+    axis: 1
+  }
+}
+layer {
+  name: "Mconv1_stage4_L1"
+  type: "Convolution"
+  bottom: "concat_stage4"
+  top: "Mconv1_stage4_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu1_stage4_L1"
+  type: "ReLU"
+  bottom: "Mconv1_stage4_L1"
+  top: "Mconv1_stage4_L1"
+}
+layer {
+  name: "Mconv1_stage4_L2"
+  type: "Convolution"
+  bottom: "concat_stage4"
+  top: "Mconv1_stage4_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu1_stage4_L2"
+  type: "ReLU"
+  bottom: "Mconv1_stage4_L2"
+  top: "Mconv1_stage4_L2"
+}
+layer {
+  name: "Mconv2_stage4_L1"
+  type: "Convolution"
+  bottom: "Mconv1_stage4_L1"
+  top: "Mconv2_stage4_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu2_stage4_L1"
+  type: "ReLU"
+  bottom: "Mconv2_stage4_L1"
+  top: "Mconv2_stage4_L1"
+}
+layer {
+  name: "Mconv2_stage4_L2"
+  type: "Convolution"
+  bottom: "Mconv1_stage4_L2"
+  top: "Mconv2_stage4_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu2_stage4_L2"
+  type: "ReLU"
+  bottom: "Mconv2_stage4_L2"
+  top: "Mconv2_stage4_L2"
+}
+layer {
+  name: "Mconv3_stage4_L1"
+  type: "Convolution"
+  bottom: "Mconv2_stage4_L1"
+  top: "Mconv3_stage4_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu3_stage4_L1"
+  type: "ReLU"
+  bottom: "Mconv3_stage4_L1"
+  top: "Mconv3_stage4_L1"
+}
+layer {
+  name: "Mconv3_stage4_L2"
+  type: "Convolution"
+  bottom: "Mconv2_stage4_L2"
+  top: "Mconv3_stage4_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu3_stage4_L2"
+  type: "ReLU"
+  bottom: "Mconv3_stage4_L2"
+  top: "Mconv3_stage4_L2"
+}
+layer {
+  name: "Mconv4_stage4_L1"
+  type: "Convolution"
+  bottom: "Mconv3_stage4_L1"
+  top: "Mconv4_stage4_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu4_stage4_L1"
+  type: "ReLU"
+  bottom: "Mconv4_stage4_L1"
+  top: "Mconv4_stage4_L1"
+}
+layer {
+  name: "Mconv4_stage4_L2"
+  type: "Convolution"
+  bottom: "Mconv3_stage4_L2"
+  top: "Mconv4_stage4_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu4_stage4_L2"
+  type: "ReLU"
+  bottom: "Mconv4_stage4_L2"
+  top: "Mconv4_stage4_L2"
+}
+layer {
+  name: "Mconv5_stage4_L1"
+  type: "Convolution"
+  bottom: "Mconv4_stage4_L1"
+  top: "Mconv5_stage4_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu5_stage4_L1"
+  type: "ReLU"
+  bottom: "Mconv5_stage4_L1"
+  top: "Mconv5_stage4_L1"
+}
+layer {
+  name: "Mconv5_stage4_L2"
+  type: "Convolution"
+  bottom: "Mconv4_stage4_L2"
+  top: "Mconv5_stage4_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu5_stage4_L2"
+  type: "ReLU"
+  bottom: "Mconv5_stage4_L2"
+  top: "Mconv5_stage4_L2"
+}
+layer {
+  name: "Mconv6_stage4_L1"
+  type: "Convolution"
+  bottom: "Mconv5_stage4_L1"
+  top: "Mconv6_stage4_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu6_stage4_L1"
+  type: "ReLU"
+  bottom: "Mconv6_stage4_L1"
+  top: "Mconv6_stage4_L1"
+}
+layer {
+  name: "Mconv6_stage4_L2"
+  type: "Convolution"
+  bottom: "Mconv5_stage4_L2"
+  top: "Mconv6_stage4_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu6_stage4_L2"
+  type: "ReLU"
+  bottom: "Mconv6_stage4_L2"
+  top: "Mconv6_stage4_L2"
+}
+layer {
+  name: "Mconv7_stage4_L1"
+  type: "Convolution"
+  bottom: "Mconv6_stage4_L1"
+  top: "Mconv7_stage4_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 38
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mconv7_stage4_L2"
+  type: "Convolution"
+  bottom: "Mconv6_stage4_L2"
+  top: "Mconv7_stage4_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 19
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "concat_stage5"
+  type: "Concat"
+  bottom: "Mconv7_stage4_L1"
+  bottom: "Mconv7_stage4_L2"
+  bottom: "conv4_4_CPM"
+  top: "concat_stage5"
+  concat_param {
+    axis: 1
+  }
+}
+layer {
+  name: "Mconv1_stage5_L1"
+  type: "Convolution"
+  bottom: "concat_stage5"
+  top: "Mconv1_stage5_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu1_stage5_L1"
+  type: "ReLU"
+  bottom: "Mconv1_stage5_L1"
+  top: "Mconv1_stage5_L1"
+}
+layer {
+  name: "Mconv1_stage5_L2"
+  type: "Convolution"
+  bottom: "concat_stage5"
+  top: "Mconv1_stage5_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu1_stage5_L2"
+  type: "ReLU"
+  bottom: "Mconv1_stage5_L2"
+  top: "Mconv1_stage5_L2"
+}
+layer {
+  name: "Mconv2_stage5_L1"
+  type: "Convolution"
+  bottom: "Mconv1_stage5_L1"
+  top: "Mconv2_stage5_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu2_stage5_L1"
+  type: "ReLU"
+  bottom: "Mconv2_stage5_L1"
+  top: "Mconv2_stage5_L1"
+}
+layer {
+  name: "Mconv2_stage5_L2"
+  type: "Convolution"
+  bottom: "Mconv1_stage5_L2"
+  top: "Mconv2_stage5_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu2_stage5_L2"
+  type: "ReLU"
+  bottom: "Mconv2_stage5_L2"
+  top: "Mconv2_stage5_L2"
+}
+layer {
+  name: "Mconv3_stage5_L1"
+  type: "Convolution"
+  bottom: "Mconv2_stage5_L1"
+  top: "Mconv3_stage5_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu3_stage5_L1"
+  type: "ReLU"
+  bottom: "Mconv3_stage5_L1"
+  top: "Mconv3_stage5_L1"
+}
+layer {
+  name: "Mconv3_stage5_L2"
+  type: "Convolution"
+  bottom: "Mconv2_stage5_L2"
+  top: "Mconv3_stage5_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu3_stage5_L2"
+  type: "ReLU"
+  bottom: "Mconv3_stage5_L2"
+  top: "Mconv3_stage5_L2"
+}
+layer {
+  name: "Mconv4_stage5_L1"
+  type: "Convolution"
+  bottom: "Mconv3_stage5_L1"
+  top: "Mconv4_stage5_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu4_stage5_L1"
+  type: "ReLU"
+  bottom: "Mconv4_stage5_L1"
+  top: "Mconv4_stage5_L1"
+}
+layer {
+  name: "Mconv4_stage5_L2"
+  type: "Convolution"
+  bottom: "Mconv3_stage5_L2"
+  top: "Mconv4_stage5_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu4_stage5_L2"
+  type: "ReLU"
+  bottom: "Mconv4_stage5_L2"
+  top: "Mconv4_stage5_L2"
+}
+layer {
+  name: "Mconv5_stage5_L1"
+  type: "Convolution"
+  bottom: "Mconv4_stage5_L1"
+  top: "Mconv5_stage5_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu5_stage5_L1"
+  type: "ReLU"
+  bottom: "Mconv5_stage5_L1"
+  top: "Mconv5_stage5_L1"
+}
+layer {
+  name: "Mconv5_stage5_L2"
+  type: "Convolution"
+  bottom: "Mconv4_stage5_L2"
+  top: "Mconv5_stage5_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu5_stage5_L2"
+  type: "ReLU"
+  bottom: "Mconv5_stage5_L2"
+  top: "Mconv5_stage5_L2"
+}
+layer {
+  name: "Mconv6_stage5_L1"
+  type: "Convolution"
+  bottom: "Mconv5_stage5_L1"
+  top: "Mconv6_stage5_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu6_stage5_L1"
+  type: "ReLU"
+  bottom: "Mconv6_stage5_L1"
+  top: "Mconv6_stage5_L1"
+}
+layer {
+  name: "Mconv6_stage5_L2"
+  type: "Convolution"
+  bottom: "Mconv5_stage5_L2"
+  top: "Mconv6_stage5_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu6_stage5_L2"
+  type: "ReLU"
+  bottom: "Mconv6_stage5_L2"
+  top: "Mconv6_stage5_L2"
+}
+layer {
+  name: "Mconv7_stage5_L1"
+  type: "Convolution"
+  bottom: "Mconv6_stage5_L1"
+  top: "Mconv7_stage5_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 38
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mconv7_stage5_L2"
+  type: "Convolution"
+  bottom: "Mconv6_stage5_L2"
+  top: "Mconv7_stage5_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 19
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "concat_stage6"
+  type: "Concat"
+  bottom: "Mconv7_stage5_L1"
+  bottom: "Mconv7_stage5_L2"
+  bottom: "conv4_4_CPM"
+  top: "concat_stage6"
+  concat_param {
+    axis: 1
+  }
+}
+layer {
+  name: "Mconv1_stage6_L1"
+  type: "Convolution"
+  bottom: "concat_stage6"
+  top: "Mconv1_stage6_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu1_stage6_L1"
+  type: "ReLU"
+  bottom: "Mconv1_stage6_L1"
+  top: "Mconv1_stage6_L1"
+}
+layer {
+  name: "Mconv1_stage6_L2"
+  type: "Convolution"
+  bottom: "concat_stage6"
+  top: "Mconv1_stage6_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu1_stage6_L2"
+  type: "ReLU"
+  bottom: "Mconv1_stage6_L2"
+  top: "Mconv1_stage6_L2"
+}
+layer {
+  name: "Mconv2_stage6_L1"
+  type: "Convolution"
+  bottom: "Mconv1_stage6_L1"
+  top: "Mconv2_stage6_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu2_stage6_L1"
+  type: "ReLU"
+  bottom: "Mconv2_stage6_L1"
+  top: "Mconv2_stage6_L1"
+}
+layer {
+  name: "Mconv2_stage6_L2"
+  type: "Convolution"
+  bottom: "Mconv1_stage6_L2"
+  top: "Mconv2_stage6_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu2_stage6_L2"
+  type: "ReLU"
+  bottom: "Mconv2_stage6_L2"
+  top: "Mconv2_stage6_L2"
+}
+layer {
+  name: "Mconv3_stage6_L1"
+  type: "Convolution"
+  bottom: "Mconv2_stage6_L1"
+  top: "Mconv3_stage6_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu3_stage6_L1"
+  type: "ReLU"
+  bottom: "Mconv3_stage6_L1"
+  top: "Mconv3_stage6_L1"
+}
+layer {
+  name: "Mconv3_stage6_L2"
+  type: "Convolution"
+  bottom: "Mconv2_stage6_L2"
+  top: "Mconv3_stage6_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu3_stage6_L2"
+  type: "ReLU"
+  bottom: "Mconv3_stage6_L2"
+  top: "Mconv3_stage6_L2"
+}
+layer {
+  name: "Mconv4_stage6_L1"
+  type: "Convolution"
+  bottom: "Mconv3_stage6_L1"
+  top: "Mconv4_stage6_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu4_stage6_L1"
+  type: "ReLU"
+  bottom: "Mconv4_stage6_L1"
+  top: "Mconv4_stage6_L1"
+}
+layer {
+  name: "Mconv4_stage6_L2"
+  type: "Convolution"
+  bottom: "Mconv3_stage6_L2"
+  top: "Mconv4_stage6_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu4_stage6_L2"
+  type: "ReLU"
+  bottom: "Mconv4_stage6_L2"
+  top: "Mconv4_stage6_L2"
+}
+layer {
+  name: "Mconv5_stage6_L1"
+  type: "Convolution"
+  bottom: "Mconv4_stage6_L1"
+  top: "Mconv5_stage6_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu5_stage6_L1"
+  type: "ReLU"
+  bottom: "Mconv5_stage6_L1"
+  top: "Mconv5_stage6_L1"
+}
+layer {
+  name: "Mconv5_stage6_L2"
+  type: "Convolution"
+  bottom: "Mconv4_stage6_L2"
+  top: "Mconv5_stage6_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 3
+    kernel_size: 7
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu5_stage6_L2"
+  type: "ReLU"
+  bottom: "Mconv5_stage6_L2"
+  top: "Mconv5_stage6_L2"
+}
+layer {
+  name: "Mconv6_stage6_L1"
+  type: "Convolution"
+  bottom: "Mconv5_stage6_L1"
+  top: "Mconv6_stage6_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu6_stage6_L1"
+  type: "ReLU"
+  bottom: "Mconv6_stage6_L1"
+  top: "Mconv6_stage6_L1"
+}
+layer {
+  name: "Mconv6_stage6_L2"
+  type: "Convolution"
+  bottom: "Mconv5_stage6_L2"
+  top: "Mconv6_stage6_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mrelu6_stage6_L2"
+  type: "ReLU"
+  bottom: "Mconv6_stage6_L2"
+  top: "Mconv6_stage6_L2"
+}
+layer {
+  name: "Mconv7_stage6_L1"
+  type: "Convolution"
+  bottom: "Mconv6_stage6_L1"
+  top: "Mconv7_stage6_L1"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 38
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "Mconv7_stage6_L2"
+  type: "Convolution"
+  bottom: "Mconv6_stage6_L2"
+  top: "Mconv7_stage6_L2"
+  param {
+    lr_mult: 4.0
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 8.0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 19
+    pad: 0
+    kernel_size: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "concat_stage7"
+  type: "Concat"
+  bottom: "Mconv7_stage6_L2"
+  bottom: "Mconv7_stage6_L1"
+  # top: "concat_stage7"
+  top: "net_output"
+  concat_param {
+    axis: 1
+  }
+}

From 7808f896c9fb6ed021c0dc713f2efb12b4bd5555 Mon Sep 17 00:00:00 2001
From: Florent Buisson <florent.buisson.1987@gmail.com>
Date: Wed, 27 Sep 2017 14:37:42 +0000
Subject: [PATCH 33/52] Detailed poseExtractor Timings.

---
 src/openpose/pose/poseExtractorCaffe.cpp    | 38 ++++++++++++++++---
 src/openpose/pose/poseExtractorTensorRT.cpp | 41 +++++++++++++++++----
 2 files changed, 66 insertions(+), 13 deletions(-)

diff --git a/src/openpose/pose/poseExtractorCaffe.cpp b/src/openpose/pose/poseExtractorCaffe.cpp
index bc4374782..923405af0 100644
--- a/src/openpose/pose/poseExtractorCaffe.cpp
+++ b/src/openpose/pose/poseExtractorCaffe.cpp
@@ -7,6 +7,22 @@
 #include <openpose/utilities/openCv.hpp>
 #include <openpose/pose/poseExtractorCaffe.hpp>
 
+typedef std::vector<std::pair<std::string, std::chrono::high_resolution_clock::time_point>> OpTimings;
+
+static OpTimings timings;
+
+static void timeNow(const std::string& label){
+    const auto now = std::chrono::high_resolution_clock::now();
+    const auto timing = std::make_pair(label, now);
+    timings.push_back(timing);
+}
+
+static std::string timeDiffToString(const std::chrono::high_resolution_clock::time_point& t1,
+                                const std::chrono::high_resolution_clock::time_point& t2 ) {
+    return std::to_string((double)std::chrono::duration_cast<std::chrono::duration<double>>(t1 - t2).count() * 1e3) + " ms";
+}
+
+
 namespace op
 {
     PoseExtractorCaffe::PoseExtractorCaffe(const Point<int>& netInputSize, const Point<int>& netOutputSize, const Point<int>& outputSize, const int scaleNumber,
@@ -79,10 +95,10 @@ namespace op
             // Security checks
             if (inputNetData.empty())
                 error("Empty inputNetData.", __LINE__, __FUNCTION__, __FILE__);
-
+            timeNow("Start");
             // 1. Caffe deep network
             spNet->forwardPass(inputNetData.getConstPtr());                                                     // ~79.3836ms
-
+            timeNow("Caffe Forward");
             // 2. Resize heat maps + merge different scales
             spResizeAndMergeCaffe->setScaleRatios(scaleRatios);
             #ifndef CPU_ONLY
@@ -91,7 +107,7 @@ namespace op
             #else
                 error("ResizeAndMergeCaffe CPU version not implemented yet.", __LINE__, __FUNCTION__, __FILE__);
             #endif
-
+            timeNow("Resize Heat Maps");
             // 3. Get peaks by Non-Maximum Suppression
             spNmsCaffe->setThreshold((float)get(PoseProperty::NMSThreshold));
             #ifndef CPU_ONLY
@@ -100,22 +116,32 @@ namespace op
             #else
                 error("NmsCaffe CPU version not implemented yet.", __LINE__, __FUNCTION__, __FILE__);
             #endif
-
+            timeNow("Peaks by nms");
             // Get scale net to output
             const auto scaleProducerToNetInput = resizeGetScaleFactor(inputDataSize, mNetOutputSize);
             const Point<int> netSize{intRound(scaleProducerToNetInput*inputDataSize.x), intRound(scaleProducerToNetInput*inputDataSize.y)};
             mScaleNetToOutput = {(float)resizeGetScaleFactor(netSize, mOutputSize)};
-
+            timeNow("Scale net to output");
             // 4. Connecting body parts
             spBodyPartConnectorCaffe->setScaleNetToOutput(mScaleNetToOutput);
             spBodyPartConnectorCaffe->setInterMinAboveThreshold((int)get(PoseProperty::ConnectInterMinAboveThreshold));
             spBodyPartConnectorCaffe->setInterThreshold((float)get(PoseProperty::ConnectInterThreshold));
             spBodyPartConnectorCaffe->setMinSubsetCnt((int)get(PoseProperty::ConnectMinSubsetCnt));
             spBodyPartConnectorCaffe->setMinSubsetScore((float)get(PoseProperty::ConnectMinSubsetScore));
-
             // GPU version not implemented yet
             spBodyPartConnectorCaffe->Forward_cpu({spHeatMapsBlob.get(), spPeaksBlob.get()}, mPoseKeypoints);
             // spBodyPartConnectorCaffe->Forward_gpu({spHeatMapsBlob.get(), spPeaksBlob.get()}, {spPoseBlob.get()}, mPoseKeypoints);
+            timeNow("Connect Body Parts");
+
+            const auto totalTimeSec = timeDiffToString(timings.back().second, timings.front().second);
+            const auto message = "Pose estimation successfully finished. Total time: " + totalTimeSec + " seconds.";
+            op::log(message, op::Priority::High);
+
+            for(OpTimings::iterator timing = timings.begin()+1; timing != timings.end(); ++timing) {
+              const auto log_time = (*timing).first + " - " + timeDiffToString((*timing).second, (*(timing-1)).second);
+              op::log(log_time, op::Priority::High);
+            }
+
         }
         catch (const std::exception& e)
         {
diff --git a/src/openpose/pose/poseExtractorTensorRT.cpp b/src/openpose/pose/poseExtractorTensorRT.cpp
index a1b07f00e..155a1f425 100644
--- a/src/openpose/pose/poseExtractorTensorRT.cpp
+++ b/src/openpose/pose/poseExtractorTensorRT.cpp
@@ -8,6 +8,21 @@
 #include <openpose/utilities/openCv.hpp>
 #include <openpose/pose/poseExtractorTensorRT.hpp>
 
+typedef std::vector<std::pair<std::string, std::chrono::high_resolution_clock::time_point>> OpTimings;
+
+static OpTimings timings;
+
+static void timeNow(const std::string& label){
+    const auto now = std::chrono::high_resolution_clock::now();
+    const auto timing = std::make_pair(label, now);
+    timings.push_back(timing);
+}
+
+static std::string timeDiffToString(const std::chrono::high_resolution_clock::time_point& t1,
+                                const std::chrono::high_resolution_clock::time_point& t2 ) {
+    return std::to_string((double)std::chrono::duration_cast<std::chrono::duration<double>>(t1 - t2).count() * 1e3) + " ms";
+}
+
 
 namespace op
 {
@@ -82,19 +97,22 @@ namespace op
             // Security checks
             if (inputNetData.empty())
                 error("Empty inputNetData.", __LINE__, __FUNCTION__, __FILE__);
-
+            timeNow("Start");
             // 1. TensorRT deep network
             spNet->forwardPass(inputNetData.getConstPtr());
-          
+            timeNow("TensorRT forward");
             // 2. Resize heat maps + merge different scales
             spResizeAndMergeTensorRT->setScaleRatios(scaleRatios);
+            timeNow("SpResizeAndMergeTensorRT");
             #ifndef CPU_ONLY
-                spResizeAndMergeTensorRT->Forward_gpu({spTensorRTNetOutputBlob.get()}, {spHeatMapsBlob.get()});       // ~5ms
+                spResizeAndMergeTensorRT->Forward_cpu({spTensorRTNetOutputBlob.get()}, {spHeatMapsBlob.get()});       // ~5ms
+                timeNow("RaM forward_gpu");
                 cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+                timeNow("CudaCheck");
             #else
                 error("ResizeAndMergeTensorRT CPU version not implemented yet.", __LINE__, __FUNCTION__, __FILE__);
             #endif
-
+            timeNow("Resize heat Maps");
             // 3. Get peaks by Non-Maximum Suppression
             spNmsTensorRT->setThreshold((float)get(PoseProperty::NMSThreshold));
             #ifndef CPU_ONLY
@@ -103,22 +121,31 @@ namespace op
             #else
                 error("NmsTensorRT CPU version not implemented yet.", __LINE__, __FUNCTION__, __FILE__);
             #endif
-
+            timeNow("Peaks by nms");
             // Get scale net to output
             const auto scaleProducerToNetInput = resizeGetScaleFactor(inputDataSize, mNetOutputSize);
             const Point<int> netSize{intRound(scaleProducerToNetInput*inputDataSize.x), intRound(scaleProducerToNetInput*inputDataSize.y)};
             mScaleNetToOutput = {(float)resizeGetScaleFactor(netSize, mOutputSize)};
-
+            timeNow("Scale net to output");
             // 4. Connecting body parts
             spBodyPartConnectorTensorRT->setScaleNetToOutput(mScaleNetToOutput);
             spBodyPartConnectorTensorRT->setInterMinAboveThreshold((int)get(PoseProperty::ConnectInterMinAboveThreshold));
             spBodyPartConnectorTensorRT->setInterThreshold((float)get(PoseProperty::ConnectInterThreshold));
             spBodyPartConnectorTensorRT->setMinSubsetCnt((int)get(PoseProperty::ConnectMinSubsetCnt));
             spBodyPartConnectorTensorRT->setMinSubsetScore((float)get(PoseProperty::ConnectMinSubsetScore));
-
             // GPU version not implemented yet
             spBodyPartConnectorTensorRT->Forward_cpu({spHeatMapsBlob.get(), spPeaksBlob.get()}, mPoseKeypoints);
             // spBodyPartConnectorTensorRT->Forward_gpu({spHeatMapsBlob.get(), spPeaksBlob.get()}, {spPoseBlob.get()}, mPoseKeypoints);
+            timeNow("Connect Body Parts");
+             
+            const auto totalTimeSec = timeDiffToString(timings.back().second, timings.front().second);
+            const auto message = "Pose estimation successfully finished. Total time: " + totalTimeSec + " seconds.";
+            op::log(message, op::Priority::High);
+
+            for(OpTimings::iterator timing = timings.begin()+1; timing != timings.end(); ++timing) {
+              const auto log_time = (*timing).first + " - " + timeDiffToString((*timing).second, (*(timing-1)).second);
+              op::log(log_time, op::Priority::High);
+            }
         }
         catch (const std::exception& e)
         {

From 8023fb1e570226ba7163d13a5426d8a1bfd94d3d Mon Sep 17 00:00:00 2001
From: Florent Buisson <florent.buisson.1987@gmail.com>
Date: Wed, 27 Sep 2017 15:55:31 +0000
Subject: [PATCH 34/52] Faster Resize and Merge.

---
 src/openpose/core/resizeAndMergeBase.cu     | 60 +++++++++------------
 src/openpose/pose/poseExtractorTensorRT.cpp |  2 +-
 2 files changed, 27 insertions(+), 35 deletions(-)

diff --git a/src/openpose/core/resizeAndMergeBase.cu b/src/openpose/core/resizeAndMergeBase.cu
index b60b6b11c..6b551e815 100644
--- a/src/openpose/core/resizeAndMergeBase.cu
+++ b/src/openpose/core/resizeAndMergeBase.cu
@@ -7,18 +7,15 @@ namespace op
     const auto THREADS_PER_BLOCK_1D = 16u;
 
     template <typename T>
-    __global__ void resizeKernel(T* targetPtr, const T* const sourcePtr, const int sourceWidth, const int sourceHeight, const int targetWidth,
-                                 const int targetHeight)
+    __global__ void resizeKernel(T* targetPtr, const T* const sourcePtr, const int sourceWidth, const int sourceHeight, const int targetWidth, const int targetHeight, const T invScaleWidth, const T invScaleHeight)
     {
         const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
         const auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
 
         if (x < targetWidth && y < targetHeight)
         {
-            const auto scaleWidth = targetWidth / T(sourceWidth);
-            const auto scaleHeight = targetHeight / T(sourceHeight);
-            const T xSource = (x + 0.5f) / scaleWidth - 0.5f;
-            const T ySource = (y + 0.5f) / scaleHeight - 0.5f;
+            const T xSource = (x + 0.5f) * invScaleWidth - 0.5f;
+            const T ySource = (y + 0.5f) * invScaleHeight - 0.5f;
 
             targetPtr[y*targetWidth+x] = bicubicInterpolate(sourcePtr, xSource, ySource, sourceWidth, sourceHeight, sourceWidth);
         }
@@ -30,7 +27,14 @@ namespace op
     {
         const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
         const auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
+               
+        const auto currentWidth = sourceWidth;
+        const auto currentHeight = sourceHeight;
 
+        const auto scaleWidth = targetWidth / currentWidth;
+        const auto scaleHeight = targetHeight / currentHeight;
+
+ 
         if (x < targetWidth && y < targetHeight)
         {
             auto& targetPixel = targetPtr[y*targetWidth+x];
@@ -38,17 +42,11 @@ namespace op
             // targetPixel = -1000.f; // For fastMax
             for (auto n = 0; n < num; n++)
             {
-                const auto currentWidth = sourceWidth * scaleRatios[n];
-                const auto currentHeight = sourceHeight * scaleRatios[n];
-
-                const auto scaleWidth = targetWidth / currentWidth;
-                const auto scaleHeight = targetHeight / currentHeight;
                 const T xSource = (x + 0.5f) / scaleWidth - 0.5f;
                 const T ySource = (y + 0.5f) / scaleHeight - 0.5f;
 
                 const T* const sourcePtrN = sourcePtr + n * sourceNumOffset;
-                const auto interpolated = bicubicInterpolate(sourcePtrN, xSource, ySource, intRound(currentWidth),
-                                                             intRound(currentHeight), sourceWidth);
+                const auto interpolated = bicubicInterpolate(sourcePtrN, xSource, ySource, sourceWidth, sourceHeight, sourceWidth);
                 targetPixel += interpolated;
                 // targetPixel = fastMax(targetPixel, interpolated);
             }
@@ -73,44 +71,38 @@ namespace op
             const dim3 numBlocks{getNumberCudaBlocks(targetWidth, threadsPerBlock.x), getNumberCudaBlocks(targetHeight, threadsPerBlock.y)};
             const auto sourceChannelOffset = sourceHeight * sourceWidth;
             const auto targetChannelOffset = targetWidth * targetHeight;
-
+            const auto scaleWidth = sourceWidth/T(targetWidth);
+            const auto scaleHeight = sourceHeight/T(targetHeight);
             // No multi-scale merging
-            if (targetSize[0] > 1)
+            /*if (targetSize[0] > 1)
             {
                 for (auto n = 0; n < num; n++)
-                {
-                    const auto offsetBase = n*channels;
+                {*/
                     for (auto c = 0 ; c < channels ; c++)
                     {
-                        const auto offset = offsetBase + c;
-                        resizeKernel<<<numBlocks, threadsPerBlock>>>(targetPtr + offset * targetChannelOffset,
-                                                                     sourcePtr + offset * sourceChannelOffset,
-                                                                     sourceWidth, sourceHeight, targetWidth, targetHeight);
+                        resizeKernel<<<numBlocks, threadsPerBlock>>>(targetPtr + c * targetChannelOffset,
+                                                                     sourcePtr + c * sourceChannelOffset,
+                                                                     sourceWidth, sourceHeight, targetWidth, targetHeight, scaleWidth, scaleHeight);
                     }
+/*
                 }
             }
             // Multi-scale merging
             else
             {
-                // If scale_number > 1 --> scaleRatios must be set
-                if (scaleRatios.size() != num)
-                    error("The scale ratios size must be equal than the number of scales.", __LINE__, __FUNCTION__, __FILE__);
-                const auto maxScales = 10;
-                if (scaleRatios.size() > maxScales)
-                    error("The maximum number of scales is " + std::to_string(maxScales) + ".", __LINE__, __FUNCTION__, __FILE__);
-                // Copy scaleRatios
-                T* scaleRatiosGpuPtr;
-                cudaMalloc((void**)&scaleRatiosGpuPtr, maxScales * sizeof(T));
-                cudaMemcpy(scaleRatiosGpuPtr, scaleRatios.data(), scaleRatios.size() * sizeof(T), cudaMemcpyHostToDevice);
+                const auto currentWidth = sourceWidth;
+                const auto currentHeight = sourceHeight;
+
+                const auto scaleWidth = targetWidth / currentWidth;
+                const auto scaleHeight = targetHeight / currentHeight;
+
                 // Perform resize + merging
                 const auto sourceNumOffset = channels * sourceChannelOffset;
                 for (auto c = 0 ; c < channels ; c++)
                     resizeKernelAndMerge<<<numBlocks, threadsPerBlock>>>(targetPtr + c * targetChannelOffset,
                                                                          sourcePtr + c * sourceChannelOffset, sourceNumOffset,
                                                                          num, scaleRatiosGpuPtr, sourceWidth, sourceHeight, targetWidth, targetHeight);
-                // Free memory
-                cudaFree(scaleRatiosGpuPtr);
-            }
+            }*/
 
             cudaCheck(__LINE__, __FUNCTION__, __FILE__);
         }
diff --git a/src/openpose/pose/poseExtractorTensorRT.cpp b/src/openpose/pose/poseExtractorTensorRT.cpp
index 155a1f425..a367110bd 100644
--- a/src/openpose/pose/poseExtractorTensorRT.cpp
+++ b/src/openpose/pose/poseExtractorTensorRT.cpp
@@ -105,7 +105,7 @@ namespace op
             spResizeAndMergeTensorRT->setScaleRatios(scaleRatios);
             timeNow("SpResizeAndMergeTensorRT");
             #ifndef CPU_ONLY
-                spResizeAndMergeTensorRT->Forward_cpu({spTensorRTNetOutputBlob.get()}, {spHeatMapsBlob.get()});       // ~5ms
+                spResizeAndMergeTensorRT->Forward_gpu({spTensorRTNetOutputBlob.get()}, {spHeatMapsBlob.get()});       // ~5ms
                 timeNow("RaM forward_gpu");
                 cudaCheck(__LINE__, __FUNCTION__, __FILE__);
                 timeNow("CudaCheck");

From ec58a48c8ddba7ab2406464673ec1b539638d27f Mon Sep 17 00:00:00 2001
From: Florent Buisson <florent.buisson.1987@gmail.com>
Date: Wed, 11 Oct 2017 21:58:53 +0200
Subject: [PATCH 35/52] TENSORRT precompiler guards

---
 Makefile                                        | 6 +++++-
 include/openpose/core/netTensorRT.hpp           | 4 ++--
 include/openpose/pose/poseExtractorTensorRT.hpp | 4 ++--
 include/openpose/wrapper/wrapper.hpp            | 4 ++++
 src/openpose/core/netTensorRT.cpp               | 4 ++--
 5 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/Makefile b/Makefile
index 7bc109ae7..061fa8de7 100644
--- a/Makefile
+++ b/Makefile
@@ -61,6 +61,8 @@ ifeq ($(DEEP_NET), tensorflow)
 # Torch
 else ifeq ($(DEEP_NET), torch)
 	# COMMON_FLAGS += -DUSE_TORCH
+else ifeq ($(DEEP_NET), tensorrt)
+        COMMON_FLAGS += -DUSE_TENSORRT
 # Caffe
 else
 	COMMON_FLAGS += -DUSE_CAFFE
@@ -146,7 +148,9 @@ ifeq ($(USE_CUDA), 1)
 endif
 
 # TensorRT
-LIBRARIES += nvinfer nvcaffe_parser
+ifeq ($(DEEP_NET), tensorrt)
+        LIBRARIES += nvinfer nvcaffe_parser
+endif
 
 # LIBRARIES += glog gflags boost_system boost_filesystem m hdf5_hl hdf5 caffe
 LIBRARIES += glog gflags boost_system boost_filesystem m hdf5_hl hdf5
diff --git a/include/openpose/core/netTensorRT.hpp b/include/openpose/core/netTensorRT.hpp
index 00e176ab0..0eaaaf7d3 100644
--- a/include/openpose/core/netTensorRT.hpp
+++ b/include/openpose/core/netTensorRT.hpp
@@ -1,4 +1,4 @@
-#ifdef USE_CAFFE
+#ifdef USE_TENSORRT
 #ifndef OPENPOSE_CORE_NET_TENSORRT_HPP
 #define OPENPOSE_CORE_NET_TENSORRT_HPP
 
@@ -57,4 +57,4 @@ namespace op
 }
 
 #endif // OPENPOSE_CORE_NET_TENSORRT_HPP
-#endif
+#endif // USE_TENSORRT
diff --git a/include/openpose/pose/poseExtractorTensorRT.hpp b/include/openpose/pose/poseExtractorTensorRT.hpp
index 270d2a8f4..f358d03ca 100644
--- a/include/openpose/pose/poseExtractorTensorRT.hpp
+++ b/include/openpose/pose/poseExtractorTensorRT.hpp
@@ -1,4 +1,4 @@
-#ifdef USE_CAFFE
+#ifdef USE_TENSORRT
 #ifndef OPENPOSE_POSE_POSE_EXTRACTOR_TENSORRT_HPP
 #define OPENPOSE_POSE_POSE_EXTRACTOR_TENSORRT_HPP
 
@@ -49,4 +49,4 @@ namespace op
 }
 
 #endif // OPENPOSE_POSE_POSE_EXTRACTOR_TENSORRT_HPP
-#endif
+#endif // USE_TENSORRT
diff --git a/include/openpose/wrapper/wrapper.hpp b/include/openpose/wrapper/wrapper.hpp
index b063e971c..bb8d54d91 100644
--- a/include/openpose/wrapper/wrapper.hpp
+++ b/include/openpose/wrapper/wrapper.hpp
@@ -638,7 +638,11 @@ namespace op
             {
                 // Pose estimators
                 for (auto gpuId = 0; gpuId < gpuNumber; gpuId++)
+#ifndef USE_TENSORRT
                     poseExtractors.emplace_back(std::make_shared<PoseExtractorCaffe>(
+#else
+                    poseExtractors.emplace_back(std::make_shared<PoseExtractorTensorRT>(
+#endif
                         poseNetInputSize, poseNetOutputSize, finalOutputSize, wrapperStructPose.scalesNumber,
                         wrapperStructPose.poseModel, modelFolder, gpuId + gpuNumberStart,
                         wrapperStructPose.heatMapTypes, wrapperStructPose.heatMapScale,
diff --git a/src/openpose/core/netTensorRT.cpp b/src/openpose/core/netTensorRT.cpp
index 6d2c81293..8894aeac3 100644
--- a/src/openpose/core/netTensorRT.cpp
+++ b/src/openpose/core/netTensorRT.cpp
@@ -1,4 +1,4 @@
-#ifdef USE_CAFFE
+#ifdef USE_TENSORRT
 #include <numeric> // std::accumulate
 #include <openpose/utilities/cuda.hpp>
 #include <openpose/core/netTensorRT.hpp>
@@ -307,4 +307,4 @@ namespace op
   }
 }
 
-#endif
+#endif // USE_TENSORRT

From 33aa099dbead80be092340e6d0d0eb0e62fbdb25 Mon Sep 17 00:00:00 2001
From: Florent Buisson <florent.buisson.1987@gmail.com>
Date: Wed, 11 Oct 2017 22:09:14 +0200
Subject: [PATCH 36/52] TENSORRT compilation is still partly using caffe

---
 Makefile | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Makefile b/Makefile
index 061fa8de7..2de2c73a7 100644
--- a/Makefile
+++ b/Makefile
@@ -70,6 +70,10 @@ else
 	LDFLAGS += -Wl,-rpath=$(CAFFE_DIR)/lib
 	INCLUDE_DIRS += $(CAFFE_DIR)/include
 	LIBRARY_DIRS += $(CAFFE_DIR)/lib
+        
+        ifeq ($(DEEP_NET), tensorrt)
+                COMMON_FLAGS += -DUSE_TENSORRT
+        endif
 endif
 
 ##############################

From 359b601fdbaec3c4fcb44093dc76d03ae57a3d2b Mon Sep 17 00:00:00 2001
From: Florent Buisson <florent.buisson.1987@gmail.com>
Date: Wed, 11 Oct 2017 23:35:41 +0200
Subject: [PATCH 37/52] Missing guards for TensorRT

---
 .../3_extract_from_image_TensorRT.cpp         |  3 +
 src/openpose/core/resizeAndMergeBase.cpp      | 71 ++++++++++++-------
 src/openpose/core/resizeAndMergeBase.cu       | 34 +++++----
 src/openpose/pose/poseExtractorTensorRT.cpp   |  2 +-
 4 files changed, 65 insertions(+), 45 deletions(-)

diff --git a/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp b/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp
index adba661b0..4a522fbc2 100644
--- a/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp
+++ b/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp
@@ -69,6 +69,7 @@ static std::string timeDiffToString(const std::chrono::high_resolution_clock::ti
 
 int openPoseTutorialPose3()
 {
+#ifdef USE_TENSORRT
     op::log("Starting pose estimation.", op::Priority::High);
     
     timeNow("Start");
@@ -153,6 +154,7 @@ int openPoseTutorialPose3()
         op::log(log_time, op::Priority::High);
     }
     
+#endif // USE_TENSORRT
   
     // Return successful message
     return 0;
@@ -169,3 +171,4 @@ int main(int argc, char *argv[])
     // Running openPoseTutorialPose1
     return openPoseTutorialPose3();
 }
+
diff --git a/src/openpose/core/resizeAndMergeBase.cpp b/src/openpose/core/resizeAndMergeBase.cpp
index 2a47de7fe..b825bb311 100644
--- a/src/openpose/core/resizeAndMergeBase.cpp
+++ b/src/openpose/core/resizeAndMergeBase.cpp
@@ -1,8 +1,6 @@
-#include <opencv2/imgproc/imgproc.hpp>
-#include <opencv2/core/core.hpp>
+// #include <opencv2/imgproc/imgproc.hpp>
 #include <openpose/core/resizeAndMergeBase.hpp>
 
-
 namespace op
 {
     template <typename T>
@@ -11,29 +9,50 @@ namespace op
     {
         try
         {
-            const int num = sourceSize[0];
-            const int channels = sourceSize[1];
-            const int sourceHeight = sourceSize[2];
-            const int sourceWidth = sourceSize[3];
-            const int targetHeight = targetSize[2];
-            const int targetWidth = targetSize[3];
-            
-            const auto sourceChannelOffset = sourceHeight * sourceWidth;
-            const auto targetChannelOffset = targetWidth * targetHeight;
-             
-            // Perform resize + merging
-            const auto sourceNumOffset = channels * sourceChannelOffset;
-            for (auto c = 0 ; c < channels ; c++) {
-                cv::Mat target (targetHeight, targetWidth, CV_32F, (void*)(targetPtr + c * targetChannelOffset));
-                cv::multiply(target, 0.f, target);
-                cv::Mat t;
-                for (auto n = 0; n < num; n++) {
-                    cv::Mat source(std::rint(sourceHeight * scaleRatios[n]), std::rint(sourceWidth * scaleRatios[n]), CV_32F, (void*)(sourcePtr + c * sourceChannelOffset + n * sourceNumOffset));
-                    cv::resize(source, t, cv::Size(targetWidth, targetHeight), 0., 0., cv::INTER_CUBIC);
-                    cv::add(target, t, target);
-                }
-                cv::divide(target, (float)num, target);
-            }
+            UNUSED(targetPtr);
+            UNUSED(sourcePtr);
+            UNUSED(scaleInputToNetInputs);
+            UNUSED(targetSize);
+            UNUSED(sourceSize);
+            error("CPU version not completely implemented.", __LINE__, __FUNCTION__, __FILE__);
+
+            // TODO: THIS CODE IS WORKING, BUT IT DOES NOT CONSIDER THE SCALES (I.E. SCALE NUMBER, START AND GAP) 
+            // const int num = bottom->shape(0);
+            // const int channel = bottom->shape(1);
+            // const int sourceHeight = bottom->shape(2);
+            // const int sourceWidth = bottom->shape(3);
+            // const int targetHeight = top->shape(2);
+            // const int targetWidth = top->shape(3);
+
+            // //stupid method
+            // for (int n = 0; n < num; n++)
+            // {
+            //     for (int c = 0; c < channel; c++)
+            //     {
+            //         //fill source
+            //         cv::Mat source(sourceWidth, sourceHeight, CV_32FC1);
+            //         const auto sourceOffsetChannel = sourceHeight * sourceWidth;
+            //         const auto sourceOffsetNum = sourceOffsetChannel * channel;
+            //         const auto sourceOffset = n*sourceOffsetNum + c*sourceOffsetChannel;
+            //         const T* const sourcePtr = bottom->cpu_data();
+            //         for (int y = 0; y < sourceHeight; y++)
+            //             for (int x = 0; x < sourceWidth; x++)
+            //                 source.at<T>(x,y) = sourcePtr[sourceOffset + y*sourceWidth + x];
+
+            //         // spatial resize
+            //         cv::Mat target;
+            //         cv::resize(source, target, {targetWidth, targetHeight}, 0, 0, CV_INTER_CUBIC);
+
+            //         //fill top
+            //         const auto targetOffsetChannel = targetHeight * targetWidth;
+            //         const auto targetOffsetNum = targetOffsetChannel * channel;
+            //         const auto targetOffset = n*targetOffsetNum + c*targetOffsetChannel;
+            //         T* targetPtr = top->mutable_cpu_data();
+            //         for (int y = 0; y < targetHeight; y++)
+            //             for (int x = 0; x < targetWidth; x++)
+            //                 targetPtr[targetOffset + y*targetWidth + x] = target.at<T>(x,y);
+            //     }
+            // }
         }
         catch (const std::exception& e)
         {
diff --git a/src/openpose/core/resizeAndMergeBase.cu b/src/openpose/core/resizeAndMergeBase.cu
index 7742b75c9..d7900aa24 100644
--- a/src/openpose/core/resizeAndMergeBase.cu
+++ b/src/openpose/core/resizeAndMergeBase.cu
@@ -7,15 +7,18 @@ namespace op
     const auto THREADS_PER_BLOCK_1D = 16u;
 
     template <typename T>
-    __global__ void resizeKernel(T* targetPtr, const T* const sourcePtr, const int sourceWidth, const int sourceHeight, const int targetWidth, const int targetHeight, const T invScaleWidth, const T invScaleHeight)
+    __global__ void resizeKernel(T* targetPtr, const T* const sourcePtr, const int sourceWidth, const int sourceHeight, const int targetWidth,
+                                 const int targetHeight)
     {
         const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
         const auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
 
         if (x < targetWidth && y < targetHeight)
         {
-            const T xSource = (x + 0.5f) * invScaleWidth - 0.5f;
-            const T ySource = (y + 0.5f) * invScaleHeight - 0.5f;
+            const auto scaleWidth = targetWidth / T(sourceWidth);
+            const auto scaleHeight = targetHeight / T(sourceHeight);
+            const T xSource = (x + 0.5f) / scaleWidth - 0.5f;
+            const T ySource = (y + 0.5f) / scaleHeight - 0.5f;
 
             targetPtr[y*targetWidth+x] = bicubicInterpolate(sourcePtr, xSource, ySource, sourceWidth, sourceHeight, sourceWidth);
         }
@@ -27,14 +30,7 @@ namespace op
     {
         const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
         const auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
-               
-        const auto currentWidth = sourceWidth;
-        const auto currentHeight = sourceHeight;
 
-        const auto scaleWidth = targetWidth / currentWidth;
-        const auto scaleHeight = targetHeight / currentHeight;
-
- 
         if (x < targetWidth && y < targetHeight)
         {
             auto& targetPixel = targetPtr[y*targetWidth+x];
@@ -51,7 +47,8 @@ namespace op
                 const T ySource = (y + 0.5f) / scaleHeight - 0.5f;
 
                 const T* const sourcePtrN = sourcePtr + n * sourceNumOffset;
-                const auto interpolated = bicubicInterpolate(sourcePtrN, xSource, ySource, sourceWidth, sourceHeight, sourceWidth);
+                const auto interpolated = bicubicInterpolate(sourcePtrN, xSource, ySource, intRound(currentWidth),
+                                                             intRound(currentHeight), sourceWidth);
                 targetPixel += interpolated;
                 // targetPixel = fastMax(targetPixel, interpolated);
             }
@@ -76,18 +73,19 @@ namespace op
             const dim3 numBlocks{getNumberCudaBlocks(targetWidth, threadsPerBlock.x), getNumberCudaBlocks(targetHeight, threadsPerBlock.y)};
             const auto sourceChannelOffset = sourceHeight * sourceWidth;
             const auto targetChannelOffset = targetWidth * targetHeight;
-            const auto scaleWidth = sourceWidth/T(targetWidth);
-            const auto scaleHeight = sourceHeight/T(targetHeight);
+
             // No multi-scale merging
-            /*if (targetSize[0] > 1)
+            if (targetSize[0] > 1)
             {
                 for (auto n = 0; n < num; n++)
-                {*/
+                {
+                    const auto offsetBase = n*channels;
                     for (auto c = 0 ; c < channels ; c++)
                     {
-                        resizeKernel<<<numBlocks, threadsPerBlock>>>(targetPtr + c * targetChannelOffset,
-                                                                     sourcePtr + c * sourceChannelOffset,
-                                                                     sourceWidth, sourceHeight, targetWidth, targetHeight, scaleWidth, scaleHeight);
+                        const auto offset = offsetBase + c;
+                        resizeKernel<<<numBlocks, threadsPerBlock>>>(targetPtr + offset * targetChannelOffset,
+                                                                     sourcePtr + offset * sourceChannelOffset,
+                                                                     sourceWidth, sourceHeight, targetWidth, targetHeight);
                     }
                 }
             }
diff --git a/src/openpose/pose/poseExtractorTensorRT.cpp b/src/openpose/pose/poseExtractorTensorRT.cpp
index a367110bd..744997001 100644
--- a/src/openpose/pose/poseExtractorTensorRT.cpp
+++ b/src/openpose/pose/poseExtractorTensorRT.cpp
@@ -1,4 +1,4 @@
-#ifdef USE_CAFFE
+#ifdef USE_TENSORRT
 #include <openpose/core/netCaffe.hpp>
 #include <openpose/core/netTensorRT.hpp>
 #include <openpose/pose/poseParameters.hpp>

From d4a89d05bc39801ae0fbc47ce0d0b520c53c5655 Mon Sep 17 00:00:00 2001
From: Florent Buisson <florent.buisson.1987@gmail.com>
Date: Mon, 13 Nov 2017 17:02:08 +0000
Subject: [PATCH 38/52] =?UTF-8?q?PIMPL=C2=A0version=20of=20poseExtractorTe?=
 =?UTF-8?q?nsorRT,=20still=20having=20template=20compilation=20errors.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 Makefile                                      |   2 -
 include/openpose/pose/headers.hpp             |   5 +-
 .../openpose/pose/poseExtractorTensorRT.hpp   |  40 +++---
 include/openpose/wrapper/wrapper.hpp          |   6 +-
 src/openpose/pose/poseExtractorTensorRT.cpp   | 124 +++++++++++++-----
 .../Makefile.config.Ubuntu16_cuda8_JetsonTX2  |   2 +-
 .../install_openpose_JetsonTX2_JetPack3.1.sh  |   1 -
 7 files changed, 113 insertions(+), 67 deletions(-)

diff --git a/Makefile b/Makefile
index 2de2c73a7..7bbc41229 100644
--- a/Makefile
+++ b/Makefile
@@ -61,8 +61,6 @@ ifeq ($(DEEP_NET), tensorflow)
 # Torch
 else ifeq ($(DEEP_NET), torch)
 	# COMMON_FLAGS += -DUSE_TORCH
-else ifeq ($(DEEP_NET), tensorrt)
-        COMMON_FLAGS += -DUSE_TENSORRT
 # Caffe
 else
 	COMMON_FLAGS += -DUSE_CAFFE
diff --git a/include/openpose/pose/headers.hpp b/include/openpose/pose/headers.hpp
index c23a9e47b..9e23af7b7 100644
--- a/include/openpose/pose/headers.hpp
+++ b/include/openpose/pose/headers.hpp
@@ -8,6 +8,7 @@
 #include <openpose/pose/poseCpuRenderer.hpp>
 #include <openpose/pose/poseExtractor.hpp>
 #include <openpose/pose/poseExtractorCaffe.hpp>
+#include <openpose/pose/poseExtractorTensorRT.hpp>
 #include <openpose/pose/poseGpuRenderer.hpp>
 #include <openpose/pose/poseParameters.hpp>
 #include <openpose/pose/poseRenderer.hpp>
@@ -15,8 +16,4 @@
 #include <openpose/pose/wPoseExtractor.hpp>
 #include <openpose/pose/wPoseRenderer.hpp>
 
-#ifdef USE_TENSORRT
-	#include <openpose/pose/poseExtractorTensorRT.hpp>
-#endif // USE_TENSORRT
-
 #endif // OPENPOSE_POSE_HEADERS_HPP
diff --git a/include/openpose/pose/poseExtractorTensorRT.hpp b/include/openpose/pose/poseExtractorTensorRT.hpp
index f358d03ca..5695baba1 100644
--- a/include/openpose/pose/poseExtractorTensorRT.hpp
+++ b/include/openpose/pose/poseExtractorTensorRT.hpp
@@ -1,52 +1,48 @@
-#ifdef USE_TENSORRT
 #ifndef OPENPOSE_POSE_POSE_EXTRACTOR_TENSORRT_HPP
 #define OPENPOSE_POSE_POSE_EXTRACTOR_TENSORRT_HPP
 
 #include <caffe/blob.hpp>
 #include <openpose/core/common.hpp>
-#include <openpose/core/net.hpp>
-#include <openpose/core/nmsCaffe.hpp>
-#include <openpose/core/resizeAndMergeCaffe.hpp>
-#include <openpose/pose/bodyPartConnectorCaffe.hpp>
 #include <openpose/pose/enumClasses.hpp>
-#include <openpose/pose/poseExtractor.hpp>
 
 namespace op
 {
     class OP_API PoseExtractorTensorRT : public PoseExtractor
     {
     public:
-        PoseExtractorTensorRT(const Point<int>& netInputSize, const Point<int>& netOutputSize, const Point<int>& outputSize, const int scaleNumber,
-                           const PoseModel poseModel, const std::string& modelFolder, const int gpuId, const std::vector<HeatMapType>& heatMapTypes = {},
-                           const ScaleMode heatMapScale = ScaleMode::ZeroToOne);
+        PoseExtractorTensorRT(const Point<int>& netInputSize, const Point<int>& netOutputSize,
+                              const Point<int>& outputSize, const int scaleNumber, const PoseModel poseModel,
+                              const std::string& modelFolder, const int gpuId,
+                              const std::vector<HeatMapType>& heatMapTypes = {},
+                              const ScaleMode heatMapScale = ScaleMode::ZeroToOne,
+                              const bool enableGoogleLogging = true);
 
         virtual ~PoseExtractorTensorRT();
 
         void netInitializationOnThread();
 
-        void forwardPass(const Array<float>& inputNetData, const Point<int>& inputDataSize, const std::vector<float>& scaleRatios = {1.f});
+        void forwardPass(const Array<float>& inputNetData, const Point<int>& inputDataSize,
+                         const std::vector<float>& scaleRatios = {1.f});
 
+         
         const float* getHeatMapCpuConstPtr() const;
 
         const float* getHeatMapGpuConstPtr() const;
 
+        std::vector<int> getHeatMapSize() const;
+
         const float* getPoseGpuConstPtr() const;
 
-    private:
-        const float mResizeScale;
-        std::shared_ptr<Net> spNet;
-        std::shared_ptr<ResizeAndMergeCaffe<float>> spResizeAndMergeTensorRT;
-        std::shared_ptr<NmsCaffe<float>> spNmsTensorRT;
-        std::shared_ptr<BodyPartConnectorCaffe<float>> spBodyPartConnectorTensorRT;
-        // Init with thread
-        boost::shared_ptr<caffe::Blob<float>> spTensorRTNetOutputBlob;
-        std::shared_ptr<caffe::Blob<float>> spHeatMapsBlob;
-        std::shared_ptr<caffe::Blob<float>> spPeaksBlob;
-        std::shared_ptr<caffe::Blob<float>> spPoseBlob;
+  private:
+        // PIMPL idiom
+        // http://www.cppsamples.com/common-tasks/pimpl.html
+        struct ImplPoseExtractorTensorRT;
+        std::unique_ptr<ImplPoseExtractorTensorRT> upImpl;
 
+        // PIMP requires DELETE_COPY & destructor, or extra code
+        // http://oliora.github.io/2015/12/29/pimpl-and-rule-of-zero.html
         DELETE_COPY(PoseExtractorTensorRT);
     };
 }
 
 #endif // OPENPOSE_POSE_POSE_EXTRACTOR_TENSORRT_HPP
-#endif // USE_TENSORRT
diff --git a/include/openpose/wrapper/wrapper.hpp b/include/openpose/wrapper/wrapper.hpp
index bb8d54d91..6370d3dfc 100644
--- a/include/openpose/wrapper/wrapper.hpp
+++ b/include/openpose/wrapper/wrapper.hpp
@@ -638,10 +638,10 @@ namespace op
             {
                 // Pose estimators
                 for (auto gpuId = 0; gpuId < gpuNumber; gpuId++)
-#ifndef USE_TENSORRT
-                    poseExtractors.emplace_back(std::make_shared<PoseExtractorCaffe>(
-#else
+#ifdef USE_TENSORRT
                     poseExtractors.emplace_back(std::make_shared<PoseExtractorTensorRT>(
+#else
+                    poseExtractors.emplace_back(std::make_shared<PoseExtractorCaffe>(
 #endif
                         poseNetInputSize, poseNetOutputSize, finalOutputSize, wrapperStructPose.scalesNumber,
                         wrapperStructPose.poseModel, modelFolder, gpuId + gpuNumberStart,
diff --git a/src/openpose/pose/poseExtractorTensorRT.cpp b/src/openpose/pose/poseExtractorTensorRT.cpp
index 744997001..ebbd40457 100644
--- a/src/openpose/pose/poseExtractorTensorRT.cpp
+++ b/src/openpose/pose/poseExtractorTensorRT.cpp
@@ -1,6 +1,10 @@
-#ifdef USE_TENSORRT
-#include <openpose/core/netCaffe.hpp>
+#ifdef USE_CAFFE
+#include <caffe/blob.hpp>
+#endif
 #include <openpose/core/netTensorRT.hpp>
+#include <openpose/core/nmsCaffe.hpp>
+#include <openpose/core/resizeAndMergeCaffe.hpp>
+#include <openpose/pose/bodyPartConnectorCaffe.hpp>
 #include <openpose/pose/poseParameters.hpp>
 #include <openpose/utilities/check.hpp>
 #include <openpose/utilities/cuda.hpp>
@@ -24,18 +28,51 @@ static std::string timeDiffToString(const std::chrono::high_resolution_clock::ti
 }
 
 
-namespace op
+nameupImpl->space op
 {
-    PoseExtractorTensorRT::PoseExtractorTensorRT(const Point<int>& netInputSize, const Point<int>& netOutputSize, const Point<int>& outputSize, const int scaleNumber,
-                                           const PoseModel poseModel, const std::string& modelFolder, const int gpuId, const std::vector<HeatMapType>& heatMapTypes,
-                                           const ScaleMode heatMapScale) :
-        PoseExtractor{netOutputSize, outputSize, poseModel, heatMapTypes, heatMapScale},
+
+    struct PoseExtractorTensorRT::ImplPoseExtractorTensorRT
+    {
+        #ifdef USE_TENSORRT // implies USE_CAFFE for now
+            const float upImpl->mResizeScale;
+            std::shared_ptr<Net> upImpl->spNet;
+            std::shared_ptr<ResizeAndMergeCaffe<float>> upImpl->spResizeAndMergeTensorRT;
+            std::shared_ptr<NmsCaffe<float>> upImpl->spNmsTensorRT;
+            std::shared_ptr<BodyPartConnectorCaffe<float>> upImpl->spBodyPartConnectorTensorRT;
+            // Init with thread
+            boost::shared_ptr<caffe::Blob<float>> upImpl->spTensorRTNetOutputBlob;
+            std::shared_ptr<caffe::Blob<float>> upImpl->spHeatMapsBlob;
+            std::shared_ptr<caffe::Blob<float>> upImpl->spPeaksBlob;
+            std::shared_ptr<caffe::Blob<float>> upImpl->spPoseBlob;
+
+
+            ImplPoseExtractorTensorRT(const Point<int>& netInputSize, const Point<int>& netOutputSize,
+                                      const Point<int>& outputSize, const int scaleNumber,
+                                      const PoseModel poseModel, const int gpuId,
+                                      const std::string& modelFolder, const bool enableGoogleLogging) :
         mResizeScale{mNetOutputSize.x / (float)netInputSize.x},
-        spNet{std::make_shared<NetTensorRT>(std::array<int,4>{scaleNumber, 3, (int)netInputSize.y, (int)netInputSize.x},
-                                         modelFolder + POSE_PROTOTXT[(int)poseModel], modelFolder + POSE_TRAINED_MODEL[(int)poseModel], gpuId)},
+        spNet{std::make_shared<NetTensorRT>(std::array<int,4>{scaleNumber, 3, 
+                                            (int)netInputSize.y, (int)netInputSize.x},
+                                            modelFolder + POSE_PROTOTXT[(int)poseModel],
+                                            modelFolder + POSE_TRAINED_MODEL[(int)poseModel], gpuId)},
         spResizeAndMergeTensorRT{std::make_shared<ResizeAndMergeCaffe<float>>()},
         spNmsTensorRT{std::make_shared<NmsCaffe<float>>()},
         spBodyPartConnectorTensorRT{std::make_shared<BodyPartConnectorCaffe<float>>()}
+        {
+        }
+        #endif
+    }
+ 
+    PoseExtractorTensorRT::PoseExtractorTensorRT(const Point<int>& netInputSize, const Point<int>& netOutputSize,
+                                                 const Point<int>& outputSize, const int scaleNumber,
+                                                 const PoseModel poseModel, const std::string& modelFolder, 
+                                                 const int gpuId, const std::vector<HeatMapType>& heatMapTypes,
+                                                 const ScaleMode heatMapScale, const bool enableGoogleLogging) :
+        PoseExtractor{netOutputSize, outputSize, poseModel, heatMapTypes, heatMapScale},
+        #ifdef USE_TENSORRT
+        , upImpl{new ImplPoseExtractorTensorRT{netInputSize, netOutputSize, scaleNumber, poseModel,
+                                               gpuId, modelFolder, enableGoogleLogging}}
+        #endif
     {
         try
         {
@@ -62,24 +99,24 @@ namespace op
           
 
             // TensorRT net
-            spNet->initializationOnThread();
-            spTensorRTNetOutputBlob = ((NetTensorRT*)spNet.get())->getOutputBlob();
+            upImpl->spNet->initializationOnThread();
+            upImpl->spTensorRTNetOutputBlob = ((NetTensorRT*)upImpl->spNet.get())->getOutputBlob();
             cudaCheck(__LINE__, __FUNCTION__, __FILE__);
 
             // HeatMaps extractor blob and layer
-            spHeatMapsBlob = {std::make_shared<caffe::Blob<float>>(1,1,1,1)};
-            spResizeAndMergeTensorRT->Reshape({spTensorRTNetOutputBlob.get()}, {spHeatMapsBlob.get()}, mResizeScale * POSE_CCN_DECREASE_FACTOR[(int)mPoseModel]);
+            upImpl->spHeatMapsBlob = {std::make_shared<caffe::Blob<float>>(1,1,1,1)};
+            upImpl->spResizeAndMergeTensorRT->Reshape({upImpl->spTensorRTNetOutputBlob.get()}, {upImpl->spHeatMapsBlob.get()}, upImpl->mResizeScale * POSE_CCN_DECREASE_FACTOR[(int)mPoseModel]);
             cudaCheck(__LINE__, __FUNCTION__, __FILE__);
 
             // Pose extractor blob and layer
-            spPeaksBlob = {std::make_shared<caffe::Blob<float>>(1,1,1,1)};
-            spNmsTensorRT->Reshape({spHeatMapsBlob.get()}, {spPeaksBlob.get()}, POSE_MAX_PEAKS[(int)mPoseModel]);
+            upImpl->spPeaksBlob = {std::make_shared<caffe::Blob<float>>(1,1,1,1)};
+            upImpl->spNmsTensorRT->Reshape({upImpl->spHeatMapsBlob.get()}, {upImpl->spPeaksBlob.get()}, POSE_MAX_PEAKS[(int)mPoseModel]);
             cudaCheck(__LINE__, __FUNCTION__, __FILE__);
 
             // Pose extractor blob and layer
-            spPoseBlob = {std::make_shared<caffe::Blob<float>>(1,1,1,1)};
-            spBodyPartConnectorTensorRT->setPoseModel(mPoseModel);
-            spBodyPartConnectorTensorRT->Reshape({spHeatMapsBlob.get(), spPeaksBlob.get()}, {spPoseBlob.get()});
+            upImpl->spPoseBlob = {std::make_shared<caffe::Blob<float>>(1,1,1,1)};
+            upImpl->spBodyPartConnectorTensorRT->setPoseModel(mPoseModel);
+            upImpl->spBodyPartConnectorTensorRT->Reshape({upImpl->spHeatMapsBlob.get(), upImpl->spPeaksBlob.get()}, {upImpl->spPoseBlob.get()});
             cudaCheck(__LINE__, __FUNCTION__, __FILE__);
 
             log("Finished initialization on thread.", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
@@ -99,13 +136,13 @@ namespace op
                 error("Empty inputNetData.", __LINE__, __FUNCTION__, __FILE__);
             timeNow("Start");
             // 1. TensorRT deep network
-            spNet->forwardPass(inputNetData.getConstPtr());
+            upImpl->spNet->forwardPass(inputNetData.getConstPtr());
             timeNow("TensorRT forward");
             // 2. Resize heat maps + merge different scales
-            spResizeAndMergeTensorRT->setScaleRatios(scaleRatios);
+            upImpl->spResizeAndMergeTensorRT->setScaleRatios(scaleRatios);
             timeNow("SpResizeAndMergeTensorRT");
             #ifndef CPU_ONLY
-                spResizeAndMergeTensorRT->Forward_gpu({spTensorRTNetOutputBlob.get()}, {spHeatMapsBlob.get()});       // ~5ms
+                upImpl->spResizeAndMergeTensorRT->Forward_gpu({upImpl->spTensorRTNetOutputBlob.get()}, {upImpl->spHeatMapsBlob.get()});       // ~5ms
                 timeNow("RaM forward_gpu");
                 cudaCheck(__LINE__, __FUNCTION__, __FILE__);
                 timeNow("CudaCheck");
@@ -114,9 +151,9 @@ namespace op
             #endif
             timeNow("Resize heat Maps");
             // 3. Get peaks by Non-Maximum Suppression
-            spNmsTensorRT->setThreshold((float)get(PoseProperty::NMSThreshold));
+            upImpl->spNmsTensorRT->setThreshold((float)get(PoseProperty::NMSThreshold));
             #ifndef CPU_ONLY
-                spNmsTensorRT->Forward_gpu({spHeatMapsBlob.get()}, {spPeaksBlob.get()});                           // ~2ms
+                upImpl->spNmsTensorRT->Forward_gpu({upImpl->spHeatMapsBlob.get()}, {upImpl->spPeaksBlob.get()});                           // ~2ms
                 cudaCheck(__LINE__, __FUNCTION__, __FILE__);
             #else
                 error("NmsTensorRT CPU version not implemented yet.", __LINE__, __FUNCTION__, __FILE__);
@@ -128,14 +165,14 @@ namespace op
             mScaleNetToOutput = {(float)resizeGetScaleFactor(netSize, mOutputSize)};
             timeNow("Scale net to output");
             // 4. Connecting body parts
-            spBodyPartConnectorTensorRT->setScaleNetToOutput(mScaleNetToOutput);
-            spBodyPartConnectorTensorRT->setInterMinAboveThreshold((int)get(PoseProperty::ConnectInterMinAboveThreshold));
-            spBodyPartConnectorTensorRT->setInterThreshold((float)get(PoseProperty::ConnectInterThreshold));
-            spBodyPartConnectorTensorRT->setMinSubsetCnt((int)get(PoseProperty::ConnectMinSubsetCnt));
-            spBodyPartConnectorTensorRT->setMinSubsetScore((float)get(PoseProperty::ConnectMinSubsetScore));
+            upImpl->spBodyPartConnectorTensorRT->setScaleNetToOutput(mScaleNetToOutput);
+            upImpl->spBodyPartConnectorTensorRT->setInterMinAboveThreshold((int)get(PoseProperty::ConnectInterMinAboveThreshold));
+            upImpl->spBodyPartConnectorTensorRT->setInterThreshold((float)get(PoseProperty::ConnectInterThreshold));
+            upImpl->spBodyPartConnectorTensorRT->setMinSubsetCnt((int)get(PoseProperty::ConnectMinSubsetCnt));
+            upImpl->spBodyPartConnectorTensorRT->setMinSubsetScore((float)get(PoseProperty::ConnectMinSubsetScore));
             // GPU version not implemented yet
-            spBodyPartConnectorTensorRT->Forward_cpu({spHeatMapsBlob.get(), spPeaksBlob.get()}, mPoseKeypoints);
-            // spBodyPartConnectorTensorRT->Forward_gpu({spHeatMapsBlob.get(), spPeaksBlob.get()}, {spPoseBlob.get()}, mPoseKeypoints);
+            upImpl->spBodyPartConnectorTensorRT->Forward_cpu({upImpl->spHeatMapsBlob.get(), upImpl->spPeaksBlob.get()}, mPoseKeypoints);
+            // upImpl->spBodyPartConnectorTensorRT->Forward_gpu({upImpl->spHeatMapsBlob.get(), upImpl->spPeaksBlob.get()}, {upImpl->spPoseBlob.get()}, mPoseKeypoints);
             timeNow("Connect Body Parts");
              
             const auto totalTimeSec = timeDiffToString(timings.back().second, timings.front().second);
@@ -158,7 +195,7 @@ namespace op
         try
         {
             checkThread();
-            return spHeatMapsBlob->cpu_data();
+            return upImpl->spHeatMapsBlob->cpu_data();
         }
         catch (const std::exception& e)
         {
@@ -172,7 +209,7 @@ namespace op
         try
         {
             checkThread();
-            return spHeatMapsBlob->gpu_data();
+            return upImpl->spHeatMapsBlob->gpu_data();
         }
         catch (const std::exception& e)
         {
@@ -181,13 +218,33 @@ namespace op
         }
     }
 
+
+    std::vector<int> PoseExtractorTensorRT::getHeatMapSize() const
+    {
+        try
+        {
+            #ifdef USE_CAFFE
+                checkThread();
+                return upImpl->spHeatMapsBlob->shape();
+            #else
+                return {};
+            #endif
+        }
+        catch (const std::exception& e)
+        {
+            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+            return {};
+        }
+    }
+
+
     const float* PoseExtractorTensorRT::getPoseGpuConstPtr() const
     {
         try
         {
             error("GPU pointer for people pose data not implemented yet.", __LINE__, __FUNCTION__, __FILE__);
             checkThread();
-            return spPoseBlob->gpu_data();
+            return upImpl->spPoseBlob->gpu_data();
         }
         catch (const std::exception& e)
         {
@@ -197,7 +254,6 @@ namespace op
     }
 }
 
-#endif
 
 
 
diff --git a/ubuntu/Makefile.config.Ubuntu16_cuda8_JetsonTX2 b/ubuntu/Makefile.config.Ubuntu16_cuda8_JetsonTX2
index 476fc9f92..d3241f886 100644
--- a/ubuntu/Makefile.config.Ubuntu16_cuda8_JetsonTX2
+++ b/ubuntu/Makefile.config.Ubuntu16_cuda8_JetsonTX2
@@ -53,7 +53,7 @@ CUDA_ARCH := -gencode arch=compute_30,code=sm_30 \
 
 # DEEP_NET choice:
 # caffe for Caffe (default and only option so far)
-DEEP_NET := caffe
+DEEP_NET := tensorrt
 
 # Caffe directory
 CAFFE_DIR := 3rdparty/caffe/distribute
diff --git a/ubuntu/install_openpose_JetsonTX2_JetPack3.1.sh b/ubuntu/install_openpose_JetsonTX2_JetPack3.1.sh
index 57d71638e..7387e3bba 100755
--- a/ubuntu/install_openpose_JetsonTX2_JetPack3.1.sh
+++ b/ubuntu/install_openpose_JetsonTX2_JetPack3.1.sh
@@ -51,7 +51,6 @@ echo ""
 
 echo "------------------------- Compiling OpenPose -------------------------"
 # Go back to main folder
-cd ..
 # Copy Makefile.config
 cp ubuntu/Makefile.config.Ubuntu16_cuda8_JetsonTX2 Makefile.config
 # Compile OpenPose

From 766c44ae0970bd7fcdea32aa223098b7d633a8a0 Mon Sep 17 00:00:00 2001
From: Florent Buisson <florent.buisson.1987@gmail.com>
Date: Mon, 13 Nov 2017 18:13:41 +0100
Subject: [PATCH 39/52] Spot the differences part 1.

---
 include/openpose/pose/poseExtractorTensorRT.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/openpose/pose/poseExtractorTensorRT.hpp b/include/openpose/pose/poseExtractorTensorRT.hpp
index 5695baba1..6d8f53f15 100644
--- a/include/openpose/pose/poseExtractorTensorRT.hpp
+++ b/include/openpose/pose/poseExtractorTensorRT.hpp
@@ -1,9 +1,9 @@
 #ifndef OPENPOSE_POSE_POSE_EXTRACTOR_TENSORRT_HPP
 #define OPENPOSE_POSE_POSE_EXTRACTOR_TENSORRT_HPP
 
-#include <caffe/blob.hpp>
 #include <openpose/core/common.hpp>
 #include <openpose/pose/enumClasses.hpp>
+#include <openpose/pose/poseExtractor.hpp>
 
 namespace op
 {

From c12dd28af48ea3ec60a5cd5f89edf53bf7ba2266 Mon Sep 17 00:00:00 2001
From: Florent Buisson <florent.buisson.1987@gmail.com>
Date: Mon, 13 Nov 2017 18:23:41 +0100
Subject: [PATCH 40/52] Spot the differences part 2

---
 src/openpose/pose/poseExtractorTensorRT.cpp | 108 ++++++++++++--------
 1 file changed, 63 insertions(+), 45 deletions(-)

diff --git a/src/openpose/pose/poseExtractorTensorRT.cpp b/src/openpose/pose/poseExtractorTensorRT.cpp
index ebbd40457..19f538211 100644
--- a/src/openpose/pose/poseExtractorTensorRT.cpp
+++ b/src/openpose/pose/poseExtractorTensorRT.cpp
@@ -1,5 +1,5 @@
 #ifdef USE_CAFFE
-#include <caffe/blob.hpp>
+    #include <caffe/blob.hpp>
 #endif
 #include <openpose/core/netTensorRT.hpp>
 #include <openpose/core/nmsCaffe.hpp>
@@ -28,38 +28,38 @@ static std::string timeDiffToString(const std::chrono::high_resolution_clock::ti
 }
 
 
-nameupImpl->space op
+namespace op
 {
 
     struct PoseExtractorTensorRT::ImplPoseExtractorTensorRT
     {
         #ifdef USE_TENSORRT // implies USE_CAFFE for now
-            const float upImpl->mResizeScale;
-            std::shared_ptr<Net> upImpl->spNet;
-            std::shared_ptr<ResizeAndMergeCaffe<float>> upImpl->spResizeAndMergeTensorRT;
-            std::shared_ptr<NmsCaffe<float>> upImpl->spNmsTensorRT;
-            std::shared_ptr<BodyPartConnectorCaffe<float>> upImpl->spBodyPartConnectorTensorRT;
+            const float mResizeScale;
+            std::shared_ptr<Net> spNet;
+            std::shared_ptr<ResizeAndMergeCaffe<float>> spResizeAndMergeTensorRT;
+            std::shared_ptr<NmsCaffe<float>> spNmsTensorRT;
+            std::shared_ptr<BodyPartConnectorCaffe<float>> spBodyPartConnectorTensorRT;
             // Init with thread
-            boost::shared_ptr<caffe::Blob<float>> upImpl->spTensorRTNetOutputBlob;
-            std::shared_ptr<caffe::Blob<float>> upImpl->spHeatMapsBlob;
-            std::shared_ptr<caffe::Blob<float>> upImpl->spPeaksBlob;
-            std::shared_ptr<caffe::Blob<float>> upImpl->spPoseBlob;
+            boost::shared_ptr<caffe::Blob<float>> spTensorRTNetOutputBlob;
+            std::shared_ptr<caffe::Blob<float>> spHeatMapsBlob;
+            std::shared_ptr<caffe::Blob<float>> spPeaksBlob;
+            std::shared_ptr<caffe::Blob<float>> spPoseBlob;
 
 
             ImplPoseExtractorTensorRT(const Point<int>& netInputSize, const Point<int>& netOutputSize,
                                       const Point<int>& outputSize, const int scaleNumber,
                                       const PoseModel poseModel, const int gpuId,
                                       const std::string& modelFolder, const bool enableGoogleLogging) :
-        mResizeScale{mNetOutputSize.x / (float)netInputSize.x},
-        spNet{std::make_shared<NetTensorRT>(std::array<int,4>{scaleNumber, 3, 
-                                            (int)netInputSize.y, (int)netInputSize.x},
-                                            modelFolder + POSE_PROTOTXT[(int)poseModel],
-                                            modelFolder + POSE_TRAINED_MODEL[(int)poseModel], gpuId)},
-        spResizeAndMergeTensorRT{std::make_shared<ResizeAndMergeCaffe<float>>()},
-        spNmsTensorRT{std::make_shared<NmsCaffe<float>>()},
-        spBodyPartConnectorTensorRT{std::make_shared<BodyPartConnectorCaffe<float>>()}
-        {
-        }
+                mResizeScale{mNetOutputSize.x / (float)netInputSize.x},
+                spNet{std::make_shared<NetTensorRT>(std::array<int,4>{scaleNumber, 3,
+                                                    (int)netInputSize.y, (int)netInputSize.x},
+                                                    modelFolder + POSE_PROTOTXT[(int)poseModel],
+                                                    modelFolder + POSE_TRAINED_MODEL[(int)poseModel], gpuId)},
+                spResizeAndMergeTensorRT{std::make_shared<ResizeAndMergeCaffe<float>>()},
+                spNmsTensorRT{std::make_shared<NmsCaffe<float>>()},
+                spBodyPartConnectorTensorRT{std::make_shared<BodyPartConnectorCaffe<float>>()}
+            {
+            }
         #endif
     }
  
@@ -76,10 +76,27 @@ nameupImpl->space op
     {
         try
         {
-            const auto resizeScale = mNetOutputSize.x / (float)netInputSize.x;
-            const auto resizeScaleCheck = resizeScale / (mNetOutputSize.y/(float)netInputSize.y);
-            if (1+1e-6 < resizeScaleCheck || resizeScaleCheck < 1-1e-6)
-                error("Net input and output size must be proportional. resizeScaleCheck = " + std::to_string(resizeScaleCheck), __LINE__, __FUNCTION__, __FILE__);
+            #ifdef USE_TENSORRT
+                const auto resizeScale = mNetOutputSize.x / (float)netInputSize.x;
+                const auto resizeScaleCheck = resizeScale / (mNetOutputSize.y/(float)netInputSize.y);
+                if (1+1e-6 < resizeScaleCheck || resizeScaleCheck < 1-1e-6)
+                    error("Net input and output size must be proportional. resizeScaleCheck = "
+                          + std::to_string(resizeScaleCheck), __LINE__, __FUNCTION__, __FILE__);
+                    // Layers parameters
+                    upImpl->spBodyPartConnectorCaffe->setPoseModel(mPoseModel);
+            #else
+                UNUSED(netInputSize);
+                UNUSED(netOutputSize);
+                UNUSED(outputSize);
+                UNUSED(scaleNumber);
+                UNUSED(poseModel);
+                UNUSED(modelFolder);
+                UNUSED(gpuId);
+                UNUSED(heatMapTypes);
+                UNUSED(heatMapScale);
+                error("OpenPose must be compiled with the `USE_CAFFE` macro definition in order to use this"
+                      " functionality.", __LINE__, __FUNCTION__, __FILE__);
+            #endif
         }
         catch (const std::exception& e)
         {
@@ -97,29 +114,30 @@ nameupImpl->space op
         {
             log("Starting initialization on thread.", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
           
+            #ifdef USE_TENSORRT
+                // TensorRT net
+                upImpl->spNet->initializationOnThread();
+                upImpl->spTensorRTNetOutputBlob = ((NetTensorRT*)upImpl->spNet.get())->getOutputBlob();
+                cudaCheck(__LINE__, __FUNCTION__, __FILE__);
 
-            // TensorRT net
-            upImpl->spNet->initializationOnThread();
-            upImpl->spTensorRTNetOutputBlob = ((NetTensorRT*)upImpl->spNet.get())->getOutputBlob();
-            cudaCheck(__LINE__, __FUNCTION__, __FILE__);
-
-            // HeatMaps extractor blob and layer
-            upImpl->spHeatMapsBlob = {std::make_shared<caffe::Blob<float>>(1,1,1,1)};
-            upImpl->spResizeAndMergeTensorRT->Reshape({upImpl->spTensorRTNetOutputBlob.get()}, {upImpl->spHeatMapsBlob.get()}, upImpl->mResizeScale * POSE_CCN_DECREASE_FACTOR[(int)mPoseModel]);
-            cudaCheck(__LINE__, __FUNCTION__, __FILE__);
-
-            // Pose extractor blob and layer
-            upImpl->spPeaksBlob = {std::make_shared<caffe::Blob<float>>(1,1,1,1)};
-            upImpl->spNmsTensorRT->Reshape({upImpl->spHeatMapsBlob.get()}, {upImpl->spPeaksBlob.get()}, POSE_MAX_PEAKS[(int)mPoseModel]);
-            cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+                // HeatMaps extractor blob and layer
+                upImpl->spHeatMapsBlob = {std::make_shared<caffe::Blob<float>>(1,1,1,1)};
+                upImpl->spResizeAndMergeTensorRT->Reshape({upImpl->spTensorRTNetOutputBlob.get()}, {upImpl->spHeatMapsBlob.get()}, upImpl->mResizeScale * POSE_CCN_DECREASE_FACTOR[(int)mPoseModel]);
+                cudaCheck(__LINE__, __FUNCTION__, __FILE__);
 
-            // Pose extractor blob and layer
-            upImpl->spPoseBlob = {std::make_shared<caffe::Blob<float>>(1,1,1,1)};
-            upImpl->spBodyPartConnectorTensorRT->setPoseModel(mPoseModel);
-            upImpl->spBodyPartConnectorTensorRT->Reshape({upImpl->spHeatMapsBlob.get(), upImpl->spPeaksBlob.get()}, {upImpl->spPoseBlob.get()});
-            cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+                // Pose extractor blob and layer
+                upImpl->spPeaksBlob = {std::make_shared<caffe::Blob<float>>(1,1,1,1)};
+                upImpl->spNmsTensorRT->Reshape({upImpl->spHeatMapsBlob.get()}, {upImpl->spPeaksBlob.get()}, POSE_MAX_PEAKS[(int)mPoseModel]);
+                cudaCheck(__LINE__, __FUNCTION__, __FILE__);
 
-            log("Finished initialization on thread.", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
+                // Pose extractor blob and layer
+                upImpl->spPoseBlob = {std::make_shared<caffe::Blob<float>>(1,1,1,1)};
+                upImpl->spBodyPartConnectorTensorRT->setPoseModel(mPoseModel);
+                upImpl->spBodyPartConnectorTensorRT->Reshape({upImpl->spHeatMapsBlob.get(), upImpl->spPeaksBlob.get()}, {upImpl->spPoseBlob.get()});
+                cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+                
+                log("Finished initialization on thread.", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
+            #endif
         }
         catch (const std::exception& e)
         {

From 047d18b831df921c4fdf14951c24b58aecffde2d Mon Sep 17 00:00:00 2001
From: Florent Buisson <florent.buisson.1987@gmail.com>
Date: Mon, 13 Nov 2017 18:37:01 +0100
Subject: [PATCH 41/52] Spot the differences 3

---
 src/openpose/pose/poseExtractorTensorRT.cpp | 24 +++++++++++----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/src/openpose/pose/poseExtractorTensorRT.cpp b/src/openpose/pose/poseExtractorTensorRT.cpp
index 19f538211..df8782e97 100644
--- a/src/openpose/pose/poseExtractorTensorRT.cpp
+++ b/src/openpose/pose/poseExtractorTensorRT.cpp
@@ -226,8 +226,12 @@ namespace op
     {
         try
         {
-            checkThread();
-            return upImpl->spHeatMapsBlob->gpu_data();
+            #ifdef USE_CAFFE
+                checkThread();
+                return upImpl->spHeatMapsBlob->gpu_data();
+            #else
+                return nullptr;
+            #endif
         }
         catch (const std::exception& e)
         {
@@ -236,7 +240,6 @@ namespace op
         }
     }
 
-
     std::vector<int> PoseExtractorTensorRT::getHeatMapSize() const
     {
         try
@@ -255,14 +258,17 @@ namespace op
         }
     }
 
-
     const float* PoseExtractorTensorRT::getPoseGpuConstPtr() const
     {
         try
         {
-            error("GPU pointer for people pose data not implemented yet.", __LINE__, __FUNCTION__, __FILE__);
-            checkThread();
-            return upImpl->spPoseBlob->gpu_data();
+            #ifdef USE_CAFFE
+                error("GPU pointer for people pose data not implemented yet.", __LINE__, __FUNCTION__, __FILE__);
+                checkThread();
+                return upImpl->spPoseBlob->gpu_data();
+            #else
+                return nullptr;
+            #endif
         }
         catch (const std::exception& e)
         {
@@ -271,7 +277,3 @@ namespace op
         }
     }
 }
-
-
-
-

From 9e4d903880c9afd427b90b3d9b663ff3000e86ad Mon Sep 17 00:00:00 2001
From: Florent Buisson <florent.buisson.1987@gmail.com>
Date: Tue, 14 Nov 2017 06:02:22 +0000
Subject: [PATCH 42/52] Fixed compilation without TensorRT

---
 src/openpose/pose/poseExtractorTensorRT.cpp | 130 ++++++++++----------
 1 file changed, 68 insertions(+), 62 deletions(-)

diff --git a/src/openpose/pose/poseExtractorTensorRT.cpp b/src/openpose/pose/poseExtractorTensorRT.cpp
index df8782e97..44e07831c 100644
--- a/src/openpose/pose/poseExtractorTensorRT.cpp
+++ b/src/openpose/pose/poseExtractorTensorRT.cpp
@@ -33,7 +33,7 @@ namespace op
 
     struct PoseExtractorTensorRT::ImplPoseExtractorTensorRT
     {
-        #ifdef USE_TENSORRT // implies USE_CAFFE for now
+        #ifdef USE_TENSORRT // implies USE_TENSORRT for now
             const float mResizeScale;
             std::shared_ptr<Net> spNet;
             std::shared_ptr<ResizeAndMergeCaffe<float>> spResizeAndMergeTensorRT;
@@ -54,21 +54,21 @@ namespace op
                 spNet{std::make_shared<NetTensorRT>(std::array<int,4>{scaleNumber, 3,
                                                     (int)netInputSize.y, (int)netInputSize.x},
                                                     modelFolder + POSE_PROTOTXT[(int)poseModel],
-                                                    modelFolder + POSE_TRAINED_MODEL[(int)poseModel], gpuId)},
+                                                    modelFolder + POSE_TRAINED_MODEL[(int)poseModel], gpuId, enableGoogleLogging)},
                 spResizeAndMergeTensorRT{std::make_shared<ResizeAndMergeCaffe<float>>()},
                 spNmsTensorRT{std::make_shared<NmsCaffe<float>>()},
                 spBodyPartConnectorTensorRT{std::make_shared<BodyPartConnectorCaffe<float>>()}
             {
             }
         #endif
-    }
+    };
  
     PoseExtractorTensorRT::PoseExtractorTensorRT(const Point<int>& netInputSize, const Point<int>& netOutputSize,
                                                  const Point<int>& outputSize, const int scaleNumber,
                                                  const PoseModel poseModel, const std::string& modelFolder, 
                                                  const int gpuId, const std::vector<HeatMapType>& heatMapTypes,
                                                  const ScaleMode heatMapScale, const bool enableGoogleLogging) :
-        PoseExtractor{netOutputSize, outputSize, poseModel, heatMapTypes, heatMapScale},
+        PoseExtractor{netOutputSize, outputSize, poseModel, heatMapTypes, heatMapScale}
         #ifdef USE_TENSORRT
         , upImpl{new ImplPoseExtractorTensorRT{netInputSize, netOutputSize, scaleNumber, poseModel,
                                                gpuId, modelFolder, enableGoogleLogging}}
@@ -94,7 +94,7 @@ namespace op
                 UNUSED(gpuId);
                 UNUSED(heatMapTypes);
                 UNUSED(heatMapScale);
-                error("OpenPose must be compiled with the `USE_CAFFE` macro definition in order to use this"
+                error("OpenPose must be compiled with the `USE_TENSORRT` macro definition in order to use this"
                       " functionality.", __LINE__, __FUNCTION__, __FILE__);
             #endif
         }
@@ -149,58 +149,60 @@ namespace op
     {
         try
         {
-            // Security checks
-            if (inputNetData.empty())
-                error("Empty inputNetData.", __LINE__, __FUNCTION__, __FILE__);
-            timeNow("Start");
-            // 1. TensorRT deep network
-            upImpl->spNet->forwardPass(inputNetData.getConstPtr());
-            timeNow("TensorRT forward");
-            // 2. Resize heat maps + merge different scales
-            upImpl->spResizeAndMergeTensorRT->setScaleRatios(scaleRatios);
-            timeNow("SpResizeAndMergeTensorRT");
-            #ifndef CPU_ONLY
-                upImpl->spResizeAndMergeTensorRT->Forward_gpu({upImpl->spTensorRTNetOutputBlob.get()}, {upImpl->spHeatMapsBlob.get()});       // ~5ms
-                timeNow("RaM forward_gpu");
-                cudaCheck(__LINE__, __FUNCTION__, __FILE__);
-                timeNow("CudaCheck");
-            #else
-                error("ResizeAndMergeTensorRT CPU version not implemented yet.", __LINE__, __FUNCTION__, __FILE__);
-            #endif
-            timeNow("Resize heat Maps");
-            // 3. Get peaks by Non-Maximum Suppression
-            upImpl->spNmsTensorRT->setThreshold((float)get(PoseProperty::NMSThreshold));
-            #ifndef CPU_ONLY
-                upImpl->spNmsTensorRT->Forward_gpu({upImpl->spHeatMapsBlob.get()}, {upImpl->spPeaksBlob.get()});                           // ~2ms
-                cudaCheck(__LINE__, __FUNCTION__, __FILE__);
-            #else
-                error("NmsTensorRT CPU version not implemented yet.", __LINE__, __FUNCTION__, __FILE__);
-            #endif
-            timeNow("Peaks by nms");
-            // Get scale net to output
-            const auto scaleProducerToNetInput = resizeGetScaleFactor(inputDataSize, mNetOutputSize);
-            const Point<int> netSize{intRound(scaleProducerToNetInput*inputDataSize.x), intRound(scaleProducerToNetInput*inputDataSize.y)};
-            mScaleNetToOutput = {(float)resizeGetScaleFactor(netSize, mOutputSize)};
-            timeNow("Scale net to output");
-            // 4. Connecting body parts
-            upImpl->spBodyPartConnectorTensorRT->setScaleNetToOutput(mScaleNetToOutput);
-            upImpl->spBodyPartConnectorTensorRT->setInterMinAboveThreshold((int)get(PoseProperty::ConnectInterMinAboveThreshold));
-            upImpl->spBodyPartConnectorTensorRT->setInterThreshold((float)get(PoseProperty::ConnectInterThreshold));
-            upImpl->spBodyPartConnectorTensorRT->setMinSubsetCnt((int)get(PoseProperty::ConnectMinSubsetCnt));
-            upImpl->spBodyPartConnectorTensorRT->setMinSubsetScore((float)get(PoseProperty::ConnectMinSubsetScore));
-            // GPU version not implemented yet
-            upImpl->spBodyPartConnectorTensorRT->Forward_cpu({upImpl->spHeatMapsBlob.get(), upImpl->spPeaksBlob.get()}, mPoseKeypoints);
-            // upImpl->spBodyPartConnectorTensorRT->Forward_gpu({upImpl->spHeatMapsBlob.get(), upImpl->spPeaksBlob.get()}, {upImpl->spPoseBlob.get()}, mPoseKeypoints);
-            timeNow("Connect Body Parts");
-             
-            const auto totalTimeSec = timeDiffToString(timings.back().second, timings.front().second);
-            const auto message = "Pose estimation successfully finished. Total time: " + totalTimeSec + " seconds.";
-            op::log(message, op::Priority::High);
+            #ifdef USE_TENSORRT
+                // Security checks
+                if (inputNetData.empty())
+                    error("Empty inputNetData.", __LINE__, __FUNCTION__, __FILE__);
+                timeNow("Start");
+                // 1. TensorRT deep network
+                upImpl->spNet->forwardPass(inputNetData.getConstPtr());
+                timeNow("TensorRT forward");
+                // 2. Resize heat maps + merge different scales
+                upImpl->spResizeAndMergeTensorRT->setScaleRatios(scaleRatios);
+                timeNow("SpResizeAndMergeTensorRT");
+                #ifndef CPU_ONLY
+                    upImpl->spResizeAndMergeTensorRT->Forward_gpu({upImpl->spTensorRTNetOutputBlob.get()}, {upImpl->spHeatMapsBlob.get()});       // ~5ms
+                    timeNow("RaM forward_gpu");
+                    cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+                    timeNow("CudaCheck");
+                #else
+                    error("ResizeAndMergeTensorRT CPU version not implemented yet.", __LINE__, __FUNCTION__, __FILE__);
+                #endif
+                timeNow("Resize heat Maps");
+                // 3. Get peaks by Non-Maximum Suppression
+                upImpl->spNmsTensorRT->setThreshold((float)get(PoseProperty::NMSThreshold));
+                #ifndef CPU_ONLY
+                    upImpl->spNmsTensorRT->Forward_gpu({upImpl->spHeatMapsBlob.get()}, {upImpl->spPeaksBlob.get()});                           // ~2ms
+                    cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+                #else
+                    error("NmsTensorRT CPU version not implemented yet.", __LINE__, __FUNCTION__, __FILE__);
+                #endif
+                timeNow("Peaks by nms");
+                // Get scale net to output
+                const auto scaleProducerToNetInput = resizeGetScaleFactor(inputDataSize, mNetOutputSize);
+                const Point<int> netSize{intRound(scaleProducerToNetInput*inputDataSize.x), intRound(scaleProducerToNetInput*inputDataSize.y)};
+                mScaleNetToOutput = {(float)resizeGetScaleFactor(netSize, mOutputSize)};
+                timeNow("Scale net to output");
+                // 4. Connecting body parts
+                upImpl->spBodyPartConnectorTensorRT->setScaleNetToOutput(mScaleNetToOutput);
+                upImpl->spBodyPartConnectorTensorRT->setInterMinAboveThreshold((int)get(PoseProperty::ConnectInterMinAboveThreshold));
+                upImpl->spBodyPartConnectorTensorRT->setInterThreshold((float)get(PoseProperty::ConnectInterThreshold));
+                upImpl->spBodyPartConnectorTensorRT->setMinSubsetCnt((int)get(PoseProperty::ConnectMinSubsetCnt));
+                upImpl->spBodyPartConnectorTensorRT->setMinSubsetScore((float)get(PoseProperty::ConnectMinSubsetScore));
+                // GPU version not implemented yet
+                upImpl->spBodyPartConnectorTensorRT->Forward_cpu({upImpl->spHeatMapsBlob.get(), upImpl->spPeaksBlob.get()}, mPoseKeypoints);
+                // upImpl->spBodyPartConnectorTensorRT->Forward_gpu({upImpl->spHeatMapsBlob.get(), upImpl->spPeaksBlob.get()}, {upImpl->spPoseBlob.get()}, mPoseKeypoints);
+                timeNow("Connect Body Parts");
+                 
+                const auto totalTimeSec = timeDiffToString(timings.back().second, timings.front().second);
+                const auto message = "Pose estimation successfully finished. Total time: " + totalTimeSec + " seconds.";
+                op::log(message, op::Priority::High);
 
-            for(OpTimings::iterator timing = timings.begin()+1; timing != timings.end(); ++timing) {
-              const auto log_time = (*timing).first + " - " + timeDiffToString((*timing).second, (*(timing-1)).second);
-              op::log(log_time, op::Priority::High);
-            }
+                for(OpTimings::iterator timing = timings.begin()+1; timing != timings.end(); ++timing) {
+                  const auto log_time = (*timing).first + " - " + timeDiffToString((*timing).second, (*(timing-1)).second);
+                  op::log(log_time, op::Priority::High);
+                }
+            #endif
         }
         catch (const std::exception& e)
         {
@@ -211,9 +213,13 @@ namespace op
     const float* PoseExtractorTensorRT::getHeatMapCpuConstPtr() const
     {
         try
-        {
-            checkThread();
-            return upImpl->spHeatMapsBlob->cpu_data();
+        {    
+            #ifdef USE_TENSORRT
+                checkThread();
+                return upImpl->spHeatMapsBlob->cpu_data();
+            #else
+                return nullptr;
+            #endif
         }
         catch (const std::exception& e)
         {
@@ -226,7 +232,7 @@ namespace op
     {
         try
         {
-            #ifdef USE_CAFFE
+            #ifdef USE_TENSORRT
                 checkThread();
                 return upImpl->spHeatMapsBlob->gpu_data();
             #else
@@ -244,7 +250,7 @@ namespace op
     {
         try
         {
-            #ifdef USE_CAFFE
+            #ifdef USE_TENSORRT
                 checkThread();
                 return upImpl->spHeatMapsBlob->shape();
             #else
@@ -262,7 +268,7 @@ namespace op
     {
         try
         {
-            #ifdef USE_CAFFE
+            #ifdef USE_TENSORRT
                 error("GPU pointer for people pose data not implemented yet.", __LINE__, __FUNCTION__, __FILE__);
                 checkThread();
                 return upImpl->spPoseBlob->gpu_data();

From b3655d0fcde1b87c96f9ee65219937579831a047 Mon Sep 17 00:00:00 2001
From: Florent Buisson <florent.buisson.1987@gmail.com>
Date: Tue, 14 Nov 2017 06:20:33 +0000
Subject: [PATCH 43/52] Fix attempt

---
 include/openpose/pose/poseExtractorTensorRT.hpp | 4 +---
 include/openpose/wrapper/wrapper.hpp            | 1 -
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/include/openpose/pose/poseExtractorTensorRT.hpp b/include/openpose/pose/poseExtractorTensorRT.hpp
index 6d8f53f15..09bd43383 100644
--- a/include/openpose/pose/poseExtractorTensorRT.hpp
+++ b/include/openpose/pose/poseExtractorTensorRT.hpp
@@ -10,9 +10,7 @@ namespace op
     class OP_API PoseExtractorTensorRT : public PoseExtractor
     {
     public:
-        PoseExtractorTensorRT(const Point<int>& netInputSize, const Point<int>& netOutputSize,
-                              const Point<int>& outputSize, const int scaleNumber, const PoseModel poseModel,
-                              const std::string& modelFolder, const int gpuId,
+        PoseExtractorTensorRT(const std::string& modelFolder, const int gpuId,
                               const std::vector<HeatMapType>& heatMapTypes = {},
                               const ScaleMode heatMapScale = ScaleMode::ZeroToOne,
                               const bool enableGoogleLogging = true);
diff --git a/include/openpose/wrapper/wrapper.hpp b/include/openpose/wrapper/wrapper.hpp
index 6370d3dfc..893cd7dd6 100644
--- a/include/openpose/wrapper/wrapper.hpp
+++ b/include/openpose/wrapper/wrapper.hpp
@@ -643,7 +643,6 @@ namespace op
 #else
                     poseExtractors.emplace_back(std::make_shared<PoseExtractorCaffe>(
 #endif
-                        poseNetInputSize, poseNetOutputSize, finalOutputSize, wrapperStructPose.scalesNumber,
                         wrapperStructPose.poseModel, modelFolder, gpuId + gpuNumberStart,
                         wrapperStructPose.heatMapTypes, wrapperStructPose.heatMapScale,
                         wrapperStructPose.enableGoogleLogging

From e76dc7194ad2b38be6a7cc7d3dae9bc35e18cf4f Mon Sep 17 00:00:00 2001
From: Florent Buisson <florent.buisson.1987@gmail.com>
Date: Tue, 14 Nov 2017 10:48:19 +0000
Subject: [PATCH 44/52] Wrong variable name

---
 src/openpose/pose/poseExtractorTensorRT.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/openpose/pose/poseExtractorTensorRT.cpp b/src/openpose/pose/poseExtractorTensorRT.cpp
index 44e07831c..5675c5bad 100644
--- a/src/openpose/pose/poseExtractorTensorRT.cpp
+++ b/src/openpose/pose/poseExtractorTensorRT.cpp
@@ -50,7 +50,7 @@ namespace op
                                       const Point<int>& outputSize, const int scaleNumber,
                                       const PoseModel poseModel, const int gpuId,
                                       const std::string& modelFolder, const bool enableGoogleLogging) :
-                mResizeScale{mNetOutputSize.x / (float)netInputSize.x},
+                mResizeScale{netOutputSize.x / (float)netInputSize.x},
                 spNet{std::make_shared<NetTensorRT>(std::array<int,4>{scaleNumber, 3,
                                                     (int)netInputSize.y, (int)netInputSize.x},
                                                     modelFolder + POSE_PROTOTXT[(int)poseModel],

From 6456dff7137aa775de0cf6dde34df31190c9c1c9 Mon Sep 17 00:00:00 2001
From: Florent Buisson <florent.buisson.1987@gmail.com>
Date: Tue, 14 Nov 2017 13:52:34 +0000
Subject: [PATCH 45/52] Too much changed in poseExtractorCaffe, need to rewrite
 TensorRT one from scratch.

---
 src/openpose/pose/poseExtractorTensorRT.cpp | 96 +++++++++++++++++----
 1 file changed, 80 insertions(+), 16 deletions(-)

diff --git a/src/openpose/pose/poseExtractorTensorRT.cpp b/src/openpose/pose/poseExtractorTensorRT.cpp
index 5675c5bad..c9afbff08 100644
--- a/src/openpose/pose/poseExtractorTensorRT.cpp
+++ b/src/openpose/pose/poseExtractorTensorRT.cpp
@@ -34,27 +34,28 @@ namespace op
     struct PoseExtractorTensorRT::ImplPoseExtractorTensorRT
     {
         #ifdef USE_TENSORRT // implies USE_TENSORRT for now
-            const float mResizeScale;
-            std::shared_ptr<Net> spNet;
+            const PoseModel mPoseModel;
+            const int mGpuId;
+            const std::string mModelFolder;
+            const bool mEnableGoogleLogging;
+            // General parameters
+            std::vector<std::shared_ptr<NetTensorRT>> spTensorRTNets;
             std::shared_ptr<ResizeAndMergeCaffe<float>> spResizeAndMergeTensorRT;
             std::shared_ptr<NmsCaffe<float>> spNmsTensorRT;
             std::shared_ptr<BodyPartConnectorCaffe<float>> spBodyPartConnectorTensorRT;
             // Init with thread
-            boost::shared_ptr<caffe::Blob<float>> spTensorRTNetOutputBlob;
+            std::vector<boost::shared_ptr<caffe::Blob<float>>> spTensorRTNetOutputBlobs;
             std::shared_ptr<caffe::Blob<float>> spHeatMapsBlob;
             std::shared_ptr<caffe::Blob<float>> spPeaksBlob;
             std::shared_ptr<caffe::Blob<float>> spPoseBlob;
 
 
-            ImplPoseExtractorTensorRT(const Point<int>& netInputSize, const Point<int>& netOutputSize,
-                                      const Point<int>& outputSize, const int scaleNumber,
-                                      const PoseModel poseModel, const int gpuId,
+            ImplPoseExtractorTensorRT(const PoseModel poseModel, const int gpuId,
                                       const std::string& modelFolder, const bool enableGoogleLogging) :
-                mResizeScale{netOutputSize.x / (float)netInputSize.x},
-                spNet{std::make_shared<NetTensorRT>(std::array<int,4>{scaleNumber, 3,
-                                                    (int)netInputSize.y, (int)netInputSize.x},
-                                                    modelFolder + POSE_PROTOTXT[(int)poseModel],
-                                                    modelFolder + POSE_TRAINED_MODEL[(int)poseModel], gpuId, enableGoogleLogging)},
+                mPoseModel{poseModel},
+                mGpuId{gpuId},
+                mModelFolder{modelFolder},
+                mEnableGoogleLoggin{enableGoogleLogging},
                 spResizeAndMergeTensorRT{std::make_shared<ResizeAndMergeCaffe<float>>()},
                 spNmsTensorRT{std::make_shared<NmsCaffe<float>>()},
                 spBodyPartConnectorTensorRT{std::make_shared<BodyPartConnectorCaffe<float>>()}
@@ -62,6 +63,69 @@ namespace op
             }
         #endif
     };
+
+    inline void reshapePoseExtractorCaffe(std::shared_ptr<ResizeAndMergeCaffe<float>>& resizeAndMergeCaffe,
+                                          std::shared_ptr<NmsCaffe<float>>& nmsCaffe,
+                                          std::shared_ptr<BodyPartConnectorCaffe<float>>& bodyPartConnectorCaffe,
+                                          std::vector<boost::shared_ptr<caffe::Blob<float>>>& caffeNetOutputBlob,
+                                          std::shared_ptr<caffe::Blob<float>>& heatMapsBlob,
+                                          std::shared_ptr<caffe::Blob<float>>& peaksBlob,
+                                          std::shared_ptr<caffe::Blob<float>>& poseBlob,
+                                          const float scaleInputToNetInput,
+                                          const PoseModel poseModel)
+        {
+            try
+            {
+                // HeatMaps extractor blob and layer
+                const auto caffeNetOutputBlobs = caffeNetSharedToPtr(caffeNetOutputBlob);
+                resizeAndMergeCaffe->Reshape(caffeNetOutputBlobs, {heatMapsBlob.get()},
+                                             POSE_CCN_DECREASE_FACTOR[(int)poseModel], 1.f/scaleInputToNetInput);
+                // Pose extractor blob and layer
+                nmsCaffe->Reshape({heatMapsBlob.get()}, {peaksBlob.get()}, POSE_MAX_PEAKS[(int)poseModel]);
+                // Pose extractor blob and layer
+                bodyPartConnectorCaffe->Reshape({heatMapsBlob.get(), peaksBlob.get()}, {poseBlob.get()});
+                // Cuda check
+                #ifdef USE_CUDA
+                    cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+                #endif
+            }
+            catch (const std::exception& e)
+            {
+                error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+            }
+        }
+
+    void addTensorRTNetOnThread(std::vector<std::shared_ptr<NetTensorRT>>& netTensorRT,
+                             std::vector<boost::shared_ptr<caffe::Blob<float>>>& caffeNetOutputBlob,
+                             const PoseModel poseModel, const int gpuId,
+                             const std::string& modelFolder, const bool enableGoogleLogging)
+        {
+            try
+            {
+                // Add Caffe Net
+                netTensorRT.emplace_back(
+                    std::make_shared<NetCaffe>(modelFolder + POSE_PROTOTXT[(int)poseModel],
+                                               modelFolder + POSE_TRAINED_MODEL[(int)poseModel],
+                                               gpuId, enableGoogleLogging)
+                );
+                // Initializing them on the thread
+                netTensorRT.back()->initializationOnThread();
+                caffeNetOutputBlob.emplace_back(netTensorRT.back()->getOutputBlob());
+                // Security checks
+                if (netTensorRT.size() != caffeNetOutputBlob.size())
+                    error("Weird error, this should not happen. Notify us.", __LINE__, __FUNCTION__, __FILE__);
+                // Cuda check
+                #ifdef USE_CUDA
+                    cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+                #endif
+            }
+            catch (const std::exception& e)
+            {
+                error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+            }
+        }
+    #endif
+
  
     PoseExtractorTensorRT::PoseExtractorTensorRT(const Point<int>& netInputSize, const Point<int>& netOutputSize,
                                                  const Point<int>& outputSize, const int scaleNumber,
@@ -116,13 +180,13 @@ namespace op
           
             #ifdef USE_TENSORRT
                 // TensorRT net
-                upImpl->spNet->initializationOnThread();
-                upImpl->spTensorRTNetOutputBlob = ((NetTensorRT*)upImpl->spNet.get())->getOutputBlob();
+                upImpl->spTensorRTNets->initializationOnThread();
+                upImpl->spTensorRTNetOutputBlobs = ((NetTensorRT*)upImpl->spTensorRTNets.get())->getOutputBlob();
                 cudaCheck(__LINE__, __FUNCTION__, __FILE__);
 
                 // HeatMaps extractor blob and layer
                 upImpl->spHeatMapsBlob = {std::make_shared<caffe::Blob<float>>(1,1,1,1)};
-                upImpl->spResizeAndMergeTensorRT->Reshape({upImpl->spTensorRTNetOutputBlob.get()}, {upImpl->spHeatMapsBlob.get()}, upImpl->mResizeScale * POSE_CCN_DECREASE_FACTOR[(int)mPoseModel]);
+                upImpl->spResizeAndMergeTensorRT->Reshape({upImpl->spTensorRTNetOutputBlobs.get()}, {upImpl->spHeatMapsBlob.get()}, upImpl->mResizeScale * POSE_CCN_DECREASE_FACTOR[(int)mPoseModel]);
                 cudaCheck(__LINE__, __FUNCTION__, __FILE__);
 
                 // Pose extractor blob and layer
@@ -155,13 +219,13 @@ namespace op
                     error("Empty inputNetData.", __LINE__, __FUNCTION__, __FILE__);
                 timeNow("Start");
                 // 1. TensorRT deep network
-                upImpl->spNet->forwardPass(inputNetData.getConstPtr());
+                upImpl->spTensorRTNets->forwardPass(inputNetData.getConstPtr());
                 timeNow("TensorRT forward");
                 // 2. Resize heat maps + merge different scales
                 upImpl->spResizeAndMergeTensorRT->setScaleRatios(scaleRatios);
                 timeNow("SpResizeAndMergeTensorRT");
                 #ifndef CPU_ONLY
-                    upImpl->spResizeAndMergeTensorRT->Forward_gpu({upImpl->spTensorRTNetOutputBlob.get()}, {upImpl->spHeatMapsBlob.get()});       // ~5ms
+                    upImpl->spResizeAndMergeTensorRT->Forward_gpu({upImpl->spTensorRTNetOutputBlobs.get()}, {upImpl->spHeatMapsBlob.get()});       // ~5ms
                     timeNow("RaM forward_gpu");
                     cudaCheck(__LINE__, __FUNCTION__, __FILE__);
                     timeNow("CudaCheck");

From b3673e6429f213b045f893a4ebed62c66c0dff84 Mon Sep 17 00:00:00 2001
From: Florent Buisson <florent.buisson.1987@gmail.com>
Date: Wed, 15 Nov 2017 10:10:14 +0100
Subject: [PATCH 46/52] PIMPL for netTensorRT

---
 include/openpose/core/netTensorRT.hpp         |  32 +-
 .../openpose/pose/poseExtractorTensorRT.hpp   |   9 +-
 src/openpose/core/netTensorRT.cpp             | 614 ++++++++++--------
 src/openpose/pose/poseExtractorTensorRT.cpp   | 269 +++++---
 4 files changed, 527 insertions(+), 397 deletions(-)

diff --git a/include/openpose/core/netTensorRT.hpp b/include/openpose/core/netTensorRT.hpp
index 0eaaaf7d3..0c0ae02e1 100644
--- a/include/openpose/core/netTensorRT.hpp
+++ b/include/openpose/core/netTensorRT.hpp
@@ -1,11 +1,10 @@
-#ifdef USE_TENSORRT
 #ifndef OPENPOSE_CORE_NET_TENSORRT_HPP
 #define OPENPOSE_CORE_NET_TENSORRT_HPP
 
-#include <caffe/net.hpp>
 #include <openpose/core/common.hpp>
 #include <openpose/core/net.hpp>
 
+
 #include "NvInfer.h"
 
 namespace op
@@ -31,30 +30,15 @@ namespace op
         boost::shared_ptr<caffe::Blob<float>> getOutputBlob() const;
 
     private:
-        // Init with constructor
-        const int mGpuId;
-        const std::array<int, 4> mNetInputSize4D;
-        std::array<int, 4> mNetOutputSize4D;
-        const unsigned long mNetInputMemory;
-        const std::string mCaffeProto;
-        const std::string mCaffeTrainedModel;
-        const std::string mLastBlobName;
-        // Init with thread
-      
-        boost::shared_ptr<caffe::Blob<float>> spInputBlob;
-        boost::shared_ptr<caffe::Blob<float>> spOutputBlob;
-      
-        // TensorRT stuff
-        nvinfer1::ICudaEngine* cudaEngine;
-        nvinfer1::IExecutionContext* cudaContext;
-        nvinfer1::ICudaEngine* caffeToGIEModel();
-        nvinfer1::ICudaEngine* createEngine();
-        cudaStream_t stream;
-        cudaEvent_t start, end;
-
+        // PIMPL idiom
+        // http://www.cppsamples.com/common-tasks/pimpl.html
+        struct ImplNetTensorRT;
+        std::unique_ptr<ImplNetTensorRT> upImpl;
+        
+        // PIMP requires DELETE_COPY & destructor, or extra code
+        // http://oliora.github.io/2015/12/29/pimpl-and-rule-of-zero.html
         DELETE_COPY(NetTensorRT);
     };
 }
 
 #endif // OPENPOSE_CORE_NET_TENSORRT_HPP
-#endif // USE_TENSORRT
diff --git a/include/openpose/pose/poseExtractorTensorRT.hpp b/include/openpose/pose/poseExtractorTensorRT.hpp
index 09bd43383..48f856e70 100644
--- a/include/openpose/pose/poseExtractorTensorRT.hpp
+++ b/include/openpose/pose/poseExtractorTensorRT.hpp
@@ -10,7 +10,7 @@ namespace op
     class OP_API PoseExtractorTensorRT : public PoseExtractor
     {
     public:
-        PoseExtractorTensorRT(const std::string& modelFolder, const int gpuId,
+        PoseExtractorTensorRT(const PoseModel poseModel, const std::string& modelFolder, const int gpuId,
                               const std::vector<HeatMapType>& heatMapTypes = {},
                               const ScaleMode heatMapScale = ScaleMode::ZeroToOne,
                               const bool enableGoogleLogging = true);
@@ -19,10 +19,9 @@ namespace op
 
         void netInitializationOnThread();
 
-        void forwardPass(const Array<float>& inputNetData, const Point<int>& inputDataSize,
-                         const std::vector<float>& scaleRatios = {1.f});
+        void forwardPass(const std::vector<Array<float>>& inputNetData, const Point<int>& inputDataSize,
+                         const std::vector<double>& scaleInputToNetInputs = {1.f});
 
-         
         const float* getHeatMapCpuConstPtr() const;
 
         const float* getHeatMapGpuConstPtr() const;
@@ -31,7 +30,7 @@ namespace op
 
         const float* getPoseGpuConstPtr() const;
 
-  private:
+    private:
         // PIMPL idiom
         // http://www.cppsamples.com/common-tasks/pimpl.html
         struct ImplPoseExtractorTensorRT;
diff --git a/src/openpose/core/netTensorRT.cpp b/src/openpose/core/netTensorRT.cpp
index 8894aeac3..416d0c752 100644
--- a/src/openpose/core/netTensorRT.cpp
+++ b/src/openpose/core/netTensorRT.cpp
@@ -1,310 +1,390 @@
-#ifdef USE_TENSORRT
 #include <numeric> // std::accumulate
+#ifdef USE_TENSORRT
+    #include <atomic>
+    #include <mutex>
+    #include <caffe/net.hpp>
+    #include <glog/logging.h> // google::InitGoogleLogging
+#endif
 #include <openpose/utilities/cuda.hpp>
+#include <openpose/utilities/fileSystem.hpp>
+#include <openpose/utilities/standard.hpp>
 #include <openpose/core/netTensorRT.hpp>
-#include <assert.h>
-#include <fstream>
-#include <sstream>
-#include <iostream>
-#include <cmath>
-#include <sys/stat.h>
-#include <cmath>
-#include <time.h>
-#include <cuda_runtime_api.h>
-#include <algorithm>
-#include <chrono>
-#include <string.h>
-#include <map>
-#include <random>
-#include <boost/make_shared.hpp>
 
-#include "NvInfer.h"
-#include "NvCaffeParser.h"
+//#include <assert.h>
+//#include <fstream>
+//#include <sstream>
+//#include <iostream>
+//#include <cmath>
+//#include <sys/stat.h>
+//#include <cmath>
+//#include <time.h>
+//#include <cuda_runtime_api.h>
+//#include <algorithm>
+//#include <chrono>
+//#include <string.h>
+//#include <map>
+//#include <random>
+//#include <boost/make_shared.hpp>
 
-using namespace nvinfer1;
-using namespace nvcaffeparser1;
 
+#ifdef USE_TENSORRT
+    #include "NvInfer.h"
+    #include "NvCaffeParser.h"
 
-std::vector<std::string> gInputs;
-std::map<std::string, DimsCHW> gInputDimensions;
+    using namespace nvinfer1;
+    using namespace nvcaffeparser1;
 
+//std::vector<std::string> gInputs;
+//std::map<std::string, DimsCHW> gInputDimensions;
+#endif // USE_TENSORRT
 
 // Logger for GIE info/warning/errors
 class Logger : public ILogger
 {
-  void log(Severity severity, const char* msg) override
-  {
-    // if suppress info-level message:  if (severity != Severity::kINFO)
-    std::cout << msg << std::endl;
-  }
+    void log(Severity severity, const char* msg) override
+    {
+        // if suppress info-level message:  if (severity != Severity::kINFO)
+        std::cout << msg << std::endl;
+    }
 } gLogger;
 
-
 namespace op
 {
-  NetTensorRT::NetTensorRT(const std::array<int, 4>& netInputSize4D, const std::string& caffeProto, const std::string& caffeTrainedModel, const int gpuId, const std::string& lastBlobName) :
-  mGpuId{gpuId},
-  // mNetInputSize4D{netInputSize4D}, // This line crashes on some devices with old G++
-  mNetInputSize4D{netInputSize4D[0], netInputSize4D[1], netInputSize4D[2], netInputSize4D[3]},
-  mNetInputMemory{std::accumulate(mNetInputSize4D.begin(), mNetInputSize4D.end(), 1, std::multiplies<int>()) * sizeof(float)},
-  mCaffeProto{caffeProto + "_" + std::to_string(mNetInputSize4D[2]) + "x" + std::to_string(mNetInputSize4D[3])},
-  mCaffeTrainedModel{caffeTrainedModel},
-  mLastBlobName{lastBlobName}
-  {
-    std::cout << "Caffe file: " << mCaffeProto.c_str() << std::endl;
-    CUDA_CHECK(cudaStreamCreate(&stream));
-    CUDA_CHECK(cudaEventCreate(&start));
-    CUDA_CHECK(cudaEventCreate(&end));
-  }
-  
-  NetTensorRT::~NetTensorRT()
-  {
-    cudaStreamDestroy(stream);
-    cudaEventDestroy(start);
-    cudaEventDestroy(end);
-    
-    if (cudaEngine)
-      cudaEngine->destroy();
-  }
-  
-  
-  ICudaEngine* NetTensorRT::caffeToGIEModel()
-  {
-    // create the builder
-    IBuilder* builder = createInferBuilder(gLogger);
-    
-    // parse the caffe model to populate the network, then set the outputs
-    INetworkDefinition* network = builder->createNetwork();
-    ICaffeParser* parser = createCaffeParser();
-    const IBlobNameToTensor* blobNameToTensor = parser->parse(mCaffeProto.c_str(),
-                                                              mCaffeTrainedModel.c_str(),
-                                                              *network,
-                                                              DataType::kFLOAT);
-    
-    if (!blobNameToTensor)
-      return nullptr;
-    
+    std::mutex sMutexNetTensorRT;
+    std::atomic<bool> sGoogleLoggingInitialized{false};
     
-    for (int i = 0, n = network->getNbInputs(); i < n; i++)
+    struct NetTensorRT::ImplNetTensorRT
     {
-      DimsCHW dims = static_cast<DimsCHW&&>(network->getInput(i)->getDimensions());
-      gInputs.push_back(network->getInput(i)->getName());
-      gInputDimensions.insert(std::make_pair(network->getInput(i)->getName(), dims));
-      std::cout << "Input \"" << network->getInput(i)->getName() << "\": " << dims.c() << "x" << dims.h() << "x" << dims.w() << std::endl;
-      if( i > 0)
-        std::cerr << "Multiple output unsupported for now!";
-    }
+        #ifdef USE_TENSORRT
+            // Init with constructor
+            const int mGpuId;
+            const std::string mCaffeProto;
+            const std::string mCaffeTrainedModel;
+            const std::string mLastBlobName;
+            std::vector<int> mNetInputSize4D;
+            // Init with thread
+            boost::shared_ptr<caffe::Blob<float>> spInputBlob;
+            boost::shared_ptr<caffe::Blob<float>> spOutputBlob;
+        
+            // Init with constructor
+            //const std::array<int, 4> mNetInputSize4D;
+            //std::array<int, 4> mNetOutputSize4D;
+            //const unsigned long mNetInputMemory;
+            // Init with thread
+        
+            // TensorRT stuff
+            nvinfer1::ICudaEngine* cudaEngine;
+            nvinfer1::IExecutionContext* cudaContext;
+            nvinfer1::ICudaEngine* caffeToGIEModel();
+            nvinfer1::ICudaEngine* createEngine();
+            cudaStream_t stream;
+            cudaEvent_t start, end;
     
-    // Specify which tensor is output (multiple unsupported)
-    if (blobNameToTensor->find(mLastBlobName.c_str()) == nullptr)
-    {
-      std::cout << "could not find output blob " << mLastBlobName.c_str() << std::endl;
-      return nullptr;
-    }
-    network->markOutput(*blobNameToTensor->find(mLastBlobName.c_str()));
+            ImplNetTensorRT(const std::string& caffeProto, const std::string& caffeTrainedModel, const int gpuId,
+                         const bool enableGoogleLogging, const std::string& lastBlobName) :
+                mGpuId{gpuId},
+                mCaffeProto{caffeProto}, // TODO, no size, how to proceed ?
+                mCaffeTrainedModel{caffeTrainedModel},
+                mLastBlobName{lastBlobName}
+            {
+                const std::string message{".\nPossible causes:\n\t1. Not downloading the OpenPose trained models."
+                    "\n\t2. Not running OpenPose from the same directory where the `model`"
+                    " folder is located.\n\t3. Using paths with spaces."};
+                if (!existFile(mCaffeProto))
+                    error("Prototxt file not found: " + mCaffeProto + message, __LINE__, __FUNCTION__, __FILE__);
+                    if (!existFile(mCaffeTrainedModel))
+                        error("Caffe trained model file not found: " + mCaffeTrainedModel + message,
+                              __LINE__, __FUNCTION__, __FILE__);
+                        // Double if condition in order to speed up the program if it is called several times
+                        if (enableGoogleLogging && !sGoogleLoggingInitialized)
+                        {
+                            std::lock_guard<std::mutex> lock{sMutexNetTensorRT};
+                            if (enableGoogleLogging && !sGoogleLoggingInitialized)
+                            {
+                                google::InitGoogleLogging("OpenPose");
+                                sGoogleLoggingInitialized = true;
+                            }
+                        }
+            }
+        #endif
+    };
     
     
-    for (int i = 0, n = network->getNbOutputs(); i < n; i++)
+#ifdef USE_TENSORRT
+    ICudaEngine* NetTensorRT::caffeToGIEModel()
     {
-      DimsCHW dims = static_cast<DimsCHW&&>(network->getOutput(i)->getDimensions());
-      std::cout << "Output \"" << network->getOutput(i)->getName() << "\": " << dims.c() << "x" << dims.h() << "x" << dims.w() << std::endl;
+        // create the builder
+        IBuilder* builder = createInferBuilder(gLogger);
+        
+        // parse the caffe model to populate the network, then set the outputs
+        INetworkDefinition* network = builder->createNetwork();
+        ICaffeParser* parser = createCaffeParser();
+        const IBlobNameToTensor* blobNameToTensor = parser->parse(mCaffeProto.c_str(),
+                                                                  mCaffeTrainedModel.c_str(),
+                                                                  *network,
+                                                                  DataType::kFLOAT);
+        
+        if (!blobNameToTensor)
+            return nullptr;
+        
+        
+        for (int i = 0, n = network->getNbInputs(); i < n; i++)
+        {
+            DimsCHW dims = static_cast<DimsCHW&&>(network->getInput(i)->getDimensions());
+            gInputs.push_back(network->getInput(i)->getName());
+            gInputDimensions.insert(std::make_pair(network->getInput(i)->getName(), dims));
+            std::cout << "Input \"" << network->getInput(i)->getName() << "\": " << dims.c() << "x" << dims.h() << "x" << dims.w() << std::endl;
+            if( i > 0)
+                std::cerr << "Multiple output unsupported for now!";
+        }
+        
+        // Specify which tensor is output (multiple unsupported)
+        if (blobNameToTensor->find(mLastBlobName.c_str()) == nullptr)
+        {
+            std::cout << "could not find output blob " << mLastBlobName.c_str() << std::endl;
+            return nullptr;
+        }
+        network->markOutput(*blobNameToTensor->find(mLastBlobName.c_str()));
+        
+        
+        for (int i = 0, n = network->getNbOutputs(); i < n; i++)
+        {
+            DimsCHW dims = static_cast<DimsCHW&&>(network->getOutput(i)->getDimensions());
+            std::cout << "Output \"" << network->getOutput(i)->getName() << "\": " << dims.c() << "x" << dims.h() << "x" << dims.w() << std::endl;
+        }
+        
+        // Build the engine
+        builder->setMaxBatchSize(1);
+        // 16 megabytes, default in giexec. No idea what's best for Jetson though,
+        // maybe check dusty_nv's code on github
+        builder->setMaxWorkspaceSize(32<<20);
+        builder->setHalf2Mode(false);
+        
+        ICudaEngine* engine = builder->buildCudaEngine(*network);
+        if (engine == nullptr)
+            std::cout << "could not build engine" << std::endl;
+        
+        parser->destroy();
+        network->destroy();
+        builder->destroy();
+        shutdownProtobufLibrary();
+        
+        return engine;
     }
-    
-    // Build the engine
-    builder->setMaxBatchSize(1);
-    // 16 megabytes, default in giexec. No idea what's best for Jetson though,
-    // maybe check dusty_nv's code on github
-    builder->setMaxWorkspaceSize(32<<20);
-    builder->setHalf2Mode(false);
-    
-    ICudaEngine* engine = builder->buildCudaEngine(*network);
-    if (engine == nullptr)
-      std::cout << "could not build engine" << std::endl;
-    
-    parser->destroy();
-    network->destroy();
-    builder->destroy();
-    shutdownProtobufLibrary();
-    
-    return engine;
-  }
-  
-  inline bool file_exists(const std::string& file_path) {
-    struct stat buffer;
-    return (stat(file_path.c_str(), &buffer) == 0);
-  }
-  
-  ICudaEngine* NetTensorRT::createEngine()
-  {
-    ICudaEngine *engine;
-    
-    std::string serializedEnginePath = mCaffeProto + ".bin";
 
-    std::cout << "Serialized engine path: " << serializedEnginePath.c_str() << std::endl;
-    if (file_exists(serializedEnginePath))
+    ICudaEngine* NetTensorRT::createEngine()
     {
-      std::cout << "Found serialized TensorRT engine, deserializing..." << std::endl;
-      char *gieModelStream{nullptr};
-      size_t size{0};
-      std::ifstream file(serializedEnginePath, std::ios::binary);
-      if (file.good())
-      {
-        file.seekg(0, file.end);
-        size = file.tellg();
-        file.seekg(0, file.beg);
-        gieModelStream = new char[size];
-        assert(gieModelStream);
-        file.read(gieModelStream, size);
-        file.close();
-      }
-
-      IRuntime* infer = createInferRuntime(gLogger);
-      engine = infer->deserializeCudaEngine(gieModelStream, size, nullptr);
-      if (gieModelStream) delete [] gieModelStream;
-      
-      return engine; 
-    }
-    else
-    {
-      engine = caffeToGIEModel();
-      if (!engine)
-      {
-        std::cerr << "Engine could not be created" << std::endl;
-        return nullptr;
-      }
-      else // serialize engine
-      {  
-        std::ofstream p(serializedEnginePath);
-        if (!p)
+        ICudaEngine *engine;
+        
+        std::string serializedEnginePath = mCaffeProto + ".bin";
+        
+        std::cout << "Serialized engine path: " << serializedEnginePath.c_str() << std::endl;
+        if (existFile(serializedEnginePath))
         {
-          std::cerr << "could not serialize engine" << std::endl;
+            std::cout << "Found serialized TensorRT engine, deserializing..." << std::endl;
+            char *gieModelStream{nullptr};
+            size_t size{0};
+            std::ifstream file(serializedEnginePath, std::ios::binary);
+            if (file.good())
+            {
+                file.seekg(0, file.end);
+                size = file.tellg();
+                file.seekg(0, file.beg);
+                gieModelStream = new char[size];
+                assert(gieModelStream);
+                file.read(gieModelStream, size);
+                file.close();
+            }
+            
+            IRuntime* infer = createInferRuntime(gLogger);
+            engine = infer->deserializeCudaEngine(gieModelStream, size, nullptr);
+            if (gieModelStream) delete [] gieModelStream;
+            
+            return engine;
         }
-        IHostMemory *ptr = engine->serialize();
-        assert(ptr);
-        p.write(reinterpret_cast<const char*>(ptr->data()), ptr->size());
-        ptr->destroy();
-      }
+        else
+        {
+            engine = caffeToGIEModel();
+            if (!engine)
+            {
+                std::cerr << "Engine could not be created" << std::endl;
+                return nullptr;
+            }
+            else // serialize engine
+            {
+                std::ofstream p(serializedEnginePath);
+                if (!p)
+                {
+                    std::cerr << "could not serialize engine" << std::endl;
+                }
+                IHostMemory *ptr = engine->serialize();
+                assert(ptr);
+                p.write(reinterpret_cast<const char*>(ptr->data()), ptr->size());
+                ptr->destroy();
+            }
+        }
+        return engine;
     }
-    return engine;
-  }
-  
-  void NetTensorRT::initializationOnThread()
-  {
-    
-    std::cout << "InitializationOnThread : start" << std::endl;
     
-    try
-    {
-      
-      std::cout << "InitializationOnThread : setting device" << std::endl;
-      // Initialize net
-      cudaSetDevice(mGpuId);
-      
-      std::cout << "InitializationOnThread : creating engine" << std::endl;
-      
-      cudaEngine = createEngine();
-      if (!cudaEngine)
-      {
-        std::cerr << "cudaEngine could not be created" << std::endl;
-        return;
-      }
-      
-      std::cout << "InitializationOnThread Pass : creating execution context" << std::endl;
-      
-      cudaContext = cudaEngine->createExecutionContext();
-      if (!cudaContext)
-      {
-        std::cerr << "cudaContext could not be created" << std::endl;
-        return;
-      }
-
-      DimsCHW outputDims = static_cast<DimsCHW&&>(cudaEngine->getBindingDimensions(cudaEngine->getNbBindings() - 1));      
-      mNetOutputSize4D = { 1, outputDims.c(), outputDims.h(), outputDims.w() };
-
-      
-      std::cout << "NetInputSize4D: " << mNetInputSize4D[0] << " " << mNetInputSize4D[1] << " " << mNetInputSize4D[2] << " " << mNetInputSize4D[3] << std::endl;
-
-      spInputBlob = boost::make_shared<caffe::Blob<float>>(mNetInputSize4D[0], mNetInputSize4D[1], mNetInputSize4D[2], mNetInputSize4D[3]);
-      spOutputBlob = boost::make_shared<caffe::Blob<float>>(mNetOutputSize4D[0], mNetOutputSize4D[1], mNetOutputSize4D[2], mNetOutputSize4D[3]);
-      
-      std::cout << "InitializationOnThread : done" << std::endl;
-      cudaCheck(__LINE__, __FUNCTION__, __FILE__);
-    }
-    catch (const std::exception& e)
-    {
-      error(e.what(), __LINE__, __FUNCTION__, __FILE__);
-    }
-  }
-  
-  float* NetTensorRT::getInputDataCpuPtr() const
-  {
-    try
-    {
-      return spInputBlob->mutable_cpu_data();
-    }
-    catch (const std::exception& e)
-    {
-      error(e.what(), __LINE__, __FUNCTION__, __FILE__);
-      return nullptr;
-    }
-  }
-  
-  float* NetTensorRT::getInputDataGpuPtr() const
-  {
-    try
+    inline void reshapeNetTensorRT(caffe::Net<float>* caffeNet, const std::vector<int>& dimensions)
     {
-      return spInputBlob->mutable_gpu_data();
+        try
+        {
+            caffeNet->blobs()[0]->Reshape(dimensions);
+            caffeNet->Reshape();
+            cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+        }
+        catch (const std::exception& e)
+        {
+            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+        }
     }
-    catch (const std::exception& e)
+#endif
+    
+    NetTensorRT::NetTensorRT(const std::string& caffeProto, const std::string& caffeTrainedModel, const int gpuId,
+                       const bool enableGoogleLogging, const std::string& lastBlobName)
+#ifdef USE_TENSORRT
+    : upImpl{new ImplNetTensorRT{caffeProto, caffeTrainedModel, gpuId, enableGoogleLogging,
+        lastBlobName}}
+#endif
     {
-      error(e.what(), __LINE__, __FUNCTION__, __FILE__);
-      return nullptr;
+        try
+        {
+            #ifdef USE_TENSORRT
+                std::cout << "Caffe file: " << mCaffeProto.c_str() << std::endl;
+                CUDA_CHECK(cudaStreamCreate(&stream));
+                CUDA_CHECK(cudaEventCreate(&start));
+                CUDA_CHECK(cudaEventCreate(&end));
+            #else
+                UNUSED(netInputSize4D);
+                UNUSED(caffeProto);
+                UNUSED(caffeTrainedModel);
+                UNUSED(gpuId);
+                UNUSED(lastBlobName);
+                error("OpenPose must be compiled with the `USE_CAFFE` macro definition in order to use this"
+                      " functionality.", __LINE__, __FUNCTION__, __FILE__);
+            #endif
+        }
+        catch (const std::exception& e)
+        {
+            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+        }
     }
-  }
-  
-  void NetTensorRT::forwardPass(const float* const inputData) const
-  {
-    try
+    
+    NetTensorRT::~NetTensorRT()
     {
-      const int batchSize = 1;
-      // Copy frame data to GPU memory
-      if (inputData != nullptr)
-      {
-        auto* gpuImagePtr = spInputBlob->mutable_gpu_data();
-        CUDA_CHECK(cudaMemcpy(gpuImagePtr, inputData, mNetInputMemory, cudaMemcpyHostToDevice));
+        cudaStreamDestroy(stream);
+        cudaEventDestroy(start);
+        cudaEventDestroy(end);
         
-        // input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(),
-        // of these, but in this case we know that there is exactly one input and one output.
-        std::vector<void*> buffers(2);
-        buffers[0] = spInputBlob->mutable_gpu_data();
-        buffers[1] = spOutputBlob->mutable_gpu_data();
-      
-        cudaContext->enqueue(batchSize, &buffers[0], stream, nullptr);
-      
-        //cudaCheck(__LINE__, __FUNCTION__, __FILE__);
-      }
+        if (cudaEngine)
+            cudaEngine->destroy();
     }
-    catch (const std::exception& e)
+    
+    void NetTensorRT::initializationOnThread()
     {
-      error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+        std::cout << "InitializationOnThread : start" << std::endl;
+        try
+        {
+            #ifdef USE_TENSORRT
+                std::cout << "InitializationOnThread : setting device" << std::endl;
+                // Initialize net
+                cudaSetDevice(mGpuId);
+            
+                std::cout << "InitializationOnThread : creating engine" << std::endl;
+            
+                cudaEngine = createEngine();
+                if (!cudaEngine)
+                {
+                    std::cerr << "cudaEngine could not be created" << std::endl;
+                    return;
+                }
+            
+                std::cout << "InitializationOnThread Pass : creating execution context" << std::endl;
+            
+                cudaContext = cudaEngine->createExecutionContext();
+                if (!cudaContext)
+                {
+                    std::cerr << "cudaContext could not be created" << std::endl;
+                    return;
+                }
+            
+                DimsCHW outputDims = static_cast<DimsCHW&&>(cudaEngine->getBindingDimensions(cudaEngine->getNbBindings() - 1));
+                mNetOutputSize4D = { 1, outputDims.c(), outputDims.h(), outputDims.w() };
+            
+            
+                std::cout << "NetInputSize4D: " << mNetInputSize4D[0] << " " << mNetInputSize4D[1] << " " << mNetInputSize4D[2] << " " << mNetInputSize4D[3] << std::endl;
+            
+                upImpl->spInputBlob = boost::make_shared<caffe::Blob<float>>(mNetInputSize4D[0], mNetInputSize4D[1], mNetInputSize4D[2], mNetInputSize4D[3]);
+                upImpl->spOutputBlob = boost::make_shared<caffe::Blob<float>>(mNetOutputSize4D[0], mNetOutputSize4D[1], mNetOutputSize4D[2], mNetOutputSize4D[3]);
+            
+                std::cout << "InitializationOnThread : done" << std::endl;
+                cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+            #endif
+        }
+        catch (const std::exception& e)
+        {
+            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+        }
     }
-  }
-  
-  boost::shared_ptr<caffe::Blob<float>> NetTensorRT::getOutputBlob() const
-  {
-    std::cout << "Getting output blob." << std::endl;
-    try
+    
+    void NetTensorRT::forwardPass(const Array<float>& inputData) const
     {
-      return spOutputBlob;
+        try
+        {
+            #ifdef USE_TENSORRT
+            // Security checks
+            if (inputData.empty())
+                error("The Array inputData cannot be empty.", __LINE__, __FUNCTION__, __FILE__);
+            if (inputData.getNumberDimensions() != 4 || inputData.getSize(1) != 3)
+                error("The Array inputData must have 4 dimensions: [batch size, 3 (RGB), height, width].",
+                      __LINE__, __FUNCTION__, __FILE__);
+            // Reshape Caffe net if required
+            if (!vectorsAreEqual(upImpl->mNetInputSize4D, inputData.getSize()))
+            {
+                upImpl->mNetInputSize4D = inputData.getSize();
+                reshapeNetTensorRT(upImpl->upCaffeNet.get(), inputData.getSize());
+            }
+            
+            // Copy frame data to GPU memory
+            auto* gpuImagePtr = upImpl->spInputBlob->mutable_gpu_data();
+            CUDA_CHECK(cudaMemcpy(gpuImagePtr, inputData.getConstPtr(), mNetInputMemory, cudaMemcpyHostToDevice));
+            
+            // input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(),
+            // of these, but in this case we know that there is exactly one input and one output.
+            std::vector<void*> buffers(2);
+            buffers[0] = upImpl->spInputBlob->mutable_gpu_data();
+            buffers[1] = upImpl->spOutputBlob->mutable_gpu_data();
+            
+            // Perform deep network forward pass
+            cudaContext->enqueue(batchSize, &buffers[0], stream, nullptr);
+            
+            // Cuda checks
+            cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+            
+            #endif
+        }
+        catch (const std::exception& e)
+        {
+            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+        }
     }
-    catch (const std::exception& e)
+    
+    boost::shared_ptr<caffe::Blob<float>> NetTensorRT::getOutputBlob() const
     {
-      error(e.what(), __LINE__, __FUNCTION__, __FILE__);
-      return nullptr;
+        try
+        {
+            #ifdef USE_TENSORRT
+                return upImpl->spOutputBlob;
+            #else
+                return nullptr;
+            #endif
+        }
+        catch (const std::exception& e)
+        {
+            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+            return nullptr;
+        }
     }
-    
-    std::cout << "Got something..." << std::endl;
-  }
 }
-
-#endif // USE_TENSORRT
+    
diff --git a/src/openpose/pose/poseExtractorTensorRT.cpp b/src/openpose/pose/poseExtractorTensorRT.cpp
index c9afbff08..9ab8b2b96 100644
--- a/src/openpose/pose/poseExtractorTensorRT.cpp
+++ b/src/openpose/pose/poseExtractorTensorRT.cpp
@@ -10,6 +10,7 @@
 #include <openpose/utilities/cuda.hpp>
 #include <openpose/utilities/fastMath.hpp>
 #include <openpose/utilities/openCv.hpp>
+#include <openpose/utilities/standard.hpp>
 #include <openpose/pose/poseExtractorTensorRT.hpp>
 
 typedef std::vector<std::pair<std::string, std::chrono::high_resolution_clock::time_point>> OpTimings;
@@ -40,39 +41,59 @@ namespace op
             const bool mEnableGoogleLogging;
             // General parameters
             std::vector<std::shared_ptr<NetTensorRT>> spTensorRTNets;
-            std::shared_ptr<ResizeAndMergeCaffe<float>> spResizeAndMergeTensorRT;
-            std::shared_ptr<NmsCaffe<float>> spNmsTensorRT;
-            std::shared_ptr<BodyPartConnectorCaffe<float>> spBodyPartConnectorTensorRT;
+            std::shared_ptr<ResizeAndMergeCaffe<float>> spResizeAndMergeCaffe;
+            std::shared_ptr<NmsCaffe<float>> spNmsCaffe;
+            std::shared_ptr<BodyPartConnectorCaffe<float>> spBodyPartConnectorCaffe;
+            std::vector<std::vector<int>> mNetInput4DSizes;
+            std::vector<double> mScaleInputToNetInputs;
             // Init with thread
             std::vector<boost::shared_ptr<caffe::Blob<float>>> spTensorRTNetOutputBlobs;
             std::shared_ptr<caffe::Blob<float>> spHeatMapsBlob;
             std::shared_ptr<caffe::Blob<float>> spPeaksBlob;
             std::shared_ptr<caffe::Blob<float>> spPoseBlob;
 
-
             ImplPoseExtractorTensorRT(const PoseModel poseModel, const int gpuId,
                                       const std::string& modelFolder, const bool enableGoogleLogging) :
                 mPoseModel{poseModel},
                 mGpuId{gpuId},
                 mModelFolder{modelFolder},
-                mEnableGoogleLoggin{enableGoogleLogging},
-                spResizeAndMergeTensorRT{std::make_shared<ResizeAndMergeCaffe<float>>()},
-                spNmsTensorRT{std::make_shared<NmsCaffe<float>>()},
-                spBodyPartConnectorTensorRT{std::make_shared<BodyPartConnectorCaffe<float>>()}
+                mEnableGoogleLogging{enableGoogleLogging},
+                spResizeAndMergeCaffe{std::make_shared<ResizeAndMergeCaffe<float>>()},
+                spNmsCaffe{std::make_shared<NmsCaffe<float>>()},
+                spBodyPartConnectorCaffe{std::make_shared<BodyPartConnectorCaffe<float>>()}
             {
             }
         #endif
     };
 
-    inline void reshapePoseExtractorCaffe(std::shared_ptr<ResizeAndMergeCaffe<float>>& resizeAndMergeCaffe,
-                                          std::shared_ptr<NmsCaffe<float>>& nmsCaffe,
-                                          std::shared_ptr<BodyPartConnectorCaffe<float>>& bodyPartConnectorCaffe,
-                                          std::vector<boost::shared_ptr<caffe::Blob<float>>>& caffeNetOutputBlob,
-                                          std::shared_ptr<caffe::Blob<float>>& heatMapsBlob,
-                                          std::shared_ptr<caffe::Blob<float>>& peaksBlob,
-                                          std::shared_ptr<caffe::Blob<float>>& poseBlob,
-                                          const float scaleInputToNetInput,
-                                          const PoseModel poseModel)
+    #ifdef USE_CAFFE
+        std::vector<caffe::Blob<float>*> caffeNetSharedToPtr(
+                                                             std::vector<boost::shared_ptr<caffe::Blob<float>>>& caffeNetOutputBlob)
+        {
+            try
+            {
+                // Prepare spCaffeNetOutputBlobss
+                std::vector<caffe::Blob<float>*> caffeNetOutputBlobs(caffeNetOutputBlob.size());
+                for (auto i = 0u ; i < caffeNetOutputBlobs.size() ; i++)
+                    caffeNetOutputBlobs[i] = caffeNetOutputBlob[i].get();
+                return caffeNetOutputBlobs;
+            }
+            catch (const std::exception& e)
+            {
+                error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+                return {};
+            }
+        }
+    
+        inline void reshapePoseExtractorCaffe(std::shared_ptr<ResizeAndMergeCaffe<float>>& resizeAndMergeCaffe,
+                                              std::shared_ptr<NmsCaffe<float>>& nmsCaffe,
+                                              std::shared_ptr<BodyPartConnectorCaffe<float>>& bodyPartConnectorCaffe,
+                                              std::vector<boost::shared_ptr<caffe::Blob<float>>>& caffeNetOutputBlob,
+                                              std::shared_ptr<caffe::Blob<float>>& heatMapsBlob,
+                                              std::shared_ptr<caffe::Blob<float>>& peaksBlob,
+                                              std::shared_ptr<caffe::Blob<float>>& poseBlob,
+                                              const float scaleInputToNetInput,
+                                              const PoseModel poseModel)
         {
             try
             {
@@ -85,29 +106,29 @@ namespace op
                 // Pose extractor blob and layer
                 bodyPartConnectorCaffe->Reshape({heatMapsBlob.get(), peaksBlob.get()}, {poseBlob.get()});
                 // Cuda check
-                #ifdef USE_CUDA
-                    cudaCheck(__LINE__, __FUNCTION__, __FILE__);
-                #endif
+    #ifdef USE_CUDA
+                cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+    #endif
             }
             catch (const std::exception& e)
             {
                 error(e.what(), __LINE__, __FUNCTION__, __FILE__);
             }
         }
-
-    void addTensorRTNetOnThread(std::vector<std::shared_ptr<NetTensorRT>>& netTensorRT,
-                             std::vector<boost::shared_ptr<caffe::Blob<float>>>& caffeNetOutputBlob,
-                             const PoseModel poseModel, const int gpuId,
-                             const std::string& modelFolder, const bool enableGoogleLogging)
+    
+        void addTensorRTNetOnThread(std::vector<std::shared_ptr<NetTensorRT>>& netTensorRT,
+                                 std::vector<boost::shared_ptr<caffe::Blob<float>>>& caffeNetOutputBlob,
+                                 const PoseModel poseModel, const int gpuId,
+                                 const std::string& modelFolder, const bool enableGoogleLogging)
         {
             try
             {
                 // Add Caffe Net
                 netTensorRT.emplace_back(
-                    std::make_shared<NetCaffe>(modelFolder + POSE_PROTOTXT[(int)poseModel],
-                                               modelFolder + POSE_TRAINED_MODEL[(int)poseModel],
-                                               gpuId, enableGoogleLogging)
-                );
+                                      std::make_shared<NetTensorRT>(modelFolder + POSE_PROTOTXT[(int)poseModel],
+                                                                 modelFolder + POSE_TRAINED_MODEL[(int)poseModel],
+                                                                 gpuId, enableGoogleLogging)
+                                      );
                 // Initializing them on the thread
                 netTensorRT.back()->initializationOnThread();
                 caffeNetOutputBlob.emplace_back(netTensorRT.back()->getOutputBlob());
@@ -115,9 +136,9 @@ namespace op
                 if (netTensorRT.size() != caffeNetOutputBlob.size())
                     error("Weird error, this should not happen. Notify us.", __LINE__, __FUNCTION__, __FILE__);
                 // Cuda check
-                #ifdef USE_CUDA
-                    cudaCheck(__LINE__, __FUNCTION__, __FILE__);
-                #endif
+    #ifdef USE_CUDA
+                cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+    #endif
             }
             catch (const std::exception& e)
             {
@@ -127,39 +148,27 @@ namespace op
     #endif
 
  
-    PoseExtractorTensorRT::PoseExtractorTensorRT(const Point<int>& netInputSize, const Point<int>& netOutputSize,
-                                                 const Point<int>& outputSize, const int scaleNumber,
-                                                 const PoseModel poseModel, const std::string& modelFolder, 
+    PoseExtractorTensorRT::PoseExtractorTensorRT(const PoseModel poseModel, const std::string& modelFolder,
                                                  const int gpuId, const std::vector<HeatMapType>& heatMapTypes,
                                                  const ScaleMode heatMapScale, const bool enableGoogleLogging) :
-        PoseExtractor{netOutputSize, outputSize, poseModel, heatMapTypes, heatMapScale}
+        PoseExtractor{poseModel, heatMapTypes, heatMapScale}
         #ifdef USE_TENSORRT
-        , upImpl{new ImplPoseExtractorTensorRT{netInputSize, netOutputSize, scaleNumber, poseModel,
-                                               gpuId, modelFolder, enableGoogleLogging}}
+        , upImpl{new ImplPoseExtractorTensorRT{poseModel, gpuId, modelFolder, enableGoogleLogging}}
         #endif
     {
         try
         {
             #ifdef USE_TENSORRT
-                const auto resizeScale = mNetOutputSize.x / (float)netInputSize.x;
-                const auto resizeScaleCheck = resizeScale / (mNetOutputSize.y/(float)netInputSize.y);
-                if (1+1e-6 < resizeScaleCheck || resizeScaleCheck < 1-1e-6)
-                    error("Net input and output size must be proportional. resizeScaleCheck = "
-                          + std::to_string(resizeScaleCheck), __LINE__, __FUNCTION__, __FILE__);
-                    // Layers parameters
-                    upImpl->spBodyPartConnectorCaffe->setPoseModel(mPoseModel);
+                // Layers parameters
+                upImpl->spBodyPartConnectorCaffe->setPoseModel(mPoseModel);
             #else
-                UNUSED(netInputSize);
-                UNUSED(netOutputSize);
-                UNUSED(outputSize);
-                UNUSED(scaleNumber);
-                UNUSED(poseModel);
-                UNUSED(modelFolder);
-                UNUSED(gpuId);
-                UNUSED(heatMapTypes);
-                UNUSED(heatMapScale);
-                error("OpenPose must be compiled with the `USE_TENSORRT` macro definition in order to use this"
-                      " functionality.", __LINE__, __FUNCTION__, __FILE__);
+            UNUSED(poseModel);
+            UNUSED(modelFolder);
+            UNUSED(gpuId);
+            UNUSED(heatMapTypes);
+            UNUSED(heatMapScale);
+            error("OpenPose must be compiled with the `USE_CAFFE` macro definition in order to use this"
+                  " functionality.", __LINE__, __FUNCTION__, __FILE__);
             #endif
         }
         catch (const std::exception& e)
@@ -176,30 +185,24 @@ namespace op
     {
         try
         {
-            log("Starting initialization on thread.", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
-          
             #ifdef USE_TENSORRT
-                // TensorRT net
-                upImpl->spTensorRTNets->initializationOnThread();
-                upImpl->spTensorRTNetOutputBlobs = ((NetTensorRT*)upImpl->spTensorRTNets.get())->getOutputBlob();
+            
+                // Logging
+                log("Starting initialization on thread.", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
+                // Initialize Caffe net
+                addTensorRTNetOnThread(upImpl->spCaffeNets, upImpl->spCaffeNetOutputBlobs, upImpl->mPoseModel,
+                                    upImpl->mGpuId, upImpl->mModelFolder, upImpl->mEnableGoogleLogging);
+            
                 cudaCheck(__LINE__, __FUNCTION__, __FILE__);
-
-                // HeatMaps extractor blob and layer
+            
+                // Initialize blobs
                 upImpl->spHeatMapsBlob = {std::make_shared<caffe::Blob<float>>(1,1,1,1)};
-                upImpl->spResizeAndMergeTensorRT->Reshape({upImpl->spTensorRTNetOutputBlobs.get()}, {upImpl->spHeatMapsBlob.get()}, upImpl->mResizeScale * POSE_CCN_DECREASE_FACTOR[(int)mPoseModel]);
-                cudaCheck(__LINE__, __FUNCTION__, __FILE__);
-
-                // Pose extractor blob and layer
                 upImpl->spPeaksBlob = {std::make_shared<caffe::Blob<float>>(1,1,1,1)};
-                upImpl->spNmsTensorRT->Reshape({upImpl->spHeatMapsBlob.get()}, {upImpl->spPeaksBlob.get()}, POSE_MAX_PEAKS[(int)mPoseModel]);
-                cudaCheck(__LINE__, __FUNCTION__, __FILE__);
-
-                // Pose extractor blob and layer
                 upImpl->spPoseBlob = {std::make_shared<caffe::Blob<float>>(1,1,1,1)};
-                upImpl->spBodyPartConnectorTensorRT->setPoseModel(mPoseModel);
-                upImpl->spBodyPartConnectorTensorRT->Reshape({upImpl->spHeatMapsBlob.get(), upImpl->spPeaksBlob.get()}, {upImpl->spPoseBlob.get()});
+
                 cudaCheck(__LINE__, __FUNCTION__, __FILE__);
-                
+
+                // Logging
                 log("Finished initialization on thread.", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
             #endif
         }
@@ -217,45 +220,105 @@ namespace op
                 // Security checks
                 if (inputNetData.empty())
                     error("Empty inputNetData.", __LINE__, __FUNCTION__, __FILE__);
+                for (const auto& inputNetDataI : inputNetData)
+                    if (inputNetDataI.empty())
+                        error("Empty inputNetData.", __LINE__, __FUNCTION__, __FILE__);
+                if (inputNetData.size() != scaleInputToNetInputs.size())
+                    error("Size(inputNetData) must be same than size(scaleInputToNetInputs).",
+                          __LINE__, __FUNCTION__, __FILE__);
+            
                 timeNow("Start");
-                // 1. TensorRT deep network
-                upImpl->spTensorRTNets->forwardPass(inputNetData.getConstPtr());
-                timeNow("TensorRT forward");
+            
+                // Resize std::vectors if required
+                const auto numberScales = inputNetData.size();
+                upImpl->mNetInput4DSizes.resize(numberScales);
+                while (upImpl->spCaffeNets.size() < numberScales)
+                    addCaffeNetOnThread(upImpl->spCaffeNets, upImpl->spCaffeNetOutputBlobs, upImpl->mPoseModel,
+                                        upImpl->mGpuId, upImpl->mModelFolder, false);
+            
+                // Process each image
+                for (auto i = 0u ; i < inputNetData.size(); i++)
+                {
+                    // 1. TensorRT deep network
+                    upImpl->spTensorRTNets->forwardPass(inputNetData.getConstPtr());
+                    
+                    // Reshape blobs if required
+                    // Note: In order to resize to input size to have same results as Matlab, uncomment the commented
+                    // lines
+                    if (!vectorsAreEqual(upImpl->mNetInput4DSizes.at(i), inputNetData[i].getSize()))
+                        // || !vectorsAreEqual(upImpl->mScaleInputToNetInputs, scaleInputToNetInputs))
+                    {
+                        upImpl->mNetInput4DSizes.at(i) = inputNetData[i].getSize();
+                        mNetOutputSize = Point<int>{upImpl->mNetInput4DSizes[0][3],
+                            upImpl->mNetInput4DSizes[0][2]};
+                        // upImpl->mScaleInputToNetInputs = scaleInputToNetInputs;
+                        reshapePoseExtractorCaffe(upImpl->spResizeAndMergeCaffe, upImpl->spNmsCaffe,
+                                                  upImpl->spBodyPartConnectorCaffe, upImpl->spCaffeNetOutputBlobs,
+                                                  upImpl->spHeatMapsBlob, upImpl->spPeaksBlob, upImpl->spPoseBlob,
+                                                  1.f, mPoseModel);
+                        // scaleInputToNetInputs[i], mPoseModel);
+                    }
+                }
+            
+                timeNow("TensorRT forwards");
+            
                 // 2. Resize heat maps + merge different scales
-                upImpl->spResizeAndMergeTensorRT->setScaleRatios(scaleRatios);
-                timeNow("SpResizeAndMergeTensorRT");
-                #ifndef CPU_ONLY
-                    upImpl->spResizeAndMergeTensorRT->Forward_gpu({upImpl->spTensorRTNetOutputBlobs.get()}, {upImpl->spHeatMapsBlob.get()});       // ~5ms
-                    timeNow("RaM forward_gpu");
-                    cudaCheck(__LINE__, __FUNCTION__, __FILE__);
-                    timeNow("CudaCheck");
-                #else
-                    error("ResizeAndMergeTensorRT CPU version not implemented yet.", __LINE__, __FUNCTION__, __FILE__);
+                const auto caffeNetOutputBlobs = caffeNetSharedToPtr(upImpl->spCaffeNetOutputBlobs);
+                const std::vector<float> floatScaleRatios(scaleInputToNetInputs.begin(), scaleInputToNetInputs.end());
+                upImpl->spResizeAndMergeCaffe->setScaleRatios(floatScaleRatios);
+                #ifdef USE_CUDA // Implied by tensorrt
+                upImpl->spResizeAndMergeCaffe->Forward_gpu(caffeNetOutputBlobs,                             // ~5ms
+                                                           {upImpl->spHeatMapsBlob.get()});
+                cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+                #else // Never reached, suppress ?
+                upImpl->spResizeAndMergeCaffe->Forward_cpu({upImpl->spCaffeNetOutputBlob.get()},
+                                                           {upImpl->spHeatMapsBlob.get()});
                 #endif
+            
                 timeNow("Resize heat Maps");
+            
                 // 3. Get peaks by Non-Maximum Suppression
-                upImpl->spNmsTensorRT->setThreshold((float)get(PoseProperty::NMSThreshold));
-                #ifndef CPU_ONLY
-                    upImpl->spNmsTensorRT->Forward_gpu({upImpl->spHeatMapsBlob.get()}, {upImpl->spPeaksBlob.get()});                           // ~2ms
-                    cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+                upImpl->spNmsCaffe->setThreshold((float)get(PoseProperty::NMSThreshold));
+                #ifdef USE_CUDA
+                upImpl->spNmsCaffe->Forward_gpu({upImpl->spHeatMapsBlob.get()}, {upImpl->spPeaksBlob.get()});// ~2ms
+                cudaCheck(__LINE__, __FUNCTION__, __FILE__);
                 #else
-                    error("NmsTensorRT CPU version not implemented yet.", __LINE__, __FUNCTION__, __FILE__);
+                error("NmsCaffe CPU version not implemented yet.", __LINE__, __FUNCTION__, __FILE__);
                 #endif
+            
                 timeNow("Peaks by nms");
-                // Get scale net to output
+            
+                // Get scale net to output (i.e. image input)
+                // Note: In order to resize to input size, (un)comment the following lines
                 const auto scaleProducerToNetInput = resizeGetScaleFactor(inputDataSize, mNetOutputSize);
-                const Point<int> netSize{intRound(scaleProducerToNetInput*inputDataSize.x), intRound(scaleProducerToNetInput*inputDataSize.y)};
-                mScaleNetToOutput = {(float)resizeGetScaleFactor(netSize, mOutputSize)};
+                const Point<int> netSize{intRound(scaleProducerToNetInput*inputDataSize.x),
+                    intRound(scaleProducerToNetInput*inputDataSize.y)};
+                mScaleNetToOutput = {(float)resizeGetScaleFactor(netSize, inputDataSize)};
+                // mScaleNetToOutput = 1.f;
+            
                 timeNow("Scale net to output");
+            
                 // 4. Connecting body parts
-                upImpl->spBodyPartConnectorTensorRT->setScaleNetToOutput(mScaleNetToOutput);
-                upImpl->spBodyPartConnectorTensorRT->setInterMinAboveThreshold((int)get(PoseProperty::ConnectInterMinAboveThreshold));
-                upImpl->spBodyPartConnectorTensorRT->setInterThreshold((float)get(PoseProperty::ConnectInterThreshold));
-                upImpl->spBodyPartConnectorTensorRT->setMinSubsetCnt((int)get(PoseProperty::ConnectMinSubsetCnt));
-                upImpl->spBodyPartConnectorTensorRT->setMinSubsetScore((float)get(PoseProperty::ConnectMinSubsetScore));
+                // Get scale net to output (i.e. image input)
+                upImpl->spBodyPartConnectorCaffe->setScaleNetToOutput(mScaleNetToOutput);
+                upImpl->spBodyPartConnectorCaffe->setInterMinAboveThreshold(
+                                                                            (float)get(PoseProperty::ConnectInterMinAboveThreshold)
+                                                                            );
+                upImpl->spBodyPartConnectorCaffe->setInterThreshold((float)get(PoseProperty::ConnectInterThreshold));
+                upImpl->spBodyPartConnectorCaffe->setMinSubsetCnt((int)get(PoseProperty::ConnectMinSubsetCnt));
+                upImpl->spBodyPartConnectorCaffe->setMinSubsetScore((float)get(PoseProperty::ConnectMinSubsetScore));
+            
                 // GPU version not implemented yet
-                upImpl->spBodyPartConnectorTensorRT->Forward_cpu({upImpl->spHeatMapsBlob.get(), upImpl->spPeaksBlob.get()}, mPoseKeypoints);
-                // upImpl->spBodyPartConnectorTensorRT->Forward_gpu({upImpl->spHeatMapsBlob.get(), upImpl->spPeaksBlob.get()}, {upImpl->spPoseBlob.get()}, mPoseKeypoints);
+                // #ifdef USE_CUDA
+                //     upImpl->spBodyPartConnectorCaffe->Forward_gpu({upImpl->spHeatMapsBlob.get(),
+                //                                                    upImpl->spPeaksBlob.get()},
+                //                                                   {upImpl->spPoseBlob.get()}, mPoseKeypoints);
+                // #else
+                upImpl->spBodyPartConnectorCaffe->Forward_cpu({upImpl->spHeatMapsBlob.get(),
+                    upImpl->spPeaksBlob.get()},
+                                                              mPoseKeypoints, mPoseScores);
+                // #endif
+            
                 timeNow("Connect Body Parts");
                  
                 const auto totalTimeSec = timeDiffToString(timings.back().second, timings.front().second);
@@ -266,6 +329,10 @@ namespace op
                   const auto log_time = (*timing).first + " - " + timeDiffToString((*timing).second, (*(timing-1)).second);
                   op::log(log_time, op::Priority::High);
                 }
+            #else
+                UNUSED(inputNetData);
+                UNUSED(inputDataSize);
+                UNUSED(scaleInputToNetInputs);
             #endif
         }
         catch (const std::exception& e)

From 273a3519b2bc18074f9fe3166ddee5c222206c08 Mon Sep 17 00:00:00 2001
From: Florent Buisson <florent.buisson.1987@gmail.com>
Date: Wed, 15 Nov 2017 15:00:58 +0000
Subject: [PATCH 47/52] Fix source issues, example remains.

---
 include/openpose/core/netTensorRT.hpp       | 21 +++---
 src/openpose/core/netTensorRT.cpp           | 75 ++++++++++-----------
 src/openpose/pose/poseExtractorTensorRT.cpp | 18 ++---
 3 files changed, 58 insertions(+), 56 deletions(-)

diff --git a/include/openpose/core/netTensorRT.hpp b/include/openpose/core/netTensorRT.hpp
index 0c0ae02e1..96b588657 100644
--- a/include/openpose/core/netTensorRT.hpp
+++ b/include/openpose/core/netTensorRT.hpp
@@ -5,31 +5,32 @@
 #include <openpose/core/net.hpp>
 
 
-#include "NvInfer.h"
+#ifdef USE_TENSORRT
+    #include "NvInfer.h"
+#endif
 
 namespace op
 {
     class OP_API NetTensorRT : public Net
     {
     public:
-        NetTensorRT(const std::array<int, 4>& netInputSize4D, const std::string& caffeProto, const std::string& caffeTrainedModel, const int gpuId = 0,
+        NetTensorRT(const std::string& caffeProto, const std::string& caffeTrainedModel, const int gpuId = 0, const bool enableGoogleLogging = true,
                  const std::string& lastBlobName = "net_output");
 
         virtual ~NetTensorRT();
 
         void initializationOnThread();
 
-        // Alternative a) getInputDataCpuPtr or getInputDataGpuPtr + forwardPass
-        float* getInputDataCpuPtr() const;
-
-        float* getInputDataGpuPtr() const;
-
-        // Alternative b)
-        void forwardPass(const float* const inputNetData = nullptr) const;
+        void forwardPass(const Array<float>& inputNetData) const;
 
         boost::shared_ptr<caffe::Blob<float>> getOutputBlob() const;
-
+    
     private:
+#ifdef USE_TENSORRT
+        nvinfer1::ICudaEngine* caffeToGIEModel();
+        
+        nvinfer1::ICudaEngine* createEngine();
+#endif
         // PIMPL idiom
         // http://www.cppsamples.com/common-tasks/pimpl.html
         struct ImplNetTensorRT;
diff --git a/src/openpose/core/netTensorRT.cpp b/src/openpose/core/netTensorRT.cpp
index 416d0c752..897087f00 100644
--- a/src/openpose/core/netTensorRT.cpp
+++ b/src/openpose/core/netTensorRT.cpp
@@ -24,7 +24,7 @@
 //#include <string.h>
 //#include <map>
 //#include <random>
-//#include <boost/make_shared.hpp>
+#include <boost/make_shared.hpp>
 
 
 #ifdef USE_TENSORRT
@@ -34,8 +34,8 @@
     using namespace nvinfer1;
     using namespace nvcaffeparser1;
 
-//std::vector<std::string> gInputs;
-//std::map<std::string, DimsCHW> gInputDimensions;
+    std::vector<std::string> gInputs;
+    std::map<std::string, DimsCHW> gInputDimensions;
 #endif // USE_TENSORRT
 
 // Logger for GIE info/warning/errors
@@ -68,15 +68,14 @@ namespace op
         
             // Init with constructor
             //const std::array<int, 4> mNetInputSize4D;
-            //std::array<int, 4> mNetOutputSize4D;
-            //const unsigned long mNetInputMemory;
+            std::vector<int> mNetOutputSize4D;
             // Init with thread
         
             // TensorRT stuff
             nvinfer1::ICudaEngine* cudaEngine;
             nvinfer1::IExecutionContext* cudaContext;
-            nvinfer1::ICudaEngine* caffeToGIEModel();
-            nvinfer1::ICudaEngine* createEngine();
+            //nvinfer1::ICudaEngine* caffeToGIEModel();
+            //nvinfer1::ICudaEngine* createEngine();
             cudaStream_t stream;
             cudaEvent_t start, end;
     
@@ -119,8 +118,8 @@ namespace op
         // parse the caffe model to populate the network, then set the outputs
         INetworkDefinition* network = builder->createNetwork();
         ICaffeParser* parser = createCaffeParser();
-        const IBlobNameToTensor* blobNameToTensor = parser->parse(mCaffeProto.c_str(),
-                                                                  mCaffeTrainedModel.c_str(),
+        const IBlobNameToTensor* blobNameToTensor = parser->parse(upImpl->mCaffeProto.c_str(),
+                                                                  upImpl->mCaffeTrainedModel.c_str(),
                                                                   *network,
                                                                   DataType::kFLOAT);
         
@@ -139,12 +138,12 @@ namespace op
         }
         
         // Specify which tensor is output (multiple unsupported)
-        if (blobNameToTensor->find(mLastBlobName.c_str()) == nullptr)
+        if (blobNameToTensor->find(upImpl->mLastBlobName.c_str()) == nullptr)
         {
-            std::cout << "could not find output blob " << mLastBlobName.c_str() << std::endl;
+            std::cout << "could not find output blob " << upImpl->mLastBlobName.c_str() << std::endl;
             return nullptr;
         }
-        network->markOutput(*blobNameToTensor->find(mLastBlobName.c_str()));
+        network->markOutput(*blobNameToTensor->find(upImpl->mLastBlobName.c_str()));
         
         
         for (int i = 0, n = network->getNbOutputs(); i < n; i++)
@@ -176,7 +175,7 @@ namespace op
     {
         ICudaEngine *engine;
         
-        std::string serializedEnginePath = mCaffeProto + ".bin";
+        std::string serializedEnginePath = upImpl->mCaffeProto + ".bin";
         
         std::cout << "Serialized engine path: " << serializedEnginePath.c_str() << std::endl;
         if (existFile(serializedEnginePath))
@@ -226,12 +225,12 @@ namespace op
         return engine;
     }
     
-    inline void reshapeNetTensorRT(caffe::Net<float>* caffeNet, const std::vector<int>& dimensions)
+    inline void reshapeNetTensorRT(boost::shared_ptr<caffe::Blob<float>> inputBlob, const std::vector<int>& dimensions)
     {
         try
         {
-            caffeNet->blobs()[0]->Reshape(dimensions);
-            caffeNet->Reshape();
+            inputBlob->Reshape(dimensions);
+            //caffeNet->Reshape(); TODO find TensorRT equivalent
             cudaCheck(__LINE__, __FUNCTION__, __FILE__);
         }
         catch (const std::exception& e)
@@ -251,10 +250,10 @@ namespace op
         try
         {
             #ifdef USE_TENSORRT
-                std::cout << "Caffe file: " << mCaffeProto.c_str() << std::endl;
-                CUDA_CHECK(cudaStreamCreate(&stream));
-                CUDA_CHECK(cudaEventCreate(&start));
-                CUDA_CHECK(cudaEventCreate(&end));
+                std::cout << "Caffe file: " << upImpl->mCaffeProto.c_str() << std::endl;
+                CUDA_CHECK(cudaStreamCreate(&upImpl->stream));
+                CUDA_CHECK(cudaEventCreate(&upImpl->start));
+                CUDA_CHECK(cudaEventCreate(&upImpl->end));
             #else
                 UNUSED(netInputSize4D);
                 UNUSED(caffeProto);
@@ -273,12 +272,12 @@ namespace op
     
     NetTensorRT::~NetTensorRT()
     {
-        cudaStreamDestroy(stream);
-        cudaEventDestroy(start);
-        cudaEventDestroy(end);
+        cudaStreamDestroy(upImpl->stream);
+        cudaEventDestroy(upImpl->start);
+        cudaEventDestroy(upImpl->end);
         
-        if (cudaEngine)
-            cudaEngine->destroy();
+        if (upImpl->cudaEngine)
+            upImpl->cudaEngine->destroy();
     }
     
     void NetTensorRT::initializationOnThread()
@@ -289,12 +288,12 @@ namespace op
             #ifdef USE_TENSORRT
                 std::cout << "InitializationOnThread : setting device" << std::endl;
                 // Initialize net
-                cudaSetDevice(mGpuId);
+                cudaSetDevice(upImpl->mGpuId);
             
                 std::cout << "InitializationOnThread : creating engine" << std::endl;
             
-                cudaEngine = createEngine();
-                if (!cudaEngine)
+                upImpl->cudaEngine = createEngine();
+                if (!upImpl->cudaEngine)
                 {
                     std::cerr << "cudaEngine could not be created" << std::endl;
                     return;
@@ -302,21 +301,21 @@ namespace op
             
                 std::cout << "InitializationOnThread Pass : creating execution context" << std::endl;
             
-                cudaContext = cudaEngine->createExecutionContext();
-                if (!cudaContext)
+                upImpl->cudaContext = upImpl->cudaEngine->createExecutionContext();
+                if (!upImpl->cudaContext)
                 {
                     std::cerr << "cudaContext could not be created" << std::endl;
                     return;
                 }
             
-                DimsCHW outputDims = static_cast<DimsCHW&&>(cudaEngine->getBindingDimensions(cudaEngine->getNbBindings() - 1));
-                mNetOutputSize4D = { 1, outputDims.c(), outputDims.h(), outputDims.w() };
+                DimsCHW outputDims = static_cast<DimsCHW&&>(upImpl->cudaEngine->getBindingDimensions(upImpl->cudaEngine->getNbBindings() - 1));
+                upImpl->mNetOutputSize4D = { 1, outputDims.c(), outputDims.h(), outputDims.w() };
             
             
-                std::cout << "NetInputSize4D: " << mNetInputSize4D[0] << " " << mNetInputSize4D[1] << " " << mNetInputSize4D[2] << " " << mNetInputSize4D[3] << std::endl;
+                std::cout << "NetInputSize4D: " << upImpl->mNetInputSize4D[0] << " " << upImpl->mNetInputSize4D[1] << " " << upImpl->mNetInputSize4D[2] << " " << upImpl->mNetInputSize4D[3] << std::endl;
             
-                upImpl->spInputBlob = boost::make_shared<caffe::Blob<float>>(mNetInputSize4D[0], mNetInputSize4D[1], mNetInputSize4D[2], mNetInputSize4D[3]);
-                upImpl->spOutputBlob = boost::make_shared<caffe::Blob<float>>(mNetOutputSize4D[0], mNetOutputSize4D[1], mNetOutputSize4D[2], mNetOutputSize4D[3]);
+                upImpl->spInputBlob = boost::make_shared<caffe::Blob<float>>(upImpl->mNetInputSize4D[0], upImpl->mNetInputSize4D[1], upImpl->mNetInputSize4D[2], upImpl->mNetInputSize4D[3]);
+                upImpl->spOutputBlob = boost::make_shared<caffe::Blob<float>>(upImpl->mNetOutputSize4D[0], upImpl->mNetOutputSize4D[1], upImpl->mNetOutputSize4D[2], upImpl->mNetOutputSize4D[3]);
             
                 std::cout << "InitializationOnThread : done" << std::endl;
                 cudaCheck(__LINE__, __FUNCTION__, __FILE__);
@@ -343,12 +342,12 @@ namespace op
             if (!vectorsAreEqual(upImpl->mNetInputSize4D, inputData.getSize()))
             {
                 upImpl->mNetInputSize4D = inputData.getSize();
-                reshapeNetTensorRT(upImpl->upCaffeNet.get(), inputData.getSize());
+                reshapeNetTensorRT(upImpl->spInputBlob, inputData.getSize());
             }
             
             // Copy frame data to GPU memory
             auto* gpuImagePtr = upImpl->spInputBlob->mutable_gpu_data();
-            CUDA_CHECK(cudaMemcpy(gpuImagePtr, inputData.getConstPtr(), mNetInputMemory, cudaMemcpyHostToDevice));
+            CUDA_CHECK(cudaMemcpy(gpuImagePtr, inputData.getConstPtr(), inputData.getVolume() * sizeof(float), cudaMemcpyHostToDevice));
             
             // input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(),
             // of these, but in this case we know that there is exactly one input and one output.
@@ -357,7 +356,7 @@ namespace op
             buffers[1] = upImpl->spOutputBlob->mutable_gpu_data();
             
             // Perform deep network forward pass
-            cudaContext->enqueue(batchSize, &buffers[0], stream, nullptr);
+            upImpl->cudaContext->enqueue(1, &buffers[0], upImpl->stream, nullptr);
             
             // Cuda checks
             cudaCheck(__LINE__, __FUNCTION__, __FILE__);
diff --git a/src/openpose/pose/poseExtractorTensorRT.cpp b/src/openpose/pose/poseExtractorTensorRT.cpp
index 9ab8b2b96..dafa0209c 100644
--- a/src/openpose/pose/poseExtractorTensorRT.cpp
+++ b/src/openpose/pose/poseExtractorTensorRT.cpp
@@ -72,7 +72,7 @@ namespace op
         {
             try
             {
-                // Prepare spCaffeNetOutputBlobss
+                // Prepare spTensorRTNetOutputBlobss
                 std::vector<caffe::Blob<float>*> caffeNetOutputBlobs(caffeNetOutputBlob.size());
                 for (auto i = 0u ; i < caffeNetOutputBlobs.size() ; i++)
                     caffeNetOutputBlobs[i] = caffeNetOutputBlob[i].get();
@@ -190,7 +190,7 @@ namespace op
                 // Logging
                 log("Starting initialization on thread.", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
                 // Initialize Caffe net
-                addTensorRTNetOnThread(upImpl->spCaffeNets, upImpl->spCaffeNetOutputBlobs, upImpl->mPoseModel,
+                addTensorRTNetOnThread(upImpl->spTensorRTNets, upImpl->spTensorRTNetOutputBlobs, upImpl->mPoseModel,
                                     upImpl->mGpuId, upImpl->mModelFolder, upImpl->mEnableGoogleLogging);
             
                 cudaCheck(__LINE__, __FUNCTION__, __FILE__);
@@ -212,7 +212,9 @@ namespace op
         }
     }
 
-    void PoseExtractorTensorRT::forwardPass(const Array<float>& inputNetData, const Point<int>& inputDataSize, const std::vector<float>& scaleRatios)
+    void PoseExtractorTensorRT::forwardPass(const std::vector<Array<float>>& inputNetData,
+                                            const Point<int>& inputDataSize,
+                                            const std::vector<double>& scaleInputToNetInputs)
     {
         try
         {
@@ -232,15 +234,15 @@ namespace op
                 // Resize std::vectors if required
                 const auto numberScales = inputNetData.size();
                 upImpl->mNetInput4DSizes.resize(numberScales);
-                while (upImpl->spCaffeNets.size() < numberScales)
-                    addCaffeNetOnThread(upImpl->spCaffeNets, upImpl->spCaffeNetOutputBlobs, upImpl->mPoseModel,
+                while (upImpl->spTensorRTNets.size() < numberScales)
+                    addTensorRTNetOnThread(upImpl->spTensorRTNets, upImpl->spTensorRTNetOutputBlobs, upImpl->mPoseModel,
                                         upImpl->mGpuId, upImpl->mModelFolder, false);
             
                 // Process each image
                 for (auto i = 0u ; i < inputNetData.size(); i++)
                 {
                     // 1. TensorRT deep network
-                    upImpl->spTensorRTNets->forwardPass(inputNetData.getConstPtr());
+                    upImpl->spTensorRTNets.at(i)->forwardPass(inputNetData[i]);
                     
                     // Reshape blobs if required
                     // Note: In order to resize to input size to have same results as Matlab, uncomment the commented
@@ -253,7 +255,7 @@ namespace op
                             upImpl->mNetInput4DSizes[0][2]};
                         // upImpl->mScaleInputToNetInputs = scaleInputToNetInputs;
                         reshapePoseExtractorCaffe(upImpl->spResizeAndMergeCaffe, upImpl->spNmsCaffe,
-                                                  upImpl->spBodyPartConnectorCaffe, upImpl->spCaffeNetOutputBlobs,
+                                                  upImpl->spBodyPartConnectorCaffe, upImpl->spTensorRTNetOutputBlobs,
                                                   upImpl->spHeatMapsBlob, upImpl->spPeaksBlob, upImpl->spPoseBlob,
                                                   1.f, mPoseModel);
                         // scaleInputToNetInputs[i], mPoseModel);
@@ -263,7 +265,7 @@ namespace op
                 timeNow("TensorRT forwards");
             
                 // 2. Resize heat maps + merge different scales
-                const auto caffeNetOutputBlobs = caffeNetSharedToPtr(upImpl->spCaffeNetOutputBlobs);
+                const auto caffeNetOutputBlobs = caffeNetSharedToPtr(upImpl->spTensorRTNetOutputBlobs);
                 const std::vector<float> floatScaleRatios(scaleInputToNetInputs.begin(), scaleInputToNetInputs.end());
                 upImpl->spResizeAndMergeCaffe->setScaleRatios(floatScaleRatios);
                 #ifdef USE_CUDA // Implied by tensorrt

From ca682c42903fd2ef2a8c15ef9d59ae52771e71b9 Mon Sep 17 00:00:00 2001
From: Florent Buisson <florent.buisson.1987@gmail.com>
Date: Wed, 15 Nov 2017 16:23:14 +0100
Subject: [PATCH 48/52] Fix samples

---
 .../tutorial_pose/1_extract_from_image.cpp    |   2 +-
 .../3_extract_from_image_TensorRT.cpp         | 121 ++++++++++--------
 2 files changed, 66 insertions(+), 57 deletions(-)

diff --git a/examples/tutorial_pose/1_extract_from_image.cpp b/examples/tutorial_pose/1_extract_from_image.cpp
index b3dddd747..d975ee00d 100644
--- a/examples/tutorial_pose/1_extract_from_image.cpp
+++ b/examples/tutorial_pose/1_extract_from_image.cpp
@@ -99,7 +99,7 @@ int openPoseTutorialPose1()
     // Check no contradictory flags enabled
     if (FLAGS_alpha_pose < 0. || FLAGS_alpha_pose > 1.)
         op::error("Alpha value for blending must be in the range [0,1].", __LINE__, __FUNCTION__, __FILE__);
-    if (FLAGS_scale_gap <= 0. && FLAGS_scale_number > 1)
+    if (FLAGS_scale_gap <= 0. && FLAGS_scale_number > 1.)
         op::error("Incompatible flag configuration: scale_gap must be greater than 0 or scale_number = 1.",
                   __LINE__, __FUNCTION__, __FILE__);
     // Enabling Google Logging
diff --git a/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp b/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp
index 4a522fbc2..13f700f21 100644
--- a/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp
+++ b/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp
@@ -1,16 +1,20 @@
 // ------------------------- OpenPose Library Tutorial - Pose - Example 3 - Extract from Image with TensorRT -------------------------
 // This first example shows the user how to:
-    // 1. Load an image (`filestream` module)
-    // 2. Extract the pose of that image (`pose` module)
-    // 3. Render the pose on a resized copy of the input image (`pose` module)
-    // 4. Display the rendered pose (`gui` module)
+// 1. Load an image (`filestream` module)
+// 2. Extract the pose of that image (`pose` module)
+// 3. Render the pose on a resized copy of the input image (`pose` module)
+// 4. Display the rendered pose (`gui` module)
 // In addition to the previous OpenPose modules, we also need to use:
-    // 1. `core` module: for the Array<float> class that the `pose` module needs
-    // 2. `utilities` module: for the error & logging functions, i.e. op::error & op::log respectively
+// 1. `core` module: for the Array<float> class that the `pose` module needs
+// 2. `utilities` module: for the error & logging functions, i.e. op::error & op::log respectively
 
 // 3rdparty dependencies
-#include <gflags/gflags.h> // DEFINE_bool, DEFINE_int32, DEFINE_int64, DEFINE_uint64, DEFINE_double, DEFINE_string
-#include <glog/logging.h> // google::InitGoogleLogging
+// GFlags: DEFINE_bool, _int32, _int64, _uint64, _double, _string
+#include <gflags/gflags.h>
+// Allow Google Flags in Ubuntu 14
+#ifndef GFLAGS_GFLAGS_H_
+namespace gflags = google;
+#endif
 // OpenPose dependencies
 #include <openpose/core/headers.hpp>
 #include <openpose/filestream/headers.hpp>
@@ -21,36 +25,40 @@
 // See all the available parameter options withe the `--help` flag. E.g. `./build/examples/openpose/openpose.bin --help`.
 // Note: This command will show you flags for other unnecessary 3rdparty files. Check only the flags for the OpenPose
 // executable. E.g. for `openpose.bin`, look for `Flags from examples/openpose/openpose.cpp:`.
-// Debugging
+// Debugging/Other
 DEFINE_int32(logging_level,             3,              "The logging level. Integer in the range [0, 255]. 0 will output any log() message, while"
-                                                        " 255 will not output any. Current OpenPose library messages are in the range 0-4: 1 for"
-                                                        " low priority messages and 4 for important ones.");
+             " 255 will not output any. Current OpenPose library messages are in the range 0-4: 1 for"
+             " low priority messages and 4 for important ones.");
 // Producer
 DEFINE_string(image_path,               "examples/media/COCO_val2014_000000000192.jpg",     "Process the desired image.");
 // OpenPose
 DEFINE_string(model_pose,               "COCO",         "Model to be used. E.g. `COCO` (18 keypoints), `MPI` (15 keypoints, ~10% faster), "
-                                                        "`MPI_4_layers` (15 keypoints, even faster but less accurate).");
+              "`MPI_4_layers` (15 keypoints, even faster but less accurate).");
 DEFINE_string(model_folder,             "models/",      "Folder path (absolute or relative) where the models (pose, face, ...) are located.");
-DEFINE_string(net_resolution,           "128x96",      "Multiples of 16. If it is increased, the accuracy potentially increases. If it is decreased,"
-                                                        " the speed increases. For maximum speed-accuracy balance, it should keep the closest aspect"
-                                                        " ratio possible to the images or videos to be processed. E.g. the default `128x96` is"
-                                                        " optimal for 16:9 videos, e.g. full HD (1980x1080) and HD (1280x720) videos.");
-DEFINE_string(resolution,               "1280x720",     "The image resolution (display and output). Use \"-1x-1\" to force the program to use the"
-                                                        " default images resolution.");
+DEFINE_string(net_resolution,           "-1x368",       "Multiples of 16. If it is increased, the accuracy potentially increases. If it is"
+              " decreased, the speed increases. For maximum speed-accuracy balance, it should keep the"
+              " closest aspect ratio possible to the images or videos to be processed. Using `-1` in"
+              " any of the dimensions, OP will choose the optimal aspect ratio depending on the user's"
+              " input value. E.g. the default `-1x368` is equivalent to `656x368` in 16:9 resolutions,"
+              " e.g. full HD (1980x1080) and HD (1280x720) resolutions.");
+DEFINE_string(output_resolution,        "-1x-1",        "The image resolution (display and output). Use \"-1x-1\" to force the program to use the"
+              " input image resolution.");
 DEFINE_int32(num_gpu_start,             0,              "GPU device start number.");
 DEFINE_double(scale_gap,                0.3,            "Scale gap between scales. No effect unless scale_number > 1. Initial scale is always 1."
-                                                        " If you want to change the initial scale, you actually want to multiply the"
-                                                        " `net_resolution` by your desired initial scale.");
+              " If you want to change the initial scale, you actually want to multiply the"
+              " `net_resolution` by your desired initial scale.");
 DEFINE_int32(scale_number,              1,              "Number of scales to average.");
 // OpenPose Rendering
-DEFINE_bool(disable_blending,           false,          "If blending is enabled, it will merge the results with the original frame. If disabled, it"
-                                                        " will only display the results on a black background.");
+DEFINE_bool(disable_blending,           false,          "If enabled, it will render the results (keypoint skeletons or heatmaps) on a black"
+            " background, instead of being rendered into the original image. Related: `part_to_show`,"
+            " `alpha_pose`, and `alpha_pose`.");
 DEFINE_double(render_threshold,         0.05,           "Only estimated keypoints whose score confidences are higher than this threshold will be"
-                                                        " rendered. Generally, a high threshold (> 0.5) will only render very clear body parts;"
-                                                        " while small thresholds (~0.1) will also output guessed and occluded keypoints, but also"
-                                                        " more false positives (i.e. wrong detections).");
+              " rendered. Generally, a high threshold (> 0.5) will only render very clear body parts;"
+              " while small thresholds (~0.1) will also output guessed and occluded keypoints, but also"
+              " more false positives (i.e. wrong detections).");
 DEFINE_double(alpha_pose,               0.6,            "Blending factor (range 0-1) for the body part rendering. 1 will show it completely, 0 will"
-                                                        " hide it. Only valid for GPU rendering.");
+              " hide it. Only valid for GPU rendering.");
+
 
 typedef std::vector<std::pair<std::string, std::chrono::high_resolution_clock::time_point>> OpTimings;
 
@@ -84,11 +92,9 @@ int openPoseTutorialPose3()
     op::log("", op::Priority::Low, __LINE__, __FUNCTION__, __FILE__);
     // Step 2 - Read Google flags (user defined configuration)
     // outputSize
-    const auto outputSize = op::flagsToPoint(FLAGS_resolution, "1280x720");
+    const auto outputSize = op::flagsToPoint(FLAGS_output_resolution, "-1x-1");
     // netInputSize
-    const auto netInputSize = op::flagsToPoint(FLAGS_net_resolution, "128x96");
-    // netOutputSize
-    const auto netOutputSize = netInputSize;
+    const auto netInputSize = op::flagsToPoint(FLAGS_net_resolution, "-1x368");
     // poseModel
     const auto poseModel = op::flagsToPoseModel(FLAGS_model_pose);
     // Check no contradictory flags enabled
@@ -96,18 +102,20 @@ int openPoseTutorialPose3()
         op::error("Alpha value for blending must be in the range [0,1].", __LINE__, __FUNCTION__, __FILE__);
     if (FLAGS_scale_gap <= 0. && FLAGS_scale_number > 1)
         op::error("Incompatible flag configuration: scale_gap must be greater than 0 or scale_number = 1.", __LINE__, __FUNCTION__, __FILE__);
+    // Enabling Google Logging
+    const bool enableGoogleLogging = true;
     // Logging
     op::log("", op::Priority::Low, __LINE__, __FUNCTION__, __FILE__);
     // Step 3 - Initialize all required classes
-    op::CvMatToOpInput cvMatToOpInput{netInputSize, FLAGS_scale_number, (float)FLAGS_scale_gap};
-    op::CvMatToOpOutput cvMatToOpOutput{outputSize};
-    op::PoseExtractorTensorRT poseExtractorTensorRT{netInputSize, netOutputSize, outputSize, FLAGS_scale_number, poseModel,
-                                              FLAGS_model_folder, FLAGS_num_gpu_start};
-    op::PoseRenderer poseRenderer{netOutputSize, outputSize, poseModel, nullptr, (float)FLAGS_render_threshold,
-                                  !FLAGS_disable_blending, (float)FLAGS_alpha_pose};
-    op::OpOutputToCvMat opOutputToCvMat{outputSize};
-    const op::Point<int> windowedSize = outputSize;
-    op::FrameDisplayer frameDisplayer{windowedSize, "OpenPose Tutorial - Example 1"};
+    op::ScaleAndSizeExtractor scaleAndSizeExtractor(netInputSize, outputSize, FLAGS_scale_number, FLAGS_scale_gap);
+    op::CvMatToOpInput cvMatToOpInput;
+    op::CvMatToOpOutput cvMatToOpOutput;
+    op::PoseExtractorCaffe poseExtractorTensorRT{poseModel, FLAGS_model_folder,
+        FLAGS_num_gpu_start, {}, op::ScaleMode::ZeroToOne, enableGoogleLogging};
+    op::PoseCpuRenderer poseRenderer{poseModel, (float)FLAGS_render_threshold, !FLAGS_disable_blending,
+        (float)FLAGS_alpha_pose};
+    op::OpOutputToCvMat opOutputToCvMat;
+    op::FrameDisplayer frameDisplayer{"OpenPose Tutorial - Example 3", outputSize};
     // Step 4 - Initialize resources on desired thread (in this case single thread, i.e. we init resources here)
     poseExtractorTensorRT.initializationOnThread();
     poseRenderer.initializationOnThread();
@@ -116,26 +124,27 @@ int openPoseTutorialPose3()
 
     // ------------------------- POSE ESTIMATION AND RENDERING -------------------------
     // Step 1 - Read and load image, error if empty (possibly wrong path)
-    cv::Mat inputImage = op::loadImage(FLAGS_image_path, CV_LOAD_IMAGE_COLOR); // Alternative: cv::imread(FLAGS_image_path, CV_LOAD_IMAGE_COLOR);
+    // Alternative: cv::imread(FLAGS_image_path, CV_LOAD_IMAGE_COLOR);
+    cv::Mat inputImage = op::loadImage(FLAGS_image_path, CV_LOAD_IMAGE_COLOR);
     if(inputImage.empty())
         op::error("Could not open or find the image: " + FLAGS_image_path, __LINE__, __FUNCTION__, __FILE__);
-    timeNow("Step 1");
-    // Step 2 - Format input image to OpenPose input and output formats
-    op::Array<float> netInputArray;
-    std::vector<float> scaleRatios;
-    std::tie(netInputArray, scaleRatios) = cvMatToOpInput.format(inputImage);
+    const op::Point<int> imageSize{inputImage.cols, inputImage.rows};
+    // Step 2 - Get desired scale sizes
+    std::vector<double> scaleInputToNetInputs;
+    std::vector<op::Point<int>> netInputSizes;
     double scaleInputToOutput;
-    op::Array<float> outputArray;
-    std::tie(scaleInputToOutput, outputArray) = cvMatToOpOutput.format(inputImage);
-    timeNow("Step 2");
-    // Step 3 - Estimate poseKeypoints
-    poseExtractorTensorRT.forwardPass(netInputArray, {inputImage.cols, inputImage.rows}, scaleRatios);
+    op::Point<int> outputResolution;
+    std::tie(scaleInputToNetInputs, netInputSizes, scaleInputToOutput, outputResolution)
+    = scaleAndSizeExtractor.extract(imageSize);
+    // Step 3 - Format input image to OpenPose input and output formats
+    const auto netInputArray = cvMatToOpInput.createArray(inputImage, scaleInputToNetInputs, netInputSizes);
+    auto outputArray = cvMatToOpOutput.createArray(inputImage, scaleInputToOutput, outputResolution);
+    // Step 4 - Estimate poseKeypoints
+    poseExtractorTensorRT.forwardPass(netInputArray, imageSize, scaleInputToNetInputs);
     const auto poseKeypoints = poseExtractorTensorRT.getPoseKeypoints();
-    timeNow("Step 3");
-    // Step 4 - Render poseKeypoints
-    poseRenderer.renderPose(outputArray, poseKeypoints);
-    timeNow("Step 4");
-    // Step 5 - OpenPose output format to cv::Mat
+    // Step 5 - Render poseKeypoints
+    poseRenderer.renderPose(outputArray, poseKeypoints, scaleInputToOutput);
+    // Step 6 - OpenPose output format to cv::Mat
     auto outputImage = opOutputToCvMat.formatToCvMat(outputArray);
     timeNow("Step 5");
 
@@ -143,7 +152,7 @@ int openPoseTutorialPose3()
     // Step 1 - Show results
     frameDisplayer.displayFrame(outputImage, 0); // Alternative: cv::imshow(outputImage) + cv::waitKey(0)
     // Step 2 - Logging information message
-    op::log("Example 1 successfully finished.", op::Priority::High);
+    op::log("Example 3 successfully finished.", op::Priority::High);
   
     const auto totalTimeSec = timeDiffToString(timings.back().second, timings.front().second);
     const auto message = "Pose estimation successfully finished. Total time: " + totalTimeSec + " seconds.";

From cb0d440dbb763852dcaacb9564ad9776e2084deb Mon Sep 17 00:00:00 2001
From: Florent Buisson <florent.buisson.1987@gmail.com>
Date: Wed, 15 Nov 2017 16:05:53 +0000
Subject: [PATCH 49/52] Compilation fixed, TensorRT net optimisation works,
 segfault on inference

---
 .../tutorial_pose/3_extract_from_image_TensorRT.cpp    |  5 +----
 src/openpose/core/netTensorRT.cpp                      | 10 +++++-----
 src/openpose/pose/poseExtractorTensorRT.cpp            |  6 +++---
 3 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp b/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp
index 13f700f21..a855fa3da 100644
--- a/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp
+++ b/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp
@@ -110,7 +110,7 @@ int openPoseTutorialPose3()
     op::ScaleAndSizeExtractor scaleAndSizeExtractor(netInputSize, outputSize, FLAGS_scale_number, FLAGS_scale_gap);
     op::CvMatToOpInput cvMatToOpInput;
     op::CvMatToOpOutput cvMatToOpOutput;
-    op::PoseExtractorCaffe poseExtractorTensorRT{poseModel, FLAGS_model_folder,
+    op::PoseExtractorTensorRT poseExtractorTensorRT{poseModel, FLAGS_model_folder,
         FLAGS_num_gpu_start, {}, op::ScaleMode::ZeroToOne, enableGoogleLogging};
     op::PoseCpuRenderer poseRenderer{poseModel, (float)FLAGS_render_threshold, !FLAGS_disable_blending,
         (float)FLAGS_alpha_pose};
@@ -171,9 +171,6 @@ int openPoseTutorialPose3()
 
 int main(int argc, char *argv[])
 {
-    // Initializing google logging (Caffe uses it for logging)
-    google::InitGoogleLogging("openPoseTutorialPose3");
-
     // Parsing command line flags
     gflags::ParseCommandLineFlags(&argc, &argv, true);
 
diff --git a/src/openpose/core/netTensorRT.cpp b/src/openpose/core/netTensorRT.cpp
index 897087f00..b59d57a7d 100644
--- a/src/openpose/core/netTensorRT.cpp
+++ b/src/openpose/core/netTensorRT.cpp
@@ -51,7 +51,7 @@ class Logger : public ILogger
 namespace op
 {
     std::mutex sMutexNetTensorRT;
-    std::atomic<bool> sGoogleLoggingInitialized{false};
+    std::atomic<bool> sGoogleLoggingInitializedTensorRT{false}; // Already defined in netCaffe
     
     struct NetTensorRT::ImplNetTensorRT
     {
@@ -82,7 +82,7 @@ namespace op
             ImplNetTensorRT(const std::string& caffeProto, const std::string& caffeTrainedModel, const int gpuId,
                          const bool enableGoogleLogging, const std::string& lastBlobName) :
                 mGpuId{gpuId},
-                mCaffeProto{caffeProto}, // TODO, no size, how to proceed ?
+                mCaffeProto{caffeProto + std::string("_368x656")}, // TODO, no size, how to proceed ?
                 mCaffeTrainedModel{caffeTrainedModel},
                 mLastBlobName{lastBlobName}
             {
@@ -95,13 +95,13 @@ namespace op
                         error("Caffe trained model file not found: " + mCaffeTrainedModel + message,
                               __LINE__, __FUNCTION__, __FILE__);
                         // Double if condition in order to speed up the program if it is called several times
-                        if (enableGoogleLogging && !sGoogleLoggingInitialized)
+                        if (enableGoogleLogging && !sGoogleLoggingInitializedTensorRT)
                         {
                             std::lock_guard<std::mutex> lock{sMutexNetTensorRT};
-                            if (enableGoogleLogging && !sGoogleLoggingInitialized)
+                            if (enableGoogleLogging && !sGoogleLoggingInitializedTensorRT)
                             {
                                 google::InitGoogleLogging("OpenPose");
-                                sGoogleLoggingInitialized = true;
+                                sGoogleLoggingInitializedTensorRT = true;
                             }
                         }
             }
diff --git a/src/openpose/pose/poseExtractorTensorRT.cpp b/src/openpose/pose/poseExtractorTensorRT.cpp
index dafa0209c..8dc981f30 100644
--- a/src/openpose/pose/poseExtractorTensorRT.cpp
+++ b/src/openpose/pose/poseExtractorTensorRT.cpp
@@ -67,7 +67,7 @@ namespace op
     };
 
     #ifdef USE_CAFFE
-        std::vector<caffe::Blob<float>*> caffeNetSharedToPtr(
+        std::vector<caffe::Blob<float>*> tensorRTNetSharedToPtr(
                                                              std::vector<boost::shared_ptr<caffe::Blob<float>>>& caffeNetOutputBlob)
         {
             try
@@ -98,7 +98,7 @@ namespace op
             try
             {
                 // HeatMaps extractor blob and layer
-                const auto caffeNetOutputBlobs = caffeNetSharedToPtr(caffeNetOutputBlob);
+                const auto caffeNetOutputBlobs = tensorRTNetSharedToPtr(caffeNetOutputBlob);
                 resizeAndMergeCaffe->Reshape(caffeNetOutputBlobs, {heatMapsBlob.get()},
                                              POSE_CCN_DECREASE_FACTOR[(int)poseModel], 1.f/scaleInputToNetInput);
                 // Pose extractor blob and layer
@@ -265,7 +265,7 @@ namespace op
                 timeNow("TensorRT forwards");
             
                 // 2. Resize heat maps + merge different scales
-                const auto caffeNetOutputBlobs = caffeNetSharedToPtr(upImpl->spTensorRTNetOutputBlobs);
+                const auto caffeNetOutputBlobs = tensorRTNetSharedToPtr(upImpl->spTensorRTNetOutputBlobs);
                 const std::vector<float> floatScaleRatios(scaleInputToNetInputs.begin(), scaleInputToNetInputs.end());
                 upImpl->spResizeAndMergeCaffe->setScaleRatios(floatScaleRatios);
                 #ifdef USE_CUDA // Implied by tensorrt

From 827510b1a71d3709375c46de3f10f49f622c84ca Mon Sep 17 00:00:00 2001
From: Florent Buisson <florent.buisson.1987@gmail.com>
Date: Wed, 15 Nov 2017 16:58:38 +0000
Subject: [PATCH 50/52] Code kind of work, not full pipeline lead to no shape
 displayed, sizes hardcoded.

---
 src/openpose/core/netTensorRT.cpp | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/openpose/core/netTensorRT.cpp b/src/openpose/core/netTensorRT.cpp
index b59d57a7d..1b2b43151 100644
--- a/src/openpose/core/netTensorRT.cpp
+++ b/src/openpose/core/netTensorRT.cpp
@@ -307,14 +307,15 @@ namespace op
                     std::cerr << "cudaContext could not be created" << std::endl;
                     return;
                 }
-            
                 DimsCHW outputDims = static_cast<DimsCHW&&>(upImpl->cudaEngine->getBindingDimensions(upImpl->cudaEngine->getNbBindings() - 1));
-                upImpl->mNetOutputSize4D = { 1, outputDims.c(), outputDims.h(), outputDims.w() };
-            
+                upImpl->mNetOutputSize4D.push_back(1);
+                upImpl->mNetOutputSize4D.push_back(outputDims.c());           
+                upImpl->mNetOutputSize4D.push_back(outputDims.h());
+                upImpl->mNetOutputSize4D.push_back(outputDims.w()); 
             
-                std::cout << "NetInputSize4D: " << upImpl->mNetInputSize4D[0] << " " << upImpl->mNetInputSize4D[1] << " " << upImpl->mNetInputSize4D[2] << " " << upImpl->mNetInputSize4D[3] << std::endl;
+                //std::cout << "NetInputSize4D: " << upImpl->mNetInputSize4D.at(0) << " " << upImpl->mNetInputSize4D.at(1) << " " << upImpl->mNetInputSize4D.at(2) << " " << upImpl->mNetInputSize4D.at(3) << std::endl;
             
-                upImpl->spInputBlob = boost::make_shared<caffe::Blob<float>>(upImpl->mNetInputSize4D[0], upImpl->mNetInputSize4D[1], upImpl->mNetInputSize4D[2], upImpl->mNetInputSize4D[3]);
+                upImpl->spInputBlob = boost::make_shared<caffe::Blob<float>>(1, 3, 368, 656);
                 upImpl->spOutputBlob = boost::make_shared<caffe::Blob<float>>(upImpl->mNetOutputSize4D[0], upImpl->mNetOutputSize4D[1], upImpl->mNetOutputSize4D[2], upImpl->mNetOutputSize4D[3]);
             
                 std::cout << "InitializationOnThread : done" << std::endl;

From a1619fa3c8aa881568ed27d20a1efb76cfcc690a Mon Sep 17 00:00:00 2001
From: Florent Buisson <florent.buisson.1987@gmail.com>
Date: Wed, 15 Nov 2017 22:02:37 +0100
Subject: [PATCH 51/52] Useless preproc macros

---
 src/openpose/pose/poseExtractorTensorRT.cpp | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/openpose/pose/poseExtractorTensorRT.cpp b/src/openpose/pose/poseExtractorTensorRT.cpp
index 8dc981f30..c61d17dd6 100644
--- a/src/openpose/pose/poseExtractorTensorRT.cpp
+++ b/src/openpose/pose/poseExtractorTensorRT.cpp
@@ -268,14 +268,10 @@ namespace op
                 const auto caffeNetOutputBlobs = tensorRTNetSharedToPtr(upImpl->spTensorRTNetOutputBlobs);
                 const std::vector<float> floatScaleRatios(scaleInputToNetInputs.begin(), scaleInputToNetInputs.end());
                 upImpl->spResizeAndMergeCaffe->setScaleRatios(floatScaleRatios);
-                #ifdef USE_CUDA // Implied by tensorrt
+            
                 upImpl->spResizeAndMergeCaffe->Forward_gpu(caffeNetOutputBlobs,                             // ~5ms
                                                            {upImpl->spHeatMapsBlob.get()});
                 cudaCheck(__LINE__, __FUNCTION__, __FILE__);
-                #else // Never reached, suppress ?
-                upImpl->spResizeAndMergeCaffe->Forward_cpu({upImpl->spCaffeNetOutputBlob.get()},
-                                                           {upImpl->spHeatMapsBlob.get()});
-                #endif
             
                 timeNow("Resize heat Maps");
             

From 344ab674b08e8a90fc79a55bfa08ae5b965d45b7 Mon Sep 17 00:00:00 2001
From: Florent Buisson <florent.buisson.1987@gmail.com>
Date: Wed, 15 Nov 2017 22:06:03 +0100
Subject: [PATCH 52/52] NetTensorRT modifs

---
 src/openpose/core/netTensorRT.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/openpose/core/netTensorRT.cpp b/src/openpose/core/netTensorRT.cpp
index 1b2b43151..ed93b3662 100644
--- a/src/openpose/core/netTensorRT.cpp
+++ b/src/openpose/core/netTensorRT.cpp
@@ -339,9 +339,10 @@ namespace op
             if (inputData.getNumberDimensions() != 4 || inputData.getSize(1) != 3)
                 error("The Array inputData must have 4 dimensions: [batch size, 3 (RGB), height, width].",
                       __LINE__, __FUNCTION__, __FILE__);
-            // Reshape Caffe net if required
+            // Reshape Tensor RT net if required
             if (!vectorsAreEqual(upImpl->mNetInputSize4D, inputData.getSize()))
             {
+                std::cout << "Reshaping Tensor RT Net : WARNING NOT TESTED, probably won't work" << std::endl;
                 upImpl->mNetInputSize4D = inputData.getSize();
                 reshapeNetTensorRT(upImpl->spInputBlob, inputData.getSize());
             }
@@ -357,7 +358,7 @@ namespace op
             buffers[1] = upImpl->spOutputBlob->mutable_gpu_data();
             
             // Perform deep network forward pass
-            upImpl->cudaContext->enqueue(1, &buffers[0], upImpl->stream, nullptr);
+            upImpl->cudaContext->enqueue(inputData.getSize(0), &buffers[0], upImpl->stream, nullptr);
             
             // Cuda checks
             cudaCheck(__LINE__, __FUNCTION__, __FILE__);