From 564aecee3fda5205eb2c782f9d0921f6636309ad Mon Sep 17 00:00:00 2001 From: Florent Buisson Date: Thu, 21 Sep 2017 11:23:37 +0200 Subject: [PATCH 01/52] Files for tensort rt pose detection, for now nothing done. --- .../3_extract_from_image_TensorRT.cpp | 134 ++++++++++++++ .../openpose/pose/poseExtractorTensorRT.hpp | 52 ++++++ src/openpose/pose/poseExtractorTensorRT.cpp | 170 ++++++++++++++++++ 3 files changed, 356 insertions(+) create mode 100644 examples/tutorial_pose/3_extract_from_image_TensorRT.cpp create mode 100644 include/openpose/pose/poseExtractorTensorRT.hpp create mode 100644 src/openpose/pose/poseExtractorTensorRT.cpp diff --git a/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp b/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp new file mode 100644 index 000000000..48cbcbb96 --- /dev/null +++ b/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp @@ -0,0 +1,134 @@ +// ------------------------- OpenPose Library Tutorial - Pose - Example 1 - Extract from Image ------------------------- +// This first example shows the user how to: + // 1. Load an image (`filestream` module) + // 2. Extract the pose of that image (`pose` module) + // 3. Render the pose on a resized copy of the input image (`pose` module) + // 4. Display the rendered pose (`gui` module) +// In addition to the previous OpenPose modules, we also need to use: + // 1. `core` module: for the Array class that the `pose` module needs + // 2. `utilities` module: for the error & logging functions, i.e. op::error & op::log respectively + +// 3rdparty dependencies +#include // DEFINE_bool, DEFINE_int32, DEFINE_int64, DEFINE_uint64, DEFINE_double, DEFINE_string +#include // google::InitGoogleLogging +// OpenPose dependencies +#include +#include +#include +#include +#include + +// See all the available parameter options withe the `--help` flag. E.g. `./build/examples/openpose/openpose.bin --help`. +// Note: This command will show you flags for other unnecessary 3rdparty files. Check only the flags for the OpenPose +// executable. E.g. for `openpose.bin`, look for `Flags from examples/openpose/openpose.cpp:`. +// Debugging +DEFINE_int32(logging_level, 3, "The logging level. Integer in the range [0, 255]. 0 will output any log() message, while" + " 255 will not output any. Current OpenPose library messages are in the range 0-4: 1 for" + " low priority messages and 4 for important ones."); +// Producer +DEFINE_string(image_path, "examples/media/COCO_val2014_000000000192.jpg", "Process the desired image."); +// OpenPose +DEFINE_string(model_pose, "COCO", "Model to be used. E.g. `COCO` (18 keypoints), `MPI` (15 keypoints, ~10% faster), " + "`MPI_4_layers` (15 keypoints, even faster but less accurate)."); +DEFINE_string(model_folder, "models/", "Folder path (absolute or relative) where the models (pose, face, ...) are located."); +DEFINE_string(net_resolution, "656x368", "Multiples of 16. If it is increased, the accuracy potentially increases. If it is decreased," + " the speed increases. For maximum speed-accuracy balance, it should keep the closest aspect" + " ratio possible to the images or videos to be processed. E.g. the default `656x368` is" + " optimal for 16:9 videos, e.g. full HD (1980x1080) and HD (1280x720) videos."); +DEFINE_string(resolution, "1280x720", "The image resolution (display and output). Use \"-1x-1\" to force the program to use the" + " default images resolution."); +DEFINE_int32(num_gpu_start, 0, "GPU device start number."); +DEFINE_double(scale_gap, 0.3, "Scale gap between scales. No effect unless scale_number > 1. Initial scale is always 1." + " If you want to change the initial scale, you actually want to multiply the" + " `net_resolution` by your desired initial scale."); +DEFINE_int32(scale_number, 1, "Number of scales to average."); +// OpenPose Rendering +DEFINE_bool(disable_blending, false, "If blending is enabled, it will merge the results with the original frame. If disabled, it" + " will only display the results on a black background."); +DEFINE_double(render_threshold, 0.05, "Only estimated keypoints whose score confidences are higher than this threshold will be" + " rendered. Generally, a high threshold (> 0.5) will only render very clear body parts;" + " while small thresholds (~0.1) will also output guessed and occluded keypoints, but also" + " more false positives (i.e. wrong detections)."); +DEFINE_double(alpha_pose, 0.6, "Blending factor (range 0-1) for the body part rendering. 1 will show it completely, 0 will" + " hide it. Only valid for GPU rendering."); + +int openPoseTutorialPose1() +{ + op::log("OpenPose Library Tutorial - Example 1.", op::Priority::High); + // ------------------------- INITIALIZATION ------------------------- + // Step 1 - Set logging level + // - 0 will output all the logging messages + // - 255 will output nothing + op::check(0 <= FLAGS_logging_level && FLAGS_logging_level <= 255, "Wrong logging_level value.", __LINE__, __FUNCTION__, __FILE__); + op::ConfigureLog::setPriorityThreshold((op::Priority)FLAGS_logging_level); + op::log("", op::Priority::Low, __LINE__, __FUNCTION__, __FILE__); + // Step 2 - Read Google flags (user defined configuration) + // outputSize + const auto outputSize = op::flagsToPoint(FLAGS_resolution, "1280x720"); + // netInputSize + const auto netInputSize = op::flagsToPoint(FLAGS_net_resolution, "656x368"); + // netOutputSize + const auto netOutputSize = netInputSize; + // poseModel + const auto poseModel = op::flagsToPoseModel(FLAGS_model_pose); + // Check no contradictory flags enabled + if (FLAGS_alpha_pose < 0. || FLAGS_alpha_pose > 1.) + op::error("Alpha value for blending must be in the range [0,1].", __LINE__, __FUNCTION__, __FILE__); + if (FLAGS_scale_gap <= 0. && FLAGS_scale_number > 1) + op::error("Incompatible flag configuration: scale_gap must be greater than 0 or scale_number = 1.", __LINE__, __FUNCTION__, __FILE__); + // Logging + op::log("", op::Priority::Low, __LINE__, __FUNCTION__, __FILE__); + // Step 3 - Initialize all required classes + op::CvMatToOpInput cvMatToOpInput{netInputSize, FLAGS_scale_number, (float)FLAGS_scale_gap}; + op::CvMatToOpOutput cvMatToOpOutput{outputSize}; + op::PoseExtractorCaffe poseExtractorCaffe{netInputSize, netOutputSize, outputSize, FLAGS_scale_number, poseModel, + FLAGS_model_folder, FLAGS_num_gpu_start}; + op::PoseRenderer poseRenderer{netOutputSize, outputSize, poseModel, nullptr, (float)FLAGS_render_threshold, + !FLAGS_disable_blending, (float)FLAGS_alpha_pose}; + op::OpOutputToCvMat opOutputToCvMat{outputSize}; + const op::Point windowedSize = outputSize; + op::FrameDisplayer frameDisplayer{windowedSize, "OpenPose Tutorial - Example 1"}; + // Step 4 - Initialize resources on desired thread (in this case single thread, i.e. we init resources here) + poseExtractorCaffe.initializationOnThread(); + poseRenderer.initializationOnThread(); + + // ------------------------- POSE ESTIMATION AND RENDERING ------------------------- + // Step 1 - Read and load image, error if empty (possibly wrong path) + cv::Mat inputImage = op::loadImage(FLAGS_image_path, CV_LOAD_IMAGE_COLOR); // Alternative: cv::imread(FLAGS_image_path, CV_LOAD_IMAGE_COLOR); + if(inputImage.empty()) + op::error("Could not open or find the image: " + FLAGS_image_path, __LINE__, __FUNCTION__, __FILE__); + // Step 2 - Format input image to OpenPose input and output formats + op::Array netInputArray; + std::vector scaleRatios; + std::tie(netInputArray, scaleRatios) = cvMatToOpInput.format(inputImage); + double scaleInputToOutput; + op::Array outputArray; + std::tie(scaleInputToOutput, outputArray) = cvMatToOpOutput.format(inputImage); + // Step 3 - Estimate poseKeypoints + poseExtractorCaffe.forwardPass(netInputArray, {inputImage.cols, inputImage.rows}, scaleRatios); + const auto poseKeypoints = poseExtractorCaffe.getPoseKeypoints(); + // Step 4 - Render poseKeypoints + poseRenderer.renderPose(outputArray, poseKeypoints); + // Step 5 - OpenPose output format to cv::Mat + auto outputImage = opOutputToCvMat.formatToCvMat(outputArray); + + // ------------------------- SHOWING RESULT AND CLOSING ------------------------- + // Step 1 - Show results + frameDisplayer.displayFrame(outputImage, 0); // Alternative: cv::imshow(outputImage) + cv::waitKey(0) + // Step 2 - Logging information message + op::log("Example 1 successfully finished.", op::Priority::High); + // Return successful message + return 0; +} + +int main(int argc, char *argv[]) +{ + // Initializing google logging (Caffe uses it for logging) + google::InitGoogleLogging("openPoseTutorialPose1"); + + // Parsing command line flags + gflags::ParseCommandLineFlags(&argc, &argv, true); + + // Running openPoseTutorialPose1 + return openPoseTutorialPose1(); +} diff --git a/include/openpose/pose/poseExtractorTensorRT.hpp b/include/openpose/pose/poseExtractorTensorRT.hpp new file mode 100644 index 000000000..d745c30f6 --- /dev/null +++ b/include/openpose/pose/poseExtractorTensorRT.hpp @@ -0,0 +1,52 @@ +#ifdef USE_CAFFE +#ifndef OPENPOSE_POSE_POSE_EXTRACTOR_CAFFE_HPP +#define OPENPOSE_POSE_POSE_EXTRACTOR_CAFFE_HPP + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace op +{ + class OP_API PoseExtractorCaffe : public PoseExtractor + { + public: + PoseExtractorCaffe(const Point& netInputSize, const Point& netOutputSize, const Point& outputSize, const int scaleNumber, + const PoseModel poseModel, const std::string& modelFolder, const int gpuId, const std::vector& heatMapTypes = {}, + const ScaleMode heatMapScale = ScaleMode::ZeroToOne); + + virtual ~PoseExtractorCaffe(); + + void netInitializationOnThread(); + + void forwardPass(const Array& inputNetData, const Point& inputDataSize, const std::vector& scaleRatios = {1.f}); + + const float* getHeatMapCpuConstPtr() const; + + const float* getHeatMapGpuConstPtr() const; + + const float* getPoseGpuConstPtr() const; + + private: + const float mResizeScale; + std::shared_ptr spNet; + std::shared_ptr> spResizeAndMergeCaffe; + std::shared_ptr> spNmsCaffe; + std::shared_ptr> spBodyPartConnectorCaffe; + // Init with thread + boost::shared_ptr> spCaffeNetOutputBlob; + std::shared_ptr> spHeatMapsBlob; + std::shared_ptr> spPeaksBlob; + std::shared_ptr> spPoseBlob; + + DELETE_COPY(PoseExtractorCaffe); + }; +} + +#endif // OPENPOSE_POSE_POSE_EXTRACTOR_CAFFE_HPP +#endif diff --git a/src/openpose/pose/poseExtractorTensorRT.cpp b/src/openpose/pose/poseExtractorTensorRT.cpp new file mode 100644 index 000000000..bc4374782 --- /dev/null +++ b/src/openpose/pose/poseExtractorTensorRT.cpp @@ -0,0 +1,170 @@ +#ifdef USE_CAFFE +#include +#include +#include +#include +#include +#include +#include + +namespace op +{ + PoseExtractorCaffe::PoseExtractorCaffe(const Point& netInputSize, const Point& netOutputSize, const Point& outputSize, const int scaleNumber, + const PoseModel poseModel, const std::string& modelFolder, const int gpuId, const std::vector& heatMapTypes, + const ScaleMode heatMapScale) : + PoseExtractor{netOutputSize, outputSize, poseModel, heatMapTypes, heatMapScale}, + mResizeScale{mNetOutputSize.x / (float)netInputSize.x}, + spNet{std::make_shared(std::array{scaleNumber, 3, (int)netInputSize.y, (int)netInputSize.x}, + modelFolder + POSE_PROTOTXT[(int)poseModel], modelFolder + POSE_TRAINED_MODEL[(int)poseModel], gpuId)}, + spResizeAndMergeCaffe{std::make_shared>()}, + spNmsCaffe{std::make_shared>()}, + spBodyPartConnectorCaffe{std::make_shared>()} + { + try + { + const auto resizeScale = mNetOutputSize.x / (float)netInputSize.x; + const auto resizeScaleCheck = resizeScale / (mNetOutputSize.y/(float)netInputSize.y); + if (1+1e-6 < resizeScaleCheck || resizeScaleCheck < 1-1e-6) + error("Net input and output size must be proportional. resizeScaleCheck = " + std::to_string(resizeScaleCheck), __LINE__, __FUNCTION__, __FILE__); + } + catch (const std::exception& e) + { + error(e.what(), __LINE__, __FUNCTION__, __FILE__); + } + } + + PoseExtractorCaffe::~PoseExtractorCaffe() + { + } + + void PoseExtractorCaffe::netInitializationOnThread() + { + try + { + log("Starting initialization on thread.", Priority::Low, __LINE__, __FUNCTION__, __FILE__); + + // Caffe net + spNet->initializationOnThread(); + spCaffeNetOutputBlob = ((NetCaffe*)spNet.get())->getOutputBlob(); + cudaCheck(__LINE__, __FUNCTION__, __FILE__); + + // HeatMaps extractor blob and layer + spHeatMapsBlob = {std::make_shared>(1,1,1,1)}; + spResizeAndMergeCaffe->Reshape({spCaffeNetOutputBlob.get()}, {spHeatMapsBlob.get()}, mResizeScale * POSE_CCN_DECREASE_FACTOR[(int)mPoseModel]); + cudaCheck(__LINE__, __FUNCTION__, __FILE__); + + // Pose extractor blob and layer + spPeaksBlob = {std::make_shared>(1,1,1,1)}; + spNmsCaffe->Reshape({spHeatMapsBlob.get()}, {spPeaksBlob.get()}, POSE_MAX_PEAKS[(int)mPoseModel]); + cudaCheck(__LINE__, __FUNCTION__, __FILE__); + + // Pose extractor blob and layer + spPoseBlob = {std::make_shared>(1,1,1,1)}; + spBodyPartConnectorCaffe->setPoseModel(mPoseModel); + spBodyPartConnectorCaffe->Reshape({spHeatMapsBlob.get(), spPeaksBlob.get()}, {spPoseBlob.get()}); + cudaCheck(__LINE__, __FUNCTION__, __FILE__); + + log("Finished initialization on thread.", Priority::Low, __LINE__, __FUNCTION__, __FILE__); + } + catch (const std::exception& e) + { + error(e.what(), __LINE__, __FUNCTION__, __FILE__); + } + } + + void PoseExtractorCaffe::forwardPass(const Array& inputNetData, const Point& inputDataSize, const std::vector& scaleRatios) + { + try + { + // Security checks + if (inputNetData.empty()) + error("Empty inputNetData.", __LINE__, __FUNCTION__, __FILE__); + + // 1. Caffe deep network + spNet->forwardPass(inputNetData.getConstPtr()); // ~79.3836ms + + // 2. Resize heat maps + merge different scales + spResizeAndMergeCaffe->setScaleRatios(scaleRatios); + #ifndef CPU_ONLY + spResizeAndMergeCaffe->Forward_gpu({spCaffeNetOutputBlob.get()}, {spHeatMapsBlob.get()}); // ~5ms + cudaCheck(__LINE__, __FUNCTION__, __FILE__); + #else + error("ResizeAndMergeCaffe CPU version not implemented yet.", __LINE__, __FUNCTION__, __FILE__); + #endif + + // 3. Get peaks by Non-Maximum Suppression + spNmsCaffe->setThreshold((float)get(PoseProperty::NMSThreshold)); + #ifndef CPU_ONLY + spNmsCaffe->Forward_gpu({spHeatMapsBlob.get()}, {spPeaksBlob.get()}); // ~2ms + cudaCheck(__LINE__, __FUNCTION__, __FILE__); + #else + error("NmsCaffe CPU version not implemented yet.", __LINE__, __FUNCTION__, __FILE__); + #endif + + // Get scale net to output + const auto scaleProducerToNetInput = resizeGetScaleFactor(inputDataSize, mNetOutputSize); + const Point netSize{intRound(scaleProducerToNetInput*inputDataSize.x), intRound(scaleProducerToNetInput*inputDataSize.y)}; + mScaleNetToOutput = {(float)resizeGetScaleFactor(netSize, mOutputSize)}; + + // 4. Connecting body parts + spBodyPartConnectorCaffe->setScaleNetToOutput(mScaleNetToOutput); + spBodyPartConnectorCaffe->setInterMinAboveThreshold((int)get(PoseProperty::ConnectInterMinAboveThreshold)); + spBodyPartConnectorCaffe->setInterThreshold((float)get(PoseProperty::ConnectInterThreshold)); + spBodyPartConnectorCaffe->setMinSubsetCnt((int)get(PoseProperty::ConnectMinSubsetCnt)); + spBodyPartConnectorCaffe->setMinSubsetScore((float)get(PoseProperty::ConnectMinSubsetScore)); + + // GPU version not implemented yet + spBodyPartConnectorCaffe->Forward_cpu({spHeatMapsBlob.get(), spPeaksBlob.get()}, mPoseKeypoints); + // spBodyPartConnectorCaffe->Forward_gpu({spHeatMapsBlob.get(), spPeaksBlob.get()}, {spPoseBlob.get()}, mPoseKeypoints); + } + catch (const std::exception& e) + { + error(e.what(), __LINE__, __FUNCTION__, __FILE__); + } + } + + const float* PoseExtractorCaffe::getHeatMapCpuConstPtr() const + { + try + { + checkThread(); + return spHeatMapsBlob->cpu_data(); + } + catch (const std::exception& e) + { + error(e.what(), __LINE__, __FUNCTION__, __FILE__); + return nullptr; + } + } + + const float* PoseExtractorCaffe::getHeatMapGpuConstPtr() const + { + try + { + checkThread(); + return spHeatMapsBlob->gpu_data(); + } + catch (const std::exception& e) + { + error(e.what(), __LINE__, __FUNCTION__, __FILE__); + return nullptr; + } + } + + const float* PoseExtractorCaffe::getPoseGpuConstPtr() const + { + try + { + error("GPU pointer for people pose data not implemented yet.", __LINE__, __FUNCTION__, __FILE__); + checkThread(); + return spPoseBlob->gpu_data(); + } + catch (const std::exception& e) + { + error(e.what(), __LINE__, __FUNCTION__, __FILE__); + return nullptr; + } + } +} + +#endif From dfc1f827c0dba611ed535c5fb03ce1febc772b86 Mon Sep 17 00:00:00 2001 From: Florent Buisson Date: Thu, 21 Sep 2017 11:27:48 +0200 Subject: [PATCH 02/52] Adding timer in new demo and checking build before replacing inference. --- examples/tutorial_pose/3_extract_from_image_TensorRT.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp b/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp index 48cbcbb96..f431d322d 100644 --- a/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp +++ b/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp @@ -54,6 +54,9 @@ DEFINE_double(alpha_pose, 0.6, "Blending factor (range int openPoseTutorialPose1() { + op::log("Starting pose estimation.", op::Priority::High); + const auto timerBegin = std::chrono::high_resolution_clock::now(); + op::log("OpenPose Library Tutorial - Example 1.", op::Priority::High); // ------------------------- INITIALIZATION ------------------------- // Step 1 - Set logging level @@ -117,6 +120,11 @@ int openPoseTutorialPose1() frameDisplayer.displayFrame(outputImage, 0); // Alternative: cv::imshow(outputImage) + cv::waitKey(0) // Step 2 - Logging information message op::log("Example 1 successfully finished.", op::Priority::High); + + const auto now = std::chrono::high_resolution_clock::now(); + const auto totalTimeSec = (double)std::chrono::duration_cast(now-timerBegin).count() * 1e-9; + const auto message = "Pose estimation successfully finished. Total time: " + std::to_string(totalTimeSec) + " seconds."; + // Return successful message return 0; } From a4885e0f15bf67bfaeb08fa5a96cacccf5ded733 Mon Sep 17 00:00:00 2001 From: Florent Buisson Date: Thu, 21 Sep 2017 09:41:54 +0000 Subject: [PATCH 03/52] PoseExtractorTensorRT changed names for build conflicts but still performs Caffe inference. --- .../openpose/pose/poseExtractorTensorRT.hpp | 22 +++---- src/openpose/pose/poseExtractorTensorRT.cpp | 62 +++++++++---------- 2 files changed, 42 insertions(+), 42 deletions(-) diff --git a/include/openpose/pose/poseExtractorTensorRT.hpp b/include/openpose/pose/poseExtractorTensorRT.hpp index d745c30f6..270d2a8f4 100644 --- a/include/openpose/pose/poseExtractorTensorRT.hpp +++ b/include/openpose/pose/poseExtractorTensorRT.hpp @@ -1,6 +1,6 @@ #ifdef USE_CAFFE -#ifndef OPENPOSE_POSE_POSE_EXTRACTOR_CAFFE_HPP -#define OPENPOSE_POSE_POSE_EXTRACTOR_CAFFE_HPP +#ifndef OPENPOSE_POSE_POSE_EXTRACTOR_TENSORRT_HPP +#define OPENPOSE_POSE_POSE_EXTRACTOR_TENSORRT_HPP #include #include @@ -13,14 +13,14 @@ namespace op { - class OP_API PoseExtractorCaffe : public PoseExtractor + class OP_API PoseExtractorTensorRT : public PoseExtractor { public: - PoseExtractorCaffe(const Point& netInputSize, const Point& netOutputSize, const Point& outputSize, const int scaleNumber, + PoseExtractorTensorRT(const Point& netInputSize, const Point& netOutputSize, const Point& outputSize, const int scaleNumber, const PoseModel poseModel, const std::string& modelFolder, const int gpuId, const std::vector& heatMapTypes = {}, const ScaleMode heatMapScale = ScaleMode::ZeroToOne); - virtual ~PoseExtractorCaffe(); + virtual ~PoseExtractorTensorRT(); void netInitializationOnThread(); @@ -35,18 +35,18 @@ namespace op private: const float mResizeScale; std::shared_ptr spNet; - std::shared_ptr> spResizeAndMergeCaffe; - std::shared_ptr> spNmsCaffe; - std::shared_ptr> spBodyPartConnectorCaffe; + std::shared_ptr> spResizeAndMergeTensorRT; + std::shared_ptr> spNmsTensorRT; + std::shared_ptr> spBodyPartConnectorTensorRT; // Init with thread - boost::shared_ptr> spCaffeNetOutputBlob; + boost::shared_ptr> spTensorRTNetOutputBlob; std::shared_ptr> spHeatMapsBlob; std::shared_ptr> spPeaksBlob; std::shared_ptr> spPoseBlob; - DELETE_COPY(PoseExtractorCaffe); + DELETE_COPY(PoseExtractorTensorRT); }; } -#endif // OPENPOSE_POSE_POSE_EXTRACTOR_CAFFE_HPP +#endif // OPENPOSE_POSE_POSE_EXTRACTOR_TENSORRT_HPP #endif diff --git a/src/openpose/pose/poseExtractorTensorRT.cpp b/src/openpose/pose/poseExtractorTensorRT.cpp index bc4374782..0bd1dc6df 100644 --- a/src/openpose/pose/poseExtractorTensorRT.cpp +++ b/src/openpose/pose/poseExtractorTensorRT.cpp @@ -5,20 +5,20 @@ #include #include #include -#include +#include namespace op { - PoseExtractorCaffe::PoseExtractorCaffe(const Point& netInputSize, const Point& netOutputSize, const Point& outputSize, const int scaleNumber, + PoseExtractorTensorRT::PoseExtractorTensorRT(const Point& netInputSize, const Point& netOutputSize, const Point& outputSize, const int scaleNumber, const PoseModel poseModel, const std::string& modelFolder, const int gpuId, const std::vector& heatMapTypes, const ScaleMode heatMapScale) : PoseExtractor{netOutputSize, outputSize, poseModel, heatMapTypes, heatMapScale}, mResizeScale{mNetOutputSize.x / (float)netInputSize.x}, spNet{std::make_shared(std::array{scaleNumber, 3, (int)netInputSize.y, (int)netInputSize.x}, modelFolder + POSE_PROTOTXT[(int)poseModel], modelFolder + POSE_TRAINED_MODEL[(int)poseModel], gpuId)}, - spResizeAndMergeCaffe{std::make_shared>()}, - spNmsCaffe{std::make_shared>()}, - spBodyPartConnectorCaffe{std::make_shared>()} + spResizeAndMergeTensorRT{std::make_shared>()}, + spNmsTensorRT{std::make_shared>()}, + spBodyPartConnectorTensorRT{std::make_shared>()} { try { @@ -33,35 +33,35 @@ namespace op } } - PoseExtractorCaffe::~PoseExtractorCaffe() + PoseExtractorTensorRT::~PoseExtractorTensorRT() { } - void PoseExtractorCaffe::netInitializationOnThread() + void PoseExtractorTensorRT::netInitializationOnThread() { try { log("Starting initialization on thread.", Priority::Low, __LINE__, __FUNCTION__, __FILE__); - // Caffe net + // TensorRT net spNet->initializationOnThread(); - spCaffeNetOutputBlob = ((NetCaffe*)spNet.get())->getOutputBlob(); + spTensorRTNetOutputBlob = ((NetCaffe*)spNet.get())->getOutputBlob(); cudaCheck(__LINE__, __FUNCTION__, __FILE__); // HeatMaps extractor blob and layer spHeatMapsBlob = {std::make_shared>(1,1,1,1)}; - spResizeAndMergeCaffe->Reshape({spCaffeNetOutputBlob.get()}, {spHeatMapsBlob.get()}, mResizeScale * POSE_CCN_DECREASE_FACTOR[(int)mPoseModel]); + spResizeAndMergeTensorRT->Reshape({spTensorRTNetOutputBlob.get()}, {spHeatMapsBlob.get()}, mResizeScale * POSE_CCN_DECREASE_FACTOR[(int)mPoseModel]); cudaCheck(__LINE__, __FUNCTION__, __FILE__); // Pose extractor blob and layer spPeaksBlob = {std::make_shared>(1,1,1,1)}; - spNmsCaffe->Reshape({spHeatMapsBlob.get()}, {spPeaksBlob.get()}, POSE_MAX_PEAKS[(int)mPoseModel]); + spNmsTensorRT->Reshape({spHeatMapsBlob.get()}, {spPeaksBlob.get()}, POSE_MAX_PEAKS[(int)mPoseModel]); cudaCheck(__LINE__, __FUNCTION__, __FILE__); // Pose extractor blob and layer spPoseBlob = {std::make_shared>(1,1,1,1)}; - spBodyPartConnectorCaffe->setPoseModel(mPoseModel); - spBodyPartConnectorCaffe->Reshape({spHeatMapsBlob.get(), spPeaksBlob.get()}, {spPoseBlob.get()}); + spBodyPartConnectorTensorRT->setPoseModel(mPoseModel); + spBodyPartConnectorTensorRT->Reshape({spHeatMapsBlob.get(), spPeaksBlob.get()}, {spPoseBlob.get()}); cudaCheck(__LINE__, __FUNCTION__, __FILE__); log("Finished initialization on thread.", Priority::Low, __LINE__, __FUNCTION__, __FILE__); @@ -72,7 +72,7 @@ namespace op } } - void PoseExtractorCaffe::forwardPass(const Array& inputNetData, const Point& inputDataSize, const std::vector& scaleRatios) + void PoseExtractorTensorRT::forwardPass(const Array& inputNetData, const Point& inputDataSize, const std::vector& scaleRatios) { try { @@ -80,25 +80,25 @@ namespace op if (inputNetData.empty()) error("Empty inputNetData.", __LINE__, __FUNCTION__, __FILE__); - // 1. Caffe deep network + // 1. TensorRT deep network spNet->forwardPass(inputNetData.getConstPtr()); // ~79.3836ms // 2. Resize heat maps + merge different scales - spResizeAndMergeCaffe->setScaleRatios(scaleRatios); + spResizeAndMergeTensorRT->setScaleRatios(scaleRatios); #ifndef CPU_ONLY - spResizeAndMergeCaffe->Forward_gpu({spCaffeNetOutputBlob.get()}, {spHeatMapsBlob.get()}); // ~5ms + spResizeAndMergeTensorRT->Forward_gpu({spTensorRTNetOutputBlob.get()}, {spHeatMapsBlob.get()}); // ~5ms cudaCheck(__LINE__, __FUNCTION__, __FILE__); #else - error("ResizeAndMergeCaffe CPU version not implemented yet.", __LINE__, __FUNCTION__, __FILE__); + error("ResizeAndMergeTensorRT CPU version not implemented yet.", __LINE__, __FUNCTION__, __FILE__); #endif // 3. Get peaks by Non-Maximum Suppression - spNmsCaffe->setThreshold((float)get(PoseProperty::NMSThreshold)); + spNmsTensorRT->setThreshold((float)get(PoseProperty::NMSThreshold)); #ifndef CPU_ONLY - spNmsCaffe->Forward_gpu({spHeatMapsBlob.get()}, {spPeaksBlob.get()}); // ~2ms + spNmsTensorRT->Forward_gpu({spHeatMapsBlob.get()}, {spPeaksBlob.get()}); // ~2ms cudaCheck(__LINE__, __FUNCTION__, __FILE__); #else - error("NmsCaffe CPU version not implemented yet.", __LINE__, __FUNCTION__, __FILE__); + error("NmsTensorRT CPU version not implemented yet.", __LINE__, __FUNCTION__, __FILE__); #endif // Get scale net to output @@ -107,15 +107,15 @@ namespace op mScaleNetToOutput = {(float)resizeGetScaleFactor(netSize, mOutputSize)}; // 4. Connecting body parts - spBodyPartConnectorCaffe->setScaleNetToOutput(mScaleNetToOutput); - spBodyPartConnectorCaffe->setInterMinAboveThreshold((int)get(PoseProperty::ConnectInterMinAboveThreshold)); - spBodyPartConnectorCaffe->setInterThreshold((float)get(PoseProperty::ConnectInterThreshold)); - spBodyPartConnectorCaffe->setMinSubsetCnt((int)get(PoseProperty::ConnectMinSubsetCnt)); - spBodyPartConnectorCaffe->setMinSubsetScore((float)get(PoseProperty::ConnectMinSubsetScore)); + spBodyPartConnectorTensorRT->setScaleNetToOutput(mScaleNetToOutput); + spBodyPartConnectorTensorRT->setInterMinAboveThreshold((int)get(PoseProperty::ConnectInterMinAboveThreshold)); + spBodyPartConnectorTensorRT->setInterThreshold((float)get(PoseProperty::ConnectInterThreshold)); + spBodyPartConnectorTensorRT->setMinSubsetCnt((int)get(PoseProperty::ConnectMinSubsetCnt)); + spBodyPartConnectorTensorRT->setMinSubsetScore((float)get(PoseProperty::ConnectMinSubsetScore)); // GPU version not implemented yet - spBodyPartConnectorCaffe->Forward_cpu({spHeatMapsBlob.get(), spPeaksBlob.get()}, mPoseKeypoints); - // spBodyPartConnectorCaffe->Forward_gpu({spHeatMapsBlob.get(), spPeaksBlob.get()}, {spPoseBlob.get()}, mPoseKeypoints); + spBodyPartConnectorTensorRT->Forward_cpu({spHeatMapsBlob.get(), spPeaksBlob.get()}, mPoseKeypoints); + // spBodyPartConnectorTensorRT->Forward_gpu({spHeatMapsBlob.get(), spPeaksBlob.get()}, {spPoseBlob.get()}, mPoseKeypoints); } catch (const std::exception& e) { @@ -123,7 +123,7 @@ namespace op } } - const float* PoseExtractorCaffe::getHeatMapCpuConstPtr() const + const float* PoseExtractorTensorRT::getHeatMapCpuConstPtr() const { try { @@ -137,7 +137,7 @@ namespace op } } - const float* PoseExtractorCaffe::getHeatMapGpuConstPtr() const + const float* PoseExtractorTensorRT::getHeatMapGpuConstPtr() const { try { @@ -151,7 +151,7 @@ namespace op } } - const float* PoseExtractorCaffe::getPoseGpuConstPtr() const + const float* PoseExtractorTensorRT::getPoseGpuConstPtr() const { try { From c05580d2838e4c9e33bba560de72ba7ec895b01b Mon Sep 17 00:00:00 2001 From: Florent Buisson Date: Thu, 21 Sep 2017 09:55:25 +0000 Subject: [PATCH 04/52] Started modifying tutorial pose 3. --- .../tutorial_pose/3_extract_from_image_TensorRT.cpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp b/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp index f431d322d..14e13fac5 100644 --- a/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp +++ b/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp @@ -1,4 +1,4 @@ -// ------------------------- OpenPose Library Tutorial - Pose - Example 1 - Extract from Image ------------------------- +// ------------------------- OpenPose Library Tutorial - Pose - Example 3 - Extract from Image with TensorRT ------------------------- // This first example shows the user how to: // 1. Load an image (`filestream` module) // 2. Extract the pose of that image (`pose` module) @@ -52,12 +52,12 @@ DEFINE_double(render_threshold, 0.05, "Only estimated keypoint DEFINE_double(alpha_pose, 0.6, "Blending factor (range 0-1) for the body part rendering. 1 will show it completely, 0 will" " hide it. Only valid for GPU rendering."); -int openPoseTutorialPose1() +int openPoseTutorialPose3() { op::log("Starting pose estimation.", op::Priority::High); const auto timerBegin = std::chrono::high_resolution_clock::now(); - op::log("OpenPose Library Tutorial - Example 1.", op::Priority::High); + op::log("OpenPose Library Tutorial - Pose Example 3.", op::Priority::High); // ------------------------- INITIALIZATION ------------------------- // Step 1 - Set logging level // - 0 will output all the logging messages @@ -114,6 +114,7 @@ int openPoseTutorialPose1() poseRenderer.renderPose(outputArray, poseKeypoints); // Step 5 - OpenPose output format to cv::Mat auto outputImage = opOutputToCvMat.formatToCvMat(outputArray); + const auto now = std::chrono::high_resolution_clock::now(); // ------------------------- SHOWING RESULT AND CLOSING ------------------------- // Step 1 - Show results @@ -121,9 +122,9 @@ int openPoseTutorialPose1() // Step 2 - Logging information message op::log("Example 1 successfully finished.", op::Priority::High); - const auto now = std::chrono::high_resolution_clock::now(); const auto totalTimeSec = (double)std::chrono::duration_cast(now-timerBegin).count() * 1e-9; const auto message = "Pose estimation successfully finished. Total time: " + std::to_string(totalTimeSec) + " seconds."; + op::log(message, op::Priority::High); // Return successful message return 0; @@ -132,11 +133,11 @@ int openPoseTutorialPose1() int main(int argc, char *argv[]) { // Initializing google logging (Caffe uses it for logging) - google::InitGoogleLogging("openPoseTutorialPose1"); + google::InitGoogleLogging("openPoseTutorialPose3"); // Parsing command line flags gflags::ParseCommandLineFlags(&argc, &argv, true); // Running openPoseTutorialPose1 - return openPoseTutorialPose1(); + return openPoseTutorialPose3(); } From 9a97e934eb5d76fa6d5e61f0738b83c146a75464 Mon Sep 17 00:00:00 2001 From: Florent Buisson Date: Thu, 21 Sep 2017 10:43:08 +0000 Subject: [PATCH 05/52] More precise timing. --- .../3_extract_from_image_TensorRT.cpp | 28 ++++++++++++++++--- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp b/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp index 14e13fac5..a8e0b9e38 100644 --- a/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp +++ b/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp @@ -52,11 +52,21 @@ DEFINE_double(render_threshold, 0.05, "Only estimated keypoint DEFINE_double(alpha_pose, 0.6, "Blending factor (range 0-1) for the body part rendering. 1 will show it completely, 0 will" " hide it. Only valid for GPU rendering."); + +static std::vector> timings; + +static void time_now(const std::string& label){ + const auto now = std::chrono::high_resolution_clock::now(); + const auto timing = std::make_pair(label, now); + timings.push_back(timing); +} + int openPoseTutorialPose3() { op::log("Starting pose estimation.", op::Priority::High); - const auto timerBegin = std::chrono::high_resolution_clock::now(); - + + time_now("Start"); + op::log("OpenPose Library Tutorial - Pose Example 3.", op::Priority::High); // ------------------------- INITIALIZATION ------------------------- // Step 1 - Set logging level @@ -100,6 +110,7 @@ int openPoseTutorialPose3() cv::Mat inputImage = op::loadImage(FLAGS_image_path, CV_LOAD_IMAGE_COLOR); // Alternative: cv::imread(FLAGS_image_path, CV_LOAD_IMAGE_COLOR); if(inputImage.empty()) op::error("Could not open or find the image: " + FLAGS_image_path, __LINE__, __FUNCTION__, __FILE__); + time_now("Step 1"); // Step 2 - Format input image to OpenPose input and output formats op::Array netInputArray; std::vector scaleRatios; @@ -107,14 +118,17 @@ int openPoseTutorialPose3() double scaleInputToOutput; op::Array outputArray; std::tie(scaleInputToOutput, outputArray) = cvMatToOpOutput.format(inputImage); + time_now("Step 2"); // Step 3 - Estimate poseKeypoints poseExtractorCaffe.forwardPass(netInputArray, {inputImage.cols, inputImage.rows}, scaleRatios); const auto poseKeypoints = poseExtractorCaffe.getPoseKeypoints(); + time_now("Step 3"); // Step 4 - Render poseKeypoints poseRenderer.renderPose(outputArray, poseKeypoints); + time_now("Step 4"); // Step 5 - OpenPose output format to cv::Mat auto outputImage = opOutputToCvMat.formatToCvMat(outputArray); - const auto now = std::chrono::high_resolution_clock::now(); + time_now("Step 5"); // ------------------------- SHOWING RESULT AND CLOSING ------------------------- // Step 1 - Show results @@ -122,9 +136,15 @@ int openPoseTutorialPose3() // Step 2 - Logging information message op::log("Example 1 successfully finished.", op::Priority::High); - const auto totalTimeSec = (double)std::chrono::duration_cast(now-timerBegin).count() * 1e-9; + const auto totalTimeSec = (double)std::chrono::duration_cast(timings.back().second-timings.front().second).count() * 1e-9; const auto message = "Pose estimation successfully finished. Total time: " + std::to_string(totalTimeSec) + " seconds."; op::log(message, op::Priority::High); + + for(const auto timing : timings) { + const auto log_time = timing.first + " - " + std::to_string((double)std::chrono::duration_cast>(timing.second - timings.front().second).count()); + op::log(log_time, op::Priority::High); + } + // Return successful message return 0; From 4778ed6c257972f66c8a35f8d819d25e3e58e10e Mon Sep 17 00:00:00 2001 From: Florent Buisson Date: Thu, 21 Sep 2017 12:10:21 +0000 Subject: [PATCH 06/52] More precise timings before replacing inference. --- .../3_extract_from_image_TensorRT.cpp | 32 ++++++++++++------- 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp b/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp index a8e0b9e38..b36f362b9 100644 --- a/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp +++ b/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp @@ -52,20 +52,26 @@ DEFINE_double(render_threshold, 0.05, "Only estimated keypoint DEFINE_double(alpha_pose, 0.6, "Blending factor (range 0-1) for the body part rendering. 1 will show it completely, 0 will" " hide it. Only valid for GPU rendering."); +typedef std::vector> OpTimings; -static std::vector> timings; +static OpTimings timings; -static void time_now(const std::string& label){ +static void timeNow(const std::string& label){ const auto now = std::chrono::high_resolution_clock::now(); const auto timing = std::make_pair(label, now); timings.push_back(timing); } +static std::string timeDiffToString(const std::chrono::high_resolution_clock::time_point& t1, + const std::chrono::high_resolution_clock::time_point& t2 ) { + return std::to_string((double)std::chrono::duration_cast>(t1 - t2).count()); +} + int openPoseTutorialPose3() { op::log("Starting pose estimation.", op::Priority::High); - time_now("Start"); + timeNow("Start"); op::log("OpenPose Library Tutorial - Pose Example 3.", op::Priority::High); // ------------------------- INITIALIZATION ------------------------- @@ -104,13 +110,15 @@ int openPoseTutorialPose3() // Step 4 - Initialize resources on desired thread (in this case single thread, i.e. we init resources here) poseExtractorCaffe.initializationOnThread(); poseRenderer.initializationOnThread(); + + timeNow("Initialization"); // ------------------------- POSE ESTIMATION AND RENDERING ------------------------- // Step 1 - Read and load image, error if empty (possibly wrong path) cv::Mat inputImage = op::loadImage(FLAGS_image_path, CV_LOAD_IMAGE_COLOR); // Alternative: cv::imread(FLAGS_image_path, CV_LOAD_IMAGE_COLOR); if(inputImage.empty()) op::error("Could not open or find the image: " + FLAGS_image_path, __LINE__, __FUNCTION__, __FILE__); - time_now("Step 1"); + timeNow("Step 1"); // Step 2 - Format input image to OpenPose input and output formats op::Array netInputArray; std::vector scaleRatios; @@ -118,17 +126,17 @@ int openPoseTutorialPose3() double scaleInputToOutput; op::Array outputArray; std::tie(scaleInputToOutput, outputArray) = cvMatToOpOutput.format(inputImage); - time_now("Step 2"); + timeNow("Step 2"); // Step 3 - Estimate poseKeypoints poseExtractorCaffe.forwardPass(netInputArray, {inputImage.cols, inputImage.rows}, scaleRatios); const auto poseKeypoints = poseExtractorCaffe.getPoseKeypoints(); - time_now("Step 3"); + timeNow("Step 3"); // Step 4 - Render poseKeypoints poseRenderer.renderPose(outputArray, poseKeypoints); - time_now("Step 4"); + timeNow("Step 4"); // Step 5 - OpenPose output format to cv::Mat auto outputImage = opOutputToCvMat.formatToCvMat(outputArray); - time_now("Step 5"); + timeNow("Step 5"); // ------------------------- SHOWING RESULT AND CLOSING ------------------------- // Step 1 - Show results @@ -136,12 +144,12 @@ int openPoseTutorialPose3() // Step 2 - Logging information message op::log("Example 1 successfully finished.", op::Priority::High); - const auto totalTimeSec = (double)std::chrono::duration_cast(timings.back().second-timings.front().second).count() * 1e-9; - const auto message = "Pose estimation successfully finished. Total time: " + std::to_string(totalTimeSec) + " seconds."; + const auto totalTimeSec = timeDiffToString(timings.back().second, timings.front().second); + const auto message = "Pose estimation successfully finished. Total time: " + totalTimeSec + " seconds."; op::log(message, op::Priority::High); - for(const auto timing : timings) { - const auto log_time = timing.first + " - " + std::to_string((double)std::chrono::duration_cast>(timing.second - timings.front().second).count()); + for(OpTimings::iterator timing = timings.begin()+1; timing != timings.end(); ++timing) { + const auto log_time = (*timing).first + " - " + timeDiffToString((*timing).second, (*(timing-1)).second); op::log(log_time, op::Priority::High); } From 9c258b71f6008ad7457a2201dd0e130e21802957 Mon Sep 17 00:00:00 2001 From: Florent Buisson Date: Thu, 21 Sep 2017 12:23:35 +0000 Subject: [PATCH 07/52] Clearer timing display. --- examples/tutorial_pose/3_extract_from_image_TensorRT.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp b/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp index b36f362b9..1b7ed7f6d 100644 --- a/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp +++ b/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp @@ -64,7 +64,7 @@ static void timeNow(const std::string& label){ static std::string timeDiffToString(const std::chrono::high_resolution_clock::time_point& t1, const std::chrono::high_resolution_clock::time_point& t2 ) { - return std::to_string((double)std::chrono::duration_cast>(t1 - t2).count()); + return std::to_string((double)std::chrono::duration_cast>(t1 - t2).count() * 1e3) + " ms"; } int openPoseTutorialPose3() From e6fbd253786bcbdda372e4f08bc373f60f1d2998 Mon Sep 17 00:00:00 2001 From: Florent Buisson Date: Thu, 21 Sep 2017 12:52:38 +0000 Subject: [PATCH 08/52] Replaced poseExtractorCaffe with poseExtractorTensorRT --- examples/tutorial_pose/3_extract_from_image_TensorRT.cpp | 8 ++++---- include/openpose/pose/headers.hpp | 1 + 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp b/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp index 1b7ed7f6d..f4e7eace1 100644 --- a/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp +++ b/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp @@ -100,7 +100,7 @@ int openPoseTutorialPose3() // Step 3 - Initialize all required classes op::CvMatToOpInput cvMatToOpInput{netInputSize, FLAGS_scale_number, (float)FLAGS_scale_gap}; op::CvMatToOpOutput cvMatToOpOutput{outputSize}; - op::PoseExtractorCaffe poseExtractorCaffe{netInputSize, netOutputSize, outputSize, FLAGS_scale_number, poseModel, + op::PoseExtractorTensorRT poseExtractorTensorRT{netInputSize, netOutputSize, outputSize, FLAGS_scale_number, poseModel, FLAGS_model_folder, FLAGS_num_gpu_start}; op::PoseRenderer poseRenderer{netOutputSize, outputSize, poseModel, nullptr, (float)FLAGS_render_threshold, !FLAGS_disable_blending, (float)FLAGS_alpha_pose}; @@ -108,7 +108,7 @@ int openPoseTutorialPose3() const op::Point windowedSize = outputSize; op::FrameDisplayer frameDisplayer{windowedSize, "OpenPose Tutorial - Example 1"}; // Step 4 - Initialize resources on desired thread (in this case single thread, i.e. we init resources here) - poseExtractorCaffe.initializationOnThread(); + poseExtractorTensorRT.initializationOnThread(); poseRenderer.initializationOnThread(); timeNow("Initialization"); @@ -128,8 +128,8 @@ int openPoseTutorialPose3() std::tie(scaleInputToOutput, outputArray) = cvMatToOpOutput.format(inputImage); timeNow("Step 2"); // Step 3 - Estimate poseKeypoints - poseExtractorCaffe.forwardPass(netInputArray, {inputImage.cols, inputImage.rows}, scaleRatios); - const auto poseKeypoints = poseExtractorCaffe.getPoseKeypoints(); + poseExtractorTensorRT.forwardPass(netInputArray, {inputImage.cols, inputImage.rows}, scaleRatios); + const auto poseKeypoints = poseExtractorTensorRT.getPoseKeypoints(); timeNow("Step 3"); // Step 4 - Render poseKeypoints poseRenderer.renderPose(outputArray, poseKeypoints); diff --git a/include/openpose/pose/headers.hpp b/include/openpose/pose/headers.hpp index 4fe06d461..4d336060a 100644 --- a/include/openpose/pose/headers.hpp +++ b/include/openpose/pose/headers.hpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include From f290fc57f750db208923d34695de4a018d317f29 Mon Sep 17 00:00:00 2001 From: Florent Buisson Date: Thu, 21 Sep 2017 12:53:37 +0000 Subject: [PATCH 09/52] Added inference sample code at end of poseExtractorTensorRT to work on laptop. DOES NOT compile, convenience commit. --- src/openpose/pose/poseExtractorTensorRT.cpp | 448 ++++++++++++++++++++ 1 file changed, 448 insertions(+) diff --git a/src/openpose/pose/poseExtractorTensorRT.cpp b/src/openpose/pose/poseExtractorTensorRT.cpp index 0bd1dc6df..0ea2e30e6 100644 --- a/src/openpose/pose/poseExtractorTensorRT.cpp +++ b/src/openpose/pose/poseExtractorTensorRT.cpp @@ -168,3 +168,451 @@ namespace op } #endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "NvInfer.h" +#include "NvCaffeParser.h" + +using namespace nvinfer1; +using namespace nvcaffeparser1; + +#define CHECK(status) \ +{ \ + if (status != 0) \ + { \ + std::cout << "Cuda failure: " << status; \ + abort(); \ + } \ +} + +struct Params +{ + std::string deployFile, modelFile, engine, calibrationCache; + std::vector outputs; + int device{ 0 }, batchSize{ 1 }, workspaceSize{ 16 }, iterations{ 10 }, avgRuns{ 10 }; + bool half2{ false }, int8{ false }, verbose{ false }, hostTime{ false }; +} gParams; + +static inline int volume(DimsCHW dims) +{ + return dims.c()*dims.h()*dims.w(); +} + +std::vector gInputs; +std::map gInputDimensions; + +// Logger for GIE info/warning/errors +class Logger : public ILogger +{ + void log(Severity severity, const char* msg) override + { + // suppress info-level messages + if (severity != Severity::kINFO || gParams.verbose) + std::cout << msg << std::endl; + } +} gLogger; + +class RndInt8Calibrator : public IInt8EntropyCalibrator +{ +public: + RndInt8Calibrator(int totalSamples = 1) + : mTotalSamples(totalSamples) + , mCurrentSample(0) + { + std::default_random_engine generator; + std::uniform_real_distribution distribution(-1.0F, 1.0F); + for(auto& elem: gInputDimensions) + { + int elemCount = volume(elem.second); + + std::vector rnd_data(elemCount); + for(auto& val: rnd_data) + val = distribution(generator); + + void * data; + CHECK(cudaMalloc(&data, elemCount * sizeof(float))); + CHECK(cudaMemcpy(data, &rnd_data[0], elemCount * sizeof(float), cudaMemcpyHostToDevice)); + + mInputDeviceBuffers.insert(std::make_pair(elem.first, data)); + } + } + + ~RndInt8Calibrator() + { + for(auto& elem: mInputDeviceBuffers) + CHECK(cudaFree(elem.second)); + } + + int getBatchSize() const override + { + return 1; + } + + bool getBatch(void* bindings[], const char* names[], int nbBindings) override + { + if (mCurrentSample >= mTotalSamples) + return false; + + for(int i = 0; i < nbBindings; ++i) + bindings[i] = mInputDeviceBuffers[names[i]]; + + ++mCurrentSample; + return true; + } + + const void* readCalibrationCache(size_t&) override + { + return nullptr; + } + + virtual void writeCalibrationCache(const void*, size_t) override + { + } + +private: + int mTotalSamples; + int mCurrentSample; + std::map mInputDeviceBuffers; +}; + +ICudaEngine* caffeToGIEModel() +{ + // create the builder + IBuilder* builder = createInferBuilder(gLogger); + + // parse the caffe model to populate the network, then set the outputs + INetworkDefinition* network = builder->createNetwork(); + ICaffeParser* parser = createCaffeParser(); + const IBlobNameToTensor* blobNameToTensor = parser->parse(gParams.deployFile.c_str(), + gParams.modelFile.empty() ? 0 : gParams.modelFile.c_str(), + *network, + gParams.half2 ? DataType::kHALF:DataType::kFLOAT); + + + if (!blobNameToTensor) + return nullptr; + + for (int i = 0, n = network->getNbInputs(); i < n; i++) + { + DimsCHW dims = static_cast(network->getInput(i)->getDimensions()); + gInputs.push_back(network->getInput(i)->getName()); + gInputDimensions.insert(std::make_pair(network->getInput(i)->getName(), dims)); + std::cout << "Input \"" << network->getInput(i)->getName() << "\": " << dims.c() << "x" << dims.h() << "x" << dims.w() << std::endl; + } + + // specify which tensors are outputs + for (auto& s : gParams.outputs) + { + if (blobNameToTensor->find(s.c_str()) == nullptr) + { + std::cout << "could not find output blob " << s << std::endl; + return nullptr; + } + network->markOutput(*blobNameToTensor->find(s.c_str())); + } + + for (int i = 0, n = network->getNbOutputs(); i < n; i++) + { + DimsCHW dims = static_cast(network->getOutput(i)->getDimensions()); + std::cout << "Output \"" << network->getOutput(i)->getName() << "\": " << dims.c() << "x" << dims.h() << "x" << dims.w() << std::endl; + } + + // Build the engine + builder->setMaxBatchSize(gParams.batchSize); + builder->setMaxWorkspaceSize(gParams.workspaceSize<<20); + builder->setHalf2Mode(gParams.half2); + + RndInt8Calibrator calibrator; + if (gParams.int8) + { + builder->setInt8Mode(true); + builder->setInt8Calibrator(&calibrator); + } + + ICudaEngine* engine = builder->buildCudaEngine(*network); + if (engine == nullptr) + std::cout << "could not build engine" << std::endl; + + parser->destroy(); + network->destroy(); + builder->destroy(); + shutdownProtobufLibrary(); + return engine; +} + +void createMemory(const ICudaEngine& engine, std::vector& buffers, const std::string& name) +{ + size_t bindingIndex = engine.getBindingIndex(name.c_str()); + printf("name=%s, bindingIndex=%d, buffers.size()=%d\n", name.c_str(), (int)bindingIndex, (int)buffers.size()); + assert(bindingIndex < buffers.size()); + DimsCHW dimensions = static_cast(engine.getBindingDimensions((int)bindingIndex)); + size_t eltCount = dimensions.c()*dimensions.h()*dimensions.w()*gParams.batchSize, memSize = eltCount * sizeof(float); + + float* localMem = new float[eltCount]; + for (size_t i = 0; i < eltCount; i++) + localMem[i] = (float(rand()) / RAND_MAX) * 2 - 1; + + void* deviceMem; + CHECK(cudaMalloc(&deviceMem, memSize)); + if (deviceMem == nullptr) + { + std::cerr << "Out of memory" << std::endl; + exit(1); + } + CHECK(cudaMemcpy(deviceMem, localMem, memSize, cudaMemcpyHostToDevice)); + + delete[] localMem; + buffers[bindingIndex] = deviceMem; +} + +void doInference(ICudaEngine& engine) +{ + IExecutionContext *context = engine.createExecutionContext(); + // input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(), + // of these, but in this case we know that there is exactly one input and one output. + + std::vector buffers(gInputs.size() + gParams.outputs.size()); + for (size_t i = 0; i < gInputs.size(); i++) + createMemory(engine, buffers, gInputs[i]); + + for (size_t i = 0; i < gParams.outputs.size(); i++) + createMemory(engine, buffers, gParams.outputs[i]); + + cudaStream_t stream; + CHECK(cudaStreamCreate(&stream)); + cudaEvent_t start, end; + CHECK(cudaEventCreate(&start)); + CHECK(cudaEventCreate(&end)); + + for (int j = 0; j < gParams.iterations; j++) + { + float total = 0, ms; + for (int i = 0; i < gParams.avgRuns; i++) + { + if (gParams.hostTime) + { + auto t_start = std::chrono::high_resolution_clock::now(); + context->execute(gParams.batchSize, &buffers[0]); + auto t_end = std::chrono::high_resolution_clock::now(); + ms = std::chrono::duration(t_end - t_start).count(); + } + else + { + cudaEventRecord(start, stream); + context->enqueue(gParams.batchSize, &buffers[0], stream, nullptr); + cudaEventRecord(end, stream); + cudaEventSynchronize(end); + cudaEventElapsedTime(&ms, start, end); + } + total += ms; + } + total /= gParams.avgRuns; + std::cout << "Average over " << gParams.avgRuns << " runs is " << total << " ms." << std::endl; + } + + + cudaStreamDestroy(stream); + cudaEventDestroy(start); + cudaEventDestroy(end); +} + + + +static void printUsage() +{ + printf("\n"); + printf("Mandatory params:\n"); + printf(" --deploy= Caffe deploy file\n"); + printf(" --output= Output blob name (can be specified multiple times)\n"); + + printf("\nOptional params:\n"); + + printf(" --model= Caffe model file (default = no model, random weights used)\n"); + printf(" --batch=N Set batch size (default = %d)\n", gParams.batchSize); + printf(" --device=N Set cuda device to N (default = %d)\n", gParams.device); + printf(" --iterations=N Run N iterations (default = %d)\n", gParams.iterations); + printf(" --avgRuns=N Set avgRuns to N - perf is measured as an average of avgRuns (default=%d)\n", gParams.avgRuns); + printf(" --workspace=N Set workspace size in megabytes (default = %d)\n", gParams.workspaceSize); + printf(" --half2 Run in paired fp16 mode (default = false)\n"); + printf(" --int8 Run in int8 mode (default = false)\n"); + printf(" --verbose Use verbose logging (default = false)\n"); + printf(" --hostTime Measure host time rather than GPU time (default = false)\n"); + printf(" --engine= Generate a serialized GIE engine\n"); + printf(" --calib= Read INT8 calibration cache file\n"); + + fflush(stdout); +} + +bool parseString(const char* arg, const char* name, std::string& value) +{ + size_t n = strlen(name); + bool match = arg[0] == '-' && arg[1] == '-' && !strncmp(arg + 2, name, n) && arg[n + 2] == '='; + if (match) + { + value = arg + n + 3; + std::cout << name << ": " << value << std::endl; + } + return match; +} + +bool parseInt(const char* arg, const char* name, int& value) +{ + size_t n = strlen(name); + bool match = arg[0] == '-' && arg[1] == '-' && !strncmp(arg + 2, name, n) && arg[n + 2] == '='; + if (match) + { + value = atoi(arg + n + 3); + std::cout << name << ": " << value << std::endl; + } + return match; +} + +bool parseBool(const char* arg, const char* name, bool& value) +{ + size_t n = strlen(name); + bool match = arg[0] == '-' && arg[1] == '-' && !strncmp(arg + 2, name, n); + if (match) + { + std::cout << name << std::endl; + value = true; + } + return match; + +} + + +bool parseArgs(int argc, char* argv[]) +{ + if (argc < 3) + { + printUsage(); + return false; + } + + for (int j = 1; j < argc; j++) + { + if (parseString(argv[j], "model", gParams.modelFile) || parseString(argv[j], "deploy", gParams.deployFile) || parseString(argv[j], "engine", gParams.engine)) + continue; + + if (parseString(argv[j], "calib", gParams.calibrationCache)) + continue; + + std::string output; + if (parseString(argv[j], "output", output)) + { + gParams.outputs.push_back(output); + continue; + } + + if (parseInt(argv[j], "batch", gParams.batchSize) || parseInt(argv[j], "iterations", gParams.iterations) || parseInt(argv[j], "avgRuns", gParams.avgRuns) + || parseInt(argv[j], "device", gParams.device) || parseInt(argv[j], "workspace", gParams.workspaceSize)) + continue; + + if (parseBool(argv[j], "half2", gParams.half2) || parseBool(argv[j], "int8", gParams.int8) + || parseBool(argv[j], "verbose", gParams.verbose) || parseBool(argv[j], "hostTime", gParams.hostTime)) + continue; + + printf("Unknown argument: %s\n", argv[j]); + return false; + } + return true; +} + +static ICudaEngine* createEngine() +{ + ICudaEngine *engine; + + if (!gParams.deployFile.empty()) { + engine = caffeToGIEModel(); + if (!engine) + { + std::cerr << "Engine could not be created" << std::endl; + return nullptr; + } + + + if (!gParams.engine.empty()) + { + std::ofstream p(gParams.engine); + if (!p) + { + std::cerr << "could not open plan output file" << std::endl; + return nullptr; + } + IHostMemory *ptr = engine->serialize(); + assert(ptr); + p.write(reinterpret_cast(ptr->data()), ptr->size()); + ptr->destroy(); + } + return engine; + } + + // load directly from serialized engine file if deploy not specified + if (!gParams.engine.empty()) { + char *gieModelStream{nullptr}; + size_t size{0}; + std::ifstream file(gParams.engine, std::ios::binary); + if (file.good()) { + file.seekg(0, file.end); + size = file.tellg(); + file.seekg(0, file.beg); + gieModelStream = new char[size]; + assert(gieModelStream); + file.read(gieModelStream, size); + file.close(); + } + + IRuntime* infer = createInferRuntime(gLogger); + engine = infer->deserializeCudaEngine(gieModelStream, size, nullptr); + if (gieModelStream) delete [] gieModelStream; + + // assume input to be "data" for deserialized engine + gInputs.push_back("data"); + return engine; + } + + // complain about empty deploy file + std::cerr << "Deploy file not specified" << std::endl; + return nullptr; +} + +int main(int argc, char** argv) +{ + // create a GIE model from the caffe model and serialize it to a stream + + if (!parseArgs(argc, argv)) + return -1; + + cudaSetDevice(gParams.device); + + if (gParams.outputs.size() == 0) + { + std::cerr << "At least one network output must be defined" << std::endl; + return -1; + } + + ICudaEngine* engine = createEngine(); + if (!engine) + { + std::cerr << "Engine could not be created" << std::endl; + return -1; + } + + doInference(*engine); + engine->destroy(); + + return 0; +} From ddc23969a8574ca4e99752ea65a7f273981f6c55 Mon Sep 17 00:00:00 2001 From: Florent Buisson Date: Fri, 22 Sep 2017 11:48:58 +0200 Subject: [PATCH 10/52] First code adaptation trial. Will not compile, still loads to replace. --- .../openpose/pose/poseExtractorTensorRT.hpp | 12 + src/openpose/pose/poseExtractorTensorRT.cpp | 728 +++++++----------- 2 files changed, 293 insertions(+), 447 deletions(-) diff --git a/include/openpose/pose/poseExtractorTensorRT.hpp b/include/openpose/pose/poseExtractorTensorRT.hpp index 270d2a8f4..33f781b8a 100644 --- a/include/openpose/pose/poseExtractorTensorRT.hpp +++ b/include/openpose/pose/poseExtractorTensorRT.hpp @@ -43,6 +43,18 @@ namespace op std::shared_ptr> spHeatMapsBlob; std::shared_ptr> spPeaksBlob; std::shared_ptr> spPoseBlob; + + // TensorRT stuff + const Point mNetInputSize; + const Point mNetOutputSize; + const Point mOutputSize; + const int mScaleNumber; + const PoseModel mPoseModel; + const std::string mModelFolder; + const int mGpuId; + const std::vector mHeatMapTypes; + const ScaleMode mHeatMapScale; + ICudaEngine* cudaEngine; DELETE_COPY(PoseExtractorTensorRT); }; diff --git a/src/openpose/pose/poseExtractorTensorRT.cpp b/src/openpose/pose/poseExtractorTensorRT.cpp index 0ea2e30e6..c7f3311d8 100644 --- a/src/openpose/pose/poseExtractorTensorRT.cpp +++ b/src/openpose/pose/poseExtractorTensorRT.cpp @@ -7,6 +7,261 @@ #include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "NvInfer.h" +#include "NvCaffeParser.h" + +using namespace nvinfer1; +using namespace nvcaffeparser1; + +#define CHECK(status) \ +{ \ +if (status != 0) \ +{ \ +std::cout << "Cuda failure: " << status; \ +abort(); \ +} \ +} + +struct Params +{ + std::string deployFile, modelFile, engine, calibrationCache; + std::vector outputs; + int device{ 0 }, batchSize{ 1 }, workspaceSize{ 16 }, iterations{ 10 }, avgRuns{ 10 }; + bool half2{ false }, int8{ false }, verbose{ false }, hostTime{ false }; +} gParams; + +static inline int volume(DimsCHW dims) +{ + return dims.c()*dims.h()*dims.w(); +} + +std::vector gInputs; +std::map gInputDimensions; + +// Logger for GIE info/warning/errors +class Logger : public ILogger +{ + void log(Severity severity, const char* msg) override + { + // suppress info-level messages + if (severity != Severity::kINFO || gParams.verbose) + std::cout << msg << std::endl; + } +} gLogger; + + +ICudaEngine* caffeToGIEModel() +{ + // create the builder + IBuilder* builder = createInferBuilder(gLogger); + + // parse the caffe model to populate the network, then set the outputs + INetworkDefinition* network = builder->createNetwork(); + ICaffeParser* parser = createCaffeParser(); + const IBlobNameToTensor* blobNameToTensor = parser->parse(gParams.deployFile.c_str(), + gParams.modelFile.empty() ? 0 : gParams.modelFile.c_str(), + *network, + gParams.half2 ? DataType::kHALF:DataType::kFLOAT); + + + if (!blobNameToTensor) + return nullptr; + + for (int i = 0, n = network->getNbInputs(); i < n; i++) + { + DimsCHW dims = static_cast(network->getInput(i)->getDimensions()); + gInputs.push_back(network->getInput(i)->getName()); + gInputDimensions.insert(std::make_pair(network->getInput(i)->getName(), dims)); + std::cout << "Input \"" << network->getInput(i)->getName() << "\": " << dims.c() << "x" << dims.h() << "x" << dims.w() << std::endl; + } + + // specify which tensors are outputs + + for (auto& s : gParams.outputs) + { + if (blobNameToTensor->find(s.c_str()) == nullptr) + { + std::cout << "could not find output blob " << s << std::endl; + return nullptr; + } + network->markOutput(*blobNameToTensor->find(s.c_str())); + } + + for (int i = 0, n = network->getNbOutputs(); i < n; i++) + { + DimsCHW dims = static_cast(network->getOutput(i)->getDimensions()); + std::cout << "Output \"" << network->getOutput(i)->getName() << "\": " << dims.c() << "x" << dims.h() << "x" << dims.w() << std::endl; + } + + // Build the engine + builder->setMaxBatchSize(1); + builder->setMaxWorkspaceSize(gParams.workspaceSize<<20); + builder->setHalf2Mode(true); + + ICudaEngine* engine = builder->buildCudaEngine(*network); + if (engine == nullptr) + std::cout << "could not build engine" << std::endl; + + parser->destroy(); + network->destroy(); + builder->destroy(); + shutdownProtobufLibrary(); + return engine; +} + +void createMemory(const ICudaEngine& engine, std::vector& buffers, const std::string& name) +{ + size_t bindingIndex = engine.getBindingIndex(name.c_str()); + printf("name=%s, bindingIndex=%d, buffers.size()=%d\n", name.c_str(), (int)bindingIndex, (int)buffers.size()); + assert(bindingIndex < buffers.size()); + DimsCHW dimensions = static_cast(engine.getBindingDimensions((int)bindingIndex)); + size_t eltCount = dimensions.c()*dimensions.h()*dimensions.w()*gParams.batchSize, memSize = eltCount * sizeof(float); + + float* localMem = new float[eltCount]; + for (size_t i = 0; i < eltCount; i++) + localMem[i] = (float(rand()) / RAND_MAX) * 2 - 1; + + void* deviceMem; + CHECK(cudaMalloc(&deviceMem, memSize)); + if (deviceMem == nullptr) + { + std::cerr << "Out of memory" << std::endl; + exit(1); + } + CHECK(cudaMemcpy(deviceMem, localMem, memSize, cudaMemcpyHostToDevice)); + + delete[] localMem; + buffers[bindingIndex] = deviceMem; +} + +void doInference(ICudaEngine& engine) +{ + IExecutionContext *context = engine.createExecutionContext(); + // input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(), + // of these, but in this case we know that there is exactly one input and one output. + + std::vector buffers(gInputs.size() + gParams.outputs.size()); + for (size_t i = 0; i < gInputs.size(); i++) + createMemory(engine, buffers, gInputs[i]); + + for (size_t i = 0; i < gParams.outputs.size(); i++) + createMemory(engine, buffers, gParams.outputs[i]); + + cudaStream_t stream; + CHECK(cudaStreamCreate(&stream)); + cudaEvent_t start, end; + CHECK(cudaEventCreate(&start)); + CHECK(cudaEventCreate(&end)); + + for (int j = 0; j < gParams.iterations; j++) + { + float total = 0, ms; + for (int i = 0; i < gParams.avgRuns; i++) + { + if (gParams.hostTime) + { + auto t_start = std::chrono::high_resolution_clock::now(); + context->execute(gParams.batchSize, &buffers[0]); + auto t_end = std::chrono::high_resolution_clock::now(); + ms = std::chrono::duration(t_end - t_start).count(); + } + else + { + cudaEventRecord(start, stream); + context->enqueue(gParams.batchSize, &buffers[0], stream, nullptr); + cudaEventRecord(end, stream); + cudaEventSynchronize(end); + cudaEventElapsedTime(&ms, start, end); + } + total += ms; + } + total /= gParams.avgRuns; + std::cout << "Average over " << gParams.avgRuns << " runs is " << total << " ms." << std::endl; + } + + + cudaStreamDestroy(stream); + cudaEventDestroy(start); + cudaEventDestroy(end); +} + + +static ICudaEngine* createEngine() +{ + // TODO replace all gParams with corresponding parameters + ICudaEngine *engine; + + if (!gParams.deployFile.empty()) { + engine = caffeToGIEModel(); + if (!engine) + { + std::cerr << "Engine could not be created" << std::endl; + return nullptr; + } + + + if (!gParams.engine.empty()) + { + std::ofstream p(gParams.engine); + if (!p) + { + std::cerr << "could not open plan output file" << std::endl; + return nullptr; + } + IHostMemory *ptr = engine->serialize(); + assert(ptr); + p.write(reinterpret_cast(ptr->data()), ptr->size()); + ptr->destroy(); + } + return engine; + } + + // load directly from serialized engine file if deploy not specified + if (!gParams.engine.empty()) { + char *gieModelStream{nullptr}; + size_t size{0}; + std::ifstream file(gParams.engine, std::ios::binary); + if (file.good()) { + file.seekg(0, file.end); + size = file.tellg(); + file.seekg(0, file.beg); + gieModelStream = new char[size]; + assert(gieModelStream); + file.read(gieModelStream, size); + file.close(); + } + + IRuntime* infer = createInferRuntime(gLogger); + engine = infer->deserializeCudaEngine(gieModelStream, size, nullptr); + if (gieModelStream) delete [] gieModelStream; + + // assume input to be "data" for deserialized engine + gInputs.push_back("data"); + return engine; + } + + // complain about empty deploy file + std::cerr << "Deploy file not specified" << std::endl; + return nullptr; +} + + + namespace op { PoseExtractorTensorRT::PoseExtractorTensorRT(const Point& netInputSize, const Point& netOutputSize, const Point& outputSize, const int scaleNumber, @@ -18,7 +273,16 @@ namespace op modelFolder + POSE_PROTOTXT[(int)poseModel], modelFolder + POSE_TRAINED_MODEL[(int)poseModel], gpuId)}, spResizeAndMergeTensorRT{std::make_shared>()}, spNmsTensorRT{std::make_shared>()}, - spBodyPartConnectorTensorRT{std::make_shared>()} + spBodyPartConnectorTensorRT{std::make_shared>()}, + mNetInputSize(netInputSize), + mNetOutputSize(netOutputSize), + mOutputSize(outputSize), + mScaleNumber(scaleNumber), + mPoseModel(poseModel), + mModelFolder(modelFolder), + mGpuId(gpuId), + mHeatMapTypes(heatMapTypes), + mHeatMapScale(heatMapScale) { try { @@ -35,6 +299,9 @@ namespace op PoseExtractorTensorRT::~PoseExtractorTensorRT() { + if(cudaEngine) + engine->destroy(); + } void PoseExtractorTensorRT::netInitializationOnThread() @@ -42,6 +309,13 @@ namespace op try { log("Starting initialization on thread.", Priority::Low, __LINE__, __FUNCTION__, __FILE__); + + cudaEngine = createEngine(); + if (!cudaEngine) + { + std::cerr << "Engine could not be created" << std::endl; + return -1; + } // TensorRT net spNet->initializationOnThread(); @@ -81,7 +355,12 @@ namespace op error("Empty inputNetData.", __LINE__, __FUNCTION__, __FILE__); // 1. TensorRT deep network - spNet->forwardPass(inputNetData.getConstPtr()); // ~79.3836ms + //spNet->forwardPass(inputNetData.getConstPtr()); // ~79.3836ms + + doInference(inputNetData.getConstPtr()); + + // Replace spNet->forward pass, but how to propagate to next + // Replace spTensorRTNetOututBlob.get() ? // 2. Resize heat maps + merge different scales spResizeAndMergeTensorRT->setScaleRatios(scaleRatios); @@ -168,451 +447,6 @@ namespace op } #endif -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "NvInfer.h" -#include "NvCaffeParser.h" -using namespace nvinfer1; -using namespace nvcaffeparser1; - -#define CHECK(status) \ -{ \ - if (status != 0) \ - { \ - std::cout << "Cuda failure: " << status; \ - abort(); \ - } \ -} -struct Params -{ - std::string deployFile, modelFile, engine, calibrationCache; - std::vector outputs; - int device{ 0 }, batchSize{ 1 }, workspaceSize{ 16 }, iterations{ 10 }, avgRuns{ 10 }; - bool half2{ false }, int8{ false }, verbose{ false }, hostTime{ false }; -} gParams; - -static inline int volume(DimsCHW dims) -{ - return dims.c()*dims.h()*dims.w(); -} - -std::vector gInputs; -std::map gInputDimensions; - -// Logger for GIE info/warning/errors -class Logger : public ILogger -{ - void log(Severity severity, const char* msg) override - { - // suppress info-level messages - if (severity != Severity::kINFO || gParams.verbose) - std::cout << msg << std::endl; - } -} gLogger; - -class RndInt8Calibrator : public IInt8EntropyCalibrator -{ -public: - RndInt8Calibrator(int totalSamples = 1) - : mTotalSamples(totalSamples) - , mCurrentSample(0) - { - std::default_random_engine generator; - std::uniform_real_distribution distribution(-1.0F, 1.0F); - for(auto& elem: gInputDimensions) - { - int elemCount = volume(elem.second); - - std::vector rnd_data(elemCount); - for(auto& val: rnd_data) - val = distribution(generator); - - void * data; - CHECK(cudaMalloc(&data, elemCount * sizeof(float))); - CHECK(cudaMemcpy(data, &rnd_data[0], elemCount * sizeof(float), cudaMemcpyHostToDevice)); - - mInputDeviceBuffers.insert(std::make_pair(elem.first, data)); - } - } - - ~RndInt8Calibrator() - { - for(auto& elem: mInputDeviceBuffers) - CHECK(cudaFree(elem.second)); - } - - int getBatchSize() const override - { - return 1; - } - - bool getBatch(void* bindings[], const char* names[], int nbBindings) override - { - if (mCurrentSample >= mTotalSamples) - return false; - - for(int i = 0; i < nbBindings; ++i) - bindings[i] = mInputDeviceBuffers[names[i]]; - - ++mCurrentSample; - return true; - } - - const void* readCalibrationCache(size_t&) override - { - return nullptr; - } - - virtual void writeCalibrationCache(const void*, size_t) override - { - } - -private: - int mTotalSamples; - int mCurrentSample; - std::map mInputDeviceBuffers; -}; - -ICudaEngine* caffeToGIEModel() -{ - // create the builder - IBuilder* builder = createInferBuilder(gLogger); - - // parse the caffe model to populate the network, then set the outputs - INetworkDefinition* network = builder->createNetwork(); - ICaffeParser* parser = createCaffeParser(); - const IBlobNameToTensor* blobNameToTensor = parser->parse(gParams.deployFile.c_str(), - gParams.modelFile.empty() ? 0 : gParams.modelFile.c_str(), - *network, - gParams.half2 ? DataType::kHALF:DataType::kFLOAT); - - - if (!blobNameToTensor) - return nullptr; - - for (int i = 0, n = network->getNbInputs(); i < n; i++) - { - DimsCHW dims = static_cast(network->getInput(i)->getDimensions()); - gInputs.push_back(network->getInput(i)->getName()); - gInputDimensions.insert(std::make_pair(network->getInput(i)->getName(), dims)); - std::cout << "Input \"" << network->getInput(i)->getName() << "\": " << dims.c() << "x" << dims.h() << "x" << dims.w() << std::endl; - } - - // specify which tensors are outputs - for (auto& s : gParams.outputs) - { - if (blobNameToTensor->find(s.c_str()) == nullptr) - { - std::cout << "could not find output blob " << s << std::endl; - return nullptr; - } - network->markOutput(*blobNameToTensor->find(s.c_str())); - } - - for (int i = 0, n = network->getNbOutputs(); i < n; i++) - { - DimsCHW dims = static_cast(network->getOutput(i)->getDimensions()); - std::cout << "Output \"" << network->getOutput(i)->getName() << "\": " << dims.c() << "x" << dims.h() << "x" << dims.w() << std::endl; - } - - // Build the engine - builder->setMaxBatchSize(gParams.batchSize); - builder->setMaxWorkspaceSize(gParams.workspaceSize<<20); - builder->setHalf2Mode(gParams.half2); - - RndInt8Calibrator calibrator; - if (gParams.int8) - { - builder->setInt8Mode(true); - builder->setInt8Calibrator(&calibrator); - } - - ICudaEngine* engine = builder->buildCudaEngine(*network); - if (engine == nullptr) - std::cout << "could not build engine" << std::endl; - - parser->destroy(); - network->destroy(); - builder->destroy(); - shutdownProtobufLibrary(); - return engine; -} - -void createMemory(const ICudaEngine& engine, std::vector& buffers, const std::string& name) -{ - size_t bindingIndex = engine.getBindingIndex(name.c_str()); - printf("name=%s, bindingIndex=%d, buffers.size()=%d\n", name.c_str(), (int)bindingIndex, (int)buffers.size()); - assert(bindingIndex < buffers.size()); - DimsCHW dimensions = static_cast(engine.getBindingDimensions((int)bindingIndex)); - size_t eltCount = dimensions.c()*dimensions.h()*dimensions.w()*gParams.batchSize, memSize = eltCount * sizeof(float); - - float* localMem = new float[eltCount]; - for (size_t i = 0; i < eltCount; i++) - localMem[i] = (float(rand()) / RAND_MAX) * 2 - 1; - - void* deviceMem; - CHECK(cudaMalloc(&deviceMem, memSize)); - if (deviceMem == nullptr) - { - std::cerr << "Out of memory" << std::endl; - exit(1); - } - CHECK(cudaMemcpy(deviceMem, localMem, memSize, cudaMemcpyHostToDevice)); - - delete[] localMem; - buffers[bindingIndex] = deviceMem; -} - -void doInference(ICudaEngine& engine) -{ - IExecutionContext *context = engine.createExecutionContext(); - // input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(), - // of these, but in this case we know that there is exactly one input and one output. - - std::vector buffers(gInputs.size() + gParams.outputs.size()); - for (size_t i = 0; i < gInputs.size(); i++) - createMemory(engine, buffers, gInputs[i]); - - for (size_t i = 0; i < gParams.outputs.size(); i++) - createMemory(engine, buffers, gParams.outputs[i]); - - cudaStream_t stream; - CHECK(cudaStreamCreate(&stream)); - cudaEvent_t start, end; - CHECK(cudaEventCreate(&start)); - CHECK(cudaEventCreate(&end)); - - for (int j = 0; j < gParams.iterations; j++) - { - float total = 0, ms; - for (int i = 0; i < gParams.avgRuns; i++) - { - if (gParams.hostTime) - { - auto t_start = std::chrono::high_resolution_clock::now(); - context->execute(gParams.batchSize, &buffers[0]); - auto t_end = std::chrono::high_resolution_clock::now(); - ms = std::chrono::duration(t_end - t_start).count(); - } - else - { - cudaEventRecord(start, stream); - context->enqueue(gParams.batchSize, &buffers[0], stream, nullptr); - cudaEventRecord(end, stream); - cudaEventSynchronize(end); - cudaEventElapsedTime(&ms, start, end); - } - total += ms; - } - total /= gParams.avgRuns; - std::cout << "Average over " << gParams.avgRuns << " runs is " << total << " ms." << std::endl; - } - - - cudaStreamDestroy(stream); - cudaEventDestroy(start); - cudaEventDestroy(end); -} - - - -static void printUsage() -{ - printf("\n"); - printf("Mandatory params:\n"); - printf(" --deploy= Caffe deploy file\n"); - printf(" --output= Output blob name (can be specified multiple times)\n"); - - printf("\nOptional params:\n"); - - printf(" --model= Caffe model file (default = no model, random weights used)\n"); - printf(" --batch=N Set batch size (default = %d)\n", gParams.batchSize); - printf(" --device=N Set cuda device to N (default = %d)\n", gParams.device); - printf(" --iterations=N Run N iterations (default = %d)\n", gParams.iterations); - printf(" --avgRuns=N Set avgRuns to N - perf is measured as an average of avgRuns (default=%d)\n", gParams.avgRuns); - printf(" --workspace=N Set workspace size in megabytes (default = %d)\n", gParams.workspaceSize); - printf(" --half2 Run in paired fp16 mode (default = false)\n"); - printf(" --int8 Run in int8 mode (default = false)\n"); - printf(" --verbose Use verbose logging (default = false)\n"); - printf(" --hostTime Measure host time rather than GPU time (default = false)\n"); - printf(" --engine= Generate a serialized GIE engine\n"); - printf(" --calib= Read INT8 calibration cache file\n"); - - fflush(stdout); -} - -bool parseString(const char* arg, const char* name, std::string& value) -{ - size_t n = strlen(name); - bool match = arg[0] == '-' && arg[1] == '-' && !strncmp(arg + 2, name, n) && arg[n + 2] == '='; - if (match) - { - value = arg + n + 3; - std::cout << name << ": " << value << std::endl; - } - return match; -} - -bool parseInt(const char* arg, const char* name, int& value) -{ - size_t n = strlen(name); - bool match = arg[0] == '-' && arg[1] == '-' && !strncmp(arg + 2, name, n) && arg[n + 2] == '='; - if (match) - { - value = atoi(arg + n + 3); - std::cout << name << ": " << value << std::endl; - } - return match; -} - -bool parseBool(const char* arg, const char* name, bool& value) -{ - size_t n = strlen(name); - bool match = arg[0] == '-' && arg[1] == '-' && !strncmp(arg + 2, name, n); - if (match) - { - std::cout << name << std::endl; - value = true; - } - return match; - -} - - -bool parseArgs(int argc, char* argv[]) -{ - if (argc < 3) - { - printUsage(); - return false; - } - - for (int j = 1; j < argc; j++) - { - if (parseString(argv[j], "model", gParams.modelFile) || parseString(argv[j], "deploy", gParams.deployFile) || parseString(argv[j], "engine", gParams.engine)) - continue; - - if (parseString(argv[j], "calib", gParams.calibrationCache)) - continue; - - std::string output; - if (parseString(argv[j], "output", output)) - { - gParams.outputs.push_back(output); - continue; - } - - if (parseInt(argv[j], "batch", gParams.batchSize) || parseInt(argv[j], "iterations", gParams.iterations) || parseInt(argv[j], "avgRuns", gParams.avgRuns) - || parseInt(argv[j], "device", gParams.device) || parseInt(argv[j], "workspace", gParams.workspaceSize)) - continue; - - if (parseBool(argv[j], "half2", gParams.half2) || parseBool(argv[j], "int8", gParams.int8) - || parseBool(argv[j], "verbose", gParams.verbose) || parseBool(argv[j], "hostTime", gParams.hostTime)) - continue; - - printf("Unknown argument: %s\n", argv[j]); - return false; - } - return true; -} - -static ICudaEngine* createEngine() -{ - ICudaEngine *engine; - - if (!gParams.deployFile.empty()) { - engine = caffeToGIEModel(); - if (!engine) - { - std::cerr << "Engine could not be created" << std::endl; - return nullptr; - } - - - if (!gParams.engine.empty()) - { - std::ofstream p(gParams.engine); - if (!p) - { - std::cerr << "could not open plan output file" << std::endl; - return nullptr; - } - IHostMemory *ptr = engine->serialize(); - assert(ptr); - p.write(reinterpret_cast(ptr->data()), ptr->size()); - ptr->destroy(); - } - return engine; - } - - // load directly from serialized engine file if deploy not specified - if (!gParams.engine.empty()) { - char *gieModelStream{nullptr}; - size_t size{0}; - std::ifstream file(gParams.engine, std::ios::binary); - if (file.good()) { - file.seekg(0, file.end); - size = file.tellg(); - file.seekg(0, file.beg); - gieModelStream = new char[size]; - assert(gieModelStream); - file.read(gieModelStream, size); - file.close(); - } - - IRuntime* infer = createInferRuntime(gLogger); - engine = infer->deserializeCudaEngine(gieModelStream, size, nullptr); - if (gieModelStream) delete [] gieModelStream; - - // assume input to be "data" for deserialized engine - gInputs.push_back("data"); - return engine; - } - - // complain about empty deploy file - std::cerr << "Deploy file not specified" << std::endl; - return nullptr; -} - -int main(int argc, char** argv) -{ - // create a GIE model from the caffe model and serialize it to a stream - - if (!parseArgs(argc, argv)) - return -1; - - cudaSetDevice(gParams.device); - - if (gParams.outputs.size() == 0) - { - std::cerr << "At least one network output must be defined" << std::endl; - return -1; - } - - ICudaEngine* engine = createEngine(); - if (!engine) - { - std::cerr << "Engine could not be created" << std::endl; - return -1; - } - - doInference(*engine); - engine->destroy(); - - return 0; -} From f09f27b498ac2aaa8c54f9ae33b7c5864bdb3b95 Mon Sep 17 00:00:00 2001 From: Florent Buisson Date: Fri, 22 Sep 2017 16:49:22 +0200 Subject: [PATCH 11/52] New netTensorRT version, cleaner, ready for debug, loads of questions. --- include/openpose/core/netTensorRT.hpp | 62 ++++ .../openpose/pose/poseExtractorTensorRT.hpp | 12 - src/openpose/core/netTensorRT.cpp | 313 ++++++++++++++++++ src/openpose/pose/poseExtractorTensorRT.cpp | 268 +-------------- 4 files changed, 378 insertions(+), 277 deletions(-) create mode 100644 include/openpose/core/netTensorRT.hpp create mode 100644 src/openpose/core/netTensorRT.cpp diff --git a/include/openpose/core/netTensorRT.hpp b/include/openpose/core/netTensorRT.hpp new file mode 100644 index 000000000..0874f475b --- /dev/null +++ b/include/openpose/core/netTensorRT.hpp @@ -0,0 +1,62 @@ +#ifdef USE_CAFFE +#ifndef OPENPOSE_CORE_NET_TENSORRT_HPP +#define OPENPOSE_CORE_NET_TENSORRT_HPP + +#include +#include +#include + +#include + +namespace op +{ + class OP_API NetTensorRT : public Net + { + public: + NetTensorRT(const std::array& netInputSize4D, const std::string& caffeProto, const std::string& caffeTrainedModel, const int gpuId = 0, + const std::string& lastBlobName = "net_output"); + + virtual ~NetTensorRT(); + + void initializationOnThread(); + + // Alternative a) getInputDataCpuPtr or getInputDataGpuPtr + forwardPass + float* getInputDataCpuPtr() const; + + float* getInputDataGpuPtr() const; + + // Alternative b) + void forwardPass(const float* const inputNetData = nullptr) const; + + boost::shared_ptr> getOutputBlob() const; + + private: + // Init with constructor + const int mGpuId; + const std::array mNetInputSize4D; + const unsigned long mNetInputMemory; + const std::string mCaffeProto; + const std::string mCaffeTrainedModel; + const std::string mLastBlobName; + // Init with thread + std::unique_ptr> upTensorRTNet; + boost::shared_ptr> spOutputBlob; + + // TensorRT stuff + const Point mNetInputSize; + const Point mNetOutputSize; + const Point mOutputSize; + const int mScaleNumber; + const PoseModel mPoseModel; + const std::string mModelFolder; + const int mGpuId; + const std::vector mHeatMapTypes; + const ScaleMode mHeatMapScale; + nvinfer1::ICudaEngine* cudaEngine; + + DELETE_COPY(NetTensorRT); + }; +} + +#endif // OPENPOSE_CORE_NET_TENSORRT_HPP +#endif diff --git a/include/openpose/pose/poseExtractorTensorRT.hpp b/include/openpose/pose/poseExtractorTensorRT.hpp index 33f781b8a..270d2a8f4 100644 --- a/include/openpose/pose/poseExtractorTensorRT.hpp +++ b/include/openpose/pose/poseExtractorTensorRT.hpp @@ -43,18 +43,6 @@ namespace op std::shared_ptr> spHeatMapsBlob; std::shared_ptr> spPeaksBlob; std::shared_ptr> spPoseBlob; - - // TensorRT stuff - const Point mNetInputSize; - const Point mNetOutputSize; - const Point mOutputSize; - const int mScaleNumber; - const PoseModel mPoseModel; - const std::string mModelFolder; - const int mGpuId; - const std::vector mHeatMapTypes; - const ScaleMode mHeatMapScale; - ICudaEngine* cudaEngine; DELETE_COPY(PoseExtractorTensorRT); }; diff --git a/src/openpose/core/netTensorRT.cpp b/src/openpose/core/netTensorRT.cpp new file mode 100644 index 000000000..fe6c7202f --- /dev/null +++ b/src/openpose/core/netTensorRT.cpp @@ -0,0 +1,313 @@ +#ifdef USE_CAFFE +#include // std::accumulate +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "NvInfer.h" +#include "NvCaffeParser.h" + +using namespace nvinfer1; +using namespace nvcaffeparser1; + +#define CHECK(status) \ +{ \ +if (status != 0) \ +{ \ +std::cout << "Cuda failure: " << status; \ +abort(); \ +} \ +} + +std::vector gInputs; +std::vector gInputDimensions; + + + +// Logger for GIE info/warning/errors +class Logger : public ILogger +{ + void log(Severity severity, const char* msg) override + { + // if suppress info-level message: if (severity != Severity::kINFO) + std::cout << msg << std::endl; + } +} gLogger; + + +ICudaEngine* caffeToGIEModel() +{ + // create the builder + IBuilder* builder = createInferBuilder(gLogger); + + // parse the caffe model to populate the network, then set the outputs + INetworkDefinition* network = builder->createNetwork(); + ICaffeParser* parser = createCaffeParser(); + const IBlobNameToTensor* blobNameToTensor = parser->parse(mCaffeProto.c_str(), + mCaffeTrainedModel.c_str(), + *network, + DataType::kHALF); + + if (!blobNameToTensor) + return nullptr; + + + for (int i = 0, n = network->getNbInputs(); i < n; i++) + { + DimsCHW dims = static_cast(network->getInput(i)->getDimensions()); + gInputs.push_back(network->getInput(i)->getName()); + gInputDimensions.insert(std::make_pair(network->getInput(i)->getName(), dims)); + std::cout << "Input \"" << network->getInput(i)->getName() << "\": " << dims.c() << "x" << dims.h() << "x" << dims.w() << std::endl; + } + + // specify which tensors are outputs + + + // TODO, if it works switch to something more generic, add as parameter etc + std::string s("net_output"); + if (blobNameToTensor->find(s.c_str()) == nullptr) + { + std::cout << "could not find output blob " << s << std::endl; + return nullptr; + } + network->markOutput(*blobNameToTensor->find(s.c_str())); + + + for (int i = 0, n = network->getNbOutputs(); i < n; i++) + { + DimsCHW dims = static_cast(network->getOutput(i)->getDimensions()); + std::cout << "Output \"" << network->getOutput(i)->getName() << "\": " << dims.c() << "x" << dims.h() << "x" << dims.w() << std::endl; + } + + // Build the engine + builder->setMaxBatchSize(1); + // 16 megabytes, default in giexec. No idea what's best for Jetson though, + // maybe check dusty_nv's code on github + builder->setMaxWorkspaceSize(16<<20); + builder->setHalf2Mode(true); + + ICudaEngine* engine = builder->buildCudaEngine(*network); + if (engine == nullptr) + std::cout << "could not build engine" << std::endl; + + parser->destroy(); + network->destroy(); + builder->destroy(); + shutdownProtobufLibrary(); + return engine; +} + +void createMemory(const ICudaEngine& engine, std::vector& buffers, const std::string& name) +{ + const int batchSize = 1; + size_t bindingIndex = engine.getBindingIndex(name.c_str()); + printf("name=%s, bindingIndex=%d, buffers.size()=%d\n", name.c_str(), (int)bindingIndex, (int)buffers.size()); + assert(bindingIndex < buffers.size()); + DimsCHW dimensions = static_cast(engine.getBindingDimensions((int)bindingIndex)); + size_t eltCount = dimensions.c()*dimensions.h()*dimensions.w()*batchSize, memSize = eltCount * sizeof(float); + + float* localMem = new float[eltCount]; + for (size_t i = 0; i < eltCount; i++) + localMem[i] = (float(rand()) / RAND_MAX) * 2 - 1; + + void* deviceMem; + CHECK(cudaMalloc(&deviceMem, memSize)); + if (deviceMem == nullptr) + { + std::cerr << "Out of memory" << std::endl; + exit(1); + } + CHECK(cudaMemcpy(deviceMem, localMem, memSize, cudaMemcpyHostToDevice)); + + delete[] localMem; + buffers[bindingIndex] = deviceMem; +} + + +static ICudaEngine* createEngine() +{ + ICudaEngine *engine; + + engine = caffeToGIEModel(); + if (!engine) + { + std::cerr << "Engine could not be created" << std::endl; + return nullptr; + } + + /* TODO seems unneeded, remove if so. + if (!gParams.engine.empty()) + { + std::ofstream p(gParams.engine); + if (!p) + { + std::cerr << "could not open plan output file" << std::endl; + return nullptr; + } + IHostMemory *ptr = engine->serialize(); + assert(ptr); + p.write(reinterpret_cast(ptr->data()), ptr->size()); + ptr->destroy(); + }*/ + return engine; +} + + +namespace op +{ + NetTensorRT::NetTensorRT(const std::array& netInputSize4D, const std::string& caffeProto, const std::string& caffeTrainedModel, const int gpuId, const std::string& lastBlobName) : + mGpuId{gpuId}, + // mNetInputSize4D{netInputSize4D}, // This line crashes on some devices with old G++ + mNetInputSize4D{netInputSize4D[0], netInputSize4D[1], netInputSize4D[2], netInputSize4D[3]}, + mNetInputMemory{std::accumulate(mNetInputSize4D.begin(), mNetInputSize4D.end(), 1, std::multiplies()) * sizeof(float)}, + mCaffeProto{caffeProto}, + mCaffeTrainedModel{caffeTrainedModel}, + mLastBlobName{lastBlobName} + { + } + + NetTensorRT::~NetTensorRT() + { + if (cudaEngine) + cudaEngine->destroy(); + } + + void NetTensorRT::initializationOnThread() + { + try + { + // Initialize net + cudaSetDevice(mGpuId); + + cudaEngine = createEngine(); + if (!cudaEngine) + { + std::cerr << "Engine could not be created" << std::endl; + return; + } + + // For tensor RT is done in caffeToGIE + /* + //caffe::TensorRT::SetDevice(mGpuId); + upTensorRTNet.reset(new caffe::Net{mTensorRTProto, caffe::TEST}); + upTensorRTNet->CopyTrainedLayersFrom(mTensorRTTrainedModel); + upTensorRTNet->blobs()[0]->Reshape({mNetInputSize4D[0], mNetInputSize4D[1], mNetInputSize4D[2], mNetInputSize4D[3]}); + upTensorRTNet->Reshape(); + cudaCheck(__LINE__, __FUNCTION__, __FILE__); + // Set spOutputBlob + spOutputBlob = upTensorRTNet->blob_by_name(mLastBlobName); + if (spOutputBlob == nullptr) + error("The output blob is a nullptr. Did you use the same name than the prototxt? (Used: " + mLastBlobName + ").", __LINE__, __FUNCTION__, __FILE__); + cudaCheck(__LINE__, __FUNCTION__, __FILE__);*/ + } + catch (const std::exception& e) + { + error(e.what(), __LINE__, __FUNCTION__, __FILE__); + } + } + + float* NetTensorRT::getInputDataCpuPtr() const + { + try + { + return upTensorRTNet->blobs().at(0)->mutable_cpu_data(); + } + catch (const std::exception& e) + { + error(e.what(), __LINE__, __FUNCTION__, __FILE__); + return nullptr; + } + } + + float* NetTensorRT::getInputDataGpuPtr() const + { + try + { + return upTensorRTNet->blobs().at(0)->mutable_gpu_data(); + } + catch (const std::exception& e) + { + error(e.what(), __LINE__, __FUNCTION__, __FILE__); + return nullptr; + } + } + + void NetTensorRT::forwardPass(const float* const inputData) const + { + try + { + // Copy frame data to GPU memory + if (inputData != nullptr) + { + + // OLD + //auto* gpuImagePtr = upTensorRTNet->blobs().at(0)->mutable_gpu_data(); + //cudaMemcpy(gpuImagePtr, inputData, mNetInputMemory, cudaMemcpyHostToDevice); + + // Tensor RT version + + // TODO maybe move this to init and keep only the execute part + IExecutionContext *context = cudaEngine.createExecutionContext(); + // input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(), + // of these, but in this case we know that there is exactly one input and one output. + + std::vector buffers(gInputs.size() + 1); + for (size_t i = 0; i < gInputs.size(); i++) + createMemory(engine, buffers, gInputs[i]); + + + createMemory(engine, buffers, std::string("net_output")); + + cudaStream_t stream; + CHECK(cudaStreamCreate(&stream)); + cudaEvent_t start, end; + CHECK(cudaEventCreate(&start)); + CHECK(cudaEventCreate(&end)); + + int batchSize = 1; + context->execute(batchSize, &buffers[0]); + + + cudaStreamDestroy(stream); + cudaEventDestroy(start); + cudaEventDestroy(end); + + } + // Old Perform deep network forward pass + //upTensorRTNet->ForwardFrom(0); + //cudaCheck(__LINE__, __FUNCTION__, __FILE__); + } + catch (const std::exception& e) + { + error(e.what(), __LINE__, __FUNCTION__, __FILE__); + } + } + + boost::shared_ptr> NetTensorRT::getOutputBlob() const + { + try + { + return spOutputBlob; + } + catch (const std::exception& e) + { + error(e.what(), __LINE__, __FUNCTION__, __FILE__); + return nullptr; + } + } +} + +#endif diff --git a/src/openpose/pose/poseExtractorTensorRT.cpp b/src/openpose/pose/poseExtractorTensorRT.cpp index c7f3311d8..000510f1b 100644 --- a/src/openpose/pose/poseExtractorTensorRT.cpp +++ b/src/openpose/pose/poseExtractorTensorRT.cpp @@ -1,5 +1,6 @@ #ifdef USE_CAFFE #include +#include #include #include #include @@ -7,260 +8,6 @@ #include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "NvInfer.h" -#include "NvCaffeParser.h" - -using namespace nvinfer1; -using namespace nvcaffeparser1; - -#define CHECK(status) \ -{ \ -if (status != 0) \ -{ \ -std::cout << "Cuda failure: " << status; \ -abort(); \ -} \ -} - -struct Params -{ - std::string deployFile, modelFile, engine, calibrationCache; - std::vector outputs; - int device{ 0 }, batchSize{ 1 }, workspaceSize{ 16 }, iterations{ 10 }, avgRuns{ 10 }; - bool half2{ false }, int8{ false }, verbose{ false }, hostTime{ false }; -} gParams; - -static inline int volume(DimsCHW dims) -{ - return dims.c()*dims.h()*dims.w(); -} - -std::vector gInputs; -std::map gInputDimensions; - -// Logger for GIE info/warning/errors -class Logger : public ILogger -{ - void log(Severity severity, const char* msg) override - { - // suppress info-level messages - if (severity != Severity::kINFO || gParams.verbose) - std::cout << msg << std::endl; - } -} gLogger; - - -ICudaEngine* caffeToGIEModel() -{ - // create the builder - IBuilder* builder = createInferBuilder(gLogger); - - // parse the caffe model to populate the network, then set the outputs - INetworkDefinition* network = builder->createNetwork(); - ICaffeParser* parser = createCaffeParser(); - const IBlobNameToTensor* blobNameToTensor = parser->parse(gParams.deployFile.c_str(), - gParams.modelFile.empty() ? 0 : gParams.modelFile.c_str(), - *network, - gParams.half2 ? DataType::kHALF:DataType::kFLOAT); - - - if (!blobNameToTensor) - return nullptr; - - for (int i = 0, n = network->getNbInputs(); i < n; i++) - { - DimsCHW dims = static_cast(network->getInput(i)->getDimensions()); - gInputs.push_back(network->getInput(i)->getName()); - gInputDimensions.insert(std::make_pair(network->getInput(i)->getName(), dims)); - std::cout << "Input \"" << network->getInput(i)->getName() << "\": " << dims.c() << "x" << dims.h() << "x" << dims.w() << std::endl; - } - - // specify which tensors are outputs - - for (auto& s : gParams.outputs) - { - if (blobNameToTensor->find(s.c_str()) == nullptr) - { - std::cout << "could not find output blob " << s << std::endl; - return nullptr; - } - network->markOutput(*blobNameToTensor->find(s.c_str())); - } - - for (int i = 0, n = network->getNbOutputs(); i < n; i++) - { - DimsCHW dims = static_cast(network->getOutput(i)->getDimensions()); - std::cout << "Output \"" << network->getOutput(i)->getName() << "\": " << dims.c() << "x" << dims.h() << "x" << dims.w() << std::endl; - } - - // Build the engine - builder->setMaxBatchSize(1); - builder->setMaxWorkspaceSize(gParams.workspaceSize<<20); - builder->setHalf2Mode(true); - - ICudaEngine* engine = builder->buildCudaEngine(*network); - if (engine == nullptr) - std::cout << "could not build engine" << std::endl; - - parser->destroy(); - network->destroy(); - builder->destroy(); - shutdownProtobufLibrary(); - return engine; -} - -void createMemory(const ICudaEngine& engine, std::vector& buffers, const std::string& name) -{ - size_t bindingIndex = engine.getBindingIndex(name.c_str()); - printf("name=%s, bindingIndex=%d, buffers.size()=%d\n", name.c_str(), (int)bindingIndex, (int)buffers.size()); - assert(bindingIndex < buffers.size()); - DimsCHW dimensions = static_cast(engine.getBindingDimensions((int)bindingIndex)); - size_t eltCount = dimensions.c()*dimensions.h()*dimensions.w()*gParams.batchSize, memSize = eltCount * sizeof(float); - - float* localMem = new float[eltCount]; - for (size_t i = 0; i < eltCount; i++) - localMem[i] = (float(rand()) / RAND_MAX) * 2 - 1; - - void* deviceMem; - CHECK(cudaMalloc(&deviceMem, memSize)); - if (deviceMem == nullptr) - { - std::cerr << "Out of memory" << std::endl; - exit(1); - } - CHECK(cudaMemcpy(deviceMem, localMem, memSize, cudaMemcpyHostToDevice)); - - delete[] localMem; - buffers[bindingIndex] = deviceMem; -} - -void doInference(ICudaEngine& engine) -{ - IExecutionContext *context = engine.createExecutionContext(); - // input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(), - // of these, but in this case we know that there is exactly one input and one output. - - std::vector buffers(gInputs.size() + gParams.outputs.size()); - for (size_t i = 0; i < gInputs.size(); i++) - createMemory(engine, buffers, gInputs[i]); - - for (size_t i = 0; i < gParams.outputs.size(); i++) - createMemory(engine, buffers, gParams.outputs[i]); - - cudaStream_t stream; - CHECK(cudaStreamCreate(&stream)); - cudaEvent_t start, end; - CHECK(cudaEventCreate(&start)); - CHECK(cudaEventCreate(&end)); - - for (int j = 0; j < gParams.iterations; j++) - { - float total = 0, ms; - for (int i = 0; i < gParams.avgRuns; i++) - { - if (gParams.hostTime) - { - auto t_start = std::chrono::high_resolution_clock::now(); - context->execute(gParams.batchSize, &buffers[0]); - auto t_end = std::chrono::high_resolution_clock::now(); - ms = std::chrono::duration(t_end - t_start).count(); - } - else - { - cudaEventRecord(start, stream); - context->enqueue(gParams.batchSize, &buffers[0], stream, nullptr); - cudaEventRecord(end, stream); - cudaEventSynchronize(end); - cudaEventElapsedTime(&ms, start, end); - } - total += ms; - } - total /= gParams.avgRuns; - std::cout << "Average over " << gParams.avgRuns << " runs is " << total << " ms." << std::endl; - } - - - cudaStreamDestroy(stream); - cudaEventDestroy(start); - cudaEventDestroy(end); -} - - -static ICudaEngine* createEngine() -{ - // TODO replace all gParams with corresponding parameters - ICudaEngine *engine; - - if (!gParams.deployFile.empty()) { - engine = caffeToGIEModel(); - if (!engine) - { - std::cerr << "Engine could not be created" << std::endl; - return nullptr; - } - - - if (!gParams.engine.empty()) - { - std::ofstream p(gParams.engine); - if (!p) - { - std::cerr << "could not open plan output file" << std::endl; - return nullptr; - } - IHostMemory *ptr = engine->serialize(); - assert(ptr); - p.write(reinterpret_cast(ptr->data()), ptr->size()); - ptr->destroy(); - } - return engine; - } - - // load directly from serialized engine file if deploy not specified - if (!gParams.engine.empty()) { - char *gieModelStream{nullptr}; - size_t size{0}; - std::ifstream file(gParams.engine, std::ios::binary); - if (file.good()) { - file.seekg(0, file.end); - size = file.tellg(); - file.seekg(0, file.beg); - gieModelStream = new char[size]; - assert(gieModelStream); - file.read(gieModelStream, size); - file.close(); - } - - IRuntime* infer = createInferRuntime(gLogger); - engine = infer->deserializeCudaEngine(gieModelStream, size, nullptr); - if (gieModelStream) delete [] gieModelStream; - - // assume input to be "data" for deserialized engine - gInputs.push_back("data"); - return engine; - } - - // complain about empty deploy file - std::cerr << "Deploy file not specified" << std::endl; - return nullptr; -} - - namespace op { @@ -269,7 +16,7 @@ namespace op const ScaleMode heatMapScale) : PoseExtractor{netOutputSize, outputSize, poseModel, heatMapTypes, heatMapScale}, mResizeScale{mNetOutputSize.x / (float)netInputSize.x}, - spNet{std::make_shared(std::array{scaleNumber, 3, (int)netInputSize.y, (int)netInputSize.x}, + spNet{std::make_shared(std::array{scaleNumber, 3, (int)netInputSize.y, (int)netInputSize.x}, modelFolder + POSE_PROTOTXT[(int)poseModel], modelFolder + POSE_TRAINED_MODEL[(int)poseModel], gpuId)}, spResizeAndMergeTensorRT{std::make_shared>()}, spNmsTensorRT{std::make_shared>()}, @@ -299,9 +46,6 @@ namespace op PoseExtractorTensorRT::~PoseExtractorTensorRT() { - if(cudaEngine) - engine->destroy(); - } void PoseExtractorTensorRT::netInitializationOnThread() @@ -310,16 +54,10 @@ namespace op { log("Starting initialization on thread.", Priority::Low, __LINE__, __FUNCTION__, __FILE__); - cudaEngine = createEngine(); - if (!cudaEngine) - { - std::cerr << "Engine could not be created" << std::endl; - return -1; - } // TensorRT net spNet->initializationOnThread(); - spTensorRTNetOutputBlob = ((NetCaffe*)spNet.get())->getOutputBlob(); + spTensorRTNetOutputBlob = ((NetTensorRT*)spNet.get())->getOutputBlob(); cudaCheck(__LINE__, __FUNCTION__, __FILE__); // HeatMaps extractor blob and layer From ba2b435c0178fb09625eef4fdeb1999b3e067ac5 Mon Sep 17 00:00:00 2001 From: Florent Buisson Date: Fri, 22 Sep 2017 15:34:26 +0000 Subject: [PATCH 12/52] Fixed everything to compile, runs, reads network and convert but then segfault, need precise step logs or debug. --- Makefile | 3 ++ include/openpose/core/netTensorRT.hpp | 11 +------ models/pose/coco/pose_deploy_linevec.prototxt | 4 +-- src/openpose/core/netTensorRT.cpp | 32 +++++++++---------- src/openpose/pose/poseExtractorTensorRT.cpp | 14 ++------ 5 files changed, 24 insertions(+), 40 deletions(-) diff --git a/Makefile b/Makefile index 0cdc9bf39..c46c08e25 100644 --- a/Makefile +++ b/Makefile @@ -145,6 +145,9 @@ ifneq ($(CPU_ONLY), 1) LIBRARIES += cudart cublas curand endif +# TensorRT +LIBRARIES += nvinfer nvcaffe_parser + # LIBRARIES += glog gflags boost_system boost_filesystem m hdf5_hl hdf5 caffe LIBRARIES += glog gflags boost_system boost_filesystem m hdf5_hl hdf5 diff --git a/include/openpose/core/netTensorRT.hpp b/include/openpose/core/netTensorRT.hpp index 0874f475b..05f7bc860 100644 --- a/include/openpose/core/netTensorRT.hpp +++ b/include/openpose/core/netTensorRT.hpp @@ -6,7 +6,7 @@ #include #include -#include +#include "NvInfer.h" namespace op { @@ -43,15 +43,6 @@ namespace op boost::shared_ptr> spOutputBlob; // TensorRT stuff - const Point mNetInputSize; - const Point mNetOutputSize; - const Point mOutputSize; - const int mScaleNumber; - const PoseModel mPoseModel; - const std::string mModelFolder; - const int mGpuId; - const std::vector mHeatMapTypes; - const ScaleMode mHeatMapScale; nvinfer1::ICudaEngine* cudaEngine; DELETE_COPY(NetTensorRT); diff --git a/models/pose/coco/pose_deploy_linevec.prototxt b/models/pose/coco/pose_deploy_linevec.prototxt index fbe0c8245..6e4322812 100755 --- a/models/pose/coco/pose_deploy_linevec.prototxt +++ b/models/pose/coco/pose_deploy_linevec.prototxt @@ -1,8 +1,8 @@ input: "image" input_dim: 1 input_dim: 3 -input_dim: 1 # This value will be defined at runtime -input_dim: 1 # This value will be defined at runtime +input_dim: 96 # This value will be defined at runtime +input_dim: 128 # This value will be defined at runtime layer { name: "conv1_1" type: "Convolution" diff --git a/src/openpose/core/netTensorRT.cpp b/src/openpose/core/netTensorRT.cpp index fe6c7202f..0fc5e6e58 100644 --- a/src/openpose/core/netTensorRT.cpp +++ b/src/openpose/core/netTensorRT.cpp @@ -23,7 +23,7 @@ using namespace nvinfer1; using namespace nvcaffeparser1; -#define CHECK(status) \ +#define CUDA_TENSORRT_CHECK(status) \ { \ if (status != 0) \ { \ @@ -33,7 +33,7 @@ abort(); \ } std::vector gInputs; -std::vector gInputDimensions; +std::map gInputDimensions; @@ -48,7 +48,7 @@ class Logger : public ILogger } gLogger; -ICudaEngine* caffeToGIEModel() +ICudaEngine* caffeToGIEModel(const std::string& caffeProto, const std::string& caffeTrainedModel) { // create the builder IBuilder* builder = createInferBuilder(gLogger); @@ -56,8 +56,8 @@ ICudaEngine* caffeToGIEModel() // parse the caffe model to populate the network, then set the outputs INetworkDefinition* network = builder->createNetwork(); ICaffeParser* parser = createCaffeParser(); - const IBlobNameToTensor* blobNameToTensor = parser->parse(mCaffeProto.c_str(), - mCaffeTrainedModel.c_str(), + const IBlobNameToTensor* blobNameToTensor = parser->parse(caffeProto.c_str(), + caffeTrainedModel.c_str(), *network, DataType::kHALF); @@ -124,24 +124,24 @@ void createMemory(const ICudaEngine& engine, std::vector& buffers, const localMem[i] = (float(rand()) / RAND_MAX) * 2 - 1; void* deviceMem; - CHECK(cudaMalloc(&deviceMem, memSize)); + CUDA_TENSORRT_CHECK(cudaMalloc(&deviceMem, memSize)); if (deviceMem == nullptr) { std::cerr << "Out of memory" << std::endl; exit(1); } - CHECK(cudaMemcpy(deviceMem, localMem, memSize, cudaMemcpyHostToDevice)); + CUDA_TENSORRT_CHECK(cudaMemcpy(deviceMem, localMem, memSize, cudaMemcpyHostToDevice)); delete[] localMem; buffers[bindingIndex] = deviceMem; } -static ICudaEngine* createEngine() +static ICudaEngine* createEngine(const std::string& caffeProto, const std::string& caffeTrainedModel) { ICudaEngine *engine; - engine = caffeToGIEModel(); + engine = caffeToGIEModel(caffeProto, caffeTrainedModel); if (!engine) { std::cerr << "Engine could not be created" << std::endl; @@ -192,7 +192,7 @@ namespace op // Initialize net cudaSetDevice(mGpuId); - cudaEngine = createEngine(); + cudaEngine = createEngine(mCaffeProto, mCaffeTrainedModel); if (!cudaEngine) { std::cerr << "Engine could not be created" << std::endl; @@ -260,22 +260,22 @@ namespace op // Tensor RT version // TODO maybe move this to init and keep only the execute part - IExecutionContext *context = cudaEngine.createExecutionContext(); + IExecutionContext *context = cudaEngine->createExecutionContext(); // input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(), // of these, but in this case we know that there is exactly one input and one output. std::vector buffers(gInputs.size() + 1); for (size_t i = 0; i < gInputs.size(); i++) - createMemory(engine, buffers, gInputs[i]); + createMemory(*cudaEngine, buffers, gInputs[i]); - createMemory(engine, buffers, std::string("net_output")); + createMemory(*cudaEngine, buffers, std::string("net_output")); cudaStream_t stream; - CHECK(cudaStreamCreate(&stream)); + CUDA_TENSORRT_CHECK(cudaStreamCreate(&stream)); cudaEvent_t start, end; - CHECK(cudaEventCreate(&start)); - CHECK(cudaEventCreate(&end)); + CUDA_TENSORRT_CHECK(cudaEventCreate(&start)); + CUDA_TENSORRT_CHECK(cudaEventCreate(&end)); int batchSize = 1; context->execute(batchSize, &buffers[0]); diff --git a/src/openpose/pose/poseExtractorTensorRT.cpp b/src/openpose/pose/poseExtractorTensorRT.cpp index 000510f1b..722524eee 100644 --- a/src/openpose/pose/poseExtractorTensorRT.cpp +++ b/src/openpose/pose/poseExtractorTensorRT.cpp @@ -20,16 +20,7 @@ namespace op modelFolder + POSE_PROTOTXT[(int)poseModel], modelFolder + POSE_TRAINED_MODEL[(int)poseModel], gpuId)}, spResizeAndMergeTensorRT{std::make_shared>()}, spNmsTensorRT{std::make_shared>()}, - spBodyPartConnectorTensorRT{std::make_shared>()}, - mNetInputSize(netInputSize), - mNetOutputSize(netOutputSize), - mOutputSize(outputSize), - mScaleNumber(scaleNumber), - mPoseModel(poseModel), - mModelFolder(modelFolder), - mGpuId(gpuId), - mHeatMapTypes(heatMapTypes), - mHeatMapScale(heatMapScale) + spBodyPartConnectorTensorRT{std::make_shared>()} { try { @@ -93,9 +84,8 @@ namespace op error("Empty inputNetData.", __LINE__, __FUNCTION__, __FILE__); // 1. TensorRT deep network - //spNet->forwardPass(inputNetData.getConstPtr()); // ~79.3836ms + spNet->forwardPass(inputNetData.getConstPtr()); - doInference(inputNetData.getConstPtr()); // Replace spNet->forward pass, but how to propagate to next // Replace spTensorRTNetOututBlob.get() ? From 97bbc05fc01bc78244d0e82ee7c708183ae2bd89 Mon Sep 17 00:00:00 2001 From: Florent Buisson Date: Fri, 22 Sep 2017 17:49:12 +0200 Subject: [PATCH 13/52] Debug logs. --- src/openpose/core/netTensorRT.cpp | 27 +++++++++++++++++++++ src/openpose/pose/poseExtractorTensorRT.cpp | 4 +++ 2 files changed, 31 insertions(+) diff --git a/src/openpose/core/netTensorRT.cpp b/src/openpose/core/netTensorRT.cpp index 0fc5e6e58..7759bfbbd 100644 --- a/src/openpose/core/netTensorRT.cpp +++ b/src/openpose/core/netTensorRT.cpp @@ -187,11 +187,18 @@ namespace op void NetTensorRT::initializationOnThread() { + + std::cout << "InitializationOnThread : start" << std::endl; + try { + + std::cout << "Forward Pass : setting device" << std::endl; // Initialize net cudaSetDevice(mGpuId); + std::cout << "Forward Pass : creating engine" << std::endl; + cudaEngine = createEngine(mCaffeProto, mCaffeTrainedModel); if (!cudaEngine) { @@ -199,6 +206,8 @@ namespace op return; } + std::cout << "Forward Pass : done" << std::endl; + // For tensor RT is done in caffeToGIE /* //caffe::TensorRT::SetDevice(mGpuId); @@ -247,6 +256,8 @@ namespace op void NetTensorRT::forwardPass(const float* const inputData) const { + + std::cout << "Forward Pass : start" << std::endl; try { // Copy frame data to GPU memory @@ -260,10 +271,15 @@ namespace op // Tensor RT version // TODO maybe move this to init and keep only the execute part + + std::cout << "Forward Pass : creating execution context" << std::endl; + IExecutionContext *context = cudaEngine->createExecutionContext(); // input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(), // of these, but in this case we know that there is exactly one input and one output. + std::cout << "Forward Pass : creating CUDA memory" << std::endl; + std::vector buffers(gInputs.size() + 1); for (size_t i = 0; i < gInputs.size(); i++) createMemory(*cudaEngine, buffers, gInputs[i]); @@ -271,16 +287,24 @@ namespace op createMemory(*cudaEngine, buffers, std::string("net_output")); + + std::cout << "Forward Pass : memory created" << std::endl; + cudaStream_t stream; CUDA_TENSORRT_CHECK(cudaStreamCreate(&stream)); cudaEvent_t start, end; CUDA_TENSORRT_CHECK(cudaEventCreate(&start)); CUDA_TENSORRT_CHECK(cudaEventCreate(&end)); + + std::cout << "Forward Pass : executing inference" << std::endl; + int batchSize = 1; context->execute(batchSize, &buffers[0]); + std::cout << "Forward Pass : inference done !" << std::endl; + cudaStreamDestroy(stream); cudaEventDestroy(start); cudaEventDestroy(end); @@ -298,6 +322,7 @@ namespace op boost::shared_ptr> NetTensorRT::getOutputBlob() const { + std::cout << "Getting output blob." << std::endl; try { return spOutputBlob; @@ -307,6 +332,8 @@ namespace op error(e.what(), __LINE__, __FUNCTION__, __FILE__); return nullptr; } + + std::cout << "Got something..." << std::endl; } } diff --git a/src/openpose/pose/poseExtractorTensorRT.cpp b/src/openpose/pose/poseExtractorTensorRT.cpp index 722524eee..a9a0abb35 100644 --- a/src/openpose/pose/poseExtractorTensorRT.cpp +++ b/src/openpose/pose/poseExtractorTensorRT.cpp @@ -83,9 +83,13 @@ namespace op if (inputNetData.empty()) error("Empty inputNetData.", __LINE__, __FUNCTION__, __FILE__); + + + std::cout << "Forward Pass Pose: tensorrt forward pass" << std::endl; // 1. TensorRT deep network spNet->forwardPass(inputNetData.getConstPtr()); + std::cout << "Forward Pass Pose: tensorrt passed !" << std::endl; // Replace spNet->forward pass, but how to propagate to next // Replace spTensorRTNetOututBlob.get() ? From c666163991f53c6d6cd88556b85eb30580821447 Mon Sep 17 00:00:00 2001 From: Florent Buisson Date: Mon, 25 Sep 2017 14:51:17 +0200 Subject: [PATCH 14/52] First try on tensorRT inference with caffe Blobs. --- include/openpose/core/netTensorRT.hpp | 3 +- src/openpose/core/netTensorRT.cpp | 80 +++++++++++++++------------ 2 files changed, 47 insertions(+), 36 deletions(-) diff --git a/include/openpose/core/netTensorRT.hpp b/include/openpose/core/netTensorRT.hpp index 05f7bc860..6e92ddb07 100644 --- a/include/openpose/core/netTensorRT.hpp +++ b/include/openpose/core/netTensorRT.hpp @@ -39,7 +39,8 @@ namespace op const std::string mCaffeTrainedModel; const std::string mLastBlobName; // Init with thread - std::unique_ptr> upTensorRTNet; + + boost::shared_ptr> spInputBlob; boost::shared_ptr> spOutputBlob; // TensorRT stuff diff --git a/src/openpose/core/netTensorRT.cpp b/src/openpose/core/netTensorRT.cpp index 7759bfbbd..648265d7c 100644 --- a/src/openpose/core/netTensorRT.cpp +++ b/src/openpose/core/netTensorRT.cpp @@ -110,32 +110,6 @@ ICudaEngine* caffeToGIEModel(const std::string& caffeProto, const std::string& c return engine; } -void createMemory(const ICudaEngine& engine, std::vector& buffers, const std::string& name) -{ - const int batchSize = 1; - size_t bindingIndex = engine.getBindingIndex(name.c_str()); - printf("name=%s, bindingIndex=%d, buffers.size()=%d\n", name.c_str(), (int)bindingIndex, (int)buffers.size()); - assert(bindingIndex < buffers.size()); - DimsCHW dimensions = static_cast(engine.getBindingDimensions((int)bindingIndex)); - size_t eltCount = dimensions.c()*dimensions.h()*dimensions.w()*batchSize, memSize = eltCount * sizeof(float); - - float* localMem = new float[eltCount]; - for (size_t i = 0; i < eltCount; i++) - localMem[i] = (float(rand()) / RAND_MAX) * 2 - 1; - - void* deviceMem; - CUDA_TENSORRT_CHECK(cudaMalloc(&deviceMem, memSize)); - if (deviceMem == nullptr) - { - std::cerr << "Out of memory" << std::endl; - exit(1); - } - CUDA_TENSORRT_CHECK(cudaMemcpy(deviceMem, localMem, memSize, cudaMemcpyHostToDevice)); - - delete[] localMem; - buffers[bindingIndex] = deviceMem; -} - static ICudaEngine* createEngine(const std::string& caffeProto, const std::string& caffeTrainedModel) { @@ -193,11 +167,11 @@ namespace op try { - std::cout << "Forward Pass : setting device" << std::endl; + std::cout << "InitializationOnThread : setting device" << std::endl; // Initialize net cudaSetDevice(mGpuId); - std::cout << "Forward Pass : creating engine" << std::endl; + std::cout << "InitializationOnThread : creating engine" << std::endl; cudaEngine = createEngine(mCaffeProto, mCaffeTrainedModel); if (!cudaEngine) @@ -206,7 +180,12 @@ namespace op return; } - std::cout << "Forward Pass : done" << std::endl; + std::cout << "InitializationOnThread : done" << std::endl; + + + + spInputBlob = std::make_shared>({1, 3, 128, 96}); + spOutputBlob = std::make_shared>({1, 57, 46, 82}); // For tensor RT is done in caffeToGIE /* @@ -215,9 +194,9 @@ namespace op upTensorRTNet->CopyTrainedLayersFrom(mTensorRTTrainedModel); upTensorRTNet->blobs()[0]->Reshape({mNetInputSize4D[0], mNetInputSize4D[1], mNetInputSize4D[2], mNetInputSize4D[3]}); upTensorRTNet->Reshape(); - cudaCheck(__LINE__, __FUNCTION__, __FILE__); + cudaCheck(__LINE__, __FUNCTION__, __FILE__);*/ // Set spOutputBlob - spOutputBlob = upTensorRTNet->blob_by_name(mLastBlobName); + /* if (spOutputBlob == nullptr) error("The output blob is a nullptr. Did you use the same name than the prototxt? (Used: " + mLastBlobName + ").", __LINE__, __FUNCTION__, __FILE__); cudaCheck(__LINE__, __FUNCTION__, __FILE__);*/ @@ -280,12 +259,40 @@ namespace op std::cout << "Forward Pass : creating CUDA memory" << std::endl; - std::vector buffers(gInputs.size() + 1); - for (size_t i = 0; i < gInputs.size(); i++) - createMemory(*cudaEngine, buffers, gInputs[i]); + /* + const int batchSize = 1; + size_t bindingIndex = engine.getBindingIndex(name.c_str()); + std::cout"name=%s, bindingIndex=%d, buffers.size()=%d\n", name.c_str(), (int)bindingIndex, (int)buffers.size()); + assert(bindingIndex < buffers.size()); + DimsCHW dimensions = static_cast(engine.getBindingDimensions((int)bindingIndex)); + size_t eltCount = dimensions.c()*dimensions.h()*dimensions.w()*batchSize, memSize = eltCount * sizeof(float); + + float* localMem = new float[eltCount]; + for (size_t i = 0; i < eltCount; i++) + localMem[i] = (float(rand()) / RAND_MAX) * 2 - 1; + + void* deviceMem; + CUDA_TENSORRT_CHECK(cudaMalloc(&deviceMem, memSize)); + if (deviceMem == nullptr) + { + std::cerr << "Out of memory" << std::endl; + exit(1); + } + CUDA_TENSORRT_CHECK(cudaMemcpy(deviceMem, localMem, memSize, cudaMemcpyHostToDevice)); + + delete[] localMem; + buffers[bindingIndex] = deviceMem; + */ + + std::vector buffers(2); + buffers[0] = spInputBlob->mutable_gpu_data(); + buffers[1] = spOutputBlob->mutable_gpu_data(); + + //createMemory(*cudaEngine, buffers, gInputs[i]); - createMemory(*cudaEngine, buffers, std::string("net_output")); + + //createMemory(*cudaEngine, buffers, std::string("net_output")); std::cout << "Forward Pass : memory created" << std::endl; @@ -303,8 +310,11 @@ namespace op context->execute(batchSize, &buffers[0]); + std::cout << "Forward Pass : inference done !" << std::endl; + + cudaStreamDestroy(stream); cudaEventDestroy(start); cudaEventDestroy(end); From 1c77534a0def82067d4946d9741473f95685640d Mon Sep 17 00:00:00 2001 From: Florent Buisson Date: Mon, 25 Sep 2017 17:37:08 +0000 Subject: [PATCH 15/52] Running, but not pose recognition. Find a way to copy memory correctly. --- src/openpose/core/netCaffe.cpp | 1 + src/openpose/core/netTensorRT.cpp | 12 +++++++----- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/src/openpose/core/netCaffe.cpp b/src/openpose/core/netCaffe.cpp index 12562ff85..ac03d80c0 100644 --- a/src/openpose/core/netCaffe.cpp +++ b/src/openpose/core/netCaffe.cpp @@ -34,6 +34,7 @@ namespace op cudaCheck(__LINE__, __FUNCTION__, __FILE__); // Set spOutputBlob spOutputBlob = upCaffeNet->blob_by_name(mLastBlobName); + std::cout << spOutputBlob->num() << " " << spOutputBlob->channels() << " " << spOutputBlob->height() << " " << spOutputBlob->width(); if (spOutputBlob == nullptr) error("The output blob is a nullptr. Did you use the same name than the prototxt? (Used: " + mLastBlobName + ").", __LINE__, __FUNCTION__, __FILE__); cudaCheck(__LINE__, __FUNCTION__, __FILE__); diff --git a/src/openpose/core/netTensorRT.cpp b/src/openpose/core/netTensorRT.cpp index 648265d7c..28eb09f09 100644 --- a/src/openpose/core/netTensorRT.cpp +++ b/src/openpose/core/netTensorRT.cpp @@ -16,6 +16,7 @@ #include #include #include +#include #include "NvInfer.h" #include "NvCaffeParser.h" @@ -184,8 +185,8 @@ namespace op - spInputBlob = std::make_shared>({1, 3, 128, 96}); - spOutputBlob = std::make_shared>({1, 57, 46, 82}); + spInputBlob = boost::make_shared>(1, 3, 128, 96); + spOutputBlob = boost::make_shared>(1, 57, 46, 82); // For tensor RT is done in caffeToGIE /* @@ -211,7 +212,7 @@ namespace op { try { - return upTensorRTNet->blobs().at(0)->mutable_cpu_data(); + return spInputBlob->mutable_cpu_data(); } catch (const std::exception& e) { @@ -224,7 +225,7 @@ namespace op { try { - return upTensorRTNet->blobs().at(0)->mutable_gpu_data(); + return spInputBlob->mutable_gpu_data(); } catch (const std::exception& e) { @@ -307,8 +308,9 @@ namespace op std::cout << "Forward Pass : executing inference" << std::endl; int batchSize = 1; + spInputBlob->Update(); context->execute(batchSize, &buffers[0]); - + spOutputBlob->Update(); std::cout << "Forward Pass : inference done !" << std::endl; From 1380b140ddcd8edcb0f8a09010d4ce8c68d3ba6a Mon Sep 17 00:00:00 2001 From: Florent Buisson Date: Mon, 25 Sep 2017 17:38:24 +0000 Subject: [PATCH 16/52] pose.sh script --- pose.sh | 1 + 1 file changed, 1 insertion(+) create mode 100755 pose.sh diff --git a/pose.sh b/pose.sh new file mode 100755 index 000000000..14f3a4deb --- /dev/null +++ b/pose.sh @@ -0,0 +1 @@ +./build/examples/openpose/openpose.bin -camera_resolution 640x480 -net_resolution 128x96 From 32f53873f8002323e9208e05a5bb05cd8d39bdfa Mon Sep 17 00:00:00 2001 From: Florent Buisson Date: Tue, 26 Sep 2017 09:54:12 +0000 Subject: [PATCH 17/52] Timing in original pose demo --- .../tutorial_pose/1_extract_from_image.cpp | 39 ++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/examples/tutorial_pose/1_extract_from_image.cpp b/examples/tutorial_pose/1_extract_from_image.cpp index 48cbcbb96..417f73de0 100644 --- a/examples/tutorial_pose/1_extract_from_image.cpp +++ b/examples/tutorial_pose/1_extract_from_image.cpp @@ -52,10 +52,29 @@ DEFINE_double(render_threshold, 0.05, "Only estimated keypoint DEFINE_double(alpha_pose, 0.6, "Blending factor (range 0-1) for the body part rendering. 1 will show it completely, 0 will" " hide it. Only valid for GPU rendering."); +typedef std::vector> OpTimings; + +static OpTimings timings; + +static void timeNow(const std::string& label){ + const auto now = std::chrono::high_resolution_clock::now(); + const auto timing = std::make_pair(label, now); + timings.push_back(timing); +} + +static std::string timeDiffToString(const std::chrono::high_resolution_clock::time_point& t1, + const std::chrono::high_resolution_clock::time_point& t2 ) { + return std::to_string((double)std::chrono::duration_cast>(t1 - t2).count() * 1e3) + " ms"; +} + + int openPoseTutorialPose1() { op::log("OpenPose Library Tutorial - Example 1.", op::Priority::High); // ------------------------- INITIALIZATION ------------------------- + + timeNow("Start"); + // Step 1 - Set logging level // - 0 will output all the logging messages // - 255 will output nothing @@ -92,11 +111,14 @@ int openPoseTutorialPose1() poseExtractorCaffe.initializationOnThread(); poseRenderer.initializationOnThread(); + timeNow("Initialization"); + // ------------------------- POSE ESTIMATION AND RENDERING ------------------------- // Step 1 - Read and load image, error if empty (possibly wrong path) cv::Mat inputImage = op::loadImage(FLAGS_image_path, CV_LOAD_IMAGE_COLOR); // Alternative: cv::imread(FLAGS_image_path, CV_LOAD_IMAGE_COLOR); if(inputImage.empty()) op::error("Could not open or find the image: " + FLAGS_image_path, __LINE__, __FUNCTION__, __FILE__); + timeNow("Step 1"); // Step 2 - Format input image to OpenPose input and output formats op::Array netInputArray; std::vector scaleRatios; @@ -104,20 +126,35 @@ int openPoseTutorialPose1() double scaleInputToOutput; op::Array outputArray; std::tie(scaleInputToOutput, outputArray) = cvMatToOpOutput.format(inputImage); + timeNow("Step 2"); // Step 3 - Estimate poseKeypoints poseExtractorCaffe.forwardPass(netInputArray, {inputImage.cols, inputImage.rows}, scaleRatios); const auto poseKeypoints = poseExtractorCaffe.getPoseKeypoints(); + timeNow("Step 3"); // Step 4 - Render poseKeypoints poseRenderer.renderPose(outputArray, poseKeypoints); + timeNow("Step 4"); // Step 5 - OpenPose output format to cv::Mat auto outputImage = opOutputToCvMat.formatToCvMat(outputArray); - + timeNow("Step 5"); + // ------------------------- SHOWING RESULT AND CLOSING ------------------------- // Step 1 - Show results frameDisplayer.displayFrame(outputImage, 0); // Alternative: cv::imshow(outputImage) + cv::waitKey(0) // Step 2 - Logging information message op::log("Example 1 successfully finished.", op::Priority::High); // Return successful message + + const auto totalTimeSec = timeDiffToString(timings.back().second, timings.front().second); + const auto message = "Pose estimation successfully finished. Total time: " + totalTimeSec + " seconds."; + op::log(message, op::Priority::High); + + for(OpTimings::iterator timing = timings.begin()+1; timing != timings.end(); ++timing) { + const auto log_time = (*timing).first + " - " + timeDiffToString((*timing).second, (*(timing-1)).second); + op::log(log_time, op::Priority::High); + } + + return 0; } From d2310db589d2c9b85f52697f9936fb8100c90ff6 Mon Sep 17 00:00:00 2001 From: Florent Buisson Date: Tue, 26 Sep 2017 11:56:41 +0200 Subject: [PATCH 18/52] Did not take into account forwardPass input data ! --- src/openpose/core/netTensorRT.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/openpose/core/netTensorRT.cpp b/src/openpose/core/netTensorRT.cpp index 28eb09f09..ca593522c 100644 --- a/src/openpose/core/netTensorRT.cpp +++ b/src/openpose/core/netTensorRT.cpp @@ -244,9 +244,11 @@ namespace op if (inputData != nullptr) { + + // OLD - //auto* gpuImagePtr = upTensorRTNet->blobs().at(0)->mutable_gpu_data(); - //cudaMemcpy(gpuImagePtr, inputData, mNetInputMemory, cudaMemcpyHostToDevice); + auto* gpuImagePtr = spInputBlob->mutable_gpu_data(); + cudaMemcpy(gpuImagePtr, inputData, mNetInputMemory, cudaMemcpyHostToDevice); // Tensor RT version From 576c055fffb2de7e76a244e34c1392111d713325 Mon Sep 17 00:00:00 2001 From: Florent Buisson Date: Tue, 26 Sep 2017 14:10:12 +0000 Subject: [PATCH 19/52] Data copied to cuda memory. Correct sizes hardcoded, no CUDA error anymore, still not working. --- src/openpose/core/netCaffe.cpp | 4 +++- src/openpose/core/netTensorRT.cpp | 10 +++++----- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/openpose/core/netCaffe.cpp b/src/openpose/core/netCaffe.cpp index ac03d80c0..2108d0178 100644 --- a/src/openpose/core/netCaffe.cpp +++ b/src/openpose/core/netCaffe.cpp @@ -32,9 +32,11 @@ namespace op upCaffeNet->blobs()[0]->Reshape({mNetInputSize4D[0], mNetInputSize4D[1], mNetInputSize4D[2], mNetInputSize4D[3]}); upCaffeNet->Reshape(); cudaCheck(__LINE__, __FUNCTION__, __FILE__); + boost::shared_ptr> spInputBlob = upCaffeNet->blobs().at(0); + std::cout << "Input Blob size : " << spInputBlob->num() << " " << spInputBlob->channels() << " " << spInputBlob->height() << " " << spInputBlob->width() << std::endl; // Set spOutputBlob spOutputBlob = upCaffeNet->blob_by_name(mLastBlobName); - std::cout << spOutputBlob->num() << " " << spOutputBlob->channels() << " " << spOutputBlob->height() << " " << spOutputBlob->width(); + std::cout << "Output Blob size : " << spOutputBlob->num() << " " << spOutputBlob->channels() << " " << spOutputBlob->height() << " " << spOutputBlob->width() << std::endl; if (spOutputBlob == nullptr) error("The output blob is a nullptr. Did you use the same name than the prototxt? (Used: " + mLastBlobName + ").", __LINE__, __FUNCTION__, __FILE__); cudaCheck(__LINE__, __FUNCTION__, __FILE__); diff --git a/src/openpose/core/netTensorRT.cpp b/src/openpose/core/netTensorRT.cpp index ca593522c..6bca6acd4 100644 --- a/src/openpose/core/netTensorRT.cpp +++ b/src/openpose/core/netTensorRT.cpp @@ -60,7 +60,7 @@ ICudaEngine* caffeToGIEModel(const std::string& caffeProto, const std::string& c const IBlobNameToTensor* blobNameToTensor = parser->parse(caffeProto.c_str(), caffeTrainedModel.c_str(), *network, - DataType::kHALF); + DataType::kFLOAT); if (!blobNameToTensor) return nullptr; @@ -97,8 +97,8 @@ ICudaEngine* caffeToGIEModel(const std::string& caffeProto, const std::string& c builder->setMaxBatchSize(1); // 16 megabytes, default in giexec. No idea what's best for Jetson though, // maybe check dusty_nv's code on github - builder->setMaxWorkspaceSize(16<<20); - builder->setHalf2Mode(true); + builder->setMaxWorkspaceSize(32<<20); + builder->setHalf2Mode(false); ICudaEngine* engine = builder->buildCudaEngine(*network); if (engine == nullptr) @@ -185,7 +185,7 @@ namespace op - spInputBlob = boost::make_shared>(1, 3, 128, 96); + spInputBlob = boost::make_shared>(1, 3, 368, 656); spOutputBlob = boost::make_shared>(1, 57, 46, 82); // For tensor RT is done in caffeToGIE @@ -248,7 +248,7 @@ namespace op // OLD auto* gpuImagePtr = spInputBlob->mutable_gpu_data(); - cudaMemcpy(gpuImagePtr, inputData, mNetInputMemory, cudaMemcpyHostToDevice); + CUDA_TENSORRT_CHECK(cudaMemcpy(gpuImagePtr, inputData, mNetInputMemory, cudaMemcpyHostToDevice)); // Tensor RT version From e5d27fec2553be3c2081661e68e1d2a471c379b0 Mon Sep 17 00:00:00 2001 From: Florent Buisson Date: Tue, 26 Sep 2017 15:09:21 +0000 Subject: [PATCH 20/52] Tutorial pose 3 working !!!! Gaining x2 inference time, now time for cleaning. --- models/pose/coco/pose_deploy_linevec.prototxt | 4 +- src/openpose/core/netTensorRT.cpp | 57 +++++++++++++++++-- 2 files changed, 54 insertions(+), 7 deletions(-) diff --git a/models/pose/coco/pose_deploy_linevec.prototxt b/models/pose/coco/pose_deploy_linevec.prototxt index 6e4322812..c310c8785 100755 --- a/models/pose/coco/pose_deploy_linevec.prototxt +++ b/models/pose/coco/pose_deploy_linevec.prototxt @@ -1,8 +1,8 @@ input: "image" input_dim: 1 input_dim: 3 -input_dim: 96 # This value will be defined at runtime -input_dim: 128 # This value will be defined at runtime +input_dim: 368 # This value will be defined at runtime +input_dim: 656 # This value will be defined at runtime layer { name: "conv1_1" type: "Convolution" diff --git a/src/openpose/core/netTensorRT.cpp b/src/openpose/core/netTensorRT.cpp index 6bca6acd4..2ffc550f7 100644 --- a/src/openpose/core/netTensorRT.cpp +++ b/src/openpose/core/netTensorRT.cpp @@ -48,6 +48,31 @@ class Logger : public ILogger } } gLogger; +void createMemory(const ICudaEngine& engine, std::vector& buffers, const std::string& name) +{ + size_t bindingIndex = engine.getBindingIndex(name.c_str()); + printf("name=%s, bindingIndex=%d, buffers.size()=%d\n", name.c_str(), (int)bindingIndex, (int)buffers.size()); + assert(bindingIndex < buffers.size()); + DimsCHW dimensions = static_cast(engine.getBindingDimensions((int)bindingIndex)); + size_t eltCount = dimensions.c()*dimensions.h()*dimensions.w()*1, memSize = eltCount * sizeof(float); + + float* localMem = new float[eltCount]; + for (size_t i = 0; i < eltCount; i++) + localMem[i] = (float(rand()) / RAND_MAX) * 2 - 1; + + void* deviceMem; + CHECK(cudaMalloc(&deviceMem, memSize)); + if (deviceMem == nullptr) + { + std::cerr << "Out of memory" << std::endl; + exit(1); + } + CHECK(cudaMemcpy(deviceMem, localMem, memSize, cudaMemcpyHostToDevice)); + + delete[] localMem; + buffers[bindingIndex] = deviceMem; +} + ICudaEngine* caffeToGIEModel(const std::string& caffeProto, const std::string& caffeTrainedModel) { @@ -294,6 +319,26 @@ namespace op //createMemory(*cudaEngine, buffers, gInputs[i]); + const int batchSize = 1; + size_t eltCount = 1*57*46*82*batchSize, memSize = eltCount * sizeof(float); + + float* localMem = new float[eltCount]; + for (size_t i = 0; i < eltCount; i++) + localMem[i] = (float(rand()) / RAND_MAX) * 2 - 1; + + void* deviceMem; + CUDA_TENSORRT_CHECK(cudaMalloc(&deviceMem, memSize)); + if (deviceMem == nullptr) + { + std::cerr << "Out of memory" << std::endl; + exit(1); + } + CUDA_TENSORRT_CHECK(cudaMemcpy(deviceMem, localMem, memSize, cudaMemcpyHostToDevice)); + + + buffers[1] = deviceMem; + //spOutputBlob->set_gpu_data((float*)deviceMem); + //createMemory(*cudaEngine, buffers, std::string("net_output")); @@ -309,12 +354,14 @@ namespace op std::cout << "Forward Pass : executing inference" << std::endl; - int batchSize = 1; - spInputBlob->Update(); + //spInputBlob->Update(); context->execute(batchSize, &buffers[0]); - spOutputBlob->Update(); - - + //spOutputBlob->Update(); + spOutputBlob->set_gpu_data((float*)deviceMem); + //CUDA_TENSORRT_CHECK(cudaMemcpy(localMem, buffers[1], memSize, cudaMemcpyDeviceToHost)); + //spOutputBlob->set_cpu_data((float*)localMem); + + delete[] localMem; std::cout << "Forward Pass : inference done !" << std::endl; From 7d370957d82c8c402fabe6e1992b5f86dcc203b0 Mon Sep 17 00:00:00 2001 From: Florent Buisson Date: Tue, 26 Sep 2017 18:13:17 +0000 Subject: [PATCH 21/52] TensorRT Net input and output dimensions at runtime. --- include/openpose/core/netTensorRT.hpp | 1 + src/openpose/core/netTensorRT.cpp | 10 +++++++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/include/openpose/core/netTensorRT.hpp b/include/openpose/core/netTensorRT.hpp index 6e92ddb07..36436ca93 100644 --- a/include/openpose/core/netTensorRT.hpp +++ b/include/openpose/core/netTensorRT.hpp @@ -34,6 +34,7 @@ namespace op // Init with constructor const int mGpuId; const std::array mNetInputSize4D; + std::array mNetOutputSize4D; const unsigned long mNetInputMemory; const std::string mCaffeProto; const std::string mCaffeTrainedModel; diff --git a/src/openpose/core/netTensorRT.cpp b/src/openpose/core/netTensorRT.cpp index 2ffc550f7..4991dba3a 100644 --- a/src/openpose/core/netTensorRT.cpp +++ b/src/openpose/core/netTensorRT.cpp @@ -97,6 +97,9 @@ ICudaEngine* caffeToGIEModel(const std::string& caffeProto, const std::string& c gInputs.push_back(network->getInput(i)->getName()); gInputDimensions.insert(std::make_pair(network->getInput(i)->getName(), dims)); std::cout << "Input \"" << network->getInput(i)->getName() << "\": " << dims.c() << "x" << dims.h() << "x" << dims.w() << std::endl; + mNetOutputSize4D = { 1, dims.c(), dims.h(), dims.w() }; + if( i > 0) + std::err << "Multiple output unsupported for now!" << std:endl; } // specify which tensors are outputs @@ -209,9 +212,10 @@ namespace op std::cout << "InitializationOnThread : done" << std::endl; - - spInputBlob = boost::make_shared>(1, 3, 368, 656); - spOutputBlob = boost::make_shared>(1, 57, 46, 82); + std::cout << "NetInputSize4D: " << mNetInputSize4D[0] << " " << mNetInputSize4D[1] << " " << mNetInputSize4D[2] << " " << mNetInputSize4D[3] << std::endl; + + spInputBlob = boost::make_shared>(mNetInputSize4D[0], mNetInputSize4D[1], mNetInputSize4D[2], mNetInputSize4D[3]); + spOutputBlob = boost::make_shared>(mNetOutputSize4D[0], mNetOutputSize4D[1], mNetOutputSize4D[2], mNetOutputSize4D[3]); // For tensor RT is done in caffeToGIE /* From f3a898c553074f8bf8dc9ee214def049377761d3 Mon Sep 17 00:00:00 2001 From: Florent Buisson Date: Tue, 26 Sep 2017 21:15:45 +0200 Subject: [PATCH 22/52] NetTensorRT cleaning. --- include/openpose/core/netTensorRT.hpp | 3 + src/openpose/core/netTensorRT.cpp | 346 ++++++++++---------------- 2 files changed, 132 insertions(+), 217 deletions(-) diff --git a/include/openpose/core/netTensorRT.hpp b/include/openpose/core/netTensorRT.hpp index 36436ca93..41df6141a 100644 --- a/include/openpose/core/netTensorRT.hpp +++ b/include/openpose/core/netTensorRT.hpp @@ -46,6 +46,9 @@ namespace op // TensorRT stuff nvinfer1::ICudaEngine* cudaEngine; + nvinfer1::IExecutionContext* cudaContext; + ICudaEngine* caffeToGIEModel(); + ICudaEngine* createEngine(); DELETE_COPY(NetTensorRT); }; diff --git a/src/openpose/core/netTensorRT.cpp b/src/openpose/core/netTensorRT.cpp index 4991dba3a..8c6a4fc48 100644 --- a/src/openpose/core/netTensorRT.cpp +++ b/src/openpose/core/netTensorRT.cpp @@ -24,20 +24,11 @@ using namespace nvinfer1; using namespace nvcaffeparser1; -#define CUDA_TENSORRT_CHECK(status) \ -{ \ -if (status != 0) \ -{ \ -std::cout << "Cuda failure: " << status; \ -abort(); \ -} \ -} std::vector gInputs; std::map gInputDimensions; - // Logger for GIE info/warning/errors class Logger : public ILogger { @@ -48,126 +39,6 @@ class Logger : public ILogger } } gLogger; -void createMemory(const ICudaEngine& engine, std::vector& buffers, const std::string& name) -{ - size_t bindingIndex = engine.getBindingIndex(name.c_str()); - printf("name=%s, bindingIndex=%d, buffers.size()=%d\n", name.c_str(), (int)bindingIndex, (int)buffers.size()); - assert(bindingIndex < buffers.size()); - DimsCHW dimensions = static_cast(engine.getBindingDimensions((int)bindingIndex)); - size_t eltCount = dimensions.c()*dimensions.h()*dimensions.w()*1, memSize = eltCount * sizeof(float); - - float* localMem = new float[eltCount]; - for (size_t i = 0; i < eltCount; i++) - localMem[i] = (float(rand()) / RAND_MAX) * 2 - 1; - - void* deviceMem; - CHECK(cudaMalloc(&deviceMem, memSize)); - if (deviceMem == nullptr) - { - std::cerr << "Out of memory" << std::endl; - exit(1); - } - CHECK(cudaMemcpy(deviceMem, localMem, memSize, cudaMemcpyHostToDevice)); - - delete[] localMem; - buffers[bindingIndex] = deviceMem; -} - - -ICudaEngine* caffeToGIEModel(const std::string& caffeProto, const std::string& caffeTrainedModel) -{ - // create the builder - IBuilder* builder = createInferBuilder(gLogger); - - // parse the caffe model to populate the network, then set the outputs - INetworkDefinition* network = builder->createNetwork(); - ICaffeParser* parser = createCaffeParser(); - const IBlobNameToTensor* blobNameToTensor = parser->parse(caffeProto.c_str(), - caffeTrainedModel.c_str(), - *network, - DataType::kFLOAT); - - if (!blobNameToTensor) - return nullptr; - - - for (int i = 0, n = network->getNbInputs(); i < n; i++) - { - DimsCHW dims = static_cast(network->getInput(i)->getDimensions()); - gInputs.push_back(network->getInput(i)->getName()); - gInputDimensions.insert(std::make_pair(network->getInput(i)->getName(), dims)); - std::cout << "Input \"" << network->getInput(i)->getName() << "\": " << dims.c() << "x" << dims.h() << "x" << dims.w() << std::endl; - mNetOutputSize4D = { 1, dims.c(), dims.h(), dims.w() }; - if( i > 0) - std::err << "Multiple output unsupported for now!" << std:endl; - } - - // specify which tensors are outputs - - - // TODO, if it works switch to something more generic, add as parameter etc - std::string s("net_output"); - if (blobNameToTensor->find(s.c_str()) == nullptr) - { - std::cout << "could not find output blob " << s << std::endl; - return nullptr; - } - network->markOutput(*blobNameToTensor->find(s.c_str())); - - - for (int i = 0, n = network->getNbOutputs(); i < n; i++) - { - DimsCHW dims = static_cast(network->getOutput(i)->getDimensions()); - std::cout << "Output \"" << network->getOutput(i)->getName() << "\": " << dims.c() << "x" << dims.h() << "x" << dims.w() << std::endl; - } - - // Build the engine - builder->setMaxBatchSize(1); - // 16 megabytes, default in giexec. No idea what's best for Jetson though, - // maybe check dusty_nv's code on github - builder->setMaxWorkspaceSize(32<<20); - builder->setHalf2Mode(false); - - ICudaEngine* engine = builder->buildCudaEngine(*network); - if (engine == nullptr) - std::cout << "could not build engine" << std::endl; - - parser->destroy(); - network->destroy(); - builder->destroy(); - shutdownProtobufLibrary(); - return engine; -} - - -static ICudaEngine* createEngine(const std::string& caffeProto, const std::string& caffeTrainedModel) -{ - ICudaEngine *engine; - - engine = caffeToGIEModel(caffeProto, caffeTrainedModel); - if (!engine) - { - std::cerr << "Engine could not be created" << std::endl; - return nullptr; - } - - /* TODO seems unneeded, remove if so. - if (!gParams.engine.empty()) - { - std::ofstream p(gParams.engine); - if (!p) - { - std::cerr << "could not open plan output file" << std::endl; - return nullptr; - } - IHostMemory *ptr = engine->serialize(); - assert(ptr); - p.write(reinterpret_cast(ptr->data()), ptr->size()); - ptr->destroy(); - }*/ - return engine; -} - namespace op { @@ -180,14 +51,119 @@ namespace op mCaffeTrainedModel{caffeTrainedModel}, mLastBlobName{lastBlobName} { + cudaStream_t stream; + CUDA_CHECK(cudaStreamCreate(&stream)); + cudaEvent_t start, end; + CUDA_CHECK(cudaEventCreate(&start)); + CUDA_CHECK(cudaEventCreate(&end)); } NetTensorRT::~NetTensorRT() { + cudaStreamDestroy(stream); + cudaEventDestroy(start); + cudaEventDestroy(end); + if (cudaEngine) cudaEngine->destroy(); } + + NetTensorRT::ICudaEngine* caffeToGIEModel() + { + // create the builder + IBuilder* builder = createInferBuilder(gLogger); + + // parse the caffe model to populate the network, then set the outputs + INetworkDefinition* network = builder->createNetwork(); + ICaffeParser* parser = createCaffeParser(); + const IBlobNameToTensor* blobNameToTensor = parser->parse(mCaffeProto.c_str(), + mCaffeTrainedModel.c_str(), + *network, + DataType::kFLOAT); + + if (!blobNameToTensor) + return nullptr; + + + for (int i = 0, n = network->getNbInputs(); i < n; i++) + { + DimsCHW dims = static_cast(network->getInput(i)->getDimensions()); + gInputs.push_back(network->getInput(i)->getName()); + gInputDimensions.insert(std::make_pair(network->getInput(i)->getName(), dims)); + std::cout << "Input \"" << network->getInput(i)->getName() << "\": " << dims.c() << "x" << dims.h() << "x" << dims.w() << std::endl; + mNetOutputSize4D = { 1, dims.c(), dims.h(), dims.w() }; + if( i > 0) + std::err << "Multiple output unsupported for now!" << std:endl; + } + + // specify which tensors are outputs + + + // TODO, if it works switch to something more generic, add as parameter etc + std::string s("net_output"); + if (blobNameToTensor->find(s.c_str()) == nullptr) + { + std::cout << "could not find output blob " << s << std::endl; + return nullptr; + } + network->markOutput(*blobNameToTensor->find(s.c_str())); + + + for (int i = 0, n = network->getNbOutputs(); i < n; i++) + { + DimsCHW dims = static_cast(network->getOutput(i)->getDimensions()); + std::cout << "Output \"" << network->getOutput(i)->getName() << "\": " << dims.c() << "x" << dims.h() << "x" << dims.w() << std::endl; + } + + // Build the engine + builder->setMaxBatchSize(1); + // 16 megabytes, default in giexec. No idea what's best for Jetson though, + // maybe check dusty_nv's code on github + builder->setMaxWorkspaceSize(32<<20); + builder->setHalf2Mode(false); + + ICudaEngine* engine = builder->buildCudaEngine(*network); + if (engine == nullptr) + std::cout << "could not build engine" << std::endl; + + parser->destroy(); + network->destroy(); + builder->destroy(); + shutdownProtobufLibrary(); + + return engine; + } + + + NetTensorRT::ICudaEngine* createEngine() + { + ICudaEngine *engine; + + engine = caffeToGIEModel(caffeProto, caffeTrainedModel); + if (!engine) + { + std::cerr << "Engine could not be created" << std::endl; + return nullptr; + } + + /* TODO Serialize and load engines for given net size as optim quite long + if (!gParams.engine.empty()) + { + std::ofstream p(gParams.engine); + if (!p) + { + std::cerr << "could not open plan output file" << std::endl; + return nullptr; + } + IHostMemory *ptr = engine->serialize(); + assert(ptr); + p.write(reinterpret_cast(ptr->data()), ptr->size()); + ptr->destroy(); + }*/ + return engine; + } + void NetTensorRT::initializationOnThread() { @@ -205,7 +181,16 @@ namespace op cudaEngine = createEngine(mCaffeProto, mCaffeTrainedModel); if (!cudaEngine) { - std::cerr << "Engine could not be created" << std::endl; + std::cerr << "cudaEngine could not be created" << std::endl; + return; + } + + std::cout << "InitializationOnThread Pass : creating execution context" << std::endl; + + cudaContext = cudaEngine->createExecutionContext(); + if (!cudaContext) + { + std::cerr << "cudaContext could not be created" << std::endl; return; } @@ -217,19 +202,7 @@ namespace op spInputBlob = boost::make_shared>(mNetInputSize4D[0], mNetInputSize4D[1], mNetInputSize4D[2], mNetInputSize4D[3]); spOutputBlob = boost::make_shared>(mNetOutputSize4D[0], mNetOutputSize4D[1], mNetOutputSize4D[2], mNetOutputSize4D[3]); - // For tensor RT is done in caffeToGIE - /* - //caffe::TensorRT::SetDevice(mGpuId); - upTensorRTNet.reset(new caffe::Net{mTensorRTProto, caffe::TEST}); - upTensorRTNet->CopyTrainedLayersFrom(mTensorRTTrainedModel); - upTensorRTNet->blobs()[0]->Reshape({mNetInputSize4D[0], mNetInputSize4D[1], mNetInputSize4D[2], mNetInputSize4D[3]}); - upTensorRTNet->Reshape(); - cudaCheck(__LINE__, __FUNCTION__, __FILE__);*/ - // Set spOutputBlob - /* - if (spOutputBlob == nullptr) - error("The output blob is a nullptr. Did you use the same name than the prototxt? (Used: " + mLastBlobName + ").", __LINE__, __FUNCTION__, __FILE__); - cudaCheck(__LINE__, __FUNCTION__, __FILE__);*/ + cudaCheck(__LINE__, __FUNCTION__, __FILE__); } catch (const std::exception& e) { @@ -272,51 +245,14 @@ namespace op // Copy frame data to GPU memory if (inputData != nullptr) { - - - - // OLD auto* gpuImagePtr = spInputBlob->mutable_gpu_data(); - CUDA_TENSORRT_CHECK(cudaMemcpy(gpuImagePtr, inputData, mNetInputMemory, cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(gpuImagePtr, inputData, mNetInputMemory, cudaMemcpyHostToDevice)); - // Tensor RT version - - // TODO maybe move this to init and keep only the execute part - - std::cout << "Forward Pass : creating execution context" << std::endl; - - IExecutionContext *context = cudaEngine->createExecutionContext(); // input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(), // of these, but in this case we know that there is exactly one input and one output. std::cout << "Forward Pass : creating CUDA memory" << std::endl; - - /* - const int batchSize = 1; - size_t bindingIndex = engine.getBindingIndex(name.c_str()); - std::cout"name=%s, bindingIndex=%d, buffers.size()=%d\n", name.c_str(), (int)bindingIndex, (int)buffers.size()); - assert(bindingIndex < buffers.size()); - DimsCHW dimensions = static_cast(engine.getBindingDimensions((int)bindingIndex)); - size_t eltCount = dimensions.c()*dimensions.h()*dimensions.w()*batchSize, memSize = eltCount * sizeof(float); - - float* localMem = new float[eltCount]; - for (size_t i = 0; i < eltCount; i++) - localMem[i] = (float(rand()) / RAND_MAX) * 2 - 1; - - void* deviceMem; - CUDA_TENSORRT_CHECK(cudaMalloc(&deviceMem, memSize)); - if (deviceMem == nullptr) - { - std::cerr << "Out of memory" << std::endl; - exit(1); - } - CUDA_TENSORRT_CHECK(cudaMemcpy(deviceMem, localMem, memSize, cudaMemcpyHostToDevice)); - - delete[] localMem; - buffers[bindingIndex] = deviceMem; - */ - std::vector buffers(2); buffers[0] = spInputBlob->mutable_gpu_data(); buffers[1] = spOutputBlob->mutable_gpu_data(); @@ -331,53 +267,29 @@ namespace op localMem[i] = (float(rand()) / RAND_MAX) * 2 - 1; void* deviceMem; - CUDA_TENSORRT_CHECK(cudaMalloc(&deviceMem, memSize)); + CUDA_CHECK(cudaMalloc(&deviceMem, memSize)); if (deviceMem == nullptr) { std::cerr << "Out of memory" << std::endl; exit(1); } - CUDA_TENSORRT_CHECK(cudaMemcpy(deviceMem, localMem, memSize, cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(deviceMem, localMem, memSize, cudaMemcpyHostToDevice)); buffers[1] = deviceMem; - //spOutputBlob->set_gpu_data((float*)deviceMem); - - - //createMemory(*cudaEngine, buffers, std::string("net_output")); - - - std::cout << "Forward Pass : memory created" << std::endl; - - cudaStream_t stream; - CUDA_TENSORRT_CHECK(cudaStreamCreate(&stream)); - cudaEvent_t start, end; - CUDA_TENSORRT_CHECK(cudaEventCreate(&start)); - CUDA_TENSORRT_CHECK(cudaEventCreate(&end)); - - - std::cout << "Forward Pass : executing inference" << std::endl; - - //spInputBlob->Update(); - context->execute(batchSize, &buffers[0]); - //spOutputBlob->Update(); - spOutputBlob->set_gpu_data((float*)deviceMem); - //CUDA_TENSORRT_CHECK(cudaMemcpy(localMem, buffers[1], memSize, cudaMemcpyDeviceToHost)); - //spOutputBlob->set_cpu_data((float*)localMem); - delete[] localMem; - std::cout << "Forward Pass : inference done !" << std::endl; - - - - cudaStreamDestroy(stream); - cudaEventDestroy(start); - cudaEventDestroy(end); + std::cout << "Forward Pass : memory created" << std::endl; + cudaCheck(__LINE__, __FUNCTION__, __FILE__); } - // Old Perform deep network forward pass - //upTensorRTNet->ForwardFrom(0); - //cudaCheck(__LINE__, __FUNCTION__, __FILE__); + std::cout << "Forward Pass : executing inference" << std::endl; + + context->execute(batchSize, &buffers[0]); + + spOutputBlob->set_gpu_data((float*)deviceMem); + + std::cout << "Forward Pass : inference done !" << std::endl; + cudaCheck(__LINE__, __FUNCTION__, __FILE__); } catch (const std::exception& e) { From 5c630b52ec7d3eab879299db03a90bd00cbed7a6 Mon Sep 17 00:00:00 2001 From: Florent Buisson Date: Tue, 26 Sep 2017 21:19:37 +0200 Subject: [PATCH 23/52] NetTensorRT cleaning bis. --- src/openpose/core/netTensorRT.cpp | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/src/openpose/core/netTensorRT.cpp b/src/openpose/core/netTensorRT.cpp index 8c6a4fc48..3c72003a2 100644 --- a/src/openpose/core/netTensorRT.cpp +++ b/src/openpose/core/netTensorRT.cpp @@ -92,17 +92,12 @@ namespace op gInputs.push_back(network->getInput(i)->getName()); gInputDimensions.insert(std::make_pair(network->getInput(i)->getName(), dims)); std::cout << "Input \"" << network->getInput(i)->getName() << "\": " << dims.c() << "x" << dims.h() << "x" << dims.w() << std::endl; - mNetOutputSize4D = { 1, dims.c(), dims.h(), dims.w() }; if( i > 0) std::err << "Multiple output unsupported for now!" << std:endl; } - // specify which tensors are outputs - - - // TODO, if it works switch to something more generic, add as parameter etc - std::string s("net_output"); - if (blobNameToTensor->find(s.c_str()) == nullptr) + // Specify which tensor is output (multiple unsupported) + if (blobNameToTensor->find(mLastBlobName.c_str()) == nullptr) { std::cout << "could not find output blob " << s << std::endl; return nullptr; @@ -114,6 +109,7 @@ namespace op { DimsCHW dims = static_cast(network->getOutput(i)->getDimensions()); std::cout << "Output \"" << network->getOutput(i)->getName() << "\": " << dims.c() << "x" << dims.h() << "x" << dims.w() << std::endl; + mNetOutputSize4D = { 1, dims.c(), dims.h(), dims.w() }; } // Build the engine From a61758319d33f30af1b4be6d6961f8538162cbb1 Mon Sep 17 00:00:00 2001 From: Florent Buisson Date: Tue, 26 Sep 2017 20:00:08 +0000 Subject: [PATCH 24/52] Cleaning compilation fix. --- include/openpose/core/netTensorRT.hpp | 6 +++-- src/openpose/core/netTensorRT.cpp | 36 +++++++++++++-------------- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/include/openpose/core/netTensorRT.hpp b/include/openpose/core/netTensorRT.hpp index 41df6141a..00e176ab0 100644 --- a/include/openpose/core/netTensorRT.hpp +++ b/include/openpose/core/netTensorRT.hpp @@ -47,8 +47,10 @@ namespace op // TensorRT stuff nvinfer1::ICudaEngine* cudaEngine; nvinfer1::IExecutionContext* cudaContext; - ICudaEngine* caffeToGIEModel(); - ICudaEngine* createEngine(); + nvinfer1::ICudaEngine* caffeToGIEModel(); + nvinfer1::ICudaEngine* createEngine(); + cudaStream_t stream; + cudaEvent_t start, end; DELETE_COPY(NetTensorRT); }; diff --git a/src/openpose/core/netTensorRT.cpp b/src/openpose/core/netTensorRT.cpp index 3c72003a2..79892b1ff 100644 --- a/src/openpose/core/netTensorRT.cpp +++ b/src/openpose/core/netTensorRT.cpp @@ -51,9 +51,8 @@ namespace op mCaffeTrainedModel{caffeTrainedModel}, mLastBlobName{lastBlobName} { - cudaStream_t stream; + std::cout << "Caffe file: " << mCaffeProto.c_str() << std::endl; CUDA_CHECK(cudaStreamCreate(&stream)); - cudaEvent_t start, end; CUDA_CHECK(cudaEventCreate(&start)); CUDA_CHECK(cudaEventCreate(&end)); } @@ -69,7 +68,7 @@ namespace op } - NetTensorRT::ICudaEngine* caffeToGIEModel() + ICudaEngine* NetTensorRT::caffeToGIEModel() { // create the builder IBuilder* builder = createInferBuilder(gLogger); @@ -93,16 +92,16 @@ namespace op gInputDimensions.insert(std::make_pair(network->getInput(i)->getName(), dims)); std::cout << "Input \"" << network->getInput(i)->getName() << "\": " << dims.c() << "x" << dims.h() << "x" << dims.w() << std::endl; if( i > 0) - std::err << "Multiple output unsupported for now!" << std:endl; + std::cerr << "Multiple output unsupported for now!"; } // Specify which tensor is output (multiple unsupported) if (blobNameToTensor->find(mLastBlobName.c_str()) == nullptr) { - std::cout << "could not find output blob " << s << std::endl; + std::cout << "could not find output blob " << mLastBlobName.c_str() << std::endl; return nullptr; } - network->markOutput(*blobNameToTensor->find(s.c_str())); + network->markOutput(*blobNameToTensor->find(mLastBlobName.c_str())); for (int i = 0, n = network->getNbOutputs(); i < n; i++) @@ -132,11 +131,11 @@ namespace op } - NetTensorRT::ICudaEngine* createEngine() + ICudaEngine* NetTensorRT::createEngine() { ICudaEngine *engine; - engine = caffeToGIEModel(caffeProto, caffeTrainedModel); + engine = caffeToGIEModel(); if (!engine) { std::cerr << "Engine could not be created" << std::endl; @@ -174,7 +173,7 @@ namespace op std::cout << "InitializationOnThread : creating engine" << std::endl; - cudaEngine = createEngine(mCaffeProto, mCaffeTrainedModel); + cudaEngine = createEngine(); if (!cudaEngine) { std::cerr << "cudaEngine could not be created" << std::endl; @@ -238,6 +237,7 @@ namespace op std::cout << "Forward Pass : start" << std::endl; try { + const int batchSize = 1; // Copy frame data to GPU memory if (inputData != nullptr) { @@ -253,10 +253,7 @@ namespace op buffers[0] = spInputBlob->mutable_gpu_data(); buffers[1] = spOutputBlob->mutable_gpu_data(); - //createMemory(*cudaEngine, buffers, gInputs[i]); - - const int batchSize = 1; - size_t eltCount = 1*57*46*82*batchSize, memSize = eltCount * sizeof(float); + size_t eltCount = mNetOutputSize4D[0]*mNetOutputSize4D[1]*mNetOutputSize4D[2]*mNetOutputSize4D[3]*batchSize, memSize = eltCount * sizeof(float); float* localMem = new float[eltCount]; for (size_t i = 0; i < eltCount; i++) @@ -277,15 +274,16 @@ namespace op std::cout << "Forward Pass : memory created" << std::endl; cudaCheck(__LINE__, __FUNCTION__, __FILE__); - } - std::cout << "Forward Pass : executing inference" << std::endl; - context->execute(batchSize, &buffers[0]); + std::cout << "Forward Pass : executing inference" << std::endl; - spOutputBlob->set_gpu_data((float*)deviceMem); + cudaContext->execute(batchSize, &buffers[0]); - std::cout << "Forward Pass : inference done !" << std::endl; - cudaCheck(__LINE__, __FUNCTION__, __FILE__); + spOutputBlob->set_gpu_data((float*)deviceMem); + + std::cout << "Forward Pass : inference done !" << std::endl; + cudaCheck(__LINE__, __FUNCTION__, __FILE__); + } } catch (const std::exception& e) { From d3a31e05c0b73d2ca785dfb6c88cb9fc335a7c7d Mon Sep 17 00:00:00 2001 From: Florent Buisson Date: Tue, 26 Sep 2017 20:46:20 +0000 Subject: [PATCH 25/52] caffeToGIE needs fixed input size and cannot be determined at runtime for now. --- models/pose/coco/pose_deploy_linevec.prototxt | 4 +- .../coco/pose_deploy_linevec.prototxt_368x656 | 2976 +++++++++++++++++ src/openpose/core/netTensorRT.cpp | 2 +- 3 files changed, 2979 insertions(+), 3 deletions(-) create mode 100755 models/pose/coco/pose_deploy_linevec.prototxt_368x656 diff --git a/models/pose/coco/pose_deploy_linevec.prototxt b/models/pose/coco/pose_deploy_linevec.prototxt index c310c8785..fbe0c8245 100755 --- a/models/pose/coco/pose_deploy_linevec.prototxt +++ b/models/pose/coco/pose_deploy_linevec.prototxt @@ -1,8 +1,8 @@ input: "image" input_dim: 1 input_dim: 3 -input_dim: 368 # This value will be defined at runtime -input_dim: 656 # This value will be defined at runtime +input_dim: 1 # This value will be defined at runtime +input_dim: 1 # This value will be defined at runtime layer { name: "conv1_1" type: "Convolution" diff --git a/models/pose/coco/pose_deploy_linevec.prototxt_368x656 b/models/pose/coco/pose_deploy_linevec.prototxt_368x656 new file mode 100755 index 000000000..c310c8785 --- /dev/null +++ b/models/pose/coco/pose_deploy_linevec.prototxt_368x656 @@ -0,0 +1,2976 @@ +input: "image" +input_dim: 1 +input_dim: 3 +input_dim: 368 # This value will be defined at runtime +input_dim: 656 # This value will be defined at runtime +layer { + name: "conv1_1" + type: "Convolution" + bottom: "image" + top: "conv1_1" + param { + lr_mult: 1.0 + decay_mult: 1 + } + param { + lr_mult: 2.0 + decay_mult: 0 + } + convolution_param { + num_output: 64 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "relu1_1" + type: "ReLU" + bottom: "conv1_1" + top: "conv1_1" +} +layer { + name: "conv1_2" + type: "Convolution" + bottom: "conv1_1" + top: "conv1_2" + param { + lr_mult: 1.0 + decay_mult: 1 + } + param { + lr_mult: 2.0 + decay_mult: 0 + } + convolution_param { + num_output: 64 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "relu1_2" + type: "ReLU" + bottom: "conv1_2" + top: "conv1_2" +} +layer { + name: "pool1_stage1" + type: "Pooling" + bottom: "conv1_2" + top: "pool1_stage1" + pooling_param { + pool: MAX + kernel_size: 2 + stride: 2 + } +} +layer { + name: "conv2_1" + type: "Convolution" + bottom: "pool1_stage1" + top: "conv2_1" + param { + lr_mult: 1.0 + decay_mult: 1 + } + param { + lr_mult: 2.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "relu2_1" + type: "ReLU" + bottom: "conv2_1" + top: "conv2_1" +} +layer { + name: "conv2_2" + type: "Convolution" + bottom: "conv2_1" + top: "conv2_2" + param { + lr_mult: 1.0 + decay_mult: 1 + } + param { + lr_mult: 2.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "relu2_2" + type: "ReLU" + bottom: "conv2_2" + top: "conv2_2" +} +layer { + name: "pool2_stage1" + type: "Pooling" + bottom: "conv2_2" + top: "pool2_stage1" + pooling_param { + pool: MAX + kernel_size: 2 + stride: 2 + } +} +layer { + name: "conv3_1" + type: "Convolution" + bottom: "pool2_stage1" + top: "conv3_1" + param { + lr_mult: 1.0 + decay_mult: 1 + } + param { + lr_mult: 2.0 + decay_mult: 0 + } + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "relu3_1" + type: "ReLU" + bottom: "conv3_1" + top: "conv3_1" +} +layer { + name: "conv3_2" + type: "Convolution" + bottom: "conv3_1" + top: "conv3_2" + param { + lr_mult: 1.0 + decay_mult: 1 + } + param { + lr_mult: 2.0 + decay_mult: 0 + } + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "relu3_2" + type: "ReLU" + bottom: "conv3_2" + top: "conv3_2" +} +layer { + name: "conv3_3" + type: "Convolution" + bottom: "conv3_2" + top: "conv3_3" + param { + lr_mult: 1.0 + decay_mult: 1 + } + param { + lr_mult: 2.0 + decay_mult: 0 + } + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "relu3_3" + type: "ReLU" + bottom: "conv3_3" + top: "conv3_3" +} +layer { + name: "conv3_4" + type: "Convolution" + bottom: "conv3_3" + top: "conv3_4" + param { + lr_mult: 1.0 + decay_mult: 1 + } + param { + lr_mult: 2.0 + decay_mult: 0 + } + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "relu3_4" + type: "ReLU" + bottom: "conv3_4" + top: "conv3_4" +} +layer { + name: "pool3_stage1" + type: "Pooling" + bottom: "conv3_4" + top: "pool3_stage1" + pooling_param { + pool: MAX + kernel_size: 2 + stride: 2 + } +} +layer { + name: "conv4_1" + type: "Convolution" + bottom: "pool3_stage1" + top: "conv4_1" + param { + lr_mult: 1.0 + decay_mult: 1 + } + param { + lr_mult: 2.0 + decay_mult: 0 + } + convolution_param { + num_output: 512 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "relu4_1" + type: "ReLU" + bottom: "conv4_1" + top: "conv4_1" +} +layer { + name: "conv4_2" + type: "Convolution" + bottom: "conv4_1" + top: "conv4_2" + param { + lr_mult: 1.0 + decay_mult: 1 + } + param { + lr_mult: 2.0 + decay_mult: 0 + } + convolution_param { + num_output: 512 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "relu4_2" + type: "ReLU" + bottom: "conv4_2" + top: "conv4_2" +} +layer { + name: "conv4_3_CPM" + type: "Convolution" + bottom: "conv4_2" + top: "conv4_3_CPM" + param { + lr_mult: 1.0 + decay_mult: 1 + } + param { + lr_mult: 2.0 + decay_mult: 0 + } + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "relu4_3_CPM" + type: "ReLU" + bottom: "conv4_3_CPM" + top: "conv4_3_CPM" +} +layer { + name: "conv4_4_CPM" + type: "Convolution" + bottom: "conv4_3_CPM" + top: "conv4_4_CPM" + param { + lr_mult: 1.0 + decay_mult: 1 + } + param { + lr_mult: 2.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "relu4_4_CPM" + type: "ReLU" + bottom: "conv4_4_CPM" + top: "conv4_4_CPM" +} +layer { + name: "conv5_1_CPM_L1" + type: "Convolution" + bottom: "conv4_4_CPM" + top: "conv5_1_CPM_L1" + param { + lr_mult: 1.0 + decay_mult: 1 + } + param { + lr_mult: 2.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "relu5_1_CPM_L1" + type: "ReLU" + bottom: "conv5_1_CPM_L1" + top: "conv5_1_CPM_L1" +} +layer { + name: "conv5_1_CPM_L2" + type: "Convolution" + bottom: "conv4_4_CPM" + top: "conv5_1_CPM_L2" + param { + lr_mult: 1.0 + decay_mult: 1 + } + param { + lr_mult: 2.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "relu5_1_CPM_L2" + type: "ReLU" + bottom: "conv5_1_CPM_L2" + top: "conv5_1_CPM_L2" +} +layer { + name: "conv5_2_CPM_L1" + type: "Convolution" + bottom: "conv5_1_CPM_L1" + top: "conv5_2_CPM_L1" + param { + lr_mult: 1.0 + decay_mult: 1 + } + param { + lr_mult: 2.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "relu5_2_CPM_L1" + type: "ReLU" + bottom: "conv5_2_CPM_L1" + top: "conv5_2_CPM_L1" +} +layer { + name: "conv5_2_CPM_L2" + type: "Convolution" + bottom: "conv5_1_CPM_L2" + top: "conv5_2_CPM_L2" + param { + lr_mult: 1.0 + decay_mult: 1 + } + param { + lr_mult: 2.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "relu5_2_CPM_L2" + type: "ReLU" + bottom: "conv5_2_CPM_L2" + top: "conv5_2_CPM_L2" +} +layer { + name: "conv5_3_CPM_L1" + type: "Convolution" + bottom: "conv5_2_CPM_L1" + top: "conv5_3_CPM_L1" + param { + lr_mult: 1.0 + decay_mult: 1 + } + param { + lr_mult: 2.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "relu5_3_CPM_L1" + type: "ReLU" + bottom: "conv5_3_CPM_L1" + top: "conv5_3_CPM_L1" +} +layer { + name: "conv5_3_CPM_L2" + type: "Convolution" + bottom: "conv5_2_CPM_L2" + top: "conv5_3_CPM_L2" + param { + lr_mult: 1.0 + decay_mult: 1 + } + param { + lr_mult: 2.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "relu5_3_CPM_L2" + type: "ReLU" + bottom: "conv5_3_CPM_L2" + top: "conv5_3_CPM_L2" +} +layer { + name: "conv5_4_CPM_L1" + type: "Convolution" + bottom: "conv5_3_CPM_L1" + top: "conv5_4_CPM_L1" + param { + lr_mult: 1.0 + decay_mult: 1 + } + param { + lr_mult: 2.0 + decay_mult: 0 + } + convolution_param { + num_output: 512 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "relu5_4_CPM_L1" + type: "ReLU" + bottom: "conv5_4_CPM_L1" + top: "conv5_4_CPM_L1" +} +layer { + name: "conv5_4_CPM_L2" + type: "Convolution" + bottom: "conv5_3_CPM_L2" + top: "conv5_4_CPM_L2" + param { + lr_mult: 1.0 + decay_mult: 1 + } + param { + lr_mult: 2.0 + decay_mult: 0 + } + convolution_param { + num_output: 512 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "relu5_4_CPM_L2" + type: "ReLU" + bottom: "conv5_4_CPM_L2" + top: "conv5_4_CPM_L2" +} +layer { + name: "conv5_5_CPM_L1" + type: "Convolution" + bottom: "conv5_4_CPM_L1" + top: "conv5_5_CPM_L1" + param { + lr_mult: 1.0 + decay_mult: 1 + } + param { + lr_mult: 2.0 + decay_mult: 0 + } + convolution_param { + num_output: 38 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "conv5_5_CPM_L2" + type: "Convolution" + bottom: "conv5_4_CPM_L2" + top: "conv5_5_CPM_L2" + param { + lr_mult: 1.0 + decay_mult: 1 + } + param { + lr_mult: 2.0 + decay_mult: 0 + } + convolution_param { + num_output: 19 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "concat_stage2" + type: "Concat" + bottom: "conv5_5_CPM_L1" + bottom: "conv5_5_CPM_L2" + bottom: "conv4_4_CPM" + top: "concat_stage2" + concat_param { + axis: 1 + } +} +layer { + name: "Mconv1_stage2_L1" + type: "Convolution" + bottom: "concat_stage2" + top: "Mconv1_stage2_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu1_stage2_L1" + type: "ReLU" + bottom: "Mconv1_stage2_L1" + top: "Mconv1_stage2_L1" +} +layer { + name: "Mconv1_stage2_L2" + type: "Convolution" + bottom: "concat_stage2" + top: "Mconv1_stage2_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu1_stage2_L2" + type: "ReLU" + bottom: "Mconv1_stage2_L2" + top: "Mconv1_stage2_L2" +} +layer { + name: "Mconv2_stage2_L1" + type: "Convolution" + bottom: "Mconv1_stage2_L1" + top: "Mconv2_stage2_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu2_stage2_L1" + type: "ReLU" + bottom: "Mconv2_stage2_L1" + top: "Mconv2_stage2_L1" +} +layer { + name: "Mconv2_stage2_L2" + type: "Convolution" + bottom: "Mconv1_stage2_L2" + top: "Mconv2_stage2_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu2_stage2_L2" + type: "ReLU" + bottom: "Mconv2_stage2_L2" + top: "Mconv2_stage2_L2" +} +layer { + name: "Mconv3_stage2_L1" + type: "Convolution" + bottom: "Mconv2_stage2_L1" + top: "Mconv3_stage2_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu3_stage2_L1" + type: "ReLU" + bottom: "Mconv3_stage2_L1" + top: "Mconv3_stage2_L1" +} +layer { + name: "Mconv3_stage2_L2" + type: "Convolution" + bottom: "Mconv2_stage2_L2" + top: "Mconv3_stage2_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu3_stage2_L2" + type: "ReLU" + bottom: "Mconv3_stage2_L2" + top: "Mconv3_stage2_L2" +} +layer { + name: "Mconv4_stage2_L1" + type: "Convolution" + bottom: "Mconv3_stage2_L1" + top: "Mconv4_stage2_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu4_stage2_L1" + type: "ReLU" + bottom: "Mconv4_stage2_L1" + top: "Mconv4_stage2_L1" +} +layer { + name: "Mconv4_stage2_L2" + type: "Convolution" + bottom: "Mconv3_stage2_L2" + top: "Mconv4_stage2_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu4_stage2_L2" + type: "ReLU" + bottom: "Mconv4_stage2_L2" + top: "Mconv4_stage2_L2" +} +layer { + name: "Mconv5_stage2_L1" + type: "Convolution" + bottom: "Mconv4_stage2_L1" + top: "Mconv5_stage2_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu5_stage2_L1" + type: "ReLU" + bottom: "Mconv5_stage2_L1" + top: "Mconv5_stage2_L1" +} +layer { + name: "Mconv5_stage2_L2" + type: "Convolution" + bottom: "Mconv4_stage2_L2" + top: "Mconv5_stage2_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu5_stage2_L2" + type: "ReLU" + bottom: "Mconv5_stage2_L2" + top: "Mconv5_stage2_L2" +} +layer { + name: "Mconv6_stage2_L1" + type: "Convolution" + bottom: "Mconv5_stage2_L1" + top: "Mconv6_stage2_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu6_stage2_L1" + type: "ReLU" + bottom: "Mconv6_stage2_L1" + top: "Mconv6_stage2_L1" +} +layer { + name: "Mconv6_stage2_L2" + type: "Convolution" + bottom: "Mconv5_stage2_L2" + top: "Mconv6_stage2_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu6_stage2_L2" + type: "ReLU" + bottom: "Mconv6_stage2_L2" + top: "Mconv6_stage2_L2" +} +layer { + name: "Mconv7_stage2_L1" + type: "Convolution" + bottom: "Mconv6_stage2_L1" + top: "Mconv7_stage2_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 38 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mconv7_stage2_L2" + type: "Convolution" + bottom: "Mconv6_stage2_L2" + top: "Mconv7_stage2_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 19 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "concat_stage3" + type: "Concat" + bottom: "Mconv7_stage2_L1" + bottom: "Mconv7_stage2_L2" + bottom: "conv4_4_CPM" + top: "concat_stage3" + concat_param { + axis: 1 + } +} +layer { + name: "Mconv1_stage3_L1" + type: "Convolution" + bottom: "concat_stage3" + top: "Mconv1_stage3_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu1_stage3_L1" + type: "ReLU" + bottom: "Mconv1_stage3_L1" + top: "Mconv1_stage3_L1" +} +layer { + name: "Mconv1_stage3_L2" + type: "Convolution" + bottom: "concat_stage3" + top: "Mconv1_stage3_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu1_stage3_L2" + type: "ReLU" + bottom: "Mconv1_stage3_L2" + top: "Mconv1_stage3_L2" +} +layer { + name: "Mconv2_stage3_L1" + type: "Convolution" + bottom: "Mconv1_stage3_L1" + top: "Mconv2_stage3_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu2_stage3_L1" + type: "ReLU" + bottom: "Mconv2_stage3_L1" + top: "Mconv2_stage3_L1" +} +layer { + name: "Mconv2_stage3_L2" + type: "Convolution" + bottom: "Mconv1_stage3_L2" + top: "Mconv2_stage3_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu2_stage3_L2" + type: "ReLU" + bottom: "Mconv2_stage3_L2" + top: "Mconv2_stage3_L2" +} +layer { + name: "Mconv3_stage3_L1" + type: "Convolution" + bottom: "Mconv2_stage3_L1" + top: "Mconv3_stage3_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu3_stage3_L1" + type: "ReLU" + bottom: "Mconv3_stage3_L1" + top: "Mconv3_stage3_L1" +} +layer { + name: "Mconv3_stage3_L2" + type: "Convolution" + bottom: "Mconv2_stage3_L2" + top: "Mconv3_stage3_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu3_stage3_L2" + type: "ReLU" + bottom: "Mconv3_stage3_L2" + top: "Mconv3_stage3_L2" +} +layer { + name: "Mconv4_stage3_L1" + type: "Convolution" + bottom: "Mconv3_stage3_L1" + top: "Mconv4_stage3_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu4_stage3_L1" + type: "ReLU" + bottom: "Mconv4_stage3_L1" + top: "Mconv4_stage3_L1" +} +layer { + name: "Mconv4_stage3_L2" + type: "Convolution" + bottom: "Mconv3_stage3_L2" + top: "Mconv4_stage3_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu4_stage3_L2" + type: "ReLU" + bottom: "Mconv4_stage3_L2" + top: "Mconv4_stage3_L2" +} +layer { + name: "Mconv5_stage3_L1" + type: "Convolution" + bottom: "Mconv4_stage3_L1" + top: "Mconv5_stage3_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu5_stage3_L1" + type: "ReLU" + bottom: "Mconv5_stage3_L1" + top: "Mconv5_stage3_L1" +} +layer { + name: "Mconv5_stage3_L2" + type: "Convolution" + bottom: "Mconv4_stage3_L2" + top: "Mconv5_stage3_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu5_stage3_L2" + type: "ReLU" + bottom: "Mconv5_stage3_L2" + top: "Mconv5_stage3_L2" +} +layer { + name: "Mconv6_stage3_L1" + type: "Convolution" + bottom: "Mconv5_stage3_L1" + top: "Mconv6_stage3_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu6_stage3_L1" + type: "ReLU" + bottom: "Mconv6_stage3_L1" + top: "Mconv6_stage3_L1" +} +layer { + name: "Mconv6_stage3_L2" + type: "Convolution" + bottom: "Mconv5_stage3_L2" + top: "Mconv6_stage3_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu6_stage3_L2" + type: "ReLU" + bottom: "Mconv6_stage3_L2" + top: "Mconv6_stage3_L2" +} +layer { + name: "Mconv7_stage3_L1" + type: "Convolution" + bottom: "Mconv6_stage3_L1" + top: "Mconv7_stage3_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 38 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mconv7_stage3_L2" + type: "Convolution" + bottom: "Mconv6_stage3_L2" + top: "Mconv7_stage3_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 19 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "concat_stage4" + type: "Concat" + bottom: "Mconv7_stage3_L1" + bottom: "Mconv7_stage3_L2" + bottom: "conv4_4_CPM" + top: "concat_stage4" + concat_param { + axis: 1 + } +} +layer { + name: "Mconv1_stage4_L1" + type: "Convolution" + bottom: "concat_stage4" + top: "Mconv1_stage4_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu1_stage4_L1" + type: "ReLU" + bottom: "Mconv1_stage4_L1" + top: "Mconv1_stage4_L1" +} +layer { + name: "Mconv1_stage4_L2" + type: "Convolution" + bottom: "concat_stage4" + top: "Mconv1_stage4_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu1_stage4_L2" + type: "ReLU" + bottom: "Mconv1_stage4_L2" + top: "Mconv1_stage4_L2" +} +layer { + name: "Mconv2_stage4_L1" + type: "Convolution" + bottom: "Mconv1_stage4_L1" + top: "Mconv2_stage4_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu2_stage4_L1" + type: "ReLU" + bottom: "Mconv2_stage4_L1" + top: "Mconv2_stage4_L1" +} +layer { + name: "Mconv2_stage4_L2" + type: "Convolution" + bottom: "Mconv1_stage4_L2" + top: "Mconv2_stage4_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu2_stage4_L2" + type: "ReLU" + bottom: "Mconv2_stage4_L2" + top: "Mconv2_stage4_L2" +} +layer { + name: "Mconv3_stage4_L1" + type: "Convolution" + bottom: "Mconv2_stage4_L1" + top: "Mconv3_stage4_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu3_stage4_L1" + type: "ReLU" + bottom: "Mconv3_stage4_L1" + top: "Mconv3_stage4_L1" +} +layer { + name: "Mconv3_stage4_L2" + type: "Convolution" + bottom: "Mconv2_stage4_L2" + top: "Mconv3_stage4_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu3_stage4_L2" + type: "ReLU" + bottom: "Mconv3_stage4_L2" + top: "Mconv3_stage4_L2" +} +layer { + name: "Mconv4_stage4_L1" + type: "Convolution" + bottom: "Mconv3_stage4_L1" + top: "Mconv4_stage4_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu4_stage4_L1" + type: "ReLU" + bottom: "Mconv4_stage4_L1" + top: "Mconv4_stage4_L1" +} +layer { + name: "Mconv4_stage4_L2" + type: "Convolution" + bottom: "Mconv3_stage4_L2" + top: "Mconv4_stage4_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu4_stage4_L2" + type: "ReLU" + bottom: "Mconv4_stage4_L2" + top: "Mconv4_stage4_L2" +} +layer { + name: "Mconv5_stage4_L1" + type: "Convolution" + bottom: "Mconv4_stage4_L1" + top: "Mconv5_stage4_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu5_stage4_L1" + type: "ReLU" + bottom: "Mconv5_stage4_L1" + top: "Mconv5_stage4_L1" +} +layer { + name: "Mconv5_stage4_L2" + type: "Convolution" + bottom: "Mconv4_stage4_L2" + top: "Mconv5_stage4_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu5_stage4_L2" + type: "ReLU" + bottom: "Mconv5_stage4_L2" + top: "Mconv5_stage4_L2" +} +layer { + name: "Mconv6_stage4_L1" + type: "Convolution" + bottom: "Mconv5_stage4_L1" + top: "Mconv6_stage4_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu6_stage4_L1" + type: "ReLU" + bottom: "Mconv6_stage4_L1" + top: "Mconv6_stage4_L1" +} +layer { + name: "Mconv6_stage4_L2" + type: "Convolution" + bottom: "Mconv5_stage4_L2" + top: "Mconv6_stage4_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu6_stage4_L2" + type: "ReLU" + bottom: "Mconv6_stage4_L2" + top: "Mconv6_stage4_L2" +} +layer { + name: "Mconv7_stage4_L1" + type: "Convolution" + bottom: "Mconv6_stage4_L1" + top: "Mconv7_stage4_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 38 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mconv7_stage4_L2" + type: "Convolution" + bottom: "Mconv6_stage4_L2" + top: "Mconv7_stage4_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 19 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "concat_stage5" + type: "Concat" + bottom: "Mconv7_stage4_L1" + bottom: "Mconv7_stage4_L2" + bottom: "conv4_4_CPM" + top: "concat_stage5" + concat_param { + axis: 1 + } +} +layer { + name: "Mconv1_stage5_L1" + type: "Convolution" + bottom: "concat_stage5" + top: "Mconv1_stage5_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu1_stage5_L1" + type: "ReLU" + bottom: "Mconv1_stage5_L1" + top: "Mconv1_stage5_L1" +} +layer { + name: "Mconv1_stage5_L2" + type: "Convolution" + bottom: "concat_stage5" + top: "Mconv1_stage5_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu1_stage5_L2" + type: "ReLU" + bottom: "Mconv1_stage5_L2" + top: "Mconv1_stage5_L2" +} +layer { + name: "Mconv2_stage5_L1" + type: "Convolution" + bottom: "Mconv1_stage5_L1" + top: "Mconv2_stage5_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu2_stage5_L1" + type: "ReLU" + bottom: "Mconv2_stage5_L1" + top: "Mconv2_stage5_L1" +} +layer { + name: "Mconv2_stage5_L2" + type: "Convolution" + bottom: "Mconv1_stage5_L2" + top: "Mconv2_stage5_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu2_stage5_L2" + type: "ReLU" + bottom: "Mconv2_stage5_L2" + top: "Mconv2_stage5_L2" +} +layer { + name: "Mconv3_stage5_L1" + type: "Convolution" + bottom: "Mconv2_stage5_L1" + top: "Mconv3_stage5_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu3_stage5_L1" + type: "ReLU" + bottom: "Mconv3_stage5_L1" + top: "Mconv3_stage5_L1" +} +layer { + name: "Mconv3_stage5_L2" + type: "Convolution" + bottom: "Mconv2_stage5_L2" + top: "Mconv3_stage5_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu3_stage5_L2" + type: "ReLU" + bottom: "Mconv3_stage5_L2" + top: "Mconv3_stage5_L2" +} +layer { + name: "Mconv4_stage5_L1" + type: "Convolution" + bottom: "Mconv3_stage5_L1" + top: "Mconv4_stage5_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu4_stage5_L1" + type: "ReLU" + bottom: "Mconv4_stage5_L1" + top: "Mconv4_stage5_L1" +} +layer { + name: "Mconv4_stage5_L2" + type: "Convolution" + bottom: "Mconv3_stage5_L2" + top: "Mconv4_stage5_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu4_stage5_L2" + type: "ReLU" + bottom: "Mconv4_stage5_L2" + top: "Mconv4_stage5_L2" +} +layer { + name: "Mconv5_stage5_L1" + type: "Convolution" + bottom: "Mconv4_stage5_L1" + top: "Mconv5_stage5_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu5_stage5_L1" + type: "ReLU" + bottom: "Mconv5_stage5_L1" + top: "Mconv5_stage5_L1" +} +layer { + name: "Mconv5_stage5_L2" + type: "Convolution" + bottom: "Mconv4_stage5_L2" + top: "Mconv5_stage5_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu5_stage5_L2" + type: "ReLU" + bottom: "Mconv5_stage5_L2" + top: "Mconv5_stage5_L2" +} +layer { + name: "Mconv6_stage5_L1" + type: "Convolution" + bottom: "Mconv5_stage5_L1" + top: "Mconv6_stage5_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu6_stage5_L1" + type: "ReLU" + bottom: "Mconv6_stage5_L1" + top: "Mconv6_stage5_L1" +} +layer { + name: "Mconv6_stage5_L2" + type: "Convolution" + bottom: "Mconv5_stage5_L2" + top: "Mconv6_stage5_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu6_stage5_L2" + type: "ReLU" + bottom: "Mconv6_stage5_L2" + top: "Mconv6_stage5_L2" +} +layer { + name: "Mconv7_stage5_L1" + type: "Convolution" + bottom: "Mconv6_stage5_L1" + top: "Mconv7_stage5_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 38 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mconv7_stage5_L2" + type: "Convolution" + bottom: "Mconv6_stage5_L2" + top: "Mconv7_stage5_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 19 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "concat_stage6" + type: "Concat" + bottom: "Mconv7_stage5_L1" + bottom: "Mconv7_stage5_L2" + bottom: "conv4_4_CPM" + top: "concat_stage6" + concat_param { + axis: 1 + } +} +layer { + name: "Mconv1_stage6_L1" + type: "Convolution" + bottom: "concat_stage6" + top: "Mconv1_stage6_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu1_stage6_L1" + type: "ReLU" + bottom: "Mconv1_stage6_L1" + top: "Mconv1_stage6_L1" +} +layer { + name: "Mconv1_stage6_L2" + type: "Convolution" + bottom: "concat_stage6" + top: "Mconv1_stage6_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu1_stage6_L2" + type: "ReLU" + bottom: "Mconv1_stage6_L2" + top: "Mconv1_stage6_L2" +} +layer { + name: "Mconv2_stage6_L1" + type: "Convolution" + bottom: "Mconv1_stage6_L1" + top: "Mconv2_stage6_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu2_stage6_L1" + type: "ReLU" + bottom: "Mconv2_stage6_L1" + top: "Mconv2_stage6_L1" +} +layer { + name: "Mconv2_stage6_L2" + type: "Convolution" + bottom: "Mconv1_stage6_L2" + top: "Mconv2_stage6_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu2_stage6_L2" + type: "ReLU" + bottom: "Mconv2_stage6_L2" + top: "Mconv2_stage6_L2" +} +layer { + name: "Mconv3_stage6_L1" + type: "Convolution" + bottom: "Mconv2_stage6_L1" + top: "Mconv3_stage6_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu3_stage6_L1" + type: "ReLU" + bottom: "Mconv3_stage6_L1" + top: "Mconv3_stage6_L1" +} +layer { + name: "Mconv3_stage6_L2" + type: "Convolution" + bottom: "Mconv2_stage6_L2" + top: "Mconv3_stage6_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu3_stage6_L2" + type: "ReLU" + bottom: "Mconv3_stage6_L2" + top: "Mconv3_stage6_L2" +} +layer { + name: "Mconv4_stage6_L1" + type: "Convolution" + bottom: "Mconv3_stage6_L1" + top: "Mconv4_stage6_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu4_stage6_L1" + type: "ReLU" + bottom: "Mconv4_stage6_L1" + top: "Mconv4_stage6_L1" +} +layer { + name: "Mconv4_stage6_L2" + type: "Convolution" + bottom: "Mconv3_stage6_L2" + top: "Mconv4_stage6_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu4_stage6_L2" + type: "ReLU" + bottom: "Mconv4_stage6_L2" + top: "Mconv4_stage6_L2" +} +layer { + name: "Mconv5_stage6_L1" + type: "Convolution" + bottom: "Mconv4_stage6_L1" + top: "Mconv5_stage6_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu5_stage6_L1" + type: "ReLU" + bottom: "Mconv5_stage6_L1" + top: "Mconv5_stage6_L1" +} +layer { + name: "Mconv5_stage6_L2" + type: "Convolution" + bottom: "Mconv4_stage6_L2" + top: "Mconv5_stage6_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu5_stage6_L2" + type: "ReLU" + bottom: "Mconv5_stage6_L2" + top: "Mconv5_stage6_L2" +} +layer { + name: "Mconv6_stage6_L1" + type: "Convolution" + bottom: "Mconv5_stage6_L1" + top: "Mconv6_stage6_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu6_stage6_L1" + type: "ReLU" + bottom: "Mconv6_stage6_L1" + top: "Mconv6_stage6_L1" +} +layer { + name: "Mconv6_stage6_L2" + type: "Convolution" + bottom: "Mconv5_stage6_L2" + top: "Mconv6_stage6_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu6_stage6_L2" + type: "ReLU" + bottom: "Mconv6_stage6_L2" + top: "Mconv6_stage6_L2" +} +layer { + name: "Mconv7_stage6_L1" + type: "Convolution" + bottom: "Mconv6_stage6_L1" + top: "Mconv7_stage6_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 38 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mconv7_stage6_L2" + type: "Convolution" + bottom: "Mconv6_stage6_L2" + top: "Mconv7_stage6_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 19 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "concat_stage7" + type: "Concat" + bottom: "Mconv7_stage6_L2" + bottom: "Mconv7_stage6_L1" + # top: "concat_stage7" + top: "net_output" + concat_param { + axis: 1 + } +} diff --git a/src/openpose/core/netTensorRT.cpp b/src/openpose/core/netTensorRT.cpp index 79892b1ff..4ab8ab62d 100644 --- a/src/openpose/core/netTensorRT.cpp +++ b/src/openpose/core/netTensorRT.cpp @@ -47,7 +47,7 @@ namespace op // mNetInputSize4D{netInputSize4D}, // This line crashes on some devices with old G++ mNetInputSize4D{netInputSize4D[0], netInputSize4D[1], netInputSize4D[2], netInputSize4D[3]}, mNetInputMemory{std::accumulate(mNetInputSize4D.begin(), mNetInputSize4D.end(), 1, std::multiplies()) * sizeof(float)}, - mCaffeProto{caffeProto}, + mCaffeProto{caffeProto + "_" + std::to_string(mNetInputSize4D[2]) + "x" + std::to_string(mNetInputSize4D[3])}, mCaffeTrainedModel{caffeTrainedModel}, mLastBlobName{lastBlobName} { From f6df326db6ad480f20899e8631628f54d9fb1542 Mon Sep 17 00:00:00 2001 From: Florent Buisson Date: Tue, 26 Sep 2017 21:14:13 +0000 Subject: [PATCH 26/52] Engine serialization and deserialization. --- src/openpose/core/netTensorRT.cpp | 77 ++++++++++++++++++++++--------- 1 file changed, 54 insertions(+), 23 deletions(-) diff --git a/src/openpose/core/netTensorRT.cpp b/src/openpose/core/netTensorRT.cpp index 4ab8ab62d..01c659872 100644 --- a/src/openpose/core/netTensorRT.cpp +++ b/src/openpose/core/netTensorRT.cpp @@ -108,7 +108,6 @@ namespace op { DimsCHW dims = static_cast(network->getOutput(i)->getDimensions()); std::cout << "Output \"" << network->getOutput(i)->getName() << "\": " << dims.c() << "x" << dims.h() << "x" << dims.w() << std::endl; - mNetOutputSize4D = { 1, dims.c(), dims.h(), dims.w() }; } // Build the engine @@ -130,32 +129,62 @@ namespace op return engine; } + inline bool file_exists(const std::string& file_path) { + struct stat buffer; + return (stat(file_path.c_str(), &buffer) == 0); + } ICudaEngine* NetTensorRT::createEngine() { ICudaEngine *engine; - engine = caffeToGIEModel(); - if (!engine) + std::string serializedEnginePath = mCaffeProto + ".bin"; + + std::cout << "Serialized engine path: " << serializedEnginePath.c_str() << std::endl; + if (file_exists(serializedEnginePath)) { - std::cerr << "Engine could not be created" << std::endl; - return nullptr; + std::cout << "Found serialized TensorRT engine, deserializing..." << std::endl; + char *gieModelStream{nullptr}; + size_t size{0}; + std::ifstream file(serializedEnginePath, std::ios::binary); + if (file.good()) + { + file.seekg(0, file.end); + size = file.tellg(); + file.seekg(0, file.beg); + gieModelStream = new char[size]; + assert(gieModelStream); + file.read(gieModelStream, size); + file.close(); + } + + IRuntime* infer = createInferRuntime(gLogger); + engine = infer->deserializeCudaEngine(gieModelStream, size, nullptr); + if (gieModelStream) delete [] gieModelStream; + + return engine; + } + else + { + engine = caffeToGIEModel(); + if (!engine) + { + std::cerr << "Engine could not be created" << std::endl; + return nullptr; + } + else // serialize engine + { + std::ofstream p(serializedEnginePath); + if (!p) + { + std::cerr << "could not serialize engine" << std::endl; + } + IHostMemory *ptr = engine->serialize(); + assert(ptr); + p.write(reinterpret_cast(ptr->data()), ptr->size()); + ptr->destroy(); + } } - - /* TODO Serialize and load engines for given net size as optim quite long - if (!gParams.engine.empty()) - { - std::ofstream p(gParams.engine); - if (!p) - { - std::cerr << "could not open plan output file" << std::endl; - return nullptr; - } - IHostMemory *ptr = engine->serialize(); - assert(ptr); - p.write(reinterpret_cast(ptr->data()), ptr->size()); - ptr->destroy(); - }*/ return engine; } @@ -188,15 +217,17 @@ namespace op std::cerr << "cudaContext could not be created" << std::endl; return; } - - std::cout << "InitializationOnThread : done" << std::endl; - + + DimsCHW outputDims = static_cast(cudaEngine->getBindingDimensions(cudaEngine->getNbBindings() - 1)); + mNetOutputSize4D = { 1, outputDims.c(), outputDims.h(), outputDims.w() }; + std::cout << "NetInputSize4D: " << mNetInputSize4D[0] << " " << mNetInputSize4D[1] << " " << mNetInputSize4D[2] << " " << mNetInputSize4D[3] << std::endl; spInputBlob = boost::make_shared>(mNetInputSize4D[0], mNetInputSize4D[1], mNetInputSize4D[2], mNetInputSize4D[3]); spOutputBlob = boost::make_shared>(mNetOutputSize4D[0], mNetOutputSize4D[1], mNetOutputSize4D[2], mNetOutputSize4D[3]); + std::cout << "InitializationOnThread : done" << std::endl; cudaCheck(__LINE__, __FUNCTION__, __FILE__); } catch (const std::exception& e) From 404077a8b59f80e09ca3df2034e72ee69df3a5fb Mon Sep 17 00:00:00 2001 From: Florent Buisson Date: Tue, 26 Sep 2017 21:44:37 +0000 Subject: [PATCH 27/52] Targetting highest possible FPS in demo. --- .../3_extract_from_image_TensorRT.cpp | 6 +- include/openpose/wrapper/wrapper.hpp | 2 +- .../coco/pose_deploy_linevec.prototxt_96x128 | 2976 +++++++++++++++++ 3 files changed, 2980 insertions(+), 4 deletions(-) create mode 100755 models/pose/coco/pose_deploy_linevec.prototxt_96x128 diff --git a/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp b/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp index f4e7eace1..adba661b0 100644 --- a/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp +++ b/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp @@ -31,9 +31,9 @@ DEFINE_string(image_path, "examples/media/COCO_val2014_00000000019 DEFINE_string(model_pose, "COCO", "Model to be used. E.g. `COCO` (18 keypoints), `MPI` (15 keypoints, ~10% faster), " "`MPI_4_layers` (15 keypoints, even faster but less accurate)."); DEFINE_string(model_folder, "models/", "Folder path (absolute or relative) where the models (pose, face, ...) are located."); -DEFINE_string(net_resolution, "656x368", "Multiples of 16. If it is increased, the accuracy potentially increases. If it is decreased," +DEFINE_string(net_resolution, "128x96", "Multiples of 16. If it is increased, the accuracy potentially increases. If it is decreased," " the speed increases. For maximum speed-accuracy balance, it should keep the closest aspect" - " ratio possible to the images or videos to be processed. E.g. the default `656x368` is" + " ratio possible to the images or videos to be processed. E.g. the default `128x96` is" " optimal for 16:9 videos, e.g. full HD (1980x1080) and HD (1280x720) videos."); DEFINE_string(resolution, "1280x720", "The image resolution (display and output). Use \"-1x-1\" to force the program to use the" " default images resolution."); @@ -85,7 +85,7 @@ int openPoseTutorialPose3() // outputSize const auto outputSize = op::flagsToPoint(FLAGS_resolution, "1280x720"); // netInputSize - const auto netInputSize = op::flagsToPoint(FLAGS_net_resolution, "656x368"); + const auto netInputSize = op::flagsToPoint(FLAGS_net_resolution, "128x96"); // netOutputSize const auto netOutputSize = netInputSize; // poseModel diff --git a/include/openpose/wrapper/wrapper.hpp b/include/openpose/wrapper/wrapper.hpp index c543a4717..eacbf385d 100644 --- a/include/openpose/wrapper/wrapper.hpp +++ b/include/openpose/wrapper/wrapper.hpp @@ -570,7 +570,7 @@ namespace op const Point& poseNetOutputSize = wrapperStructPose.netInputSize; std::vector> poseExtractors; for (auto gpuId = 0; gpuId < gpuNumber; gpuId++) - poseExtractors.emplace_back(std::make_shared( + poseExtractors.emplace_back(std::make_shared( wrapperStructPose.netInputSize, poseNetOutputSize, finalOutputSize, wrapperStructPose.scalesNumber, wrapperStructPose.poseModel, wrapperStructPose.modelFolder, gpuId + gpuNumberStart, wrapperStructPose.heatMapTypes, wrapperStructPose.heatMapScale diff --git a/models/pose/coco/pose_deploy_linevec.prototxt_96x128 b/models/pose/coco/pose_deploy_linevec.prototxt_96x128 new file mode 100755 index 000000000..6e4322812 --- /dev/null +++ b/models/pose/coco/pose_deploy_linevec.prototxt_96x128 @@ -0,0 +1,2976 @@ +input: "image" +input_dim: 1 +input_dim: 3 +input_dim: 96 # This value will be defined at runtime +input_dim: 128 # This value will be defined at runtime +layer { + name: "conv1_1" + type: "Convolution" + bottom: "image" + top: "conv1_1" + param { + lr_mult: 1.0 + decay_mult: 1 + } + param { + lr_mult: 2.0 + decay_mult: 0 + } + convolution_param { + num_output: 64 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "relu1_1" + type: "ReLU" + bottom: "conv1_1" + top: "conv1_1" +} +layer { + name: "conv1_2" + type: "Convolution" + bottom: "conv1_1" + top: "conv1_2" + param { + lr_mult: 1.0 + decay_mult: 1 + } + param { + lr_mult: 2.0 + decay_mult: 0 + } + convolution_param { + num_output: 64 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "relu1_2" + type: "ReLU" + bottom: "conv1_2" + top: "conv1_2" +} +layer { + name: "pool1_stage1" + type: "Pooling" + bottom: "conv1_2" + top: "pool1_stage1" + pooling_param { + pool: MAX + kernel_size: 2 + stride: 2 + } +} +layer { + name: "conv2_1" + type: "Convolution" + bottom: "pool1_stage1" + top: "conv2_1" + param { + lr_mult: 1.0 + decay_mult: 1 + } + param { + lr_mult: 2.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "relu2_1" + type: "ReLU" + bottom: "conv2_1" + top: "conv2_1" +} +layer { + name: "conv2_2" + type: "Convolution" + bottom: "conv2_1" + top: "conv2_2" + param { + lr_mult: 1.0 + decay_mult: 1 + } + param { + lr_mult: 2.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "relu2_2" + type: "ReLU" + bottom: "conv2_2" + top: "conv2_2" +} +layer { + name: "pool2_stage1" + type: "Pooling" + bottom: "conv2_2" + top: "pool2_stage1" + pooling_param { + pool: MAX + kernel_size: 2 + stride: 2 + } +} +layer { + name: "conv3_1" + type: "Convolution" + bottom: "pool2_stage1" + top: "conv3_1" + param { + lr_mult: 1.0 + decay_mult: 1 + } + param { + lr_mult: 2.0 + decay_mult: 0 + } + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "relu3_1" + type: "ReLU" + bottom: "conv3_1" + top: "conv3_1" +} +layer { + name: "conv3_2" + type: "Convolution" + bottom: "conv3_1" + top: "conv3_2" + param { + lr_mult: 1.0 + decay_mult: 1 + } + param { + lr_mult: 2.0 + decay_mult: 0 + } + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "relu3_2" + type: "ReLU" + bottom: "conv3_2" + top: "conv3_2" +} +layer { + name: "conv3_3" + type: "Convolution" + bottom: "conv3_2" + top: "conv3_3" + param { + lr_mult: 1.0 + decay_mult: 1 + } + param { + lr_mult: 2.0 + decay_mult: 0 + } + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "relu3_3" + type: "ReLU" + bottom: "conv3_3" + top: "conv3_3" +} +layer { + name: "conv3_4" + type: "Convolution" + bottom: "conv3_3" + top: "conv3_4" + param { + lr_mult: 1.0 + decay_mult: 1 + } + param { + lr_mult: 2.0 + decay_mult: 0 + } + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "relu3_4" + type: "ReLU" + bottom: "conv3_4" + top: "conv3_4" +} +layer { + name: "pool3_stage1" + type: "Pooling" + bottom: "conv3_4" + top: "pool3_stage1" + pooling_param { + pool: MAX + kernel_size: 2 + stride: 2 + } +} +layer { + name: "conv4_1" + type: "Convolution" + bottom: "pool3_stage1" + top: "conv4_1" + param { + lr_mult: 1.0 + decay_mult: 1 + } + param { + lr_mult: 2.0 + decay_mult: 0 + } + convolution_param { + num_output: 512 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "relu4_1" + type: "ReLU" + bottom: "conv4_1" + top: "conv4_1" +} +layer { + name: "conv4_2" + type: "Convolution" + bottom: "conv4_1" + top: "conv4_2" + param { + lr_mult: 1.0 + decay_mult: 1 + } + param { + lr_mult: 2.0 + decay_mult: 0 + } + convolution_param { + num_output: 512 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "relu4_2" + type: "ReLU" + bottom: "conv4_2" + top: "conv4_2" +} +layer { + name: "conv4_3_CPM" + type: "Convolution" + bottom: "conv4_2" + top: "conv4_3_CPM" + param { + lr_mult: 1.0 + decay_mult: 1 + } + param { + lr_mult: 2.0 + decay_mult: 0 + } + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "relu4_3_CPM" + type: "ReLU" + bottom: "conv4_3_CPM" + top: "conv4_3_CPM" +} +layer { + name: "conv4_4_CPM" + type: "Convolution" + bottom: "conv4_3_CPM" + top: "conv4_4_CPM" + param { + lr_mult: 1.0 + decay_mult: 1 + } + param { + lr_mult: 2.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "relu4_4_CPM" + type: "ReLU" + bottom: "conv4_4_CPM" + top: "conv4_4_CPM" +} +layer { + name: "conv5_1_CPM_L1" + type: "Convolution" + bottom: "conv4_4_CPM" + top: "conv5_1_CPM_L1" + param { + lr_mult: 1.0 + decay_mult: 1 + } + param { + lr_mult: 2.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "relu5_1_CPM_L1" + type: "ReLU" + bottom: "conv5_1_CPM_L1" + top: "conv5_1_CPM_L1" +} +layer { + name: "conv5_1_CPM_L2" + type: "Convolution" + bottom: "conv4_4_CPM" + top: "conv5_1_CPM_L2" + param { + lr_mult: 1.0 + decay_mult: 1 + } + param { + lr_mult: 2.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "relu5_1_CPM_L2" + type: "ReLU" + bottom: "conv5_1_CPM_L2" + top: "conv5_1_CPM_L2" +} +layer { + name: "conv5_2_CPM_L1" + type: "Convolution" + bottom: "conv5_1_CPM_L1" + top: "conv5_2_CPM_L1" + param { + lr_mult: 1.0 + decay_mult: 1 + } + param { + lr_mult: 2.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "relu5_2_CPM_L1" + type: "ReLU" + bottom: "conv5_2_CPM_L1" + top: "conv5_2_CPM_L1" +} +layer { + name: "conv5_2_CPM_L2" + type: "Convolution" + bottom: "conv5_1_CPM_L2" + top: "conv5_2_CPM_L2" + param { + lr_mult: 1.0 + decay_mult: 1 + } + param { + lr_mult: 2.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "relu5_2_CPM_L2" + type: "ReLU" + bottom: "conv5_2_CPM_L2" + top: "conv5_2_CPM_L2" +} +layer { + name: "conv5_3_CPM_L1" + type: "Convolution" + bottom: "conv5_2_CPM_L1" + top: "conv5_3_CPM_L1" + param { + lr_mult: 1.0 + decay_mult: 1 + } + param { + lr_mult: 2.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "relu5_3_CPM_L1" + type: "ReLU" + bottom: "conv5_3_CPM_L1" + top: "conv5_3_CPM_L1" +} +layer { + name: "conv5_3_CPM_L2" + type: "Convolution" + bottom: "conv5_2_CPM_L2" + top: "conv5_3_CPM_L2" + param { + lr_mult: 1.0 + decay_mult: 1 + } + param { + lr_mult: 2.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "relu5_3_CPM_L2" + type: "ReLU" + bottom: "conv5_3_CPM_L2" + top: "conv5_3_CPM_L2" +} +layer { + name: "conv5_4_CPM_L1" + type: "Convolution" + bottom: "conv5_3_CPM_L1" + top: "conv5_4_CPM_L1" + param { + lr_mult: 1.0 + decay_mult: 1 + } + param { + lr_mult: 2.0 + decay_mult: 0 + } + convolution_param { + num_output: 512 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "relu5_4_CPM_L1" + type: "ReLU" + bottom: "conv5_4_CPM_L1" + top: "conv5_4_CPM_L1" +} +layer { + name: "conv5_4_CPM_L2" + type: "Convolution" + bottom: "conv5_3_CPM_L2" + top: "conv5_4_CPM_L2" + param { + lr_mult: 1.0 + decay_mult: 1 + } + param { + lr_mult: 2.0 + decay_mult: 0 + } + convolution_param { + num_output: 512 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "relu5_4_CPM_L2" + type: "ReLU" + bottom: "conv5_4_CPM_L2" + top: "conv5_4_CPM_L2" +} +layer { + name: "conv5_5_CPM_L1" + type: "Convolution" + bottom: "conv5_4_CPM_L1" + top: "conv5_5_CPM_L1" + param { + lr_mult: 1.0 + decay_mult: 1 + } + param { + lr_mult: 2.0 + decay_mult: 0 + } + convolution_param { + num_output: 38 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "conv5_5_CPM_L2" + type: "Convolution" + bottom: "conv5_4_CPM_L2" + top: "conv5_5_CPM_L2" + param { + lr_mult: 1.0 + decay_mult: 1 + } + param { + lr_mult: 2.0 + decay_mult: 0 + } + convolution_param { + num_output: 19 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "concat_stage2" + type: "Concat" + bottom: "conv5_5_CPM_L1" + bottom: "conv5_5_CPM_L2" + bottom: "conv4_4_CPM" + top: "concat_stage2" + concat_param { + axis: 1 + } +} +layer { + name: "Mconv1_stage2_L1" + type: "Convolution" + bottom: "concat_stage2" + top: "Mconv1_stage2_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu1_stage2_L1" + type: "ReLU" + bottom: "Mconv1_stage2_L1" + top: "Mconv1_stage2_L1" +} +layer { + name: "Mconv1_stage2_L2" + type: "Convolution" + bottom: "concat_stage2" + top: "Mconv1_stage2_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu1_stage2_L2" + type: "ReLU" + bottom: "Mconv1_stage2_L2" + top: "Mconv1_stage2_L2" +} +layer { + name: "Mconv2_stage2_L1" + type: "Convolution" + bottom: "Mconv1_stage2_L1" + top: "Mconv2_stage2_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu2_stage2_L1" + type: "ReLU" + bottom: "Mconv2_stage2_L1" + top: "Mconv2_stage2_L1" +} +layer { + name: "Mconv2_stage2_L2" + type: "Convolution" + bottom: "Mconv1_stage2_L2" + top: "Mconv2_stage2_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu2_stage2_L2" + type: "ReLU" + bottom: "Mconv2_stage2_L2" + top: "Mconv2_stage2_L2" +} +layer { + name: "Mconv3_stage2_L1" + type: "Convolution" + bottom: "Mconv2_stage2_L1" + top: "Mconv3_stage2_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu3_stage2_L1" + type: "ReLU" + bottom: "Mconv3_stage2_L1" + top: "Mconv3_stage2_L1" +} +layer { + name: "Mconv3_stage2_L2" + type: "Convolution" + bottom: "Mconv2_stage2_L2" + top: "Mconv3_stage2_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu3_stage2_L2" + type: "ReLU" + bottom: "Mconv3_stage2_L2" + top: "Mconv3_stage2_L2" +} +layer { + name: "Mconv4_stage2_L1" + type: "Convolution" + bottom: "Mconv3_stage2_L1" + top: "Mconv4_stage2_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu4_stage2_L1" + type: "ReLU" + bottom: "Mconv4_stage2_L1" + top: "Mconv4_stage2_L1" +} +layer { + name: "Mconv4_stage2_L2" + type: "Convolution" + bottom: "Mconv3_stage2_L2" + top: "Mconv4_stage2_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu4_stage2_L2" + type: "ReLU" + bottom: "Mconv4_stage2_L2" + top: "Mconv4_stage2_L2" +} +layer { + name: "Mconv5_stage2_L1" + type: "Convolution" + bottom: "Mconv4_stage2_L1" + top: "Mconv5_stage2_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu5_stage2_L1" + type: "ReLU" + bottom: "Mconv5_stage2_L1" + top: "Mconv5_stage2_L1" +} +layer { + name: "Mconv5_stage2_L2" + type: "Convolution" + bottom: "Mconv4_stage2_L2" + top: "Mconv5_stage2_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu5_stage2_L2" + type: "ReLU" + bottom: "Mconv5_stage2_L2" + top: "Mconv5_stage2_L2" +} +layer { + name: "Mconv6_stage2_L1" + type: "Convolution" + bottom: "Mconv5_stage2_L1" + top: "Mconv6_stage2_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu6_stage2_L1" + type: "ReLU" + bottom: "Mconv6_stage2_L1" + top: "Mconv6_stage2_L1" +} +layer { + name: "Mconv6_stage2_L2" + type: "Convolution" + bottom: "Mconv5_stage2_L2" + top: "Mconv6_stage2_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu6_stage2_L2" + type: "ReLU" + bottom: "Mconv6_stage2_L2" + top: "Mconv6_stage2_L2" +} +layer { + name: "Mconv7_stage2_L1" + type: "Convolution" + bottom: "Mconv6_stage2_L1" + top: "Mconv7_stage2_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 38 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mconv7_stage2_L2" + type: "Convolution" + bottom: "Mconv6_stage2_L2" + top: "Mconv7_stage2_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 19 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "concat_stage3" + type: "Concat" + bottom: "Mconv7_stage2_L1" + bottom: "Mconv7_stage2_L2" + bottom: "conv4_4_CPM" + top: "concat_stage3" + concat_param { + axis: 1 + } +} +layer { + name: "Mconv1_stage3_L1" + type: "Convolution" + bottom: "concat_stage3" + top: "Mconv1_stage3_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu1_stage3_L1" + type: "ReLU" + bottom: "Mconv1_stage3_L1" + top: "Mconv1_stage3_L1" +} +layer { + name: "Mconv1_stage3_L2" + type: "Convolution" + bottom: "concat_stage3" + top: "Mconv1_stage3_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu1_stage3_L2" + type: "ReLU" + bottom: "Mconv1_stage3_L2" + top: "Mconv1_stage3_L2" +} +layer { + name: "Mconv2_stage3_L1" + type: "Convolution" + bottom: "Mconv1_stage3_L1" + top: "Mconv2_stage3_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu2_stage3_L1" + type: "ReLU" + bottom: "Mconv2_stage3_L1" + top: "Mconv2_stage3_L1" +} +layer { + name: "Mconv2_stage3_L2" + type: "Convolution" + bottom: "Mconv1_stage3_L2" + top: "Mconv2_stage3_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu2_stage3_L2" + type: "ReLU" + bottom: "Mconv2_stage3_L2" + top: "Mconv2_stage3_L2" +} +layer { + name: "Mconv3_stage3_L1" + type: "Convolution" + bottom: "Mconv2_stage3_L1" + top: "Mconv3_stage3_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu3_stage3_L1" + type: "ReLU" + bottom: "Mconv3_stage3_L1" + top: "Mconv3_stage3_L1" +} +layer { + name: "Mconv3_stage3_L2" + type: "Convolution" + bottom: "Mconv2_stage3_L2" + top: "Mconv3_stage3_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu3_stage3_L2" + type: "ReLU" + bottom: "Mconv3_stage3_L2" + top: "Mconv3_stage3_L2" +} +layer { + name: "Mconv4_stage3_L1" + type: "Convolution" + bottom: "Mconv3_stage3_L1" + top: "Mconv4_stage3_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu4_stage3_L1" + type: "ReLU" + bottom: "Mconv4_stage3_L1" + top: "Mconv4_stage3_L1" +} +layer { + name: "Mconv4_stage3_L2" + type: "Convolution" + bottom: "Mconv3_stage3_L2" + top: "Mconv4_stage3_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu4_stage3_L2" + type: "ReLU" + bottom: "Mconv4_stage3_L2" + top: "Mconv4_stage3_L2" +} +layer { + name: "Mconv5_stage3_L1" + type: "Convolution" + bottom: "Mconv4_stage3_L1" + top: "Mconv5_stage3_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu5_stage3_L1" + type: "ReLU" + bottom: "Mconv5_stage3_L1" + top: "Mconv5_stage3_L1" +} +layer { + name: "Mconv5_stage3_L2" + type: "Convolution" + bottom: "Mconv4_stage3_L2" + top: "Mconv5_stage3_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu5_stage3_L2" + type: "ReLU" + bottom: "Mconv5_stage3_L2" + top: "Mconv5_stage3_L2" +} +layer { + name: "Mconv6_stage3_L1" + type: "Convolution" + bottom: "Mconv5_stage3_L1" + top: "Mconv6_stage3_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu6_stage3_L1" + type: "ReLU" + bottom: "Mconv6_stage3_L1" + top: "Mconv6_stage3_L1" +} +layer { + name: "Mconv6_stage3_L2" + type: "Convolution" + bottom: "Mconv5_stage3_L2" + top: "Mconv6_stage3_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu6_stage3_L2" + type: "ReLU" + bottom: "Mconv6_stage3_L2" + top: "Mconv6_stage3_L2" +} +layer { + name: "Mconv7_stage3_L1" + type: "Convolution" + bottom: "Mconv6_stage3_L1" + top: "Mconv7_stage3_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 38 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mconv7_stage3_L2" + type: "Convolution" + bottom: "Mconv6_stage3_L2" + top: "Mconv7_stage3_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 19 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "concat_stage4" + type: "Concat" + bottom: "Mconv7_stage3_L1" + bottom: "Mconv7_stage3_L2" + bottom: "conv4_4_CPM" + top: "concat_stage4" + concat_param { + axis: 1 + } +} +layer { + name: "Mconv1_stage4_L1" + type: "Convolution" + bottom: "concat_stage4" + top: "Mconv1_stage4_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu1_stage4_L1" + type: "ReLU" + bottom: "Mconv1_stage4_L1" + top: "Mconv1_stage4_L1" +} +layer { + name: "Mconv1_stage4_L2" + type: "Convolution" + bottom: "concat_stage4" + top: "Mconv1_stage4_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu1_stage4_L2" + type: "ReLU" + bottom: "Mconv1_stage4_L2" + top: "Mconv1_stage4_L2" +} +layer { + name: "Mconv2_stage4_L1" + type: "Convolution" + bottom: "Mconv1_stage4_L1" + top: "Mconv2_stage4_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu2_stage4_L1" + type: "ReLU" + bottom: "Mconv2_stage4_L1" + top: "Mconv2_stage4_L1" +} +layer { + name: "Mconv2_stage4_L2" + type: "Convolution" + bottom: "Mconv1_stage4_L2" + top: "Mconv2_stage4_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu2_stage4_L2" + type: "ReLU" + bottom: "Mconv2_stage4_L2" + top: "Mconv2_stage4_L2" +} +layer { + name: "Mconv3_stage4_L1" + type: "Convolution" + bottom: "Mconv2_stage4_L1" + top: "Mconv3_stage4_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu3_stage4_L1" + type: "ReLU" + bottom: "Mconv3_stage4_L1" + top: "Mconv3_stage4_L1" +} +layer { + name: "Mconv3_stage4_L2" + type: "Convolution" + bottom: "Mconv2_stage4_L2" + top: "Mconv3_stage4_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu3_stage4_L2" + type: "ReLU" + bottom: "Mconv3_stage4_L2" + top: "Mconv3_stage4_L2" +} +layer { + name: "Mconv4_stage4_L1" + type: "Convolution" + bottom: "Mconv3_stage4_L1" + top: "Mconv4_stage4_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu4_stage4_L1" + type: "ReLU" + bottom: "Mconv4_stage4_L1" + top: "Mconv4_stage4_L1" +} +layer { + name: "Mconv4_stage4_L2" + type: "Convolution" + bottom: "Mconv3_stage4_L2" + top: "Mconv4_stage4_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu4_stage4_L2" + type: "ReLU" + bottom: "Mconv4_stage4_L2" + top: "Mconv4_stage4_L2" +} +layer { + name: "Mconv5_stage4_L1" + type: "Convolution" + bottom: "Mconv4_stage4_L1" + top: "Mconv5_stage4_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu5_stage4_L1" + type: "ReLU" + bottom: "Mconv5_stage4_L1" + top: "Mconv5_stage4_L1" +} +layer { + name: "Mconv5_stage4_L2" + type: "Convolution" + bottom: "Mconv4_stage4_L2" + top: "Mconv5_stage4_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu5_stage4_L2" + type: "ReLU" + bottom: "Mconv5_stage4_L2" + top: "Mconv5_stage4_L2" +} +layer { + name: "Mconv6_stage4_L1" + type: "Convolution" + bottom: "Mconv5_stage4_L1" + top: "Mconv6_stage4_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu6_stage4_L1" + type: "ReLU" + bottom: "Mconv6_stage4_L1" + top: "Mconv6_stage4_L1" +} +layer { + name: "Mconv6_stage4_L2" + type: "Convolution" + bottom: "Mconv5_stage4_L2" + top: "Mconv6_stage4_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu6_stage4_L2" + type: "ReLU" + bottom: "Mconv6_stage4_L2" + top: "Mconv6_stage4_L2" +} +layer { + name: "Mconv7_stage4_L1" + type: "Convolution" + bottom: "Mconv6_stage4_L1" + top: "Mconv7_stage4_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 38 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mconv7_stage4_L2" + type: "Convolution" + bottom: "Mconv6_stage4_L2" + top: "Mconv7_stage4_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 19 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "concat_stage5" + type: "Concat" + bottom: "Mconv7_stage4_L1" + bottom: "Mconv7_stage4_L2" + bottom: "conv4_4_CPM" + top: "concat_stage5" + concat_param { + axis: 1 + } +} +layer { + name: "Mconv1_stage5_L1" + type: "Convolution" + bottom: "concat_stage5" + top: "Mconv1_stage5_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu1_stage5_L1" + type: "ReLU" + bottom: "Mconv1_stage5_L1" + top: "Mconv1_stage5_L1" +} +layer { + name: "Mconv1_stage5_L2" + type: "Convolution" + bottom: "concat_stage5" + top: "Mconv1_stage5_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu1_stage5_L2" + type: "ReLU" + bottom: "Mconv1_stage5_L2" + top: "Mconv1_stage5_L2" +} +layer { + name: "Mconv2_stage5_L1" + type: "Convolution" + bottom: "Mconv1_stage5_L1" + top: "Mconv2_stage5_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu2_stage5_L1" + type: "ReLU" + bottom: "Mconv2_stage5_L1" + top: "Mconv2_stage5_L1" +} +layer { + name: "Mconv2_stage5_L2" + type: "Convolution" + bottom: "Mconv1_stage5_L2" + top: "Mconv2_stage5_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu2_stage5_L2" + type: "ReLU" + bottom: "Mconv2_stage5_L2" + top: "Mconv2_stage5_L2" +} +layer { + name: "Mconv3_stage5_L1" + type: "Convolution" + bottom: "Mconv2_stage5_L1" + top: "Mconv3_stage5_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu3_stage5_L1" + type: "ReLU" + bottom: "Mconv3_stage5_L1" + top: "Mconv3_stage5_L1" +} +layer { + name: "Mconv3_stage5_L2" + type: "Convolution" + bottom: "Mconv2_stage5_L2" + top: "Mconv3_stage5_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu3_stage5_L2" + type: "ReLU" + bottom: "Mconv3_stage5_L2" + top: "Mconv3_stage5_L2" +} +layer { + name: "Mconv4_stage5_L1" + type: "Convolution" + bottom: "Mconv3_stage5_L1" + top: "Mconv4_stage5_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu4_stage5_L1" + type: "ReLU" + bottom: "Mconv4_stage5_L1" + top: "Mconv4_stage5_L1" +} +layer { + name: "Mconv4_stage5_L2" + type: "Convolution" + bottom: "Mconv3_stage5_L2" + top: "Mconv4_stage5_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu4_stage5_L2" + type: "ReLU" + bottom: "Mconv4_stage5_L2" + top: "Mconv4_stage5_L2" +} +layer { + name: "Mconv5_stage5_L1" + type: "Convolution" + bottom: "Mconv4_stage5_L1" + top: "Mconv5_stage5_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu5_stage5_L1" + type: "ReLU" + bottom: "Mconv5_stage5_L1" + top: "Mconv5_stage5_L1" +} +layer { + name: "Mconv5_stage5_L2" + type: "Convolution" + bottom: "Mconv4_stage5_L2" + top: "Mconv5_stage5_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu5_stage5_L2" + type: "ReLU" + bottom: "Mconv5_stage5_L2" + top: "Mconv5_stage5_L2" +} +layer { + name: "Mconv6_stage5_L1" + type: "Convolution" + bottom: "Mconv5_stage5_L1" + top: "Mconv6_stage5_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu6_stage5_L1" + type: "ReLU" + bottom: "Mconv6_stage5_L1" + top: "Mconv6_stage5_L1" +} +layer { + name: "Mconv6_stage5_L2" + type: "Convolution" + bottom: "Mconv5_stage5_L2" + top: "Mconv6_stage5_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu6_stage5_L2" + type: "ReLU" + bottom: "Mconv6_stage5_L2" + top: "Mconv6_stage5_L2" +} +layer { + name: "Mconv7_stage5_L1" + type: "Convolution" + bottom: "Mconv6_stage5_L1" + top: "Mconv7_stage5_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 38 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mconv7_stage5_L2" + type: "Convolution" + bottom: "Mconv6_stage5_L2" + top: "Mconv7_stage5_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 19 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "concat_stage6" + type: "Concat" + bottom: "Mconv7_stage5_L1" + bottom: "Mconv7_stage5_L2" + bottom: "conv4_4_CPM" + top: "concat_stage6" + concat_param { + axis: 1 + } +} +layer { + name: "Mconv1_stage6_L1" + type: "Convolution" + bottom: "concat_stage6" + top: "Mconv1_stage6_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu1_stage6_L1" + type: "ReLU" + bottom: "Mconv1_stage6_L1" + top: "Mconv1_stage6_L1" +} +layer { + name: "Mconv1_stage6_L2" + type: "Convolution" + bottom: "concat_stage6" + top: "Mconv1_stage6_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu1_stage6_L2" + type: "ReLU" + bottom: "Mconv1_stage6_L2" + top: "Mconv1_stage6_L2" +} +layer { + name: "Mconv2_stage6_L1" + type: "Convolution" + bottom: "Mconv1_stage6_L1" + top: "Mconv2_stage6_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu2_stage6_L1" + type: "ReLU" + bottom: "Mconv2_stage6_L1" + top: "Mconv2_stage6_L1" +} +layer { + name: "Mconv2_stage6_L2" + type: "Convolution" + bottom: "Mconv1_stage6_L2" + top: "Mconv2_stage6_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu2_stage6_L2" + type: "ReLU" + bottom: "Mconv2_stage6_L2" + top: "Mconv2_stage6_L2" +} +layer { + name: "Mconv3_stage6_L1" + type: "Convolution" + bottom: "Mconv2_stage6_L1" + top: "Mconv3_stage6_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu3_stage6_L1" + type: "ReLU" + bottom: "Mconv3_stage6_L1" + top: "Mconv3_stage6_L1" +} +layer { + name: "Mconv3_stage6_L2" + type: "Convolution" + bottom: "Mconv2_stage6_L2" + top: "Mconv3_stage6_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu3_stage6_L2" + type: "ReLU" + bottom: "Mconv3_stage6_L2" + top: "Mconv3_stage6_L2" +} +layer { + name: "Mconv4_stage6_L1" + type: "Convolution" + bottom: "Mconv3_stage6_L1" + top: "Mconv4_stage6_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu4_stage6_L1" + type: "ReLU" + bottom: "Mconv4_stage6_L1" + top: "Mconv4_stage6_L1" +} +layer { + name: "Mconv4_stage6_L2" + type: "Convolution" + bottom: "Mconv3_stage6_L2" + top: "Mconv4_stage6_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu4_stage6_L2" + type: "ReLU" + bottom: "Mconv4_stage6_L2" + top: "Mconv4_stage6_L2" +} +layer { + name: "Mconv5_stage6_L1" + type: "Convolution" + bottom: "Mconv4_stage6_L1" + top: "Mconv5_stage6_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu5_stage6_L1" + type: "ReLU" + bottom: "Mconv5_stage6_L1" + top: "Mconv5_stage6_L1" +} +layer { + name: "Mconv5_stage6_L2" + type: "Convolution" + bottom: "Mconv4_stage6_L2" + top: "Mconv5_stage6_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu5_stage6_L2" + type: "ReLU" + bottom: "Mconv5_stage6_L2" + top: "Mconv5_stage6_L2" +} +layer { + name: "Mconv6_stage6_L1" + type: "Convolution" + bottom: "Mconv5_stage6_L1" + top: "Mconv6_stage6_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu6_stage6_L1" + type: "ReLU" + bottom: "Mconv6_stage6_L1" + top: "Mconv6_stage6_L1" +} +layer { + name: "Mconv6_stage6_L2" + type: "Convolution" + bottom: "Mconv5_stage6_L2" + top: "Mconv6_stage6_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu6_stage6_L2" + type: "ReLU" + bottom: "Mconv6_stage6_L2" + top: "Mconv6_stage6_L2" +} +layer { + name: "Mconv7_stage6_L1" + type: "Convolution" + bottom: "Mconv6_stage6_L1" + top: "Mconv7_stage6_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 38 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mconv7_stage6_L2" + type: "Convolution" + bottom: "Mconv6_stage6_L2" + top: "Mconv7_stage6_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 19 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "concat_stage7" + type: "Concat" + bottom: "Mconv7_stage6_L2" + bottom: "Mconv7_stage6_L1" + # top: "concat_stage7" + top: "net_output" + concat_param { + axis: 1 + } +} From 1971baa4f3f14d04d639768ab92eefc791fbe76c Mon Sep 17 00:00:00 2001 From: Florent Buisson Date: Tue, 26 Sep 2017 21:54:05 +0000 Subject: [PATCH 28/52] Asynchronous inference. --- src/openpose/core/netTensorRT.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/openpose/core/netTensorRT.cpp b/src/openpose/core/netTensorRT.cpp index 01c659872..fd4a174d2 100644 --- a/src/openpose/core/netTensorRT.cpp +++ b/src/openpose/core/netTensorRT.cpp @@ -308,7 +308,7 @@ namespace op std::cout << "Forward Pass : executing inference" << std::endl; - cudaContext->execute(batchSize, &buffers[0]); + cudaContext->enqueue(batchSize, &buffers[0], stream, nullptr); spOutputBlob->set_gpu_data((float*)deviceMem); From 330d4bbf0dcedfc9b7a595c0d7af61cb1e428204 Mon Sep 17 00:00:00 2001 From: Florent Buisson Date: Tue, 26 Sep 2017 22:03:38 +0000 Subject: [PATCH 29/52] Way simpler inference code, a lot was useless. --- src/openpose/core/netTensorRT.cpp | 27 --------------------------- 1 file changed, 27 deletions(-) diff --git a/src/openpose/core/netTensorRT.cpp b/src/openpose/core/netTensorRT.cpp index fd4a174d2..bb02d5041 100644 --- a/src/openpose/core/netTensorRT.cpp +++ b/src/openpose/core/netTensorRT.cpp @@ -277,41 +277,14 @@ namespace op // input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(), // of these, but in this case we know that there is exactly one input and one output. - - std::cout << "Forward Pass : creating CUDA memory" << std::endl; - std::vector buffers(2); buffers[0] = spInputBlob->mutable_gpu_data(); buffers[1] = spOutputBlob->mutable_gpu_data(); - - size_t eltCount = mNetOutputSize4D[0]*mNetOutputSize4D[1]*mNetOutputSize4D[2]*mNetOutputSize4D[3]*batchSize, memSize = eltCount * sizeof(float); - - float* localMem = new float[eltCount]; - for (size_t i = 0; i < eltCount; i++) - localMem[i] = (float(rand()) / RAND_MAX) * 2 - 1; - - void* deviceMem; - CUDA_CHECK(cudaMalloc(&deviceMem, memSize)); - if (deviceMem == nullptr) - { - std::cerr << "Out of memory" << std::endl; - exit(1); - } - CUDA_CHECK(cudaMemcpy(deviceMem, localMem, memSize, cudaMemcpyHostToDevice)); - - - buffers[1] = deviceMem; - delete[] localMem; - - std::cout << "Forward Pass : memory created" << std::endl; - cudaCheck(__LINE__, __FUNCTION__, __FILE__); std::cout << "Forward Pass : executing inference" << std::endl; cudaContext->enqueue(batchSize, &buffers[0], stream, nullptr); - spOutputBlob->set_gpu_data((float*)deviceMem); - std::cout << "Forward Pass : inference done !" << std::endl; cudaCheck(__LINE__, __FUNCTION__, __FILE__); } From c2be9aa06170054108bcd159be30b14e6b148830 Mon Sep 17 00:00:00 2001 From: Florent Buisson Date: Tue, 26 Sep 2017 23:11:23 +0000 Subject: [PATCH 30/52] Removing log to speedup inference. --- src/openpose/core/netTensorRT.cpp | 7 +------ src/openpose/pose/poseExtractorTensorRT.cpp | 8 -------- 2 files changed, 1 insertion(+), 14 deletions(-) diff --git a/src/openpose/core/netTensorRT.cpp b/src/openpose/core/netTensorRT.cpp index bb02d5041..6d2c81293 100644 --- a/src/openpose/core/netTensorRT.cpp +++ b/src/openpose/core/netTensorRT.cpp @@ -264,8 +264,6 @@ namespace op void NetTensorRT::forwardPass(const float* const inputData) const { - - std::cout << "Forward Pass : start" << std::endl; try { const int batchSize = 1; @@ -281,12 +279,9 @@ namespace op buffers[0] = spInputBlob->mutable_gpu_data(); buffers[1] = spOutputBlob->mutable_gpu_data(); - std::cout << "Forward Pass : executing inference" << std::endl; - cudaContext->enqueue(batchSize, &buffers[0], stream, nullptr); - std::cout << "Forward Pass : inference done !" << std::endl; - cudaCheck(__LINE__, __FUNCTION__, __FILE__); + //cudaCheck(__LINE__, __FUNCTION__, __FILE__); } } catch (const std::exception& e) diff --git a/src/openpose/pose/poseExtractorTensorRT.cpp b/src/openpose/pose/poseExtractorTensorRT.cpp index a9a0abb35..a1b07f00e 100644 --- a/src/openpose/pose/poseExtractorTensorRT.cpp +++ b/src/openpose/pose/poseExtractorTensorRT.cpp @@ -83,17 +83,9 @@ namespace op if (inputNetData.empty()) error("Empty inputNetData.", __LINE__, __FUNCTION__, __FILE__); - - - std::cout << "Forward Pass Pose: tensorrt forward pass" << std::endl; // 1. TensorRT deep network spNet->forwardPass(inputNetData.getConstPtr()); - std::cout << "Forward Pass Pose: tensorrt passed !" << std::endl; - - // Replace spNet->forward pass, but how to propagate to next - // Replace spTensorRTNetOututBlob.get() ? - // 2. Resize heat maps + merge different scales spResizeAndMergeTensorRT->setScaleRatios(scaleRatios); #ifndef CPU_ONLY From 89e3b443e8fb9330e4d6eb4ac8cbc72730974164 Mon Sep 17 00:00:00 2001 From: Florent Buisson Date: Wed, 27 Sep 2017 13:10:08 +0000 Subject: [PATCH 31/52] ResizeAndMergeBase CPU version. --- src/openpose/core/resizeAndMergeBase.cpp | 71 +++++++++--------------- 1 file changed, 26 insertions(+), 45 deletions(-) diff --git a/src/openpose/core/resizeAndMergeBase.cpp b/src/openpose/core/resizeAndMergeBase.cpp index ea324a251..3872e9614 100644 --- a/src/openpose/core/resizeAndMergeBase.cpp +++ b/src/openpose/core/resizeAndMergeBase.cpp @@ -1,6 +1,8 @@ -// #include +#include +#include #include + namespace op { template @@ -9,50 +11,29 @@ namespace op { try { - UNUSED(targetPtr); - UNUSED(sourcePtr); - UNUSED(scaleRatios); - UNUSED(targetSize); - UNUSED(sourceSize); - error("CPU version not completely implemented.", __LINE__, __FUNCTION__, __FILE__); - - // TODO: THIS CODE IS WORKING, BUT IT DOES NOT CONSIDER THE SCALES (I.E. SCALE NUMBER, START AND GAP) - // const int num = bottom->shape(0); - // const int channel = bottom->shape(1); - // const int sourceHeight = bottom->shape(2); - // const int sourceWidth = bottom->shape(3); - // const int targetHeight = top->shape(2); - // const int targetWidth = top->shape(3); - - // //stupid method - // for (int n = 0; n < num; n++) - // { - // for (int c = 0; c < channel; c++) - // { - // //fill source - // cv::Mat source(sourceWidth, sourceHeight, CV_32FC1); - // const auto sourceOffsetChannel = sourceHeight * sourceWidth; - // const auto sourceOffsetNum = sourceOffsetChannel * channel; - // const auto sourceOffset = n*sourceOffsetNum + c*sourceOffsetChannel; - // const T* const sourcePtr = bottom->cpu_data(); - // for (int y = 0; y < sourceHeight; y++) - // for (int x = 0; x < sourceWidth; x++) - // source.at(x,y) = sourcePtr[sourceOffset + y*sourceWidth + x]; - - // // spatial resize - // cv::Mat target; - // cv::resize(source, target, {targetWidth, targetHeight}, 0, 0, CV_INTER_CUBIC); - - // //fill top - // const auto targetOffsetChannel = targetHeight * targetWidth; - // const auto targetOffsetNum = targetOffsetChannel * channel; - // const auto targetOffset = n*targetOffsetNum + c*targetOffsetChannel; - // T* targetPtr = top->mutable_cpu_data(); - // for (int y = 0; y < targetHeight; y++) - // for (int x = 0; x < targetWidth; x++) - // targetPtr[targetOffset + y*targetWidth + x] = target.at(x,y); - // } - // } + const int num = sourceSize[0]; + const int channels = sourceSize[1]; + const int sourceHeight = sourceSize[2]; + const int sourceWidth = sourceSize[3]; + const int targetHeight = targetSize[2]; + const int targetWidth = targetSize[3]; + + const auto sourceChannelOffset = sourceHeight * sourceWidth; + const auto targetChannelOffset = targetWidth * targetHeight; + + // Perform resize + merging + const auto sourceNumOffset = channels * sourceChannelOffset; + for (auto c = 0 ; c < channels ; c++) { + cv::Mat target (targetHeight, targetWidth, CV_32F, (void*)(targetPtr + c * targetChannelOffset)); + cv::multiply(target, 0.f, target); + cv::Mat t; + for (auto n = 0; n < num; n++) { + cv::Mat source(std::rint(sourceHeight * scaleRatios[n]), std::rint(sourceWidth * scaleRatios[n]), CV_32F, (void*)(sourcePtr + c * sourceChannelOffset + n * sourceNumOffset)); + cv::resize(source, t, cv::Size(targetWidth, targetHeight), 0., 0., cv::INTER_CUBIC); + cv::add(target, t, target); + } + cv::divide(target, (float)num, target); + } } catch (const std::exception& e) { From b54ae119c325926e85f9465110a850b24d9bc248 Mon Sep 17 00:00:00 2001 From: Florent Buisson Date: Wed, 27 Sep 2017 13:11:37 +0000 Subject: [PATCH 32/52] Inference model for pose net size 256x192 --- .../coco/pose_deploy_linevec.prototxt_192x256 | 2976 +++++++++++++++++ 1 file changed, 2976 insertions(+) create mode 100755 models/pose/coco/pose_deploy_linevec.prototxt_192x256 diff --git a/models/pose/coco/pose_deploy_linevec.prototxt_192x256 b/models/pose/coco/pose_deploy_linevec.prototxt_192x256 new file mode 100755 index 000000000..99cc4e4fe --- /dev/null +++ b/models/pose/coco/pose_deploy_linevec.prototxt_192x256 @@ -0,0 +1,2976 @@ +input: "image" +input_dim: 1 +input_dim: 3 +input_dim: 192 # This value will be defined at runtime +input_dim: 256 # This value will be defined at runtime +layer { + name: "conv1_1" + type: "Convolution" + bottom: "image" + top: "conv1_1" + param { + lr_mult: 1.0 + decay_mult: 1 + } + param { + lr_mult: 2.0 + decay_mult: 0 + } + convolution_param { + num_output: 64 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "relu1_1" + type: "ReLU" + bottom: "conv1_1" + top: "conv1_1" +} +layer { + name: "conv1_2" + type: "Convolution" + bottom: "conv1_1" + top: "conv1_2" + param { + lr_mult: 1.0 + decay_mult: 1 + } + param { + lr_mult: 2.0 + decay_mult: 0 + } + convolution_param { + num_output: 64 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "relu1_2" + type: "ReLU" + bottom: "conv1_2" + top: "conv1_2" +} +layer { + name: "pool1_stage1" + type: "Pooling" + bottom: "conv1_2" + top: "pool1_stage1" + pooling_param { + pool: MAX + kernel_size: 2 + stride: 2 + } +} +layer { + name: "conv2_1" + type: "Convolution" + bottom: "pool1_stage1" + top: "conv2_1" + param { + lr_mult: 1.0 + decay_mult: 1 + } + param { + lr_mult: 2.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "relu2_1" + type: "ReLU" + bottom: "conv2_1" + top: "conv2_1" +} +layer { + name: "conv2_2" + type: "Convolution" + bottom: "conv2_1" + top: "conv2_2" + param { + lr_mult: 1.0 + decay_mult: 1 + } + param { + lr_mult: 2.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "relu2_2" + type: "ReLU" + bottom: "conv2_2" + top: "conv2_2" +} +layer { + name: "pool2_stage1" + type: "Pooling" + bottom: "conv2_2" + top: "pool2_stage1" + pooling_param { + pool: MAX + kernel_size: 2 + stride: 2 + } +} +layer { + name: "conv3_1" + type: "Convolution" + bottom: "pool2_stage1" + top: "conv3_1" + param { + lr_mult: 1.0 + decay_mult: 1 + } + param { + lr_mult: 2.0 + decay_mult: 0 + } + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "relu3_1" + type: "ReLU" + bottom: "conv3_1" + top: "conv3_1" +} +layer { + name: "conv3_2" + type: "Convolution" + bottom: "conv3_1" + top: "conv3_2" + param { + lr_mult: 1.0 + decay_mult: 1 + } + param { + lr_mult: 2.0 + decay_mult: 0 + } + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "relu3_2" + type: "ReLU" + bottom: "conv3_2" + top: "conv3_2" +} +layer { + name: "conv3_3" + type: "Convolution" + bottom: "conv3_2" + top: "conv3_3" + param { + lr_mult: 1.0 + decay_mult: 1 + } + param { + lr_mult: 2.0 + decay_mult: 0 + } + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "relu3_3" + type: "ReLU" + bottom: "conv3_3" + top: "conv3_3" +} +layer { + name: "conv3_4" + type: "Convolution" + bottom: "conv3_3" + top: "conv3_4" + param { + lr_mult: 1.0 + decay_mult: 1 + } + param { + lr_mult: 2.0 + decay_mult: 0 + } + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "relu3_4" + type: "ReLU" + bottom: "conv3_4" + top: "conv3_4" +} +layer { + name: "pool3_stage1" + type: "Pooling" + bottom: "conv3_4" + top: "pool3_stage1" + pooling_param { + pool: MAX + kernel_size: 2 + stride: 2 + } +} +layer { + name: "conv4_1" + type: "Convolution" + bottom: "pool3_stage1" + top: "conv4_1" + param { + lr_mult: 1.0 + decay_mult: 1 + } + param { + lr_mult: 2.0 + decay_mult: 0 + } + convolution_param { + num_output: 512 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "relu4_1" + type: "ReLU" + bottom: "conv4_1" + top: "conv4_1" +} +layer { + name: "conv4_2" + type: "Convolution" + bottom: "conv4_1" + top: "conv4_2" + param { + lr_mult: 1.0 + decay_mult: 1 + } + param { + lr_mult: 2.0 + decay_mult: 0 + } + convolution_param { + num_output: 512 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "relu4_2" + type: "ReLU" + bottom: "conv4_2" + top: "conv4_2" +} +layer { + name: "conv4_3_CPM" + type: "Convolution" + bottom: "conv4_2" + top: "conv4_3_CPM" + param { + lr_mult: 1.0 + decay_mult: 1 + } + param { + lr_mult: 2.0 + decay_mult: 0 + } + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "relu4_3_CPM" + type: "ReLU" + bottom: "conv4_3_CPM" + top: "conv4_3_CPM" +} +layer { + name: "conv4_4_CPM" + type: "Convolution" + bottom: "conv4_3_CPM" + top: "conv4_4_CPM" + param { + lr_mult: 1.0 + decay_mult: 1 + } + param { + lr_mult: 2.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "relu4_4_CPM" + type: "ReLU" + bottom: "conv4_4_CPM" + top: "conv4_4_CPM" +} +layer { + name: "conv5_1_CPM_L1" + type: "Convolution" + bottom: "conv4_4_CPM" + top: "conv5_1_CPM_L1" + param { + lr_mult: 1.0 + decay_mult: 1 + } + param { + lr_mult: 2.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "relu5_1_CPM_L1" + type: "ReLU" + bottom: "conv5_1_CPM_L1" + top: "conv5_1_CPM_L1" +} +layer { + name: "conv5_1_CPM_L2" + type: "Convolution" + bottom: "conv4_4_CPM" + top: "conv5_1_CPM_L2" + param { + lr_mult: 1.0 + decay_mult: 1 + } + param { + lr_mult: 2.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "relu5_1_CPM_L2" + type: "ReLU" + bottom: "conv5_1_CPM_L2" + top: "conv5_1_CPM_L2" +} +layer { + name: "conv5_2_CPM_L1" + type: "Convolution" + bottom: "conv5_1_CPM_L1" + top: "conv5_2_CPM_L1" + param { + lr_mult: 1.0 + decay_mult: 1 + } + param { + lr_mult: 2.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "relu5_2_CPM_L1" + type: "ReLU" + bottom: "conv5_2_CPM_L1" + top: "conv5_2_CPM_L1" +} +layer { + name: "conv5_2_CPM_L2" + type: "Convolution" + bottom: "conv5_1_CPM_L2" + top: "conv5_2_CPM_L2" + param { + lr_mult: 1.0 + decay_mult: 1 + } + param { + lr_mult: 2.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "relu5_2_CPM_L2" + type: "ReLU" + bottom: "conv5_2_CPM_L2" + top: "conv5_2_CPM_L2" +} +layer { + name: "conv5_3_CPM_L1" + type: "Convolution" + bottom: "conv5_2_CPM_L1" + top: "conv5_3_CPM_L1" + param { + lr_mult: 1.0 + decay_mult: 1 + } + param { + lr_mult: 2.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "relu5_3_CPM_L1" + type: "ReLU" + bottom: "conv5_3_CPM_L1" + top: "conv5_3_CPM_L1" +} +layer { + name: "conv5_3_CPM_L2" + type: "Convolution" + bottom: "conv5_2_CPM_L2" + top: "conv5_3_CPM_L2" + param { + lr_mult: 1.0 + decay_mult: 1 + } + param { + lr_mult: 2.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "relu5_3_CPM_L2" + type: "ReLU" + bottom: "conv5_3_CPM_L2" + top: "conv5_3_CPM_L2" +} +layer { + name: "conv5_4_CPM_L1" + type: "Convolution" + bottom: "conv5_3_CPM_L1" + top: "conv5_4_CPM_L1" + param { + lr_mult: 1.0 + decay_mult: 1 + } + param { + lr_mult: 2.0 + decay_mult: 0 + } + convolution_param { + num_output: 512 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "relu5_4_CPM_L1" + type: "ReLU" + bottom: "conv5_4_CPM_L1" + top: "conv5_4_CPM_L1" +} +layer { + name: "conv5_4_CPM_L2" + type: "Convolution" + bottom: "conv5_3_CPM_L2" + top: "conv5_4_CPM_L2" + param { + lr_mult: 1.0 + decay_mult: 1 + } + param { + lr_mult: 2.0 + decay_mult: 0 + } + convolution_param { + num_output: 512 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "relu5_4_CPM_L2" + type: "ReLU" + bottom: "conv5_4_CPM_L2" + top: "conv5_4_CPM_L2" +} +layer { + name: "conv5_5_CPM_L1" + type: "Convolution" + bottom: "conv5_4_CPM_L1" + top: "conv5_5_CPM_L1" + param { + lr_mult: 1.0 + decay_mult: 1 + } + param { + lr_mult: 2.0 + decay_mult: 0 + } + convolution_param { + num_output: 38 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "conv5_5_CPM_L2" + type: "Convolution" + bottom: "conv5_4_CPM_L2" + top: "conv5_5_CPM_L2" + param { + lr_mult: 1.0 + decay_mult: 1 + } + param { + lr_mult: 2.0 + decay_mult: 0 + } + convolution_param { + num_output: 19 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "concat_stage2" + type: "Concat" + bottom: "conv5_5_CPM_L1" + bottom: "conv5_5_CPM_L2" + bottom: "conv4_4_CPM" + top: "concat_stage2" + concat_param { + axis: 1 + } +} +layer { + name: "Mconv1_stage2_L1" + type: "Convolution" + bottom: "concat_stage2" + top: "Mconv1_stage2_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu1_stage2_L1" + type: "ReLU" + bottom: "Mconv1_stage2_L1" + top: "Mconv1_stage2_L1" +} +layer { + name: "Mconv1_stage2_L2" + type: "Convolution" + bottom: "concat_stage2" + top: "Mconv1_stage2_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu1_stage2_L2" + type: "ReLU" + bottom: "Mconv1_stage2_L2" + top: "Mconv1_stage2_L2" +} +layer { + name: "Mconv2_stage2_L1" + type: "Convolution" + bottom: "Mconv1_stage2_L1" + top: "Mconv2_stage2_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu2_stage2_L1" + type: "ReLU" + bottom: "Mconv2_stage2_L1" + top: "Mconv2_stage2_L1" +} +layer { + name: "Mconv2_stage2_L2" + type: "Convolution" + bottom: "Mconv1_stage2_L2" + top: "Mconv2_stage2_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu2_stage2_L2" + type: "ReLU" + bottom: "Mconv2_stage2_L2" + top: "Mconv2_stage2_L2" +} +layer { + name: "Mconv3_stage2_L1" + type: "Convolution" + bottom: "Mconv2_stage2_L1" + top: "Mconv3_stage2_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu3_stage2_L1" + type: "ReLU" + bottom: "Mconv3_stage2_L1" + top: "Mconv3_stage2_L1" +} +layer { + name: "Mconv3_stage2_L2" + type: "Convolution" + bottom: "Mconv2_stage2_L2" + top: "Mconv3_stage2_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu3_stage2_L2" + type: "ReLU" + bottom: "Mconv3_stage2_L2" + top: "Mconv3_stage2_L2" +} +layer { + name: "Mconv4_stage2_L1" + type: "Convolution" + bottom: "Mconv3_stage2_L1" + top: "Mconv4_stage2_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu4_stage2_L1" + type: "ReLU" + bottom: "Mconv4_stage2_L1" + top: "Mconv4_stage2_L1" +} +layer { + name: "Mconv4_stage2_L2" + type: "Convolution" + bottom: "Mconv3_stage2_L2" + top: "Mconv4_stage2_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu4_stage2_L2" + type: "ReLU" + bottom: "Mconv4_stage2_L2" + top: "Mconv4_stage2_L2" +} +layer { + name: "Mconv5_stage2_L1" + type: "Convolution" + bottom: "Mconv4_stage2_L1" + top: "Mconv5_stage2_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu5_stage2_L1" + type: "ReLU" + bottom: "Mconv5_stage2_L1" + top: "Mconv5_stage2_L1" +} +layer { + name: "Mconv5_stage2_L2" + type: "Convolution" + bottom: "Mconv4_stage2_L2" + top: "Mconv5_stage2_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu5_stage2_L2" + type: "ReLU" + bottom: "Mconv5_stage2_L2" + top: "Mconv5_stage2_L2" +} +layer { + name: "Mconv6_stage2_L1" + type: "Convolution" + bottom: "Mconv5_stage2_L1" + top: "Mconv6_stage2_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu6_stage2_L1" + type: "ReLU" + bottom: "Mconv6_stage2_L1" + top: "Mconv6_stage2_L1" +} +layer { + name: "Mconv6_stage2_L2" + type: "Convolution" + bottom: "Mconv5_stage2_L2" + top: "Mconv6_stage2_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu6_stage2_L2" + type: "ReLU" + bottom: "Mconv6_stage2_L2" + top: "Mconv6_stage2_L2" +} +layer { + name: "Mconv7_stage2_L1" + type: "Convolution" + bottom: "Mconv6_stage2_L1" + top: "Mconv7_stage2_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 38 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mconv7_stage2_L2" + type: "Convolution" + bottom: "Mconv6_stage2_L2" + top: "Mconv7_stage2_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 19 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "concat_stage3" + type: "Concat" + bottom: "Mconv7_stage2_L1" + bottom: "Mconv7_stage2_L2" + bottom: "conv4_4_CPM" + top: "concat_stage3" + concat_param { + axis: 1 + } +} +layer { + name: "Mconv1_stage3_L1" + type: "Convolution" + bottom: "concat_stage3" + top: "Mconv1_stage3_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu1_stage3_L1" + type: "ReLU" + bottom: "Mconv1_stage3_L1" + top: "Mconv1_stage3_L1" +} +layer { + name: "Mconv1_stage3_L2" + type: "Convolution" + bottom: "concat_stage3" + top: "Mconv1_stage3_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu1_stage3_L2" + type: "ReLU" + bottom: "Mconv1_stage3_L2" + top: "Mconv1_stage3_L2" +} +layer { + name: "Mconv2_stage3_L1" + type: "Convolution" + bottom: "Mconv1_stage3_L1" + top: "Mconv2_stage3_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu2_stage3_L1" + type: "ReLU" + bottom: "Mconv2_stage3_L1" + top: "Mconv2_stage3_L1" +} +layer { + name: "Mconv2_stage3_L2" + type: "Convolution" + bottom: "Mconv1_stage3_L2" + top: "Mconv2_stage3_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu2_stage3_L2" + type: "ReLU" + bottom: "Mconv2_stage3_L2" + top: "Mconv2_stage3_L2" +} +layer { + name: "Mconv3_stage3_L1" + type: "Convolution" + bottom: "Mconv2_stage3_L1" + top: "Mconv3_stage3_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu3_stage3_L1" + type: "ReLU" + bottom: "Mconv3_stage3_L1" + top: "Mconv3_stage3_L1" +} +layer { + name: "Mconv3_stage3_L2" + type: "Convolution" + bottom: "Mconv2_stage3_L2" + top: "Mconv3_stage3_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu3_stage3_L2" + type: "ReLU" + bottom: "Mconv3_stage3_L2" + top: "Mconv3_stage3_L2" +} +layer { + name: "Mconv4_stage3_L1" + type: "Convolution" + bottom: "Mconv3_stage3_L1" + top: "Mconv4_stage3_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu4_stage3_L1" + type: "ReLU" + bottom: "Mconv4_stage3_L1" + top: "Mconv4_stage3_L1" +} +layer { + name: "Mconv4_stage3_L2" + type: "Convolution" + bottom: "Mconv3_stage3_L2" + top: "Mconv4_stage3_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu4_stage3_L2" + type: "ReLU" + bottom: "Mconv4_stage3_L2" + top: "Mconv4_stage3_L2" +} +layer { + name: "Mconv5_stage3_L1" + type: "Convolution" + bottom: "Mconv4_stage3_L1" + top: "Mconv5_stage3_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu5_stage3_L1" + type: "ReLU" + bottom: "Mconv5_stage3_L1" + top: "Mconv5_stage3_L1" +} +layer { + name: "Mconv5_stage3_L2" + type: "Convolution" + bottom: "Mconv4_stage3_L2" + top: "Mconv5_stage3_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu5_stage3_L2" + type: "ReLU" + bottom: "Mconv5_stage3_L2" + top: "Mconv5_stage3_L2" +} +layer { + name: "Mconv6_stage3_L1" + type: "Convolution" + bottom: "Mconv5_stage3_L1" + top: "Mconv6_stage3_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu6_stage3_L1" + type: "ReLU" + bottom: "Mconv6_stage3_L1" + top: "Mconv6_stage3_L1" +} +layer { + name: "Mconv6_stage3_L2" + type: "Convolution" + bottom: "Mconv5_stage3_L2" + top: "Mconv6_stage3_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu6_stage3_L2" + type: "ReLU" + bottom: "Mconv6_stage3_L2" + top: "Mconv6_stage3_L2" +} +layer { + name: "Mconv7_stage3_L1" + type: "Convolution" + bottom: "Mconv6_stage3_L1" + top: "Mconv7_stage3_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 38 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mconv7_stage3_L2" + type: "Convolution" + bottom: "Mconv6_stage3_L2" + top: "Mconv7_stage3_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 19 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "concat_stage4" + type: "Concat" + bottom: "Mconv7_stage3_L1" + bottom: "Mconv7_stage3_L2" + bottom: "conv4_4_CPM" + top: "concat_stage4" + concat_param { + axis: 1 + } +} +layer { + name: "Mconv1_stage4_L1" + type: "Convolution" + bottom: "concat_stage4" + top: "Mconv1_stage4_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu1_stage4_L1" + type: "ReLU" + bottom: "Mconv1_stage4_L1" + top: "Mconv1_stage4_L1" +} +layer { + name: "Mconv1_stage4_L2" + type: "Convolution" + bottom: "concat_stage4" + top: "Mconv1_stage4_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu1_stage4_L2" + type: "ReLU" + bottom: "Mconv1_stage4_L2" + top: "Mconv1_stage4_L2" +} +layer { + name: "Mconv2_stage4_L1" + type: "Convolution" + bottom: "Mconv1_stage4_L1" + top: "Mconv2_stage4_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu2_stage4_L1" + type: "ReLU" + bottom: "Mconv2_stage4_L1" + top: "Mconv2_stage4_L1" +} +layer { + name: "Mconv2_stage4_L2" + type: "Convolution" + bottom: "Mconv1_stage4_L2" + top: "Mconv2_stage4_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu2_stage4_L2" + type: "ReLU" + bottom: "Mconv2_stage4_L2" + top: "Mconv2_stage4_L2" +} +layer { + name: "Mconv3_stage4_L1" + type: "Convolution" + bottom: "Mconv2_stage4_L1" + top: "Mconv3_stage4_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu3_stage4_L1" + type: "ReLU" + bottom: "Mconv3_stage4_L1" + top: "Mconv3_stage4_L1" +} +layer { + name: "Mconv3_stage4_L2" + type: "Convolution" + bottom: "Mconv2_stage4_L2" + top: "Mconv3_stage4_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu3_stage4_L2" + type: "ReLU" + bottom: "Mconv3_stage4_L2" + top: "Mconv3_stage4_L2" +} +layer { + name: "Mconv4_stage4_L1" + type: "Convolution" + bottom: "Mconv3_stage4_L1" + top: "Mconv4_stage4_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu4_stage4_L1" + type: "ReLU" + bottom: "Mconv4_stage4_L1" + top: "Mconv4_stage4_L1" +} +layer { + name: "Mconv4_stage4_L2" + type: "Convolution" + bottom: "Mconv3_stage4_L2" + top: "Mconv4_stage4_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu4_stage4_L2" + type: "ReLU" + bottom: "Mconv4_stage4_L2" + top: "Mconv4_stage4_L2" +} +layer { + name: "Mconv5_stage4_L1" + type: "Convolution" + bottom: "Mconv4_stage4_L1" + top: "Mconv5_stage4_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu5_stage4_L1" + type: "ReLU" + bottom: "Mconv5_stage4_L1" + top: "Mconv5_stage4_L1" +} +layer { + name: "Mconv5_stage4_L2" + type: "Convolution" + bottom: "Mconv4_stage4_L2" + top: "Mconv5_stage4_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu5_stage4_L2" + type: "ReLU" + bottom: "Mconv5_stage4_L2" + top: "Mconv5_stage4_L2" +} +layer { + name: "Mconv6_stage4_L1" + type: "Convolution" + bottom: "Mconv5_stage4_L1" + top: "Mconv6_stage4_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu6_stage4_L1" + type: "ReLU" + bottom: "Mconv6_stage4_L1" + top: "Mconv6_stage4_L1" +} +layer { + name: "Mconv6_stage4_L2" + type: "Convolution" + bottom: "Mconv5_stage4_L2" + top: "Mconv6_stage4_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu6_stage4_L2" + type: "ReLU" + bottom: "Mconv6_stage4_L2" + top: "Mconv6_stage4_L2" +} +layer { + name: "Mconv7_stage4_L1" + type: "Convolution" + bottom: "Mconv6_stage4_L1" + top: "Mconv7_stage4_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 38 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mconv7_stage4_L2" + type: "Convolution" + bottom: "Mconv6_stage4_L2" + top: "Mconv7_stage4_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 19 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "concat_stage5" + type: "Concat" + bottom: "Mconv7_stage4_L1" + bottom: "Mconv7_stage4_L2" + bottom: "conv4_4_CPM" + top: "concat_stage5" + concat_param { + axis: 1 + } +} +layer { + name: "Mconv1_stage5_L1" + type: "Convolution" + bottom: "concat_stage5" + top: "Mconv1_stage5_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu1_stage5_L1" + type: "ReLU" + bottom: "Mconv1_stage5_L1" + top: "Mconv1_stage5_L1" +} +layer { + name: "Mconv1_stage5_L2" + type: "Convolution" + bottom: "concat_stage5" + top: "Mconv1_stage5_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu1_stage5_L2" + type: "ReLU" + bottom: "Mconv1_stage5_L2" + top: "Mconv1_stage5_L2" +} +layer { + name: "Mconv2_stage5_L1" + type: "Convolution" + bottom: "Mconv1_stage5_L1" + top: "Mconv2_stage5_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu2_stage5_L1" + type: "ReLU" + bottom: "Mconv2_stage5_L1" + top: "Mconv2_stage5_L1" +} +layer { + name: "Mconv2_stage5_L2" + type: "Convolution" + bottom: "Mconv1_stage5_L2" + top: "Mconv2_stage5_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu2_stage5_L2" + type: "ReLU" + bottom: "Mconv2_stage5_L2" + top: "Mconv2_stage5_L2" +} +layer { + name: "Mconv3_stage5_L1" + type: "Convolution" + bottom: "Mconv2_stage5_L1" + top: "Mconv3_stage5_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu3_stage5_L1" + type: "ReLU" + bottom: "Mconv3_stage5_L1" + top: "Mconv3_stage5_L1" +} +layer { + name: "Mconv3_stage5_L2" + type: "Convolution" + bottom: "Mconv2_stage5_L2" + top: "Mconv3_stage5_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu3_stage5_L2" + type: "ReLU" + bottom: "Mconv3_stage5_L2" + top: "Mconv3_stage5_L2" +} +layer { + name: "Mconv4_stage5_L1" + type: "Convolution" + bottom: "Mconv3_stage5_L1" + top: "Mconv4_stage5_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu4_stage5_L1" + type: "ReLU" + bottom: "Mconv4_stage5_L1" + top: "Mconv4_stage5_L1" +} +layer { + name: "Mconv4_stage5_L2" + type: "Convolution" + bottom: "Mconv3_stage5_L2" + top: "Mconv4_stage5_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu4_stage5_L2" + type: "ReLU" + bottom: "Mconv4_stage5_L2" + top: "Mconv4_stage5_L2" +} +layer { + name: "Mconv5_stage5_L1" + type: "Convolution" + bottom: "Mconv4_stage5_L1" + top: "Mconv5_stage5_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu5_stage5_L1" + type: "ReLU" + bottom: "Mconv5_stage5_L1" + top: "Mconv5_stage5_L1" +} +layer { + name: "Mconv5_stage5_L2" + type: "Convolution" + bottom: "Mconv4_stage5_L2" + top: "Mconv5_stage5_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu5_stage5_L2" + type: "ReLU" + bottom: "Mconv5_stage5_L2" + top: "Mconv5_stage5_L2" +} +layer { + name: "Mconv6_stage5_L1" + type: "Convolution" + bottom: "Mconv5_stage5_L1" + top: "Mconv6_stage5_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu6_stage5_L1" + type: "ReLU" + bottom: "Mconv6_stage5_L1" + top: "Mconv6_stage5_L1" +} +layer { + name: "Mconv6_stage5_L2" + type: "Convolution" + bottom: "Mconv5_stage5_L2" + top: "Mconv6_stage5_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu6_stage5_L2" + type: "ReLU" + bottom: "Mconv6_stage5_L2" + top: "Mconv6_stage5_L2" +} +layer { + name: "Mconv7_stage5_L1" + type: "Convolution" + bottom: "Mconv6_stage5_L1" + top: "Mconv7_stage5_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 38 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mconv7_stage5_L2" + type: "Convolution" + bottom: "Mconv6_stage5_L2" + top: "Mconv7_stage5_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 19 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "concat_stage6" + type: "Concat" + bottom: "Mconv7_stage5_L1" + bottom: "Mconv7_stage5_L2" + bottom: "conv4_4_CPM" + top: "concat_stage6" + concat_param { + axis: 1 + } +} +layer { + name: "Mconv1_stage6_L1" + type: "Convolution" + bottom: "concat_stage6" + top: "Mconv1_stage6_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu1_stage6_L1" + type: "ReLU" + bottom: "Mconv1_stage6_L1" + top: "Mconv1_stage6_L1" +} +layer { + name: "Mconv1_stage6_L2" + type: "Convolution" + bottom: "concat_stage6" + top: "Mconv1_stage6_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu1_stage6_L2" + type: "ReLU" + bottom: "Mconv1_stage6_L2" + top: "Mconv1_stage6_L2" +} +layer { + name: "Mconv2_stage6_L1" + type: "Convolution" + bottom: "Mconv1_stage6_L1" + top: "Mconv2_stage6_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu2_stage6_L1" + type: "ReLU" + bottom: "Mconv2_stage6_L1" + top: "Mconv2_stage6_L1" +} +layer { + name: "Mconv2_stage6_L2" + type: "Convolution" + bottom: "Mconv1_stage6_L2" + top: "Mconv2_stage6_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu2_stage6_L2" + type: "ReLU" + bottom: "Mconv2_stage6_L2" + top: "Mconv2_stage6_L2" +} +layer { + name: "Mconv3_stage6_L1" + type: "Convolution" + bottom: "Mconv2_stage6_L1" + top: "Mconv3_stage6_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu3_stage6_L1" + type: "ReLU" + bottom: "Mconv3_stage6_L1" + top: "Mconv3_stage6_L1" +} +layer { + name: "Mconv3_stage6_L2" + type: "Convolution" + bottom: "Mconv2_stage6_L2" + top: "Mconv3_stage6_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu3_stage6_L2" + type: "ReLU" + bottom: "Mconv3_stage6_L2" + top: "Mconv3_stage6_L2" +} +layer { + name: "Mconv4_stage6_L1" + type: "Convolution" + bottom: "Mconv3_stage6_L1" + top: "Mconv4_stage6_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu4_stage6_L1" + type: "ReLU" + bottom: "Mconv4_stage6_L1" + top: "Mconv4_stage6_L1" +} +layer { + name: "Mconv4_stage6_L2" + type: "Convolution" + bottom: "Mconv3_stage6_L2" + top: "Mconv4_stage6_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu4_stage6_L2" + type: "ReLU" + bottom: "Mconv4_stage6_L2" + top: "Mconv4_stage6_L2" +} +layer { + name: "Mconv5_stage6_L1" + type: "Convolution" + bottom: "Mconv4_stage6_L1" + top: "Mconv5_stage6_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu5_stage6_L1" + type: "ReLU" + bottom: "Mconv5_stage6_L1" + top: "Mconv5_stage6_L1" +} +layer { + name: "Mconv5_stage6_L2" + type: "Convolution" + bottom: "Mconv4_stage6_L2" + top: "Mconv5_stage6_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 3 + kernel_size: 7 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu5_stage6_L2" + type: "ReLU" + bottom: "Mconv5_stage6_L2" + top: "Mconv5_stage6_L2" +} +layer { + name: "Mconv6_stage6_L1" + type: "Convolution" + bottom: "Mconv5_stage6_L1" + top: "Mconv6_stage6_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu6_stage6_L1" + type: "ReLU" + bottom: "Mconv6_stage6_L1" + top: "Mconv6_stage6_L1" +} +layer { + name: "Mconv6_stage6_L2" + type: "Convolution" + bottom: "Mconv5_stage6_L2" + top: "Mconv6_stage6_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mrelu6_stage6_L2" + type: "ReLU" + bottom: "Mconv6_stage6_L2" + top: "Mconv6_stage6_L2" +} +layer { + name: "Mconv7_stage6_L1" + type: "Convolution" + bottom: "Mconv6_stage6_L1" + top: "Mconv7_stage6_L1" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 38 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "Mconv7_stage6_L2" + type: "Convolution" + bottom: "Mconv6_stage6_L2" + top: "Mconv7_stage6_L2" + param { + lr_mult: 4.0 + decay_mult: 1 + } + param { + lr_mult: 8.0 + decay_mult: 0 + } + convolution_param { + num_output: 19 + pad: 0 + kernel_size: 1 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "concat_stage7" + type: "Concat" + bottom: "Mconv7_stage6_L2" + bottom: "Mconv7_stage6_L1" + # top: "concat_stage7" + top: "net_output" + concat_param { + axis: 1 + } +} From 7808f896c9fb6ed021c0dc713f2efb12b4bd5555 Mon Sep 17 00:00:00 2001 From: Florent Buisson Date: Wed, 27 Sep 2017 14:37:42 +0000 Subject: [PATCH 33/52] Detailed poseExtractor Timings. --- src/openpose/pose/poseExtractorCaffe.cpp | 38 ++++++++++++++++--- src/openpose/pose/poseExtractorTensorRT.cpp | 41 +++++++++++++++++---- 2 files changed, 66 insertions(+), 13 deletions(-) diff --git a/src/openpose/pose/poseExtractorCaffe.cpp b/src/openpose/pose/poseExtractorCaffe.cpp index bc4374782..923405af0 100644 --- a/src/openpose/pose/poseExtractorCaffe.cpp +++ b/src/openpose/pose/poseExtractorCaffe.cpp @@ -7,6 +7,22 @@ #include #include +typedef std::vector> OpTimings; + +static OpTimings timings; + +static void timeNow(const std::string& label){ + const auto now = std::chrono::high_resolution_clock::now(); + const auto timing = std::make_pair(label, now); + timings.push_back(timing); +} + +static std::string timeDiffToString(const std::chrono::high_resolution_clock::time_point& t1, + const std::chrono::high_resolution_clock::time_point& t2 ) { + return std::to_string((double)std::chrono::duration_cast>(t1 - t2).count() * 1e3) + " ms"; +} + + namespace op { PoseExtractorCaffe::PoseExtractorCaffe(const Point& netInputSize, const Point& netOutputSize, const Point& outputSize, const int scaleNumber, @@ -79,10 +95,10 @@ namespace op // Security checks if (inputNetData.empty()) error("Empty inputNetData.", __LINE__, __FUNCTION__, __FILE__); - + timeNow("Start"); // 1. Caffe deep network spNet->forwardPass(inputNetData.getConstPtr()); // ~79.3836ms - + timeNow("Caffe Forward"); // 2. Resize heat maps + merge different scales spResizeAndMergeCaffe->setScaleRatios(scaleRatios); #ifndef CPU_ONLY @@ -91,7 +107,7 @@ namespace op #else error("ResizeAndMergeCaffe CPU version not implemented yet.", __LINE__, __FUNCTION__, __FILE__); #endif - + timeNow("Resize Heat Maps"); // 3. Get peaks by Non-Maximum Suppression spNmsCaffe->setThreshold((float)get(PoseProperty::NMSThreshold)); #ifndef CPU_ONLY @@ -100,22 +116,32 @@ namespace op #else error("NmsCaffe CPU version not implemented yet.", __LINE__, __FUNCTION__, __FILE__); #endif - + timeNow("Peaks by nms"); // Get scale net to output const auto scaleProducerToNetInput = resizeGetScaleFactor(inputDataSize, mNetOutputSize); const Point netSize{intRound(scaleProducerToNetInput*inputDataSize.x), intRound(scaleProducerToNetInput*inputDataSize.y)}; mScaleNetToOutput = {(float)resizeGetScaleFactor(netSize, mOutputSize)}; - + timeNow("Scale net to output"); // 4. Connecting body parts spBodyPartConnectorCaffe->setScaleNetToOutput(mScaleNetToOutput); spBodyPartConnectorCaffe->setInterMinAboveThreshold((int)get(PoseProperty::ConnectInterMinAboveThreshold)); spBodyPartConnectorCaffe->setInterThreshold((float)get(PoseProperty::ConnectInterThreshold)); spBodyPartConnectorCaffe->setMinSubsetCnt((int)get(PoseProperty::ConnectMinSubsetCnt)); spBodyPartConnectorCaffe->setMinSubsetScore((float)get(PoseProperty::ConnectMinSubsetScore)); - // GPU version not implemented yet spBodyPartConnectorCaffe->Forward_cpu({spHeatMapsBlob.get(), spPeaksBlob.get()}, mPoseKeypoints); // spBodyPartConnectorCaffe->Forward_gpu({spHeatMapsBlob.get(), spPeaksBlob.get()}, {spPoseBlob.get()}, mPoseKeypoints); + timeNow("Connect Body Parts"); + + const auto totalTimeSec = timeDiffToString(timings.back().second, timings.front().second); + const auto message = "Pose estimation successfully finished. Total time: " + totalTimeSec + " seconds."; + op::log(message, op::Priority::High); + + for(OpTimings::iterator timing = timings.begin()+1; timing != timings.end(); ++timing) { + const auto log_time = (*timing).first + " - " + timeDiffToString((*timing).second, (*(timing-1)).second); + op::log(log_time, op::Priority::High); + } + } catch (const std::exception& e) { diff --git a/src/openpose/pose/poseExtractorTensorRT.cpp b/src/openpose/pose/poseExtractorTensorRT.cpp index a1b07f00e..155a1f425 100644 --- a/src/openpose/pose/poseExtractorTensorRT.cpp +++ b/src/openpose/pose/poseExtractorTensorRT.cpp @@ -8,6 +8,21 @@ #include #include +typedef std::vector> OpTimings; + +static OpTimings timings; + +static void timeNow(const std::string& label){ + const auto now = std::chrono::high_resolution_clock::now(); + const auto timing = std::make_pair(label, now); + timings.push_back(timing); +} + +static std::string timeDiffToString(const std::chrono::high_resolution_clock::time_point& t1, + const std::chrono::high_resolution_clock::time_point& t2 ) { + return std::to_string((double)std::chrono::duration_cast>(t1 - t2).count() * 1e3) + " ms"; +} + namespace op { @@ -82,19 +97,22 @@ namespace op // Security checks if (inputNetData.empty()) error("Empty inputNetData.", __LINE__, __FUNCTION__, __FILE__); - + timeNow("Start"); // 1. TensorRT deep network spNet->forwardPass(inputNetData.getConstPtr()); - + timeNow("TensorRT forward"); // 2. Resize heat maps + merge different scales spResizeAndMergeTensorRT->setScaleRatios(scaleRatios); + timeNow("SpResizeAndMergeTensorRT"); #ifndef CPU_ONLY - spResizeAndMergeTensorRT->Forward_gpu({spTensorRTNetOutputBlob.get()}, {spHeatMapsBlob.get()}); // ~5ms + spResizeAndMergeTensorRT->Forward_cpu({spTensorRTNetOutputBlob.get()}, {spHeatMapsBlob.get()}); // ~5ms + timeNow("RaM forward_gpu"); cudaCheck(__LINE__, __FUNCTION__, __FILE__); + timeNow("CudaCheck"); #else error("ResizeAndMergeTensorRT CPU version not implemented yet.", __LINE__, __FUNCTION__, __FILE__); #endif - + timeNow("Resize heat Maps"); // 3. Get peaks by Non-Maximum Suppression spNmsTensorRT->setThreshold((float)get(PoseProperty::NMSThreshold)); #ifndef CPU_ONLY @@ -103,22 +121,31 @@ namespace op #else error("NmsTensorRT CPU version not implemented yet.", __LINE__, __FUNCTION__, __FILE__); #endif - + timeNow("Peaks by nms"); // Get scale net to output const auto scaleProducerToNetInput = resizeGetScaleFactor(inputDataSize, mNetOutputSize); const Point netSize{intRound(scaleProducerToNetInput*inputDataSize.x), intRound(scaleProducerToNetInput*inputDataSize.y)}; mScaleNetToOutput = {(float)resizeGetScaleFactor(netSize, mOutputSize)}; - + timeNow("Scale net to output"); // 4. Connecting body parts spBodyPartConnectorTensorRT->setScaleNetToOutput(mScaleNetToOutput); spBodyPartConnectorTensorRT->setInterMinAboveThreshold((int)get(PoseProperty::ConnectInterMinAboveThreshold)); spBodyPartConnectorTensorRT->setInterThreshold((float)get(PoseProperty::ConnectInterThreshold)); spBodyPartConnectorTensorRT->setMinSubsetCnt((int)get(PoseProperty::ConnectMinSubsetCnt)); spBodyPartConnectorTensorRT->setMinSubsetScore((float)get(PoseProperty::ConnectMinSubsetScore)); - // GPU version not implemented yet spBodyPartConnectorTensorRT->Forward_cpu({spHeatMapsBlob.get(), spPeaksBlob.get()}, mPoseKeypoints); // spBodyPartConnectorTensorRT->Forward_gpu({spHeatMapsBlob.get(), spPeaksBlob.get()}, {spPoseBlob.get()}, mPoseKeypoints); + timeNow("Connect Body Parts"); + + const auto totalTimeSec = timeDiffToString(timings.back().second, timings.front().second); + const auto message = "Pose estimation successfully finished. Total time: " + totalTimeSec + " seconds."; + op::log(message, op::Priority::High); + + for(OpTimings::iterator timing = timings.begin()+1; timing != timings.end(); ++timing) { + const auto log_time = (*timing).first + " - " + timeDiffToString((*timing).second, (*(timing-1)).second); + op::log(log_time, op::Priority::High); + } } catch (const std::exception& e) { From 8023fb1e570226ba7163d13a5426d8a1bfd94d3d Mon Sep 17 00:00:00 2001 From: Florent Buisson Date: Wed, 27 Sep 2017 15:55:31 +0000 Subject: [PATCH 34/52] Faster Resize and Merge. --- src/openpose/core/resizeAndMergeBase.cu | 60 +++++++++------------ src/openpose/pose/poseExtractorTensorRT.cpp | 2 +- 2 files changed, 27 insertions(+), 35 deletions(-) diff --git a/src/openpose/core/resizeAndMergeBase.cu b/src/openpose/core/resizeAndMergeBase.cu index b60b6b11c..6b551e815 100644 --- a/src/openpose/core/resizeAndMergeBase.cu +++ b/src/openpose/core/resizeAndMergeBase.cu @@ -7,18 +7,15 @@ namespace op const auto THREADS_PER_BLOCK_1D = 16u; template - __global__ void resizeKernel(T* targetPtr, const T* const sourcePtr, const int sourceWidth, const int sourceHeight, const int targetWidth, - const int targetHeight) + __global__ void resizeKernel(T* targetPtr, const T* const sourcePtr, const int sourceWidth, const int sourceHeight, const int targetWidth, const int targetHeight, const T invScaleWidth, const T invScaleHeight) { const auto x = (blockIdx.x * blockDim.x) + threadIdx.x; const auto y = (blockIdx.y * blockDim.y) + threadIdx.y; if (x < targetWidth && y < targetHeight) { - const auto scaleWidth = targetWidth / T(sourceWidth); - const auto scaleHeight = targetHeight / T(sourceHeight); - const T xSource = (x + 0.5f) / scaleWidth - 0.5f; - const T ySource = (y + 0.5f) / scaleHeight - 0.5f; + const T xSource = (x + 0.5f) * invScaleWidth - 0.5f; + const T ySource = (y + 0.5f) * invScaleHeight - 0.5f; targetPtr[y*targetWidth+x] = bicubicInterpolate(sourcePtr, xSource, ySource, sourceWidth, sourceHeight, sourceWidth); } @@ -30,7 +27,14 @@ namespace op { const auto x = (blockIdx.x * blockDim.x) + threadIdx.x; const auto y = (blockIdx.y * blockDim.y) + threadIdx.y; + + const auto currentWidth = sourceWidth; + const auto currentHeight = sourceHeight; + const auto scaleWidth = targetWidth / currentWidth; + const auto scaleHeight = targetHeight / currentHeight; + + if (x < targetWidth && y < targetHeight) { auto& targetPixel = targetPtr[y*targetWidth+x]; @@ -38,17 +42,11 @@ namespace op // targetPixel = -1000.f; // For fastMax for (auto n = 0; n < num; n++) { - const auto currentWidth = sourceWidth * scaleRatios[n]; - const auto currentHeight = sourceHeight * scaleRatios[n]; - - const auto scaleWidth = targetWidth / currentWidth; - const auto scaleHeight = targetHeight / currentHeight; const T xSource = (x + 0.5f) / scaleWidth - 0.5f; const T ySource = (y + 0.5f) / scaleHeight - 0.5f; const T* const sourcePtrN = sourcePtr + n * sourceNumOffset; - const auto interpolated = bicubicInterpolate(sourcePtrN, xSource, ySource, intRound(currentWidth), - intRound(currentHeight), sourceWidth); + const auto interpolated = bicubicInterpolate(sourcePtrN, xSource, ySource, sourceWidth, sourceHeight, sourceWidth); targetPixel += interpolated; // targetPixel = fastMax(targetPixel, interpolated); } @@ -73,44 +71,38 @@ namespace op const dim3 numBlocks{getNumberCudaBlocks(targetWidth, threadsPerBlock.x), getNumberCudaBlocks(targetHeight, threadsPerBlock.y)}; const auto sourceChannelOffset = sourceHeight * sourceWidth; const auto targetChannelOffset = targetWidth * targetHeight; - + const auto scaleWidth = sourceWidth/T(targetWidth); + const auto scaleHeight = sourceHeight/T(targetHeight); // No multi-scale merging - if (targetSize[0] > 1) + /*if (targetSize[0] > 1) { for (auto n = 0; n < num; n++) - { - const auto offsetBase = n*channels; + {*/ for (auto c = 0 ; c < channels ; c++) { - const auto offset = offsetBase + c; - resizeKernel<<>>(targetPtr + offset * targetChannelOffset, - sourcePtr + offset * sourceChannelOffset, - sourceWidth, sourceHeight, targetWidth, targetHeight); + resizeKernel<<>>(targetPtr + c * targetChannelOffset, + sourcePtr + c * sourceChannelOffset, + sourceWidth, sourceHeight, targetWidth, targetHeight, scaleWidth, scaleHeight); } +/* } } // Multi-scale merging else { - // If scale_number > 1 --> scaleRatios must be set - if (scaleRatios.size() != num) - error("The scale ratios size must be equal than the number of scales.", __LINE__, __FUNCTION__, __FILE__); - const auto maxScales = 10; - if (scaleRatios.size() > maxScales) - error("The maximum number of scales is " + std::to_string(maxScales) + ".", __LINE__, __FUNCTION__, __FILE__); - // Copy scaleRatios - T* scaleRatiosGpuPtr; - cudaMalloc((void**)&scaleRatiosGpuPtr, maxScales * sizeof(T)); - cudaMemcpy(scaleRatiosGpuPtr, scaleRatios.data(), scaleRatios.size() * sizeof(T), cudaMemcpyHostToDevice); + const auto currentWidth = sourceWidth; + const auto currentHeight = sourceHeight; + + const auto scaleWidth = targetWidth / currentWidth; + const auto scaleHeight = targetHeight / currentHeight; + // Perform resize + merging const auto sourceNumOffset = channels * sourceChannelOffset; for (auto c = 0 ; c < channels ; c++) resizeKernelAndMerge<<>>(targetPtr + c * targetChannelOffset, sourcePtr + c * sourceChannelOffset, sourceNumOffset, num, scaleRatiosGpuPtr, sourceWidth, sourceHeight, targetWidth, targetHeight); - // Free memory - cudaFree(scaleRatiosGpuPtr); - } + }*/ cudaCheck(__LINE__, __FUNCTION__, __FILE__); } diff --git a/src/openpose/pose/poseExtractorTensorRT.cpp b/src/openpose/pose/poseExtractorTensorRT.cpp index 155a1f425..a367110bd 100644 --- a/src/openpose/pose/poseExtractorTensorRT.cpp +++ b/src/openpose/pose/poseExtractorTensorRT.cpp @@ -105,7 +105,7 @@ namespace op spResizeAndMergeTensorRT->setScaleRatios(scaleRatios); timeNow("SpResizeAndMergeTensorRT"); #ifndef CPU_ONLY - spResizeAndMergeTensorRT->Forward_cpu({spTensorRTNetOutputBlob.get()}, {spHeatMapsBlob.get()}); // ~5ms + spResizeAndMergeTensorRT->Forward_gpu({spTensorRTNetOutputBlob.get()}, {spHeatMapsBlob.get()}); // ~5ms timeNow("RaM forward_gpu"); cudaCheck(__LINE__, __FUNCTION__, __FILE__); timeNow("CudaCheck"); From ec58a48c8ddba7ab2406464673ec1b539638d27f Mon Sep 17 00:00:00 2001 From: Florent Buisson Date: Wed, 11 Oct 2017 21:58:53 +0200 Subject: [PATCH 35/52] TENSORRT precompiler guards --- Makefile | 6 +++++- include/openpose/core/netTensorRT.hpp | 4 ++-- include/openpose/pose/poseExtractorTensorRT.hpp | 4 ++-- include/openpose/wrapper/wrapper.hpp | 4 ++++ src/openpose/core/netTensorRT.cpp | 4 ++-- 5 files changed, 15 insertions(+), 7 deletions(-) diff --git a/Makefile b/Makefile index 7bc109ae7..061fa8de7 100644 --- a/Makefile +++ b/Makefile @@ -61,6 +61,8 @@ ifeq ($(DEEP_NET), tensorflow) # Torch else ifeq ($(DEEP_NET), torch) # COMMON_FLAGS += -DUSE_TORCH +else ifeq ($(DEEP_NET), tensorrt) + COMMON_FLAGS += -DUSE_TENSORRT # Caffe else COMMON_FLAGS += -DUSE_CAFFE @@ -146,7 +148,9 @@ ifeq ($(USE_CUDA), 1) endif # TensorRT -LIBRARIES += nvinfer nvcaffe_parser +ifeq ($(DEEP_NET), tensorrt) + LIBRARIES += nvinfer nvcaffe_parser +endif # LIBRARIES += glog gflags boost_system boost_filesystem m hdf5_hl hdf5 caffe LIBRARIES += glog gflags boost_system boost_filesystem m hdf5_hl hdf5 diff --git a/include/openpose/core/netTensorRT.hpp b/include/openpose/core/netTensorRT.hpp index 00e176ab0..0eaaaf7d3 100644 --- a/include/openpose/core/netTensorRT.hpp +++ b/include/openpose/core/netTensorRT.hpp @@ -1,4 +1,4 @@ -#ifdef USE_CAFFE +#ifdef USE_TENSORRT #ifndef OPENPOSE_CORE_NET_TENSORRT_HPP #define OPENPOSE_CORE_NET_TENSORRT_HPP @@ -57,4 +57,4 @@ namespace op } #endif // OPENPOSE_CORE_NET_TENSORRT_HPP -#endif +#endif // USE_TENSORRT diff --git a/include/openpose/pose/poseExtractorTensorRT.hpp b/include/openpose/pose/poseExtractorTensorRT.hpp index 270d2a8f4..f358d03ca 100644 --- a/include/openpose/pose/poseExtractorTensorRT.hpp +++ b/include/openpose/pose/poseExtractorTensorRT.hpp @@ -1,4 +1,4 @@ -#ifdef USE_CAFFE +#ifdef USE_TENSORRT #ifndef OPENPOSE_POSE_POSE_EXTRACTOR_TENSORRT_HPP #define OPENPOSE_POSE_POSE_EXTRACTOR_TENSORRT_HPP @@ -49,4 +49,4 @@ namespace op } #endif // OPENPOSE_POSE_POSE_EXTRACTOR_TENSORRT_HPP -#endif +#endif // USE_TENSORRT diff --git a/include/openpose/wrapper/wrapper.hpp b/include/openpose/wrapper/wrapper.hpp index b063e971c..bb8d54d91 100644 --- a/include/openpose/wrapper/wrapper.hpp +++ b/include/openpose/wrapper/wrapper.hpp @@ -638,7 +638,11 @@ namespace op { // Pose estimators for (auto gpuId = 0; gpuId < gpuNumber; gpuId++) +#ifndef USE_TENSORRT poseExtractors.emplace_back(std::make_shared( +#else + poseExtractors.emplace_back(std::make_shared( +#endif poseNetInputSize, poseNetOutputSize, finalOutputSize, wrapperStructPose.scalesNumber, wrapperStructPose.poseModel, modelFolder, gpuId + gpuNumberStart, wrapperStructPose.heatMapTypes, wrapperStructPose.heatMapScale, diff --git a/src/openpose/core/netTensorRT.cpp b/src/openpose/core/netTensorRT.cpp index 6d2c81293..8894aeac3 100644 --- a/src/openpose/core/netTensorRT.cpp +++ b/src/openpose/core/netTensorRT.cpp @@ -1,4 +1,4 @@ -#ifdef USE_CAFFE +#ifdef USE_TENSORRT #include // std::accumulate #include #include @@ -307,4 +307,4 @@ namespace op } } -#endif +#endif // USE_TENSORRT From 33aa099dbead80be092340e6d0d0eb0e62fbdb25 Mon Sep 17 00:00:00 2001 From: Florent Buisson Date: Wed, 11 Oct 2017 22:09:14 +0200 Subject: [PATCH 36/52] TENSORRT compilation is still partly using caffe --- Makefile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Makefile b/Makefile index 061fa8de7..2de2c73a7 100644 --- a/Makefile +++ b/Makefile @@ -70,6 +70,10 @@ else LDFLAGS += -Wl,-rpath=$(CAFFE_DIR)/lib INCLUDE_DIRS += $(CAFFE_DIR)/include LIBRARY_DIRS += $(CAFFE_DIR)/lib + + ifeq ($(DEEP_NET), tensorrt) + COMMON_FLAGS += -DUSE_TENSORRT + endif endif ############################## From 359b601fdbaec3c4fcb44093dc76d03ae57a3d2b Mon Sep 17 00:00:00 2001 From: Florent Buisson Date: Wed, 11 Oct 2017 23:35:41 +0200 Subject: [PATCH 37/52] Missing guards for TensorRT --- .../3_extract_from_image_TensorRT.cpp | 3 + src/openpose/core/resizeAndMergeBase.cpp | 71 ++++++++++++------- src/openpose/core/resizeAndMergeBase.cu | 34 +++++---- src/openpose/pose/poseExtractorTensorRT.cpp | 2 +- 4 files changed, 65 insertions(+), 45 deletions(-) diff --git a/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp b/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp index adba661b0..4a522fbc2 100644 --- a/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp +++ b/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp @@ -69,6 +69,7 @@ static std::string timeDiffToString(const std::chrono::high_resolution_clock::ti int openPoseTutorialPose3() { +#ifdef USE_TENSORRT op::log("Starting pose estimation.", op::Priority::High); timeNow("Start"); @@ -153,6 +154,7 @@ int openPoseTutorialPose3() op::log(log_time, op::Priority::High); } +#endif // USE_TENSORRT // Return successful message return 0; @@ -169,3 +171,4 @@ int main(int argc, char *argv[]) // Running openPoseTutorialPose1 return openPoseTutorialPose3(); } + diff --git a/src/openpose/core/resizeAndMergeBase.cpp b/src/openpose/core/resizeAndMergeBase.cpp index 2a47de7fe..b825bb311 100644 --- a/src/openpose/core/resizeAndMergeBase.cpp +++ b/src/openpose/core/resizeAndMergeBase.cpp @@ -1,8 +1,6 @@ -#include -#include +// #include #include - namespace op { template @@ -11,29 +9,50 @@ namespace op { try { - const int num = sourceSize[0]; - const int channels = sourceSize[1]; - const int sourceHeight = sourceSize[2]; - const int sourceWidth = sourceSize[3]; - const int targetHeight = targetSize[2]; - const int targetWidth = targetSize[3]; - - const auto sourceChannelOffset = sourceHeight * sourceWidth; - const auto targetChannelOffset = targetWidth * targetHeight; - - // Perform resize + merging - const auto sourceNumOffset = channels * sourceChannelOffset; - for (auto c = 0 ; c < channels ; c++) { - cv::Mat target (targetHeight, targetWidth, CV_32F, (void*)(targetPtr + c * targetChannelOffset)); - cv::multiply(target, 0.f, target); - cv::Mat t; - for (auto n = 0; n < num; n++) { - cv::Mat source(std::rint(sourceHeight * scaleRatios[n]), std::rint(sourceWidth * scaleRatios[n]), CV_32F, (void*)(sourcePtr + c * sourceChannelOffset + n * sourceNumOffset)); - cv::resize(source, t, cv::Size(targetWidth, targetHeight), 0., 0., cv::INTER_CUBIC); - cv::add(target, t, target); - } - cv::divide(target, (float)num, target); - } + UNUSED(targetPtr); + UNUSED(sourcePtr); + UNUSED(scaleInputToNetInputs); + UNUSED(targetSize); + UNUSED(sourceSize); + error("CPU version not completely implemented.", __LINE__, __FUNCTION__, __FILE__); + + // TODO: THIS CODE IS WORKING, BUT IT DOES NOT CONSIDER THE SCALES (I.E. SCALE NUMBER, START AND GAP) + // const int num = bottom->shape(0); + // const int channel = bottom->shape(1); + // const int sourceHeight = bottom->shape(2); + // const int sourceWidth = bottom->shape(3); + // const int targetHeight = top->shape(2); + // const int targetWidth = top->shape(3); + + // //stupid method + // for (int n = 0; n < num; n++) + // { + // for (int c = 0; c < channel; c++) + // { + // //fill source + // cv::Mat source(sourceWidth, sourceHeight, CV_32FC1); + // const auto sourceOffsetChannel = sourceHeight * sourceWidth; + // const auto sourceOffsetNum = sourceOffsetChannel * channel; + // const auto sourceOffset = n*sourceOffsetNum + c*sourceOffsetChannel; + // const T* const sourcePtr = bottom->cpu_data(); + // for (int y = 0; y < sourceHeight; y++) + // for (int x = 0; x < sourceWidth; x++) + // source.at(x,y) = sourcePtr[sourceOffset + y*sourceWidth + x]; + + // // spatial resize + // cv::Mat target; + // cv::resize(source, target, {targetWidth, targetHeight}, 0, 0, CV_INTER_CUBIC); + + // //fill top + // const auto targetOffsetChannel = targetHeight * targetWidth; + // const auto targetOffsetNum = targetOffsetChannel * channel; + // const auto targetOffset = n*targetOffsetNum + c*targetOffsetChannel; + // T* targetPtr = top->mutable_cpu_data(); + // for (int y = 0; y < targetHeight; y++) + // for (int x = 0; x < targetWidth; x++) + // targetPtr[targetOffset + y*targetWidth + x] = target.at(x,y); + // } + // } } catch (const std::exception& e) { diff --git a/src/openpose/core/resizeAndMergeBase.cu b/src/openpose/core/resizeAndMergeBase.cu index 7742b75c9..d7900aa24 100644 --- a/src/openpose/core/resizeAndMergeBase.cu +++ b/src/openpose/core/resizeAndMergeBase.cu @@ -7,15 +7,18 @@ namespace op const auto THREADS_PER_BLOCK_1D = 16u; template - __global__ void resizeKernel(T* targetPtr, const T* const sourcePtr, const int sourceWidth, const int sourceHeight, const int targetWidth, const int targetHeight, const T invScaleWidth, const T invScaleHeight) + __global__ void resizeKernel(T* targetPtr, const T* const sourcePtr, const int sourceWidth, const int sourceHeight, const int targetWidth, + const int targetHeight) { const auto x = (blockIdx.x * blockDim.x) + threadIdx.x; const auto y = (blockIdx.y * blockDim.y) + threadIdx.y; if (x < targetWidth && y < targetHeight) { - const T xSource = (x + 0.5f) * invScaleWidth - 0.5f; - const T ySource = (y + 0.5f) * invScaleHeight - 0.5f; + const auto scaleWidth = targetWidth / T(sourceWidth); + const auto scaleHeight = targetHeight / T(sourceHeight); + const T xSource = (x + 0.5f) / scaleWidth - 0.5f; + const T ySource = (y + 0.5f) / scaleHeight - 0.5f; targetPtr[y*targetWidth+x] = bicubicInterpolate(sourcePtr, xSource, ySource, sourceWidth, sourceHeight, sourceWidth); } @@ -27,14 +30,7 @@ namespace op { const auto x = (blockIdx.x * blockDim.x) + threadIdx.x; const auto y = (blockIdx.y * blockDim.y) + threadIdx.y; - - const auto currentWidth = sourceWidth; - const auto currentHeight = sourceHeight; - const auto scaleWidth = targetWidth / currentWidth; - const auto scaleHeight = targetHeight / currentHeight; - - if (x < targetWidth && y < targetHeight) { auto& targetPixel = targetPtr[y*targetWidth+x]; @@ -51,7 +47,8 @@ namespace op const T ySource = (y + 0.5f) / scaleHeight - 0.5f; const T* const sourcePtrN = sourcePtr + n * sourceNumOffset; - const auto interpolated = bicubicInterpolate(sourcePtrN, xSource, ySource, sourceWidth, sourceHeight, sourceWidth); + const auto interpolated = bicubicInterpolate(sourcePtrN, xSource, ySource, intRound(currentWidth), + intRound(currentHeight), sourceWidth); targetPixel += interpolated; // targetPixel = fastMax(targetPixel, interpolated); } @@ -76,18 +73,19 @@ namespace op const dim3 numBlocks{getNumberCudaBlocks(targetWidth, threadsPerBlock.x), getNumberCudaBlocks(targetHeight, threadsPerBlock.y)}; const auto sourceChannelOffset = sourceHeight * sourceWidth; const auto targetChannelOffset = targetWidth * targetHeight; - const auto scaleWidth = sourceWidth/T(targetWidth); - const auto scaleHeight = sourceHeight/T(targetHeight); + // No multi-scale merging - /*if (targetSize[0] > 1) + if (targetSize[0] > 1) { for (auto n = 0; n < num; n++) - {*/ + { + const auto offsetBase = n*channels; for (auto c = 0 ; c < channels ; c++) { - resizeKernel<<>>(targetPtr + c * targetChannelOffset, - sourcePtr + c * sourceChannelOffset, - sourceWidth, sourceHeight, targetWidth, targetHeight, scaleWidth, scaleHeight); + const auto offset = offsetBase + c; + resizeKernel<<>>(targetPtr + offset * targetChannelOffset, + sourcePtr + offset * sourceChannelOffset, + sourceWidth, sourceHeight, targetWidth, targetHeight); } } } diff --git a/src/openpose/pose/poseExtractorTensorRT.cpp b/src/openpose/pose/poseExtractorTensorRT.cpp index a367110bd..744997001 100644 --- a/src/openpose/pose/poseExtractorTensorRT.cpp +++ b/src/openpose/pose/poseExtractorTensorRT.cpp @@ -1,4 +1,4 @@ -#ifdef USE_CAFFE +#ifdef USE_TENSORRT #include #include #include From d4a89d05bc39801ae0fbc47ce0d0b520c53c5655 Mon Sep 17 00:00:00 2001 From: Florent Buisson Date: Mon, 13 Nov 2017 17:02:08 +0000 Subject: [PATCH 38/52] =?UTF-8?q?PIMPL=C2=A0version=20of=20poseExtractorTe?= =?UTF-8?q?nsorRT,=20still=20having=20template=20compilation=20errors.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Makefile | 2 - include/openpose/pose/headers.hpp | 5 +- .../openpose/pose/poseExtractorTensorRT.hpp | 40 +++--- include/openpose/wrapper/wrapper.hpp | 6 +- src/openpose/pose/poseExtractorTensorRT.cpp | 124 +++++++++++++----- .../Makefile.config.Ubuntu16_cuda8_JetsonTX2 | 2 +- .../install_openpose_JetsonTX2_JetPack3.1.sh | 1 - 7 files changed, 113 insertions(+), 67 deletions(-) diff --git a/Makefile b/Makefile index 2de2c73a7..7bbc41229 100644 --- a/Makefile +++ b/Makefile @@ -61,8 +61,6 @@ ifeq ($(DEEP_NET), tensorflow) # Torch else ifeq ($(DEEP_NET), torch) # COMMON_FLAGS += -DUSE_TORCH -else ifeq ($(DEEP_NET), tensorrt) - COMMON_FLAGS += -DUSE_TENSORRT # Caffe else COMMON_FLAGS += -DUSE_CAFFE diff --git a/include/openpose/pose/headers.hpp b/include/openpose/pose/headers.hpp index c23a9e47b..9e23af7b7 100644 --- a/include/openpose/pose/headers.hpp +++ b/include/openpose/pose/headers.hpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -15,8 +16,4 @@ #include #include -#ifdef USE_TENSORRT - #include -#endif // USE_TENSORRT - #endif // OPENPOSE_POSE_HEADERS_HPP diff --git a/include/openpose/pose/poseExtractorTensorRT.hpp b/include/openpose/pose/poseExtractorTensorRT.hpp index f358d03ca..5695baba1 100644 --- a/include/openpose/pose/poseExtractorTensorRT.hpp +++ b/include/openpose/pose/poseExtractorTensorRT.hpp @@ -1,52 +1,48 @@ -#ifdef USE_TENSORRT #ifndef OPENPOSE_POSE_POSE_EXTRACTOR_TENSORRT_HPP #define OPENPOSE_POSE_POSE_EXTRACTOR_TENSORRT_HPP #include #include -#include -#include -#include -#include #include -#include namespace op { class OP_API PoseExtractorTensorRT : public PoseExtractor { public: - PoseExtractorTensorRT(const Point& netInputSize, const Point& netOutputSize, const Point& outputSize, const int scaleNumber, - const PoseModel poseModel, const std::string& modelFolder, const int gpuId, const std::vector& heatMapTypes = {}, - const ScaleMode heatMapScale = ScaleMode::ZeroToOne); + PoseExtractorTensorRT(const Point& netInputSize, const Point& netOutputSize, + const Point& outputSize, const int scaleNumber, const PoseModel poseModel, + const std::string& modelFolder, const int gpuId, + const std::vector& heatMapTypes = {}, + const ScaleMode heatMapScale = ScaleMode::ZeroToOne, + const bool enableGoogleLogging = true); virtual ~PoseExtractorTensorRT(); void netInitializationOnThread(); - void forwardPass(const Array& inputNetData, const Point& inputDataSize, const std::vector& scaleRatios = {1.f}); + void forwardPass(const Array& inputNetData, const Point& inputDataSize, + const std::vector& scaleRatios = {1.f}); + const float* getHeatMapCpuConstPtr() const; const float* getHeatMapGpuConstPtr() const; + std::vector getHeatMapSize() const; + const float* getPoseGpuConstPtr() const; - private: - const float mResizeScale; - std::shared_ptr spNet; - std::shared_ptr> spResizeAndMergeTensorRT; - std::shared_ptr> spNmsTensorRT; - std::shared_ptr> spBodyPartConnectorTensorRT; - // Init with thread - boost::shared_ptr> spTensorRTNetOutputBlob; - std::shared_ptr> spHeatMapsBlob; - std::shared_ptr> spPeaksBlob; - std::shared_ptr> spPoseBlob; + private: + // PIMPL idiom + // http://www.cppsamples.com/common-tasks/pimpl.html + struct ImplPoseExtractorTensorRT; + std::unique_ptr upImpl; + // PIMP requires DELETE_COPY & destructor, or extra code + // http://oliora.github.io/2015/12/29/pimpl-and-rule-of-zero.html DELETE_COPY(PoseExtractorTensorRT); }; } #endif // OPENPOSE_POSE_POSE_EXTRACTOR_TENSORRT_HPP -#endif // USE_TENSORRT diff --git a/include/openpose/wrapper/wrapper.hpp b/include/openpose/wrapper/wrapper.hpp index bb8d54d91..6370d3dfc 100644 --- a/include/openpose/wrapper/wrapper.hpp +++ b/include/openpose/wrapper/wrapper.hpp @@ -638,10 +638,10 @@ namespace op { // Pose estimators for (auto gpuId = 0; gpuId < gpuNumber; gpuId++) -#ifndef USE_TENSORRT - poseExtractors.emplace_back(std::make_shared( -#else +#ifdef USE_TENSORRT poseExtractors.emplace_back(std::make_shared( +#else + poseExtractors.emplace_back(std::make_shared( #endif poseNetInputSize, poseNetOutputSize, finalOutputSize, wrapperStructPose.scalesNumber, wrapperStructPose.poseModel, modelFolder, gpuId + gpuNumberStart, diff --git a/src/openpose/pose/poseExtractorTensorRT.cpp b/src/openpose/pose/poseExtractorTensorRT.cpp index 744997001..ebbd40457 100644 --- a/src/openpose/pose/poseExtractorTensorRT.cpp +++ b/src/openpose/pose/poseExtractorTensorRT.cpp @@ -1,6 +1,10 @@ -#ifdef USE_TENSORRT -#include +#ifdef USE_CAFFE +#include +#endif #include +#include +#include +#include #include #include #include @@ -24,18 +28,51 @@ static std::string timeDiffToString(const std::chrono::high_resolution_clock::ti } -namespace op +nameupImpl->space op { - PoseExtractorTensorRT::PoseExtractorTensorRT(const Point& netInputSize, const Point& netOutputSize, const Point& outputSize, const int scaleNumber, - const PoseModel poseModel, const std::string& modelFolder, const int gpuId, const std::vector& heatMapTypes, - const ScaleMode heatMapScale) : - PoseExtractor{netOutputSize, outputSize, poseModel, heatMapTypes, heatMapScale}, + + struct PoseExtractorTensorRT::ImplPoseExtractorTensorRT + { + #ifdef USE_TENSORRT // implies USE_CAFFE for now + const float upImpl->mResizeScale; + std::shared_ptr upImpl->spNet; + std::shared_ptr> upImpl->spResizeAndMergeTensorRT; + std::shared_ptr> upImpl->spNmsTensorRT; + std::shared_ptr> upImpl->spBodyPartConnectorTensorRT; + // Init with thread + boost::shared_ptr> upImpl->spTensorRTNetOutputBlob; + std::shared_ptr> upImpl->spHeatMapsBlob; + std::shared_ptr> upImpl->spPeaksBlob; + std::shared_ptr> upImpl->spPoseBlob; + + + ImplPoseExtractorTensorRT(const Point& netInputSize, const Point& netOutputSize, + const Point& outputSize, const int scaleNumber, + const PoseModel poseModel, const int gpuId, + const std::string& modelFolder, const bool enableGoogleLogging) : mResizeScale{mNetOutputSize.x / (float)netInputSize.x}, - spNet{std::make_shared(std::array{scaleNumber, 3, (int)netInputSize.y, (int)netInputSize.x}, - modelFolder + POSE_PROTOTXT[(int)poseModel], modelFolder + POSE_TRAINED_MODEL[(int)poseModel], gpuId)}, + spNet{std::make_shared(std::array{scaleNumber, 3, + (int)netInputSize.y, (int)netInputSize.x}, + modelFolder + POSE_PROTOTXT[(int)poseModel], + modelFolder + POSE_TRAINED_MODEL[(int)poseModel], gpuId)}, spResizeAndMergeTensorRT{std::make_shared>()}, spNmsTensorRT{std::make_shared>()}, spBodyPartConnectorTensorRT{std::make_shared>()} + { + } + #endif + } + + PoseExtractorTensorRT::PoseExtractorTensorRT(const Point& netInputSize, const Point& netOutputSize, + const Point& outputSize, const int scaleNumber, + const PoseModel poseModel, const std::string& modelFolder, + const int gpuId, const std::vector& heatMapTypes, + const ScaleMode heatMapScale, const bool enableGoogleLogging) : + PoseExtractor{netOutputSize, outputSize, poseModel, heatMapTypes, heatMapScale}, + #ifdef USE_TENSORRT + , upImpl{new ImplPoseExtractorTensorRT{netInputSize, netOutputSize, scaleNumber, poseModel, + gpuId, modelFolder, enableGoogleLogging}} + #endif { try { @@ -62,24 +99,24 @@ namespace op // TensorRT net - spNet->initializationOnThread(); - spTensorRTNetOutputBlob = ((NetTensorRT*)spNet.get())->getOutputBlob(); + upImpl->spNet->initializationOnThread(); + upImpl->spTensorRTNetOutputBlob = ((NetTensorRT*)upImpl->spNet.get())->getOutputBlob(); cudaCheck(__LINE__, __FUNCTION__, __FILE__); // HeatMaps extractor blob and layer - spHeatMapsBlob = {std::make_shared>(1,1,1,1)}; - spResizeAndMergeTensorRT->Reshape({spTensorRTNetOutputBlob.get()}, {spHeatMapsBlob.get()}, mResizeScale * POSE_CCN_DECREASE_FACTOR[(int)mPoseModel]); + upImpl->spHeatMapsBlob = {std::make_shared>(1,1,1,1)}; + upImpl->spResizeAndMergeTensorRT->Reshape({upImpl->spTensorRTNetOutputBlob.get()}, {upImpl->spHeatMapsBlob.get()}, upImpl->mResizeScale * POSE_CCN_DECREASE_FACTOR[(int)mPoseModel]); cudaCheck(__LINE__, __FUNCTION__, __FILE__); // Pose extractor blob and layer - spPeaksBlob = {std::make_shared>(1,1,1,1)}; - spNmsTensorRT->Reshape({spHeatMapsBlob.get()}, {spPeaksBlob.get()}, POSE_MAX_PEAKS[(int)mPoseModel]); + upImpl->spPeaksBlob = {std::make_shared>(1,1,1,1)}; + upImpl->spNmsTensorRT->Reshape({upImpl->spHeatMapsBlob.get()}, {upImpl->spPeaksBlob.get()}, POSE_MAX_PEAKS[(int)mPoseModel]); cudaCheck(__LINE__, __FUNCTION__, __FILE__); // Pose extractor blob and layer - spPoseBlob = {std::make_shared>(1,1,1,1)}; - spBodyPartConnectorTensorRT->setPoseModel(mPoseModel); - spBodyPartConnectorTensorRT->Reshape({spHeatMapsBlob.get(), spPeaksBlob.get()}, {spPoseBlob.get()}); + upImpl->spPoseBlob = {std::make_shared>(1,1,1,1)}; + upImpl->spBodyPartConnectorTensorRT->setPoseModel(mPoseModel); + upImpl->spBodyPartConnectorTensorRT->Reshape({upImpl->spHeatMapsBlob.get(), upImpl->spPeaksBlob.get()}, {upImpl->spPoseBlob.get()}); cudaCheck(__LINE__, __FUNCTION__, __FILE__); log("Finished initialization on thread.", Priority::Low, __LINE__, __FUNCTION__, __FILE__); @@ -99,13 +136,13 @@ namespace op error("Empty inputNetData.", __LINE__, __FUNCTION__, __FILE__); timeNow("Start"); // 1. TensorRT deep network - spNet->forwardPass(inputNetData.getConstPtr()); + upImpl->spNet->forwardPass(inputNetData.getConstPtr()); timeNow("TensorRT forward"); // 2. Resize heat maps + merge different scales - spResizeAndMergeTensorRT->setScaleRatios(scaleRatios); + upImpl->spResizeAndMergeTensorRT->setScaleRatios(scaleRatios); timeNow("SpResizeAndMergeTensorRT"); #ifndef CPU_ONLY - spResizeAndMergeTensorRT->Forward_gpu({spTensorRTNetOutputBlob.get()}, {spHeatMapsBlob.get()}); // ~5ms + upImpl->spResizeAndMergeTensorRT->Forward_gpu({upImpl->spTensorRTNetOutputBlob.get()}, {upImpl->spHeatMapsBlob.get()}); // ~5ms timeNow("RaM forward_gpu"); cudaCheck(__LINE__, __FUNCTION__, __FILE__); timeNow("CudaCheck"); @@ -114,9 +151,9 @@ namespace op #endif timeNow("Resize heat Maps"); // 3. Get peaks by Non-Maximum Suppression - spNmsTensorRT->setThreshold((float)get(PoseProperty::NMSThreshold)); + upImpl->spNmsTensorRT->setThreshold((float)get(PoseProperty::NMSThreshold)); #ifndef CPU_ONLY - spNmsTensorRT->Forward_gpu({spHeatMapsBlob.get()}, {spPeaksBlob.get()}); // ~2ms + upImpl->spNmsTensorRT->Forward_gpu({upImpl->spHeatMapsBlob.get()}, {upImpl->spPeaksBlob.get()}); // ~2ms cudaCheck(__LINE__, __FUNCTION__, __FILE__); #else error("NmsTensorRT CPU version not implemented yet.", __LINE__, __FUNCTION__, __FILE__); @@ -128,14 +165,14 @@ namespace op mScaleNetToOutput = {(float)resizeGetScaleFactor(netSize, mOutputSize)}; timeNow("Scale net to output"); // 4. Connecting body parts - spBodyPartConnectorTensorRT->setScaleNetToOutput(mScaleNetToOutput); - spBodyPartConnectorTensorRT->setInterMinAboveThreshold((int)get(PoseProperty::ConnectInterMinAboveThreshold)); - spBodyPartConnectorTensorRT->setInterThreshold((float)get(PoseProperty::ConnectInterThreshold)); - spBodyPartConnectorTensorRT->setMinSubsetCnt((int)get(PoseProperty::ConnectMinSubsetCnt)); - spBodyPartConnectorTensorRT->setMinSubsetScore((float)get(PoseProperty::ConnectMinSubsetScore)); + upImpl->spBodyPartConnectorTensorRT->setScaleNetToOutput(mScaleNetToOutput); + upImpl->spBodyPartConnectorTensorRT->setInterMinAboveThreshold((int)get(PoseProperty::ConnectInterMinAboveThreshold)); + upImpl->spBodyPartConnectorTensorRT->setInterThreshold((float)get(PoseProperty::ConnectInterThreshold)); + upImpl->spBodyPartConnectorTensorRT->setMinSubsetCnt((int)get(PoseProperty::ConnectMinSubsetCnt)); + upImpl->spBodyPartConnectorTensorRT->setMinSubsetScore((float)get(PoseProperty::ConnectMinSubsetScore)); // GPU version not implemented yet - spBodyPartConnectorTensorRT->Forward_cpu({spHeatMapsBlob.get(), spPeaksBlob.get()}, mPoseKeypoints); - // spBodyPartConnectorTensorRT->Forward_gpu({spHeatMapsBlob.get(), spPeaksBlob.get()}, {spPoseBlob.get()}, mPoseKeypoints); + upImpl->spBodyPartConnectorTensorRT->Forward_cpu({upImpl->spHeatMapsBlob.get(), upImpl->spPeaksBlob.get()}, mPoseKeypoints); + // upImpl->spBodyPartConnectorTensorRT->Forward_gpu({upImpl->spHeatMapsBlob.get(), upImpl->spPeaksBlob.get()}, {upImpl->spPoseBlob.get()}, mPoseKeypoints); timeNow("Connect Body Parts"); const auto totalTimeSec = timeDiffToString(timings.back().second, timings.front().second); @@ -158,7 +195,7 @@ namespace op try { checkThread(); - return spHeatMapsBlob->cpu_data(); + return upImpl->spHeatMapsBlob->cpu_data(); } catch (const std::exception& e) { @@ -172,7 +209,7 @@ namespace op try { checkThread(); - return spHeatMapsBlob->gpu_data(); + return upImpl->spHeatMapsBlob->gpu_data(); } catch (const std::exception& e) { @@ -181,13 +218,33 @@ namespace op } } + + std::vector PoseExtractorTensorRT::getHeatMapSize() const + { + try + { + #ifdef USE_CAFFE + checkThread(); + return upImpl->spHeatMapsBlob->shape(); + #else + return {}; + #endif + } + catch (const std::exception& e) + { + error(e.what(), __LINE__, __FUNCTION__, __FILE__); + return {}; + } + } + + const float* PoseExtractorTensorRT::getPoseGpuConstPtr() const { try { error("GPU pointer for people pose data not implemented yet.", __LINE__, __FUNCTION__, __FILE__); checkThread(); - return spPoseBlob->gpu_data(); + return upImpl->spPoseBlob->gpu_data(); } catch (const std::exception& e) { @@ -197,7 +254,6 @@ namespace op } } -#endif diff --git a/ubuntu/Makefile.config.Ubuntu16_cuda8_JetsonTX2 b/ubuntu/Makefile.config.Ubuntu16_cuda8_JetsonTX2 index 476fc9f92..d3241f886 100644 --- a/ubuntu/Makefile.config.Ubuntu16_cuda8_JetsonTX2 +++ b/ubuntu/Makefile.config.Ubuntu16_cuda8_JetsonTX2 @@ -53,7 +53,7 @@ CUDA_ARCH := -gencode arch=compute_30,code=sm_30 \ # DEEP_NET choice: # caffe for Caffe (default and only option so far) -DEEP_NET := caffe +DEEP_NET := tensorrt # Caffe directory CAFFE_DIR := 3rdparty/caffe/distribute diff --git a/ubuntu/install_openpose_JetsonTX2_JetPack3.1.sh b/ubuntu/install_openpose_JetsonTX2_JetPack3.1.sh index 57d71638e..7387e3bba 100755 --- a/ubuntu/install_openpose_JetsonTX2_JetPack3.1.sh +++ b/ubuntu/install_openpose_JetsonTX2_JetPack3.1.sh @@ -51,7 +51,6 @@ echo "" echo "------------------------- Compiling OpenPose -------------------------" # Go back to main folder -cd .. # Copy Makefile.config cp ubuntu/Makefile.config.Ubuntu16_cuda8_JetsonTX2 Makefile.config # Compile OpenPose From 766c44ae0970bd7fcdea32aa223098b7d633a8a0 Mon Sep 17 00:00:00 2001 From: Florent Buisson Date: Mon, 13 Nov 2017 18:13:41 +0100 Subject: [PATCH 39/52] Spot the differences part 1. --- include/openpose/pose/poseExtractorTensorRT.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/openpose/pose/poseExtractorTensorRT.hpp b/include/openpose/pose/poseExtractorTensorRT.hpp index 5695baba1..6d8f53f15 100644 --- a/include/openpose/pose/poseExtractorTensorRT.hpp +++ b/include/openpose/pose/poseExtractorTensorRT.hpp @@ -1,9 +1,9 @@ #ifndef OPENPOSE_POSE_POSE_EXTRACTOR_TENSORRT_HPP #define OPENPOSE_POSE_POSE_EXTRACTOR_TENSORRT_HPP -#include #include #include +#include namespace op { From c12dd28af48ea3ec60a5cd5f89edf53bf7ba2266 Mon Sep 17 00:00:00 2001 From: Florent Buisson Date: Mon, 13 Nov 2017 18:23:41 +0100 Subject: [PATCH 40/52] Spot the differences part 2 --- src/openpose/pose/poseExtractorTensorRT.cpp | 108 ++++++++++++-------- 1 file changed, 63 insertions(+), 45 deletions(-) diff --git a/src/openpose/pose/poseExtractorTensorRT.cpp b/src/openpose/pose/poseExtractorTensorRT.cpp index ebbd40457..19f538211 100644 --- a/src/openpose/pose/poseExtractorTensorRT.cpp +++ b/src/openpose/pose/poseExtractorTensorRT.cpp @@ -1,5 +1,5 @@ #ifdef USE_CAFFE -#include + #include #endif #include #include @@ -28,38 +28,38 @@ static std::string timeDiffToString(const std::chrono::high_resolution_clock::ti } -nameupImpl->space op +namespace op { struct PoseExtractorTensorRT::ImplPoseExtractorTensorRT { #ifdef USE_TENSORRT // implies USE_CAFFE for now - const float upImpl->mResizeScale; - std::shared_ptr upImpl->spNet; - std::shared_ptr> upImpl->spResizeAndMergeTensorRT; - std::shared_ptr> upImpl->spNmsTensorRT; - std::shared_ptr> upImpl->spBodyPartConnectorTensorRT; + const float mResizeScale; + std::shared_ptr spNet; + std::shared_ptr> spResizeAndMergeTensorRT; + std::shared_ptr> spNmsTensorRT; + std::shared_ptr> spBodyPartConnectorTensorRT; // Init with thread - boost::shared_ptr> upImpl->spTensorRTNetOutputBlob; - std::shared_ptr> upImpl->spHeatMapsBlob; - std::shared_ptr> upImpl->spPeaksBlob; - std::shared_ptr> upImpl->spPoseBlob; + boost::shared_ptr> spTensorRTNetOutputBlob; + std::shared_ptr> spHeatMapsBlob; + std::shared_ptr> spPeaksBlob; + std::shared_ptr> spPoseBlob; ImplPoseExtractorTensorRT(const Point& netInputSize, const Point& netOutputSize, const Point& outputSize, const int scaleNumber, const PoseModel poseModel, const int gpuId, const std::string& modelFolder, const bool enableGoogleLogging) : - mResizeScale{mNetOutputSize.x / (float)netInputSize.x}, - spNet{std::make_shared(std::array{scaleNumber, 3, - (int)netInputSize.y, (int)netInputSize.x}, - modelFolder + POSE_PROTOTXT[(int)poseModel], - modelFolder + POSE_TRAINED_MODEL[(int)poseModel], gpuId)}, - spResizeAndMergeTensorRT{std::make_shared>()}, - spNmsTensorRT{std::make_shared>()}, - spBodyPartConnectorTensorRT{std::make_shared>()} - { - } + mResizeScale{mNetOutputSize.x / (float)netInputSize.x}, + spNet{std::make_shared(std::array{scaleNumber, 3, + (int)netInputSize.y, (int)netInputSize.x}, + modelFolder + POSE_PROTOTXT[(int)poseModel], + modelFolder + POSE_TRAINED_MODEL[(int)poseModel], gpuId)}, + spResizeAndMergeTensorRT{std::make_shared>()}, + spNmsTensorRT{std::make_shared>()}, + spBodyPartConnectorTensorRT{std::make_shared>()} + { + } #endif } @@ -76,10 +76,27 @@ nameupImpl->space op { try { - const auto resizeScale = mNetOutputSize.x / (float)netInputSize.x; - const auto resizeScaleCheck = resizeScale / (mNetOutputSize.y/(float)netInputSize.y); - if (1+1e-6 < resizeScaleCheck || resizeScaleCheck < 1-1e-6) - error("Net input and output size must be proportional. resizeScaleCheck = " + std::to_string(resizeScaleCheck), __LINE__, __FUNCTION__, __FILE__); + #ifdef USE_TENSORRT + const auto resizeScale = mNetOutputSize.x / (float)netInputSize.x; + const auto resizeScaleCheck = resizeScale / (mNetOutputSize.y/(float)netInputSize.y); + if (1+1e-6 < resizeScaleCheck || resizeScaleCheck < 1-1e-6) + error("Net input and output size must be proportional. resizeScaleCheck = " + + std::to_string(resizeScaleCheck), __LINE__, __FUNCTION__, __FILE__); + // Layers parameters + upImpl->spBodyPartConnectorCaffe->setPoseModel(mPoseModel); + #else + UNUSED(netInputSize); + UNUSED(netOutputSize); + UNUSED(outputSize); + UNUSED(scaleNumber); + UNUSED(poseModel); + UNUSED(modelFolder); + UNUSED(gpuId); + UNUSED(heatMapTypes); + UNUSED(heatMapScale); + error("OpenPose must be compiled with the `USE_CAFFE` macro definition in order to use this" + " functionality.", __LINE__, __FUNCTION__, __FILE__); + #endif } catch (const std::exception& e) { @@ -97,29 +114,30 @@ nameupImpl->space op { log("Starting initialization on thread.", Priority::Low, __LINE__, __FUNCTION__, __FILE__); + #ifdef USE_TENSORRT + // TensorRT net + upImpl->spNet->initializationOnThread(); + upImpl->spTensorRTNetOutputBlob = ((NetTensorRT*)upImpl->spNet.get())->getOutputBlob(); + cudaCheck(__LINE__, __FUNCTION__, __FILE__); - // TensorRT net - upImpl->spNet->initializationOnThread(); - upImpl->spTensorRTNetOutputBlob = ((NetTensorRT*)upImpl->spNet.get())->getOutputBlob(); - cudaCheck(__LINE__, __FUNCTION__, __FILE__); - - // HeatMaps extractor blob and layer - upImpl->spHeatMapsBlob = {std::make_shared>(1,1,1,1)}; - upImpl->spResizeAndMergeTensorRT->Reshape({upImpl->spTensorRTNetOutputBlob.get()}, {upImpl->spHeatMapsBlob.get()}, upImpl->mResizeScale * POSE_CCN_DECREASE_FACTOR[(int)mPoseModel]); - cudaCheck(__LINE__, __FUNCTION__, __FILE__); - - // Pose extractor blob and layer - upImpl->spPeaksBlob = {std::make_shared>(1,1,1,1)}; - upImpl->spNmsTensorRT->Reshape({upImpl->spHeatMapsBlob.get()}, {upImpl->spPeaksBlob.get()}, POSE_MAX_PEAKS[(int)mPoseModel]); - cudaCheck(__LINE__, __FUNCTION__, __FILE__); + // HeatMaps extractor blob and layer + upImpl->spHeatMapsBlob = {std::make_shared>(1,1,1,1)}; + upImpl->spResizeAndMergeTensorRT->Reshape({upImpl->spTensorRTNetOutputBlob.get()}, {upImpl->spHeatMapsBlob.get()}, upImpl->mResizeScale * POSE_CCN_DECREASE_FACTOR[(int)mPoseModel]); + cudaCheck(__LINE__, __FUNCTION__, __FILE__); - // Pose extractor blob and layer - upImpl->spPoseBlob = {std::make_shared>(1,1,1,1)}; - upImpl->spBodyPartConnectorTensorRT->setPoseModel(mPoseModel); - upImpl->spBodyPartConnectorTensorRT->Reshape({upImpl->spHeatMapsBlob.get(), upImpl->spPeaksBlob.get()}, {upImpl->spPoseBlob.get()}); - cudaCheck(__LINE__, __FUNCTION__, __FILE__); + // Pose extractor blob and layer + upImpl->spPeaksBlob = {std::make_shared>(1,1,1,1)}; + upImpl->spNmsTensorRT->Reshape({upImpl->spHeatMapsBlob.get()}, {upImpl->spPeaksBlob.get()}, POSE_MAX_PEAKS[(int)mPoseModel]); + cudaCheck(__LINE__, __FUNCTION__, __FILE__); - log("Finished initialization on thread.", Priority::Low, __LINE__, __FUNCTION__, __FILE__); + // Pose extractor blob and layer + upImpl->spPoseBlob = {std::make_shared>(1,1,1,1)}; + upImpl->spBodyPartConnectorTensorRT->setPoseModel(mPoseModel); + upImpl->spBodyPartConnectorTensorRT->Reshape({upImpl->spHeatMapsBlob.get(), upImpl->spPeaksBlob.get()}, {upImpl->spPoseBlob.get()}); + cudaCheck(__LINE__, __FUNCTION__, __FILE__); + + log("Finished initialization on thread.", Priority::Low, __LINE__, __FUNCTION__, __FILE__); + #endif } catch (const std::exception& e) { From 047d18b831df921c4fdf14951c24b58aecffde2d Mon Sep 17 00:00:00 2001 From: Florent Buisson Date: Mon, 13 Nov 2017 18:37:01 +0100 Subject: [PATCH 41/52] Spot the differences 3 --- src/openpose/pose/poseExtractorTensorRT.cpp | 24 +++++++++++---------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/src/openpose/pose/poseExtractorTensorRT.cpp b/src/openpose/pose/poseExtractorTensorRT.cpp index 19f538211..df8782e97 100644 --- a/src/openpose/pose/poseExtractorTensorRT.cpp +++ b/src/openpose/pose/poseExtractorTensorRT.cpp @@ -226,8 +226,12 @@ namespace op { try { - checkThread(); - return upImpl->spHeatMapsBlob->gpu_data(); + #ifdef USE_CAFFE + checkThread(); + return upImpl->spHeatMapsBlob->gpu_data(); + #else + return nullptr; + #endif } catch (const std::exception& e) { @@ -236,7 +240,6 @@ namespace op } } - std::vector PoseExtractorTensorRT::getHeatMapSize() const { try @@ -255,14 +258,17 @@ namespace op } } - const float* PoseExtractorTensorRT::getPoseGpuConstPtr() const { try { - error("GPU pointer for people pose data not implemented yet.", __LINE__, __FUNCTION__, __FILE__); - checkThread(); - return upImpl->spPoseBlob->gpu_data(); + #ifdef USE_CAFFE + error("GPU pointer for people pose data not implemented yet.", __LINE__, __FUNCTION__, __FILE__); + checkThread(); + return upImpl->spPoseBlob->gpu_data(); + #else + return nullptr; + #endif } catch (const std::exception& e) { @@ -271,7 +277,3 @@ namespace op } } } - - - - From 9e4d903880c9afd427b90b3d9b663ff3000e86ad Mon Sep 17 00:00:00 2001 From: Florent Buisson Date: Tue, 14 Nov 2017 06:02:22 +0000 Subject: [PATCH 42/52] Fixed compilation without TensorRT --- src/openpose/pose/poseExtractorTensorRT.cpp | 130 ++++++++++---------- 1 file changed, 68 insertions(+), 62 deletions(-) diff --git a/src/openpose/pose/poseExtractorTensorRT.cpp b/src/openpose/pose/poseExtractorTensorRT.cpp index df8782e97..44e07831c 100644 --- a/src/openpose/pose/poseExtractorTensorRT.cpp +++ b/src/openpose/pose/poseExtractorTensorRT.cpp @@ -33,7 +33,7 @@ namespace op struct PoseExtractorTensorRT::ImplPoseExtractorTensorRT { - #ifdef USE_TENSORRT // implies USE_CAFFE for now + #ifdef USE_TENSORRT // implies USE_TENSORRT for now const float mResizeScale; std::shared_ptr spNet; std::shared_ptr> spResizeAndMergeTensorRT; @@ -54,21 +54,21 @@ namespace op spNet{std::make_shared(std::array{scaleNumber, 3, (int)netInputSize.y, (int)netInputSize.x}, modelFolder + POSE_PROTOTXT[(int)poseModel], - modelFolder + POSE_TRAINED_MODEL[(int)poseModel], gpuId)}, + modelFolder + POSE_TRAINED_MODEL[(int)poseModel], gpuId, enableGoogleLogging)}, spResizeAndMergeTensorRT{std::make_shared>()}, spNmsTensorRT{std::make_shared>()}, spBodyPartConnectorTensorRT{std::make_shared>()} { } #endif - } + }; PoseExtractorTensorRT::PoseExtractorTensorRT(const Point& netInputSize, const Point& netOutputSize, const Point& outputSize, const int scaleNumber, const PoseModel poseModel, const std::string& modelFolder, const int gpuId, const std::vector& heatMapTypes, const ScaleMode heatMapScale, const bool enableGoogleLogging) : - PoseExtractor{netOutputSize, outputSize, poseModel, heatMapTypes, heatMapScale}, + PoseExtractor{netOutputSize, outputSize, poseModel, heatMapTypes, heatMapScale} #ifdef USE_TENSORRT , upImpl{new ImplPoseExtractorTensorRT{netInputSize, netOutputSize, scaleNumber, poseModel, gpuId, modelFolder, enableGoogleLogging}} @@ -94,7 +94,7 @@ namespace op UNUSED(gpuId); UNUSED(heatMapTypes); UNUSED(heatMapScale); - error("OpenPose must be compiled with the `USE_CAFFE` macro definition in order to use this" + error("OpenPose must be compiled with the `USE_TENSORRT` macro definition in order to use this" " functionality.", __LINE__, __FUNCTION__, __FILE__); #endif } @@ -149,58 +149,60 @@ namespace op { try { - // Security checks - if (inputNetData.empty()) - error("Empty inputNetData.", __LINE__, __FUNCTION__, __FILE__); - timeNow("Start"); - // 1. TensorRT deep network - upImpl->spNet->forwardPass(inputNetData.getConstPtr()); - timeNow("TensorRT forward"); - // 2. Resize heat maps + merge different scales - upImpl->spResizeAndMergeTensorRT->setScaleRatios(scaleRatios); - timeNow("SpResizeAndMergeTensorRT"); - #ifndef CPU_ONLY - upImpl->spResizeAndMergeTensorRT->Forward_gpu({upImpl->spTensorRTNetOutputBlob.get()}, {upImpl->spHeatMapsBlob.get()}); // ~5ms - timeNow("RaM forward_gpu"); - cudaCheck(__LINE__, __FUNCTION__, __FILE__); - timeNow("CudaCheck"); - #else - error("ResizeAndMergeTensorRT CPU version not implemented yet.", __LINE__, __FUNCTION__, __FILE__); - #endif - timeNow("Resize heat Maps"); - // 3. Get peaks by Non-Maximum Suppression - upImpl->spNmsTensorRT->setThreshold((float)get(PoseProperty::NMSThreshold)); - #ifndef CPU_ONLY - upImpl->spNmsTensorRT->Forward_gpu({upImpl->spHeatMapsBlob.get()}, {upImpl->spPeaksBlob.get()}); // ~2ms - cudaCheck(__LINE__, __FUNCTION__, __FILE__); - #else - error("NmsTensorRT CPU version not implemented yet.", __LINE__, __FUNCTION__, __FILE__); - #endif - timeNow("Peaks by nms"); - // Get scale net to output - const auto scaleProducerToNetInput = resizeGetScaleFactor(inputDataSize, mNetOutputSize); - const Point netSize{intRound(scaleProducerToNetInput*inputDataSize.x), intRound(scaleProducerToNetInput*inputDataSize.y)}; - mScaleNetToOutput = {(float)resizeGetScaleFactor(netSize, mOutputSize)}; - timeNow("Scale net to output"); - // 4. Connecting body parts - upImpl->spBodyPartConnectorTensorRT->setScaleNetToOutput(mScaleNetToOutput); - upImpl->spBodyPartConnectorTensorRT->setInterMinAboveThreshold((int)get(PoseProperty::ConnectInterMinAboveThreshold)); - upImpl->spBodyPartConnectorTensorRT->setInterThreshold((float)get(PoseProperty::ConnectInterThreshold)); - upImpl->spBodyPartConnectorTensorRT->setMinSubsetCnt((int)get(PoseProperty::ConnectMinSubsetCnt)); - upImpl->spBodyPartConnectorTensorRT->setMinSubsetScore((float)get(PoseProperty::ConnectMinSubsetScore)); - // GPU version not implemented yet - upImpl->spBodyPartConnectorTensorRT->Forward_cpu({upImpl->spHeatMapsBlob.get(), upImpl->spPeaksBlob.get()}, mPoseKeypoints); - // upImpl->spBodyPartConnectorTensorRT->Forward_gpu({upImpl->spHeatMapsBlob.get(), upImpl->spPeaksBlob.get()}, {upImpl->spPoseBlob.get()}, mPoseKeypoints); - timeNow("Connect Body Parts"); - - const auto totalTimeSec = timeDiffToString(timings.back().second, timings.front().second); - const auto message = "Pose estimation successfully finished. Total time: " + totalTimeSec + " seconds."; - op::log(message, op::Priority::High); + #ifdef USE_TENSORRT + // Security checks + if (inputNetData.empty()) + error("Empty inputNetData.", __LINE__, __FUNCTION__, __FILE__); + timeNow("Start"); + // 1. TensorRT deep network + upImpl->spNet->forwardPass(inputNetData.getConstPtr()); + timeNow("TensorRT forward"); + // 2. Resize heat maps + merge different scales + upImpl->spResizeAndMergeTensorRT->setScaleRatios(scaleRatios); + timeNow("SpResizeAndMergeTensorRT"); + #ifndef CPU_ONLY + upImpl->spResizeAndMergeTensorRT->Forward_gpu({upImpl->spTensorRTNetOutputBlob.get()}, {upImpl->spHeatMapsBlob.get()}); // ~5ms + timeNow("RaM forward_gpu"); + cudaCheck(__LINE__, __FUNCTION__, __FILE__); + timeNow("CudaCheck"); + #else + error("ResizeAndMergeTensorRT CPU version not implemented yet.", __LINE__, __FUNCTION__, __FILE__); + #endif + timeNow("Resize heat Maps"); + // 3. Get peaks by Non-Maximum Suppression + upImpl->spNmsTensorRT->setThreshold((float)get(PoseProperty::NMSThreshold)); + #ifndef CPU_ONLY + upImpl->spNmsTensorRT->Forward_gpu({upImpl->spHeatMapsBlob.get()}, {upImpl->spPeaksBlob.get()}); // ~2ms + cudaCheck(__LINE__, __FUNCTION__, __FILE__); + #else + error("NmsTensorRT CPU version not implemented yet.", __LINE__, __FUNCTION__, __FILE__); + #endif + timeNow("Peaks by nms"); + // Get scale net to output + const auto scaleProducerToNetInput = resizeGetScaleFactor(inputDataSize, mNetOutputSize); + const Point netSize{intRound(scaleProducerToNetInput*inputDataSize.x), intRound(scaleProducerToNetInput*inputDataSize.y)}; + mScaleNetToOutput = {(float)resizeGetScaleFactor(netSize, mOutputSize)}; + timeNow("Scale net to output"); + // 4. Connecting body parts + upImpl->spBodyPartConnectorTensorRT->setScaleNetToOutput(mScaleNetToOutput); + upImpl->spBodyPartConnectorTensorRT->setInterMinAboveThreshold((int)get(PoseProperty::ConnectInterMinAboveThreshold)); + upImpl->spBodyPartConnectorTensorRT->setInterThreshold((float)get(PoseProperty::ConnectInterThreshold)); + upImpl->spBodyPartConnectorTensorRT->setMinSubsetCnt((int)get(PoseProperty::ConnectMinSubsetCnt)); + upImpl->spBodyPartConnectorTensorRT->setMinSubsetScore((float)get(PoseProperty::ConnectMinSubsetScore)); + // GPU version not implemented yet + upImpl->spBodyPartConnectorTensorRT->Forward_cpu({upImpl->spHeatMapsBlob.get(), upImpl->spPeaksBlob.get()}, mPoseKeypoints); + // upImpl->spBodyPartConnectorTensorRT->Forward_gpu({upImpl->spHeatMapsBlob.get(), upImpl->spPeaksBlob.get()}, {upImpl->spPoseBlob.get()}, mPoseKeypoints); + timeNow("Connect Body Parts"); + + const auto totalTimeSec = timeDiffToString(timings.back().second, timings.front().second); + const auto message = "Pose estimation successfully finished. Total time: " + totalTimeSec + " seconds."; + op::log(message, op::Priority::High); - for(OpTimings::iterator timing = timings.begin()+1; timing != timings.end(); ++timing) { - const auto log_time = (*timing).first + " - " + timeDiffToString((*timing).second, (*(timing-1)).second); - op::log(log_time, op::Priority::High); - } + for(OpTimings::iterator timing = timings.begin()+1; timing != timings.end(); ++timing) { + const auto log_time = (*timing).first + " - " + timeDiffToString((*timing).second, (*(timing-1)).second); + op::log(log_time, op::Priority::High); + } + #endif } catch (const std::exception& e) { @@ -211,9 +213,13 @@ namespace op const float* PoseExtractorTensorRT::getHeatMapCpuConstPtr() const { try - { - checkThread(); - return upImpl->spHeatMapsBlob->cpu_data(); + { + #ifdef USE_TENSORRT + checkThread(); + return upImpl->spHeatMapsBlob->cpu_data(); + #else + return nullptr; + #endif } catch (const std::exception& e) { @@ -226,7 +232,7 @@ namespace op { try { - #ifdef USE_CAFFE + #ifdef USE_TENSORRT checkThread(); return upImpl->spHeatMapsBlob->gpu_data(); #else @@ -244,7 +250,7 @@ namespace op { try { - #ifdef USE_CAFFE + #ifdef USE_TENSORRT checkThread(); return upImpl->spHeatMapsBlob->shape(); #else @@ -262,7 +268,7 @@ namespace op { try { - #ifdef USE_CAFFE + #ifdef USE_TENSORRT error("GPU pointer for people pose data not implemented yet.", __LINE__, __FUNCTION__, __FILE__); checkThread(); return upImpl->spPoseBlob->gpu_data(); From b3655d0fcde1b87c96f9ee65219937579831a047 Mon Sep 17 00:00:00 2001 From: Florent Buisson Date: Tue, 14 Nov 2017 06:20:33 +0000 Subject: [PATCH 43/52] Fix attempt --- include/openpose/pose/poseExtractorTensorRT.hpp | 4 +--- include/openpose/wrapper/wrapper.hpp | 1 - 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/include/openpose/pose/poseExtractorTensorRT.hpp b/include/openpose/pose/poseExtractorTensorRT.hpp index 6d8f53f15..09bd43383 100644 --- a/include/openpose/pose/poseExtractorTensorRT.hpp +++ b/include/openpose/pose/poseExtractorTensorRT.hpp @@ -10,9 +10,7 @@ namespace op class OP_API PoseExtractorTensorRT : public PoseExtractor { public: - PoseExtractorTensorRT(const Point& netInputSize, const Point& netOutputSize, - const Point& outputSize, const int scaleNumber, const PoseModel poseModel, - const std::string& modelFolder, const int gpuId, + PoseExtractorTensorRT(const std::string& modelFolder, const int gpuId, const std::vector& heatMapTypes = {}, const ScaleMode heatMapScale = ScaleMode::ZeroToOne, const bool enableGoogleLogging = true); diff --git a/include/openpose/wrapper/wrapper.hpp b/include/openpose/wrapper/wrapper.hpp index 6370d3dfc..893cd7dd6 100644 --- a/include/openpose/wrapper/wrapper.hpp +++ b/include/openpose/wrapper/wrapper.hpp @@ -643,7 +643,6 @@ namespace op #else poseExtractors.emplace_back(std::make_shared( #endif - poseNetInputSize, poseNetOutputSize, finalOutputSize, wrapperStructPose.scalesNumber, wrapperStructPose.poseModel, modelFolder, gpuId + gpuNumberStart, wrapperStructPose.heatMapTypes, wrapperStructPose.heatMapScale, wrapperStructPose.enableGoogleLogging From e76dc7194ad2b38be6a7cc7d3dae9bc35e18cf4f Mon Sep 17 00:00:00 2001 From: Florent Buisson Date: Tue, 14 Nov 2017 10:48:19 +0000 Subject: [PATCH 44/52] Wrong variable name --- src/openpose/pose/poseExtractorTensorRT.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/openpose/pose/poseExtractorTensorRT.cpp b/src/openpose/pose/poseExtractorTensorRT.cpp index 44e07831c..5675c5bad 100644 --- a/src/openpose/pose/poseExtractorTensorRT.cpp +++ b/src/openpose/pose/poseExtractorTensorRT.cpp @@ -50,7 +50,7 @@ namespace op const Point& outputSize, const int scaleNumber, const PoseModel poseModel, const int gpuId, const std::string& modelFolder, const bool enableGoogleLogging) : - mResizeScale{mNetOutputSize.x / (float)netInputSize.x}, + mResizeScale{netOutputSize.x / (float)netInputSize.x}, spNet{std::make_shared(std::array{scaleNumber, 3, (int)netInputSize.y, (int)netInputSize.x}, modelFolder + POSE_PROTOTXT[(int)poseModel], From 6456dff7137aa775de0cf6dde34df31190c9c1c9 Mon Sep 17 00:00:00 2001 From: Florent Buisson Date: Tue, 14 Nov 2017 13:52:34 +0000 Subject: [PATCH 45/52] Too much changed in poseExtractorCaffe, need to rewrite TensorRT one from scratch. --- src/openpose/pose/poseExtractorTensorRT.cpp | 96 +++++++++++++++++---- 1 file changed, 80 insertions(+), 16 deletions(-) diff --git a/src/openpose/pose/poseExtractorTensorRT.cpp b/src/openpose/pose/poseExtractorTensorRT.cpp index 5675c5bad..c9afbff08 100644 --- a/src/openpose/pose/poseExtractorTensorRT.cpp +++ b/src/openpose/pose/poseExtractorTensorRT.cpp @@ -34,27 +34,28 @@ namespace op struct PoseExtractorTensorRT::ImplPoseExtractorTensorRT { #ifdef USE_TENSORRT // implies USE_TENSORRT for now - const float mResizeScale; - std::shared_ptr spNet; + const PoseModel mPoseModel; + const int mGpuId; + const std::string mModelFolder; + const bool mEnableGoogleLogging; + // General parameters + std::vector> spTensorRTNets; std::shared_ptr> spResizeAndMergeTensorRT; std::shared_ptr> spNmsTensorRT; std::shared_ptr> spBodyPartConnectorTensorRT; // Init with thread - boost::shared_ptr> spTensorRTNetOutputBlob; + std::vector>> spTensorRTNetOutputBlobs; std::shared_ptr> spHeatMapsBlob; std::shared_ptr> spPeaksBlob; std::shared_ptr> spPoseBlob; - ImplPoseExtractorTensorRT(const Point& netInputSize, const Point& netOutputSize, - const Point& outputSize, const int scaleNumber, - const PoseModel poseModel, const int gpuId, + ImplPoseExtractorTensorRT(const PoseModel poseModel, const int gpuId, const std::string& modelFolder, const bool enableGoogleLogging) : - mResizeScale{netOutputSize.x / (float)netInputSize.x}, - spNet{std::make_shared(std::array{scaleNumber, 3, - (int)netInputSize.y, (int)netInputSize.x}, - modelFolder + POSE_PROTOTXT[(int)poseModel], - modelFolder + POSE_TRAINED_MODEL[(int)poseModel], gpuId, enableGoogleLogging)}, + mPoseModel{poseModel}, + mGpuId{gpuId}, + mModelFolder{modelFolder}, + mEnableGoogleLoggin{enableGoogleLogging}, spResizeAndMergeTensorRT{std::make_shared>()}, spNmsTensorRT{std::make_shared>()}, spBodyPartConnectorTensorRT{std::make_shared>()} @@ -62,6 +63,69 @@ namespace op } #endif }; + + inline void reshapePoseExtractorCaffe(std::shared_ptr>& resizeAndMergeCaffe, + std::shared_ptr>& nmsCaffe, + std::shared_ptr>& bodyPartConnectorCaffe, + std::vector>>& caffeNetOutputBlob, + std::shared_ptr>& heatMapsBlob, + std::shared_ptr>& peaksBlob, + std::shared_ptr>& poseBlob, + const float scaleInputToNetInput, + const PoseModel poseModel) + { + try + { + // HeatMaps extractor blob and layer + const auto caffeNetOutputBlobs = caffeNetSharedToPtr(caffeNetOutputBlob); + resizeAndMergeCaffe->Reshape(caffeNetOutputBlobs, {heatMapsBlob.get()}, + POSE_CCN_DECREASE_FACTOR[(int)poseModel], 1.f/scaleInputToNetInput); + // Pose extractor blob and layer + nmsCaffe->Reshape({heatMapsBlob.get()}, {peaksBlob.get()}, POSE_MAX_PEAKS[(int)poseModel]); + // Pose extractor blob and layer + bodyPartConnectorCaffe->Reshape({heatMapsBlob.get(), peaksBlob.get()}, {poseBlob.get()}); + // Cuda check + #ifdef USE_CUDA + cudaCheck(__LINE__, __FUNCTION__, __FILE__); + #endif + } + catch (const std::exception& e) + { + error(e.what(), __LINE__, __FUNCTION__, __FILE__); + } + } + + void addTensorRTNetOnThread(std::vector>& netTensorRT, + std::vector>>& caffeNetOutputBlob, + const PoseModel poseModel, const int gpuId, + const std::string& modelFolder, const bool enableGoogleLogging) + { + try + { + // Add Caffe Net + netTensorRT.emplace_back( + std::make_shared(modelFolder + POSE_PROTOTXT[(int)poseModel], + modelFolder + POSE_TRAINED_MODEL[(int)poseModel], + gpuId, enableGoogleLogging) + ); + // Initializing them on the thread + netTensorRT.back()->initializationOnThread(); + caffeNetOutputBlob.emplace_back(netTensorRT.back()->getOutputBlob()); + // Security checks + if (netTensorRT.size() != caffeNetOutputBlob.size()) + error("Weird error, this should not happen. Notify us.", __LINE__, __FUNCTION__, __FILE__); + // Cuda check + #ifdef USE_CUDA + cudaCheck(__LINE__, __FUNCTION__, __FILE__); + #endif + } + catch (const std::exception& e) + { + error(e.what(), __LINE__, __FUNCTION__, __FILE__); + } + } + #endif + PoseExtractorTensorRT::PoseExtractorTensorRT(const Point& netInputSize, const Point& netOutputSize, const Point& outputSize, const int scaleNumber, @@ -116,13 +180,13 @@ namespace op #ifdef USE_TENSORRT // TensorRT net - upImpl->spNet->initializationOnThread(); - upImpl->spTensorRTNetOutputBlob = ((NetTensorRT*)upImpl->spNet.get())->getOutputBlob(); + upImpl->spTensorRTNets->initializationOnThread(); + upImpl->spTensorRTNetOutputBlobs = ((NetTensorRT*)upImpl->spTensorRTNets.get())->getOutputBlob(); cudaCheck(__LINE__, __FUNCTION__, __FILE__); // HeatMaps extractor blob and layer upImpl->spHeatMapsBlob = {std::make_shared>(1,1,1,1)}; - upImpl->spResizeAndMergeTensorRT->Reshape({upImpl->spTensorRTNetOutputBlob.get()}, {upImpl->spHeatMapsBlob.get()}, upImpl->mResizeScale * POSE_CCN_DECREASE_FACTOR[(int)mPoseModel]); + upImpl->spResizeAndMergeTensorRT->Reshape({upImpl->spTensorRTNetOutputBlobs.get()}, {upImpl->spHeatMapsBlob.get()}, upImpl->mResizeScale * POSE_CCN_DECREASE_FACTOR[(int)mPoseModel]); cudaCheck(__LINE__, __FUNCTION__, __FILE__); // Pose extractor blob and layer @@ -155,13 +219,13 @@ namespace op error("Empty inputNetData.", __LINE__, __FUNCTION__, __FILE__); timeNow("Start"); // 1. TensorRT deep network - upImpl->spNet->forwardPass(inputNetData.getConstPtr()); + upImpl->spTensorRTNets->forwardPass(inputNetData.getConstPtr()); timeNow("TensorRT forward"); // 2. Resize heat maps + merge different scales upImpl->spResizeAndMergeTensorRT->setScaleRatios(scaleRatios); timeNow("SpResizeAndMergeTensorRT"); #ifndef CPU_ONLY - upImpl->spResizeAndMergeTensorRT->Forward_gpu({upImpl->spTensorRTNetOutputBlob.get()}, {upImpl->spHeatMapsBlob.get()}); // ~5ms + upImpl->spResizeAndMergeTensorRT->Forward_gpu({upImpl->spTensorRTNetOutputBlobs.get()}, {upImpl->spHeatMapsBlob.get()}); // ~5ms timeNow("RaM forward_gpu"); cudaCheck(__LINE__, __FUNCTION__, __FILE__); timeNow("CudaCheck"); From b3673e6429f213b045f893a4ebed62c66c0dff84 Mon Sep 17 00:00:00 2001 From: Florent Buisson Date: Wed, 15 Nov 2017 10:10:14 +0100 Subject: [PATCH 46/52] PIMPL for netTensorRT --- include/openpose/core/netTensorRT.hpp | 32 +- .../openpose/pose/poseExtractorTensorRT.hpp | 9 +- src/openpose/core/netTensorRT.cpp | 614 ++++++++++-------- src/openpose/pose/poseExtractorTensorRT.cpp | 269 +++++--- 4 files changed, 527 insertions(+), 397 deletions(-) diff --git a/include/openpose/core/netTensorRT.hpp b/include/openpose/core/netTensorRT.hpp index 0eaaaf7d3..0c0ae02e1 100644 --- a/include/openpose/core/netTensorRT.hpp +++ b/include/openpose/core/netTensorRT.hpp @@ -1,11 +1,10 @@ -#ifdef USE_TENSORRT #ifndef OPENPOSE_CORE_NET_TENSORRT_HPP #define OPENPOSE_CORE_NET_TENSORRT_HPP -#include #include #include + #include "NvInfer.h" namespace op @@ -31,30 +30,15 @@ namespace op boost::shared_ptr> getOutputBlob() const; private: - // Init with constructor - const int mGpuId; - const std::array mNetInputSize4D; - std::array mNetOutputSize4D; - const unsigned long mNetInputMemory; - const std::string mCaffeProto; - const std::string mCaffeTrainedModel; - const std::string mLastBlobName; - // Init with thread - - boost::shared_ptr> spInputBlob; - boost::shared_ptr> spOutputBlob; - - // TensorRT stuff - nvinfer1::ICudaEngine* cudaEngine; - nvinfer1::IExecutionContext* cudaContext; - nvinfer1::ICudaEngine* caffeToGIEModel(); - nvinfer1::ICudaEngine* createEngine(); - cudaStream_t stream; - cudaEvent_t start, end; - + // PIMPL idiom + // http://www.cppsamples.com/common-tasks/pimpl.html + struct ImplNetTensorRT; + std::unique_ptr upImpl; + + // PIMP requires DELETE_COPY & destructor, or extra code + // http://oliora.github.io/2015/12/29/pimpl-and-rule-of-zero.html DELETE_COPY(NetTensorRT); }; } #endif // OPENPOSE_CORE_NET_TENSORRT_HPP -#endif // USE_TENSORRT diff --git a/include/openpose/pose/poseExtractorTensorRT.hpp b/include/openpose/pose/poseExtractorTensorRT.hpp index 09bd43383..48f856e70 100644 --- a/include/openpose/pose/poseExtractorTensorRT.hpp +++ b/include/openpose/pose/poseExtractorTensorRT.hpp @@ -10,7 +10,7 @@ namespace op class OP_API PoseExtractorTensorRT : public PoseExtractor { public: - PoseExtractorTensorRT(const std::string& modelFolder, const int gpuId, + PoseExtractorTensorRT(const PoseModel poseModel, const std::string& modelFolder, const int gpuId, const std::vector& heatMapTypes = {}, const ScaleMode heatMapScale = ScaleMode::ZeroToOne, const bool enableGoogleLogging = true); @@ -19,10 +19,9 @@ namespace op void netInitializationOnThread(); - void forwardPass(const Array& inputNetData, const Point& inputDataSize, - const std::vector& scaleRatios = {1.f}); + void forwardPass(const std::vector>& inputNetData, const Point& inputDataSize, + const std::vector& scaleInputToNetInputs = {1.f}); - const float* getHeatMapCpuConstPtr() const; const float* getHeatMapGpuConstPtr() const; @@ -31,7 +30,7 @@ namespace op const float* getPoseGpuConstPtr() const; - private: + private: // PIMPL idiom // http://www.cppsamples.com/common-tasks/pimpl.html struct ImplPoseExtractorTensorRT; diff --git a/src/openpose/core/netTensorRT.cpp b/src/openpose/core/netTensorRT.cpp index 8894aeac3..416d0c752 100644 --- a/src/openpose/core/netTensorRT.cpp +++ b/src/openpose/core/netTensorRT.cpp @@ -1,310 +1,390 @@ -#ifdef USE_TENSORRT #include // std::accumulate +#ifdef USE_TENSORRT + #include + #include + #include + #include // google::InitGoogleLogging +#endif #include +#include +#include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "NvInfer.h" -#include "NvCaffeParser.h" +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include -using namespace nvinfer1; -using namespace nvcaffeparser1; +#ifdef USE_TENSORRT + #include "NvInfer.h" + #include "NvCaffeParser.h" -std::vector gInputs; -std::map gInputDimensions; + using namespace nvinfer1; + using namespace nvcaffeparser1; +//std::vector gInputs; +//std::map gInputDimensions; +#endif // USE_TENSORRT // Logger for GIE info/warning/errors class Logger : public ILogger { - void log(Severity severity, const char* msg) override - { - // if suppress info-level message: if (severity != Severity::kINFO) - std::cout << msg << std::endl; - } + void log(Severity severity, const char* msg) override + { + // if suppress info-level message: if (severity != Severity::kINFO) + std::cout << msg << std::endl; + } } gLogger; - namespace op { - NetTensorRT::NetTensorRT(const std::array& netInputSize4D, const std::string& caffeProto, const std::string& caffeTrainedModel, const int gpuId, const std::string& lastBlobName) : - mGpuId{gpuId}, - // mNetInputSize4D{netInputSize4D}, // This line crashes on some devices with old G++ - mNetInputSize4D{netInputSize4D[0], netInputSize4D[1], netInputSize4D[2], netInputSize4D[3]}, - mNetInputMemory{std::accumulate(mNetInputSize4D.begin(), mNetInputSize4D.end(), 1, std::multiplies()) * sizeof(float)}, - mCaffeProto{caffeProto + "_" + std::to_string(mNetInputSize4D[2]) + "x" + std::to_string(mNetInputSize4D[3])}, - mCaffeTrainedModel{caffeTrainedModel}, - mLastBlobName{lastBlobName} - { - std::cout << "Caffe file: " << mCaffeProto.c_str() << std::endl; - CUDA_CHECK(cudaStreamCreate(&stream)); - CUDA_CHECK(cudaEventCreate(&start)); - CUDA_CHECK(cudaEventCreate(&end)); - } - - NetTensorRT::~NetTensorRT() - { - cudaStreamDestroy(stream); - cudaEventDestroy(start); - cudaEventDestroy(end); - - if (cudaEngine) - cudaEngine->destroy(); - } - - - ICudaEngine* NetTensorRT::caffeToGIEModel() - { - // create the builder - IBuilder* builder = createInferBuilder(gLogger); - - // parse the caffe model to populate the network, then set the outputs - INetworkDefinition* network = builder->createNetwork(); - ICaffeParser* parser = createCaffeParser(); - const IBlobNameToTensor* blobNameToTensor = parser->parse(mCaffeProto.c_str(), - mCaffeTrainedModel.c_str(), - *network, - DataType::kFLOAT); - - if (!blobNameToTensor) - return nullptr; - + std::mutex sMutexNetTensorRT; + std::atomic sGoogleLoggingInitialized{false}; - for (int i = 0, n = network->getNbInputs(); i < n; i++) + struct NetTensorRT::ImplNetTensorRT { - DimsCHW dims = static_cast(network->getInput(i)->getDimensions()); - gInputs.push_back(network->getInput(i)->getName()); - gInputDimensions.insert(std::make_pair(network->getInput(i)->getName(), dims)); - std::cout << "Input \"" << network->getInput(i)->getName() << "\": " << dims.c() << "x" << dims.h() << "x" << dims.w() << std::endl; - if( i > 0) - std::cerr << "Multiple output unsupported for now!"; - } + #ifdef USE_TENSORRT + // Init with constructor + const int mGpuId; + const std::string mCaffeProto; + const std::string mCaffeTrainedModel; + const std::string mLastBlobName; + std::vector mNetInputSize4D; + // Init with thread + boost::shared_ptr> spInputBlob; + boost::shared_ptr> spOutputBlob; + + // Init with constructor + //const std::array mNetInputSize4D; + //std::array mNetOutputSize4D; + //const unsigned long mNetInputMemory; + // Init with thread + + // TensorRT stuff + nvinfer1::ICudaEngine* cudaEngine; + nvinfer1::IExecutionContext* cudaContext; + nvinfer1::ICudaEngine* caffeToGIEModel(); + nvinfer1::ICudaEngine* createEngine(); + cudaStream_t stream; + cudaEvent_t start, end; - // Specify which tensor is output (multiple unsupported) - if (blobNameToTensor->find(mLastBlobName.c_str()) == nullptr) - { - std::cout << "could not find output blob " << mLastBlobName.c_str() << std::endl; - return nullptr; - } - network->markOutput(*blobNameToTensor->find(mLastBlobName.c_str())); + ImplNetTensorRT(const std::string& caffeProto, const std::string& caffeTrainedModel, const int gpuId, + const bool enableGoogleLogging, const std::string& lastBlobName) : + mGpuId{gpuId}, + mCaffeProto{caffeProto}, // TODO, no size, how to proceed ? + mCaffeTrainedModel{caffeTrainedModel}, + mLastBlobName{lastBlobName} + { + const std::string message{".\nPossible causes:\n\t1. Not downloading the OpenPose trained models." + "\n\t2. Not running OpenPose from the same directory where the `model`" + " folder is located.\n\t3. Using paths with spaces."}; + if (!existFile(mCaffeProto)) + error("Prototxt file not found: " + mCaffeProto + message, __LINE__, __FUNCTION__, __FILE__); + if (!existFile(mCaffeTrainedModel)) + error("Caffe trained model file not found: " + mCaffeTrainedModel + message, + __LINE__, __FUNCTION__, __FILE__); + // Double if condition in order to speed up the program if it is called several times + if (enableGoogleLogging && !sGoogleLoggingInitialized) + { + std::lock_guard lock{sMutexNetTensorRT}; + if (enableGoogleLogging && !sGoogleLoggingInitialized) + { + google::InitGoogleLogging("OpenPose"); + sGoogleLoggingInitialized = true; + } + } + } + #endif + }; - for (int i = 0, n = network->getNbOutputs(); i < n; i++) +#ifdef USE_TENSORRT + ICudaEngine* NetTensorRT::caffeToGIEModel() { - DimsCHW dims = static_cast(network->getOutput(i)->getDimensions()); - std::cout << "Output \"" << network->getOutput(i)->getName() << "\": " << dims.c() << "x" << dims.h() << "x" << dims.w() << std::endl; + // create the builder + IBuilder* builder = createInferBuilder(gLogger); + + // parse the caffe model to populate the network, then set the outputs + INetworkDefinition* network = builder->createNetwork(); + ICaffeParser* parser = createCaffeParser(); + const IBlobNameToTensor* blobNameToTensor = parser->parse(mCaffeProto.c_str(), + mCaffeTrainedModel.c_str(), + *network, + DataType::kFLOAT); + + if (!blobNameToTensor) + return nullptr; + + + for (int i = 0, n = network->getNbInputs(); i < n; i++) + { + DimsCHW dims = static_cast(network->getInput(i)->getDimensions()); + gInputs.push_back(network->getInput(i)->getName()); + gInputDimensions.insert(std::make_pair(network->getInput(i)->getName(), dims)); + std::cout << "Input \"" << network->getInput(i)->getName() << "\": " << dims.c() << "x" << dims.h() << "x" << dims.w() << std::endl; + if( i > 0) + std::cerr << "Multiple output unsupported for now!"; + } + + // Specify which tensor is output (multiple unsupported) + if (blobNameToTensor->find(mLastBlobName.c_str()) == nullptr) + { + std::cout << "could not find output blob " << mLastBlobName.c_str() << std::endl; + return nullptr; + } + network->markOutput(*blobNameToTensor->find(mLastBlobName.c_str())); + + + for (int i = 0, n = network->getNbOutputs(); i < n; i++) + { + DimsCHW dims = static_cast(network->getOutput(i)->getDimensions()); + std::cout << "Output \"" << network->getOutput(i)->getName() << "\": " << dims.c() << "x" << dims.h() << "x" << dims.w() << std::endl; + } + + // Build the engine + builder->setMaxBatchSize(1); + // 16 megabytes, default in giexec. No idea what's best for Jetson though, + // maybe check dusty_nv's code on github + builder->setMaxWorkspaceSize(32<<20); + builder->setHalf2Mode(false); + + ICudaEngine* engine = builder->buildCudaEngine(*network); + if (engine == nullptr) + std::cout << "could not build engine" << std::endl; + + parser->destroy(); + network->destroy(); + builder->destroy(); + shutdownProtobufLibrary(); + + return engine; } - - // Build the engine - builder->setMaxBatchSize(1); - // 16 megabytes, default in giexec. No idea what's best for Jetson though, - // maybe check dusty_nv's code on github - builder->setMaxWorkspaceSize(32<<20); - builder->setHalf2Mode(false); - - ICudaEngine* engine = builder->buildCudaEngine(*network); - if (engine == nullptr) - std::cout << "could not build engine" << std::endl; - - parser->destroy(); - network->destroy(); - builder->destroy(); - shutdownProtobufLibrary(); - - return engine; - } - - inline bool file_exists(const std::string& file_path) { - struct stat buffer; - return (stat(file_path.c_str(), &buffer) == 0); - } - - ICudaEngine* NetTensorRT::createEngine() - { - ICudaEngine *engine; - - std::string serializedEnginePath = mCaffeProto + ".bin"; - std::cout << "Serialized engine path: " << serializedEnginePath.c_str() << std::endl; - if (file_exists(serializedEnginePath)) + ICudaEngine* NetTensorRT::createEngine() { - std::cout << "Found serialized TensorRT engine, deserializing..." << std::endl; - char *gieModelStream{nullptr}; - size_t size{0}; - std::ifstream file(serializedEnginePath, std::ios::binary); - if (file.good()) - { - file.seekg(0, file.end); - size = file.tellg(); - file.seekg(0, file.beg); - gieModelStream = new char[size]; - assert(gieModelStream); - file.read(gieModelStream, size); - file.close(); - } - - IRuntime* infer = createInferRuntime(gLogger); - engine = infer->deserializeCudaEngine(gieModelStream, size, nullptr); - if (gieModelStream) delete [] gieModelStream; - - return engine; - } - else - { - engine = caffeToGIEModel(); - if (!engine) - { - std::cerr << "Engine could not be created" << std::endl; - return nullptr; - } - else // serialize engine - { - std::ofstream p(serializedEnginePath); - if (!p) + ICudaEngine *engine; + + std::string serializedEnginePath = mCaffeProto + ".bin"; + + std::cout << "Serialized engine path: " << serializedEnginePath.c_str() << std::endl; + if (existFile(serializedEnginePath)) { - std::cerr << "could not serialize engine" << std::endl; + std::cout << "Found serialized TensorRT engine, deserializing..." << std::endl; + char *gieModelStream{nullptr}; + size_t size{0}; + std::ifstream file(serializedEnginePath, std::ios::binary); + if (file.good()) + { + file.seekg(0, file.end); + size = file.tellg(); + file.seekg(0, file.beg); + gieModelStream = new char[size]; + assert(gieModelStream); + file.read(gieModelStream, size); + file.close(); + } + + IRuntime* infer = createInferRuntime(gLogger); + engine = infer->deserializeCudaEngine(gieModelStream, size, nullptr); + if (gieModelStream) delete [] gieModelStream; + + return engine; } - IHostMemory *ptr = engine->serialize(); - assert(ptr); - p.write(reinterpret_cast(ptr->data()), ptr->size()); - ptr->destroy(); - } + else + { + engine = caffeToGIEModel(); + if (!engine) + { + std::cerr << "Engine could not be created" << std::endl; + return nullptr; + } + else // serialize engine + { + std::ofstream p(serializedEnginePath); + if (!p) + { + std::cerr << "could not serialize engine" << std::endl; + } + IHostMemory *ptr = engine->serialize(); + assert(ptr); + p.write(reinterpret_cast(ptr->data()), ptr->size()); + ptr->destroy(); + } + } + return engine; } - return engine; - } - - void NetTensorRT::initializationOnThread() - { - - std::cout << "InitializationOnThread : start" << std::endl; - try - { - - std::cout << "InitializationOnThread : setting device" << std::endl; - // Initialize net - cudaSetDevice(mGpuId); - - std::cout << "InitializationOnThread : creating engine" << std::endl; - - cudaEngine = createEngine(); - if (!cudaEngine) - { - std::cerr << "cudaEngine could not be created" << std::endl; - return; - } - - std::cout << "InitializationOnThread Pass : creating execution context" << std::endl; - - cudaContext = cudaEngine->createExecutionContext(); - if (!cudaContext) - { - std::cerr << "cudaContext could not be created" << std::endl; - return; - } - - DimsCHW outputDims = static_cast(cudaEngine->getBindingDimensions(cudaEngine->getNbBindings() - 1)); - mNetOutputSize4D = { 1, outputDims.c(), outputDims.h(), outputDims.w() }; - - - std::cout << "NetInputSize4D: " << mNetInputSize4D[0] << " " << mNetInputSize4D[1] << " " << mNetInputSize4D[2] << " " << mNetInputSize4D[3] << std::endl; - - spInputBlob = boost::make_shared>(mNetInputSize4D[0], mNetInputSize4D[1], mNetInputSize4D[2], mNetInputSize4D[3]); - spOutputBlob = boost::make_shared>(mNetOutputSize4D[0], mNetOutputSize4D[1], mNetOutputSize4D[2], mNetOutputSize4D[3]); - - std::cout << "InitializationOnThread : done" << std::endl; - cudaCheck(__LINE__, __FUNCTION__, __FILE__); - } - catch (const std::exception& e) - { - error(e.what(), __LINE__, __FUNCTION__, __FILE__); - } - } - - float* NetTensorRT::getInputDataCpuPtr() const - { - try - { - return spInputBlob->mutable_cpu_data(); - } - catch (const std::exception& e) - { - error(e.what(), __LINE__, __FUNCTION__, __FILE__); - return nullptr; - } - } - - float* NetTensorRT::getInputDataGpuPtr() const - { - try + inline void reshapeNetTensorRT(caffe::Net* caffeNet, const std::vector& dimensions) { - return spInputBlob->mutable_gpu_data(); + try + { + caffeNet->blobs()[0]->Reshape(dimensions); + caffeNet->Reshape(); + cudaCheck(__LINE__, __FUNCTION__, __FILE__); + } + catch (const std::exception& e) + { + error(e.what(), __LINE__, __FUNCTION__, __FILE__); + } } - catch (const std::exception& e) +#endif + + NetTensorRT::NetTensorRT(const std::string& caffeProto, const std::string& caffeTrainedModel, const int gpuId, + const bool enableGoogleLogging, const std::string& lastBlobName) +#ifdef USE_TENSORRT + : upImpl{new ImplNetTensorRT{caffeProto, caffeTrainedModel, gpuId, enableGoogleLogging, + lastBlobName}} +#endif { - error(e.what(), __LINE__, __FUNCTION__, __FILE__); - return nullptr; + try + { + #ifdef USE_TENSORRT + std::cout << "Caffe file: " << mCaffeProto.c_str() << std::endl; + CUDA_CHECK(cudaStreamCreate(&stream)); + CUDA_CHECK(cudaEventCreate(&start)); + CUDA_CHECK(cudaEventCreate(&end)); + #else + UNUSED(netInputSize4D); + UNUSED(caffeProto); + UNUSED(caffeTrainedModel); + UNUSED(gpuId); + UNUSED(lastBlobName); + error("OpenPose must be compiled with the `USE_CAFFE` macro definition in order to use this" + " functionality.", __LINE__, __FUNCTION__, __FILE__); + #endif + } + catch (const std::exception& e) + { + error(e.what(), __LINE__, __FUNCTION__, __FILE__); + } } - } - - void NetTensorRT::forwardPass(const float* const inputData) const - { - try + + NetTensorRT::~NetTensorRT() { - const int batchSize = 1; - // Copy frame data to GPU memory - if (inputData != nullptr) - { - auto* gpuImagePtr = spInputBlob->mutable_gpu_data(); - CUDA_CHECK(cudaMemcpy(gpuImagePtr, inputData, mNetInputMemory, cudaMemcpyHostToDevice)); + cudaStreamDestroy(stream); + cudaEventDestroy(start); + cudaEventDestroy(end); - // input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(), - // of these, but in this case we know that there is exactly one input and one output. - std::vector buffers(2); - buffers[0] = spInputBlob->mutable_gpu_data(); - buffers[1] = spOutputBlob->mutable_gpu_data(); - - cudaContext->enqueue(batchSize, &buffers[0], stream, nullptr); - - //cudaCheck(__LINE__, __FUNCTION__, __FILE__); - } + if (cudaEngine) + cudaEngine->destroy(); } - catch (const std::exception& e) + + void NetTensorRT::initializationOnThread() { - error(e.what(), __LINE__, __FUNCTION__, __FILE__); + std::cout << "InitializationOnThread : start" << std::endl; + try + { + #ifdef USE_TENSORRT + std::cout << "InitializationOnThread : setting device" << std::endl; + // Initialize net + cudaSetDevice(mGpuId); + + std::cout << "InitializationOnThread : creating engine" << std::endl; + + cudaEngine = createEngine(); + if (!cudaEngine) + { + std::cerr << "cudaEngine could not be created" << std::endl; + return; + } + + std::cout << "InitializationOnThread Pass : creating execution context" << std::endl; + + cudaContext = cudaEngine->createExecutionContext(); + if (!cudaContext) + { + std::cerr << "cudaContext could not be created" << std::endl; + return; + } + + DimsCHW outputDims = static_cast(cudaEngine->getBindingDimensions(cudaEngine->getNbBindings() - 1)); + mNetOutputSize4D = { 1, outputDims.c(), outputDims.h(), outputDims.w() }; + + + std::cout << "NetInputSize4D: " << mNetInputSize4D[0] << " " << mNetInputSize4D[1] << " " << mNetInputSize4D[2] << " " << mNetInputSize4D[3] << std::endl; + + upImpl->spInputBlob = boost::make_shared>(mNetInputSize4D[0], mNetInputSize4D[1], mNetInputSize4D[2], mNetInputSize4D[3]); + upImpl->spOutputBlob = boost::make_shared>(mNetOutputSize4D[0], mNetOutputSize4D[1], mNetOutputSize4D[2], mNetOutputSize4D[3]); + + std::cout << "InitializationOnThread : done" << std::endl; + cudaCheck(__LINE__, __FUNCTION__, __FILE__); + #endif + } + catch (const std::exception& e) + { + error(e.what(), __LINE__, __FUNCTION__, __FILE__); + } } - } - - boost::shared_ptr> NetTensorRT::getOutputBlob() const - { - std::cout << "Getting output blob." << std::endl; - try + + void NetTensorRT::forwardPass(const Array& inputData) const { - return spOutputBlob; + try + { + #ifdef USE_TENSORRT + // Security checks + if (inputData.empty()) + error("The Array inputData cannot be empty.", __LINE__, __FUNCTION__, __FILE__); + if (inputData.getNumberDimensions() != 4 || inputData.getSize(1) != 3) + error("The Array inputData must have 4 dimensions: [batch size, 3 (RGB), height, width].", + __LINE__, __FUNCTION__, __FILE__); + // Reshape Caffe net if required + if (!vectorsAreEqual(upImpl->mNetInputSize4D, inputData.getSize())) + { + upImpl->mNetInputSize4D = inputData.getSize(); + reshapeNetTensorRT(upImpl->upCaffeNet.get(), inputData.getSize()); + } + + // Copy frame data to GPU memory + auto* gpuImagePtr = upImpl->spInputBlob->mutable_gpu_data(); + CUDA_CHECK(cudaMemcpy(gpuImagePtr, inputData.getConstPtr(), mNetInputMemory, cudaMemcpyHostToDevice)); + + // input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(), + // of these, but in this case we know that there is exactly one input and one output. + std::vector buffers(2); + buffers[0] = upImpl->spInputBlob->mutable_gpu_data(); + buffers[1] = upImpl->spOutputBlob->mutable_gpu_data(); + + // Perform deep network forward pass + cudaContext->enqueue(batchSize, &buffers[0], stream, nullptr); + + // Cuda checks + cudaCheck(__LINE__, __FUNCTION__, __FILE__); + + #endif + } + catch (const std::exception& e) + { + error(e.what(), __LINE__, __FUNCTION__, __FILE__); + } } - catch (const std::exception& e) + + boost::shared_ptr> NetTensorRT::getOutputBlob() const { - error(e.what(), __LINE__, __FUNCTION__, __FILE__); - return nullptr; + try + { + #ifdef USE_TENSORRT + return upImpl->spOutputBlob; + #else + return nullptr; + #endif + } + catch (const std::exception& e) + { + error(e.what(), __LINE__, __FUNCTION__, __FILE__); + return nullptr; + } } - - std::cout << "Got something..." << std::endl; - } } - -#endif // USE_TENSORRT + diff --git a/src/openpose/pose/poseExtractorTensorRT.cpp b/src/openpose/pose/poseExtractorTensorRT.cpp index c9afbff08..9ab8b2b96 100644 --- a/src/openpose/pose/poseExtractorTensorRT.cpp +++ b/src/openpose/pose/poseExtractorTensorRT.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include typedef std::vector> OpTimings; @@ -40,39 +41,59 @@ namespace op const bool mEnableGoogleLogging; // General parameters std::vector> spTensorRTNets; - std::shared_ptr> spResizeAndMergeTensorRT; - std::shared_ptr> spNmsTensorRT; - std::shared_ptr> spBodyPartConnectorTensorRT; + std::shared_ptr> spResizeAndMergeCaffe; + std::shared_ptr> spNmsCaffe; + std::shared_ptr> spBodyPartConnectorCaffe; + std::vector> mNetInput4DSizes; + std::vector mScaleInputToNetInputs; // Init with thread std::vector>> spTensorRTNetOutputBlobs; std::shared_ptr> spHeatMapsBlob; std::shared_ptr> spPeaksBlob; std::shared_ptr> spPoseBlob; - ImplPoseExtractorTensorRT(const PoseModel poseModel, const int gpuId, const std::string& modelFolder, const bool enableGoogleLogging) : mPoseModel{poseModel}, mGpuId{gpuId}, mModelFolder{modelFolder}, - mEnableGoogleLoggin{enableGoogleLogging}, - spResizeAndMergeTensorRT{std::make_shared>()}, - spNmsTensorRT{std::make_shared>()}, - spBodyPartConnectorTensorRT{std::make_shared>()} + mEnableGoogleLogging{enableGoogleLogging}, + spResizeAndMergeCaffe{std::make_shared>()}, + spNmsCaffe{std::make_shared>()}, + spBodyPartConnectorCaffe{std::make_shared>()} { } #endif }; - inline void reshapePoseExtractorCaffe(std::shared_ptr>& resizeAndMergeCaffe, - std::shared_ptr>& nmsCaffe, - std::shared_ptr>& bodyPartConnectorCaffe, - std::vector>>& caffeNetOutputBlob, - std::shared_ptr>& heatMapsBlob, - std::shared_ptr>& peaksBlob, - std::shared_ptr>& poseBlob, - const float scaleInputToNetInput, - const PoseModel poseModel) + #ifdef USE_CAFFE + std::vector*> caffeNetSharedToPtr( + std::vector>>& caffeNetOutputBlob) + { + try + { + // Prepare spCaffeNetOutputBlobss + std::vector*> caffeNetOutputBlobs(caffeNetOutputBlob.size()); + for (auto i = 0u ; i < caffeNetOutputBlobs.size() ; i++) + caffeNetOutputBlobs[i] = caffeNetOutputBlob[i].get(); + return caffeNetOutputBlobs; + } + catch (const std::exception& e) + { + error(e.what(), __LINE__, __FUNCTION__, __FILE__); + return {}; + } + } + + inline void reshapePoseExtractorCaffe(std::shared_ptr>& resizeAndMergeCaffe, + std::shared_ptr>& nmsCaffe, + std::shared_ptr>& bodyPartConnectorCaffe, + std::vector>>& caffeNetOutputBlob, + std::shared_ptr>& heatMapsBlob, + std::shared_ptr>& peaksBlob, + std::shared_ptr>& poseBlob, + const float scaleInputToNetInput, + const PoseModel poseModel) { try { @@ -85,29 +106,29 @@ namespace op // Pose extractor blob and layer bodyPartConnectorCaffe->Reshape({heatMapsBlob.get(), peaksBlob.get()}, {poseBlob.get()}); // Cuda check - #ifdef USE_CUDA - cudaCheck(__LINE__, __FUNCTION__, __FILE__); - #endif + #ifdef USE_CUDA + cudaCheck(__LINE__, __FUNCTION__, __FILE__); + #endif } catch (const std::exception& e) { error(e.what(), __LINE__, __FUNCTION__, __FILE__); } } - - void addTensorRTNetOnThread(std::vector>& netTensorRT, - std::vector>>& caffeNetOutputBlob, - const PoseModel poseModel, const int gpuId, - const std::string& modelFolder, const bool enableGoogleLogging) + + void addTensorRTNetOnThread(std::vector>& netTensorRT, + std::vector>>& caffeNetOutputBlob, + const PoseModel poseModel, const int gpuId, + const std::string& modelFolder, const bool enableGoogleLogging) { try { // Add Caffe Net netTensorRT.emplace_back( - std::make_shared(modelFolder + POSE_PROTOTXT[(int)poseModel], - modelFolder + POSE_TRAINED_MODEL[(int)poseModel], - gpuId, enableGoogleLogging) - ); + std::make_shared(modelFolder + POSE_PROTOTXT[(int)poseModel], + modelFolder + POSE_TRAINED_MODEL[(int)poseModel], + gpuId, enableGoogleLogging) + ); // Initializing them on the thread netTensorRT.back()->initializationOnThread(); caffeNetOutputBlob.emplace_back(netTensorRT.back()->getOutputBlob()); @@ -115,9 +136,9 @@ namespace op if (netTensorRT.size() != caffeNetOutputBlob.size()) error("Weird error, this should not happen. Notify us.", __LINE__, __FUNCTION__, __FILE__); // Cuda check - #ifdef USE_CUDA - cudaCheck(__LINE__, __FUNCTION__, __FILE__); - #endif + #ifdef USE_CUDA + cudaCheck(__LINE__, __FUNCTION__, __FILE__); + #endif } catch (const std::exception& e) { @@ -127,39 +148,27 @@ namespace op #endif - PoseExtractorTensorRT::PoseExtractorTensorRT(const Point& netInputSize, const Point& netOutputSize, - const Point& outputSize, const int scaleNumber, - const PoseModel poseModel, const std::string& modelFolder, + PoseExtractorTensorRT::PoseExtractorTensorRT(const PoseModel poseModel, const std::string& modelFolder, const int gpuId, const std::vector& heatMapTypes, const ScaleMode heatMapScale, const bool enableGoogleLogging) : - PoseExtractor{netOutputSize, outputSize, poseModel, heatMapTypes, heatMapScale} + PoseExtractor{poseModel, heatMapTypes, heatMapScale} #ifdef USE_TENSORRT - , upImpl{new ImplPoseExtractorTensorRT{netInputSize, netOutputSize, scaleNumber, poseModel, - gpuId, modelFolder, enableGoogleLogging}} + , upImpl{new ImplPoseExtractorTensorRT{poseModel, gpuId, modelFolder, enableGoogleLogging}} #endif { try { #ifdef USE_TENSORRT - const auto resizeScale = mNetOutputSize.x / (float)netInputSize.x; - const auto resizeScaleCheck = resizeScale / (mNetOutputSize.y/(float)netInputSize.y); - if (1+1e-6 < resizeScaleCheck || resizeScaleCheck < 1-1e-6) - error("Net input and output size must be proportional. resizeScaleCheck = " - + std::to_string(resizeScaleCheck), __LINE__, __FUNCTION__, __FILE__); - // Layers parameters - upImpl->spBodyPartConnectorCaffe->setPoseModel(mPoseModel); + // Layers parameters + upImpl->spBodyPartConnectorCaffe->setPoseModel(mPoseModel); #else - UNUSED(netInputSize); - UNUSED(netOutputSize); - UNUSED(outputSize); - UNUSED(scaleNumber); - UNUSED(poseModel); - UNUSED(modelFolder); - UNUSED(gpuId); - UNUSED(heatMapTypes); - UNUSED(heatMapScale); - error("OpenPose must be compiled with the `USE_TENSORRT` macro definition in order to use this" - " functionality.", __LINE__, __FUNCTION__, __FILE__); + UNUSED(poseModel); + UNUSED(modelFolder); + UNUSED(gpuId); + UNUSED(heatMapTypes); + UNUSED(heatMapScale); + error("OpenPose must be compiled with the `USE_CAFFE` macro definition in order to use this" + " functionality.", __LINE__, __FUNCTION__, __FILE__); #endif } catch (const std::exception& e) @@ -176,30 +185,24 @@ namespace op { try { - log("Starting initialization on thread.", Priority::Low, __LINE__, __FUNCTION__, __FILE__); - #ifdef USE_TENSORRT - // TensorRT net - upImpl->spTensorRTNets->initializationOnThread(); - upImpl->spTensorRTNetOutputBlobs = ((NetTensorRT*)upImpl->spTensorRTNets.get())->getOutputBlob(); + + // Logging + log("Starting initialization on thread.", Priority::Low, __LINE__, __FUNCTION__, __FILE__); + // Initialize Caffe net + addTensorRTNetOnThread(upImpl->spCaffeNets, upImpl->spCaffeNetOutputBlobs, upImpl->mPoseModel, + upImpl->mGpuId, upImpl->mModelFolder, upImpl->mEnableGoogleLogging); + cudaCheck(__LINE__, __FUNCTION__, __FILE__); - - // HeatMaps extractor blob and layer + + // Initialize blobs upImpl->spHeatMapsBlob = {std::make_shared>(1,1,1,1)}; - upImpl->spResizeAndMergeTensorRT->Reshape({upImpl->spTensorRTNetOutputBlobs.get()}, {upImpl->spHeatMapsBlob.get()}, upImpl->mResizeScale * POSE_CCN_DECREASE_FACTOR[(int)mPoseModel]); - cudaCheck(__LINE__, __FUNCTION__, __FILE__); - - // Pose extractor blob and layer upImpl->spPeaksBlob = {std::make_shared>(1,1,1,1)}; - upImpl->spNmsTensorRT->Reshape({upImpl->spHeatMapsBlob.get()}, {upImpl->spPeaksBlob.get()}, POSE_MAX_PEAKS[(int)mPoseModel]); - cudaCheck(__LINE__, __FUNCTION__, __FILE__); - - // Pose extractor blob and layer upImpl->spPoseBlob = {std::make_shared>(1,1,1,1)}; - upImpl->spBodyPartConnectorTensorRT->setPoseModel(mPoseModel); - upImpl->spBodyPartConnectorTensorRT->Reshape({upImpl->spHeatMapsBlob.get(), upImpl->spPeaksBlob.get()}, {upImpl->spPoseBlob.get()}); + cudaCheck(__LINE__, __FUNCTION__, __FILE__); - + + // Logging log("Finished initialization on thread.", Priority::Low, __LINE__, __FUNCTION__, __FILE__); #endif } @@ -217,45 +220,105 @@ namespace op // Security checks if (inputNetData.empty()) error("Empty inputNetData.", __LINE__, __FUNCTION__, __FILE__); + for (const auto& inputNetDataI : inputNetData) + if (inputNetDataI.empty()) + error("Empty inputNetData.", __LINE__, __FUNCTION__, __FILE__); + if (inputNetData.size() != scaleInputToNetInputs.size()) + error("Size(inputNetData) must be same than size(scaleInputToNetInputs).", + __LINE__, __FUNCTION__, __FILE__); + timeNow("Start"); - // 1. TensorRT deep network - upImpl->spTensorRTNets->forwardPass(inputNetData.getConstPtr()); - timeNow("TensorRT forward"); + + // Resize std::vectors if required + const auto numberScales = inputNetData.size(); + upImpl->mNetInput4DSizes.resize(numberScales); + while (upImpl->spCaffeNets.size() < numberScales) + addCaffeNetOnThread(upImpl->spCaffeNets, upImpl->spCaffeNetOutputBlobs, upImpl->mPoseModel, + upImpl->mGpuId, upImpl->mModelFolder, false); + + // Process each image + for (auto i = 0u ; i < inputNetData.size(); i++) + { + // 1. TensorRT deep network + upImpl->spTensorRTNets->forwardPass(inputNetData.getConstPtr()); + + // Reshape blobs if required + // Note: In order to resize to input size to have same results as Matlab, uncomment the commented + // lines + if (!vectorsAreEqual(upImpl->mNetInput4DSizes.at(i), inputNetData[i].getSize())) + // || !vectorsAreEqual(upImpl->mScaleInputToNetInputs, scaleInputToNetInputs)) + { + upImpl->mNetInput4DSizes.at(i) = inputNetData[i].getSize(); + mNetOutputSize = Point{upImpl->mNetInput4DSizes[0][3], + upImpl->mNetInput4DSizes[0][2]}; + // upImpl->mScaleInputToNetInputs = scaleInputToNetInputs; + reshapePoseExtractorCaffe(upImpl->spResizeAndMergeCaffe, upImpl->spNmsCaffe, + upImpl->spBodyPartConnectorCaffe, upImpl->spCaffeNetOutputBlobs, + upImpl->spHeatMapsBlob, upImpl->spPeaksBlob, upImpl->spPoseBlob, + 1.f, mPoseModel); + // scaleInputToNetInputs[i], mPoseModel); + } + } + + timeNow("TensorRT forwards"); + // 2. Resize heat maps + merge different scales - upImpl->spResizeAndMergeTensorRT->setScaleRatios(scaleRatios); - timeNow("SpResizeAndMergeTensorRT"); - #ifndef CPU_ONLY - upImpl->spResizeAndMergeTensorRT->Forward_gpu({upImpl->spTensorRTNetOutputBlobs.get()}, {upImpl->spHeatMapsBlob.get()}); // ~5ms - timeNow("RaM forward_gpu"); - cudaCheck(__LINE__, __FUNCTION__, __FILE__); - timeNow("CudaCheck"); - #else - error("ResizeAndMergeTensorRT CPU version not implemented yet.", __LINE__, __FUNCTION__, __FILE__); + const auto caffeNetOutputBlobs = caffeNetSharedToPtr(upImpl->spCaffeNetOutputBlobs); + const std::vector floatScaleRatios(scaleInputToNetInputs.begin(), scaleInputToNetInputs.end()); + upImpl->spResizeAndMergeCaffe->setScaleRatios(floatScaleRatios); + #ifdef USE_CUDA // Implied by tensorrt + upImpl->spResizeAndMergeCaffe->Forward_gpu(caffeNetOutputBlobs, // ~5ms + {upImpl->spHeatMapsBlob.get()}); + cudaCheck(__LINE__, __FUNCTION__, __FILE__); + #else // Never reached, suppress ? + upImpl->spResizeAndMergeCaffe->Forward_cpu({upImpl->spCaffeNetOutputBlob.get()}, + {upImpl->spHeatMapsBlob.get()}); #endif + timeNow("Resize heat Maps"); + // 3. Get peaks by Non-Maximum Suppression - upImpl->spNmsTensorRT->setThreshold((float)get(PoseProperty::NMSThreshold)); - #ifndef CPU_ONLY - upImpl->spNmsTensorRT->Forward_gpu({upImpl->spHeatMapsBlob.get()}, {upImpl->spPeaksBlob.get()}); // ~2ms - cudaCheck(__LINE__, __FUNCTION__, __FILE__); + upImpl->spNmsCaffe->setThreshold((float)get(PoseProperty::NMSThreshold)); + #ifdef USE_CUDA + upImpl->spNmsCaffe->Forward_gpu({upImpl->spHeatMapsBlob.get()}, {upImpl->spPeaksBlob.get()});// ~2ms + cudaCheck(__LINE__, __FUNCTION__, __FILE__); #else - error("NmsTensorRT CPU version not implemented yet.", __LINE__, __FUNCTION__, __FILE__); + error("NmsCaffe CPU version not implemented yet.", __LINE__, __FUNCTION__, __FILE__); #endif + timeNow("Peaks by nms"); - // Get scale net to output + + // Get scale net to output (i.e. image input) + // Note: In order to resize to input size, (un)comment the following lines const auto scaleProducerToNetInput = resizeGetScaleFactor(inputDataSize, mNetOutputSize); - const Point netSize{intRound(scaleProducerToNetInput*inputDataSize.x), intRound(scaleProducerToNetInput*inputDataSize.y)}; - mScaleNetToOutput = {(float)resizeGetScaleFactor(netSize, mOutputSize)}; + const Point netSize{intRound(scaleProducerToNetInput*inputDataSize.x), + intRound(scaleProducerToNetInput*inputDataSize.y)}; + mScaleNetToOutput = {(float)resizeGetScaleFactor(netSize, inputDataSize)}; + // mScaleNetToOutput = 1.f; + timeNow("Scale net to output"); + // 4. Connecting body parts - upImpl->spBodyPartConnectorTensorRT->setScaleNetToOutput(mScaleNetToOutput); - upImpl->spBodyPartConnectorTensorRT->setInterMinAboveThreshold((int)get(PoseProperty::ConnectInterMinAboveThreshold)); - upImpl->spBodyPartConnectorTensorRT->setInterThreshold((float)get(PoseProperty::ConnectInterThreshold)); - upImpl->spBodyPartConnectorTensorRT->setMinSubsetCnt((int)get(PoseProperty::ConnectMinSubsetCnt)); - upImpl->spBodyPartConnectorTensorRT->setMinSubsetScore((float)get(PoseProperty::ConnectMinSubsetScore)); + // Get scale net to output (i.e. image input) + upImpl->spBodyPartConnectorCaffe->setScaleNetToOutput(mScaleNetToOutput); + upImpl->spBodyPartConnectorCaffe->setInterMinAboveThreshold( + (float)get(PoseProperty::ConnectInterMinAboveThreshold) + ); + upImpl->spBodyPartConnectorCaffe->setInterThreshold((float)get(PoseProperty::ConnectInterThreshold)); + upImpl->spBodyPartConnectorCaffe->setMinSubsetCnt((int)get(PoseProperty::ConnectMinSubsetCnt)); + upImpl->spBodyPartConnectorCaffe->setMinSubsetScore((float)get(PoseProperty::ConnectMinSubsetScore)); + // GPU version not implemented yet - upImpl->spBodyPartConnectorTensorRT->Forward_cpu({upImpl->spHeatMapsBlob.get(), upImpl->spPeaksBlob.get()}, mPoseKeypoints); - // upImpl->spBodyPartConnectorTensorRT->Forward_gpu({upImpl->spHeatMapsBlob.get(), upImpl->spPeaksBlob.get()}, {upImpl->spPoseBlob.get()}, mPoseKeypoints); + // #ifdef USE_CUDA + // upImpl->spBodyPartConnectorCaffe->Forward_gpu({upImpl->spHeatMapsBlob.get(), + // upImpl->spPeaksBlob.get()}, + // {upImpl->spPoseBlob.get()}, mPoseKeypoints); + // #else + upImpl->spBodyPartConnectorCaffe->Forward_cpu({upImpl->spHeatMapsBlob.get(), + upImpl->spPeaksBlob.get()}, + mPoseKeypoints, mPoseScores); + // #endif + timeNow("Connect Body Parts"); const auto totalTimeSec = timeDiffToString(timings.back().second, timings.front().second); @@ -266,6 +329,10 @@ namespace op const auto log_time = (*timing).first + " - " + timeDiffToString((*timing).second, (*(timing-1)).second); op::log(log_time, op::Priority::High); } + #else + UNUSED(inputNetData); + UNUSED(inputDataSize); + UNUSED(scaleInputToNetInputs); #endif } catch (const std::exception& e) From 273a3519b2bc18074f9fe3166ddee5c222206c08 Mon Sep 17 00:00:00 2001 From: Florent Buisson Date: Wed, 15 Nov 2017 15:00:58 +0000 Subject: [PATCH 47/52] Fix source issues, example remains. --- include/openpose/core/netTensorRT.hpp | 21 +++--- src/openpose/core/netTensorRT.cpp | 75 ++++++++++----------- src/openpose/pose/poseExtractorTensorRT.cpp | 18 ++--- 3 files changed, 58 insertions(+), 56 deletions(-) diff --git a/include/openpose/core/netTensorRT.hpp b/include/openpose/core/netTensorRT.hpp index 0c0ae02e1..96b588657 100644 --- a/include/openpose/core/netTensorRT.hpp +++ b/include/openpose/core/netTensorRT.hpp @@ -5,31 +5,32 @@ #include -#include "NvInfer.h" +#ifdef USE_TENSORRT + #include "NvInfer.h" +#endif namespace op { class OP_API NetTensorRT : public Net { public: - NetTensorRT(const std::array& netInputSize4D, const std::string& caffeProto, const std::string& caffeTrainedModel, const int gpuId = 0, + NetTensorRT(const std::string& caffeProto, const std::string& caffeTrainedModel, const int gpuId = 0, const bool enableGoogleLogging = true, const std::string& lastBlobName = "net_output"); virtual ~NetTensorRT(); void initializationOnThread(); - // Alternative a) getInputDataCpuPtr or getInputDataGpuPtr + forwardPass - float* getInputDataCpuPtr() const; - - float* getInputDataGpuPtr() const; - - // Alternative b) - void forwardPass(const float* const inputNetData = nullptr) const; + void forwardPass(const Array& inputNetData) const; boost::shared_ptr> getOutputBlob() const; - + private: +#ifdef USE_TENSORRT + nvinfer1::ICudaEngine* caffeToGIEModel(); + + nvinfer1::ICudaEngine* createEngine(); +#endif // PIMPL idiom // http://www.cppsamples.com/common-tasks/pimpl.html struct ImplNetTensorRT; diff --git a/src/openpose/core/netTensorRT.cpp b/src/openpose/core/netTensorRT.cpp index 416d0c752..897087f00 100644 --- a/src/openpose/core/netTensorRT.cpp +++ b/src/openpose/core/netTensorRT.cpp @@ -24,7 +24,7 @@ //#include //#include //#include -//#include +#include #ifdef USE_TENSORRT @@ -34,8 +34,8 @@ using namespace nvinfer1; using namespace nvcaffeparser1; -//std::vector gInputs; -//std::map gInputDimensions; + std::vector gInputs; + std::map gInputDimensions; #endif // USE_TENSORRT // Logger for GIE info/warning/errors @@ -68,15 +68,14 @@ namespace op // Init with constructor //const std::array mNetInputSize4D; - //std::array mNetOutputSize4D; - //const unsigned long mNetInputMemory; + std::vector mNetOutputSize4D; // Init with thread // TensorRT stuff nvinfer1::ICudaEngine* cudaEngine; nvinfer1::IExecutionContext* cudaContext; - nvinfer1::ICudaEngine* caffeToGIEModel(); - nvinfer1::ICudaEngine* createEngine(); + //nvinfer1::ICudaEngine* caffeToGIEModel(); + //nvinfer1::ICudaEngine* createEngine(); cudaStream_t stream; cudaEvent_t start, end; @@ -119,8 +118,8 @@ namespace op // parse the caffe model to populate the network, then set the outputs INetworkDefinition* network = builder->createNetwork(); ICaffeParser* parser = createCaffeParser(); - const IBlobNameToTensor* blobNameToTensor = parser->parse(mCaffeProto.c_str(), - mCaffeTrainedModel.c_str(), + const IBlobNameToTensor* blobNameToTensor = parser->parse(upImpl->mCaffeProto.c_str(), + upImpl->mCaffeTrainedModel.c_str(), *network, DataType::kFLOAT); @@ -139,12 +138,12 @@ namespace op } // Specify which tensor is output (multiple unsupported) - if (blobNameToTensor->find(mLastBlobName.c_str()) == nullptr) + if (blobNameToTensor->find(upImpl->mLastBlobName.c_str()) == nullptr) { - std::cout << "could not find output blob " << mLastBlobName.c_str() << std::endl; + std::cout << "could not find output blob " << upImpl->mLastBlobName.c_str() << std::endl; return nullptr; } - network->markOutput(*blobNameToTensor->find(mLastBlobName.c_str())); + network->markOutput(*blobNameToTensor->find(upImpl->mLastBlobName.c_str())); for (int i = 0, n = network->getNbOutputs(); i < n; i++) @@ -176,7 +175,7 @@ namespace op { ICudaEngine *engine; - std::string serializedEnginePath = mCaffeProto + ".bin"; + std::string serializedEnginePath = upImpl->mCaffeProto + ".bin"; std::cout << "Serialized engine path: " << serializedEnginePath.c_str() << std::endl; if (existFile(serializedEnginePath)) @@ -226,12 +225,12 @@ namespace op return engine; } - inline void reshapeNetTensorRT(caffe::Net* caffeNet, const std::vector& dimensions) + inline void reshapeNetTensorRT(boost::shared_ptr> inputBlob, const std::vector& dimensions) { try { - caffeNet->blobs()[0]->Reshape(dimensions); - caffeNet->Reshape(); + inputBlob->Reshape(dimensions); + //caffeNet->Reshape(); TODO find TensorRT equivalent cudaCheck(__LINE__, __FUNCTION__, __FILE__); } catch (const std::exception& e) @@ -251,10 +250,10 @@ namespace op try { #ifdef USE_TENSORRT - std::cout << "Caffe file: " << mCaffeProto.c_str() << std::endl; - CUDA_CHECK(cudaStreamCreate(&stream)); - CUDA_CHECK(cudaEventCreate(&start)); - CUDA_CHECK(cudaEventCreate(&end)); + std::cout << "Caffe file: " << upImpl->mCaffeProto.c_str() << std::endl; + CUDA_CHECK(cudaStreamCreate(&upImpl->stream)); + CUDA_CHECK(cudaEventCreate(&upImpl->start)); + CUDA_CHECK(cudaEventCreate(&upImpl->end)); #else UNUSED(netInputSize4D); UNUSED(caffeProto); @@ -273,12 +272,12 @@ namespace op NetTensorRT::~NetTensorRT() { - cudaStreamDestroy(stream); - cudaEventDestroy(start); - cudaEventDestroy(end); + cudaStreamDestroy(upImpl->stream); + cudaEventDestroy(upImpl->start); + cudaEventDestroy(upImpl->end); - if (cudaEngine) - cudaEngine->destroy(); + if (upImpl->cudaEngine) + upImpl->cudaEngine->destroy(); } void NetTensorRT::initializationOnThread() @@ -289,12 +288,12 @@ namespace op #ifdef USE_TENSORRT std::cout << "InitializationOnThread : setting device" << std::endl; // Initialize net - cudaSetDevice(mGpuId); + cudaSetDevice(upImpl->mGpuId); std::cout << "InitializationOnThread : creating engine" << std::endl; - cudaEngine = createEngine(); - if (!cudaEngine) + upImpl->cudaEngine = createEngine(); + if (!upImpl->cudaEngine) { std::cerr << "cudaEngine could not be created" << std::endl; return; @@ -302,21 +301,21 @@ namespace op std::cout << "InitializationOnThread Pass : creating execution context" << std::endl; - cudaContext = cudaEngine->createExecutionContext(); - if (!cudaContext) + upImpl->cudaContext = upImpl->cudaEngine->createExecutionContext(); + if (!upImpl->cudaContext) { std::cerr << "cudaContext could not be created" << std::endl; return; } - DimsCHW outputDims = static_cast(cudaEngine->getBindingDimensions(cudaEngine->getNbBindings() - 1)); - mNetOutputSize4D = { 1, outputDims.c(), outputDims.h(), outputDims.w() }; + DimsCHW outputDims = static_cast(upImpl->cudaEngine->getBindingDimensions(upImpl->cudaEngine->getNbBindings() - 1)); + upImpl->mNetOutputSize4D = { 1, outputDims.c(), outputDims.h(), outputDims.w() }; - std::cout << "NetInputSize4D: " << mNetInputSize4D[0] << " " << mNetInputSize4D[1] << " " << mNetInputSize4D[2] << " " << mNetInputSize4D[3] << std::endl; + std::cout << "NetInputSize4D: " << upImpl->mNetInputSize4D[0] << " " << upImpl->mNetInputSize4D[1] << " " << upImpl->mNetInputSize4D[2] << " " << upImpl->mNetInputSize4D[3] << std::endl; - upImpl->spInputBlob = boost::make_shared>(mNetInputSize4D[0], mNetInputSize4D[1], mNetInputSize4D[2], mNetInputSize4D[3]); - upImpl->spOutputBlob = boost::make_shared>(mNetOutputSize4D[0], mNetOutputSize4D[1], mNetOutputSize4D[2], mNetOutputSize4D[3]); + upImpl->spInputBlob = boost::make_shared>(upImpl->mNetInputSize4D[0], upImpl->mNetInputSize4D[1], upImpl->mNetInputSize4D[2], upImpl->mNetInputSize4D[3]); + upImpl->spOutputBlob = boost::make_shared>(upImpl->mNetOutputSize4D[0], upImpl->mNetOutputSize4D[1], upImpl->mNetOutputSize4D[2], upImpl->mNetOutputSize4D[3]); std::cout << "InitializationOnThread : done" << std::endl; cudaCheck(__LINE__, __FUNCTION__, __FILE__); @@ -343,12 +342,12 @@ namespace op if (!vectorsAreEqual(upImpl->mNetInputSize4D, inputData.getSize())) { upImpl->mNetInputSize4D = inputData.getSize(); - reshapeNetTensorRT(upImpl->upCaffeNet.get(), inputData.getSize()); + reshapeNetTensorRT(upImpl->spInputBlob, inputData.getSize()); } // Copy frame data to GPU memory auto* gpuImagePtr = upImpl->spInputBlob->mutable_gpu_data(); - CUDA_CHECK(cudaMemcpy(gpuImagePtr, inputData.getConstPtr(), mNetInputMemory, cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(gpuImagePtr, inputData.getConstPtr(), inputData.getVolume() * sizeof(float), cudaMemcpyHostToDevice)); // input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(), // of these, but in this case we know that there is exactly one input and one output. @@ -357,7 +356,7 @@ namespace op buffers[1] = upImpl->spOutputBlob->mutable_gpu_data(); // Perform deep network forward pass - cudaContext->enqueue(batchSize, &buffers[0], stream, nullptr); + upImpl->cudaContext->enqueue(1, &buffers[0], upImpl->stream, nullptr); // Cuda checks cudaCheck(__LINE__, __FUNCTION__, __FILE__); diff --git a/src/openpose/pose/poseExtractorTensorRT.cpp b/src/openpose/pose/poseExtractorTensorRT.cpp index 9ab8b2b96..dafa0209c 100644 --- a/src/openpose/pose/poseExtractorTensorRT.cpp +++ b/src/openpose/pose/poseExtractorTensorRT.cpp @@ -72,7 +72,7 @@ namespace op { try { - // Prepare spCaffeNetOutputBlobss + // Prepare spTensorRTNetOutputBlobss std::vector*> caffeNetOutputBlobs(caffeNetOutputBlob.size()); for (auto i = 0u ; i < caffeNetOutputBlobs.size() ; i++) caffeNetOutputBlobs[i] = caffeNetOutputBlob[i].get(); @@ -190,7 +190,7 @@ namespace op // Logging log("Starting initialization on thread.", Priority::Low, __LINE__, __FUNCTION__, __FILE__); // Initialize Caffe net - addTensorRTNetOnThread(upImpl->spCaffeNets, upImpl->spCaffeNetOutputBlobs, upImpl->mPoseModel, + addTensorRTNetOnThread(upImpl->spTensorRTNets, upImpl->spTensorRTNetOutputBlobs, upImpl->mPoseModel, upImpl->mGpuId, upImpl->mModelFolder, upImpl->mEnableGoogleLogging); cudaCheck(__LINE__, __FUNCTION__, __FILE__); @@ -212,7 +212,9 @@ namespace op } } - void PoseExtractorTensorRT::forwardPass(const Array& inputNetData, const Point& inputDataSize, const std::vector& scaleRatios) + void PoseExtractorTensorRT::forwardPass(const std::vector>& inputNetData, + const Point& inputDataSize, + const std::vector& scaleInputToNetInputs) { try { @@ -232,15 +234,15 @@ namespace op // Resize std::vectors if required const auto numberScales = inputNetData.size(); upImpl->mNetInput4DSizes.resize(numberScales); - while (upImpl->spCaffeNets.size() < numberScales) - addCaffeNetOnThread(upImpl->spCaffeNets, upImpl->spCaffeNetOutputBlobs, upImpl->mPoseModel, + while (upImpl->spTensorRTNets.size() < numberScales) + addTensorRTNetOnThread(upImpl->spTensorRTNets, upImpl->spTensorRTNetOutputBlobs, upImpl->mPoseModel, upImpl->mGpuId, upImpl->mModelFolder, false); // Process each image for (auto i = 0u ; i < inputNetData.size(); i++) { // 1. TensorRT deep network - upImpl->spTensorRTNets->forwardPass(inputNetData.getConstPtr()); + upImpl->spTensorRTNets.at(i)->forwardPass(inputNetData[i]); // Reshape blobs if required // Note: In order to resize to input size to have same results as Matlab, uncomment the commented @@ -253,7 +255,7 @@ namespace op upImpl->mNetInput4DSizes[0][2]}; // upImpl->mScaleInputToNetInputs = scaleInputToNetInputs; reshapePoseExtractorCaffe(upImpl->spResizeAndMergeCaffe, upImpl->spNmsCaffe, - upImpl->spBodyPartConnectorCaffe, upImpl->spCaffeNetOutputBlobs, + upImpl->spBodyPartConnectorCaffe, upImpl->spTensorRTNetOutputBlobs, upImpl->spHeatMapsBlob, upImpl->spPeaksBlob, upImpl->spPoseBlob, 1.f, mPoseModel); // scaleInputToNetInputs[i], mPoseModel); @@ -263,7 +265,7 @@ namespace op timeNow("TensorRT forwards"); // 2. Resize heat maps + merge different scales - const auto caffeNetOutputBlobs = caffeNetSharedToPtr(upImpl->spCaffeNetOutputBlobs); + const auto caffeNetOutputBlobs = caffeNetSharedToPtr(upImpl->spTensorRTNetOutputBlobs); const std::vector floatScaleRatios(scaleInputToNetInputs.begin(), scaleInputToNetInputs.end()); upImpl->spResizeAndMergeCaffe->setScaleRatios(floatScaleRatios); #ifdef USE_CUDA // Implied by tensorrt From ca682c42903fd2ef2a8c15ef9d59ae52771e71b9 Mon Sep 17 00:00:00 2001 From: Florent Buisson Date: Wed, 15 Nov 2017 16:23:14 +0100 Subject: [PATCH 48/52] Fix samples --- .../tutorial_pose/1_extract_from_image.cpp | 2 +- .../3_extract_from_image_TensorRT.cpp | 121 ++++++++++-------- 2 files changed, 66 insertions(+), 57 deletions(-) diff --git a/examples/tutorial_pose/1_extract_from_image.cpp b/examples/tutorial_pose/1_extract_from_image.cpp index b3dddd747..d975ee00d 100644 --- a/examples/tutorial_pose/1_extract_from_image.cpp +++ b/examples/tutorial_pose/1_extract_from_image.cpp @@ -99,7 +99,7 @@ int openPoseTutorialPose1() // Check no contradictory flags enabled if (FLAGS_alpha_pose < 0. || FLAGS_alpha_pose > 1.) op::error("Alpha value for blending must be in the range [0,1].", __LINE__, __FUNCTION__, __FILE__); - if (FLAGS_scale_gap <= 0. && FLAGS_scale_number > 1) + if (FLAGS_scale_gap <= 0. && FLAGS_scale_number > 1.) op::error("Incompatible flag configuration: scale_gap must be greater than 0 or scale_number = 1.", __LINE__, __FUNCTION__, __FILE__); // Enabling Google Logging diff --git a/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp b/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp index 4a522fbc2..13f700f21 100644 --- a/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp +++ b/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp @@ -1,16 +1,20 @@ // ------------------------- OpenPose Library Tutorial - Pose - Example 3 - Extract from Image with TensorRT ------------------------- // This first example shows the user how to: - // 1. Load an image (`filestream` module) - // 2. Extract the pose of that image (`pose` module) - // 3. Render the pose on a resized copy of the input image (`pose` module) - // 4. Display the rendered pose (`gui` module) +// 1. Load an image (`filestream` module) +// 2. Extract the pose of that image (`pose` module) +// 3. Render the pose on a resized copy of the input image (`pose` module) +// 4. Display the rendered pose (`gui` module) // In addition to the previous OpenPose modules, we also need to use: - // 1. `core` module: for the Array class that the `pose` module needs - // 2. `utilities` module: for the error & logging functions, i.e. op::error & op::log respectively +// 1. `core` module: for the Array class that the `pose` module needs +// 2. `utilities` module: for the error & logging functions, i.e. op::error & op::log respectively // 3rdparty dependencies -#include // DEFINE_bool, DEFINE_int32, DEFINE_int64, DEFINE_uint64, DEFINE_double, DEFINE_string -#include // google::InitGoogleLogging +// GFlags: DEFINE_bool, _int32, _int64, _uint64, _double, _string +#include +// Allow Google Flags in Ubuntu 14 +#ifndef GFLAGS_GFLAGS_H_ +namespace gflags = google; +#endif // OpenPose dependencies #include #include @@ -21,36 +25,40 @@ // See all the available parameter options withe the `--help` flag. E.g. `./build/examples/openpose/openpose.bin --help`. // Note: This command will show you flags for other unnecessary 3rdparty files. Check only the flags for the OpenPose // executable. E.g. for `openpose.bin`, look for `Flags from examples/openpose/openpose.cpp:`. -// Debugging +// Debugging/Other DEFINE_int32(logging_level, 3, "The logging level. Integer in the range [0, 255]. 0 will output any log() message, while" - " 255 will not output any. Current OpenPose library messages are in the range 0-4: 1 for" - " low priority messages and 4 for important ones."); + " 255 will not output any. Current OpenPose library messages are in the range 0-4: 1 for" + " low priority messages and 4 for important ones."); // Producer DEFINE_string(image_path, "examples/media/COCO_val2014_000000000192.jpg", "Process the desired image."); // OpenPose DEFINE_string(model_pose, "COCO", "Model to be used. E.g. `COCO` (18 keypoints), `MPI` (15 keypoints, ~10% faster), " - "`MPI_4_layers` (15 keypoints, even faster but less accurate)."); + "`MPI_4_layers` (15 keypoints, even faster but less accurate)."); DEFINE_string(model_folder, "models/", "Folder path (absolute or relative) where the models (pose, face, ...) are located."); -DEFINE_string(net_resolution, "128x96", "Multiples of 16. If it is increased, the accuracy potentially increases. If it is decreased," - " the speed increases. For maximum speed-accuracy balance, it should keep the closest aspect" - " ratio possible to the images or videos to be processed. E.g. the default `128x96` is" - " optimal for 16:9 videos, e.g. full HD (1980x1080) and HD (1280x720) videos."); -DEFINE_string(resolution, "1280x720", "The image resolution (display and output). Use \"-1x-1\" to force the program to use the" - " default images resolution."); +DEFINE_string(net_resolution, "-1x368", "Multiples of 16. If it is increased, the accuracy potentially increases. If it is" + " decreased, the speed increases. For maximum speed-accuracy balance, it should keep the" + " closest aspect ratio possible to the images or videos to be processed. Using `-1` in" + " any of the dimensions, OP will choose the optimal aspect ratio depending on the user's" + " input value. E.g. the default `-1x368` is equivalent to `656x368` in 16:9 resolutions," + " e.g. full HD (1980x1080) and HD (1280x720) resolutions."); +DEFINE_string(output_resolution, "-1x-1", "The image resolution (display and output). Use \"-1x-1\" to force the program to use the" + " input image resolution."); DEFINE_int32(num_gpu_start, 0, "GPU device start number."); DEFINE_double(scale_gap, 0.3, "Scale gap between scales. No effect unless scale_number > 1. Initial scale is always 1." - " If you want to change the initial scale, you actually want to multiply the" - " `net_resolution` by your desired initial scale."); + " If you want to change the initial scale, you actually want to multiply the" + " `net_resolution` by your desired initial scale."); DEFINE_int32(scale_number, 1, "Number of scales to average."); // OpenPose Rendering -DEFINE_bool(disable_blending, false, "If blending is enabled, it will merge the results with the original frame. If disabled, it" - " will only display the results on a black background."); +DEFINE_bool(disable_blending, false, "If enabled, it will render the results (keypoint skeletons or heatmaps) on a black" + " background, instead of being rendered into the original image. Related: `part_to_show`," + " `alpha_pose`, and `alpha_pose`."); DEFINE_double(render_threshold, 0.05, "Only estimated keypoints whose score confidences are higher than this threshold will be" - " rendered. Generally, a high threshold (> 0.5) will only render very clear body parts;" - " while small thresholds (~0.1) will also output guessed and occluded keypoints, but also" - " more false positives (i.e. wrong detections)."); + " rendered. Generally, a high threshold (> 0.5) will only render very clear body parts;" + " while small thresholds (~0.1) will also output guessed and occluded keypoints, but also" + " more false positives (i.e. wrong detections)."); DEFINE_double(alpha_pose, 0.6, "Blending factor (range 0-1) for the body part rendering. 1 will show it completely, 0 will" - " hide it. Only valid for GPU rendering."); + " hide it. Only valid for GPU rendering."); + typedef std::vector> OpTimings; @@ -84,11 +92,9 @@ int openPoseTutorialPose3() op::log("", op::Priority::Low, __LINE__, __FUNCTION__, __FILE__); // Step 2 - Read Google flags (user defined configuration) // outputSize - const auto outputSize = op::flagsToPoint(FLAGS_resolution, "1280x720"); + const auto outputSize = op::flagsToPoint(FLAGS_output_resolution, "-1x-1"); // netInputSize - const auto netInputSize = op::flagsToPoint(FLAGS_net_resolution, "128x96"); - // netOutputSize - const auto netOutputSize = netInputSize; + const auto netInputSize = op::flagsToPoint(FLAGS_net_resolution, "-1x368"); // poseModel const auto poseModel = op::flagsToPoseModel(FLAGS_model_pose); // Check no contradictory flags enabled @@ -96,18 +102,20 @@ int openPoseTutorialPose3() op::error("Alpha value for blending must be in the range [0,1].", __LINE__, __FUNCTION__, __FILE__); if (FLAGS_scale_gap <= 0. && FLAGS_scale_number > 1) op::error("Incompatible flag configuration: scale_gap must be greater than 0 or scale_number = 1.", __LINE__, __FUNCTION__, __FILE__); + // Enabling Google Logging + const bool enableGoogleLogging = true; // Logging op::log("", op::Priority::Low, __LINE__, __FUNCTION__, __FILE__); // Step 3 - Initialize all required classes - op::CvMatToOpInput cvMatToOpInput{netInputSize, FLAGS_scale_number, (float)FLAGS_scale_gap}; - op::CvMatToOpOutput cvMatToOpOutput{outputSize}; - op::PoseExtractorTensorRT poseExtractorTensorRT{netInputSize, netOutputSize, outputSize, FLAGS_scale_number, poseModel, - FLAGS_model_folder, FLAGS_num_gpu_start}; - op::PoseRenderer poseRenderer{netOutputSize, outputSize, poseModel, nullptr, (float)FLAGS_render_threshold, - !FLAGS_disable_blending, (float)FLAGS_alpha_pose}; - op::OpOutputToCvMat opOutputToCvMat{outputSize}; - const op::Point windowedSize = outputSize; - op::FrameDisplayer frameDisplayer{windowedSize, "OpenPose Tutorial - Example 1"}; + op::ScaleAndSizeExtractor scaleAndSizeExtractor(netInputSize, outputSize, FLAGS_scale_number, FLAGS_scale_gap); + op::CvMatToOpInput cvMatToOpInput; + op::CvMatToOpOutput cvMatToOpOutput; + op::PoseExtractorCaffe poseExtractorTensorRT{poseModel, FLAGS_model_folder, + FLAGS_num_gpu_start, {}, op::ScaleMode::ZeroToOne, enableGoogleLogging}; + op::PoseCpuRenderer poseRenderer{poseModel, (float)FLAGS_render_threshold, !FLAGS_disable_blending, + (float)FLAGS_alpha_pose}; + op::OpOutputToCvMat opOutputToCvMat; + op::FrameDisplayer frameDisplayer{"OpenPose Tutorial - Example 3", outputSize}; // Step 4 - Initialize resources on desired thread (in this case single thread, i.e. we init resources here) poseExtractorTensorRT.initializationOnThread(); poseRenderer.initializationOnThread(); @@ -116,26 +124,27 @@ int openPoseTutorialPose3() // ------------------------- POSE ESTIMATION AND RENDERING ------------------------- // Step 1 - Read and load image, error if empty (possibly wrong path) - cv::Mat inputImage = op::loadImage(FLAGS_image_path, CV_LOAD_IMAGE_COLOR); // Alternative: cv::imread(FLAGS_image_path, CV_LOAD_IMAGE_COLOR); + // Alternative: cv::imread(FLAGS_image_path, CV_LOAD_IMAGE_COLOR); + cv::Mat inputImage = op::loadImage(FLAGS_image_path, CV_LOAD_IMAGE_COLOR); if(inputImage.empty()) op::error("Could not open or find the image: " + FLAGS_image_path, __LINE__, __FUNCTION__, __FILE__); - timeNow("Step 1"); - // Step 2 - Format input image to OpenPose input and output formats - op::Array netInputArray; - std::vector scaleRatios; - std::tie(netInputArray, scaleRatios) = cvMatToOpInput.format(inputImage); + const op::Point imageSize{inputImage.cols, inputImage.rows}; + // Step 2 - Get desired scale sizes + std::vector scaleInputToNetInputs; + std::vector> netInputSizes; double scaleInputToOutput; - op::Array outputArray; - std::tie(scaleInputToOutput, outputArray) = cvMatToOpOutput.format(inputImage); - timeNow("Step 2"); - // Step 3 - Estimate poseKeypoints - poseExtractorTensorRT.forwardPass(netInputArray, {inputImage.cols, inputImage.rows}, scaleRatios); + op::Point outputResolution; + std::tie(scaleInputToNetInputs, netInputSizes, scaleInputToOutput, outputResolution) + = scaleAndSizeExtractor.extract(imageSize); + // Step 3 - Format input image to OpenPose input and output formats + const auto netInputArray = cvMatToOpInput.createArray(inputImage, scaleInputToNetInputs, netInputSizes); + auto outputArray = cvMatToOpOutput.createArray(inputImage, scaleInputToOutput, outputResolution); + // Step 4 - Estimate poseKeypoints + poseExtractorTensorRT.forwardPass(netInputArray, imageSize, scaleInputToNetInputs); const auto poseKeypoints = poseExtractorTensorRT.getPoseKeypoints(); - timeNow("Step 3"); - // Step 4 - Render poseKeypoints - poseRenderer.renderPose(outputArray, poseKeypoints); - timeNow("Step 4"); - // Step 5 - OpenPose output format to cv::Mat + // Step 5 - Render poseKeypoints + poseRenderer.renderPose(outputArray, poseKeypoints, scaleInputToOutput); + // Step 6 - OpenPose output format to cv::Mat auto outputImage = opOutputToCvMat.formatToCvMat(outputArray); timeNow("Step 5"); @@ -143,7 +152,7 @@ int openPoseTutorialPose3() // Step 1 - Show results frameDisplayer.displayFrame(outputImage, 0); // Alternative: cv::imshow(outputImage) + cv::waitKey(0) // Step 2 - Logging information message - op::log("Example 1 successfully finished.", op::Priority::High); + op::log("Example 3 successfully finished.", op::Priority::High); const auto totalTimeSec = timeDiffToString(timings.back().second, timings.front().second); const auto message = "Pose estimation successfully finished. Total time: " + totalTimeSec + " seconds."; From cb0d440dbb763852dcaacb9564ad9776e2084deb Mon Sep 17 00:00:00 2001 From: Florent Buisson Date: Wed, 15 Nov 2017 16:05:53 +0000 Subject: [PATCH 49/52] Compilation fixed, TensorRT net optimisation works, segfault on inference --- .../tutorial_pose/3_extract_from_image_TensorRT.cpp | 5 +---- src/openpose/core/netTensorRT.cpp | 10 +++++----- src/openpose/pose/poseExtractorTensorRT.cpp | 6 +++--- 3 files changed, 9 insertions(+), 12 deletions(-) diff --git a/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp b/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp index 13f700f21..a855fa3da 100644 --- a/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp +++ b/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp @@ -110,7 +110,7 @@ int openPoseTutorialPose3() op::ScaleAndSizeExtractor scaleAndSizeExtractor(netInputSize, outputSize, FLAGS_scale_number, FLAGS_scale_gap); op::CvMatToOpInput cvMatToOpInput; op::CvMatToOpOutput cvMatToOpOutput; - op::PoseExtractorCaffe poseExtractorTensorRT{poseModel, FLAGS_model_folder, + op::PoseExtractorTensorRT poseExtractorTensorRT{poseModel, FLAGS_model_folder, FLAGS_num_gpu_start, {}, op::ScaleMode::ZeroToOne, enableGoogleLogging}; op::PoseCpuRenderer poseRenderer{poseModel, (float)FLAGS_render_threshold, !FLAGS_disable_blending, (float)FLAGS_alpha_pose}; @@ -171,9 +171,6 @@ int openPoseTutorialPose3() int main(int argc, char *argv[]) { - // Initializing google logging (Caffe uses it for logging) - google::InitGoogleLogging("openPoseTutorialPose3"); - // Parsing command line flags gflags::ParseCommandLineFlags(&argc, &argv, true); diff --git a/src/openpose/core/netTensorRT.cpp b/src/openpose/core/netTensorRT.cpp index 897087f00..b59d57a7d 100644 --- a/src/openpose/core/netTensorRT.cpp +++ b/src/openpose/core/netTensorRT.cpp @@ -51,7 +51,7 @@ class Logger : public ILogger namespace op { std::mutex sMutexNetTensorRT; - std::atomic sGoogleLoggingInitialized{false}; + std::atomic sGoogleLoggingInitializedTensorRT{false}; // Already defined in netCaffe struct NetTensorRT::ImplNetTensorRT { @@ -82,7 +82,7 @@ namespace op ImplNetTensorRT(const std::string& caffeProto, const std::string& caffeTrainedModel, const int gpuId, const bool enableGoogleLogging, const std::string& lastBlobName) : mGpuId{gpuId}, - mCaffeProto{caffeProto}, // TODO, no size, how to proceed ? + mCaffeProto{caffeProto + std::string("_368x656")}, // TODO, no size, how to proceed ? mCaffeTrainedModel{caffeTrainedModel}, mLastBlobName{lastBlobName} { @@ -95,13 +95,13 @@ namespace op error("Caffe trained model file not found: " + mCaffeTrainedModel + message, __LINE__, __FUNCTION__, __FILE__); // Double if condition in order to speed up the program if it is called several times - if (enableGoogleLogging && !sGoogleLoggingInitialized) + if (enableGoogleLogging && !sGoogleLoggingInitializedTensorRT) { std::lock_guard lock{sMutexNetTensorRT}; - if (enableGoogleLogging && !sGoogleLoggingInitialized) + if (enableGoogleLogging && !sGoogleLoggingInitializedTensorRT) { google::InitGoogleLogging("OpenPose"); - sGoogleLoggingInitialized = true; + sGoogleLoggingInitializedTensorRT = true; } } } diff --git a/src/openpose/pose/poseExtractorTensorRT.cpp b/src/openpose/pose/poseExtractorTensorRT.cpp index dafa0209c..8dc981f30 100644 --- a/src/openpose/pose/poseExtractorTensorRT.cpp +++ b/src/openpose/pose/poseExtractorTensorRT.cpp @@ -67,7 +67,7 @@ namespace op }; #ifdef USE_CAFFE - std::vector*> caffeNetSharedToPtr( + std::vector*> tensorRTNetSharedToPtr( std::vector>>& caffeNetOutputBlob) { try @@ -98,7 +98,7 @@ namespace op try { // HeatMaps extractor blob and layer - const auto caffeNetOutputBlobs = caffeNetSharedToPtr(caffeNetOutputBlob); + const auto caffeNetOutputBlobs = tensorRTNetSharedToPtr(caffeNetOutputBlob); resizeAndMergeCaffe->Reshape(caffeNetOutputBlobs, {heatMapsBlob.get()}, POSE_CCN_DECREASE_FACTOR[(int)poseModel], 1.f/scaleInputToNetInput); // Pose extractor blob and layer @@ -265,7 +265,7 @@ namespace op timeNow("TensorRT forwards"); // 2. Resize heat maps + merge different scales - const auto caffeNetOutputBlobs = caffeNetSharedToPtr(upImpl->spTensorRTNetOutputBlobs); + const auto caffeNetOutputBlobs = tensorRTNetSharedToPtr(upImpl->spTensorRTNetOutputBlobs); const std::vector floatScaleRatios(scaleInputToNetInputs.begin(), scaleInputToNetInputs.end()); upImpl->spResizeAndMergeCaffe->setScaleRatios(floatScaleRatios); #ifdef USE_CUDA // Implied by tensorrt From 827510b1a71d3709375c46de3f10f49f622c84ca Mon Sep 17 00:00:00 2001 From: Florent Buisson Date: Wed, 15 Nov 2017 16:58:38 +0000 Subject: [PATCH 50/52] Code kind of work, not full pipeline lead to no shape displayed, sizes hardcoded. --- src/openpose/core/netTensorRT.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/openpose/core/netTensorRT.cpp b/src/openpose/core/netTensorRT.cpp index b59d57a7d..1b2b43151 100644 --- a/src/openpose/core/netTensorRT.cpp +++ b/src/openpose/core/netTensorRT.cpp @@ -307,14 +307,15 @@ namespace op std::cerr << "cudaContext could not be created" << std::endl; return; } - DimsCHW outputDims = static_cast(upImpl->cudaEngine->getBindingDimensions(upImpl->cudaEngine->getNbBindings() - 1)); - upImpl->mNetOutputSize4D = { 1, outputDims.c(), outputDims.h(), outputDims.w() }; - + upImpl->mNetOutputSize4D.push_back(1); + upImpl->mNetOutputSize4D.push_back(outputDims.c()); + upImpl->mNetOutputSize4D.push_back(outputDims.h()); + upImpl->mNetOutputSize4D.push_back(outputDims.w()); - std::cout << "NetInputSize4D: " << upImpl->mNetInputSize4D[0] << " " << upImpl->mNetInputSize4D[1] << " " << upImpl->mNetInputSize4D[2] << " " << upImpl->mNetInputSize4D[3] << std::endl; + //std::cout << "NetInputSize4D: " << upImpl->mNetInputSize4D.at(0) << " " << upImpl->mNetInputSize4D.at(1) << " " << upImpl->mNetInputSize4D.at(2) << " " << upImpl->mNetInputSize4D.at(3) << std::endl; - upImpl->spInputBlob = boost::make_shared>(upImpl->mNetInputSize4D[0], upImpl->mNetInputSize4D[1], upImpl->mNetInputSize4D[2], upImpl->mNetInputSize4D[3]); + upImpl->spInputBlob = boost::make_shared>(1, 3, 368, 656); upImpl->spOutputBlob = boost::make_shared>(upImpl->mNetOutputSize4D[0], upImpl->mNetOutputSize4D[1], upImpl->mNetOutputSize4D[2], upImpl->mNetOutputSize4D[3]); std::cout << "InitializationOnThread : done" << std::endl; From a1619fa3c8aa881568ed27d20a1efb76cfcc690a Mon Sep 17 00:00:00 2001 From: Florent Buisson Date: Wed, 15 Nov 2017 22:02:37 +0100 Subject: [PATCH 51/52] Useless preproc macros --- src/openpose/pose/poseExtractorTensorRT.cpp | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/openpose/pose/poseExtractorTensorRT.cpp b/src/openpose/pose/poseExtractorTensorRT.cpp index 8dc981f30..c61d17dd6 100644 --- a/src/openpose/pose/poseExtractorTensorRT.cpp +++ b/src/openpose/pose/poseExtractorTensorRT.cpp @@ -268,14 +268,10 @@ namespace op const auto caffeNetOutputBlobs = tensorRTNetSharedToPtr(upImpl->spTensorRTNetOutputBlobs); const std::vector floatScaleRatios(scaleInputToNetInputs.begin(), scaleInputToNetInputs.end()); upImpl->spResizeAndMergeCaffe->setScaleRatios(floatScaleRatios); - #ifdef USE_CUDA // Implied by tensorrt + upImpl->spResizeAndMergeCaffe->Forward_gpu(caffeNetOutputBlobs, // ~5ms {upImpl->spHeatMapsBlob.get()}); cudaCheck(__LINE__, __FUNCTION__, __FILE__); - #else // Never reached, suppress ? - upImpl->spResizeAndMergeCaffe->Forward_cpu({upImpl->spCaffeNetOutputBlob.get()}, - {upImpl->spHeatMapsBlob.get()}); - #endif timeNow("Resize heat Maps"); From 344ab674b08e8a90fc79a55bfa08ae5b965d45b7 Mon Sep 17 00:00:00 2001 From: Florent Buisson Date: Wed, 15 Nov 2017 22:06:03 +0100 Subject: [PATCH 52/52] NetTensorRT modifs --- src/openpose/core/netTensorRT.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/openpose/core/netTensorRT.cpp b/src/openpose/core/netTensorRT.cpp index 1b2b43151..ed93b3662 100644 --- a/src/openpose/core/netTensorRT.cpp +++ b/src/openpose/core/netTensorRT.cpp @@ -339,9 +339,10 @@ namespace op if (inputData.getNumberDimensions() != 4 || inputData.getSize(1) != 3) error("The Array inputData must have 4 dimensions: [batch size, 3 (RGB), height, width].", __LINE__, __FUNCTION__, __FILE__); - // Reshape Caffe net if required + // Reshape Tensor RT net if required if (!vectorsAreEqual(upImpl->mNetInputSize4D, inputData.getSize())) { + std::cout << "Reshaping Tensor RT Net : WARNING NOT TESTED, probably won't work" << std::endl; upImpl->mNetInputSize4D = inputData.getSize(); reshapeNetTensorRT(upImpl->spInputBlob, inputData.getSize()); } @@ -357,7 +358,7 @@ namespace op buffers[1] = upImpl->spOutputBlob->mutable_gpu_data(); // Perform deep network forward pass - upImpl->cudaContext->enqueue(1, &buffers[0], upImpl->stream, nullptr); + upImpl->cudaContext->enqueue(inputData.getSize(0), &buffers[0], upImpl->stream, nullptr); // Cuda checks cudaCheck(__LINE__, __FUNCTION__, __FILE__);