diff --git a/arcface/CMakeLists.txt b/arcface/CMakeLists.txt index b8e25fd8..00ad9b2b 100644 --- a/arcface/CMakeLists.txt +++ b/arcface/CMakeLists.txt @@ -35,5 +35,17 @@ target_link_libraries(arcface-r50 cudart) target_link_libraries(arcface-r50 myplugins) target_link_libraries(arcface-r50 ${OpenCV_LIBS}) +add_executable(arcface-mobilefacenet ${PROJECT_SOURCE_DIR}/arcface-mobilefacenet.cpp) +target_link_libraries(arcface-mobilefacenet nvinfer) +target_link_libraries(arcface-mobilefacenet cudart) +target_link_libraries(arcface-mobilefacenet myplugins) +target_link_libraries(arcface-mobilefacenet ${OpenCV_LIBS}) + +add_executable(arcface-r100 ${PROJECT_SOURCE_DIR}/arcface-r100.cpp) +target_link_libraries(arcface-r100 nvinfer) +target_link_libraries(arcface-r100 cudart) +target_link_libraries(arcface-r100 myplugins) +target_link_libraries(arcface-r100 ${OpenCV_LIBS}) + add_definitions(-O2 -pthread) diff --git a/arcface/README.md b/arcface/README.md index 11b46a4f..5cfbab2f 100644 --- a/arcface/README.md +++ b/arcface/README.md @@ -2,7 +2,7 @@ The mxnet implementation is from [deepinsight/insightface.](https://github.com/deepinsight/insightface) -The pretrained model is [LResNet50E-IR,ArcFace@ms1m-refine-v1.](https://github.com/deepinsight/insightface/wiki/Model-Zoo#32-lresnet50e-irarcfacems1m-refine-v1) +The pretrained models are from [LResNet50E-IR,ArcFace@ms1m-refine-v1](https://github.com/deepinsight/insightface/wiki/Model-Zoo#32-lresnet50e-irarcfacems1m-refine-v1), [LResNet100E-IR,ArcFace@ms1m-refine-v2](https://github.com/deepinsight/insightface/wiki/Model-Zoo#31-lresnet100e-irarcfacems1m-refine-v2) and [MobileFaceNet,ArcFace@ms1m-refine-v1](https://github.com/deepinsight/insightface/wiki/Model-Zoo#34-mobilefacenetarcfacems1m-refine-v1) The two input images used in this project are joey0.ppm and joey1.ppm, download them from [Google Drive.](https://drive.google.com/drive/folders/1ctqpkRCRKyBZRCNwo9Uq4eUoMRLtFq1e). The input image is 112x112, and generated from `get_input()` in `insightface/deploy/face_model.py`, which is cropped and aligned face image. @@ -12,13 +12,12 @@ The two input images used in this project are joey0.ppm and joey1.ppm, download ## Config -- FP16/FP32 can be selected by the macro `USE_FP16` in arcface-r50.cpp -- GPU id can be selected by the macro `DEVICE` in arcface-r50.cpp +- FP16/FP32 can be selected by the macro `USE_FP16` in arcface-r50/r100/mobilefacenet.cpp +- GPU id can be selected by the macro `DEVICE` in arcface-r50/r100/mobilefacenet.cpp ## Run -1. generate arcface-r50.wts from mxnet implementation with LResNet50E-IR,ArcFace@ms1m-refine-v1 pretrained model - +1.Generate .wts file from mxnet implementation of pretrained model. The following example described how to generate arcface-r50.wts from mxnet implementation of LResNet50E-IR,ArcFace@ms1m-refine-v1. ``` git clone https://github.com/deepinsight/insightface cd insightface @@ -29,9 +28,10 @@ cd deploy python gen_wts.py // a file 'arcface-r50.wts' will be generated. // the master branch of insightface should work, if not, you can checkout 94ad870abb3203d6f31b049b70dd080dc8f33fca -``` +// arcface-r100.wts/arcface-mobilefacenet.wts can be generated in similar way from mxnet implementation of LResNet100E-IR,ArcFace@ms1m-refine-v1/MobileFaceNet,ArcFace@ms1m-refine-v1 pretrained model. -2. put arcface-r50.wts into tensorrtx/arcface, build and run +``` +2.Put .wts file into tensorrtx/arcface, build and run ``` cd tensorrtx/arcface @@ -42,9 +42,20 @@ cmake .. make sudo ./arcface-r50 -s // serialize model to plan file i.e. 'arcface-r50.engine' sudo ./arcface-r50 -d // deserialize plan file and run inference + +or + +sudo ./arcface-r100 -s // serialize model to plan file i.e. 'arcface-r100.engine' +sudo ./arcface-r100 -d // deserialize plan file and run inference + + +or + +sudo ./arcface-mobilefacenet -s // serialize model to plan file i.e. 'arcface-mobilefacenet.engine' +sudo ./arcface-mobilefacenet -d // deserialize plan file and run inference ``` -3. check the output log, latency and similarity score. +3.Check the output log, latency and similarity score. ## More Information diff --git a/arcface/arcface-mobilefacenet.cpp b/arcface/arcface-mobilefacenet.cpp new file mode 100644 index 00000000..1d9e81b8 --- /dev/null +++ b/arcface/arcface-mobilefacenet.cpp @@ -0,0 +1,451 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include "NvInfer.h" +#include "cuda_runtime_api.h" +#include "logging.h" + +#define CHECK(status) \ + do\ + {\ + auto ret = (status);\ + if (ret != 0)\ + {\ + std::cerr << "Cuda failure: " << ret << std::endl;\ + abort();\ + }\ + } while (0) + +//#define USE_FP16 // comment out this if want to use FP32 +#define DEVICE 0 // GPU id +#define BATCH_SIZE 1 // currently, only support BATCH=1 + +using namespace nvinfer1; + +// stuff we know about the network and the input/output blobs +static const int INPUT_H = 112; +static const int INPUT_W = 112; +static const int OUTPUT_SIZE = 512; +const char* INPUT_BLOB_NAME = "data"; +const char* OUTPUT_BLOB_NAME = "prob"; +static Logger gLogger; + +// TensorRT weight files have a simple space delimited format: +// [type] [size] +std::map loadWeights(const std::string file) { + std::cout << "Loading weights: " << file << std::endl; + std::map weightMap; + + // Open weights file + std::ifstream input(file); + assert(input.is_open() && "Unable to load weight file."); + + // Read number of weight blobs + int32_t count; + input >> count; + assert(count > 0 && "Invalid weight map file."); + + while (count--) + { + Weights wt{DataType::kFLOAT, nullptr, 0}; + uint32_t size; + + // Read name and type of blob + std::string name; + input >> name >> std::dec >> size; + wt.type = DataType::kFLOAT; + + // Load blob + uint32_t* val = reinterpret_cast(malloc(sizeof(val) * size)); + for (uint32_t x = 0, y = size; x < y; ++x) + { + input >> std::hex >> val[x]; + } + wt.values = val; + + wt.count = size; + weightMap[name] = wt; + } + + return weightMap; +} + +IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname, float eps) { + float *gamma = (float*)weightMap[lname + "_gamma"].values; + float *beta = (float*)weightMap[lname + "_beta"].values; + float *mean = (float*)weightMap[lname + "_moving_mean"].values; + float *var = (float*)weightMap[lname + "_moving_var"].values; + int len = weightMap[lname + "_moving_var"].count; + + float *scval = reinterpret_cast(malloc(sizeof(float) * len)); + for (int i = 0; i < len; i++) { + scval[i] = gamma[i] / sqrt(var[i] + eps); + } + Weights scale{DataType::kFLOAT, scval, len}; + + float *shval = reinterpret_cast(malloc(sizeof(float) * len)); + for (int i = 0; i < len; i++) { + shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps); + } + Weights shift{DataType::kFLOAT, shval, len}; + + float *pval = reinterpret_cast(malloc(sizeof(float) * len)); + for (int i = 0; i < len; i++) { + pval[i] = 1.0; + } + Weights power{DataType::kFLOAT, pval, len}; + + weightMap[lname + ".scale"] = scale; + weightMap[lname + ".shift"] = shift; + weightMap[lname + ".power"] = power; + IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power); + assert(scale_1); + return scale_1; +} + +ILayer* addPRelu(INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname) { + float *gamma = (float*)weightMap[lname + "_gamma"].values; + int len = weightMap[lname + "_gamma"].count; + + float *scval_1 = reinterpret_cast(malloc(sizeof(float) * len)); + float *scval_2 = reinterpret_cast(malloc(sizeof(float) * len)); + for (int i = 0; i < len; i++) { + scval_1[i] = -1.0; + scval_2[i] = -gamma[i]; + } + Weights scale_1{ DataType::kFLOAT, scval_1, len }; + Weights scale_2{ DataType::kFLOAT, scval_2, len }; + + float *shval = reinterpret_cast(malloc(sizeof(float) * len)); + for (int i = 0; i < len; i++) { + shval[i] = 0.0; + } + Weights shift{ DataType::kFLOAT, shval, len }; + + float *pval = reinterpret_cast(malloc(sizeof(float) * len)); + for (int i = 0; i < len; i++) { + pval[i] = 1.0; + } + Weights power{ DataType::kFLOAT, pval, len }; + + auto relu1 = network->addActivation(input, ActivationType::kRELU); + assert(relu1); + IScaleLayer* scale1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale_1, power); + assert(scale1); + auto relu2 = network->addActivation(*scale1->getOutput(0), ActivationType::kRELU); + assert(relu2); + IScaleLayer* scale2 = network->addScale(*relu2->getOutput(0), ScaleMode::kCHANNEL, shift, scale_2, power); + assert(scale2); + IElementWiseLayer* ew1 = network->addElementWise(*relu1->getOutput(0), *scale2->getOutput(0), ElementWiseOperation::kSUM); + assert(ew1); + return ew1; +} + +ILayer* conv_bn_relu(INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname, int oup, int k = 3, int p = 1, int s = 2, int groups=1) { + Weights emptywts{DataType::kFLOAT, nullptr, 0}; + IConvolutionLayer* conv1 = network->addConvolutionNd(input, oup, DimsHW{k, k}, weightMap[lname + "_conv2d_weight"], emptywts); + assert(conv1); + conv1->setStrideNd(DimsHW{s, s}); + conv1->setPaddingNd(DimsHW{p, p}); + conv1->setNbGroups(groups); + auto bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + "_batchnorm", 2e-5); + assert(bn1); + auto act1 = addPRelu(network, weightMap, *bn1->getOutput(0), lname + "_relu"); + assert(act1); + return act1; +} + +ILayer* conv_bn(INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname, int oup, int k = 3, int p = 1, int s = 1, int groups=1) { + Weights emptywts{DataType::kFLOAT, nullptr, 0}; + IConvolutionLayer* conv1 = network->addConvolutionNd(input, oup, DimsHW{k, k}, weightMap[lname + "_conv2d_weight"], emptywts); + assert(conv1); + conv1->setStrideNd(DimsHW{s, s}); + conv1->setPaddingNd(DimsHW{p, p}); + conv1->setNbGroups(groups); + auto bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + "_batchnorm", 2e-5); + assert(bn1); + return bn1; +} + +ILayer* DepthWise(INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname, int inp, int oup, int groups, int s) { + Weights emptywts{DataType::kFLOAT, nullptr, 0}; + IConvolutionLayer* conv1 = network->addConvolutionNd(input, groups, DimsHW{1, 1}, weightMap[lname + "_conv_sep_conv2d_weight"], emptywts); + assert(conv1); + conv1->setStrideNd(DimsHW{1, 1}); + conv1->setPaddingNd(DimsHW{0, 0}); + conv1->setNbGroups(1); + auto bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + "_conv_sep_batchnorm", 2e-5); + assert(bn1); + auto act1 = addPRelu(network, weightMap, *bn1->getOutput(0), lname + "_conv_sep_relu"); + assert(act1); + + IConvolutionLayer* conv2 = network->addConvolutionNd(*act1->getOutput(0), groups, DimsHW{3, 3}, weightMap[lname + "_conv_dw_conv2d_weight"], emptywts); + assert(conv2); + conv2->setStrideNd(DimsHW{s, s}); + conv2->setPaddingNd(DimsHW{1, 1}); + conv2->setNbGroups(groups); + auto bn2 = addBatchNorm2d(network, weightMap, *conv2->getOutput(0), lname + "_conv_dw_batchnorm", 2e-5); + assert(bn2); + auto act2 = addPRelu(network, weightMap, *bn2->getOutput(0), lname + "_conv_dw_relu"); + assert(act2); + + IConvolutionLayer* conv3 = network->addConvolutionNd(*act2->getOutput(0), oup, DimsHW{1, 1}, weightMap[lname + "_conv_proj_conv2d_weight"], emptywts); + assert(conv3); + conv3->setStrideNd(DimsHW{1, 1}); + conv3->setPaddingNd(DimsHW{0, 0}); + conv3->setNbGroups(1); + auto bn3 = addBatchNorm2d(network, weightMap, *conv3->getOutput(0), lname + "_conv_proj_batchnorm", 2e-5); + assert(bn3); + return bn3; +} + + +ILayer* DWResidual(INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname, int inp, int oup, int groups, int s) { + + auto dw1 = DepthWise(network, weightMap, input, lname, inp, oup, groups, s); + IElementWiseLayer* ew1; + ew1 = network->addElementWise(input, *dw1->getOutput(0), ElementWiseOperation::kSUM); + assert(ew1); + return ew1; +} + + +// Creat the engine using only the API and not any parser. +ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) { + INetworkDefinition* network = builder->createNetworkV2(0U); + + // Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME + ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{3, INPUT_H, INPUT_W}); + assert(data); + + std::map weightMap = loadWeights("../arcface-mobilefacenet.wts"); + Weights emptywts{DataType::kFLOAT, nullptr, 0}; + + auto conv_1 = conv_bn_relu(network, weightMap, *data, "conv_1", 64, 3, 1, 2); + auto conv_2_dw = conv_bn_relu(network, weightMap, *conv_1->getOutput(0), "conv_2_dw", 64, 3, 1, 1, 64); + auto conv_23 = DepthWise(network, weightMap, *conv_2_dw->getOutput(0), "dconv_23", 64, 64, 128, 2); + auto res_3_block0 = DWResidual(network, weightMap, *conv_23->getOutput(0), "res_3_block0", 64, 64, 128, 1); + auto res_3_block1 = DWResidual(network, weightMap, *res_3_block0->getOutput(0), "res_3_block1", 64, 64, 128, 1); + auto res_3_block2 = DWResidual(network, weightMap, *res_3_block1->getOutput(0), "res_3_block2", 64, 64, 128, 1); + auto res_3_block3 = DWResidual(network, weightMap, *res_3_block2->getOutput(0), "res_3_block3", 64, 64, 128, 1); + auto conv_34 = DepthWise(network, weightMap, *res_3_block3->getOutput(0), "dconv_34", 64, 128, 256, 2); + auto res_4_block0 = DWResidual(network, weightMap, *conv_34->getOutput(0), "res_4_block0", 128, 128, 256, 1); + auto res_4_block1 = DWResidual(network, weightMap, *res_4_block0->getOutput(0), "res_4_block1", 128, 128, 256, 1); + auto res_4_block2 = DWResidual(network, weightMap, *res_4_block1->getOutput(0), "res_4_block2", 128, 128, 256, 1); + auto res_4_block3 = DWResidual(network, weightMap, *res_4_block2->getOutput(0), "res_4_block3", 128, 128, 256, 1); + auto res_4_block4 = DWResidual(network, weightMap, *res_4_block3->getOutput(0), "res_4_block4", 128, 128, 256, 1); + auto res_4_block5 = DWResidual(network, weightMap, *res_4_block4->getOutput(0), "res_4_block5", 128, 128, 256, 1); + auto conv_45 = DepthWise(network, weightMap, *res_4_block5->getOutput(0), "dconv_45", 128, 128, 512, 2); + auto res_5_block0 = DWResidual(network, weightMap, *conv_45->getOutput(0), "res_5_block0", 128, 128, 256, 1); + auto res_5_block1 = DWResidual(network, weightMap, *res_5_block0->getOutput(0), "res_5_block1", 128, 128, 256, 1); + auto conv_6_sep = conv_bn_relu(network, weightMap, *res_5_block1->getOutput(0), "conv_6sep", 512, 1, 0, 1); + auto conv_6dw7_7 = conv_bn(network, weightMap, *conv_6_sep->getOutput(0), "conv_6dw7_7", 512, 7, 0, 1, 512); + IFullyConnectedLayer* fc1 = network->addFullyConnected(*conv_6dw7_7->getOutput(0), 128, weightMap["fc1_weight"], weightMap["pre_fc1_bias"]); + assert(fc1); + auto bn1 = addBatchNorm2d(network, weightMap, *fc1->getOutput(0), "fc1", 2e-5); + assert(bn1); + bn1->getOutput(0)->setName(OUTPUT_BLOB_NAME); + network->markOutput(*bn1->getOutput(0)); + + // Build engine + builder->setMaxBatchSize(maxBatchSize); + config->setMaxWorkspaceSize(16 * (1 << 20)); // 16MB +#ifdef USE_FP16 + config->setFlag(BuilderFlag::kFP16); +#endif + std::cout << "Building engine, please wait for a while..." << std::endl; + ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); + std::cout << "Build engine successfully!" << std::endl; + + // Don't need the network any more + network->destroy(); + + // Release host memory + for (auto& mem : weightMap) + { + free((void*) (mem.second.values)); + } + + return engine; +} + +void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream) { + // Create builder + IBuilder* builder = createInferBuilder(gLogger); + IBuilderConfig* config = builder->createBuilderConfig(); + + // Create model to populate the network, then set the outputs and create an engine + ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT); + assert(engine != nullptr); + + // Serialize the engine + (*modelStream) = engine->serialize(); + + // Close everything down + engine->destroy(); + builder->destroy(); +} + +void doInference(IExecutionContext& context, float* input, float* output, int batchSize) { + const ICudaEngine& engine = context.getEngine(); + + // Pointers to input and output device buffers to pass to engine. + // Engine requires exactly IEngine::getNbBindings() number of buffers. + assert(engine.getNbBindings() == 2); + void* buffers[2]; + + // In order to bind the buffers, we need to know the names of the input and output tensors. + // Note that indices are guaranteed to be less than IEngine::getNbBindings() + const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME); + const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME); + + // Create GPU buffers on device + CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float))); + CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float))); + + // Create stream + cudaStream_t stream; + CHECK(cudaStreamCreate(&stream)); + + // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host + CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream)); + context.enqueue(batchSize, buffers, stream, nullptr); + CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream)); + cudaStreamSynchronize(stream); + + // Release stream and buffers + cudaStreamDestroy(stream); + CHECK(cudaFree(buffers[inputIndex])); + CHECK(cudaFree(buffers[outputIndex])); +} + +int read_files_in_dir(const char *p_dir_name, std::vector &file_names) { + DIR *p_dir = opendir(p_dir_name); + if (p_dir == nullptr) { + return -1; + } + + struct dirent* p_file = nullptr; + while ((p_file = readdir(p_dir)) != nullptr) { + if (strcmp(p_file->d_name, ".") != 0 && + strcmp(p_file->d_name, "..") != 0) { + //std::string cur_file_name(p_dir_name); + //cur_file_name += "/"; + //cur_file_name += p_file->d_name; + std::string cur_file_name(p_file->d_name); + file_names.push_back(cur_file_name); + } + } + + closedir(p_dir); + return 0; +} + +int main(int argc, char** argv) { + cudaSetDevice(DEVICE); + // create a model using the API directly and serialize it to a stream + char *trtModelStream{nullptr}; + size_t size{0}; + + if (argc == 2 && std::string(argv[1]) == "-s") { + IHostMemory* modelStream{nullptr}; + APIToModel(BATCH_SIZE, &modelStream); + assert(modelStream != nullptr); + std::ofstream p("arcface-mobilefacenet.engine", std::ios::binary); + if (!p) { + std::cerr << "could not open plan output file" << std::endl; + return -1; + } + p.write(reinterpret_cast(modelStream->data()), modelStream->size()); + modelStream->destroy(); + return 0; + } else if (argc == 2 && std::string(argv[1]) == "-d") { + std::ifstream file("arcface-mobilefacenet.engine", std::ios::binary); + if (file.good()) { + file.seekg(0, file.end); + size = file.tellg(); + file.seekg(0, file.beg); + trtModelStream = new char[size]; + assert(trtModelStream); + file.read(trtModelStream, size); + file.close(); + } + } else { + std::cerr << "arguments not right!" << std::endl; + std::cerr << "./arcface-mobilefacenet -s // serialize model to plan file" << std::endl; + std::cerr << "./arcface-mobilefacenet -d // deserialize plan file and run inference" << std::endl; + return -1; + } + + // prepare input data --------------------------- + static float data[BATCH_SIZE * 3 * INPUT_H * INPUT_W]; + //for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++) + // data[i] = 1.0; + static float prob[BATCH_SIZE * OUTPUT_SIZE]; + IRuntime* runtime = createInferRuntime(gLogger); + assert(runtime != nullptr); + ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size); + assert(engine != nullptr); + IExecutionContext* context = engine->createExecutionContext(); + assert(context != nullptr); + delete[] trtModelStream; + + cv::Mat img = cv::imread("../joey0.ppm"); + for (int i = 0; i < INPUT_H * INPUT_W; i++) { + data[i] = ((float)img.at(i)[2] - 127.5) * 0.0078125; + data[i + INPUT_H * INPUT_W] = ((float)img.at(i)[1] - 127.5) * 0.0078125; + data[i + 2 * INPUT_H * INPUT_W] = ((float)img.at(i)[0] - 127.5) * 0.0078125; + } + + // Run inference + auto start = std::chrono::system_clock::now(); + doInference(*context, data, prob, BATCH_SIZE); + auto end = std::chrono::system_clock::now(); + std::cout << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; + + cv::Mat out(512, 1, CV_32FC1, prob); + cv::Mat out_norm; + cv::normalize(out, out_norm); + + img = cv::imread("../joey1.ppm"); + for (int i = 0; i < INPUT_H * INPUT_W; i++) { + data[i] = ((float)img.at(i)[2] - 127.5) * 0.0078125; + data[i + INPUT_H * INPUT_W] = ((float)img.at(i)[1] - 127.5) * 0.0078125; + data[i + 2 * INPUT_H * INPUT_W] = ((float)img.at(i)[0] - 127.5) * 0.0078125; + } + + // Run inference + start = std::chrono::system_clock::now(); + doInference(*context, data, prob, BATCH_SIZE); + end = std::chrono::system_clock::now(); + std::cout << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; + + cv::Mat out1(1, 512, CV_32FC1, prob); + cv::Mat out_norm1; + cv::normalize(out1, out_norm1); + + cv::Mat res = out_norm1 * out_norm; + + std::cout << "similarity score: " << *(float*)res.data << std::endl; + + // Destroy the engine + context->destroy(); + engine->destroy(); + runtime->destroy(); + + //Print histogram of the output distribution + //std::cout << "\nOutput:\n\n"; + //for (unsigned int i = 0; i < OUTPUT_SIZE; i++) + //{ + // std::cout << p_out_norm[i] << ", "; + // if (i % 10 == 0) std::cout << i / 10 << std::endl; + //} + //std::cout << std::endl; + + return 0; +} diff --git a/arcface/arcface-r100.cpp b/arcface/arcface-r100.cpp new file mode 100644 index 00000000..d94c2e96 --- /dev/null +++ b/arcface/arcface-r100.cpp @@ -0,0 +1,455 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include "NvInfer.h" +#include "cuda_runtime_api.h" +#include "logging.h" + +#define CHECK(status) \ + do\ + {\ + auto ret = (status);\ + if (ret != 0)\ + {\ + std::cerr << "Cuda failure: " << ret << std::endl;\ + abort();\ + }\ + } while (0) + +//#define USE_FP16 // comment out this if want to use FP32 +#define DEVICE 0 // GPU id +#define BATCH_SIZE 1 // currently, only support BATCH=1 + +using namespace nvinfer1; + +// stuff we know about the network and the input/output blobs +static const int INPUT_H = 112; +static const int INPUT_W = 112; +static const int OUTPUT_SIZE = 512; +const char* INPUT_BLOB_NAME = "data"; +const char* OUTPUT_BLOB_NAME = "prob"; +static Logger gLogger; + +// TensorRT weight files have a simple space delimited format: +// [type] [size] +std::map loadWeights(const std::string file) { + std::cout << "Loading weights: " << file << std::endl; + std::map weightMap; + + // Open weights file + std::ifstream input(file); + assert(input.is_open() && "Unable to load weight file."); + + // Read number of weight blobs + int32_t count; + input >> count; + assert(count > 0 && "Invalid weight map file."); + + while (count--) + { + Weights wt{DataType::kFLOAT, nullptr, 0}; + uint32_t size; + + // Read name and type of blob + std::string name; + input >> name >> std::dec >> size; + wt.type = DataType::kFLOAT; + + // Load blob + uint32_t* val = reinterpret_cast(malloc(sizeof(val) * size)); + for (uint32_t x = 0, y = size; x < y; ++x) + { + input >> std::hex >> val[x]; + } + wt.values = val; + + wt.count = size; + weightMap[name] = wt; + } + + return weightMap; +} + +IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname, float eps) { + float *gamma = (float*)weightMap[lname + "_gamma"].values; + float *beta = (float*)weightMap[lname + "_beta"].values; + float *mean = (float*)weightMap[lname + "_moving_mean"].values; + float *var = (float*)weightMap[lname + "_moving_var"].values; + int len = weightMap[lname + "_moving_var"].count; + + float *scval = reinterpret_cast(malloc(sizeof(float) * len)); + for (int i = 0; i < len; i++) { + scval[i] = gamma[i] / sqrt(var[i] + eps); + } + Weights scale{DataType::kFLOAT, scval, len}; + + float *shval = reinterpret_cast(malloc(sizeof(float) * len)); + for (int i = 0; i < len; i++) { + shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps); + } + Weights shift{DataType::kFLOAT, shval, len}; + + float *pval = reinterpret_cast(malloc(sizeof(float) * len)); + for (int i = 0; i < len; i++) { + pval[i] = 1.0; + } + Weights power{DataType::kFLOAT, pval, len}; + + weightMap[lname + ".scale"] = scale; + weightMap[lname + ".shift"] = shift; + weightMap[lname + ".power"] = power; + IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power); + assert(scale_1); + return scale_1; +} + +ILayer* addPRelu(INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname) { + float *gamma = (float*)weightMap[lname + "_gamma"].values; + int len = weightMap[lname + "_gamma"].count; + + float *scval_1 = reinterpret_cast(malloc(sizeof(float) * len)); + float *scval_2 = reinterpret_cast(malloc(sizeof(float) * len)); + for (int i = 0; i < len; i++) { + scval_1[i] = -1.0; + scval_2[i] = -gamma[i]; + } + Weights scale_1{ DataType::kFLOAT, scval_1, len }; + Weights scale_2{ DataType::kFLOAT, scval_2, len }; + + float *shval = reinterpret_cast(malloc(sizeof(float) * len)); + for (int i = 0; i < len; i++) { + shval[i] = 0.0; + } + Weights shift{ DataType::kFLOAT, shval, len }; + + float *pval = reinterpret_cast(malloc(sizeof(float) * len)); + for (int i = 0; i < len; i++) { + pval[i] = 1.0; + } + Weights power{ DataType::kFLOAT, pval, len }; + + auto relu1 = network->addActivation(input, ActivationType::kRELU); + assert(relu1); + IScaleLayer* scale1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale_1, power); + assert(scale1); + auto relu2 = network->addActivation(*scale1->getOutput(0), ActivationType::kRELU); + assert(relu2); + IScaleLayer* scale2 = network->addScale(*relu2->getOutput(0), ScaleMode::kCHANNEL, shift, scale_2, power); + assert(scale2); + IElementWiseLayer* ew1 = network->addElementWise(*relu1->getOutput(0), *scale2->getOutput(0), ElementWiseOperation::kSUM); + assert(ew1); + return ew1; +} + +ILayer* resUnit(INetworkDefinition *network, std::map& weightMap, ITensor& input, int num_filters, int s, bool dim_match, std::string lname) { + Weights emptywts{DataType::kFLOAT, nullptr, 0}; + auto bn1 = addBatchNorm2d(network, weightMap, input, lname + "_bn1", 2e-5); + IConvolutionLayer* conv1 = network->addConvolutionNd(*bn1->getOutput(0), num_filters, DimsHW{3, 3}, weightMap[lname + "_conv1_weight"], emptywts); + assert(conv1); + conv1->setPaddingNd(DimsHW{1, 1}); + auto bn2 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + "_bn2", 2e-5); + auto act1 = addPRelu(network, weightMap, *bn2->getOutput(0), lname + "_relu1"); + IConvolutionLayer* conv2 = network->addConvolutionNd(*act1->getOutput(0), num_filters, DimsHW{3, 3}, weightMap[lname + "_conv2_weight"], emptywts); + assert(conv2); + conv2->setStrideNd(DimsHW{s, s}); + conv2->setPaddingNd(DimsHW{1, 1}); + auto bn3 = addBatchNorm2d(network, weightMap, *conv2->getOutput(0), lname + "_bn3", 2e-5); + + IElementWiseLayer* ew1; + if (dim_match) { + ew1 = network->addElementWise(input, *bn3->getOutput(0), ElementWiseOperation::kSUM); + } else { + IConvolutionLayer* conv1sc = network->addConvolutionNd(input, num_filters, DimsHW{1, 1}, weightMap[lname + "_conv1sc_weight"], emptywts); + assert(conv1sc); + conv1sc->setStrideNd(DimsHW{s, s}); + auto bn1sc = addBatchNorm2d(network, weightMap, *conv1sc->getOutput(0), lname + "_sc", 2e-5); + ew1 = network->addElementWise(*bn1sc->getOutput(0), *bn3->getOutput(0), ElementWiseOperation::kSUM); + } + assert(ew1); + return ew1; +} + +// Creat the engine using only the API and not any parser. +ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) { + INetworkDefinition* network = builder->createNetworkV2(0U); + + // Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME + ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{3, INPUT_H, INPUT_W}); + assert(data); + + std::map weightMap = loadWeights("../arcface-r100.wts"); + Weights emptywts{DataType::kFLOAT, nullptr, 0}; + + IConvolutionLayer* conv0 = network->addConvolutionNd(*data, 64, DimsHW{3, 3}, weightMap["conv0_weight"], emptywts); + assert(conv0); + conv0->setPaddingNd(DimsHW{1, 1}); + auto bn0 = addBatchNorm2d(network, weightMap, *conv0->getOutput(0), "bn0", 2e-5); + auto relu0 = addPRelu(network, weightMap, *bn0->getOutput(0), "relu0"); + + auto s1u1 = resUnit(network, weightMap, *relu0->getOutput(0), 64, 2, false, "stage1_unit1"); + auto s1u2 = resUnit(network, weightMap, *s1u1->getOutput(0), 64, 1, true, "stage1_unit2"); + auto s1u3 = resUnit(network, weightMap, *s1u2->getOutput(0), 64, 1, true, "stage1_unit3"); + + auto s2u1 = resUnit(network, weightMap, *s1u3->getOutput(0), 128, 2, false, "stage2_unit1"); + auto s2u2 = resUnit(network, weightMap, *s2u1->getOutput(0), 128, 1, true, "stage2_unit2"); + auto s2u3 = resUnit(network, weightMap, *s2u2->getOutput(0), 128, 1, true, "stage2_unit3"); + auto s2u4 = resUnit(network, weightMap, *s2u3->getOutput(0), 128, 1, true, "stage2_unit4"); + + + auto s2u5 = resUnit(network, weightMap, *s2u4->getOutput(0), 128, 1, true, "stage2_unit5"); + auto s2u6 = resUnit(network, weightMap, *s2u5->getOutput(0), 128, 1, true, "stage2_unit6"); + auto s2u7 = resUnit(network, weightMap, *s2u6->getOutput(0), 128, 1, true, "stage2_unit7"); + auto s2u8 = resUnit(network, weightMap, *s2u7->getOutput(0), 128, 1, true, "stage2_unit8"); + + auto s2u9 = resUnit(network, weightMap, *s2u8->getOutput(0), 128, 1, true, "stage2_unit9"); + auto s2u10 = resUnit(network, weightMap, *s2u9->getOutput(0), 128, 1, true, "stage2_unit10"); + auto s2u11 = resUnit(network, weightMap, *s2u10->getOutput(0), 128, 1, true, "stage2_unit11"); + auto s2u12 = resUnit(network, weightMap, *s2u11->getOutput(0), 128, 1, true, "stage2_unit12"); + auto s2u13 = resUnit(network, weightMap, *s2u12->getOutput(0), 128, 1, true, "stage2_unit13"); + + auto s3u1 = resUnit(network, weightMap, *s2u13->getOutput(0), 256, 2, false, "stage3_unit1"); + auto s3u2 = resUnit(network, weightMap, *s3u1->getOutput(0), 256, 1, true, "stage3_unit2"); + auto s3u3 = resUnit(network, weightMap, *s3u2->getOutput(0), 256, 1, true, "stage3_unit3"); + auto s3u4 = resUnit(network, weightMap, *s3u3->getOutput(0), 256, 1, true, "stage3_unit4"); + auto s3u5 = resUnit(network, weightMap, *s3u4->getOutput(0), 256, 1, true, "stage3_unit5"); + auto s3u6 = resUnit(network, weightMap, *s3u5->getOutput(0), 256, 1, true, "stage3_unit6"); + auto s3u7 = resUnit(network, weightMap, *s3u6->getOutput(0), 256, 1, true, "stage3_unit7"); + auto s3u8 = resUnit(network, weightMap, *s3u7->getOutput(0), 256, 1, true, "stage3_unit8"); + auto s3u9 = resUnit(network, weightMap, *s3u8->getOutput(0), 256, 1, true, "stage3_unit9"); + auto s3u10 = resUnit(network, weightMap, *s3u9->getOutput(0), 256, 1, true, "stage3_unit10"); + auto s3u11 = resUnit(network, weightMap, *s3u10->getOutput(0), 256, 1, true, "stage3_unit11"); + auto s3u12 = resUnit(network, weightMap, *s3u11->getOutput(0), 256, 1, true, "stage3_unit12"); + auto s3u13 = resUnit(network, weightMap, *s3u12->getOutput(0), 256, 1, true, "stage3_unit13"); + auto s3u14 = resUnit(network, weightMap, *s3u13->getOutput(0), 256, 1, true, "stage3_unit14"); + + auto s3u15 = resUnit(network, weightMap, *s3u14->getOutput(0), 256, 1, true, "stage3_unit15"); + auto s3u16 = resUnit(network, weightMap, *s3u15->getOutput(0), 256, 1, true, "stage3_unit16"); + auto s3u17 = resUnit(network, weightMap, *s3u16->getOutput(0), 256, 1, true, "stage3_unit17"); + auto s3u18 = resUnit(network, weightMap, *s3u17->getOutput(0), 256, 1, true, "stage3_unit18"); + auto s3u19 = resUnit(network, weightMap, *s3u18->getOutput(0), 256, 1, true, "stage3_unit19"); + auto s3u20 = resUnit(network, weightMap, *s3u19->getOutput(0), 256, 1, true, "stage3_unit20"); + auto s3u21 = resUnit(network, weightMap, *s3u20->getOutput(0), 256, 1, true, "stage3_unit21"); + auto s3u22 = resUnit(network, weightMap, *s3u21->getOutput(0), 256, 1, true, "stage3_unit22"); + auto s3u23 = resUnit(network, weightMap, *s3u22->getOutput(0), 256, 1, true, "stage3_unit23"); + auto s3u24 = resUnit(network, weightMap, *s3u23->getOutput(0), 256, 1, true, "stage3_unit24"); + auto s3u25 = resUnit(network, weightMap, *s3u24->getOutput(0), 256, 1, true, "stage3_unit25"); + auto s3u26 = resUnit(network, weightMap, *s3u25->getOutput(0), 256, 1, true, "stage3_unit26"); + auto s3u27 = resUnit(network, weightMap, *s3u26->getOutput(0), 256, 1, true, "stage3_unit27"); + auto s3u28 = resUnit(network, weightMap, *s3u27->getOutput(0), 256, 1, true, "stage3_unit28"); + auto s3u29 = resUnit(network, weightMap, *s3u28->getOutput(0), 256, 1, true, "stage3_unit29"); + auto s3u30 = resUnit(network, weightMap, *s3u29->getOutput(0), 256, 1, true, "stage3_unit30"); + + auto s4u1 = resUnit(network, weightMap, *s3u30->getOutput(0), 512, 2, false, "stage4_unit1"); + auto s4u2 = resUnit(network, weightMap, *s4u1->getOutput(0), 512, 1, true, "stage4_unit2"); + auto s4u3 = resUnit(network, weightMap, *s4u2->getOutput(0), 512, 1, true, "stage4_unit3"); + + auto bn1 = addBatchNorm2d(network, weightMap, *s4u3->getOutput(0), "bn1", 2e-5); + IFullyConnectedLayer* fc1 = network->addFullyConnected(*bn1->getOutput(0), 512, weightMap["pre_fc1_weight"], weightMap["pre_fc1_bias"]); + assert(fc1); + auto bn2 = addBatchNorm2d(network, weightMap, *fc1->getOutput(0), "fc1", 2e-5); + + bn2->getOutput(0)->setName(OUTPUT_BLOB_NAME); + network->markOutput(*bn2->getOutput(0)); + + // Build engine + builder->setMaxBatchSize(maxBatchSize); + config->setMaxWorkspaceSize(16 * (1 << 20)); // 16MB +#ifdef USE_FP16 + config->setFlag(BuilderFlag::kFP16); +#endif + std::cout << "Building engine, please wait for a while..." << std::endl; + ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); + std::cout << "Build engine successfully!" << std::endl; + + // Don't need the network any more + network->destroy(); + + // Release host memory + for (auto& mem : weightMap) + { + free((void*) (mem.second.values)); + } + + return engine; +} + +void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream) { + // Create builder + IBuilder* builder = createInferBuilder(gLogger); + IBuilderConfig* config = builder->createBuilderConfig(); + + // Create model to populate the network, then set the outputs and create an engine + ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT); + assert(engine != nullptr); + + // Serialize the engine + (*modelStream) = engine->serialize(); + + // Close everything down + engine->destroy(); + builder->destroy(); +} + +void doInference(IExecutionContext& context, float* input, float* output, int batchSize) { + const ICudaEngine& engine = context.getEngine(); + + // Pointers to input and output device buffers to pass to engine. + // Engine requires exactly IEngine::getNbBindings() number of buffers. + assert(engine.getNbBindings() == 2); + void* buffers[2]; + + // In order to bind the buffers, we need to know the names of the input and output tensors. + // Note that indices are guaranteed to be less than IEngine::getNbBindings() + const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME); + const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME); + + // Create GPU buffers on device + CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float))); + CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float))); + + // Create stream + cudaStream_t stream; + CHECK(cudaStreamCreate(&stream)); + + // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host + CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream)); + context.enqueue(batchSize, buffers, stream, nullptr); + CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream)); + cudaStreamSynchronize(stream); + + // Release stream and buffers + cudaStreamDestroy(stream); + CHECK(cudaFree(buffers[inputIndex])); + CHECK(cudaFree(buffers[outputIndex])); +} + +int read_files_in_dir(const char *p_dir_name, std::vector &file_names) { + DIR *p_dir = opendir(p_dir_name); + if (p_dir == nullptr) { + return -1; + } + + struct dirent* p_file = nullptr; + while ((p_file = readdir(p_dir)) != nullptr) { + if (strcmp(p_file->d_name, ".") != 0 && + strcmp(p_file->d_name, "..") != 0) { + //std::string cur_file_name(p_dir_name); + //cur_file_name += "/"; + //cur_file_name += p_file->d_name; + std::string cur_file_name(p_file->d_name); + file_names.push_back(cur_file_name); + } + } + + closedir(p_dir); + return 0; +} + +int main(int argc, char** argv) { + cudaSetDevice(DEVICE); + // create a model using the API directly and serialize it to a stream + char *trtModelStream{nullptr}; + size_t size{0}; + + if (argc == 2 && std::string(argv[1]) == "-s") { + IHostMemory* modelStream{nullptr}; + APIToModel(256, &modelStream); + assert(modelStream != nullptr); + std::ofstream p("arcface-r100.engine", std::ios::binary); + if (!p) { + std::cerr << "could not open plan output file" << std::endl; + return -1; + } + p.write(reinterpret_cast(modelStream->data()), modelStream->size()); + modelStream->destroy(); + return 0; + } else if (argc == 2 && std::string(argv[1]) == "-d") { + std::ifstream file("arcface-r100.engine", std::ios::binary); + if (file.good()) { + file.seekg(0, file.end); + size = file.tellg(); + file.seekg(0, file.beg); + trtModelStream = new char[size]; + assert(trtModelStream); + file.read(trtModelStream, size); + file.close(); + } + } else { + std::cerr << "arguments not right!" << std::endl; + std::cerr << "./arcface-r100 -s // serialize model to plan file" << std::endl; + std::cerr << "./arcface-r100 -d // deserialize plan file and run inference" << std::endl; + return -1; + } + + // prepare input data --------------------------- + static float data[BATCH_SIZE * 3 * INPUT_H * INPUT_W]; + //for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++) + // data[i] = 1.0; + static float prob[BATCH_SIZE * OUTPUT_SIZE]; + IRuntime* runtime = createInferRuntime(gLogger); + assert(runtime != nullptr); + ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size); + assert(engine != nullptr); + IExecutionContext* context = engine->createExecutionContext(); + assert(context != nullptr); + delete[] trtModelStream; + + cv::Mat img = cv::imread("../joey0.ppm"); + for (int i = 0; i < INPUT_H * INPUT_W; i++) { + data[i] = ((float)img.at(i)[2] - 127.5) * 0.0078125; + data[i + INPUT_H * INPUT_W] = ((float)img.at(i)[1] - 127.5) * 0.0078125; + data[i + 2 * INPUT_H * INPUT_W] = ((float)img.at(i)[0] - 127.5) * 0.0078125; + } + + // Run inference + auto start = std::chrono::system_clock::now(); + doInference(*context, data, prob, BATCH_SIZE); + auto end = std::chrono::system_clock::now(); + std::cout << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; + + cv::Mat out(512, 1, CV_32FC1, prob); + cv::Mat out_norm; + cv::normalize(out, out_norm); + + img = cv::imread("../joey1.ppm"); + for (int i = 0; i < INPUT_H * INPUT_W; i++) { + data[i] = ((float)img.at(i)[2] - 127.5) * 0.0078125; + data[i + INPUT_H * INPUT_W] = ((float)img.at(i)[1] - 127.5) * 0.0078125; + data[i + 2 * INPUT_H * INPUT_W] = ((float)img.at(i)[0] - 127.5) * 0.0078125; + } + + // Run inference + start = std::chrono::system_clock::now(); + doInference(*context, data, prob, BATCH_SIZE); + end = std::chrono::system_clock::now(); + std::cout << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; + + cv::Mat out1(1, 512, CV_32FC1, prob); + cv::Mat out_norm1; + cv::normalize(out1, out_norm1); + + cv::Mat res = out_norm1 * out_norm; + + std::cout << "similarity score: " << *(float*)res.data << std::endl; + + // Destroy the engine + context->destroy(); + engine->destroy(); + runtime->destroy(); + + //Print histogram of the output distribution + //std::cout << "\nOutput:\n\n"; + //for (unsigned int i = 0; i < OUTPUT_SIZE; i++) + //{ + // std::cout << p_out_norm[i] << ", "; + // if (i % 10 == 0) std::cout << i / 10 << std::endl; + //} + //std::cout << std::endl; + + return 0; +} \ No newline at end of file