TensorRT Construction

给出一个最简单的创建并运行的过程。

Procedure

Create a global object ILogger

ILogger 是个抽象类，需要派生后使用。官方 API 中给了例子。具体使用可以从 Code Sample 中拷贝。

Create object of type IBuilder

IBuilder* builder = createInferBuilder(Logger);

Create object of type IBuilderConfig

IBuilderConfig* config = builder->createBuilderConfig();

Create object of type INetworkDefinition

// default creation
INetworkDefinition* network = builder->createNetworkV2(0U);

// explicit batch
INetworkDefinition* network = builder->createNetworkV2(1U << static_cast<uint32_t>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH));

Network Definition
Create Engine

ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);

Create Execution Context

IExecutionContext* context = engine->createExecutionContext();

Do Inference in the context

下面给出的例子是异步的。(Asychronous execution)

  // Pointer to Input & Output Buffers
  void* buffers[2];
  
  // In order to bind the buffers, we need to know the names of the input and output tensors.
  // Note that indices are guaranteed to be less than IEngine::getNbBindings()
  const int inputIndex = engine->getBindingIndex(INPUT_BLOB_NAME);
  const int outputIndex = engine->getBindingIndex(OUTPUT_BLOB_NAME);

  // Create GPU buffers on device
  CUDA_CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float)));
  CUDA_CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));

  // Create stream
  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
  CUDA_CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
  context->enqueue(batchSize, buffers, stream, nullptr);
  CUDA_CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
  cudaStreamSynchronize(stream);

  // Release stream and buffers
  cudaStreamDestroy(stream);
  CUDA_CHECK(cudaFree(buffers[inputIndex]));
  CUDA_CHECK(cudaFree(buffers[outputIndex]));

简单来说三个步骤：拷贝数据到 Device 、context->enqueue() 执行、拷贝数据回 Host 。

Network Definition

// set the input
auto data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{-1, 1, INPUT_H, INPUT_W});

// add layers
auto conv1 = network->addConvolution(*data->getOutput(0), 20, DimsHW{5, 5}, weightMap["conv1filter"], weightMap["conv1bias"]);
conv1->setStride(DimsHW{1, 1});

auto pool1 = network->addPooling(*conv1->getOutput(0), PoolingType::kMAX, DimsHW{2, 2});
pool1->setStride(DimsHW{2, 2});

auto ip1 = network->addFullyConnected(*pool1->getOutput(0), 500, weightMap["ip1filter"], weightMap["ip1bias"]);
auto relu1 = network->addActivation(*ip1->getOutput(0), ActivationType::kRELU);

auto prob = network->addSoftMax(*relu1->getOutput(0));
prob->getOutput(0)->setName(OUTPUT_BLOB_NAME);

// set output
network->markOutput(*prob->getOutput(0));

输入的维数都是 Dims3 ，可以隐式指定 batchsize 。对于全连接层，若输入为 {C, H, W}，则会变形成 {1, C*H*W} 后输入网络。
网络的输入和输出都是 ITensor 类。将 ITensor 和字符串绑定，用于之后索引。

文中代码参考此处

TensorRT -- Construction

文章目录

TensorRT Construction

Procedure

Network Definition

猜你喜欢