TensorRT Construction
给出一个最简单的创建并运行的过程。
Procedure
- Create a global object
ILogger
ILogger
是个抽象类,需要派生后使用。官方 API 中给了例子。具体使用可以从 Code Sample 中拷贝。
- Create object of type
IBuilder
IBuilder* builder = createInferBuilder(Logger);
- Create object of type
IBuilderConfig
IBuilderConfig* config = builder->createBuilderConfig();
- Create object of type
INetworkDefinition
// default creation
INetworkDefinition* network = builder->createNetworkV2(0U);
// explicit batch
INetworkDefinition* network = builder->createNetworkV2(1U << static_cast<uint32_t>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH));
-
Network Definition
-
Create Engine
ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
- Create Execution Context
IExecutionContext* context = engine->createExecutionContext();
- Do Inference in the context
下面给出的例子是异步的。(Asychronous execution)
// Pointer to Input & Output Buffers
void* buffers[2];
// In order to bind the buffers, we need to know the names of the input and output tensors.
// Note that indices are guaranteed to be less than IEngine::getNbBindings()
const int inputIndex = engine->getBindingIndex(INPUT_BLOB_NAME);
const int outputIndex = engine->getBindingIndex(OUTPUT_BLOB_NAME);
// Create GPU buffers on device
CUDA_CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float)));
CUDA_CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));
// Create stream
cudaStream_t stream;
CUDA_CHECK(cudaStreamCreate(&stream));
// DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
CUDA_CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
context->enqueue(batchSize, buffers, stream, nullptr);
CUDA_CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
cudaStreamSynchronize(stream);
// Release stream and buffers
cudaStreamDestroy(stream);
CUDA_CHECK(cudaFree(buffers[inputIndex]));
CUDA_CHECK(cudaFree(buffers[outputIndex]));
简单来说三个步骤:拷贝数据到 Device
、context->enqueue()
执行、拷贝数据回 Host
。
Network Definition
// set the input
auto data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{-1, 1, INPUT_H, INPUT_W});
// add layers
auto conv1 = network->addConvolution(*data->getOutput(0), 20, DimsHW{5, 5}, weightMap["conv1filter"], weightMap["conv1bias"]);
conv1->setStride(DimsHW{1, 1});
auto pool1 = network->addPooling(*conv1->getOutput(0), PoolingType::kMAX, DimsHW{2, 2});
pool1->setStride(DimsHW{2, 2});
auto ip1 = network->addFullyConnected(*pool1->getOutput(0), 500, weightMap["ip1filter"], weightMap["ip1bias"]);
auto relu1 = network->addActivation(*ip1->getOutput(0), ActivationType::kRELU);
auto prob = network->addSoftMax(*relu1->getOutput(0));
prob->getOutput(0)->setName(OUTPUT_BLOB_NAME);
// set output
network->markOutput(*prob->getOutput(0));
- 输入的维数都是
Dims3
,可以隐式指定batchsize
。对于全连接层,若输入为 {C, H, W},则会变形成 {1, C*H*W} 后输入网络。 - 网络的输入和输出都是
ITensor
类。将ITensor
和字符串绑定,用于之后索引。
文中代码参考此处