利用Python实现KLT跟踪算法
NVIDIA 视觉编程接口 (VPI: Vision Programming Interface
) 是 NVIDIA 的计算机视觉和图像处理软件库,使您能够实现在 NVIDIA Jetson 嵌入式设备和独立的GPU 上可用的不同硬件后端上加速的算法。
库中的一些算法包括过滤方法、透视扭曲、时间降噪、直方图均衡、立体视差和镜头失真校正。 VPI 提供易于使用的 Python 绑定以及 C++ API。
除了与 OpenCV 接口外,VPI 还能够与 PyTorch 和其他基于 Python 的库进行互操作。
下面的示例跟踪输入视频上的边界框,在每一帧上绘制它们并将结果保存在视频文件中。用户可以定义将用于处理的后端。
输出效果:
from __future__ import print_function
import sys
from argparse import ArgumentParser
import numpy as np
import cv2
import vpi
# Convert a colored input frame to grayscale (if needed)
# and then, if using PVA backend, convert it to 16-bit unsigned pixels;
# The converted frame is copied before wrapping it as a VPI image so
# later draws in the gray frame do not change the reference VPI image.
def convertFrameImage(inputFrame, backend):
if inputFrame.ndim == 3 and inputFrame.shape[2] == 3:
grayFrame = cv2.cvtColor(inputFrame, cv2.COLOR_BGR2GRAY)
else:
grayFrame = inputFrame
if backend == vpi.Backend.PVA:
# PVA only supports 16-bit unsigned inputs,
# where each element is in 0-255 range, so
# no rescaling is needed.
grayFrame = grayFrame.astype(np.uint16)
grayImage = vpi.asimage(grayFrame.copy())
return grayFrame, grayImage
# Write the input gray frame to output video with
# input bounding boxes and predictions
def writeOutput(outVideo, cvGray, inBoxes, inPreds, colors, backend):
try:
if cvGray.dtype == np.uint16:
cvGray = cvGray.astype(np.uint8)
if cvGray.dtype != np.uint8:
raise Exception('Input frame format must be grayscale, 8-bit unsigned')
cvGrayBGR = cv2.cvtColor(cvGray, cv2.COLOR_GRAY2BGR)
# Tracking the number of valid bounding boxes in the current frame
numValidBoxes = 0
# Draw the input bounding boxes considering the input predictions
with inBoxes.rlock_cpu(), inPreds.rlock_cpu() as pred:
# Array of bounding boxes (bbox) and predictions (pred)
bbox = inBoxes.cpu().view(np.recarray)
for i in range(inBoxes.size):
if bbox[i].tracking_status == vpi.KLTTrackStatus.LOST:
# If the tracking status of the current bounding box is lost, skip it
continue
# Gather information of the current (i) bounding box and prediction
# Prediction scaling width, height and x, y
predScaleWidth = pred[i][0, 0]
predScaleHeight = pred[i][1, 1]
predX = pred[i][0, 2]
predY = pred[i][1, 2]
# Bounding box scaling width, height and x, y and bbox width, height
bboxScaleWidth = bbox[i].bbox.xform.mat3[0, 0]
bboxScaleHeight = bbox[i].bbox.xform.mat3[1, 1]
bboxX = bbox[i].bbox.xform.mat3[0, 2]
bboxY = bbox[i].bbox.xform.mat3[1, 2]
bboxWidth = bbox[i].bbox.width
bboxHeight = bbox[i].bbox.height
# Compute corrected x, y and width, height (w, h) by proper adding
# bounding box and prediction x, y and by proper multiplying
# bounding box w, h with its own scaling and prediction scaling
x = bboxX + predX
y = bboxY + predY
w = bboxWidth * bboxScaleWidth * predScaleWidth
h = bboxHeight * bboxScaleHeight * predScaleHeight
# Start point and end point of the bounding box for OpenCV drawing
startPoint = tuple(np.array([x, y], dtype=int))
endPoint = tuple(np.array([x, y], dtype=int) + np.array([w, h], dtype=int))
# The color of the bounding box to be drawn
bboxColor = tuple([ int(c) for c in colors[0, i] ])
cv2.rectangle(cvGrayBGR, startPoint, endPoint, bboxColor, 2)
# Incrementing the number of valid bounding boxes in the current frame
numValidBoxes += 1
print(' Valid: {:02d} boxes'.format(numValidBoxes))
outVideo.write(cvGrayBGR)
except Exception as e:
print('Error while writing output video:\n', e, file=sys.stderr)
exit(1)
# ----------------------------
# Parse command line arguments
parser = ArgumentParser()
parser.add_argument('backend', choices=['cpu','cuda','pva'],
help='Backend to be used for processing')
parser.add_argument('input',
help='Input video')
parser.add_argument('boxes',
help='Text file with bounding boxes description')
args = parser.parse_args()
if args.backend == 'cpu':
backend = vpi.Backend.CPU
elif args.backend == 'cuda':
backend = vpi.Backend.CUDA
else:
assert args.backend == 'pva'
backend = vpi.Backend.PVA
# -----------------------------
# Open input and output videos
inVideo = cv2.VideoCapture(args.input)
fourcc = cv2.VideoWriter_fourcc(*'MPEG')
inSize = (int(inVideo.get(cv2.CAP_PROP_FRAME_WIDTH)), int(inVideo.get(cv2.CAP_PROP_FRAME_HEIGHT)))
fps = inVideo.get(cv2.CAP_PROP_FPS)
outVideo = cv2.VideoWriter('klt_python'+str(sys.version_info[0])+'_'+args.backend+'.mp4',
fourcc, fps, inSize)
if not outVideo.isOpened():
print("Error creating output video", file=sys.stderr)
exit(1)
# -----------------------------
# Reading input bounding boxes
# All boxes is a dictionary of all bounding boxes to be tracked in the input video,
# where each value is a list of new bounding boxes to track at the frame indicated by its key
allBoxes = {
}
totalNumBoxes = 0
# Array capacity 0 means no restricted maximum number of bounding boxes
arrayCapacity = 0
if backend == vpi.Backend.PVA:
# PVA requires 128 array capacity or maximum number of bounding boxes
arrayCapacity = 128
with open(args.boxes) as f:
# The input file (f) should have one bounding box per lines as:
# "startFrame bboxX bboxY bboxWidth bboxHeight"; e.g.: "61 547 337 14 11"
for line in f.readlines():
line = line.replace('\n', '').replace('\r', '')
startFrame, x, y, w, h = [ float(v) for v in line.split(' ') ]
bb = (x, y, w, h)
if startFrame not in allBoxes:
allBoxes[startFrame] = [bb]
else:
allBoxes[startFrame].append(bb)
totalNumBoxes += 1
if totalNumBoxes == arrayCapacity:
# Stop adding boxes if its total reached the array capacity
break
curFrame = 0
curNumBoxes = len(allBoxes[curFrame])
# ------------------------------------------------------------------------------
# Initialize VPI array with all input bounding boxes (same as C++ KLT sample)
if arrayCapacity == 0:
arrayCapacity = totalNumBoxes
inBoxes = vpi.Array(arrayCapacity, vpi.Type.KLT_TRACKED_BOUNDING_BOX)
inBoxes.size = totalNumBoxes
with inBoxes.wlock_cpu():
data = inBoxes.cpu().view(np.recarray)
# Global index i of all bounding boxes data, starting at 0
i = 0
for f in sorted(allBoxes.keys()):
for bb in allBoxes[f]:
# Each bounding box bb is a tuple of (x, y, w, h)
x, y, w, h = bb
# The bounding box data is the identity for the scaling part,
# meaning no scaling, and the offset part is its position x, y
data[i].bbox.xform.mat3[0, 0] = 1
data[i].bbox.xform.mat3[1, 1] = 1
data[i].bbox.xform.mat3[2, 2] = 1
data[i].bbox.xform.mat3[0, 2] = x
data[i].bbox.xform.mat3[1, 2] = y
# The bounding box data stores its width and height w, h
data[i].bbox.width = w
data[i].bbox.height = h
# Initially all boxes have status tracked and update needed
data[i].tracking_status = vpi.KLTTrackStatus.TRACKED
data[i].template_status = vpi.KLTTemplateStatus.UPDATE_NEEDED
# Incrementing the global index for the next bounding box
i += 1
#-------------------------------------------------------------------------------
# Generate random colors for bounding boxes equal to the C++ KLT sample
hues = np.zeros((totalNumBoxes,), dtype=np.uint8)
if int(cv2.__version__.split('.')[0]) >= 3:
cv2.setRNGSeed(1)
hues = cv2.randu(hues, 0, 180)
else:
# Random differs in OpenCV-2.4
rng = cv2.cv.RNG(1)
hues = cv2.cv.fromarray(np.array([[ h for h in hues ]], dtype=np.uint8))
cv2.cv.RandArr(rng, hues, cv2.cv.CV_RAND_UNI, 0, 180)
hues = [ hues[0, i] for i in range(totalNumBoxes) ]
colors = np.array([[ [int(h), 255, 255] for h in hues ]], dtype=np.uint8)
colors = cv2.cvtColor(colors, cv2.COLOR_HSV2BGR)
#-------------------------------------------------------------------------------
# Initialize the KLT Feature Tracker algorithm
# Load up first frame
validFrame, cvFrame = inVideo.read()
if not validFrame:
print("Error reading first input frame", file=sys.stderr)
exit(1)
# Convert OpenCV frame to gray returning also the VPI image for given backend
cvGray, imgTemplate = convertFrameImage(cvFrame, backend)
# Create the KLT Feature Tracker object using the backend specified by the user
klt = vpi.KLTFeatureTracker(imgTemplate, inBoxes, backend=backend)
#-------------------------------------------------------------------------------
# Main processing loop
while validFrame:
print('Frame: {:04d} ; Total: {:02d} boxes ;'.format(curFrame, curNumBoxes), end='')
# Adjust input boxes and predictions to the current number of boxes
inPreds = klt.in_predictions()
inPreds.size = curNumBoxes
inBoxes.size = curNumBoxes
# Write current frame to the output video
writeOutput(outVideo, cvGray, inBoxes, inPreds, colors, backend)
# Read next input frame
curFrame += 1
validFrame, cvFrame = inVideo.read()
if not validFrame:
break
cvGray, imgReference = convertFrameImage(cvFrame, backend)
outBoxes = klt(imgReference)
if curFrame in allBoxes:
curNumBoxes += len(allBoxes[curFrame])
outVideo.release()
# vim: ts=8:sw=4:sts=4:et:ai