树莓派4B使用深度学习搭建自动跟随小车

146 阅读 0 评论 97 点赞

我是靠谱客的博主碧蓝学姐，最近开发中收集的这篇文章主要介绍树莓派4B使用深度学习搭建自动跟随小车，觉得挺不错的，现在分享给大家，希望可以做个参考。

概述

1.树莓派环境搭建

这里就不详细介绍了，使用的buster系统，python版本为默认的3.7.

安装好 numpy、 opencv。如果需要使用 Intel神经棒进行加速，则需要安装 mvnc,可以自行寻找教程。我们最终是通过摄像头来跑深度学习从而识别人体，根据识别到的方框来判断人具体的位置，从而控制小车。

2.关于小车

这里使用的小车是全向三轮车，可以使用串口来进行控制，通过不同的串口指令可以实现前进、后退以及顺时针或逆时针旋转等操作。我们需要使树莓派作为控制器向小车发送串口指令。在 python中，只需导入 serial包，就可以通过串口发送数据了。

3.小车控制部分代码

def move_forward():
    forwardInput = 'ff fe 01 00 00 00 12 00 00 00'
    ser.write(bytes.fromhex(forwardInput))
    
def move_backward():
    backwardInput = 'ff fe 01 00 00 00 12 00 00 02'
    ser.write(bytes.fromhex(backwardInput))

def stop_car():
    stopInput = 'ff fe 01 00 00 00 00 00 00 00'
    ser.write(bytes.fromhex(stopInput))

def move_left():
    leftInput = 'ff fe 01 00 12 00 00 00 00 04'
    ser.write(bytes.fromhex(leftInput))

def move_right():
    rightInput = 'ff fe 01 00 12 00 00 00 00 00'
    ser.write(bytes.fromhex(rightInput))
    
def move_zpositive():
    zpositiveInput = 'ff fe 01 00 00 00 00 00 06 00'
    ser.write(bytes.fromhex(zpositiveInput))
    
def move_znegetive():
    znegetiveInput = 'ff fe 01 00 00 00 00 00 06 01'
    ser.write(bytes.fromhex(znegetiveInput))

具体的串口命令可以根据你所使用的小车来改变。这里要注意的是，串口命令一般是16进制发送的，而 python中默认是字符串格式，所以需要格式转换。

4.总体代码

# USAGE
# python ncs_realtime_objectdetection.py --graph graphs/mobilenetgraph --display 1
# python ncs_realtime_objectdetection.py --graph graphs/mobilenetgraph --confidence 0.5 --display 1

# import the necessary packages
from mvnc import mvncapi as mvnc
from imutils.video import VideoStream
from imutils.video import FPS
import argparse
import numpy as np
import time
import cv2
import serial


# initialize the list of class labels our network was trained to
# detect, then generate a set of bounding box colors for each class
CLASSES = ("background", "aeroplane", "bicycle", "bird",
    "boat", "bottle", "bus", "car", "cat", "chair", "cow",
    "diningtable", "dog", "horse", "motorbike", "person",
    "pottedplant", "sheep", "sofa", "train", "tvmonitor")
COLORS = np.random.uniform(0, 255, size=(len(CLASSES), 3))

# frame dimensions should be sqaure
PREPROCESS_DIMS = (300, 300)
DISPLAY_DIMS = (900, 900)

# calculate the multiplier needed to scale the bounding boxes
DISP_MULTIPLIER = DISPLAY_DIMS[0] // PREPROCESS_DIMS[0]

ser = serial.Serial("/dev/ttyUSB0", 115200, timeout = 0.5)

def move_forward():
    forwardInput = 'ff fe 01 00 00 00 12 00 00 00'
    ser.write(bytes.fromhex(forwardInput))
    
def move_backward():
    backwardInput = 'ff fe 01 00 00 00 12 00 00 02'
    ser.write(bytes.fromhex(backwardInput))

def stop_car():
    stopInput = 'ff fe 01 00 00 00 00 00 00 00'
    ser.write(bytes.fromhex(stopInput))

def move_left():
    leftInput = 'ff fe 01 00 12 00 00 00 00 04'
    ser.write(bytes.fromhex(leftInput))

def move_right():
    rightInput = 'ff fe 01 00 12 00 00 00 00 00'
    ser.write(bytes.fromhex(rightInput))
    
def move_zpositive():
    zpositiveInput = 'ff fe 01 00 00 00 00 00 06 00'
    ser.write(bytes.fromhex(zpositiveInput))
    
def move_znegetive():
    znegetiveInput = 'ff fe 01 00 00 00 00 00 06 01'
    ser.write(bytes.fromhex(znegetiveInput))

def preprocess_image(input_image):
    # preprocess the image
    preprocessed = cv2.resize(input_image, PREPROCESS_DIMS)
    preprocessed = preprocessed - 127.5
    preprocessed = preprocessed * 0.007843
    preprocessed = preprocessed.astype(np.float16)

    # return the image to the calling function
    return preprocessed

def predict(image, graph):
    # preprocess the image
    image = preprocess_image(image)

    # send the image to the NCS and run a forward pass to grab the
    # network predictions
    graph.LoadTensor(image, None)
    (output, _) = graph.GetResult()

    # grab the number of valid object predictions from the output,
    # then initialize the list of predictions
    num_valid_boxes = output[0]
    predictions = []

    # loop over results
    for box_index in range(int(num_valid_boxes)):
        # calculate the base index into our array so we can extract
        # bounding box information
        base_index = 7 + box_index * 7

        # boxes with non-finite (inf, nan, etc) numbers must be ignored
        if (not np.isfinite(output[base_index]) or
            not np.isfinite(output[base_index + 1]) or
            not np.isfinite(output[base_index + 2]) or
            not np.isfinite(output[base_index + 3]) or
            not np.isfinite(output[base_index + 4]) or
            not np.isfinite(output[base_index + 5]) or
            not np.isfinite(output[base_index + 6])):
            continue

        # extract the image width and height and clip the boxes to the
        # image size in case network returns boxes outside of the image
        # boundaries
        (h, w) = image.shape[:2]
        x1 = max(0, int(output[base_index + 3] * w))
        y1 = max(0, int(output[base_index + 4] * h))
        x2 = min(w, int(output[base_index + 5] * w))
        y2 = min(h, int(output[base_index + 6] * h))

        # grab the prediction class label, confidence (i.e., probability),
        # and bounding box (x, y)-coordinates
        pred_class = int(output[base_index + 1])
        pred_conf = output[base_index + 2]
        pred_boxpts = ((x1, y1), (x2, y2))

        # create prediciton tuple and append the prediction to the
        # predictions list
        prediction = (pred_class, pred_conf, pred_boxpts)
        predictions.append(prediction)

    # return the list of predictions to the calling function
    return predictions

# construct the argument parser and parse the arguments
ap = argparse.ArgumentParser()
ap.add_argument("-g", "--graph", required=True,
    help="path to input graph file")
ap.add_argument("-c", "--confidence", default=.5,
    help="confidence threshold")
ap.add_argument("-d", "--display", type=int, default=0,
    help="switch to display image on screen")
args = vars(ap.parse_args())

# grab a list of all NCS devices plugged in to USB
print("[INFO] finding NCS devices...")
devices = mvnc.EnumerateDevices()

# if no devices found, exit the script
if len(devices) == 0:
    print("[INFO] No devices found. Please plug in a NCS")
    quit()

# use the first device since this is a simple test script
# (you'll want to modify this is using multiple NCS devices)
print("[INFO] found {} devices. device0 will be used. "
    "opening device0...".format(len(devices)))
device = mvnc.Device(devices[0])
device.OpenDevice()

# open the CNN graph file
print("[INFO] loading the graph file into RPi memory...")
with open(args["graph"], mode="rb") as f:
    graph_in_memory = f.read()

# load the graph into the NCS
print("[INFO] allocating the graph on the NCS...")
graph = device.AllocateGraph(graph_in_memory)

# open a pointer to the video stream thread and allow the buffer to
# start to fill, then start the FPS counter
print("[INFO] starting the video stream and FPS counter...")
vs = VideoStream(usePiCamera=False).start()
time.sleep(1)
fps = FPS().start()

# loop over frames from the video file stream
while True:
    try:
        # grab the frame from the threaded video stream
        # make a copy of the frame and resize it for display/video purposes
        frame = vs.read()
        image_for_result = frame.copy()
        image_for_result = cv2.resize(image_for_result, DISPLAY_DIMS)

        # use the NCS to acquire predictions
        predictions = predict(frame, graph)

        # loop over our predictions
        for (i, pred) in enumerate(predictions):
            # extract prediction data for readability
            (pred_class, pred_conf, pred_boxpts) = pred

            # filter out weak detections by ensuring the `confidence`
            # is greater than the minimum confidence
            #if pred_conf > args["confidence"]:
            if CLASSES[pred_class] == 'person':
                area = (pred_boxpts[1][0] - pred_boxpts[0][0]) * (pred_boxpts[1][1] - pred_boxpts[0][1])
                if ((pred_boxpts[0][0] + pred_boxpts[1][0]) // 2) - 150 > 20:
                    move_zpositive()
                elif ((pred_boxpts[0][0] + pred_boxpts[1][0]) // 2) - 150 < -20:
                    move_znegetive()
                elif area < 150 * 150:
                    move_forward()
                elif area > 250 * 250:
                    move_backward()
                else:
                    stop_car()
                
                #print(area)
                #print(((pred_boxpts[0][0] + pred_boxpts[1][0]) // 2) - 150)
                # print prediction to terminal
                #print("[INFO] Prediction #{}: class={}, confidence={}, "
                    #"boxpoints={}".format(i, CLASSES[pred_class], pred_conf,
                    #pred_boxpts))

                # check if we should show the prediction data
                # on the frame
                if args["display"] > 0:
                    # build a label consisting of the predicted class and
                    # associated probability
                    label = "{}: {:.2f}%".format(CLASSES[pred_class],
                        pred_conf * 100)

                    # extract information from the prediction boxpoints
                    (ptA, ptB) = (pred_boxpts[0], pred_boxpts[1])
                    ptA = (ptA[0] * DISP_MULTIPLIER, ptA[1] * DISP_MULTIPLIER)
                    ptB = (ptB[0] * DISP_MULTIPLIER, ptB[1] * DISP_MULTIPLIER)
                    (startX, startY) = (ptA[0], ptA[1])
                    y = startY - 15 if startY - 15 > 15 else startY + 15

                    # display the rectangle and label text
                    cv2.rectangle(image_for_result, ptA, ptB,
                        COLORS[pred_class], 2)
                    cv2.putText(image_for_result, label, (startX, y),
                        cv2.FONT_HERSHEY_SIMPLEX, 1, COLORS[pred_class], 3)
            
            
        # check if we should display the frame on the screen
        # with prediction data (you can achieve faster FPS if you
        # do not output to the screen)
        if args["display"] > 0:
            # display the frame to the screen
            cv2.imshow("Output", image_for_result)
            key = cv2.waitKey(1) & 0xFF

            # if the `q` key was pressed, break from the loop
            if key == ord("q"):
                break

        # update the FPS counter
        fps.update()
    
    # if "ctrl+c" is pressed in the terminal, break from the loop
    except KeyboardInterrupt:
        break

    # if there's a problem reading a frame, break gracefully
    except AttributeError:
        break

# stop the FPS counter timer
fps.stop()

# destroy all windows if we are displaying them
if args["display"] > 0:
    cv2.destroyAllWindows()

# stop the video stream
vs.stop()

# clean up the graph and device
graph.DeallocateGraph()
device.CloseDevice()

# display FPS information
print("[INFO] elapsed time: {:.2f}".format(fps.elapsed()))
print("[INFO] approx. FPS: {:.2f}".format(fps.fps()))

深度学习算法模型的导入以及调用就不多说了，要达到跟随行人的目的，重点是第186行开始，如果检测到的是类别是 person，那么根据方框 pred_boxpts的相关参数，可以算出水平方向与整个图像中心点的偏差，从而来控制小车的方向；至于前后方向，我们可以根据方框计算其所占的面积大小，如果人体远离摄像头那么整个面积必然会减小，这时前进即可，反之亦然。

这个方法虽然比较简单，但是实际的效果还是不错的。至于其他部分，注释比较多，就不再细说了。后续会考虑加入避障等更多功能的实现。

注：这个模型用到了神经棒一代，不用也完全可以，可以直接跑tf-tensor的例程，这里主要说的是如果根据识别到的人体控制小车。
最后是完整的工程：
链接：百度网盘
提取码：1u4r