目前,在网上大部分实现手势识别的算法,都是基于肤色检测和凸包检测。此算法虽然运算速度较快,但最大的弊端就是对手势背景要求较高(也就是说只要背景有跟皮肤类似的颜色出现,就很难将手势分割出来),抗干扰能力较差,且人体其他部位(如头部)对算法实现的影响也比较大。基于此,本文主要采取openpose中手部20个关键点检测,通过比较各个关键点之间的位置关系,来实现精度较高、抗干扰能力较强的手势识别算法。
openpose关键点检测效果如图:
其算法原理以识别“手势1”和“手势5”为例:
第一步:以每个手指第一个关节点(序号3、7、11、15、19)和手掌根部点(序号0)为边界,通过cv2.minEnclosingCircle()函数拟合一个检测圆(图上蓝色圆);
第二步:分别计算每个手指最外侧点(序号4、8、12、16、20,分别对应大拇指、食指、中指、无名指、小拇指的最外侧)与检测圆(图上蓝色圆)的距离;
第三步:根据5个距离的情况组合,来判断手势的类型。若只有8号检测点在圆圈的外部(表示食指是竖起来的),4号、12号、16号、20号检测点均在圆圈的内部(表示其余四个指头是握着的),则为“手势1”,图1右上角显示检测的结果“this is one”;若4、8、12、16、20号监测点均在圆圈外(表示5个手指全部伸直)则为“手势5”,图2右上角显示检测的结果“this is five”。
以此类推,根据其他检测点与圆圈的位置关系组合,除以上提到的2种手势外,还实现了8种手势,效果如下图所示。
手势“竖中指”
准备工作(基于ROS机器人框架):
1.通过ROS框架启动摄像头。
2.下载手部检测点CAFFE库。
3.运行以下源代码实现检测和显示功能。
#!/usr/bin/env python
#-*- coding: utf-8 -*-
from __future__ import print_function
import sys
import rospy
import time
import cv2
import math
from std_msgs.msg import String
from sensor_msgs.msg import Image
from cv_bridge import CvBridge, CvBridgeError
import numpy as np
from sensor_msgs.msg import CompressedImage
from geometry_msgs.msg import Twist, Vector3
# from base import Event
protoFile = "/home/xtark/catkin_ws/src/xtark_tutorials/scripts/pose_deploy.prototxt"#必须是绝对路径
weightsFile = "/home/xtark/catkin_ws/src/xtark_tutorials/scripts/pose_iter_102000.caffemodel"
nPoints = 22
#用于画检测点连线
POSE_PAIRS = [ [0,1],[1,2],[2,3],[3,4],[0,5],[5,6],[6,7],[7,8],[0,9],[9,10],[10,11],[11,12],[0,13],[13,14],[14,15],[15,16],[0,17],[17,18],[18,19],[19,20] ]
threshold = 0.2
#导入caffe训练模块
net = cv2.dnn.readNetFromCaffe(protoFile, weightsFile)
flag=np.array([-1,-1])
class image_converter:
def __init__(self):
# 定义处理前图像订阅器
self.image_sub = rospy.Subscriber("/usb_cam/image_raw",Image,self.callback)
# 定义处理后图像发布器
self.image_pub = rospy.Publisher("/image_topic_2",Image, queue_size =3)
# 定义原图和opencv图转换器
self.bridge = CvBridge()
def callback(self,data):
# 将原图转为opencv图
try:
frame = self.bridge.imgmsg_to_cv2(data, "bgr8")
except CvBridgeError as e:
print(e)
#初始化
frameWidth = frame.shape[1]
frameHeight = frame.shape[0]
aspect_ratio = frameWidth/frameHeight
inHeight = 368
inWidth = int(((aspect_ratio*inHeight)*8)//8)
t = time.time()
# 读取每一帧的数据
frameCopy = np.copy(frame)
# blobFromImage将图像转为blob
inpBlob = cv2.dnn.blobFromImage(frame, 1.0 / 255, (inWidth, inHeight), (0, 0, 0), swapRB=False, crop=False)
net.setInput(inpBlob)
# forward实现网络推断
# 模型可生成22个关键点,其中21个点是人手部的,第22个点代表着背景
output = net.forward()
#计算检测时间
print("forward = {}".format(time.time() - t))
# print(output)
# Empty list to store the detected keypoints
points = []
limit_points=[]
for i in range(nPoints):
probMap = output[0, i, :, :]
probMap = cv2.resize(probMap, (frameWidth, frameHeight))
# print(probMap)
# 找到精确位置
minVal, prob, minLoc, point = cv2.minMaxLoc(probMap)
# print(prob)
if prob > threshold :
cv2.circle(frameCopy, (int(point[0]), int(point[1])), 6, (0, 255, 255), thickness=-1, lineType=cv2.FILLED)
cv2.putText(frameCopy, "{}".format(i), (int(point[0]), int(point[1])), cv2.FONT_HERSHEY_SIMPLEX, .8, (0, 0, 255), 2, lineType=cv2.LINE_AA)
# points.append((int(point[0]), int(point[1])))
points.append(np.array([int(point[0]), int(point[1])]))
# points.append(np.array((int(point[0]), int(point[1]))))
else :
# points.append(None)
points.append(flag)
#连接拟合圆形所需的点
limit = np.concatenate(([points[0]],[points[3]],[points[7]],[points[11]],[points[15]],[points[19]]),axis=0)
#剔除无效点
for i in limit:
if (i == flag).all():
pass
else:
limit_points.append(i)
limit_points=np.array(limit_points)
# print(limit_points)
# print(tuple(points[8]))
# 多边形判断法
# result_one = cv2.pointPolygonTest(limit_points,tuple(points[8]),False)
# print(result_one)
# epsilon = 0.02*cv2.arcLength(limit_points,True)
# approx = cv2.approxPolyDP(limit_points,epsilon,True)
# frameCopy = cv2.drawContours(frameCopy,[approx],0,(255,0,0),2)
#拟合圆
(x,y),radius = cv2.minEnclosingCircle(limit_points)
# print(x,y,radius)
center = (int(x),int(y))
radius = int(radius)
flag_distance = radius * 0.1
# print(points[4],points[8],points[12],points[16],points[20])
# 计算每个判断点到圆边界的距离
if (points[4]!= flag).all():
distance_4 = np.linalg.norm(points[4]-center)-radius
else:
distance_4 = 0
if (points[8]!= flag).all():
distance_8 = np.linalg.norm(points[8]-center)-radius
else:
distance_8 = 0
if (points[12]!= flag).all():
distance_12 = np.linalg.norm(points[12]-center)-radius
else:
distance_12 = 0
if (points[16]!= flag).all():
distance_16 = np.linalg.norm(points[16]-center)-radius
else:
distance_16 = 0
if (points[20]!= flag).all():
distance_20 = np.linalg.norm(points[20]-center)-radius
else:
distance_20 = 0
# print(distance_4,distance_8,distance_12,distance_16,distance_20,flag_distance)
cv2.circle(frameCopy,center,radius,(255,0,0),2)
# print([distance_4,distance_12,distance_16,distance_20])
# 依据判断点到圆边界的距离来推算手势类型
if distance_8 >= flag_distance and (np.array([distance_4,distance_12,distance_16,distance_20])<flag_distance).all():
result = "this is one"
elif (np.array([distance_8,distance_12])>=flag_distance).all() and (np.array([distance_4,distance_16,distance_20])<flag_distance).all():
result = "this is two"
elif (np.array([distance_8,distance_12,distance_16,distance_20])>=flag_distance).all() and distance_4<flag_distance:
result = "this is four"
elif (np.array([distance_4,distance_8,distance_12,distance_16,distance_20])>=flag_distance).all():
result = "this is five"
elif (np.array([distance_4,distance_8,distance_12,distance_16,distance_20])<=flag_distance).all():
result = "this is fist"
elif (np.array([distance_4,distance_8])>=flag_distance).all() and (np.array([distance_12,distance_16,distance_20])<flag_distance).all():
result = "this is eight"
elif distance_4>=flag_distance and (np.array([distance_8,distance_16,distance_20])<flag_distance).all():
result = "this is good"
elif distance_12>=flag_distance and (np.array([distance_4,distance_8,distance_16,distance_20])<flag_distance).all():
result = "this is out"
elif (np.array([distance_12,distance_16])>=flag_distance).all() and distance_4<flag_distance:
result = "this is OK"
elif (np.array([distance_16,distance_20])>=flag_distance).all() and distance_4<flag_distance:
result = "this is love"
else:
result = "can't find"
#把手势类型名称画在图上
cv2.putText(frameCopy, result, (430, 80), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (0, 255, 0), 2, lineType=cv2.LINE_AA)
# 画出关键点
for pair in POSE_PAIRS:
partA = pair[0]
partB = pair[1]
# if points[partA] and points[partB]:
cv2.line(frame, tuple(points[partA]), tuple(points[partB]), (0, 255, 255), 2, lineType=cv2.LINE_AA)
cv2.circle(frame, tuple(points[partA]), 5, (0, 0, 255), thickness=-1, lineType=cv2.FILLED)
cv2.circle(frame, tuple(points[partB]), 5, (0, 0, 255), thickness=-1, lineType=cv2.FILLED)
try:
self.image_pub.publish(self.bridge.cv2_to_imgmsg(frame, "bgr8"))#8UC1为灰度图,bgr8为彩色图
except CvBridgeError as e:
print(e)
if __name__ == '__main__':
rospy.init_node('image_converter', anonymous=True)
ic = image_converter()
print('start')
try:
rospy.spin()
except rospy.ROSInterruptException:
print('exception')
问题:
因本算法是基于树莓派读取摄像头信息,然后通过在win10中虚拟机(Linux)系统运行核心代码,因此全程没有GPU加速,检测速度比较慢(大约3秒钟检测一帧),所有没有实现实时检测功能(3秒钟卡一下有点难受)。要是有朋友能通过GPU加速运行此代码,那么实时检测的效果应该是比较理想的。