2021-09-22

python基于yolov3实现的手势控制音乐播放器

效果演示

话不多说,先上最后的成品展示。

<iframe allowfullscreen="true" data-mediaembed="bilibili" id="wHRrGiiU-1632303429184" src="https://player.bilibili.com/player.html?aid=293127737"></iframe>

python基于yolov3实现的手势控制音乐播放器

总体框架

最先我设想的很简单,整个工程分为两个模块,手势识别模块和音乐播放器模块,于是就有了如下框架图。
2021-09-22
但是后来我发现GTX960上跑这个程序似乎有点力不从心,所以就用了两台电脑跑程序的情况。音乐播放器在一台电脑A上,手势识别模块在另一台电脑B上,两台电脑通过tcp连接,电脑A运行音乐播放器,并将采集到的手势图像数据传到B处,电脑B识别出结果后发送给A,A上的音乐播放器再做出相应的应答。其实就是相当于把B当作服务器一个道理。

手势识别模块

整个的手势识别模块我是基于yolov3做的,一共做了数字1~5的5手势的识别。
因为是做毕设,数据集的手机没有很严谨,python基于opencv调用摄像头,自己在电脑前比划手势,每3秒保存一次录像中的图片,得到每种手势大概200+的图片,五种手势一共1200+张图片。
2021-09-22
在GXT960的识别效果如下,还是满流畅的。
2021-09-22
识别代码

import cv2 #include <stdio.h>
import os
import shutil
import numpy as np #defind xx as xx
import tensorflow as tf
import imutils
from time import * #improt time
from PIL import Image
import core.utils as utils
from core.config import cfg
from core.yolov3 import YOLOV3


class YoloTest(object):
    def __init__(self):
        self.input_size       = cfg.TEST.INPUT_SIZE
        self.anchor_per_scale = cfg.YOLO.ANCHOR_PER_SCALE
        self.classes          = utils.read_class_names(cfg.YOLO.CLASSES)
        self.num_classes      = len(self.classes)
        self.anchors          = np.array(utils.get_anchors(cfg.YOLO.ANCHORS))
        self.score_threshold  = cfg.TEST.SCORE_THRESHOLD
        self.iou_threshold    = cfg.TEST.IOU_THRESHOLD
        self.moving_ave_decay = cfg.YOLO.MOVING_AVE_DECAY
        self.annotation_path  = cfg.TEST.ANNOT_PATH
        self.weight_file      = cfg.TEST.WEIGHT_FILE
        self.write_image      = cfg.TEST.WRITE_IMAGE
        self.write_image_path = cfg.TEST.WRITE_IMAGE_PATH
        self.show_label       = cfg.TEST.SHOW_LABEL

        with tf.name_scope('input'):
            self.input_data = tf.placeholder(dtype=tf.float32, name='input_data')
            self.trainable  = tf.placeholder(dtype=tf.bool,    name='trainable')

        model = YOLOV3(self.input_data, self.trainable)
        self.pred_sbbox, self.pred_mbbox, self.pred_lbbox = model.pred_sbbox, model.pred_mbbox, model.pred_lbbox

        with tf.name_scope('ema'):
            ema_obj = tf.train.ExponentialMovingAverage(self.moving_ave_decay)

        self.sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
        self.saver = tf.train.Saver(ema_obj.variables_to_restore())
        self.saver.restore(self.sess, self.weight_file)

    def predict(self, image):

        org_image = np.copy(image)
        org_h, org_w, _ = org_image.shape

        image_data = utils.image_preporcess(image, [self.input_size, self.input_size])#改变图像大小,padding
        image_data = image_data[np.newaxis, ...]#[2,3]->[[2,3]]

        pred_sbbox, pred_mbbox, pred_lbbox = self.sess.run(
            [self.pred_sbbox, self.pred_mbbox, self.pred_lbbox],
            feed_dict={
                self.input_data: image_data,
                self.trainable: False
            }
        )

        pred_bbox = np.concatenate([np.reshape(pred_sbbox, (-1, 5 + self.num_classes)),
                                    np.reshape(pred_mbbox, (-1, 5 + self.num_classes)),
                                    np.reshape(pred_lbbox, (-1, 5 + self.num_classes))], axis=0)#concat
        bboxes = utils.postprocess_boxes(pred_bbox, (org_h, org_w), self.input_size, self.score_threshold)
        bboxes = utils.nms(bboxes, self.iou_threshold)

        return bboxes


# ----------------------------------------------------------------------------------------------------------------------


if __name__ == '__main__':
    yolo = YoloTest()
    vs = cv2.VideoCapture(0)
    count = 0
    start_time = time()
    while True:
        _, frame = vs.read()
        frame = imutils.resize(frame, width=720)
        bboxes = yolo.predict(frame)
        frame = utils.draw_bbox(frame, bboxes)#画框
        last = time() - start_time
        count += 1

        fps = count / last
        cv2.putText(frame, "fps: %0.2f" % fps, (20, 30), cv2.FONT_HERSHEY_DUPLEX, 1, (0, 255, 0), 1)
        cv2.imshow("", frame)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

在静态识别的基础上,我继续做了一个简单的手势移动的动态识别和轨迹跟踪。
2021-09-22
动态识别的代码

import cv2
import os
import shutil
import numpy as np
import tensorflow as tf
import core.utils as utils
from core.config import cfg
from core.yolov3 import YOLOV3
from PIL import Image
import imutils
from time import *
from collections import deque


class YoloTest(object):
    def __init__(self):
        self.input_size = cfg.TEST.INPUT_SIZE
        self.anchor_per_scale = cfg.YOLO.ANCHOR_PER_SCALE
        self.classes = utils.read_class_names(cfg.YOLO.CLASSES)
        self.num_classes = len(self.classes)
        self.anchors = np.array(utils.get_anchors(cfg.YOLO.ANCHORS))
        self.score_threshold = cfg.TEST.SCORE_THRESHOLD
        self.iou_threshold = cfg.TEST.IOU_THRESHOLD
        self.moving_ave_decay = cfg.YOLO.MOVING_AVE_DECAY
        self.annotation_path = cfg.TEST.ANNOT_PATH
        self.weight_file = cfg.TEST.WEIGHT_FILE
        self.write_image = cfg.TEST.WRITE_IMAGE
        self.write_image_path = cfg.TEST.WRITE_IMAGE_PATH
        self.show_label = cfg.TEST.SHOW_LABEL

        with tf.name_scope('input'):
            self.input_data = tf.placeholder(dtype=tf.float32, name='input_data')
            self.trainable  = tf.placeholder(dtype=tf.bool,    name='trainable')

        model = YOLOV3(self.input_data, self.trainable)
        self.pred_sbbox, self.pred_mbbox, self.pred_lbbox = model.pred_sbbox, model.pred_mbbox, model.pred_lbbox

        with tf.name_scope('ema'):
            ema_obj = tf.train.ExponentialMovingAverage(self.moving_ave_decay)

        self.sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
        self.saver = tf.train.Saver(ema_obj.variables_to_restore())
        self.saver.restore(self.sess, self.weight_file)

    def predict(self, image):

        org_image = np.copy(image)
        org_h, org_w, _ = org_image.shape

        image_data = utils.image_preporcess(image, [self.input_size, self.input_size])
        image_data = image_data[np.newaxis, ...]

        pred_sbbox, pred_mbbox, pred_lbbox = self.sess.run(
            [self.pred_sbbox, self.pred_mbbox, self.pred_lbbox],
            feed_dict={
                self.input_data: image_data,
                self.trainable: False
            }
        )

        pred_bbox = np.concatenate([np.reshape(pred_sbbox, (-1, 5 + self.num_classes)),
                                    np.reshape(pred_mbbox, (-1, 5 + self.num_classes)),
                                    np.reshape(pred_lbbox, (-1, 5 + self.num_classes))], axis=0)
        bboxes = utils.postprocess_boxes(pred_bbox, (org_h, org_w), self.input_size, self.score_threshold)
        bboxes = utils.nms(bboxes, self.iou_threshold)

        return bboxes


# ----------------------------------------------------------------------------------------------------------------------

if __name__ == '__main__':

    yolo = YoloTest()
    vs = cv2.VideoCapture(0)
    pts = [deque(maxlen=10) for _ in range(1)]

    while True:
        ret, frame = vs.read()
        if not ret:
            break
        frame = cv2.flip(frame, 180)
        frame = imutils.resize(frame, width=720)
        frame = frame[70:500,360:720]
        bboxes = yolo.predict(frame)

        # --------------------------------------------------------------------------------------------------------------

        if len(bboxes) > 0 and bboxes[0][4] > 0.6:
            cx = int((bboxes[0][0] + bboxes[0][2]) / 2)
            cy = int((bboxes[0][1] + bboxes[0][3]) / 2)
            center = (cx, cy)
            cv2.circle(frame, (cx, cy), 1, (0, 0, 255), 5)
            pts[0].append(center)

        else:
            pts = [deque(maxlen=10) for _ in range(1)]

        # --------------------------------------------------------------------------------------------------------------

        count_up = 0
        count_down = 0
        count_left = 0
        count_right = 0

        for j in range(1, len(pts[0])):
            if pts[0][j - 1] is None or pts[0][j] is None:
                continue

            if pts[0][0][1] - pts[0][-1][1] > 30:
                cv2.putText(frame, 'up', (20, 40), cv2.FONT_HERSHEY_DUPLEX, 1, (0, 0, 255), 1)

            if pts[0][-1][1] - pts[0][0][1] > 30:
                cv2.putText(frame, 'down', (20, 40), cv2.FONT_HERSHEY_DUPLEX, 1, (0, 0, 255), 1)

            if pts[0][-1][0] - pts[0][0][0] > 30:
                cv2.putText(frame, 'left', (100, 40), cv2.FONT_HERSHEY_DUPLEX, 1, (0, 255, 0), 1)

            if pts[0][0][0] - pts[0][-1][0] > 30:
                cv2.putText(frame, 'right', (100, 40), cv2.FONT_HERSHEY_DUPLEX, 1, (0, 255, 0), 1)

            cv2.line(frame, (pts[0][j - 1]), (pts[0][j]), (255, 0, 0), 2)

        # --------------------------------------------------------------------------------------------------------------

        frame = utils.draw_bbox(frame, bboxes)


        cv2.imshow("", frame)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

音乐播放器模块

完成手势识别模块之后,已经用了我半条命了。所以,一开始我设计的音乐播放器的UI是…emmm,用我老师的话说就是完全不能看那种。
2021-09-22
有一说一,这种播放器简洁明,多好。
于是在老师的压迫下,才有了下面这个版本的音乐播放器播放器。
2021-09-22
播放器的代码如下

import cv2
import os
from time import *

import sys
import time
import random
import configparser
from PyQt5.QtGui import *
from PyQt5.QtCore import *
from PyQt5.QtWidgets import *
from PyQt5.QtMultimedia import *


'''音乐播放器'''
class musicPlayer(QWidget):
	def __init__(self):
		super().__init__()

		self.timer_camera = QTimer()  # 定义定时器,用于控制显示视频的帧率
		self.cap = cv2.VideoCapture()  # 视频流
		self.CAM_NUM = 0  # 为0时表示视频流来自笔记本内置摄像头

		self.pix = QPixmap('./image/back.png')  # 蒙版+图片
		self.resize(self.pix.size())
		self.setMask(self.pix.mask())
		# 设置无边框和置顶窗口样式
		self.setWindowFlags(Qt.FramelessWindowHint | Qt.WindowStaysOnTopHint)
		self.__initialize()
	'''初始化'''
	def __initialize(self):
		self.songs_list = []
		self.song_formats = ['mp3', 'm4a', 'flac', 'wav', 'ogg']
		self.settingfilename = 'setting.ini'
		self.player = QMediaPlayer()
		self.cur_path = os.path.abspath(os.path.dirname(__file__))
		self.cur_playing_song = ''
		self.is_switching = False
		self.is_pause = True
		# 界面元素
		# --播放时间
		self.label1 = QLabel('00:00', self)
		self.label1.setStyle(QStyleFactory.create('Fusion'))
		self.label1.setGeometry(50, 1000, 100, 100)
		self.label2 = QLabel('00:00', self)
		self.label2.setStyle(QStyleFactory.create('Fusion'))
		self.label2.setGeometry(900, 1000, 100, 100)
		# --音乐播放进度条
		self.slider = QSlider(Qt.Horizontal, self)
		self.slider.sliderMoved[int].connect(lambda: self.player.setPosition(self.slider.value()))
		self.slider.setStyle(QStyleFactory.create('Fusion'))
		self.slider.setGeometry(100, 1000, 800, 100)
		# --音量图标
		self.vlabel = QLabel(self)
		v_img = QPixmap('./image/vuluem.png')
		self.vlabel.setPixmap(v_img)
		self.vlabel.move(60, 955)
		# --音量控制滑动条
		self.vslider = QSlider(Qt.Horizontal, self)
		self.vslider.sliderMoved[int].connect(lambda: self.player.setVolume(self.vslider.value()))
		self.vslider.setStyle(QStyleFactory.create('Fusion'))
		self.vslider.setGeometry(130, 930, 300, 100)
		# --上一首按钮
		self.preview_button = QPushButton('上一首', self)
		self.preview_button.clicked.connect(self.previewMusic)
		self.preview_button.setStyle(QStyleFactory.create('Fusion'))
		self.preview_button.setGeometry(550, 700, 100, 40)
		# --下一首按钮
		self.next_button = QPushButton('下一首', self)
		self.next_button.clicked.connect(self.nextMusic)
		self.next_button.setStyle(QStyleFactory.create('Fusion'))
		self.next_button.setGeometry(800, 700, 100, 40)
		# --打开文件夹按钮
		self.open_button = QPushButton('导入音乐', self)
		self.open_button.setStyle(QStyleFactory.create('Fusion'))
		self.open_button.clicked.connect(self.openDir)
		self.open_button.setGeometry(550, 950, 100, 40)
		# --显示音乐列表
		self.qlist = QListWidget(self)
		self.qlist.setGeometry(100, 200, 400, 700)
		self.qlist.itemDoubleClicked.connect(self.doubleClicked)
		self.qlist.setStyle(QStyleFactory.create('windows'))
		#self.qlist.setStyleSheet("background-color:pink;")
		# --播放按钮
		self.play_button = QPushButton(self)
		self.play_button.setStyleSheet("QPushButton{border-image: url(image/play_1.png)}"
									   "QPushButton:hover{border-image: url(image/play_2.png)}"
									   "QPushButton:pressed{border-image: url(image/pause.png)}")
		self.play_button.clicked.connect(self.playMusic)
		self.play_button.setGeometry(975, 555, 110, 110)
		# --手势按钮
		self.gesture_button = QPushButton(self)
		self.gesture_button.setStyleSheet("QPushButton{border-image: url(image/gesture.png)}"
										  "QPushButton:hover{border-image: url(image/gesture_1.png)}")
		self.gesture_button.clicked.connect(self.button_open_camera_clicked)
		self.timer_camera.timeout.connect(self.show_camera)
		self.gesture_button.setGeometry(680, 800, 105, 105)
		# --如果有初始化setting, 导入setting
		self.loadSetting()
		# --播放模式
		self.cmb = QComboBox(self)
		self.cmb.setStyle(QStyleFactory.create('Fusion'))
		self.cmb.addItem('顺序播放')
		self.cmb.addItem('单曲循环')
		self.cmb.addItem('随机播放')
		self.cmb.setGeometry(800, 950, 110, 40)
		# --计时器
		self.timer = QTimer(self)
		self.timer.start(1000)
		self.timer.timeout.connect(self.playByMode)
		# 退出按钮
		self.qbtn = QPushButton(self)
		self.qbtn.clicked.connect(QCoreApplication.instance().quit)
		self.qbtn.setStyleSheet("QPushButton{border-image: url(image/close_1.png)}"
								"QPushButton:hover{border-image: url(image/close_2.png)}"
								"QPushButton:pressed{border-image: url(image/close_2.png)}")
		self.qbtn.setGeometry(930, 120, 50, 50)
		# 定义显示视频的Label
		self.label_show_camera = QLabel(self)
		self.label_show_camera.setFixedSize(301, 301)
		#self.label_show_camera.setStyleSheet("background-color:pink;")
		self.label_show_camera.move(600, 200)


	def paintEvent(self, event):
		"""绘制窗口"""
		paint = QPainter(self)
		paint.drawPixmap(0, 0, self.pix.width(), self.pix.height(), self.pix)


	'''根据播放模式播放音乐'''
	def playByMode(self):
		if (not self.is_pause) and (not self.is_switching):
			self.slider.setMinimum(0)
			self.slider.setMaximum(self.player.duration())
			self.slider.setValue(self.slider.value() + 1000)
		self.label1.setText(time.strftime('%M:%S', time.localtime(self.player.position()/1000)))
		self.label2.setText(time.strftime('%M:%S', time.localtime(self.player.duration()/1000)))
		# 顺序播放
		if (self.cmb.currentIndex() == 0) and (not self.is_pause) and (not self.is_switching):
			if self.qlist.count() == 0:
				return
			if self.player.position() == self.player.duration():
				self.nextMusic()
		# 单曲循环
		elif (self.cmb.currentIndex() == 1) and (not self.is_pause) and (not self.is_switching):
			if self.qlist.count() == 0:
				return
			if self.player.position() == self.player.duration():
				self.is_switching = True
				self.setCurPlaying()
				self.slider.setValue(0)
				self.playMusic()
				self.is_switching = False
		# 随机播放
		elif (self.cmb.currentIndex() == 2) and (not self.is_pause) and (not self.is_switching):
			if self.qlist.count() == 0:
				return
			if self.player.position() == self.player.duration():
				self.is_switching = True
				self.qlist.setCurrentRow(random.randint(0, self.qlist.count()-1))
				self.setCurPlaying()
				self.slider.setValue(0)
				self.playMusic()
				self.is_switching = False


	'''打开文件夹'''
	def openDir(self):
		self.cur_path = QFileDialog.getExistingDirectory(self, "选取文件夹", self.cur_path)
		if self.cur_path:
			self.showMusicList()
			self.cur_playing_song = ''
			self.setCurPlaying()
			self.label1.setText('00:00')
			self.label2.setText('00:00')
			self.slider.setSliderPosition(0)
			self.is_pause = True
	'''导入setting'''
	def loadSetting(self):
		if os.path.isfile(self.settingfilename):
			config = configparser.ConfigParser()
			config.read(self.settingfilename)
			self.cur_path = config.get('MusicPlayer', 'PATH')
			self.showMusicList()
	'''更新setting'''
	def updateSetting(self):
		config = configparser.ConfigParser()
		config.read(self.settingfilename)
		if not os.path.isfile(self.settingfilename):
			config.add_section('MusicPlayer')
		config.set('MusicPlayer', 'PATH', self.cur_path)
		config.write(open(self.settingfilename, 'w'))
	'''显示文件夹中所有音乐'''
	def showMusicList(self):
		self.qlist.clear()
		self.updateSetting()
		for song in os.listdir(self.cur_path):
			if song.split('.')[-1] in self.song_formats:
				self.songs_list.append([song, os.path.join(self.cur_path, song).replace('\\', '/')])
				self.qlist.addItem(song)
		self.qlist.setCurrentRow(0)
		if self.songs_list:
			self.cur_playing_song = self.songs_list[self.qlist.currentRow()][-1]
	'''双击播放音乐'''
	def doubleClicked(self):
		self.slider.setValue(0)
		self.is_switching = True
		self.setCurPlaying()
		self.playMusic()
		self.is_switching = False
	'''设置当前播放的音乐'''
	def setCurPlaying(self):
		self.cur_playing_song = self.songs_list[self.qlist.currentRow()][-1]
		self.player.setMedia(QMediaContent(QUrl(self.cur_playing_song)))
		self.player.setVolume(50)
	'''提示'''
	def Tips(self, message):
		QMessageBox.about(self, "提示", message)
	'''播放音乐'''
	def playMusic(self):
		if self.qlist.count() == 0:
			self.Tips('当前路径内无可播放的音乐文件')
			return
		if not self.player.isAudioAvailable():
			self.setCurPlaying()
		if self.is_pause or self.is_switching:
			self.player.play()
			self.is_pause = False
			self.play_button.setStyleSheet("QPushButton{border-image: url(image/pause.png)}")
		elif (not self.is_pause) and (not self.is_switching):
			self.player.pause()
			self.is_pause = True
			self.play_button.setStyleSheet("QPushButton{border-image: url(image/play_1.png)}"
										   "QPushButton:hover{border-image: url(image/play_2.png)}")
	'''上一首'''
	def previewMusic(self):
		self.slider.setValue(0)
		if self.qlist.count() == 0:
			self.Tips('当前路径内无可播放的音乐文件')
			return
		pre_row = self.qlist.currentRow()-1 if self.qlist.currentRow() != 0 else self.qlist.count() - 1
		self.qlist.setCurrentRow(pre_row)
		self.is_switching = True
		self.setCurPlaying()
		self.playMusic()
		self.is_switching = False
	'''下一首'''
	def nextMusic(self):
		self.slider.setValue(0)
		if self.qlist.count() == 0:
			self.Tips('当前路径内无可播放的音乐文件')
			return
		next_row = self.qlist.currentRow()+1 if self.qlist.currentRow() != self.qlist.count()-1 else 0
		self.qlist.setCurrentRow(next_row)
		self.is_switching = True
		self.setCurPlaying()
		self.playMusic()
		self.is_switching = False

	def button_open_camera_clicked(self):
		if self.timer_camera.isActive() == False:  # 若定时器未启动
			flag = self.cap.open(self.CAM_NUM)  # 参数是0,表示打开笔记本的内置摄像头,参数是视频文件路径则打开视频
			if flag == False:  # flag表示open()成不成功
				QMessageBox.warning(self, 'warning', "请检查相机于电脑是否连接正确", buttons=QMessageBox.Ok)
			else:
				self.timer_camera.start(1)  # 定时器开始计时1ms,结果是每过30ms从摄像头中取一帧显示
			self.gesture_button.setStyleSheet("QPushButton{border-image: url(image/gesture_2.png)}")
		else:
			self.timer_camera.stop()  # 关闭定时器
			self.cap.release()  # 释放视频流
			self.label_show_camera.clear()  # 清空视频显示区域
			self.gesture_button.setStyleSheet("QPushButton{border-image: url(image/gesture.png)}"
											  "QPushButton:hover{border-image: url(image/gesture_1.png)}")

	def show_camera(self):
		flag, image = self.cap.read()  # 从视频流中读取
		image = cv2.flip(image, 180)
		show = cv2.resize(image, (300, 300))  # 把读到的帧的大小重新设置为 300x300
		show = cv2.cvtColor(show, cv2.COLOR_BGR2RGB)  # 视频色彩转换回RGB,这样才是现实的颜色
		showImage = QImage(show.data, show.shape[1], show.shape[0],
								QImage.Format_RGB888)  # 把读取到的视频数据变成QImage形式
		self.label_show_camera.setPixmap(QPixmap.fromImage(showImage))  # 往显示视频的Label里 显示QImage

'''run'''
if __name__ == '__main__':
	app = QApplication(sys.argv)
	gui = musicPlayer()
	gui.show()
	sys.exit(app.exec_())

一个小总结吧

毕设就这么弄完了,对我来说,写论文比写代码难多了,程序写完之后还得肝论文。然后就是要训练神经网络的小伙伴,自己没有配GPU环境的,可以去用谷歌云盘的colab,免费的GPU,训练还是快了很多的。

上一篇:【2017-05-02】winform弹出警告框是否进行增删改操作、记事本制作、对话框控件和输出输入流


下一篇:Python网络编程——编写一个简单的回显客户端/服务器应用