当前位置：首页 > news >正文

用手势操控现实：OpenCV 音量控制与 AI 换脸技术解析

news 2025/9/12 17:20:20

基于opencv的手势控制音量和ai换脸

HandTrackingModule.py

import cv2
import mediapipe as mp
import timeclass handDetector():def __init__(self, mode = False, maxHands = 2, model_complexity = 1, detectionCon = 0.5, trackCon = 0.5):self.mode = modeself.maxHands = maxHandsself.model_complexity = model_complexityself.detectionCon = detectionConself.trackCon = trackConself.mpHands = mp.solutions.handsself.hands = self.mpHands.Hands(self.mode, self.maxHands, self.model_complexity, self.detectionCon, self.trackCon)self.mpDraw = mp.solutions.drawing_utilsdef findHands(self, img, draw = True):# Hand类的对象只能使用RGB图像imgRGB = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)self.results = self.hands.process(imgRGB)# print(results.multi_hand_landmarks)# 如果存在手if self.results.multi_hand_landmarks:# 如果存在多个手for handLms in self.results.multi_hand_landmarks:if draw:# 设置连接线等属性self.connection_drawing_spec = self.mpDraw.DrawingSpec(color=(0, 255, 0), thickness=2)# 绘制self.mpDraw.draw_landmarks(img, handLms, self.mpHands.HAND_CONNECTIONS, connection_drawing_spec=self.connection_drawing_spec)return imgdef findPosition(self, img, handNum=0, draw=True):lmList = []# 每个点的索引和它的像素比例，若知道窗口的宽度和高度可以计算位置if self.results.multi_hand_landmarks:myHand = self.results.multi_hand_landmarks[handNum]for id, lm in enumerate(myHand.landmark):# print(id, lm)h, w, c = img.shapecx, cy = int(lm.x * w), int(lm.y * h)# print(id, cx, cy)lmList.append([id, cx, cy])if draw:cv2.circle(img, (cx, cy), 7, (255, 0, 0), cv2.FILLED)# 绘制每一只手return lmList

定义了一个名为 handDetector 的类，用于检测和跟踪手部。下面是代码的详细分析：

导入库

cv2: OpenCV 库，用于图像处理。
mediapipe as mp: 用于多媒体解决方案的库，在此用于手部检测。
time: 用于时间管理，但在给定的代码段中未使用。

`handDetector` 类

初始化方法 `init`

该方法用于初始化 handDetector 类的对象，并设置一些参数。

mode: 布尔值，控制 MediaPipe 手部解决方案的静态图像模式。默认值为 False。
maxHands: 最大手部数量，控制同时检测的手的数量。默认值为 2。
model_complexity: 模型复杂度，有 0、1、2 三个级别。默认值为 1。
detectionCon: 检测置信度阈值。默认值为 0.5。
trackCon: 跟踪置信度阈值。默认值为 0.5。

此外，还创建了 MediaPipe 手部解决方案的实例，并初始化了绘图工具。

方法 `findHands`

该方法用于在给定图像中找到手，并根据需要绘制手部标记。

img: 输入图像。
draw: 布尔值，控制是否绘制手部标记。默认值为 True。

该方法首先将图像从 BGR 转换为 RGB，然后处理图像以找到手部标记。如果找到了手部标记，并且 draw 参数为 True，则会在图像上绘制手部标记和连接线。

方法 `findPosition`

该方法用于在给定图像中找到手部标记的位置，并返回一个包含每个标记位置的列表。

img: 输入图像。
handNum: 手的索引，用于选择多个检测到的手中的特定一只。默认值为 0。
draw: 布尔值，控制是否在图像上绘制每个标记的圆圈。默认值为 True。

该方法遍历给定手的每个标记，并计算其在图像中的位置。如果 draw 参数为 True，则在每个标记的位置上绘制一个圆圈。

总结

handDetector 类是一个用于检测和跟踪手部的工具。它使用了 MediaPipe 的手部解决方案，并提供了在图像上绘制手部标记和连接线的功能。通过调用这些方法，你可以在视频流或静态图像中跟踪手部，甚至找到特定手部标记的位置。

VolumeHandControl.py

import cv2
import time
import numpy as np
import HandTrackingModule as htm
import math
from ctypes import cast, POINTER
from comtypes import CLSCTX_ALL
from pycaw.pycaw import AudioUtilities, IAudioEndpointVolume
wCam, hCam = 640, 480
cap = cv2.VideoCapture(0)
# 设置摄像头的宽度
cap.set(3, wCam)
# 设置摄像头的高度
cap.set(4, hCam)
pTime = 0
tiga_img = cv2.imread("tiga.jpg", cv2.IMREAD_UNCHANGED)
detector = htm.handDetector(detectionCon=0.7)face_Cascade = cv2.CascadeClassifier("haarcascade_frontalface_default.xml")
devices = AudioUtilities.GetSpeakers()
interface = devices.Activate(IAudioEndpointVolume._iid_, CLSCTX_ALL, None)
volume = cast(interface, POINTER(IAudioEndpointVolume))
# volume.GetMute()
# volume.GetMasterVolumeLevel()
# 音量范围
volRange = volume.GetVolumeRange()
print(volRange)
# 最小音量
minVol = volRange[0]
# 最大音量
maxVol = volRange[1]
vol = 0
volBar = 400
volPer = 0
def overlay_img(img, img_over, img_over_x, img_over_y):# 背景图像高宽img_w, img_h, img_c = img.shape# 覆盖图像高宽通道数img_over_h, img_over_w, img_over_c = img_over.shape# 转换成4通道if img_over_c == 3:img_over = cv2.cvtColor(img_over, cv2.COLOR_BGR2BGRA)# 遍历列for w in range(0, img_over_w):#遍历行for h in range(0, img_over_h):if img_over[h, w, 3] != 0:# 遍历三个通道for c in range(0, 3):x = img_over_x + wy = img_over_y + hif x >= img_w or y >= img_h:breakimg[y-40, x, c] = img_over[h, w, c]return imgwhile True:success, img = cap.read()gray_frame = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)height, width, channel = img.shapefaces = face_Cascade.detectMultiScale(gray_frame, 1.15, 5)for (x, y, w, h) in faces:gw = wgh = int(height * w / width)tiga_img = cv2.resize(tiga_img, (gw, gh+gh))print(gw, gh)if 0 <= x < img.shape[1] and 0 <= y < img.shape[0]:overlay_img(img, tiga_img, x, y)img = detector.findHands(img)lmList = detector.findPosition(img, draw=False)if len(lmList) != 0:# print(lmList[4], lmList[8])x1, y1 = lmList[4][1], lmList[4][2]x2, y2 = lmList[8][1], lmList[8][2]cv2.circle(img, (x1, y1), 15, (255, 0, 255), cv2.FILLED)cv2.circle(img, (x2, y2), 15, (255, 0, 255), cv2.FILLED)cv2.line(img, (x1, y1), (x2, y2), (255, 0, 255), 3)cx, cy = (x1+x2)//2, (y1+y2)//2cv2.circle(img, (cx, cy), 15, (255, 0, 255), cv2.FILLED)length = math.hypot(x2 - x1, y2 - y1)print(length)# Hand rang 130 25# Vomume Range -65 0vol = np.interp(length, [25, 175], [minVol, maxVol])volBar = np.interp(length, [25, 175], [400, 150])volPer = np.interp(length, [25, 175], [0, 100])print(int(length), vol)volume.SetMasterVolumeLevel(vol, None)if length<25:cv2.circle(img, (cx, cy), 15, (0, 255, 0), cv2.FILLED)cv2.rectangle(img, (50, 150), (85, 400), (255, 0, 0), 3)cv2.rectangle(img, (50, int(volBar)), (85, 400), (255, 0, 0), cv2.FILLED)cv2.putText(img, f'{int(volPer)} %', (40, 450), cv2.FONT_HERSHEY_COMPLEX, 1, (255, 0, 0), 3)cTime = time.time()fps = 1/(cTime - pTime)pTime = cTimecv2.putText(img, f'FPS:{int(fps)}', (40, 50), cv2.FONT_HERSHEY_COMPLEX, 1, (255, 0, 0), 3)cv2.imshow("img", img)cv2.waitKey(1)