姿态估计与追踪
什么是姿态估计
姿态估计(Pose Estimation)是计算机视觉领域的一个重要技术,用于从图像或视频中检测和定位人体的关键部位(如肩膀、肘部、膝盖等)的位置。通过这些关键点的坐标,我们可以推断出人体的整体姿态和运动。
姿态估计的基本概念
在姿态估计中,我们通常需要:
- 关键点检测(Keypoint Detection)- 在图像中找到预定义的身体部位
- 骨骼连接(Skeleton Connection)- 将相邻的关键点连接起来形成骨骼结构
- 置信度评估(Confidence Scoring)- 评估每个关键点检测的可靠性
- 时间追踪(Temporal Tracking)- 在视频中追踪同一人物的姿态变化
姿态估计的应用场景
- 运动分析:评估运动员的姿态和动作质量
- 健身指导:检查用户的运动形式是否正确
- 视频编辑:为动画和特效生成运动数据
- 人机交互:通过手势和身体动作进行交互
- 机器人学习:让机器人从人类动作中学习
- 安全监控:检测异常行为或摔倒事件
2D 姿态估计
基本原理
2D 姿态估计在单张图像或视频帧中检测身体关键点的二维坐标(x, y)。这是最基础和最常用的姿态估计方法。
2D 关键点定义
标准的人体 2D 姿态模型通常定义 17-33 个关键点:
颅骨区域(5 个点):
- 鼻子
- 左眼、右眼
- 左耳、右耳
躯干区域(4 个点):
- 左肩、右肩
- 左髋、右髋
四肢区域(8 个点):
- 左肘、右肘
- 左膝、右膝
- 左踝、右踝
- 左腕、右腕
扩展关键点(如 MediaPipe):
- 手指关节
- 脚趾关节
- 更多躯干点2D 姿态估计的实现
使用 MediaPipe 进行 2D 姿态估计:
python
import cv2
import mediapipe as mp
import numpy as np
class Pose2DEstimator:
def __init__(self, model_complexity=1):
self.mp_pose = mp.solutions.pose
self.mp_drawing = mp.solutions.drawing_utils
self.pose = self.mp_pose.Pose(
static_image_mode=False,
model_complexity=model_complexity,
smooth_landmarks=True,
min_detection_confidence=0.7,
min_tracking_confidence=0.5
)
def estimate_pose(self, frame):
"""
从输入帧估计姿态
Args:
frame: 输入图像(BGR 格式)
Returns:
landmarks: 关键点列表
frame: 绘制了关键点的图像
"""
# 颜色空间转换
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
# 执行姿态估计
results = self.pose.process(frame_rgb)
# 获取关键点
landmarks = results.pose_landmarks.landmark if results.pose_landmarks else None
# 绘制结果
if landmarks:
self.mp_drawing.draw_landmarks(
frame,
results.pose_landmarks,
self.mp_pose.POSE_CONNECTIONS,
self.mp_drawing.DrawingSpec(color=(0, 255, 0), thickness=2, circle_radius=2),
self.mp_drawing.DrawingSpec(color=(255, 0, 0), thickness=2)
)
return landmarks, frame
def get_landmark_coordinates(self, landmarks, landmark_index):
"""获取特定关键点的坐标"""
if landmarks is None:
return None
landmark = landmarks[landmark_index]
return (landmark.x, landmark.y, landmark.z, landmark.visibility)
def get_all_coordinates(self, landmarks):
"""获取所有关键点的坐标"""
if landmarks is None:
return None
coords = []
for i, landmark in enumerate(landmarks):
coords.append({
'index': i,
'x': landmark.x,
'y': landmark.y,
'z': landmark.z,
'visibility': landmark.visibility
})
return coords
def close(self):
"""释放资源"""
self.pose.close()
# 使用示例
def main():
estimator = Pose2DEstimator()
cap = cv2.VideoCapture(0)
while True:
success, frame = cap.read()
if not success:
break
landmarks, frame_with_pose = estimator.estimate_pose(frame)
if landmarks:
# 获取特定关键点
nose = estimator.get_landmark_coordinates(landmarks, 0)
left_shoulder = estimator.get_landmark_coordinates(landmarks, 11)
right_shoulder = estimator.get_landmark_coordinates(landmarks, 12)
print(f"鼻子位置: x={nose[0]:.3f}, y={nose[1]:.3f}, 可见度={nose[3]:.3f}")
cv2.imshow('2D Pose Estimation', frame_with_pose)
if cv2.waitKey(1) & 0xFF == ord('q'):
break
cap.release()
cv2.destroyAllWindows()
estimator.close()
if __name__ == "__main__":
main()3D 姿态估计
3D 姿态估计的意义
3D 姿态估计在 2D 基础上增加了深度信息,可以得到身体关键点的三维坐标(x, y, z)。这对于更准确的动作分析和机器人控制至关重要。
从 2D 到 3D 的转换
方法 1:单目估计
通过单个摄像头的图像直接估计 3D 姿态。这需要强大的深度学习模型。
python
import cv2
import mediapipe as mp
class Pose3DEstimator:
def __init__(self, model_complexity=1):
self.mp_pose = mp.solutions.pose
self.mp_drawing = mp.solutions.drawing_utils
self.pose = self.mp_pose.Pose(
static_image_mode=False,
model_complexity=model_complexity,
smooth_landmarks=True
)
def estimate_3d_pose(self, frame):
"""
估计 3D 姿态
MediaPipe 的输出包含 z 坐标(深度信息)
"""
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
results = self.pose.process(frame_rgb)
if not results.pose_landmarks:
return None
landmarks_3d = []
for landmark in results.pose_landmarks.landmark:
landmarks_3d.append({
'x': landmark.x,
'y': landmark.y,
'z': landmark.z,
'visibility': landmark.visibility
})
return landmarks_3d
def extract_body_model(self, landmarks_3d):
"""
提取身体模型的关键度量
"""
if not landmarks_3d:
return None
# 提取关键关节
left_shoulder = landmarks_3d[11]
right_shoulder = landmarks_3d[12]
left_hip = landmarks_3d[23]
right_hip = landmarks_3d[24]
left_knee = landmarks_3d[25]
right_knee = landmarks_3d[26]
left_ankle = landmarks_3d[27]
right_ankle = landmarks_3d[28]
body_model = {
'shoulders': {
'left': left_shoulder,
'right': right_shoulder
},
'hips': {
'left': left_hip,
'right': right_hip
},
'left_leg': {
'knee': left_knee,
'ankle': left_ankle
},
'right_leg': {
'knee': right_knee,
'ankle': right_ankle
}
}
return body_model
def calculate_3d_distance(self, point1, point2):
"""计算两个 3D 点之间的距离"""
dx = point1['x'] - point2['x']
dy = point1['y'] - point2['y']
dz = point1['z'] - point2['z']
return (dx**2 + dy**2 + dz**2) ** 0.5
def close(self):
self.pose.close()方法 2:立体视觉
使用双摄像头或深度摄像头增强 3D 估计的精度。
python
import numpy as np
import cv2
class StereoPose3DEstimator:
def __init__(self, left_camera_matrix, right_camera_matrix,
left_dist_coeffs, right_dist_coeffs, rotation_matrix, translation_vector):
"""
初始化立体视觉姿态估计器
Args:
left_camera_matrix: 左摄像头的内参矩阵
right_camera_matrix: 右摄像头的内参矩阵
left_dist_coeffs: 左摄像头的畸变系数
right_dist_coeffs: 右摄像头的畸变系数
rotation_matrix: 两摄像头间的旋转矩阵
translation_vector: 两摄像头间的平移向量
"""
self.left_cam_matrix = left_camera_matrix
self.right_cam_matrix = right_camera_matrix
self.left_dist_coeffs = left_dist_coeffs
self.right_dist_coeffs = right_dist_coeffs
self.rotation_matrix = rotation_matrix
self.translation_vector = translation_vector
# 计算投影矩阵
self.proj_left = left_camera_matrix @ np.hstack([np.eye(3), np.zeros((3, 1))])
self.proj_right = right_camera_matrix @ np.hstack([rotation_matrix, translation_vector])
def triangulate_point(self, left_point, right_point):
"""
通过三角测量从两个摄像头的观测值计算 3D 点
Args:
left_point: 左摄像头中的 2D 点 (x, y)
right_point: 右摄像头中的 2D 点 (x, y)
Returns:
3D 点坐标 (x, y, z)
"""
points_4d = cv2.triangulatePoints(
self.proj_left, self.proj_right,
np.array(left_point).reshape(2, 1),
np.array(right_point).reshape(2, 1)
)
# 从齐次坐标转换为笛卡尔坐标
point_3d = points_4d[:3] / points_4d[3]
return point_3d.flatten()姿态追踪
什么是姿态追踪
姿态追踪是在视频序列中持续追踪同一个人或多个人的姿态变化。与静态的姿态估计不同,追踪需要在帧与帧之间维持对象的身份识别。
单人姿态追踪
python
import cv2
import mediapipe as mp
from collections import deque
class SinglePersonPoseTracker:
def __init__(self, max_history=30):
self.mp_pose = mp.solutions.pose
self.mp_drawing = mp.solutions.drawing_utils
self.pose = self.mp_pose.Pose(
smooth_landmarks=True,
min_detection_confidence=0.7,
min_tracking_confidence=0.5
)
self.max_history = max_history
self.pose_history = deque(maxlen=max_history)
def update_pose(self, frame):
"""更新当前帧的姿态"""
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
results = self.pose.process(frame_rgb)
if results.pose_landmarks:
landmarks = results.pose_landmarks.landmark
self.pose_history.append(landmarks)
return landmarks
return None
def get_smoothed_pose(self):
"""获取平滑的姿态(使用历史数据的平均)"""
if not self.pose_history:
return None
num_landmarks = len(self.pose_history[0])
smoothed_landmarks = []
for i in range(num_landmarks):
x_vals = [lm[i].x for lm in self.pose_history]
y_vals = [lm[i].y for lm in self.pose_history]
z_vals = [lm[i].z for lm in self.pose_history]
visibility_vals = [lm[i].visibility for lm in self.pose_history]
smoothed_landmarks.append({
'x': sum(x_vals) / len(x_vals),
'y': sum(y_vals) / len(y_vals),
'z': sum(z_vals) / len(z_vals),
'visibility': sum(visibility_vals) / len(visibility_vals)
})
return smoothed_landmarks
def get_pose_velocity(self, landmark_index):
"""计算特定关键点的速度"""
if len(self.pose_history) < 2:
return None
current = self.pose_history[-1][landmark_index]
previous = self.pose_history[-2][landmark_index]
vx = current.x - previous.x
vy = current.y - previous.y
vz = current.z - previous.z
return {'vx': vx, 'vy': vy, 'vz': vz}
def close(self):
self.pose.close()
# 使用示例
def track_single_person():
tracker = SinglePersonPoseTracker()
cap = cv2.VideoCapture(0)
while True:
success, frame = cap.read()
if not success:
break
landmarks = tracker.update_pose(frame)
smoothed_pose = tracker.get_smoothed_pose()
# 绘制追踪结果
if landmarks:
mp.solutions.drawing_utils.draw_landmarks(
frame,
mp.solutions.pose.PoseLandmark.Name,
mp.solutions.pose.POSE_CONNECTIONS
)
# 获取关键点的速度
left_wrist_velocity = tracker.get_pose_velocity(15)
if left_wrist_velocity:
speed = (left_wrist_velocity['vx']**2 +
left_wrist_velocity['vy']**2 +
left_wrist_velocity['vz']**2) ** 0.5
cv2.putText(frame, f"Left Wrist Speed: {speed:.4f}",
(10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
cv2.imshow('Single Person Pose Tracking', frame)
if cv2.waitKey(1) & 0xFF == ord('q'):
break
cap.release()
cv2.destroyAllWindows()
tracker.close()多人姿态追踪
python
import cv2
import mediapipe as mp
from scipy.spatial.distance import cdist
import numpy as np
class MultiPersonPoseTracker:
def __init__(self, distance_threshold=0.5):
self.mp_pose = mp.solutions.pose
self.mp_drawing = mp.solutions.drawing_utils
self.pose = self.mp_pose.Pose()
self.tracked_persons = {} # 存储追踪的人员 ID
self.next_person_id = 0
self.distance_threshold = distance_threshold
def detect_poses(self, frame):
"""检测帧中所有人的姿态"""
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
results = self.pose.process(frame_rgb)
detected_poses = []
if results.pose_landmarks:
# 由于 MediaPipe Pose 单次只检测一个人,
# 我们需要其他方法进行多人检测
detected_poses.append(results.pose_landmarks.landmark)
return detected_poses
def calculate_pose_distance(self, pose1, pose2):
"""计算两个姿态之间的距离"""
distance = 0
for i in range(min(len(pose1), len(pose2))):
dx = pose1[i].x - pose2[i].x
dy = pose1[i].y - pose2[i].y
dz = pose1[i].z - pose2[i].z
distance += (dx**2 + dy**2 + dz**2) ** 0.5
return distance / min(len(pose1), len(pose2))
def match_poses(self, detected_poses, previous_poses):
"""匹配当前检测到的姿态和前一帧的姿态"""
if not previous_poses:
# 第一帧,分配新 ID
matches = [(i, None) for i in range(len(detected_poses))]
else:
# 计算距离矩阵
distance_matrix = np.zeros((len(detected_poses), len(previous_poses)))
for i, detected in enumerate(detected_poses):
for j, previous in enumerate(previous_poses):
distance_matrix[i, j] = self.calculate_pose_distance(detected, previous)
# 匹配(使用贪心方法或匈牙利算法)
matches = []
for i in range(len(detected_poses)):
best_match = np.argmin(distance_matrix[i])
if distance_matrix[i, best_match] < self.distance_threshold:
matches.append((i, best_match))
else:
matches.append((i, None))
return matches
def update_tracking(self, frame):
"""更新多人追踪"""
detected_poses = self.detect_poses(frame)
previous_poses = [self.tracked_persons[pid]['pose']
for pid in self.tracked_persons]
matches = self.match_poses(detected_poses, previous_poses)
# 更新追踪信息
new_tracked_persons = {}
for detected_idx, previous_idx in matches:
if previous_idx is not None:
# 更新现有追踪
person_id = list(self.tracked_persons.keys())[previous_idx]
new_tracked_persons[person_id] = {
'pose': detected_poses[detected_idx],
'frame_count': self.tracked_persons[person_id]['frame_count'] + 1
}
else:
# 创建新追踪
person_id = self.next_person_id
self.next_person_id += 1
new_tracked_persons[person_id] = {
'pose': detected_poses[detected_idx],
'frame_count': 1
}
self.tracked_persons = new_tracked_persons
return self.tracked_persons
def draw_tracked_poses(self, frame):
"""绘制所有追踪的人的姿态"""
h, w, c = frame.shape
colors = [(0, 255, 0), (255, 0, 0), (0, 0, 255), (255, 255, 0)]
for person_id, data in self.tracked_persons.items():
color = colors[person_id % len(colors)]
# 绘制 ID
cv2.putText(frame, f"ID: {person_id}", (10, 30 + person_id * 30),
cv2.FONT_HERSHEY_SIMPLEX, 1, color, 2)
# 这里可以添加绘制关键点和连接的代码
return frame
def close(self):
self.pose.close()动作识别
动作识别的概念
基于姿态估计的动作识别是指识别人类正在执行的动作类型(如走路、跑步、坐下、站立等)。
基于姿态的动作分类
python
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
class ActionRecognizer:
def __init__(self):
self.scaler = StandardScaler()
self.classifier = RandomForestClassifier(n_estimators=100)
self.action_classes = ['standing', 'walking', 'sitting', 'running']
self.is_trained = False
def extract_features(self, landmarks):
"""
从姿态关键点提取特征
包括角度、距离、速度等
"""
if not landmarks or len(landmarks) < 33:
return None
features = []
# 提取身体各部位的角度
left_arm_angle = self.calculate_angle(
landmarks[11], landmarks[13], landmarks[15]
)
right_arm_angle = self.calculate_angle(
landmarks[12], landmarks[14], landmarks[16]
)
left_leg_angle = self.calculate_angle(
landmarks[23], landmarks[25], landmarks[27]
)
right_leg_angle = self.calculate_angle(
landmarks[24], landmarks[26], landmarks[28]
)
# 提取身体各部位的距离
shoulder_distance = self.calculate_distance(
landmarks[11], landmarks[12]
)
hip_distance = self.calculate_distance(
landmarks[23], landmarks[24]
)
# 提取姿态的纵横比
height = landmarks[0].y - landmarks[27].y
width = max(landmarks[i].x for i in range(33)) - min(landmarks[i].x for i in range(33))
aspect_ratio = height / (width + 1e-6)
features.extend([
left_arm_angle, right_arm_angle,
left_leg_angle, right_leg_angle,
shoulder_distance, hip_distance,
aspect_ratio
])
return np.array(features).reshape(1, -1)
def calculate_angle(self, point1, point2, point3):
"""计算三个点形成的角度"""
import math
a = np.array([point1.x, point1.y])
b = np.array([point2.x, point2.y])
c = np.array([point3.x, point3.y])
ba = a - b
bc = c - b
cos_angle = np.dot(ba, bc) / (np.linalg.norm(ba) * np.linalg.norm(bc) + 1e-6)
angle = np.arccos(np.clip(cos_angle, -1, 1))
return np.degrees(angle)
def calculate_distance(self, point1, point2):
"""计算两点间的距离"""
return np.sqrt((point1.x - point2.x)**2 + (point1.y - point2.y)**2)
def train(self, X_train, y_train):
"""
训练动作分类器
Args:
X_train: 训练特征 (n_samples, n_features)
y_train: 训练标签 (n_samples,)
"""
X_scaled = self.scaler.fit_transform(X_train)
self.classifier.fit(X_scaled, y_train)
self.is_trained = True
def predict(self, landmarks):
"""预测姿态对应的动作"""
if not self.is_trained:
return None
features = self.extract_features(landmarks)
if features is None:
return None
X_scaled = self.scaler.transform(features)
prediction = self.classifier.predict(X_scaled)[0]
confidence = np.max(self.classifier.predict_proba(X_scaled))
return {
'action': self.action_classes[prediction],
'confidence': confidence
}
# 使用示例
def recognize_actions():
import mediapipe as mp
import cv2
mp_pose = mp.solutions.pose
pose = mp_pose.Pose()
recognizer = ActionRecognizer()
# 这里需要先使用训练数据训练模型
# recognizer.train(X_train, y_train)
cap = cv2.VideoCapture(0)
while True:
success, frame = cap.read()
if not success:
break
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
results = pose.process(frame_rgb)
if results.pose_landmarks:
prediction = recognizer.predict(results.pose_landmarks.landmark)
if prediction:
text = f"{prediction['action']}: {prediction['confidence']:.2f}"
cv2.putText(frame, text, (10, 30),
cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
cv2.imshow('Action Recognition', frame)
if cv2.waitKey(1) & 0xFF == ord('q'):
break
cap.release()
cv2.destroyAllWindows()
pose.close()LeBot 中的姿态估计应用
动作模仿
LeBot 可以通过检测人类的姿态,将其转换为机器人的关节命令,实现动作模仿。
python
class RobotMotionMimicry:
def __init__(self, robot_controller):
self.robot = robot_controller
self.pose_estimator = Pose3DEstimator()
# 人类关键点到机器人关节的映射
self.mapping = {
'left_shoulder': 'left_shoulder_joint',
'left_elbow': 'left_elbow_joint',
'left_wrist': 'left_wrist_joint',
'right_shoulder': 'right_shoulder_joint',
'right_elbow': 'right_elbow_joint',
'right_wrist': 'right_wrist_joint',
'left_hip': 'left_hip_joint',
'left_knee': 'left_knee_joint',
'left_ankle': 'left_ankle_joint',
'right_hip': 'right_hip_joint',
'right_knee': 'right_knee_joint',
'right_ankle': 'right_ankle_joint',
}
def human_to_robot_angle(self, human_angle, joint_name):
"""将人类关节角度转换为机器人关节角度"""
# 这需要根据实际的机器人结构进行标定
# 这里仅是示例
scaling_factor = self.robot.get_joint_scale(joint_name)
offset = self.robot.get_joint_offset(joint_name)
robot_angle = human_angle * scaling_factor + offset
return robot_angle
def mimic_human_motion(self, frame):
"""实时模仿人类动作"""
landmarks_3d = self.pose_estimator.estimate_3d_pose(frame)
if not landmarks_3d:
return
# 提取人类的关键关节角度
left_arm_angle = self.calculate_angle(
landmarks_3d[11], landmarks_3d[13], landmarks_3d[15]
)
right_arm_angle = self.calculate_angle(
landmarks_3d[12], landmarks_3d[14], landmarks_3d[16]
)
# 转换为机器人角度
robot_left_arm = self.human_to_robot_angle(left_arm_angle, 'left_elbow_joint')
robot_right_arm = self.human_to_robot_angle(right_arm_angle, 'right_elbow_joint')
# 发送命令到机器人
self.robot.set_joint_angle('left_elbow_joint', robot_left_arm)
self.robot.set_joint_angle('right_elbow_joint', robot_right_arm)总结
姿态估计与追踪是计算机视觉中的关键技术,具有广泛的应用前景。通过掌握这些技术,你可以:
- 实现动作识别 - 自动识别人类的各种动作
- 进行动作分析 - 详细分析运动员或患者的动作质量
- 控制机器人 - 让机器人模仿人类动作
- 创建交互系统 - 基于身体动作的新型人机交互界面
在 LeBot 项目中,姿态估计和追踪是实现动作学习和自主决策的基础。
推荐资源
- MediaPipe Pose 官方文档:https://mediapipe.dev/solutions/pose
- OpenPose 项目:https://github.com/CMU-Perceptual-Computing-Lab/openpose
- 3D 姿态估计综述:https://arxiv.org/abs/1405.0495
- 动作识别最新进展:https://github.com/topics/action-recognition