2020 AI & Machine Learning
WWDC20 · 24 min · AI & Machine Learning
Detect Body and Hand Pose with Vision
Explore how the Vision framework can help your app detect body and hand poses in photos and video. With pose detection, your app can analyze the poses, movements, and gestures of people to offer new video editing possibilities, or to perform action classification when paired with an action classifier built in Create ML. And we’ll show you how you can bring gesture recognition into your app through hand pose, delivering a whole new form of interaction. To understand more about how you might apply body pose for Action Classification, be sure to also watch the "Build an Action Classifier with Create ML" and "Explore the Action & Vision app" sessions. And to learn more about other great features in Vision, check out the "Explore Computer Vision APIs" session.
Watch at developer.apple.com ↗Code shown on screen · 10 snippets
HandPoseCameraViewController
extension CameraViewController: AVCaptureVideoDataOutputSampleBufferDelegate {
public func captureOutput(_ output: AVCaptureOutput, didOutput sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) {
var thumbTip: CGPoint?
var indexTip: CGPoint?
defer {
DispatchQueue.main.sync {
self.processPoints(thumbTip: thumbTip, indexTip: indexTip)
}
}
let handler = VNImageRequestHandler(cmSampleBuffer: sampleBuffer, orientation: .up, options: [:])
do {
// Perform VNDetectHumanHandPoseRequest
try handler.perform([handPoseRequest])
// Continue only when a hand was detected in the frame.
// Since we set the maximumHandCount property of the request to 1, there will be at most one observation.
guard let observation = handPoseRequest.results?.first as? VNRecognizedPointsObservation else {
return
}
// Get points for thumb and index finger.
let thumbPoints = try observation.recognizedPoints(forGroupKey: .handLandmarkRegionKeyThumb)
let indexFingerPoints = try observation.recognizedPoints(forGroupKey: .handLandmarkRegionKeyIndexFinger)
// Look for tip points.
guard let thumbTipPoint = thumbPoints[.handLandmarkKeyThumbTIP], let indexTipPoint = indexFingerPoints[.handLandmarkKeyIndexTIP] else {
return
}
// Ignore low confidence points.
guard thumbTipPoint.confidence > 0.3 && indexTipPoint.confidence > 0.3 else {
return
}
// Convert points from Vision coordinates to AVFoundation coordinates.
thumbTip = CGPoint(x: thumbTipPoint.location.x, y: 1 - thumbTipPoint.location.y)
indexTip = CGPoint(x: indexTipPoint.location.x, y: 1 - indexTipPoint.location.y)
} catch {
cameraFeedSession?.stopRunning()
let error = AppError.visionError(error: error)
DispatchQueue.main.async {
error.displayInViewController(self)
}
}
}
} HandPoseProcessPointsPair
init(pinchMaxDistance: CGFloat = 40, evidenceCounterStateTrigger: Int = 3) {
self.pinchMaxDistance = pinchMaxDistance
self.evidenceCounterStateTrigger = evidenceCounterStateTrigger
}
func reset() {
state = .unknown
pinchEvidenceCounter = 0
apartEvidenceCounter = 0
}
func processPointsPair(_ pointsPair: PointsPair) {
lastProcessedPointsPair = pointsPair
let distance = pointsPair.indexTip.distance(from: pointsPair.thumbTip)
if distance < pinchMaxDistance {
// Keep accumulating evidence for pinch state.
pinchEvidenceCounter += 1
apartEvidenceCounter = 0
// Set new state based on evidence amount.
state = (pinchEvidenceCounter >= evidenceCounterStateTrigger) ? .pinched : .possiblePinch
} else {
// Keep accumulating evidence for apart state.
apartEvidenceCounter += 1
pinchEvidenceCounter = 0
// Set new state based on evidence amount.
state = (apartEvidenceCounter >= evidenceCounterStateTrigger) ? .apart : .possibleApart
}
} HandPoseHandleGestureStateChange
private func handleGestureStateChange(state: HandGestureProcessor.State) {
let pointsPair = gestureProcessor.lastProcessedPointsPair
var tipsColor: UIColor
switch state {
case .possiblePinch, .possibleApart:
// We are in one of the "possible": states, meaning there is not enough evidence yet to determine
// if we want to draw or not. For now, collect points in the evidence buffer, so we can add them
// to a drawing path when required.
evidenceBuffer.append(pointsPair)
tipsColor = .orange
case .pinched:
// We have enough evidence to draw. Draw the points collected in the evidence buffer, if any.
for bufferedPoints in evidenceBuffer {
updatePath(with: bufferedPoints, isLastPointsPair: false)
}
// Clear the evidence buffer.
evidenceBuffer.removeAll()
// Finally, draw current point
updatePath(with: pointsPair, isLastPointsPair: false)
tipsColor = .green
case .apart, .unknown:
// We have enough evidence to not draw. Discard any evidence buffer points.
evidenceBuffer.removeAll()
// And draw the last segment of our draw path.
updatePath(with: pointsPair, isLastPointsPair: true)
tipsColor = .red
}
cameraView.showPoints([pointsPair.thumbTip, pointsPair.indexTip], color: tipsColor)
} HandPoseHandleGesture
@IBAction func handleGesture(_ gesture: UITapGestureRecognizer) {
guard gesture.state == .ended else {
return
}
evidenceBuffer.removeAll()
drawPath.removeAllPoints()
drawOverlay.path = drawPath.cgPath
} ActionVisionGameViewController
extension GameViewController: CameraViewControllerOutputDelegate {
func cameraViewController(_ controller: CameraViewController, didReceiveBuffer buffer: CMSampleBuffer, orientation: CGImagePropertyOrientation) {
let visionHandler = VNImageRequestHandler(cmSampleBuffer: buffer, orientation: orientation, options: [:])
if self.gameManager.stateMachine.currentState is GameManager.TrackThrowsState {
DispatchQueue.main.async {
// Get the frame of rendered view
let normalizedFrame = CGRect(x: 0, y: 0, width: 1, height: 1)
self.jointSegmentView.frame = controller.viewRectForVisionRect(normalizedFrame)
self.trajectoryView.frame = controller.viewRectForVisionRect(normalizedFrame)
}
// Perform the trajectory request in a separate dispatch queue
trajectoryQueue.async {
self.setUpDetectTrajectoriesRequest()
do {
if let trajectoryRequest = self.detectTrajectoryRequest {
try visionHandler.perform([trajectoryRequest])
}
} catch {
AppError.display(error, inViewController: self)
}
}
}
// Run bodypose request for additional GameConstants.maxPostReleasePoseObservations frames after the first trajectory observation is detected
if !(self.trajectoryView.inFlight && self.trajectoryInFlightPoseObservations >= GameConstants.maxTrajectoryInFlightPoseObservations) {
do {
try visionHandler.perform([detectPlayerRequest])
if let result = detectPlayerRequest.results?.first as? VNRecognizedPointsObservation {
let box = humanBoundingBox(for: result)
let boxView = playerBoundingBox
DispatchQueue.main.async {
let horizontalInset = CGFloat(-20.0)
let verticalInset = CGFloat(-20.0)
let viewRect = controller.viewRectForVisionRect(box).insetBy(dx: horizontalInset, dy: verticalInset)
self.updateBoundingBox(boxView, withRect: viewRect)
if !self.playerDetected && !boxView.isHidden {
self.gameStatusLabel.alpha = 0
self.resetTrajectoryRegions()
self.gameManager.stateMachine.enter(GameManager.DetectedPlayerState.self)
}
}
}
} catch {
AppError.display(error, inViewController: self)
}
} else {
// Hide player bounding box
DispatchQueue.main.async {
if !self.playerBoundingBox.isHidden {
self.playerBoundingBox.isHidden = true
self.jointSegmentView.resetView()
}
}
}
}
} ActionVisionHumanBoundingBox
func humanBoundingBox(for observation: VNRecognizedPointsObservation) -> CGRect {
var box = CGRect.zero
// Process body points only if the confidence is high
guard observation.confidence > 0.6 else {
return box
}
var normalizedBoundingBox = CGRect.null
guard let points = try? observation.recognizedPoints(forGroupKey: .all) else {
return box
}
for (_, point) in points {
// Only use point if human pose joint was detected reliably
guard point.confidence > 0.1 else { continue }
normalizedBoundingBox = normalizedBoundingBox.union(CGRect(origin: point.location, size: .zero))
}
if !normalizedBoundingBox.isNull {
box = normalizedBoundingBox
}
// Fetch body joints from the observation and overlay them on the player
DispatchQueue.main.async {
let joints = getBodyJointsFor(observation: observation)
self.jointSegmentView.joints = joints
}
// Store the body pose observation in playerStats when the game is in TrackThrowsState
// We will use these observations for action classification once the throw is complete
if gameManager.stateMachine.currentState is GameManager.TrackThrowsState {
playerStats.storeObservation(observation)
if trajectoryView.inFlight {
trajectoryInFlightPoseObservations += 1
}
}
return box
} ActionVisionStoreObservation
mutating func storeObservation(_ observation: VNRecognizedPointsObservation) {
if poseObservations.count >= GameConstants.maxPoseObservations {
poseObservations.removeFirst()
}
poseObservations.append(observation)
} ActionVisionGetLastThrowType
mutating func getLastThrowType() -> ThrowType {
let actionClassifier = PlayerActionClassifier().model
guard let poseMultiArray = prepareInputWithObservations(poseObservations) else {
return ThrowType.none
}
let input = PlayerActionClassifierInput(input: poseMultiArray)
guard let predictions = try? actionClassifier.prediction(from: input),
let output = predictions.featureValue(for: "output")?.multiArrayValue,
let outputBuffer = try? UnsafeBufferPointer<Float32>(output) else {
return ThrowType.none
}
let probabilities = Array(outputBuffer)
guard let maxConfidence = probabilities.prefix(3).max(), let maxIndex = probabilities.firstIndex(of: maxConfidence) else {
return ThrowType.none
}
let throwTypes = ThrowType.allCases
return throwTypes[maxIndex]
} ActionVisionPrepareInputWithObservations
func prepareInputWithObservations(_ observations: [VNRecognizedPointsObservation]) -> MLMultiArray? {
let numAvailableFrames = observations.count
let observationsNeeded = 60
var multiArrayBuffer = [MLMultiArray]()
// swiftlint:disable identifier_name
for f in 0 ..< min(numAvailableFrames, observationsNeeded) {
let pose = observations[f]
do {
let oneFrameMultiArray = try pose.keypointsMultiArray()
multiArrayBuffer.append(oneFrameMultiArray)
} catch {
continue
}
}
// If poseWindow does not have enough frames (60) yet, we need to pad 0s
if numAvailableFrames < observationsNeeded {
for _ in 0 ..< (observationsNeeded - numAvailableFrames) {
do {
let oneFrameMultiArray = try MLMultiArray(shape: [1, 3, 18], dataType: .double)
try resetMultiArray(oneFrameMultiArray)
multiArrayBuffer.append(oneFrameMultiArray)
} catch {
continue
}
}
}
return MLMultiArray(concatenating: [MLMultiArray](multiArrayBuffer), axis: 0, dataType: MLMultiArrayDataType.double)
} ActionVisionGetLastThrowType2
mutating func getLastThrowType() -> ThrowType {
let actionClassifier = PlayerActionClassifier().model
guard let poseMultiArray = prepareInputWithObservations(poseObservations) else {
return ThrowType.none
}
let input = PlayerActionClassifierInput(input: poseMultiArray)
guard let predictions = try? actionClassifier.prediction(from: input),
let output = predictions.featureValue(for: "output")?.multiArrayValue,
let outputBuffer = try? UnsafeBufferPointer<Float32>(output) else {
return ThrowType.none
}
let probabilities = Array(outputBuffer)
guard let maxConfidence = probabilities.prefix(3).max(), let maxIndex = probabilities.firstIndex(of: maxConfidence) else {
return ThrowType.none
}
let throwTypes = ThrowType.allCases
return throwTypes[maxIndex]
} Resources
Related sessions
-
18 min -
15 min -
27 min -
18 min -
26 min -
36 min -
25 min