Detect Body and Hand Pose with Vision

Explore how the Vision framework can help your app detect body and hand poses in photos and video. With pose detection, your app can analyze the poses, movements, and gestures of people to offer new video editing possibilities, or to perform action classification when paired with an action classifier built in Create ML. And we’ll show you how you can bring gesture recognition into your app through hand pose, delivering a whole new form of interaction. To understand more about how you might apply body pose for Action Classification, be sure to also watch the "Build an Action Classifier with Create ML" and "Explore the Action & Vision app" sessions. And to learn more about other great features in Vision, check out the "Explore Computer Vision APIs" session.

Watch at developer.apple.com ↗

Transcript all transcripts

Code shown on screen · 10 snippets

HandPoseCameraViewController swift · at 7:07 ↗

extension CameraViewController: AVCaptureVideoDataOutputSampleBufferDelegate {
    public func captureOutput(_ output: AVCaptureOutput, didOutput sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) {
        var thumbTip: CGPoint?
        var indexTip: CGPoint?
        
        defer {
            DispatchQueue.main.sync {
                self.processPoints(thumbTip: thumbTip, indexTip: indexTip)
            }
        }

        let handler = VNImageRequestHandler(cmSampleBuffer: sampleBuffer, orientation: .up, options: [:])
        do {
            // Perform VNDetectHumanHandPoseRequest
            try handler.perform([handPoseRequest])
            // Continue only when a hand was detected in the frame.
            // Since we set the maximumHandCount property of the request to 1, there will be at most one observation.
            guard let observation = handPoseRequest.results?.first as? VNRecognizedPointsObservation else {
                return
            }
            // Get points for thumb and index finger.
            let thumbPoints = try observation.recognizedPoints(forGroupKey: .handLandmarkRegionKeyThumb)
            let indexFingerPoints = try observation.recognizedPoints(forGroupKey: .handLandmarkRegionKeyIndexFinger)
            // Look for tip points.
            guard let thumbTipPoint = thumbPoints[.handLandmarkKeyThumbTIP], let indexTipPoint = indexFingerPoints[.handLandmarkKeyIndexTIP] else {
                return
            }
            // Ignore low confidence points.
            guard thumbTipPoint.confidence > 0.3 && indexTipPoint.confidence > 0.3 else {
                return
            }
            // Convert points from Vision coordinates to AVFoundation coordinates.
            thumbTip = CGPoint(x: thumbTipPoint.location.x, y: 1 - thumbTipPoint.location.y)
            indexTip = CGPoint(x: indexTipPoint.location.x, y: 1 - indexTipPoint.location.y)
        } catch {
            cameraFeedSession?.stopRunning()
            let error = AppError.visionError(error: error)
            DispatchQueue.main.async {
                error.displayInViewController(self)
            }
        }
    }
}

HandPoseProcessPointsPair swift · at 8:29 ↗

init(pinchMaxDistance: CGFloat = 40, evidenceCounterStateTrigger: Int = 3) {
        self.pinchMaxDistance = pinchMaxDistance
        self.evidenceCounterStateTrigger = evidenceCounterStateTrigger
    }
    
    func reset() {
        state = .unknown
        pinchEvidenceCounter = 0
        apartEvidenceCounter = 0
    }
    
    func processPointsPair(_ pointsPair: PointsPair) {
        lastProcessedPointsPair = pointsPair
        let distance = pointsPair.indexTip.distance(from: pointsPair.thumbTip)
        if distance < pinchMaxDistance {
            // Keep accumulating evidence for pinch state.
            pinchEvidenceCounter += 1
            apartEvidenceCounter = 0
            // Set new state based on evidence amount.
            state = (pinchEvidenceCounter >= evidenceCounterStateTrigger) ? .pinched : .possiblePinch
        } else {
            // Keep accumulating evidence for apart state.
            apartEvidenceCounter += 1
            pinchEvidenceCounter = 0
            // Set new state based on evidence amount.
            state = (apartEvidenceCounter >= evidenceCounterStateTrigger) ? .apart : .possibleApart
        }
    }

HandPoseHandleGestureStateChange swift · at 9:25 ↗

private func handleGestureStateChange(state: HandGestureProcessor.State) {
        let pointsPair = gestureProcessor.lastProcessedPointsPair
        var tipsColor: UIColor
        switch state {
        case .possiblePinch, .possibleApart:
            // We are in one of the "possible": states, meaning there is not enough evidence yet to determine
            // if we want to draw or not. For now, collect points in the evidence buffer, so we can add them
            // to a drawing path when required.
            evidenceBuffer.append(pointsPair)
            tipsColor = .orange
        case .pinched:
            // We have enough evidence to draw. Draw the points collected in the evidence buffer, if any.
            for bufferedPoints in evidenceBuffer {
                updatePath(with: bufferedPoints, isLastPointsPair: false)
            }
            // Clear the evidence buffer.
            evidenceBuffer.removeAll()
            // Finally, draw current point
            updatePath(with: pointsPair, isLastPointsPair: false)
            tipsColor = .green
        case .apart, .unknown:
            // We have enough evidence to not draw. Discard any evidence buffer points.
            evidenceBuffer.removeAll()
            // And draw the last segment of our draw path.
            updatePath(with: pointsPair, isLastPointsPair: true)
            tipsColor = .red
        }
        cameraView.showPoints([pointsPair.thumbTip, pointsPair.indexTip], color: tipsColor)
    }

HandPoseHandleGesture swift · at 10:15 ↗

@IBAction func handleGesture(_ gesture: UITapGestureRecognizer) {
        guard gesture.state == .ended else {
            return
        }
        evidenceBuffer.removeAll()
        drawPath.removeAllPoints()
        drawOverlay.path = drawPath.cgPath
    }

ActionVisionGameViewController swift · at 20:48 ↗

extension GameViewController: CameraViewControllerOutputDelegate {
    func cameraViewController(_ controller: CameraViewController, didReceiveBuffer buffer: CMSampleBuffer, orientation: CGImagePropertyOrientation) {
        let visionHandler = VNImageRequestHandler(cmSampleBuffer: buffer, orientation: orientation, options: [:])
        if self.gameManager.stateMachine.currentState is GameManager.TrackThrowsState {
            DispatchQueue.main.async {
                // Get the frame of rendered view
                let normalizedFrame = CGRect(x: 0, y: 0, width: 1, height: 1)
                self.jointSegmentView.frame = controller.viewRectForVisionRect(normalizedFrame)
                self.trajectoryView.frame = controller.viewRectForVisionRect(normalizedFrame)
            }
            // Perform the trajectory request in a separate dispatch queue
            trajectoryQueue.async {
                self.setUpDetectTrajectoriesRequest()
                do {
                    if let trajectoryRequest = self.detectTrajectoryRequest {
                        try visionHandler.perform([trajectoryRequest])
                    }
                } catch {
                    AppError.display(error, inViewController: self)
                }
            }
        }
        // Run bodypose request for additional GameConstants.maxPostReleasePoseObservations frames after the first trajectory observation is detected
        if !(self.trajectoryView.inFlight && self.trajectoryInFlightPoseObservations >= GameConstants.maxTrajectoryInFlightPoseObservations) {
            do {
                try visionHandler.perform([detectPlayerRequest])
                if let result = detectPlayerRequest.results?.first as? VNRecognizedPointsObservation {
                    let box = humanBoundingBox(for: result)
                    let boxView = playerBoundingBox
                    DispatchQueue.main.async {
                        let horizontalInset = CGFloat(-20.0)
                        let verticalInset = CGFloat(-20.0)
                        let viewRect = controller.viewRectForVisionRect(box).insetBy(dx: horizontalInset, dy: verticalInset)
                        self.updateBoundingBox(boxView, withRect: viewRect)
                        if !self.playerDetected && !boxView.isHidden {
                            self.gameStatusLabel.alpha = 0
                            self.resetTrajectoryRegions()
                            self.gameManager.stateMachine.enter(GameManager.DetectedPlayerState.self)
                        }
                    }
                }
            } catch {
                AppError.display(error, inViewController: self)
            }
        } else {
            // Hide player bounding box
            DispatchQueue.main.async {
                if !self.playerBoundingBox.isHidden {
                    self.playerBoundingBox.isHidden = true
                    self.jointSegmentView.resetView()
                }
            }
        }
    }
}

ActionVisionHumanBoundingBox swift · at 21:19 ↗

func humanBoundingBox(for observation: VNRecognizedPointsObservation) -> CGRect {
        var box = CGRect.zero
        // Process body points only if the confidence is high
        guard observation.confidence > 0.6 else {
            return box
        }
        var normalizedBoundingBox = CGRect.null
        guard let points = try? observation.recognizedPoints(forGroupKey: .all) else {
            return box
        }
        for (_, point) in points {
            // Only use point if human pose joint was detected reliably
            guard point.confidence > 0.1 else { continue }
            normalizedBoundingBox = normalizedBoundingBox.union(CGRect(origin: point.location, size: .zero))
        }
        if !normalizedBoundingBox.isNull {
            box = normalizedBoundingBox
        }
        // Fetch body joints from the observation and overlay them on the player
        DispatchQueue.main.async {
            let joints = getBodyJointsFor(observation: observation)
            self.jointSegmentView.joints = joints
        }
        // Store the body pose observation in playerStats when the game is in TrackThrowsState
        // We will use these observations for action classification once the throw is complete
        if gameManager.stateMachine.currentState is GameManager.TrackThrowsState {
            playerStats.storeObservation(observation)
            if trajectoryView.inFlight {
                trajectoryInFlightPoseObservations += 1
            }
        }
        return box
    }

ActionVisionStoreObservation swift · at 21:58 ↗

mutating func storeObservation(_ observation: VNRecognizedPointsObservation) {
        if poseObservations.count >= GameConstants.maxPoseObservations {
            poseObservations.removeFirst()
        }
        poseObservations.append(observation)
    }

ActionVisionGetLastThrowType swift · at 22:21 ↗

mutating func getLastThrowType() -> ThrowType {
        let actionClassifier = PlayerActionClassifier().model
        guard let poseMultiArray = prepareInputWithObservations(poseObservations) else {
            return ThrowType.none
        }
        let input = PlayerActionClassifierInput(input: poseMultiArray)
        guard let predictions = try? actionClassifier.prediction(from: input),
            let output = predictions.featureValue(for: "output")?.multiArrayValue,
                let outputBuffer = try? UnsafeBufferPointer<Float32>(output) else {
            return ThrowType.none
        }
        let probabilities = Array(outputBuffer)
        guard let maxConfidence = probabilities.prefix(3).max(), let maxIndex = probabilities.firstIndex(of: maxConfidence) else {
            return ThrowType.none
        }
        let throwTypes = ThrowType.allCases
        return throwTypes[maxIndex]
    }

ActionVisionPrepareInputWithObservations swift · at 22:42 ↗

func prepareInputWithObservations(_ observations: [VNRecognizedPointsObservation]) -> MLMultiArray? {
    let numAvailableFrames = observations.count
    let observationsNeeded = 60
    var multiArrayBuffer = [MLMultiArray]()

    // swiftlint:disable identifier_name
    for f in 0 ..< min(numAvailableFrames, observationsNeeded) {
        let pose = observations[f]
        do {
            let oneFrameMultiArray = try pose.keypointsMultiArray()
            multiArrayBuffer.append(oneFrameMultiArray)
        } catch {
            continue
        }
    }
    
    // If poseWindow does not have enough frames (60) yet, we need to pad 0s
    if numAvailableFrames < observationsNeeded {
        for _ in 0 ..< (observationsNeeded - numAvailableFrames) {
            do {
                let oneFrameMultiArray = try MLMultiArray(shape: [1, 3, 18], dataType: .double)
                try resetMultiArray(oneFrameMultiArray)
                multiArrayBuffer.append(oneFrameMultiArray)
            } catch {
                continue
            }
        }
    }
    return MLMultiArray(concatenating: [MLMultiArray](multiArrayBuffer), axis: 0, dataType: MLMultiArrayDataType.double)
}

ActionVisionGetLastThrowType2 swift · at 23:19 ↗

mutating func getLastThrowType() -> ThrowType {
        let actionClassifier = PlayerActionClassifier().model
        guard let poseMultiArray = prepareInputWithObservations(poseObservations) else {
            return ThrowType.none
        }
        let input = PlayerActionClassifierInput(input: poseMultiArray)
        guard let predictions = try? actionClassifier.prediction(from: input),
            let output = predictions.featureValue(for: "output")?.multiArrayValue,
                let outputBuffer = try? UnsafeBufferPointer<Float32>(output) else {
            return ThrowType.none
        }
        let probabilities = Array(outputBuffer)
        guard let maxConfidence = probabilities.prefix(3).max(), let maxIndex = probabilities.firstIndex(of: maxConfidence) else {
            return ThrowType.none
        }
        let throwTypes = ThrowType.allCases
        return throwTypes[maxIndex]
    }

Resources

Sample code Detecting Hand Poses with Vision
Documentation Vision

Integrate with motorized iPhone stands using DockKit

WWDC23

18 min
Explore 3D body pose and person segmentation in Vision

WWDC23

15 min
Classify hand poses and actions with Create ML

WWDC21 · 5 snippets

27 min
Detect people, faces, and poses using Vision

WWDC21 · 6 snippets

18 min
Build an Action Classifier with Create ML

WWDC20 · 6 snippets

26 min
Explore the Action & Vision app

WWDC20

36 min
Explore Computer Vision APIs

WWDC20 · 4 snippets

25 min

Code shown on screen · 10 snippets

Resources

Related sessions

Integrate with motorized iPhone stands using DockKit

Explore 3D body pose and person segmentation in Vision

Classify hand poses and actions with Create ML

Detect people, faces, and poses using Vision

Build an Action Classifier with Create ML

Explore the Action & Vision app

Explore Computer Vision APIs