2026 AI & Machine LearningAudio & Video
WWDC26 · 17 min · AI & Machine Learning / Audio & Video
Meet the Music Understanding framework
Discover Music Understanding, a new framework that lets your app analyze audio across six dimensions, on device: key, rhythm, structure, pace, instrument activity, and loudness. And use the Music Understanding Lab sample app to visualize each result.
Watch at developer.apple.com ↗Chapters
Code shown on screen · 18 snippets
Initialize the session
import MusicUnderstanding
.fileImporter(isPresented: $isPresented, allowedContentTypes: [.audio]) { result in
switch result {
case .success(let url):
let asset = AVURLAsset(url: url,
options: [AVURLAssetPreferPreciseDurationAndTimingKey : true])
let session = try await MusicUnderstandingSession(asset: asset)
let results = try await session.analyze()
}
} Inside SessionResult
import MusicUnderstanding
public struct SessionResult: Codable, Sendable {
public let instrumentActivity: InstrumentActivityResult?
public let key: KeyResult?
public let loudness: LoudnessResult?
public let pace: PaceResult?
public let rhythm: RhythmResult?
public let structure: StructureResult?
} TimedValue
import MusicUnderstanding
public struct TimedValue<Value>: Codable, Equatable, Sendable
where Value: Codable & Equatable & Sendable {
public let time: CMTime
public let value: Value
} RangedValue
import MusicUnderstanding
public struct RangedValue<Value>: Codable, Equatable, Sendable
where Value: Codable & Equatable & Sendable {
public let range: CMTimeRange
public let value: Value
} Key analysis
public struct KeyResult: Codable, Sendable {
public let ranges: [MusicUnderstandingSession.RangedValue<KeySignature]
} KeySignature
public struct KeySignature: Codable, Hashable, Sendable {
public let tonic: Tonic
public let mode: Mode
} Using tonic
@frozen public enum Tonic: String, Codable, Hashable, Sendable {
case aFlat, aSharp, a, bFlat, b, c, cSharp, d, dFlat, dSharp, eFlat, e, f, fSharp, g, gFlat, gSharp
} Using mode
public enum Mode: String, Codable, Hashable, Sendable {
case major, minor
} Rhythm analysis
import MusicUnderstanding
public struct RhythmResult: Codable, Sendable {
public let beats: [CMTime]
public let bars: [CMTime]
public let beatsPerMinute: Float?
} StructureResult
import MusicUnderstanding
public struct StructureResult: Codable, Sendable {
public let sections: [CMTimeRange]
public let segments: [CMTimeRange]
public let phrases: [CMTimeRange]
} Analyzing pace
import MusicUnderstanding
public struct PaceResult: Codable, Sendable {
public let ranges: [MusicUnderstandingSession.RangedValue<Double>]
} InstrumentActivityResult
import MusicUnderstanding
public struct InstrumentActivityResult: Codable, Sendable {
public let ranges: [Instrument: [CMTimeRange]]
public let activity: [Instrument: [MusicUnderstandingSession.TimedValue<Float>]]
} LoudnessResult
import MusicUnderstanding
public struct LoudnessResult: Codable, Sendable {
public let integrated: MusicUnderstandingSession.TimedValue<Float>
public let momentary: [MusicUnderstandingSession.TimedValue<Float>]
public let shortTerm: [MusicUnderstandingSession.TimedValue<Float>]
public let peak: MusicUnderstandingSession.TimedValue<Float>
} Streaming API for loudness
import MusicUnderstanding
public var loudnessResults: some AsyncSequence<LoudnessResult, any Error> & Sendable Streaming API for loudness
import MusicUnderstanding
let audioProvider = AudioProvider()
let session = MusicUnderstandingSession(audioProvider: audioProvider)
await withThrowingTaskGroup(of: Void.self) { taskGroup in
group.addTask {
for try await result in await session.loudnessResults {
updateAudioLevel(result.momentary.value)
}
}
group.addTask {
try await session.analyze(for: [.loudness])
}
} Audio Provider
import MusicUnderstanding
struct AudioProvider: AsyncSequence, AsyncIteratorProtocol {
func makeAsyncIterator() -> Self {
return self
}
mutating func next() async -> AVReadOnlyAudioPCMBuffer? {
// Return the next audio buffer, or nil to signal completion
}
} Encode to JSON
import MusicUnderstanding
let session = try await MusicUnderstandingSession(asset: asset)
let results = try await session.analyze()
let encoder = JSONEncoder()
try encoder.encode(results) Suggestion for using pace
let timePerClip = 60 / paceValue