2026 AI & Machine Learning
WWDC26 · 26 min · AI & Machine Learning
Meet the Evaluations framework
Learn how to evaluate model-driven experiences using the Evaluations framework. In a probabilistic world, unit tests alone won’t suffice. Discover how to define metrics, automatically grade outputs, and aggregate statistics to ensure your AI-powered features perform reliably across Apple’s platforms.
Watch at developer.apple.com ↗Chapters
- 0:00 — Introduction
- 3:10 — Demo app Book Tacker: a manual evaluation
- 4:31 — Building your first evaluation
- 8:06 — Running the evaluation and reading the report
- 10:57 — Building robust datasets
- 14:20 — Refining metrics and evaluators
- 15:41 — Evaluation-driven development and hill-climbing
- 16:12 — Model judges: qualitative metrics
- 18:42 — Building a model judge
- 21:19 — Refining with score dimensions
- 23:45 — Reviewing dimension results
- 24:20 — Best practices
- 25:38 — Next steps
Code shown on screen · 13 snippets
Define an Evaluation
// Evaluations
import Evaluations
struct BookTaggingEvaluation: Evaluation {
} Run with Swift Testing and an optimization target
// Optimization Target
("Book Tag Evaluations", .evaluates(evaluation, info: evaluationInfo))
func evaluateBookTagging() async throws {
let result = EvaluationContext.current.result
let rangeMetric = BookTagEvaluationTests.evaluation.tagCount
#expect(result.aggregateValue(.mean(of: rangeMetric)) >= 0.8)
} Constrain output with a Generable @Guide
// BookTags.swift
struct BookTags: Codable {
(description: "Descriptive tags capturing themes, genres, moods, and topics from the summary", .count(3...8))
var tags: [String]
} snippet. Define the dataset with ModelSample
// BookTaggingEvaluation
var dataset = ArrayLoader(samples: [
ModelSample(prompt: "okay I am OBSESSED and I need everyone to read this RIGHT NOW...",
expected: BookTags(tags: ["classic", "romance", "wit", "regency"])),
ModelSample(prompt: "Read this in one sitting between midnight and 4am and I cannot...",
expected: BookTags(tags: ["classic", "gothic", "horror", "vampire", "suspense"])),
])
// Or load your whole library:
var dataset = ArrayLoader(samples:
Book.sampleBooks.map { book in
ModelSample(prompt: book.review, expected: BookTags(tags: book.tags))
}
) Synthesize more samples with a SampleGenerator
// Synthesizing more inputs
let samples: [ModelSample<String>] = [
ModelSample(prompt: "The largest planet in our solar system...", expected: "Jupiter."),
ModelSample(prompt: "The capital of Thailand...", expected: "Bangkok."),
ModelSample(prompt: "Swift is...", expected: "a powerful programming language."),
ModelSample(prompt: "All those moments will be lost in time...", expected: "Like tears in rain.")
]
for try await sample in samples.makeSamples(
"""
Generate diverse sentence completions about the listed topics:
- The Solar System
- World Capitals
""",
targetCount: 1000) {
samples.append(sample)
} More evaluators: word count and genre
let wordCount = Metric("WordCount")
Evaluator { _, subject in
for tag in subject.value.tags {
if tag.contains(" ") {
return wordCount.failing(rationale: "Tag \(tag) contains multiple words")
}
}
return wordCount.passing()
}
let hasGenreTag = Metric("HasGenreTag")
Evaluator { _, subject in
let tags = subject.value.tags.map { $0.lowercased() }
let knownGenres = await BookTaggingService.knownGenres
for tag in tags {
if knownGenres.contains(tag) {
return hasGenreTag.passing(rationale: "Matched \(tag)")
}
}
return hasGenreTag.failing()
} Define a Metric and Evaluator
let tagCount = Metric("TagCount")
var evaluators: Evaluators {
// Tag count is within the required 3–8 range
Evaluator { _, subject in
let count = subject.value.tags.count
if (count >= 3 && count <= 8) {
return tagCount.passing(rationale: "\(count) tags")
}
return tagCount.failing(rationale: "Got \(count) tags, expected 3–8")
}
} Aggregate metrics across samples
let tagCount = Metric("TagCount")
let tagTotal = Metric("TagTotal")
func aggregateMetrics(using aggregator: inout MetricsAggregator) {
aggregator.computeMean(of: tagCount)
aggregator.group("Distribution of Tag Totals") { aggregator in
aggregator.computeStandardDeviation(of: tagTotal)
aggregator.computeMean(of: tagTotal)
aggregator.computeVariance(of: tagTotal)
}
} Iterate the feature's instructions (hill-climbing)
// BookTaggingService.swift
let instructions = Instructions {
"""
You are a librarian and literary analyst. Given a reader's
freeform summary of a book they read — describing their
thoughts, feelings, and what stood out — generate a set of
descriptive tags reflected in the summary.
Rules:
- Return between 3 and 8 tags.
- Tags should be lowercase, concise (single word or hyphenated), and descriptive.
- Tags should include the book's genre, chosen from the included list of known genres.
Known Genres:
- \(Self.knownGenres.joined(separator: ", "))
"""
} Build a model judge
ModelJudgeEvaluator(
"TagQuality",
scale: .numeric([
4: "Tags are relevant and helpful for browsing",
3: "Mostly relevant, one tag too vague or generic",
2: "Several tags are wrong or generic",
1: "Unhelpful or irrelevant"
]),
judge: PrivateCloudComputeLanguageModel()
) Split into score dimensions
// BookTaggingEvaluation.swift
ScoreDimension(
"Relevance",
description: """
Whether each tag describes a quality, theme, or tone
of the book itself rather than incidental details or
the reader's personal reactions.
""",
scale: .numeric([
4: "Every tag describes the book itself",
3: "Most tags describe the book",
2: "Some tags describe personal reactions",
1: "Tags don't meaningfully describe the book"
])
)
// Define `usefulness` the same way as a second ScoreDimension. Add dimensions to the judge
// BookTaggingEvaluation.swift
var evaluators: Evaluators {
Evaluator { }
Evaluator { }
Evaluator { }
ModelJudgeEvaluator(
judge: PrivateCloudComputeLanguageModel(),
dimensions: [relevance, usefulness]
)
} Add app context with a ModelJudgePrompt
// BookTaggingEvaluation.swift
ModelJudgeEvaluator(
judge: PrivateCloudComputeLanguageModel(),
dimensions: [relevance, usefulness],
prompt: ModelJudgePrompt(
instructions: """
You are evaluating tags generated for a personal book-tracking app where users
organize their library by browsing and filtering tags.
""",
evaluationTarget: { value in
"\(value.tags.count) Generated tags: " + value.tags.joined(separator: ", ")
},
reference: { input, _ in
let expectedTags = input.expected?.tags.joined(separator: ", ")
return ["Expected Tags": expectedTags ?? "No expected tags defined"]
}
)
)