2025 AI & Machine Learning
WWDC25 · 20 min · AI & Machine Learning
Explore large language models on Apple silicon with MLX
Discover MLX LM – designed specifically to make working with large language models simple and efficient on Apple silicon. We’ll cover how to fine-tune and run inference on state-of-the-art large language models on your Mac, and how to seamlessly integrate them into Swift-based applications and projects.
Watch at developer.apple.com ↗Chapters
Code shown on screen · 16 snippets
Running DeepSeek AI's model with MLX LM
mlx_lm.chat --model mlx-community/DeepSeek-V3-0324-4bit Text generation with MLX LM
mlx_lm.generate --model "mlx-community/Mistral-7B-Instruct-v0.3-4bit" \
--prompt "Write a quick sort in Swift" Changing the model's behavior with flags
mlx_lm.generate --model "mlx-community/Mistral-7B-Instruct-v0.3-4bit" \
--prompt "Write a quick sort in Swift" \
--top-p 0.5 \
--temp 0.2 \
--max-tokens 1024 Getting help for MLX LM
mlx_lm.generate --help MLX LM Python API
# Using MLX LM from Python
from mlx_lm import load, generate
# Load the model and tokenizer directly from HF
model, tokenizer = load("mlx-community/Mistral-7B-Instruct-v0.3-4bit")
# Prepare the prompt for the model
prompt = "Write a quick sort in Swift"
messages = [{"role": "user", "content": prompt}]
prompt = tokenizer.apply_chat_template(
messages, add_generation_prompt=True
)
# Generate the text
text = generate(model, tokenizer, prompt=prompt, verbose=True) Inspecting model architecture
from mlx_lm import load, generate
model, tokenizer = load("mlx-community/Mistral-7B-Instruct-v0.3-4bit")
print(model)
print(model.parameters())
print(model.layers[0].self_attn) Generation with KV cache
from mlx_lm import load, generate
from mlx_lm.models.cache import make_prompt_cache
# Load the model and tokenizer directly from HF
model, tokenizer = load("mlx-community/Mistral-7B-Instruct-v0.3-4bit")
# Prepare the prompt for the model
prompt = "Write a quick sort in Swift"
messages = [{"role": "user", "content": prompt}]
prompt = tokenizer.apply_chat_template(
messages, add_generation_prompt=True
)
cache = make_prompt_cache(model)
# Generate the text
text = generate(model, tokenizer, prompt=prompt, prompt_cache=cache, verbose=True) Quantization
mlx_lm.convert --hf-path "mistralai/Mistral-7B-Instruct-v0.3" \
--mlx-path "./mistral-7b-v0.3-4bit" \
--dtype float16 \
--quantize --q-bits 4 --q-group-size 64 Model quantization with MLX LM in Python
from mlx_lm.convert import convert
# We can choose a different quantization per layer
def mixed_quantization(layer_path, layer, model_config):
if "lm_head" in layer_path or "embed_tokens" in layer_path:
return {"bits": 6, "group_size": 64}
elif hasattr(layer, "to_quantized"):
return {"bits": 4, "group_size": 64}
else:
return False
# Convert can be used to change precision, quantize and upload models to HF
convert(
hf_path="mistralai/Mistral-7B-Instruct-v0.3",
mlx_path="./mistral-7b-v0.3-mixed-4-6-bit",
quantize=True,
quant_predicate=mixed_quantization
) Model fine-tuning
mlx_lm.lora --model "mlx-community/Mistral-7B-Instruct-v0.3-4bit"
--train
--data /path/to/our/data/folder
--iters 300
--batch-size 16 Prompting before fine-tuning
mlx_lm.generate --model "./mistral-7b-v0.3-4bit" \
--prompt "Who won the latest super bowl?" Fine-tuning to learn new knowledge
mlx_lm.lora --model "./mistral-7b-v0.3-4bit"
--train
--data ./data
--iters 300
--batch-size 8
--mask-prompt
--learning-rate 1e-5 Prompting after fine-tuning
mlx_lm.generate --model "mlx-community/Mistral-7B-Instruct-v0.3-4bit" \
--prompt "Who won the latest super bowl?" \
--adapter "adapters" Fusing models
mlx_lm.fuse --model "mlx-community/Mistral-7B-Instruct-v0.3-4bit"
--adapter-path "path/to/trained/adapters" \
--save-path "fused-mistral-7b-v0.3-4bit" \
--upload-repo "my-name/fused-mistral-7b-v0.3-4bit"
# Fusing our fine-tuned model adapters
mlx_lm.fuse --model "./mistral-7b-v0.3-4bit" \
--adapter-path "adapters" \
--save-path "fused-mistral-7b-v0.3-4bit" LLMs in MLX Swift
import Foundation
import MLX
import MLXLMCommon
import MLXLLM
@main
struct LLM {
static func main() async throws {
// Load the model and tokenizer directly from HF
let modelId = "mlx-community/Mistral-7B-Instruct-v0.3-4bit"
let modelFactory = LLMModelFactory.shared
let configuration = ModelConfiguration(id: modelId)
let model = try await modelFactory.loadContainer(configuration: configuration)
try await model.perform({context in
// Prepare the prompt for the model
let prompt = "Write a quicksort in Swift"
let input = try await context.processor.prepare(input: UserInput(prompt: prompt))
// Generate the text
let params = GenerateParameters(temperature: 0.0)
let tokenStream = try generate(input: input, parameters: params, context: context)
for await part in tokenStream {
print(part.chunk ?? "", terminator: "")
}
})
}
} Generation with KV cache in MLX Swift
import Foundation
import MLX
import MLXLMCommon
import MLXLLM
@main
struct LLM {
static func main() async throws {
// Load the model and tokenizer directly from HF
let modelId = "mlx-community/Mistral-7B-Instruct-v0.3-4bit"
let modelFactory = LLMModelFactory.shared
let configuration = ModelConfiguration(id: modelId)
let model = try await modelFactory.loadContainer(configuration: configuration)
try await model.perform({context in
// Prepare the prompt for the model
let prompt = "Write a quicksort in Swift"
let input = try await context.processor.prepare(input: UserInput(prompt: prompt))
// Create the key-value cache
let generateParameters = GenerateParameters()
let cache = context.model.newCache(parameters: generateParameters)
// Low level token iterator
let tokenIter = try TokenIterator(input: input,
model: context.model,
cache: cache,
parameters: generateParameters)
let tokenStream = generate(input: input, context: context, iterator: tokenIter)
for await part in tokenStream {
print(part.chunk ?? "", terminator: "")
}
})
}
} Resources
Related sessions
-
19 min