Basic BootstrapFewShot Optimization
import dspy
from dspy.teleprompt import BootstrapFewShot
import os
from dotenv import load_dotenv
# Load environment variables
load_dotenv()
# Configure DSPy
lm = dspy.LM("openai/gpt-4o-mini", api_key=os.getenv("OPENAI_API_KEY"))
dspy.configure(lm=lm)
class BasicQA(dspy.Module):
def __init__(self):
super().__init__()
self.generate_answer = dspy.Predict("question -> answer")
def forward(self, question):
return self.generate_answer(question=question)
# Define exact match metric
def exact_match(example, pred, trace=None):
"""Check if prediction matches expected answer."""
return example.answer.lower().strip() == pred.answer.lower().strip()
# Training data
trainset = [
dspy.Example(question="What is 2+2?", answer="4").with_inputs("question"),
dspy.Example(question="Capital of France?", answer="Paris").with_inputs("question"),
dspy.Example(question="Who wrote Romeo and Juliet?", answer="Shakespeare").with_inputs("question"),
dspy.Example(question="What is H2O?", answer="Water").with_inputs("question"),
dspy.Example(question="How many continents?", answer="7").with_inputs("question"),
]
# Test data
testset = [
dspy.Example(question="What is 3+3?", answer="6").with_inputs("question"),
dspy.Example(question="Capital of Spain?", answer="Madrid").with_inputs("question"),
]
def bootstrap_optimize(program, trainset, max_demos=4):
"""Optimize the program using BootstrapFewShot."""
optimizer = BootstrapFewShot(
metric=exact_match,
max_bootstrapped_demos=max_demos,
max_labeled_demos=2
)
return optimizer.compile(program, trainset=trainset)
def evaluate(program, testset):
"""Evaluate program on test set."""
correct = 0
for example in testset:
try:
pred = program(question=example.question)
if exact_match(example, pred):
correct += 1
except Exception as e:
print(f"Error: {e}")
return correct / len(testset) if testset else 0
# Run optimization
baseline = BasicQA()
optimized = bootstrap_optimize(BasicQA(), trainset, max_demos=4)
# Evaluate
baseline_score = evaluate(baseline, testset)
optimized_score = evaluate(optimized, testset)
print(f"Baseline accuracy: {baseline_score:.2%}")
print(f"Optimized accuracy: {optimized_score:.2%}")
print(f"Improvement: {optimized_score - baseline_score:+.2%}")
Key Concepts
The metric function receives example (ground truth) and
pred (prediction). Normalize strings with
.lower().strip() for robust matching.
KNNFewShot for Context-Aware Selection
import dspy
from dspy.teleprompt import KNNFewShot
import os
from dotenv import load_dotenv
load_dotenv()
lm = dspy.LM("openai/gpt-4o-mini", api_key=os.getenv("OPENAI_API_KEY"))
dspy.configure(lm=lm)
class TopicClassifier(dspy.Module):
def __init__(self):
super().__init__()
self.classify = dspy.Predict("text -> topic")
def forward(self, text):
return self.classify(text=text)
# Training data with diverse topics
trainset = [
dspy.Example(text="Stock price increased after earnings", topic="finance").with_inputs("text"),
dspy.Example(text="New study reveals vaccine effectiveness", topic="healthcare").with_inputs("text"),
dspy.Example(text="Court ruled in favor of plaintiff", topic="legal").with_inputs("text"),
dspy.Example(text="Quarterback threw touchdown pass", topic="sports").with_inputs("text"),
dspy.Example(text="New iPhone features improved camera", topic="technology").with_inputs("text"),
dspy.Example(text="Merger approved by shareholders", topic="finance").with_inputs("text"),
dspy.Example(text="Patient recovered after surgery", topic="healthcare").with_inputs("text"),
dspy.Example(text="Jury reached guilty verdict", topic="legal").with_inputs("text"),
]
# Test data
testset = [
dspy.Example(text="Bitcoin price surged today", topic="finance").with_inputs("text"),
dspy.Example(text="The team won the championship", topic="sports").with_inputs("text"),
]
def topic_match(example, pred, trace=None):
"""Check if predicted topic matches expected."""
return example.topic.lower().strip() == pred.topic.lower().strip()
def create_knn_optimizer(k=3):
"""Create KNNFewShot optimizer."""
return KNNFewShot(
k=k,
trainset=trainset
)
def evaluate_classifier(classifier, testset):
"""Evaluate classifier accuracy."""
correct = 0
for example in testset:
try:
pred = classifier(text=example.text)
if topic_match(example, pred):
correct += 1
print(f"✓ '{example.text[:40]}...' -> {pred.topic}")
else:
print(f"✗ '{example.text[:40]}...' -> {pred.topic} (expected: {example.topic})")
except Exception as e:
print(f"Error: {e}")
return correct / len(testset) if testset else 0
# Create and compile
optimizer = create_knn_optimizer(k=3)
compiled = optimizer.compile(TopicClassifier(), trainset=trainset)
# Evaluate
print("=== KNNFewShot Results ===")
accuracy = evaluate_classifier(compiled, testset)
print(f"\nAccuracy: {accuracy:.2%}")
Key Concepts
KNNFewShot selects the k most similar examples for each query. For finance questions, it selects finance examples—making the prompts more relevant!
MIPRO for Complex Reasoning
import dspy
from dspy.teleprompt import MIPRO
import re
import os
from dotenv import load_dotenv
load_dotenv()
lm = dspy.LM("openai/gpt-4o-mini", api_key=os.getenv("OPENAI_API_KEY"))
dspy.configure(lm=lm)
class MathSolver(dspy.Module):
def __init__(self):
super().__init__()
self.solve = dspy.ChainOfThought("problem -> answer")
def forward(self, problem):
result = self.solve(problem=problem)
return dspy.Prediction(
steps=result.rationale,
answer=result.answer
)
# Training data
trainset = [
dspy.Example(
problem="A rope is 12m long. Cut into 3 equal pieces. Length of each?",
answer="4 meters"
).with_inputs("problem"),
dspy.Example(
problem="Train travels 60km in 1 hour. How far in 3 hours?",
answer="180 km"
).with_inputs("problem"),
dspy.Example(
problem="Box has 8 rows of 5 apples each. Total apples?",
answer="40 apples"
).with_inputs("problem"),
]
# Test data
testset = [
dspy.Example(
problem="John saves $200 per month. How much in 6 months?",
answer="$1,200"
).with_inputs("problem"),
]
def extract_number(text):
"""Extract numerical value from text."""
numbers = re.findall(r'[\d,]+(?:\.\d+)?', str(text))
if numbers:
return float(numbers[-1].replace(',', ''))
return None
def math_metric(example, pred, trace=None):
"""Metric that compares numerical answers."""
pred_num = extract_number(pred.answer)
true_num = extract_number(example.answer)
if pred_num is None or true_num is None:
# Fall back to string comparison
return example.answer.lower() in pred.answer.lower()
# Allow small numerical tolerance
return abs(pred_num - true_num) < 0.01
def mipro_optimize(program, trainset, num_candidates=10):
"""Optimize using MIPRO."""
optimizer = MIPRO(
metric=math_metric,
num_candidates=num_candidates,
init_temperature=0.7,
verbose=True
)
return optimizer.compile(
program,
trainset=trainset,
num_trials=3,
max_bootstrapped_demos=4
)
# Optimize
print("=== MIPRO Optimization ===")
optimized = mipro_optimize(MathSolver(), trainset)
# Test
print("\n=== Testing ===")
for example in testset:
result = optimized(problem=example.problem)
print(f"Problem: {example.problem}")
print(f"Reasoning: {result.steps}")
print(f"Answer: {result.answer}")
print(f"Expected: {example.answer}")
print(f"Correct: {math_metric(example, result)}")
Key Concepts
MIPRO optimizes both instructions and demonstrations. The
extract_number helper enables robust numerical comparison
even when formats differ.
Optimizer Comparison Benchmark
import dspy
from dspy.teleprompt import BootstrapFewShot, KNNFewShot, MIPRO
import time
import os
from dotenv import load_dotenv
load_dotenv()
lm = dspy.LM("openai/gpt-4o-mini", api_key=os.getenv("OPENAI_API_KEY"))
dspy.configure(lm=lm)
class SentimentAnalyzer(dspy.Module):
def __init__(self):
super().__init__()
self.analyze = dspy.Predict("text -> sentiment")
def forward(self, text):
return self.analyze(text=text)
trainset = [
dspy.Example(text="I love this product!", sentiment="positive").with_inputs("text"),
dspy.Example(text="Terrible quality.", sentiment="negative").with_inputs("text"),
dspy.Example(text="Works as expected.", sentiment="neutral").with_inputs("text"),
dspy.Example(text="Outstanding service!", sentiment="positive").with_inputs("text"),
dspy.Example(text="Would not recommend.", sentiment="negative").with_inputs("text"),
dspy.Example(text="It's okay I guess.", sentiment="neutral").with_inputs("text"),
]
testset = [
dspy.Example(text="Best purchase ever!", sentiment="positive").with_inputs("text"),
dspy.Example(text="Complete waste of money.", sentiment="negative").with_inputs("text"),
dspy.Example(text="Does the job.", sentiment="neutral").with_inputs("text"),
]
def sentiment_match(example, pred, trace=None):
return example.sentiment.lower() == pred.sentiment.lower()
def evaluate(program, testset):
correct = sum(1 for ex in testset
if sentiment_match(ex, program(text=ex.text)))
return correct / len(testset)
def benchmark_optimizers(program_class, trainset, testset):
"""Compare multiple optimizers."""
results = {}
# Baseline
print("Testing Baseline...")
baseline = program_class()
results["Baseline"] = {
"accuracy": evaluate(baseline, testset),
"time": 0
}
# BootstrapFewShot
print("Testing BootstrapFewShot...")
start = time.time()
optimizer = BootstrapFewShot(metric=sentiment_match, max_bootstrapped_demos=4)
compiled = optimizer.compile(program_class(), trainset=trainset)
results["BootstrapFewShot"] = {
"accuracy": evaluate(compiled, testset),
"time": time.time() - start
}
# KNNFewShot
print("Testing KNNFewShot...")
start = time.time()
optimizer = KNNFewShot(k=3, trainset=trainset)
compiled = optimizer.compile(program_class(), trainset=trainset)
results["KNNFewShot"] = {
"accuracy": evaluate(compiled, testset),
"time": time.time() - start
}
# MIPRO
print("Testing MIPRO...")
start = time.time()
optimizer = MIPRO(metric=sentiment_match, auto="light")
compiled = optimizer.compile(program_class(), trainset=trainset, num_trials=2)
results["MIPRO"] = {
"accuracy": evaluate(compiled, testset),
"time": time.time() - start
}
return results
def print_comparison(results):
"""Print comparison report."""
print("\n" + "=" * 50)
print("OPTIMIZER COMPARISON REPORT")
print("=" * 50)
baseline_acc = results["Baseline"]["accuracy"]
for name, data in results.items():
print(f"\n{name}:")
print(f" Accuracy: {data['accuracy']:.1%}")
print(f" Time: {data['time']:.1f}s")
if name != "Baseline":
improvement = data['accuracy'] - baseline_acc
print(f" Improvement: {improvement:+.1%}")
# Best performance
best = max(results.items(), key=lambda x: x[1]['accuracy'])
print(f"\n🏆 Best Performance: {best[0]}")
# Run benchmark
results = benchmark_optimizers(SentimentAnalyzer, trainset, testset)
print_comparison(results)
Key Concepts
This benchmark tracks both accuracy and compilation time. Note how MIPRO takes longer but often achieves the best results.
Progressive Optimization Pipeline
import dspy
from dspy.teleprompt import BootstrapFewShot, KNNFewShot, MIPRO
import time
import os
from dotenv import load_dotenv
load_dotenv()
lm = dspy.LM("openai/gpt-4o-mini", api_key=os.getenv("OPENAI_API_KEY"))
dspy.configure(lm=lm)
def evaluate(program, valset, metric):
"""Evaluate program on validation set."""
correct = sum(1 for ex in valset if metric(ex, program(**ex.inputs())))
return correct / len(valset)
def progressive_optimize(program_class, trainset, valset, metric, target_accuracy=0.85):
"""
Progressively optimize until target accuracy is reached.
"""
stages = [
{
"name": "BootstrapFewShot",
"optimizer": BootstrapFewShot(metric=metric, max_bootstrapped_demos=4),
"config": {}
},
{
"name": "KNNFewShot",
"optimizer": KNNFewShot(k=3, trainset=trainset),
"config": {}
},
{
"name": "MIPRO",
"optimizer": MIPRO(metric=metric, auto="medium"),
"config": {"num_trials": 3}
}
]
results = []
best_program = None
best_accuracy = 0
# Baseline
baseline = program_class()
baseline_acc = evaluate(baseline, valset, metric)
results.append({
"stage": "Baseline",
"accuracy": baseline_acc,
"time": 0
})
print(f"Baseline: {baseline_acc:.1%}")
for stage in stages:
print(f"\nTrying {stage['name']}...")
start = time.time()
try:
compiled = stage['optimizer'].compile(
program_class(),
trainset=trainset,
**stage['config']
)
accuracy = evaluate(compiled, valset, metric)
elapsed = time.time() - start
results.append({
"stage": stage['name'],
"accuracy": accuracy,
"time": elapsed
})
print(f" Accuracy: {accuracy:.1%} (took {elapsed:.1f}s)")
if accuracy > best_accuracy:
best_accuracy = accuracy
best_program = compiled
print(f" ✓ New best!")
# Early stopping if target reached
if accuracy >= target_accuracy:
print(f"\n🎯 Target accuracy {target_accuracy:.1%} reached!")
break
except Exception as e:
print(f" Error: {e}")
return best_program, results
def analyze_optimization_roi(results):
"""Analyze return on investment."""
print("\n" + "=" * 50)
print("ROI ANALYSIS")
print("=" * 50)
baseline_acc = results[0]['accuracy']
for i, result in enumerate(results[1:], 1):
gain = result['accuracy'] - baseline_acc
time_spent = result['time']
if time_spent > 0:
roi = gain / (time_spent / 60) # Gain per minute
print(f"\n{result['stage']}:")
print(f" Accuracy gain: {gain:+.1%}")
print(f" Time spent: {time_spent:.1f}s")
print(f" ROI: {roi:.2%} per minute")
# Example usage
class SimpleQA(dspy.Module):
def __init__(self):
super().__init__()
self.answer = dspy.Predict("question -> answer")
def forward(self, question):
return self.answer(question=question)
trainset = [
dspy.Example(question="What is 2+2?", answer="4").with_inputs("question"),
dspy.Example(question="Capital of France?", answer="Paris").with_inputs("question"),
# ... more examples
]
valset = [
dspy.Example(question="What is 5+5?", answer="10").with_inputs("question"),
]
def metric(ex, pred, trace=None):
return ex.answer.lower() in pred.answer.lower()
# Run progressive optimization
best, results = progressive_optimize(
SimpleQA, trainset, valset, metric, target_accuracy=0.80
)
# Analyze ROI
analyze_optimization_roi(results)
print("\n✅ Optimization complete!")
Key Concepts
Progressive optimization stops early when target is reached—saving time. ROI analysis helps identify which optimizer gives best "bang for buck."
Congratulations! You've completed Chapter 5: Optimizers & Compilation. You now understand how to automatically optimize DSPy programs for maximum performance!