Exercise 1: Creating a Quality Dataset
This solution demonstrates how to create a balanced, diverse dataset with edge cases and proper splitting.
import dspy
import random
def create_sentiment_dataset():
examples = []
# 1. POSITIVE EXAMPLES (10+)
positive_texts = [
"This product is amazing! Best purchase ever!",
"Absolutely love the quality.",
"Great customer service, very helpful.",
"Fast shipping and item as described.",
"Exceeded my expectations entirely.",
"I would recommend this to everyone.",
"Fantastic value for the money.",
"Works perfectly out of the box.",
"Simple to use and effective.",
"Five stars, no complaints!"
]
for text in positive_texts:
examples.append(dspy.Example(text=text, sentiment="positive", confidence=0.9).with_inputs("text"))
# 2. NEGATIVE EXAMPLES (10+)
negative_texts = [
"Terrible quality, broke immediately.",
"Waste of money, do not buy.",
"Customer service was rude and unhelpful.",
"Shipping took forever and arrived damaged.",
"Completely different from the description.",
"Not worth the price tag.",
"Stopped working after one day.",
"Frustrating to set up.",
"Poor design choices everywhere.",
"I want a refund."
]
for text in negative_texts:
examples.append(dspy.Example(text=text, sentiment="negative", confidence=0.9).with_inputs("text"))
# 3. NEUTRAL EXAMPLES (5+)
neutral_texts = [
"It's okay, nothing special.",
"Average product for the price.",
"Does the job, but has flaws.",
"Delivered on time.",
"It is what it is."
]
for text in neutral_texts:
examples.append(dspy.Example(text=text, sentiment="neutral", confidence=0.5).with_inputs("text"))
# 4. EDGE CASESS (5+)
edge_cases = [
("Great, another broken item.", "negative", 0.8), # Sarcasm
("Food was good, service was bad.", "neutral", 0.6), # Mixed
("Is this worth it?", "neutral", 0.3), # Question
("meh", "neutral", 0.4), # Short
("LOVE IT!!! ๐", "positive", 0.95), # Emoji/formatting
]
for text, sent, conf in edge_cases:
examples.append(dspy.Example(text=text, sentiment=sent, confidence=conf).with_inputs("text"))
# 5. SHUFFLE
random.seed(42)
random.shuffle(examples)
# 6. SPLIT (60/20/20)
n = len(examples)
train_end = int(n * 0.6)
dev_end = int(n * 0.8)
trainset = examples[:train_end]
devset = examples[train_end:dev_end]
testset = examples[dev_end:]
return trainset, devset, testset
Exercise 2: Designing a Custom Metric
This metric implements weighted sub-scores and handles the trace parameter for optimization.
def qa_quality_metric(example, pred, trace=None):
# 1. Correctness (40%)
# Simple inclusion check (could use semantic comparison in production)
expected = example.answer.lower()
actual = pred.answer.lower()
correctness = 1.0 if expected in actual else 0.0
# 2. Completeness (30%)
# Check if key concepts from answer are present
key_terms = expected.split()
found_terms = sum(1 for term in key_terms if term in actual)
completeness = found_terms / len(key_terms) if key_terms else 1.0
# 3. Conciseness (20%)
# Ideal length between 10 and 100 words
word_count = len(actual.split())
if 10 <= word_count <= 100:
conciseness = 1.0
elif word_count < 5 or word_count > 200:
conciseness = 0.0
else:
conciseness = 0.5
# 4. Format (10%)
# Check for basic punctuation
format_score = 1.0 if actual.strip().endswith(('.', '!', '?')) else 0.0
# Weighted Sum
final_score = (
0.4 * correctness +
0.3 * completeness +
0.2 * conciseness +
0.1 * format_score
)
# Trace Handling
if trace is not None:
# Optimization Goal: High quality (>0.7)
return final_score >= 0.7
return final_score
Exercise 3: Systematic Evaluation
A comprehensive evaluation function that provides actionable insights.
def comprehensive_evaluation(module, devset, metric):
results = {
'total_score': 0.0,
'count': 0,
'scores': [],
'errors': [],
'best_examples': [],
'worst_examples': []
}
detailed_results = []
for idx, example in enumerate(devset):
try:
# Predict
pred = module(**example.inputs())
# Score
score = metric(example, pred)
# Store
results['total_score'] += score
results['count'] += 1
results['scores'].append(score)
detailed_results.append({
'example': example,
'pred': pred,
'score': score
})
# Error Analysis Categories
if score < 0.5:
error_type = "low_quality"
if len(pred.answer) == 0:
error_type = "empty_response"
results['errors'].append({
'index': idx,
'type': error_type,
'input': example.question,
'expected': example.answer,
'actual': pred.answer
})
except Exception as e:
results['errors'].append({
'index': idx,
'type': 'exception',
'message': str(e)
})
# Averages
if results['count'] > 0:
results['average_score'] = results['total_score'] / results['count']
# Sort for best/worst
sorted_res = sorted(detailed_results, key=lambda x: x['score'], reverse=True)
results['best_examples'] = sorted_res[:3]
results['worst_examples'] = sorted_res[-3:]
return results