Chapter 4 ยท Section 11

Solutions

Detailed implementations for the evaluation exercises.

Exercise 1: Creating a Quality Dataset

This solution demonstrates how to create a balanced, diverse dataset with edge cases and proper splitting.

import dspy
import random

def create_sentiment_dataset():
    examples = []
    
    # 1. POSITIVE EXAMPLES (10+)
    positive_texts = [
        "This product is amazing! Best purchase ever!",
        "Absolutely love the quality.",
        "Great customer service, very helpful.",
        "Fast shipping and item as described.",
        "Exceeded my expectations entirely.",
        "I would recommend this to everyone.",
        "Fantastic value for the money.",
        "Works perfectly out of the box.",
        "Simple to use and effective.",
        "Five stars, no complaints!"
    ]
    for text in positive_texts:
        examples.append(dspy.Example(text=text, sentiment="positive", confidence=0.9).with_inputs("text"))

    # 2. NEGATIVE EXAMPLES (10+)
    negative_texts = [
        "Terrible quality, broke immediately.",
        "Waste of money, do not buy.",
        "Customer service was rude and unhelpful.",
        "Shipping took forever and arrived damaged.",
        "Completely different from the description.",
        "Not worth the price tag.",
        "Stopped working after one day.",
        "Frustrating to set up.",
        "Poor design choices everywhere.",
        "I want a refund."
    ]
    for text in negative_texts:
        examples.append(dspy.Example(text=text, sentiment="negative", confidence=0.9).with_inputs("text"))

    # 3. NEUTRAL EXAMPLES (5+)
    neutral_texts = [
        "It's okay, nothing special.",
        "Average product for the price.",
        "Does the job, but has flaws.",
        "Delivered on time.",
        "It is what it is."
    ]
    for text in neutral_texts:
        examples.append(dspy.Example(text=text, sentiment="neutral", confidence=0.5).with_inputs("text"))

    # 4. EDGE CASESS (5+)
    edge_cases = [
        ("Great, another broken item.", "negative", 0.8), # Sarcasm
        ("Food was good, service was bad.", "neutral", 0.6), # Mixed
        ("Is this worth it?", "neutral", 0.3), # Question
        ("meh", "neutral", 0.4), # Short
        ("LOVE IT!!! ๐Ÿ˜", "positive", 0.95), # Emoji/formatting
    ]
    for text, sent, conf in edge_cases:
        examples.append(dspy.Example(text=text, sentiment=sent, confidence=conf).with_inputs("text"))

    # 5. SHUFFLE
    random.seed(42)
    random.shuffle(examples)

    # 6. SPLIT (60/20/20)
    n = len(examples)
    train_end = int(n * 0.6)
    dev_end = int(n * 0.8)

    trainset = examples[:train_end]
    devset = examples[train_end:dev_end]
    testset = examples[dev_end:]

    return trainset, devset, testset

Exercise 2: Designing a Custom Metric

This metric implements weighted sub-scores and handles the trace parameter for optimization.

def qa_quality_metric(example, pred, trace=None):
    # 1. Correctness (40%)
    # Simple inclusion check (could use semantic comparison in production)
    expected = example.answer.lower()
    actual = pred.answer.lower()
    correctness = 1.0 if expected in actual else 0.0

    # 2. Completeness (30%)
    # Check if key concepts from answer are present
    key_terms = expected.split()
    found_terms = sum(1 for term in key_terms if term in actual)
    completeness = found_terms / len(key_terms) if key_terms else 1.0

    # 3. Conciseness (20%)
    # Ideal length between 10 and 100 words
    word_count = len(actual.split())
    if 10 <= word_count <= 100:
        conciseness = 1.0
    elif word_count < 5 or word_count > 200:
        conciseness = 0.0
    else:
        conciseness = 0.5

    # 4. Format (10%)
    # Check for basic punctuation
    format_score = 1.0 if actual.strip().endswith(('.', '!', '?')) else 0.0

    # Weighted Sum
    final_score = (
        0.4 * correctness + 
        0.3 * completeness + 
        0.2 * conciseness + 
        0.1 * format_score
    )

    # Trace Handling
    if trace is not None:
        # Optimization Goal: High quality (>0.7)
        return final_score >= 0.7
        
    return final_score

Exercise 3: Systematic Evaluation

A comprehensive evaluation function that provides actionable insights.

def comprehensive_evaluation(module, devset, metric):
    results = {
        'total_score': 0.0,
        'count': 0,
        'scores': [],
        'errors': [],
        'best_examples': [],
        'worst_examples': []
    }
    
    detailed_results = []

    for idx, example in enumerate(devset):
        try:
            # Predict
            pred = module(**example.inputs())
            
            # Score
            score = metric(example, pred)
            
            # Store
            results['total_score'] += score
            results['count'] += 1
            results['scores'].append(score)
            
            detailed_results.append({
                'example': example, 
                'pred': pred, 
                'score': score
            })

            # Error Analysis Categories
            if score < 0.5:
                error_type = "low_quality"
                if len(pred.answer) == 0:
                    error_type = "empty_response"
                results['errors'].append({
                    'index': idx,
                    'type': error_type,
                    'input': example.question,
                    'expected': example.answer,
                    'actual': pred.answer
                })

        except Exception as e:
            results['errors'].append({
                'index': idx,
                'type': 'exception',
                'message': str(e)
            })

    # Averages
    if results['count'] > 0:
        results['average_score'] = results['total_score'] / results['count']
    
    # Sort for best/worst
    sorted_res = sorted(detailed_results, key=lambda x: x['score'], reverse=True)
    results['best_examples'] = sorted_res[:3]
    results['worst_examples'] = sorted_res[-3:]

    return results