1. Building a Mini-RAG System
Here is a complete implementation of the Mini-RAG system, including document chunking, vector storage with ChromaDB, and a DSPy-based answerer.
Python
import dspy
import chromadb
from typing import List, Dict
from sentence_transformers import SentenceTransformer
# 1. Document Processing
def chunk_document(text: str, chunk_size: int = 1000) -> List[str]:
return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
# 2. Vector Storage
class SimpleRAG:
def __init__(self):
self.chroma_client = chromadb.Client()
self.collection = self.chroma_client.create_collection("documents")
self.embedder = SentenceTransformer('all-MiniLM-L6-v2')
def add_documents(self, chunks: List[str], metadata: List[Dict]):
embeddings = self.embedder.encode(chunks).tolist()
ids = [str(i) for i in range(len(chunks))]
self.collection.add(
documents=chunks,
embeddings=embeddings,
metadatas=metadata,
ids=ids
)
def query(self, query: str, n_results: int = 3) -> Dict:
query_embedding = self.embedder.encode([query]).tolist()
return self.collection.query(
query_embeddings=query_embedding,
n_results=n_results
)
# 3. DSPy Answerer
class RAGAnswerSignature(dspy.Signature):
"""Generate answer from retrieved context."""
context = dspy.InputField(desc="Retrieved document chunks")
question = dspy.InputField(desc="User question")
answer = dspy.OutputField(desc="Answer based on context")
sources = dspy.OutputField(desc="Source information")
class RAGAnswerer(dspy.Module):
def __init__(self):
super().__init__()
self.generate = dspy.Predict(RAGAnswerSignature)
def forward(self, question: str, retrieved_docs: List[Dict]):
# Extract content from ChromaDB results
# Note: ChromaDB returns a dict with 'documents' as a list of lists
docs_content = retrieved_docs['documents'][0]
context = "\n\n".join(docs_content)
return self.generate(
context=context,
question=question
)
2. STORM Writing Assistant
This solution implements a simplified STORM pipeline that generates a multi-section article with citations based on simulated research findings.
Python
import dspy
from typing import Dict, List
class ContentGenerator(dspy.Module):
def __init__(self):
super().__init__()
self.generate_content = dspy.Predict(
"section_title, research_data, word_count -> content"
)
self.add_citations = dspy.Predict(
"content, research_data -> cited_content"
)
def generate_section(self, section_title: str, research_data: Dict, word_count: int = 300) -> Dict:
# Flatten findings for context
research_text = ""
for perspective, data in research_data.items():
research_text += f"\nPerspective: {perspective}\n"
research_text += "\n".join(data['findings'])
# Generate proper content
content_result = self.generate_content(
section_title=section_title,
research_data=research_text,
word_count=str(word_count)
)
# Enhance with citations
cited_result = self.add_citations(
content=content_result.content,
research_data=research_text
)
return {
'title': section_title,
'content': cited_result.cited_content,
'word_count': len(cited_result.cited_content.split())
}