RAG Evals Examples
This page provides practical examples of using RAG Evals in different scenarios.
Basic Usage Example
This example shows how to evaluate a simple RAG system using faithfulness, context precision, and answer relevance metrics:
import instructor
from rag_evals import Faithfulness, ChunkPrecision, AnswerRelevance
# Initialize with LLM
client = instructor.from_provider("openai/gpt-4o-mini")
# Sample RAG output
question = "What are the benefits of exercise?"
answer = "Regular exercise improves cardiovascular health and increases strength."
context = [
"Regular physical activity improves heart health and circulation.",
"Weight training builds muscle strength and increases bone density.",
"The earliest Olympic games were held in Ancient Greece."
]
# Evaluate faithfulness
faithfulness_result = Faithfulness.grade(
question=question,
answer=answer,
context=context,
client=client
)
# Evaluate context precision
precision_result = ChunkPrecision.grade(
question=question,
answer=answer,
context=context,
client=client
)
# Evaluate answer relevance
relevance_result = AnswerRelevance.grade(
question=question,
answer=answer,
context=context, # Context isn't used in relevance evaluation
client=client
)
# Process and display results
print(f"Overall Faithfulness Score: {faithfulness_result.overall_faithfulness_score:.2f}")
print(f"Overall Context Precision: {precision_result.avg_score:.2f}")
print(f"Overall Answer Relevance: {relevance_result.overall_score:.2f}")
# Detailed analysis
print("\nFaithfulness Breakdown:")
for statement in faithfulness_result.statements:
print(f"- {statement.statement}")
print(f" Supported: {statement.is_supported}")
if statement.is_supported and statement.supporting_chunk_ids:
print(f" Supporting chunks: {statement.supporting_chunk_ids}")
print("\nContext Precision Breakdown:")
for chunk in precision_result.graded_chunks:
print(f"- Chunk {chunk.id_chunk}: {'Relevant' if chunk.score else 'Not Relevant'}")
print("\nAnswer Relevance Breakdown:")
print(f"- Topical Match: {relevance_result.topical_match:.2f}")
print(f"- Completeness: {relevance_result.completeness:.2f}")
print(f"- Conciseness: {relevance_result.conciseness:.2f}")
print(f"- Reasoning: {relevance_result.reasoning}")
Parallel Evaluation Example
This example demonstrates how to run multiple evaluations in parallel for better performance:
import asyncio
import instructor
from rag_evals import Faithfulness, ChunkPrecision, AnswerRelevance
# Initialize with async client
async_client = instructor.AsyncInstructor(provider="openai/gpt-4o-mini")
# Sample RAG outputs to evaluate
examples = [
{
"question": "What are the benefits of exercise?",
"answer": "Regular exercise improves cardiovascular health and increases strength.",
"context": [
"Regular physical activity improves heart health and circulation.",
"Weight training builds muscle strength and increases bone density.",
"The earliest Olympic games were held in Ancient Greece."
]
},
{
"question": "How does photosynthesis work?",
"answer": "Photosynthesis is the process by which plants convert sunlight into energy.",
"context": [
"Photosynthesis is the process by which plants use sunlight to synthesize foods from carbon dioxide and water.",
"The process primarily happens in the plant's leaves through their chloroplasts.",
"Solar panels are designed to convert sunlight into electricity."
]
}
]
async def evaluate_example(example):
"""Evaluate a single example using multiple metrics in parallel"""
faithfulness_task = Faithfulness.agrade(
question=example["question"],
answer=example["answer"],
context=example["context"],
client=async_client
)
precision_task = ChunkPrecision.agrade(
question=example["question"],
answer=example["answer"],
context=example["context"],
client=async_client
)
relevance_task = AnswerRelevance.agrade(
question=example["question"],
answer=example["answer"],
context=example["context"],
client=async_client
)
# Run all evaluations in parallel
faithfulness_result, precision_result, relevance_result = await asyncio.gather(
faithfulness_task, precision_task, relevance_task
)
return {
"question": example["question"],
"faithfulness_score": faithfulness_result.overall_faithfulness_score,
"precision_score": precision_result.avg_score,
"relevance_score": relevance_result.overall_score
}
async def evaluate_all_examples():
"""Evaluate all examples in parallel"""
tasks = [evaluate_example(example) for example in examples]
results = await asyncio.gather(*tasks)
return results
# Run the evaluations
results = asyncio.run(evaluate_all_examples())
# Display results
for result in results:
print(f"\nQuestion: {result['question']}")
print(f"Faithfulness Score: {result['faithfulness_score']:.2f}")
print(f"Precision Score: {result['precision_score']:.2f}")
print(f"Relevance Score: {result['relevance_score']:.2f}")
Custom Prompt Example
This example shows how to customize the evaluation prompts:
from rag_evals import base, FaithfulnessResult, RelevanceScore, ChunkGradedBinary
# Define a custom answer relevance evaluator with a modified prompt
CustomRelevance = base.ContextEvaluation(
prompt="""
You are evaluating how well an answer addresses the question.
Evaluate the answer along these dimensions:
1. Topical relevance (0-1): Does the answer address the topic of the question?
2. Completeness (0-1): Does the answer cover all parts of the question?
3. Conciseness (0-1): Is the answer appropriately detailed without unnecessary information?
Give each dimension a score and provide your reasoning.
""",
response_model=RelevanceScore
)
# Define a custom context precision evaluator
CustomPrecision = base.ContextEvaluation(
prompt="""
You are evaluating the relevance of retrieved context chunks to the original question.
For each retrieved chunk:
1. Determine if it contains information that would help answer the question
2. Consider both direct relevance (explicitly addresses the question) and indirect relevance (provides background or related information)
3. Be moderately strict - the chunk should contain actual helpful information, not just be on a vaguely related topic
Output a binary judgment (relevant/not relevant) for the chunk.
""",
response_model=base.ChunkGradedBinary
)
# Use the custom evaluators
relevance_result = CustomRelevance.grade(
question=question,
answer=answer,
context=context,
client=client
)
precision_result = CustomPrecision.grade(
question=question,
answer=answer,
context=context,
client=client
)
Comprehensive Evaluation Example
This example demonstrates a complete evaluation framework that assesses all key relationships:
import pandas as pd
import instructor
from rag_evals import Faithfulness, ChunkPrecision, AnswerRelevance
def evaluate_rag_system(question, context, answer, client):
"""Comprehensive RAG evaluation across all key dimensions"""
# 1. Context → Question: Context Relevance
precision_result = ChunkPrecision.grade(
question=question,
answer=answer,
context=context,
client=client
)
# 2. Context → Answer: Faithfulness
faithfulness_result = Faithfulness.grade(
question=question,
answer=answer,
context=context,
client=client
)
# 3. Question → Answer: Answer Relevance
relevance_result = AnswerRelevance.grade(
question=question,
answer=answer,
context=context,
client=client
)
# Calculate aggregate scores
retriever_score = precision_result.avg_score
generator_score = faithfulness_result.overall_faithfulness_score
end_to_end_score = relevance_result.overall_score
# Overall system score (simple average)
overall_score = (retriever_score + generator_score + end_to_end_score) / 3
return {
"overall_score": overall_score,
"retriever_score": retriever_score,
"generator_score": generator_score,
"end_to_end_score": end_to_end_score,
"precision_details": precision_result,
"faithfulness_details": faithfulness_result,
"relevance_details": relevance_result
}
# Example usage
client = instructor.from_provider("openai/gpt-4o-mini")
question = "What are the health benefits of meditation?"
answer = "Meditation reduces stress and improves mental focus. It also helps with depression."
context = [
"Regular meditation reduces stress and anxiety.",
"Meditation can improve focus and attention span.",
"The history of meditation dates back to ancient civilizations."
]
results = evaluate_rag_system(question, context, answer, client)
# Display results
print(f"Overall System Score: {results['overall_score']:.2f}")
print(f"Retriever Score (Context Relevance): {results['retriever_score']:.2f}")
print(f"Generator Score (Faithfulness): {results['generator_score']:.2f}")
print(f"End-to-End Score (Answer Relevance): {results['end_to_end_score']:.2f}")
# Detailed analysis (example)
print("\nUnfaithful statements:")
for stmt in results['faithfulness_details'].statements:
if not stmt.is_supported:
print(f"- {stmt.statement}")
print("\nIrrelevant context chunks:")
for chunk in results['precision_details'].graded_chunks:
if not chunk.score:
print(f"- Chunk {chunk.id_chunk}: {context[chunk.id_chunk][:50]}...")
These examples demonstrate the flexibility and power of RAG Evals for different evaluation scenarios. You can adapt these patterns to fit your specific evaluation needs.