API Reference

This section provides detailed API documentation for the RAG Evals library.

Core Classes

ContextEvaluation

ContextEvaluation is the base class for all context-based evaluations in RAG Evals.

class ContextEvaluation(BaseModel):
    """Base class for context-based evaluations that handles common patterns
    including grading question, and optional answers against a context that is enumerated   
    with an id.

    This class is designed to be used as a base class for specific evaluation classes.
    It provides a common interface for evaluating questions and answers against a context.
    """

    prompt: str
    examples: list[Any] | None = None
    response_model: type[BaseModel]
    chunk_template: str = dedent("""
        <evaluation>
            {% if examples is not none %}
            <examples>
                {% for example in examples %}
                <example>
                    {{ example }}
                </example>
                {% endfor %}
            {% endif %}
            <question>{{ question }}</question>
            {% if answer is not none %}
            <answer>{{ answer }}</answer>
            {% endif %}
            <context>
                {% for chunk in context %}
                <chunk id="{{ chunk.id }}">
                    {{ chunk.chunk }}
                </chunk>
                {% endfor %}
            </context>
        </evaluation>
    """)

Methods

grade

def grade(
    self,
    question: str,
    answer: str | None,
    context: list[Any],
    client: Instructor,
) -> BaseModel:
    """Run an evaluation of a question and optional answer against provided context chunks.

    Args:
        question (str): The question being evaluated.
        answer (Optional[str]): The answer to evaluate, if available. Can be None.
        context (List[Any]): List of context chunks to evaluate against.
        client (Instructor): An initialized Instructor client instance.

    Returns:
        BaseModel: An instance of the response_model containing the structured evaluation results.
    """

agrade

async def agrade(
    self,
    question: str,
    answer: str | None,
    context: list[Any],
    client: AsyncInstructor,
) -> BaseModel:
    """Run an evaluation of a question and optional answer against provided context chunks asynchronously.

    Args:   
        question (str): The question being evaluated.   
        answer (Optional[str]): The answer to evaluate, if available. Can be None.
        context (List[Any]): List of context chunks to evaluate against.
        client (AsyncInstructor): An initialized AsyncInstructor client instance.

    Returns:
        BaseModel: An instance of the response_model containing the structured evaluation results.
    """

ContextValidationMixin

A mixin class that ensures the integrity of chunk references in RAG evaluations by validating that all chunk IDs correspond to actual context chunks.

class ContextValidationMixin:
    """Mixin class that ensures the integrity of chunk references in RAG evaluations
    by validating that all chunk IDs correspond to actual context chunks."""

    @field_validator('graded_chunks')
    @classmethod
    def validate_chunks_against_context(cls, chunks: list[Any], info: ValidationInfo) -> list[Any]:
        """Validate and process chunk IDs against context chunks."""

ChunkScore

Represents a score for a single context chunk.

class ChunkScore(BaseModel):
    id_chunk: int
    score: float = Field(ge=0.0, le=1.0, description="Score from 0-1 indicating the precision of the chunk, lower is worse")

ChunkBinaryScore

Represents a binary (pass/fail) score for a single context chunk.

class ChunkBinaryScore(BaseModel):
    id_chunk: int
    score: bool = Field(description="Whether the chunk is passed or failed")

ChunkGraded

Container for a list of graded chunks with a continuous score.

class ChunkGraded(BaseModel, ContextValidationMixin):
    graded_chunks: list[ChunkScore]

    @property 
    def avg_score(self) -> float:
        return sum(chunk.score for chunk in self.graded_chunks) / len(self.graded_chunks)

ChunkGradedBinary

Container for a list of graded chunks with a binary score.

class ChunkGradedBinary(BaseModel, ContextValidationMixin):
    graded_chunks: list[ChunkBinaryScore]

    @property 
    def avg_score(self) -> float:
        return sum(chunk.score for chunk in self.graded_chunks) / len(self.graded_chunks)

Faithfulness Module

FaithfulnessResult

The result of a faithfulness evaluation.

class FaithfulnessResult(BaseModel):
    statements: list[StatementEvaluation] = Field(description="A list of all statements extracted from the answer and their evaluation.")

    @property
    def overall_faithfulness_score(self) -> float:
        if not self.statements:
            return 0.0
        supported_statements = sum(s.is_supported for s in self.statements)
        return supported_statements / len(self.statements)

StatementEvaluation

Represents the evaluation of a single statement extracted from an answer.

class StatementEvaluation(BaseModel):
    statement: str = Field(description="An individual claim extracted from the generated answer.")
    is_supported: bool = Field(description="Is this statement supported by the provided context chunks?")
    supporting_chunk_ids: Optional[list[int]] = Field(
        default=None, 
        description="A list of chunk IDs (0-indexed integers) from the provided context that support this statement."
    )

Faithfulness

The main faithfulness evaluator.

Faithfulness = base.ContextEvaluation(
    prompt="""
    You are an expert evaluator tasked with assessing the factual faithfulness of a generated answer to its provided context...
    """,
    response_model=FaithfulnessResult
)

Precision Module

ChunkPrecision

The main context precision evaluator.

ChunkPrecision = base.ContextEvaluation(
    prompt = """
    You are an expert evaluator assessing if a specific retrieved context chunk was utilized in generating a given answer...
    """, 
    response_model = base.ChunkGradedBinary
)

Usage Examples

For complete usage examples, see: