evaluate()
The evaluate()
function is the core API for defining and running LLM evaluations using Vitest.
Import
import { evaluate } from 'viteval';
Signature
function evaluate<
DATA_ITEM extends DataItem,
DATA extends Data<DATA_ITEM>,
>(
name: string,
{
data,
aggregation = 'mean',
task,
scorers,
threshold = 1.0,
timeout,
}: Eval<DATA>
)
Parameters
name
- Type:
string
- Required: Yes
- Description: A human-readable name for the evaluation
options
- Type:
Eval<DATA>
- Required: Yes
- Description: Configuration object for the evaluation
Eval Interface
interface Eval<DATA extends Data> {
/**
* The description of the evaluation.
*/
description?: string;
/**
* The data to use for the evaluation.
*/
data: DATA;
/**
* The task to evaluate.
*/
task: Task<InferDataInput<DATA>, InferDataOutput<DATA>, InferDataExtra<DATA>>;
/**
* The scorers to use for the evaluation.
*/
scorers: Scorer<InferDataOutput<DATA>, InferDataExtra<DATA>>[];
/**
* The aggregation type for the evaluation.
*
* @default 'mean'
*/
aggregation?: ScorerAggregationType;
/**
* The threshold for the evaluation.
*
* @default 1.0
*/
threshold?: number;
/**
* The timeout for the evaluation.
*
* @default Value provided to `timeout`, or if not supplied, the runtime configuration value (`config.eval?.timeout`), or 25000ms if none provided.
*/
timeout?: number;
}
data
The data to use for the evaluation. Can be an array, function, or dataset.
Type: Data<DATA_ITEM>
type Data<DATA_ITEM extends DataItem> =
| DATA_ITEM[]
| DataGenerator<DATA_ITEM>
| Dataset<DataGenerator<DATA_ITEM>>;
// Inline data
data: [
{ input: "What is 2+2?", expected: "4" },
{ input: "What is 3+3?", expected: "6" },
]
// From function
data: async () => {
const response = await fetch('/api/test-cases');
return response.json();
}
// From dataset
data: mathDataset
task
Function that processes input and returns the model's output.
Type: Task<InferDataInput<DATA>, InferDataOutput<DATA>, InferDataExtra<DATA>>
type Task<INPUT, OUTPUT, EXTRA extends Extra> = (
args: TaskArgs<INPUT, EXTRA>
) => Promise<OUTPUT> | OUTPUT;
type TaskArgs<INPUT, EXTRA extends Extra> = TF.Merge<
EXTRA,
{
input: INPUT;
}
>;
// Simple text generation
task: async ({ input }) => {
const result = await generateText({
model: 'gpt-4',
prompt: input,
});
return result.text;
}
// With chat messages
task: async ({ input }) => {
const result = await generateText({
model: 'gpt-4',
messages: [
{ role: 'system', content: 'You are a math tutor.' },
{ role: 'user', content: input },
],
});
return result.text;
}
// Structured input
task: async ({ input, context }) => {
return await answerQuestion(input.question, context);
}
scorers
Array of scorer functions to evaluate the output quality.
Type: Scorer<InferDataOutput<DATA>, InferDataExtra<DATA>>[]
import { scorers, createScorer } from 'viteval';
// Built-in scorers
scorers: [scorers.exactMatch]
scorers: [scorers.levenshtein, scorers.factual]
// Custom scorer
const customScorer = createScorer({
name: 'length-check',
score: ({ output, expected }) => {
return {
score: output.length === expected?.length ? 1 : 0,
metadata: { method: 'length_comparison' }
};
},
});
scorers: [customScorer, scorers.answerSimilarity]
threshold
(optional)
Minimum average score required for the evaluation to pass.
Type: number
Default: 1.0
Range: 0.0
to 1.0
// Require 90% average score
threshold: 0.9
// More lenient threshold
threshold: 0.6
// Perfect scores only (default)
threshold: 1.0
timeout
(optional)
Maximum time (in milliseconds) for each test case.
Type: number
Default: Value provided to timeout
, or if not supplied, the runtime configuration value (config.eval?.timeout
), or 25000ms if none provided.
// 1 minute timeout
timeout: 60000
// Quick timeout for fast models
timeout: 5000
// No timeout (not recommended)
timeout: 0
aggregation
(optional)
How to aggregate scores across multiple scorers.
Type: ScorerAggregationType
Default: 'mean'
Options: 'mean' | 'median' | 'sum'
// Use mean score (default)
aggregation: 'mean'
// Use median score
aggregation: 'median'
// Use sum of scores
aggregation: 'sum'
description
(optional)
Human-readable description of the evaluation.
Type: string
description: 'Evaluates math problem solving capabilities'
DataItem Interface
The data should contain DataItem
objects:
type DataItem<
INPUT = unknown,
OUTPUT = unknown,
EXTRA extends Extra = Extra,
> = TF.Merge<
EXTRA,
{
name?: string;
input: INPUT;
expected?: OUTPUT;
}
>;
Where Extra
is:
type Extra = Record<string, unknown>;
Simple Test Cases
{ input: "Hello", expected: "Hi there!" }
Complex Input Types
// Object input
{
input: {
question: "What's the weather?",
location: "San Francisco"
},
expected: "Sunny, 72°F"
}
// Array input
{
input: ["apple", "banana", "cherry"],
expected: "apple, banana, cherry"
}
Test Case Metadata
{
input: "Translate: Hello",
expected: "Hola",
difficulty: "easy",
language: "spanish"
}
Named Test Cases
{
name: "Basic addition",
input: "What is 2+2?",
expected: "4"
}