Evals
How to evaluate prompts, tools, and wallet flows safely.
Evaluation Framework
The AOMI evaluation framework enables testing agent behavior with deterministic assertions. Run tests against forked mainnet state to verify that agents correctly execute blockchain operations.
Overview
EvalCase
Building Test Cases
use aomi_eval::{EvalCase, BalanceChange, BalanceAsset};
// Simple test case
let case = EvalCase::new("What is the ETH balance of vitalik.eth?");
// With expected behavior
let case = EvalCase::new("Transfer 0.1 ETH to alice.eth")
.with_expectation("Transaction should be simulated and queued");
// With balance assertions
let case = EvalCase::new("Transfer 0.1 ETH to alice.eth")
.with_balance_change(BalanceChange {
holder: "alice.eth".into(),
asset: BalanceAsset::eth(),
expected_delta: parse_units("0.1", 18),
label: "Alice receives ETH".into(),
});Assertion Types
use aomi_eval::{AssertionPlan, BalanceChange, BalanceCheck, BalanceAsset};
// Balance change (delta)
let change = AssertionPlan::BalanceChange(BalanceChange {
holder: "0xAlice...".into(),
asset: BalanceAsset::eth(),
expected_delta: 100_000_000_000_000_000, // +0.1 ETH
label: "Alice receives ETH".into(),
});
// Exact balance check
let check = AssertionPlan::BalanceCheck(BalanceCheck {
holder: "0xBob...".into(),
asset: BalanceAsset::usdc("0xA0b86991c...")?,
expected_value: 1_000_000_000, // 1000 USDC
label: "Bob has 1000 USDC".into(),
});
// Minimum balance
let at_least = AssertionPlan::BalanceAtLeast {
holder: "0xCharlie...".into(),
asset: BalanceAsset::eth(),
min_value: 1_000_000_000_000_000_000, // >= 1 ETH
label: "Charlie has at least 1 ETH".into(),
};BalanceAsset
use aomi_eval::BalanceAsset;
// Native ETH
let eth = BalanceAsset::eth();
// Generic native token
let matic = BalanceAsset::native("MATIC", 18);
// ERC20 tokens
let usdc = BalanceAsset::usdc("0xA0b86991c6218b36c1d19D4a2e9Eb0cE3606eB48")?;
let usdt = BalanceAsset::usdt("0xdAC17F958D2ee523a2206206994597C13D831ec7")?;
// Custom ERC20
let weth = BalanceAsset::erc20(
"WETH",
"0xC02aaA39b223FE8D0A0e5C4F27eAD9083C756Cc2",
18,
)?;EvalHarness
Setup
use aomi_eval::{EvalHarness, EvalCase};
#[tokio::test]
async fn test_eth_transfer() -> anyhow::Result<()> {
// Skip if no API key
if skip_if_missing_anthropic_key()? {
return Ok(());
}
// Create harness (initializes fork, funds accounts)
let harness = EvalHarness::new().await?;
// Define test case
let case = EvalCase::new("Transfer 0.1 ETH to 0xAlice...")
.with_balance_change(BalanceChange {
holder: "0xAlice...".into(),
asset: BalanceAsset::eth(),
expected_delta: parse_units("0.1", 18),
label: "Alice receives ETH".into(),
});
// Run evaluation
let result = harness.run_case(case).await?;
// Check results
assert!(!result.is_empty(), "Agent should produce actions");
assert!(result.has_tool_call("send_transaction_to_wallet"));
Ok(())
}Test Accounts
// Pre-configured eval accounts
pub const EVAL_ACCOUNTS: &[&str] = &[
"0x1234...", // Account 0 - funded with ETH
"0x5678...", // Account 1 - funded with ETH + USDC
// ...
];
// Accounts are pre-funded at harness initialization:
// - 100 ETH each
// - 2000 USDC each (from whale impersonation)TestResult
Structure
use aomi_eval::{TestResult, RoundResult, AgentAction, ToolCall};
let result: TestResult = harness.run_case(case).await?;
// Inspect results
println!("Test: {}", result.intent);
println!("Rounds: {}", result.total_rounds());
println!("Tool calls: {}", result.total_tool_calls());
println!("Responses: {}", result.total_responses());
// Check for specific tool calls
if result.has_tool_call("get_account_info") {
println!("Agent checked account info");
}
// Get all calls to a tool
let transfer_calls = result.get_tool_calls("send_transaction_to_wallet");
for call in transfer_calls {
println!("Transfer: {} => {}", call.topic, call.content);
}
// Iterate rounds
for (i, round) in result.rounds.iter().enumerate() {
println!("\nRound {}:", i + 1);
println!(" Input: {}", round.input);
for action in &round.actions {
match action {
AgentAction::ToolCall(tc) => println!(" [tool] {}", tc.topic),
AgentAction::Response(text) => println!(" [response] {}", text),
AgentAction::System(msg) => println!(" [system] {}", msg),
}
}
}Printing Results
// TestResult implements Display
println!("{}", result);
// Output:
// Test #1: Transfer 0.1 ETH to alice.eth
//
// Round 1:
// >> Transfer 0.1 ETH to alice.eth
// [00] [tool] get_account_info => {"address": "0x...", "balance": "10.5"}
// [01] [tool] send_transaction_to_wallet => {"to": "0x...", "value": "0.1"}
// [02] [response] I've queued a transaction to transfer 0.1 ETH...Assertions
Running Assertions
use aomi_eval::{Assertion, AssertionResult};
// Assertions run after all rounds complete
let case = EvalCase::new("Transfer 1 USDC to bob.eth")
.with_balance_change(BalanceChange {
holder: "bob.eth".into(),
asset: BalanceAsset::usdc(USDC_CONTRACT)?,
expected_delta: 1_000_000, // 1 USDC
label: "Bob receives USDC".into(),
});
let result = harness.run_case(case).await?;
// Assertions are verified automatically
// Failures are included in TestResultCustom Assertions
use aomi_eval::Assertion;
use async_trait::async_trait;
pub struct ContractDeployedAssertion {
pub expected_address: Option<Address>,
pub bytecode_hash: B256,
}
#[async_trait]
impl Assertion for ContractDeployedAssertion {
async fn verify(&self, provider: &impl Provider) -> AssertionResult {
// Check if contract exists
let code = provider.get_code(self.expected_address?, None).await?;
if code.is_empty() {
return AssertionResult::Failed {
label: "Contract not deployed".into(),
reason: "No bytecode at expected address".into(),
};
}
let hash = keccak256(&code);
if hash != self.bytecode_hash {
return AssertionResult::Failed {
label: "Wrong contract".into(),
reason: format!("Expected hash {:?}, got {:?}", self.bytecode_hash, hash),
};
}
AssertionResult::Passed {
label: "Contract deployed correctly".into(),
}
}
}EvaluationApp
Specialized Agent
use aomi_eval::eval_app::EvaluationApp;
// Create headless agent for evaluation
let app = EvaluationApp::new().await?;
// Process a prompt
let messages = app.process_prompt("Check ETH balance of vitalik.eth").await?;
// Messages contain all agent actions
for msg in messages {
if let Some((tool_name, result)) = &msg.tool_stream {
println!("Tool: {} => {}", tool_name, result);
}
}Auto-signing
For evaluation, wallet transactions can be auto-signed:
// Set environment variable to enable auto-signing
std::env::set_var("LOCAL_TEST_WALLET_AUTOSIGN", "true");
// Transactions will be automatically approved in eval mode
let result = harness.run_case(transfer_case).await?;Test Patterns
Basic Query Test
#[tokio::test]
async fn test_balance_query() -> anyhow::Result<()> {
let harness = EvalHarness::new().await?;
let case = EvalCase::new("What is the ETH balance of vitalik.eth?");
let result = harness.run_case(case).await?;
assert!(result.has_tool_call("get_account_info"));
assert!(result.total_responses() >= 1);
Ok(())
}Transaction Test
#[tokio::test]
async fn test_eth_transfer() -> anyhow::Result<()> {
let harness = EvalHarness::new().await?;
let alice = EVAL_ACCOUNTS[1];
let initial_balance = harness.get_balance(alice, BalanceAsset::eth()).await?;
let case = EvalCase::new(format!("Transfer 0.5 ETH to {}", alice))
.with_balance_change(BalanceChange {
holder: alice.into(),
asset: BalanceAsset::eth(),
expected_delta: parse_units("0.5", 18),
label: "Alice receives 0.5 ETH".into(),
});
let result = harness.run_case(case).await?;
assert!(result.has_tool_call("send_transaction_to_wallet"));
Ok(())
}Multi-Step Test
#[tokio::test]
async fn test_swap_tokens() -> anyhow::Result<()> {
let harness = EvalHarness::new().await?;
let case = EvalCase::new("Swap 100 USDC for ETH using Uniswap")
.with_expectation("Should approve USDC and execute swap")
.with_balance_change(BalanceChange {
holder: EVAL_ACCOUNTS[0].into(),
asset: BalanceAsset::usdc(USDC_CONTRACT)?,
expected_delta: -100_000_000, // -100 USDC
label: "USDC spent".into(),
});
let result = harness.run_case(case).await?;
// Should have multiple tool calls
assert!(result.total_tool_calls() >= 2);
assert!(result.has_tool_call("get_contract_abi")); // Fetch router ABI
assert!(result.has_tool_call("send_transaction_to_wallet"));
Ok(())
}Negative Test
#[tokio::test]
async fn test_insufficient_funds() -> anyhow::Result<()> {
let harness = EvalHarness::new().await?;
// Try to transfer more than available
let case = EvalCase::new("Transfer 1000000 ETH to alice.eth")
.with_expectation("Should fail or warn about insufficient funds");
let result = harness.run_case(case).await?;
// Agent should handle gracefully
let responses: Vec<_> = result.rounds.iter()
.flat_map(|r| r.actions.iter())
.filter_map(|a| match a {
AgentAction::Response(text) => Some(text.to_lowercase()),
_ => None,
})
.collect();
let mentions_insufficient = responses.iter()
.any(|r| r.contains("insufficient") || r.contains("not enough"));
assert!(mentions_insufficient, "Agent should mention insufficient funds");
Ok(())
}Running Evaluations
CLI
# Run all eval tests
cargo test --package aomi-eval --features eval-test
# Run specific test
cargo test --package aomi-eval test_eth_transfer --features eval-test
# With debug output
DEBUG=1 cargo test --package aomi-eval --features eval-test -- --nocaptureEnvironment Setup
# Required
export ANTHROPIC_API_KEY="sk-ant-..."
# Optional - for BAML expectations
export BAML_SERVER_URL="http://localhost:2024"
# Enable auto-signing for eval
export LOCAL_TEST_WALLET_AUTOSIGN="true"CI Configuration
# .github/workflows/eval.yml
name: Evaluations
on:
push:
branches: [main]
schedule:
- cron: '0 0 * * *' # Daily
jobs:
eval:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Setup Rust
uses: actions-rs/toolchain@v1
- name: Run Evaluations
env:
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
ALCHEMY_API_KEY: ${{ secrets.ALCHEMY_API_KEY }}
run: |
cargo test --package aomi-eval --features eval-testBest Practices
Test Isolation
Guidelines
| Practice | Description |
|---|---|
| Fork at specific block | Ensure reproducible state |
| Pre-fund accounts | Avoid flaky balance issues |
| Reset after each test | Isolate test state |
| Use deterministic inputs | Avoid random addresses |
| Check tool calls | Verify agent behavior |
| Validate outputs | Assert on final state |