import asyncio
from calibrate.llm import tests
# Define your tools
tools = [
{
"type": "client",
"name": "plan_next_question",
"description": "Plan the next question to ask",
"parameters": [
{
"id": "next_unanswered_question_index",
"type": "integer",
"description": "Index of next question",
"required": True
},
{
"id": "questions_answered",
"type": "array",
"description": "List of answered question indices",
"items": {"type": "integer"},
"required": True
}
]
}
]
# Define your test cases
test_cases = [
{
"history": [
{"role": "assistant", "content": "Hello! What is your name?"},
{"role": "user", "content": "Aman Dalmia"}
],
"evaluation": {
"type": "tool_call",
"tool_calls": [
{
"tool": "plan_next_question",
"arguments": {
"next_unanswered_question_index": 2,
"questions_answered": [1]
}
}
]
},
"settings": {"language": "english"}
},
{
"history": [
{"role": "assistant", "content": "What is your phone number?"},
{"role": "user", "content": "Can I skip this question?"}
],
"evaluation": {
"type": "response",
"criteria": "The assistant should allow the user to skip giving their phone number."
}
}
]
# Run tests
result = asyncio.run(tests.run(
system_prompt="You are a helpful assistant filling out a form...",
tools=tools,
test_cases=test_cases,
output_dir="./out",
model="openai/gpt-4.1",
provider="openrouter",
run_name="my_test_run",
))