{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "4c2a6fa7",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/gpfs/radev/home/tl688/.conda/envs/evoagentx/lib/python3.11/site-packages/PyPDF2/__init__.py:21: DeprecationWarning: PyPDF2 is deprecated. Please move to the pypdf library instead.\n",
" warnings.warn(\n"
]
}
],
"source": [
"import os\n",
"\n",
"from dotenv import load_dotenv\n",
"\n",
"from evoagentx.agents.agent_manager import AgentManager\n",
"from evoagentx.benchmark import HotPotQA\n",
"from evoagentx.core.callbacks import suppress_logger_info\n",
"from evoagentx.core.logging import logger\n",
"from evoagentx.evaluators import Evaluator\n",
"from evoagentx.models import OpenAILLM, OpenAILLMConfig\n",
"from evoagentx.optimizers import TextGradOptimizer\n",
"from evoagentx.prompts import StringTemplate\n",
"from evoagentx.workflow import SequentialWorkFlowGraph\n",
"from dotenv import load_dotenv\n",
"\n",
"from evoagentx.agents.agent_manager import AgentManager\n",
"from evoagentx.benchmark import MBPP\n",
"from evoagentx.core.callbacks import suppress_logger_info\n",
"from evoagentx.core.logging import logger\n",
"from evoagentx.evaluators import Evaluator\n",
"from evoagentx.models import OpenAILLM, OpenAILLMConfig\n",
"from evoagentx.optimizers import TextGradOptimizer\n",
"from evoagentx.prompts import StringTemplate\n",
"from evoagentx.workflow import SequentialWorkFlowGraph\n",
"\n",
"from evoagentx.models import OpenAILLMConfig, OpenAILLM\n",
"from evoagentx.workflow import SEWWorkFlowGraph, STRUCTUREWorkFlowGraph\n",
"from evoagentx.agents import AgentManager\n",
"from evoagentx.benchmark import HumanEval,AFlowMBPP\n",
"from evoagentx.evaluators import Evaluator \n",
"from evoagentx.optimizers import SEWOptimizer, STRUCTUREOptimizer\n",
"from evoagentx.optimizers.structure_optimizer import STRUCTUREWorkFlowScheme\n",
"from evoagentx.core.callbacks import suppress_logger_info\n",
"\n",
"from evoagentx.models import OpenAILLMConfig, OpenAILLM,AzureOpenAIConfig,LiteLLMConfig,LiteLLM\n",
"from evoagentx.workflow import SEWWorkFlowGraph \n",
"from evoagentx.agents import AgentManager\n",
"from evoagentx.benchmark import MBPPPLUS, AFlowMBPPPLUS\n",
"from evoagentx.evaluators import Evaluator \n",
"from evoagentx.optimizers import SEWOptimizer \n",
"from evoagentx.core.callbacks import suppress_logger_info\n",
"from evoagentx.benchmark import HumanEvalPLUS\n",
"from evoagentx.benchmark import SciCode\n",
"from copy import deepcopy\n",
"from evoagentx.benchmark import PertQA\n",
"from copy import deepcopy\n",
"\n",
"import nest_asyncio\n",
"nest_asyncio.apply()\n",
"\n",
"class HotPotQASplits(HotPotQA):\n",
"\n",
" def _load_data(self):\n",
" # load the original test data \n",
" super()._load_data()\n",
" # split the data into train, dev and test\n",
" import numpy as np \n",
" np.random.seed(42)\n",
" permutation = np.random.permutation(len(self._dev_data))\n",
" full_test_data = self._dev_data \n",
" # randomly select 10 samples for train, 40 for dev, and 100 for test\n",
" self._train_data = [full_test_data[idx] for idx in permutation[:50]]\n",
" self._dev_data = [full_test_data[idx] for idx in permutation[:50]]\n",
" self._test_data = [full_test_data[idx] for idx in permutation[50:550]]\n",
" self._fulldata = full_test_data\n",
"\n",
"\n",
"def collate_func(example: dict) -> dict:\n",
" problem = \"Question: {}\\n\\nAnswer:\".format(example[\"question_new\"])\n",
" return {\"problem\": problem}\n",
"\n",
"\n",
"hotpotqa_graph_data = {\n",
" \"goal\": \"Answer the question based on the context. The answer should be a direct response to the question, without including explanations or reasoning.\",\n",
" \"tasks\": [\n",
" {\n",
" \"name\": \"answer_generate\",\n",
" \"description\": \"Answer the question based on the context.\",\n",
" \"inputs\": [\n",
" {\"name\": \"problem\", \"type\": \"str\", \"required\": True, \"description\": \"The problem to solve.\"}\n",
" ],\n",
" \"outputs\": [\n",
" {\"name\": \"answer\", \"type\": \"str\", \"required\": True, \"description\": \"The answer to the problem.\"}\n",
" ],\n",
" \"prompt_template\": StringTemplate(instruction=\"Think step by step to answer the question. You should explain your thinking process in the 'thought' field, and provide the final answer in the 'answer' field.\\nFormat your output in xml format, such as xxx and xxx.\"),\n",
" \"parse_mode\": \"xml\"\n",
" }\n",
" ] \n",
"}\n",
"os.environ[\"AZURE_OPENAI_DEPLOYMENT_NAME\"] = \"gpt-4o-mini\"\n",
"os.environ[\"AZURE_OPENAI_ENDPOINT\"] = \"https://tianyuliu-hua-raredisea-resource.cognitiveservices.azure.com/\"\n",
"os.environ[\"AZURE_OPENAI_KEY\"] = \"2pa9h2ZIN1lQepFWwYADlXIKIansa9KPhxMoumeGbRQ08f2uDTXiJQQJ99BKACHYHv6XJ3w3AAAAACOGsQIt\"\n",
"os.environ[\"AZURE_OPENAI_API_VERSION\"] = \"2025-01-01-preview\"\n",
"llm_config = LiteLLMConfig(model=\"azure/\" + os.getenv(\"AZURE_OPENAI_DEPLOYMENT_NAME\"), # Azure model format\n",
" azure_endpoint=os.getenv(\"AZURE_OPENAI_ENDPOINT\"),\n",
" azure_key=os.getenv(\"AZURE_OPENAI_KEY\"),\n",
" api_version=os.getenv(\"AZURE_OPENAI_API_VERSION\", \"2024-12-01-preview\"), top_p=0.85, temperature=0.2, frequency_penalty=0.0, presence_penalty=0.0)\n",
"\n",
"executor_llm = LiteLLM(config=llm_config)\n",
"optimizer_llm = LiteLLM(config=llm_config)\n",
"llm = executor_llm"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "ad0efa03",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"evoagentx.optimizers.sew_optimizer.SEWOptimizer"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"SEWOptimizer "
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "ad4b2024",
"metadata": {},
"outputs": [],
"source": [
"# difficult easy "
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "c95059f0",
"metadata": {},
"outputs": [],
"source": [
"from evoagentx.benchmark import HotPotQA"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "84efabfa",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[32m2026-01-04 11:04:54.121\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.benchmark.pertqa\u001b[0m:\u001b[36m_load_data_from_file\u001b[0m:\u001b[36m52\u001b[0m - \u001b[1mloading HotPotQA data from /home/tl688/pitl688/selfevolve/EvoAgentX/examples/pertqa/reploge_train.json ...\u001b[0m\n",
"\u001b[32m2026-01-04 11:04:54.197\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.benchmark.pertqa\u001b[0m:\u001b[36m_load_data_from_file\u001b[0m:\u001b[36m52\u001b[0m - \u001b[1mloading HotPotQA data from /home/tl688/pitl688/selfevolve/EvoAgentX/examples/pertqa/reploge_train.json ...\u001b[0m\n",
"\u001b[32m2026-01-04 11:04:54.266\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.benchmark.pertqa\u001b[0m:\u001b[36m_load_data_from_file\u001b[0m:\u001b[36m52\u001b[0m - \u001b[1mloading HotPotQA data from /home/tl688/pitl688/selfevolve/EvoAgentX/examples/pertqa/reploge_test.json ...\u001b[0m\n"
]
}
],
"source": [
"# llm_config = OpenAILLMConfig(model=\"gpt-4o-mini-2024-07-18\", openai_key=OPENAI_API_KEY, top_p=0.85, temperature=0.2, frequency_penalty=0.0, presence_penalty=0.0)\n",
"# llm = OpenAILLM(config=llm_config)\n",
"llm = executor_llm\n",
"\n",
"# obtain SEW workflow \n",
"sew_graph = SEWWorkFlowGraph.from_dict(hotpotqa_graph_data)\n",
"agent_manager = AgentManager()\n",
"agent_manager.add_agents_from_workflow(sew_graph, executor_llm.config)\n",
"\n",
"benchmark = PertQA(pertdata='reploge')\n",
"\n",
"# obtain Evaluator\n",
"evaluator = Evaluator(llm=llm, agent_manager=agent_manager, collate_func=collate_func, num_workers=20, verbose=True)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "d2bba683",
"metadata": {},
"outputs": [],
"source": [
"# import json\n",
"# # with open(\"../../MaAS/maas/ext/maas/data/humaneval_train.jsonl\", 'w') as f:\n",
"# # json.dump(humaneval._dev_data, f, indent=2) # indent=4 makes the JSON output more readable\n",
"\n",
"\n",
"# # with open(\"../../MaAS/maas/ext/maas/data/humaneval_test.jsonl\", 'w') as f:\n",
"# # json.dump(humaneval._test_data, f, indent=2) # indent=4 makes the JSON output more readable\n",
"\n",
"# with open(\"../../MaAS/maas/ext/maas/data/humaneval_train.jsonl\", 'w') as f:\n",
"# for obj in humaneval._dev_data:\n",
"# json_line = json.dumps(obj)\n",
"# f.write(json_line + '\\n')\n",
" \n",
"# with open(\"../../MaAS/maas/ext/maas/data/humaneval_test.jsonl\", 'w') as f:\n",
"# for obj in humaneval._test_data:\n",
"# json_line = json.dumps(obj)\n",
"# f.write(json_line + '\\n')\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "8598151b",
"metadata": {
"scrolled": false
},
"outputs": [
{
"data": {
"text/plain": [
"1"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(sew_graph.to_dict()['nodes'])"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "b1f7fc18",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(sew_graph.edges)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "33859fa8",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[]"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sew_graph.edges"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "3c048529",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# obtain SEWOptimizer after having more roles, default\n",
"optimizer = SEWOptimizer(\n",
" graph=sew_graph, \n",
" evaluator=evaluator, \n",
" llm=llm, \n",
" max_steps=20,\n",
" eval_rounds=3, \n",
" repr_scheme=\"python\", \n",
" optimize_mode=\"all\", \n",
" order=\"zero-order\",\n",
" max_rounds=20,\n",
")\n",
"\n",
"# with suppress_logger_info():\n",
"# metrics = optimizer.evaluate(dataset=humaneval, eval_mode=\"test\")\n",
"# print(\"Evaluation metrics: \", metrics)\n"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "9cbdd001",
"metadata": {},
"outputs": [],
"source": [
"# len(benchmark._fulldata)\n",
"benchmark._train_data = benchmark._train_data[0:50]\n",
"benchmark._dev_data = benchmark._dev_data[0:50]"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "8b05058e",
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[32m2026-01-04 11:05:05.377\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m678\u001b[0m - \u001b[1mOptimizing the SEWWorkFlowGraph workflow with python representation.\u001b[0m\n",
"\u001b[32m2026-01-04 11:05:05.378\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m682\u001b[0m - \u001b[1mRun initial evaluation on the original workflow ...\u001b[0m\n",
"Evaluating workflow: 2%|▏ | 1/50 [00:01<01:14, 1.52s/it]Task exception was never retrieved\n",
"future: exception=RuntimeError('Event loop is closed')>\n",
"Traceback (most recent call last):\n",
" File \"/gpfs/radev/home/tl688/.conda/envs/evoagentx/lib/python3.11/asyncio/tasks.py\", line 277, in __step\n",
" result = coro.send(None)\n",
" ^^^^^^^^^^^^^^^\n",
" File \"/gpfs/radev/home/tl688/.conda/envs/evoagentx/lib/python3.11/site-packages/litellm/utils.py\", line 873, in _client_async_logging_helper\n",
" GLOBAL_LOGGING_WORKER.ensure_initialized_and_enqueue(\n",
" File \"/gpfs/radev/home/tl688/.conda/envs/evoagentx/lib/python3.11/site-packages/litellm/litellm_core_utils/logging_worker.py\", line 322, in ensure_initialized_and_enqueue\n",
" self.enqueue(async_coroutine)\n",
" File \"/gpfs/radev/home/tl688/.conda/envs/evoagentx/lib/python3.11/site-packages/litellm/litellm_core_utils/logging_worker.py\", line 131, in enqueue\n",
" self._queue.put_nowait(task)\n",
" File \"/gpfs/radev/home/tl688/.conda/envs/evoagentx/lib/python3.11/asyncio/queues.py\", line 147, in put_nowait\n",
" self._wakeup_next(self._getters)\n",
" File \"/gpfs/radev/home/tl688/.conda/envs/evoagentx/lib/python3.11/asyncio/queues.py\", line 63, in _wakeup_next\n",
" waiter.set_result(None)\n",
" File \"/gpfs/radev/home/tl688/.conda/envs/evoagentx/lib/python3.11/asyncio/futures.py\", line 263, in set_result\n",
" self.__schedule_callbacks()\n",
" File \"/gpfs/radev/home/tl688/.conda/envs/evoagentx/lib/python3.11/asyncio/futures.py\", line 173, in __schedule_callbacks\n",
" self._loop.call_soon(callback, self, context=ctx)\n",
" File \"/gpfs/radev/home/tl688/.conda/envs/evoagentx/lib/python3.11/asyncio/base_events.py\", line 762, in call_soon\n",
" self._check_closed()\n",
" File \"/gpfs/radev/home/tl688/.conda/envs/evoagentx/lib/python3.11/asyncio/base_events.py\", line 520, in _check_closed\n",
" raise RuntimeError('Event loop is closed')\n",
"RuntimeError: Event loop is closed\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating workflow: 8%|▊ | 4/50 [00:01<00:15, 2.97it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 20%|██ | 10/50 [00:02<00:04, 9.01it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating workflow: 34%|███▍ | 17/50 [00:02<00:02, 15.34it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 40%|████ | 20/50 [00:02<00:01, 16.27it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating workflow: 50%|█████ | 25/50 [00:03<00:03, 6.94it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 54%|█████▍ | 27/50 [00:03<00:02, 7.67it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating workflow: 70%|███████ | 35/50 [00:04<00:01, 12.84it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating workflow: 78%|███████▊ | 39/50 [00:04<00:00, 14.18it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 82%|████████▏ | 41/50 [00:05<00:01, 6.47it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 86%|████████▌ | 43/50 [00:05<00:00, 7.11it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating workflow: 94%|█████████▍| 47/50 [00:06<00:00, 8.02it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 98%|█████████▊| 49/50 [00:06<00:00, 7.83it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 100%|██████████| 50/50 [00:06<00:00, 7.82it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"\u001b[32m2026-01-04 11:05:11.833\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m685\u001b[0m - \u001b[1mInitial metrics: {'f1': 0.0, 'em': 0.0, 'acc': 0.5}\u001b[0m\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[32m2026-01-04 11:05:12.299\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.010 | Total tokens: 39366 | Current cost: $0.000 | Current tokens: 111\u001b[0m\n",
"\u001b[32m2026-01-04 11:05:13.041\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.010 | Total tokens: 39504 | Current cost: $0.000 | Current tokens: 138\u001b[0m\n",
"\u001b[32m2026-01-04 11:05:13.043\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m700\u001b[0m - \u001b[33m\u001b[1mError in step 0: can only concatenate str (not \"NoneType\") to str. Skip this step.\u001b[0m\n",
"\u001b[32m2026-01-04 11:05:13.607\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.010 | Total tokens: 39608 | Current cost: $0.000 | Current tokens: 104\u001b[0m\n",
"\u001b[32m2026-01-04 11:05:14.163\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.010 | Total tokens: 39745 | Current cost: $0.000 | Current tokens: 137\u001b[0m\n",
"\u001b[32m2026-01-04 11:05:14.164\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m700\u001b[0m - \u001b[33m\u001b[1mError in step 1: can only concatenate str (not \"NoneType\") to str. Skip this step.\u001b[0m\n",
"\u001b[32m2026-01-04 11:05:14.564\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.010 | Total tokens: 39856 | Current cost: $0.000 | Current tokens: 111\u001b[0m\n",
"\u001b[32m2026-01-04 11:05:15.140\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.011 | Total tokens: 39994 | Current cost: $0.000 | Current tokens: 138\u001b[0m\n",
"\u001b[32m2026-01-04 11:05:15.141\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m700\u001b[0m - \u001b[33m\u001b[1mError in step 2: can only concatenate str (not \"NoneType\") to str. Skip this step.\u001b[0m\n",
"\u001b[32m2026-01-04 11:05:15.485\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.011 | Total tokens: 40057 | Current cost: $0.000 | Current tokens: 63\u001b[0m\n",
"\u001b[32m2026-01-04 11:05:16.196\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.011 | Total tokens: 40184 | Current cost: $0.000 | Current tokens: 127\u001b[0m\n",
"\u001b[32m2026-01-04 11:05:16.197\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m700\u001b[0m - \u001b[33m\u001b[1mError in step 3: can only concatenate str (not \"NoneType\") to str. Skip this step.\u001b[0m\n",
"\u001b[32m2026-01-04 11:05:17.001\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.011 | Total tokens: 40281 | Current cost: $0.000 | Current tokens: 97\u001b[0m\n",
"\u001b[32m2026-01-04 11:05:18.395\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.011 | Total tokens: 40501 | Current cost: $0.000 | Current tokens: 220\u001b[0m\n",
"\u001b[32m2026-01-04 11:05:18.396\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36mparse_workflow_python_repr\u001b[0m:\u001b[36m403\u001b[0m - \u001b[33m\u001b[1mFailed to parse workflow string: 'llm_config'. Return the original workflow.\u001b[0m\n",
"\u001b[32m2026-01-04 11:05:18.397\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m700\u001b[0m - \u001b[33m\u001b[1mError in step 4: can only concatenate str (not \"NoneType\") to str. Skip this step.\u001b[0m\n",
"\u001b[32m2026-01-04 11:05:20.641\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.011 | Total tokens: 40762 | Current cost: $0.000 | Current tokens: 261\u001b[0m\n",
"\u001b[32m2026-01-04 11:05:21.196\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.011 | Total tokens: 41077 | Current cost: $0.000 | Current tokens: 315\u001b[0m\n",
"\u001b[32m2026-01-04 11:05:21.198\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m700\u001b[0m - \u001b[33m\u001b[1mError in step 5: can only concatenate str (not \"NoneType\") to str. Skip this step.\u001b[0m\n",
"\u001b[32m2026-01-04 11:05:21.819\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.011 | Total tokens: 41176 | Current cost: $0.000 | Current tokens: 99\u001b[0m\n",
"\u001b[32m2026-01-04 11:05:22.472\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.011 | Total tokens: 41338 | Current cost: $0.000 | Current tokens: 162\u001b[0m\n",
"\u001b[32m2026-01-04 11:05:22.473\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36mparse_workflow_python_repr\u001b[0m:\u001b[36m403\u001b[0m - \u001b[33m\u001b[1mFailed to parse workflow string: 'llm_config'. Return the original workflow.\u001b[0m\n",
"\u001b[32m2026-01-04 11:05:22.474\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m700\u001b[0m - \u001b[33m\u001b[1mError in step 6: can only concatenate str (not \"NoneType\") to str. Skip this step.\u001b[0m\n",
"\u001b[32m2026-01-04 11:05:22.879\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.011 | Total tokens: 41405 | Current cost: $0.000 | Current tokens: 67\u001b[0m\n",
"\u001b[32m2026-01-04 11:05:23.352\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.011 | Total tokens: 41543 | Current cost: $0.000 | Current tokens: 138\u001b[0m\n",
"\u001b[32m2026-01-04 11:05:23.353\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m700\u001b[0m - \u001b[33m\u001b[1mError in step 7: can only concatenate str (not \"NoneType\") to str. Skip this step.\u001b[0m\n",
"\u001b[32m2026-01-04 11:05:23.795\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.011 | Total tokens: 41621 | Current cost: $0.000 | Current tokens: 78\u001b[0m\n",
"\u001b[32m2026-01-04 11:05:24.365\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.011 | Total tokens: 41762 | Current cost: $0.000 | Current tokens: 141\u001b[0m\n",
"\u001b[32m2026-01-04 11:05:24.366\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m700\u001b[0m - \u001b[33m\u001b[1mError in step 8: can only concatenate str (not \"NoneType\") to str. Skip this step.\u001b[0m\n",
"\u001b[32m2026-01-04 11:05:25.021\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.011 | Total tokens: 41862 | Current cost: $0.000 | Current tokens: 100\u001b[0m\n",
"\u001b[32m2026-01-04 11:05:25.464\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.011 | Total tokens: 42026 | Current cost: $0.000 | Current tokens: 164\u001b[0m\n",
"\u001b[32m2026-01-04 11:05:25.465\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m700\u001b[0m - \u001b[33m\u001b[1mError in step 9: can only concatenate str (not \"NoneType\") to str. Skip this step.\u001b[0m\n",
"\u001b[32m2026-01-04 11:05:26.127\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.011 | Total tokens: 42140 | Current cost: $0.000 | Current tokens: 114\u001b[0m\n",
"\u001b[32m2026-01-04 11:05:26.734\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.011 | Total tokens: 42317 | Current cost: $0.000 | Current tokens: 177\u001b[0m\n",
"\u001b[32m2026-01-04 11:05:26.736\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36mparse_workflow_python_repr\u001b[0m:\u001b[36m403\u001b[0m - \u001b[33m\u001b[1mFailed to parse workflow string: 'llm_config'. Return the original workflow.\u001b[0m\n",
"\u001b[32m2026-01-04 11:05:26.736\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m700\u001b[0m - \u001b[33m\u001b[1mError in step 10: can only concatenate str (not \"NoneType\") to str. Skip this step.\u001b[0m\n",
"\u001b[32m2026-01-04 11:05:27.029\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.011 | Total tokens: 42392 | Current cost: $0.000 | Current tokens: 75\u001b[0m\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[32m2026-01-04 11:05:27.621\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.011 | Total tokens: 42528 | Current cost: $0.000 | Current tokens: 136\u001b[0m\n",
"\u001b[32m2026-01-04 11:05:27.622\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36mparse_workflow_python_repr\u001b[0m:\u001b[36m403\u001b[0m - \u001b[33m\u001b[1mFailed to parse workflow string: 'llm_config'. Return the original workflow.\u001b[0m\n",
"\u001b[32m2026-01-04 11:05:27.622\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m700\u001b[0m - \u001b[33m\u001b[1mError in step 11: can only concatenate str (not \"NoneType\") to str. Skip this step.\u001b[0m\n",
"\u001b[32m2026-01-04 11:05:29.335\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.011 | Total tokens: 42724 | Current cost: $0.000 | Current tokens: 196\u001b[0m\n",
"\u001b[32m2026-01-04 11:05:29.887\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.011 | Total tokens: 42988 | Current cost: $0.000 | Current tokens: 264\u001b[0m\n",
"\u001b[32m2026-01-04 11:05:29.888\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m700\u001b[0m - \u001b[33m\u001b[1mError in step 12: can only concatenate str (not \"NoneType\") to str. Skip this step.\u001b[0m\n",
"\u001b[32m2026-01-04 11:05:31.604\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.012 | Total tokens: 43239 | Current cost: $0.000 | Current tokens: 251\u001b[0m\n",
"\u001b[32m2026-01-04 11:05:32.175\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.012 | Total tokens: 43553 | Current cost: $0.000 | Current tokens: 314\u001b[0m\n",
"\u001b[32m2026-01-04 11:05:32.176\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m700\u001b[0m - \u001b[33m\u001b[1mError in step 13: can only concatenate str (not \"NoneType\") to str. Skip this step.\u001b[0m\n",
"\u001b[32m2026-01-04 11:05:34.782\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.012 | Total tokens: 43914 | Current cost: $0.000 | Current tokens: 361\u001b[0m\n",
"\u001b[32m2026-01-04 11:05:35.278\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.012 | Total tokens: 44330 | Current cost: $0.000 | Current tokens: 416\u001b[0m\n",
"\u001b[32m2026-01-04 11:05:35.280\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m700\u001b[0m - \u001b[33m\u001b[1mError in step 14: can only concatenate str (not \"NoneType\") to str. Skip this step.\u001b[0m\n",
"\u001b[32m2026-01-04 11:05:35.853\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.012 | Total tokens: 44428 | Current cost: $0.000 | Current tokens: 98\u001b[0m\n",
"\u001b[32m2026-01-04 11:05:36.369\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.012 | Total tokens: 44592 | Current cost: $0.000 | Current tokens: 164\u001b[0m\n",
"\u001b[32m2026-01-04 11:05:36.370\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m700\u001b[0m - \u001b[33m\u001b[1mError in step 15: can only concatenate str (not \"NoneType\") to str. Skip this step.\u001b[0m\n",
"\u001b[32m2026-01-04 11:05:37.525\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.012 | Total tokens: 44768 | Current cost: $0.000 | Current tokens: 176\u001b[0m\n",
"\u001b[32m2026-01-04 11:05:38.148\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.012 | Total tokens: 45012 | Current cost: $0.000 | Current tokens: 244\u001b[0m\n",
"\u001b[32m2026-01-04 11:05:38.149\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m700\u001b[0m - \u001b[33m\u001b[1mError in step 16: can only concatenate str (not \"NoneType\") to str. Skip this step.\u001b[0m\n",
"\u001b[32m2026-01-04 11:05:38.474\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.012 | Total tokens: 45116 | Current cost: $0.000 | Current tokens: 104\u001b[0m\n",
"\u001b[32m2026-01-04 11:05:38.899\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.012 | Total tokens: 45253 | Current cost: $0.000 | Current tokens: 137\u001b[0m\n",
"\u001b[32m2026-01-04 11:05:38.901\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m700\u001b[0m - \u001b[33m\u001b[1mError in step 17: can only concatenate str (not \"NoneType\") to str. Skip this step.\u001b[0m\n",
"\u001b[32m2026-01-04 11:05:39.651\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.012 | Total tokens: 45372 | Current cost: $0.000 | Current tokens: 119\u001b[0m\n",
"\u001b[32m2026-01-04 11:05:40.276\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.012 | Total tokens: 45559 | Current cost: $0.000 | Current tokens: 187\u001b[0m\n",
"\u001b[32m2026-01-04 11:05:40.277\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m700\u001b[0m - \u001b[33m\u001b[1mError in step 18: can only concatenate str (not \"NoneType\") to str. Skip this step.\u001b[0m\n",
"\u001b[32m2026-01-04 11:05:45.335\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.012 | Total tokens: 45698 | Current cost: $0.000 | Current tokens: 139\u001b[0m\n",
"\u001b[32m2026-01-04 11:05:46.105\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.012 | Total tokens: 45924 | Current cost: $0.000 | Current tokens: 226\u001b[0m\n",
"\u001b[32m2026-01-04 11:05:46.106\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36mparse_workflow_python_repr\u001b[0m:\u001b[36m403\u001b[0m - \u001b[33m\u001b[1mFailed to parse workflow string: 'llm_config'. Return the original workflow.\u001b[0m\n",
"\u001b[32m2026-01-04 11:05:46.107\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m700\u001b[0m - \u001b[33m\u001b[1mError in step 19: can only concatenate str (not \"NoneType\") to str. Skip this step.\u001b[0m\n",
"\u001b[32m2026-01-04 11:05:46.107\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m707\u001b[0m - \u001b[1mReach the maximum number of steps 20. Stop the optimization.\u001b[0m\n",
"\u001b[32m2026-01-04 11:05:46.108\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m710\u001b[0m - \u001b[1mRestore the best graph from the snapshot ...\u001b[0m\n",
"\u001b[32m2026-01-04 11:05:46.108\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36mrestore_best_graph\u001b[0m:\u001b[36m814\u001b[0m - \u001b[1mRestore the best graph from snapshot with metrics {'f1': 0.0, 'em': 0.0, 'acc': 0.5} ...\u001b[0m\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating workflow: 0%| | 2/3000 [00:01<37:22, 1.34it/s] "
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating workflow: 0%| | 7/3000 [00:01<07:22, 6.76it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating workflow: 1%| | 17/3000 [00:02<02:51, 17.44it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 1%| | 20/3000 [00:02<02:53, 17.13it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 1%| | 23/3000 [00:03<08:24, 5.91it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 1%| | 25/3000 [00:04<07:58, 6.21it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating workflow: 1%| | 34/3000 [00:04<03:43, 13.25it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 1%| | 37/3000 [00:04<04:34, 10.79it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 1%|▏ | 40/3000 [00:05<04:39, 10.61it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 1%|▏ | 42/3000 [00:05<07:43, 6.38it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating workflow: 2%|▏ | 46/3000 [00:06<06:20, 7.76it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating workflow: 2%|▏ | 51/3000 [00:06<04:21, 11.29it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 2%|▏ | 54/3000 [00:06<03:32, 13.88it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating workflow: 2%|▏ | 58/3000 [00:07<04:23, 11.18it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 2%|▏ | 60/3000 [00:07<05:06, 9.59it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating workflow: 2%|▏ | 65/3000 [00:08<06:32, 7.47it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 2%|▏ | 69/3000 [00:08<04:22, 11.17it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 2%|▏ | 72/3000 [00:08<04:31, 10.78it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 2%|▎ | 75/3000 [00:08<04:35, 10.63it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 3%|▎ | 77/3000 [00:09<05:18, 9.18it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 3%|▎ | 79/3000 [00:09<06:04, 8.00it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 3%|▎ | 81/3000 [00:09<06:39, 7.31it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating workflow: 3%|▎ | 87/3000 [00:10<04:23, 11.04it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 3%|▎ | 89/3000 [00:10<03:58, 12.21it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating workflow: 3%|▎ | 95/3000 [00:10<02:51, 16.94it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating workflow: 3%|▎ | 100/3000 [00:11<06:33, 7.37it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 3%|▎ | 102/3000 [00:12<05:51, 8.25it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating workflow: 4%|▎ | 107/3000 [00:12<04:28, 10.78it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 4%|▎ | 112/3000 [00:12<03:15, 14.80it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 4%|▍ | 114/3000 [00:12<03:20, 14.37it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 4%|▍ | 116/3000 [00:13<07:00, 6.85it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating workflow: 4%|▍ | 120/3000 [00:13<05:16, 9.10it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 4%|▍ | 122/3000 [00:13<04:49, 9.93it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating workflow: 4%|▍ | 127/3000 [00:14<03:53, 12.33it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 4%|▍ | 129/3000 [00:14<03:42, 12.90it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 4%|▍ | 131/3000 [00:14<05:08, 9.30it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 4%|▍ | 134/3000 [00:15<04:40, 10.21it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 5%|▍ | 136/3000 [00:15<06:51, 6.96it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 5%|▍ | 138/3000 [00:15<05:52, 8.11it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating workflow: 5%|▍ | 141/3000 [00:16<07:23, 6.45it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 5%|▍ | 146/3000 [00:16<03:56, 12.08it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating workflow: 5%|▌ | 151/3000 [00:16<04:19, 10.99it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 5%|▌ | 153/3000 [00:17<04:20, 10.92it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 5%|▌ | 156/3000 [00:17<04:16, 11.09it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating workflow: 5%|▌ | 160/3000 [00:17<05:05, 9.29it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating workflow: 5%|▌ | 164/3000 [00:18<05:02, 9.38it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 6%|▌ | 166/3000 [00:18<05:29, 8.61it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 6%|▌ | 170/3000 [00:18<03:34, 13.19it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 6%|▌ | 172/3000 [00:19<05:05, 9.26it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 6%|▌ | 175/3000 [00:19<03:53, 12.11it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 6%|▌ | 177/3000 [00:19<05:18, 8.87it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating workflow: 6%|▌ | 181/3000 [00:20<05:39, 8.31it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 6%|▌ | 185/3000 [00:20<04:26, 10.55it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 6%|▌ | 187/3000 [00:20<04:47, 9.79it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 6%|▋ | 189/3000 [00:20<04:18, 10.89it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 6%|▋ | 191/3000 [00:21<04:43, 9.90it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 6%|▋ | 193/3000 [00:21<04:55, 9.49it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating workflow: 7%|▋ | 197/3000 [00:21<05:16, 8.87it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 7%|▋ | 199/3000 [00:22<05:04, 9.19it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 7%|▋ | 200/3000 [00:22<05:01, 9.27it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 7%|▋ | 203/3000 [00:22<04:23, 10.63it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 7%|▋ | 205/3000 [00:22<05:17, 8.79it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 7%|▋ | 208/3000 [00:22<04:02, 11.53it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating workflow: 7%|▋ | 212/3000 [00:23<04:55, 9.44it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 7%|▋ | 214/3000 [00:23<04:18, 10.79it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating workflow: 7%|▋ | 218/3000 [00:23<04:20, 10.67it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 7%|▋ | 220/3000 [00:24<03:46, 12.29it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 7%|▋ | 222/3000 [00:24<04:22, 10.59it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 7%|▋ | 224/3000 [00:24<04:40, 9.90it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 8%|▊ | 226/3000 [00:24<05:26, 8.51it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 8%|▊ | 227/3000 [00:25<05:40, 8.13it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 8%|▊ | 229/3000 [00:25<05:26, 8.50it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating workflow: 8%|▊ | 234/3000 [00:25<04:22, 10.54it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 8%|▊ | 236/3000 [00:25<04:31, 10.19it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating workflow: 8%|▊ | 241/3000 [00:26<04:09, 11.07it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 8%|▊ | 243/3000 [00:26<04:37, 9.92it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 8%|▊ | 245/3000 [00:26<04:36, 9.97it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating workflow: 8%|▊ | 249/3000 [00:27<03:39, 12.54it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 8%|▊ | 251/3000 [00:27<04:25, 10.37it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 8%|▊ | 253/3000 [00:27<04:42, 9.71it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 9%|▊ | 256/3000 [00:27<04:42, 9.73it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 9%|▊ | 258/3000 [00:31<23:59, 1.91it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"\u001b[32m2026-01-04 11:06:17.406\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n",
"Example: {'question': \"You are an expert in single-cell biology and functional genomics. In K562 cells, LIMS1 is perturbed and the expression of PLAT is measured. Does this perturbation cause a significant change in PLAT expression? Answer strictly in the format 'Final Answer: Yes' or 'Final Answer: No'.\", 'answer': 'No', 'question_new': \"You are an expert in single-cell biology and functional genomics. In K562 cells, LIMS1 is perturbed and the expression of PLAT is measured. Does this perturbation cause a significant change in PLAT expression? Answer strictly in the format 'Final Answer: Yes' or 'Final Answer: No'.\", '_id': 23890}\n",
"Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 9 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n",
"\u001b[32m2026-01-04 11:06:17.434\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n",
"Example: {'question': \"You are a specialist in gene perturbation experiments. In K562 cells, we introduce a perturbation to AURKA and then measure expression of ERCC6L. Does this perturbation cause a significant change in ERCC6L expression? Reply only in the form 'Final Answer: Yes' or 'Final Answer: No'.\", 'answer': 'Yes', 'question_new': \"You are a specialist in gene perturbation experiments. In K562 cells, we introduce a perturbation to AURKA and then measure expression of ERCC6L. Does this perturbation cause a significant change in ERCC6L expression? Reply only in the form 'Final Answer: Yes' or 'Final Answer: No'.\", '_id': 3174}\n",
"Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 9 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 9%|▊ | 261/3000 [00:31<16:02, 2.85it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"\u001b[32m2026-01-04 11:06:26.299\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n",
"Example: {'question': \"You are a genomics expert evaluating perturbation experiments. In K562 cells, EXOSC7 is perturbed and the expression of FANCA is measured. Determine whether FANCA shows a significant expression change under this perturbation. Respond exactly with 'Final Answer: Yes' or 'Final Answer: No'.\", 'answer': 'Yes', 'question_new': \"You are a genomics expert evaluating perturbation experiments. In K562 cells, EXOSC7 is perturbed and the expression of FANCA is measured. Determine whether FANCA shows a significant expression change under this perturbation. Respond exactly with 'Final Answer: Yes' or 'Final Answer: No'.\", '_id': 16166}\n",
"Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 1 second. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 9%|▉ | 263/3000 [00:40<1:03:37, 1.39s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[32m2026-01-04 11:06:26.411\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n",
"Example: {'question': \"You are an expert in gene expression analysis. For K562 cells, assess whether perturbation of SASS6 is associated with a significant change in CDKN1A expression compared with unperturbed controls. Answer strictly as 'Final Answer: Yes' or 'Final Answer: No'.\", 'answer': 'Yes', 'question_new': \"You are an expert in gene expression analysis. For K562 cells, assess whether perturbation of SASS6 is associated with a significant change in CDKN1A expression compared with unperturbed controls. Answer strictly as 'Final Answer: Yes' or 'Final Answer: No'.\", '_id': 46855}\n",
"Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 1 second. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 9%|▉ | 264/3000 [00:40<54:29, 1.19s/it] "
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[32m2026-01-04 11:06:26.422\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n",
"Example: {'question': \"You are an expert analyst of perturbation datasets. For K562 cells subjected to perturbation of RCL1, does the expression profile of SPAG5 indicate a significant change relative to control conditions? Reply strictly with 'Final Answer: Yes' or 'Final Answer: No'.\", 'answer': 'Yes', 'question_new': \"You are an expert analyst of perturbation datasets. For K562 cells subjected to perturbation of RCL1, does the expression profile of SPAG5 indicate a significant change relative to control conditions? Reply strictly with 'Final Answer: Yes' or 'Final Answer: No'.\", '_id': 42057}\n",
"Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 1 second. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n",
"\u001b[32m2026-01-04 11:06:26.439\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n",
"Example: {'question': \"You are a specialist in gene perturbation experiments. In K562 cells, we introduce a perturbation to TUBB2A and then measure expression of RRAS. Does this perturbation cause a significant change in RRAS expression? Reply only in the form 'Final Answer: Yes' or 'Final Answer: No'.\", 'answer': 'Yes', 'question_new': \"You are a specialist in gene perturbation experiments. In K562 cells, we introduce a perturbation to TUBB2A and then measure expression of RRAS. Does this perturbation cause a significant change in RRAS expression? Reply only in the form 'Final Answer: Yes' or 'Final Answer: No'.\", '_id': 56454}\n",
"Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 1 second. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n",
"\u001b[32m2026-01-04 11:06:26.489\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n",
"Example: {'question': \"You are a domain expert in functional genomics. For experiments carried out in K562 cells, we perturb TMSB10 and examine the expression of S100A11. Does perturbing TMSB10 lead to a significant change in S100A11 expression? Reply only with 'Final Answer: Yes' or 'Final Answer: No'.\", 'answer': 'Yes', 'question_new': \"You are a domain expert in functional genomics. For experiments carried out in K562 cells, we perturb TMSB10 and examine the expression of S100A11. Does perturbing TMSB10 lead to a significant change in S100A11 expression? Reply only with 'Final Answer: Yes' or 'Final Answer: No'.\", '_id': 54791}\n",
"Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 3 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n",
"\u001b[32m2026-01-04 11:06:26.543\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 1 second. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 9%|▉ | 268/3000 [00:40<29:29, 1.54it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"\u001b[32m2026-01-04 11:06:26.552\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n",
"Example: {'question': \"You are a functional genomics specialist. In K562 cells, we perturb EIF1AD and monitor MND1 expression. Decide whether this perturbation leads to a significant alteration in MND1 expression. Answer only in the format 'Final Answer: Yes' or 'Final Answer: No'.\", 'answer': 'Yes', 'question_new': \"You are a functional genomics specialist. In K562 cells, we perturb EIF1AD and monitor MND1 expression. Decide whether this perturbation leads to a significant alteration in MND1 expression. Answer only in the format 'Final Answer: Yes' or 'Final Answer: No'.\", '_id': 14038}\n",
"Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 3 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n",
"\u001b[32m2026-01-04 11:06:26.567\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 1 second. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"\u001b[32m2026-01-04 11:06:26.607\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n",
"Example: {'question': \"You are a domain expert in functional genomics. For experiments carried out in K562 cells, we perturb GEMIN5 and examine the expression of GEMIN5. Does perturbing GEMIN5 lead to a significant change in GEMIN5 expression? Reply only with 'Final Answer: Yes' or 'Final Answer: No'.\", 'answer': 'Yes', 'question_new': \"You are a domain expert in functional genomics. For experiments carried out in K562 cells, we perturb GEMIN5 and examine the expression of GEMIN5. Does perturbing GEMIN5 lead to a significant change in GEMIN5 expression? Reply only with 'Final Answer: Yes' or 'Final Answer: No'.\", '_id': 17491}\n",
"Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 3 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n",
"\u001b[32m2026-01-04 11:06:26.648\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 1 second. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 9%|▉ | 272/3000 [00:40<18:05, 2.51it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"\u001b[32m2026-01-04 11:06:26.716\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n",
"Example: {'question': \"You are an expert in gene expression analysis. For K562 cells, assess whether perturbation of RPL31 is associated with a significant change in CDKN1A expression compared with unperturbed controls. Answer strictly as 'Final Answer: Yes' or 'Final Answer: No'.\", 'answer': 'Yes', 'question_new': \"You are an expert in gene expression analysis. For K562 cells, assess whether perturbation of RPL31 is associated with a significant change in CDKN1A expression compared with unperturbed controls. Answer strictly as 'Final Answer: Yes' or 'Final Answer: No'.\", '_id': 44015}\n",
"Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 3 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n",
"\u001b[32m2026-01-04 11:06:26.726\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 1 second. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"\u001b[32m2026-01-04 11:06:26.792\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n",
"Example: {'question': \"You are a genomics expert evaluating perturbation experiments. In K562 cells, TAF6 is perturbed and the expression of TAF6 is measured. Determine whether TAF6 shows a significant expression change under this perturbation. Respond exactly with 'Final Answer: Yes' or 'Final Answer: No'.\", 'answer': 'Yes', 'question_new': \"You are a genomics expert evaluating perturbation experiments. In K562 cells, TAF6 is perturbed and the expression of TAF6 is measured. Determine whether TAF6 shows a significant expression change under this perturbation. Respond exactly with 'Final Answer: Yes' or 'Final Answer: No'.\", '_id': 52806}\n",
"Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 3 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 9%|▉ | 275/3000 [00:40<13:16, 3.42it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[32m2026-01-04 11:06:26.825\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n",
"Example: {'question': \"You are an expert in gene regulation studies. For experiments performed in K562 cells, EIF4A3 is perturbed and EPB41 expression is measured. Determine whether EPB41 exhibits a significant expression change under this perturbation. Respond exactly with 'Final Answer: Yes' or 'Final Answer: No'.\", 'answer': 'No', 'question_new': \"You are an expert in gene regulation studies. For experiments performed in K562 cells, EIF4A3 is perturbed and EPB41 expression is measured. Determine whether EPB41 exhibits a significant expression change under this perturbation. Respond exactly with 'Final Answer: Yes' or 'Final Answer: No'.\", '_id': 14659}\n",
"Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 3 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n",
"\u001b[32m2026-01-04 11:06:26.852\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n",
"Example: {'question': \"You are an expert in gene expression analysis. For K562 cells, assess whether perturbation of OPA1 is associated with a significant change in OPA1 expression compared with unperturbed controls. Answer strictly as 'Final Answer: Yes' or 'Final Answer: No'.\", 'answer': 'Yes', 'question_new': \"You are an expert in gene expression analysis. For K562 cells, assess whether perturbation of OPA1 is associated with a significant change in OPA1 expression compared with unperturbed controls. Answer strictly as 'Final Answer: Yes' or 'Final Answer: No'.\", '_id': 33635}\n",
"Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 3 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n",
"\u001b[32m2026-01-04 11:06:26.879\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n",
"Example: {'question': \"You are a specialist in gene perturbation experiments. In K562 cells, we introduce a perturbation to TWSG1 and then measure expression of HAUS1. Does this perturbation cause a significant change in HAUS1 expression? Reply only in the form 'Final Answer: Yes' or 'Final Answer: No'.\", 'answer': 'Yes', 'question_new': \"You are a specialist in gene perturbation experiments. In K562 cells, we introduce a perturbation to TWSG1 and then measure expression of HAUS1. Does this perturbation cause a significant change in HAUS1 expression? Reply only in the form 'Final Answer: Yes' or 'Final Answer: No'.\", '_id': 56774}\n",
"Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 3 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n",
"\u001b[32m2026-01-04 11:06:26.911\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n",
"Example: {'question': \"You are a domain expert in functional genomics. For experiments carried out in K562 cells, we perturb TRRAP and examine the expression of ADM. Does perturbing TRRAP lead to a significant change in ADM expression? Reply only with 'Final Answer: Yes' or 'Final Answer: No'.\", 'answer': 'Yes', 'question_new': \"You are a domain expert in functional genomics. For experiments carried out in K562 cells, we perturb TRRAP and examine the expression of ADM. Does perturbing TRRAP lead to a significant change in ADM expression? Reply only with 'Final Answer: Yes' or 'Final Answer: No'.\", '_id': 55901}\n",
"Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 3 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 9%|▉ | 279/3000 [00:40<08:55, 5.08it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[32m2026-01-04 11:06:27.015\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n",
"Example: {'question': \"You are an expert in gene expression analysis. For K562 cells, assess whether perturbation of CDC37 is associated with a significant change in RAC3 expression compared with unperturbed controls. Answer strictly as 'Final Answer: Yes' or 'Final Answer: No'.\", 'answer': 'Yes', 'question_new': \"You are an expert in gene expression analysis. For K562 cells, assess whether perturbation of CDC37 is associated with a significant change in RAC3 expression compared with unperturbed controls. Answer strictly as 'Final Answer: Yes' or 'Final Answer: No'.\", '_id': 6165}\n",
"Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 3 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n",
"\u001b[32m2026-01-04 11:06:27.024\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n",
"Example: {'question': \"You are an expert in gene regulation studies. For experiments performed in K562 cells, DPY19L4 is perturbed and DBNDD2 expression is measured. Determine whether DBNDD2 exhibits a significant expression change under this perturbation. Respond exactly with 'Final Answer: Yes' or 'Final Answer: No'.\", 'answer': 'Yes', 'question_new': \"You are an expert in gene regulation studies. For experiments performed in K562 cells, DPY19L4 is perturbed and DBNDD2 expression is measured. Determine whether DBNDD2 exhibits a significant expression change under this perturbation. Respond exactly with 'Final Answer: Yes' or 'Final Answer: No'.\", '_id': 13109}\n",
"Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 3 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n",
"\u001b[32m2026-01-04 11:06:31.710\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 1 second. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 9%|▉ | 282/3000 [00:45<26:46, 1.69it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"\u001b[32m2026-01-04 11:06:31.862\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 4 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n",
"\u001b[32m2026-01-04 11:06:31.862\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 3 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 9%|▉ | 284/3000 [00:45<21:50, 2.07it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"\u001b[32m2026-01-04 11:06:31.938\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 3 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"\u001b[32m2026-01-04 11:06:31.960\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 3 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"\u001b[32m2026-01-04 11:06:31.996\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 3 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 10%|▉ | 287/3000 [00:45<15:37, 2.89it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"\u001b[32m2026-01-04 11:06:32.024\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 3 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"\u001b[32m2026-01-04 11:06:32.050\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 3 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"\u001b[32m2026-01-04 11:06:32.226\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n",
"Example: {'question': \"You are an expert in gene expression analysis. For K562 cells, assess whether perturbation of PSMA4 is associated with a significant change in GABARAPL1 expression compared with unperturbed controls. Answer strictly as 'Final Answer: Yes' or 'Final Answer: No'.\", 'answer': 'Yes', 'question_new': \"You are an expert in gene expression analysis. For K562 cells, assess whether perturbation of PSMA4 is associated with a significant change in GABARAPL1 expression compared with unperturbed controls. Answer strictly as 'Final Answer: Yes' or 'Final Answer: No'.\", '_id': 39195}\n",
"Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 4 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 10%|▉ | 290/3000 [00:46<11:52, 3.80it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[32m2026-01-04 11:06:32.546\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 3 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"\u001b[32m2026-01-04 11:06:32.559\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 3 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 10%|▉ | 292/3000 [00:46<10:52, 4.15it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"\u001b[32m2026-01-04 11:06:32.584\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 3 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"\u001b[32m2026-01-04 11:06:32.631\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 3 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"\u001b[32m2026-01-04 11:06:32.690\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 3 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 10%|▉ | 295/3000 [00:46<07:57, 5.67it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"\u001b[32m2026-01-04 11:06:32.734\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n",
"Example: {'question': \"You are an expert in regulatory genomics. Consider data from K562 cells in which ATP6V1F is perturbed and ATP6V1F expression is observed. Does this perturbation lead to a significant difference in ATP6V1F expression relative to control conditions? Answer strictly with 'Final Answer: Yes' or 'Final Answer: No'.\", 'answer': 'Yes', 'question_new': \"You are an expert in regulatory genomics. Consider data from K562 cells in which ATP6V1F is perturbed and ATP6V1F expression is observed. Does this perturbation lead to a significant difference in ATP6V1F expression relative to control conditions? Answer strictly with 'Final Answer: Yes' or 'Final Answer: No'.\", '_id': 3003}\n",
"Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 4 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n",
"\u001b[32m2026-01-04 11:06:36.081\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 2 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 10%|▉ | 297/3000 [00:49<24:12, 1.86it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"\u001b[32m2026-01-04 11:06:36.097\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 2 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"\u001b[32m2026-01-04 11:06:36.413\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n",
"Example: {'question': \"You are an expert in gene regulation studies. For experiments performed in K562 cells, BMS1 is perturbed and BMS1 expression is measured. Determine whether BMS1 exhibits a significant expression change under this perturbation. Respond exactly with 'Final Answer: Yes' or 'Final Answer: No'.\", 'answer': 'Yes', 'question_new': \"You are an expert in gene regulation studies. For experiments performed in K562 cells, BMS1 is perturbed and BMS1 expression is measured. Determine whether BMS1 exhibits a significant expression change under this perturbation. Respond exactly with 'Final Answer: Yes' or 'Final Answer: No'.\", '_id': 3689}\n",
"Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 2 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 10%|▉ | 299/3000 [00:50<19:56, 2.26it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[32m2026-01-04 11:06:36.940\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n",
"Example: {'question': \"You are a genomics expert evaluating perturbation experiments. In K562 cells, EXOC5 is perturbed and the expression of CLIC1 is measured. Determine whether CLIC1 shows a significant expression change under this perturbation. Respond exactly with 'Final Answer: Yes' or 'Final Answer: No'.\", 'answer': 'Yes', 'question_new': \"You are a genomics expert evaluating perturbation experiments. In K562 cells, EXOC5 is perturbed and the expression of CLIC1 is measured. Determine whether CLIC1 shows a significant expression change under this perturbation. Respond exactly with 'Final Answer: Yes' or 'Final Answer: No'.\", '_id': 15876}\n",
"Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 2 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 10%|█ | 300/3000 [00:50<20:30, 2.19it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[32m2026-01-04 11:06:37.773\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 2 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 10%|█ | 301/3000 [00:51<23:34, 1.91it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"\u001b[32m2026-01-04 11:06:37.785\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 2 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"\u001b[32m2026-01-04 11:06:37.828\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 2 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"\u001b[32m2026-01-04 11:06:37.852\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 2 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"\u001b[32m2026-01-04 11:06:37.870\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 2 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"\u001b[32m2026-01-04 11:06:38.137\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 1 second. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 10%|█ | 306/3000 [00:51<12:08, 3.70it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"\u001b[32m2026-01-04 11:06:38.157\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 1 second. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"\u001b[32m2026-01-04 11:06:38.185\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 1 second. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"\u001b[32m2026-01-04 11:06:38.360\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 1 second. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 10%|█ | 309/3000 [00:52<09:15, 4.84it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"\u001b[32m2026-01-04 11:06:38.725\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n",
"Example: {'question': \"You are a domain expert in functional genomics. For experiments carried out in K562 cells, we perturb CDIPT and examine the expression of CAV2. Does perturbing CDIPT lead to a significant change in CAV2 expression? Reply only with 'Final Answer: Yes' or 'Final Answer: No'.\", 'answer': 'Yes', 'question_new': \"You are a domain expert in functional genomics. For experiments carried out in K562 cells, we perturb CDIPT and examine the expression of CAV2. Does perturbing CDIPT lead to a significant change in CAV2 expression? Reply only with 'Final Answer: Yes' or 'Final Answer: No'.\", '_id': 6411}\n",
"Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 2 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 10%|█ | 310/3000 [00:52<10:12, 4.39it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[32m2026-01-04 11:06:38.774\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n",
"Example: {'question': \"You are an expert in gene regulation studies. For experiments performed in K562 cells, DICER1 is perturbed and CDK6 expression is measured. Determine whether CDK6 exhibits a significant expression change under this perturbation. Respond exactly with 'Final Answer: Yes' or 'Final Answer: No'.\", 'answer': 'Yes', 'question_new': \"You are an expert in gene regulation studies. For experiments performed in K562 cells, DICER1 is perturbed and CDK6 expression is measured. Determine whether CDK6 exhibits a significant expression change under this perturbation. Respond exactly with 'Final Answer: Yes' or 'Final Answer: No'.\", '_id': 12319}\n",
"Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 2 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n",
"\u001b[32m2026-01-04 11:06:39.231\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 2 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n",
"\u001b[32m2026-01-04 11:06:39.231\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n",
"Example: {'question': \"You are a genomics expert evaluating perturbation experiments. In K562 cells, CENPW is perturbed and the expression of XRN2 is measured. Determine whether XRN2 shows a significant expression change under this perturbation. Respond exactly with 'Final Answer: Yes' or 'Final Answer: No'.\", 'answer': 'Yes', 'question_new': \"You are a genomics expert evaluating perturbation experiments. In K562 cells, CENPW is perturbed and the expression of XRN2 is measured. Determine whether XRN2 shows a significant expression change under this perturbation. Respond exactly with 'Final Answer: Yes' or 'Final Answer: No'.\", '_id': 6986}\n",
"Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 7 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 10%|█ | 312/3000 [00:53<10:32, 4.25it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"\u001b[32m2026-01-04 11:06:39.248\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 2 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"\u001b[32m2026-01-04 11:06:39.546\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 1 second. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 10%|█ | 315/3000 [00:53<08:21, 5.36it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"\u001b[32m2026-01-04 11:06:40.497\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 6 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 11%|█ | 316/3000 [00:54<13:28, 3.32it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"\u001b[32m2026-01-04 11:06:40.568\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n",
"Example: {'question': \"You are an expert in regulatory genomics. Consider data from K562 cells in which PNO1 is perturbed and TACC3 expression is observed. Does this perturbation lead to a significant difference in TACC3 expression relative to control conditions? Answer strictly with 'Final Answer: Yes' or 'Final Answer: No'.\", 'answer': 'Yes', 'question_new': \"You are an expert in regulatory genomics. Consider data from K562 cells in which PNO1 is perturbed and TACC3 expression is observed. Does this perturbation lead to a significant difference in TACC3 expression relative to control conditions? Answer strictly with 'Final Answer: Yes' or 'Final Answer: No'.\", '_id': 36073}\n",
"Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 6 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n",
"\u001b[32m2026-01-04 11:06:40.740\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n",
"Example: {'question': \"You are a specialist in gene perturbation experiments. In K562 cells, we introduce a perturbation to MSTO1 and then measure expression of VHL. Does this perturbation cause a significant change in VHL expression? Reply only in the form 'Final Answer: Yes' or 'Final Answer: No'.\", 'answer': 'No', 'question_new': \"You are a specialist in gene perturbation experiments. In K562 cells, we introduce a perturbation to MSTO1 and then measure expression of VHL. Does this perturbation cause a significant change in VHL expression? Reply only in the form 'Final Answer: Yes' or 'Final Answer: No'.\", '_id': 29024}\n",
"Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 6 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 11%|█ | 318/3000 [00:54<11:03, 4.04it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[32m2026-01-04 11:06:40.859\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 6 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 11%|█ | 319/3000 [00:54<10:02, 4.45it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"\u001b[32m2026-01-04 11:06:40.904\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 6 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"\u001b[32m2026-01-04 11:06:40.914\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 6 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"\u001b[32m2026-01-04 11:06:40.930\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 6 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"\u001b[32m2026-01-04 11:06:40.964\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 6 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 11%|█ | 323/3000 [00:54<05:34, 7.99it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"\u001b[32m2026-01-04 11:06:40.986\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 6 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"\u001b[32m2026-01-04 11:06:41.011\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 6 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"\u001b[32m2026-01-04 11:06:41.278\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 5 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 11%|█ | 326/3000 [00:55<05:15, 8.48it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"\u001b[32m2026-01-04 11:06:41.324\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 5 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 11%|█ | 328/3000 [01:00<33:49, 1.32it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 11%|█ | 330/3000 [01:00<26:51, 1.66it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 11%|█ | 331/3000 [01:01<24:17, 1.83it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"\u001b[32m2026-01-04 11:06:47.310\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n",
"Example: {'question': \"You are a functional genomics specialist. In K562 cells, we perturb NDUFS5 and monitor TBL1X expression. Decide whether this perturbation leads to a significant alteration in TBL1X expression. Answer only in the format 'Final Answer: Yes' or 'Final Answer: No'.\", 'answer': 'Yes', 'question_new': \"You are a functional genomics specialist. In K562 cells, we perturb NDUFS5 and monitor TBL1X expression. Decide whether this perturbation leads to a significant alteration in TBL1X expression. Answer only in the format 'Final Answer: Yes' or 'Final Answer: No'.\", '_id': 30948}\n",
"Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 1 second. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating workflow: 11%|█ | 334/3000 [01:02<21:33, 2.06it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating workflow: 11%|█▏ | 340/3000 [01:02<08:01, 5.52it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 11%|█▏ | 342/3000 [01:02<06:38, 6.67it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 11%|█▏ | 344/3000 [01:03<07:05, 6.24it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 12%|█▏ | 346/3000 [01:03<08:30, 5.19it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 12%|█▏ | 348/3000 [01:04<08:46, 5.04it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating workflow: 12%|█▏ | 354/3000 [01:04<05:06, 8.64it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating workflow: 12%|█▏ | 358/3000 [01:04<04:07, 10.65it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 12%|█▏ | 360/3000 [01:04<04:30, 9.77it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 12%|█▏ | 362/3000 [01:05<05:16, 8.34it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 12%|█▏ | 364/3000 [01:05<05:05, 8.63it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating workflow: 12%|█▏ | 368/3000 [01:06<06:00, 7.29it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating workflow: 12%|█▏ | 373/3000 [01:06<04:12, 10.41it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 12%|█▎ | 375/3000 [01:06<04:07, 10.62it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating workflow: 13%|█▎ | 379/3000 [01:06<03:54, 11.18it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 13%|█▎ | 382/3000 [01:07<03:29, 12.48it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 13%|█▎ | 384/3000 [01:07<04:36, 9.48it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 13%|█▎ | 386/3000 [01:07<05:43, 7.61it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 13%|█▎ | 387/3000 [01:08<06:26, 6.76it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating workflow: 13%|█▎ | 393/3000 [01:08<04:27, 9.74it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating workflow: 13%|█▎ | 400/3000 [01:08<02:41, 16.10it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 13%|█▎ | 403/3000 [01:09<02:33, 16.89it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 14%|█▎ | 405/3000 [01:09<05:56, 7.27it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 14%|█▎ | 407/3000 [01:10<05:41, 7.60it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating workflow: 14%|█▎ | 412/3000 [01:10<04:21, 9.89it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 14%|█▍ | 414/3000 [01:10<04:28, 9.62it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 14%|█▍ | 417/3000 [01:10<03:34, 12.06it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 14%|█▍ | 419/3000 [01:11<04:13, 10.18it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 14%|█▍ | 421/3000 [01:11<04:32, 9.45it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating workflow: 14%|█▍ | 424/3000 [01:11<05:42, 7.53it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating workflow: 14%|█▍ | 429/3000 [01:12<04:15, 10.05it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 14%|█▍ | 432/3000 [01:12<03:25, 12.52it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 14%|█▍ | 434/3000 [01:12<03:59, 10.70it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 15%|█▍ | 436/3000 [01:12<03:38, 11.72it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 15%|█▍ | 438/3000 [01:13<03:54, 10.91it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating workflow: 15%|█▍ | 441/3000 [01:13<05:48, 7.34it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 15%|█▍ | 442/3000 [01:13<06:13, 6.85it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating workflow: 15%|█▍ | 448/3000 [01:14<04:14, 10.02it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 15%|█▌ | 450/3000 [01:14<04:13, 10.07it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 15%|█▌ | 452/3000 [01:14<04:16, 9.94it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 15%|█▌ | 454/3000 [01:15<04:39, 9.13it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating workflow: 15%|█▌ | 459/3000 [01:15<03:54, 10.86it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 15%|█▌ | 461/3000 [01:15<03:35, 11.76it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 15%|█▌ | 463/3000 [01:15<04:42, 8.99it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 16%|█▌ | 465/3000 [01:16<05:14, 8.05it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 16%|█▌ | 468/3000 [01:16<03:51, 10.94it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 16%|█▌ | 470/3000 [01:16<04:50, 8.72it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating workflow: 16%|█▌ | 476/3000 [01:17<04:03, 10.36it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 16%|█▌ | 479/3000 [01:17<03:39, 11.51it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 16%|█▌ | 481/3000 [01:17<03:23, 12.41it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating workflow: 16%|█▌ | 485/3000 [01:18<04:16, 9.82it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 16%|█▌ | 487/3000 [01:18<04:56, 8.49it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating workflow: 16%|█▋ | 490/3000 [01:18<04:20, 9.64it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 16%|█▋ | 492/3000 [01:18<04:39, 8.97it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 16%|█▋ | 495/3000 [01:19<03:48, 10.98it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 17%|█▋ | 497/3000 [01:19<04:08, 10.08it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 17%|█▋ | 499/3000 [01:19<03:53, 10.69it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating workflow: 17%|█▋ | 503/3000 [01:19<04:06, 10.13it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 17%|█▋ | 505/3000 [01:20<04:17, 9.68it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating workflow: 17%|█▋ | 508/3000 [01:20<04:52, 8.51it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 17%|█▋ | 510/3000 [01:20<04:41, 8.86it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating workflow: 17%|█▋ | 513/3000 [01:21<04:40, 8.87it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating workflow: 17%|█▋ | 516/3000 [01:21<04:16, 9.68it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating workflow: 17%|█▋ | 519/3000 [01:21<04:17, 9.62it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating workflow: 17%|█▋ | 523/3000 [01:22<03:31, 11.69it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 18%|█▊ | 525/3000 [01:22<04:57, 8.32it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating workflow: 18%|█▊ | 531/3000 [01:22<03:39, 11.24it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 18%|█▊ | 533/3000 [01:23<03:28, 11.83it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 18%|█▊ | 535/3000 [01:23<04:21, 9.41it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating workflow: 18%|█▊ | 539/3000 [01:23<04:13, 9.73it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 18%|█▊ | 541/3000 [01:23<03:56, 10.40it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 18%|█▊ | 543/3000 [01:24<04:40, 8.77it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating workflow: 18%|█▊ | 548/3000 [01:24<04:21, 9.38it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 18%|█▊ | 550/3000 [01:25<03:51, 10.58it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 18%|█▊ | 552/3000 [01:25<04:19, 9.43it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 18%|█▊ | 554/3000 [01:25<04:25, 9.22it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 19%|█▊ | 556/3000 [01:25<04:14, 9.61it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating workflow: 19%|█▊ | 560/3000 [01:26<04:03, 10.04it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 19%|█▊ | 562/3000 [01:26<04:37, 8.78it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating workflow: 19%|█▉ | 565/3000 [01:26<05:20, 7.59it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating workflow: 19%|█▉ | 570/3000 [01:27<03:11, 12.69it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 19%|█▉ | 572/3000 [01:27<03:49, 10.56it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 19%|█▉ | 574/3000 [01:27<03:50, 10.54it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 19%|█▉ | 576/3000 [01:27<04:17, 9.41it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating workflow: 19%|█▉ | 581/3000 [01:28<03:17, 12.24it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 19%|█▉ | 583/3000 [01:28<03:26, 11.70it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 20%|█▉ | 585/3000 [01:28<06:02, 6.66it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"Evaluating workflow: 20%|█▉ | 587/3000 [01:29<05:17, 7.61it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating workflow: 20%|█▉ | 592/3000 [01:31<10:38, 3.77it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"\u001b[32m2026-01-04 11:07:27.417\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n",
"Example: {'question': \"You are a genomics expert evaluating perturbation experiments. In K562 cells, WDR43 is perturbed and the expression of LMNB1 is measured. Determine whether LMNB1 shows a significant expression change under this perturbation. Respond exactly with 'Final Answer: Yes' or 'Final Answer: No'.\", 'answer': 'Yes', 'question_new': \"You are a genomics expert evaluating perturbation experiments. In K562 cells, WDR43 is perturbed and the expression of LMNB1 is measured. Determine whether LMNB1 shows a significant expression change under this perturbation. Respond exactly with 'Final Answer: Yes' or 'Final Answer: No'.\", '_id': 59806}\n",
"Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 5 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n",
"\u001b[32m2026-01-04 11:07:31.993\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n",
"Example: {'question': \"You are an expert in single-cell biology and functional genomics. In K562 cells, MRPL2 is perturbed and the expression of MRPL2 is measured. Does this perturbation cause a significant change in MRPL2 expression? Answer strictly in the format 'Final Answer: Yes' or 'Final Answer: No'.\", 'answer': 'Yes', 'question_new': \"You are an expert in single-cell biology and functional genomics. In K562 cells, MRPL2 is perturbed and the expression of MRPL2 is measured. Does this perturbation cause a significant change in MRPL2 expression? Answer strictly in the format 'Final Answer: Yes' or 'Final Answer: No'.\", '_id': 27210}\n",
"Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 4 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating workflow: 20%|█▉ | 594/3000 [01:45<1:24:30, 2.11s/it]Unclosed client session\n",
"client_session: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556065.068567108)])']\n",
"connector: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556065.704426785)])']\n",
"connector: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556065.468535935)])']\n",
"connector: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556065.75796186)])']\n",
"connector: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556065.630263251)])']\n",
"connector: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556065.47683879)])']\n",
"connector: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556065.5055185)])']\n",
"connector: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556065.804223335)])']\n",
"connector: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556065.545936991)])']\n",
"connector: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556065.674084593)])']\n",
"connector: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556065.509688697)])']\n",
"connector: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556065.556698489)])']\n",
"connector: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556065.713792279)])']\n",
"connector: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556065.748664991)])']\n",
"connector: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556065.317082633)])']\n",
"connector: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556066.018006141)])']\n",
"connector: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556065.859226761)])']\n",
"connector: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556065.29137632)])']\n",
"connector: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556065.879643574)])']\n",
"connector: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556067.009974304)])']\n",
"connector: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556067.78460822)])']\n",
"connector: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556067.591595393)])']\n",
"connector: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556068.140438927)])']\n",
"connector: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556069.086527261)])']\n",
"connector: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556069.874236782)])']\n",
"connector: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556069.604221863)])']\n",
"connector: \n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Unclosed connector\n",
"connections: ['deque([(, 1556067.372303514)])']\n",
"connector: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556068.073945957)])']\n",
"connector: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556068.953597037)])']\n",
"connector: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556068.699285816)])']\n",
"connector: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556069.155374525)])']\n",
"connector: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556067.81179197)])']\n",
"connector: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556106.532364606)])']\n",
"connector: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed client session\n",
"client_session: \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[32m2026-01-04 11:07:32.450\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n",
"Example: {'question': \"You are a functional genomics specialist. In K562 cells, we perturb WAC and monitor PLOD2 expression. Decide whether this perturbation leads to a significant alteration in PLOD2 expression. Answer only in the format 'Final Answer: Yes' or 'Final Answer: No'.\", 'answer': 'Yes', 'question_new': \"You are a functional genomics specialist. In K562 cells, we perturb WAC and monitor PLOD2 expression. Decide whether this perturbation leads to a significant alteration in PLOD2 expression. Answer only in the format 'Final Answer: Yes' or 'Final Answer: No'.\", '_id': 59338}\n",
"Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 3 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Unclosed client session\n",
"client_session: \n",
"Evaluating workflow: 20%|█▉ | 595/3000 [01:46<1:13:55, 1.84s/it]Unclosed client session\n",
"client_session: \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[32m2026-01-04 11:07:32.472\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n",
"Example: {'question': \"You are an expert in gene regulation studies. For experiments performed in K562 cells, PPP1R15B is perturbed and SRP72 expression is measured. Determine whether SRP72 exhibits a significant expression change under this perturbation. Respond exactly with 'Final Answer: Yes' or 'Final Answer: No'.\", 'answer': 'Yes', 'question_new': \"You are an expert in gene regulation studies. For experiments performed in K562 cells, PPP1R15B is perturbed and SRP72 expression is measured. Determine whether SRP72 exhibits a significant expression change under this perturbation. Respond exactly with 'Final Answer: Yes' or 'Final Answer: No'.\", '_id': 37859}\n",
"Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 3 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n",
"metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Unclosed client session\n",
"client_session: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556106.4637648)])']\n",
"connector: \n",
"Unclosed client session\n",
"client_session: \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[32m2026-01-04 11:07:32.488\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n",
"Example: {'question': \"You are an expert analyst of perturbation datasets. For K562 cells subjected to perturbation of DARS, does the expression profile of CBR3 indicate a significant change relative to control conditions? Reply strictly with 'Final Answer: Yes' or 'Final Answer: No'.\", 'answer': 'Yes', 'question_new': \"You are an expert analyst of perturbation datasets. For K562 cells subjected to perturbation of DARS, does the expression profile of CBR3 indicate a significant change relative to control conditions? Reply strictly with 'Final Answer: Yes' or 'Final Answer: No'.\", '_id': 10667}\n",
"Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 3 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n",
"\u001b[32m2026-01-04 11:07:32.489\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n",
"Example: {'question': \"You are a specialist in gene perturbation experiments. In K562 cells, we introduce a perturbation to SMUG1 and then measure expression of FBH1. Does this perturbation cause a significant change in FBH1 expression? Reply only in the form 'Final Answer: Yes' or 'Final Answer: No'.\", 'answer': 'Yes', 'question_new': \"You are a specialist in gene perturbation experiments. In K562 cells, we introduce a perturbation to SMUG1 and then measure expression of FBH1. Does this perturbation cause a significant change in FBH1 expression? Reply only in the form 'Final Answer: Yes' or 'Final Answer: No'.\", '_id': 49504}\n",
"Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 3 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Unclosed client session\n",
"client_session: \n",
"Unclosed client session\n",
"client_session: \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[32m2026-01-04 11:07:32.493\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n",
"Example: {'question': \"You are a specialist in gene perturbation experiments. In K562 cells, we introduce a perturbation to MED4 and then measure expression of DMKN. Does this perturbation cause a significant change in DMKN expression? Reply only in the form 'Final Answer: Yes' or 'Final Answer: No'.\", 'answer': 'No', 'question_new': \"You are a specialist in gene perturbation experiments. In K562 cells, we introduce a perturbation to MED4 and then measure expression of DMKN. Does this perturbation cause a significant change in DMKN expression? Reply only in the form 'Final Answer: Yes' or 'Final Answer: No'.\", '_id': 25754}\n",
"Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 3 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n",
"\u001b[32m2026-01-04 11:07:32.501\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 3 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Unclosed client session\n",
"client_session: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556106.587330725)])']\n",
"connector: \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[32m2026-01-04 11:07:32.504\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n",
"Example: {'question': \"You are a genomics expert evaluating perturbation experiments. In K562 cells, HNRNPR is perturbed and the expression of HNRNPR is measured. Determine whether HNRNPR shows a significant expression change under this perturbation. Respond exactly with 'Final Answer: Yes' or 'Final Answer: No'.\", 'answer': 'Yes', 'question_new': \"You are a genomics expert evaluating perturbation experiments. In K562 cells, HNRNPR is perturbed and the expression of HNRNPR is measured. Determine whether HNRNPR shows a significant expression change under this perturbation. Respond exactly with 'Final Answer: Yes' or 'Final Answer: No'.\", '_id': 20556}\n",
"Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 3 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Unclosed connector\n",
"connections: ['deque([(, 1556106.211922575)])']\n",
"connector: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556106.360632778)])']\n",
"connector: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556106.241473948)])']\n",
"connector: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556106.511990587)])']\n",
"connector: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556106.325841442)])']\n",
"connector: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556106.383114963)])']\n",
"connector: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556106.414055342)])']\n",
"connector: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556105.988093819)])']\n",
"connector: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556106.714495805)])']\n",
"connector: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556106.194453581)])']\n",
"connector: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556106.319740423)])']\n",
"connector: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556108.365590929)])']\n",
"connector: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556108.525661851)])']\n",
"connector: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556108.600545375)])']\n",
"connector: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556109.097169432)])']\n",
"connector: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556110.493030311)])']\n",
"connector: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556111.272711614)])']\n",
"connector: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556111.621540795)])']\n",
"connector: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556110.559458895)])']\n",
"connector: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556110.734145775)])']\n",
"connector: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556110.806248243)])']\n",
"connector: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556112.505568483)])']\n",
"connector: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556112.608747868)])']\n",
"connector: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556112.896133511)])']\n",
"connector: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556114.45453075)])']\n",
"connector: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556114.541714774)])']\n",
"connector: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556114.613551364)])']\n",
"connector: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556116.745499441)])']\n",
"connector: \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"\u001b[32m2026-01-04 11:07:32.542\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 3 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Unclosed client session\n",
"client_session: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556116.569785187)])']\n",
"connector: \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n",
"\u001b[32m2026-01-04 11:07:32.547\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US 2 have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 3 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Unclosed client session\n",
"client_session: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556116.902401341)])']\n",
"connector: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556116.383799014)])']\n",
"connector: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556115.850771736)])']\n",
"connector: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556116.416547595)])']\n",
"connector: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556117.853690574)])']\n",
"connector: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556118.627459094)])']\n",
"connector: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556118.571596409)])']\n",
"connector: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556119.133989453)])']\n",
"connector: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556119.929369086)])']\n",
"connector: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556119.820079831)])']\n",
"connector: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556120.06630069)])']\n",
"connector: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556119.938016696)])']\n",
"connector: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556120.720838844)])']\n",
"connector: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556120.792143208)])']\n",
"connector: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556120.501614404)])']\n",
"connector: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556120.238519401)])']\n",
"connector: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556120.696463016)])']\n",
"connector: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed client session\n",
"client_session: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556120.683630116)])']\n",
"connector: \n",
"Unclosed connector\n",
"connections: ['deque([(, 1556120.75195234)])']\n",
"connector: