Spaces:
Sleeping
Sleeping
File size: 4,643 Bytes
d9175ae 02e8821 d9175ae | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 | import os
import random
from collections import defaultdict
from dotenv import load_dotenv
from openai import OpenAI
from tool_use_env.client import ToolUseEnv
from tool_use_env.models import ToolUseAction
# --- Load env ---
load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN")
HF_MODEL = os.getenv("HF_MODEL", "meta-llama/Meta-Llama-3-8B-Instruct")
# --- HF client ---
hf_client = OpenAI(
base_url="https://router.huggingface.co/v1",
api_key=HF_TOKEN
)
# --- Reproducibility ---
random.seed(42)
# --- Global flag ---
HF_AVAILABLE = True
# π§ Rule-based (correct logic)
def rule_based_policy(query: str):
q = query.lower()
if any(op in q for op in ["+", "-", "*", "/"]):
return "use_calculator"
if "capital" in q or "who is" in q or "ceo" in q:
return "use_search"
return "use_search"
# π§ Noisy fallback (simulate LLM mistakes)
def noisy_rule_policy(query: str):
correct = rule_based_policy(query)
if random.random() < 0.08: # 8% noise
action = random.choice([
"use_calculator",
"use_search",
"answer_directly"
])
return correct
# π§ LLM + fallback policy
def llm_policy(query: str):
global HF_AVAILABLE
prompt = f"""
You are an AI agent.
Choose EXACTLY one action:
- use_calculator
- use_search
- answer_directly
Query: {query}
ONLY output one action.
"""
# --- Try HF only if still available ---
if HF_AVAILABLE:
try:
response = hf_client.chat.completions.create(
model=HF_MODEL,
messages=[{"role": "user", "content": prompt}],
temperature=0
)
action = response.choices[0].message.content.strip()
if random.random() < 0.08:
action = random.choice([
"use_calculator",
"use_search",
"answer_directly"
])
if action in ["use_calculator", "use_search", "answer_directly"]:
print("[HF] Used")
return action
except Exception as e:
print("[HF FAILED β switching to fallback permanently]")
HF_AVAILABLE = False
# --- Fallback ---
return noisy_rule_policy(query)
# π§ͺ Evaluation
def run_evaluation(num_episodes=50):
results = []
total_score = 0
difficulty_scores = defaultdict(list)
with ToolUseEnv(base_url="https://clove25-tool-use-openenv.hf.space").sync() as env:
for _ in range(num_episodes):
result = env.reset()
obs = result.observation
query = obs.query
state = env.state()
difficulty = state.difficulty
action_type = llm_policy(query)
action = ToolUseAction(action_type=action_type)
result = env.step(action)
obs = result.observation
score = result.reward
total_score += score
difficulty_scores[difficulty].append(score)
results.append({
"query": query,
"difficulty": difficulty,
"action": action_type,
"score": score,
"message": obs.message
})
print(f"Score: {score:.2f}")
avg_score = total_score / num_episodes
print("\n=== OVERALL PERFORMANCE ===")
print(f"Average Score: {avg_score:.2f}")
print("\n=== DIFFICULTY BREAKDOWN ===")
for level in ["easy", "medium", "hard"]:
if difficulty_scores[level]:
avg = sum(difficulty_scores[level]) / len(difficulty_scores[level])
print(f"{level.capitalize()}: {avg:.2f}")
print("\n=== SAMPLE CASES ===")
for r in results[:5]:
print(f"\nQuery: {r['query']}")
print(f"Action: {r['action']}")
print(f"Score: {r['score']:.2f}")
print(f"Details: {r['message']}")
return results
# π Failure analysis (FIXED VERSION)
def analyze_failures(results):
total = len(results)
tool_failures = 0
wrong_decisions = 0
for r in results:
score = r["score"]
action = r["action"]
if score < 0.5:
if "use_" in action:
tool_failures += 1
else:
wrong_decisions += 1
print("\n=== FAILURE ANALYSIS ===")
print(f"Tool failures: {tool_failures}/{total} ({(tool_failures/total)*100:.1f}%)")
print(f"Wrong decisions: {wrong_decisions}/{total} ({(wrong_decisions/total)*100:.1f}%)")
# π Run
if __name__ == "__main__":
results = run_evaluation(50)
analyze_failures(results) |