Spaces:

ctuning
/

FlexBoard

Sleeping

FlexBoard / cost_calculator.py

Grigori Fursin

first commit

4d2d78e unverified 11 months ago

3.98 kB

	"""
	Cost calculation module for MLPerf configurations.
	"""

	import logging

	import pandas as pd

	logger = logging.getLogger(__name__)

	DEFAULT_HOURLY_COST = 1.0

	DEFAULT_DEVICE_COSTS = {
	"NVIDIA H100": 3.00,
	"NVIDIA H200": 4.00,
	"NVIDIA GH200": 5.00,
	"NVIDIA B200/GB200": 7.00,
	"AMD MI300X": 3.50,
	"AMD MI325X": 4.50,
	"NVIDIA RTX 4090": 1.20,
	"NVIDIA L40S": 1.80,
	"NVIDIA Jetson AGX": 0.30,
	}

	device_costs = {}


	def normalize_gpu_name(name: str) -> str:
	"""Normalize GPU names by identifying common patterns for the same device families."""
	if not name:
	return name

	name_upper = name.upper()

	gpu_families = {
	"H100": "NVIDIA H100",
	"H200": "NVIDIA H200",
	"GH200": "NVIDIA GH200",
	"GRACE HOPPER": "NVIDIA GH200",
	"B200": "NVIDIA B200/GB200",
	"GB200": "NVIDIA B200/GB200",
	"MI300X": "AMD MI300X",
	"MI325X": "AMD MI325X",
	"RTX 4090": "NVIDIA RTX 4090",
	"L40S": "NVIDIA L40S",
	}

	if "JETSON" in name_upper and ("ORIN" in name_upper or "THOR" in name_upper):
	return "NVIDIA Jetson AGX"

	for keyword, normalized_name in gpu_families.items():
	if keyword in name_upper:
	return normalized_name

	return name


	def initialize_device_costs(df: pd.DataFrame) -> None:
	"""Initialize device costs from dataset with default values."""
	global device_costs

	accelerators = set()

	if df is not None and not df.empty and "system.accelerator.name" in df.columns:
	for acc in df["system.accelerator.name"].dropna().unique():
	normalized_name = normalize_gpu_name(acc)
	accelerators.add(normalized_name)

	device_costs = {}
	for device in accelerators:
	if device in DEFAULT_DEVICE_COSTS:
	device_costs[device] = DEFAULT_DEVICE_COSTS[device]
	else:
	device_costs[device] = DEFAULT_HOURLY_COST

	logger.info(f"Initialized costs for {len(device_costs)} unique device families")


	def get_device_costs() -> dict[str, float]:
	"""Return a copy of the current device costs."""
	return device_costs.copy()


	def update_device_costs(new_costs: dict[str, float]) -> None:
	"""Update device costs with new values."""
	global device_costs
	device_costs.update(new_costs)
	logger.info(f"Updated costs for {len(new_costs)} devices")


	def calculate_costs(df: pd.DataFrame) -> pd.DataFrame:
	"""Add cost metrics to the DataFrame."""
	if df is None or df.empty:
	return df

	result_df = df.copy()

	result_df["hourly_cost"] = None
	result_df["cost_per_million_tokens"] = None

	for idx, row in result_df.iterrows():
	hourly_cost = estimate_hourly_cost(row)
	result_df.at[idx, "hourly_cost"] = hourly_cost

	if hourly_cost and "metrics.result" in row and row["metrics.result"]:
	tokens_per_hour = row["metrics.result"] * 3600
	if tokens_per_hour > 0:
	cost_per_million = (hourly_cost / tokens_per_hour) * 1000000
	result_df.at[idx, "cost_per_million_tokens"] = cost_per_million

	return result_df


	def estimate_hourly_cost(row: pd.Series) -> float:
	"""Estimate hourly cost for a single configuration."""
	try:
	acc_name = row.get("system.accelerator.name")
	acc_vendor = row.get("system.accelerator.vendor")
	acc_count = row.get("system.accelerator.total_count")

	if not acc_count:
	return None

	base_cost = DEFAULT_HOURLY_COST

	if acc_name:
	normalized_name = normalize_gpu_name(acc_name)
	if normalized_name in device_costs:
	base_cost = device_costs[normalized_name]
	elif acc_vendor and acc_vendor in device_costs:
	base_cost = device_costs[acc_vendor]

	return base_cost * acc_count

	except Exception as e:
	logger.warning(f"Error calculating cost: {e}")
	return None