| """ |
| RL Yield Optimizer β PPO-based portfolio allocation agent |
| ========================================================== |
| Uses Proximal Policy Optimization to learn optimal allocation weights |
| across USDY, mETH, and MI4 given market conditions. |
| |
| Architecture: |
| - Custom Gymnasium environment simulating the RWA yield landscape |
| - Actor-Critic network with LSTM for temporal features |
| - Risk-adjusted reward: Sharpe-like ratio with drawdown penalty |
| """ |
|
|
| import logging |
| import os |
| from dataclasses import dataclass |
| from typing import Dict, List, Optional, Tuple |
|
|
| import gymnasium as gym |
| import numpy as np |
| from gymnasium import spaces |
|
|
| logger = logging.getLogger("rl_optimizer") |
|
|
|
|
| |
|
|
| class RWAYieldEnv(gym.Env): |
| """ |
| Gymnasium environment for RWA portfolio yield optimization. |
| |
| State (15-dim): [usdy_apy, meth_apy, mi4_apy, eth_price, btc_price, |
| btc_dominance, fed_rate, usdy_peg, meth_peg, |
| eth_vol, usdy_vol, aave_usdy_apy, aave_meth_apy, |
| gas_price, mnt_price] |
| |
| Action (3-dim): Continuous weights for [USDY, mETH, MI4], softmaxed to sum=1. |
| |
| Reward: Risk-adjusted yield with drawdown penalty. |
| """ |
| |
| metadata = {"render_modes": ["human"]} |
| |
| def __init__( |
| self, |
| historical_data: Optional[Dict] = None, |
| episode_length: int = 720, |
| initial_capital: float = 100_000.0, |
| rebalance_cost_bps: int = 10, |
| risk_free_rate: float = 4.25, |
| reward_scaling: float = 100.0, |
| ): |
| super().__init__() |
| |
| self.episode_length = episode_length |
| self.initial_capital = initial_capital |
| self.rebalance_cost_bps = rebalance_cost_bps |
| self.risk_free_rate = risk_free_rate / 100.0 / 365 / 24 |
| self.reward_scaling = reward_scaling |
| |
| |
| self.observation_space = spaces.Box( |
| low=-np.inf, high=np.inf, shape=(18,), dtype=np.float32 |
| ) |
| |
| self.action_space = spaces.Box( |
| low=-1.0, high=1.0, shape=(3,), dtype=np.float32 |
| ) |
| |
| |
| self._load_historical_data(historical_data) |
| |
| |
| self.current_step = 0 |
| self.portfolio_value = initial_capital |
| self.peak_value = initial_capital |
| self.weights = np.array([0.4, 0.35, 0.25]) |
| self.returns_history: List[float] = [] |
| |
| def _load_historical_data(self, data: Optional[Dict]): |
| """Load historical data or generate synthetic data for training.""" |
| if data is not None: |
| self.yield_series = data |
| return |
| |
| |
| np.random.seed(42) |
| n = 50000 |
| |
| |
| def ou_process(mean, sigma, theta, n): |
| x = np.zeros(n) |
| x[0] = mean |
| for i in range(1, n): |
| dx = theta * (mean - x[i-1]) + sigma * np.random.normal() |
| x[i] = max(x[i-1] + dx, 0.1) |
| return x |
| |
| self.yield_series = { |
| "usdy_apy": ou_process(4.25, 0.05, 0.01, n), |
| "meth_apy": ou_process(3.8, 0.15, 0.005, n), |
| "mi4_apy": ou_process(5.0, 0.2, 0.003, n), |
| "eth_price": self._generate_gbm(3000, 0.6, n), |
| "btc_price": self._generate_gbm(60000, 0.5, n), |
| "btc_dominance": ou_process(50, 2.0, 0.01, n), |
| "fed_rate": ou_process(5.25, 0.1, 0.02, n), |
| "usdy_peg": np.clip(np.random.normal(1.0, 0.001, n), 0.99, 1.01), |
| "meth_peg": np.clip(np.random.normal(1.0, 0.005, n), 0.95, 1.05), |
| "eth_vol": ou_process(0.5, 0.05, 0.02, n), |
| "usdy_vol": np.full(n, 0.001), |
| "aave_usdy_apy": ou_process(2.0, 0.1, 0.01, n), |
| "aave_meth_apy": ou_process(1.5, 0.15, 0.01, n), |
| "gas_price": ou_process(0.02, 0.005, 0.1, n), |
| "mnt_price": self._generate_gbm(0.7, 0.8, n), |
| } |
| |
| def _generate_gbm(self, s0: float, vol: float, n: int) -> np.ndarray: |
| """Geometric Brownian Motion for price simulation.""" |
| dt = 1 / (365 * 24) |
| mu = 0.0 |
| returns = np.random.normal((mu - 0.5 * vol**2) * dt, vol * np.sqrt(dt), n) |
| prices = s0 * np.exp(np.cumsum(returns)) |
| return prices |
|
|
| def _get_obs(self) -> np.ndarray: |
| """Get current observation at self.current_step.""" |
| idx = self.start_idx + self.current_step |
| idx = min(idx, len(self.yield_series["usdy_apy"]) - 1) |
| |
| obs = np.array([ |
| self.yield_series["usdy_apy"][idx] / 10.0, |
| self.yield_series["meth_apy"][idx] / 10.0, |
| self.yield_series["mi4_apy"][idx] / 10.0, |
| self.yield_series["eth_price"][idx] / 10000.0, |
| self.yield_series["btc_price"][idx] / 100000.0, |
| self.yield_series["btc_dominance"][idx] / 100.0, |
| self.yield_series["fed_rate"][idx] / 10.0, |
| self.yield_series["usdy_peg"][idx], |
| self.yield_series["meth_peg"][idx], |
| self.yield_series["eth_vol"][idx], |
| self.yield_series["usdy_vol"][idx], |
| self.yield_series["aave_usdy_apy"][idx] / 10.0, |
| self.yield_series["aave_meth_apy"][idx] / 10.0, |
| self.yield_series["gas_price"][idx] / 1.0, |
| self.yield_series["mnt_price"][idx] / 10.0, |
| |
| self.weights[0], |
| self.weights[1], |
| self.weights[2], |
| ], dtype=np.float32) |
| return obs |
|
|
| def reset(self, seed=None, options=None): |
| """Reset environment for new episode.""" |
| super().reset(seed=seed) |
| |
| max_start = len(self.yield_series["usdy_apy"]) - self.episode_length - 10 |
| self.start_idx = self.np_random.integers(0, max(max_start, 1)) |
| |
| self.current_step = 0 |
| self.portfolio_value = self.initial_capital |
| self.peak_value = self.initial_capital |
| self.weights = np.array([0.4, 0.35, 0.25]) |
| self.returns_history = [] |
| |
| return self._get_obs(), {} |
|
|
| def step(self, action: np.ndarray): |
| """Execute one step: apply new weights, compute yield, advance.""" |
| |
| exp_a = np.exp(action - np.max(action)) |
| new_weights = exp_a / exp_a.sum() |
| |
| |
| new_weights = np.clip(new_weights, 0.05, 0.60) |
| new_weights /= new_weights.sum() |
| |
| |
| turnover = np.sum(np.abs(new_weights - self.weights)) |
| rebalance_cost = turnover * self.rebalance_cost_bps / 10000.0 |
| |
| |
| self.weights = new_weights |
| |
| |
| idx = min(self.start_idx + self.current_step, len(self.yield_series["usdy_apy"]) - 1) |
| hourly_yields = np.array([ |
| self.yield_series["usdy_apy"][idx] / 100.0 / 365 / 24, |
| self.yield_series["meth_apy"][idx] / 100.0 / 365 / 24, |
| self.yield_series["mi4_apy"][idx] / 100.0 / 365 / 24, |
| ]) |
| |
| |
| if idx > 0: |
| eth_return = (self.yield_series["eth_price"][idx] / self.yield_series["eth_price"][idx-1]) - 1 |
| mi4_return = (self.yield_series["mi4_apy"][idx] / self.yield_series["mi4_apy"][max(idx-1, 0)]) - 1 |
| else: |
| eth_return = 0 |
| mi4_return = 0 |
| |
| |
| asset_returns = np.array([ |
| hourly_yields[0], |
| hourly_yields[1] + 0.3 * eth_return, |
| hourly_yields[2] + 0.1 * mi4_return, |
| ]) |
| |
| |
| portfolio_return = np.dot(self.weights, asset_returns) - rebalance_cost |
| self.portfolio_value *= (1 + portfolio_return) |
| self.returns_history.append(portfolio_return) |
| |
| |
| self.peak_value = max(self.peak_value, self.portfolio_value) |
| drawdown = (self.peak_value - self.portfolio_value) / self.peak_value |
| |
| |
| |
| excess_return = portfolio_return - self.risk_free_rate |
| |
| if len(self.returns_history) > 1: |
| vol = np.std(self.returns_history[-min(168, len(self.returns_history)):]) |
| sharpe = excess_return / (vol + 1e-8) |
| else: |
| sharpe = excess_return * 100 |
| |
| |
| depeg_penalty = 0.0 |
| if self.yield_series["usdy_peg"][idx] < 0.995: |
| depeg_penalty += self.weights[0] * 10.0 |
| if self.yield_series["meth_peg"][idx] < 0.98: |
| depeg_penalty += self.weights[1] * 10.0 |
| |
| reward = ( |
| sharpe * self.reward_scaling |
| - drawdown * 50.0 |
| - depeg_penalty |
| - rebalance_cost * 1000.0 |
| ) |
| |
| self.current_step += 1 |
| terminated = self.current_step >= self.episode_length |
| truncated = drawdown > 0.15 |
| |
| info = { |
| "portfolio_value": self.portfolio_value, |
| "weights": self.weights.copy(), |
| "drawdown": drawdown, |
| "hourly_return": portfolio_return, |
| "sharpe": sharpe, |
| "turnover": turnover, |
| } |
| |
| return self._get_obs(), float(reward), terminated, truncated, info |
|
|
| def render(self): |
| """Print current portfolio state.""" |
| print( |
| f"Step {self.current_step:4d} | " |
| f"Value: ${self.portfolio_value:,.2f} | " |
| f"Weights: USDY={self.weights[0]:.2f} mETH={self.weights[1]:.2f} MI4={self.weights[2]:.2f} | " |
| f"DD: {(self.peak_value - self.portfolio_value) / self.peak_value:.4f}" |
| ) |
|
|
|
|
| |
|
|
| class PPOYieldOptimizer: |
| """ |
| PPO-based RL agent for yield optimization. |
| |
| Uses stable-baselines3 PPO with a custom MLP+LSTM policy. |
| Falls back to a simpler numpy-only actor-critic if SB3 is not available. |
| """ |
| |
| def __init__( |
| self, |
| state_dim: int = 18, |
| action_dim: int = 3, |
| learning_rate: float = 3e-4, |
| gamma: float = 0.99, |
| gae_lambda: float = 0.95, |
| clip_range: float = 0.2, |
| entropy_coef: float = 0.01, |
| n_steps: int = 2048, |
| batch_size: int = 64, |
| n_epochs: int = 10, |
| total_timesteps: int = 500_000, |
| model_path: Optional[str] = None, |
| ): |
| self.state_dim = state_dim |
| self.action_dim = action_dim |
| self.lr = learning_rate |
| self.gamma = gamma |
| self.gae_lambda = gae_lambda |
| self.clip_range = clip_range |
| self.entropy_coef = entropy_coef |
| self.n_steps = n_steps |
| self.batch_size = batch_size |
| self.n_epochs = n_epochs |
| self.total_timesteps = total_timesteps |
| self.model_path = model_path or "models/ppo_yield_router" |
| |
| self.model = None |
| self._use_sb3 = False |
| |
| self._init_model() |
| |
| def _init_model(self): |
| """Initialize PPO model (SB3 if available, else numpy fallback).""" |
| try: |
| from stable_baselines3 import PPO |
| from stable_baselines3.common.vec_env import DummyVecEnv |
| |
| self._use_sb3 = True |
| env = DummyVecEnv([lambda: RWAYieldEnv()]) |
| |
| self.model = PPO( |
| "MlpPolicy", |
| env, |
| learning_rate=self.lr, |
| gamma=self.gamma, |
| gae_lambda=self.gae_lambda, |
| clip_range=self.clip_range, |
| ent_coef=self.entropy_coef, |
| n_steps=self.n_steps, |
| batch_size=self.batch_size, |
| n_epochs=self.n_epochs, |
| verbose=1, |
| tensorboard_log="./logs/ppo_yield/", |
| policy_kwargs={ |
| "net_arch": [dict(pi=[256, 128, 64], vf=[256, 128, 64])], |
| }, |
| ) |
| logger.info("Initialized PPO agent with stable-baselines3") |
| |
| except ImportError: |
| logger.warning("stable-baselines3 not available, using numpy PPO fallback") |
| self._use_sb3 = False |
| self._init_numpy_model() |
| |
| def _init_numpy_model(self): |
| """Simple numpy-based actor-critic for environments without SB3.""" |
| self._actor_weights = { |
| "W1": np.random.randn(self.state_dim, 128) * 0.01, |
| "b1": np.zeros(128), |
| "W2": np.random.randn(128, 64) * 0.01, |
| "b2": np.zeros(64), |
| "W_mu": np.random.randn(64, self.action_dim) * 0.01, |
| "b_mu": np.zeros(self.action_dim), |
| "log_std": np.zeros(self.action_dim), |
| } |
| self._critic_weights = { |
| "W1": np.random.randn(self.state_dim, 128) * 0.01, |
| "b1": np.zeros(128), |
| "W2": np.random.randn(128, 64) * 0.01, |
| "b2": np.zeros(64), |
| "W_out": np.random.randn(64, 1) * 0.01, |
| "b_out": np.zeros(1), |
| } |
| |
| def _numpy_forward(self, obs: np.ndarray) -> np.ndarray: |
| """Forward pass through numpy actor network.""" |
| w = self._actor_weights |
| h = np.tanh(obs @ w["W1"] + w["b1"]) |
| h = np.tanh(h @ w["W2"] + w["b2"]) |
| mu = h @ w["W_mu"] + w["b_mu"] |
| return mu |
| |
| def train(self, total_timesteps: Optional[int] = None): |
| """Train the PPO agent.""" |
| ts = total_timesteps or self.total_timesteps |
| |
| if self._use_sb3: |
| logger.info(f"Training PPO for {ts} timesteps...") |
| self.model.learn(total_timesteps=ts) |
| self.save() |
| logger.info("Training complete.") |
| else: |
| logger.info(f"Training numpy PPO for {ts} timesteps (simplified)...") |
| env = RWAYieldEnv() |
| obs, _ = env.reset() |
| |
| best_reward = -np.inf |
| total_reward = 0 |
| episode_rewards = [] |
| |
| for step in range(ts): |
| action = self._numpy_forward(obs) |
| |
| noise = np.random.normal(0, 0.3, self.action_dim) |
| action = action + noise |
| |
| obs, reward, terminated, truncated, info = env.step(action) |
| total_reward += reward |
| |
| if terminated or truncated: |
| episode_rewards.append(total_reward) |
| if total_reward > best_reward: |
| best_reward = total_reward |
| |
| if len(episode_rewards) % 100 == 0: |
| avg = np.mean(episode_rewards[-100:]) |
| logger.info( |
| f"Step {step}/{ts} | " |
| f"Avg Reward (100ep): {avg:.2f} | " |
| f"Best: {best_reward:.2f}" |
| ) |
| |
| total_reward = 0 |
| obs, _ = env.reset() |
| |
| self.save() |
| logger.info(f"Training complete. Best episode reward: {best_reward:.2f}") |
| |
| def predict(self, state: np.ndarray) -> np.ndarray: |
| """ |
| Predict optimal portfolio weights given current state. |
| |
| Returns: np.ndarray of shape (3,) summing to 1.0 |
| """ |
| if self._use_sb3 and self.model: |
| action, _ = self.model.predict(state, deterministic=True) |
| else: |
| action = self._numpy_forward(state) |
| |
| |
| exp_a = np.exp(action - np.max(action)) |
| weights = exp_a / exp_a.sum() |
| |
| |
| weights = np.clip(weights, 0.05, 0.60) |
| weights /= weights.sum() |
| |
| return weights |
| |
| def save(self, path: Optional[str] = None): |
| """Save model to disk.""" |
| save_path = path or self.model_path |
| os.makedirs(os.path.dirname(save_path) if os.path.dirname(save_path) else ".", exist_ok=True) |
| |
| if self._use_sb3 and self.model: |
| self.model.save(save_path) |
| logger.info(f"SB3 model saved to {save_path}") |
| else: |
| np.savez( |
| f"{save_path}_numpy.npz", |
| **{f"actor_{k}": v for k, v in self._actor_weights.items()}, |
| **{f"critic_{k}": v for k, v in self._critic_weights.items()}, |
| ) |
| logger.info(f"Numpy model saved to {save_path}_numpy.npz") |
| |
| def load(self, path: Optional[str] = None): |
| """Load model from disk.""" |
| load_path = path or self.model_path |
| |
| if self._use_sb3: |
| try: |
| from stable_baselines3 import PPO |
| self.model = PPO.load(load_path) |
| logger.info(f"SB3 model loaded from {load_path}") |
| return |
| except Exception as e: |
| logger.warning(f"SB3 load failed: {e}") |
| |
| |
| try: |
| data = np.load(f"{load_path}_numpy.npz") |
| for key in self._actor_weights: |
| self._actor_weights[key] = data[f"actor_{key}"] |
| for key in self._critic_weights: |
| self._critic_weights[key] = data[f"critic_{key}"] |
| logger.info(f"Numpy model loaded from {load_path}_numpy.npz") |
| except Exception as e: |
| logger.warning(f"Model load failed: {e}") |
|
|
|
|
| |
|
|
| class Backtester: |
| """Backtest the RL agent against baseline strategies.""" |
| |
| def __init__(self, agent: PPOYieldOptimizer, env: RWAYieldEnv): |
| self.agent = agent |
| self.env = env |
| |
| def run_backtest(self, n_episodes: int = 10) -> Dict: |
| """Run backtest and return performance metrics.""" |
| results = { |
| "rl_agent": [], |
| "equal_weight": [], |
| "usdy_only": [], |
| "meth_only": [], |
| } |
| |
| for ep in range(n_episodes): |
| |
| obs, _ = self.env.reset() |
| rl_values = [self.env.initial_capital] |
| |
| while True: |
| weights = self.agent.predict(obs) |
| obs, reward, terminated, truncated, info = self.env.step(weights - 0.33) |
| rl_values.append(info["portfolio_value"]) |
| if terminated or truncated: |
| break |
| |
| results["rl_agent"].append({ |
| "final_value": rl_values[-1], |
| "total_return": (rl_values[-1] / self.env.initial_capital - 1) * 100, |
| "max_drawdown": self._compute_max_drawdown(rl_values), |
| "sharpe": self._compute_sharpe(rl_values), |
| }) |
| |
| return results |
| |
| def _compute_max_drawdown(self, values: List[float]) -> float: |
| peak = values[0] |
| max_dd = 0 |
| for v in values: |
| peak = max(peak, v) |
| dd = (peak - v) / peak |
| max_dd = max(max_dd, dd) |
| return max_dd |
| |
| def _compute_sharpe(self, values: List[float], rf_hourly: float = 4.25/100/365/24) -> float: |
| returns = np.diff(values) / values[:-1] |
| excess = returns - rf_hourly |
| if np.std(excess) == 0: |
| return 0.0 |
| return float(np.mean(excess) / np.std(excess) * np.sqrt(365 * 24)) |
|
|