E-Rong
/

til-26-ae-agent

ml-intern

Model card Files Files and versions

xet

Community

E-Rong commited on about 12 hours ago

Commit

7b2f944

verified ·

1 Parent(s): 0e66bf6

Add train_in_space.py for running training inside the Space

Browse files

Files changed (1) hide show

train_in_space.py +366 -0

train_in_space.py ADDED Viewed

	@@ -0,0 +1,366 @@

+#!/usr/bin/env python3
+"""
+TIL-26-AE Bomberman Agent Training - Runs inside the Space
+Uses local til_environment (already in repo) + pushes checkpoints to Hub model repo.
+"""
+import os
+import sys
+import numpy as np
+import gymnasium as gym
+from gymnasium.spaces import Box, Discrete
+import torch
+# In the Space, til-26-ae is at the repo root; in sandbox it's elsewhere.
+# Try multiple paths.
+for path in [
+    "/home/user/app/til-26-ae",       # HF Space typical path
+    "/app/til-26-ae",                  # sandbox path
+    os.path.join(os.path.dirname(__file__), "..", "til-26-ae"),  # relative
+    "til-26-ae",                        # current dir
+]:
+    if os.path.isdir(path):
+        sys.path.insert(0, path)
+        print(f"Using til_environment from: {path}")
+        break
+from til_environment.bomberman_env import Bomberman
+from til_environment.config import default_config
+from pettingzoo.utils.conversions import aec_to_parallel
+from sb3_contrib import MaskablePPO
+from sb3_contrib.common.wrappers import ActionMasker
+from stable_baselines3.common.callbacks import BaseCallback, CheckpointCallback
+from stable_baselines3.common.monitor import Monitor
+from huggingface_hub import HfApi
+# ---------------------------------------------------------------------------
+#  Environment wrappers
+# ---------------------------------------------------------------------------
+class BombermanSingleAgentEnv(gym.Env):
+    def __init__(self, cfg=None, seed=None, opponent_policy="random"):
+        super().__init__()
+        self.cfg = cfg or default_config()
+        self.cfg.env.render_mode = None
+        raw = Bomberman(self.cfg)
+        self._parallel_env = aec_to_parallel(raw)
+        self.agent_id = "agent_0"
+        self._episode_seed = seed
+        self._episode_count = 0
+        self.action_space = Discrete(6)
+        self._last_action_mask = None
+        self._obs_size = None
+        self._last_obs_dict = None
+        self._compute_obs_space()
+    def _compute_obs_space(self):
+        cfg = self.cfg
+        viewcone_l = int(cfg.dynamics.vision.behind) + int(cfg.dynamics.vision.ahead) + 1
+        viewcone_w = int(cfg.dynamics.vision.left) + int(cfg.dynamics.vision.right) + 1
+        agent_viewcone_size = viewcone_l * viewcone_w * 25
+        base_r = int(cfg.entities.base.vision_radius)
+        base_side = 2 * base_r + 1
+        base_viewcone_size = base_side * base_side * 25
+        scalar_size = 11
+        self._obs_size = agent_viewcone_size + base_viewcone_size + scalar_size
+        self.observation_space = Box(low=-np.inf, high=np.inf, shape=(self._obs_size,), dtype=np.float32)
+    def reset(self, seed=None, options=None):
+        self._episode_seed = self._episode_count if seed is None else seed
+        self._episode_count += 1
+        obs_dict, info_dict = self._parallel_env.reset(seed=self._episode_seed, options=options)
+        self._last_obs_dict = obs_dict
+        self._last_action_mask = obs_dict[self.agent_id]["action_mask"].astype(bool)
+        return self._flatten_obs(obs_dict[self.agent_id]), {}
+    def step(self, action):
+        actions = {self.agent_id: action}
+        for aid, obs in self._last_obs_dict.items():
+            if aid != self.agent_id:
+                valid = np.where(obs["action_mask"] == 1)[0]
+                actions[aid] = int(np.random.choice(valid)) if len(valid) > 0 else 0
+        obs_dict, rewards, terminations, truncations, infos = self._parallel_env.step(actions)
+        self._last_obs_dict = obs_dict
+        if self.agent_id not in obs_dict:
+            return np.zeros(self._obs_size, dtype=np.float32), 0.0, True, False, {}
+        self._last_action_mask = obs_dict[self.agent_id]["action_mask"].astype(bool)
+        obs = self._flatten_obs(obs_dict[self.agent_id])
+        r = float(rewards.get(self.agent_id, 0.0))
+        done = terminations.get(self.agent_id, False) or truncations.get(self.agent_id, False)
+        return obs, r, done, False, infos.get(self.agent_id, {})
+    def action_masks(self):
+        return self._last_action_mask
+    def _flatten_obs(self, od):
+        return np.concatenate([
+            od["agent_viewcone"].flatten(), od["base_viewcone"].flatten(),
+            np.array([od["direction"]], dtype=np.float32),
+            od["location"].flatten().astype(np.float32),
+            od["base_location"].flatten().astype(np.float32),
+            od["health"].flatten().astype(np.float32),
+            np.array([od["frozen_ticks"]], dtype=np.float32),
+            od["base_health"].flatten().astype(np.float32),
+            od["team_resources"].flatten().astype(np.float32),
+            np.array([od["team_bombs"]], dtype=np.float32),
+            np.array([od["step"]], dtype=np.float32),
+        ], dtype=np.float32)
+    def close(self):
+        self._parallel_env.close()
+class RewardShapingWrapper(gym.Wrapper):
+    def __init__(self, env, adaptive_k=1.2, base_explore_weight=0.5):
+        super().__init__(env)
+        self.adaptive_k = adaptive_k
+        self.base_explore_weight = base_explore_weight
+        self._visit_counts = None
+        self._grid_size = 16
+        self._avg_enemy_deaths = 0.0
+        self._episode_count = 0
+        self._explore_weight = base_explore_weight
+    def reset(self, **kwargs):
+        self._visit_counts = np.zeros((self._grid_size, self._grid_size), dtype=np.int32)
+        return self.env.reset(**kwargs)
+    def step(self, action):
+        obs, reward, done, truncated, info = self.env.step(action)
+        pos = info.get("location", None)
+        bonus = 0.0
+        if pos is not None:
+            x, y = int(pos[0]), int(pos[1])
+            if 0 <= x < self._grid_size and 0 <= y < self._grid_size:
+                bonus = 1.0 / (1.0 + self._visit_counts[x, y])
+                self._visit_counts[x, y] += 1
+        if done:
+            self._episode_count += 1
+            alpha = 1.0 - np.tanh(self.adaptive_k * self._avg_enemy_deaths)
+            self._explore_weight = self.base_explore_weight * max(0.1, alpha)
+        return obs, reward + self._explore_weight * bonus, done, truncated, info
+    def action_masks(self):
+        return self.env.action_masks()
+class RuleBasedOpponent:
+    def __init__(self, difficulty="simple"):
+        self.difficulty = difficulty
+    def act(self, od):
+        valid = np.where(od["action_mask"] == 1)[0]
+        if len(valid) == 0:
+            return 4
+        if self.difficulty == "static":
+            return 4
+        if self.difficulty == "simple":
+            vc = od["agent_viewcone"]
+            if (np.any(vc[..., 10] > 0) or np.any(vc[..., 12] > 0)) and 5 in valid:
+                return 5
+            mv = [a for a in valid if a < 4]
+            return int(np.random.choice(mv)) if mv else 4
+        return 4
+class CurriculumEnv(gym.Env):
+    STAGES = ["static", "simple", "smart", "mixed"]
+    WIN_RATE = 0.55
+    EPS_PER_STAGE = 500
+    def __init__(self, cfg=None, seed=None):
+        super().__init__()
+        self.cfg = cfg or default_config()
+        self.cfg.env.render_mode = None
+        self._parallel_env = aec_to_parallel(Bomberman(self.cfg))
+        self.agent_id = "agent_0"
+        self._episode_count = 0
+        self.action_space = Discrete(6)
+        self._last_action_mask = None
+        self._obs_size = None
+        self._last_obs_dict = None
+        self._compute_obs_space()
+        self.stage_idx = 0
+        self.stage_eps = 0
+        self.stage_wins = 0
+        self.stage_rewards = []
+        self.opponents = {}
+        self._init_opponents()
+    def _compute_obs_space(self):
+        cfg = self.cfg
+        vl = int(cfg.dynamics.vision.behind) + int(cfg.dynamics.vision.ahead) + 1
+        vw = int(cfg.dynamics.vision.left) + int(cfg.dynamics.vision.right) + 1
+        av = vl * vw * 25
+        br = int(cfg.entities.base.vision_radius)
+        bs = 2 * br + 1
+        bv = bs * bs * 25
+        self._obs_size = av + bv + 11
+        self.observation_space = Box(low=-np.inf, high=np.inf, shape=(self._obs_size,), dtype=np.float32)
+    def _init_opponents(self):
+        for i in range(1, self.cfg.env.num_teams):
+            self.opponents[f"agent_{i}"] = RuleBasedOpponent(difficulty="static")
+    def _update_difficulty(self):
+        stage = self.STAGES[self.stage_idx]
+        for oid, opp in self.opponents.items():
+            opp.difficulty = "smart" if (stage == "mixed" and int(oid.split("_")[1]) % 2 == 0) else stage
+    def _check_advance(self):
+        if self.stage_idx >= len(self.STAGES) - 1:
+            return False
+        if len(self.stage_rewards) >= self.EPS_PER_STAGE:
+            wr = self.stage_wins / max(1, len(self.stage_rewards))
+            if wr >= self.WIN_RATE:
+                print(f"Stage {self.STAGES[self.stage_idx]} done (wr={wr:.1%}). Advancing.")
+                self.stage_idx += 1
+                self.stage_eps = self.stage_wins = 0
+                self.stage_rewards = []
+                self._update_difficulty()
+                return True
+        return False
+    def reset(self, seed=None, options=None):
+        self._episode_count += 1
+        obs_dict, info_dict = self._parallel_env.reset(seed=self._episode_count, options=options)
+        self._last_obs_dict = obs_dict
+        self._last_action_mask = obs_dict[self.agent_id]["action_mask"].astype(bool)
+        return self._flatten(obs_dict[self.agent_id]), {}
+    def step(self, action):
+        actions = {self.agent_id: action}
+        for aid, obs in self._last_obs_dict.items():
+            if aid != self.agent_id:
+                opp = self.opponents.get(aid)
+                actions[aid] = opp.act(obs) if opp else 4
+        obs_dict, rewards, terminations, truncations, infos = self._parallel_env.step(actions)
+        self._last_obs_dict = obs_dict
+        if self.agent_id not in obs_dict:
+            self.stage_eps += 1
+            return np.zeros(self._obs_size, dtype=np.float32), 0.0, True, False, {}
+        self._last_action_mask = obs_dict[self.agent_id]["action_mask"].astype(bool)
+        obs = self._flatten(obs_dict[self.agent_id])
+        r = float(rewards.get(self.agent_id, 0.0))
+        done = terminations.get(self.agent_id, False) or truncations.get(self.agent_id, False)
+        if done:
+            self.stage_eps += 1
+            self.stage_rewards.append(r)
+            if r > 10.0:
+                self.stage_wins += 1
+            self._check_advance()
+        return obs, r, done, False, {"stage": self.stage_idx, "stage_name": self.STAGES[self.stage_idx]}
+    def action_masks(self):
+        return self._last_action_mask
+    def _flatten(self, od):
+        return np.concatenate([
+            od["agent_viewcone"].flatten(), od["base_viewcone"].flatten(),
+            np.array([od["direction"]], dtype=np.float32),
+            od["location"].flatten().astype(np.float32),
+            od["base_location"].flatten().astype(np.float32),
+            od["health"].flatten().astype(np.float32),
+            np.array([od["frozen_ticks"]], dtype=np.float32),
+            od["base_health"].flatten().astype(np.float32),
+            od["team_resources"].flatten().astype(np.float32),
+            np.array([od["team_bombs"]], dtype=np.float32),
+            np.array([od["step"]], dtype=np.float32),
+        ], dtype=np.float32)
+    def close(self):
+        self._parallel_env.close()
+# ---------------------------------------------------------------------------
+#  Training
+# ---------------------------------------------------------------------------
+HUB_REPO = os.environ.get("HUB_MODEL_ID", "E-Rong/til-26-ae-agent")
+def hub_push(path_in_local, path_in_repo, repo_id=HUB_REPO):
+    """Push a file to the Hub model repo."""
+    try:
+        api = HfApi()
+        api.upload_file(path_or_fileobj=path_in_local, path_in_repo=path_in_repo,
+                        repo_id=repo_id, repo_type="model")
+        print(f"  -> pushed {path_in_repo}")
+    except Exception as e:
+        print(f"  -> push failed: {e}")
+class HubCheckpointCallback(BaseCallback):
+    """Pushes .zip checkpoints to the Hub every N steps."""
+    def __init__(self, save_freq=50000, repo_id=HUB_REPO, verbose=0):
+        super().__init__(verbose)
+        self.save_freq = save_freq
+        self.repo_id = repo_id
+    def _on_step(self) -> bool:
+        if self.num_timesteps % self.save_freq == 0:
+            path = f"/tmp/checkpoint_{self.num_timesteps}.zip"
+            self.model.save(path)
+            hub_push(path, f"checkpoint_{self.num_timesteps}.zip", self.repo_id)
+        return True
+def train_phase(phase, total_timesteps, model=None):
+    cfg = default_config()
+    cfg.env.render_mode = None
+    if phase == 1:
+        print("=== Phase 1: vs Random ===")
+        base = BombermanSingleAgentEnv(cfg=cfg)
+        env = ActionMasker(Monitor(base), lambda e: e.action_masks())
+    elif phase == 2:
+        print("=== Phase 2: + Exploration Shaping ===")
+        base = BombermanSingleAgentEnv(cfg=cfg)
+        base = RewardShapingWrapper(base)
+        env = ActionMasker(Monitor(base), lambda e: e.action_masks())
+    elif phase == 3:
+        print("=== Phase 3: Curriculum Self-Play ===")
+        cfg.env.num_teams = 3
+        base = CurriculumEnv(cfg=cfg)
+        env = ActionMasker(Monitor(base), lambda e: e.action_masks())
+    else:
+        raise ValueError(phase)
+    if model is None:
+        print("Creating MaskablePPO...")
+        model = MaskablePPO(
+            "MlpPolicy", env,
+            learning_rate=3e-4, n_steps=2048, batch_size=64, n_epochs=10,
+            gamma=0.99, gae_lambda=0.95, clip_range=0.2,
+            ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5,
+            verbose=1,
+            device="cuda" if torch.cuda.is_available() else "cpu",
+        )
+    else:
+        model.set_env(env)
+    ckpt_cb = CheckpointCallback(save_freq=50000, save_path="./ckpts", name_prefix=f"p{phase}")
+    hub_cb = HubCheckpointCallback(save_freq=50000, repo_id=HUB_REPO)
+    model.learn(total_timesteps=total_timesteps, callback=[ckpt_cb, hub_cb], progress_bar=False)
+    final = f"phase{phase}_final.zip"
+    model.save(final)
+    hub_push(final, final, HUB_REPO)
+    env.close()
+    print(f"Phase {phase} complete.")
+    return model
+def main():
+    ts = os.environ.get("TOTAL_TIMESTEPS", "500000:500000:1000000")
+    phase_ts = [int(x.replace("_", "")) for x in ts.split(":")]
+    print(f"Phase timesteps: {phase_ts}")
+    model = None
+    for i, t in enumerate(phase_ts[:3], 1):
+        model = train_phase(i, t, model)
+    print("\n=== All phases complete ===")
+if __name__ == "__main__":
+    main()