Upload folder using huggingface_hub

Browse files

Files changed (14) hide show

README.md +32 -766
data/data_loader.py +138 -27
events.md +776 -0
log.log +2 -2
models/vocabulary.py +11 -0
pre_cache.sh +2 -1
scripts/analyze_distribution.py +500 -81
scripts/analyze_hyperparams.py +255 -0
scripts/cache_dataset.py +110 -84
scripts/debug_db_counts.py +37 -0
t.json +10 -46
test.md +34 -0
train.py +20 -2
train.sh +3 -3

README.md CHANGED Viewed

@@ -1,776 +1,42 @@
-# =========================================
-# Entity Encoders
-# =========================================
-# These are generated offline/streaming and are the "vocabulary" for the model.
-<WalletEmbedding>       # Embedding of a wallet's relationships, behavior, and history.
-<WalletEmbedding> = [
-    // Data from the 'wallet_profiles' table (Wallet-level lifetime and daily/weekly stats)
-    wallet_profiles_row: [
-        // Core Info & Timestamps
-        age,                    // No Contextual
-        wallet_address,         // Primary wallet identifier
-        // 7. NEW: Deployed Token Aggregates (8 Features)
-        deployed_tokens_count,          // Total tokens created
-        deployed_tokens_migrated_pct, // % that migrated
-        deployed_tokens_avg_lifetime_sec, // Avg duration before dev selling
-        deployed_tokens_avg_peak_mc_usd,  // Avg peak marketcap
-        deployed_tokens_median_peak_mc_usd,
-        // Metadata & Balances
-        balance,                // Current SOL balance
-        // Lifetime Transaction Counts (Total history)
-        transfers_in_count,     // Total native transfers received
-        transfers_out_count,    // Total native transfers sent
-        spl_transfers_in_count, // Total SPL token transfers received
-        spl_transfers_out_count,// Total SPL token transfers sent
-        // Lifetime Trading Stats (Total history)
-        total_buys_count,       // Total buys across all tokens
-        total_sells_count,      // Total sells across all tokens
-        total_winrate,          // Overall trading winrate
-        // 1-Day Stats (Realized P&L, Counts, Averages, Volume, Fees, Winrate)
-        stats_1d_realized_profit_sol,
-        stats_1d_realized_profit_pnl,
-        stats_1d_buy_count,
-        stats_1d_sell_count,
-        stats_1d_transfer_in_count,
-        stats_1d_transfer_out_count,
-        stats_1d_avg_holding_period,
-        stats_1d_total_bought_cost_sol,
-        stats_1d_total_sold_income_sol,
-        stats_1d_total_fee,
-        stats_1d_winrate,
-        stats_1d_tokens_traded,
-        // 7-Day Stats (Realized P&L, Counts, Averages, Volume, Fees, Winrate)
-        stats_7d_realized_profit_sol,
-        stats_7d_realized_profit_pnl,
-        stats_7d_buy_count,
-        stats_7d_sell_count,
-        stats_7d_transfer_in_count,
-        stats_7d_transfer_out_count,
-        stats_7d_avg_holding_period,
-        stats_7d_total_bought_cost_sol,
-        stats_7d_total_sold_income_sol,
-        stats_7d_total_fee,
-        stats_7d_winrate,
-        stats_7d_tokens_traded,
-        // 30 Days is to useless in the context
-    ],
-    // Data from the 'wallet_socials' table (Social media and profile info)
-    wallet_socials_row: [
-        has_pf_profile,
-        has_twitter,
-        has_telegram,
-        is_exchange_wallet,
-        username,
-    ],
-    // Data from the 'wallet_holdings' table (Token-level statistics for held tokens)
-    wallet_holdings_pool: [
-        <TokenVibeEmbedding>,
-        holding_time,       // How much he held the token (We check only tokens that currently is holding, or recently traded)
-        balance_pct_to_supply,        // Current quantity of the token held
-        // History (Amounts & Costs)
-        history_bought_amount_sol,  // Total amount of token bought
-        bought_amount_sol_pct_to_native_balance // Is he traded a lot of his wallet size
-        // History (Counts)
-        history_total_buys,     // Total number of buy transactions
-        history_total_sells,    // Total number of sell transactions
-        // Profit and Loss
-        realized_profit_pnl,    // Realized P&L as a percentage
-        realized_profit_sol,
-        // Transfers (Non-trade movements)
-        history_transfer_in,
-        history_transfer_out,
-        avarage_trade_gap_seconds,
-        total_priority_fees, // Total tips + Priority Fees
-    ]
-]
-<TokenVibeEmbedding>    # Multimodal embedding of a token's identity
-<TokenVibeEmbedding> = [<TokenAddressEmbedding>, <NameEmbedding>, <SymbolEmbedding>, <ImageEmbedding>, protocol_id]
-<TextEmbedding>         # Text embedding MultiModal processor.
-<MediaEmbedding>        # Multimodal VIT encoder.
-# -----------------------------------------
-# 1. TradeEncoder
-# -----------------------------------------
-# Captures large-size trades from any wallet.
-[timestamp, 'LargeTrade', relative_ts, <WalletEmbedding>, trade_direction, sol_amount, dex_platform_id, priority_fee, mev_protection, token_amount_pct_of_holding, quote_amount_pct_of_holding, slippage, price_impact, success, is_bundle, total_usd]
-# Captures the high-signal "Dev Sold or Bought" event.
-[timestamp, 'Deployer_Trade', relative_ts, <CreatorWalletEmbedding>,  trade_direction, sol_amount, dex_platform_id, priority_fee, mev_protection, token_amount_pct_of_holding, quote_amount_pct_of_holding, slippage, price_impact, success, is_bundle, total_usd]
-# Captures *all* trades from pre-defined high-P&L/win-rate, kol and known wallets.
-[timestamp, 'SmartWallet_Trade', relative_ts, <TraderWalletEmbedding>, trade_direction, sol_amount,  dex_platform_id, priority_fee, mev_protection, token_amount_pct_of_holding, quote_amount_pct_of_holding, slippage, price_impact, success, is_bundle, total_usd]
-# Raw trades. Loaded in H/B/H Prefix (first ~10k) and Suffix (last ~5k).
-[timestamp, 'Trade', relative_ts, <TraderWalletEmbedding>, trade_direction, sol_amount, dex_platform_id, priority_fee, mev_protection, token_amount_pct_of_holding, quote_amount_pct_of_holding, slippage, price_impact, success, is_bundle, total_usd]
-# -----------------------------------------
-# 2. TransferEncoder
-# -----------------------------------------
-# Raw transfers. Loaded in H/B/H Prefix (all in first ~10k trade window) and Suffix (all in last ~5k trade window).
-[timestamp, 'Transfer', relative_ts, <SourceWalletEmbedding>, <DestinationWalletEmbedding>, token_amount, transfer_pct_of_total_supply, transfer_pct_of_holding, priority_fee]
-# Captures scarce, large transfers *after* the initial launch window.
-[timestamp, 'LargeTransfer', relative_ts, <FromWalletEmbedding>, <ToWalletEmbedding>, token_amount, transfer_pct_of_total_supply, transfer_pct_of_holding, priority_fee]
-# -----------------------------------------
-# 3. LifecycleEncoder
-# -----------------------------------------
-# The T0 event.
-[timestamp, 'Mint', 0, <CreatorWalletEmbedding>, <TokenVibeEmbedding>]
-# -----------------------------------------
-# 3. PoolEncoder
-# -----------------------------------------
-# Signals migration from launchpad to a real pool.
-[timestamp, 'PoolCreated', relative_ts, <ProviderWalletEmbedding>, protocol_id, <QuoteTokenVibeEmbedding>, base_amount, quote_amount, quote_pct_to_main_pool_balance, base_pct_to_main_pool_balance]
-# Signals LP addition or removal.
-[timestamp, 'LiquidityChange', relative_ts, <ProviderWalletEmbedding>, <QuoteTokenVibeEmbedding>, change_type_id, quote_amount, quote_pct_to_current_pool_balance]
-# Signals creator/dev taking platform fees.
-[timestamp, 'FeeCollected', relative_ts, <RecipientWalletEmbedding>, sol_amount, token_amount]
-# -----------------------------------------
-# SupplyEncoder
-# -----------------------------------------
-# Signals a supply reduction.
-[timestamp, 'TokenBurn', relative_ts, <BurnerWalletEmbedding>, amount_pct_of_total_supply, amount_tokens_burned]
-# Signals locked supply, e.g., for team/marketing.
-[timestamp, 'SupplyLock', relative_ts, <LockerWalletEmbedding>, amount_pct_of_total_supply, lock_duration]
-# -----------------------------------------
-#  ChartEncoder
-# -----------------------------------------
-# (The "Sliding Window") This is the new chart event.
-[timestamp, 'Chart_Segment', relative_ts, OHLC_segment, chart_interval_id]
-# -----------------------------------------
-#  PulseEncoder
-# -----------------------------------------
-# It is a low-frequency event (Dynamic Interval: 5min, 15min, or 1hr based on token age).
-[timestamp, 'OnChain_Snapshot', relative_ts, total_holders, smart_traders, kols, holder_growth_rate, top_10_holder_pct, sniper_holding_pct, rat_wallets_holding_pct, bundle_holding_pct, current_market_cap, liquidity, volume, buy_count, sell_count, total_txns, global_fees_paid]
-# -----------------------------------------
-#  HoldersListEncoder
-# -----------------------------------------
-<HolderDistributionEmbedding> # Transformer-based embedding of the top holders (WalletEmbeddings + Pct).
-# Token-specific holder analysis.
-[timestamp, 'HolderSnapshot', relative_ts, <HolderDistributionEmbedding>]
-# -----------------------------------------
-# ChainSnapshotEncoder
-# -----------------------------------------
-# Broad chain-level market conditions.
-[timestamp, 'ChainSnapshot', relative_ts, native_token_price_usd, gas_fee]
-# Launchpad market regime (using absolute, log-normalized values).
-[timestamp, 'Lighthouse_Snapshot', relative_ts, protocol_id, timeframe_id, total_volume, total_transactions, total_traders, total_tokens_created, total_migrations]
-# -----------------------------------------
-# TokenTrendingListEncoder
-# -----------------------------------------
-# Fires *per token* on a trending list. The high-attention "meta" signal.
-[timestamp, 'TrendingToken', relative_ts, <TokenVibeEmbedding_of_trending_token>, list_source_id, timeframe_id, rank]
-# Fires *per token* on the boosted list.
-[timestamp, 'BoostedToken', relative_ts, <TokenVibeEmbedding_of_boosted_token>, total_boost_amount, rank]
-# -----------------------------------------
-# LaunchpadTheadEncoder
-# -----------------------------------------
-# On-platform social signal (Pump.fun comments).
-[timestamp, 'PumpReply', relative_ts, <UserWalletEmbedding>, <ReplyTextEmbedding>]
-# -----------------------------------------
-# CTEncoder
-# -----------------------------------------
-# Off-platform social signal (Twitter).
-[timestamp, 'XPost', relative_ts, <AuthorWalletEmbedding>, <PostTextEmbedding>, <MediaEmbedding>]
-[timestamp, 'XRetweet', relative_ts, <RetweeterWalletEmbedding>, <OriginalAuthorWalletEmbedding>, <OriginalPostTextEmbedding>, <OriginalPostMediaEmbedding>]
-[timestamp, 'XReply', relative_ts, <AuthorWalletEmbedding>, <PostTextEmbedding>, <MediaEmbedding>, <MainTweetEmbedding>]
-[timestamp, 'XQuoteTweet', relative_ts, <QuoterWalletEmbedding>, <QuoterTextEmbedding>, <OriginalAuthorWalletEmbedding>, <OriginalPostTextEmbedding>, <OriginalPostMediaEmbedding>]
-# -----------------------------------------
-# GlobalTrendingEncoder
-# -----------------------------------------
-# Broader cultural trend signal (TikTok).
-[timestamp, 'TikTok_Trending_Hashtag', relative_ts, <HashtagNameEmbedding>, rank]
-# Broader cultural trend signal (Twitter).
-[timestamp, 'XTrending_Hashtag', relative_ts, <HashtagNameEmbedding>, rank]
-# -----------------------------------------
-# TrackerEncoder
-# -----------------------------------------
-# Retail marketing signal (Paid groups).
-[timestamp, 'AlphaGroup_Call', relative_ts, group_id]
-[timestamp, 'Call_Channel', relative_ts, channel_id]
-# High-impact catalyst event.
-[timestamp, 'CexListing', relative_ts, exchange_id]
-# High-impact catalyst event.
-[timestamp, 'Migrated', relative_ts, protocol_id]
-# -----------------------------------------
-# Dex Encoder
-# -----------------------------------------
-[timestamp, 'DexBoost_Paid', relative_ts, amount, total_amount_on_token]
-[timestamp, 'DexProfile_Updated', relative_ts, has_changed_website_flag, has_changed_twitter_flag, has_changed_telegram_flag, has_changed_description_flag, <WebsiteEmbedding>, <TwitterLinkEmbedding>, <NewDescriptionEmbeeded>]
-### **Global Context Injection**
-<PRELAUNCH> <LAUNCH> <Middle> <RECENT>
-### **Token Role Embedding**
-<TokenVibeEmbedding_of_Token_A> + Subject_Token_Role
-<TokenVibeEmbedding_of_Token_B> + Trending_Token_Role
-<QuoteTokenVibeEmbedding_of_USDC> + Quote_Token_Role
-# **Links**
-### `TransferLink`
-```
-['signature', 'source', 'destination', 'mint', 'timestamp']
 ```
------
-### `BundleTradeLink`
-```
-['signatures', 'wallet_a', 'wallet_b', 'mint', 'slot', 'timestamp']
-```
------
-### `CopiedTradeLink`
-```
-['leader_buy_sig', 'leader_sell_sig', 'follower_buy_sig', 'follower_sell_sig', 'follower', 'leader', 'mint', 'time_gap_on_buy_sec', 'time_gap_on_sell_sec', 'leader_pnl', 'follower_pnl', 'leader_buy_total', 'leader_sell_total', 'follower_buy_total', 'follower_sell_total', 'follower_buy_slippage', 'follower_sell_slippage']
-```
------
-### `CoordinatedActivityLink`
-```
-['leader_first_sig', 'leader_second_sig', 'follower_first_sig', 'follower_second_sig', 'follower', 'leader', 'mint', 'time_gap_on_first_sec', 'time_gap_on_second_sec']
-```
------
-### `MintedLink`
-```
-['signature', 'timestamp', 'buy_amount']
-```
------
-### `SnipedLink`
-```
-['signature', 'rank', 'sniped_amount']
-```
------
-### `LockedSupplyLink`
-```
-['signature', 'amount', 'unlock_timestamp']
-```
------
-### `BurnedLink`
-```
-['signature', 'amount', 'timestamp']
-```
------
-### `ProvidedLiquidityLink`
 ```
-['signature', 'wallet', 'token', 'pool_address', 'amount_base', 'amount_quote', 'timestamp']
-```
------
-### `WhaleOfLink`
-```
-['wallet', 'token', 'holding_pct_at_creation', 'ath_usd_at_creation']
-```
------
-### `TopTraderOfLink`
-```
-['wallet', 'token', 'pnl_at_creation', 'ath_usd_at_creation']
-```
-/////
-    def __gettestitem__(self, idx: int) -> Dict[str, Any]:
-        """
-        Generates a single complex data item, structured for the MemecoinCollator.
-        NOTE: This currently returns the same mock data regardless of `idx`.
-        """
-        # --- 1. Setup Pooler and Define Raw Data ---
-        pooler = EmbeddingPooler()
-         # --- 5. Create Mock Raw Batch Data (FIXED) ---
-        print("Creating mock raw batch...")
-        # (Wallet profiles, socials, holdings definitions are unchanged)
-        profile1 = {
-            'wallet_address': 'addrW1', 'age': 1.5e7, 'balance': 10.5,
-            'deployed_tokens_count': 2, 'deployed_tokens_migrated_pct': 0.5, 'deployed_tokens_avg_lifetime_sec': 36000.0, 'deployed_tokens_avg_peak_mc_usd': 100000.0, 'deployed_tokens_median_peak_mc_usd': 50000.0,
-            'transfers_in_count': 10, 'transfers_out_count': 5, 'spl_transfers_in_count': 20, 'spl_transfers_out_count': 15,
-            'total_buys_count': 50, 'total_sells_count': 40, 'total_winrate': 0.6,
-            'stats_1d_realized_profit_sol': 1.2, 'stats_1d_realized_profit_pnl': 0.1, 'stats_1d_buy_count': 5, 'stats_1d_sell_count': 3, 'stats_1d_transfer_in_count': 2, 'stats_1d_transfer_out_count': 1, 'stats_1d_avg_holding_period': 3600, 'stats_1d_total_bought_cost_sol': 10.0, 'stats_1d_total_sold_income_sol': 11.2, 'stats_1d_total_fee': 0.1, 'stats_1d_winrate': 0.7, 'stats_1d_tokens_traded': 4,
-            'stats_7d_realized_profit_sol': 5.0, 'stats_7d_realized_profit_pnl': 0.2, 'stats_7d_buy_count': 20, 'stats_7d_sell_count': 15, 'stats_7d_transfer_in_count': 8, 'stats_7d_transfer_out_count': 4, 'stats_7d_avg_holding_period': 7200, 'stats_7d_total_bought_cost_sol': 40.0, 'stats_7d_total_sold_income_sol': 45.0, 'stats_7d_total_fee': 0.5, 'stats_7d_winrate': 0.65, 'stats_7d_tokens_traded': 10,
-        }
-        social1 = {'has_pf_profile': True, 'has_twitter': True, 'has_telegram': False, 'is_exchange_wallet': False, 'username': 'trader_one'}
-        holdings1 = [
-            {'mint_address': 'tknA', 'holding_time': 3600.0, 'realized_profit_sol': 5.2, 'total_priority_fees': 0.05, 'balance_pct_to_supply': 0.01, 'history_bought_amount_sol': 10, 'bought_amount_sol_pct_to_native_balance': 0.5, 'history_total_buys': 5, 'history_total_sells': 2, 'realized_profit_pnl': 0.52, 'history_transfer_in': 1, 'history_transfer_out': 0, 'avarage_trade_gap_seconds': 300},
-        ]
-        profile2 = {
-            'wallet_address': 'addrW2', 'age': 1e6, 'balance': 1.0,
-            'deployed_tokens_count': 0, 'deployed_tokens_migrated_pct': 0.0, 'deployed_tokens_avg_lifetime_sec': 0.0, 'deployed_tokens_avg_peak_mc_usd': 0.0, 'deployed_tokens_median_peak_mc_usd': 0.0,
-            'transfers_in_count': 1, 'transfers_out_count': 0, 'spl_transfers_in_count': 0, 'spl_transfers_out_count': 0,
-            'total_buys_count': 0, 'total_sells_count': 0, 'total_winrate': 0.0,
-            'stats_1d_realized_profit_sol': 0.0, 'stats_1d_realized_profit_pnl': 0.0, 'stats_1d_buy_count': 0, 'stats_1d_sell_count': 0, 'stats_1d_transfer_in_count': 0, 'stats_1d_transfer_out_count': 0, 'stats_1d_avg_holding_period': 0, 'stats_1d_total_bought_cost_sol': 0.0, 'stats_1d_total_sold_income_sol': 0.0, 'stats_1d_total_fee': 0.0, 'stats_1d_winrate': 0.0, 'stats_1d_tokens_traded': 0,
-            'stats_7d_realized_profit_sol': 0.0, 'stats_7d_realized_profit_pnl': 0.0, 'stats_7d_buy_count': 0, 'stats_7d_sell_count': 0, 'stats_7d_transfer_in_count': 0, 'stats_7d_transfer_out_count': 0, 'stats_7d_avg_holding_period': 0, 'stats_7d_total_bought_cost_sol': 0.0, 'stats_7d_total_sold_income_sol': 0.0, 'stats_7d_total_fee': 0.0, 'stats_7d_winrate': 0.0, 'stats_7d_tokens_traded': 0,
-        }
-        social2 = {'has_pf_profile': False, 'has_twitter': False, 'has_telegram': False, 'is_exchange_wallet': True, 'username': 'cex_wallet'}
-        holdings2 = []
-        # Define raw data and get their indices
-        tokenA_data = {
-            'address_emb_idx': pooler.get_idx('tknA'),
-            'name_emb_idx': pooler.get_idx('Token A'),
-            'symbol_emb_idx': pooler.get_idx('TKA'),
-            'image_emb_idx': pooler.get_idx(Image.new('RGB',(256,256), color='blue')),
-            'protocol': 1
-        }
-        # Add wallet usernames to the pool
-        wallet1_user_idx = pooler.get_idx(social1['username'])
-        wallet2_user_idx = pooler.get_idx(social2['username'])
-        social1['username_emb_idx'] = wallet1_user_idx
-        social2['username_emb_idx'] = wallet2_user_idx
-        # --- NEW: Add a third wallet for social tests ---
-        social3 = {'has_pf_profile': False, 'has_twitter': True, 'has_telegram': True, 'is_exchange_wallet': False, 'username': 'social_butterfly'}
-        wallet3_user_idx = pooler.get_idx(social3['username'])
-        social3['username_emb_idx'] = wallet3_user_idx
-        # Create the final pre-computed data structures
-        tokenB_data = {
-            'address_emb_idx': pooler.get_idx('tknA'),
-            'name_emb_idx': pooler.get_idx('Token A'),
-            'symbol_emb_idx': pooler.get_idx('TKA'),
-            'image_emb_idx': pooler.get_idx(Image.new('RGB',(256,256), color='blue')),
-            'protocol': 1
-        }
-        tokenC_data = {
-            'address_emb_idx': pooler.get_idx('tknA'),
-            'name_emb_idx': pooler.get_idx('Token A'),
-            'symbol_emb_idx': pooler.get_idx('TKA'),
-            'image_emb_idx': pooler.get_idx(Image.new('RGB',(256,256), color='blue')),
-            'protocol': 1
-        }
-        tokenD_data = {
-            'address_emb_idx': pooler.get_idx('tknA'),
-            'name_emb_idx': pooler.get_idx('Token A'),
-            'symbol_emb_idx': pooler.get_idx('TKA'),
-            'image_emb_idx': pooler.get_idx(Image.new('RGB',(256,256), color='blue')),
-            'protocol': 1
-        }
-        item = {
-                'event_sequence': [
-                     {'event_type': 'XPost', # NEW
-                    'timestamp': 1729711350,
-                    'relative_ts': -25,
-                    'wallet_address': 'addrW1', # Author
-                    'text_emb_idx': pooler.get_idx('This is the main tweet about $TKA'),
-                    'media_emb_idx': pooler.get_idx(Image.new('RGB', (100,100), color='cyan'))
-                    },
-                    {'event_type': 'XReply', # NEW
-                    'timestamp': 1729711360,
-                    'relative_ts': -35,
-                    'wallet_address': 'addrW2', # Replier
-                    'text_emb_idx': pooler.get_idx('This is a reply to the main tweet'),
-                    'media_emb_idx': pooler.get_idx(None), # No media in reply
-                    'main_tweet_text_emb_idx': pooler.get_idx('This is the main tweet about $TKA')
-                    },
-                    {'event_type': 'XRetweet', # NEW
-                    'timestamp': 1729711370,
-                    'relative_ts': -40,
-                    'wallet_address': 'addrW3', # The retweeter
-                    'original_author_wallet_address': 'addrW1', # The original author
-                    'original_post_text_emb_idx': pooler.get_idx('This is the main tweet about $TKA'),
-                    'original_post_media_emb_idx': pooler.get_idx(Image.new('RGB', (100,100), color='cyan'))
-                    },
-                    # --- CORRECTED: Test a pre-launch event with negative relative_ts ---
-                    {'event_type': 'Transfer',
-                    'timestamp': 1729711180,
-                    'relative_ts': -10, # Negative relative_ts indicates pre-launch
-                    'wallet_address': 'addrW2',
-                    'destination_wallet_address': 'addrW1',
-                    'token_address': 'tknA',
-                    'token_amount': 1000.0, 'transfer_pct_of_total_supply': 0.0, 'transfer_pct_of_holding': 0.0, 'priority_fee': 0.0
-                    },
-                    {'event_type': 'Mint', 'timestamp': 1729711190, 'relative_ts': 0, 'wallet_address': 'addrW1', 'token_address': 'tknA'},
-                    {'event_type': 'Chart_Segment', 'timestamp': 1729711200, 'relative_ts': 60, 'opens': [1.0]*OHLC_SEQ_LEN, 'closes': [1.1]*OHLC_SEQ_LEN, 'i': '1s'}, # This is high-def (segment 0) by default
-                    {'event_type': 'Chart_Segment', 'timestamp': 1729711260, 'relative_ts': 120, 'opens': [1.0]*OHLC_SEQ_LEN, 'closes': [1.1]*OHLC_SEQ_LEN, 'i': '1s'}, # You can mark this as blurry
-                    {'event_type': 'Transfer',
-                    'timestamp': 1729711210,
-                    'relative_ts': 20,
-                    'wallet_address': 'addrW1', # Source
-                    'destination_wallet_address': 'addrW2', # Destination
-                    'token_address': 'tknA', # Need token for context? (Optional, depends on design)
-                    'token_amount': 500.0,
-                    'transfer_pct_of_total_supply': 0.005,
-                    'transfer_pct_of_holding': 0.1,
-                    'priority_fee': 0.0001
-                    },
-                    {'event_type': 'Trade',
-                    'timestamp': 1729711220,
-                    'relative_ts': 30,
-                    'wallet_address': 'addrW1',
-                    'token_address': 'tknA',
-                    'trade_direction': 0,
-                    'sol_amount': 0.5,
-                    # --- FIXED: Pass the integer ID directly ---
-                    'dex_platform_id': vocab.DEX_TO_ID['Axiom'],
-                    'priority_fee': 0.0002,
-                    'mev_protection': False,
-                    'token_amount_pct_of_holding': 0.05, 'quote_amount_pct_of_holding': 0.02,
-                    'slippage': 0.01, 'price_impact': 0.005, 'success': True, 'is_bundle': False, 'total_usd': 75.0
-                    },
-                    {'event_type': 'Deployer_Trade', # NEW: Testing a trade variant
-                    'timestamp': 1729711230,
-                    'relative_ts': 40,
-                    'wallet_address': 'addrW1', # The creator wallet
-                    'token_address': 'tknA',
-                    'trade_direction': 1, 'sol_amount': 0.2,
-                    # --- FIXED: Pass the integer ID directly ---
-                    'dex_platform_id': vocab.DEX_TO_ID['Trojan'],
-                    'priority_fee': 0.0005,
-                    'mev_protection': True,
-                    'token_amount_pct_of_holding': 0.1, 'quote_amount_pct_of_holding': 0.0,
-                    'slippage': 0.02, 'price_impact': 0.01, 'success': True, 'is_bundle': False, 'total_usd': 30.0
-                    },
-                    {'event_type': 'SmartWallet_Trade', # NEW
-                    'timestamp': 1729711240,
-                    'relative_ts': 50,
-                    'wallet_address': 'addrW1', # A known smart wallet
-                    'token_address': 'tknA',
-                    'trade_direction': 0, 'sol_amount': 1.5,
-                    # --- FIXED: Pass the integer ID directly ---
-                    'dex_platform_id': vocab.DEX_TO_ID['Axiom'],
-                    'priority_fee': 0.001,
-                    'mev_protection': True,
-                    'token_amount_pct_of_holding': 0.2, 'quote_amount_pct_of_holding': 0.1,
-                    'slippage': 0.01, 'price_impact': 0.008, 'success': True, 'is_bundle': False, 'total_usd': 225.0
-                    },
-                    {'event_type': 'LargeTrade', # NEW
-                    'timestamp': 1729711250,
-                    'relative_ts': 60,
-                    'wallet_address': 'addrW2', # Some other wallet
-                    'token_address': 'tknA',
-                    'trade_direction': 0, 'sol_amount': 10.0,
-                    # --- FIXED: Pass the integer ID directly ---
-                    'dex_platform_id': vocab.DEX_TO_ID['OXK'],
-                    'priority_fee': 0.002,
-                    'mev_protection': False,
-                    'token_amount_pct_of_holding': 0.8, 'quote_amount_pct_of_holding': 0.5,
-                    'slippage': 0.03, 'price_impact': 0.05, 'success': True, 'is_bundle': False, 'total_usd': 1500.0
-                    },
-                    {'event_type': 'Chart_Segment', 'timestamp': 1729711260, 'relative_ts': 70, 'opens': [1.0]*OHLC_SEQ_LEN, 'closes': [1.1]*OHLC_SEQ_LEN, 'i': '1s'},
-                    {'event_type': 'PoolCreated', # NEW
-                    'timestamp': 1729711270,
-                    'relative_ts': 80,
-                    'wallet_address': 'addrW1',
-                    'protocol_id': vocab.PROTOCOL_TO_ID['Raydium CPMM'],
-                    'quote_token_address': 'tknB',
-                    'base_amount': 1000000.0,
-                    'quote_amount': 10.0
-                    },
-                    {'event_type': 'LiquidityChange', # NEW
-                    'timestamp': 1729711280,
-                    'relative_ts': 90,
-                    'wallet_address': 'addrW2',
-                    'quote_token_address': 'tknB',
-                    'change_type_id': 0, # 0 for 'add'
-                    'quote_amount': 2.0
-                    },
-                    {'event_type': 'FeeCollected', # NEW
-                    'timestamp': 1729711290,
-                    'relative_ts': 100,
-                    'wallet_address': 'addrW1', # The recipient (e.g., dev wallet)
-                    'sol_amount': 0.1
-                    },
-                    {'event_type': 'TokenBurn', # NEW
-                    'timestamp': 1729711300,
-                    'relative_ts': 110,
-                    'wallet_address': 'addrW2', # The burner wallet
-                    'amount_pct_of_total_supply': 0.01, # 1% of supply
-                    'amount_tokens_burned': 10000000.0
-                    },
-                    {'event_type': 'SupplyLock', # NEW
-                    'timestamp': 1729711310,
-                    'relative_ts': 120,
-                    'wallet_address': 'addrW1', # The locker wallet
-                    'amount_pct_of_total_supply': 0.10, # 10% of supply
-                    'lock_duration': 2592000 # 30 days in seconds
-                    },
-                    {'event_type': 'HolderSnapshot', # NEW
-                    'timestamp': 1729711320,
-                    'relative_ts': 130,
-                    # This is a pointer to the pre-computed embedding
-                    # In a real system, this would be the index of the embedding
-                    'holders': [ # Raw holder data
-                        {'wallet': 'addrW1', 'holding_pct': 0.15},
-                        {'wallet': 'addrW2', 'holding_pct': 0.05},
-                        # Add more mock holders if needed
-                    ]
-                    },
-                    {'event_type': 'OnChain_Snapshot', # NEW
-                    'timestamp': 1729711320,
-                    'relative_ts': 130,
-                    'total_holders': 500,
-                    'smart_traders': 25,
-                    'kols': 3,
-                    'holder_growth_rate': 0.15,
-                    'top_10_holder_pct': 0.22,
-                    'sniper_holding_pct': 0.05,
-                    'rat_wallets_holding_pct': 0.02,
-                    'bundle_holding_pct': 0.01,
-                    'current_market_cap': 150000.0,
-                    'volume': 50000.0,
-                    'buy_count': 120,
-                    'sell_count': 80,
-                    'total_txns': 200,
-                    'global_fees_paid': 1.5
-                    },
-                    {'event_type': 'TrendingToken', # NEW
-                    'timestamp': 1729711330,
-                    'relative_ts': 140,
-                    'token_address': 'tknC', # The token that is trending
-                    'list_source_id': vocab.TRENDING_LIST_SOURCE_TO_ID['Phantom'],
-                    'timeframe_id': vocab.TRENDING_LIST_TIMEFRAME_TO_ID['1h'],
-                    'rank': 3
-                    },
-                    {'event_type': 'BoostedToken', # NEW
-                    'timestamp': 1729711340,
-                    'relative_ts': 150,
-                    'token_address': 'tknD', # The token that is boosted
-                    'total_boost_amount': 5000.0,
-                    'rank': 1
-                    },
-                    {'event_type': 'XQuoteTweet', # NEW
-                    'timestamp': 1729711380,
-                    'relative_ts': 190,
-                    'wallet_address': 'addrW3', # The quoter
-                    'quoter_text_emb_idx': pooler.get_idx('Wow, look at this! $TKA'),
-                    'original_author_wallet_address': 'addrW1', # The original author
-                    'original_post_text_emb_idx': pooler.get_idx('This is the main tweet about $TKA'),
-                    'original_post_media_emb_idx': pooler.get_idx(Image.new('RGB', (100,100), color='cyan'))
-                    },
-                    # --- NEW: Add special context tokens ---
-                    {'event_type': 'MIDDLE', 'timestamp': 1729711500, 'relative_ts': 195},
-                    {'event_type': 'PumpReply', # NEW
-                    'timestamp': 1729711390,
-                    'relative_ts': 200,
-                    'wallet_address': 'addrW2', # The user who replied
-                    'reply_text_emb_idx': pooler.get_idx('to the moon!')
-                    },
-                    {'event_type': 'DexBoost_Paid', # NEW
-                    'timestamp': 1729711400,
-                    'relative_ts': 210,
-                    'amount': 5.0, # e.g., 5 Boost
-                    'total_amount_on_token': 25.0 # 25 Boost Points
-                    },
-                    {'event_type': 'DexProfile_Updated', # NEW
-                    'timestamp': 1729711410,
-                    'relative_ts': 220,
-                    'has_changed_website_flag': True,
-                    'has_changed_twitter_flag': False,
-                    'has_changed_telegram_flag': True,
-                    'has_changed_description_flag': True,
-                    # Pre-computed text embeddings
-                    'website_emb_idx': pooler.get_idx('new-token-website.com'),
-                    'twitter_link_emb_idx': pooler.get_idx('old_handle'), # No change, so old link
-                    'telegram_link_emb_idx': pooler.get_idx('new_tg_group'),
-                    'description_emb_idx': pooler.get_idx('This is the new and improved token description.')
-                    },
-                    {'event_type': 'AlphaGroup_Call', # NEW
-                    'timestamp': 1729711420,
-                    'relative_ts': 230,
-                    'group_id': vocab.ALPHA_GROUPS_TO_ID['Potion']
-                    },
-                    {'event_type': 'Channel_Call', # NEW
-                    'timestamp': 1729711430,
-                    'relative_ts': 240,
-                    'channel_id': vocab.CALL_CHANNELS_TO_ID['MarcosCalls']
-                    },
-                    {'event_type': 'RECENT', 'timestamp': 1729711510, 'relative_ts': 245},
-                    {'event_type': 'CexListing', # NEW
-                    'timestamp': 1729711440,
-                    'relative_ts': 250,
-                    'exchange_id': vocab.EXCHANGES_TO_ID['mexc']
-                    },
-                    {'event_type': 'TikTok_Trending_Hashtag', # NEW
-                    'timestamp': 1729711450,
-                    'relative_ts': 260,
-                    'hashtag_name_emb_idx': pooler.get_idx('CryptoTok'),
-                    'rank': 5
-                    },
-                    {'event_type': 'XTrending_Hashtag', # NEW
-                    'timestamp': 1729711460,
-                    'relative_ts': 270,
-                    'hashtag_name_emb_idx': pooler.get_idx('SolanaMemes'),
-                    'rank': 2
-                    },
-                    {'event_type': 'ChainSnapshot', # NEW
-                    'timestamp': 1729711470,
-                    'relative_ts': 280,
-                    'native_token_price_usd': 150.75,
-                    'gas_fee': 0.00015 # Example gas fee
-                    },
-                    {'event_type': 'Lighthouse_Snapshot', # NEW
-                    'timestamp': 1729711480,
-                    'relative_ts': 290,
-                    'protocol_id': vocab.PROTOCOL_TO_ID['Pump V1'],
-                    'timeframe_id': vocab.LIGHTHOUSE_TIMEFRAME_TO_ID['1h'],
-                    'total_volume': 1.2e6,
-                    'total_transactions': 5000,
-                    'total_traders': 1200,
-                    'total_tokens_created': 85,
-                    'total_migrations': 70
-                    },
-                    {'event_type': 'Migrated', # NEW
-                     'timestamp': 1729711490,
-                     'relative_ts': 300,
-                     'protocol_id': vocab.PROTOCOL_TO_ID['Raydium CPMM']
-                    },
-                ],
-                'wallets': {
-                    'addrW1': {'profile': profile1, 'socials': social1, 'holdings': holdings1},
-                    'addrW2': {'profile': profile2, 'socials': social2, 'holdings': holdings2},
-                    # --- NEW: Add wallet 3 data ---
-                    'addrW3': {
-                        'profile': {**profile2, 'wallet_address': 'addrW3'}, # Reuse profile2 but change address
-                        'socials': social3,
-                        'holdings': []
-                    }
-                },
-                'tokens': {
-                    'tknA': tokenA_data, # Main token
-                    'tknB': tokenB_data, # Quote token
-                    'tknC': tokenC_data,  # Trending token
-                    'tknD': tokenD_data   # Boosted token
-                },
-                # --- NEW: The pre-computed embedding pool is generated after collecting all items
-                'embedding_pooler': pooler, # Pass the pooler to generate the tensor later
-                # --- NEW: Expanded graph_links to test all encoders ---
-                # --- FIXED: Removed useless logging fields as per user request ---
-                'graph_links': {
-                    'TransferLink': {'links': [{'timestamp': 1729711205}], 'edges': [('addrW1', 'addrW2')]}, # Keep timestamp
-                    'BundleTradeLink': {'links': [{'timestamp': 1729711215}], 'edges': [('addrW1', 'addrW2')]}, # Keep timestamp
-                    'CopiedTradeLink': {'links': [
-                        {'time_gap_on_buy_sec': 10, 'time_gap_on_sell_sec': 120, 'leader_pnl': 5.0, 'follower_pnl': 4.0, 'follower_buy_total': 100, 'follower_sell_total': 120}
-                        ], 'edges': [('addrW1', 'addrW2')]},
-                    'CoordinatedActivityLink': {'links': [
-                        {'time_gap_on_first_sec': 5, 'time_gap_on_second_sec': 8}
-                        ], 'edges': [('addrW1', 'addrW2')]},
-                    'MintedLink': {'links': [
-                        {'timestamp': 1729711200, 'buy_amount': 1e9}
-                        ], 'edges': [('addrW1', 'tknA')]},
-                    'SnipedLink': {'links': [
-                        {'rank': 1, 'sniped_amount': 5e8}
-                        ], 'edges': [('addrW1', 'tknA')]},
-                    'LockedSupplyLink': {'links': [
-                        {'amount': 1e10} # Only amount is needed
-                        ], 'edges': [('addrW1', 'tknA')]},
-                    'BurnedLink': {'links': [
-                        {'timestamp': 1729711300} # Only timestamp is needed
-                        ], 'edges': [('addrW2', 'tknA')]},
-                    'ProvidedLiquidityLink': {'links': [
-                        {'timestamp': 1729711250} # Only timestamp is needed
-                        ], 'edges': [('addrW1', 'tknA')]},
-                    'WhaleOfLink': {'links': [
-                        {} # Just the existence of the link is the feature
-                        ], 'edges': [('addrW1', 'tknA')]},
-                    'TopTraderOfLink': {'links': [
-                        {'pnl_at_creation': 50000.0} # Only PnL is needed
-                        ], 'edges': [('addrW2', 'tknA')]}
-                },
-                # --- FIXED: Removed chart_segments dictionary ---
-                'labels': torch.randn(self.num_outputs) if self.num_outputs > 0 else torch.zeros(0),
-                'labels_mask': torch.ones(self.num_outputs) if self.num_outputs > 0 else torch.zeros(0)
-            }
-        print("Mock raw batch created.")
-        return item

+# Apollo: Oracle Model
+## Project Status
+**Phase:** Hyperparameter Optimization & Dataset Preparation.
+### Recent Updates (Jan 2026)
+*   **Hyperparameter Tuning**: Analyzed token trade distribution to determine optimal model parameters.
+    *   **Max Sequence Length**: Set to **8192**. This covers >2 hours of high-frequency trading activity for high-volume tokens (verified against `HWVY...`) and the full lifecycle for 99% of tokens.
+    *   **Prediction Horizons**: Set to **60s, 3m, 5m, 10m, 30m, 1h, 2h**.
+        *   **Min Horizon (60s)**: Chosen to accommodate ~20s inference latency while capturing the "meat" of aggressive breakout movers.
+        *   **Max Horizon (2h)**: Covers the timeframe where 99% of tokens hit their All-Time High.
+*   **Infrastructure**:
+    *   Updated `train.sh` to use these new hyperparameters.
+    *   Updated `scripts/cache_dataset.py` to ensure cached datasets are labeled with these horizons.
+    *   Verified `DataFetcher` retrieves full trade histories (no hidden limits).
+## Configuration Summary
+| Parameter | Value | Rationale |
+| :--- | :--- | :--- |
+| **Max Seq Len** | `8192` | Captures >2h of intense pump activity or full rug lifecycle. |
+| **Horizons** | `60, 180, 300, 600, 1800, 3600, 7200` | From "Scalp/Breakout" (1m) to "Runner/ATH" (2h). |
+| **Inference Latency** | ~20s | Dictates the 60s minimum horizon. |
+## Usage
+### 1. Cache Dataset
+Pre-process data into `.pt` files with correct labels.
+```bash
+./pre_cache.sh
 ```
+### 2. Train Model
+Launch training with updated hyperparameters.
+```bash
+./train.sh
 ```
+## TODOs
+*   [ ] **Re-run Caching**: Since horizons changed, the existing cache (if any) is stale. Expected to run `pre_cache.sh`.
+*   [ ] **Verify Inference**: Ensure `inference.py` handles the 20s latency constraints gracefully (e.g. timestamp checks).
+*   [ ] **Model Architecture**: Confirm `8192` context length fits in VRAM with current model config (Attention implementation).

data/data_loader.py CHANGED Viewed

@@ -10,6 +10,7 @@ from typing import List, Dict, Any, Optional, Union, Tuple
 from pathlib import Path
 import numpy as np
 from bisect import bisect_left, bisect_right
 # We need the vocabulary for IDs and the processor for the pooler
 import models.vocabulary as vocab
@@ -136,12 +137,59 @@ class OracleDataset(Dataset):
             self.cached_files = sorted(self.cache_dir.glob("sample_*.pt"), key=lambda p: int(p.stem.split('_')[1]))
             if not self.cached_files:
                 raise RuntimeError(f"Cache directory '{self.cache_dir}' provided but contains no 'sample_*.pt' files.")
             self.num_samples = len(self.cached_files)
             if max_samples is not None:
                 self.num_samples = min(max_samples, self.num_samples)
                 self.cached_files = self.cached_files[:self.num_samples]
-            print(f"INFO: Found {self.num_samples} cached samples to use.")
             self.sampled_mints = [] # Not needed in cached mode
             self.available_mints = []
@@ -201,6 +249,12 @@ class OracleDataset(Dataset):
     def __len__(self) -> int:
         return self.num_samples
     def _normalize_price_series(self, values: List[float]) -> List[float]:
         if not values:
             return values
@@ -874,26 +928,27 @@ class OracleDataset(Dataset):
         if not trade_ts_values:
             return None
-        first_trade_ts = min(trade_ts_values)
-        last_trade_ts = max(trade_ts_values)
-        available_duration = last_trade_ts - mint_ts_value
-        if available_duration <= 0:
-            return None
-        if available_duration < (min_window + min_label):
-            return None
-        required_horizon = preferred_horizon if available_duration >= (min_window + preferred_horizon) else min_label
-        upper_bound = max(0.0, available_duration - required_horizon)
-        lower_bound = max(min_window, int(max(0.0, first_trade_ts - mint_ts_value)))
-        if upper_bound < lower_bound:
-            return None
-        if upper_bound == lower_bound:
-            sample_offset = lower_bound
         else:
-            sample_offset = random.randint(lower_bound, int(upper_bound))
-        T_cutoff = mint_timestamp + datetime.timedelta(seconds=int(sample_offset))
         token_address = raw_data['token_address']
         creator_address = raw_data['creator_address']
@@ -1031,7 +1086,7 @@ class OracleDataset(Dataset):
             max_horizon_seconds=self.max_cache_horizon_seconds,
             include_wallet_data=False,
             include_graph=False,
-            min_trades=50,
             full_history=True,      # Bypass H/B/H limits
             prune_failed=True,      # Drop failed trades
             prune_transfers=True    # Drop transfers (captured in snapshots)
@@ -1436,20 +1491,76 @@ class OracleDataset(Dataset):
         event_sequence = [entry[1] for entry in event_sequence_entries]
         # 8. Compute Labels using future data
-        labels = torch.zeros(0)
-        labels_mask = torch.zeros(0)
-        # NEED TO IMPORT OR REFIND future_trades_for_labels LOGIC
-        # We need logic to compute future returns
-        # For now, placeholder or port the logic
-        # 9. Return Item
         return {
             'event_sequence': event_sequence,
             'wallets': wallet_data,
             'tokens': all_token_data,
             'graph_links': graph_links,
             'embedding_pooler': pooler,
-            'labels': labels,
-            'labels_mask': labels_mask
         }

 from pathlib import Path
 import numpy as np
 from bisect import bisect_left, bisect_right
+import json
 # We need the vocabulary for IDs and the processor for the pooler
 import models.vocabulary as vocab
             self.cached_files = sorted(self.cache_dir.glob("sample_*.pt"), key=lambda p: int(p.stem.split('_')[1]))
             if not self.cached_files:
                 raise RuntimeError(f"Cache directory '{self.cache_dir}' provided but contains no 'sample_*.pt' files.")
+            # --- NEW: Strict Metadata & Weighting ---
+            metadata_path = self.cache_dir / "metadata.jsonl"
+            if not metadata_path.exists():
+                raise RuntimeError(f"FATAL: metadata.jsonl not found in {self.cache_dir}. Cannot train without class-balanced sampling.")
+            print(f"INFO: Loading metadata from {metadata_path}...")
+            file_class_map = {}
+            class_counts = defaultdict(int)
+            with open(metadata_path, 'r') as f:
+                for line in f:
+                    try:
+                        entry = json.loads(line)
+                        fname = entry['file']
+                        cid = entry['class_id']
+                        file_class_map[fname] = cid
+                        class_counts[cid] += 1
+                    except Exception as e:
+                        print(f"WARN: Failed to parse metadata line: {e}")
+            print(f"INFO: Class Distribution: {dict(class_counts)}")
+            # Compute Weights
+            self.weights_list = []
+            valid_files = []
+            # We iterate properly sorted cached files to align with __getitem__ index
+            for p in self.cached_files:
+                fname = p.name
+                if fname not in file_class_map:
+                    # Should be fatal if strict, but maybe some files were skipped?
+                    # If file exists but no metadata, we can't weight it properly.
+                    # Current pipeline writes metadata only for successful caches.
+                    # So if it's in cached_files but not metadata, it might be a stale file.
+                    print(f"WARN: File {fname} found in cache but missing metadata. Skipping.")
+                    continue
+                cid = file_class_map[fname]
+                count = class_counts[cid]
+                weight = 1.0 / count if count > 0 else 0.0
+                self.weights_list.append(weight)
+                valid_files.append(p)
+            self.cached_files = valid_files
             self.num_samples = len(self.cached_files)
             if max_samples is not None:
                 self.num_samples = min(max_samples, self.num_samples)
                 self.cached_files = self.cached_files[:self.num_samples]
+                self.weights_list = self.weights_list[:self.num_samples]
+            print(f"INFO: Weighted Dataset Ready. {self.num_samples} samples.")
             self.sampled_mints = [] # Not needed in cached mode
             self.available_mints = []
     def __len__(self) -> int:
         return self.num_samples
+    def get_weights(self) -> torch.DoubleTensor:
+        """Returns the sampling weights for the dataset."""
+        if hasattr(self, 'weights_list') and self.weights_list:
+            return torch.as_tensor(self.weights_list, dtype=torch.double)
+        return None
     def _normalize_price_series(self, values: List[float]) -> List[float]:
         if not values:
             return values
         if not trade_ts_values:
             return None
+        # Cache guarantees min_trades=25, so we proceed assuming valid data.
+        # But for safety in dynamic sampling:
+        if not trade_ts_values:
+             return None
+        # Sort trades to find the 24th trade timestamp
+        sorted_trades_ts = sorted(trade_ts_values)
+        # T_start = Timestamp of the 25th trade (index 24)
+        # If somehow we have fewer than 25 trades (cache mismatch?), fallback to last.
+        safe_idx = min(24, len(sorted_trades_ts) - 1)
+        min_cutoff_ts = sorted_trades_ts[safe_idx]
+        max_cutoff_ts = sorted_trades_ts[-1]
+        if max_cutoff_ts <= min_cutoff_ts:
+             sample_offset_ts = min_cutoff_ts
         else:
+             # Standard case: sample uniformly between [Trade[24], LastTrade]
+             sample_offset_ts = random.uniform(min_cutoff_ts, max_cutoff_ts)
+        T_cutoff = datetime.datetime.fromtimestamp(sample_offset_ts, tz=datetime.timezone.utc)
         token_address = raw_data['token_address']
         creator_address = raw_data['creator_address']
             max_horizon_seconds=self.max_cache_horizon_seconds,
             include_wallet_data=False,
             include_graph=False,
+            min_trades=25,
             full_history=True,      # Bypass H/B/H limits
             prune_failed=True,      # Drop failed trades
             prune_transfers=True    # Drop transfers (captured in snapshots)
         event_sequence = [entry[1] for entry in event_sequence_entries]
         # 8. Compute Labels using future data
+        # Define horizons (e.g., [60, 120, ...])
+        horizons = sorted(self.horizons_seconds)
+        # Pre-sort future trades for efficient searching
+        # Note: future_trades_for_labels contains ALL trades (past & future relative to T_cutoff)
+        # We need to find the price at T_cutoff and at T_cutoff + h
+        all_trades = future_trades_for_labels
+        # Ensure sorted
+        all_trades.sort(key=lambda x: _timestamp_to_order_value(x['timestamp']))
+        # Find price at T_cutoff (Current Price)
+        # It's the last trade before or at T_cutoff
+        current_price = 0.0
+        cutoff_ts_val = T_cutoff.timestamp()
+        last_trade_ts_val = _timestamp_to_order_value(all_trades[-1]['timestamp'])
+        # Find index of last trade <= T_cutoff
+        # We can use binary search or simple iteration since we are building dataset
+        # Iterating is safer for complex logic
+        current_price_idx = -1
+        for i, t in enumerate(all_trades):
+            if _timestamp_to_order_value(t['timestamp']) <= cutoff_ts_val:
+                current_price = float(t['price_usd'])
+                current_price_idx = i
+            else:
+                break
+        label_values = []
+        mask_values = []
+        for h in horizons:
+            target_ts = cutoff_ts_val + h
+            if target_ts > last_trade_ts_val:
+                # Horizon extends beyond known history
+                # We MASK this label. We do NOT guess 0.
+                label_values.append(0.0) # Dummy value
+                mask_values.append(0.0)  # Mask = 0 (Ignore)
+            else:
+                # Find price at target_ts
+                # It is the last trade strictly before or at target_ts
+                future_price = current_price # Default to current if no trades found in window? Unlikely if checked range.
+                # Check trades between current_idx and target
+                # Optimization: start search from current_price_idx
+                found_future = False
+                for j in range(current_price_idx, len(all_trades)):
+                    t = all_trades[j]
+                    t_ts = _timestamp_to_order_value(t['timestamp'])
+                    if t_ts <= target_ts:
+                         future_price = float(t['price_usd'])
+                         found_future = True
+                    else:
+                        break # Optimization: surpassed target_ts
+                if current_price > 0:
+                    ret = (future_price - current_price) / current_price
+                else:
+                    ret = 0.0
+                label_values.append(ret)
+                mask_values.append(1.0) # Mask = 1 (Valid)
         return {
             'event_sequence': event_sequence,
             'wallets': wallet_data,
             'tokens': all_token_data,
             'graph_links': graph_links,
             'embedding_pooler': pooler,
+            'labels': torch.tensor(label_values, dtype=torch.float32),
+            'labels_mask': torch.tensor(mask_values, dtype=torch.float32)
         }

events.md ADDED Viewed

	@@ -0,0 +1,776 @@

+# =========================================
+# Entity Encoders
+# =========================================
+# These are generated offline/streaming and are the "vocabulary" for the model.
+<WalletEmbedding>       # Embedding of a wallet's relationships, behavior, and history.
+<WalletEmbedding> = [
+    // Data from the 'wallet_profiles' table (Wallet-level lifetime and daily/weekly stats)
+    wallet_profiles_row: [
+        // Core Info & Timestamps
+        age,                    // No Contextual
+        wallet_address,         // Primary wallet identifier
+        // 7. NEW: Deployed Token Aggregates (8 Features)
+        deployed_tokens_count,          // Total tokens created
+        deployed_tokens_migrated_pct, // % that migrated
+        deployed_tokens_avg_lifetime_sec, // Avg duration before dev selling
+        deployed_tokens_avg_peak_mc_usd,  // Avg peak marketcap
+        deployed_tokens_median_peak_mc_usd,
+        // Metadata & Balances
+        balance,                // Current SOL balance
+        // Lifetime Transaction Counts (Total history)
+        transfers_in_count,     // Total native transfers received
+        transfers_out_count,    // Total native transfers sent
+        spl_transfers_in_count, // Total SPL token transfers received
+        spl_transfers_out_count,// Total SPL token transfers sent
+        // Lifetime Trading Stats (Total history)
+        total_buys_count,       // Total buys across all tokens
+        total_sells_count,      // Total sells across all tokens
+        total_winrate,          // Overall trading winrate
+        // 1-Day Stats (Realized P&L, Counts, Averages, Volume, Fees, Winrate)
+        stats_1d_realized_profit_sol,
+        stats_1d_realized_profit_pnl,
+        stats_1d_buy_count,
+        stats_1d_sell_count,
+        stats_1d_transfer_in_count,
+        stats_1d_transfer_out_count,
+        stats_1d_avg_holding_period,
+        stats_1d_total_bought_cost_sol,
+        stats_1d_total_sold_income_sol,
+        stats_1d_total_fee,
+        stats_1d_winrate,
+        stats_1d_tokens_traded,
+        // 7-Day Stats (Realized P&L, Counts, Averages, Volume, Fees, Winrate)
+        stats_7d_realized_profit_sol,
+        stats_7d_realized_profit_pnl,
+        stats_7d_buy_count,
+        stats_7d_sell_count,
+        stats_7d_transfer_in_count,
+        stats_7d_transfer_out_count,
+        stats_7d_avg_holding_period,
+        stats_7d_total_bought_cost_sol,
+        stats_7d_total_sold_income_sol,
+        stats_7d_total_fee,
+        stats_7d_winrate,
+        stats_7d_tokens_traded,
+        // 30 Days is to useless in the context
+    ],
+    // Data from the 'wallet_socials' table (Social media and profile info)
+    wallet_socials_row: [
+        has_pf_profile,
+        has_twitter,
+        has_telegram,
+        is_exchange_wallet,
+        username,
+    ],
+    // Data from the 'wallet_holdings' table (Token-level statistics for held tokens)
+    wallet_holdings_pool: [
+        <TokenVibeEmbedding>,
+        holding_time,       // How much he held the token (We check only tokens that currently is holding, or recently traded)
+        balance_pct_to_supply,        // Current quantity of the token held
+        // History (Amounts & Costs)
+        history_bought_amount_sol,  // Total amount of token bought
+        bought_amount_sol_pct_to_native_balance // Is he traded a lot of his wallet size
+        // History (Counts)
+        history_total_buys,     // Total number of buy transactions
+        history_total_sells,    // Total number of sell transactions
+        // Profit and Loss
+        realized_profit_pnl,    // Realized P&L as a percentage
+        realized_profit_sol,
+        // Transfers (Non-trade movements)
+        history_transfer_in,
+        history_transfer_out,
+        avarage_trade_gap_seconds,
+        total_priority_fees, // Total tips + Priority Fees
+    ]
+]
+<TokenVibeEmbedding>    # Multimodal embedding of a token's identity
+<TokenVibeEmbedding> = [<TokenAddressEmbedding>, <NameEmbedding>, <SymbolEmbedding>, <ImageEmbedding>, protocol_id]
+<TextEmbedding>         # Text embedding MultiModal processor.
+<MediaEmbedding>        # Multimodal VIT encoder.
+# -----------------------------------------
+# 1. TradeEncoder
+# -----------------------------------------
+# Captures large-size trades from any wallet.
+[timestamp, 'LargeTrade', relative_ts, <WalletEmbedding>, trade_direction, sol_amount, dex_platform_id, priority_fee, mev_protection, token_amount_pct_of_holding, quote_amount_pct_of_holding, slippage, price_impact, success, is_bundle, total_usd]
+# Captures the high-signal "Dev Sold or Bought" event.
+[timestamp, 'Deployer_Trade', relative_ts, <CreatorWalletEmbedding>,  trade_direction, sol_amount, dex_platform_id, priority_fee, mev_protection, token_amount_pct_of_holding, quote_amount_pct_of_holding, slippage, price_impact, success, is_bundle, total_usd]
+# Captures *all* trades from pre-defined high-P&L/win-rate, kol and known wallets.
+[timestamp, 'SmartWallet_Trade', relative_ts, <TraderWalletEmbedding>, trade_direction, sol_amount,  dex_platform_id, priority_fee, mev_protection, token_amount_pct_of_holding, quote_amount_pct_of_holding, slippage, price_impact, success, is_bundle, total_usd]
+# Raw trades. Loaded in H/B/H Prefix (first ~10k) and Suffix (last ~5k).
+[timestamp, 'Trade', relative_ts, <TraderWalletEmbedding>, trade_direction, sol_amount, dex_platform_id, priority_fee, mev_protection, token_amount_pct_of_holding, quote_amount_pct_of_holding, slippage, price_impact, success, is_bundle, total_usd]
+# -----------------------------------------
+# 2. TransferEncoder
+# -----------------------------------------
+# Raw transfers. Loaded in H/B/H Prefix (all in first ~10k trade window) and Suffix (all in last ~5k trade window).
+[timestamp, 'Transfer', relative_ts, <SourceWalletEmbedding>, <DestinationWalletEmbedding>, token_amount, transfer_pct_of_total_supply, transfer_pct_of_holding, priority_fee]
+# Captures scarce, large transfers *after* the initial launch window.
+[timestamp, 'LargeTransfer', relative_ts, <FromWalletEmbedding>, <ToWalletEmbedding>, token_amount, transfer_pct_of_total_supply, transfer_pct_of_holding, priority_fee]
+# -----------------------------------------
+# 3. LifecycleEncoder
+# -----------------------------------------
+# The T0 event.
+[timestamp, 'Mint', 0, <CreatorWalletEmbedding>, <TokenVibeEmbedding>]
+# -----------------------------------------
+# 3. PoolEncoder
+# -----------------------------------------
+# Signals migration from launchpad to a real pool.
+[timestamp, 'PoolCreated', relative_ts, <ProviderWalletEmbedding>, protocol_id, <QuoteTokenVibeEmbedding>, base_amount, quote_amount, quote_pct_to_main_pool_balance, base_pct_to_main_pool_balance]
+# Signals LP addition or removal.
+[timestamp, 'LiquidityChange', relative_ts, <ProviderWalletEmbedding>, <QuoteTokenVibeEmbedding>, change_type_id, quote_amount, quote_pct_to_current_pool_balance]
+# Signals creator/dev taking platform fees.
+[timestamp, 'FeeCollected', relative_ts, <RecipientWalletEmbedding>, sol_amount, token_amount]
+# -----------------------------------------
+# SupplyEncoder
+# -----------------------------------------
+# Signals a supply reduction.
+[timestamp, 'TokenBurn', relative_ts, <BurnerWalletEmbedding>, amount_pct_of_total_supply, amount_tokens_burned]
+# Signals locked supply, e.g., for team/marketing.
+[timestamp, 'SupplyLock', relative_ts, <LockerWalletEmbedding>, amount_pct_of_total_supply, lock_duration]
+# -----------------------------------------
+#  ChartEncoder
+# -----------------------------------------
+# (The "Sliding Window") This is the new chart event.
+[timestamp, 'Chart_Segment', relative_ts, OHLC_segment, chart_interval_id]
+# -----------------------------------------
+#  PulseEncoder
+# -----------------------------------------
+# It is a low-frequency event (Dynamic Interval: 5min, 15min, or 1hr based on token age).
+[timestamp, 'OnChain_Snapshot', relative_ts, total_holders, smart_traders, kols, holder_growth_rate, top_10_holder_pct, sniper_holding_pct, rat_wallets_holding_pct, bundle_holding_pct, current_market_cap, liquidity, volume, buy_count, sell_count, total_txns, global_fees_paid]
+# -----------------------------------------
+#  HoldersListEncoder
+# -----------------------------------------
+<HolderDistributionEmbedding> # Transformer-based embedding of the top holders (WalletEmbeddings + Pct).
+# Token-specific holder analysis.
+[timestamp, 'HolderSnapshot', relative_ts, <HolderDistributionEmbedding>]
+# -----------------------------------------
+# ChainSnapshotEncoder
+# -----------------------------------------
+# Broad chain-level market conditions.
+[timestamp, 'ChainSnapshot', relative_ts, native_token_price_usd, gas_fee]
+# Launchpad market regime (using absolute, log-normalized values).
+[timestamp, 'Lighthouse_Snapshot', relative_ts, protocol_id, timeframe_id, total_volume, total_transactions, total_traders, total_tokens_created, total_migrations]
+# -----------------------------------------
+# TokenTrendingListEncoder
+# -----------------------------------------
+# Fires *per token* on a trending list. The high-attention "meta" signal.
+[timestamp, 'TrendingToken', relative_ts, <TokenVibeEmbedding_of_trending_token>, list_source_id, timeframe_id, rank]
+# Fires *per token* on the boosted list.
+[timestamp, 'BoostedToken', relative_ts, <TokenVibeEmbedding_of_boosted_token>, total_boost_amount, rank]
+# -----------------------------------------
+# LaunchpadTheadEncoder
+# -----------------------------------------
+# On-platform social signal (Pump.fun comments).
+[timestamp, 'PumpReply', relative_ts, <UserWalletEmbedding>, <ReplyTextEmbedding>]
+# -----------------------------------------
+# CTEncoder
+# -----------------------------------------
+# Off-platform social signal (Twitter).
+[timestamp, 'XPost', relative_ts, <AuthorWalletEmbedding>, <PostTextEmbedding>, <MediaEmbedding>]
+[timestamp, 'XRetweet', relative_ts, <RetweeterWalletEmbedding>, <OriginalAuthorWalletEmbedding>, <OriginalPostTextEmbedding>, <OriginalPostMediaEmbedding>]
+[timestamp, 'XReply', relative_ts, <AuthorWalletEmbedding>, <PostTextEmbedding>, <MediaEmbedding>, <MainTweetEmbedding>]
+[timestamp, 'XQuoteTweet', relative_ts, <QuoterWalletEmbedding>, <QuoterTextEmbedding>, <OriginalAuthorWalletEmbedding>, <OriginalPostTextEmbedding>, <OriginalPostMediaEmbedding>]
+# -----------------------------------------
+# GlobalTrendingEncoder
+# -----------------------------------------
+# Broader cultural trend signal (TikTok).
+[timestamp, 'TikTok_Trending_Hashtag', relative_ts, <HashtagNameEmbedding>, rank]
+# Broader cultural trend signal (Twitter).
+[timestamp, 'XTrending_Hashtag', relative_ts, <HashtagNameEmbedding>, rank]
+# -----------------------------------------
+# TrackerEncoder
+# -----------------------------------------
+# Retail marketing signal (Paid groups).
+[timestamp, 'AlphaGroup_Call', relative_ts, group_id]
+[timestamp, 'Call_Channel', relative_ts, channel_id]
+# High-impact catalyst event.
+[timestamp, 'CexListing', relative_ts, exchange_id]
+# High-impact catalyst event.
+[timestamp, 'Migrated', relative_ts, protocol_id]
+# -----------------------------------------
+# Dex Encoder
+# -----------------------------------------
+[timestamp, 'DexBoost_Paid', relative_ts, amount, total_amount_on_token]
+[timestamp, 'DexProfile_Updated', relative_ts, has_changed_website_flag, has_changed_twitter_flag, has_changed_telegram_flag, has_changed_description_flag, <WebsiteEmbedding>, <TwitterLinkEmbedding>, <NewDescriptionEmbeeded>]
+### **Global Context Injection**
+<PRELAUNCH> <LAUNCH> <Middle> <RECENT>
+### **Token Role Embedding**
+<TokenVibeEmbedding_of_Token_A> + Subject_Token_Role
+<TokenVibeEmbedding_of_Token_B> + Trending_Token_Role
+<QuoteTokenVibeEmbedding_of_USDC> + Quote_Token_Role
+# **Links**
+### `TransferLink`
+```
+['signature', 'source', 'destination', 'mint', 'timestamp']
+```
+-----
+### `BundleTradeLink`
+```
+['signatures', 'wallet_a', 'wallet_b', 'mint', 'slot', 'timestamp']
+```
+-----
+### `CopiedTradeLink`
+```
+['leader_buy_sig', 'leader_sell_sig', 'follower_buy_sig', 'follower_sell_sig', 'follower', 'leader', 'mint', 'time_gap_on_buy_sec', 'time_gap_on_sell_sec', 'leader_pnl', 'follower_pnl', 'leader_buy_total', 'leader_sell_total', 'follower_buy_total', 'follower_sell_total', 'follower_buy_slippage', 'follower_sell_slippage']
+```
+-----
+### `CoordinatedActivityLink`
+```
+['leader_first_sig', 'leader_second_sig', 'follower_first_sig', 'follower_second_sig', 'follower', 'leader', 'mint', 'time_gap_on_first_sec', 'time_gap_on_second_sec']
+```
+-----
+### `MintedLink`
+```
+['signature', 'timestamp', 'buy_amount']
+```
+-----
+### `SnipedLink`
+```
+['signature', 'rank', 'sniped_amount']
+```
+-----
+### `LockedSupplyLink`
+```
+['signature', 'amount', 'unlock_timestamp']
+```
+-----
+### `BurnedLink`
+```
+['signature', 'amount', 'timestamp']
+```
+-----
+### `ProvidedLiquidityLink`
+```
+['signature', 'wallet', 'token', 'pool_address', 'amount_base', 'amount_quote', 'timestamp']
+```
+-----
+### `WhaleOfLink`
+```
+['wallet', 'token', 'holding_pct_at_creation', 'ath_usd_at_creation']
+```
+-----
+### `TopTraderOfLink`
+```
+['wallet', 'token', 'pnl_at_creation', 'ath_usd_at_creation']
+```
+/////
+    def __gettestitem__(self, idx: int) -> Dict[str, Any]:
+        """
+        Generates a single complex data item, structured for the MemecoinCollator.
+        NOTE: This currently returns the same mock data regardless of `idx`.
+        """
+        # --- 1. Setup Pooler and Define Raw Data ---
+        pooler = EmbeddingPooler()
+         # --- 5. Create Mock Raw Batch Data (FIXED) ---
+        print("Creating mock raw batch...")
+        # (Wallet profiles, socials, holdings definitions are unchanged)
+        profile1 = {
+            'wallet_address': 'addrW1', 'age': 1.5e7, 'balance': 10.5,
+            'deployed_tokens_count': 2, 'deployed_tokens_migrated_pct': 0.5, 'deployed_tokens_avg_lifetime_sec': 36000.0, 'deployed_tokens_avg_peak_mc_usd': 100000.0, 'deployed_tokens_median_peak_mc_usd': 50000.0,
+            'transfers_in_count': 10, 'transfers_out_count': 5, 'spl_transfers_in_count': 20, 'spl_transfers_out_count': 15,
+            'total_buys_count': 50, 'total_sells_count': 40, 'total_winrate': 0.6,
+            'stats_1d_realized_profit_sol': 1.2, 'stats_1d_realized_profit_pnl': 0.1, 'stats_1d_buy_count': 5, 'stats_1d_sell_count': 3, 'stats_1d_transfer_in_count': 2, 'stats_1d_transfer_out_count': 1, 'stats_1d_avg_holding_period': 3600, 'stats_1d_total_bought_cost_sol': 10.0, 'stats_1d_total_sold_income_sol': 11.2, 'stats_1d_total_fee': 0.1, 'stats_1d_winrate': 0.7, 'stats_1d_tokens_traded': 4,
+            'stats_7d_realized_profit_sol': 5.0, 'stats_7d_realized_profit_pnl': 0.2, 'stats_7d_buy_count': 20, 'stats_7d_sell_count': 15, 'stats_7d_transfer_in_count': 8, 'stats_7d_transfer_out_count': 4, 'stats_7d_avg_holding_period': 7200, 'stats_7d_total_bought_cost_sol': 40.0, 'stats_7d_total_sold_income_sol': 45.0, 'stats_7d_total_fee': 0.5, 'stats_7d_winrate': 0.65, 'stats_7d_tokens_traded': 10,
+        }
+        social1 = {'has_pf_profile': True, 'has_twitter': True, 'has_telegram': False, 'is_exchange_wallet': False, 'username': 'trader_one'}
+        holdings1 = [
+            {'mint_address': 'tknA', 'holding_time': 3600.0, 'realized_profit_sol': 5.2, 'total_priority_fees': 0.05, 'balance_pct_to_supply': 0.01, 'history_bought_amount_sol': 10, 'bought_amount_sol_pct_to_native_balance': 0.5, 'history_total_buys': 5, 'history_total_sells': 2, 'realized_profit_pnl': 0.52, 'history_transfer_in': 1, 'history_transfer_out': 0, 'avarage_trade_gap_seconds': 300},
+        ]
+        profile2 = {
+            'wallet_address': 'addrW2', 'age': 1e6, 'balance': 1.0,
+            'deployed_tokens_count': 0, 'deployed_tokens_migrated_pct': 0.0, 'deployed_tokens_avg_lifetime_sec': 0.0, 'deployed_tokens_avg_peak_mc_usd': 0.0, 'deployed_tokens_median_peak_mc_usd': 0.0,
+            'transfers_in_count': 1, 'transfers_out_count': 0, 'spl_transfers_in_count': 0, 'spl_transfers_out_count': 0,
+            'total_buys_count': 0, 'total_sells_count': 0, 'total_winrate': 0.0,
+            'stats_1d_realized_profit_sol': 0.0, 'stats_1d_realized_profit_pnl': 0.0, 'stats_1d_buy_count': 0, 'stats_1d_sell_count': 0, 'stats_1d_transfer_in_count': 0, 'stats_1d_transfer_out_count': 0, 'stats_1d_avg_holding_period': 0, 'stats_1d_total_bought_cost_sol': 0.0, 'stats_1d_total_sold_income_sol': 0.0, 'stats_1d_total_fee': 0.0, 'stats_1d_winrate': 0.0, 'stats_1d_tokens_traded': 0,
+            'stats_7d_realized_profit_sol': 0.0, 'stats_7d_realized_profit_pnl': 0.0, 'stats_7d_buy_count': 0, 'stats_7d_sell_count': 0, 'stats_7d_transfer_in_count': 0, 'stats_7d_transfer_out_count': 0, 'stats_7d_avg_holding_period': 0, 'stats_7d_total_bought_cost_sol': 0.0, 'stats_7d_total_sold_income_sol': 0.0, 'stats_7d_total_fee': 0.0, 'stats_7d_winrate': 0.0, 'stats_7d_tokens_traded': 0,
+        }
+        social2 = {'has_pf_profile': False, 'has_twitter': False, 'has_telegram': False, 'is_exchange_wallet': True, 'username': 'cex_wallet'}
+        holdings2 = []
+        # Define raw data and get their indices
+        tokenA_data = {
+            'address_emb_idx': pooler.get_idx('tknA'),
+            'name_emb_idx': pooler.get_idx('Token A'),
+            'symbol_emb_idx': pooler.get_idx('TKA'),
+            'image_emb_idx': pooler.get_idx(Image.new('RGB',(256,256), color='blue')),
+            'protocol': 1
+        }
+        # Add wallet usernames to the pool
+        wallet1_user_idx = pooler.get_idx(social1['username'])
+        wallet2_user_idx = pooler.get_idx(social2['username'])
+        social1['username_emb_idx'] = wallet1_user_idx
+        social2['username_emb_idx'] = wallet2_user_idx
+        # --- NEW: Add a third wallet for social tests ---
+        social3 = {'has_pf_profile': False, 'has_twitter': True, 'has_telegram': True, 'is_exchange_wallet': False, 'username': 'social_butterfly'}
+        wallet3_user_idx = pooler.get_idx(social3['username'])
+        social3['username_emb_idx'] = wallet3_user_idx
+        # Create the final pre-computed data structures
+        tokenB_data = {
+            'address_emb_idx': pooler.get_idx('tknA'),
+            'name_emb_idx': pooler.get_idx('Token A'),
+            'symbol_emb_idx': pooler.get_idx('TKA'),
+            'image_emb_idx': pooler.get_idx(Image.new('RGB',(256,256), color='blue')),
+            'protocol': 1
+        }
+        tokenC_data = {
+            'address_emb_idx': pooler.get_idx('tknA'),
+            'name_emb_idx': pooler.get_idx('Token A'),
+            'symbol_emb_idx': pooler.get_idx('TKA'),
+            'image_emb_idx': pooler.get_idx(Image.new('RGB',(256,256), color='blue')),
+            'protocol': 1
+        }
+        tokenD_data = {
+            'address_emb_idx': pooler.get_idx('tknA'),
+            'name_emb_idx': pooler.get_idx('Token A'),
+            'symbol_emb_idx': pooler.get_idx('TKA'),
+            'image_emb_idx': pooler.get_idx(Image.new('RGB',(256,256), color='blue')),
+            'protocol': 1
+        }
+        item = {
+                'event_sequence': [
+                     {'event_type': 'XPost', # NEW
+                    'timestamp': 1729711350,
+                    'relative_ts': -25,
+                    'wallet_address': 'addrW1', # Author
+                    'text_emb_idx': pooler.get_idx('This is the main tweet about $TKA'),
+                    'media_emb_idx': pooler.get_idx(Image.new('RGB', (100,100), color='cyan'))
+                    },
+                    {'event_type': 'XReply', # NEW
+                    'timestamp': 1729711360,
+                    'relative_ts': -35,
+                    'wallet_address': 'addrW2', # Replier
+                    'text_emb_idx': pooler.get_idx('This is a reply to the main tweet'),
+                    'media_emb_idx': pooler.get_idx(None), # No media in reply
+                    'main_tweet_text_emb_idx': pooler.get_idx('This is the main tweet about $TKA')
+                    },
+                    {'event_type': 'XRetweet', # NEW
+                    'timestamp': 1729711370,
+                    'relative_ts': -40,
+                    'wallet_address': 'addrW3', # The retweeter
+                    'original_author_wallet_address': 'addrW1', # The original author
+                    'original_post_text_emb_idx': pooler.get_idx('This is the main tweet about $TKA'),
+                    'original_post_media_emb_idx': pooler.get_idx(Image.new('RGB', (100,100), color='cyan'))
+                    },
+                    # --- CORRECTED: Test a pre-launch event with negative relative_ts ---
+                    {'event_type': 'Transfer',
+                    'timestamp': 1729711180,
+                    'relative_ts': -10, # Negative relative_ts indicates pre-launch
+                    'wallet_address': 'addrW2',
+                    'destination_wallet_address': 'addrW1',
+                    'token_address': 'tknA',
+                    'token_amount': 1000.0, 'transfer_pct_of_total_supply': 0.0, 'transfer_pct_of_holding': 0.0, 'priority_fee': 0.0
+                    },
+                    {'event_type': 'Mint', 'timestamp': 1729711190, 'relative_ts': 0, 'wallet_address': 'addrW1', 'token_address': 'tknA'},
+                    {'event_type': 'Chart_Segment', 'timestamp': 1729711200, 'relative_ts': 60, 'opens': [1.0]*OHLC_SEQ_LEN, 'closes': [1.1]*OHLC_SEQ_LEN, 'i': '1s'}, # This is high-def (segment 0) by default
+                    {'event_type': 'Chart_Segment', 'timestamp': 1729711260, 'relative_ts': 120, 'opens': [1.0]*OHLC_SEQ_LEN, 'closes': [1.1]*OHLC_SEQ_LEN, 'i': '1s'}, # You can mark this as blurry
+                    {'event_type': 'Transfer',
+                    'timestamp': 1729711210,
+                    'relative_ts': 20,
+                    'wallet_address': 'addrW1', # Source
+                    'destination_wallet_address': 'addrW2', # Destination
+                    'token_address': 'tknA', # Need token for context? (Optional, depends on design)
+                    'token_amount': 500.0,
+                    'transfer_pct_of_total_supply': 0.005,
+                    'transfer_pct_of_holding': 0.1,
+                    'priority_fee': 0.0001
+                    },
+                    {'event_type': 'Trade',
+                    'timestamp': 1729711220,
+                    'relative_ts': 30,
+                    'wallet_address': 'addrW1',
+                    'token_address': 'tknA',
+                    'trade_direction': 0,
+                    'sol_amount': 0.5,
+                    # --- FIXED: Pass the integer ID directly ---
+                    'dex_platform_id': vocab.DEX_TO_ID['Axiom'],
+                    'priority_fee': 0.0002,
+                    'mev_protection': False,
+                    'token_amount_pct_of_holding': 0.05, 'quote_amount_pct_of_holding': 0.02,
+                    'slippage': 0.01, 'price_impact': 0.005, 'success': True, 'is_bundle': False, 'total_usd': 75.0
+                    },
+                    {'event_type': 'Deployer_Trade', # NEW: Testing a trade variant
+                    'timestamp': 1729711230,
+                    'relative_ts': 40,
+                    'wallet_address': 'addrW1', # The creator wallet
+                    'token_address': 'tknA',
+                    'trade_direction': 1, 'sol_amount': 0.2,
+                    # --- FIXED: Pass the integer ID directly ---
+                    'dex_platform_id': vocab.DEX_TO_ID['Trojan'],
+                    'priority_fee': 0.0005,
+                    'mev_protection': True,
+                    'token_amount_pct_of_holding': 0.1, 'quote_amount_pct_of_holding': 0.0,
+                    'slippage': 0.02, 'price_impact': 0.01, 'success': True, 'is_bundle': False, 'total_usd': 30.0
+                    },
+                    {'event_type': 'SmartWallet_Trade', # NEW
+                    'timestamp': 1729711240,
+                    'relative_ts': 50,
+                    'wallet_address': 'addrW1', # A known smart wallet
+                    'token_address': 'tknA',
+                    'trade_direction': 0, 'sol_amount': 1.5,
+                    # --- FIXED: Pass the integer ID directly ---
+                    'dex_platform_id': vocab.DEX_TO_ID['Axiom'],
+                    'priority_fee': 0.001,
+                    'mev_protection': True,
+                    'token_amount_pct_of_holding': 0.2, 'quote_amount_pct_of_holding': 0.1,
+                    'slippage': 0.01, 'price_impact': 0.008, 'success': True, 'is_bundle': False, 'total_usd': 225.0
+                    },
+                    {'event_type': 'LargeTrade', # NEW
+                    'timestamp': 1729711250,
+                    'relative_ts': 60,
+                    'wallet_address': 'addrW2', # Some other wallet
+                    'token_address': 'tknA',
+                    'trade_direction': 0, 'sol_amount': 10.0,
+                    # --- FIXED: Pass the integer ID directly ---
+                    'dex_platform_id': vocab.DEX_TO_ID['OXK'],
+                    'priority_fee': 0.002,
+                    'mev_protection': False,
+                    'token_amount_pct_of_holding': 0.8, 'quote_amount_pct_of_holding': 0.5,
+                    'slippage': 0.03, 'price_impact': 0.05, 'success': True, 'is_bundle': False, 'total_usd': 1500.0
+                    },
+                    {'event_type': 'Chart_Segment', 'timestamp': 1729711260, 'relative_ts': 70, 'opens': [1.0]*OHLC_SEQ_LEN, 'closes': [1.1]*OHLC_SEQ_LEN, 'i': '1s'},
+                    {'event_type': 'PoolCreated', # NEW
+                    'timestamp': 1729711270,
+                    'relative_ts': 80,
+                    'wallet_address': 'addrW1',
+                    'protocol_id': vocab.PROTOCOL_TO_ID['Raydium CPMM'],
+                    'quote_token_address': 'tknB',
+                    'base_amount': 1000000.0,
+                    'quote_amount': 10.0
+                    },
+                    {'event_type': 'LiquidityChange', # NEW
+                    'timestamp': 1729711280,
+                    'relative_ts': 90,
+                    'wallet_address': 'addrW2',
+                    'quote_token_address': 'tknB',
+                    'change_type_id': 0, # 0 for 'add'
+                    'quote_amount': 2.0
+                    },
+                    {'event_type': 'FeeCollected', # NEW
+                    'timestamp': 1729711290,
+                    'relative_ts': 100,
+                    'wallet_address': 'addrW1', # The recipient (e.g., dev wallet)
+                    'sol_amount': 0.1
+                    },
+                    {'event_type': 'TokenBurn', # NEW
+                    'timestamp': 1729711300,
+                    'relative_ts': 110,
+                    'wallet_address': 'addrW2', # The burner wallet
+                    'amount_pct_of_total_supply': 0.01, # 1% of supply
+                    'amount_tokens_burned': 10000000.0
+                    },
+                    {'event_type': 'SupplyLock', # NEW
+                    'timestamp': 1729711310,
+                    'relative_ts': 120,
+                    'wallet_address': 'addrW1', # The locker wallet
+                    'amount_pct_of_total_supply': 0.10, # 10% of supply
+                    'lock_duration': 2592000 # 30 days in seconds
+                    },
+                    {'event_type': 'HolderSnapshot', # NEW
+                    'timestamp': 1729711320,
+                    'relative_ts': 130,
+                    # This is a pointer to the pre-computed embedding
+                    # In a real system, this would be the index of the embedding
+                    'holders': [ # Raw holder data
+                        {'wallet': 'addrW1', 'holding_pct': 0.15},
+                        {'wallet': 'addrW2', 'holding_pct': 0.05},
+                        # Add more mock holders if needed
+                    ]
+                    },
+                    {'event_type': 'OnChain_Snapshot', # NEW
+                    'timestamp': 1729711320,
+                    'relative_ts': 130,
+                    'total_holders': 500,
+                    'smart_traders': 25,
+                    'kols': 3,
+                    'holder_growth_rate': 0.15,
+                    'top_10_holder_pct': 0.22,
+                    'sniper_holding_pct': 0.05,
+                    'rat_wallets_holding_pct': 0.02,
+                    'bundle_holding_pct': 0.01,
+                    'current_market_cap': 150000.0,
+                    'volume': 50000.0,
+                    'buy_count': 120,
+                    'sell_count': 80,
+                    'total_txns': 200,
+                    'global_fees_paid': 1.5
+                    },
+                    {'event_type': 'TrendingToken', # NEW
+                    'timestamp': 1729711330,
+                    'relative_ts': 140,
+                    'token_address': 'tknC', # The token that is trending
+                    'list_source_id': vocab.TRENDING_LIST_SOURCE_TO_ID['Phantom'],
+                    'timeframe_id': vocab.TRENDING_LIST_TIMEFRAME_TO_ID['1h'],
+                    'rank': 3
+                    },
+                    {'event_type': 'BoostedToken', # NEW
+                    'timestamp': 1729711340,
+                    'relative_ts': 150,
+                    'token_address': 'tknD', # The token that is boosted
+                    'total_boost_amount': 5000.0,
+                    'rank': 1
+                    },
+                    {'event_type': 'XQuoteTweet', # NEW
+                    'timestamp': 1729711380,
+                    'relative_ts': 190,
+                    'wallet_address': 'addrW3', # The quoter
+                    'quoter_text_emb_idx': pooler.get_idx('Wow, look at this! $TKA'),
+                    'original_author_wallet_address': 'addrW1', # The original author
+                    'original_post_text_emb_idx': pooler.get_idx('This is the main tweet about $TKA'),
+                    'original_post_media_emb_idx': pooler.get_idx(Image.new('RGB', (100,100), color='cyan'))
+                    },
+                    # --- NEW: Add special context tokens ---
+                    {'event_type': 'MIDDLE', 'timestamp': 1729711500, 'relative_ts': 195},
+                    {'event_type': 'PumpReply', # NEW
+                    'timestamp': 1729711390,
+                    'relative_ts': 200,
+                    'wallet_address': 'addrW2', # The user who replied
+                    'reply_text_emb_idx': pooler.get_idx('to the moon!')
+                    },
+                    {'event_type': 'DexBoost_Paid', # NEW
+                    'timestamp': 1729711400,
+                    'relative_ts': 210,
+                    'amount': 5.0, # e.g., 5 Boost
+                    'total_amount_on_token': 25.0 # 25 Boost Points
+                    },
+                    {'event_type': 'DexProfile_Updated', # NEW
+                    'timestamp': 1729711410,
+                    'relative_ts': 220,
+                    'has_changed_website_flag': True,
+                    'has_changed_twitter_flag': False,
+                    'has_changed_telegram_flag': True,
+                    'has_changed_description_flag': True,
+                    # Pre-computed text embeddings
+                    'website_emb_idx': pooler.get_idx('new-token-website.com'),
+                    'twitter_link_emb_idx': pooler.get_idx('old_handle'), # No change, so old link
+                    'telegram_link_emb_idx': pooler.get_idx('new_tg_group'),
+                    'description_emb_idx': pooler.get_idx('This is the new and improved token description.')
+                    },
+                    {'event_type': 'AlphaGroup_Call', # NEW
+                    'timestamp': 1729711420,
+                    'relative_ts': 230,
+                    'group_id': vocab.ALPHA_GROUPS_TO_ID['Potion']
+                    },
+                    {'event_type': 'Channel_Call', # NEW
+                    'timestamp': 1729711430,
+                    'relative_ts': 240,
+                    'channel_id': vocab.CALL_CHANNELS_TO_ID['MarcosCalls']
+                    },
+                    {'event_type': 'RECENT', 'timestamp': 1729711510, 'relative_ts': 245},
+                    {'event_type': 'CexListing', # NEW
+                    'timestamp': 1729711440,
+                    'relative_ts': 250,
+                    'exchange_id': vocab.EXCHANGES_TO_ID['mexc']
+                    },
+                    {'event_type': 'TikTok_Trending_Hashtag', # NEW
+                    'timestamp': 1729711450,
+                    'relative_ts': 260,
+                    'hashtag_name_emb_idx': pooler.get_idx('CryptoTok'),
+                    'rank': 5
+                    },
+                    {'event_type': 'XTrending_Hashtag', # NEW
+                    'timestamp': 1729711460,
+                    'relative_ts': 270,
+                    'hashtag_name_emb_idx': pooler.get_idx('SolanaMemes'),
+                    'rank': 2
+                    },
+                    {'event_type': 'ChainSnapshot', # NEW
+                    'timestamp': 1729711470,
+                    'relative_ts': 280,
+                    'native_token_price_usd': 150.75,
+                    'gas_fee': 0.00015 # Example gas fee
+                    },
+                    {'event_type': 'Lighthouse_Snapshot', # NEW
+                    'timestamp': 1729711480,
+                    'relative_ts': 290,
+                    'protocol_id': vocab.PROTOCOL_TO_ID['Pump V1'],
+                    'timeframe_id': vocab.LIGHTHOUSE_TIMEFRAME_TO_ID['1h'],
+                    'total_volume': 1.2e6,
+                    'total_transactions': 5000,
+                    'total_traders': 1200,
+                    'total_tokens_created': 85,
+                    'total_migrations': 70
+                    },
+                    {'event_type': 'Migrated', # NEW
+                     'timestamp': 1729711490,
+                     'relative_ts': 300,
+                     'protocol_id': vocab.PROTOCOL_TO_ID['Raydium CPMM']
+                    },
+                ],
+                'wallets': {
+                    'addrW1': {'profile': profile1, 'socials': social1, 'holdings': holdings1},
+                    'addrW2': {'profile': profile2, 'socials': social2, 'holdings': holdings2},
+                    # --- NEW: Add wallet 3 data ---
+                    'addrW3': {
+                        'profile': {**profile2, 'wallet_address': 'addrW3'}, # Reuse profile2 but change address
+                        'socials': social3,
+                        'holdings': []
+                    }
+                },
+                'tokens': {
+                    'tknA': tokenA_data, # Main token
+                    'tknB': tokenB_data, # Quote token
+                    'tknC': tokenC_data,  # Trending token
+                    'tknD': tokenD_data   # Boosted token
+                },
+                # --- NEW: The pre-computed embedding pool is generated after collecting all items
+                'embedding_pooler': pooler, # Pass the pooler to generate the tensor later
+                # --- NEW: Expanded graph_links to test all encoders ---
+                # --- FIXED: Removed useless logging fields as per user request ---
+                'graph_links': {
+                    'TransferLink': {'links': [{'timestamp': 1729711205}], 'edges': [('addrW1', 'addrW2')]}, # Keep timestamp
+                    'BundleTradeLink': {'links': [{'timestamp': 1729711215}], 'edges': [('addrW1', 'addrW2')]}, # Keep timestamp
+                    'CopiedTradeLink': {'links': [
+                        {'time_gap_on_buy_sec': 10, 'time_gap_on_sell_sec': 120, 'leader_pnl': 5.0, 'follower_pnl': 4.0, 'follower_buy_total': 100, 'follower_sell_total': 120}
+                        ], 'edges': [('addrW1', 'addrW2')]},
+                    'CoordinatedActivityLink': {'links': [
+                        {'time_gap_on_first_sec': 5, 'time_gap_on_second_sec': 8}
+                        ], 'edges': [('addrW1', 'addrW2')]},
+                    'MintedLink': {'links': [
+                        {'timestamp': 1729711200, 'buy_amount': 1e9}
+                        ], 'edges': [('addrW1', 'tknA')]},
+                    'SnipedLink': {'links': [
+                        {'rank': 1, 'sniped_amount': 5e8}
+                        ], 'edges': [('addrW1', 'tknA')]},
+                    'LockedSupplyLink': {'links': [
+                        {'amount': 1e10} # Only amount is needed
+                        ], 'edges': [('addrW1', 'tknA')]},
+                    'BurnedLink': {'links': [
+                        {'timestamp': 1729711300} # Only timestamp is needed
+                        ], 'edges': [('addrW2', 'tknA')]},
+                    'ProvidedLiquidityLink': {'links': [
+                        {'timestamp': 1729711250} # Only timestamp is needed
+                        ], 'edges': [('addrW1', 'tknA')]},
+                    'WhaleOfLink': {'links': [
+                        {} # Just the existence of the link is the feature
+                        ], 'edges': [('addrW1', 'tknA')]},
+                    'TopTraderOfLink': {'links': [
+                        {'pnl_at_creation': 50000.0} # Only PnL is needed
+                        ], 'edges': [('addrW2', 'tknA')]}
+                },
+                # --- FIXED: Removed chart_segments dictionary ---
+                'labels': torch.randn(self.num_outputs) if self.num_outputs > 0 else torch.zeros(0),
+                'labels_mask': torch.ones(self.num_outputs) if self.num_outputs > 0 else torch.zeros(0)
+            }
+        print("Mock raw batch created.")
+        return item

log.log CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9ce8d085fbecbf5108090a954e61db882a7ba0e7fddf4a57223d72e8ebf7713d
-size 1378

 version https://git-lfs.github.com/spec/v1
+oid sha256:df6cd6a1404a931ba4869d7eaf6e6a564e98b0a87f04d8edf8f6189aebfdeab4
+size 20694

models/vocabulary.py CHANGED Viewed

@@ -186,3 +186,14 @@ EXCHANGES = [
 EXCHANGES_TO_ID = {name: i for i, name in enumerate(EXCHANGES)}
 ID_TO_EXCHANGES = {i: name for i, name in enumerate(EXCHANGES)}
 NUM_EXCHANGES = len(EXCHANGES)

 EXCHANGES_TO_ID = {name: i for i, name in enumerate(EXCHANGES)}
 ID_TO_EXCHANGES = {i: name for i, name in enumerate(EXCHANGES)}
 NUM_EXCHANGES = len(EXCHANGES)
+# --- NEW: Return Class Thresholds ---
+# Class 0: 0 - 3x
+# Class 1: 3 - 10x
+# Class 2: 10 - 20x
+# Class 3: 20 - 100x
+# Class 4: 100 - 10,000x
+RETURN_THRESHOLDS = [0, 3, 10, 20, 100, 10000]
+# Class 5: Manipulated (High return but suspicious metrics)
+MANIPULATED_CLASS_ID = 5

pre_cache.sh CHANGED Viewed

@@ -3,6 +3,7 @@
 echo "Starting dataset caching..."
 python3 scripts/cache_dataset.py \
-    --ohlc_stats_path "/workspace/apollo/data/ohlc_stats.npz"
 echo "Done!"

 echo "Starting dataset caching..."
 python3 scripts/cache_dataset.py \
+    --ohlc_stats_path "/workspace/apollo/data/ohlc_stats.npz" \
+    --max_samples 500
 echo "Done!"

scripts/analyze_distribution.py CHANGED Viewed

@@ -2,117 +2,536 @@
 import os
 import sys
 import datetime
-from dotenv import load_dotenv
 from clickhouse_driver import Client as ClickHouseClient
 # Add parent to path
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-load_dotenv()
 CLICKHOUSE_HOST = os.getenv("CLICKHOUSE_HOST", "localhost")
 CLICKHOUSE_PORT = int(os.getenv("CLICKHOUSE_PORT", 9000))
-CLICKHOUSE_USER = os.getenv("CLICKHOUSE_USER", "default")
 CLICKHOUSE_PASSWORD = os.getenv("CLICKHOUSE_PASSWORD", "")
 CLICKHOUSE_DATABASE = os.getenv("CLICKHOUSE_DATABASE", "default")
-def analyze():
     try:
-        client = ClickHouseClient(
-            host=CLICKHOUSE_HOST,
-            port=CLICKHOUSE_PORT,
-            user=CLICKHOUSE_USER,
-            password=CLICKHOUSE_PASSWORD,
-            database=CLICKHOUSE_DATABASE
-        )
-        print("--- Database Stats Analysis ---")
-        # 1. Total Mints
-        total_mints = client.execute("SELECT count() FROM mints")[0][0]
-        print(f"Total Mints: {total_mints}")
-        if total_mints == 0:
-            print("No data found.")
-            return
-        # 2. Migrated Count (Proxy: launchpad != protocol OR check if in raydium pairs)
-        # Assuming we can infer success or use token_metrics
-        # Let's look at ATH Price distribution from token_metrics which is populated by the indexer
-        # Check coverage of token_metrics
-        total_metrics = client.execute("SELECT count() FROM token_metrics")[0][0]
-        print(f"Tokens with Metrics: {total_metrics} (Coverage: {total_metrics/total_mints*100:.1f}%)")
-        # 3. ATH Price Stats
-        # We need to know what a '5x' looks like.
-        # Since we don't have 'opening price' easily indexed for all, let's assume standard pump.fun open price ranges
-        # or just look at Market Cap distribution if available, or just raw ATH price.
-        # Pump.fun launch MC is usually ~$4-5k.
-        # 5x = $25k MC.
-        # 10x = $50k MC (Migration).
-        # Let's check distribution of ath_price_usd * total_supply (Approx ATH Market Cap)
-        # We need total_supply from tokens table.
-        print("\n--- ATH Market Cap Distribution (Approx) ---")
-        query_mc_buckets = """
-        SELECT
             case
-                when mc < 5000 then '1. < $5k (Fail)'
-                when mc >= 5000 AND mc < 20000 then '2. $5k - $20k (2x-4x)'
-                when mc >= 20000 AND mc < 60000 then '3. $20k - $60k (4x-12x)'
-                when mc >= 60000 AND mc < 150000 then '4. $60k - $150k (12x-30x)'
-                when mc >= 150000 then '5. > $150k (Mooners)'
                 else 'Unknown'
-            end as bucket,
-            count() as cnt
-        FROM (
-            SELECT
-                tm.ath_price_usd * (t.total_supply / pow(10, t.decimals)) as mc
-            FROM token_metrics tm
-            JOIN tokens t ON tm.token_address = t.token_address
-        )
-        GROUP BY bucket
-        ORDER BY bucket
         """
-        rows = client.execute(query_mc_buckets)
-        for r in rows:
-            print(f"{r[0]}: {r[1]} tokens")
-        # 4. Volume Distribution
-        # Helps define "High Volume Losers" vs "Garbage"
-        print("\n--- Volume Distribution (Total USD) ---")
-        # Aggregating all trades is heavy, let's do a sample or use token_metrics if it has volume (it doesn't seem to have volume sum in snippet)
-        # We'll use a subquery on trades for a subset or just a heavy query if local
-        query_vol_buckets = """
-        SELECT
             case
-                when vol < 100 then '1. < $100 (Dead)'
-                when vol >= 100 AND vol < 1000 then '2. $100 - $1k (Tiny)'
-                when vol >= 1000 AND vol < 10000 then '3. $1k - $10k (Noise)'
-                when vol >= 10000 AND vol < 100000 then '4. $10k - $100k (Active)'
-                when vol >= 100000 then '5. > $100k (High)'
                 else 'Unknown'
-            end as bucket,
-            count() as cnt
-        FROM (
             SELECT
-                base_address, sum(price_usd * amount_decimal) as vol
             FROM trades
             GROUP BY base_address
-        )
-        GROUP BY bucket
-        ORDER BY bucket
         """
-        # This might be slow on huge datasets.
-        rows_vol = client.execute(query_vol_buckets)
-        for r in rows_vol:
-            print(f"{r[0]}: {r[1]} tokens")
-    except Exception as e:
-        print(f"Error: {e}")
 if __name__ == "__main__":
     analyze()

 import os
 import sys
 import datetime
 from clickhouse_driver import Client as ClickHouseClient
 # Add parent to path
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+# removed dotenv
+# load_dotenv()
 CLICKHOUSE_HOST = os.getenv("CLICKHOUSE_HOST", "localhost")
 CLICKHOUSE_PORT = int(os.getenv("CLICKHOUSE_PORT", 9000))
+# .env shows empty user/pass, which implies 'default' user and empty password for ClickHouse
+CLICKHOUSE_USER = os.getenv("CLICKHOUSE_USER", "default")
 CLICKHOUSE_PASSWORD = os.getenv("CLICKHOUSE_PASSWORD", "")
 CLICKHOUSE_DATABASE = os.getenv("CLICKHOUSE_DATABASE", "default")
+def get_client():
+    return ClickHouseClient(
+        host=CLICKHOUSE_HOST,
+        port=CLICKHOUSE_PORT,
+        user=CLICKHOUSE_USER,
+        password=CLICKHOUSE_PASSWORD,
+        database=CLICKHOUSE_DATABASE
+    )
+def print_distribution_stats(client, metric_name, subquery, bucket_case_sql):
+    print(f"\n   -> {metric_name}")
+    # 1. Print Basic Stats (Mean, Quantiles)
+    stats_query = f"""
+    SELECT
+        avg(val),
+        quantiles(0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99)(val),
+        min(val),
+        max(val),
+        count()
+    FROM (
+        {subquery}
+    )
+    """
     try:
+        stats = client.execute(stats_query)[0]
+        avg_val = stats[0]
+        qs = stats[1]
+        min_val = stats[2]
+        max_val = stats[3]
+        count_val = stats[4]
+        if count_val == 0:
+            print("      No data for this segment.")
+            return
+        print(f"      Mean: {avg_val:.4f} | Min: {min_val:.4f} | Max: {max_val:.4f}")
+        print(f"      Q: p10={qs[0]:.2f} p50={qs[2]:.2f} p90={qs[4]:.2f} p99={qs[6]:.2f}")
+    except Exception as e:
+        print(f"      Error calculating stats: {e}")
+        return
+    # 2. Print Buckets
+    query = f"""
+    SELECT
+        {bucket_case_sql} as bucket,
+        count() as cnt
+    FROM (
+        {subquery}
+    )
+    GROUP BY bucket
+    ORDER BY bucket
+    """
+    try:
+        rows = client.execute(query)
+        # total_count used for pct is the count_val from stats
+        print("      Buckets:")
+        for r in rows:
+            pct = (r[1] / count_val * 100) if count_val > 0 else 0
+            print(f"        {r[0]}: {r[1]} ({pct:.1f}%)")
+    except Exception as e:
+        print(f"      Error calculating buckets: {e}")
+def get_filtered_metric_query(inner_query, cohort_sql):
+    """
+    Wraps the inner metric query to only include tokens in the cohort.
+    Assumes inner_query returns 'base_address' (or aliased) and 'val'.
+    If the inner query returns 'token_address', it should be handled.
+    Most of our queries return 'base_address' (from trades) or 'token_address' (from token_metrics).
+    We will normalize to use 'base_address' via subquery alias if needed, but simplest is
+    to filter on the outer Select.
+    """
+    # We need to know if the inner query produces 'base_address' or 'token_address'
+    # Currently our queries produce 'base_address' mostly, except token_metrics ones.
+    # Let's standardize inner queries in the main loop to alias the key column to 'join_key'
+    return f"""
+    SELECT * FROM (
+        {inner_query}
+    ) WHERE join_key IN ({cohort_sql})
+    """
+import numpy as np
+from models.vocabulary import RETURN_THRESHOLDS, MANIPULATED_CLASS_ID
+def get_return_class_map(client):
+    """
+    Returns a dictionary mapping token_address -> class_id (int)
+    Filters out tokens with > 10,000x return.
+    Implements Dynamic Outlier Detection:
+    - Calculates Median Fees, Volume, Holders for each Class (1-4).
+    - Downgrades tokens with metrics < 10% of their class median to Class 5 (Manipulated).
+    """
+    print("   -> Fetching metrics for classification...")
+    # improved query to get fees/vol/holders
+    # aggregating trades for fees/vol to appear more robust than token_metrics snapshots
+    print("   -> Fetching metrics for classification...")
+    # SQL OPTIMIZATION:
+    # 1. Use token_metrics for Volume/Holders (Pre-computed).
+    # 2. Pre-aggregate trades for Fees in a subquery to avoid massive JOIN explosion.
+    query = """
+    SELECT
+        tm.token_address,
+        (argMax(tm.ath_price_usd, tm.updated_at) / 0.000004) as ret,
+        any(tr.fees) as fees,
+        argMax(tm.total_volume_usd, tm.updated_at) as vol,
+        argMax(tm.unique_holders, tm.updated_at) as holders
+    FROM token_metrics tm
+    LEFT JOIN (
+        SELECT
+            base_address,
+            sum(priority_fee + coin_creator_fee) as fees
+        FROM trades
+        GROUP BY base_address
+    ) tr ON tm.token_address = tr.base_address
+    GROUP BY tm.token_address
+    HAVING ret <= 10000
+    """
+    rows = client.execute(query)
+    # 1. Initial Classification
+    temp_map = {} # token -> {class_id, fees, vol, holders}
+    # Storage for stats calculation
+    class_stats = {i: {'fees': [], 'vol': [], 'holders': []} for i in range(len(RETURN_THRESHOLDS)-1)}
+    print(f"   -> Initial classification of {len(rows)} tokens...")
+    for r in rows:
+        token_addr = r[0]
+        ret_val = r[1]
+        fees = r[2] or 0.0
+        vol = r[3] or 0.0
+        holders = r[4] or 0
+        class_id = -1
+        for i in range(len(RETURN_THRESHOLDS) - 1):
+            lower = RETURN_THRESHOLDS[i]
+            upper = RETURN_THRESHOLDS[i+1]
+            if ret_val >= lower and ret_val < upper:
+                class_id = i
+                break
+        if class_id != -1:
+            temp_map[token_addr] = {'id': class_id, 'fees': fees, 'vol': vol, 'holders': holders}
+            class_stats[class_id]['fees'].append(fees)
+            class_stats[class_id]['vol'].append(vol)
+            class_stats[class_id]['holders'].append(holders)
+    # 2. Calculate Medians & Thresholds
+    thresholds = {}
+    print("   -> Calculating Class Medians & Thresholds (< 10% of Median)...")
+    for i in range(1, 5): # Check classes 1, 2, 3, 4 (Profitable to PVE)
+        # Class 0 (Garbage) is not checked/filtered
+        if len(class_stats[i]['fees']) > 0:
+            med_fees = np.median(class_stats[i]['fees'])
+            med_vol = np.median(class_stats[i]['vol'])
+            med_holders = np.median(class_stats[i]['holders'])
+            thresholds[i] = {
+                'fees': med_fees * 0.5,
+                'vol': med_vol * 0.5,
+                'holders': med_holders * 0.5
+            }
+            print(f"      [Class {i}] Median Fees: {med_fees:.4f} (Thresh: {thresholds[i]['fees']:.4f}) | Median Vol: ${med_vol:.0f} (Thresh: ${thresholds[i]['vol']:.0f}) | Median Holders: {med_holders:.0f} (Thresh: {thresholds[i]['holders']:.0f})")
+        else:
+            thresholds[i] = {'fees': 0, 'vol': 0, 'holders': 0}
+    # 3. Reclassification
+    print("   -> Detecting Manipulated Outliers...")
+    final_map = {}
+    manipulated_count = 0
+    for token, data in temp_map.items():
+        cid = data['id']
+        # Only check if it's a "successful" class (ID > 0)
+        if cid > 0 and cid in thresholds:
+            t = thresholds[cid]
+            # Condition: If ANY metric is suspiciously low
+            is_manipulated = (data['fees'] < t['fees']) or (data['vol'] < t['vol']) or (data['holders'] < t['holders'])
+            if is_manipulated:
+                final_map[token] = MANIPULATED_CLASS_ID
+                manipulated_count += 1
+            else:
+                final_map[token] = cid
+        else:
+            final_map[token] = cid
+    print(f"   -> Reclassification Complete. identified {manipulated_count} manipulated tokens.")
+    return final_map, thresholds
+def analyze():
+    client = get_client()
+    print("=== SEGMENTED DISTRIBUTION ANALYSIS ===")
+    # 1. Get Classified Map AND Thresholds
+    class_map, thresholds = get_return_class_map(client)
+    # 2. Invert Map for easy lookups (still useful for counts or smaller segments)
+    segments_tokens = {}
+    for t, c in class_map.items():
+        if c not in segments_tokens:
+            segments_tokens[c] = []
+        segments_tokens[c].append(t)
+    # Define Labels
+    labels = {
+        0: "0. Garbage (< 3x)",
+        1: "1. Profitable (3x-10x)",
+        2: "2. Good (10x-20x)",
+        3: "3. Hyped (20x-100x)",
+        4: "4. PVE (100x-10kx)",
+        MANIPULATED_CLASS_ID: "5. MANIPULATED (Fake Metrics)"
+    }
+    # Common SQL parts
+    # We need a robust base for the WHERE clause variables (fees, vol, holders)
+    # Since we can't easily alias in the WHERE clause of a subquery filter without re-joining,
+    # we will rely on a standardized CTE-like structure or just simpler subqueries in the condition.
+    # Efficient Token Metrics View
+    # We need to filter based on: ret, fees, vol, holders
+    # fees come from trades (sum), vol/holders/ret from token_metrics (argMax)
+    # To keep query size small, we define the criteria logic in SQL.
+    # But we need 'fees' which is an aggregate.
+    # So we define a base cohort query that computes these 4 values for EVERY token,
+    # and then wrap it with the WHERE clause.
+    base_cohort_source = """
+        SELECT
+            tm.token_address as join_key,
+            (argMax(tm.ath_price_usd, tm.updated_at) / 0.000004) as ret,
+            any(tr.fees) as fees,
+            argMax(tm.total_volume_usd, tm.updated_at) as vol,
+            argMax(tm.unique_holders, tm.updated_at) as holders
+        FROM token_metrics tm
+        LEFT JOIN (
+            SELECT base_address, sum(priority_fee + coin_creator_fee) as fees
+            FROM trades
+            GROUP BY base_address
+        ) tr ON tm.token_address = tr.base_address
+        GROUP BY tm.token_address
+    """
+    # Iterate through known classes
+    for cid in sorted(labels.keys()):
+        label = labels[cid]
+        tokens = segments_tokens.get(cid, [])
+        count = len(tokens)
+        print(f"\n\n==================================================")
+        print(f"SEGMENT: {label}")
+        print(f"==================================================")
+        print(f"Tokens in segment: {count}")
+        if count == 0:
+            continue
+        # Construct SQL Condition based on ID
+        condition = "1=0" # Default fail
+        if cid == 0:
+            # Garbage: Just Return < 3.
+            # Note: Technically it also includes tokens that might have been >3x but <10000x...
+            # BUT our Python/Map logic says Garbage is class 0.
+            # The only way to be class 0 in the map is if ret < 3.
+            # Downgraded tokens go to Class 5.
+            condition = "ret < 3"
+        elif cid == MANIPULATED_CLASS_ID:
+            # Manipulated:
+            # It's the collection of (Class K logic AND is_outlier)
+            sub_conds = []
+            for k in range(1, 5):
+                if k in thresholds:
+                    t = thresholds[k]
+                    # Range for Class K
+                    lower = RETURN_THRESHOLDS[k]
+                    upper = RETURN_THRESHOLDS[k+1]
+                    # Outlier logic
+                    sub_conds.append(f"(ret >= {lower} AND ret < {upper} AND (fees < {t['fees']} OR vol < {t['vol']} OR holders < {t['holders']}))")
+            if sub_conds:
+                condition = " OR ".join(sub_conds)
+        else:
+            # Normal Classes 1-4
+            if cid in thresholds:
+                t = thresholds[cid]
+                lower = RETURN_THRESHOLDS[cid]
+                upper = RETURN_THRESHOLDS[cid+1]
+                # Valid logic: In Range AND NOT Outlier
+                condition = f"(ret >= {lower} AND ret < {upper} AND fees >= {t['fees']} AND vol >= {t['vol']} AND holders >= {t['holders']})"
+        # Final Cohort SQL: Select keys satisfying the condition
+        # We wrap the base source
+        cohort_sql = f"""
+            SELECT join_key FROM (
+                {base_cohort_source}
+            ) WHERE {condition}
+        """
+        # Helper to construct the full condition "join_key IN (...)"
+        # NOW we use the subquery instead of a literal list
+        def make_query(inner, cohort_subquery):
+             return f"""
+                SELECT * FROM (
+                    {inner}
+                ) WHERE join_key IN (
+                    {cohort_subquery}
+                )
+             """
+        # --- Metrics Definitions ---
+        # 1. Fees (SOL)
+        fees_inner = """
+            SELECT base_address as join_key, sum(priority_fee + coin_creator_fee) as val
+            FROM trades
+            GROUP BY base_address
+        """
+        fees_buckets = """
             case
+                when val < 0.001 then '1. < 0.001 SOL'
+                when val >= 0.001 AND val < 0.01 then '2. 0.001 - 0.01'
+                when val >= 0.01 AND val < 0.1 then '3. 0.01 - 0.1'
+                when val >= 0.1 AND val < 1 then '4. 0.1 - 1'
+                when val >= 1 then '5. > 1 SOL'
                 else 'Unknown'
+            end
         """
+        print_distribution_stats(client, "Total Fees (SOL)", make_query(fees_inner, cohort_sql), fees_buckets)
+        # 2. Volume (USD)
+        vol_inner = """
+            SELECT base_address as join_key, sum(total_usd) as val
+            FROM trades
+            GROUP BY base_address
+        """
+        vol_buckets = """
             case
+                when val < 1000 then '1. < $1k'
+                when val >= 1000 AND val < 10000 then '2. $1k - $10k'
+                when val >= 10000 AND val < 100000 then '3. $10k - $100k'
+                when val >= 100000 AND val < 1000000 then '4. $100k - $1M'
+                when val >= 1000000 then '5. > $1M'
                 else 'Unknown'
+            end
+        """
+        print_distribution_stats(client, "Total Volume (USD)", make_query(vol_inner, cohort_sql), vol_buckets)
+        # 3. Unique Holders
+        holders_inner = """
+            SELECT token_address as join_key, argMax(unique_holders, updated_at) as val
+            FROM token_metrics
+            GROUP BY token_address
+        """
+        holders_buckets = """
+            case
+                when val < 10 then '1. < 10'
+                when val >= 10 AND val < 50 then '2. 10 - 50'
+                when val >= 50 AND val < 100 then '3. 50 - 100'
+                when val >= 100 AND val < 500 then '4. 100 - 500'
+                when val >= 500 then '5. > 500'
+                else 'Unknown'
+            end
+        """
+        print_distribution_stats(client, "Unique Holders", make_query(holders_inner, cohort_sql), holders_buckets)
+        # 4. Snipers % Supply
+        snipers_inner = """
+            SELECT
+                m.base_address as join_key,
+                (m.val / t.total_supply * 100) as val
+            FROM (
+                SELECT
+                    base_address,
+                    sumIf(base_amount, buyer_rank <= 70) as val
+                FROM (
+                    SELECT
+                        base_address,
+                        base_amount,
+                        dense_rank() OVER (PARTITION BY base_address ORDER BY min_slot, min_idx) as buyer_rank
+                    FROM (
+                        SELECT
+                            base_address,
+                            maker,
+                            min(slot) as min_slot,
+                            min(transaction_index) as min_idx,
+                            sum(base_amount) as base_amount
+                        FROM trades
+                        WHERE trade_type = 0
+                        GROUP BY base_address, maker
+                    )
+                )
+                GROUP BY base_address
+            ) m
+            JOIN (
+                SELECT token_address, argMax(total_supply, updated_at) as total_supply
+                FROM tokens
+                GROUP BY token_address
+            ) t ON m.base_address = t.token_address
+            WHERE t.total_supply > 0
+        """
+        pct_buckets = """
+            case
+                when val < 1 then '1. < 1%'
+                when val >= 1 AND val < 5 then '2. 1% - 5%'
+                when val >= 5 AND val < 10 then '3. 5% - 10%'
+                when val >= 10 AND val < 20 then '4. 10% - 20%'
+                when val >= 20 AND val < 50 then '5. 20% - 50%'
+                when val >= 50 then '6. > 50%'
+                else 'Unknown'
+            end
+        """
+        print_distribution_stats(client, "Snipers % Supply (Top 70)", make_query(snipers_inner, cohort_sql), pct_buckets)
+        # 5. Bundled % Supply
+        bundled_inner = """
+            SELECT
+                m.base_address as join_key,
+                (m.val / t.total_supply * 100) as val
+            FROM (
+                SELECT
+                    t.base_address,
+                    sum(t.base_amount) as val
+                FROM trades t
+                JOIN (
+                    SELECT base_address, min(slot) as min_slot
+                    FROM trades
+                    GROUP BY base_address
+                ) m ON t.base_address = m.base_address AND t.slot = m.min_slot
+                WHERE t.trade_type = 0
+                GROUP BY t.base_address
+            ) m
+            JOIN (
+                SELECT token_address, argMax(total_supply, updated_at) as total_supply
+                FROM tokens
+                GROUP BY token_address
+            ) t ON m.base_address = t.token_address
+            WHERE t.total_supply > 0
+        """
+        print_distribution_stats(client, "Bundled % Supply", make_query(bundled_inner, cohort_sql), pct_buckets)
+        # 6. Dev Holding % Supply
+        dev_inner = """
             SELECT
+                t.token_address as join_key,
+                (wh.current_balance / (t.total_supply / pow(10, t.decimals)) * 100) as val
+            FROM (
+                 SELECT token_address, argMax(creator_address, updated_at) as creator_address, argMax(total_supply, updated_at) as total_supply, argMax(decimals, updated_at) as decimals
+                 FROM tokens
+                 GROUP BY token_address
+            ) t
+            JOIN (
+               SELECT mint_address, wallet_address, argMax(current_balance, updated_at) as current_balance
+               FROM wallet_holdings
+               GROUP BY mint_address, wallet_address
+            ) wh ON t.token_address = wh.mint_address AND t.creator_address = wh.wallet_address
+            WHERE t.total_supply > 0
+        """
+        print_distribution_stats(client, "Dev Holding % Supply", make_query(dev_inner, cohort_sql), pct_buckets)
+        # 7. Insiders % Supply
+        insiders_inner = """
+             SELECT
+                wh.mint_address as join_key,
+                (sum(wh.current_balance) / (t.total_supply / pow(10, t.decimals)) * 100) as val
+            FROM (
+                SELECT mint_address, wallet_address, argMax(current_balance, updated_at) as current_balance
+                FROM wallet_holdings
+                GROUP BY mint_address, wallet_address
+            ) wh
+            JOIN (
+                SELECT wallet_address,
+                       argMax(total_buys_count, updated_at) as buys,
+                       argMax(transfers_in_count, updated_at) as transfers,
+                       argMax(spl_transfers_in_count, updated_at) as spl_transfers
+                FROM wallet_profile_metrics
+                GROUP BY wallet_address
+            ) wpm ON wh.wallet_address = wpm.wallet_address
+            JOIN (
+                SELECT token_address, argMax(total_supply, updated_at) as total_supply, argMax(decimals, updated_at) as decimals
+                FROM tokens
+                GROUP BY token_address
+            ) t ON wh.mint_address = t.token_address
+            WHERE wpm.buys = 0 AND (wpm.transfers > 0 OR wpm.spl_transfers > 0) AND t.total_supply > 0
+            GROUP BY wh.mint_address, t.total_supply, t.decimals
+        """
+        print_distribution_stats(client, "Insiders % Supply", make_query(insiders_inner, cohort_sql), pct_buckets)
+        # 8. Time to ATH (Seconds)
+        time_ath_inner = """
+            SELECT
+                base_address as join_key,
+                (argMax(timestamp, price_usd) - min(timestamp)) as val
             FROM trades
             GROUP BY base_address
         """
+        time_ath_buckets = """
+            case
+                when val < 5 then '1. < 5s'
+                when val >= 5 AND val < 30 then '2. 5s - 30s'
+                when val >= 30 AND val < 60 then '3. 30s - 1m'
+                when val >= 60 AND val < 300 then '4. 1m - 5m'
+                when val >= 300 AND val < 3600 then '5. 5m - 1h'
+                when val >= 3600 then '6. > 1h'
+                else 'Unknown'
+            end
+        """
+        print_distribution_stats(client, "Time to ATH (Seconds)", make_query(time_ath_inner, cohort_sql), time_ath_buckets)
 if __name__ == "__main__":
     analyze()

scripts/analyze_hyperparams.py ADDED Viewed

	@@ -0,0 +1,255 @@

+import os
+import sys
+import torch
+import numpy as np
+import argparse
+from tqdm import tqdm
+from datetime import datetime, timezone
+from collections import defaultdict
+# Add project root to path
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+from data.data_loader import OracleDataset, DataFetcher
+import os
+import sys
+import numpy as np
+import argparse
+from tqdm import tqdm
+from datetime import datetime, timezone
+import collections
+# Add project root to path
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+from data.data_loader import DataFetcher
+import os
+import sys
+import numpy as np
+import argparse
+from tqdm import tqdm
+from datetime import datetime, timezone
+import collections
+from dotenv import load_dotenv
+from clickhouse_driver import Client as ClickHouseClient
+from neo4j import GraphDatabase
+# Add project root to path
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+from data.data_loader import DataFetcher
+def parse_args():
+    parser = argparse.ArgumentParser(description="Analyze dataset to tune hyperparameters (Horizons, Seq Len)")
+    parser.add_argument("--max_samples", type=int, default=5000, help="Max samples to analyze")
+    parser.add_argument("--token_address", type=str, default=None, help="Specific token address to analyze")
+    return parser.parse_args()
+def main():
+    load_dotenv()
+    args = parse_args()
+    print("--- Hyperparameter Calibration Analysis (SQL) ---")
+    # DB Connection
+    ch_host = os.getenv("CLICKHOUSE_HOST", "localhost")
+    ch_port = int(os.getenv("CLICKHOUSE_NATIVE_PORT", 9000))
+    neo_uri = os.getenv("NEO4J_URI", "bolt://localhost:7687")
+    neo_user = os.getenv("NEO4J_USER", "neo4j")
+    neo_pass = os.getenv("NEO4J_PASSWORD", "password")
+    print(f"Connecting to ClickHouse at {ch_host}:{ch_port}...")
+    clickhouse_client = ClickHouseClient(host=ch_host, port=ch_port)
+    print(f"Connecting to Neo4j at {neo_uri}...")
+    neo4j_driver = GraphDatabase.driver(neo_uri, auth=(neo_user, neo_pass))
+    # 1. Initialize DataFetcher
+    fetcher = DataFetcher(clickhouse_client=clickhouse_client, neo4j_driver=neo4j_driver)
+    print("DataFetcher initialized.")
+    # 2. Fetch Sample Mints
+    if args.token_address:
+        print(f"Analyzing specific token: {args.token_address}")
+        # Try to find mint timestamp
+        query = f"SELECT mint_address, timestamp FROM mints WHERE mint_address = '{args.token_address}'"
+        mints = fetcher.db_client.execute(query)
+        if not mints:
+            print("Token not found in mints table. Trying to use first trade timestamp...")
+            # Fallback if not in mints table
+            q2 = f"SELECT base_address, min(timestamp) FROM trades WHERE base_address = '{args.token_address}' GROUP BY base_address"
+            mints = fetcher.db_client.execute(q2)
+        if not mints:
+            print("Token not found in trades either (or no trades). Exiting.")
+            return
+    else:
+        print(f"Fetching {args.max_samples} sample tokens...")
+        # Fetch random mints
+        query = f"""
+        SELECT mint_address, timestamp FROM mints
+        ORDER BY rand()
+        LIMIT {args.max_samples}
+        """
+        mints = fetcher.db_client.execute(query)
+        print(f"Fetched {len(mints)} tokens.")
+    # Metrics to collect
+    lifespans = []   # Time from mint to last trade
+    time_to_ath = [] # Time from mint to highest price
+    # Sequence Length estimations
+    windows_to_test = [5, 10, 30, 60] # Minutes
+    event_counts = {w: [] for w in windows_to_test}
+    full_history_counts = []
+    print(f"Analyzing trades for {len(mints)} tokens...")
+    for mint_addr, mint_ts in tqdm(mints):
+        try:
+            if isinstance(mint_ts, datetime) and mint_ts.tzinfo is None:
+                mint_ts = mint_ts.replace(tzinfo=timezone.utc)
+            t0 = mint_ts.timestamp()
+            # Fetch ALL trades for this token
+            # We don't need full enrichments, just timestamp and price
+            # Args: token_addr, T_cutoff, count_threshold, early_lim, recent_lim, full_history
+            now_ts = datetime.now(timezone.utc)
+            trades, _, _ = fetcher.fetch_trades_for_token(mint_addr, now_ts, 0, 0, 0, full_history=True)
+            if not trades: continue
+            # Trades are usually sorted, but ensure
+            trades.sort(key=lambda x: x['timestamp'])
+            # Lifespan
+            last_ts = trades[-1]['timestamp'].timestamp()
+            lifespans.append(last_ts - t0)
+            # Time to ATH
+            max_price = -1.0
+            ath_ts = 0.0
+            valid_trades = []
+            for t in trades:
+                p = float(t.get('price_usd', 0.0))
+                # Basic filter for garbage prints
+                if p > 0:
+                    valid_trades.append(t)
+                    if p > max_price:
+                        max_price = p
+                        ath_ts = t['timestamp'].timestamp()
+            if max_price > 0:
+                time_to_ath.append(ath_ts - t0)
+            # --- Sequence Length Metrics ---
+            full_history_counts.append(len(valid_trades))
+            # Windowed counts
+            counts_in_window = {w: 0 for w in windows_to_test}
+            for t in valid_trades:
+                ts_val = t['timestamp'].timestamp()
+                elapsed_min = (ts_val - t0) / 60.0
+                for w in windows_to_test:
+                    if elapsed_min <= w:
+                        counts_in_window[w] += 1
+            for w in windows_to_test:
+                event_counts[w].append(counts_in_window[w])
+        except Exception as e:
+            print(f"Error processing {mint_addr}: {e}")
+            import traceback
+            traceback.print_exc()
+            pass
+    # --- Stats Calculation ---
+    def print_stats(name, data):
+        if not data:
+            print(f"{name}: No Data")
+            return
+        # Convert to numpy array for easier filtering if needed, though they are lists
+        arr = np.array(data)
+        p25 = np.percentile(arr, 25)
+        p50 = np.percentile(arr, 50)
+        p75 = np.percentile(arr, 75)
+        p90 = np.percentile(arr, 90)
+        p95 = np.percentile(arr, 95)
+        p99 = np.percentile(arr, 99)
+        max_val = np.max(arr)
+        print(f"[{name}]")
+        print(f"  Mean: {np.mean(arr):.2f} | Median: {p50:.2f} | Max: {max_val:.2f}")
+        print(f"  25%: {p25:.2f} | 75%: {p75:.2f} | 90%: {p90:.2f} | 95%: {p95:.2f} | 99%: {p99:.2f}")
+    print("\n" + "="*40)
+    print("RESULTS (ALL TOKENS)")
+    print("="*40)
+    # Time Stats
+    lifespans_min = [x/60.0 for x in lifespans]
+    time_to_ath_min = [x/60.0 for x in time_to_ath]
+    print_stats("Token Lifespan (Minutes)", lifespans_min)
+    print("\n")
+    print_stats("Time to ATH (Minutes)", time_to_ath_min)
+    print("\n" + "-"*20)
+    print("SEQUENCE LENGTHS (Trades Only)")
+    print("-"*20)
+    print_stats("Full History Length", full_history_counts)
+    for w in windows_to_test:
+        print("\n")
+        print_stats(f"Trades in First {w} Minutes", event_counts[w])
+    # --- High Activity Subset ---
+    print("\n" + "="*40)
+    print("RESULTS (HIGH ACTIVITY SUBSET)")
+    print("Filter: > 50 trades AND > 5 min lifespan")
+    print("="*40)
+    # Filter indices
+    valid_indices = []
+    for i, count in enumerate(full_history_counts):
+        if count > 50 and lifespans_min[i] > 5.0:
+            valid_indices.append(i)
+    if not valid_indices:
+        print("No high activity tokens found.")
+    else:
+        print(f"Found {len(valid_indices)} high activity tokens out of {len(full_history_counts)}.")
+        subset_lifespans = [lifespans_min[i] for i in valid_indices]
+        subset_ath = [time_to_ath_min[i] for i in valid_indices if i < len(time_to_ath_min)] # careful with length if sizes differ? they shouldn't by logic, but time_to_ath depends on if trade > 0
+        # indices are aligned with loop order
+        # But wait, time_to_ath was appended only if max_price > 0.
+        # This misalignment is risky.
+        # Better: Store dicts or tuples in the main loop instead of parallel lists.
+        # Quick fix: Just recalc stats on lists is hard if not aligned?
+        # Actually time_to_ath might be shorter than lifespans.
+        # Let's just print what we can, assuming simple filtering on `event_counts` which aligns 1:1 with loop (except exceptions).
+        # Re-collect logic for subsets is cleaner if we store objects.
+        # But let's just do Event Counts which are critical for seq_len.
+        subset_history = [full_history_counts[i] for i in valid_indices]
+        print_stats("Subset: Full History Length", subset_history)
+        for w in windows_to_test:
+            subset_w = [event_counts[w][i] for i in valid_indices]
+            print("\n")
+            print_stats(f"Subset: Trades in First {w} Min", subset_w)
+    print("\nRecommendation Logic:")
+    print("1. Horizons: Look at 'Time to ATH' p90 (or p90 of Subset).")
+    print("2. Max Seq Len: Look at 'Trades in First X Minutes' (X ~= Max Horizon).")
+if __name__ == "__main__":
+    main()

scripts/cache_dataset.py CHANGED Viewed

@@ -14,104 +14,130 @@ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from data.data_loader import OracleDataset
 from data.data_fetcher import DataFetcher
 from clickhouse_driver import Client as ClickHouseClient
 from neo4j import GraphDatabase
-# Load environment variables
-load_dotenv()
-# --- Configuration ---
-CLICKHOUSE_HOST = os.getenv("CLICKHOUSE_HOST", "localhost")
-CLICKHOUSE_PORT = int(os.getenv("CLICKHOUSE_PORT", 9000))
-CLICKHOUSE_USER = os.getenv("CLICKHOUSE_USER") or "default"
-CLICKHOUSE_PASSWORD = os.getenv("CLICKHOUSE_PASSWORD") or ""
-CLICKHOUSE_DATABASE = os.getenv("CLICKHOUSE_DATABASE", "default")
-NEO4J_URI = os.getenv("NEO4J_URI", "bolt://localhost:7687")
-NEO4J_USER = os.getenv("NEO4J_USER", "neo4j")
-NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD", "password")
-CACHE_DIR = os.getenv("CACHE_DIR", "/workspace/apollo/data/cache")
 def main():
-    parser = argparse.ArgumentParser(description="Pre-cache dataset samples.")
-    parser.add_argument("--max_samples", type=int, default=-1, help="Number of samples to cache. Set to -1 to process all available.")
-    parser.add_argument("--start_date", type=str, default=None, help="Start date for filtering mints (YYYY-MM-DD).")
-    parser.add_argument("--ohlc_stats_path", type=str, default=None, help="Path to OHLC stats JSON.")
-    parser.add_argument("--min_trade_usd", type=float, default=0.0, help="Minimum trade USD value.")
     args = parser.parse_args()
-    # Handle -1 as unlimited (None)
-    max_samples = args.max_samples if args.max_samples != -1 else None
-    # Create cache directory if it doesn't exist
-    output_dir = Path(CACHE_DIR)
     output_dir.mkdir(parents=True, exist_ok=True)
     start_date_dt = None
     if args.start_date:
-        start_date_dt = datetime.datetime.strptime(args.start_date, "%Y-%m-%d").replace(tzinfo=datetime.timezone.utc)
-    # --- 1. Set up database connections ---
     try:
-        print("INFO: Connecting to ClickHouse...")
-        clickhouse_client = ClickHouseClient(host=CLICKHOUSE_HOST, port=CLICKHOUSE_PORT, user=CLICKHOUSE_USER, password=CLICKHOUSE_PASSWORD, database=CLICKHOUSE_DATABASE)
-        print("INFO: Connecting to Neo4j...")
-        neo4j_driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))
-    except Exception as e:
-        print(f"ERROR: Failed to connect to databases: {e}", file=sys.stderr)
-        sys.exit(1)
-    # --- 2. Initialize DataFetcher and OracleDataset ---
-    data_fetcher = DataFetcher(clickhouse_client=clickhouse_client, neo4j_driver=neo4j_driver)
-    dataset = OracleDataset(
-        data_fetcher=data_fetcher,
-        max_samples=max_samples,
-        start_date=start_date_dt,
-        ohlc_stats_path=args.ohlc_stats_path,
-        horizons_seconds=[60, 300, 900, 1800, 3600],
-        quantiles=[0.5],
-        min_trade_usd=args.min_trade_usd
-    )
-    if len(dataset) == 0:
-        print("WARNING: Dataset initialization resulted in 0 samples. Nothing to cache.")
-        return
-    # --- 3. Iterate and cache each item ---
-    print(f"INFO: Starting to generate and cache {len(dataset)} samples...")
-    skipped_count = 0
-    for i in tqdm(range(len(dataset)), desc="Caching samples"):
-        try:
-            item = dataset.__cacheitem__(i)
-            if item is None:
-                skipped_count += 1
-                continue
-            output_path = output_dir / f"sample_{i}.pt"
-            torch.save(item, output_path)
-        except Exception as e:
-            error_msg = str(e)
-            # If a FATAL error occurs (e.g. persistent DB auth failure), stop the script immediately.
-            if "FATAL" in error_msg or "AuthenticationRateLimit" in error_msg:
-                print(f"\nCRITICAL: Fatal error encountered processing sample {i}. Stopping execution.\nError: {e}", file=sys.stderr)
-                sys.exit(1)
-            print(f"\nERROR: Failed to generate or save sample {i} for mint '{dataset.sampled_mints[i]['mint_address']}'. Error: {e}", file=sys.stderr)
-            # print trackback
-            import traceback
-            traceback.print_exc()
-            skipped_count += 1
-            continue
-    print(f"\n--- Caching Complete ---\nSuccessfully cached: {len(dataset) - skipped_count} items.\nSkipped: {skipped_count} items.\nCache location: {output_dir.resolve()}")
-    # --- 4. Close connections ---
-    clickhouse_client.disconnect()
-    neo4j_driver.close()
 if __name__ == "__main__":
     main()

 from data.data_loader import OracleDataset
 from data.data_fetcher import DataFetcher
+from scripts.analyze_distribution import get_return_class_map
 from clickhouse_driver import Client as ClickHouseClient
 from neo4j import GraphDatabase
 def main():
+    load_dotenv()
+    parser = argparse.ArgumentParser(description="Cache dataset samples for training.")
+    parser.add_argument("--output_dir", type=str, default="data/cache", help="Directory to save cached samples")
+    parser.add_argument("--max_samples", type=int, default=None, help="Maximum number of samples to generate")
+    parser.add_argument("--start_date", type=str, default=None, help="Start date (YYYY-MM-DD) for fetching new mints")
+    parser.add_argument("--ohlc_stats_path", type=str, default="data/ohlc_stats.npz")
+    parser.add_argument("--min_trade_usd", type=float, default=0.0)
+    # DB Args
+    parser.add_argument("--clickhouse_host", type=str, default=os.getenv("CLICKHOUSE_HOST", "localhost"))
+    parser.add_argument("--clickhouse_port", type=int, default=int(os.getenv("CLICKHOUSE_PORT", 9000)))
+    parser.add_argument("--neo4j_uri", type=str, default=os.getenv("NEO4J_URI", "bolt://localhost:7687"))
+    parser.add_argument("--neo4j_user", type=str, default=os.getenv("NEO4J_USER", "neo4j"))
+    parser.add_argument("--neo4j_password", type=str, default=os.getenv("NEO4J_PASSWORD", "password"))
     args = parser.parse_args()
+    output_dir = Path(args.output_dir)
     output_dir.mkdir(parents=True, exist_ok=True)
     start_date_dt = None
     if args.start_date:
+        start_date_dt = datetime.datetime.strptime(args.start_date, "%Y-%m-%d")
+    print(f"INFO: Initializing DB Connections...")
+    clickhouse_client = ClickHouseClient(host=args.clickhouse_host, port=args.clickhouse_port)
+    neo4j_driver = GraphDatabase.driver(args.neo4j_uri, auth=(args.neo4j_user, args.neo4j_password))
     try:
+        # --- 2. Initialize DataFetcher and OracleDataset ---
+        data_fetcher = DataFetcher(clickhouse_client=clickhouse_client, neo4j_driver=neo4j_driver)
+        # Pre-fetch the Return Class Map
+        # tokens not in this map (e.g. >10k x) are INVALID and will be skipped
+        print("INFO: Fetching Return Classification Map...")
+        return_class_map, thresholds = get_return_class_map(clickhouse_client)
+        print(f"INFO: Loaded {len(return_class_map)} valid classified tokens.")
+        dataset = OracleDataset(
+            data_fetcher=data_fetcher,
+            max_samples=args.max_samples,
+            start_date=start_date_dt,
+            ohlc_stats_path=args.ohlc_stats_path,
+            horizons_seconds=[60, 180, 300, 600, 1800, 3600, 7200],
+            quantiles=[0.5],
+            min_trade_usd=args.min_trade_usd
+        )
+        if len(dataset) == 0:
+            print("WARNING: Dataset initialization resulted in 0 samples. Nothing to cache.")
+            return
+        # --- 3. Iterate and cache each item ---
+        print(f"INFO: Starting to generate and cache {len(dataset)} samples...")
+        metadata_path = output_dir / "metadata.jsonl"
+        print(f"INFO: Writing metadata to {metadata_path}")
+        skipped_count = 0
+        filtered_count = 0
+        cached_count = 0
+        # Open metadata file in append mode
+        with open(metadata_path, 'a') as meta_f:
+            for i in tqdm(range(len(dataset)), desc="Caching samples"):
+                mint_addr = dataset.sampled_mints[i]['mint_address']
+                # 1. Filter Check
+                if mint_addr not in return_class_map:
+                    # Token is effectively "filtered out" (e.g. > 10,000x return or missing metrics)
+                    filtered_count += 1
+                    continue
+                class_id = return_class_map[mint_addr]
+                try:
+                    item = dataset.__cacheitem__(i)
+                    if item is None:
+                        skipped_count += 1
+                        continue
+                    filename = f"sample_{i}.pt"
+                    output_path = output_dir / filename
+                    torch.save(item, output_path)
+                    # Write metadata entry
+                    # Minimizing IO overhead by keeping line short
+                    meta_entry = {"file": filename, "class_id": class_id}
+                    meta_f.write(json.dumps(meta_entry) + "\n")
+                    cached_count += 1
+                except Exception as e:
+                    error_msg = str(e)
+                    # If a FATAL error occurs (e.g. persistent DB auth failure), stop the script immediately.
+                    if "FATAL" in error_msg or "AuthenticationRateLimit" in error_msg:
+                        print(f"\nCRITICAL: Fatal error encountered processing sample {i}. Stopping execution.\nError: {e}", file=sys.stderr)
+                        sys.exit(1)
+                    print(f"\nERROR: Failed to generate or save sample {i} for mint '{mint_addr}'. Error: {e}", file=sys.stderr)
+                    # print trackback
+                    import traceback
+                    traceback.print_exc()
+                    skipped_count += 1
+                    continue
+        print(f"\n--- Caching Complete ---")
+        print(f"Successfully cached: {cached_count} items.")
+        print(f"Filtered (Invalid/High Return): {filtered_count} items.")
+        print(f"Skipped (Errors/Empty): {skipped_count} items.")
+        print(f"Cache location: {output_dir.resolve()}")
+        print(f"Metadata location: {metadata_path.resolve()}")
+    finally:
+        # --- 4. Close connections ---
+        clickhouse_client.disconnect()
+        neo4j_driver.close()
 if __name__ == "__main__":
     main()

scripts/debug_db_counts.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import os
+import clickhouse_connect
+from dotenv import load_dotenv
+load_dotenv()
+def check_max_trades():
+    try:
+        client = clickhouse_connect.get_client(
+            host=os.getenv("CLICKHOUSE_HOST"),
+            port=int(os.getenv("CLICKHOUSE_HTTP_PORT")),
+            secure=False
+        )
+        print("Connected to ClickHouse.")
+        # 1. Find the token with the most trades
+        print("Querying max trade count per token (this might take a moment)...")
+        query = """
+        SELECT base_address, count(*) as c
+        FROM trades
+        GROUP BY base_address
+        ORDER BY c DESC
+        LIMIT 5
+        """
+        result = client.query(query)
+        print("Top 5 Tokens by Trade Count:")
+        for row in result.result_rows:
+            print(f"Token: {row[0]}, Count: {row[1]}")
+    except Exception as e:
+        print(f"Error: {e}")
+if __name__ == "__main__":
+    check_max_trades()

t.json CHANGED Viewed

@@ -1,47 +1,11 @@
-"newPairs": {
-                "fees": {
-                    "max": null,
-                    "min": null
-                },
-                "txns": {
-                    "max": null,
-                    "min": null
-                },
-                "bundle": {
-                    "max": null,
-                    "min": null
-                },
-                "volume": {
-                    "max": null,
-                    "min": null
-                },
-                "holders": {
-                    "max": null,
-                    "min": null
-                },
-                "numBuys": {
-                    "max": null,
-                    "min": null
-                },
-                "snipers": {
-                    "max": null,
-                    "min": null
-                },
-                "insiders": {
-                    "max": null,
-                    "min": null
-                },
-                "numSells": {
-                    "max": null,
-                    "min": null
-                },
-                "devHolding": {
-                    "max": null,
-                    "min": null
-                },
-                "top10Holders": {
-                    "max": null,
-                    "min": null
-                },
-            },

+"TotalAggregetedFees"
+"TotalSupplyBoughtByBundledTxns"
+"TotalVolume"
+"TotalUniqueHolders"
+"TotalnumBuys"
+"TotalSupplyBoughtBySnipers (first 70 unique wallets)"
+"TotalSupplyHeldByInsiders"
+"TotalnumSells"
+"TotalDevHoldingSupply"
+"totalSupplyHeldByTop10Holders"

test.md ADDED Viewed

	@@ -0,0 +1,34 @@

+Hyperparameter Analysis & Recommendations
+Objective
+Determine data-driven values for --max_seq_len and --horizons_seconds to optimize model training.
+Analysis Findings
+1. Trade Volume Distribution
+General Population (Bias towards Rugs): 99% of tokens have fewer than 1,300 trades in their entire lifetime.
+High-Activity Tokens (Successful Launches): Verified against token HWVY....
+Total Trades: ~300,000.
+First 60 Minutes: ~3,720 trades.
+Rate: Approx. 60-100 trades/minute during the initial pump.
+2. Time-to-ATH (All-Time High)
+Median: ~3 seconds (Immediate dump/failure).
+90th Percentile: ~2.6 minutes.
+99th Percentile: ~90 minutes.
+Conclusion: A model needs to observe at least the first 90 minutes to capture the "peak" behavior of the most successful 1% of tokens.
+Recommendations
+Max Sequence Length (--max_seq_len)
+Recommendation: 8192
+Logic:
+High-volume tokens generate ~3,700 trades in the first hour.
+To cover the critical 90-minute window (Time-to-ATH 99th percentile) for a high-volume token: 3700 * 1.5 = 5550 trades.
+Adding buffer for liquidity events and higher-intensity bursts: 8192 (nearest power of 2).
+This length is sufficient to capture:
+2+ hours of data for high-activity tokens.
+The entire lifecycle for >99% of all tokens.
+Prediction Horizons (--horizons_seconds)
+Recommendation: 30, 60, 300, 600, 1800, 3600, 7200 (30s, 1m, 5m, 10m, 30m, 1h, 2h)
+Logic:
+Short-term (30s - 5m): Crucial for immediate volatility and scalping predictions, especially given the median lifespan is extremely short.
+Medium-term (10m - 30m): Captures the trend development for "standard" rugs (90th percentile < 3 min, but tails extend).
+Long-term (1h - 2h): Essential for the 1% of successful tokens where ATH occurs around 90 minutes.

train.py CHANGED Viewed

@@ -22,7 +22,7 @@ except RuntimeError:
 import torch
 import torch.nn as nn
-from torch.utils.data import DataLoader
 from torch.optim import AdamW
 # --- Accelerate & Transformers ---
@@ -248,10 +248,28 @@ def main() -> None:
     if len(dataset) == 0:
         raise RuntimeError("Dataset is empty.")
     dataloader = DataLoader(
         dataset,
         batch_size=batch_size,
-        shuffle=bool(args.shuffle),
         num_workers=int(args.num_workers),
         pin_memory=bool(args.pin_memory),
         collate_fn=functools.partial(filtered_collate, collator)

 import torch
 import torch.nn as nn
+from torch.utils.data import DataLoader, WeightedRandomSampler
 from torch.optim import AdamW
 # --- Accelerate & Transformers ---
     if len(dataset) == 0:
         raise RuntimeError("Dataset is empty.")
+    # --- NEW: Weighted Sampling Logic ---
+    sampler = None
+    shuffle = bool(args.shuffle)
+    # Check if dataset provides weights (from metadata.jsonl)
+    if hasattr(dataset, 'get_weights'):
+        weights = dataset.get_weights()
+        if weights is not None:
+            if shuffle:
+                logger.info("INFO: Class weights found. Using WeightedRandomSampler for balanced training.")
+                # Note: WeightedRandomSampler requires shuffle=False in DataLoader
+                # It draws samples with replacement by default.
+                sampler = WeightedRandomSampler(weights, num_samples=len(weights), replacement=True)
+                shuffle = False
+            else:
+                 logger.info("INFO: Weights found but shuffle=False. Ignoring weights (sequential mode).")
     dataloader = DataLoader(
         dataset,
         batch_size=batch_size,
+        shuffle=shuffle,
+        sampler=sampler,
         num_workers=int(args.num_workers),
         pin_memory=bool(args.pin_memory),
         collate_fn=functools.partial(filtered_collate, collator)

train.sh CHANGED Viewed

@@ -1,4 +1,4 @@
-/venv/main/bin/accelerate launch train.py \
   --epochs 10 \
   --batch_size 1 \
   --learning_rate 1e-4 \
@@ -11,8 +11,8 @@
   --tensorboard_dir runs/oracle \
   --checkpoint_dir checkpoints \
   --mixed_precision bf16 \
-  --max_seq_len 4096 \
-  --horizons_seconds 30 60 120 240 420 \
   --quantiles 0.1 0.5 0.9 \
   --ohlc_stats_path ./data/ohlc_stats.npz \
   --num_workers 4 \

+accelerate launch train.py \
   --epochs 10 \
   --batch_size 1 \
   --learning_rate 1e-4 \
   --tensorboard_dir runs/oracle \
   --checkpoint_dir checkpoints \
   --mixed_precision bf16 \
+  --max_seq_len 8192 \
+  --horizons_seconds 60 180 300 600 1800 3600 7200 \
   --quantiles 0.1 0.5 0.9 \
   --ohlc_stats_path ./data/ohlc_stats.npz \
   --num_workers 4 \