# vocabulary.py """ Defines the vocabulary and mappings for categorical features. """ # --- Event Type Mappings --- EVENT_NAMES = [ '__PAD__', 'Chart_Segment', 'Mint', 'Transfer', 'LargeTransfer', 'Trade', 'Deployer_Trade', 'SmartWallet_Trade', 'LargeTrade', 'PoolCreated', 'LiquidityChange', 'FeeCollected', 'TokenBurn', 'SupplyLock', 'OnChain_Snapshot', 'HolderSnapshot', 'TrendingToken', 'BoostedToken', 'XPost', 'XRetweet', 'XReply', 'XQuoteTweet', 'PumpReply', 'DexBoost_Paid', 'DexProfile_Updated', 'AlphaGroup_Call', 'Channel_Call', 'CexListing', 'TikTok_Trending_Hashtag', 'XTrending_Hashtag', 'ChainSnapshot', 'Lighthouse_Snapshot', 'Migrated', 'MIDDLE', 'RECENT' ] EVENT_TO_ID = {name: i for i, name in enumerate(EVENT_NAMES)} ID_TO_EVENT = {i: name for i, name in enumerate(EVENT_NAMES)} NUM_EVENT_TYPES = len(EVENT_NAMES) # --- Protocol Mappings --- # The canonical list of protocol names PROTOCOL_NAMES = [ "Unknown", "Pump V1", "Pump AMM", "Bonk", "Raydium CPMM" ] PROTOCOL_TO_ID = {name: i for i, name in enumerate(PROTOCOL_NAMES)} ID_TO_PROTOCOL = {i: name for i, name in enumerate(PROTOCOL_NAMES)} NUM_PROTOCOLS = len(PROTOCOL_NAMES) # --- Neo4J Link Type Mappings --- # UPDATED: Added link types from your Neo4j schema LINK_TYPES = [ "TransferLink", "TransferLinkToken", "BundleTradeLink", "CopiedTradeLink", "CoordinatedActivityLink", "MintedLink", "SnipedLink", "LockedSupplyLink", "BurnedLink", "ProvidedLiquidityLink", "WhaleOfLink", "TopTraderOfLink", ] LINK_TYPE_TO_ID = {name: i for i, name in enumerate(LINK_TYPES)} ID_TO_LINK_TYPE = {i: name for i, name in enumerate(LINK_TYPES)} NUM_LINK_TYPES = len(LINK_TYPES) LINK_NAME_TO_TRIPLET = { # Wallet <-> Wallet Links "TransferLink": ('wallet', 'TransferLink', 'wallet'), "BundleTradeLink": ('wallet', 'BundleTradeLink', 'wallet'), "CopiedTradeLink": ('wallet', 'CopiedTradeLink', 'wallet'), "CoordinatedActivityLink": ('wallet', 'CoordinatedActivityLink', 'wallet'), # Wallet -> Token Links "TransferLinkToken": ('wallet', 'TransferLinkToken', 'token'), "MintedLink": ('wallet', 'MintedLink', 'token'), "SnipedLink": ('wallet', 'SnipedLink', 'token'), "LockedSupplyLink": ('wallet', 'LockedSupplyLink', 'token'), "BurnedLink": ('wallet', 'BurnedLink', 'token'), "ProvidedLiquidityLink": ('wallet', 'ProvidedLiquidityLink', 'token'), "WhaleOfLink": ('wallet', 'WhaleOfLink', 'token'), "TopTraderOfLink": ('wallet', 'TopTraderOfLink', 'token'), } # --- NEW: OHLC Interval Mappings --- OHLC_INTERVALS = [ "Unknown", # ID 0 "1s", # ID 1 "30s", # ID 2 ] INTERVAL_TO_ID = {name: i for i, name in enumerate(OHLC_INTERVALS)} ID_TO_INTERVAL = {i: name for i, name in enumerate(OHLC_INTERVALS)} NUM_OHLC_INTERVALS = len(OHLC_INTERVALS) DEX_NAMES = [ "Unknown", "Axiom", "Bullx", "OXK", "Trojan", "Jupyter" ] DEX_TO_ID = {name: i for i, name in enumerate(DEX_NAMES)} ID_TO_DEX = {i: name for i, name in enumerate(DEX_NAMES)} NUM_DEX_PLATFORMS = len(DEX_NAMES) # --- NEW: Trending List Source Mappings --- TRENDING_LIST_SOURCES = [ "Unknown", "Phantom", "Dexscreener" ] TRENDING_LIST_SOURCE_TO_ID = {name: i for i, name in enumerate(TRENDING_LIST_SOURCES)} ID_TO_TRENDING_LIST_SOURCE = {i: name for i, name in enumerate(TRENDING_LIST_SOURCES)} NUM_TRENDING_LIST_SOURCES = len(TRENDING_LIST_SOURCES) # --- NEW: Trending List Timeframe Mappings --- TRENDING_LIST_TIMEFRAMES = [ "Unknown", "5m", "1h", "24h" ] TRENDING_LIST_TIMEFRAME_TO_ID = {name: i for i, name in enumerate(TRENDING_LIST_TIMEFRAMES)} ID_TO_TRENDING_LIST_TIMEFRAME = {i: name for i, name in enumerate(TRENDING_LIST_TIMEFRAMES)} NUM_TRENDING_LIST_TIMEFRAMES = len(TRENDING_LIST_TIMEFRAMES) # --- NEW: Lighthouse Snapshot Timeframe Mappings --- LIGHTHOUSE_TIMEFRAMES = [ "Unknown", "5m", "1h", "6h", "24h" ] LIGHTHOUSE_TIMEFRAME_TO_ID = {name: i for i, name in enumerate(LIGHTHOUSE_TIMEFRAMES)} NUM_LIGHTHOUSE_TIMEFRAMES = len(LIGHTHOUSE_TIMEFRAMES) # --- NEW: TrackerEncoder Vocabularies --- # Alpha Groups (Discord) ALPHA_GROUPS = [ "unknown", "Potion", "Serenity", "Digi World" ] ALPHA_GROUPS_TO_ID = {name: i for i, name in enumerate(ALPHA_GROUPS)} ID_TO_ALPHA_GROUPS = {i: name for i, name in enumerate(ALPHA_GROUPS)} NUM_ALPHA_GROUPS = len(ALPHA_GROUPS) # Call Channels (Telegram) CALL_CHANNELS = [ "unknown", "MarcosCalls", "kobecalls", "DEGEMSCALLS" ] CALL_CHANNELS_TO_ID = {name: i for i, name in enumerate(CALL_CHANNELS)} ID_TO_CALL_CHANNELS = {i: name for i, name in enumerate(CALL_CHANNELS)} NUM_CALL_CHANNELS = len(CALL_CHANNELS) # CEX Exchanges EXCHANGES = [ "unknown", "mexc", "weex", "binance", "kraken" ] EXCHANGES_TO_ID = {name: i for i, name in enumerate(EXCHANGES)} ID_TO_EXCHANGES = {i: name for i, name in enumerate(EXCHANGES)} NUM_EXCHANGES = len(EXCHANGES) # Return buckets used across analysis/scoring scripts. # Split 3x-10x into 3x-5x and 5x-10x to reduce within-bucket heterogeneity. RETURN_THRESHOLDS = [0, 3, 5, 10, 20, 100, 10000] NUM_RETURN_CLASSES = len(RETURN_THRESHOLDS) - 1 # Manipulated (High return but suspicious metrics). Keep this as "after all return buckets". MANIPULATED_CLASS_ID = NUM_RETURN_CLASSES