oracle / models /vocabulary.py
zirobtc's picture
Upload folder using huggingface_hub
7189688
# vocabulary.py
"""
Defines the vocabulary and mappings for categorical features.
"""
# --- Event Type Mappings ---
EVENT_NAMES = [
'__PAD__', 'Chart_Segment', 'Mint',
'Transfer', 'LargeTransfer',
'Trade',
'Deployer_Trade',
'SmartWallet_Trade',
'LargeTrade',
'PoolCreated',
'LiquidityChange',
'FeeCollected',
'TokenBurn',
'SupplyLock',
'OnChain_Snapshot',
'HolderSnapshot',
'TrendingToken',
'BoostedToken',
'XPost',
'XRetweet',
'XReply',
'XQuoteTweet',
'PumpReply',
'DexBoost_Paid',
'DexProfile_Updated',
'AlphaGroup_Call',
'Channel_Call',
'CexListing',
'TikTok_Trending_Hashtag',
'XTrending_Hashtag',
'ChainSnapshot',
'Lighthouse_Snapshot',
'Migrated',
'MIDDLE',
'RECENT'
]
EVENT_TO_ID = {name: i for i, name in enumerate(EVENT_NAMES)}
ID_TO_EVENT = {i: name for i, name in enumerate(EVENT_NAMES)}
NUM_EVENT_TYPES = len(EVENT_NAMES)
# --- Protocol Mappings ---
# The canonical list of protocol names
PROTOCOL_NAMES = [
"Unknown",
"Pump V1",
"Pump AMM",
"Bonk",
"Raydium CPMM"
]
PROTOCOL_TO_ID = {name: i for i, name in enumerate(PROTOCOL_NAMES)}
ID_TO_PROTOCOL = {i: name for i, name in enumerate(PROTOCOL_NAMES)}
NUM_PROTOCOLS = len(PROTOCOL_NAMES)
# --- Neo4J Link Type Mappings ---
# UPDATED: Added link types from your Neo4j schema
LINK_TYPES = [
"TransferLink",
"TransferLinkToken",
"BundleTradeLink",
"CopiedTradeLink",
"CoordinatedActivityLink",
"MintedLink",
"SnipedLink",
"LockedSupplyLink",
"BurnedLink",
"ProvidedLiquidityLink",
"WhaleOfLink",
"TopTraderOfLink",
]
LINK_TYPE_TO_ID = {name: i for i, name in enumerate(LINK_TYPES)}
ID_TO_LINK_TYPE = {i: name for i, name in enumerate(LINK_TYPES)}
NUM_LINK_TYPES = len(LINK_TYPES)
LINK_NAME_TO_TRIPLET = {
# Wallet <-> Wallet Links
"TransferLink": ('wallet', 'TransferLink', 'wallet'),
"BundleTradeLink": ('wallet', 'BundleTradeLink', 'wallet'),
"CopiedTradeLink": ('wallet', 'CopiedTradeLink', 'wallet'),
"CoordinatedActivityLink": ('wallet', 'CoordinatedActivityLink', 'wallet'),
# Wallet -> Token Links
"TransferLinkToken": ('wallet', 'TransferLinkToken', 'token'),
"MintedLink": ('wallet', 'MintedLink', 'token'),
"SnipedLink": ('wallet', 'SnipedLink', 'token'),
"LockedSupplyLink": ('wallet', 'LockedSupplyLink', 'token'),
"BurnedLink": ('wallet', 'BurnedLink', 'token'),
"ProvidedLiquidityLink": ('wallet', 'ProvidedLiquidityLink', 'token'),
"WhaleOfLink": ('wallet', 'WhaleOfLink', 'token'),
"TopTraderOfLink": ('wallet', 'TopTraderOfLink', 'token'),
}
# --- NEW: OHLC Interval Mappings ---
OHLC_INTERVALS = [
"Unknown", # ID 0
"1s", # ID 1
"30s", # ID 2
]
INTERVAL_TO_ID = {name: i for i, name in enumerate(OHLC_INTERVALS)}
ID_TO_INTERVAL = {i: name for i, name in enumerate(OHLC_INTERVALS)}
NUM_OHLC_INTERVALS = len(OHLC_INTERVALS)
DEX_NAMES = [
"Unknown",
"Axiom",
"Bullx",
"OXK",
"Trojan",
"Jupyter"
]
DEX_TO_ID = {name: i for i, name in enumerate(DEX_NAMES)}
ID_TO_DEX = {i: name for i, name in enumerate(DEX_NAMES)}
NUM_DEX_PLATFORMS = len(DEX_NAMES)
# --- NEW: Trending List Source Mappings ---
TRENDING_LIST_SOURCES = [
"Unknown",
"Phantom",
"Dexscreener"
]
TRENDING_LIST_SOURCE_TO_ID = {name: i for i, name in enumerate(TRENDING_LIST_SOURCES)}
ID_TO_TRENDING_LIST_SOURCE = {i: name for i, name in enumerate(TRENDING_LIST_SOURCES)}
NUM_TRENDING_LIST_SOURCES = len(TRENDING_LIST_SOURCES)
# --- NEW: Trending List Timeframe Mappings ---
TRENDING_LIST_TIMEFRAMES = [
"Unknown",
"5m",
"1h",
"24h"
]
TRENDING_LIST_TIMEFRAME_TO_ID = {name: i for i, name in enumerate(TRENDING_LIST_TIMEFRAMES)}
ID_TO_TRENDING_LIST_TIMEFRAME = {i: name for i, name in enumerate(TRENDING_LIST_TIMEFRAMES)}
NUM_TRENDING_LIST_TIMEFRAMES = len(TRENDING_LIST_TIMEFRAMES)
# --- NEW: Lighthouse Snapshot Timeframe Mappings ---
LIGHTHOUSE_TIMEFRAMES = [
"Unknown",
"5m",
"1h",
"6h",
"24h"
]
LIGHTHOUSE_TIMEFRAME_TO_ID = {name: i for i, name in enumerate(LIGHTHOUSE_TIMEFRAMES)}
NUM_LIGHTHOUSE_TIMEFRAMES = len(LIGHTHOUSE_TIMEFRAMES)
# --- NEW: TrackerEncoder Vocabularies ---
# Alpha Groups (Discord)
ALPHA_GROUPS = [
"unknown",
"Potion",
"Serenity",
"Digi World"
]
ALPHA_GROUPS_TO_ID = {name: i for i, name in enumerate(ALPHA_GROUPS)}
ID_TO_ALPHA_GROUPS = {i: name for i, name in enumerate(ALPHA_GROUPS)}
NUM_ALPHA_GROUPS = len(ALPHA_GROUPS)
# Call Channels (Telegram)
CALL_CHANNELS = [
"unknown",
"MarcosCalls",
"kobecalls",
"DEGEMSCALLS"
]
CALL_CHANNELS_TO_ID = {name: i for i, name in enumerate(CALL_CHANNELS)}
ID_TO_CALL_CHANNELS = {i: name for i, name in enumerate(CALL_CHANNELS)}
NUM_CALL_CHANNELS = len(CALL_CHANNELS)
# CEX Exchanges
EXCHANGES = [
"unknown", "mexc", "weex", "binance", "kraken"
]
EXCHANGES_TO_ID = {name: i for i, name in enumerate(EXCHANGES)}
ID_TO_EXCHANGES = {i: name for i, name in enumerate(EXCHANGES)}
NUM_EXCHANGES = len(EXCHANGES)
# Return buckets used across analysis/scoring scripts.
# Split 3x-10x into 3x-5x and 5x-10x to reduce within-bucket heterogeneity.
RETURN_THRESHOLDS = [0, 3, 5, 10, 20, 100, 10000]
NUM_RETURN_CLASSES = len(RETURN_THRESHOLDS) - 1
# Manipulated (High return but suspicious metrics). Keep this as "after all return buckets".
MANIPULATED_CLASS_ID = NUM_RETURN_CLASSES