zirobtc commited on
Commit
9dd732c
·
1 Parent(s): 0e3516b

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. .ipynb_checkpoints/install-checkpoint.sh +26 -0
  3. log.log +2 -2
  4. metadata/INFORMATION_SCHEMA.sql +2 -0
  5. metadata/default.sql +2 -0
  6. metadata/information_schema.sql +2 -0
  7. metadata/system.sql +2 -0
  8. pre_cache.sh +0 -4
  9. preprocessed_configs/config.xml +70 -0
  10. scripts/analyze_data_distribution.py +207 -0
  11. scripts/analyze_distribution.py +118 -0
  12. scripts/cache_dataset.py +7 -4
  13. scripts/download_epoch_artifacts.py +1 -1
  14. status +3 -0
  15. store/0e3/0e3eb1d8-7d84-4fb4-9205-69aed69777eb/format_version.txt +1 -0
  16. store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/amount.bin +3 -0
  17. store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/amount.cmrk2 +0 -0
  18. store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/amount_decimal.bin +3 -0
  19. store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/amount_decimal.cmrk2 +0 -0
  20. store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/checksums.txt +0 -0
  21. store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/columns.txt +13 -0
  22. store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/columns_substreams.txt +29 -0
  23. store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/count.txt +1 -0
  24. store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/default_compression_codec.txt +1 -0
  25. store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/error.bin +3 -0
  26. store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/error.cmrk2 +0 -0
  27. store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/error.null.bin +3 -0
  28. store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/error.null.cmrk2 +0 -0
  29. store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/metadata_version.txt +1 -0
  30. store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/mint_address.bin +3 -0
  31. store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/mint_address.cmrk2 +0 -0
  32. store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/mint_address.size.bin +3 -0
  33. store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/mint_address.size.cmrk2 +0 -0
  34. store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/primary.cidx +0 -0
  35. store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/priority_fee.bin +3 -0
  36. store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/priority_fee.cmrk2 +0 -0
  37. store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/serialization.json +1 -0
  38. store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/signature.cmrk2 +0 -0
  39. store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/signature.size.bin +3 -0
  40. store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/signature.size.cmrk2 +0 -0
  41. store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/slot.bin +3 -0
  42. store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/slot.cmrk2 +0 -0
  43. store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/source.cmrk2 +0 -0
  44. store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/source.size.bin +3 -0
  45. store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/source.size.cmrk2 +0 -0
  46. store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/source_balance.bin +3 -0
  47. store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/source_balance.cmrk2 +0 -0
  48. store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/source_balance.sparse.idx.bin +3 -0
  49. store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/source_balance.sparse.idx.cmrk2 +0 -0
  50. store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/success.bin +3 -0
.gitattributes CHANGED
@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  log.log filter=lfs diff=lfs merge=lfs -text
 
 
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  log.log filter=lfs diff=lfs merge=lfs -text
37
+ store/74c/74c70007-cccd-4669-bfd4-e25f8348ad8c/all_1_35_2/primary.cidx filter=lfs diff=lfs merge=lfs -text
.ipynb_checkpoints/install-checkpoint.sh ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ sudo apt update
2
+ sudo apt install -y curl wget gnupg apt-transport-https ca-certificates dirmngr
3
+
4
+ sudo apt update
5
+ sudo apt install -y pkg-config libudev-dev
6
+
7
+ curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
8
+ source $HOME/.cargo/env
9
+
10
+ # ClickHouse (add repo and install)
11
+ sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 8919F6BD2B48D754
12
+ echo "deb https://packages.clickhouse.com/deb stable main" | sudo tee /etc/apt/sources.list.d/clickhouse.list
13
+ sudo apt update
14
+ sudo apt install -y clickhouse-server clickhouse-client
15
+
16
+ # Neo4j (add repo and install)
17
+ sudo wget -O - https://debian.neo4j.com/neotechnology.gpg.key | sudo gpg --dearmor -o /usr/share/keyrings/neo4j.gpg
18
+ echo "deb [signed-by=/usr/share/keyrings/neo4j.gpg] https://debian.neo4j.com stable latest" | sudo tee -a /etc/apt/sources.list.d/neo4j.list
19
+ sudo apt update
20
+ sudo apt install -y neo4j
21
+
22
+ # Start Neo4j (Runs on bolt://localhost:7687)
23
+ sudo neo4j-admin dbms set-initial-password neo4j123
24
+ neo4j start
25
+
26
+ clickhouse-server
log.log CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2bfaace3cf2aadc0acf9e9714d8df00c44bc545db23c87e7497a7844ba3c98a9
3
- size 6115919
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ce8d085fbecbf5108090a954e61db882a7ba0e7fddf4a57223d72e8ebf7713d
3
+ size 1378
metadata/INFORMATION_SCHEMA.sql ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ ATTACH DATABASE INFORMATION_SCHEMA
2
+ ENGINE = Memory
metadata/default.sql ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ ATTACH DATABASE _ UUID '0f383396-21e2-451d-b7c5-287e41ad186e'
2
+ ENGINE = Atomic
metadata/information_schema.sql ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ ATTACH DATABASE information_schema
2
+ ENGINE = Memory
metadata/system.sql ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ ATTACH DATABASE _ UUID '248514d1-a5e3-431c-8452-15429f2d2c8c'
2
+ ENGINE = Atomic
pre_cache.sh CHANGED
@@ -1,12 +1,8 @@
1
  #!/bin/bash
2
  # Pre-caches the dataset for training
3
- # Usage: ./pre_cache.sh [max_samples]
4
-
5
- MAX_SAMPLES=${1:-100}
6
 
7
  echo "Starting dataset caching..."
8
  python3 scripts/cache_dataset.py \
9
- --max_samples $MAX_SAMPLES \
10
  --ohlc_stats_path "/workspace/apollo/data/ohlc_stats.npz"
11
 
12
  echo "Done!"
 
1
  #!/bin/bash
2
  # Pre-caches the dataset for training
 
 
 
3
 
4
  echo "Starting dataset caching..."
5
  python3 scripts/cache_dataset.py \
 
6
  --ohlc_stats_path "/workspace/apollo/data/ohlc_stats.npz"
7
 
8
  echo "Done!"
preprocessed_configs/config.xml ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!-- This file was generated automatically.
2
+ Do not edit it: it is likely to be discarded and generated again before it's read next time.
3
+ Files used to generate this file:
4
+ config.xml -->
5
+
6
+ <!-- Config that is used when server is run without config file. -->
7
+ <clickhouse>
8
+ <logger>
9
+ <level>trace</level>
10
+ <console>true</console>
11
+ </logger>
12
+
13
+ <http_port>8123</http_port>
14
+ <tcp_port>9000</tcp_port>
15
+ <mysql_port>9004</mysql_port>
16
+ <postgresql_port>9005</postgresql_port>
17
+
18
+ <path>./</path>
19
+
20
+ <mlock_executable>true</mlock_executable>
21
+
22
+ <send_crash_reports>
23
+ <enabled>true</enabled>
24
+ <send_logical_errors>true</send_logical_errors>
25
+ <endpoint>https://crash.clickhouse.com/</endpoint>
26
+ </send_crash_reports>
27
+
28
+ <http_options_response>
29
+ <header>
30
+ <name>Access-Control-Allow-Origin</name>
31
+ <value>*</value>
32
+ </header>
33
+ <header>
34
+ <name>Access-Control-Allow-Headers</name>
35
+ <value>origin, x-requested-with, x-clickhouse-format, x-clickhouse-user, x-clickhouse-key, Authorization</value>
36
+ </header>
37
+ <header>
38
+ <name>Access-Control-Allow-Methods</name>
39
+ <value>POST, GET, OPTIONS</value>
40
+ </header>
41
+ <header>
42
+ <name>Access-Control-Max-Age</name>
43
+ <value>86400</value>
44
+ </header>
45
+ </http_options_response>
46
+
47
+ <users>
48
+ <default>
49
+ <password/>
50
+
51
+ <networks>
52
+ <ip>::/0</ip>
53
+ </networks>
54
+
55
+ <profile>default</profile>
56
+ <quota>default</quota>
57
+
58
+ <access_management>1</access_management>
59
+ <named_collection_control>1</named_collection_control>
60
+ </default>
61
+ </users>
62
+
63
+ <profiles>
64
+ <default/>
65
+ </profiles>
66
+
67
+ <quotas>
68
+ <default/>
69
+ </quotas>
70
+ </clickhouse>
scripts/analyze_data_distribution.py ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import os
3
+ import sys
4
+ import datetime
5
+ import math
6
+ from collections import defaultdict
7
+ import statistics
8
+
9
+ # Add parent directory to path to import modules
10
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
11
+
12
+ from data.data_fetcher import DataFetcher
13
+ from clickhouse_driver import Client as ClickHouseClient
14
+ from neo4j import GraphDatabase
15
+ from dotenv import load_dotenv
16
+
17
+ # Load environment variables
18
+ load_dotenv()
19
+
20
+ # --- Configuration ---
21
+ CLICKHOUSE_HOST = os.getenv("CLICKHOUSE_HOST", "localhost")
22
+ CLICKHOUSE_PORT = int(os.getenv("CLICKHOUSE_PORT", 9000))
23
+ CLICKHOUSE_USER = os.getenv("CLICKHOUSE_USER") or "default"
24
+ CLICKHOUSE_PASSWORD = os.getenv("CLICKHOUSE_PASSWORD") or ""
25
+ CLICKHOUSE_DATABASE = os.getenv("CLICKHOUSE_DATABASE", "default")
26
+
27
+ NEO4J_URI = os.getenv("NEO4J_URI", "bolt://localhost:7687")
28
+ NEO4J_USER = os.getenv("NEO4J_USER", "neo4j")
29
+ NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD", "password")
30
+
31
+ def get_percentile(data, p):
32
+ if not data:
33
+ return 0.0
34
+ data.sort()
35
+ k = (len(data) - 1) * p
36
+ f = math.floor(k)
37
+ c = math.ceil(k)
38
+ if f == c:
39
+ return data[int(k)]
40
+ d0 = data[int(f)]
41
+ d1 = data[int(c)]
42
+ return d0 + (d1 - d0) * (k - f)
43
+
44
+ def main():
45
+ print("INFO: Connecting to Databases...")
46
+ try:
47
+ clickhouse_client = ClickHouseClient(host=CLICKHOUSE_HOST, port=CLICKHOUSE_PORT, user=CLICKHOUSE_USER, password=CLICKHOUSE_PASSWORD, database=CLICKHOUSE_DATABASE)
48
+ # We don't strictly need Neo4j for these aggregate stats, but initializing DataFetcher might require it
49
+ # Actually we can just run raw SQL queries on Clickhouse as that's where the metrics are.
50
+ except Exception as e:
51
+ print(f"ERROR: Failed to connect to Clickhouse: {e}")
52
+ sys.exit(1)
53
+
54
+ print("INFO: Fetching Aggregate Statistics from ClickHouse...")
55
+ print(" This may take a moment depending on dataset size...")
56
+
57
+ # 1. Migration Statistics
58
+ print("INFO: Counting Total Tokens (Mints table)...")
59
+ res_total = clickhouse_client.execute("SELECT count() FROM mints")
60
+ total_tokens = res_total[0][0]
61
+
62
+ print("INFO: Counting Migrated Tokens (Migrations table)...")
63
+ res_migrated = clickhouse_client.execute("SELECT count() FROM migrations")
64
+ migrated_count = res_migrated[0][0]
65
+
66
+ print(f"\n--- General Population ---")
67
+ print(f"Total Tokens: {total_tokens}")
68
+ if total_tokens > 0:
69
+ print(f"Migrated: {migrated_count} ({migrated_count/total_tokens*100:.2f}%)")
70
+ print(f"Not Migrated: {total_tokens - migrated_count} ({(total_tokens - migrated_count)/total_tokens*100:.2f}%)")
71
+ else:
72
+ print("Migrated: 0 (0.00%)")
73
+
74
+
75
+ # 2. Volume & Market Cap Distribution (Peak)
76
+ # We'll fetch the ATH stats for all tokens to build histograms
77
+ print("\nINFO: Fetching metrics per token...")
78
+ query_metrics = """
79
+ SELECT
80
+ t.token_address,
81
+ max(tm.ath_price_usd) as peak_price,
82
+ max(tm.ath_price_usd * t.total_supply / pow(10, t.decimals)) as peak_mc_usd
83
+ FROM token_metrics tm
84
+ JOIN tokens t ON tm.token_address = t.token_address
85
+ GROUP BY t.token_address, t.total_supply, t.decimals
86
+ """
87
+ # Note: If token_metrics is huge, we might want to sample or do percentiles in SQL.
88
+ # For now, let's try SQL percentiles first to be efficient.
89
+
90
+ query_percentiles = """
91
+ SELECT
92
+ quantiles(0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99)(ath_price_usd * total_supply / pow(10, decimals)) as mc_quantiles
93
+ FROM (
94
+ SELECT
95
+ tm.token_address,
96
+ avg(tm.ath_price_usd) as ath_price_usd,
97
+ any(t.total_supply) as total_supply,
98
+ any(t.decimals) as decimals
99
+ FROM token_metrics tm
100
+ JOIN tokens t ON tm.token_address = t.token_address
101
+ GROUP BY tm.token_address
102
+ )
103
+ """
104
+ # Simplified query if the join is expensive or complex, but let's assume we can get Peak MC.
105
+ # Actually, simpler proxy: Let's look at `trades` to see volume per token.
106
+
107
+ print("INFO: Calculates Volume & ATH Distribution from token_metrics_latest...")
108
+ query_metrics = """
109
+ SELECT
110
+ total_volume_usd,
111
+ ath_price_usd
112
+ FROM token_metrics_latest
113
+ WHERE total_volume_usd > 0
114
+ """
115
+ rows = clickhouse_client.execute(query_metrics)
116
+
117
+ volumes = []
118
+ ath_prices = []
119
+
120
+ for r in rows:
121
+ volumes.append(float(r[0]))
122
+ ath_prices.append(float(r[1]))
123
+
124
+ if not volumes:
125
+ print("WARN: No metric data found in token_metrics_latest. Trying token_metrics...")
126
+ # Fallback to aggregation on token_metrics if latest is empty
127
+ query_fallback = """
128
+ SELECT
129
+ argMax(total_volume_usd, updated_at),
130
+ argMax(ath_price_usd, updated_at)
131
+ FROM token_metrics
132
+ GROUP BY token_address
133
+ """
134
+ rows = clickhouse_client.execute(query_fallback)
135
+ volumes = []
136
+ ath_prices = []
137
+ for r in rows:
138
+ volumes.append(float(r[0]))
139
+ ath_prices.append(float(r[1]))
140
+
141
+ if not volumes:
142
+ print("WARN: No metric data found. Exiting.")
143
+ return
144
+
145
+ volumes.sort()
146
+ ath_prices.sort()
147
+ n = len(volumes)
148
+
149
+
150
+ print("\n--- Volume USD Distribution (Per Token) ---")
151
+ print(f"Min: ${volumes[0]:.2f}")
152
+ print(f"10th %ile: ${get_percentile(volumes, 0.1):.2f}")
153
+ print(f"25th %ile: ${get_percentile(volumes, 0.25):.2f}")
154
+ print(f"50th %ile: ${get_percentile(volumes, 0.5):.2f} (Median)")
155
+ print(f"75th %ile: ${get_percentile(volumes, 0.75):.2f}")
156
+ print(f"90th %ile: ${get_percentile(volumes, 0.9):.2f}")
157
+ print(f"95th %ile: ${get_percentile(volumes, 0.95):.2f}")
158
+ print(f"99th %ile: ${get_percentile(volumes, 0.99):.2f}")
159
+ print(f"Max: ${volumes[-1]:.2f}")
160
+
161
+ # --- 3. Fees Distribution (Priority + Bribe) ---
162
+ print("\nINFO: Calculating Aggregated Fees per Token (Priority + Bribe)...")
163
+ query_fees = """
164
+ SELECT
165
+ base_address,
166
+ sum(priority_fee) + sum(bribe_fee) as total_fees_sol
167
+ FROM trades
168
+ GROUP BY base_address
169
+ HAVING total_fees_sol > 0
170
+ """
171
+ rows_fees = clickhouse_client.execute(query_fees)
172
+ fees = []
173
+ for r in rows_fees:
174
+ fees.append(float(r[1]))
175
+
176
+ if fees:
177
+ fees.sort()
178
+ print(f"\n--- Total Fees Spent (SOL) Distribution (Per Token) ---")
179
+ print(f"Min: {fees[0]:.4f} SOL")
180
+ print(f"50th %ile: {get_percentile(fees, 0.5):.4f} SOL")
181
+ print(f"75th %ile: {get_percentile(fees, 0.75):.4f} SOL")
182
+ print(f"90th %ile: {get_percentile(fees, 0.9):.4f} SOL")
183
+ print(f"95th %ile: {get_percentile(fees, 0.95):.4f} SOL")
184
+ print(f"Max: {fees[-1]:.4f} SOL")
185
+
186
+ count_low_fees = sum(1 for f in fees if f < 0.1)
187
+ count_mid_fees = sum(1 for f in fees if f >= 0.1 and f < 1.0)
188
+ count_high_fees = sum(1 for f in fees if f >= 1.0)
189
+
190
+ print(f"\n--- Fee Thresholds Analysis ---")
191
+ print(f"Tokens < 0.1 SOL Fees: {count_low_fees} ({count_low_fees/len(fees)*100:.1f}%)")
192
+ print(f"Tokens 0.1 - 1.0 SOL: {count_mid_fees} ({count_mid_fees/len(fees)*100:.1f}%)")
193
+ print(f"Tokens > 1.0 SOL Fees: {count_high_fees} ({count_high_fees/len(fees)*100:.1f}%)")
194
+ else:
195
+ print("WARN: No fee data found.")
196
+
197
+ print(f"\n--- Potential Thresholds Analysis ---")
198
+ count_under_1k = sum(1 for v in volumes if v < 1000)
199
+ count_over_20k = sum(1 for v in volumes if v > 20000)
200
+ count_over_500k = sum(1 for v in volumes if v > 500000)
201
+
202
+ print(f"Tokens < $1k Vol ('Instant Garbage'?): {count_under_1k} ({count_under_1k/n*100:.1f}%)")
203
+ print(f"Tokens > $20k Vol ('Contenders'?): {count_over_20k} ({count_over_20k/n*100:.1f}%)")
204
+ print(f"Tokens > $500k Vol ('Alpha'?): {count_over_500k} ({count_over_500k/n*100:.1f}%)")
205
+
206
+ if __name__ == "__main__":
207
+ main()
scripts/analyze_distribution.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import os
3
+ import sys
4
+ import datetime
5
+ from dotenv import load_dotenv
6
+ from clickhouse_driver import Client as ClickHouseClient
7
+
8
+ # Add parent to path
9
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
10
+
11
+ load_dotenv()
12
+
13
+ CLICKHOUSE_HOST = os.getenv("CLICKHOUSE_HOST", "localhost")
14
+ CLICKHOUSE_PORT = int(os.getenv("CLICKHOUSE_PORT", 9000))
15
+ CLICKHOUSE_USER = os.getenv("CLICKHOUSE_USER", "default")
16
+ CLICKHOUSE_PASSWORD = os.getenv("CLICKHOUSE_PASSWORD", "")
17
+ CLICKHOUSE_DATABASE = os.getenv("CLICKHOUSE_DATABASE", "default")
18
+
19
+ def analyze():
20
+ try:
21
+ client = ClickHouseClient(
22
+ host=CLICKHOUSE_HOST,
23
+ port=CLICKHOUSE_PORT,
24
+ user=CLICKHOUSE_USER,
25
+ password=CLICKHOUSE_PASSWORD,
26
+ database=CLICKHOUSE_DATABASE
27
+ )
28
+
29
+ print("--- Database Stats Analysis ---")
30
+
31
+ # 1. Total Mints
32
+ total_mints = client.execute("SELECT count() FROM mints")[0][0]
33
+ print(f"Total Mints: {total_mints}")
34
+
35
+ if total_mints == 0:
36
+ print("No data found.")
37
+ return
38
+
39
+ # 2. Migrated Count (Proxy: launchpad != protocol OR check if in raydium pairs)
40
+ # Assuming we can infer success or use token_metrics
41
+ # Let's look at ATH Price distribution from token_metrics which is populated by the indexer
42
+
43
+ # Check coverage of token_metrics
44
+ total_metrics = client.execute("SELECT count() FROM token_metrics")[0][0]
45
+ print(f"Tokens with Metrics: {total_metrics} (Coverage: {total_metrics/total_mints*100:.1f}%)")
46
+
47
+ # 3. ATH Price Stats
48
+ # We need to know what a '5x' looks like.
49
+ # Since we don't have 'opening price' easily indexed for all, let's assume standard pump.fun open price ranges
50
+ # or just look at Market Cap distribution if available, or just raw ATH price.
51
+ # Pump.fun launch MC is usually ~$4-5k.
52
+ # 5x = $25k MC.
53
+ # 10x = $50k MC (Migration).
54
+
55
+ # Let's check distribution of ath_price_usd * total_supply (Approx ATH Market Cap)
56
+ # We need total_supply from tokens table.
57
+
58
+ print("\n--- ATH Market Cap Distribution (Approx) ---")
59
+ query_mc_buckets = """
60
+ SELECT
61
+ case
62
+ when mc < 5000 then '1. < $5k (Fail)'
63
+ when mc >= 5000 AND mc < 20000 then '2. $5k - $20k (2x-4x)'
64
+ when mc >= 20000 AND mc < 60000 then '3. $20k - $60k (4x-12x)'
65
+ when mc >= 60000 AND mc < 150000 then '4. $60k - $150k (12x-30x)'
66
+ when mc >= 150000 then '5. > $150k (Mooners)'
67
+ else 'Unknown'
68
+ end as bucket,
69
+ count() as cnt
70
+ FROM (
71
+ SELECT
72
+ tm.ath_price_usd * (t.total_supply / pow(10, t.decimals)) as mc
73
+ FROM token_metrics tm
74
+ JOIN tokens t ON tm.token_address = t.token_address
75
+ )
76
+ GROUP BY bucket
77
+ ORDER BY bucket
78
+ """
79
+ rows = client.execute(query_mc_buckets)
80
+ for r in rows:
81
+ print(f"{r[0]}: {r[1]} tokens")
82
+
83
+ # 4. Volume Distribution
84
+ # Helps define "High Volume Losers" vs "Garbage"
85
+ print("\n--- Volume Distribution (Total USD) ---")
86
+ # Aggregating all trades is heavy, let's do a sample or use token_metrics if it has volume (it doesn't seem to have volume sum in snippet)
87
+ # We'll use a subquery on trades for a subset or just a heavy query if local
88
+
89
+ query_vol_buckets = """
90
+ SELECT
91
+ case
92
+ when vol < 100 then '1. < $100 (Dead)'
93
+ when vol >= 100 AND vol < 1000 then '2. $100 - $1k (Tiny)'
94
+ when vol >= 1000 AND vol < 10000 then '3. $1k - $10k (Noise)'
95
+ when vol >= 10000 AND vol < 100000 then '4. $10k - $100k (Active)'
96
+ when vol >= 100000 then '5. > $100k (High)'
97
+ else 'Unknown'
98
+ end as bucket,
99
+ count() as cnt
100
+ FROM (
101
+ SELECT
102
+ base_address, sum(price_usd * amount_decimal) as vol
103
+ FROM trades
104
+ GROUP BY base_address
105
+ )
106
+ GROUP BY bucket
107
+ ORDER BY bucket
108
+ """
109
+ # This might be slow on huge datasets.
110
+ rows_vol = client.execute(query_vol_buckets)
111
+ for r in rows_vol:
112
+ print(f"{r[0]}: {r[1]} tokens")
113
+
114
+ except Exception as e:
115
+ print(f"Error: {e}")
116
+
117
+ if __name__ == "__main__":
118
+ analyze()
scripts/cache_dataset.py CHANGED
@@ -35,13 +35,16 @@ CACHE_DIR = os.getenv("CACHE_DIR", "/workspace/apollo/data/cache")
35
 
36
  def main():
37
  parser = argparse.ArgumentParser(description="Pre-cache dataset samples.")
38
- parser.add_argument("--max_samples", type=int, default=100, help="Number of samples to cache.")
39
- parser.add_argument("--t_cutoff_seconds", type=int, default=60, help="Deprecated; cutoff is randomized at training time.")
40
  parser.add_argument("--start_date", type=str, default=None, help="Start date for filtering mints (YYYY-MM-DD).")
41
  parser.add_argument("--ohlc_stats_path", type=str, default=None, help="Path to OHLC stats JSON.")
42
  parser.add_argument("--min_trade_usd", type=float, default=0.0, help="Minimum trade USD value.")
43
 
44
  args = parser.parse_args()
 
 
 
45
 
46
  # Create cache directory if it doesn't exist
47
  output_dir = Path(CACHE_DIR)
@@ -66,9 +69,9 @@ def main():
66
 
67
  dataset = OracleDataset(
68
  data_fetcher=data_fetcher,
69
- max_samples=args.max_samples,
70
  start_date=start_date_dt,
71
- t_cutoff_seconds=args.t_cutoff_seconds,
72
  ohlc_stats_path=args.ohlc_stats_path,
73
  horizons_seconds=[60, 300, 900, 1800, 3600],
74
  quantiles=[0.5],
 
35
 
36
  def main():
37
  parser = argparse.ArgumentParser(description="Pre-cache dataset samples.")
38
+ parser.add_argument("--max_samples", type=int, default=-1, help="Number of samples to cache. Set to -1 to process all available.")
39
+
40
  parser.add_argument("--start_date", type=str, default=None, help="Start date for filtering mints (YYYY-MM-DD).")
41
  parser.add_argument("--ohlc_stats_path", type=str, default=None, help="Path to OHLC stats JSON.")
42
  parser.add_argument("--min_trade_usd", type=float, default=0.0, help="Minimum trade USD value.")
43
 
44
  args = parser.parse_args()
45
+
46
+ # Handle -1 as unlimited (None)
47
+ max_samples = args.max_samples if args.max_samples != -1 else None
48
 
49
  # Create cache directory if it doesn't exist
50
  output_dir = Path(CACHE_DIR)
 
69
 
70
  dataset = OracleDataset(
71
  data_fetcher=data_fetcher,
72
+ max_samples=max_samples,
73
  start_date=start_date_dt,
74
+
75
  ohlc_stats_path=args.ohlc_stats_path,
76
  horizons_seconds=[60, 300, 900, 1800, 3600],
77
  quantiles=[0.5],
scripts/download_epoch_artifacts.py CHANGED
@@ -54,7 +54,7 @@ def build_patterns(epoch: int, skip_clickhouse: bool = False) -> List[str]:
54
 
55
  def parse_args() -> argparse.Namespace:
56
  parser = argparse.ArgumentParser(description="Download epoch artifacts from Hugging Face.")
57
- parser.add_argument("--epoch", type=int, required=False, help="Epoch number to download (e.g., 851)", default=851)
58
  parser.add_argument("-c", "--skip-clickhouse", action="store_true", help="Download only the Neo4j dump")
59
  parser.add_argument(
60
  "--token",
 
54
 
55
  def parse_args() -> argparse.Namespace:
56
  parser = argparse.ArgumentParser(description="Download epoch artifacts from Hugging Face.")
57
+ parser.add_argument("--epoch", type=int, required=False, help="Epoch number to download (e.g., 844)", default=844)
58
  parser.add_argument("-c", "--skip-clickhouse", action="store_true", help="Download only the Neo4j dump")
59
  parser.add_argument(
60
  "--token",
status ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ PID: 7085
2
+ Started at: 2026-01-26 07:54:26
3
+ Revision: 54508
store/0e3/0e3eb1d8-7d84-4fb4-9205-69aed69777eb/format_version.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ 1
store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/amount.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:78741c5d6f64855c8216af8f40e1c753e5164c9a8c641f083dbd715edb01c8fe
3
+ size 7420410
store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/amount.cmrk2 ADDED
Binary file (481 Bytes). View file
 
store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/amount_decimal.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd99b3e6239e67cde4e32b2341f0ce61e08b8b3a29dfaa278cdf6b14b2f6478e
3
+ size 8334598
store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/amount_decimal.cmrk2 ADDED
Binary file (485 Bytes). View file
 
store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/checksums.txt ADDED
Binary file (1.65 kB). View file
 
store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/columns.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ columns format version: 1
2
+ 11 columns:
3
+ `timestamp` DateTime('UTC')
4
+ `signature` String
5
+ `slot` UInt64
6
+ `success` Bool
7
+ `error` Nullable(String)
8
+ `priority_fee` Float64
9
+ `mint_address` String
10
+ `source` String
11
+ `amount` UInt64
12
+ `amount_decimal` Float64
13
+ `source_balance` Float64
store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/columns_substreams.txt ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ columns substreams version: 1
2
+ 11 columns:
3
+ 1 substreams for column `timestamp`:
4
+ timestamp
5
+ 2 substreams for column `signature`:
6
+ signature.size
7
+ signature
8
+ 1 substreams for column `slot`:
9
+ slot
10
+ 1 substreams for column `success`:
11
+ success
12
+ 2 substreams for column `error`:
13
+ error.null
14
+ error
15
+ 1 substreams for column `priority_fee`:
16
+ priority_fee
17
+ 2 substreams for column `mint_address`:
18
+ mint_address.size
19
+ mint_address
20
+ 2 substreams for column `source`:
21
+ source.size
22
+ source
23
+ 1 substreams for column `amount`:
24
+ amount
25
+ 1 substreams for column `amount_decimal`:
26
+ amount_decimal
27
+ 2 substreams for column `source_balance`:
28
+ source_balance.sparse.idx
29
+ source_balance
store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/count.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ 1100993
store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/default_compression_codec.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ CODEC(LZ4)
store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/error.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e5a412041fb3275348b17f4629e30874227e064b7a99ee7f75443272cf57146b
3
+ size 19714
store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/error.cmrk2 ADDED
Binary file (404 Bytes). View file
 
store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/error.null.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29b4b54b17151eab8c63c60abc03f6d36e4db9a99fb65adec928a44f5385a538
3
+ size 12763
store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/error.null.cmrk2 ADDED
Binary file (251 Bytes). View file
 
store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/metadata_version.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ 0
store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/mint_address.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3005ec831972d8b5735427bb3639961a32cdf98d653d99493498723f9bda84cf
3
+ size 2541548
store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/mint_address.cmrk2 ADDED
Binary file (413 Bytes). View file
 
store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/mint_address.size.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ce10c4fce08c143dfc9dac8c83239708a415e01b451b773935ba3957b6949eb
3
+ size 46913
store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/mint_address.size.cmrk2 ADDED
Binary file (346 Bytes). View file
 
store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/primary.cidx ADDED
Binary file (9.68 kB). View file
 
store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/priority_fee.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d254e8e5ef97247f7d542374e73cc0b94a8df3c949812ab6874cbb3a50875600
3
+ size 472876
store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/priority_fee.cmrk2 ADDED
Binary file (364 Bytes). View file
 
store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/serialization.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"columns":[{"kind":"Default","name":"amount","num_defaults":12050,"num_rows":1100993},{"kind":"Default","name":"amount_decimal","num_defaults":12050,"num_rows":1100993},{"kind":"Default","name":"mint_address","num_defaults":0,"num_rows":1100993},{"kind":"Default","name":"priority_fee","num_defaults":603011,"num_rows":1100993},{"kind":"Default","name":"signature","num_defaults":0,"num_rows":1100993},{"kind":"Default","name":"slot","num_defaults":0,"num_rows":1100993},{"kind":"Default","name":"source","num_defaults":0,"num_rows":1100993},{"kind":"Sparse","name":"source_balance","num_defaults":1100972,"num_rows":1100993},{"kind":"Default","name":"success","num_defaults":2430,"num_rows":1100993},{"kind":"Default","name":"timestamp","num_defaults":0,"num_rows":1100993}],"types_serialization_versions":{"string":1},"version":1}
store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/signature.cmrk2 ADDED
Binary file (497 Bytes). View file
 
store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/signature.size.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c49e6bc87990c8c5ab96fd121d5ae48c81543e428bc5dc2279a46844c938730
3
+ size 1139668
store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/signature.size.cmrk2 ADDED
Binary file (382 Bytes). View file
 
store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/slot.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae9876c194dc904c59decde7de4308e13a9ae3a5e9590c287051a2d1374b583d
3
+ size 2918244
store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/slot.cmrk2 ADDED
Binary file (420 Bytes). View file
 
store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/source.cmrk2 ADDED
Binary file (492 Bytes). View file
 
store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/source.size.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98f071e87f6c59312c867a8b715573fd7e5f3dc5ec1ff9afb30d6279f386421f
3
+ size 235336
store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/source.size.cmrk2 ADDED
Binary file (357 Bytes). View file
 
store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/source_balance.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac17d2da6ced6aa0c59138dea492b9e46fafb8c3c91943f6e7133b4b2f6fdb43
3
+ size 137
store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/source_balance.cmrk2 ADDED
Binary file (60 Bytes). View file
 
store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/source_balance.sparse.idx.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7dc6efc4326db4329ea2ee99ff93188c17504ad8f8f9604f53ea859a96828e2c
3
+ size 66
store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/source_balance.sparse.idx.cmrk2 ADDED
Binary file (222 Bytes). View file
 
store/0ee/0ee2fde6-369e-4923-bdf9-fe7f0c901703/all_1_1_0/success.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29b4b54b17151eab8c63c60abc03f6d36e4db9a99fb65adec928a44f5385a538
3
+ size 12763