Upload folder using huggingface_hub

Browse files

Files changed (8) hide show

TASK_LIST.md +87 -0
cache_dataset.py +225 -58
data/data_loader.py +154 -16
log.log +2 -2
resume.md +190 -0
scripts/evaluate_sample.py +77 -3
scripts/rebuild_metadata.py +15 -1
train.py +77 -45

TASK_LIST.md ADDED Viewed

	@@ -0,0 +1,87 @@

+Task List
+Fix validation splitting by token identity
+Replace class-only split logic in train.py (line 235)
+Group cached context samples by source_token or token_address
+Ensure one token can only exist in train or val, never both
+Keep class balance as a secondary constraint, not the primary identity rule
+Stop using current validation as decision-grade signal
+Treat old val curves/checkpoints as contaminated
+Re-evaluate only after token-grouped split is in place
+Audit cache metadata and make token identity explicit
+Ensure every cached sample has stable token identity fields
+Required minimum: source_token, class_id
+Prefer also storing lightweight cache-planning metadata for later analysis
+Redesign cache generation around fixed budgets
+Define total cache budget first
+Allocate exact sample counts per token class before writing files
+Do not let raw source distribution decide cache composition
+Remove destructive dependence on token class map filtering alone
+Token class should guide budget allocation
+It should not be the only logic determining whether cache is useful
+Add cache-time context-level balancing
+After sampling a candidate context, evaluate realized future labels for that context
+Use realized context outcome to decide whether to keep or skip it
+Do this before saving to disk
+Start with binary polarity, not movement-threshold balancing
+Positive if max valid horizon return > 0
+Negative otherwise
+Use this only as cache-selection metadata first
+Make polarity quotas class-conditional
+For stronger classes, target positive/negative ratios
+For garbage classes, do not force positives
+Keep class 0 mostly natural/negative
+Keep T_cutoff random during cache generation
+Do not freeze a single deterministic cutoff per token
+Determinism should be in the planning/budget logic, not in removing context diversity
+Add exact acceptance accounting during cache build
+Track how many samples have already been accepted per class
+Track polarity counts per class
+Stop accepting once quotas are filled
+Avoid cache waste from duplicate low-value contexts
+Add retry/attempt limits per token
+If a token cannot satisfy desired quota type, stop oversampling it endlessly
+Move on to other tokens instead of filling disk with junk
+Keep label derivation in the data pipeline, not in training logic
+Loader should produce final labels and masks
+Collator should only stack/batch them
+Model should only consume them
+Reduce or remove train-time class reweighting after cache is fixed
+Revisit WeightedRandomSampler
+Revisit class_loss_weights
+If cache is balanced upstream, training should not need heavy rescue weighting
+Revisit movement head only after split and cache are fixed
+Keep it auxiliary
+Do not let movement-label threshold debates block the more important data fixes
+Later simplify naming/threshold assumptions if needed
+Add cache audit tooling
+Report counts by class_id
+Report counts by class x polarity
+Report unique tokens by class
+Report acceptance/rejection reasons
+Report train/val token overlap check
+Add validation integrity checks
+Assert zero token overlap between train and val
+Print per-class token counts, not just sample counts
+Print per-class sample counts too
+Rebuild cache after the new policy is implemented
+Old cache is shaped by the wrong distribution
+Old validation split is not trustworthy
+New training should start from the rebuilt corpus
+Retrain and re-baseline from scratch
+New split
+New cache
+Minimal train-time rescue weighting
+Recompare backbone behavior only after that
+Recommended implementation order
+Token-grouped validation split
+Validation overlap checks
+Cache metadata cleanup
+Exact class quotas in cache generation
+Class-conditional polarity quotas
+Cache audit reports
+Remove/reduce train-time weighting
+Rebuild cache
+Retrain
+Reassess movement head

cache_dataset.py CHANGED Viewed

@@ -23,6 +23,7 @@ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from scripts.analyze_distribution import get_return_class_map
 from scripts.compute_quality_score import get_token_quality_scores, fetch_token_metrics, _bucket_id, _midrank_percentiles, EPS
 from clickhouse_driver import Client as ClickHouseClient
 from neo4j import GraphDatabase
@@ -32,6 +33,61 @@ _worker_return_class_map = None
 _worker_quality_scores_map = None
 def _init_worker(db_config, dataset_config, return_class_map, quality_scores_map):
     global _worker_dataset, _worker_return_class_map, _worker_quality_scores_map
     from data.data_loader import OracleDataset
@@ -43,7 +99,7 @@ def _init_worker(db_config, dataset_config, return_class_map, quality_scores_map
     _worker_dataset = OracleDataset(
         data_fetcher=data_fetcher,
-        max_samples=dataset_config['max_samples'],
         start_date=dataset_config['start_date'],
         horizons_seconds=dataset_config['horizons_seconds'],
         quantiles=dataset_config['quantiles'],
@@ -68,42 +124,15 @@ def _process_single_token_context(args):
         q_score = _worker_quality_scores_map.get(mint_addr)
         if q_score is None:
             return {'status': 'skipped', 'reason': 'no quality score', 'mint': mint_addr}
-        saved_files = []
-        for ctx_idx, ctx in enumerate(contexts):
-            ctx["quality_score"] = q_score
-            ctx["class_id"] = class_id
-            ctx["source_token"] = mint_addr
-            ctx["cache_mode"] = "context"
-            filename = f"sample_{mint_addr[:16]}_{ctx_idx}.pt"
-            output_path = Path(output_dir) / filename
-            torch.save(ctx, output_path)
-            saved_files.append(filename)
-        return {'status': 'success', 'mint': mint_addr, 'class_id': class_id, 'q_score': q_score, 'n_contexts': len(contexts), 'n_events': len(contexts[0].get('event_sequence', [])) if contexts else 0, 'files': saved_files}
-    except Exception as e:
-        import traceback
-        return {'status': 'error', 'mint': mint_addr, 'error': str(e), 'traceback': traceback.format_exc()}
-def _process_single_token_raw(args):
-    idx, mint_addr, output_dir = args
-    global _worker_dataset, _worker_return_class_map, _worker_quality_scores_map
-    try:
-        class_id = _worker_return_class_map.get(mint_addr)
-        if class_id is None:
-            return {'status': 'skipped', 'reason': 'not in class map', 'mint': mint_addr}
-        item = _worker_dataset.__cacheitem__(idx)
-        if item is None:
-            return {'status': 'skipped', 'reason': 'cacheitem returned None', 'mint': mint_addr}
-        q_score = _worker_quality_scores_map.get(mint_addr)
-        if q_score is None:
-            return {'status': 'skipped', 'reason': 'no quality score', 'mint': mint_addr}
-        item["quality_score"] = q_score
-        item["class_id"] = class_id
-        item["cache_mode"] = "raw"
-        filename = f"sample_{mint_addr[:16]}.pt"
-        output_path = Path(output_dir) / filename
-        torch.save(item, output_path)
-        return {'status': 'success', 'mint': mint_addr, 'class_id': class_id, 'q_score': q_score, 'n_trades': len(item.get('trades', [])), 'files': [filename]}
     except Exception as e:
         import traceback
         return {'status': 'error', 'mint': mint_addr, 'error': str(e), 'traceback': traceback.format_exc()}
@@ -122,14 +151,16 @@ def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("--output_dir", type=str, default="data/cache")
-    parser.add_argument("--max_samples", type=int, default=None)
     parser.add_argument("--start_date", type=str, default=None)
     parser.add_argument("--min_trade_usd", type=float, default=0.0)
-    parser.add_argument("--cache_mode", type=str, default="raw", choices=["raw", "context"])
-    parser.add_argument("--context_length", type=int, default=8192)
     parser.add_argument("--min_trades", type=int, default=10)
     parser.add_argument("--samples_per_token", type=int, default=1)
     parser.add_argument("--num_workers", type=int, default=1)
     parser.add_argument("--clickhouse_host", type=str, default=os.getenv("CLICKHOUSE_HOST", "localhost"))
     parser.add_argument("--clickhouse_port", type=int, default=int(os.getenv("CLICKHOUSE_PORT", 9000)))
@@ -138,6 +169,11 @@ def main():
     parser.add_argument("--neo4j_password", type=str, default=os.getenv("NEO4J_PASSWORD", "password"))
     args = parser.parse_args()
     if args.num_workers == 0:
         args.num_workers = max(1, mp.cpu_count() - 4)
@@ -163,7 +199,15 @@ def main():
         quality_scores_map = get_token_quality_scores(clickhouse_client)
         print(f"INFO: Loaded {len(quality_scores_map)} quality scores.")
-        dataset = OracleDataset(data_fetcher=data_fetcher, max_samples=args.max_samples, start_date=start_date_dt, horizons_seconds=[60, 180, 300, 600, 1800, 3600, 7200], quantiles=[0.5], min_trade_usd=args.min_trade_usd, max_seq_len=args.context_length)
         if len(dataset) == 0:
             print("WARNING: No samples. Exiting.")
@@ -178,25 +222,52 @@ def main():
             print("WARNING: No tokens after filtering.")
             return
-        print(f"INFO: Cache mode: {args.cache_mode}, Workers: {args.num_workers}")
         db_config = {'clickhouse_host': args.clickhouse_host, 'clickhouse_port': args.clickhouse_port, 'neo4j_uri': args.neo4j_uri, 'neo4j_user': args.neo4j_user, 'neo4j_password': args.neo4j_password}
-        dataset_config = {'max_samples': args.max_samples, 'start_date': start_date_dt, 'horizons_seconds': [60, 180, 300, 600, 1800, 3600, 7200], 'quantiles': [0.5], 'min_trade_usd': args.min_trade_usd, 'max_seq_len': args.context_length, 'sampled_mints': filtered_mints}
         # Build tasks from filtered_mints directly
         tasks = []
         for i, mint_record in enumerate(filtered_mints):
             mint_addr = mint_record['mint_address']
-            if args.cache_mode == "context":
-                tasks.append((i, mint_addr, args.samples_per_token, str(output_dir)))
-            else:
-                tasks.append((i, mint_addr, str(output_dir)))
         print(f"INFO: Starting to cache {len(tasks)} tokens...")
         success_count, skipped_count, error_count = 0, 0, 0
         class_distribution = {}
-        process_fn = _process_single_token_context if args.cache_mode == "context" else _process_single_token_raw
         if args.num_workers == 1:
             print("INFO: Single-threaded mode...")
@@ -204,8 +275,61 @@ def main():
             for task in tqdm(tasks, desc="Caching"):
                 result = process_fn(task)
                 if result['status'] == 'success':
-                    success_count += 1
-                    class_distribution[result['class_id']] = class_distribution.get(result['class_id'], 0) + 1
                 elif result['status'] == 'skipped':
                     skipped_count += 1
                 else:
@@ -219,8 +343,26 @@ def main():
                     try:
                         result = future.result(timeout=300)
                         if result['status'] == 'success':
                             success_count += 1
-                            class_distribution[result['class_id']] = class_distribution.get(result['class_id'], 0) + 1
                         elif result['status'] == 'skipped':
                             skipped_count += 1
                         else:
@@ -230,15 +372,40 @@ def main():
                         tqdm.write(f"WORKER ERROR: {e}")
         print("INFO: Building metadata...")
-        file_class_map = {}
-        for f in sorted(output_dir.glob("sample_*.pt")):
-            try:
-                file_class_map[f.name] = torch.load(f, map_location="cpu", weights_only=False).get("class_id", 0)
-            except:
-                pass
         with open(output_dir / "class_metadata.json", 'w') as f:
-            json.dump({'file_class_map': file_class_map, 'class_distribution': {str(k): v for k, v in class_distribution.items()}, 'cache_mode': args.cache_mode, 'num_workers': args.num_workers}, f, indent=2)
         print(f"\n--- Done ---\nSuccess: {success_count}, Skipped: {skipped_count}, Errors: {error_count}\nFiles: {len(file_class_map)}\nLocation: {output_dir.resolve()}")

 from scripts.analyze_distribution import get_return_class_map
 from scripts.compute_quality_score import get_token_quality_scores, fetch_token_metrics, _bucket_id, _midrank_percentiles, EPS
+from data.data_loader import summarize_context_window
 from clickhouse_driver import Client as ClickHouseClient
 from neo4j import GraphDatabase
 _worker_quality_scores_map = None
+def _build_context_quota_plan(
+    class_ids,
+    target_contexts_per_class,
+    target_contexts_total,
+    good_ratio_nonzero,
+    good_ratio_class0,
+):
+    unique_class_ids = sorted(set(int(cid) for cid in class_ids))
+    if not unique_class_ids:
+        return {}
+    if target_contexts_per_class is not None:
+        per_class_target = int(target_contexts_per_class)
+    elif target_contexts_total is not None:
+        per_class_target = max(1, int(target_contexts_total) // len(unique_class_ids))
+    else:
+        return {}
+    if per_class_target <= 0:
+        raise RuntimeError("Context quota target must be positive.")
+    plan = {}
+    for class_id in unique_class_ids:
+        ratio = float(good_ratio_class0 if class_id == 0 else good_ratio_nonzero)
+        ratio = max(0.0, min(1.0, ratio))
+        good_target = int(round(per_class_target * ratio))
+        bad_target = per_class_target - good_target
+        plan[class_id] = {
+            "total_target": per_class_target,
+            "good_target": good_target,
+            "bad_target": bad_target,
+        }
+    return plan
+def _should_accept_context(class_id, context_bucket, accepted_counts, quota_plan):
+    if not quota_plan:
+        return True
+    if class_id not in quota_plan:
+        return False
+    class_plan = quota_plan[class_id]
+    class_counts = accepted_counts[class_id]
+    if class_counts["total"] >= class_plan["total_target"]:
+        return False
+    bucket_key = "good" if context_bucket == "good" else "bad"
+    target_key = f"{bucket_key}_target"
+    if class_counts[bucket_key] >= class_plan[target_key]:
+        return False
+    return True
 def _init_worker(db_config, dataset_config, return_class_map, quality_scores_map):
     global _worker_dataset, _worker_return_class_map, _worker_quality_scores_map
     from data.data_loader import OracleDataset
     _worker_dataset = OracleDataset(
         data_fetcher=data_fetcher,
+        min_trades=dataset_config['min_trades'],
         start_date=dataset_config['start_date'],
         horizons_seconds=dataset_config['horizons_seconds'],
         quantiles=dataset_config['quantiles'],
         q_score = _worker_quality_scores_map.get(mint_addr)
         if q_score is None:
             return {'status': 'skipped', 'reason': 'no quality score', 'mint': mint_addr}
+        return {
+            'status': 'success',
+            'mint': mint_addr,
+            'class_id': class_id,
+            'q_score': q_score,
+            'n_contexts': len(contexts),
+            'n_events': len(contexts[0].get('event_sequence', [])) if contexts else 0,
+            'contexts': contexts,
+        }
     except Exception as e:
         import traceback
         return {'status': 'error', 'mint': mint_addr, 'error': str(e), 'traceback': traceback.format_exc()}
     parser = argparse.ArgumentParser()
     parser.add_argument("--output_dir", type=str, default="data/cache")
     parser.add_argument("--start_date", type=str, default=None)
     parser.add_argument("--min_trade_usd", type=float, default=0.0)
     parser.add_argument("--min_trades", type=int, default=10)
+    parser.add_argument("--context_length", type=int, default=8192)
     parser.add_argument("--samples_per_token", type=int, default=1)
+    parser.add_argument("--target_contexts_per_class", type=int, default=None)
+    parser.add_argument("--target_contexts_total", type=int, default=None)
+    parser.add_argument("--good_ratio_nonzero", type=float, default=0.5)
+    parser.add_argument("--good_ratio_class0", type=float, default=0.0)
     parser.add_argument("--num_workers", type=int, default=1)
     parser.add_argument("--clickhouse_host", type=str, default=os.getenv("CLICKHOUSE_HOST", "localhost"))
     parser.add_argument("--clickhouse_port", type=int, default=int(os.getenv("CLICKHOUSE_PORT", 9000)))
     parser.add_argument("--neo4j_password", type=str, default=os.getenv("NEO4J_PASSWORD", "password"))
     args = parser.parse_args()
+    if args.target_contexts_per_class is not None and args.target_contexts_total is not None:
+        raise RuntimeError(
+            "Choose exactly one cache budget: either --target_contexts_per_class or --target_contexts_total."
+        )
     if args.num_workers == 0:
         args.num_workers = max(1, mp.cpu_count() - 4)
         quality_scores_map = get_token_quality_scores(clickhouse_client)
         print(f"INFO: Loaded {len(quality_scores_map)} quality scores.")
+        dataset = OracleDataset(
+            data_fetcher=data_fetcher,
+            min_trades=args.min_trades,
+            start_date=start_date_dt,
+            horizons_seconds=[60, 180, 300, 600, 1800, 3600, 7200],
+            quantiles=[0.5],
+            min_trade_usd=args.min_trade_usd,
+            max_seq_len=args.context_length,
+        )
         if len(dataset) == 0:
             print("WARNING: No samples. Exiting.")
             print("WARNING: No tokens after filtering.")
             return
+        print(f"INFO: Building canonical context cache | Workers: {args.num_workers}")
+        if args.num_workers != 1 and (
+            args.target_contexts_per_class is not None or args.target_contexts_total is not None
+        ):
+            raise RuntimeError(
+                "Quota-driven context caching currently requires --num_workers 1 so accepted contexts "
+                "can be planned and written deterministically in one process."
+            )
         db_config = {'clickhouse_host': args.clickhouse_host, 'clickhouse_port': args.clickhouse_port, 'neo4j_uri': args.neo4j_uri, 'neo4j_user': args.neo4j_user, 'neo4j_password': args.neo4j_password}
+        dataset_config = {'start_date': start_date_dt, 'min_trades': args.min_trades, 'horizons_seconds': [60, 180, 300, 600, 1800, 3600, 7200], 'quantiles': [0.5], 'min_trade_usd': args.min_trade_usd, 'max_seq_len': args.context_length, 'sampled_mints': filtered_mints}
         # Build tasks from filtered_mints directly
         tasks = []
         for i, mint_record in enumerate(filtered_mints):
             mint_addr = mint_record['mint_address']
+            tasks.append((i, mint_addr, args.samples_per_token, str(output_dir)))
         print(f"INFO: Starting to cache {len(tasks)} tokens...")
         success_count, skipped_count, error_count = 0, 0, 0
         class_distribution = {}
+        context_distribution = defaultdict(lambda: defaultdict(int))
+        file_class_map = {}
+        file_context_bucket_map = {}
+        file_context_summary_map = {}
+        process_fn = _process_single_token_context
+        quota_plan = {}
+        accepted_counts = defaultdict(lambda: {"total": 0, "good": 0, "bad": 0})
+        accepted_per_token = defaultdict(int)
+        quota_plan = _build_context_quota_plan(
+            class_ids=[return_class_map[m['mint_address']] for m in filtered_mints if m['mint_address'] in return_class_map],
+            target_contexts_per_class=args.target_contexts_per_class,
+            target_contexts_total=args.target_contexts_total,
+            good_ratio_nonzero=args.good_ratio_nonzero,
+            good_ratio_class0=args.good_ratio_class0,
+        )
+        if quota_plan:
+            print("INFO: Context quota plan:")
+            for class_id, plan in sorted(quota_plan.items()):
+                print(
+                    f"  Class {class_id}: total={plan['total_target']} "
+                    f"(good={plan['good_target']}, bad={plan['bad_target']})"
+                )
         if args.num_workers == 1:
             print("INFO: Single-threaded mode...")
             for task in tqdm(tasks, desc="Caching"):
                 result = process_fn(task)
                 if result['status'] == 'success':
+                    if quota_plan:
+                        class_id = result['class_id']
+                        mint_addr = result['mint']
+                        q_score = result['q_score']
+                        saved_any = False
+                        for ctx in result.get("contexts", []):
+                            context_summary = summarize_context_window(ctx.get("labels"), ctx.get("labels_mask"))
+                            context_bucket = context_summary["context_bucket"]
+                            if not _should_accept_context(class_id, context_bucket, accepted_counts, quota_plan):
+                                continue
+                            ctx["quality_score"] = q_score
+                            ctx["class_id"] = class_id
+                            ctx["source_token"] = mint_addr
+                            ctx["context_bucket"] = context_bucket
+                            ctx["context_score"] = context_summary["context_score"]
+                            file_idx = accepted_per_token[mint_addr]
+                            filename = f"sample_{mint_addr[:16]}_{file_idx}.pt"
+                            output_path = Path(output_dir) / filename
+                            torch.save(ctx, output_path)
+                            accepted_per_token[mint_addr] += 1
+                            accepted_counts[class_id]["total"] += 1
+                            accepted_counts[class_id][context_bucket] += 1
+                            class_distribution[class_id] = class_distribution.get(class_id, 0) + 1
+                            context_distribution[class_id][context_bucket] += 1
+                            file_class_map[filename] = class_id
+                            file_context_bucket_map[filename] = context_bucket
+                            file_context_summary_map[filename] = context_summary
+                            saved_any = True
+                        if saved_any:
+                            success_count += 1
+                    else:
+                        class_id = result['class_id']
+                        mint_addr = result['mint']
+                        q_score = result['q_score']
+                        for ctx_idx, ctx in enumerate(result.get("contexts", [])):
+                            context_summary = summarize_context_window(ctx.get("labels"), ctx.get("labels_mask"))
+                            context_bucket = context_summary["context_bucket"]
+                            ctx["quality_score"] = q_score
+                            ctx["class_id"] = class_id
+                            ctx["source_token"] = mint_addr
+                            ctx["context_bucket"] = context_bucket
+                            ctx["context_score"] = context_summary["context_score"]
+                            filename = f"sample_{mint_addr[:16]}_{ctx_idx}.pt"
+                            output_path = Path(output_dir) / filename
+                            torch.save(ctx, output_path)
+                            file_class_map[filename] = class_id
+                            file_context_bucket_map[filename] = context_bucket
+                            file_context_summary_map[filename] = context_summary
+                            class_distribution[class_id] = class_distribution.get(class_id, 0) + 1
+                            context_distribution[class_id][context_bucket] += 1
+                        success_count += 1
                 elif result['status'] == 'skipped':
                     skipped_count += 1
                 else:
                     try:
                         result = future.result(timeout=300)
                         if result['status'] == 'success':
+                            class_id = result['class_id']
+                            mint_addr = result['mint']
+                            q_score = result['q_score']
+                            for ctx_idx, ctx in enumerate(result.get("contexts", [])):
+                                context_summary = summarize_context_window(ctx.get("labels"), ctx.get("labels_mask"))
+                                context_bucket = context_summary["context_bucket"]
+                                ctx["quality_score"] = q_score
+                                ctx["class_id"] = class_id
+                                ctx["source_token"] = mint_addr
+                                ctx["context_bucket"] = context_bucket
+                                ctx["context_score"] = context_summary["context_score"]
+                                filename = f"sample_{mint_addr[:16]}_{ctx_idx}.pt"
+                                output_path = Path(output_dir) / filename
+                                torch.save(ctx, output_path)
+                                file_class_map[filename] = class_id
+                                file_context_bucket_map[filename] = context_bucket
+                                file_context_summary_map[filename] = context_summary
+                                class_distribution[class_id] = class_distribution.get(class_id, 0) + 1
+                                context_distribution[class_id][context_bucket] += 1
                             success_count += 1
                         elif result['status'] == 'skipped':
                             skipped_count += 1
                         else:
                         tqdm.write(f"WORKER ERROR: {e}")
         print("INFO: Building metadata...")
+        if not file_class_map:
+            for f in sorted(output_dir.glob("sample_*.pt")):
+                try:
+                    cached = torch.load(f, map_location="cpu", weights_only=False)
+                    file_class_map[f.name] = cached.get("class_id", 0)
+                    if "labels" in cached and "labels_mask" in cached:
+                        context_summary = summarize_context_window(cached.get("labels"), cached.get("labels_mask"))
+                        file_context_bucket_map[f.name] = context_summary["context_bucket"]
+                        file_context_summary_map[f.name] = context_summary
+                except Exception:
+                    pass
         with open(output_dir / "class_metadata.json", 'w') as f:
+            json.dump({
+                'file_class_map': file_class_map,
+                'file_context_bucket_map': file_context_bucket_map,
+                'file_context_summary_map': file_context_summary_map,
+                'class_distribution': {str(k): v for k, v in class_distribution.items()},
+                'context_distribution': {
+                    str(k): {bucket: count for bucket, count in bucket_counts.items()}
+                    for k, bucket_counts in context_distribution.items()
+                },
+                'quota_plan': {str(k): v for k, v in quota_plan.items()},
+                'accepted_counts': {str(k): v for k, v in accepted_counts.items()},
+                'num_workers': args.num_workers,
+            }, f, indent=2)
+        if quota_plan:
+            print("INFO: Accepted context counts:")
+            for class_id, counts in sorted(accepted_counts.items()):
+                print(
+                    f"  Class {class_id}: total={counts['total']} "
+                    f"good={counts['good']} bad={counts['bad']}"
+                )
         print(f"\n--- Done ---\nSuccess: {success_count}, Skipped: {skipped_count}, Errors: {error_count}\nFiles: {len(file_class_map)}\nLocation: {output_dir.resolve()}")

data/data_loader.py CHANGED Viewed

@@ -64,6 +64,63 @@ MIN_AMOUNT_TRANSFER_SUPPLY = 0.0 # 1.0% of total supply
 HOLDER_SNAPSHOT_INTERVAL_SEC = 300
 HOLDER_SNAPSHOT_TOP_K = 200
 DEAD_URI_RETRY_LIMIT = 2
 class EmbeddingPooler:
@@ -123,6 +180,7 @@ class OracleDataset(Dataset):
                  horizons_seconds: List[int] = [],
                  quantiles: List[float] = [],
                  max_samples: Optional[int] = None,
                  token_allowlist: Optional[List[str]] = None,
                  cache_dir: Optional[Union[str, Path]] = None,
@@ -131,8 +189,11 @@ class OracleDataset(Dataset):
                  max_seq_len: int = 8192,
                  p99_clamps: Optional[Dict[str, float]] = None,
                  movement_label_config: Optional[Dict[str, float]] = None):
         self.max_seq_len = max_seq_len
         # --- P99 data-driven clamp values (replace hardcoded min/max) ---
         self.p99_clamps = {
@@ -187,9 +248,12 @@ class OracleDataset(Dataset):
             if not self.cached_files:
                 raise RuntimeError(f"Cache directory '{self.cache_dir}' provided but contains no 'sample_*.pt' files.")
-            # --- OPTIMIZED: Load class_ids from metadata cache file ---
             file_class_map = {}
             class_counts = defaultdict(int)
             metadata_path = self.cache_dir / "class_metadata.json"
             if metadata_path.exists():
@@ -199,20 +263,29 @@ class OracleDataset(Dataset):
                     with open(metadata_path, 'r') as f:
                         cached_metadata = json.load(f)
                     file_class_map = cached_metadata.get('file_class_map', {})
                     # Validate that cached files match metadata
                     cached_file_names = {p.name for p in self.cached_files}
                     metadata_file_names = set(file_class_map.keys())
                     if cached_file_names != metadata_file_names:
                         print(f"WARN: Metadata cache mismatch ({len(cached_file_names)} files vs {len(metadata_file_names)} in metadata). Rebuilding...")
                         file_class_map = {}
                     else:
                         # Rebuild class_counts from loaded map
-                        for cid in file_class_map.values():
                             class_counts[cid] += 1
                         print(f"INFO: Loaded metadata for {len(file_class_map)} samples in <1s")
                 except Exception as e:
                     print(f"WARN: Failed to load metadata cache: {e}. Rebuilding...")
                     file_class_map = {}
             # Slow path: scan all files and build metadata cache
             if not file_class_map:
@@ -229,23 +302,41 @@ class OracleDataset(Dataset):
                         if cid is None:
                             print(f"WARN: File {p.name} missing class_id. Skipping.")
                             continue
                         file_class_map[p.name] = cid
                         class_counts[cid] += 1
                     except Exception as e:
                         print(f"WARN: Failed to read cached sample {p.name}: {e}")
                 # Save metadata cache for future runs
                 try:
                     with open(metadata_path, 'w') as f:
-                        json.dump({'file_class_map': file_class_map}, f)
                     print(f"INFO: Saved class metadata cache to {metadata_path}")
                 except Exception as e:
                     print(f"WARN: Failed to save metadata cache: {e}")
             print(f"INFO: Class Distribution: {dict(class_counts)}")
             # Store file_class_map for fast lookup by train.py's create_balanced_split
             self.file_class_map = {p: cid for p, cid in file_class_map.items()}
             # Compute Weights
             self.weights_list = []
@@ -260,8 +351,24 @@ class OracleDataset(Dataset):
                     continue
                 cid = file_class_map[fname]
-                count = class_counts[cid]
-                weight = 1.0 / (count ** 0.5) if count > 0 else 0.0
                 self.weights_list.append(weight)
                 valid_files.append(p)
@@ -273,6 +380,25 @@ class OracleDataset(Dataset):
                 self.cached_files = self.cached_files[:self.num_samples]
                 self.weights_list = self.weights_list[:self.num_samples]
             print(f"INFO: Weighted Dataset Ready. {self.num_samples} samples.")
             self.sampled_mints = [] # Not needed in cached mode
             self.available_mints = []
@@ -1297,7 +1423,7 @@ class OracleDataset(Dataset):
             key=lambda t: _timestamp_to_order_value(t.get('timestamp'))
         )
-        min_context_trades = 10
         if len(all_trades_sorted) < (min_context_trades + 1):  # context + 1 trade after cutoff
             return None
@@ -1626,7 +1752,7 @@ class OracleDataset(Dataset):
             max_horizon_seconds=self.max_cache_horizon_seconds,
             include_wallet_data=False,
             include_graph=False,
-            min_trades=10, # Enforce min trades for context
             full_history=True,      # Bypass H/B/H limits
             prune_failed=False,     # Keep failed trades for realistic simulation
             prune_transfers=False   # Keep transfers for snapshot reconstruction
@@ -1641,8 +1767,14 @@ class OracleDataset(Dataset):
         raw_data['name'] = initial_mint_record.get('token_name', '')
         raw_data['symbol'] = initial_mint_record.get('token_symbol', '')
         raw_data['token_uri'] = initial_mint_record.get('token_uri', '')
-        raw_data['total_supply'] = initial_mint_record.get('total_supply', 0)
-        raw_data['decimals'] = initial_mint_record.get('token_decimals', 6)
         raw_data['protocol'] = initial_mint_record.get('protocol', 1)
         def _timestamp_to_order_value(ts_value: Any) -> float:
@@ -2686,7 +2818,7 @@ class OracleDataset(Dataset):
             max_horizon_seconds=self.max_cache_horizon_seconds,
             include_wallet_data=False,
             include_graph=False,
-            min_trades=10,
             full_history=True,
             prune_failed=False,
             prune_transfers=False
@@ -2704,8 +2836,14 @@ class OracleDataset(Dataset):
         raw_data['name'] = initial_mint_record.get('token_name', '')
         raw_data['symbol'] = initial_mint_record.get('token_symbol', '')
         raw_data['token_uri'] = initial_mint_record.get('token_uri', '')
-        raw_data['total_supply'] = initial_mint_record.get('total_supply', 0)
-        raw_data['decimals'] = initial_mint_record.get('token_decimals', 6)
         raw_data['protocol'] = initial_mint_record.get('protocol', 1)
         def _timestamp_to_order_value(ts_value) -> float:
@@ -2734,7 +2872,7 @@ class OracleDataset(Dataset):
             key=lambda t: _timestamp_to_order_value(t.get('timestamp'))
         )
-        min_context_trades = 10
         if len(all_trades_sorted) < (min_context_trades + 1):
             print(f"  SKIP: Not enough trades ({len(all_trades_sorted)}) for {token_address}")
             return []
@@ -2821,9 +2959,9 @@ class OracleDataset(Dataset):
         total_supply_raw = int(raw_total_supply)
         token_decimals = int(raw_decimals)
         if total_supply_raw <= 0:
-            raise RuntimeError(f"Invalid total_supply for {token_address}: {total_supply_raw}")
         if token_decimals < 0:
-            raise RuntimeError(f"Invalid decimals for {token_address}: {token_decimals}")
         token_scale = 10 ** token_decimals
         def _strict_int(v: Any, field_name: str) -> int:

 HOLDER_SNAPSHOT_INTERVAL_SEC = 300
 HOLDER_SNAPSHOT_TOP_K = 200
 DEAD_URI_RETRY_LIMIT = 2
+DEFAULT_TOTAL_SUPPLY_RAW = 1_000_000_000_000_000
+DEFAULT_TOKEN_DECIMALS = 6
+CONTEXT_BUCKET_NEGATIVE = "bad"
+CONTEXT_BUCKET_POSITIVE = "good"
+def summarize_context_window(
+    labels: Any,
+    labels_mask: Any,
+) -> Dict[str, Any]:
+    """
+    Summarize a realized context window using its valid future returns.
+    Base rule:
+    - each horizon contributes signed terminal PnL from buying at cutoff
+    - magnitude matters, so we compress returns with signed log1p
+    - the context is `good` only if the net score is positive
+    """
+    if labels is None or labels_mask is None:
+        raise RuntimeError("Context weighting requires both 'labels' and 'labels_mask'.")
+    if isinstance(labels, torch.Tensor):
+        label_vals = labels.tolist()
+    else:
+        label_vals = list(labels)
+    if isinstance(labels_mask, torch.Tensor):
+        mask_vals = labels_mask.tolist()
+    else:
+        mask_vals = list(labels_mask)
+    valid_returns = [
+        float(ret)
+        for ret, keep in zip(label_vals, mask_vals)
+        if float(keep) > 0.0
+    ]
+    signed_contributions = []
+    for ret in valid_returns:
+        magnitude = np.log1p(abs(ret))
+        signed_contributions.append(magnitude if ret > 0.0 else -magnitude)
+    positive_count = sum(1 for ret in valid_returns if ret > 0.0)
+    negative_count = len(valid_returns) - positive_count
+    context_score = float(sum(signed_contributions) / len(signed_contributions)) if signed_contributions else 0.0
+    context_bucket = (
+        CONTEXT_BUCKET_POSITIVE
+        if context_score > 0.0
+        else CONTEXT_BUCKET_NEGATIVE
+    )
+    return {
+        "context_bucket": context_bucket,
+        "context_score": context_score,
+        "positive_horizons": positive_count,
+        "negative_horizons": negative_count,
+        "valid_horizons": len(valid_returns),
+    }
 class EmbeddingPooler:
                  horizons_seconds: List[int] = [],
                  quantiles: List[float] = [],
                  max_samples: Optional[int] = None,
+                 min_trades: int = 10,
                  token_allowlist: Optional[List[str]] = None,
                  cache_dir: Optional[Union[str, Path]] = None,
                  max_seq_len: int = 8192,
                  p99_clamps: Optional[Dict[str, float]] = None,
                  movement_label_config: Optional[Dict[str, float]] = None):
         self.max_seq_len = max_seq_len
+        self.min_trades = int(min_trades)
+        if self.min_trades < 1:
+            raise RuntimeError(f"min_trades must be >= 1, got {self.min_trades}")
         # --- P99 data-driven clamp values (replace hardcoded min/max) ---
         self.p99_clamps = {
             if not self.cached_files:
                 raise RuntimeError(f"Cache directory '{self.cache_dir}' provided but contains no 'sample_*.pt' files.")
+            # --- OPTIMIZED: Load cached metadata if available ---
             file_class_map = {}
+            file_context_bucket_map = {}
+            file_context_summary_map = {}
             class_counts = defaultdict(int)
+            class_context_counts = defaultdict(lambda: defaultdict(int))
             metadata_path = self.cache_dir / "class_metadata.json"
             if metadata_path.exists():
                     with open(metadata_path, 'r') as f:
                         cached_metadata = json.load(f)
                     file_class_map = cached_metadata.get('file_class_map', {})
+                    file_context_bucket_map = cached_metadata.get('file_context_bucket_map', {})
+                    file_context_summary_map = cached_metadata.get('file_context_summary_map', {})
                     # Validate that cached files match metadata
                     cached_file_names = {p.name for p in self.cached_files}
                     metadata_file_names = set(file_class_map.keys())
                     if cached_file_names != metadata_file_names:
                         print(f"WARN: Metadata cache mismatch ({len(cached_file_names)} files vs {len(metadata_file_names)} in metadata). Rebuilding...")
                         file_class_map = {}
+                        file_context_bucket_map = {}
+                        file_context_summary_map = {}
                     else:
                         # Rebuild class_counts from loaded map
+                        for fname, cid in file_class_map.items():
                             class_counts[cid] += 1
+                            bucket = file_context_bucket_map.get(fname)
+                            if bucket is not None:
+                                class_context_counts[cid][bucket] += 1
                         print(f"INFO: Loaded metadata for {len(file_class_map)} samples in <1s")
                 except Exception as e:
                     print(f"WARN: Failed to load metadata cache: {e}. Rebuilding...")
                     file_class_map = {}
+                    file_context_bucket_map = {}
+                    file_context_summary_map = {}
             # Slow path: scan all files and build metadata cache
             if not file_class_map:
                         if cid is None:
                             print(f"WARN: File {p.name} missing class_id. Skipping.")
                             continue
+                        context_summary = summarize_context_window(
+                            cached_item.get("labels"),
+                            cached_item.get("labels_mask"),
+                        )
+                        bucket = context_summary["context_bucket"]
                         file_class_map[p.name] = cid
+                        file_context_bucket_map[p.name] = bucket
+                        file_context_summary_map[p.name] = context_summary
                         class_counts[cid] += 1
+                        class_context_counts[cid][bucket] += 1
                     except Exception as e:
                         print(f"WARN: Failed to read cached sample {p.name}: {e}")
                 # Save metadata cache for future runs
                 try:
                     with open(metadata_path, 'w') as f:
+                        json.dump({
+                            'file_class_map': file_class_map,
+                            'file_context_bucket_map': file_context_bucket_map,
+                            'file_context_summary_map': file_context_summary_map,
+                        }, f)
                     print(f"INFO: Saved class metadata cache to {metadata_path}")
                 except Exception as e:
                     print(f"WARN: Failed to save metadata cache: {e}")
             print(f"INFO: Class Distribution: {dict(class_counts)}")
+            print(
+                "INFO: Context Distribution by Class: "
+                f"{ {cid: dict(bucket_counts) for cid, bucket_counts in class_context_counts.items()} }"
+            )
             # Store file_class_map for fast lookup by train.py's create_balanced_split
             self.file_class_map = {p: cid for p, cid in file_class_map.items()}
+            self.file_context_bucket_map = {p: bucket for p, bucket in file_context_bucket_map.items()}
+            self.file_context_summary_map = {p: summary for p, summary in file_context_summary_map.items()}
             # Compute Weights
             self.weights_list = []
                     continue
                 cid = file_class_map[fname]
+                bucket = file_context_bucket_map.get(fname)
+                if bucket is None:
+                    raise RuntimeError(
+                        f"Cached sample '{fname}' is missing a context bucket. "
+                        "Rebuild metadata or cache before training."
+                    )
+                class_bucket_counts = class_context_counts[cid]
+                present_buckets = [name for name, cnt in class_bucket_counts.items() if cnt > 0]
+                if not present_buckets:
+                    raise RuntimeError(
+                        f"Class {cid} has no valid context buckets recorded. Cannot compute sampler weights."
+                    )
+                bucket_count = class_bucket_counts[bucket]
+                if bucket_count <= 0:
+                    raise RuntimeError(
+                        f"Class {cid} bucket '{bucket}' has invalid count {bucket_count} for sample '{fname}'."
+                    )
+                weight = 1.0 / (len(present_buckets) * bucket_count)
                 self.weights_list.append(weight)
                 valid_files.append(p)
                 self.cached_files = self.cached_files[:self.num_samples]
                 self.weights_list = self.weights_list[:self.num_samples]
+            # Recompute sampler weights against the active cached file subset so the
+            # class/context balancing reflects the actual dataset seen by training.
+            active_class_context_counts = defaultdict(lambda: defaultdict(int))
+            for p in self.cached_files:
+                fname = p.name
+                cid = file_class_map[fname]
+                bucket = file_context_bucket_map[fname]
+                active_class_context_counts[cid][bucket] += 1
+            self.weights_list = []
+            for p in self.cached_files:
+                fname = p.name
+                cid = file_class_map[fname]
+                bucket = file_context_bucket_map[fname]
+                class_bucket_counts = active_class_context_counts[cid]
+                present_buckets = [name for name, cnt in class_bucket_counts.items() if cnt > 0]
+                bucket_count = class_bucket_counts[bucket]
+                self.weights_list.append(1.0 / (len(present_buckets) * bucket_count))
             print(f"INFO: Weighted Dataset Ready. {self.num_samples} samples.")
             self.sampled_mints = [] # Not needed in cached mode
             self.available_mints = []
             key=lambda t: _timestamp_to_order_value(t.get('timestamp'))
         )
+        min_context_trades = self.min_trades
         if len(all_trades_sorted) < (min_context_trades + 1):  # context + 1 trade after cutoff
             return None
             max_horizon_seconds=self.max_cache_horizon_seconds,
             include_wallet_data=False,
             include_graph=False,
+            min_trades=self.min_trades,
             full_history=True,      # Bypass H/B/H limits
             prune_failed=False,     # Keep failed trades for realistic simulation
             prune_transfers=False   # Keep transfers for snapshot reconstruction
         raw_data['name'] = initial_mint_record.get('token_name', '')
         raw_data['symbol'] = initial_mint_record.get('token_symbol', '')
         raw_data['token_uri'] = initial_mint_record.get('token_uri', '')
+        raw_total_supply = initial_mint_record.get('total_supply', DEFAULT_TOTAL_SUPPLY_RAW)
+        raw_token_decimals = initial_mint_record.get('token_decimals', DEFAULT_TOKEN_DECIMALS)
+        raw_data['total_supply'] = (
+            int(raw_total_supply) if raw_total_supply and int(raw_total_supply) > 0 else DEFAULT_TOTAL_SUPPLY_RAW
+        )
+        raw_data['decimals'] = (
+            int(raw_token_decimals) if raw_token_decimals is not None and int(raw_token_decimals) >= 0 else DEFAULT_TOKEN_DECIMALS
+        )
         raw_data['protocol'] = initial_mint_record.get('protocol', 1)
         def _timestamp_to_order_value(ts_value: Any) -> float:
             max_horizon_seconds=self.max_cache_horizon_seconds,
             include_wallet_data=False,
             include_graph=False,
+            min_trades=self.min_trades,
             full_history=True,
             prune_failed=False,
             prune_transfers=False
         raw_data['name'] = initial_mint_record.get('token_name', '')
         raw_data['symbol'] = initial_mint_record.get('token_symbol', '')
         raw_data['token_uri'] = initial_mint_record.get('token_uri', '')
+        raw_total_supply = initial_mint_record.get('total_supply', DEFAULT_TOTAL_SUPPLY_RAW)
+        raw_token_decimals = initial_mint_record.get('token_decimals', DEFAULT_TOKEN_DECIMALS)
+        raw_data['total_supply'] = (
+            int(raw_total_supply) if raw_total_supply and int(raw_total_supply) > 0 else DEFAULT_TOTAL_SUPPLY_RAW
+        )
+        raw_data['decimals'] = (
+            int(raw_token_decimals) if raw_token_decimals is not None and int(raw_token_decimals) >= 0 else DEFAULT_TOKEN_DECIMALS
+        )
         raw_data['protocol'] = initial_mint_record.get('protocol', 1)
         def _timestamp_to_order_value(ts_value) -> float:
             key=lambda t: _timestamp_to_order_value(t.get('timestamp'))
         )
+        min_context_trades = self.min_trades
         if len(all_trades_sorted) < (min_context_trades + 1):
             print(f"  SKIP: Not enough trades ({len(all_trades_sorted)}) for {token_address}")
             return []
         total_supply_raw = int(raw_total_supply)
         token_decimals = int(raw_decimals)
         if total_supply_raw <= 0:
+            total_supply_raw = DEFAULT_TOTAL_SUPPLY_RAW
         if token_decimals < 0:
+            token_decimals = DEFAULT_TOKEN_DECIMALS
         token_scale = 10 ** token_decimals
         def _strict_int(v: Any, field_name: str) -> int:

log.log CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:935233e4d7669b2a25173d7ae164317e85f1a5e8b0fc1d8d1832ab0893fca471
-size 19258

 version https://git-lfs.github.com/spec/v1
+oid sha256:df78e2b44dd97a148be762f91e3b00f397651f8e7e43ee21f938492291fdfa3a
+size 83447

resume.md ADDED Viewed

	@@ -0,0 +1,190 @@

+# Resume
+## Main conclusions from this chat
+1. The main issue is data/sample construction, not just checkpoint choice.
+- `class_id` is token-level.
+- labels are context-level and depend on sampled `T_cutoff`.
+- balanced token classes do not imply balanced future outcomes.
+- the model can easily learn an over-negative prior if the cache is built naively.
+2. Cache balancing must happen at cache generation time.
+- train-time weighting is too late to fix disk waste or missing context diversity.
+- the cache should control:
+  - class balance
+  - good/bad context balance within class
+3. Validation needed token isolation, not just class balancing.
+- same token appearing in train and val through different contexts makes validation misleading.
+4. Movement-threshold circularity is not the main blocker anymore.
+- movement labels are downstream of realized returns.
+- movement thresholds should not control cache construction.
+5. OHLC is important, but current usage looks like trend/regime summary, not true pattern detection.
+- the chart branch is being used.
+- but probe tests suggest it is mostly acting as continuation/trend context.
+- not as breakout / support-resistance / head-and-shoulders intelligence.
+6. The current code is still regression-first.
+- main target is multi-horizon return prediction.
+- there is also:
+  - quality head
+  - movement head
+- calling movement auxiliary only matters if its loss contribution is actually secondary.
+## What was implemented in code
+### 1. Validation split
+- `train.py`
+- validation split was changed to group by token identity (`source_token` / `token_address`) instead of only `class_id`.
+### 2. Task tracker
+- `TASK_LIST.md`
+- created as the running checklist for this work.
+### 3. Context weighting signal
+- `data/data_loader.py`
+- added a context-quality summary derived from realized multi-horizon returns.
+- current code computes:
+  - `context_score`
+  - `context_bucket` (`good` / `bad`)
+- this is used for weighting and cache metadata.
+### 4. Cached dataset weighting
+- `data/data_loader.py`
+- sampler weights now account for:
+  - class balance
+  - good/bad context balance inside class
+### 5. Weighted cache generation
+- `cache_dataset.py`
+- canonical builder now writes context cache only.
+- no `cache_mode`.
+- no `max_samples`.
+- cache budget is context-based:
+  - `--target_contexts_total`
+  - or `--target_contexts_per_class`
+- quota-driven acceptance is implemented before save:
+  - level 1: class quotas
+  - level 2: good/bad quotas within class
+### 6. Metadata rebuild
+- `scripts/rebuild_metadata.py`
+- now rebuilds:
+  - `file_class_map`
+  - `file_context_bucket_map`
+  - `file_context_summary_map`
+### 7. `min_trades`
+- re-added properly
+- no longer a dead CLI arg
+- now actually controls dataset/context eligibility thresholds
+### 8. Evaluate script OHLC probes
+- `scripts/evaluate_sample.py`
+- added these OHLC probe modes:
+  - `ohlc_reverse`
+  - `ohlc_shuffle_chunks`
+  - `ohlc_mask_recent`
+  - `ohlc_trend_only`
+  - `ohlc_summary_shuffle`
+  - `ohlc_detrend`
+  - `ohlc_smooth`
+### 9. Bad mint metadata fallback
+- `data/data_loader.py`
+- if mint metadata has invalid or zero supply/decimals, it now defaults to:
+  - `total_supply = 1000000000000000`
+  - `token_decimals = 6`
+## Current cache-builder interface
+`cache_dataset.py` now uses a canonical context-cache path only.
+Relevant args:
+- `--output_dir`
+- `--start_date`
+- `--min_trade_usd`
+- `--min_trades`
+- `--context_length`
+- `--samples_per_token`
+- `--target_contexts_per_class`
+- `--target_contexts_total`
+- `--good_ratio_nonzero`
+- `--good_ratio_class0`
+- `--num_workers`
+- DB connection args
+Rules:
+- choose exactly one of:
+  - `--target_contexts_per_class`
+  - `--target_contexts_total`
+- if quota-driven caching is used, current implementation expects:
+  - `--num_workers 1`
+`--samples_per_token` is still present.
+- It is not a cache budget.
+- It is a candidate-generation knob.
+- It may still be removable later if a better attempt-based planner replaces it.
+## Important conceptual corrections from this chat
+1. Binary context type matters operationally, but a naive binary rule is dangerous.
+- examples like `+1%` then `-20%` show why simplistic rules fail.
+- context typing should come from realized multi-horizon behavior, not one crude shortcut.
+2. Patching alone does not force pattern learning.
+- it only makes local pattern use possible.
+- the model can still rely on trend shortcuts unless the representation and training setup make that harder.
+3. Support/resistance may be inferable from current chart inputs in principle.
+- but current encoder likely compresses too early and learns an easier shortcut instead.
+4. For this question, the bottleneck is not just “what loss?”
+- but also:
+  - chart input representation
+  - encoder compression
+  - whether the model can preserve local 1s structure
+## OHLC probe findings from this chat
+The key probe result repeated across runs was:
+- `ohlc_detrend` had the largest impact
+- `ohlc_trend_only` had the second-largest impact
+- `ohlc_smooth`, `shuffle`, `summary_shuffle`, `reverse`, and `mask_recent` had very small impact
+Interpretation:
+- the model is using OHLC mostly as:
+  - broad trend/regime context
+  - continuation-style directional signal
+- not as:
+  - local chart pattern detector
+  - support/resistance-aware trader logic
+  - breakout/rejection/fair-value-gap style reasoning
+This strongly suggests many of the model’s bearish/bad predictions are driven by trend continuation behavior from the chart branch.
+## What was explicitly rejected or corrected
+- do not keep treating `cache_mode` as a real concept
+- do not let `max_samples` and context-budget args overlap
+- do not call something “auxiliary” if its weight can dominate optimization
+- do not assume OHLC importance means chart-pattern understanding
+- do not answer architecture questions by guessing from intention; inspect actual code
+## What still matters next
+1. verify token-overlap assertions are enforced in train/val, not just token-grouped split logic
+2. rebuild cache with the new quota-based builder and inspect actual distributions
+3. update cache audit tooling
+4. decide whether `samples_per_token` should be replaced by a better attempt planner
+5. decide how the chart branch should evolve if the goal is trader-like 1s pattern reasoning rather than trend summary
+## Short current state
+- cache construction is much closer to the right direction now
+- validation logic is less broken than before
+- OHLC is confirmed important
+- but OHLC is currently behaving more like a trend/summary branch than a technical-pattern branch
+- the codebase is still primarily training on return regression, with extra heads layered on top

scripts/evaluate_sample.py CHANGED Viewed

@@ -49,6 +49,10 @@ OHLC_PROBE_MODES = [
     "ohlc_reverse",
     "ohlc_shuffle_chunks",
     "ohlc_mask_recent",
 ]
 def unlog_transform(tensor):
@@ -221,6 +225,56 @@ def _chunk_permutation_indices(length, chunk_size):
     return out
 def apply_ohlc_probe(batch, mode):
     probed = clone_batch(batch)
     if "ohlc_price_tensors" not in probed or probed["ohlc_price_tensors"].numel() == 0:
@@ -243,6 +297,26 @@ def apply_ohlc_probe(batch, mode):
         elif keep == 0:
             ohlc.zero_()
         probed["ohlc_price_tensors"] = ohlc
     return probed
@@ -683,7 +757,7 @@ def main():
                 batch=batch,
                 preds=full_preds,
                 quality_pred=full_quality,
-                direction_pred=full_direction,
                 gt_labels=gt_labels,
                 gt_mask=gt_mask,
                 gt_quality=gt_quality,
@@ -701,7 +775,7 @@ def main():
                             batch=probe_batch,
                             preds=probe_preds,
                             quality_pred=probe_quality,
-                            direction_pred=probe_direction,
                             gt_labels=gt_labels,
                             gt_mask=gt_mask,
                             gt_quality=gt_quality,
@@ -717,7 +791,7 @@ def main():
                         batch=ablated_batch,
                         preds=ablated_preds,
                         quality_pred=ablated_quality,
-                        direction_pred=ablated_direction,
                         gt_labels=gt_labels,
                         gt_mask=gt_mask,
                         gt_quality=gt_quality,

     "ohlc_reverse",
     "ohlc_shuffle_chunks",
     "ohlc_mask_recent",
+    "ohlc_trend_only",
+    "ohlc_summary_shuffle",
+    "ohlc_detrend",
+    "ohlc_smooth",
 ]
 def unlog_transform(tensor):
     return out
+def _moving_average_1d(series, kernel_size):
+    if kernel_size <= 1 or series.numel() == 0:
+        return series
+    pad = kernel_size // 2
+    kernel = torch.ones(1, 1, kernel_size, device=series.device, dtype=series.dtype) / float(kernel_size)
+    x = series.view(1, 1, -1)
+    x = torch.nn.functional.pad(x, (pad, pad), mode="replicate")
+    smoothed = torch.nn.functional.conv1d(x, kernel)
+    return smoothed.view(-1)[: series.numel()]
+def _linear_trend(series):
+    if series.numel() <= 1:
+        return series.clone()
+    start = series[0]
+    end = series[-1]
+    steps = torch.linspace(0.0, 1.0, series.numel(), device=series.device, dtype=series.dtype)
+    return start + (end - start) * steps
+def _summary_preserving_shuffle(series, chunk_size=20):
+    length = series.numel()
+    if length <= 2:
+        return series
+    chunks = []
+    interior_start = 1
+    interior_end = length - 1
+    for i in range(interior_start, interior_end, chunk_size):
+        chunks.append(series[i:min(i + chunk_size, interior_end)].clone())
+    if len(chunks) <= 1:
+        return series
+    reordered = list(reversed(chunks))
+    out = series.clone()
+    cursor = 1
+    for chunk in reordered:
+        out[cursor:cursor + chunk.numel()] = chunk
+        cursor += chunk.numel()
+    out[0] = series[0]
+    out[-1] = series[-1]
+    return out
+def _apply_per_series(ohlc, transform_fn):
+    out = ohlc.clone()
+    for batch_idx in range(out.shape[0]):
+        for channel_idx in range(out.shape[1]):
+            out[batch_idx, channel_idx] = transform_fn(out[batch_idx, channel_idx])
+    return out
 def apply_ohlc_probe(batch, mode):
     probed = clone_batch(batch)
     if "ohlc_price_tensors" not in probed or probed["ohlc_price_tensors"].numel() == 0:
         elif keep == 0:
             ohlc.zero_()
         probed["ohlc_price_tensors"] = ohlc
+    elif mode == "ohlc_trend_only":
+        probed["ohlc_price_tensors"] = _apply_per_series(ohlc, _linear_trend)
+    elif mode == "ohlc_summary_shuffle":
+        probed["ohlc_price_tensors"] = _apply_per_series(
+            ohlc,
+            lambda series: _summary_preserving_shuffle(series, chunk_size=20),
+        )
+    elif mode == "ohlc_detrend":
+        def detrend(series):
+            trend = _linear_trend(series)
+            detrended = series - trend + series[0]
+            detrended[0] = series[0]
+            detrended[-1] = series[0]
+            return detrended
+        probed["ohlc_price_tensors"] = _apply_per_series(ohlc, detrend)
+    elif mode == "ohlc_smooth":
+        probed["ohlc_price_tensors"] = _apply_per_series(
+            ohlc,
+            lambda series: _moving_average_1d(series, kernel_size=11),
+        )
     return probed
                 batch=batch,
                 preds=full_preds,
                 quality_pred=full_quality,
+                movement_pred=full_direction,
                 gt_labels=gt_labels,
                 gt_mask=gt_mask,
                 gt_quality=gt_quality,
                             batch=probe_batch,
                             preds=probe_preds,
                             quality_pred=probe_quality,
+                            movement_pred=probe_direction,
                             gt_labels=gt_labels,
                             gt_mask=gt_mask,
                             gt_quality=gt_quality,
                         batch=ablated_batch,
                         preds=ablated_preds,
                         quality_pred=ablated_quality,
+                        movement_pred=ablated_direction,
                         gt_labels=gt_labels,
                         gt_mask=gt_mask,
                         gt_quality=gt_quality,

scripts/rebuild_metadata.py CHANGED Viewed

@@ -5,6 +5,7 @@ import json
 from pathlib import Path
 from tqdm import tqdm
 from collections import defaultdict
 def rebuild_metadata(cache_dir="data/cache"):
     cache_path = Path(cache_dir)
@@ -15,10 +16,13 @@ def rebuild_metadata(cache_dir="data/cache"):
         print("No .pt files found!")
         return
-    print(f"Found {len(files)} files. Reading class IDs...")
     file_class_map = {}
     class_distribution = defaultdict(int)
     for f in tqdm(files):
         try:
@@ -26,14 +30,24 @@ def rebuild_metadata(cache_dir="data/cache"):
             # But torch.load loads everything. To be safe/fast, we just load on CPU.
             data = torch.load(f, map_location="cpu", weights_only=False)
             cid = data.get("class_id", 0)
             file_class_map[f.name] = cid
             class_distribution[cid] += 1
         except Exception as e:
             print(f"Error reading {f.name}: {e}")
     output_data = {
         'file_class_map': file_class_map,
         'class_distribution': {str(k): v for k, v in class_distribution.items()},
         # These are informational, setting defaults to avoid breaking if loader checks them
         'num_workers': 1,
         'horizons_seconds': [300, 900, 1800, 3600, 7200], # From user's pre_cache.sh

 from pathlib import Path
 from tqdm import tqdm
 from collections import defaultdict
+from data.data_loader import summarize_context_window
 def rebuild_metadata(cache_dir="data/cache"):
     cache_path = Path(cache_dir)
         print("No .pt files found!")
         return
+    print(f"Found {len(files)} files. Reading class IDs and context summaries...")
     file_class_map = {}
+    file_context_bucket_map = {}
+    file_context_summary_map = {}
     class_distribution = defaultdict(int)
+    context_distribution = defaultdict(lambda: defaultdict(int))
     for f in tqdm(files):
         try:
             # But torch.load loads everything. To be safe/fast, we just load on CPU.
             data = torch.load(f, map_location="cpu", weights_only=False)
             cid = data.get("class_id", 0)
+            context_summary = summarize_context_window(data.get("labels"), data.get("labels_mask"))
             file_class_map[f.name] = cid
+            file_context_bucket_map[f.name] = context_summary["context_bucket"]
+            file_context_summary_map[f.name] = context_summary
             class_distribution[cid] += 1
+            context_distribution[cid][context_summary["context_bucket"]] += 1
         except Exception as e:
             print(f"Error reading {f.name}: {e}")
     output_data = {
         'file_class_map': file_class_map,
+        'file_context_bucket_map': file_context_bucket_map,
+        'file_context_summary_map': file_context_summary_map,
         'class_distribution': {str(k): v for k, v in class_distribution.items()},
+        'context_distribution': {
+            str(k): {bucket: count for bucket, count in bucket_counts.items()}
+            for k, bucket_counts in context_distribution.items()
+        },
         # These are informational, setting defaults to avoid breaking if loader checks them
         'num_workers': 1,
         'horizons_seconds': [300, 900, 1800, 3600, 7200], # From user's pre_cache.sh

train.py CHANGED Viewed

@@ -234,54 +234,77 @@ def collator_like_targets(labels, labels_mask, movement_label_config: Optional[D
 def create_balanced_split(dataset, n_val_per_class: int = 1, seed: int = 42):
     """
-    Create train/val split with balanced classes in validation set.
-    Uses dataset's internal file_class_map for speed (no file loading).
-    Returns (train_indices, val_indices, class_to_indices).
     """
     import random
-    random.seed(seed)
-    # Group indices by class_id - use dataset's existing map if available
     class_to_indices = defaultdict(list)
-    # Fast path: use dataset's sample_labels (aligned with __getitem__)
-    if hasattr(dataset, 'sample_labels') and dataset.sample_labels:
-         for idx, class_id in enumerate(dataset.sample_labels):
-             class_to_indices[class_id].append(idx)
-    # Legacy path: use dataset's file_class_map (for 1-file-1-sample datasets)
-    elif hasattr(dataset, 'file_class_map') and dataset.file_class_map:
-        for idx, cached_file in enumerate(dataset.cached_files):
-            # file_class_map uses filename strings as keys, cached_files are Path objects
-            fname = cached_file.name if hasattr(cached_file, 'name') else str(cached_file)
-            class_id = dataset.file_class_map.get(fname, 0)
-            class_to_indices[class_id].append(idx)
-    else:
-        # Fallback: load from files (slow but works)
-        logger.info("No file_class_map found, loading class IDs from files (this may take a while)...")
-        import torch
-        for idx in range(len(dataset.cached_files)):
-            try:
-                cached_item = torch.load(dataset.cached_files[idx], map_location="cpu", weights_only=False)
-                class_id = cached_item.get("class_id", 0)
-                class_to_indices[class_id].append(idx)
-            except Exception:
-                class_to_indices[0].append(idx)
     train_indices = []
     val_indices = []
-    # For each class, take n_val_per_class samples for validation
-    for class_id, indices in class_to_indices.items():
-        random.shuffle(indices)
-        n_val = min(len(indices), n_val_per_class) # Ensure we don't take more than we have
-        val_indices.extend(indices[:n_val])
-        train_indices.extend(indices[n_val:])
-    # Shuffle both sets
-    random.shuffle(train_indices)
-    random.shuffle(val_indices)
-    return train_indices, val_indices, class_to_indices
 def run_validation(model, val_dataloader, accelerator, quantiles, quality_loss_fn, vocab):
@@ -688,18 +711,27 @@ def main() -> None:
         raise RuntimeError("Dataset is empty.")
     # --- NEW: Create balanced train/val split ---
-    logger.info(f"Creating balanced split with {args.val_samples_per_class} validation samples per class...")
-    train_indices, val_indices, class_distribution = create_balanced_split(
         dataset, n_val_per_class=args.val_samples_per_class, seed=seed
     )
-    # Log class distribution (use set for O(1) lookup)
-    train_set = set(train_indices)
     logger.info(f"Total samples: {len(dataset)}, Train: {len(train_indices)}, Val: {len(val_indices)}")
     for class_id, indices in sorted(class_distribution.items()):
-        n_val = min(len(indices), args.val_samples_per_class)
-        n_train = len(indices) - n_val
-        logger.info(f"  Class {class_id}: {len(indices)} total (~{n_train} train, {n_val} val)")
     # --- Compute class weights for loss weighting ---
     num_classes = max(class_distribution.keys()) + 1 if class_distribution else 7

 def create_balanced_split(dataset, n_val_per_class: int = 1, seed: int = 42):
     """
+    Create a token-grouped train/val split.
+    Validation is selected by token identity first, then balanced by class. This
+    prevents the same token from appearing in both splits through different cached
+    windows.
+    Returns:
+        (train_indices, val_indices, class_to_indices, split_stats)
     """
     import random
+    rng = random.Random(seed)
     class_to_indices = defaultdict(list)
+    class_to_token_groups = defaultdict(lambda: defaultdict(list))
+    if not hasattr(dataset, "cached_files"):
+        raise RuntimeError("Token-grouped split requires a cached dataset with cached_files.")
+    for idx, cached_file in enumerate(dataset.cached_files):
+        fname = cached_file.name if hasattr(cached_file, "name") else str(cached_file)
+        cached_item = None
+        try:
+            cached_item = torch.load(cached_file, map_location="cpu", weights_only=False)
+        except Exception as exc:
+            raise RuntimeError(f"Failed to read cached sample '{fname}' for split planning: {exc}") from exc
+        class_id = int(cached_item.get("class_id", 0))
+        source_token = cached_item.get("source_token") or cached_item.get("token_address")
+        if not source_token:
+            raise RuntimeError(
+                f"Cached sample '{fname}' is missing both 'source_token' and 'token_address'; "
+                "cannot build token-isolated validation split."
+            )
+        class_to_indices[class_id].append(idx)
+        class_to_token_groups[class_id][str(source_token)].append(idx)
     train_indices = []
     val_indices = []
+    split_stats = {}
+    for class_id, token_groups in class_to_token_groups.items():
+        token_items = list(token_groups.items())
+        rng.shuffle(token_items)
+        n_val_tokens = min(len(token_items), n_val_per_class)
+        val_token_items = token_items[:n_val_tokens]
+        train_token_items = token_items[n_val_tokens:]
+        class_val_indices = []
+        class_train_indices = []
+        for _, indices in val_token_items:
+            class_val_indices.extend(indices)
+        for _, indices in train_token_items:
+            class_train_indices.extend(indices)
+        val_indices.extend(class_val_indices)
+        train_indices.extend(class_train_indices)
+        split_stats[class_id] = {
+            "total_samples": len(class_to_indices[class_id]),
+            "total_tokens": len(token_items),
+            "val_samples": len(class_val_indices),
+            "val_tokens": len(val_token_items),
+            "train_samples": len(class_train_indices),
+            "train_tokens": len(train_token_items),
+        }
+    rng.shuffle(train_indices)
+    rng.shuffle(val_indices)
+    return train_indices, val_indices, class_to_indices, split_stats
 def run_validation(model, val_dataloader, accelerator, quantiles, quality_loss_fn, vocab):
         raise RuntimeError("Dataset is empty.")
     # --- NEW: Create balanced train/val split ---
+    logger.info(
+        f"Creating token-grouped split with {args.val_samples_per_class} validation tokens per class..."
+    )
+    train_indices, val_indices, class_distribution, split_stats = create_balanced_split(
         dataset, n_val_per_class=args.val_samples_per_class, seed=seed
     )
     logger.info(f"Total samples: {len(dataset)}, Train: {len(train_indices)}, Val: {len(val_indices)}")
     for class_id, indices in sorted(class_distribution.items()):
+        stats = split_stats.get(class_id, {})
+        logger.info(
+            "  Class %s: %s samples across %s tokens (~%s train samples / %s val samples, "
+            "%s train tokens / %s val tokens)",
+            class_id,
+            len(indices),
+            stats.get("total_tokens", 0),
+            stats.get("train_samples", 0),
+            stats.get("val_samples", 0),
+            stats.get("train_tokens", 0),
+            stats.get("val_tokens", 0),
+        )
     # --- Compute class weights for loss weighting ---
     num_classes = max(class_distribution.keys()) + 1 if class_distribution else 7