Upload 31 files
Browse files- .gitattributes +25 -0
- Vector-HaSH-Simple-Paper.tex +155 -0
- Vector-HaSH_for-6yr-old.pdf +3 -0
- XAUUSDc_M3_data.csv +0 -0
- chart.png +3 -0
- data_fetcher.py +324 -0
- image-png-pages_research-paper/page_01.png +3 -0
- image-png-pages_research-paper/page_02.png +3 -0
- image-png-pages_research-paper/page_03.png +3 -0
- image-png-pages_research-paper/page_04.png +3 -0
- image-png-pages_research-paper/page_05.png +3 -0
- image-png-pages_research-paper/page_06.png +3 -0
- image-png-pages_research-paper/page_07.png +3 -0
- image-png-pages_research-paper/page_08.png +3 -0
- image-png-pages_research-paper/page_09.png +3 -0
- image-png-pages_research-paper/page_10.png +3 -0
- image-png-pages_research-paper/page_11.png +3 -0
- image-png-pages_research-paper/page_12.png +3 -0
- image-png-pages_research-paper/page_13.png +3 -0
- image-png-pages_research-paper/page_14.png +3 -0
- image-png-pages_research-paper/page_15.png +3 -0
- image-png-pages_research-paper/page_16.png +3 -0
- image-png-pages_research-paper/page_17.png +3 -0
- image-png-pages_research-paper/page_18.png +3 -0
- image-png-pages_research-paper/page_19.png +3 -0
- image-png-pages_research-paper/page_20.png +3 -0
- image-png-pages_research-paper/page_21.png +3 -0
- image-png-pages_research-paper/page_22.png +3 -0
- implementation_plan.md +37 -0
- long_process.png +0 -0
- reference.pdf +3 -0
- vector_hash_trader_colab.py +380 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,28 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
chart.png filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
image-png-pages_research-paper/page_01.png filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
image-png-pages_research-paper/page_02.png filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
image-png-pages_research-paper/page_03.png filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
image-png-pages_research-paper/page_04.png filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
image-png-pages_research-paper/page_05.png filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
image-png-pages_research-paper/page_06.png filter=lfs diff=lfs merge=lfs -text
|
| 43 |
+
image-png-pages_research-paper/page_07.png filter=lfs diff=lfs merge=lfs -text
|
| 44 |
+
image-png-pages_research-paper/page_08.png filter=lfs diff=lfs merge=lfs -text
|
| 45 |
+
image-png-pages_research-paper/page_09.png filter=lfs diff=lfs merge=lfs -text
|
| 46 |
+
image-png-pages_research-paper/page_10.png filter=lfs diff=lfs merge=lfs -text
|
| 47 |
+
image-png-pages_research-paper/page_11.png filter=lfs diff=lfs merge=lfs -text
|
| 48 |
+
image-png-pages_research-paper/page_12.png filter=lfs diff=lfs merge=lfs -text
|
| 49 |
+
image-png-pages_research-paper/page_13.png filter=lfs diff=lfs merge=lfs -text
|
| 50 |
+
image-png-pages_research-paper/page_14.png filter=lfs diff=lfs merge=lfs -text
|
| 51 |
+
image-png-pages_research-paper/page_15.png filter=lfs diff=lfs merge=lfs -text
|
| 52 |
+
image-png-pages_research-paper/page_16.png filter=lfs diff=lfs merge=lfs -text
|
| 53 |
+
image-png-pages_research-paper/page_17.png filter=lfs diff=lfs merge=lfs -text
|
| 54 |
+
image-png-pages_research-paper/page_18.png filter=lfs diff=lfs merge=lfs -text
|
| 55 |
+
image-png-pages_research-paper/page_19.png filter=lfs diff=lfs merge=lfs -text
|
| 56 |
+
image-png-pages_research-paper/page_20.png filter=lfs diff=lfs merge=lfs -text
|
| 57 |
+
image-png-pages_research-paper/page_21.png filter=lfs diff=lfs merge=lfs -text
|
| 58 |
+
image-png-pages_research-paper/page_22.png filter=lfs diff=lfs merge=lfs -text
|
| 59 |
+
reference.pdf filter=lfs diff=lfs merge=lfs -text
|
| 60 |
+
Vector-HaSH_for-6yr-old.pdf filter=lfs diff=lfs merge=lfs -text
|
Vector-HaSH-Simple-Paper.tex
ADDED
|
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
\documentclass[conference]{IEEEtran}
|
| 2 |
+
\usepackage{cite}
|
| 3 |
+
\usepackage{amsmath,amssymb,amsfonts}
|
| 4 |
+
\usepackage{algorithmic}
|
| 5 |
+
\usepackage{graphicx}
|
| 6 |
+
\usepackage{textcomp}
|
| 7 |
+
\usepackage{xcolor}
|
| 8 |
+
\usepackage{listings}
|
| 9 |
+
\usepackage{hyperref}
|
| 10 |
+
|
| 11 |
+
% Python code listing style
|
| 12 |
+
\lstset{
|
| 13 |
+
language=Python,
|
| 14 |
+
basicstyle=\ttfamily\small,
|
| 15 |
+
keywordstyle=\color{blue},
|
| 16 |
+
commentstyle=\color{green!50!black},
|
| 17 |
+
stringstyle=\color{red},
|
| 18 |
+
showstringspaces=false,
|
| 19 |
+
numbers=left,
|
| 20 |
+
numberstyle=\tiny\color{gray},
|
| 21 |
+
frame=single,
|
| 22 |
+
breaklines=true
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
\def\BibTeX{{\rm B\kern-.05em{\sc i\kern-.025em b}\kern-.08em
|
| 26 |
+
T\kern-.1667em\lower.7ex\hbox{E}\kern-.125emX}}
|
| 27 |
+
|
| 28 |
+
\begin{document}
|
| 29 |
+
|
| 30 |
+
\title{Vector-HaSH: A Magical Memory Palace for the Brain\\
|
| 31 |
+
\large Explained for Smart 6-Year-Olds!}
|
| 32 |
+
|
| 33 |
+
\author{\IEEEauthorblockN{Agent-Self Swarm Intelligence}}
|
| 34 |
+
|
| 35 |
+
\maketitle
|
| 36 |
+
|
| 37 |
+
\begin{abstract}
|
| 38 |
+
Imagine your brain is a giant Lego castle. How does it remember a supersized recipe for baking 10,000 cookies without forgetting the first step? Older models, like the Hopfield network, try to squish every single cookie recipe into one box, and eventually, the box explodes (we call this the ``memory cliff''). This paper talks about Vector-HaSH, a shiny new tool that fixes this problem! It splits memory into two jobs: a ''scaffold'' (like a treasure map of empty boxes) and the ''content'' (the actual treasure inside the boxes). By placing memories on this map using a simple 2D steering wheel (velocity), the brain can remember tens of thousands of things in a row without breaking a sweat!
|
| 39 |
+
\end{abstract}
|
| 40 |
+
|
| 41 |
+
\section{Introduction}
|
| 42 |
+
Have you ever tried to memorize a very long grocery list? If you put milk, eggs, carrots, and 50 other things in your pocket all at once, your pocket might rip. In neuroscience (the study of brains), scientists noticed that computer memory models (like Hopfield networks) do exactly this. After seeing too many patterns, they suddenly forget EVERYTHING. This catastrophic failure is known as the **memory cliff**.
|
| 43 |
+
|
| 44 |
+
But your real brain does not do this! Your brain uses two magic helpers:
|
| 45 |
+
1. **Grid Cells:** These are like special GPS trackers in your brain. They make a map of invisible tiles so you always know where you are standing.
|
| 46 |
+
2. **Hippocampus (HPC):** This is the memory vault. It stores the rich, colorful pictures of what you see (like a giant chocolate cake).
|
| 47 |
+
|
| 48 |
+
**Vector-HaSH** (which stands for Vector-Hippocampal and Scaffold Hypothesis) is a clever system that lets these two helpers hold hands. Instead of memorizing the whole cake at once, the Grid Cells create a path (a scaffold) and the Hippocampus attaches the cake to one of the steps on the path. To get to the next memory, you just turn the steering wheel (velocity vector) and move to the next tile!
|
| 49 |
+
|
| 50 |
+
\section{Related Work}
|
| 51 |
+
Before Vector-HaSH, scientists believed in the classic Hopfield Network. Think of it as a magical rubber band ball. You stretch it with new memories. But if you stretch it too many times, SNAP! The rubber bands break.
|
| 52 |
+
|
| 53 |
+
Other researchers tried to fix it by using ``sparse'' inputs (putting only tiny rubber bands). But even then, the capacity scaling was limited. You could only store $O(N)$ memories, where $N$ is the number of neurons. If you wanted to remember $10,000$ steps of a dance routine, you needed millions of brain cells. Vector-HaSH changes the game entirely by using grid cell networks as a sequence scaffold, escaping the dreaded memory cliff.
|
| 54 |
+
|
| 55 |
+
\section{Proposed Method}
|
| 56 |
+
Imagine making a long train out of toy cars.
|
| 57 |
+
In the old way, every toy car had to carry the heavy load of remembering exactly which car came next by staring at the whole car.
|
| 58 |
+
|
| 59 |
+
In Vector-HaSH, the train tracks themselves (Grid Cells) tell you where to go next. All you need is a tiny steering wheel (a 2-dimensional velocity) to move forward!
|
| 60 |
+
|
| 61 |
+
\subsection{The Three Big Steps}
|
| 62 |
+
1. \textbf{The Grid Space (The Map):} Think of it like a giant chessboard. You are a knight jumping across it. The board is made of a few tiny, connected circles (modules).
|
| 63 |
+
2. \textbf{The Hippocampus (The Polaroid Camera):} For every square on the chessboard, the camera takes a snapshot and remembers the sensory details.
|
| 64 |
+
3. \textbf{Velocity Shift (The Steering Wheel):} To remember the next scene in the movie, a very tiny, simple system (a Multi-Layer Perceptron or MLP) just gives a "push" (velocity vector) to the Grid Cells. The Grid Cells step forward, and the Hippocampus wakes up the next memory!
|
| 65 |
+
|
| 66 |
+
By doing this, the memory capacity goes UP exponentially! It can remember 14,000 steps easily, whereas the old model failed at 30 steps!
|
| 67 |
+
|
| 68 |
+
\section{Code Examples: Tiny and Pythonic}
|
| 69 |
+
Let's look at the real code logic for Vector-HaSH. We will make tiny, runnable scripts so you can build your own mini-brain at home!
|
| 70 |
+
|
| 71 |
+
\subsection{Example 1: Moving the Grid Cells}
|
| 72 |
+
How does the brain know where to go next? It uses a "velocity" to shift the grid. Here is a tiny Python example:
|
| 73 |
+
|
| 74 |
+
\begin{lstlisting}[language=Python]
|
| 75 |
+
import numpy as np
|
| 76 |
+
|
| 77 |
+
# Line 1: Imagine our grid map has 5 spots (0 to 4).
|
| 78 |
+
grid_map_size = 5
|
| 79 |
+
|
| 80 |
+
# Line 2: You are currently sitting at spot number 2.
|
| 81 |
+
current_grid_state = 2
|
| 82 |
+
|
| 83 |
+
# Line 3: The steering wheel tells us to move forward by 1 step!
|
| 84 |
+
velocity_shift = 1
|
| 85 |
+
|
| 86 |
+
# Line 4: We calculate the new spot! We use the modulo operator (%),
|
| 87 |
+
# which acts like a circle. If you step past 4, you go back to 0!
|
| 88 |
+
next_grid_state = (current_grid_state + velocity_shift) % grid_map_size
|
| 89 |
+
|
| 90 |
+
# Line 5: Print the result! The car moved to spot 3!
|
| 91 |
+
print(f"We drove to spot: {next_grid_state}")
|
| 92 |
+
\end{lstlisting}
|
| 93 |
+
|
| 94 |
+
\emph{Explanation for a 6-year old:}
|
| 95 |
+
\begin{itemize}
|
| 96 |
+
\item \textbf{Line 1:} We build a tiny race track with 5 spaces.
|
| 97 |
+
\item \textbf{Line 2:} We put our toy car on space number 2.
|
| 98 |
+
\item \textbf{Line 3:} We press the gas pedal to move 1 space.
|
| 99 |
+
\item \textbf{Line 4:} We calculate where the car lands. Because the track is a circle, if we go past the end, we warp back to the start!
|
| 100 |
+
\item \textbf{Line 5:} We tell the world where our car parked!
|
| 101 |
+
\end{itemize}
|
| 102 |
+
|
| 103 |
+
\subsection{Example 2: Hippocampus Remembering the Cake}
|
| 104 |
+
Now that we are on a new grid spot, the Hippocampus needs to hook a memory onto it. We use a matrix multiplication (which is just a fancy way of giving high-fives).
|
| 105 |
+
|
| 106 |
+
\begin{lstlisting}[language=Python]
|
| 107 |
+
import numpy as np
|
| 108 |
+
|
| 109 |
+
# Line 1: This is our grid spot (Spot 3). It is turned ON (1).
|
| 110 |
+
grid_activity = np.array([0, 0, 0, 1, 0])
|
| 111 |
+
|
| 112 |
+
# Line 2: These are the memory weights.
|
| 113 |
+
# They decide what picture appears when a spot is ON.
|
| 114 |
+
hippocampus_weights = np.array([
|
| 115 |
+
[0.1, 0.2], # Spot 0 -> sees an apple
|
| 116 |
+
[0.5, 0.9], # Spot 1 -> sees a dog
|
| 117 |
+
[0.8, 0.1], # Spot 2 -> sees a car
|
| 118 |
+
[0.9, 0.9], # Spot 3 -> sees a GIANT CAKE!
|
| 119 |
+
[0.3, 0.4] # Spot 4 -> sees a tree
|
| 120 |
+
])
|
| 121 |
+
|
| 122 |
+
# Line 3: We multiply our current spot by the weights.
|
| 123 |
+
# It acts like a magic flashlight revealing the picture.
|
| 124 |
+
recalled_memory = grid_activity.dot(hippocampus_weights)
|
| 125 |
+
|
| 126 |
+
# Line 4: Boom! We see the numbers [0.9, 0.9] which means CAKE!
|
| 127 |
+
print(f"I remember: {recalled_memory}")
|
| 128 |
+
\end{lstlisting}
|
| 129 |
+
|
| 130 |
+
\emph{Explanation for a 6-year old:}
|
| 131 |
+
\begin{itemize}
|
| 132 |
+
\item \textbf{Line 1:} We have a row of light switches. Only the switch for Spot 3 is turned ON.
|
| 133 |
+
\item \textbf{Line 2:} We have a magical book of secrets (weights). Each switch is glued to a different secret picture.
|
| 134 |
+
\item \textbf{Line 3:} We use `.dot()`, which is a robot taking the ON switch and pulling its secret picture out of the book.
|
| 135 |
+
\item \textbf{Line 4:} The robot shows us the picture. Yummy cake!
|
| 136 |
+
\end{itemize}
|
| 137 |
+
|
| 138 |
+
\section{Experiments}
|
| 139 |
+
The smart scientists put Vector-HaSH through a tough obstacle course:
|
| 140 |
+
1. \textbf{The Dark Room Test:} Can the grid cells still work if you turn off the lights? Yes! Even if you can't see the colorful walls (no sensory input), the steering wheel (velocity) still drives the car around the invisible grid map.
|
| 141 |
+
2. \textbf{The Mega-Marathon Test:} Can Vector-HaSH run for 14,000 steps without stumbling over its shoelaces? Yes! Even a tiny network recalled the exact sequence of 14,000 turns without making a mistake!
|
| 142 |
+
|
| 143 |
+
\section{Results}
|
| 144 |
+
Vector-HaSH scored an A+! The results showed that biological brains use a \textbf{Sequence Scaffold}.
|
| 145 |
+
If you learn a new song, you don't build a new piano. You use the same piano keys (the grid cells scaffold) and just play them in a different order! Because the brain reuses the grid cells, it saves a MASSIVE amount of energy and avoids the memory cliff. This is exactly how "Memory Athletes" (people who can memorize a whole deck of cards in 20 seconds) use the "Memory Palace" trick. They walk through a familiar house in their mind (the grid) and drop off memories in every room!
|
| 146 |
+
|
| 147 |
+
\section{Conclusion}
|
| 148 |
+
The brain is the coolest computer in the world. Instead of getting overwhelmed by remembering everything at once, it uses Grid Cells to build a map, and Hippocampus cells to take pictures along the way. Vector-HaSH proves that with a tiny 2D steering wheel (velocity), we can navigate super-long memories flawlessly. Next time you play with your Lego sets, remember: your brain is snapping together a track and placing memories on it, block by block!
|
| 149 |
+
|
| 150 |
+
\begin{thebibliography}{00}
|
| 151 |
+
\bibitem{b1} Vector-HaSH Authors. "Episodic and associative memory through grid-like scaffolds." Nature, 2024.
|
| 152 |
+
\bibitem{b2} Hopfield, J. J. "Neural networks and physical systems with emergent collective computational abilities." PNAS, 1982.
|
| 153 |
+
\end{thebibliography}
|
| 154 |
+
|
| 155 |
+
\end{document}
|
Vector-HaSH_for-6yr-old.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bf0c1d472b727cca18821076bed5b85f27ca1260d5c6125abd6e8f89f5e19a77
|
| 3 |
+
size 112508
|
XAUUSDc_M3_data.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
chart.png
ADDED
|
Git LFS Details
|
data_fetcher.py
ADDED
|
@@ -0,0 +1,324 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 4 |
+
β data_fetcher.py β MT5 XAUUSDc M3 Data Fetcher β
|
| 5 |
+
β Fetches 1-year OHLCV + spread from MetaTrader5 (3-min candles) β
|
| 6 |
+
β Saves CSV + symbol_info.json. Run locally with MT5 terminal open. β
|
| 7 |
+
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import sys, time, json
|
| 11 |
+
import numpy as np
|
| 12 |
+
import pandas as pd
|
| 13 |
+
from datetime import datetime, timedelta, timezone
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
|
| 16 |
+
try:
|
| 17 |
+
import MetaTrader5 as mt5
|
| 18 |
+
except ImportError:
|
| 19 |
+
print("ERROR: MetaTrader5 package not installed. Run: pip install MetaTrader5")
|
| 20 |
+
sys.exit(1)
|
| 21 |
+
|
| 22 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 23 |
+
# CONFIGURATION
|
| 24 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 25 |
+
SYMBOL = "XAUUSDc"
|
| 26 |
+
TIMEFRAME_M1 = mt5.TIMEFRAME_M1 # Fetch M1, resample to M3
|
| 27 |
+
TF_LABEL = "M3"
|
| 28 |
+
RESAMPLE_MINS = 3 # 3-minute candles
|
| 29 |
+
LOOKBACK_DAYS = 365 # 1 year
|
| 30 |
+
OUTPUT_DIR = Path(__file__).resolve().parent
|
| 31 |
+
OUTPUT_CSV = OUTPUT_DIR / f"{SYMBOL}_{TF_LABEL}_data.csv"
|
| 32 |
+
OUTPUT_JSON = OUTPUT_DIR / f"{SYMBOL}_symbol_info.json"
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 36 |
+
# MT5 CONNECTION
|
| 37 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 38 |
+
def init_mt5() -> None:
|
| 39 |
+
"""Initialize MT5 connection with retries."""
|
| 40 |
+
for attempt in range(3):
|
| 41 |
+
if mt5.initialize():
|
| 42 |
+
info = mt5.terminal_info()
|
| 43 |
+
print(f"β MT5 connected β Build {info.build}, Company: {info.company}")
|
| 44 |
+
return
|
| 45 |
+
print(f" Attempt {attempt+1}/3 failed, retrying in 2s...")
|
| 46 |
+
time.sleep(2)
|
| 47 |
+
print(f"β MT5 initialization failed: {mt5.last_error()}")
|
| 48 |
+
sys.exit(1)
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def validate_symbol(symbol: str) -> dict:
|
| 52 |
+
"""Validate symbol exists and return its properties."""
|
| 53 |
+
info = mt5.symbol_info(symbol)
|
| 54 |
+
if info is None:
|
| 55 |
+
symbols = mt5.symbols_get()
|
| 56 |
+
gold_syms = [s.name for s in symbols if "XAU" in s.name or "GOLD" in s.name.upper()]
|
| 57 |
+
print(f"β Symbol '{symbol}' not found.")
|
| 58 |
+
if gold_syms:
|
| 59 |
+
print(f" Available gold symbols: {gold_syms}")
|
| 60 |
+
else:
|
| 61 |
+
print(f" No gold symbols found. Check your broker.")
|
| 62 |
+
sys.exit(1)
|
| 63 |
+
|
| 64 |
+
if not info.visible:
|
| 65 |
+
mt5.symbol_select(symbol, True)
|
| 66 |
+
time.sleep(0.5)
|
| 67 |
+
|
| 68 |
+
props = {
|
| 69 |
+
"name": info.name,
|
| 70 |
+
"digits": info.digits,
|
| 71 |
+
"point": info.point,
|
| 72 |
+
"spread": info.spread,
|
| 73 |
+
"trade_mode": info.trade_mode,
|
| 74 |
+
"volume_min": info.volume_min,
|
| 75 |
+
"volume_max": info.volume_max,
|
| 76 |
+
"volume_step": info.volume_step,
|
| 77 |
+
"trade_contract_size": info.trade_contract_size,
|
| 78 |
+
"trade_tick_value": info.trade_tick_value,
|
| 79 |
+
"trade_tick_size": info.trade_tick_size,
|
| 80 |
+
"currency_profit": info.currency_profit,
|
| 81 |
+
}
|
| 82 |
+
print(f"β Symbol validated: {info.name}")
|
| 83 |
+
print(f" Digits: {info.digits} | Point: {info.point} | "
|
| 84 |
+
f"Spread: {info.spread} | Min Lot: {info.volume_min} | "
|
| 85 |
+
f"Max Lot: {info.volume_max} | Contract: {info.trade_contract_size}")
|
| 86 |
+
return props
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 90 |
+
# DATA FETCHING (M1 β resample to M3)
|
| 91 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 92 |
+
def fetch_ohlcv(symbol: str, days: int) -> pd.DataFrame:
|
| 93 |
+
"""Fetch M1 OHLCV from MT5, then resample to 3-minute candles."""
|
| 94 |
+
utc_now = datetime.now(timezone.utc)
|
| 95 |
+
date_from = utc_now - timedelta(days=days)
|
| 96 |
+
|
| 97 |
+
print(f"\nβ Fetching M1 bars: {date_from.date()} to {utc_now.date()} β¦")
|
| 98 |
+
print(f" (Will resample M1 β M{RESAMPLE_MINS} after fetching)")
|
| 99 |
+
|
| 100 |
+
# Fetch in chunks to avoid MT5 limits (max ~100k bars per request)
|
| 101 |
+
chunk_days = 30
|
| 102 |
+
all_frames = []
|
| 103 |
+
current_start = date_from
|
| 104 |
+
|
| 105 |
+
while current_start < utc_now:
|
| 106 |
+
chunk_end = min(current_start + timedelta(days=chunk_days), utc_now)
|
| 107 |
+
rates = mt5.copy_rates_range(symbol, TIMEFRAME_M1, current_start, chunk_end)
|
| 108 |
+
|
| 109 |
+
if rates is not None and len(rates) > 0:
|
| 110 |
+
chunk_df = pd.DataFrame(rates)
|
| 111 |
+
chunk_df["time"] = pd.to_datetime(chunk_df["time"], unit="s", utc=True)
|
| 112 |
+
all_frames.append(chunk_df)
|
| 113 |
+
print(f" Chunk {current_start.date()} β {chunk_end.date()}: {len(chunk_df):,} M1 bars")
|
| 114 |
+
else:
|
| 115 |
+
err = mt5.last_error()
|
| 116 |
+
print(f" Chunk {current_start.date()} β {chunk_end.date()}: no data ({err})")
|
| 117 |
+
|
| 118 |
+
current_start = chunk_end
|
| 119 |
+
|
| 120 |
+
if not all_frames:
|
| 121 |
+
print(f"β No M1 data returned from any chunk")
|
| 122 |
+
sys.exit(1)
|
| 123 |
+
|
| 124 |
+
df = pd.concat(all_frames, ignore_index=True)
|
| 125 |
+
df = df.drop_duplicates(subset="time").sort_values("time").reset_index(drop=True)
|
| 126 |
+
df.rename(columns={"real_volume": "volume"}, inplace=True, errors="ignore")
|
| 127 |
+
|
| 128 |
+
print(f"β Total M1 bars: {len(df):,}")
|
| 129 |
+
print(f" M1 range: {df['time'].iloc[0]} β {df['time'].iloc[-1]}")
|
| 130 |
+
|
| 131 |
+
# ββ Resample M1 β M3 ββ
|
| 132 |
+
print(f"\nβ Resampling M1 β M{RESAMPLE_MINS} β¦")
|
| 133 |
+
df.set_index("time", inplace=True)
|
| 134 |
+
|
| 135 |
+
resampled = df.resample(f"{RESAMPLE_MINS}min", label="right", closed="right").agg({
|
| 136 |
+
"open": "first",
|
| 137 |
+
"high": "max",
|
| 138 |
+
"low": "min",
|
| 139 |
+
"close": "last",
|
| 140 |
+
"tick_volume": "sum",
|
| 141 |
+
"spread": "last",
|
| 142 |
+
}).dropna(subset=["open"])
|
| 143 |
+
|
| 144 |
+
# Also resample volume if present
|
| 145 |
+
if "volume" in df.columns:
|
| 146 |
+
resampled["volume"] = df["volume"].resample(f"{RESAMPLE_MINS}min", label="right", closed="right").sum()
|
| 147 |
+
|
| 148 |
+
resampled.reset_index(inplace=True)
|
| 149 |
+
|
| 150 |
+
# Ensure required columns
|
| 151 |
+
required = ["time", "open", "high", "low", "close", "tick_volume", "spread"]
|
| 152 |
+
for col in required:
|
| 153 |
+
if col not in resampled.columns:
|
| 154 |
+
if col == "spread":
|
| 155 |
+
resampled["spread"] = 0
|
| 156 |
+
elif col == "tick_volume":
|
| 157 |
+
resampled["tick_volume"] = 0
|
| 158 |
+
|
| 159 |
+
print(f"β Resampled to {len(resampled):,} M{RESAMPLE_MINS} bars")
|
| 160 |
+
print(f" M3 range: {resampled['time'].iloc[0]} β {resampled['time'].iloc[-1]}")
|
| 161 |
+
return resampled
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
def fetch_spread_from_ticks(symbol: str, days: int) -> float | None:
|
| 166 |
+
"""Fetch recent tick data to compute median spread. Returns median spread in points."""
|
| 167 |
+
print(f"\nβ Computing spread from tick data (sampling last 30 days) β¦")
|
| 168 |
+
|
| 169 |
+
utc_now = datetime.now(timezone.utc)
|
| 170 |
+
tick_start = utc_now - timedelta(days=min(days, 30))
|
| 171 |
+
|
| 172 |
+
ticks = mt5.copy_ticks_range(symbol, tick_start, utc_now, mt5.COPY_TICKS_INFO)
|
| 173 |
+
|
| 174 |
+
if ticks is None or len(ticks) == 0:
|
| 175 |
+
print(f" β No tick data available, using bar spread column")
|
| 176 |
+
return None
|
| 177 |
+
|
| 178 |
+
tick_df = pd.DataFrame(ticks)
|
| 179 |
+
tick_df["time"] = pd.to_datetime(tick_df["time"], unit="s", utc=True)
|
| 180 |
+
tick_df["spread_pts"] = (tick_df["ask"] - tick_df["bid"]) / mt5.symbol_info(symbol).point
|
| 181 |
+
|
| 182 |
+
avg_spread = tick_df["spread_pts"].mean()
|
| 183 |
+
median_spread = tick_df["spread_pts"].median()
|
| 184 |
+
max_spread = tick_df["spread_pts"].quantile(0.99)
|
| 185 |
+
|
| 186 |
+
print(f"β Processed {len(tick_df):,} ticks")
|
| 187 |
+
print(f" Avg spread: {avg_spread:.1f} pts | "
|
| 188 |
+
f"Median: {median_spread:.1f} pts | "
|
| 189 |
+
f"99th pctl: {max_spread:.1f} pts")
|
| 190 |
+
return median_spread
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 194 |
+
# DATA VALIDATION & CLEANING
|
| 195 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 196 |
+
def validate_data(df: pd.DataFrame) -> pd.DataFrame:
|
| 197 |
+
"""Validate and clean OHLCV data."""
|
| 198 |
+
print(f"\nβ Validating data quality β¦")
|
| 199 |
+
issues = []
|
| 200 |
+
|
| 201 |
+
# 1. NaN
|
| 202 |
+
nan_count = df[["open", "high", "low", "close"]].isnull().sum().sum()
|
| 203 |
+
if nan_count > 0:
|
| 204 |
+
issues.append(f" β {nan_count} NaN values in OHLCV β forward-filling")
|
| 205 |
+
df[["open", "high", "low", "close"]] = df[["open", "high", "low", "close"]].ffill()
|
| 206 |
+
|
| 207 |
+
# 2. OHLC integrity
|
| 208 |
+
bad_hl = (df["high"] < df["low"]).sum()
|
| 209 |
+
if bad_hl > 0:
|
| 210 |
+
issues.append(f" β {bad_hl} bars where high < low β swapping")
|
| 211 |
+
mask = df["high"] < df["low"]
|
| 212 |
+
df.loc[mask, ["high", "low"]] = df.loc[mask, ["low", "high"]].values
|
| 213 |
+
|
| 214 |
+
bad_range = ((df["open"] > df["high"]) | (df["open"] < df["low"]) |
|
| 215 |
+
(df["close"] > df["high"]) | (df["close"] < df["low"])).sum()
|
| 216 |
+
if bad_range > 0:
|
| 217 |
+
issues.append(f" β {bad_range} bars where open/close outside H-L β clamping")
|
| 218 |
+
df["open"] = df["open"].clip(lower=df["low"], upper=df["high"])
|
| 219 |
+
df["close"] = df["close"].clip(lower=df["low"], upper=df["high"])
|
| 220 |
+
|
| 221 |
+
# 3. Duplicates
|
| 222 |
+
dups = df["time"].duplicated().sum()
|
| 223 |
+
if dups > 0:
|
| 224 |
+
issues.append(f" β {dups} duplicate timestamps β keeping last")
|
| 225 |
+
df = df.drop_duplicates(subset="time", keep="last")
|
| 226 |
+
|
| 227 |
+
# 4. Large gaps (> 5 days)
|
| 228 |
+
time_diff = df["time"].diff()
|
| 229 |
+
large_gaps = time_diff[time_diff > pd.Timedelta(days=5)]
|
| 230 |
+
for idx in large_gaps.index:
|
| 231 |
+
gap = time_diff.loc[idx]
|
| 232 |
+
issues.append(f" β Large gap: {df['time'].iloc[idx-1]} β {df['time'].iloc[idx]} ({gap})")
|
| 233 |
+
|
| 234 |
+
# 5. Sort
|
| 235 |
+
df = df.sort_values("time").reset_index(drop=True)
|
| 236 |
+
|
| 237 |
+
# 6. Remove weekends
|
| 238 |
+
weekend_mask = df["time"].dt.dayofweek.isin([5, 6])
|
| 239 |
+
weekend_count = weekend_mask.sum()
|
| 240 |
+
if weekend_count > 0:
|
| 241 |
+
issues.append(f" βΉ Removed {weekend_count} weekend bars")
|
| 242 |
+
df = df[~weekend_mask].reset_index(drop=True)
|
| 243 |
+
|
| 244 |
+
if issues:
|
| 245 |
+
for issue in issues:
|
| 246 |
+
print(issue)
|
| 247 |
+
else:
|
| 248 |
+
print(" β Data quality: PASS (no issues found)")
|
| 249 |
+
|
| 250 |
+
print(f"\n Final dataset: {len(df):,} bars")
|
| 251 |
+
print(f" Price range: {df['close'].min():.2f} β {df['close'].max():.2f}")
|
| 252 |
+
print(f" Avg spread: {df['spread'].mean():.1f} pts")
|
| 253 |
+
print(f" Date range: {df['time'].iloc[0].date()} β {df['time'].iloc[-1].date()}")
|
| 254 |
+
return df
|
| 255 |
+
|
| 256 |
+
|
| 257 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 258 |
+
# MAIN
|
| 259 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 260 |
+
def main():
|
| 261 |
+
print("=" * 68)
|
| 262 |
+
print(f" MT5 Data Fetcher β {SYMBOL} {TF_LABEL} (1 Year)")
|
| 263 |
+
print("=" * 68)
|
| 264 |
+
|
| 265 |
+
# 1. Connect
|
| 266 |
+
init_mt5()
|
| 267 |
+
|
| 268 |
+
try:
|
| 269 |
+
# 2. Validate symbol & save info
|
| 270 |
+
sym_props = validate_symbol(SYMBOL)
|
| 271 |
+
|
| 272 |
+
# Save symbol info JSON for Colab / EA consumption
|
| 273 |
+
with open(OUTPUT_JSON, "w") as f:
|
| 274 |
+
json.dump(sym_props, f, indent=2, default=str)
|
| 275 |
+
print(f"\nβ Symbol info saved: {OUTPUT_JSON}")
|
| 276 |
+
|
| 277 |
+
# 3. Fetch OHLCV
|
| 278 |
+
df = fetch_ohlcv(SYMBOL, LOOKBACK_DAYS)
|
| 279 |
+
|
| 280 |
+
# 4. Enhance spread from ticks
|
| 281 |
+
median_spread = fetch_spread_from_ticks(SYMBOL, LOOKBACK_DAYS)
|
| 282 |
+
if median_spread is not None:
|
| 283 |
+
zero_mask = df["spread"] == 0
|
| 284 |
+
if zero_mask.sum() > 0:
|
| 285 |
+
df.loc[zero_mask, "spread"] = int(median_spread)
|
| 286 |
+
print(f" Filled {zero_mask.sum()} zero-spread bars with median: {median_spread:.0f}")
|
| 287 |
+
|
| 288 |
+
# 5. Validate
|
| 289 |
+
df = validate_data(df)
|
| 290 |
+
|
| 291 |
+
# 6. Add metadata columns
|
| 292 |
+
df["hour"] = df["time"].dt.hour
|
| 293 |
+
df["dayofweek"] = df["time"].dt.dayofweek
|
| 294 |
+
df["returns"] = np.log(df["close"] / df["close"].shift(1))
|
| 295 |
+
|
| 296 |
+
# 7. Save CSV
|
| 297 |
+
output_cols = [
|
| 298 |
+
"time", "open", "high", "low", "close",
|
| 299 |
+
"tick_volume", "spread", "hour", "dayofweek", "returns",
|
| 300 |
+
]
|
| 301 |
+
if "volume" in df.columns and "volume" not in output_cols:
|
| 302 |
+
output_cols.insert(5, "volume")
|
| 303 |
+
|
| 304 |
+
df_out = df[[c for c in output_cols if c in df.columns]]
|
| 305 |
+
df_out.to_csv(OUTPUT_CSV, index=False)
|
| 306 |
+
|
| 307 |
+
print(f"\n{'=' * 68}")
|
| 308 |
+
print(f" β SAVED: {OUTPUT_CSV}")
|
| 309 |
+
print(f" β Rows: {len(df_out):,} | Columns: {len(df_out.columns)}")
|
| 310 |
+
print(f" β File size: {OUTPUT_CSV.stat().st_size / 1024:.0f} KB")
|
| 311 |
+
print(f"{'=' * 68}")
|
| 312 |
+
|
| 313 |
+
print(f"\nSample (first 3 rows):")
|
| 314 |
+
print(df_out.head(3).to_string(index=False))
|
| 315 |
+
print(f"\nSample (last 3 rows):")
|
| 316 |
+
print(df_out.tail(3).to_string(index=False))
|
| 317 |
+
|
| 318 |
+
finally:
|
| 319 |
+
mt5.shutdown()
|
| 320 |
+
print("\nβ MT5 connection closed.")
|
| 321 |
+
|
| 322 |
+
|
| 323 |
+
if __name__ == "__main__":
|
| 324 |
+
main()
|
image-png-pages_research-paper/page_01.png
ADDED
|
Git LFS Details
|
image-png-pages_research-paper/page_02.png
ADDED
|
Git LFS Details
|
image-png-pages_research-paper/page_03.png
ADDED
|
Git LFS Details
|
image-png-pages_research-paper/page_04.png
ADDED
|
Git LFS Details
|
image-png-pages_research-paper/page_05.png
ADDED
|
Git LFS Details
|
image-png-pages_research-paper/page_06.png
ADDED
|
Git LFS Details
|
image-png-pages_research-paper/page_07.png
ADDED
|
Git LFS Details
|
image-png-pages_research-paper/page_08.png
ADDED
|
Git LFS Details
|
image-png-pages_research-paper/page_09.png
ADDED
|
Git LFS Details
|
image-png-pages_research-paper/page_10.png
ADDED
|
Git LFS Details
|
image-png-pages_research-paper/page_11.png
ADDED
|
Git LFS Details
|
image-png-pages_research-paper/page_12.png
ADDED
|
Git LFS Details
|
image-png-pages_research-paper/page_13.png
ADDED
|
Git LFS Details
|
image-png-pages_research-paper/page_14.png
ADDED
|
Git LFS Details
|
image-png-pages_research-paper/page_15.png
ADDED
|
Git LFS Details
|
image-png-pages_research-paper/page_16.png
ADDED
|
Git LFS Details
|
image-png-pages_research-paper/page_17.png
ADDED
|
Git LFS Details
|
image-png-pages_research-paper/page_18.png
ADDED
|
Git LFS Details
|
image-png-pages_research-paper/page_19.png
ADDED
|
Git LFS Details
|
image-png-pages_research-paper/page_20.png
ADDED
|
Git LFS Details
|
image-png-pages_research-paper/page_21.png
ADDED
|
Git LFS Details
|
image-png-pages_research-paper/page_22.png
ADDED
|
Git LFS Details
|
implementation_plan.md
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Implementation Plan - Vector-HaSH Financial Trader
|
| 2 |
+
|
| 3 |
+
## Objective
|
| 4 |
+
Implement the Vector-HaSH algorithm for predicting pure financial prices (XAUUSD 3-minute timeframe) inside Google Colab (T4 GPU). Evaluate strategy via strict anchored Walk-Forward Optimization (WFO) to eliminate forward-looking bias.
|
| 5 |
+
|
| 6 |
+
## Proposed Strategy Architecture
|
| 7 |
+
|
| 8 |
+
### 1. Feature Engineering
|
| 9 |
+
We will rely **ONLY** on pure price transformations.
|
| 10 |
+
- Compute rolling features: Log returns, rolling volatility, and sequence windows of size $W$ (e.g. 15 bars). Let the state at time $t$ be $\mathbf{x}_t \in \mathbb{R}^{W}$.
|
| 11 |
+
- **Discrete Quantization**: To map continuous prices into the discrete elements similar to the visual "sbook" in Vector-HaSH, we will use `flash-kmeans` (with $K$ clusters) to quantize the historical $\mathbf{x}_t$ vectors into discrete sensory classes $\mathbf{s}_t$.
|
| 12 |
+
|
| 13 |
+
### 2. Vector-HaSH Memory Scaffold
|
| 14 |
+
Instead of a 2D spatial grid, we will use a **1D Continuous Track** (approximating time).
|
| 15 |
+
- **Grid Scaffold ($\mathbf{g}_t$)**: Synthesize multiscale 1D grid cell representations (using sine/cosine waves or cyclic shifts).
|
| 16 |
+
- **Place Cells ($\mathbf{p}_t$)**: Project Grid cells into a sparse higher-dimensional space: $\mathbf{p}_t = \sigma(\mathbf{W}_{pg} \mathbf{g}_t)$.
|
| 17 |
+
- **Hetero-associative Memory**: Train the sensory-to-place map $\mathbf{W}_{sp}$ dynamically using Recursive Least Squares (RLS), mimicking the [pseudotrain_2d_iterative_step](file:///C:/Users/User/Desktop/debugrem/Vector-HaSH-agent-trader/VectorHaSH-main/MTT.py#133-140) seen in [MTT.py](file:///C:/Users/User/Desktop/debugrem/Vector-HaSH-agent-trader/VectorHaSH-main/MTT.py).
|
| 18 |
+
|
| 19 |
+
### 3. Machine Learning Wrapper (XGBoost)
|
| 20 |
+
- At time $t$, extract the *Memory Recall Error* ($\mathbf{s}_t - \hat{\mathbf{s}}_t$) and the *Place Cell Activations* ($\mathbf{p}_t$).
|
| 21 |
+
- Feed these VectorHaSH embeddings into an XGBoost Classifier/Regressor.
|
| 22 |
+
- Target: Next bar log return $r_{t+1}$ or direction $\text{sign}(r_{t+1})$.
|
| 23 |
+
|
| 24 |
+
### 4. Anchored Walk-Forward Optimization
|
| 25 |
+
To avoid cheating:
|
| 26 |
+
- Train/Test splits expand over time.
|
| 27 |
+
- Fold 1: Train $[0, T]$, Test $[T, T+H]$.
|
| 28 |
+
- Fold 2: Train $[0, T+H]$, Test $[T+H, T+2H]$.
|
| 29 |
+
- `flash-kmeans`, Vector-HaSH memory construction, and XGBoost fitting will occur **ONLY** on the Training slice of each fold, and act out-of-sample on the Test slice.
|
| 30 |
+
|
| 31 |
+
### 5. Mono-Script Colab Implementation (`vector_hash_trader.py`)
|
| 32 |
+
- Vectorized using PyTorch (`device='cuda'`) or NumPy (`cuml`/`cupy`/XGBoost-GPU).
|
| 33 |
+
- Plotting module included: cumulative returns, drawdown, WFO heatmaps, and memory collapse analysis.
|
| 34 |
+
|
| 35 |
+
## Verification
|
| 36 |
+
- Assert strictly positive index lookups when indexing arrays (no `t` to `t+1` leakage before target definition).
|
| 37 |
+
- Verify standard performance metrics: Sharpe Ratio, Sortino Ratio, Max Drawdown.
|
long_process.png
ADDED
|
reference.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:236a8b2612efdb18b213e349062c81db7be02eab115cfb141cf001d928e73b53
|
| 3 |
+
size 6404425
|
vector_hash_trader_colab.py
ADDED
|
@@ -0,0 +1,380 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
# ==============================================================================
|
| 3 |
+
# β RUN THESE IN A GOOGLE COLAB CELL BEFORE EXECUTING THE SCRIPT:
|
| 4 |
+
# !pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
|
| 5 |
+
# !pip install xgboost pandas numpy matplotlib seaborn tqdm
|
| 6 |
+
# !pip install git+https://github.com/svg-project/flash-kmeans.git
|
| 7 |
+
# ==============================================================================
|
| 8 |
+
"""
|
| 9 |
+
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 10 |
+
β vector_hash_trader_colab.py β Vector-HaSH Financial Time-Series Trader β
|
| 11 |
+
β Highly optimized monolithic GPU/Vectorized script for Google Colab. β
|
| 12 |
+
β Predicts pure prices via Anchored Walk-Forward Optimization (No Peeking)β
|
| 13 |
+
β Uses Vector-HaSH biologically plausible Scaffold representations + XGB. β
|
| 14 |
+
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 15 |
+
"""
|
| 16 |
+
import os
|
| 17 |
+
import sys
|
| 18 |
+
import gc
|
| 19 |
+
import time
|
| 20 |
+
import numpy as np
|
| 21 |
+
import pandas as pd
|
| 22 |
+
import matplotlib.pyplot as plt
|
| 23 |
+
import seaborn as sns
|
| 24 |
+
from pathlib import Path
|
| 25 |
+
from tqdm import tqdm
|
| 26 |
+
|
| 27 |
+
import torch
|
| 28 |
+
import torch.nn as nn
|
| 29 |
+
import torch.nn.functional as F
|
| 30 |
+
|
| 31 |
+
try:
|
| 32 |
+
import xgboost as xgb
|
| 33 |
+
except ImportError:
|
| 34 |
+
print("Running pip install xgboost...")
|
| 35 |
+
os.system("pip install xgboost")
|
| 36 |
+
import xgboost as xgb
|
| 37 |
+
|
| 38 |
+
try:
|
| 39 |
+
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error
|
| 40 |
+
except ImportError:
|
| 41 |
+
pass
|
| 42 |
+
|
| 43 |
+
# Try to import flash_kmeans if installed, else fallback to PyTorch custom KMeans
|
| 44 |
+
try:
|
| 45 |
+
from flash_kmeans import batch_kmeans_Euclid
|
| 46 |
+
FLASH_KMEANS_AVAILABLE = True
|
| 47 |
+
print("[INFO] flash_kmeans is available. We will use Triton-accelerated K-Means!")
|
| 48 |
+
except ImportError:
|
| 49 |
+
FLASH_KMEANS_AVAILABLE = False
|
| 50 |
+
print("[WARN] flash_kmeans not installed. Using PyTorch fallback.")
|
| 51 |
+
|
| 52 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 53 |
+
# PyTorch Fallback KMeans (if flash_kmeans not installed)
|
| 54 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 55 |
+
def fast_pytorch_kmeans(x, n_clusters, max_iter=100, tol=1e-4, device='cuda'):
|
| 56 |
+
"""Simple PyTorch KMeans for fallback."""
|
| 57 |
+
N, D = x.shape
|
| 58 |
+
# Randomly initialize centers from data points
|
| 59 |
+
indices = torch.randperm(N, device=device)[:n_clusters]
|
| 60 |
+
centers = x[indices].clone()
|
| 61 |
+
|
| 62 |
+
for i in range(max_iter):
|
| 63 |
+
# Compute distances (N, K)
|
| 64 |
+
dists = torch.cdist(x, centers, p=2)
|
| 65 |
+
# Assign clusters
|
| 66 |
+
cluster_ids = torch.argmin(dists, dim=1)
|
| 67 |
+
|
| 68 |
+
# Compute new centers
|
| 69 |
+
new_centers = torch.zeros_like(centers)
|
| 70 |
+
counts = torch.bincount(cluster_ids, minlength=n_clusters).float().unsqueeze(1)
|
| 71 |
+
new_centers.scatter_add_(0, cluster_ids.unsqueeze(1).expand(-1, D), x)
|
| 72 |
+
|
| 73 |
+
# Avoid division by zero
|
| 74 |
+
new_centers = new_centers / counts.clamp(min=1)
|
| 75 |
+
|
| 76 |
+
# Check convergence
|
| 77 |
+
center_shift = torch.norm(centers - new_centers, p=2)
|
| 78 |
+
centers = new_centers
|
| 79 |
+
if center_shift < tol:
|
| 80 |
+
break
|
| 81 |
+
|
| 82 |
+
return cluster_ids, centers
|
| 83 |
+
|
| 84 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 85 |
+
# Vector-HaSH Scaffold Engine
|
| 86 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 87 |
+
class VectorHashMemory(nn.Module):
|
| 88 |
+
"""
|
| 89 |
+
Simulates the Hippocampal/Entorhinal Grid structure for 1D Financial Sequences.
|
| 90 |
+
g_t: Grid sequence state (Time representation)
|
| 91 |
+
p_t: Place cells (Sparse projection of grid cells)
|
| 92 |
+
s_t: Sensory cells (Discretized pure price states/embeddings)
|
| 93 |
+
W_pg: Fixed random sparse projection from Grid to Place.
|
| 94 |
+
W_sp: Associative mapping connecting Place to Sensory (RLS trained).
|
| 95 |
+
"""
|
| 96 |
+
def __init__(self, N_grid=30, N_place=400, N_sensory=64, sparsity=0.1, device='cuda'):
|
| 97 |
+
super().__init__()
|
| 98 |
+
self.device = device
|
| 99 |
+
self.Ng = N_grid
|
| 100 |
+
self.Np = N_place
|
| 101 |
+
self.Ns = N_sensory
|
| 102 |
+
|
| 103 |
+
# Grid to Place sparse random projection (Non-trainable but fixed)
|
| 104 |
+
self.W_pg = torch.randn(self.Np, self.Ng, device=device, dtype=torch.float32)
|
| 105 |
+
|
| 106 |
+
# Apply Sparsity mask (like pruning in MTT.py)
|
| 107 |
+
mask = (torch.rand(self.Np, self.Ng, device=device) < sparsity).float()
|
| 108 |
+
self.W_pg = self.W_pg * mask
|
| 109 |
+
|
| 110 |
+
# Sensory Memory Retrieval weights (Trained via Pseudo-inverse / RLS on Train Fold)
|
| 111 |
+
self.W_sp = torch.zeros(self.Ns, self.Np, device=device, dtype=torch.float32)
|
| 112 |
+
|
| 113 |
+
def generate_grid_scaffold(self, T):
|
| 114 |
+
"""Generates a 1D continuous cyclic ring attractor for time."""
|
| 115 |
+
# Multi-scale sinusoidal oscillators (phases) corresponding to progression
|
| 116 |
+
t = torch.arange(T, device=self.device, dtype=torch.float32)
|
| 117 |
+
g_t = []
|
| 118 |
+
for i in range(self.Ng // 2):
|
| 119 |
+
freq = 1.0 / (2.0 ** (i * 0.1)) # Exponential scale
|
| 120 |
+
g_t.append(torch.sin(t * freq))
|
| 121 |
+
g_t.append(torch.cos(t * freq))
|
| 122 |
+
if len(g_t) < self.Ng:
|
| 123 |
+
g_t.append(torch.zeros_like(t))
|
| 124 |
+
g_t = torch.stack(g_t, dim=1) # (T, Ng)
|
| 125 |
+
return g_t
|
| 126 |
+
|
| 127 |
+
def generate_place_cells(self, g_t):
|
| 128 |
+
"""Project grid to place cells and apply ReLU for sparsity."""
|
| 129 |
+
# (T, Ng) @ (Ng, Np) -> (T, Np)
|
| 130 |
+
p_t = F.relu(torch.matmul(g_t, self.W_pg.T))
|
| 131 |
+
return p_t
|
| 132 |
+
|
| 133 |
+
def memorize(self, p_t, s_t):
|
| 134 |
+
"""
|
| 135 |
+
Calculates W_sp = S * pseudo_inverse(P) using Batched PyTorch SVD.
|
| 136 |
+
This represents the biological Hetero-Associative memory storage.
|
| 137 |
+
p_t: (T, Np)
|
| 138 |
+
s_t: (T, Ns)
|
| 139 |
+
"""
|
| 140 |
+
# We need pseudo-inverse of P^T, which has shape (Np, T). The inverse will be (T, Np).
|
| 141 |
+
p_t_inv = torch.linalg.pinv(p_t.T)
|
| 142 |
+
# W_sp: (Ns, T) @ (T, Np) -> (Ns, Np)
|
| 143 |
+
self.W_sp = torch.matmul(s_t.T, p_t_inv)
|
| 144 |
+
|
| 145 |
+
def recall(self, p_t):
|
| 146 |
+
"""
|
| 147 |
+
Returns reconstructed Sensory state.
|
| 148 |
+
\\hat{S} = P @ W_sp^T
|
| 149 |
+
"""
|
| 150 |
+
return torch.matmul(p_t, self.W_sp.T)
|
| 151 |
+
|
| 152 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 153 |
+
# DATA PROCESSING MODULE
|
| 154 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 155 |
+
def load_and_prepare_data(csv_path, window_size=16):
|
| 156 |
+
"""Loads XAUUSD M3 pure prices and constructs sequential state matrices."""
|
| 157 |
+
print(f"β Loading {csv_path} ...")
|
| 158 |
+
df = pd.read_csv(csv_path)
|
| 159 |
+
|
| 160 |
+
# We only care about pure price. Use 'close' and calculate 'returns' if missing
|
| 161 |
+
if 'returns' not in df.columns:
|
| 162 |
+
df['returns'] = np.log(df['close'] / df['close'].shift(1))
|
| 163 |
+
|
| 164 |
+
df = df.dropna().reset_index(drop=True)
|
| 165 |
+
|
| 166 |
+
# Target: Predict next return
|
| 167 |
+
df['target_return'] = df['returns'].shift(-1)
|
| 168 |
+
df['target_class'] = (df['target_return'] > 0).astype(int) # 1 if UP, 0 if DOWN
|
| 169 |
+
|
| 170 |
+
df = df.dropna().reset_index(drop=True)
|
| 171 |
+
|
| 172 |
+
# Create Rolling Winodws representation X_t (using last `window_size` returns)
|
| 173 |
+
returns_arr = df['returns'].values
|
| 174 |
+
N_samples = len(returns_arr) - window_size + 1
|
| 175 |
+
|
| 176 |
+
X_seq = np.zeros((N_samples, window_size), dtype=np.float32)
|
| 177 |
+
for i in range(window_size):
|
| 178 |
+
X_seq[:, i] = returns_arr[i : N_samples + i]
|
| 179 |
+
|
| 180 |
+
df_aligned = df.iloc[window_size - 1:].reset_index(drop=True)
|
| 181 |
+
|
| 182 |
+
print(f"β Data constructed! {N_samples} sequences of shape {window_size}.")
|
| 183 |
+
return df_aligned, X_seq
|
| 184 |
+
|
| 185 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 186 |
+
# ANCHORED WALK-FORWARD OPTIMIZATION STRATEGY
|
| 187 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 188 |
+
def execute_wfo_strategy(df, X_seq, n_splits=5, device='cuda'):
|
| 189 |
+
print(f"\n{'='*68}")
|
| 190 |
+
print(f" STARTING ANCHORED WALK-FORWARD OPTIMIZATION ({n_splits} folds)")
|
| 191 |
+
print(f"{'='*68}")
|
| 192 |
+
|
| 193 |
+
N = len(df)
|
| 194 |
+
fold_size = N // (n_splits + 1)
|
| 195 |
+
|
| 196 |
+
all_predictions = []
|
| 197 |
+
all_targets = []
|
| 198 |
+
all_returns = []
|
| 199 |
+
|
| 200 |
+
equity_timestamps = []
|
| 201 |
+
equity_curve = [1.0] # Starts at 1.0 multiplier
|
| 202 |
+
|
| 203 |
+
for fold in range(n_splits):
|
| 204 |
+
train_end = fold_size * (fold + 1)
|
| 205 |
+
test_end = train_end + fold_size
|
| 206 |
+
if fold == n_splits - 1:
|
| 207 |
+
test_end = N # Take the rest for the last fold
|
| 208 |
+
|
| 209 |
+
print(f"\nβΊ Fold {fold+1}/{n_splits} | Train: [0 : {train_end}] | Test: [{train_end} : {test_end}]")
|
| 210 |
+
|
| 211 |
+
# 1. Split Data
|
| 212 |
+
X_train_np = X_seq[:train_end]
|
| 213 |
+
y_train_np = df['target_class'].iloc[:train_end].values
|
| 214 |
+
|
| 215 |
+
X_test_np = X_seq[train_end:test_end]
|
| 216 |
+
y_test_np = df['target_class'].iloc[train_end:test_end].values
|
| 217 |
+
returns_test_np = df['target_return'].iloc[train_end:test_end].values
|
| 218 |
+
timestamps_test = df['time'].iloc[train_end:test_end].values
|
| 219 |
+
|
| 220 |
+
# Send to Device
|
| 221 |
+
X_train = torch.tensor(X_train_np, dtype=torch.float32, device=device)
|
| 222 |
+
X_test = torch.tensor(X_test_np, dtype=torch.float32, device=device)
|
| 223 |
+
|
| 224 |
+
# 2. Flash KMeans Quantization (Sensory Encoding) -> Convert 15D window to K=64 Centroids
|
| 225 |
+
K_clusters = 64
|
| 226 |
+
|
| 227 |
+
if FLASH_KMEANS_AVAILABLE:
|
| 228 |
+
# flash-kmeans expects input (Batch, N, Dim), so we add batch dim
|
| 229 |
+
X_tr_exp = X_train.unsqueeze(0)
|
| 230 |
+
cluster_ids, centers, _ = batch_kmeans_Euclid(X_tr_exp, n_clusters=K_clusters, tol=1e-4, verbose=False)
|
| 231 |
+
centers = centers.squeeze(0) # (K, D)
|
| 232 |
+
|
| 233 |
+
# Predict for train
|
| 234 |
+
dists_tr = torch.cdist(X_train, centers, p=2)
|
| 235 |
+
c_ids_tr = torch.argmin(dists_tr, dim=1)
|
| 236 |
+
# Predict for test
|
| 237 |
+
dists_te = torch.cdist(X_test, centers, p=2)
|
| 238 |
+
c_ids_te = torch.argmin(dists_te, dim=1)
|
| 239 |
+
|
| 240 |
+
else:
|
| 241 |
+
c_ids_tr, centers = fast_pytorch_kmeans(X_train, n_clusters=K_clusters, device=device)
|
| 242 |
+
dists_te = torch.cdist(X_test, centers, p=2)
|
| 243 |
+
c_ids_te = torch.argmin(dists_te, dim=1)
|
| 244 |
+
|
| 245 |
+
# One-hot encode the sensory targets: (T, K)
|
| 246 |
+
S_train = F.one_hot(c_ids_tr, num_classes=K_clusters).float()
|
| 247 |
+
S_test = F.one_hot(c_ids_te, num_classes=K_clusters).float()
|
| 248 |
+
|
| 249 |
+
# 3. Vector-HaSH Memorization
|
| 250 |
+
print(" β Initializing Vector-HaSH Scaffold & Memorizing...")
|
| 251 |
+
VH = VectorHashMemory(N_grid=32, N_place=512, N_sensory=K_clusters, sparsity=0.15, device=device)
|
| 252 |
+
|
| 253 |
+
G_train = VH.generate_grid_scaffold(T=train_end)
|
| 254 |
+
P_train = VH.generate_place_cells(G_train)
|
| 255 |
+
|
| 256 |
+
# Memorize (Hetero-association: Place -> Sensory) using Pseudo-Inverse
|
| 257 |
+
VH.memorize(P_train, S_train)
|
| 258 |
+
|
| 259 |
+
# Reconstruction Error features
|
| 260 |
+
S_hat_train = VH.recall(P_train)
|
| 261 |
+
error_train = (S_train - S_hat_train).detach()
|
| 262 |
+
|
| 263 |
+
# 4. Out-Of-Sample Memory Simulation
|
| 264 |
+
# For out of sample, we just map time to grid to place, and try to recall.
|
| 265 |
+
G_test_full = VH.generate_grid_scaffold(T=test_end)
|
| 266 |
+
G_test = G_test_full[train_end:test_end]
|
| 267 |
+
P_test = VH.generate_place_cells(G_test)
|
| 268 |
+
|
| 269 |
+
S_hat_test = VH.recall(P_test)
|
| 270 |
+
error_test = (S_test - S_hat_test).detach()
|
| 271 |
+
|
| 272 |
+
# 5. XGBoost Modeling
|
| 273 |
+
print(" β Training highly-optimized GPU XGBoost Model...")
|
| 274 |
+
# Feature Matrix: Concat Raw X_t, Place Cells P_t, Recall Error \epsilon_t
|
| 275 |
+
F_train = torch.cat([X_train, P_train, error_train], dim=1).cpu().numpy()
|
| 276 |
+
F_test = torch.cat([X_test, P_test, error_test], dim=1).cpu().numpy()
|
| 277 |
+
|
| 278 |
+
dtrain = xgb.DMatrix(F_train, label=y_train_np)
|
| 279 |
+
dtest = xgb.DMatrix(F_test, label=y_test_np)
|
| 280 |
+
|
| 281 |
+
params = {
|
| 282 |
+
'objective': 'binary:logistic',
|
| 283 |
+
'tree_method': 'hist',
|
| 284 |
+
'device': 'cuda', # T4 GPU Acceleration
|
| 285 |
+
'eval_metric': 'logloss',
|
| 286 |
+
'learning_rate': 0.05,
|
| 287 |
+
'max_depth': 4,
|
| 288 |
+
'subsample': 0.8,
|
| 289 |
+
'colsample_bytree': 0.8
|
| 290 |
+
}
|
| 291 |
+
|
| 292 |
+
evallist = [(dtrain, 'train'), (dtest, 'eval')]
|
| 293 |
+
bst = xgb.train(params, dtrain, num_boost_round=100, evals=evallist, verbose_eval=False)
|
| 294 |
+
|
| 295 |
+
# Predict on Test Split!
|
| 296 |
+
preds_prob = bst.predict(dtest)
|
| 297 |
+
preds_class = (preds_prob > 0.5).astype(int)
|
| 298 |
+
|
| 299 |
+
acc = accuracy_score(y_test_np, preds_class)
|
| 300 |
+
print(f" β Fold {fold+1} completed! Out-of-Sample Accuracy: {acc:.4f}")
|
| 301 |
+
|
| 302 |
+
# Calculate Strategy Returns
|
| 303 |
+
# Simple strategy: If pred=1, buy. If pred=0, sell.
|
| 304 |
+
trade_signals = np.where(preds_class == 1, 1, -1)
|
| 305 |
+
strategy_returns = trade_signals * returns_test_np
|
| 306 |
+
|
| 307 |
+
for ret in strategy_returns:
|
| 308 |
+
equity_curve.append(equity_curve[-1] * (1 + ret))
|
| 309 |
+
|
| 310 |
+
equity_timestamps.extend(timestamps_test)
|
| 311 |
+
all_predictions.extend(preds_class)
|
| 312 |
+
all_targets.extend(y_test_np)
|
| 313 |
+
all_returns.extend(strategy_returns)
|
| 314 |
+
|
| 315 |
+
# Clear CUDA memory
|
| 316 |
+
del X_train, X_test, X_tr_exp, G_train, P_train, S_train, S_hat_train, error_train
|
| 317 |
+
del G_test_full, G_test, P_test, S_test, S_hat_test, error_test, VH
|
| 318 |
+
torch.cuda.empty_cache()
|
| 319 |
+
gc.collect()
|
| 320 |
+
|
| 321 |
+
print(f"\n{'='*68}")
|
| 322 |
+
|
| 323 |
+
# 6. Evaluation & Plotting
|
| 324 |
+
overall_acc = accuracy_score(all_targets, all_predictions)
|
| 325 |
+
print(f"OVERALL OUT-OF-SAMPLE ACCURACY: {overall_acc:.4f}")
|
| 326 |
+
|
| 327 |
+
cum_ret = np.prod([1+r for r in all_returns])
|
| 328 |
+
print(f"OVERALL CUMULATIVE RETURN (Multiplier): {cum_ret:.4f}x")
|
| 329 |
+
|
| 330 |
+
# Calculate Drawdown
|
| 331 |
+
eq_array = np.array(equity_curve)
|
| 332 |
+
peaks = np.maximum.accumulate(eq_array)
|
| 333 |
+
drawdowns = (eq_array - peaks) / peaks
|
| 334 |
+
max_dd = np.min(drawdowns) * 100
|
| 335 |
+
print(f"MAX DRAWDOWN: {max_dd:.2f}%")
|
| 336 |
+
|
| 337 |
+
# Matplotlib Graph Generation
|
| 338 |
+
plt.style.use('dark_background')
|
| 339 |
+
fig, axs = plt.subplots(2, 1, figsize=(14, 10), gridspec_kw={'height_ratios': [3, 1]})
|
| 340 |
+
|
| 341 |
+
# Equity Curve
|
| 342 |
+
axs[0].plot(eq_array, color='cyan', linewidth=1.5, label=f"Strategy Equity (Return: {cum_ret:.2f}x)")
|
| 343 |
+
axs[0].set_title(f"XAUUSD Vector-HaSH Strategy - Anchored Walking-Forward Equity", fontsize=16, color='white')
|
| 344 |
+
axs[0].set_ylabel("Portfolio Multiplier", fontsize=12)
|
| 345 |
+
axs[0].grid(axis='y', linestyle='--', alpha=0.3)
|
| 346 |
+
axs[0].legend(loc="upper left")
|
| 347 |
+
|
| 348 |
+
# Drawdown Curve
|
| 349 |
+
axs[1].fill_between(range(len(drawdowns)), drawdowns*100, 0, color='red', alpha=0.5, label="Drawdown (%)")
|
| 350 |
+
axs[1].set_title(f"Drawdown Profile (Max DD: {max_dd:.2f}%)", fontsize=14, color='white')
|
| 351 |
+
axs[1].set_ylabel("Drawdown %", fontsize=12)
|
| 352 |
+
axs[1].set_xlabel("Out-Of-Sample Chronological Steps", fontsize=12)
|
| 353 |
+
axs[1].grid(axis='y', linestyle='--', alpha=0.3)
|
| 354 |
+
axs[1].legend(loc="lower left")
|
| 355 |
+
|
| 356 |
+
plt.tight_layout()
|
| 357 |
+
output_png = "vector_hash_equity_report.png"
|
| 358 |
+
plt.savefig(output_png, dpi=300, bbox_inches='tight')
|
| 359 |
+
print(f"β Strategy report chart saved to {output_png}!")
|
| 360 |
+
|
| 361 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 362 |
+
# EXECUTION SCRIPT
|
| 363 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 364 |
+
if __name__ == "__main__":
|
| 365 |
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
| 366 |
+
print(f"Runtime Device: {device.upper()}")
|
| 367 |
+
|
| 368 |
+
csv_file = Path("XAUUSDc_M3_data.csv")
|
| 369 |
+
if not csv_file.exists():
|
| 370 |
+
print(f"ERROR: {csv_file} not found in the current directory.")
|
| 371 |
+
sys.exit(1)
|
| 372 |
+
|
| 373 |
+
df_data, X_seq_data = load_and_prepare_data(csv_file, window_size=16)
|
| 374 |
+
|
| 375 |
+
# Optional: subset for extremely rapid testing (just uncomment to run faster)
|
| 376 |
+
# df_data = df_data.iloc[-10000:].reset_index(drop=True)
|
| 377 |
+
# X_seq_data = X_seq_data[-10000:]
|
| 378 |
+
|
| 379 |
+
execute_wfo_strategy(df_data, X_seq_data, n_splits=5, device=device)
|
| 380 |
+
|