kauabarros-24 commited on
Commit ·
43056e4
1
Parent(s): 651b3a9
CHORE: Generate most datas
Browse files- Untitled-1 +1 -0
- data/PETR4_SA.csv +0 -0
- data/features_10m.csv +1 -0
- data/features_120m.csv +1 -0
- data/features_24m.csv +1 -0
- data/features_60m.csv +1 -0
- data/raw_PETR4_SA.csv +0 -0
- data/target_10m.csv +1 -0
- data/target_120m.csv +1 -0
- data/target_24m.csv +1 -0
- data/target_60m.csv +1 -0
- data/teste.csv +0 -0
- pyproject.toml +4 -20
- scripts/download_data.py +16 -0
- src/features.py → scripts/generate_features.py +4 -3
- scripts/load_datas.py +0 -0
- scripts/train_all.py +5 -11
- src/backtest.py +0 -0
- src/data_collection.py +0 -24
- src/{features_engineering.py → feature_engineering.py} +15 -19
- src/model_training.py +31 -28
- src/models.py +0 -0
Untitled-1
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
|
data/PETR4_SA.csv
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/features_10m.csv
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
,Price,ret_1m,ret_2m,ret_3m,ret_4m,ret_5m,ma6,ma12,close/ma6,close/ma12
|
data/features_120m.csv
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
,Price,ret_1m,ret_2m,ret_3m,ret_4m,ret_5m,ma6,ma12,close/ma6,close/ma12
|
data/features_24m.csv
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
,Price,ret_1m,ret_2m,ret_3m,ret_4m,ret_5m,ma6,ma12,close/ma6,close/ma12
|
data/features_60m.csv
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
,Price,ret_1m,ret_2m,ret_3m,ret_4m,ret_5m,ma6,ma12,close/ma6,close/ma12
|
data/raw_PETR4_SA.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/target_10m.csv
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
,target
|
data/target_120m.csv
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
,target
|
data/target_24m.csv
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
,target
|
data/target_60m.csv
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
,target
|
data/teste.csv
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
pyproject.toml
CHANGED
|
@@ -12,14 +12,9 @@ dependencies = [
|
|
| 12 |
"statsmodels>=0.14.0",
|
| 13 |
"scikit-learn>=1.3.0",
|
| 14 |
"yfinance>=0.2.0",
|
| 15 |
-
"pandas-datareader>=0.10.0",
|
| 16 |
"matplotlib>=3.7.0",
|
| 17 |
"seaborn>=0.12.0",
|
| 18 |
-
"
|
| 19 |
-
"jupyter>=1.0.0",
|
| 20 |
-
"jupyterlab>=4.0.0",
|
| 21 |
-
"ipykernel>=6.0.0",
|
| 22 |
-
"joblib>=1.5.3"
|
| 23 |
]
|
| 24 |
requires-python = ">=3.12"
|
| 25 |
readme = "README.md"
|
|
@@ -32,19 +27,8 @@ build-backend = "pdm.backend"
|
|
| 32 |
[tool.pdm]
|
| 33 |
distribution = false
|
| 34 |
|
| 35 |
-
[dependency-groups]
|
| 36 |
-
dev = [
|
| 37 |
-
"black>=23.0.0",
|
| 38 |
-
"flake8>=6.0.0",
|
| 39 |
-
"pytest>=7.0.0"
|
| 40 |
-
]
|
| 41 |
-
|
| 42 |
[tool.pdm.scripts]
|
| 43 |
-
download-data = "python3
|
| 44 |
-
generate-features = "python3
|
| 45 |
-
features = "python3 src/features.py"
|
| 46 |
train-all = "python3 scripts/train_all.py"
|
| 47 |
-
|
| 48 |
-
train-logistic = "python -c 'from src.model_training import train_model; train_model(10, \"logistic\"); train_model(24, \"logistic\"); train_model(60, \"logistic\"); train_model(120, \"logistic\")'"
|
| 49 |
-
backtest = "python src/backtesting.py"
|
| 50 |
-
pipeline = "pdm run download-data && pdm run generate-features && pdm run train-all"
|
|
|
|
| 12 |
"statsmodels>=0.14.0",
|
| 13 |
"scikit-learn>=1.3.0",
|
| 14 |
"yfinance>=0.2.0",
|
|
|
|
| 15 |
"matplotlib>=3.7.0",
|
| 16 |
"seaborn>=0.12.0",
|
| 17 |
+
"joblib>=1.2.0"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
]
|
| 19 |
requires-python = ">=3.12"
|
| 20 |
readme = "README.md"
|
|
|
|
| 27 |
[tool.pdm]
|
| 28 |
distribution = false
|
| 29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
[tool.pdm.scripts]
|
| 31 |
+
download-data = "python3 scripts/download_data.py"
|
| 32 |
+
generate-features = "python3 scripts/generate_features.py"
|
|
|
|
| 33 |
train-all = "python3 scripts/train_all.py"
|
| 34 |
+
pipeline = "python3 scripts/download_data.py && python3 scripts/generate_features.py && python3 scripts/train_all.py"
|
|
|
|
|
|
|
|
|
scripts/download_data.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import yfinance as yf
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import os
|
| 4 |
+
|
| 5 |
+
def download_stock_data(ticker, start, end, save_path):
|
| 6 |
+
os.makedirs('data', exist_ok=True)
|
| 7 |
+
print(f"Baixando dados de {ticker}...")
|
| 8 |
+
df = yf.download(ticker, start=start, end=end, progress=False)
|
| 9 |
+
df.to_csv(save_path)
|
| 10 |
+
print(f"Dados salvos em {save_path}")
|
| 11 |
+
print(f"Shape: {df.shape}")
|
| 12 |
+
print(f"Colunas: {list(df.columns)}")
|
| 13 |
+
return df
|
| 14 |
+
|
| 15 |
+
if __name__ == "__main__":
|
| 16 |
+
download_stock_data("PETR4.SA", "2000-01-01", "2025-01-01", "data/raw_PETR4_SA.csv")
|
src/features.py → scripts/generate_features.py
RENAMED
|
@@ -1,6 +1,7 @@
|
|
|
|
|
|
|
|
| 1 |
from src.feature_engineering import prepare_data_for_all_horizons
|
| 2 |
|
| 3 |
if __name__ == "__main__":
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
print("Arquivos gerados com sucesso na pasta data/")
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
sys.path.append('.')
|
| 3 |
from src.feature_engineering import prepare_data_for_all_horizons
|
| 4 |
|
| 5 |
if __name__ == "__main__":
|
| 6 |
+
prepare_data_for_all_horizons("PETR4.SA")
|
| 7 |
+
print("Arquivos gerados com sucesso")
|
|
|
scripts/load_datas.py
DELETED
|
File without changes
|
scripts/train_all.py
CHANGED
|
@@ -1,12 +1,6 @@
|
|
| 1 |
-
|
|
|
|
|
|
|
| 2 |
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
print("=== Treinando com Random Forest ===")
|
| 6 |
-
for h in horizons:
|
| 7 |
-
train_model(h, model_type='random_forest')
|
| 8 |
-
|
| 9 |
-
"""print("\n=== Treinando com Regressão Logística ===")
|
| 10 |
-
for h in horizons:
|
| 11 |
-
train_model(h, model_type='logistic')
|
| 12 |
-
"""
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
sys.path.append('.')
|
| 3 |
+
from src.model_training import train_all_horizons
|
| 4 |
|
| 5 |
+
if __name__ == "__main__":
|
| 6 |
+
train_all_horizons()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/backtest.py
DELETED
|
File without changes
|
src/data_collection.py
DELETED
|
@@ -1,24 +0,0 @@
|
|
| 1 |
-
import yfinance as yf
|
| 2 |
-
import pandas as pd
|
| 3 |
-
import os
|
| 4 |
-
|
| 5 |
-
def download_stock_data(ticker: str, start: str, end: str, interval="1d") -> pd.DataFrame:
|
| 6 |
-
df = yf.download(ticker, start=start, end=end, interval=interval, progress=False)
|
| 7 |
-
|
| 8 |
-
if isinstance(df.columns, pd.MultiIndex):
|
| 9 |
-
df.columns = df.columns.droplevel(1)
|
| 10 |
-
return df
|
| 11 |
-
|
| 12 |
-
def save_data(df: pd.DataFrame, ticker: str, data_dir='data'):
|
| 13 |
-
os.makedirs(data_dir, exist_ok=True)
|
| 14 |
-
filename = f"{ticker.replace('.', '_')}.csv"
|
| 15 |
-
path = os.path.join(data_dir, filename)
|
| 16 |
-
df.to_csv(path)
|
| 17 |
-
print(f'Dados salvos em {path}')
|
| 18 |
-
return path
|
| 19 |
-
|
| 20 |
-
if __name__ == "__main__":
|
| 21 |
-
ticker = "PETR4.SA"
|
| 22 |
-
df = download_stock_data(ticker, start="2010-01-01", end="2025-01-01")
|
| 23 |
-
print(df.head())
|
| 24 |
-
save_data(df, ticker)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/{features_engineering.py → feature_engineering.py}
RENAMED
|
@@ -2,25 +2,24 @@ import pandas as pd
|
|
| 2 |
import numpy as np
|
| 3 |
import os
|
| 4 |
|
| 5 |
-
def load_data(ticker
|
| 6 |
ticker_clean = ticker.replace('.', '_')
|
| 7 |
filename = f"raw_{ticker_clean}.csv"
|
| 8 |
path = os.path.join(data_dir, filename)
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
df =
|
| 16 |
return df
|
| 17 |
|
| 18 |
-
def resample_monthly(df_daily
|
| 19 |
-
|
| 20 |
-
monthly = df_daily.resample('M').last()
|
| 21 |
return monthly
|
| 22 |
|
| 23 |
-
def create_features_and_target(df_monthly
|
| 24 |
df = df_monthly.copy()
|
| 25 |
close = df["Close"]
|
| 26 |
for lag in [1, 2, 3, 4, 5]:
|
|
@@ -34,20 +33,17 @@ def create_features_and_target(df_monthly: pd.DataFrame, horizon_months: int):
|
|
| 34 |
df.dropna(inplace=True)
|
| 35 |
return df
|
| 36 |
|
| 37 |
-
def prepare_data_for_all_horizons(ticker
|
| 38 |
os.makedirs('data', exist_ok=True)
|
| 39 |
df_daily = load_data(ticker)
|
|
|
|
| 40 |
df_monthly = resample_monthly(df_daily)
|
|
|
|
| 41 |
for h in horizons:
|
| 42 |
df_h = create_features_and_target(df_monthly, h)
|
| 43 |
-
feature_cols = [col for col in df_h.columns
|
| 44 |
-
if col not in ['target', 'Close', 'Open', 'High', 'Low', 'Volume']]
|
| 45 |
X = df_h[feature_cols]
|
| 46 |
y = df_h['target']
|
| 47 |
X.to_csv(f"data/features_{h}m.csv")
|
| 48 |
y.to_csv(f"data/target_{h}m.csv", header=['target'])
|
| 49 |
print(f"Horizonte {h} meses: {X.shape[0]} amostras salvas.")
|
| 50 |
-
|
| 51 |
-
if __name__ == "__main__":
|
| 52 |
-
ticker = "PETR4.SA"
|
| 53 |
-
prepare_data_for_all_horizons(ticker)
|
|
|
|
| 2 |
import numpy as np
|
| 3 |
import os
|
| 4 |
|
| 5 |
+
def load_data(ticker, data_dir='data'):
|
| 6 |
ticker_clean = ticker.replace('.', '_')
|
| 7 |
filename = f"raw_{ticker_clean}.csv"
|
| 8 |
path = os.path.join(data_dir, filename)
|
| 9 |
+
df = pd.read_csv(path)
|
| 10 |
+
if 'Date' in df.columns:
|
| 11 |
+
df.set_index('Date', inplace=True)
|
| 12 |
+
elif 'Unnamed: 0' in df.columns:
|
| 13 |
+
df.set_index('Unnamed: 0', inplace=True)
|
| 14 |
+
df.index = pd.to_datetime(df.index, errors='coerce')
|
| 15 |
+
df = df.dropna()
|
| 16 |
return df
|
| 17 |
|
| 18 |
+
def resample_monthly(df_daily):
|
| 19 |
+
monthly = df_daily.resample('ME').last()
|
|
|
|
| 20 |
return monthly
|
| 21 |
|
| 22 |
+
def create_features_and_target(df_monthly, horizon_months):
|
| 23 |
df = df_monthly.copy()
|
| 24 |
close = df["Close"]
|
| 25 |
for lag in [1, 2, 3, 4, 5]:
|
|
|
|
| 33 |
df.dropna(inplace=True)
|
| 34 |
return df
|
| 35 |
|
| 36 |
+
def prepare_data_for_all_horizons(ticker, horizons=[10, 24, 60, 120]):
|
| 37 |
os.makedirs('data', exist_ok=True)
|
| 38 |
df_daily = load_data(ticker)
|
| 39 |
+
print(f"Dados carregados: {len(df_daily)} linhas")
|
| 40 |
df_monthly = resample_monthly(df_daily)
|
| 41 |
+
print(f"Dados mensais: {len(df_monthly)} linhas")
|
| 42 |
for h in horizons:
|
| 43 |
df_h = create_features_and_target(df_monthly, h)
|
| 44 |
+
feature_cols = [col for col in df_h.columns if col not in ['target', 'Close', 'Open', 'High', 'Low', 'Volume']]
|
|
|
|
| 45 |
X = df_h[feature_cols]
|
| 46 |
y = df_h['target']
|
| 47 |
X.to_csv(f"data/features_{h}m.csv")
|
| 48 |
y.to_csv(f"data/target_{h}m.csv", header=['target'])
|
| 49 |
print(f"Horizonte {h} meses: {X.shape[0]} amostras salvas.")
|
|
|
|
|
|
|
|
|
|
|
|
src/model_training.py
CHANGED
|
@@ -3,43 +3,46 @@ import numpy as np
|
|
| 3 |
from sklearn.linear_model import LogisticRegression
|
| 4 |
from sklearn.ensemble import RandomForestClassifier
|
| 5 |
from sklearn.metrics import accuracy_score, classification_report
|
|
|
|
| 6 |
import os
|
| 7 |
-
import joblib
|
| 8 |
|
| 9 |
def load_features_target(horizon):
|
| 10 |
X = pd.read_csv(f"data/features_{horizon}m.csv", index_col=0)
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
return X, Y
|
| 14 |
|
| 15 |
-
def temporal_train_split(X,
|
| 16 |
n = len(X)
|
| 17 |
split_idx = int(n * (1 - test_size))
|
| 18 |
X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
return X_train, X_test, Y_train, Y_test
|
| 22 |
|
| 23 |
-
def train_model(horizon, model_type=
|
| 24 |
-
X,
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
model = RandomForestClassifier(n_estimators=100, random_state=42)
|
| 30 |
-
|
| 31 |
-
raise ValueError(f"There's a error in training model: {error}")
|
| 32 |
-
|
| 33 |
-
model.fit(X_train, Y_train)
|
| 34 |
-
|
| 35 |
y_pred = model.predict(X_test)
|
| 36 |
-
|
| 37 |
-
report = classification_report(
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
|
|
|
| 41 |
os.makedirs('models', exist_ok=True)
|
| 42 |
-
joblib.dump(model,
|
| 43 |
-
print(f"Modelo salvo em {
|
| 44 |
-
|
| 45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
from sklearn.linear_model import LogisticRegression
|
| 4 |
from sklearn.ensemble import RandomForestClassifier
|
| 5 |
from sklearn.metrics import accuracy_score, classification_report
|
| 6 |
+
import joblib
|
| 7 |
import os
|
|
|
|
| 8 |
|
| 9 |
def load_features_target(horizon):
|
| 10 |
X = pd.read_csv(f"data/features_{horizon}m.csv", index_col=0)
|
| 11 |
+
y = pd.read_csv(f"data/target_{horizon}m.csv", index_col=0).squeeze()
|
| 12 |
+
return X, y
|
|
|
|
| 13 |
|
| 14 |
+
def temporal_train_split(X, y, test_size=0.2):
|
| 15 |
n = len(X)
|
| 16 |
split_idx = int(n * (1 - test_size))
|
| 17 |
X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
|
| 18 |
+
y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]
|
| 19 |
+
return X_train, X_test, y_train, y_test
|
|
|
|
| 20 |
|
| 21 |
+
def train_model(horizon, model_type='random_forest', save_model=True):
|
| 22 |
+
X, y = load_features_target(horizon)
|
| 23 |
+
X_train, X_test, y_train, y_test = temporal_train_split(X, y, test_size=0.2)
|
| 24 |
+
if model_type == 'logistic':
|
| 25 |
+
model = LogisticRegression(max_iter=1000, random_state=42)
|
| 26 |
+
else:
|
| 27 |
model = RandomForestClassifier(n_estimators=100, random_state=42)
|
| 28 |
+
model.fit(X_train, y_train)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
y_pred = model.predict(X_test)
|
| 30 |
+
acc = accuracy_score(y_test, y_pred)
|
| 31 |
+
report = classification_report(y_test, y_pred, target_names=['Queda', 'Sobe'])
|
| 32 |
+
print(f"\nHorizonte: {horizon} meses - {model_type}")
|
| 33 |
+
print(f"Acurácia: {acc:.4f}")
|
| 34 |
+
print(report)
|
| 35 |
+
if save_model:
|
| 36 |
os.makedirs('models', exist_ok=True)
|
| 37 |
+
joblib.dump(model, f"models/{model_type}_{horizon}m.pkl")
|
| 38 |
+
print(f"Modelo salvo em models/{model_type}_{horizon}m.pkl")
|
| 39 |
+
return model, acc, report
|
| 40 |
+
|
| 41 |
+
def train_all_horizons():
|
| 42 |
+
horizons = [10, 24, 60, 120]
|
| 43 |
+
print("Treinando Random Forest...")
|
| 44 |
+
for h in horizons:
|
| 45 |
+
train_model(h, 'random_forest')
|
| 46 |
+
print("\nTreinando Regressão Logística...")
|
| 47 |
+
for h in horizons:
|
| 48 |
+
train_model(h, 'logistic')
|
src/models.py
DELETED
|
File without changes
|