kauabarros-24 commited on
Commit
43056e4
·
1 Parent(s): 651b3a9

CHORE: Generate most datas

Browse files
Untitled-1 ADDED
@@ -0,0 +1 @@
 
 
1
+
data/PETR4_SA.csv DELETED
The diff for this file is too large to render. See raw diff
 
data/features_10m.csv ADDED
@@ -0,0 +1 @@
 
 
1
+ ,Price,ret_1m,ret_2m,ret_3m,ret_4m,ret_5m,ma6,ma12,close/ma6,close/ma12
data/features_120m.csv ADDED
@@ -0,0 +1 @@
 
 
1
+ ,Price,ret_1m,ret_2m,ret_3m,ret_4m,ret_5m,ma6,ma12,close/ma6,close/ma12
data/features_24m.csv ADDED
@@ -0,0 +1 @@
 
 
1
+ ,Price,ret_1m,ret_2m,ret_3m,ret_4m,ret_5m,ma6,ma12,close/ma6,close/ma12
data/features_60m.csv ADDED
@@ -0,0 +1 @@
 
 
1
+ ,Price,ret_1m,ret_2m,ret_3m,ret_4m,ret_5m,ma6,ma12,close/ma6,close/ma12
data/raw_PETR4_SA.csv ADDED
The diff for this file is too large to render. See raw diff
 
data/target_10m.csv ADDED
@@ -0,0 +1 @@
 
 
1
+ ,target
data/target_120m.csv ADDED
@@ -0,0 +1 @@
 
 
1
+ ,target
data/target_24m.csv ADDED
@@ -0,0 +1 @@
 
 
1
+ ,target
data/target_60m.csv ADDED
@@ -0,0 +1 @@
 
 
1
+ ,target
data/teste.csv DELETED
The diff for this file is too large to render. See raw diff
 
pyproject.toml CHANGED
@@ -12,14 +12,9 @@ dependencies = [
12
  "statsmodels>=0.14.0",
13
  "scikit-learn>=1.3.0",
14
  "yfinance>=0.2.0",
15
- "pandas-datareader>=0.10.0",
16
  "matplotlib>=3.7.0",
17
  "seaborn>=0.12.0",
18
- "plotly>=5.14.0",
19
- "jupyter>=1.0.0",
20
- "jupyterlab>=4.0.0",
21
- "ipykernel>=6.0.0",
22
- "joblib>=1.5.3"
23
  ]
24
  requires-python = ">=3.12"
25
  readme = "README.md"
@@ -32,19 +27,8 @@ build-backend = "pdm.backend"
32
  [tool.pdm]
33
  distribution = false
34
 
35
- [dependency-groups]
36
- dev = [
37
- "black>=23.0.0",
38
- "flake8>=6.0.0",
39
- "pytest>=7.0.0"
40
- ]
41
-
42
  [tool.pdm.scripts]
43
- download-data = "python3 src/data_collection.py"
44
- generate-features = "python3 src/features_enginnering.py"
45
- features = "python3 src/features.py"
46
  train-all = "python3 scripts/train_all.py"
47
- train-rf = "python3 -c 'from src.model_training import train_model; train_model(10, \"random_forest\"); train_model(24, \"random_forest\"); train_model(60, \"random_forest\"); train_model(120, \"random_forest\")'"
48
- train-logistic = "python -c 'from src.model_training import train_model; train_model(10, \"logistic\"); train_model(24, \"logistic\"); train_model(60, \"logistic\"); train_model(120, \"logistic\")'"
49
- backtest = "python src/backtesting.py"
50
- pipeline = "pdm run download-data && pdm run generate-features && pdm run train-all"
 
12
  "statsmodels>=0.14.0",
13
  "scikit-learn>=1.3.0",
14
  "yfinance>=0.2.0",
 
15
  "matplotlib>=3.7.0",
16
  "seaborn>=0.12.0",
17
+ "joblib>=1.2.0"
 
 
 
 
18
  ]
19
  requires-python = ">=3.12"
20
  readme = "README.md"
 
27
  [tool.pdm]
28
  distribution = false
29
 
 
 
 
 
 
 
 
30
  [tool.pdm.scripts]
31
+ download-data = "python3 scripts/download_data.py"
32
+ generate-features = "python3 scripts/generate_features.py"
 
33
  train-all = "python3 scripts/train_all.py"
34
+ pipeline = "python3 scripts/download_data.py && python3 scripts/generate_features.py && python3 scripts/train_all.py"
 
 
 
scripts/download_data.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import yfinance as yf
2
+ import pandas as pd
3
+ import os
4
+
5
+ def download_stock_data(ticker, start, end, save_path):
6
+ os.makedirs('data', exist_ok=True)
7
+ print(f"Baixando dados de {ticker}...")
8
+ df = yf.download(ticker, start=start, end=end, progress=False)
9
+ df.to_csv(save_path)
10
+ print(f"Dados salvos em {save_path}")
11
+ print(f"Shape: {df.shape}")
12
+ print(f"Colunas: {list(df.columns)}")
13
+ return df
14
+
15
+ if __name__ == "__main__":
16
+ download_stock_data("PETR4.SA", "2000-01-01", "2025-01-01", "data/raw_PETR4_SA.csv")
src/features.py → scripts/generate_features.py RENAMED
@@ -1,6 +1,7 @@
 
 
1
  from src.feature_engineering import prepare_data_for_all_horizons
2
 
3
  if __name__ == "__main__":
4
- ticker = "PETR4.SA"
5
- prepare_data_for_all_horizons(ticker)
6
- print("Arquivos gerados com sucesso na pasta data/")
 
1
+ import sys
2
+ sys.path.append('.')
3
  from src.feature_engineering import prepare_data_for_all_horizons
4
 
5
  if __name__ == "__main__":
6
+ prepare_data_for_all_horizons("PETR4.SA")
7
+ print("Arquivos gerados com sucesso")
 
scripts/load_datas.py DELETED
File without changes
scripts/train_all.py CHANGED
@@ -1,12 +1,6 @@
1
- from src.model_training import train_model
 
 
2
 
3
- horizons = ['10m', '2y', '5y', '10y']
4
-
5
- print("=== Treinando com Random Forest ===")
6
- for h in horizons:
7
- train_model(h, model_type='random_forest')
8
-
9
- """print("\n=== Treinando com Regressão Logística ===")
10
- for h in horizons:
11
- train_model(h, model_type='logistic')
12
- """
 
1
+ import sys
2
+ sys.path.append('.')
3
+ from src.model_training import train_all_horizons
4
 
5
+ if __name__ == "__main__":
6
+ train_all_horizons()
 
 
 
 
 
 
 
 
src/backtest.py DELETED
File without changes
src/data_collection.py DELETED
@@ -1,24 +0,0 @@
1
- import yfinance as yf
2
- import pandas as pd
3
- import os
4
-
5
- def download_stock_data(ticker: str, start: str, end: str, interval="1d") -> pd.DataFrame:
6
- df = yf.download(ticker, start=start, end=end, interval=interval, progress=False)
7
-
8
- if isinstance(df.columns, pd.MultiIndex):
9
- df.columns = df.columns.droplevel(1)
10
- return df
11
-
12
- def save_data(df: pd.DataFrame, ticker: str, data_dir='data'):
13
- os.makedirs(data_dir, exist_ok=True)
14
- filename = f"{ticker.replace('.', '_')}.csv"
15
- path = os.path.join(data_dir, filename)
16
- df.to_csv(path)
17
- print(f'Dados salvos em {path}')
18
- return path
19
-
20
- if __name__ == "__main__":
21
- ticker = "PETR4.SA"
22
- df = download_stock_data(ticker, start="2010-01-01", end="2025-01-01")
23
- print(df.head())
24
- save_data(df, ticker)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/{features_engineering.py → feature_engineering.py} RENAMED
@@ -2,25 +2,24 @@ import pandas as pd
2
  import numpy as np
3
  import os
4
 
5
- def load_data(ticker: str, data_dir='data') -> pd.DataFrame:
6
  ticker_clean = ticker.replace('.', '_')
7
  filename = f"raw_{ticker_clean}.csv"
8
  path = os.path.join(data_dir, filename)
9
- if not os.path.exists(path):
10
- alt_path = os.path.join(data_dir, f"{ticker_clean}.csv")
11
- if os.path.exists(alt_path):
12
- path = alt_path
13
- else:
14
- raise FileNotFoundError(f"Arquivo não encontrado: {path} ou {alt_path}")
15
- df = pd.read_csv(path, index_col=0, parse_dates=True)
16
  return df
17
 
18
- def resample_monthly(df_daily: pd.DataFrame) -> pd.DataFrame:
19
- df_daily.index = pd.to_datetime(df_daily.index)
20
- monthly = df_daily.resample('M').last()
21
  return monthly
22
 
23
- def create_features_and_target(df_monthly: pd.DataFrame, horizon_months: int):
24
  df = df_monthly.copy()
25
  close = df["Close"]
26
  for lag in [1, 2, 3, 4, 5]:
@@ -34,20 +33,17 @@ def create_features_and_target(df_monthly: pd.DataFrame, horizon_months: int):
34
  df.dropna(inplace=True)
35
  return df
36
 
37
- def prepare_data_for_all_horizons(ticker: str, horizons=[10, 24, 60, 120]):
38
  os.makedirs('data', exist_ok=True)
39
  df_daily = load_data(ticker)
 
40
  df_monthly = resample_monthly(df_daily)
 
41
  for h in horizons:
42
  df_h = create_features_and_target(df_monthly, h)
43
- feature_cols = [col for col in df_h.columns
44
- if col not in ['target', 'Close', 'Open', 'High', 'Low', 'Volume']]
45
  X = df_h[feature_cols]
46
  y = df_h['target']
47
  X.to_csv(f"data/features_{h}m.csv")
48
  y.to_csv(f"data/target_{h}m.csv", header=['target'])
49
  print(f"Horizonte {h} meses: {X.shape[0]} amostras salvas.")
50
-
51
- if __name__ == "__main__":
52
- ticker = "PETR4.SA"
53
- prepare_data_for_all_horizons(ticker)
 
2
  import numpy as np
3
  import os
4
 
5
+ def load_data(ticker, data_dir='data'):
6
  ticker_clean = ticker.replace('.', '_')
7
  filename = f"raw_{ticker_clean}.csv"
8
  path = os.path.join(data_dir, filename)
9
+ df = pd.read_csv(path)
10
+ if 'Date' in df.columns:
11
+ df.set_index('Date', inplace=True)
12
+ elif 'Unnamed: 0' in df.columns:
13
+ df.set_index('Unnamed: 0', inplace=True)
14
+ df.index = pd.to_datetime(df.index, errors='coerce')
15
+ df = df.dropna()
16
  return df
17
 
18
+ def resample_monthly(df_daily):
19
+ monthly = df_daily.resample('ME').last()
 
20
  return monthly
21
 
22
+ def create_features_and_target(df_monthly, horizon_months):
23
  df = df_monthly.copy()
24
  close = df["Close"]
25
  for lag in [1, 2, 3, 4, 5]:
 
33
  df.dropna(inplace=True)
34
  return df
35
 
36
+ def prepare_data_for_all_horizons(ticker, horizons=[10, 24, 60, 120]):
37
  os.makedirs('data', exist_ok=True)
38
  df_daily = load_data(ticker)
39
+ print(f"Dados carregados: {len(df_daily)} linhas")
40
  df_monthly = resample_monthly(df_daily)
41
+ print(f"Dados mensais: {len(df_monthly)} linhas")
42
  for h in horizons:
43
  df_h = create_features_and_target(df_monthly, h)
44
+ feature_cols = [col for col in df_h.columns if col not in ['target', 'Close', 'Open', 'High', 'Low', 'Volume']]
 
45
  X = df_h[feature_cols]
46
  y = df_h['target']
47
  X.to_csv(f"data/features_{h}m.csv")
48
  y.to_csv(f"data/target_{h}m.csv", header=['target'])
49
  print(f"Horizonte {h} meses: {X.shape[0]} amostras salvas.")
 
 
 
 
src/model_training.py CHANGED
@@ -3,43 +3,46 @@ import numpy as np
3
  from sklearn.linear_model import LogisticRegression
4
  from sklearn.ensemble import RandomForestClassifier
5
  from sklearn.metrics import accuracy_score, classification_report
 
6
  import os
7
- import joblib
8
 
9
  def load_features_target(horizon):
10
  X = pd.read_csv(f"data/features_{horizon}m.csv", index_col=0)
11
- Y = pd.read_csv(f"data/target_{horizon}m.csv", index_col=0).squeeze()
12
-
13
- return X, Y
14
 
15
- def temporal_train_split(X, Y, test_size=0.2):
16
  n = len(X)
17
  split_idx = int(n * (1 - test_size))
18
  X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
19
- Y_train, Y_test = Y.iloc[:split_idx], Y.iloc[split_idx:]
20
-
21
- return X_train, X_test, Y_train, Y_test
22
 
23
- def train_model(horizon, model_type="random_forest"):
24
- X, Y = load_features_target(horizon)
25
-
26
- X_train, X_test, Y_train, Y_test = temporal_train_split(X, Y, test_size=0.2)
27
-
28
- try:
29
  model = RandomForestClassifier(n_estimators=100, random_state=42)
30
- except Exception as error:
31
- raise ValueError(f"There's a error in training model: {error}")
32
-
33
- model.fit(X_train, Y_train)
34
-
35
  y_pred = model.predict(X_test)
36
- accuracy = accuracy_score(Y_test, y_pred)
37
- report = classification_report(Y_test, y_pred, target_names=['Queda', 'Sobe'])
38
-
39
- if model:
40
- filename = f"models/{model_type}_{horizon}m.pkl"
 
41
  os.makedirs('models', exist_ok=True)
42
- joblib.dump(model, filename)
43
- print(f"Modelo salvo em {filename}")
44
-
45
- return model, accuracy, report
 
 
 
 
 
 
 
 
 
3
  from sklearn.linear_model import LogisticRegression
4
  from sklearn.ensemble import RandomForestClassifier
5
  from sklearn.metrics import accuracy_score, classification_report
6
+ import joblib
7
  import os
 
8
 
9
  def load_features_target(horizon):
10
  X = pd.read_csv(f"data/features_{horizon}m.csv", index_col=0)
11
+ y = pd.read_csv(f"data/target_{horizon}m.csv", index_col=0).squeeze()
12
+ return X, y
 
13
 
14
+ def temporal_train_split(X, y, test_size=0.2):
15
  n = len(X)
16
  split_idx = int(n * (1 - test_size))
17
  X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
18
+ y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]
19
+ return X_train, X_test, y_train, y_test
 
20
 
21
+ def train_model(horizon, model_type='random_forest', save_model=True):
22
+ X, y = load_features_target(horizon)
23
+ X_train, X_test, y_train, y_test = temporal_train_split(X, y, test_size=0.2)
24
+ if model_type == 'logistic':
25
+ model = LogisticRegression(max_iter=1000, random_state=42)
26
+ else:
27
  model = RandomForestClassifier(n_estimators=100, random_state=42)
28
+ model.fit(X_train, y_train)
 
 
 
 
29
  y_pred = model.predict(X_test)
30
+ acc = accuracy_score(y_test, y_pred)
31
+ report = classification_report(y_test, y_pred, target_names=['Queda', 'Sobe'])
32
+ print(f"\nHorizonte: {horizon} meses - {model_type}")
33
+ print(f"Acurácia: {acc:.4f}")
34
+ print(report)
35
+ if save_model:
36
  os.makedirs('models', exist_ok=True)
37
+ joblib.dump(model, f"models/{model_type}_{horizon}m.pkl")
38
+ print(f"Modelo salvo em models/{model_type}_{horizon}m.pkl")
39
+ return model, acc, report
40
+
41
+ def train_all_horizons():
42
+ horizons = [10, 24, 60, 120]
43
+ print("Treinando Random Forest...")
44
+ for h in horizons:
45
+ train_model(h, 'random_forest')
46
+ print("\nTreinando Regressão Logística...")
47
+ for h in horizons:
48
+ train_model(h, 'logistic')
src/models.py DELETED
File without changes