kauabarros-24 commited on Mar 11

Commit

43056e4

1 Parent(s): 651b3a9

CHORE: Generate most datas

Files changed (22) hide show

Untitled-1 +1 -0
data/PETR4_SA.csv +0 -0
data/features_10m.csv +1 -0
data/features_120m.csv +1 -0
data/features_24m.csv +1 -0
data/features_60m.csv +1 -0
data/raw_PETR4_SA.csv +0 -0
data/target_10m.csv +1 -0
data/target_120m.csv +1 -0
data/target_24m.csv +1 -0
data/target_60m.csv +1 -0
data/teste.csv +0 -0
pyproject.toml +4 -20
scripts/download_data.py +16 -0
src/features.py → scripts/generate_features.py +4 -3
scripts/load_datas.py +0 -0
scripts/train_all.py +5 -11
src/backtest.py +0 -0
src/data_collection.py +0 -24
src/{features_engineering.py → feature_engineering.py} +15 -19
src/model_training.py +31 -28
src/models.py +0 -0

Untitled-1 ADDED Viewed

	@@ -0,0 +1 @@


1	+

data/PETR4_SA.csv DELETED Viewed

The diff for this file is too large to render. See raw diff

data/features_10m.csv ADDED Viewed

	@@ -0,0 +1 @@


1	+ ,Price,ret_1m,ret_2m,ret_3m,ret_4m,ret_5m,ma6,ma12,close/ma6,close/ma12

data/features_120m.csv ADDED Viewed

	@@ -0,0 +1 @@


1	+ ,Price,ret_1m,ret_2m,ret_3m,ret_4m,ret_5m,ma6,ma12,close/ma6,close/ma12

data/features_24m.csv ADDED Viewed

	@@ -0,0 +1 @@


1	+ ,Price,ret_1m,ret_2m,ret_3m,ret_4m,ret_5m,ma6,ma12,close/ma6,close/ma12

data/features_60m.csv ADDED Viewed

	@@ -0,0 +1 @@


1	+ ,Price,ret_1m,ret_2m,ret_3m,ret_4m,ret_5m,ma6,ma12,close/ma6,close/ma12

data/raw_PETR4_SA.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/target_10m.csv ADDED Viewed

	@@ -0,0 +1 @@


1	+ ,target

data/target_120m.csv ADDED Viewed

	@@ -0,0 +1 @@


1	+ ,target

data/target_24m.csv ADDED Viewed

	@@ -0,0 +1 @@


1	+ ,target

data/target_60m.csv ADDED Viewed

	@@ -0,0 +1 @@


1	+ ,target

data/teste.csv DELETED Viewed

The diff for this file is too large to render. See raw diff

pyproject.toml CHANGED Viewed

@@ -12,14 +12,9 @@ dependencies = [
     "statsmodels>=0.14.0",
     "scikit-learn>=1.3.0",
     "yfinance>=0.2.0",
-    "pandas-datareader>=0.10.0",
     "matplotlib>=3.7.0",
     "seaborn>=0.12.0",
-    "plotly>=5.14.0",
-    "jupyter>=1.0.0",
-    "jupyterlab>=4.0.0",
-    "ipykernel>=6.0.0",
-    "joblib>=1.5.3"
 ]
 requires-python = ">=3.12"
 readme = "README.md"
@@ -32,19 +27,8 @@ build-backend = "pdm.backend"
 [tool.pdm]
 distribution = false
-[dependency-groups]
-dev = [
-    "black>=23.0.0",
-    "flake8>=6.0.0",
-    "pytest>=7.0.0"
-]
 [tool.pdm.scripts]
-download-data = "python3 src/data_collection.py"
-generate-features = "python3 src/features_enginnering.py"
-features = "python3 src/features.py"
 train-all = "python3 scripts/train_all.py"
-train-rf = "python3 -c 'from src.model_training import train_model; train_model(10, \"random_forest\"); train_model(24, \"random_forest\"); train_model(60, \"random_forest\"); train_model(120, \"random_forest\")'"
-train-logistic = "python -c 'from src.model_training import train_model; train_model(10, \"logistic\"); train_model(24, \"logistic\"); train_model(60, \"logistic\"); train_model(120, \"logistic\")'"
-backtest = "python src/backtesting.py"
-pipeline = "pdm run download-data && pdm run generate-features && pdm run train-all"

     "statsmodels>=0.14.0",
     "scikit-learn>=1.3.0",
     "yfinance>=0.2.0",
     "matplotlib>=3.7.0",
     "seaborn>=0.12.0",
+    "joblib>=1.2.0"
 ]
 requires-python = ">=3.12"
 readme = "README.md"
 [tool.pdm]
 distribution = false
 [tool.pdm.scripts]
+download-data = "python3 scripts/download_data.py"
+generate-features = "python3 scripts/generate_features.py"
 train-all = "python3 scripts/train_all.py"
+pipeline = "python3 scripts/download_data.py && python3 scripts/generate_features.py && python3 scripts/train_all.py"

scripts/download_data.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import yfinance as yf
+import pandas as pd
+import os
+def download_stock_data(ticker, start, end, save_path):
+    os.makedirs('data', exist_ok=True)
+    print(f"Baixando dados de {ticker}...")
+    df = yf.download(ticker, start=start, end=end, progress=False)
+    df.to_csv(save_path)
+    print(f"Dados salvos em {save_path}")
+    print(f"Shape: {df.shape}")
+    print(f"Colunas: {list(df.columns)}")
+    return df
+if __name__ == "__main__":
+    download_stock_data("PETR4.SA", "2000-01-01", "2025-01-01", "data/raw_PETR4_SA.csv")

src/features.py → scripts/generate_features.py RENAMED Viewed

@@ -1,6 +1,7 @@
 from src.feature_engineering import prepare_data_for_all_horizons
 if __name__ == "__main__":
-    ticker = "PETR4.SA"
-    prepare_data_for_all_horizons(ticker)
-    print("Arquivos gerados com sucesso na pasta data/")

+import sys
+sys.path.append('.')
 from src.feature_engineering import prepare_data_for_all_horizons
 if __name__ == "__main__":
+    prepare_data_for_all_horizons("PETR4.SA")
+    print("Arquivos gerados com sucesso")

scripts/load_datas.py DELETED Viewed

File without changes

scripts/train_all.py CHANGED Viewed

@@ -1,12 +1,6 @@
-from src.model_training import train_model
-horizons = ['10m', '2y', '5y', '10y']
-print("=== Treinando com Random Forest ===")
-for h in horizons:
-    train_model(h, model_type='random_forest')
-"""print("\n=== Treinando com Regressão Logística ===")
-for h in horizons:
-    train_model(h, model_type='logistic')
-"""

+import sys
+sys.path.append('.')
+from src.model_training import train_all_horizons
+if __name__ == "__main__":
+    train_all_horizons()

src/backtest.py DELETED Viewed

File without changes

src/data_collection.py DELETED Viewed

@@ -1,24 +0,0 @@
-import yfinance as yf
-import pandas as pd
-import os
-def download_stock_data(ticker: str, start: str, end: str, interval="1d") -> pd.DataFrame:
-    df = yf.download(ticker, start=start, end=end, interval=interval, progress=False)
-    if isinstance(df.columns, pd.MultiIndex):
-        df.columns = df.columns.droplevel(1)
-    return df
-def save_data(df: pd.DataFrame, ticker: str, data_dir='data'):
-    os.makedirs(data_dir, exist_ok=True)
-    filename = f"{ticker.replace('.', '_')}.csv"
-    path = os.path.join(data_dir, filename)
-    df.to_csv(path)
-    print(f'Dados salvos em {path}')
-    return path
-if __name__ == "__main__":
-    ticker = "PETR4.SA"
-    df = download_stock_data(ticker, start="2010-01-01", end="2025-01-01")
-    print(df.head())
-    save_data(df, ticker)

src/{features_engineering.py → feature_engineering.py} RENAMED Viewed

@@ -2,25 +2,24 @@ import pandas as pd
 import numpy as np
 import os
-def load_data(ticker: str, data_dir='data') -> pd.DataFrame:
     ticker_clean = ticker.replace('.', '_')
     filename = f"raw_{ticker_clean}.csv"
     path = os.path.join(data_dir, filename)
-    if not os.path.exists(path):
-        alt_path = os.path.join(data_dir, f"{ticker_clean}.csv")
-        if os.path.exists(alt_path):
-            path = alt_path
-        else:
-            raise FileNotFoundError(f"Arquivo não encontrado: {path} ou {alt_path}")
-    df = pd.read_csv(path, index_col=0, parse_dates=True)
     return df
-def resample_monthly(df_daily: pd.DataFrame) -> pd.DataFrame:
-    df_daily.index = pd.to_datetime(df_daily.index)
-    monthly = df_daily.resample('M').last()
     return monthly
-def create_features_and_target(df_monthly: pd.DataFrame, horizon_months: int):
     df = df_monthly.copy()
     close = df["Close"]
     for lag in [1, 2, 3, 4, 5]:
@@ -34,20 +33,17 @@ def create_features_and_target(df_monthly: pd.DataFrame, horizon_months: int):
     df.dropna(inplace=True)
     return df
-def prepare_data_for_all_horizons(ticker: str, horizons=[10, 24, 60, 120]):
     os.makedirs('data', exist_ok=True)
     df_daily = load_data(ticker)
     df_monthly = resample_monthly(df_daily)
     for h in horizons:
         df_h = create_features_and_target(df_monthly, h)
-        feature_cols = [col for col in df_h.columns
-                        if col not in ['target', 'Close', 'Open', 'High', 'Low', 'Volume']]
         X = df_h[feature_cols]
         y = df_h['target']
         X.to_csv(f"data/features_{h}m.csv")
         y.to_csv(f"data/target_{h}m.csv", header=['target'])
         print(f"Horizonte {h} meses: {X.shape[0]} amostras salvas.")
-if __name__ == "__main__":
-    ticker = "PETR4.SA"
-    prepare_data_for_all_horizons(ticker)

 import numpy as np
 import os
+def load_data(ticker, data_dir='data'):
     ticker_clean = ticker.replace('.', '_')
     filename = f"raw_{ticker_clean}.csv"
     path = os.path.join(data_dir, filename)
+    df = pd.read_csv(path)
+    if 'Date' in df.columns:
+        df.set_index('Date', inplace=True)
+    elif 'Unnamed: 0' in df.columns:
+        df.set_index('Unnamed: 0', inplace=True)
+    df.index = pd.to_datetime(df.index, errors='coerce')
+    df = df.dropna()
     return df
+def resample_monthly(df_daily):
+    monthly = df_daily.resample('ME').last()
     return monthly
+def create_features_and_target(df_monthly, horizon_months):
     df = df_monthly.copy()
     close = df["Close"]
     for lag in [1, 2, 3, 4, 5]:
     df.dropna(inplace=True)
     return df
+def prepare_data_for_all_horizons(ticker, horizons=[10, 24, 60, 120]):
     os.makedirs('data', exist_ok=True)
     df_daily = load_data(ticker)
+    print(f"Dados carregados: {len(df_daily)} linhas")
     df_monthly = resample_monthly(df_daily)
+    print(f"Dados mensais: {len(df_monthly)} linhas")
     for h in horizons:
         df_h = create_features_and_target(df_monthly, h)
+        feature_cols = [col for col in df_h.columns if col not in ['target', 'Close', 'Open', 'High', 'Low', 'Volume']]
         X = df_h[feature_cols]
         y = df_h['target']
         X.to_csv(f"data/features_{h}m.csv")
         y.to_csv(f"data/target_{h}m.csv", header=['target'])
         print(f"Horizonte {h} meses: {X.shape[0]} amostras salvas.")

src/model_training.py CHANGED Viewed

@@ -3,43 +3,46 @@ import numpy as np
 from sklearn.linear_model import LogisticRegression
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.metrics import accuracy_score, classification_report
 import os
-import joblib
 def load_features_target(horizon):
     X = pd.read_csv(f"data/features_{horizon}m.csv", index_col=0)
-    Y = pd.read_csv(f"data/target_{horizon}m.csv", index_col=0).squeeze()
-    return X, Y
-def temporal_train_split(X, Y, test_size=0.2):
     n = len(X)
     split_idx = int(n * (1 - test_size))
     X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
-    Y_train, Y_test = Y.iloc[:split_idx], Y.iloc[split_idx:]
-    return X_train, X_test, Y_train, Y_test
-def train_model(horizon, model_type="random_forest"):
-    X, Y = load_features_target(horizon)
-    X_train, X_test, Y_train, Y_test = temporal_train_split(X, Y, test_size=0.2)
-    try:
         model = RandomForestClassifier(n_estimators=100, random_state=42)
-    except Exception as error:
-        raise ValueError(f"There's a error in training model: {error}")
-    model.fit(X_train, Y_train)
     y_pred = model.predict(X_test)
-    accuracy = accuracy_score(Y_test, y_pred)
-    report = classification_report(Y_test, y_pred, target_names=['Queda', 'Sobe'])
-    if model:
-        filename = f"models/{model_type}_{horizon}m.pkl"
         os.makedirs('models', exist_ok=True)
-        joblib.dump(model, filename)
-        print(f"Modelo salvo em {filename}")
-    return model, accuracy, report

 from sklearn.linear_model import LogisticRegression
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.metrics import accuracy_score, classification_report
+import joblib
 import os
 def load_features_target(horizon):
     X = pd.read_csv(f"data/features_{horizon}m.csv", index_col=0)
+    y = pd.read_csv(f"data/target_{horizon}m.csv", index_col=0).squeeze()
+    return X, y
+def temporal_train_split(X, y, test_size=0.2):
     n = len(X)
     split_idx = int(n * (1 - test_size))
     X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
+    y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]
+    return X_train, X_test, y_train, y_test
+def train_model(horizon, model_type='random_forest', save_model=True):
+    X, y = load_features_target(horizon)
+    X_train, X_test, y_train, y_test = temporal_train_split(X, y, test_size=0.2)
+    if model_type == 'logistic':
+        model = LogisticRegression(max_iter=1000, random_state=42)
+    else:
         model = RandomForestClassifier(n_estimators=100, random_state=42)
+    model.fit(X_train, y_train)
     y_pred = model.predict(X_test)
+    acc = accuracy_score(y_test, y_pred)
+    report = classification_report(y_test, y_pred, target_names=['Queda', 'Sobe'])
+    print(f"\nHorizonte: {horizon} meses - {model_type}")
+    print(f"Acurácia: {acc:.4f}")
+    print(report)
+    if save_model:
         os.makedirs('models', exist_ok=True)
+        joblib.dump(model, f"models/{model_type}_{horizon}m.pkl")
+        print(f"Modelo salvo em models/{model_type}_{horizon}m.pkl")
+    return model, acc, report
+def train_all_horizons():
+    horizons = [10, 24, 60, 120]
+    print("Treinando Random Forest...")
+    for h in horizons:
+        train_model(h, 'random_forest')
+    print("\nTreinando Regressão Logística...")
+    for h in horizons:
+        train_model(h, 'logistic')

src/models.py DELETED Viewed

File without changes