architojha commited on
Commit
323306f
·
1 Parent(s): 68e9b59
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.json filter=lfs diff=lfs merge=lfs -text
Dockerfile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9
2
+
3
+ RUN useradd -m -u 1000 user
4
+ USER user
5
+ ENV PATH="/home/user/.local/bin:$PATH"
6
+
7
+ WORKDIR /app
8
+
9
+ COPY --chown=user ./requirements.txt requirements.txt
10
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
11
+
12
+ COPY --chown=user . /app
13
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
api.py ADDED
@@ -0,0 +1,342 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException
2
+ from pydantic import BaseModel
3
+ import pandas as pd
4
+ import numpy as np
5
+ import tensorflow as tf
6
+ from yahoo_fin.stock_info import get_data
7
+ from sklearn.preprocessing import MinMaxScaler
8
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
9
+ from pytorch_forecasting import TemporalFusionTransformer
10
+ from bs4 import BeautifulSoup
11
+ import requests
12
+ import torch
13
+ from llama_index.llms.groq import Groq
14
+ from llama_index.core import StorageContext, load_index_from_storage
15
+ from llama_index.embeddings.huggingface import HuggingFaceEmbedding
16
+ from dotenv import load_dotenv
17
+ from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI
18
+ import os
19
+
20
+ load_dotenv()
21
+
22
+ embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
23
+ storage_context = StorageContext.from_defaults(persist_dir="rag_index")
24
+ index = load_index_from_storage(storage_context, embed_model=embed_model)
25
+
26
+ llm = HuggingFaceInferenceAPI(
27
+ model_name="HuggingFaceH4/zephyr-7b-alpha", token=os.getenv('HF_API')
28
+ )
29
+
30
+ query_engine = index.as_query_engine(llm=llm)
31
+
32
+ MODEL_PATH = "lib/20_lstm_model.h5"
33
+ model = tf.keras.models.load_model(MODEL_PATH)
34
+
35
+ model_name_news= "mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis"
36
+ tokenizer = AutoTokenizer.from_pretrained(model_name_news)
37
+ sentiment_model = AutoModelForSequenceClassification.from_pretrained(model_name_news)
38
+
39
+ best_model_path = 'lib/tft_pred.ckpt'
40
+
41
+ best_tft = TemporalFusionTransformer.load_from_checkpoint(best_model_path)
42
+
43
+ app = FastAPI()
44
+
45
+ class TickerRequest(BaseModel):
46
+ ticker: str
47
+ start_date: str
48
+ end_date: str
49
+ interval: str = "1d"
50
+
51
+ def fetch_and_process_ticker_data(ticker, start_date, end_date, interval="1d"):
52
+ df = pd.DataFrame()
53
+ try:
54
+ temp = get_data(ticker, start_date=start_date, end_date=end_date, index_as_date=True, interval=interval)
55
+ temp = temp.drop(columns="close")
56
+ temp["revenue"] = temp["adjclose"] * temp["volume"]
57
+ temp["daily_profit"] = temp["adjclose"] - temp["open"]
58
+ df = pd.concat([df, temp], axis=0)
59
+ df.to_csv("api_test.csv", index=False) # Save locally for reference
60
+ except Exception as error:
61
+ raise HTTPException(status_code=500, detail=f"Error processing ticker {ticker}: {error}")
62
+ return df
63
+
64
+ def ticker_encoded(df):
65
+ label_map = {'ATOM': 0, 'HBIO': 1, 'IBEX': 2, 'MYFW': 3, 'NATH': 4}
66
+
67
+ ticker_encoded = []
68
+
69
+ for i in df.iloc():
70
+
71
+ ticker_name = i['ticker']
72
+
73
+ encoded_ticker = label_map[ticker_name]
74
+
75
+ ticker_encoded.append(encoded_ticker)
76
+ df['ticker_encoded'] = ticker_encoded
77
+
78
+ return df
79
+
80
+ def normalize(df):
81
+ price_scaler = MinMaxScaler()
82
+ volume_revenue_scaler = MinMaxScaler()
83
+ profit_scaler = MinMaxScaler()
84
+
85
+ df[["open", "high", "low", "adjclose"]] = price_scaler.fit_transform(df[["open", "high", "low", "adjclose"]])
86
+ df[["volume", "revenue"]] = volume_revenue_scaler.fit_transform(df[["volume", "revenue"]])
87
+ df[["daily_profit"]] = profit_scaler.fit_transform(df[["daily_profit"]])
88
+
89
+ return df, price_scaler
90
+
91
+ def create_sequence(dataset):
92
+ sequences = []
93
+ labels = []
94
+ dates = []
95
+ stock = []
96
+
97
+ df_copy = dataset.drop(columns=["date"])
98
+
99
+ start_idx = 0
100
+ for stop_idx in range(20, len(dataset)):
101
+ set_ = set(dataset.iloc[start_idx:stop_idx]["ticker_encoded"].values)
102
+ target_day_ticker_class = dataset.iloc[stop_idx]["ticker_encoded"]
103
+
104
+ if len(set_) == 1 and target_day_ticker_class == list(set_)[0]:
105
+ sequences.append(df_copy.iloc[start_idx:stop_idx].values)
106
+ labels.append(df_copy.iloc[stop_idx][["open", "adjclose"]])
107
+ date_string = dataset.iloc[stop_idx]["date"].strftime('%Y-%m-%d')
108
+ dates.append(date_string)
109
+ stock.append(str(dataset.iloc[stop_idx]["ticker_encoded"]))
110
+
111
+ start_idx += 1
112
+
113
+ return np.array(sequences), np.array(labels), dates, stock
114
+
115
+ def scaling_predictions(price_scaler,combined_dataset_prediction):
116
+
117
+ price_scaler.min_ = np.array([price_scaler.min_[0], price_scaler.min_[3]])
118
+
119
+ price_scaler.scale_ = np.array([price_scaler.scale_[0], price_scaler.scale_[3]])
120
+
121
+ combined_dataset_prediction_inverse =price_scaler.inverse_transform(combined_dataset_prediction)
122
+
123
+ return combined_dataset_prediction_inverse
124
+
125
+ def storing_predictions(df,dates,stock,combined_dataset_prediction_inverse):
126
+
127
+ df['pred_open'] = np.nan
128
+
129
+ df['pred_closing'] = np.nan
130
+
131
+ for idx, row in df.iterrows():
132
+
133
+ current_row_date = row.date.strftime('%Y-%m-%d')
134
+
135
+ current_row_ticker = str(row.ticker_encoded)
136
+
137
+ for i in range(len(dates)):
138
+
139
+
140
+ if current_row_date == dates[i] and stock[i] == current_row_ticker:
141
+
142
+ opening_price = combined_dataset_prediction_inverse[i][0]
143
+ closing_price = combined_dataset_prediction_inverse[i][1]
144
+ df.loc[idx, 'pred_open'] = opening_price
145
+ df.loc[idx, 'pred_closing'] = closing_price
146
+
147
+ break
148
+ df = df.dropna(subset=['pred_open', 'pred_closing']).reset_index(drop=True)
149
+
150
+ return df
151
+
152
+ def scrape_news(ticker_name):
153
+
154
+ columns = ['datatime', 'title','source', 'link','top_sentiment','sentiment_score']
155
+ df = pd.DataFrame(columns=columns)
156
+
157
+ for i in range (1,3):
158
+
159
+ url = f'https://markets.businessinsider.com/news/{ticker_name}-stock?p={i}'
160
+ response = requests.get(url)
161
+ html = response.text
162
+ soup = BeautifulSoup(html, 'lxml')
163
+
164
+ articles = soup.find_all('div',class_= 'latest-news__story')
165
+
166
+ for article in articles:
167
+ datatime = article.find('time', class_ = 'latest-news__date').get('datetime')
168
+
169
+ title = article.find('a', class_ = 'news-link').text
170
+
171
+ source = article.find('span', class_ = 'latest-news__source').text
172
+
173
+ link = article.find('a', class_ = 'news-link').get('href')
174
+
175
+ top_sentiment = ''
176
+
177
+ sentiment_score = 0
178
+
179
+ temp = pd.DataFrame([[datatime,title, source,link, top_sentiment,sentiment_score]], columns= df.columns)
180
+
181
+ df = pd.concat([temp,df], axis = 0)
182
+
183
+ return df
184
+
185
+ def add_recent_news(main_df, news_df,lookback_days=10):
186
+
187
+ news_df.drop(columns=['top_sentiment', 'sentiment_score'], inplace=True)
188
+
189
+
190
+ main_df['date'] = pd.to_datetime(main_df['date'])
191
+ news_df['datatime'] = pd.to_datetime(news_df['datatime'])
192
+
193
+
194
+ news_list = []
195
+ last_available_news = ''
196
+
197
+ for _, row in main_df.iterrows():
198
+ current_date = row['date']
199
+ current_ticker = row['ticker']
200
+ news_articles = ''
201
+
202
+
203
+ for _, news_row in news_df.iterrows():
204
+ extracted_date = news_row['datatime']
205
+
206
+
207
+ if (current_date - extracted_date).days <= lookback_days and extracted_date < current_date:
208
+ news_articles += news_row['title'] + " "
209
+
210
+
211
+ if not news_articles.strip():
212
+ for _, news_row in news_df[::-1].iterrows():
213
+ if news_row['datatime'] < current_date:
214
+ news_articles = news_row['title']
215
+ break
216
+
217
+
218
+ last_available_news = news_articles.strip() or last_available_news
219
+ news_list.append(last_available_news)
220
+
221
+
222
+ main_df['news'] = news_list
223
+
224
+
225
+ return main_df
226
+
227
+ def news_sentiment(df):
228
+
229
+ news_column_name = 'news'
230
+ texts = df[news_column_name].tolist()
231
+
232
+
233
+ inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
234
+
235
+ with torch.no_grad():
236
+ outputs = sentiment_model(**inputs)
237
+
238
+
239
+ logits = outputs.logits
240
+ probs = torch.softmax(logits, dim=-1)
241
+
242
+
243
+ labels = ["negative", "neutral", "positive"]
244
+
245
+
246
+ predictions = torch.argmax(probs, dim=-1)
247
+
248
+
249
+ df['predicted_sentiment'] = pd.Series([labels[pred] for pred in predictions], index=df[df[news_column_name].notna()].index)
250
+
251
+ sentiment_map = {
252
+ 'positive': 1,
253
+ 'neutral': 0,
254
+ 'negative': -1
255
+ }
256
+
257
+
258
+ df['sentiment_score'] = df['predicted_sentiment'].map(sentiment_map)
259
+
260
+ df = df.drop(columns=['news'])
261
+
262
+ return df
263
+
264
+ def get_tft_predictions(df):
265
+ for i in range(1, 21):
266
+ df[f'open_lag_{i}'] = df.groupby('ticker')['open'].shift(i)
267
+ df[f'adjclose_lag_{i}'] = df.groupby('ticker')['adjclose'].shift(i)
268
+
269
+ lag_columns = [f'open_lag_{i}' for i in range(1, 21)] + [f'adjclose_lag_{i}' for i in range(1, 21)]
270
+
271
+ df.dropna(subset=lag_columns, inplace=True)
272
+
273
+ predictions = best_tft.predict(df, mode="quantiles")
274
+
275
+ return predictions
276
+
277
+ @app.post("/fetch-ticker-data/")
278
+ async def fetch_ticker_data(request: TickerRequest):
279
+ try:
280
+ result_df = fetch_and_process_ticker_data(
281
+ ticker=request.ticker,
282
+ start_date=request.start_date,
283
+ end_date=request.end_date,
284
+ interval=request.interval
285
+ )
286
+ return result_df.to_dict(orient="records")
287
+ except Exception as e:
288
+ raise HTTPException(status_code=500, detail=str(e))
289
+
290
+ @app.post("/predict-prices/")
291
+ async def predict_prices(request: TickerRequest):
292
+ try:
293
+ raw_data = fetch_and_process_ticker_data(
294
+ ticker=request.ticker,
295
+ start_date=request.start_date,
296
+ end_date=request.end_date,
297
+ interval=request.interval
298
+ )
299
+
300
+
301
+ raw_data = raw_data.tail(60)
302
+ raw_data= raw_data.reset_index()
303
+
304
+
305
+ raw_data.rename(columns={"index": "date"}, inplace=True)
306
+ raw_data = ticker_encoded(raw_data)
307
+
308
+ temp_df = raw_data.copy()
309
+
310
+ normalized_data, scaler = normalize(raw_data)
311
+ normalized_data = normalized_data.drop(columns=['ticker'])
312
+
313
+ sequences, _, dates, stock = create_sequence(normalized_data)
314
+ combined_dataset_prediction = model.predict(sequences)
315
+ combined_dataset_prediction_inverse = scaling_predictions(scaler,combined_dataset_prediction)
316
+
317
+
318
+ lstm_pred_df=storing_predictions(temp_df,dates,stock,combined_dataset_prediction_inverse)
319
+ news_df = scrape_news(ticker_name = request.ticker)
320
+
321
+ combined_with_news_df = add_recent_news(lstm_pred_df,news_df)
322
+ sentiment_df = news_sentiment(combined_with_news_df)
323
+
324
+ sentiment_df['time_idx'] = range(1, len(sentiment_df) + 1)
325
+
326
+ predicted_values = get_tft_predictions(sentiment_df)
327
+
328
+ final_pred_open_price = predicted_values[0].item()
329
+ final_pred_closing_price = predicted_values[1].item()
330
+
331
+ return {"open": final_pred_open_price, 'close': final_pred_closing_price}
332
+
333
+ except Exception as e:
334
+ raise HTTPException(status_code=500, detail=str(e))
335
+
336
+
337
+ @app.get("/query-rag/{user_query}")
338
+ def query_rag(user_query:str):
339
+
340
+ response = query_engine.query(user_query)
341
+
342
+ return {'message':response}
lib/20_lstm_model.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cdb810150ae9d87484ce4ccc4fa5434647820411b2400b41c6fe368a3fa12f7a
3
+ size 422880
lib/tft_pred.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a38216d56d429e8f038efe3c1d83996d06b3e02b296352cf17e1e635c579371
3
+ size 2885961
rag_index/default__vector_store.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b07b216dd34042722c022963768ab830d48d385c645623e46afc83b37a4745c0
3
+ size 14374003
rag_index/docstore.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bbf3547cf968b289a8fff77a16cb0143649df8424f45843822ad9c6853bf3d45
3
+ size 7500231
rag_index/graph_store.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8e0a77744010862225c69da83c585f4f8a42fd551b044ce530dbb1eb6e16742c
3
+ size 18
rag_index/image__vector_store.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d17ed74c1649a438e518a8dc56a7772913dfe1ea7a7605bce069c63872431455
3
+ size 72
rag_index/index_store.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0128597297ccb9b86477e4805882f2501c988031e30cc651d857ad3a5a3b870c
3
+ size 133807
requirements.txt ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ pydantic
3
+ pandas
4
+ numpy
5
+ tensorflow==2.13.0
6
+ yahoo-fin
7
+ scikit-learn
8
+ transformers==4.39.2
9
+ pytorch_forecasting
10
+ beautiful
11
+ requests
12
+ torch
13
+ llama-index
14
+ llama-index-core
15
+ llama-index-embeddings-huggingface
16
+ dotenv
17
+ llama-index-llms-huggingface-api
18
+ keras==3.2.1