shyameati commited on
Commit
36bd0e2
·
1 Parent(s): 0bd1ef8

Initial FastAPI app for transcripts API

Browse files
Files changed (4) hide show
  1. Dockerfile +13 -0
  2. app.py +136 -0
  3. index.html +44 -0
  4. requirements.txt +6 -0
Dockerfile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ WORKDIR /code
4
+
5
+ COPY requirements.txt .
6
+ RUN pip install --no-cache-dir -r requirements.txt
7
+
8
+ COPY . .
9
+
10
+ EXPOSE 7860
11
+
12
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
13
+
app.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from fastapi import FastAPI
3
+ from datasets import load_dataset
4
+ import pandas as pd
5
+ import numpy as np
6
+
7
+ # -------------------------------------------------
8
+ # Logging configuration
9
+ # -------------------------------------------------
10
+ logging.basicConfig(
11
+ level=logging.INFO,
12
+ format="%(asctime)s | %(levelname)s | %(message)s"
13
+ )
14
+ logger = logging.getLogger(__name__)
15
+
16
+ app = FastAPI()
17
+
18
+ # -------------------------------------------------
19
+ # Lazy dataset cache
20
+ # -------------------------------------------------
21
+ DF = None
22
+
23
+
24
+ def load_data_once():
25
+ """
26
+ Load the dataset only once using streaming mode.
27
+ Hugging Face Spaces cannot download large datasets at startup,
28
+ so streaming=True avoids timeouts and memory issues.
29
+ """
30
+ global DF
31
+
32
+ if DF is None:
33
+ logger.info("Loading dataset (streaming mode): kurry/sp500_earnings_transcripts ...")
34
+
35
+ ds = load_dataset(
36
+ "kurry/sp500_earnings_transcripts",
37
+ split="train",
38
+ streaming=True
39
+ )
40
+
41
+ # Convert streaming dataset → pandas DataFrame
42
+ # Limit rows to avoid memory overload
43
+ rows = []
44
+ for i, item in enumerate(ds):
45
+ rows.append(item)
46
+ if i > 5000: # Safety limit for Spaces
47
+ break
48
+
49
+ DF = pd.DataFrame(rows)
50
+ logger.info(f"Loaded {len(DF)} rows into DataFrame")
51
+
52
+ return DF
53
+
54
+
55
+ # -------------------------------------------------
56
+ # Utility: convert NumPy → Python
57
+ # -------------------------------------------------
58
+ def to_python(obj):
59
+ if isinstance(obj, (np.integer, np.int64)):
60
+ return int(obj)
61
+ if isinstance(obj, (np.floating, np.float64)):
62
+ return float(obj)
63
+ if isinstance(obj, (np.bool_,)):
64
+ return bool(obj)
65
+ if isinstance(obj, pd.Timestamp):
66
+ return obj.isoformat()
67
+ return obj
68
+
69
+
70
+ def clean_dict(d):
71
+ return {k: to_python(v) for k, v in d.items()}
72
+
73
+
74
+ # -------------------------------------------------
75
+ # Routes
76
+ # -------------------------------------------------
77
+
78
+ @app.on_event("startup")
79
+ def startup_event():
80
+ logger.info("🚀 Earnings Transcript API starting up")
81
+
82
+
83
+ @app.get("/")
84
+ def root():
85
+ logger.info("Root endpoint called")
86
+ return {"message": "Earnings Transcript API is running"}
87
+
88
+
89
+ @app.get("/tickers")
90
+ def list_tickers():
91
+ logger.info("Listing all tickers")
92
+ df = load_data_once()
93
+
94
+ if "symbol" not in df.columns:
95
+ return {"error": "Dataset does not contain 'symbol' column"}
96
+
97
+ tickers = sorted(df["symbol"].dropna().unique().tolist())
98
+ logger.info(f"Returned {len(tickers)} tickers")
99
+ return {"tickers": tickers}
100
+
101
+
102
+ @app.get("/transcript/{symbol}")
103
+ def get_transcript(symbol: str):
104
+ logger.info(f"Transcript request received for symbol: {symbol}")
105
+
106
+ df = load_data_once()
107
+ symbol = symbol.upper()
108
+
109
+ if "symbol" not in df.columns:
110
+ return {"error": "Dataset missing 'symbol' column"}
111
+
112
+ rows = df[df["symbol"] == symbol]
113
+
114
+ if rows.empty:
115
+ logger.warning(f"No transcripts found for symbol: {symbol}")
116
+ return {"error": f"No transcripts found for symbol {symbol}"}
117
+
118
+ row = rows.iloc[0]
119
+ base_info = clean_dict(row.to_dict())
120
+
121
+ # Extract structured content (correct column name)
122
+ segments = row.get("structured_content", None)
123
+
124
+ if isinstance(segments, list):
125
+ logger.info(f"Cleaning {len(segments)} segments for {symbol}")
126
+ cleaned_segments = [
127
+ clean_dict(seg) for seg in segments if isinstance(seg, dict)
128
+ ]
129
+ base_info["segments"] = cleaned_segments
130
+ else:
131
+ logger.info(f"No structured_content found for {symbol}")
132
+ base_info["segments"] = []
133
+
134
+ logger.info(f"Returning transcript for {symbol}")
135
+ return base_info
136
+
index.html ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <title>Ticker Checker</title>
5
+ <style>
6
+ body { font-family: Arial; margin: 40px; }
7
+ input { padding: 8px; width: 200px; }
8
+ button { padding: 8px 12px; }
9
+ #result { margin-top: 20px; font-size: 18px; }
10
+ </style>
11
+ </head>
12
+ <body>
13
+
14
+ <h2>Check if a Ticker Exists</h2>
15
+
16
+ <input id="tickerInput" type="text" placeholder="Enter ticker (e.g., AAPL)">
17
+ <button onclick="checkTicker()">Check</button>
18
+
19
+ <div id="result"></div>
20
+
21
+ <script>
22
+ async function checkTicker() {
23
+ const ticker = document.getElementById("tickerInput").value.trim().toUpperCase();
24
+ const resultDiv = document.getElementById("result");
25
+
26
+ if (!ticker) {
27
+ resultDiv.innerHTML = "Please enter a ticker.";
28
+ return;
29
+ }
30
+
31
+ const response = await fetch(`/check/${ticker}`);
32
+ const data = await response.json();
33
+
34
+ if (data.exists) {
35
+ resultDiv.innerHTML = `<span style="color: green;">✔ ${ticker} exists in the dataset</span>`;
36
+ } else {
37
+ resultDiv.innerHTML = `<span style="color: red;">✘ ${ticker} does NOT exist in the dataset</span>`;
38
+ }
39
+ }
40
+ </script>
41
+
42
+ </body>
43
+ </html>
44
+
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ fastapi==0.109.0
2
+ uvicorn==0.27.0
3
+ datasets==2.18.0
4
+ pandas==2.2.0
5
+ pyarrow==15.0.0
6
+