TharaKavin commited on
Commit
b0560dc
·
verified ·
1 Parent(s): 96bf70a

Created app.py

Browse files
Files changed (1) hide show
  1. app.py +645 -0
app.py ADDED
@@ -0,0 +1,645 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import time
3
+ import io
4
+ import os
5
+ import traceback
6
+
7
+ import numpy as np
8
+ import pandas as pd
9
+ import duckdb # kept for parity (not used directly in these benches)
10
+ import gradio as gr
11
+ import matplotlib.pyplot as plt
12
+ from PIL import Image
13
+
14
+ # Optional libs
15
+ try:
16
+ import polars as pl
17
+ HAS_POLARS = True
18
+ except Exception:
19
+ pl = None
20
+ HAS_POLARS = False
21
+
22
+ # FireDucks new API: import the pandas shim
23
+ try:
24
+ import fireducks.pandas as fdpd
25
+ HAS_FIREDUCKS = True
26
+ except Exception:
27
+ fdpd = None
28
+ HAS_FIREDUCKS = False
29
+
30
+ # -------------------------
31
+ # Basic utils / data gen
32
+ # -------------------------
33
+ def generate_data(n_rows: int, n_groups: int = 50) -> pd.DataFrame:
34
+ rng = np.random.default_rng(42)
35
+ ids = np.arange(n_rows)
36
+ categories = rng.integers(0, n_groups, size=n_rows)
37
+ categories = np.array([f"cat_{c}" for c in categories])
38
+ value1 = rng.normal(0, 1, size=n_rows)
39
+ value2 = rng.normal(10, 5, size=n_rows)
40
+ start_date = np.datetime64("2020-01-01")
41
+ dates = start_date + rng.integers(0, 365, size=n_rows).astype("timedelta64[D]")
42
+
43
+ return pd.DataFrame(
44
+ {"id": ids, "category": categories, "value1": value1, "value2": value2, "date": dates}
45
+ )
46
+
47
+ def time_function(fn, repeats=3):
48
+ repeats = int(max(1, repeats))
49
+ times = []
50
+ for _ in range(repeats):
51
+ start = time.perf_counter()
52
+ fn()
53
+ end = time.perf_counter()
54
+ times.append(end - start)
55
+ return float(np.mean(times)), float(np.std(times)), [float(t) for t in times]
56
+
57
+ # -------------------------
58
+ # FireDucks helpers
59
+ # -------------------------
60
+ def ensure_fireducks_from_pandas(df: pd.DataFrame):
61
+ """
62
+ Convert a pandas DataFrame into a FireDucks-backed pandas object (shim).
63
+ """
64
+ if not HAS_FIREDUCKS:
65
+ raise RuntimeError("FireDucks (fireducks.pandas) not installed")
66
+
67
+ # Try common constructors
68
+ try:
69
+ return fdpd.DataFrame(df)
70
+ except Exception:
71
+ pass
72
+
73
+ try:
74
+ if hasattr(fdpd, "from_pandas"):
75
+ return fdpd.from_pandas(df)
76
+ except Exception:
77
+ pass
78
+
79
+ raise RuntimeError("Could not construct FireDucks DataFrame from pandas with current shim")
80
+
81
+ def materialize_fireducks(obj):
82
+ """
83
+ Convert FireDucks result to pandas if possible for fair inspection.
84
+ """
85
+ if isinstance(obj, pd.DataFrame):
86
+ return obj
87
+ if HAS_FIREDUCKS:
88
+ try:
89
+ if hasattr(obj, "to_pandas"):
90
+ return obj.to_pandas()
91
+ except Exception:
92
+ pass
93
+ return obj
94
+
95
+ # -------------------------
96
+ # Benchmark helpers
97
+ # -------------------------
98
+ def build_result(op_name, pandas_stats, polars_stats, fireducks_stats):
99
+ p_mean, p_std, p_runs = pandas_stats if pandas_stats else (None, None, None)
100
+ pl_mean, pl_std, pl_runs = polars_stats if polars_stats else (None, None, None)
101
+ fd_mean, fd_std, fd_runs = fireducks_stats if fireducks_stats else (None, None, None)
102
+
103
+ speed_pl = (p_mean / pl_mean) if (p_mean and pl_mean and pl_mean > 0) else None
104
+ speed_fd = (p_mean / fd_mean) if (p_mean and fd_mean and fd_mean > 0) else None
105
+
106
+ return {
107
+ "operation": op_name,
108
+ "pandas_mean_s": p_mean,
109
+ "pandas_std_s": p_std,
110
+ "pandas_runs": p_runs,
111
+ "polars_mean_s": pl_mean,
112
+ "polars_std_s": pl_std,
113
+ "polars_runs": pl_runs,
114
+ "fireducks_mean_s": fd_mean,
115
+ "fireducks_std_s": fd_std,
116
+ "fireducks_runs": fd_runs,
117
+ "speedup_polars_over_pandas": speed_pl,
118
+ "speedup_fireducks_over_pandas": speed_fd,
119
+ }
120
+
121
+ # -------------------------
122
+ # Bench functions (all kept)
123
+ # -------------------------
124
+ def bench_filter(df: pd.DataFrame, repeats=3):
125
+ def p_op():
126
+ _ = df[(df["value1"] > 0.5) & (df["category"] == df["category"].iloc[0])]
127
+
128
+ p_stats = time_function(p_op, repeats)
129
+
130
+ pl_stats = None
131
+ if HAS_POLARS:
132
+ pl_df = pl.from_pandas(df)
133
+ def pl_op():
134
+ first_cat = pl_df["category"][0]
135
+ _ = pl_df.filter((pl.col("value1") > 0.5) & (pl.col("category") == first_cat)).to_pandas()
136
+ pl_stats = time_function(pl_op, repeats)
137
+
138
+ fd_stats = None
139
+ if HAS_FIREDUCKS:
140
+ try:
141
+ fd_df = ensure_fireducks_from_pandas(df)
142
+ def fd_op():
143
+ res = fd_df[(fd_df["value1"] > 0.5) & (fd_df["category"] == fd_df["category"].iloc[0])]
144
+ _ = materialize_fireducks(res)
145
+ fd_stats = time_function(fd_op, repeats)
146
+ except Exception:
147
+ fd_stats = None
148
+
149
+ return build_result("Filter", p_stats, pl_stats, fd_stats)
150
+
151
+ def bench_groupby(df: pd.DataFrame, repeats=3):
152
+ def p_op():
153
+ _ = df.groupby("category")[["value1", "value2"]].mean()
154
+
155
+ p_stats = time_function(p_op, repeats)
156
+
157
+ pl_stats = None
158
+ if HAS_POLARS:
159
+ pl_df = pl.from_pandas(df)
160
+ def pl_op():
161
+ _ = pl_df.group_by("category").agg([pl.col("value1").mean(), pl.col("value2").mean()]).to_pandas()
162
+ pl_stats = time_function(pl_op, repeats)
163
+
164
+ fd_stats = None
165
+ if HAS_FIREDUCKS:
166
+ try:
167
+ fd_df = ensure_fireducks_from_pandas(df)
168
+ def fd_op():
169
+ res = fd_df.group_by("category")[["value1", "value2"]].mean()
170
+ _ = materialize_fireducks(res)
171
+ fd_stats = time_function(fd_op, repeats)
172
+ except Exception:
173
+ fd_stats = None
174
+
175
+ return build_result("Groupby mean", p_stats, pl_stats, fd_stats)
176
+
177
+ def bench_join(df: pd.DataFrame, repeats=3):
178
+ categories = df["category"].unique()
179
+ rng = np.random.default_rng(123)
180
+ dim_df = pd.DataFrame({"category": categories, "weight": rng.uniform(0.5, 2.0, len(categories))})
181
+
182
+ def p_op():
183
+ _ = df.merge(dim_df, on="category", how="left")
184
+
185
+ p_stats = time_function(p_op, repeats)
186
+
187
+ pl_stats = None
188
+ if HAS_POLARS:
189
+ pl_df = pl.from_pandas(df)
190
+ pl_dim = pl.from_pandas(dim_df)
191
+ def pl_op():
192
+ _ = pl_df.join(pl_dim, on="category", how="left").to_pandas()
193
+ pl_stats = time_function(pl_op, repeats)
194
+
195
+ fd_stats = None
196
+ if HAS_FIREDUCKS:
197
+ try:
198
+ fd_df = ensure_fireducks_from_pandas(df)
199
+ fd_dim = ensure_fireducks_from_pandas(dim_df)
200
+ def fd_op():
201
+ res = fd_df.merge(fd_dim, on="category", how="left")
202
+ _ = materialize_fireducks(res)
203
+ fd_stats = time_function(fd_op, repeats)
204
+ except Exception:
205
+ fd_stats = None
206
+
207
+ return build_result("Join on category", p_stats, pl_stats, fd_stats)
208
+
209
+ def bench_fillna(df: pd.DataFrame, repeats=3):
210
+ def p_op():
211
+ _ = df.fillna(0)
212
+ p_stats = time_function(p_op, repeats)
213
+
214
+ pl_stats = None
215
+ if HAS_POLARS:
216
+ pl_df = pl.from_pandas(df)
217
+ def pl_op():
218
+ _ = pl_df.fill_null(0).to_pandas()
219
+ pl_stats = time_function(pl_op, repeats)
220
+
221
+ fd_stats = None
222
+ if HAS_FIREDUCKS:
223
+ try:
224
+ fd_df = ensure_fireducks_from_pandas(df)
225
+ def fd_op():
226
+ res = fd_df.fillna(0)
227
+ _ = materialize_fireducks(res)
228
+ fd_stats = time_function(fd_op, repeats)
229
+ except Exception:
230
+ fd_stats = None
231
+
232
+ return build_result("Fill NA / fillna", p_stats, pl_stats, fd_stats)
233
+
234
+ def bench_dropna(df: pd.DataFrame, repeats=3):
235
+ def p_op():
236
+ _ = df.dropna()
237
+ p_stats = time_function(p_op, repeats)
238
+
239
+ pl_stats = None
240
+ if HAS_POLARS:
241
+ pl_df = pl.from_pandas(df)
242
+ def pl_op():
243
+ _ = pl_df.drop_nulls().to_pandas()
244
+ pl_stats = time_function(pl_op, repeats)
245
+
246
+ fd_stats = None
247
+ if HAS_FIREDUCKS:
248
+ try:
249
+ fd_df = ensure_fireducks_from_pandas(df)
250
+ def fd_op():
251
+ res = fd_df.dropna()
252
+ _ = materialize_fireducks(res)
253
+ fd_stats = time_function(fd_op, repeats)
254
+ except Exception:
255
+ fd_stats = None
256
+
257
+ return build_result("Drop NA / dropna", p_stats, pl_stats, fd_stats)
258
+
259
+ def bench_sort(df: pd.DataFrame, repeats=3):
260
+ def p_op():
261
+ _ = df.sort_values("value1")
262
+ p_stats = time_function(p_op, repeats)
263
+
264
+ pl_stats = None
265
+ if HAS_POLARS:
266
+ pl_df = pl.from_pandas(df)
267
+ def pl_op():
268
+ _ = pl_df.sort("value1").to_pandas()
269
+ pl_stats = time_function(pl_op, repeats)
270
+
271
+ fd_stats = None
272
+ if HAS_FIREDUCKS:
273
+ try:
274
+ fd_df = ensure_fireducks_from_pandas(df)
275
+ def fd_op():
276
+ res = fd_df.sort_values("value1")
277
+ _ = materialize_fireducks(res)
278
+ fd_stats = time_function(fd_op, repeats)
279
+ except Exception:
280
+ fd_stats = None
281
+
282
+ return build_result("Sort by value1", p_stats, pl_stats, fd_stats)
283
+
284
+ def bench_describe(df: pd.DataFrame, repeats=3):
285
+ def p_op():
286
+ _ = df.describe()
287
+ p_stats = time_function(p_op, repeats)
288
+
289
+ pl_stats = None
290
+ if HAS_POLARS:
291
+ pl_df = pl.from_pandas(df)
292
+ def pl_op():
293
+ _ = pl_df.describe().to_pandas()
294
+ pl_stats = time_function(pl_op, repeats)
295
+
296
+ fd_stats = None
297
+ if HAS_FIREDUCKS:
298
+ try:
299
+ fd_df = ensure_fireducks_from_pandas(df)
300
+ def fd_op():
301
+ res = fd_df.describe()
302
+ _ = materialize_fireducks(res)
303
+ fd_stats = time_function(fd_op, repeats)
304
+ except Exception:
305
+ fd_stats = None
306
+
307
+ return build_result("Describe()", p_stats, pl_stats, fd_stats)
308
+
309
+ def bench_read_csv(df: pd.DataFrame, repeats=3):
310
+ path = "temp_bench.csv"
311
+ df.to_csv(path, index=False)
312
+
313
+ def p_op():
314
+ _ = pd.read_csv(path)
315
+ p_stats = time_function(p_op, repeats)
316
+
317
+ pl_stats = None
318
+ if HAS_POLARS:
319
+ def pl_op():
320
+ _ = pl.read_csv(path).to_pandas()
321
+ pl_stats = time_function(pl_op, repeats)
322
+
323
+ fd_stats = None
324
+ if HAS_FIREDUCKS:
325
+ try:
326
+ def fd_op():
327
+ res = fdpd.read_csv(path)
328
+ _ = materialize_fireducks(res)
329
+ fd_stats = time_function(fd_op, repeats)
330
+ except Exception:
331
+ try:
332
+ def fd_op_fb():
333
+ res = fdpd.DataFrame(pd.read_csv(path))
334
+ _ = materialize_fireducks(res)
335
+ fd_stats = time_function(fd_op_fb, repeats)
336
+ except Exception:
337
+ fd_stats = None
338
+
339
+ try:
340
+ os.remove(path)
341
+ except Exception:
342
+ pass
343
+
344
+ return build_result("Read CSV", p_stats, pl_stats, fd_stats)
345
+
346
+ def bench_read_parquet(df: pd.DataFrame, repeats=3):
347
+ path = "temp_bench.parquet"
348
+ df.to_parquet(path, index=False)
349
+
350
+ def p_op():
351
+ _ = pd.read_parquet(path)
352
+ p_stats = time_function(p_op, repeats)
353
+
354
+ pl_stats = None
355
+ if HAS_POLARS:
356
+ def pl_op():
357
+ _ = pl.read_parquet(path).to_pandas()
358
+ pl_stats = time_function(pl_op, repeats)
359
+
360
+ fd_stats = None
361
+ if HAS_FIREDUCKS:
362
+ try:
363
+ def fd_op():
364
+ res = fdpd.read_parquet(path)
365
+ _ = materialize_fireducks(res)
366
+ fd_stats = time_function(fd_op, repeats)
367
+ except Exception:
368
+ try:
369
+ def fd_op_fb():
370
+ res = fdpd.DataFrame(pd.read_parquet(path))
371
+ _ = materialize_fireducks(res)
372
+ fd_stats = time_function(fd_op_fb, repeats)
373
+ except Exception:
374
+ fd_stats = None
375
+
376
+ try:
377
+ os.remove(path)
378
+ except Exception:
379
+ pass
380
+
381
+ return build_result("Read Parquet", p_stats, pl_stats, fd_stats)
382
+
383
+ def bench_write_parquet(df: pd.DataFrame, repeats=3):
384
+ def p_op():
385
+ df.to_parquet("temp_pd.parquet")
386
+ p_stats = time_function(p_op, repeats)
387
+
388
+ pl_stats = None
389
+ if HAS_POLARS:
390
+ pl_df = pl.from_pandas(df)
391
+ def pl_op():
392
+ pl_df.write_parquet("temp_pl.parquet")
393
+ pl_stats = time_function(pl_op, repeats)
394
+
395
+ fd_stats = None
396
+ if HAS_FIREDUCKS:
397
+ try:
398
+ fd_df = ensure_fireducks_from_pandas(df)
399
+ def fd_op():
400
+ if hasattr(fd_df, "to_parquet"):
401
+ fd_df.to_parquet("temp_fd.parquet")
402
+ else:
403
+ materialize_fireducks(fd_df).to_parquet("temp_fd.parquet")
404
+ fd_stats = time_function(fd_op, repeats)
405
+ except Exception:
406
+ fd_stats = None
407
+
408
+ for p in ["temp_pd.parquet", "temp_pl.parquet", "temp_fd.parquet"]:
409
+ try:
410
+ os.remove(p)
411
+ except Exception:
412
+ pass
413
+
414
+ return build_result("Write Parquet", p_stats, pl_stats, fd_stats)
415
+
416
+ # -------------------------
417
+ # UI helpers: chart and images
418
+ # -------------------------
419
+ def generate_chart_three(result):
420
+ fig, ax = plt.subplots(figsize=(5, 3))
421
+ labels = []
422
+ values = []
423
+ if result["pandas_mean_s"] is not None:
424
+ labels.append("Pandas")
425
+ values.append(result["pandas_mean_s"])
426
+ if result["polars_mean_s"] is not None:
427
+ labels.append("Polars")
428
+ values.append(result["polars_mean_s"])
429
+ if result["fireducks_mean_s"] is not None:
430
+ labels.append("FireDucks")
431
+ values.append(result["fireducks_mean_s"])
432
+ ax.bar(labels, values)
433
+ ax.set_ylabel("Time (s)")
434
+ ax.set_title(result["operation"])
435
+ for i, v in enumerate(values):
436
+ ax.text(i, v + max(values) * 0.01, f"{v:.4f}s", ha='center')
437
+ buf = io.BytesIO()
438
+ plt.tight_layout()
439
+ plt.savefig(buf, format="png")
440
+ buf.seek(0)
441
+ plt.close(fig)
442
+ return Image.open(buf)
443
+
444
+ def generate_speedbars(result):
445
+ """
446
+ Horizontal bars showing relative speed. Lower time = longer 'speed' bar.
447
+ We'll normalize with the fastest (smallest) time.
448
+ """
449
+ # Collect engines & times
450
+ engines = []
451
+ times = []
452
+ if result["pandas_mean_s"] is not None:
453
+ engines.append("Pandas"); times.append(result["pandas_mean_s"])
454
+ if result["polars_mean_s"] is not None:
455
+ engines.append("Polars"); times.append(result["polars_mean_s"])
456
+ if result["fireducks_mean_s"] is not None:
457
+ engines.append("FireDucks"); times.append(result["fireducks_mean_s"])
458
+
459
+ if len(times) == 0:
460
+ # return a small empty image
461
+ img = Image.new("RGB", (600, 80), color=(240,240,240))
462
+ return img
463
+
464
+ fastest = min(times)
465
+ # speed multiplier relative to pandas baseline (if pandas present)
466
+ baseline = result["pandas_mean_s"] if result["pandas_mean_s"] else fastest
467
+
468
+ # Normalize lengths: invert times so smaller time -> bigger bar
469
+ inv = [fastest / t for t in times]
470
+ max_inv = max(inv)
471
+ lengths = [int(500 * (v / max_inv)) for v in inv]
472
+
473
+ fig, ax = plt.subplots(figsize=(6, len(engines) * 0.6 + 0.5))
474
+ y_pos = np.arange(len(engines))
475
+
476
+ ax.barh(y_pos, lengths, align='center')
477
+ ax.set_yticks(y_pos)
478
+ ax.set_yticklabels(engines)
479
+ ax.invert_yaxis() # fastest on top
480
+ ax.set_xlabel("Relative speed (normalized to fastest)")
481
+ # Annotate multiplier and actual time
482
+ for i, (l, t) in enumerate(zip(lengths, times)):
483
+ mult = baseline / t if baseline and t else None
484
+ label = f"{t:.4f}s"
485
+ if mult:
486
+ label += f" ({mult:.2f}x vs baseline)"
487
+ ax.text(l + 6, i, label, va='center')
488
+
489
+ plt.tight_layout()
490
+ buf = io.BytesIO()
491
+ plt.savefig(buf, format="png")
492
+ buf.seek(0)
493
+ plt.close(fig)
494
+ return Image.open(buf)
495
+
496
+ def format_result_md(result):
497
+ md = f"### 🔬 {result['operation']}\n\n"
498
+ md += "| Engine | Mean (s) | Std (s) |\n|---|---:|---:|\n"
499
+ md += f"| Pandas | `{result['pandas_mean_s']}` | `{result['pandas_std_s']}` |\n"
500
+ md += f"| Polars | `{result['polars_mean_s']}` | `{result['polars_std_s']}` |\n"
501
+ md += f"| FireDucks | `{result['fireducks_mean_s']}` | `{result['fireducks_std_s']}` |\n\n"
502
+ if result["speedup_polars_over_pandas"]:
503
+ md += f"- Polars speedup over Pandas: **{result['speedup_polars_over_pandas']:.2f}x**\n"
504
+ if result["speedup_fireducks_over_pandas"]:
505
+ md += f"- FireDucks speedup over Pandas: **{result['speedup_fireducks_over_pandas']:.2f}x**\n"
506
+ md += "\n<details><summary>Raw runs</summary>\n\n"
507
+ md += f"- Pandas runs: `{result['pandas_runs']}`\n"
508
+ md += f"- Polars runs: `{result['polars_runs']}`\n"
509
+ md += f"- FireDucks runs: `{result['fireducks_runs']}`\n"
510
+ md += "\n</details>\n"
511
+ return md
512
+
513
+ def fastest_engine_badge(result):
514
+ engines = []
515
+ times = []
516
+ if result["pandas_mean_s"] is not None:
517
+ engines.append("Pandas"); times.append(result["pandas_mean_s"])
518
+ if result["polars_mean_s"] is not None:
519
+ engines.append("Polars"); times.append(result["polars_mean_s"])
520
+ if result["fireducks_mean_s"] is not None:
521
+ engines.append("FireDucks"); times.append(result["fireducks_mean_s"])
522
+
523
+ if not engines:
524
+ return "<div style='padding:8px;background:#f8d7da;color:#721c24;border-radius:6px'>No engines available</div>"
525
+
526
+ idx = int(np.argmin(times))
527
+ fastest = engines[idx]
528
+ time_val = times[idx]
529
+ html = f"""
530
+ <div style="display:inline-block;padding:10px 14px;border-radius:8px;background:#0f172a;color:#fff">
531
+ <strong>Fastest:</strong> {fastest} — {time_val:.4f}s
532
+ </div>
533
+ """
534
+ return html
535
+
536
+ # -------------------------
537
+ # Dispatcher map
538
+ # -------------------------
539
+ OPERATION_MAP = {
540
+ "Filter": bench_filter,
541
+ "Groupby": bench_groupby,
542
+ "Join": bench_join,
543
+ "Fillna": bench_fillna,
544
+ "Dropna": bench_dropna,
545
+ "Sort": bench_sort,
546
+ "Describe": bench_describe,
547
+ "Read CSV": bench_read_csv,
548
+ "Read Parquet": bench_read_parquet,
549
+ "Write Parquet": bench_write_parquet,
550
+ }
551
+
552
+ def run_benchmark_dispatch(operation, df, repeats):
553
+ if operation not in OPERATION_MAP:
554
+ raise ValueError("Unsupported operation")
555
+ fn = OPERATION_MAP[operation]
556
+ return fn(df, repeats)
557
+
558
+ # -------------------------
559
+ # Gradio UI (Option A layout)
560
+ # -------------------------
561
+ theme = gr.themes.Soft(primary_hue="indigo", neutral_hue="slate")
562
+
563
+ with gr.Blocks(title="Pandas vs Polars vs FireDucks Benchmark", theme=theme) as demo:
564
+ gr.Markdown("# 🐼 vs 🔥 vs ⚡ Pandas vs Polars vs FireDucks — Benchmark playground")
565
+
566
+ with gr.Tabs():
567
+ with gr.Tab("Synthetic dataset"):
568
+ # Controls
569
+ dataset_size = gr.Radio(["100k", "500k", "2M"], value="100k", label="Dataset size")
570
+ operation = gr.Dropdown(list(OPERATION_MAP.keys()), value="Filter", label="Operation")
571
+ repeats = gr.Slider(1, 7, value=3, label="Repeats")
572
+ run_btn = gr.Button("Run benchmark")
573
+
574
+ # OUTPUT LAYOUT (Option A): chart top -> speedbars -> fastest badge -> markdown
575
+ chart_out = gr.Image(label="Timing chart (lower is better)", height=300, width=600)
576
+ speedbars_out = gr.Image(label="Relative speedbars (fastest normalized to 1)", height=300, width=600)
577
+ fastest_out = gr.HTML(label="Fastest engine")
578
+ md_out = gr.Markdown()
579
+
580
+ def run_synth(size, op, reps):
581
+ # check optional libs
582
+ missing = []
583
+ if not HAS_POLARS:
584
+ missing.append("polars")
585
+ if not HAS_FIREDUCKS:
586
+ missing.append("fireducks (fireducks.pandas shim)")
587
+ if missing:
588
+ # return friendly warning in place of outputs
589
+ warn = f"⚠ Missing libraries: {', '.join(missing)}. Add them to requirements.txt if you want those engines tested."
590
+ # for images, return small placeholder image with warning text
591
+ img = Image.new("RGB", (800, 200), color=(250,250,250))
592
+ return img, img, f"<div style='color:#b45309;padding:10px'>{warn}</div>", f"**Warning**: {warn}"
593
+
594
+ n = {"100k": 100_000, "500k": 500_000, "2M": 2_000_000}[size]
595
+ df = generate_data(n)
596
+ result = run_benchmark_dispatch(op, df, int(reps))
597
+
598
+ # Build visuals
599
+ chart = generate_chart_three(result)
600
+ speedbars = generate_speedbars(result)
601
+ fastest_html = fastest_engine_badge(result)
602
+ md = format_result_md(result)
603
+ return chart, speedbars, fastest_html, md
604
+
605
+ run_btn.click(run_synth, [dataset_size, operation, repeats], [chart_out, speedbars_out, fastest_out, md_out])
606
+
607
+ with gr.Tab("Custom dataset"):
608
+ file_in = gr.File(label="Upload CSV / Parquet / Feather / Arrow", file_types=['.csv', '.parquet', '.feather', '.arrow'])
609
+ operation_c = gr.Dropdown(list(OPERATION_MAP.keys()), value="Filter", label="Operation")
610
+ repeats_c = gr.Slider(1, 7, value=3, label="Repeats")
611
+ run_btn_c = gr.Button("Run on uploaded dataset")
612
+
613
+ chart_out_c = gr.Image(label="Timing chart")
614
+ speedbars_out_c = gr.Image(label="Relative speedbars")
615
+ fastest_out_c = gr.HTML(label="Fastest engine")
616
+ md_out_c = gr.Markdown()
617
+
618
+ def run_custom(file, op, reps):
619
+ if file is None:
620
+ img = Image.new("RGB", (800, 200), color=(250,250,250))
621
+ return img, img, "<div style='color:#b45309;padding:10px'>Upload a dataset file first.</div>", "Upload a dataset file first."
622
+ fname = file.name
623
+ try:
624
+ if fname.endswith(".csv"):
625
+ df = pd.read_csv(fname)
626
+ elif fname.endswith(".parquet"):
627
+ df = pd.read_parquet(fname)
628
+ elif fname.endswith(".feather") or fname.endswith(".arrow"):
629
+ df = pd.read_feather(fname)
630
+ else:
631
+ return Image.new("RGB", (800,200),(250,250,250)), Image.new("RGB",(800,200),(250,250,250)), "<div>Unsupported file format</div>", "Unsupported file format"
632
+ except Exception as e:
633
+ return Image.new("RGB", (800,200),(250,250,250)), Image.new("RGB",(800,200),(250,250,250)), f"<div>Error reading file: {e}</div>", f"Error reading file: {e}"
634
+
635
+ result = run_benchmark_dispatch(op, df, int(reps))
636
+ chart = generate_chart_three(result)
637
+ speedbars = generate_speedbars(result)
638
+ fastest_html = fastest_engine_badge(result)
639
+ md = format_result_md(result)
640
+ return chart, speedbars, fastest_html, md
641
+
642
+ run_btn_c.click(run_custom, [file_in, operation_c, repeats_c], [chart_out_c, speedbars_out_c, fastest_out_c, md_out_c])
643
+
644
+ if __name__ == "__main__":
645
+ demo.launch(server_name='0.0.0.0', server_port=int(os.environ.get("PORT", 7860)))