kobi-kadosh commited on
Commit
d4d9815
·
verified ·
1 Parent(s): d7b9e50

fix: load sample config (50k) instead of full 975k for fast cold-start

Browse files
Files changed (1) hide show
  1. app.py +7 -5
app.py CHANGED
@@ -12,8 +12,9 @@ import random
12
  import gradio as gr
13
  from datasets import load_dataset
14
 
15
- # Load from the published dataset uses the default (HEAD) revision.
16
- # On first load this downloads ~95MB of Parquet; subsequent requests use cache.
 
17
  _DS = None
18
  _REVISION = "v1.0-rc1"
19
 
@@ -21,7 +22,7 @@ _REVISION = "v1.0-rc1"
21
  def _load():
22
  global _DS
23
  if _DS is None:
24
- _DS = load_dataset("carosh/cli-1m", revision=_REVISION, split="train")
25
  return _DS
26
 
27
 
@@ -80,8 +81,9 @@ def build_ui():
80
  gr.Markdown(
81
  "# CLI-1M Dataset Explorer\n"
82
  f"Browsing [`carosh/cli-1m`](https://huggingface.co/datasets/carosh/cli-1m) "
83
- f"— revision `{_REVISION}` — {len(ds):,} rows\n\n"
84
- "Filter by shell, language, or industry bucket, then sample random rows."
 
85
  )
86
 
87
  with gr.Row():
 
12
  import gradio as gr
13
  from datasets import load_dataset
14
 
15
+ # Load the `sample` config (50k stratified rows) downloads ~4.6MB vs 95MB
16
+ # for the full default/train. Fast cold-start on CPU free tier.
17
+ # Users who need the full 975k can filter via load_dataset locally.
18
  _DS = None
19
  _REVISION = "v1.0-rc1"
20
 
 
22
  def _load():
23
  global _DS
24
  if _DS is None:
25
+ _DS = load_dataset("carosh/cli-1m", name="sample", revision=_REVISION, split="train")
26
  return _DS
27
 
28
 
 
81
  gr.Markdown(
82
  "# CLI-1M Dataset Explorer\n"
83
  f"Browsing [`carosh/cli-1m`](https://huggingface.co/datasets/carosh/cli-1m) "
84
+ f"— `sample` config, revision `{_REVISION}` — {len(ds):,} stratified rows\n\n"
85
+ "Filter by shell, language, or industry bucket, then sample random rows. "
86
+ "The `sample` config is a stratified 50k subset of the full 975k corpus."
87
  )
88
 
89
  with gr.Row():