kobi-kadosh commited on
Commit
d28d1c7
·
verified ·
1 Parent(s): d4d9815

fix: move demo to top-level, lazy-load dataset on button click

Browse files
Files changed (1) hide show
  1. app.py +65 -75
app.py CHANGED
@@ -2,46 +2,41 @@
2
 
3
  Random-row viewer with bucket / shell / language filters.
4
  Deploy to HuggingFace Spaces (CPU Free tier).
5
-
6
- Usage on HF Spaces:
7
- This file + requirements.txt in the space repo is all you need.
8
- Set HF_TOKEN in Space secrets if the dataset requires auth.
9
  """
10
 
11
  import random
12
  import gradio as gr
13
  from datasets import load_dataset
14
 
15
- # Load the `sample` config (50k stratified rows) — downloads ~4.6MB vs 95MB
16
- # for the full default/train. Fast cold-start on CPU free tier.
17
- # Users who need the full 975k can filter via load_dataset locally.
18
- _DS = None
19
  _REVISION = "v1.0-rc1"
 
 
 
 
 
 
 
 
 
 
20
 
21
 
22
  def _load():
23
  global _DS
24
  if _DS is None:
25
- _DS = load_dataset("carosh/cli-1m", name="sample", revision=_REVISION, split="train")
 
 
26
  return _DS
27
 
28
 
29
- def _get_options(ds):
30
- shells = sorted(set(ds["shell"]))
31
- langs = sorted(set(ds["language"]))
32
- buckets_flat = set()
33
- for b in ds["bucket"]:
34
- if isinstance(b, list):
35
- buckets_flat.update(b)
36
- elif b:
37
- buckets_flat.add(b)
38
- return ["(any)"] + shells, ["(any)"] + langs, ["(any)"] + sorted(buckets_flat)
39
-
40
-
41
  def sample_rows(shell_filter, lang_filter, bucket_filter, n_rows, seed):
42
- ds = _load()
43
- filtered = ds
 
 
44
 
 
45
  if shell_filter != "(any)":
46
  filtered = filtered.filter(lambda r: r["shell"] == shell_filter)
47
  if lang_filter != "(any)":
@@ -53,69 +48,64 @@ def sample_rows(shell_filter, lang_filter, bucket_filter, n_rows, seed):
53
 
54
  total = len(filtered)
55
  if total == 0:
56
- return "No rows match the selected filters.", ""
57
 
58
- rng = random.Random(int(seed) if seed else None)
59
- indices = rng.sample(range(total), min(int(n_rows), total))
 
60
  rows = filtered.select(indices)
61
 
62
- md_parts = [f"**{total:,} rows match** — showing {len(indices)}\n"]
63
  for i, row in enumerate(rows):
64
  msgs = row.get("messages") or []
65
  user_msg = next((m["content"] for m in msgs if m.get("role") == "user"), "")
66
  assistant_msg = next((m["content"] for m in msgs if m.get("role") == "assistant"), "")
67
  bucket = ", ".join(row.get("bucket") or [])
68
- md_parts.append(
69
- f"---\n**Row {i+1}** · shell=`{row.get('shell')}` · lang=`{row.get('language')}` · bucket=`{bucket}`\n\n"
 
70
  f"**User:** {user_msg}\n\n"
71
  f"```{row.get('shell', 'bash')}\n{assistant_msg}\n```\n"
72
  )
73
- return "\n".join(md_parts), f"{total:,}"
74
-
75
-
76
- def build_ui():
77
- ds = _load()
78
- shell_opts, lang_opts, bucket_opts = _get_options(ds)
79
-
80
- with gr.Blocks(title="CLI-1M Explorer", theme=gr.themes.Soft()) as demo:
81
- gr.Markdown(
82
- "# CLI-1M Dataset Explorer\n"
83
- f"Browsing [`carosh/cli-1m`](https://huggingface.co/datasets/carosh/cli-1m) "
84
- f"— `sample` config, revision `{_REVISION}` — {len(ds):,} stratified rows\n\n"
85
- "Filter by shell, language, or industry bucket, then sample random rows. "
86
- "The `sample` config is a stratified 50k subset of the full 975k corpus."
87
- )
88
-
89
- with gr.Row():
90
- shell_dd = gr.Dropdown(shell_opts, value="(any)", label="Shell")
91
- lang_dd = gr.Dropdown(lang_opts, value="(any)", label="Language")
92
- bucket_dd = gr.Dropdown(bucket_opts, value="(any)", label="Industry bucket")
93
-
94
- with gr.Row():
95
- n_rows = gr.Slider(1, 20, value=5, step=1, label="Rows to show")
96
- seed = gr.Number(value=42, label="Random seed (blank = random)")
97
-
98
- sample_btn = gr.Button("Sample rows", variant="primary")
99
- match_count = gr.Textbox(label="Rows matching filter", interactive=False)
100
- output = gr.Markdown()
101
-
102
- sample_btn.click(
103
- fn=sample_rows,
104
- inputs=[shell_dd, lang_dd, bucket_dd, n_rows, seed],
105
- outputs=[output, match_count],
106
- )
107
-
108
- gr.Markdown(
109
- "---\n"
110
- "**Links:** "
111
- "[Dataset card](https://huggingface.co/datasets/carosh/cli-1m) · "
112
- "[Eval split (gated)](https://huggingface.co/datasets/carosh/cli-1m-eval) · "
113
- "[Source repo](https://github.com/wildcard/caro-eval) · "
114
- "Apache-2.0"
115
- )
116
-
117
- return demo
118
 
119
 
120
  if __name__ == "__main__":
121
- build_ui().launch()
 
2
 
3
  Random-row viewer with bucket / shell / language filters.
4
  Deploy to HuggingFace Spaces (CPU Free tier).
 
 
 
 
5
  """
6
 
7
  import random
8
  import gradio as gr
9
  from datasets import load_dataset
10
 
 
 
 
 
11
  _REVISION = "v1.0-rc1"
12
+ _DS = None # lazy-loaded on first query
13
+
14
+ SHELL_OPTS = ["(any)", "bash", "zsh", "fish", "powershell", "nu", "oils-osh"]
15
+ LANG_OPTS = ["(any)", "en", "zh", "de", "es", "fr", "ja", "it", "pt", "ru", "ar", "hi", "ko", "he"]
16
+
17
+ # Known buckets — avoids full dataset scan at startup
18
+ BUCKET_OPTS = ["(any)", "devops", "cloud", "database", "security", "pkg_mgmt",
19
+ "finance_web3", "bio_science", "data_ml", "network", "media",
20
+ "editor_term", "editor_writer", "lang_tool", "mobile_embed",
21
+ "modern_unix", "systems", "web_api", "misc"]
22
 
23
 
24
  def _load():
25
  global _DS
26
  if _DS is None:
27
+ _DS = load_dataset(
28
+ "carosh/cli-1m", name="sample", revision=_REVISION, split="train"
29
+ )
30
  return _DS
31
 
32
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  def sample_rows(shell_filter, lang_filter, bucket_filter, n_rows, seed):
34
+ try:
35
+ ds = _load()
36
+ except Exception as e:
37
+ return f"Error loading dataset: {e}", ""
38
 
39
+ filtered = ds
40
  if shell_filter != "(any)":
41
  filtered = filtered.filter(lambda r: r["shell"] == shell_filter)
42
  if lang_filter != "(any)":
 
48
 
49
  total = len(filtered)
50
  if total == 0:
51
+ return "No rows match the selected filters.", "0"
52
 
53
+ rng = random.Random(int(seed) if str(seed).strip() else None)
54
+ n = min(int(n_rows), total)
55
+ indices = rng.sample(range(total), n)
56
  rows = filtered.select(indices)
57
 
58
+ parts = [f"**{total:,} rows match** — showing {n}\n"]
59
  for i, row in enumerate(rows):
60
  msgs = row.get("messages") or []
61
  user_msg = next((m["content"] for m in msgs if m.get("role") == "user"), "")
62
  assistant_msg = next((m["content"] for m in msgs if m.get("role") == "assistant"), "")
63
  bucket = ", ".join(row.get("bucket") or [])
64
+ parts.append(
65
+ f"---\n**Row {i+1}** · `shell={row.get('shell')}` · "
66
+ f"`lang={row.get('language')}` · `bucket={bucket}`\n\n"
67
  f"**User:** {user_msg}\n\n"
68
  f"```{row.get('shell', 'bash')}\n{assistant_msg}\n```\n"
69
  )
70
+ return "\n".join(parts), f"{total:,}"
71
+
72
+
73
+ with gr.Blocks(title="CLI-1M Explorer", theme=gr.themes.Soft()) as demo:
74
+ gr.Markdown(
75
+ "# CLI-1M Dataset Explorer\n"
76
+ f"Browsing [`carosh/cli-1m`](https://huggingface.co/datasets/carosh/cli-1m) "
77
+ f" `sample` config (50k stratified rows), revision `{_REVISION}`\n\n"
78
+ "Filter by shell, language, or industry bucket, then click **Sample rows**."
79
+ )
80
+
81
+ with gr.Row():
82
+ shell_dd = gr.Dropdown(SHELL_OPTS, value="(any)", label="Shell")
83
+ lang_dd = gr.Dropdown(LANG_OPTS, value="(any)", label="Language")
84
+ bucket_dd = gr.Dropdown(BUCKET_OPTS, value="(any)", label="Industry bucket")
85
+
86
+ with gr.Row():
87
+ n_rows = gr.Slider(1, 20, value=5, step=1, label="Rows to show")
88
+ seed = gr.Number(value=42, label="Random seed (blank = random)")
89
+
90
+ sample_btn = gr.Button("Sample rows", variant="primary")
91
+ match_count = gr.Textbox(label="Matching rows", interactive=False)
92
+ output = gr.Markdown()
93
+
94
+ sample_btn.click(
95
+ fn=sample_rows,
96
+ inputs=[shell_dd, lang_dd, bucket_dd, n_rows, seed],
97
+ outputs=[output, match_count],
98
+ )
99
+
100
+ gr.Markdown(
101
+ "---\n"
102
+ "**Links:** "
103
+ "[Dataset card](https://huggingface.co/datasets/carosh/cli-1m) · "
104
+ "[Eval split (gated)](https://huggingface.co/datasets/carosh/cli-1m-eval) · "
105
+ "[Source repo](https://github.com/wildcard/caro-eval) · "
106
+ "Apache-2.0"
107
+ )
 
 
 
 
 
 
 
108
 
109
 
110
  if __name__ == "__main__":
111
+ demo.launch()