Spaces:

carosh
/

cli-1m-explorer

Running

App Files Files Community

kobi-kadosh commited on 6 days ago

Commit

d28d1c7

verified ·

1 Parent(s): d4d9815

fix: move demo to top-level, lazy-load dataset on button click

Browse files

Files changed (1) hide show

app.py +65 -75

app.py CHANGED Viewed

@@ -2,46 +2,41 @@
 Random-row viewer with bucket / shell / language filters.
 Deploy to HuggingFace Spaces (CPU Free tier).
-Usage on HF Spaces:
-    This file + requirements.txt in the space repo is all you need.
-    Set HF_TOKEN in Space secrets if the dataset requires auth.
 """
 import random
 import gradio as gr
 from datasets import load_dataset
-# Load the `sample` config (50k stratified rows) — downloads ~4.6MB vs 95MB
-# for the full default/train. Fast cold-start on CPU free tier.
-# Users who need the full 975k can filter via load_dataset locally.
-_DS = None
 _REVISION = "v1.0-rc1"
 def _load():
     global _DS
     if _DS is None:
-        _DS = load_dataset("carosh/cli-1m", name="sample", revision=_REVISION, split="train")
     return _DS
-def _get_options(ds):
-    shells = sorted(set(ds["shell"]))
-    langs = sorted(set(ds["language"]))
-    buckets_flat = set()
-    for b in ds["bucket"]:
-        if isinstance(b, list):
-            buckets_flat.update(b)
-        elif b:
-            buckets_flat.add(b)
-    return ["(any)"] + shells, ["(any)"] + langs, ["(any)"] + sorted(buckets_flat)
 def sample_rows(shell_filter, lang_filter, bucket_filter, n_rows, seed):
-    ds = _load()
-    filtered = ds
     if shell_filter != "(any)":
         filtered = filtered.filter(lambda r: r["shell"] == shell_filter)
     if lang_filter != "(any)":
@@ -53,69 +48,64 @@ def sample_rows(shell_filter, lang_filter, bucket_filter, n_rows, seed):
     total = len(filtered)
     if total == 0:
-        return "No rows match the selected filters.", ""
-    rng = random.Random(int(seed) if seed else None)
-    indices = rng.sample(range(total), min(int(n_rows), total))
     rows = filtered.select(indices)
-    md_parts = [f"**{total:,} rows match** — showing {len(indices)}\n"]
     for i, row in enumerate(rows):
         msgs = row.get("messages") or []
         user_msg = next((m["content"] for m in msgs if m.get("role") == "user"), "")
         assistant_msg = next((m["content"] for m in msgs if m.get("role") == "assistant"), "")
         bucket = ", ".join(row.get("bucket") or [])
-        md_parts.append(
-            f"---\n**Row {i+1}** · shell=`{row.get('shell')}` · lang=`{row.get('language')}` · bucket=`{bucket}`\n\n"
             f"**User:** {user_msg}\n\n"
             f"```{row.get('shell', 'bash')}\n{assistant_msg}\n```\n"
         )
-    return "\n".join(md_parts), f"{total:,}"
-def build_ui():
-    ds = _load()
-    shell_opts, lang_opts, bucket_opts = _get_options(ds)
-    with gr.Blocks(title="CLI-1M Explorer", theme=gr.themes.Soft()) as demo:
-        gr.Markdown(
-            "# CLI-1M Dataset Explorer\n"
-            f"Browsing [`carosh/cli-1m`](https://huggingface.co/datasets/carosh/cli-1m) "
-            f"— `sample` config, revision `{_REVISION}` — {len(ds):,} stratified rows\n\n"
-            "Filter by shell, language, or industry bucket, then sample random rows. "
-            "The `sample` config is a stratified 50k subset of the full 975k corpus."
-        )
-        with gr.Row():
-            shell_dd = gr.Dropdown(shell_opts, value="(any)", label="Shell")
-            lang_dd = gr.Dropdown(lang_opts, value="(any)", label="Language")
-            bucket_dd = gr.Dropdown(bucket_opts, value="(any)", label="Industry bucket")
-        with gr.Row():
-            n_rows = gr.Slider(1, 20, value=5, step=1, label="Rows to show")
-            seed = gr.Number(value=42, label="Random seed (blank = random)")
-        sample_btn = gr.Button("Sample rows", variant="primary")
-        match_count = gr.Textbox(label="Rows matching filter", interactive=False)
-        output = gr.Markdown()
-        sample_btn.click(
-            fn=sample_rows,
-            inputs=[shell_dd, lang_dd, bucket_dd, n_rows, seed],
-            outputs=[output, match_count],
-        )
-        gr.Markdown(
-            "---\n"
-            "**Links:** "
-            "[Dataset card](https://huggingface.co/datasets/carosh/cli-1m) · "
-            "[Eval split (gated)](https://huggingface.co/datasets/carosh/cli-1m-eval) · "
-            "[Source repo](https://github.com/wildcard/caro-eval) · "
-            "Apache-2.0"
-        )
-    return demo
 if __name__ == "__main__":
-    build_ui().launch()

 Random-row viewer with bucket / shell / language filters.
 Deploy to HuggingFace Spaces (CPU Free tier).
 """
 import random
 import gradio as gr
 from datasets import load_dataset
 _REVISION = "v1.0-rc1"
+_DS = None  # lazy-loaded on first query
+SHELL_OPTS = ["(any)", "bash", "zsh", "fish", "powershell", "nu", "oils-osh"]
+LANG_OPTS = ["(any)", "en", "zh", "de", "es", "fr", "ja", "it", "pt", "ru", "ar", "hi", "ko", "he"]
+# Known buckets — avoids full dataset scan at startup
+BUCKET_OPTS = ["(any)", "devops", "cloud", "database", "security", "pkg_mgmt",
+               "finance_web3", "bio_science", "data_ml", "network", "media",
+               "editor_term", "editor_writer", "lang_tool", "mobile_embed",
+               "modern_unix", "systems", "web_api", "misc"]
 def _load():
     global _DS
     if _DS is None:
+        _DS = load_dataset(
+            "carosh/cli-1m", name="sample", revision=_REVISION, split="train"
+        )
     return _DS
 def sample_rows(shell_filter, lang_filter, bucket_filter, n_rows, seed):
+    try:
+        ds = _load()
+    except Exception as e:
+        return f"Error loading dataset: {e}", ""
+    filtered = ds
     if shell_filter != "(any)":
         filtered = filtered.filter(lambda r: r["shell"] == shell_filter)
     if lang_filter != "(any)":
     total = len(filtered)
     if total == 0:
+        return "No rows match the selected filters.", "0"
+    rng = random.Random(int(seed) if str(seed).strip() else None)
+    n = min(int(n_rows), total)
+    indices = rng.sample(range(total), n)
     rows = filtered.select(indices)
+    parts = [f"**{total:,} rows match** — showing {n}\n"]
     for i, row in enumerate(rows):
         msgs = row.get("messages") or []
         user_msg = next((m["content"] for m in msgs if m.get("role") == "user"), "")
         assistant_msg = next((m["content"] for m in msgs if m.get("role") == "assistant"), "")
         bucket = ", ".join(row.get("bucket") or [])
+        parts.append(
+            f"---\n**Row {i+1}** · `shell={row.get('shell')}` · "
+            f"`lang={row.get('language')}` · `bucket={bucket}`\n\n"
             f"**User:** {user_msg}\n\n"
             f"```{row.get('shell', 'bash')}\n{assistant_msg}\n```\n"
         )
+    return "\n".join(parts), f"{total:,}"
+with gr.Blocks(title="CLI-1M Explorer", theme=gr.themes.Soft()) as demo:
+    gr.Markdown(
+        "# CLI-1M Dataset Explorer\n"
+        f"Browsing [`carosh/cli-1m`](https://huggingface.co/datasets/carosh/cli-1m) "
+        f"— `sample` config (50k stratified rows), revision `{_REVISION}`\n\n"
+        "Filter by shell, language, or industry bucket, then click **Sample rows**."
+    )
+    with gr.Row():
+        shell_dd = gr.Dropdown(SHELL_OPTS, value="(any)", label="Shell")
+        lang_dd = gr.Dropdown(LANG_OPTS, value="(any)", label="Language")
+        bucket_dd = gr.Dropdown(BUCKET_OPTS, value="(any)", label="Industry bucket")
+    with gr.Row():
+        n_rows = gr.Slider(1, 20, value=5, step=1, label="Rows to show")
+        seed = gr.Number(value=42, label="Random seed (blank = random)")
+    sample_btn = gr.Button("Sample rows", variant="primary")
+    match_count = gr.Textbox(label="Matching rows", interactive=False)
+    output = gr.Markdown()
+    sample_btn.click(
+        fn=sample_rows,
+        inputs=[shell_dd, lang_dd, bucket_dd, n_rows, seed],
+        outputs=[output, match_count],
+    )
+    gr.Markdown(
+        "---\n"
+        "**Links:** "
+        "[Dataset card](https://huggingface.co/datasets/carosh/cli-1m) · "
+        "[Eval split (gated)](https://huggingface.co/datasets/carosh/cli-1m-eval) · "
+        "[Source repo](https://github.com/wildcard/caro-eval) · "
+        "Apache-2.0"
+    )
 if __name__ == "__main__":
+    demo.launch()