Spaces:
Running
Running
fix: move demo to top-level, lazy-load dataset on button click
Browse files
app.py
CHANGED
|
@@ -2,46 +2,41 @@
|
|
| 2 |
|
| 3 |
Random-row viewer with bucket / shell / language filters.
|
| 4 |
Deploy to HuggingFace Spaces (CPU Free tier).
|
| 5 |
-
|
| 6 |
-
Usage on HF Spaces:
|
| 7 |
-
This file + requirements.txt in the space repo is all you need.
|
| 8 |
-
Set HF_TOKEN in Space secrets if the dataset requires auth.
|
| 9 |
"""
|
| 10 |
|
| 11 |
import random
|
| 12 |
import gradio as gr
|
| 13 |
from datasets import load_dataset
|
| 14 |
|
| 15 |
-
# Load the `sample` config (50k stratified rows) — downloads ~4.6MB vs 95MB
|
| 16 |
-
# for the full default/train. Fast cold-start on CPU free tier.
|
| 17 |
-
# Users who need the full 975k can filter via load_dataset locally.
|
| 18 |
-
_DS = None
|
| 19 |
_REVISION = "v1.0-rc1"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
|
| 22 |
def _load():
|
| 23 |
global _DS
|
| 24 |
if _DS is None:
|
| 25 |
-
_DS = load_dataset(
|
|
|
|
|
|
|
| 26 |
return _DS
|
| 27 |
|
| 28 |
|
| 29 |
-
def _get_options(ds):
|
| 30 |
-
shells = sorted(set(ds["shell"]))
|
| 31 |
-
langs = sorted(set(ds["language"]))
|
| 32 |
-
buckets_flat = set()
|
| 33 |
-
for b in ds["bucket"]:
|
| 34 |
-
if isinstance(b, list):
|
| 35 |
-
buckets_flat.update(b)
|
| 36 |
-
elif b:
|
| 37 |
-
buckets_flat.add(b)
|
| 38 |
-
return ["(any)"] + shells, ["(any)"] + langs, ["(any)"] + sorted(buckets_flat)
|
| 39 |
-
|
| 40 |
-
|
| 41 |
def sample_rows(shell_filter, lang_filter, bucket_filter, n_rows, seed):
|
| 42 |
-
|
| 43 |
-
|
|
|
|
|
|
|
| 44 |
|
|
|
|
| 45 |
if shell_filter != "(any)":
|
| 46 |
filtered = filtered.filter(lambda r: r["shell"] == shell_filter)
|
| 47 |
if lang_filter != "(any)":
|
|
@@ -53,69 +48,64 @@ def sample_rows(shell_filter, lang_filter, bucket_filter, n_rows, seed):
|
|
| 53 |
|
| 54 |
total = len(filtered)
|
| 55 |
if total == 0:
|
| 56 |
-
return "No rows match the selected filters.", ""
|
| 57 |
|
| 58 |
-
rng = random.Random(int(seed) if seed else None)
|
| 59 |
-
|
|
|
|
| 60 |
rows = filtered.select(indices)
|
| 61 |
|
| 62 |
-
|
| 63 |
for i, row in enumerate(rows):
|
| 64 |
msgs = row.get("messages") or []
|
| 65 |
user_msg = next((m["content"] for m in msgs if m.get("role") == "user"), "")
|
| 66 |
assistant_msg = next((m["content"] for m in msgs if m.get("role") == "assistant"), "")
|
| 67 |
bucket = ", ".join(row.get("bucket") or [])
|
| 68 |
-
|
| 69 |
-
f"---\n**Row {i+1}** · shell=
|
|
|
|
| 70 |
f"**User:** {user_msg}\n\n"
|
| 71 |
f"```{row.get('shell', 'bash')}\n{assistant_msg}\n```\n"
|
| 72 |
)
|
| 73 |
-
return "\n".join(
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
)
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
output
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
)
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
"[Dataset card](https://huggingface.co/datasets/carosh/cli-1m) · "
|
| 112 |
-
"[Eval split (gated)](https://huggingface.co/datasets/carosh/cli-1m-eval) · "
|
| 113 |
-
"[Source repo](https://github.com/wildcard/caro-eval) · "
|
| 114 |
-
"Apache-2.0"
|
| 115 |
-
)
|
| 116 |
-
|
| 117 |
-
return demo
|
| 118 |
|
| 119 |
|
| 120 |
if __name__ == "__main__":
|
| 121 |
-
|
|
|
|
| 2 |
|
| 3 |
Random-row viewer with bucket / shell / language filters.
|
| 4 |
Deploy to HuggingFace Spaces (CPU Free tier).
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
"""
|
| 6 |
|
| 7 |
import random
|
| 8 |
import gradio as gr
|
| 9 |
from datasets import load_dataset
|
| 10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
_REVISION = "v1.0-rc1"
|
| 12 |
+
_DS = None # lazy-loaded on first query
|
| 13 |
+
|
| 14 |
+
SHELL_OPTS = ["(any)", "bash", "zsh", "fish", "powershell", "nu", "oils-osh"]
|
| 15 |
+
LANG_OPTS = ["(any)", "en", "zh", "de", "es", "fr", "ja", "it", "pt", "ru", "ar", "hi", "ko", "he"]
|
| 16 |
+
|
| 17 |
+
# Known buckets — avoids full dataset scan at startup
|
| 18 |
+
BUCKET_OPTS = ["(any)", "devops", "cloud", "database", "security", "pkg_mgmt",
|
| 19 |
+
"finance_web3", "bio_science", "data_ml", "network", "media",
|
| 20 |
+
"editor_term", "editor_writer", "lang_tool", "mobile_embed",
|
| 21 |
+
"modern_unix", "systems", "web_api", "misc"]
|
| 22 |
|
| 23 |
|
| 24 |
def _load():
|
| 25 |
global _DS
|
| 26 |
if _DS is None:
|
| 27 |
+
_DS = load_dataset(
|
| 28 |
+
"carosh/cli-1m", name="sample", revision=_REVISION, split="train"
|
| 29 |
+
)
|
| 30 |
return _DS
|
| 31 |
|
| 32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
def sample_rows(shell_filter, lang_filter, bucket_filter, n_rows, seed):
|
| 34 |
+
try:
|
| 35 |
+
ds = _load()
|
| 36 |
+
except Exception as e:
|
| 37 |
+
return f"Error loading dataset: {e}", ""
|
| 38 |
|
| 39 |
+
filtered = ds
|
| 40 |
if shell_filter != "(any)":
|
| 41 |
filtered = filtered.filter(lambda r: r["shell"] == shell_filter)
|
| 42 |
if lang_filter != "(any)":
|
|
|
|
| 48 |
|
| 49 |
total = len(filtered)
|
| 50 |
if total == 0:
|
| 51 |
+
return "No rows match the selected filters.", "0"
|
| 52 |
|
| 53 |
+
rng = random.Random(int(seed) if str(seed).strip() else None)
|
| 54 |
+
n = min(int(n_rows), total)
|
| 55 |
+
indices = rng.sample(range(total), n)
|
| 56 |
rows = filtered.select(indices)
|
| 57 |
|
| 58 |
+
parts = [f"**{total:,} rows match** — showing {n}\n"]
|
| 59 |
for i, row in enumerate(rows):
|
| 60 |
msgs = row.get("messages") or []
|
| 61 |
user_msg = next((m["content"] for m in msgs if m.get("role") == "user"), "")
|
| 62 |
assistant_msg = next((m["content"] for m in msgs if m.get("role") == "assistant"), "")
|
| 63 |
bucket = ", ".join(row.get("bucket") or [])
|
| 64 |
+
parts.append(
|
| 65 |
+
f"---\n**Row {i+1}** · `shell={row.get('shell')}` · "
|
| 66 |
+
f"`lang={row.get('language')}` · `bucket={bucket}`\n\n"
|
| 67 |
f"**User:** {user_msg}\n\n"
|
| 68 |
f"```{row.get('shell', 'bash')}\n{assistant_msg}\n```\n"
|
| 69 |
)
|
| 70 |
+
return "\n".join(parts), f"{total:,}"
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
with gr.Blocks(title="CLI-1M Explorer", theme=gr.themes.Soft()) as demo:
|
| 74 |
+
gr.Markdown(
|
| 75 |
+
"# CLI-1M Dataset Explorer\n"
|
| 76 |
+
f"Browsing [`carosh/cli-1m`](https://huggingface.co/datasets/carosh/cli-1m) "
|
| 77 |
+
f"— `sample` config (50k stratified rows), revision `{_REVISION}`\n\n"
|
| 78 |
+
"Filter by shell, language, or industry bucket, then click **Sample rows**."
|
| 79 |
+
)
|
| 80 |
+
|
| 81 |
+
with gr.Row():
|
| 82 |
+
shell_dd = gr.Dropdown(SHELL_OPTS, value="(any)", label="Shell")
|
| 83 |
+
lang_dd = gr.Dropdown(LANG_OPTS, value="(any)", label="Language")
|
| 84 |
+
bucket_dd = gr.Dropdown(BUCKET_OPTS, value="(any)", label="Industry bucket")
|
| 85 |
+
|
| 86 |
+
with gr.Row():
|
| 87 |
+
n_rows = gr.Slider(1, 20, value=5, step=1, label="Rows to show")
|
| 88 |
+
seed = gr.Number(value=42, label="Random seed (blank = random)")
|
| 89 |
+
|
| 90 |
+
sample_btn = gr.Button("Sample rows", variant="primary")
|
| 91 |
+
match_count = gr.Textbox(label="Matching rows", interactive=False)
|
| 92 |
+
output = gr.Markdown()
|
| 93 |
+
|
| 94 |
+
sample_btn.click(
|
| 95 |
+
fn=sample_rows,
|
| 96 |
+
inputs=[shell_dd, lang_dd, bucket_dd, n_rows, seed],
|
| 97 |
+
outputs=[output, match_count],
|
| 98 |
+
)
|
| 99 |
+
|
| 100 |
+
gr.Markdown(
|
| 101 |
+
"---\n"
|
| 102 |
+
"**Links:** "
|
| 103 |
+
"[Dataset card](https://huggingface.co/datasets/carosh/cli-1m) · "
|
| 104 |
+
"[Eval split (gated)](https://huggingface.co/datasets/carosh/cli-1m-eval) · "
|
| 105 |
+
"[Source repo](https://github.com/wildcard/caro-eval) · "
|
| 106 |
+
"Apache-2.0"
|
| 107 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
|
| 109 |
|
| 110 |
if __name__ == "__main__":
|
| 111 |
+
demo.launch()
|