Spaces:

carosh
/

cli-1m-explorer

Running

App Files Files Community

kobi-kadosh commited on 6 days ago

Commit

e994cfe

verified ·

1 Parent(s): f5258f8

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +119 -0

app.py ADDED Viewed

	@@ -0,0 +1,119 @@

+"""CLI-1M Dataset Explorer — carosh/cli-1m
+Random-row viewer with bucket / shell / language filters.
+Deploy to HuggingFace Spaces (CPU Free tier).
+Usage on HF Spaces:
+    This file + requirements.txt in the space repo is all you need.
+    Set HF_TOKEN in Space secrets if the dataset requires auth.
+"""
+import random
+import gradio as gr
+from datasets import load_dataset
+# Load from the published dataset — uses the default (HEAD) revision.
+# On first load this downloads ~95MB of Parquet; subsequent requests use cache.
+_DS = None
+_REVISION = "v1.0-rc1"
+def _load():
+    global _DS
+    if _DS is None:
+        _DS = load_dataset("carosh/cli-1m", revision=_REVISION, split="train")
+    return _DS
+def _get_options(ds):
+    shells = sorted(set(ds["shell"]))
+    langs = sorted(set(ds["language"]))
+    buckets_flat = set()
+    for b in ds["bucket"]:
+        if isinstance(b, list):
+            buckets_flat.update(b)
+        elif b:
+            buckets_flat.add(b)
+    return ["(any)"] + shells, ["(any)"] + langs, ["(any)"] + sorted(buckets_flat)
+def sample_rows(shell_filter, lang_filter, bucket_filter, n_rows, seed):
+    ds = _load()
+    filtered = ds
+    if shell_filter != "(any)":
+        filtered = filtered.filter(lambda r: r["shell"] == shell_filter)
+    if lang_filter != "(any)":
+        filtered = filtered.filter(lambda r: r["language"] == lang_filter)
+    if bucket_filter != "(any)":
+        filtered = filtered.filter(
+            lambda r: bucket_filter in (r["bucket"] if isinstance(r["bucket"], list) else [])
+        )
+    total = len(filtered)
+    if total == 0:
+        return "No rows match the selected filters.", ""
+    rng = random.Random(int(seed) if seed else None)
+    indices = rng.sample(range(total), min(int(n_rows), total))
+    rows = filtered.select(indices)
+    md_parts = [f"**{total:,} rows match** — showing {len(indices)}\n"]
+    for i, row in enumerate(rows):
+        msgs = row.get("messages") or []
+        user_msg = next((m["content"] for m in msgs if m.get("role") == "user"), "")
+        assistant_msg = next((m["content"] for m in msgs if m.get("role") == "assistant"), "")
+        bucket = ", ".join(row.get("bucket") or [])
+        md_parts.append(
+            f"---\n**Row {i+1}** · shell=`{row.get('shell')}` · lang=`{row.get('language')}` · bucket=`{bucket}`\n\n"
+            f"**User:** {user_msg}\n\n"
+            f"```{row.get('shell', 'bash')}\n{assistant_msg}\n```\n"
+        )
+    return "\n".join(md_parts), f"{total:,}"
+def build_ui():
+    ds = _load()
+    shell_opts, lang_opts, bucket_opts = _get_options(ds)
+    with gr.Blocks(title="CLI-1M Explorer", theme=gr.themes.Soft()) as demo:
+        gr.Markdown(
+            "# CLI-1M Dataset Explorer\n"
+            f"Browsing [`carosh/cli-1m`](https://huggingface.co/datasets/carosh/cli-1m) "
+            f"— revision `{_REVISION}` — {len(ds):,} rows\n\n"
+            "Filter by shell, language, or industry bucket, then sample random rows."
+        )
+        with gr.Row():
+            shell_dd = gr.Dropdown(shell_opts, value="(any)", label="Shell")
+            lang_dd = gr.Dropdown(lang_opts, value="(any)", label="Language")
+            bucket_dd = gr.Dropdown(bucket_opts, value="(any)", label="Industry bucket")
+        with gr.Row():
+            n_rows = gr.Slider(1, 20, value=5, step=1, label="Rows to show")
+            seed = gr.Number(value=42, label="Random seed (blank = random)")
+        sample_btn = gr.Button("Sample rows", variant="primary")
+        match_count = gr.Textbox(label="Rows matching filter", interactive=False)
+        output = gr.Markdown()
+        sample_btn.click(
+            fn=sample_rows,
+            inputs=[shell_dd, lang_dd, bucket_dd, n_rows, seed],
+            outputs=[output, match_count],
+        )
+        gr.Markdown(
+            "---\n"
+            "**Links:** "
+            "[Dataset card](https://huggingface.co/datasets/carosh/cli-1m) · "
+            "[Eval split (gated)](https://huggingface.co/datasets/carosh/cli-1m-eval) · "
+            "[Source repo](https://github.com/wildcard/caro-eval) · "
+            "Apache-2.0"
+        )
+    return demo
+if __name__ == "__main__":
+    build_ui().launch()