File size: 4,030 Bytes
e994cfe
 
 
 
 
 
 
 
 
 
 
d28d1c7
 
 
 
 
 
 
 
 
 
e994cfe
 
 
 
 
d28d1c7
 
 
e994cfe
 
 
 
d28d1c7
 
 
 
e994cfe
d28d1c7
e994cfe
 
 
 
 
 
 
 
 
 
 
d28d1c7
e994cfe
d28d1c7
 
 
e994cfe
 
d28d1c7
e994cfe
 
 
 
 
d28d1c7
 
 
e994cfe
 
 
d28d1c7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e994cfe
 
 
d28d1c7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
"""CLI-1M Dataset Explorer — carosh/cli-1m

Random-row viewer with bucket / shell / language filters.
Deploy to HuggingFace Spaces (CPU Free tier).
"""

import random
import gradio as gr
from datasets import load_dataset

_REVISION = "v1.0-rc1"
_DS = None  # lazy-loaded on first query

SHELL_OPTS = ["(any)", "bash", "zsh", "fish", "powershell", "nu", "oils-osh"]
LANG_OPTS = ["(any)", "en", "zh", "de", "es", "fr", "ja", "it", "pt", "ru", "ar", "hi", "ko", "he"]

# Known buckets — avoids full dataset scan at startup
BUCKET_OPTS = ["(any)", "devops", "cloud", "database", "security", "pkg_mgmt",
               "finance_web3", "bio_science", "data_ml", "network", "media",
               "editor_term", "editor_writer", "lang_tool", "mobile_embed",
               "modern_unix", "systems", "web_api", "misc"]


def _load():
    global _DS
    if _DS is None:
        _DS = load_dataset(
            "carosh/cli-1m", name="sample", revision=_REVISION, split="train"
        )
    return _DS


def sample_rows(shell_filter, lang_filter, bucket_filter, n_rows, seed):
    try:
        ds = _load()
    except Exception as e:
        return f"Error loading dataset: {e}", ""

    filtered = ds
    if shell_filter != "(any)":
        filtered = filtered.filter(lambda r: r["shell"] == shell_filter)
    if lang_filter != "(any)":
        filtered = filtered.filter(lambda r: r["language"] == lang_filter)
    if bucket_filter != "(any)":
        filtered = filtered.filter(
            lambda r: bucket_filter in (r["bucket"] if isinstance(r["bucket"], list) else [])
        )

    total = len(filtered)
    if total == 0:
        return "No rows match the selected filters.", "0"

    rng = random.Random(int(seed) if str(seed).strip() else None)
    n = min(int(n_rows), total)
    indices = rng.sample(range(total), n)
    rows = filtered.select(indices)

    parts = [f"**{total:,} rows match** — showing {n}\n"]
    for i, row in enumerate(rows):
        msgs = row.get("messages") or []
        user_msg = next((m["content"] for m in msgs if m.get("role") == "user"), "")
        assistant_msg = next((m["content"] for m in msgs if m.get("role") == "assistant"), "")
        bucket = ", ".join(row.get("bucket") or [])
        parts.append(
            f"---\n**Row {i+1}** · `shell={row.get('shell')}` · "
            f"`lang={row.get('language')}` · `bucket={bucket}`\n\n"
            f"**User:** {user_msg}\n\n"
            f"```{row.get('shell', 'bash')}\n{assistant_msg}\n```\n"
        )
    return "\n".join(parts), f"{total:,}"


with gr.Blocks(title="CLI-1M Explorer", theme=gr.themes.Soft()) as demo:
    gr.Markdown(
        "# CLI-1M Dataset Explorer\n"
        f"Browsing [`carosh/cli-1m`](https://huggingface.co/datasets/carosh/cli-1m) "
        f"— `sample` config (50k stratified rows), revision `{_REVISION}`\n\n"
        "Filter by shell, language, or industry bucket, then click **Sample rows**."
    )

    with gr.Row():
        shell_dd = gr.Dropdown(SHELL_OPTS, value="(any)", label="Shell")
        lang_dd = gr.Dropdown(LANG_OPTS, value="(any)", label="Language")
        bucket_dd = gr.Dropdown(BUCKET_OPTS, value="(any)", label="Industry bucket")

    with gr.Row():
        n_rows = gr.Slider(1, 20, value=5, step=1, label="Rows to show")
        seed = gr.Number(value=42, label="Random seed (blank = random)")

    sample_btn = gr.Button("Sample rows", variant="primary")
    match_count = gr.Textbox(label="Matching rows", interactive=False)
    output = gr.Markdown()

    sample_btn.click(
        fn=sample_rows,
        inputs=[shell_dd, lang_dd, bucket_dd, n_rows, seed],
        outputs=[output, match_count],
    )

    gr.Markdown(
        "---\n"
        "**Links:** "
        "[Dataset card](https://huggingface.co/datasets/carosh/cli-1m) · "
        "[Eval split (gated)](https://huggingface.co/datasets/carosh/cli-1m-eval) · "
        "[Source repo](https://github.com/wildcard/caro-eval) · "
        "Apache-2.0"
    )


if __name__ == "__main__":
    demo.launch()