kobi-kadosh commited on
Commit
e994cfe
·
verified ·
1 Parent(s): f5258f8

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +119 -0
app.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """CLI-1M Dataset Explorer — carosh/cli-1m
2
+
3
+ Random-row viewer with bucket / shell / language filters.
4
+ Deploy to HuggingFace Spaces (CPU Free tier).
5
+
6
+ Usage on HF Spaces:
7
+ This file + requirements.txt in the space repo is all you need.
8
+ Set HF_TOKEN in Space secrets if the dataset requires auth.
9
+ """
10
+
11
+ import random
12
+ import gradio as gr
13
+ from datasets import load_dataset
14
+
15
+ # Load from the published dataset — uses the default (HEAD) revision.
16
+ # On first load this downloads ~95MB of Parquet; subsequent requests use cache.
17
+ _DS = None
18
+ _REVISION = "v1.0-rc1"
19
+
20
+
21
+ def _load():
22
+ global _DS
23
+ if _DS is None:
24
+ _DS = load_dataset("carosh/cli-1m", revision=_REVISION, split="train")
25
+ return _DS
26
+
27
+
28
+ def _get_options(ds):
29
+ shells = sorted(set(ds["shell"]))
30
+ langs = sorted(set(ds["language"]))
31
+ buckets_flat = set()
32
+ for b in ds["bucket"]:
33
+ if isinstance(b, list):
34
+ buckets_flat.update(b)
35
+ elif b:
36
+ buckets_flat.add(b)
37
+ return ["(any)"] + shells, ["(any)"] + langs, ["(any)"] + sorted(buckets_flat)
38
+
39
+
40
+ def sample_rows(shell_filter, lang_filter, bucket_filter, n_rows, seed):
41
+ ds = _load()
42
+ filtered = ds
43
+
44
+ if shell_filter != "(any)":
45
+ filtered = filtered.filter(lambda r: r["shell"] == shell_filter)
46
+ if lang_filter != "(any)":
47
+ filtered = filtered.filter(lambda r: r["language"] == lang_filter)
48
+ if bucket_filter != "(any)":
49
+ filtered = filtered.filter(
50
+ lambda r: bucket_filter in (r["bucket"] if isinstance(r["bucket"], list) else [])
51
+ )
52
+
53
+ total = len(filtered)
54
+ if total == 0:
55
+ return "No rows match the selected filters.", ""
56
+
57
+ rng = random.Random(int(seed) if seed else None)
58
+ indices = rng.sample(range(total), min(int(n_rows), total))
59
+ rows = filtered.select(indices)
60
+
61
+ md_parts = [f"**{total:,} rows match** — showing {len(indices)}\n"]
62
+ for i, row in enumerate(rows):
63
+ msgs = row.get("messages") or []
64
+ user_msg = next((m["content"] for m in msgs if m.get("role") == "user"), "")
65
+ assistant_msg = next((m["content"] for m in msgs if m.get("role") == "assistant"), "")
66
+ bucket = ", ".join(row.get("bucket") or [])
67
+ md_parts.append(
68
+ f"---\n**Row {i+1}** · shell=`{row.get('shell')}` · lang=`{row.get('language')}` · bucket=`{bucket}`\n\n"
69
+ f"**User:** {user_msg}\n\n"
70
+ f"```{row.get('shell', 'bash')}\n{assistant_msg}\n```\n"
71
+ )
72
+ return "\n".join(md_parts), f"{total:,}"
73
+
74
+
75
+ def build_ui():
76
+ ds = _load()
77
+ shell_opts, lang_opts, bucket_opts = _get_options(ds)
78
+
79
+ with gr.Blocks(title="CLI-1M Explorer", theme=gr.themes.Soft()) as demo:
80
+ gr.Markdown(
81
+ "# CLI-1M Dataset Explorer\n"
82
+ f"Browsing [`carosh/cli-1m`](https://huggingface.co/datasets/carosh/cli-1m) "
83
+ f"— revision `{_REVISION}` — {len(ds):,} rows\n\n"
84
+ "Filter by shell, language, or industry bucket, then sample random rows."
85
+ )
86
+
87
+ with gr.Row():
88
+ shell_dd = gr.Dropdown(shell_opts, value="(any)", label="Shell")
89
+ lang_dd = gr.Dropdown(lang_opts, value="(any)", label="Language")
90
+ bucket_dd = gr.Dropdown(bucket_opts, value="(any)", label="Industry bucket")
91
+
92
+ with gr.Row():
93
+ n_rows = gr.Slider(1, 20, value=5, step=1, label="Rows to show")
94
+ seed = gr.Number(value=42, label="Random seed (blank = random)")
95
+
96
+ sample_btn = gr.Button("Sample rows", variant="primary")
97
+ match_count = gr.Textbox(label="Rows matching filter", interactive=False)
98
+ output = gr.Markdown()
99
+
100
+ sample_btn.click(
101
+ fn=sample_rows,
102
+ inputs=[shell_dd, lang_dd, bucket_dd, n_rows, seed],
103
+ outputs=[output, match_count],
104
+ )
105
+
106
+ gr.Markdown(
107
+ "---\n"
108
+ "**Links:** "
109
+ "[Dataset card](https://huggingface.co/datasets/carosh/cli-1m) · "
110
+ "[Eval split (gated)](https://huggingface.co/datasets/carosh/cli-1m-eval) · "
111
+ "[Source repo](https://github.com/wildcard/caro-eval) · "
112
+ "Apache-2.0"
113
+ )
114
+
115
+ return demo
116
+
117
+
118
+ if __name__ == "__main__":
119
+ build_ui().launch()