Spaces:
Running
Running
fix: load sample config (50k) instead of full 975k for fast cold-start
Browse files
app.py
CHANGED
|
@@ -12,8 +12,9 @@ import random
|
|
| 12 |
import gradio as gr
|
| 13 |
from datasets import load_dataset
|
| 14 |
|
| 15 |
-
# Load
|
| 16 |
-
#
|
|
|
|
| 17 |
_DS = None
|
| 18 |
_REVISION = "v1.0-rc1"
|
| 19 |
|
|
@@ -21,7 +22,7 @@ _REVISION = "v1.0-rc1"
|
|
| 21 |
def _load():
|
| 22 |
global _DS
|
| 23 |
if _DS is None:
|
| 24 |
-
_DS = load_dataset("carosh/cli-1m", revision=_REVISION, split="train")
|
| 25 |
return _DS
|
| 26 |
|
| 27 |
|
|
@@ -80,8 +81,9 @@ def build_ui():
|
|
| 80 |
gr.Markdown(
|
| 81 |
"# CLI-1M Dataset Explorer\n"
|
| 82 |
f"Browsing [`carosh/cli-1m`](https://huggingface.co/datasets/carosh/cli-1m) "
|
| 83 |
-
f"— revision `{_REVISION}` — {len(ds):,} rows\n\n"
|
| 84 |
-
"Filter by shell, language, or industry bucket, then sample random rows."
|
|
|
|
| 85 |
)
|
| 86 |
|
| 87 |
with gr.Row():
|
|
|
|
| 12 |
import gradio as gr
|
| 13 |
from datasets import load_dataset
|
| 14 |
|
| 15 |
+
# Load the `sample` config (50k stratified rows) — downloads ~4.6MB vs 95MB
|
| 16 |
+
# for the full default/train. Fast cold-start on CPU free tier.
|
| 17 |
+
# Users who need the full 975k can filter via load_dataset locally.
|
| 18 |
_DS = None
|
| 19 |
_REVISION = "v1.0-rc1"
|
| 20 |
|
|
|
|
| 22 |
def _load():
|
| 23 |
global _DS
|
| 24 |
if _DS is None:
|
| 25 |
+
_DS = load_dataset("carosh/cli-1m", name="sample", revision=_REVISION, split="train")
|
| 26 |
return _DS
|
| 27 |
|
| 28 |
|
|
|
|
| 81 |
gr.Markdown(
|
| 82 |
"# CLI-1M Dataset Explorer\n"
|
| 83 |
f"Browsing [`carosh/cli-1m`](https://huggingface.co/datasets/carosh/cli-1m) "
|
| 84 |
+
f"— `sample` config, revision `{_REVISION}` — {len(ds):,} stratified rows\n\n"
|
| 85 |
+
"Filter by shell, language, or industry bucket, then sample random rows. "
|
| 86 |
+
"The `sample` config is a stratified 50k subset of the full 975k corpus."
|
| 87 |
)
|
| 88 |
|
| 89 |
with gr.Row():
|