Spaces:

carosh
/

cli-1m-explorer

Running

kobi-kadosh commited on 6 days ago

Commit

d4d9815

verified ·

1 Parent(s): d7b9e50

fix: load sample config (50k) instead of full 975k for fast cold-start

Files changed (1) hide show

app.py CHANGED Viewed

@@ -12,8 +12,9 @@ import random
 import gradio as gr
 from datasets import load_dataset
-# Load from the published dataset — uses the default (HEAD) revision.
-# On first load this downloads ~95MB of Parquet; subsequent requests use cache.
 _DS = None
 _REVISION = "v1.0-rc1"
@@ -21,7 +22,7 @@ _REVISION = "v1.0-rc1"
 def _load():
     global _DS
     if _DS is None:
-        _DS = load_dataset("carosh/cli-1m", revision=_REVISION, split="train")
     return _DS
@@ -80,8 +81,9 @@ def build_ui():
         gr.Markdown(
             "# CLI-1M Dataset Explorer\n"
             f"Browsing [`carosh/cli-1m`](https://huggingface.co/datasets/carosh/cli-1m) "
-            f"— revision `{_REVISION}` — {len(ds):,} rows\n\n"
-            "Filter by shell, language, or industry bucket, then sample random rows."
         )
         with gr.Row():

 import gradio as gr
 from datasets import load_dataset
+# Load the `sample` config (50k stratified rows) — downloads ~4.6MB vs 95MB
+# for the full default/train. Fast cold-start on CPU free tier.
+# Users who need the full 975k can filter via load_dataset locally.
 _DS = None
 _REVISION = "v1.0-rc1"
 def _load():
     global _DS
     if _DS is None:
+        _DS = load_dataset("carosh/cli-1m", name="sample", revision=_REVISION, split="train")
     return _DS
         gr.Markdown(
             "# CLI-1M Dataset Explorer\n"
             f"Browsing [`carosh/cli-1m`](https://huggingface.co/datasets/carosh/cli-1m) "
+            f"— `sample` config, revision `{_REVISION}` — {len(ds):,} stratified rows\n\n"
+            "Filter by shell, language, or industry bucket, then sample random rows. "
+            "The `sample` config is a stratified 50k subset of the full 975k corpus."
         )
         with gr.Row():