| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| """Contains commands to interact with datasets on the Hugging Face Hub. |
| |
| Usage: |
| # list datasets on the Hub |
| hf datasets ls |
| |
| # list datasets with a search query |
| hf datasets ls --search "code" |
| |
| # get info about a dataset |
| hf datasets info HuggingFaceFW/fineweb |
| """ |
|
|
| import enum |
| from typing import Annotated, get_args |
|
|
| import typer |
|
|
| from huggingface_hub._dataset_viewer import execute_raw_sql_query |
| from huggingface_hub.errors import CLIError, RepositoryNotFoundError, RevisionNotFoundError |
| from huggingface_hub.hf_api import DatasetSort_T, ExpandDatasetProperty_T |
|
|
| from ._cli_utils import ( |
| AuthorOpt, |
| FilterOpt, |
| FormatWithAutoOpt, |
| LimitOpt, |
| RevisionOpt, |
| SearchOpt, |
| TokenOpt, |
| api_object_to_dict, |
| get_hf_api, |
| make_expand_properties_parser, |
| typer_factory, |
| ) |
| from ._output import OutputFormatWithAuto, out |
|
|
|
|
| _EXPAND_PROPERTIES = sorted(get_args(ExpandDatasetProperty_T)) |
| _SORT_OPTIONS = get_args(DatasetSort_T) |
| DatasetSortEnum = enum.Enum("DatasetSortEnum", {s: s for s in _SORT_OPTIONS}, type=str) |
|
|
|
|
| ExpandOpt = Annotated[ |
| str | None, |
| typer.Option( |
| help=f"Comma-separated properties to return. When used, only the listed properties (and id) are returned. Example: '--expand=downloads,likes,tags'. Valid: {', '.join(_EXPAND_PROPERTIES)}.", |
| callback=make_expand_properties_parser(_EXPAND_PROPERTIES), |
| ), |
| ] |
|
|
|
|
| datasets_cli = typer_factory(help="Interact with datasets on the Hub.") |
|
|
|
|
| @datasets_cli.command( |
| "list | ls", |
| examples=[ |
| "hf datasets ls", |
| "hf datasets ls --sort downloads --limit 10", |
| 'hf datasets ls --search "code"', |
| ], |
| ) |
| def datasets_ls( |
| search: SearchOpt = None, |
| author: AuthorOpt = None, |
| filter: FilterOpt = None, |
| sort: Annotated[ |
| DatasetSortEnum | None, |
| typer.Option(help="Sort results."), |
| ] = None, |
| limit: LimitOpt = 10, |
| expand: ExpandOpt = None, |
| format: FormatWithAutoOpt = OutputFormatWithAuto.auto, |
| token: TokenOpt = None, |
| ) -> None: |
| """List datasets on the Hub.""" |
| api = get_hf_api(token=token) |
| sort_key = sort.value if sort else None |
| results = [ |
| api_object_to_dict(dataset_info) |
| for dataset_info in api.list_datasets( |
| filter=filter, |
| author=author, |
| search=search, |
| sort=sort_key, |
| limit=limit, |
| expand=expand, |
| ) |
| ] |
| out.table(results) |
|
|
|
|
| @datasets_cli.command( |
| "info", |
| examples=[ |
| "hf datasets info HuggingFaceFW/fineweb", |
| "hf datasets info my-dataset --expand downloads,likes,tags", |
| ], |
| ) |
| def datasets_info( |
| dataset_id: Annotated[str, typer.Argument(help="The dataset ID (e.g. `username/repo-name`).")], |
| revision: RevisionOpt = None, |
| expand: ExpandOpt = None, |
| format: FormatWithAutoOpt = OutputFormatWithAuto.auto, |
| token: TokenOpt = None, |
| ) -> None: |
| """Get info about a dataset on the Hub.""" |
| api = get_hf_api(token=token) |
| try: |
| info = api.dataset_info(repo_id=dataset_id, revision=revision, expand=expand) |
| except RepositoryNotFoundError as e: |
| raise CLIError(f"Dataset '{dataset_id}' not found.") from e |
| except RevisionNotFoundError as e: |
| raise CLIError(f"Revision '{revision}' not found on '{dataset_id}'.") from e |
| out.dict(info) |
|
|
|
|
| @datasets_cli.command( |
| "parquet", |
| examples=[ |
| "hf datasets parquet cfahlgren1/hub-stats", |
| "hf datasets parquet cfahlgren1/hub-stats --subset models", |
| "hf datasets parquet cfahlgren1/hub-stats --split train", |
| "hf datasets parquet cfahlgren1/hub-stats --format json", |
| ], |
| ) |
| def datasets_parquet( |
| dataset_id: Annotated[str, typer.Argument(help="The dataset ID (e.g. `username/repo-name`).")], |
| subset: Annotated[str | None, typer.Option("--subset", help="Filter parquet entries by subset/config.")] = None, |
| split: Annotated[str | None, typer.Option(help="Filter parquet entries by split.")] = None, |
| format: FormatWithAutoOpt = OutputFormatWithAuto.auto, |
| token: TokenOpt = None, |
| ) -> None: |
| """List parquet file URLs available for a dataset.""" |
| api = get_hf_api(token=token) |
| entries = api.list_dataset_parquet_files(repo_id=dataset_id, config=subset) |
| filtered = [entry for entry in entries if split is None or entry.split == split] |
| results = [ |
| {"subset": entry.config, "split": entry.split, "url": entry.url, "size": entry.size} for entry in filtered |
| ] |
| out.table(results, headers=["subset", "split", "url", "size"], id_key="url") |
|
|
|
|
| @datasets_cli.command( |
| "sql", |
| examples=[ |
| "hf datasets sql \"SELECT COUNT(*) AS rows FROM read_parquet('https://huggingface.co/api/datasets/cfahlgren1/hub-stats/parquet/models/train/0.parquet')\"", |
| "hf datasets sql \"SELECT * FROM read_parquet('https://huggingface.co/api/datasets/cfahlgren1/hub-stats/parquet/models/train/0.parquet') LIMIT 5\" --format json", |
| ], |
| ) |
| def datasets_sql( |
| sql: Annotated[str, typer.Argument(help="Raw SQL query to execute.")], |
| format: FormatWithAutoOpt = OutputFormatWithAuto.auto, |
| token: TokenOpt = None, |
| ) -> None: |
| """Execute a raw SQL query with DuckDB against dataset parquet URLs.""" |
| try: |
| result = execute_raw_sql_query(sql_query=sql, token=token) |
| except ImportError as e: |
| raise CLIError(str(e)) from e |
| out.table(result) |
|
|