File size: 8,053 Bytes
5e9fb2f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
# Copyright 202-present, the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Contains command to download files from the Hub with the CLI.

Usage:
    hf download --help

    # Download file
    hf download gpt2 config.json

    # Download entire repo
    hf download fffiloni/zeroscope --repo-type=space --revision=refs/pr/78

    # Download repo with filters
    hf download gpt2 --include="*.safetensors"

    # Download with token
    hf download Wauplin/private-model --token=hf_***

    # Download quietly (no progress bar, no warnings, only the returned path)
    hf download gpt2 config.json --quiet

    # Download to local dir
    hf download gpt2 --local-dir=./models/gpt2

    # Download a subfolder
    hf download HuggingFaceM4/FineVision art/ --repo-type=dataset
"""

import warnings
from typing import Annotated

import typer

from huggingface_hub._snapshot_download import snapshot_download
from huggingface_hub.errors import CLIError
from huggingface_hub.file_download import DryRunFileInfo, hf_hub_download
from huggingface_hub.utils import _format_size

from ._cli_utils import FormatWithAutoOpt, RepoIdArg, RepoTypeOpt, RevisionOpt, TokenOpt
from ._output import OutputFormatWithAuto, out


DOWNLOAD_EXAMPLES = [
    "hf download meta-llama/Llama-3.2-1B-Instruct",
    "hf download meta-llama/Llama-3.2-1B-Instruct config.json tokenizer.json",
    'hf download meta-llama/Llama-3.2-1B-Instruct --include "*.safetensors" --exclude "*.bin"',
    "hf download meta-llama/Llama-3.2-1B-Instruct --local-dir ./models/llama",
    "hf download HuggingFaceM4/FineVision art/ --repo-type dataset",
]


def download(
    repo_id: RepoIdArg,
    filenames: Annotated[
        list[str] | None,
        typer.Argument(
            help="Files to download (e.g. `config.json`, `data/metadata.jsonl`).",
        ),
    ] = None,
    repo_type: RepoTypeOpt = RepoTypeOpt.model,
    revision: RevisionOpt = None,
    include: Annotated[
        list[str] | None,
        typer.Option(
            help="Glob patterns to include from files to download. eg: *.json",
        ),
    ] = None,
    exclude: Annotated[
        list[str] | None,
        typer.Option(
            help="Glob patterns to exclude from files to download.",
        ),
    ] = None,
    cache_dir: Annotated[
        str | None,
        typer.Option(
            help="Directory where to save files.",
        ),
    ] = None,
    local_dir: Annotated[
        str | None,
        typer.Option(
            help="If set, the downloaded file will be placed under this directory. Check out https://huggingface.co/docs/huggingface_hub/guides/download#download-files-to-a-local-folder for more details.",
        ),
    ] = None,
    force_download: Annotated[
        bool,
        typer.Option(
            help="If True, the files will be downloaded even if they are already cached.",
        ),
    ] = False,
    dry_run: Annotated[
        bool,
        typer.Option(
            help="If True, perform a dry run without actually downloading the file.",
        ),
    ] = False,
    token: TokenOpt = None,
    max_workers: Annotated[
        int,
        typer.Option(
            help="Maximum number of workers to use for downloading files. Default is 8.",
        ),
    ] = 8,
    format: FormatWithAutoOpt = OutputFormatWithAuto.auto,
) -> None:
    """Download files from the Hub."""

    def run_download() -> str | DryRunFileInfo | list[DryRunFileInfo]:
        filenames_list = filenames if filenames is not None else []

        # Separate subfolder patterns (ending with '/') from regular filenames
        # Subfolders like "art/" are converted to include patterns like "art/**"
        subfolders = [f for f in filenames_list if f.endswith("/")]
        subfolder_patterns = [f"{f.rstrip('/')}/**" for f in subfolders]
        regular_filenames = [f for f in filenames_list if not f.endswith("/")]

        # Error if subfolder patterns are combined with --include/--exclude
        # Guide user to use --include instead of subfolder argument
        if len(subfolder_patterns) > 0:
            if include is not None and len(include) > 0:
                raise CLIError(
                    f"Cannot combine subfolder argument ('{subfolders[0]}') with `--include`. "
                    f'Please use `--include "{subfolders[0]}*"` instead.'
                )
            if exclude is not None and len(exclude) > 0:
                raise CLIError(
                    f"Cannot combine subfolder argument ('{subfolders[0]}') with `--exclude`. "
                    f'Please use `--include "{subfolders[0]}*"` with `--exclude` instead.'
                )

        # Warn user if patterns are ignored (only if regular filenames are provided)
        if len(regular_filenames) > 0:
            if include is not None and len(include) > 0:
                warnings.warn("Ignoring `--include` since filenames have being explicitly set.")
            if exclude is not None and len(exclude) > 0:
                warnings.warn("Ignoring `--exclude` since filenames have being explicitly set.")

        # Single file to download (not a subfolder): use `hf_hub_download`
        if len(regular_filenames) == 1 and len(subfolder_patterns) == 0:
            return hf_hub_download(
                repo_id=repo_id,
                repo_type=repo_type.value,
                revision=revision,
                filename=regular_filenames[0],
                cache_dir=cache_dir,
                force_download=force_download,
                token=token,
                local_dir=local_dir,
                library_name="huggingface-cli",
                dry_run=dry_run,
            )

        # Otherwise: use `snapshot_download` to ensure all files comes from same revision
        if len(regular_filenames) == 0 and len(subfolder_patterns) == 0:
            # No filenames provided: use include/exclude patterns
            allow_patterns = include
            ignore_patterns = exclude
        else:
            # Combine regular filenames and subfolder patterns as allow_patterns
            allow_patterns = regular_filenames + subfolder_patterns
            ignore_patterns = None

        return snapshot_download(
            repo_id=repo_id,
            repo_type=repo_type.value,
            revision=revision,
            allow_patterns=allow_patterns,
            ignore_patterns=ignore_patterns,
            force_download=force_download,
            cache_dir=cache_dir,
            token=token,
            local_dir=local_dir,
            library_name="huggingface-cli",
            max_workers=max_workers,
            dry_run=dry_run,
        )

    def _print_result(result: str | DryRunFileInfo | list[DryRunFileInfo]) -> None:
        if isinstance(result, str):
            out.result("Downloaded", path=result)
            return

        # Print dry run info
        if isinstance(result, DryRunFileInfo):
            result = [result]
        will_download = [r for r in result if r.will_download]
        out.text(
            f"[dry-run] Will download {len(will_download)} files"
            f" (out of {len(result)})"
            f" totalling {_format_size(sum(r.file_size for r in will_download))}."
        )
        items = [
            {
                "file": info.filename,
                "size": _format_size(info.file_size) if info.will_download else "-",
            }
            for info in sorted(result, key=lambda x: x.filename)
        ]
        out.table(items)

    _print_result(run_download())