# Copyright 2026-present, the HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Shared logic for bucket operations. This module contains the core buckets logic used by both the CLI and the Python API. """ import fnmatch import json import mimetypes import os import stat import sys import time from collections.abc import Iterator from dataclasses import dataclass, field from datetime import datetime, timezone from pathlib import Path from typing import TYPE_CHECKING, Any, Literal from . import constants, logging from .errors import BucketNotFoundError from .utils import XetFileData, disable_progress_bars, enable_progress_bars, parse_datetime from .utils._terminal import StatusLine if TYPE_CHECKING: from .hf_api import HfApi logger = logging.get_logger(__name__) BUCKET_PREFIX = "hf://buckets/" _SYNC_TIME_WINDOW_MS = 1000 # 1s safety-window for file modification time comparisons # ============================================================================= # Bucket data structures # ============================================================================= def _split_bucket_id_and_prefix(path: str) -> tuple[str, str]: """Split 'namespace/name(/optional/prefix)' into ('namespace/name', 'prefix'). Returns (bucket_id, prefix) where prefix may be empty string. Raises ValueError if path doesn't contain at least namespace/name. """ parts = path.split("/", 2) if len(parts) < 2 or not parts[0] or not parts[1]: raise ValueError(f"Invalid bucket path: '{path}'. Expected format: namespace/bucket_name") bucket_id = f"{parts[0]}/{parts[1]}" prefix = parts[2] if len(parts) > 2 else "" return bucket_id, prefix @dataclass class BucketInfo: """ Contains information about a bucket on the Hub. This object is returned by [`bucket_info`] and [`list_buckets`]. Attributes: id (`str`): ID of the bucket. private (`bool`): Is the bucket private. created_at (`datetime`): Date of creation of the bucket on the Hub. size (`int`): Size of the bucket in bytes. total_files (`int`): Total number of files in the bucket. """ id: str private: bool created_at: datetime size: int total_files: int def __init__(self, **kwargs): self.id = kwargs.pop("id") self.private = kwargs.pop("private") self.created_at = parse_datetime(kwargs.pop("createdAt")) self.size = kwargs.pop("size") self.total_files = kwargs.pop("totalFiles") self.__dict__.update(**kwargs) @dataclass class _BucketAddFile: source: str | Path | bytes destination: str xet_hash: str | None = field(default=None) size: int | None = field(default=None) mtime: int = field(init=False) content_type: str | None = field(init=False) def __post_init__(self) -> None: self.content_type = None if isinstance(self.source, (str, Path)): # guess content type from source path self.content_type = mimetypes.guess_type(self.source)[0] if self.content_type is None: # or default to destination path content type self.content_type = mimetypes.guess_type(self.destination)[0] self.mtime = int( os.path.getmtime(self.source) * 1000 if not isinstance(self.source, bytes) else time.time() * 1000 ) @dataclass class _BucketCopyFile: destination: str xet_hash: str source_repo_type: str # "model", "dataset", "space", "bucket" source_repo_id: str size: int | None = field(default=None) mtime: int = field(init=False) content_type: str | None = field(init=False) def __post_init__(self) -> None: self.content_type = mimetypes.guess_type(self.destination)[0] self.mtime = int(time.time() * 1000) @dataclass class _BucketDeleteFile: path: str @dataclass(frozen=True) class BucketFileMetadata: """Data structure containing information about a file in a bucket. Returned by [`get_bucket_file_metadata`]. Args: size (`int`): Size of the file in bytes. xet_file_data (`XetFileData`): Xet information for the file (hash and refresh route). """ size: int xet_file_data: XetFileData @dataclass class BucketUrl: """Describes a bucket URL on the Hub. `BucketUrl` is returned by [`create_bucket`]. At initialization, the URL is parsed to populate properties: - endpoint (`str`) - namespace (`str`) - bucket_id (`str`) - url (`str`) - handle (`str`) Args: url (`str`): String value of the bucket url. endpoint (`str`, *optional*): Endpoint of the Hub. Defaults to . """ url: str endpoint: str = "" namespace: str = field(init=False) bucket_id: str = field(init=False) handle: str = field(init=False) def __post_init__(self) -> None: self.endpoint = self.endpoint or constants.ENDPOINT # Parse URL: expected format is `{endpoint}/buckets/{namespace}/{bucket_name}` url_path = self.url.replace(self.endpoint, "").strip("/") # Remove leading "buckets/" prefix if url_path.startswith("buckets/"): url_path = url_path[len("buckets/") :] bucket_id, prefix = _split_bucket_id_and_prefix(url_path) if prefix: raise ValueError(f"Unable to parse bucket URL: {self.url}") self.namespace = bucket_id.split("/")[0] self.bucket_id = bucket_id self.handle = f"hf://buckets/{self.bucket_id}" @dataclass class BucketFile: """ Contains information about a file in a bucket on the Hub. This object is returned by [`list_bucket_tree`]. Similar to [`RepoFile`] but for files in buckets. """ type: Literal["file"] path: str size: int xet_hash: str mtime: datetime | None uploaded_at: datetime | None def __init__(self, **kwargs): self.type = kwargs.pop("type") self.path = kwargs.pop("path") self.size = kwargs.pop("size") self.xet_hash = kwargs.pop("xetHash") mtime = kwargs.pop("mtime", None) self.mtime = parse_datetime(mtime) if mtime else None uploaded_at = kwargs.pop("uploadedAt", None) self.uploaded_at = parse_datetime(uploaded_at) if uploaded_at else None @dataclass class BucketFolder: """ Contains information about a directory in a bucket on the Hub. This object is returned by [`list_bucket_tree`]. Similar to [`RepoFolder`] but for directories in buckets. """ type: Literal["directory"] path: str uploaded_at: datetime | None def __init__(self, **kwargs): self.type = kwargs.pop("type") self.path = kwargs.pop("path") uploaded_at = kwargs.pop("uploadedAt", None) or kwargs.pop("uploaded_at", None) self.uploaded_at = ( (uploaded_at if isinstance(uploaded_at, datetime) else parse_datetime(uploaded_at)) if uploaded_at else None ) # ============================================================================= # Bucket path parsing # ============================================================================= def _parse_bucket_path(path: str) -> tuple[str, str]: """Parse a bucket path like hf://buckets/namespace/bucket_name/prefix into (bucket_id, prefix). Returns: tuple: (bucket_id, prefix) where bucket_id is "namespace/bucket_name" and prefix may be empty string. """ if not path.startswith(BUCKET_PREFIX): raise ValueError(f"Invalid bucket path: {path}. Must start with {BUCKET_PREFIX}") return _split_bucket_id_and_prefix(path.removeprefix(BUCKET_PREFIX)) def _is_bucket_path(path: str) -> bool: """Check if a path is a bucket path.""" return path.startswith(BUCKET_PREFIX) # ============================================================================= # Sync data structures # ============================================================================= @dataclass class SyncOperation: """Represents a sync operation to be performed.""" action: Literal["upload", "download", "delete", "skip"] path: str size: int | None = None reason: str = "" local_mtime: str | None = None remote_mtime: str | None = None bucket_file: BucketFile | None = None # BucketFile when available (not serialized to plan file) @dataclass class SyncPlan: """Represents a complete sync plan.""" source: str dest: str timestamp: str operations: list[SyncOperation] = field(default_factory=list) def summary(self) -> dict[str, int | str]: uploads = sum(1 for op in self.operations if op.action == "upload") downloads = sum(1 for op in self.operations if op.action == "download") deletes = sum(1 for op in self.operations if op.action == "delete") skips = sum(1 for op in self.operations if op.action == "skip") total_size = sum(op.size or 0 for op in self.operations if op.action in ("upload", "download")) return { "uploads": uploads, "downloads": downloads, "deletes": deletes, "skips": skips, "total_size": total_size, } # ============================================================================= # Filter matching # ============================================================================= class FilterMatcher: """Matches file paths against include/exclude patterns.""" def __init__( self, include_patterns: list[str] | None = None, exclude_patterns: list[str] | None = None, filter_rules: list[tuple[str, str]] | None = None, ): """Initialize the filter matcher. Args: include_patterns: Patterns to include (from --include) exclude_patterns: Patterns to exclude (from --exclude) filter_rules: Rules from filter file as list of ("+"/"-", pattern) tuples """ self.include_patterns = include_patterns or [] self.exclude_patterns = exclude_patterns or [] self.filter_rules = filter_rules or [] def matches(self, path: str) -> bool: """Check if a path should be included based on the filter rules. Filtering rules: - Filters are evaluated in order, first matching rule decides - If no rules match, include by default (unless include patterns are specified) """ # First check filter rules from file (in order) for sign, pattern in self.filter_rules: if fnmatch.fnmatch(path, pattern): return sign == "+" # Then check CLI patterns for pattern in self.exclude_patterns: if fnmatch.fnmatch(path, pattern): return False for pattern in self.include_patterns: if fnmatch.fnmatch(path, pattern): return True # If include patterns were specified but none matched, exclude if self.include_patterns: return False # Default: include return True def _parse_filter_file(filter_file: str) -> list[tuple[str, str]]: """Parse a filter file and return a list of (sign, pattern) tuples. Filter file format: - Lines starting with "+" are include patterns - Lines starting with "-" are exclude patterns - Empty lines and lines starting with "#" are ignored """ rules = [] with open(filter_file) as f: for line in f: line = line.strip() if not line or line.startswith("#"): continue if line.startswith("+"): rules.append(("+", line[1:].strip())) elif line.startswith("-"): rules.append(("-", line[1:].strip())) else: # Default to include if no prefix rules.append(("+", line)) return rules # ============================================================================= # File listing # ============================================================================= def _stat_local(path: str) -> tuple[int, float] | None: """Stat a local file and return (size, mtime_ms). Returns None if the path is missing or is a directory. Uses a single ``os.stat`` call so callers don't pay for multiple syscalls per file. """ try: st = os.stat(path) except OSError: return None if stat.S_ISDIR(st.st_mode): return None return st.st_size, st.st_mtime * 1000 def _list_local_files(local_path: str) -> Iterator[tuple[str, int, float]]: """List all files in a local directory. Yields: tuple: (relative_path, size, mtime_ms) for each file """ local_path = os.path.abspath(local_path) if not os.path.isdir(local_path): raise ValueError(f"Local path must be a directory: {local_path}") for root, _, files in os.walk(local_path): for filename in files: full_path = os.path.join(root, filename) stat_info = _stat_local(full_path) if stat_info is None: continue rel_path = os.path.relpath(full_path, local_path) # Normalize to forward slashes for consistency rel_path = rel_path.replace(os.sep, "/") yield rel_path, stat_info[0], stat_info[1] def _list_remote_files(api: "HfApi", bucket_id: str, prefix: str) -> Iterator[tuple[str, int, float, Any]]: """List all files in a bucket with a given prefix. Yields: tuple: (relative_path, size, mtime_ms, bucket_file) for each file. bucket_file is the BucketFile object from list_bucket_tree. """ for item in api.list_bucket_tree(bucket_id, prefix=prefix or None, recursive=True): if isinstance(item, BucketFolder): continue path = item.path # Remove prefix from path to get relative path # Only strip prefix if it's followed by "/" (directory boundary) or is exact match if prefix: if path.startswith(prefix + "/"): rel_path = path[len(prefix) + 1 :] elif path == prefix: # Exact match: the file IS the prefix (e.g., single file download) rel_path = path.rsplit("/", 1)[-1] if "/" in path else path else: # Path doesn't match prefix pattern (e.g., "submarine.txt" for prefix "sub") # Skip this file - it was returned by the API but doesn't belong to this prefix continue else: rel_path = path mtime_ms = item.mtime.timestamp() * 1000 if item.mtime else 0 yield rel_path, item.size, mtime_ms, item # ============================================================================= # Sync plan computation # ============================================================================= def _mtime_to_iso(mtime_ms: float) -> str: """Convert mtime in milliseconds to ISO format string.""" return datetime.fromtimestamp(mtime_ms / 1000, tz=timezone.utc).isoformat() def _compare_files_for_sync( *, path: str, action: Literal["upload", "download"], source_size: int, source_mtime: float, dest_size: int, dest_mtime: float, source_newer_label: str, dest_newer_label: str, ignore_sizes: bool, ignore_times: bool, ignore_existing: bool, bucket_file: Any | None = None, ) -> SyncOperation: """Compare source and dest files and return the appropriate sync operation. This is a unified helper for both upload and download directions. Args: path: Relative file path action: "upload" or "download" source_size: Size of the source file (bytes) source_mtime: Mtime of the source file (milliseconds) dest_size: Size of the destination file (bytes) dest_mtime: Mtime of the destination file (milliseconds) source_newer_label: Label when source is newer (e.g., "local newer" or "remote newer") dest_newer_label: Label when dest is newer (e.g., "remote newer" or "local newer") ignore_sizes: Only compare mtime ignore_times: Only compare size ignore_existing: Skip files that exist on receiver bucket_file: BucketFile object (for downloads only) Returns: SyncOperation describing the action to take """ local_mtime_iso = _mtime_to_iso(source_mtime if action == "upload" else dest_mtime) remote_mtime_iso = _mtime_to_iso(dest_mtime if action == "upload" else source_mtime) base_kwargs: dict[str, Any] = { "path": path, "size": source_size, "local_mtime": local_mtime_iso, "remote_mtime": remote_mtime_iso, } if ignore_existing: return SyncOperation(action="skip", reason="exists on receiver (--ignore-existing)", **base_kwargs) size_differs = source_size != dest_size source_newer = (source_mtime - dest_mtime) > _SYNC_TIME_WINDOW_MS if ignore_sizes: if source_newer: return SyncOperation(action=action, reason=source_newer_label, bucket_file=bucket_file, **base_kwargs) else: dest_newer = (dest_mtime - source_mtime) > _SYNC_TIME_WINDOW_MS skip_reason = dest_newer_label if dest_newer else "same mtime" return SyncOperation(action="skip", reason=skip_reason, **base_kwargs) elif ignore_times: if size_differs: return SyncOperation(action=action, reason="size differs", bucket_file=bucket_file, **base_kwargs) else: return SyncOperation(action="skip", reason="same size", **base_kwargs) else: if size_differs or source_newer: reason = "size differs" if size_differs else source_newer_label return SyncOperation(action=action, reason=reason, bucket_file=bucket_file, **base_kwargs) else: return SyncOperation(action="skip", reason="identical", **base_kwargs) def _compute_sync_plan( source: str, dest: str, api: "HfApi", delete: bool = False, ignore_times: bool = False, ignore_sizes: bool = False, existing: bool = False, ignore_existing: bool = False, filter_matcher: FilterMatcher | None = None, status: Any | None = None, ) -> SyncPlan: """Compute the sync plan by comparing source and destination. Returns: SyncPlan with all operations to be performed """ filter_matcher = filter_matcher or FilterMatcher() is_upload = not _is_bucket_path(source) and _is_bucket_path(dest) is_download = _is_bucket_path(source) and not _is_bucket_path(dest) if not is_upload and not is_download: raise ValueError("One of source or dest must be a bucket path (hf://buckets/...) and the other must be local.") plan = SyncPlan( source=source, dest=dest, timestamp=datetime.now(timezone.utc).isoformat(), ) remote_total: int | None = None if is_upload: # Local -> Remote local_path = os.path.abspath(source) bucket_id, prefix = _parse_bucket_path(dest) if not os.path.isdir(local_path): raise ValueError(f"Source must be a directory: {local_path}") # Get local and remote file lists local_files = {} for rel_path, size, mtime_ms in _list_local_files(local_path): if filter_matcher.matches(rel_path): local_files[rel_path] = (size, mtime_ms) if status: status.update(f"Scanning local directory ({len(local_files)} files)") if status: status.done(f"Scanning local directory ({len(local_files)} files)") remote_files = {} if status: try: remote_total = api.bucket_info(bucket_id).total_files except Exception: pass try: for rel_path, size, mtime_ms, _ in _list_remote_files(api, bucket_id, prefix): if filter_matcher.matches(rel_path): remote_files[rel_path] = (size, mtime_ms) if status: total_str = f"/{remote_total}" if remote_total is not None else "" status.update(f"Scanning remote bucket ({len(remote_files)}{total_str} files)") except BucketNotFoundError: # Bucket doesn't exist yet - this is expected for new uploads logger.debug(f"Bucket '{bucket_id}' not found, treating as empty.") if status: status.done(f"Scanning remote bucket ({len(remote_files)} files)") # Compare files all_paths = set(local_files.keys()) | set(remote_files.keys()) if status: status.done(f"Comparing files ({len(all_paths)} paths)") for path in sorted(all_paths): local_info = local_files.get(path) remote_info = remote_files.get(path) if local_info and not remote_info: # New file if existing: # --existing: skip new files plan.operations.append( SyncOperation( action="skip", path=path, size=local_info[0], reason="new file (--existing)", local_mtime=_mtime_to_iso(local_info[1]), ) ) else: plan.operations.append( SyncOperation( action="upload", path=path, size=local_info[0], reason="new file", local_mtime=_mtime_to_iso(local_info[1]), ) ) elif local_info and remote_info: # File exists in both - use helper to determine action local_size, local_mtime = local_info remote_size, remote_mtime = remote_info plan.operations.append( _compare_files_for_sync( path=path, action="upload", source_size=local_size, source_mtime=local_mtime, dest_size=remote_size, dest_mtime=remote_mtime, source_newer_label="local newer", dest_newer_label="remote newer", ignore_sizes=ignore_sizes, ignore_times=ignore_times, ignore_existing=ignore_existing, ) ) elif not local_info and remote_info and delete: # File only in remote and --delete mode plan.operations.append( SyncOperation( action="delete", path=path, size=remote_info[0], reason="not in source (--delete)", remote_mtime=_mtime_to_iso(remote_info[1]), ) ) else: # Remote -> Local (download) bucket_id, prefix = _parse_bucket_path(source) local_path = os.path.abspath(dest) # Get remote and local file lists remote_files = {} bucket_file_map: dict[str, Any] = {} if status: try: remote_total = api.bucket_info(bucket_id).total_files except Exception: pass for rel_path, size, mtime_ms, bucket_file in _list_remote_files(api, bucket_id, prefix): if filter_matcher.matches(rel_path): remote_files[rel_path] = (size, mtime_ms) bucket_file_map[rel_path] = bucket_file if status: total_str = f"/{remote_total}" if remote_total is not None else "" status.update(f"Scanning remote bucket ({len(remote_files)}{total_str} files)") if status: status.done(f"Scanning remote bucket ({len(remote_files)} files)") local_files = {} if os.path.isdir(local_path): if delete: # Full walk needed to discover local-only files for deletion. for rel_path, size, mtime_ms in _list_local_files(local_path): if filter_matcher.matches(rel_path): local_files[rel_path] = (size, mtime_ms) if status: status.update(f"Scanning local directory ({len(local_files)} files)") else: # Without --delete, the plan only depends on paths that exist # remotely. Stat just those instead of walking the whole tree, # which can take minutes when dest sits in a large directory # like ~/.cache/huggingface/. for rel_path in remote_files: local_file = os.path.join(local_path, rel_path) stat_info = _stat_local(local_file) if stat_info is None: continue local_files[rel_path] = stat_info if status: status.update(f"Scanning local directory ({len(local_files)} files)") if status: status.done(f"Scanning local directory ({len(local_files)} files)") # Compare files all_paths = set(remote_files.keys()) | set(local_files.keys()) if status: status.done(f"Comparing files ({len(all_paths)} paths)") for path in sorted(all_paths): remote_info = remote_files.get(path) local_info = local_files.get(path) if remote_info and not local_info: # New file if existing: # --existing: skip new files plan.operations.append( SyncOperation( action="skip", path=path, size=remote_info[0], reason="new file (--existing)", remote_mtime=_mtime_to_iso(remote_info[1]), ) ) else: plan.operations.append( SyncOperation( action="download", path=path, size=remote_info[0], reason="new file", remote_mtime=_mtime_to_iso(remote_info[1]), bucket_file=bucket_file_map.get(path), ) ) elif remote_info and local_info: # File exists in both - use helper to determine action remote_size, remote_mtime = remote_info local_size, local_mtime = local_info plan.operations.append( _compare_files_for_sync( path=path, action="download", source_size=remote_size, source_mtime=remote_mtime, dest_size=local_size, dest_mtime=local_mtime, source_newer_label="remote newer", dest_newer_label="local newer", ignore_sizes=ignore_sizes, ignore_times=ignore_times, ignore_existing=ignore_existing, bucket_file=bucket_file_map.get(path), ) ) elif not remote_info and local_info and delete: # File only in local and --delete mode plan.operations.append( SyncOperation( action="delete", path=path, size=local_info[0], reason="not in source (--delete)", local_mtime=_mtime_to_iso(local_info[1]), ) ) return plan # ============================================================================= # Plan serialization # ============================================================================= def _write_plan(plan: SyncPlan, f) -> None: """Write a sync plan as JSONL to a file-like object.""" # Write header header = { "type": "header", "source": plan.source, "dest": plan.dest, "timestamp": plan.timestamp, "summary": plan.summary(), } f.write(json.dumps(header) + "\n") # Write operations for op in plan.operations: op_dict: dict[str, Any] = { "type": "operation", "action": op.action, "path": op.path, "reason": op.reason, } if op.size is not None: op_dict["size"] = op.size if op.local_mtime is not None: op_dict["local_mtime"] = op.local_mtime if op.remote_mtime is not None: op_dict["remote_mtime"] = op.remote_mtime f.write(json.dumps(op_dict) + "\n") def _save_plan(plan: SyncPlan, plan_file: str) -> None: """Save a sync plan to a JSONL file.""" with open(plan_file, "w") as f: _write_plan(plan, f) def _load_plan(plan_file: str) -> SyncPlan: """Load a sync plan from a JSONL file.""" with open(plan_file) as f: lines = f.readlines() if not lines: raise ValueError(f"Empty plan file: {plan_file}") # Parse header header = json.loads(lines[0]) if header.get("type") != "header": raise ValueError("Invalid plan file: expected header as first line") plan = SyncPlan( source=header["source"], dest=header["dest"], timestamp=header["timestamp"], ) # Parse operations for line in lines[1:]: op_dict = json.loads(line) if op_dict.get("type") != "operation": continue plan.operations.append( SyncOperation( action=op_dict["action"], path=op_dict["path"], size=op_dict.get("size"), reason=op_dict.get("reason", ""), local_mtime=op_dict.get("local_mtime"), remote_mtime=op_dict.get("remote_mtime"), ) ) return plan # ============================================================================= # Plan execution # ============================================================================= def _execute_plan(plan: SyncPlan, api: "HfApi", verbose: bool = False, status: Any | None = None) -> None: """Execute a sync plan.""" is_upload = not _is_bucket_path(plan.source) and _is_bucket_path(plan.dest) is_download = _is_bucket_path(plan.source) and not _is_bucket_path(plan.dest) if is_upload: local_path = os.path.abspath(plan.source) bucket_id, prefix = _parse_bucket_path(plan.dest) prefix = prefix.rstrip("/") # Avoid double slashes in remote paths # Collect operations add_files: list[tuple[str | Path | bytes, str]] = [] delete_paths: list[str] = [] for op in plan.operations: match op.action: case "upload": local_file = os.path.join(local_path, op.path) remote_path = f"{prefix}/{op.path}" if prefix else op.path if verbose: print(f" Uploading: {op.path} ({op.reason})") add_files.append((local_file, remote_path)) case "delete": remote_path = f"{prefix}/{op.path}" if prefix else op.path if verbose: print(f" Deleting: {op.path} ({op.reason})") delete_paths.append(remote_path) case "skip" if verbose: print(f" Skipping: {op.path} ({op.reason})") # Execute batch operations if add_files or delete_paths: if status: parts = [] if add_files: parts.append(f"uploading {len(add_files)} files") if delete_paths: parts.append(f"deleting {len(delete_paths)} files") status.done(", ".join(parts).capitalize()) api.batch_bucket_files( bucket_id, add=add_files or None, delete=delete_paths or None, ) elif is_download: bucket_id, prefix = _parse_bucket_path(plan.source) prefix = prefix.rstrip("/") # Avoid double slashes in remote paths local_path = os.path.abspath(plan.dest) # Ensure local directory exists os.makedirs(local_path, exist_ok=True) # Collect download operations download_files: list[tuple[str | BucketFile, str | Path]] = [] delete_files: list[str] = [] for op in plan.operations: if op.action == "download": local_file = os.path.join(local_path, op.path) # Ensure parent directory exists os.makedirs(os.path.dirname(local_file), exist_ok=True) if verbose: print(f" Downloading: {op.path} ({op.reason})") # Use BucketFile when available (avoids extra metadata fetch per file) if op.bucket_file is not None: download_files.append((op.bucket_file, local_file)) else: remote_path = f"{prefix}/{op.path}" if prefix else op.path download_files.append((remote_path, local_file)) elif op.action == "delete": local_file = os.path.join(local_path, op.path) if verbose: print(f" Deleting: {op.path} ({op.reason})") delete_files.append(local_file) elif op.action == "skip" and verbose: print(f" Skipping: {op.path} ({op.reason})") # Execute downloads if len(download_files) > 0: if status: status.done(f"Downloading {len(download_files)} files") api.download_bucket_files(bucket_id, download_files) # Execute deletes if status and delete_files: status.done(f"Deleting {len(delete_files)} local files") for file_path in delete_files: if os.path.exists(file_path): os.remove(file_path) # Remove empty parent directories parent = os.path.dirname(file_path) while parent != local_path: try: os.rmdir(parent) parent = os.path.dirname(parent) except OSError: break def _print_plan_summary(plan: SyncPlan) -> None: """Print a summary of the sync plan.""" summary = plan.summary() print(f"Sync plan: {plan.source} -> {plan.dest}") print(f" Uploads: {summary['uploads']}") print(f" Downloads: {summary['downloads']}") print(f" Deletes: {summary['deletes']}") print(f" Skips: {summary['skips']}") # ============================================================================= # Public sync function (Python API) # ============================================================================= def sync_bucket_internal( source: str | None = None, dest: str | None = None, *, api: "HfApi", delete: bool = False, ignore_times: bool = False, ignore_sizes: bool = False, existing: bool = False, ignore_existing: bool = False, include: list[str] | None = None, exclude: list[str] | None = None, filter_from: str | None = None, plan: str | None = None, apply: str | None = None, dry_run: bool = False, verbose: bool = False, quiet: bool = False, token: bool | str | None = None, ) -> SyncPlan: """Sync files between a local directory and a bucket. This is equivalent to the ``hf buckets sync`` CLI command. One of ``source`` or ``dest`` must be a bucket path (``hf://buckets/...``) and the other must be a local directory path. Args: source (`str`, *optional*): Source path: local directory or ``hf://buckets/namespace/bucket_name(/prefix)``. Required unless using ``apply``. dest (`str`, *optional*): Destination path: local directory or ``hf://buckets/namespace/bucket_name(/prefix)``. Required unless using ``apply``. api ([`HfApi`]): The HfApi instance to use for API calls. delete (`bool`, *optional*, defaults to `False`): Delete destination files not present in source. ignore_times (`bool`, *optional*, defaults to `False`): Skip files only based on size, ignoring modification times. ignore_sizes (`bool`, *optional*, defaults to `False`): Skip files only based on modification times, ignoring sizes. existing (`bool`, *optional*, defaults to `False`): Skip creating new files on receiver (only update existing files). ignore_existing (`bool`, *optional*, defaults to `False`): Skip updating files that exist on receiver (only create new files). include (`list[str]`, *optional*): Include files matching patterns (fnmatch-style). exclude (`list[str]`, *optional*): Exclude files matching patterns (fnmatch-style). filter_from (`str`, *optional*): Path to a filter file with include/exclude rules. plan (`str`, *optional*): Save sync plan to this JSONL file instead of executing. apply (`str`, *optional*): Apply a previously saved plan file. When set, ``source`` and ``dest`` are not needed. dry_run (`bool`, *optional*, defaults to `False`): Print sync plan to stdout as JSONL without executing. verbose (`bool`, *optional*, defaults to `False`): Show detailed per-file operations. quiet (`bool`, *optional*, defaults to `False`): Suppress all output and progress bars. token (Union[bool, str, None], optional): A valid user access token. If not provided, the locally saved token will be used. Returns: [`SyncPlan`]: The computed (or loaded) sync plan. Raises: `ValueError`: If arguments are invalid (e.g., both paths are remote, conflicting options). Example: ```python >>> from huggingface_hub import HfApi >>> api = HfApi() # Upload local directory to bucket >>> api.sync_bucket("./data", "hf://buckets/username/my-bucket") # Download bucket to local directory >>> api.sync_bucket("hf://buckets/username/my-bucket", "./data") # Sync with delete and filtering >>> api.sync_bucket( ... "./data", ... "hf://buckets/username/my-bucket", ... delete=True, ... include=["*.safetensors"], ... ) # Dry run: preview what would be synced >>> plan = api.sync_bucket("./data", "hf://buckets/username/my-bucket", dry_run=True) >>> plan.summary() {'uploads': 3, 'downloads': 0, 'deletes': 0, 'skips': 1, 'total_size': 4096} # Save plan for review, then apply >>> api.sync_bucket("./data", "hf://buckets/username/my-bucket", plan="sync-plan.jsonl") >>> api.sync_bucket(apply="sync-plan.jsonl") ``` """ # Build API with token if needed if token is not None: from .hf_api import HfApi api = HfApi(token=token) # --- Apply mode --- if apply: if source or dest: raise ValueError("Cannot specify source/dest when using apply.") if plan is not None: raise ValueError("Cannot specify both plan and apply.") if delete: raise ValueError("Cannot specify delete when using apply.") if ignore_times: raise ValueError("Cannot specify ignore_times when using apply.") if ignore_sizes: raise ValueError("Cannot specify ignore_sizes when using apply.") if include: raise ValueError("Cannot specify include when using apply.") if exclude: raise ValueError("Cannot specify exclude when using apply.") if filter_from: raise ValueError("Cannot specify filter_from when using apply.") if existing: raise ValueError("Cannot specify existing when using apply.") if ignore_existing: raise ValueError("Cannot specify ignore_existing when using apply.") if dry_run: raise ValueError("Cannot specify dry_run when using apply.") sync_plan = _load_plan(apply) status = StatusLine(enabled=not quiet) if not quiet: _print_plan_summary(sync_plan) print("Executing plan...") if quiet: disable_progress_bars() try: _execute_plan(sync_plan, api, verbose=verbose, status=status) finally: if quiet: enable_progress_bars() if not quiet: print("Sync completed.") return sync_plan # --- Normal mode --- if not source or not dest: raise ValueError("Both source and dest are required (unless using apply).") source_is_bucket = _is_bucket_path(source) dest_is_bucket = _is_bucket_path(dest) if source_is_bucket and dest_is_bucket: raise ValueError("Remote to remote sync is not supported. One path must be local.") if not source_is_bucket and not dest_is_bucket: raise ValueError("One of source or dest must be a bucket path (hf://buckets/...).") if ignore_times and ignore_sizes: raise ValueError("Cannot specify both ignore_times and ignore_sizes.") if existing and ignore_existing: raise ValueError("Cannot specify both existing and ignore_existing.") if dry_run and plan: raise ValueError("Cannot specify both dry_run and plan.") # Validate local path if source_is_bucket: if os.path.exists(dest) and not os.path.isdir(dest): raise ValueError(f"Destination must be a directory: {dest}") else: if not os.path.isdir(source): raise ValueError(f"Source must be an existing directory: {source}") # Build filter matcher filter_rules = None if filter_from: filter_rules = _parse_filter_file(filter_from) filter_matcher = FilterMatcher( include_patterns=include, exclude_patterns=exclude, filter_rules=filter_rules, ) # Compute sync plan status = StatusLine(enabled=not quiet and not dry_run) sync_plan = _compute_sync_plan( source=source, dest=dest, api=api, delete=delete, ignore_times=ignore_times, ignore_sizes=ignore_sizes, existing=existing, ignore_existing=ignore_existing, filter_matcher=filter_matcher, status=status, ) if dry_run: _write_plan(sync_plan, sys.stdout) return sync_plan if plan: _save_plan(sync_plan, plan) if not quiet: _print_plan_summary(sync_plan) print(f"Plan saved to: {plan}") return sync_plan # Execute plan if not quiet: _print_plan_summary(sync_plan) summary = sync_plan.summary() if summary["uploads"] == 0 and summary["downloads"] == 0 and summary["deletes"] == 0: if not quiet: print("Nothing to sync.") return sync_plan if not quiet: print("Syncing...") if quiet: disable_progress_bars() try: _execute_plan(sync_plan, api, verbose=verbose, status=status) finally: if quiet: enable_progress_bars() if not quiet: print("Sync completed.") return sync_plan