Spaces:

ycwhencpp
/

train-new

Paused

File size: 5,675 Bytes

5e9fb2f

# Copyright 2026 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import os
import shutil
import subprocess
from dataclasses import dataclass
from typing import TYPE_CHECKING, Any, Union

from . import constants
from .utils import get_token


if TYPE_CHECKING:
    import duckdb


@dataclass(frozen=True)
class DatasetParquetEntry:
    """Represents a single parquet file available for a dataset on the Hub."""

    config: str
    split: str
    url: str
    size: int


def execute_raw_sql_query(sql_query: str, *, token: str | bool | None = None) -> list[dict[str, Any]]:
    normalized_query = sql_query.strip().rstrip(";").strip()
    _raise_on_forbidden_query(normalized_query)

    connection = None
    try:
        connection = _get_duckdb_connection(token=token)
        relation = connection.sql(normalized_query)
        if relation is None:
            raise ValueError("SQL query must return rows.")

        if isinstance(relation, _DuckDBCliRelation):
            # DuckDB binary => run CLI => parse JSON
            return relation.execute()
        else:
            # DuckDB Python API => fetch columns + rows => convert to dicts
            columns = tuple(column[0] for column in relation.description)
            rows = tuple(tuple(row) for row in relation.fetchall())
            return [dict(zip(columns, row)) for row in rows]
    finally:
        if connection is not None:
            connection.close()


def _raise_on_forbidden_query(query: str) -> None:
    if len(query) == 0:
        raise ValueError("SQL query cannot be empty.")

    # DuckDB CLI meta-commands are dot-prefixed words (e.g. `.shell`, `.output`).
    # Let's forbid them for now but allow SQL expressions like `.5` that can legitimately start a line.
    for line in query.splitlines():
        stripped = line.lstrip()
        if stripped.startswith(".") and stripped[1:2].isalpha():
            raise ValueError("DuckDB CLI meta-commands are not allowed in SQL queries.")


def _get_duckdb_connection(
    token: str | bool | None,
) -> Union["duckdb.DuckDBPyConnection", "_DuckDBCliConnection"]:
    try:
        # If DuckDB is installed as a Python package, use it!
        import duckdb
    except ImportError as error:
        # Otherwise, use the DuckDB CLI binary.
        duckdb_binary = shutil.which("duckdb")
        if duckdb_binary is None:
            raise ImportError(
                "DuckDB is required for `hf datasets sql`. Install the Python package with `pip install duckdb` or "
                "install the DuckDB CLI binary (for example `brew install duckdb`)."
            ) from error
        return _DuckDBCliConnection(binary_path=duckdb_binary, token=token)

    # Create a new connection (Python API).
    connection = duckdb.connect()
    try:
        for statement in _build_duckdb_secret_statements(token):
            connection.execute(statement)
        return connection
    except Exception:
        connection.close()
        raise


@dataclass
class _DuckDBCliConnection:
    """DuckDB connection.

    Mimics the DuckDB Python API, but runs the queries via the DuckDB CLI binary.
    """

    binary_path: str
    token: str | bool | None

    def __post_init__(self) -> None:
        self._setup_statements = _build_duckdb_secret_statements(self.token)

    def sql(self, query: str) -> "_DuckDBCliRelation":
        return _DuckDBCliRelation(binary_path=self.binary_path, setup_statements=self._setup_statements, query=query)

    def close(self) -> None:
        pass


@dataclass
class _DuckDBCliRelation:
    """DuckDB relation.

    Mimics the DuckDB Python API, but runs the queries via the DuckDB CLI binary.
    """

    binary_path: str
    setup_statements: list[str]
    query: str

    def execute(self) -> list[dict[str, Any]]:
        # Build the DuckDB CLI input.
        setup = []
        if self.setup_statements:
            setup = [
                f".output {os.devnull}",
                *(f"{stmt};" for stmt in self.setup_statements),
                ".output",
            ]
        full_query = "\n".join(setup + [self.query + ";"])

        # Run DuckDB binary
        result = subprocess.run(
            [self.binary_path, "-json"],
            input=full_query,
            capture_output=True,
            text=True,
            check=False,
        )
        if result.returncode != 0:
            error_message = result.stderr.strip() or result.stdout.strip() or "DuckDB CLI command failed."
            raise RuntimeError(error_message)

        # Parse JSON output and return
        return json.loads(result.stdout.strip())


def _build_duckdb_secret_statements(token: str | bool | None) -> list[str]:
    if token is None or token is True:
        token = get_token()

    if not token:
        return []

    escaped_token = token.replace("'", "''")
    escaped_endpoint = constants.ENDPOINT.replace("'", "''")
    return [
        f"CREATE OR REPLACE SECRET hf_hub_token (TYPE HTTP, BEARER_TOKEN '{escaped_token}', SCOPE '{escaped_endpoint}')",
        f"CREATE OR REPLACE SECRET hf_token (TYPE HUGGINGFACE, TOKEN '{escaped_token}')",
    ]