ycwhencpp's picture
Sync repo: updated train_grpo notebook for training run
5e9fb2f verified
# Copyright 2025 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Contains commands to interact with papers on the Hugging Face Hub.
Usage:
# list daily papers (most recently submitted)
hf papers ls
# list trending papers
hf papers ls --sort=trending
# list papers from a specific date, ordered by upvotes
hf papers ls --date=2025-01-23
# list today's papers, ordered by upvotes
hf papers ls --date=today
# list papers from a specific week
hf papers ls --week=2025-W09
# list papers by a specific submitter
hf papers ls --submitter=someuser
# search papers
hf papers search "vision language"
# get info about a paper
hf papers info 2502.08025
# read a paper as markdown
hf papers read 2502.08025
"""
import datetime
import enum
from typing import Annotated, get_args
import typer
from huggingface_hub.errors import CLIError, HfHubHTTPError
from huggingface_hub.hf_api import DailyPapersSort_T
from ._cli_utils import (
FormatWithAutoOpt,
LimitOpt,
TokenOpt,
api_object_to_dict,
get_hf_api,
typer_factory,
)
from ._output import OutputFormatWithAuto, out
_SORT_OPTIONS = get_args(DailyPapersSort_T)
PaperSortEnum = enum.Enum("PaperSortEnum", {s: s for s in _SORT_OPTIONS}, type=str) # type: ignore[misc]
def _parse_date(value: str | None) -> str | None:
"""Parse date option, converting 'today' to current date."""
if value is None:
return None
if value.lower() == "today":
return datetime.date.today().isoformat()
return value
papers_cli = typer_factory(help="Interact with papers on the Hub.")
@papers_cli.command(
"list | ls",
examples=[
"hf papers ls",
"hf papers ls --sort trending",
"hf papers ls --date 2025-01-23",
"hf papers ls --week 2025-W09",
"hf papers ls --submitter akhaliq",
"hf papers ls --format json",
],
)
def papers_ls(
date: Annotated[
str | None,
typer.Option(
help="Date in ISO format (YYYY-MM-DD) or 'today'.",
callback=_parse_date,
),
] = None,
week: Annotated[
str | None,
typer.Option(help="ISO week to filter by, e.g. '2025-W09'."),
] = None,
month: Annotated[
str | None,
typer.Option(help="Month to filter by in ISO format (YYYY-MM), e.g. '2025-02'."),
] = None,
submitter: Annotated[
str | None,
typer.Option(help="Filter by username of the submitter."),
] = None,
sort: Annotated[
PaperSortEnum | None,
typer.Option(help="Sort results."),
] = None,
limit: LimitOpt = 50,
format: FormatWithAutoOpt = OutputFormatWithAuto.auto,
token: TokenOpt = None,
) -> None:
"""List daily papers on the Hub."""
api = get_hf_api(token=token)
sort_key = sort.value if sort else None
results = []
for paper_info in api.list_daily_papers(
date=date,
week=week,
month=month,
submitter=submitter,
sort=sort_key,
limit=limit,
):
item = api_object_to_dict(paper_info)
submitted_by = item.get("submitted_by") or {}
item["submitted_by_name"] = submitted_by.get("fullname") or submitted_by.get("username") or ""
results.append(item)
out.table(
results,
headers=["id", "title", "upvotes", "comments", "published_at", "submitted_by_name"],
alignments={"upvotes": "right", "comments": "right"},
)
@papers_cli.command(
"search",
examples=[
'hf papers search "vision language"',
'hf papers search "attention mechanism" --limit 10',
'hf papers search "diffusion" --format json',
],
)
def papers_search(
query: Annotated[str, typer.Argument(help="Search query string.")],
limit: LimitOpt = 20,
format: FormatWithAutoOpt = OutputFormatWithAuto.auto,
token: TokenOpt = None,
) -> None:
"""Search papers on the Hub."""
api = get_hf_api(token=token)
results = [api_object_to_dict(paper_info) for paper_info in api.list_papers(query=query, limit=limit)]
out.table(results, headers=["id", "title", "summary", "upvotes", "published_at"], alignments={"upvotes": "right"})
@papers_cli.command(
"info",
examples=[
"hf papers info 2601.15621",
],
)
def papers_info(
paper_id: Annotated[str, typer.Argument(help="The arXiv paper ID (e.g. '2502.08025').")],
format: FormatWithAutoOpt = OutputFormatWithAuto.auto,
token: TokenOpt = None,
) -> None:
"""Get info about a paper on the Hub."""
api = get_hf_api(token=token)
try:
info = api.paper_info(id=paper_id)
except HfHubHTTPError as e:
if e.response.status_code == 404:
raise CLIError(f"Paper '{paper_id}' not found on the Hub.") from e
raise
out.dict(info)
@papers_cli.command(
"read",
examples=[
"hf papers read 2601.15621",
],
)
def papers_read(
paper_id: Annotated[str, typer.Argument(help="The arXiv paper ID (e.g. '2502.08025').")],
token: TokenOpt = None,
) -> None:
"""Read a paper as markdown."""
api = get_hf_api(token=token)
try:
content = api.read_paper(id=paper_id)
except HfHubHTTPError as e:
if e.response.status_code == 404:
raise CLIError(f"Paper '{paper_id}' not found on the Hub.") from e
raise
out.text(content)