Spaces:

smolagents
/

ml-intern

Running on CPU Upgrade

File size: 19,604 Bytes

#!/usr/bin/env python3
"""
Integration tests for HF Papers Tool
Tests with real HF and arXiv APIs — all endpoints are public, no auth required.

Run: python tests/integration/tools/test_papers_integration.py
"""
import asyncio
import re
import sys

sys.path.insert(0, ".")

from agent.tools.papers_tool import hf_papers_handler

# ANSI color codes
GREEN = "\033[92m"
YELLOW = "\033[93m"
RED = "\033[91m"
BLUE = "\033[94m"
DIM = "\033[2m"
RESET = "\033[0m"

assertions_passed = 0
assertions_failed = 0


def print_test(msg):
    print(f"\n{BLUE}{'─' * 70}{RESET}")
    print(f"{BLUE}[TEST]{RESET} {msg}")
    print(f"{BLUE}{'─' * 70}{RESET}")


def print_success(msg):
    print(f"{GREEN}  ✓ {msg}{RESET}")


def print_error(msg):
    print(f"{RED}  ✗ {msg}{RESET}")


def print_output(output: str, max_lines: int = 40):
    """Print the full tool output, indented, with line limit."""
    lines = output.split("\n")
    for line in lines[:max_lines]:
        print(f"{DIM}  │ {RESET}{line}")
    if len(lines) > max_lines:
        print(f"{DIM}  │ ... ({len(lines) - max_lines} more lines){RESET}")


def assert_true(condition: bool, msg: str) -> bool:
    """Assert and print result. Returns True if passed."""
    global assertions_passed, assertions_failed
    if condition:
        print_success(msg)
        assertions_passed += 1
        return True
    else:
        print_error(msg)
        assertions_failed += 1
        return False


async def run(args: dict) -> tuple[str, bool]:
    return await hf_papers_handler(args)


# ---------------------------------------------------------------------------
# Test Suite 1: Paper Discovery
# ---------------------------------------------------------------------------


async def test_trending():
    print_test("trending (limit=3)")
    output, success = await run({"operation": "trending", "limit": 3})
    print_output(output)

    ok = True
    ok &= assert_true(success, "success=True")
    ok &= assert_true("# Trending Papers" in output, "has '# Trending Papers' heading")
    ok &= assert_true("Showing 3 paper(s)" in output, "shows exactly 3 papers")

    # Check that each paper has an arxiv_id line
    arxiv_ids = re.findall(r"\*\*arxiv_id:\*\* (\S+)", output)
    ok &= assert_true(len(arxiv_ids) == 3, f"found 3 arxiv IDs: {arxiv_ids}")

    # Check that IDs look valid (digits and dots)
    for aid in arxiv_ids:
        ok &= assert_true(
            re.match(r"\d{4}\.\d{4,5}", aid) is not None,
            f"arxiv_id '{aid}' looks valid (NNNN.NNNNN format)",
        )

    # Check each paper has an HF URL
    hf_urls = re.findall(r"https://huggingface\.co/papers/\S+", output)
    ok &= assert_true(len(hf_urls) == 3, f"found 3 HF paper URLs")

    return ok


async def test_trending_with_query():
    print_test("trending with query='language' (limit=5)")
    output, success = await run({"operation": "trending", "query": "language", "limit": 5})
    print_output(output)

    ok = True
    ok &= assert_true(success, "success=True")
    ok &= assert_true("Filtered by: 'language'" in output, "shows filter applied")

    # The filter may return 0-5 results depending on today's papers
    match = re.search(r"Showing (\d+) paper\(s\)", output)
    ok &= assert_true(match is not None, "has 'Showing N paper(s)' line")
    if match:
        count = int(match.group(1))
        ok &= assert_true(count <= 5, f"returned {count} papers (within limit)")
        # If we got results, verify they mention language somewhere
        if count > 0:
            print_success(f"got {count} filtered results")

    return ok


async def test_search():
    print_test("search 'direct preference optimization' (limit=3)")
    output, success = await run(
        {"operation": "search", "query": "direct preference optimization", "limit": 3}
    )
    print_output(output)

    ok = True
    ok &= assert_true(success, "success=True")
    ok &= assert_true("Papers matching" in output, "has matching header")

    arxiv_ids = re.findall(r"\*\*arxiv_id:\*\* (\S+)", output)
    ok &= assert_true(len(arxiv_ids) == 3, f"found 3 results: {arxiv_ids}")

    # At least one result should mention "preference" in title or summary
    ok &= assert_true(
        "preference" in output.lower(),
        "results mention 'preference' (relevant to query)",
    )

    return ok


async def test_paper_details():
    print_test("paper_details for 2305.18290 (DPO paper)")
    output, success = await run({"operation": "paper_details", "arxiv_id": "2305.18290"})
    print_output(output)

    ok = True
    ok &= assert_true(success, "success=True")
    ok &= assert_true("Direct Preference Optimization" in output, "title contains 'Direct Preference Optimization'")
    ok &= assert_true("2305.18290" in output, "contains arxiv_id")
    ok &= assert_true("https://arxiv.org/abs/2305.18290" in output, "has arxiv URL")
    ok &= assert_true("https://huggingface.co/papers/2305.18290" in output, "has HF URL")
    ok &= assert_true("**Authors:**" in output, "has authors section")
    ok &= assert_true("**upvotes:**" in output, "has upvotes")

    # Check for abstract or AI summary
    ok &= assert_true(
        "## Abstract" in output or "## AI Summary" in output,
        "has Abstract or AI Summary section",
    )

    # Check for next steps hint
    ok &= assert_true("read_paper" in output, "mentions read_paper as next step")
    ok &= assert_true("find_all_resources" in output, "mentions find_all_resources as next step")

    return ok


# ---------------------------------------------------------------------------
# Test Suite 2: Read Paper
# ---------------------------------------------------------------------------


async def test_read_paper_toc():
    print_test("read_paper TOC for 2305.18290 (no section → should return abstract + sections)")
    output, success = await run({"operation": "read_paper", "arxiv_id": "2305.18290"})
    print_output(output)

    ok = True
    ok &= assert_true(success, "success=True")
    ok &= assert_true("## Abstract" in output, "has Abstract section")
    ok &= assert_true("## Sections" in output, "has Sections heading (TOC)")

    # Check that sections are listed with bold titles
    section_titles = re.findall(r"- \*\*(.+?)\*\*:", output)
    ok &= assert_true(len(section_titles) >= 5, f"found {len(section_titles)} sections (expect >=5 for a full paper)")
    if section_titles:
        print_success(f"sections found: {section_titles[:5]}{'...' if len(section_titles) > 5 else ''}")

    # Check that expected DPO paper sections are present
    section_text = " ".join(section_titles).lower()
    ok &= assert_true("introduction" in section_text, "'Introduction' section present")
    ok &= assert_true("experiment" in section_text, "'Experiment' section present")

    # Check for the tip about reading specific sections
    ok &= assert_true("section=" in output, "has tip about using section parameter")

    # Check the abstract has actual content (not empty)
    abstract_match = re.search(r"## Abstract\n(.+?)(?:\n##|\n\*\*Tip)", output, re.DOTALL)
    if abstract_match:
        abstract_text = abstract_match.group(1).strip()
        ok &= assert_true(len(abstract_text) > 100, f"abstract has real content ({len(abstract_text)} chars)")
    else:
        ok &= assert_true(False, "could extract abstract text")

    return ok


async def test_read_paper_section_by_number():
    print_test("read_paper section='4' for 2305.18290")
    output, success = await run(
        {"operation": "read_paper", "arxiv_id": "2305.18290", "section": "4"}
    )
    print_output(output, max_lines=30)

    ok = True
    ok &= assert_true(success, "success=True")
    ok &= assert_true("https://arxiv.org/abs/2305.18290" in output, "has arxiv URL")

    # Should have a section heading at top
    ok &= assert_true(output.startswith("# "), "starts with heading")

    # Should have substantial content
    ok &= assert_true(len(output) > 500, f"section has substantial content ({len(output)} chars)")

    # Should NOT have TOC structure (this is a single section, not the TOC)
    ok &= assert_true("## Sections" not in output, "is a single section (not TOC)")

    return ok


async def test_read_paper_section_by_name():
    print_test("read_paper section='Experiments' for 2305.18290")
    output, success = await run(
        {"operation": "read_paper", "arxiv_id": "2305.18290", "section": "Experiments"}
    )
    print_output(output, max_lines=30)

    ok = True
    ok &= assert_true(success, "success=True")

    # Title should contain "Experiments"
    first_line = output.split("\n")[0]
    ok &= assert_true(
        "experiment" in first_line.lower(),
        f"heading contains 'Experiments': '{first_line}'",
    )

    ok &= assert_true(len(output) > 500, f"section has substantial content ({len(output)} chars)")

    return ok


async def test_read_paper_old_paper():
    print_test("read_paper for 1706.03762 (Attention Is All You Need — 2017 paper)")
    output, success = await run({"operation": "read_paper", "arxiv_id": "1706.03762"})
    print_output(output, max_lines=30)

    ok = True
    ok &= assert_true(success, "success=True")
    ok &= assert_true("attention" in output.lower(), "mentions 'attention' (relevant content)")

    # Either we get sections (HTML available) or abstract fallback
    has_sections = "## Sections" in output
    has_abstract_fallback = "HTML version not available" in output
    ok &= assert_true(
        has_sections or has_abstract_fallback or "## Abstract" in output,
        "got either full sections, or abstract fallback",
    )
    if has_sections:
        print_success("HTML version available — got full sections")
    elif has_abstract_fallback:
        print_success("HTML not available — graceful fallback to abstract")

    return ok


# ---------------------------------------------------------------------------
# Test Suite 3: Linked Resources
# ---------------------------------------------------------------------------


async def test_find_datasets():
    print_test("find_datasets for 2305.18290 (limit=5, sort=downloads)")
    output, success = await run(
        {"operation": "find_datasets", "arxiv_id": "2305.18290", "limit": 5}
    )
    print_output(output)

    ok = True
    ok &= assert_true(success, "success=True")
    ok &= assert_true("Datasets linked to paper 2305.18290" in output, "has correct heading")
    ok &= assert_true("sorted by downloads" in output, "sorted by downloads (default)")

    # Check we got dataset entries with IDs
    dataset_ids = re.findall(r"\[([^\]]+)\]\(https://huggingface\.co/datasets/", output)
    ok &= assert_true(len(dataset_ids) > 0, f"found {len(dataset_ids)} dataset links")
    if dataset_ids:
        print_success(f"dataset IDs: {dataset_ids}")

    # Check download counts are present
    downloads = re.findall(r"Downloads: ([\d,]+)", output)
    ok &= assert_true(len(downloads) > 0, f"found download counts: {downloads}")

    # Check for inspect hint
    ok &= assert_true("hf_inspect_dataset" in output, "has inspect dataset hint")

    return ok


async def test_find_datasets_sort_likes():
    print_test("find_datasets for 2305.18290 (sort=likes, limit=3)")
    output, success = await run(
        {"operation": "find_datasets", "arxiv_id": "2305.18290", "limit": 3, "sort": "likes"}
    )
    print_output(output)

    ok = True
    ok &= assert_true(success, "success=True")
    ok &= assert_true("sorted by likes" in output, "sorted by likes")

    return ok


async def test_find_models():
    print_test("find_models for 2305.18290 (limit=5)")
    output, success = await run(
        {"operation": "find_models", "arxiv_id": "2305.18290", "limit": 5}
    )
    print_output(output)

    ok = True
    ok &= assert_true(success, "success=True")
    ok &= assert_true("Models linked to paper 2305.18290" in output, "has correct heading")

    # Check model links
    model_ids = re.findall(r"\[([^\]]+)\]\(https://huggingface\.co/", output)
    ok &= assert_true(len(model_ids) > 0, f"found {len(model_ids)} model links")
    if model_ids:
        print_success(f"model IDs: {model_ids}")

    # Check for pipeline_tag / library info
    has_task = "Task:" in output
    has_library = "Library:" in output
    ok &= assert_true(has_task or has_library, "has Task or Library metadata")

    return ok


async def test_find_collections():
    print_test("find_collections for 2305.18290")
    output, success = await run(
        {"operation": "find_collections", "arxiv_id": "2305.18290"}
    )
    print_output(output)

    ok = True
    ok &= assert_true(success, "success=True")
    ok &= assert_true("Collections containing paper" in output, "has correct heading")

    # Check collection entries
    collection_urls = re.findall(r"https://huggingface\.co/collections/\S+", output)
    ok &= assert_true(len(collection_urls) > 0, f"found {len(collection_urls)} collection URLs")

    # Check for metadata
    ok &= assert_true("Upvotes:" in output, "has upvote counts")
    ok &= assert_true("Items:" in output, "has item counts")

    return ok


async def test_find_all_resources():
    print_test("find_all_resources for 2305.18290 (parallel fan-out)")
    output, success = await run(
        {"operation": "find_all_resources", "arxiv_id": "2305.18290"}
    )
    print_output(output)

    ok = True
    ok &= assert_true(success, "success=True")
    ok &= assert_true("# Resources linked to paper 2305.18290" in output, "has unified heading")
    ok &= assert_true("https://huggingface.co/papers/2305.18290" in output, "has paper URL")

    # All three sections should be present
    ok &= assert_true("## Datasets" in output, "has Datasets section")
    ok &= assert_true("## Models" in output, "has Models section")
    ok &= assert_true("## Collections" in output, "has Collections section")

    # Check that sections have actual entries (not just "None found")
    ok &= assert_true("downloads)" in output, "datasets/models have download counts")

    return ok


# ---------------------------------------------------------------------------
# Test Suite 4: Edge Cases
# ---------------------------------------------------------------------------


async def test_search_no_results():
    print_test("search with gibberish query → should return empty gracefully")
    output, success = await run(
        {"operation": "search", "query": "xyzzyplugh_nonexistent_topic_9999"}
    )
    print_output(output)

    ok = True
    ok &= assert_true(success, "success=True (empty results is not an error)")
    ok &= assert_true("No papers found" in output, "says 'No papers found'")

    return ok


async def test_missing_query():
    print_test("search without query → should error")
    output, success = await run({"operation": "search"})
    print_output(output)

    ok = True
    ok &= assert_true(not success, "success=False (missing required param)")
    ok &= assert_true("required" in output.lower(), "error mentions 'required'")

    return ok


async def test_missing_arxiv_id():
    print_test("find_datasets without arxiv_id → should error")
    output, success = await run({"operation": "find_datasets"})
    print_output(output)

    ok = True
    ok &= assert_true(not success, "success=False")
    ok &= assert_true("required" in output.lower(), "error mentions 'required'")

    return ok


async def test_invalid_arxiv_id():
    print_test("paper_details with nonexistent arxiv ID")
    output, success = await run({"operation": "paper_details", "arxiv_id": "0000.00000"})
    print_output(output)

    ok = True
    ok &= assert_true(not success, "success=False (API returns error)")

    return ok


async def test_invalid_operation():
    print_test("invalid operation name → should error")
    output, success = await run({"operation": "nonexistent_op"})
    print_output(output)

    ok = True
    ok &= assert_true(not success, "success=False")
    ok &= assert_true("Unknown operation" in output, "says 'Unknown operation'")
    ok &= assert_true("trending" in output, "lists valid operations")

    return ok


async def test_read_paper_bad_section():
    print_test("read_paper with nonexistent section → should error with available sections")
    output, success = await run(
        {"operation": "read_paper", "arxiv_id": "2305.18290", "section": "Nonexistent Section XYZ"}
    )
    print_output(output)

    ok = True
    ok &= assert_true(not success, "success=False")
    ok &= assert_true("not found" in output.lower(), "says section 'not found'")
    ok &= assert_true("Introduction" in output, "lists available sections (includes Introduction)")

    return ok


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------


async def main():
    print("=" * 70)
    print(f"{BLUE}HF Papers Tool — Integration Tests{RESET}")
    print(f"{BLUE}All APIs are public, no authentication required.{RESET}")
    print("=" * 70)

    all_tests = [
        # Suite 1: Paper Discovery
        ("Paper Discovery", [
            test_trending,
            test_trending_with_query,
            test_search,
            test_paper_details,
        ]),
        # Suite 2: Read Paper
        ("Read Paper", [
            test_read_paper_toc,
            test_read_paper_section_by_number,
            test_read_paper_section_by_name,
            test_read_paper_old_paper,
        ]),
        # Suite 3: Linked Resources
        ("Linked Resources", [
            test_find_datasets,
            test_find_datasets_sort_likes,
            test_find_models,
            test_find_collections,
            test_find_all_resources,
        ]),
        # Suite 4: Edge Cases
        ("Edge Cases", [
            test_search_no_results,
            test_missing_query,
            test_missing_arxiv_id,
            test_invalid_arxiv_id,
            test_invalid_operation,
            test_read_paper_bad_section,
        ]),
    ]

    global assertions_passed, assertions_failed
    suite_results = []

    for suite_name, tests in all_tests:
        print(f"\n{YELLOW}{'=' * 70}{RESET}")
        print(f"{YELLOW}Test Suite: {suite_name} ({len(tests)} tests){RESET}")
        print(f"{YELLOW}{'=' * 70}{RESET}")

        suite_pass = 0
        suite_fail = 0

        for test_fn in tests:
            try:
                test_ok = await test_fn()
                if test_ok:
                    suite_pass += 1
                else:
                    suite_fail += 1
            except Exception as e:
                print_error(f"CRASHED: {e}")
                import traceback
                traceback.print_exc()
                suite_fail += 1

        suite_results.append((suite_name, suite_pass, suite_fail))

    # Summary
    print(f"\n{'=' * 70}")
    print(f"{BLUE}Summary{RESET}")
    print(f"{'=' * 70}")
    for suite_name, sp, sf in suite_results:
        icon = f"{GREEN}✓{RESET}" if sf == 0 else f"{RED}✗{RESET}"
        print(f"  {icon} {suite_name}: {sp}/{sp + sf} tests passed")

    print(f"{'─' * 70}")
    total_tests = sum(sp + sf for _, sp, sf in suite_results)
    total_failed = sum(sf for _, _, sf in suite_results)
    print(f"  Assertions: {assertions_passed} passed, {assertions_failed} failed")
    print(f"  Tests:      {total_tests - total_failed}/{total_tests} passed")
    print(f"{'=' * 70}\n")

    if total_failed > 0 or assertions_failed > 0:
        sys.exit(1)


if __name__ == "__main__":
    asyncio.run(main())