#!/usr/bin/env python3 """ Integration tests for HF Papers Tool Tests with real HF and arXiv APIs — all endpoints are public, no auth required. Run: python tests/integration/tools/test_papers_integration.py """ import asyncio import re import sys sys.path.insert(0, ".") from agent.tools.papers_tool import hf_papers_handler # ANSI color codes GREEN = "\033[92m" YELLOW = "\033[93m" RED = "\033[91m" BLUE = "\033[94m" DIM = "\033[2m" RESET = "\033[0m" assertions_passed = 0 assertions_failed = 0 def print_test(msg): print(f"\n{BLUE}{'─' * 70}{RESET}") print(f"{BLUE}[TEST]{RESET} {msg}") print(f"{BLUE}{'─' * 70}{RESET}") def print_success(msg): print(f"{GREEN} ✓ {msg}{RESET}") def print_error(msg): print(f"{RED} ✗ {msg}{RESET}") def print_output(output: str, max_lines: int = 40): """Print the full tool output, indented, with line limit.""" lines = output.split("\n") for line in lines[:max_lines]: print(f"{DIM} │ {RESET}{line}") if len(lines) > max_lines: print(f"{DIM} │ ... ({len(lines) - max_lines} more lines){RESET}") def assert_true(condition: bool, msg: str) -> bool: """Assert and print result. Returns True if passed.""" global assertions_passed, assertions_failed if condition: print_success(msg) assertions_passed += 1 return True else: print_error(msg) assertions_failed += 1 return False async def run(args: dict) -> tuple[str, bool]: return await hf_papers_handler(args) # --------------------------------------------------------------------------- # Test Suite 1: Paper Discovery # --------------------------------------------------------------------------- async def test_trending(): print_test("trending (limit=3)") output, success = await run({"operation": "trending", "limit": 3}) print_output(output) ok = True ok &= assert_true(success, "success=True") ok &= assert_true("# Trending Papers" in output, "has '# Trending Papers' heading") ok &= assert_true("Showing 3 paper(s)" in output, "shows exactly 3 papers") # Check that each paper has an arxiv_id line arxiv_ids = re.findall(r"\*\*arxiv_id:\*\* (\S+)", output) ok &= assert_true(len(arxiv_ids) == 3, f"found 3 arxiv IDs: {arxiv_ids}") # Check that IDs look valid (digits and dots) for aid in arxiv_ids: ok &= assert_true( re.match(r"\d{4}\.\d{4,5}", aid) is not None, f"arxiv_id '{aid}' looks valid (NNNN.NNNNN format)", ) # Check each paper has an HF URL hf_urls = re.findall(r"https://huggingface\.co/papers/\S+", output) ok &= assert_true(len(hf_urls) == 3, f"found 3 HF paper URLs") return ok async def test_trending_with_query(): print_test("trending with query='language' (limit=5)") output, success = await run({"operation": "trending", "query": "language", "limit": 5}) print_output(output) ok = True ok &= assert_true(success, "success=True") ok &= assert_true("Filtered by: 'language'" in output, "shows filter applied") # The filter may return 0-5 results depending on today's papers match = re.search(r"Showing (\d+) paper\(s\)", output) ok &= assert_true(match is not None, "has 'Showing N paper(s)' line") if match: count = int(match.group(1)) ok &= assert_true(count <= 5, f"returned {count} papers (within limit)") # If we got results, verify they mention language somewhere if count > 0: print_success(f"got {count} filtered results") return ok async def test_search(): print_test("search 'direct preference optimization' (limit=3)") output, success = await run( {"operation": "search", "query": "direct preference optimization", "limit": 3} ) print_output(output) ok = True ok &= assert_true(success, "success=True") ok &= assert_true("Papers matching" in output, "has matching header") arxiv_ids = re.findall(r"\*\*arxiv_id:\*\* (\S+)", output) ok &= assert_true(len(arxiv_ids) == 3, f"found 3 results: {arxiv_ids}") # At least one result should mention "preference" in title or summary ok &= assert_true( "preference" in output.lower(), "results mention 'preference' (relevant to query)", ) return ok async def test_paper_details(): print_test("paper_details for 2305.18290 (DPO paper)") output, success = await run({"operation": "paper_details", "arxiv_id": "2305.18290"}) print_output(output) ok = True ok &= assert_true(success, "success=True") ok &= assert_true("Direct Preference Optimization" in output, "title contains 'Direct Preference Optimization'") ok &= assert_true("2305.18290" in output, "contains arxiv_id") ok &= assert_true("https://arxiv.org/abs/2305.18290" in output, "has arxiv URL") ok &= assert_true("https://huggingface.co/papers/2305.18290" in output, "has HF URL") ok &= assert_true("**Authors:**" in output, "has authors section") ok &= assert_true("**upvotes:**" in output, "has upvotes") # Check for abstract or AI summary ok &= assert_true( "## Abstract" in output or "## AI Summary" in output, "has Abstract or AI Summary section", ) # Check for next steps hint ok &= assert_true("read_paper" in output, "mentions read_paper as next step") ok &= assert_true("find_all_resources" in output, "mentions find_all_resources as next step") return ok # --------------------------------------------------------------------------- # Test Suite 2: Read Paper # --------------------------------------------------------------------------- async def test_read_paper_toc(): print_test("read_paper TOC for 2305.18290 (no section → should return abstract + sections)") output, success = await run({"operation": "read_paper", "arxiv_id": "2305.18290"}) print_output(output) ok = True ok &= assert_true(success, "success=True") ok &= assert_true("## Abstract" in output, "has Abstract section") ok &= assert_true("## Sections" in output, "has Sections heading (TOC)") # Check that sections are listed with bold titles section_titles = re.findall(r"- \*\*(.+?)\*\*:", output) ok &= assert_true(len(section_titles) >= 5, f"found {len(section_titles)} sections (expect >=5 for a full paper)") if section_titles: print_success(f"sections found: {section_titles[:5]}{'...' if len(section_titles) > 5 else ''}") # Check that expected DPO paper sections are present section_text = " ".join(section_titles).lower() ok &= assert_true("introduction" in section_text, "'Introduction' section present") ok &= assert_true("experiment" in section_text, "'Experiment' section present") # Check for the tip about reading specific sections ok &= assert_true("section=" in output, "has tip about using section parameter") # Check the abstract has actual content (not empty) abstract_match = re.search(r"## Abstract\n(.+?)(?:\n##|\n\*\*Tip)", output, re.DOTALL) if abstract_match: abstract_text = abstract_match.group(1).strip() ok &= assert_true(len(abstract_text) > 100, f"abstract has real content ({len(abstract_text)} chars)") else: ok &= assert_true(False, "could extract abstract text") return ok async def test_read_paper_section_by_number(): print_test("read_paper section='4' for 2305.18290") output, success = await run( {"operation": "read_paper", "arxiv_id": "2305.18290", "section": "4"} ) print_output(output, max_lines=30) ok = True ok &= assert_true(success, "success=True") ok &= assert_true("https://arxiv.org/abs/2305.18290" in output, "has arxiv URL") # Should have a section heading at top ok &= assert_true(output.startswith("# "), "starts with heading") # Should have substantial content ok &= assert_true(len(output) > 500, f"section has substantial content ({len(output)} chars)") # Should NOT have TOC structure (this is a single section, not the TOC) ok &= assert_true("## Sections" not in output, "is a single section (not TOC)") return ok async def test_read_paper_section_by_name(): print_test("read_paper section='Experiments' for 2305.18290") output, success = await run( {"operation": "read_paper", "arxiv_id": "2305.18290", "section": "Experiments"} ) print_output(output, max_lines=30) ok = True ok &= assert_true(success, "success=True") # Title should contain "Experiments" first_line = output.split("\n")[0] ok &= assert_true( "experiment" in first_line.lower(), f"heading contains 'Experiments': '{first_line}'", ) ok &= assert_true(len(output) > 500, f"section has substantial content ({len(output)} chars)") return ok async def test_read_paper_old_paper(): print_test("read_paper for 1706.03762 (Attention Is All You Need — 2017 paper)") output, success = await run({"operation": "read_paper", "arxiv_id": "1706.03762"}) print_output(output, max_lines=30) ok = True ok &= assert_true(success, "success=True") ok &= assert_true("attention" in output.lower(), "mentions 'attention' (relevant content)") # Either we get sections (HTML available) or abstract fallback has_sections = "## Sections" in output has_abstract_fallback = "HTML version not available" in output ok &= assert_true( has_sections or has_abstract_fallback or "## Abstract" in output, "got either full sections, or abstract fallback", ) if has_sections: print_success("HTML version available — got full sections") elif has_abstract_fallback: print_success("HTML not available — graceful fallback to abstract") return ok # --------------------------------------------------------------------------- # Test Suite 3: Linked Resources # --------------------------------------------------------------------------- async def test_find_datasets(): print_test("find_datasets for 2305.18290 (limit=5, sort=downloads)") output, success = await run( {"operation": "find_datasets", "arxiv_id": "2305.18290", "limit": 5} ) print_output(output) ok = True ok &= assert_true(success, "success=True") ok &= assert_true("Datasets linked to paper 2305.18290" in output, "has correct heading") ok &= assert_true("sorted by downloads" in output, "sorted by downloads (default)") # Check we got dataset entries with IDs dataset_ids = re.findall(r"\[([^\]]+)\]\(https://huggingface\.co/datasets/", output) ok &= assert_true(len(dataset_ids) > 0, f"found {len(dataset_ids)} dataset links") if dataset_ids: print_success(f"dataset IDs: {dataset_ids}") # Check download counts are present downloads = re.findall(r"Downloads: ([\d,]+)", output) ok &= assert_true(len(downloads) > 0, f"found download counts: {downloads}") # Check for inspect hint ok &= assert_true("hf_inspect_dataset" in output, "has inspect dataset hint") return ok async def test_find_datasets_sort_likes(): print_test("find_datasets for 2305.18290 (sort=likes, limit=3)") output, success = await run( {"operation": "find_datasets", "arxiv_id": "2305.18290", "limit": 3, "sort": "likes"} ) print_output(output) ok = True ok &= assert_true(success, "success=True") ok &= assert_true("sorted by likes" in output, "sorted by likes") return ok async def test_find_models(): print_test("find_models for 2305.18290 (limit=5)") output, success = await run( {"operation": "find_models", "arxiv_id": "2305.18290", "limit": 5} ) print_output(output) ok = True ok &= assert_true(success, "success=True") ok &= assert_true("Models linked to paper 2305.18290" in output, "has correct heading") # Check model links model_ids = re.findall(r"\[([^\]]+)\]\(https://huggingface\.co/", output) ok &= assert_true(len(model_ids) > 0, f"found {len(model_ids)} model links") if model_ids: print_success(f"model IDs: {model_ids}") # Check for pipeline_tag / library info has_task = "Task:" in output has_library = "Library:" in output ok &= assert_true(has_task or has_library, "has Task or Library metadata") return ok async def test_find_collections(): print_test("find_collections for 2305.18290") output, success = await run( {"operation": "find_collections", "arxiv_id": "2305.18290"} ) print_output(output) ok = True ok &= assert_true(success, "success=True") ok &= assert_true("Collections containing paper" in output, "has correct heading") # Check collection entries collection_urls = re.findall(r"https://huggingface\.co/collections/\S+", output) ok &= assert_true(len(collection_urls) > 0, f"found {len(collection_urls)} collection URLs") # Check for metadata ok &= assert_true("Upvotes:" in output, "has upvote counts") ok &= assert_true("Items:" in output, "has item counts") return ok async def test_find_all_resources(): print_test("find_all_resources for 2305.18290 (parallel fan-out)") output, success = await run( {"operation": "find_all_resources", "arxiv_id": "2305.18290"} ) print_output(output) ok = True ok &= assert_true(success, "success=True") ok &= assert_true("# Resources linked to paper 2305.18290" in output, "has unified heading") ok &= assert_true("https://huggingface.co/papers/2305.18290" in output, "has paper URL") # All three sections should be present ok &= assert_true("## Datasets" in output, "has Datasets section") ok &= assert_true("## Models" in output, "has Models section") ok &= assert_true("## Collections" in output, "has Collections section") # Check that sections have actual entries (not just "None found") ok &= assert_true("downloads)" in output, "datasets/models have download counts") return ok # --------------------------------------------------------------------------- # Test Suite 4: Edge Cases # --------------------------------------------------------------------------- async def test_search_no_results(): print_test("search with gibberish query → should return empty gracefully") output, success = await run( {"operation": "search", "query": "xyzzyplugh_nonexistent_topic_9999"} ) print_output(output) ok = True ok &= assert_true(success, "success=True (empty results is not an error)") ok &= assert_true("No papers found" in output, "says 'No papers found'") return ok async def test_missing_query(): print_test("search without query → should error") output, success = await run({"operation": "search"}) print_output(output) ok = True ok &= assert_true(not success, "success=False (missing required param)") ok &= assert_true("required" in output.lower(), "error mentions 'required'") return ok async def test_missing_arxiv_id(): print_test("find_datasets without arxiv_id → should error") output, success = await run({"operation": "find_datasets"}) print_output(output) ok = True ok &= assert_true(not success, "success=False") ok &= assert_true("required" in output.lower(), "error mentions 'required'") return ok async def test_invalid_arxiv_id(): print_test("paper_details with nonexistent arxiv ID") output, success = await run({"operation": "paper_details", "arxiv_id": "0000.00000"}) print_output(output) ok = True ok &= assert_true(not success, "success=False (API returns error)") return ok async def test_invalid_operation(): print_test("invalid operation name → should error") output, success = await run({"operation": "nonexistent_op"}) print_output(output) ok = True ok &= assert_true(not success, "success=False") ok &= assert_true("Unknown operation" in output, "says 'Unknown operation'") ok &= assert_true("trending" in output, "lists valid operations") return ok async def test_read_paper_bad_section(): print_test("read_paper with nonexistent section → should error with available sections") output, success = await run( {"operation": "read_paper", "arxiv_id": "2305.18290", "section": "Nonexistent Section XYZ"} ) print_output(output) ok = True ok &= assert_true(not success, "success=False") ok &= assert_true("not found" in output.lower(), "says section 'not found'") ok &= assert_true("Introduction" in output, "lists available sections (includes Introduction)") return ok # --------------------------------------------------------------------------- # Main # --------------------------------------------------------------------------- async def main(): print("=" * 70) print(f"{BLUE}HF Papers Tool — Integration Tests{RESET}") print(f"{BLUE}All APIs are public, no authentication required.{RESET}") print("=" * 70) all_tests = [ # Suite 1: Paper Discovery ("Paper Discovery", [ test_trending, test_trending_with_query, test_search, test_paper_details, ]), # Suite 2: Read Paper ("Read Paper", [ test_read_paper_toc, test_read_paper_section_by_number, test_read_paper_section_by_name, test_read_paper_old_paper, ]), # Suite 3: Linked Resources ("Linked Resources", [ test_find_datasets, test_find_datasets_sort_likes, test_find_models, test_find_collections, test_find_all_resources, ]), # Suite 4: Edge Cases ("Edge Cases", [ test_search_no_results, test_missing_query, test_missing_arxiv_id, test_invalid_arxiv_id, test_invalid_operation, test_read_paper_bad_section, ]), ] global assertions_passed, assertions_failed suite_results = [] for suite_name, tests in all_tests: print(f"\n{YELLOW}{'=' * 70}{RESET}") print(f"{YELLOW}Test Suite: {suite_name} ({len(tests)} tests){RESET}") print(f"{YELLOW}{'=' * 70}{RESET}") suite_pass = 0 suite_fail = 0 for test_fn in tests: try: test_ok = await test_fn() if test_ok: suite_pass += 1 else: suite_fail += 1 except Exception as e: print_error(f"CRASHED: {e}") import traceback traceback.print_exc() suite_fail += 1 suite_results.append((suite_name, suite_pass, suite_fail)) # Summary print(f"\n{'=' * 70}") print(f"{BLUE}Summary{RESET}") print(f"{'=' * 70}") for suite_name, sp, sf in suite_results: icon = f"{GREEN}✓{RESET}" if sf == 0 else f"{RED}✗{RESET}" print(f" {icon} {suite_name}: {sp}/{sp + sf} tests passed") print(f"{'─' * 70}") total_tests = sum(sp + sf for _, sp, sf in suite_results) total_failed = sum(sf for _, _, sf in suite_results) print(f" Assertions: {assertions_passed} passed, {assertions_failed} failed") print(f" Tests: {total_tests - total_failed}/{total_tests} passed") print(f"{'=' * 70}\n") if total_failed > 0 or assertions_failed > 0: sys.exit(1) if __name__ == "__main__": asyncio.run(main())