File size: 19,604 Bytes
12c8f97
 
 
 
c434421
 
12c8f97
 
c434421
12c8f97
 
 
 
 
 
 
 
 
 
 
c434421
12c8f97
 
c434421
 
 
12c8f97
 
c434421
12c8f97
c434421
12c8f97
 
 
c434421
12c8f97
 
 
c434421
12c8f97
 
c434421
 
 
 
 
 
 
12c8f97
 
c434421
 
 
 
 
 
 
 
 
 
 
12c8f97
 
c434421
12c8f97
 
 
c434421
 
 
12c8f97
 
c434421
 
 
 
12c8f97
c434421
 
 
 
12c8f97
c434421
 
 
12c8f97
c434421
 
 
 
 
 
12c8f97
c434421
 
 
12c8f97
c434421
12c8f97
 
c434421
 
 
 
12c8f97
c434421
 
 
12c8f97
c434421
 
 
 
 
 
 
 
 
12c8f97
c434421
12c8f97
 
c434421
 
 
 
12c8f97
c434421
 
 
 
 
12c8f97
c434421
 
 
 
 
 
 
12c8f97
 
c434421
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12c8f97
 
c434421
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12c8f97
c434421
 
 
 
 
 
 
 
12c8f97
c434421
 
12c8f97
c434421
 
12c8f97
c434421
 
 
 
 
 
 
12c8f97
c434421
 
 
 
12c8f97
c434421
 
 
 
 
12c8f97
 
c434421
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12c8f97
c434421
 
 
 
 
 
 
12c8f97
c434421
 
 
 
 
 
 
 
 
12c8f97
c434421
 
 
 
 
 
 
 
 
 
 
 
12c8f97
c434421
 
 
12c8f97
c434421
 
12c8f97
c434421
 
 
 
 
 
 
12c8f97
c434421
 
 
 
 
 
 
12c8f97
c434421
 
 
 
 
12c8f97
c434421
 
 
 
 
 
 
 
 
 
 
12c8f97
c434421
 
 
 
 
 
 
 
 
 
 
 
12c8f97
c434421
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12c8f97
c434421
 
 
 
 
12c8f97
c434421
 
 
 
 
 
 
 
 
 
 
 
 
 
12c8f97
c434421
 
 
 
 
 
 
 
 
 
 
 
12c8f97
c434421
 
 
 
 
 
 
 
12c8f97
c434421
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12c8f97
c434421
 
 
 
 
 
 
 
 
 
 
 
 
12c8f97
 
 
 
 
c434421
12c8f97
c434421
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12c8f97
 
 
c434421
 
 
 
 
 
 
 
 
 
 
12c8f97
 
c434421
12c8f97
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
#!/usr/bin/env python3
"""
Integration tests for HF Papers Tool
Tests with real HF and arXiv APIs β€” all endpoints are public, no auth required.

Run: python tests/integration/tools/test_papers_integration.py
"""
import asyncio
import re
import sys

sys.path.insert(0, ".")

from agent.tools.papers_tool import hf_papers_handler

# ANSI color codes
GREEN = "\033[92m"
YELLOW = "\033[93m"
RED = "\033[91m"
BLUE = "\033[94m"
DIM = "\033[2m"
RESET = "\033[0m"

assertions_passed = 0
assertions_failed = 0


def print_test(msg):
    print(f"\n{BLUE}{'─' * 70}{RESET}")
    print(f"{BLUE}[TEST]{RESET} {msg}")
    print(f"{BLUE}{'─' * 70}{RESET}")


def print_success(msg):
    print(f"{GREEN}  βœ“ {msg}{RESET}")


def print_error(msg):
    print(f"{RED}  βœ— {msg}{RESET}")


def print_output(output: str, max_lines: int = 40):
    """Print the full tool output, indented, with line limit."""
    lines = output.split("\n")
    for line in lines[:max_lines]:
        print(f"{DIM}  β”‚ {RESET}{line}")
    if len(lines) > max_lines:
        print(f"{DIM}  β”‚ ... ({len(lines) - max_lines} more lines){RESET}")


def assert_true(condition: bool, msg: str) -> bool:
    """Assert and print result. Returns True if passed."""
    global assertions_passed, assertions_failed
    if condition:
        print_success(msg)
        assertions_passed += 1
        return True
    else:
        print_error(msg)
        assertions_failed += 1
        return False


async def run(args: dict) -> tuple[str, bool]:
    return await hf_papers_handler(args)


# ---------------------------------------------------------------------------
# Test Suite 1: Paper Discovery
# ---------------------------------------------------------------------------


async def test_trending():
    print_test("trending (limit=3)")
    output, success = await run({"operation": "trending", "limit": 3})
    print_output(output)

    ok = True
    ok &= assert_true(success, "success=True")
    ok &= assert_true("# Trending Papers" in output, "has '# Trending Papers' heading")
    ok &= assert_true("Showing 3 paper(s)" in output, "shows exactly 3 papers")

    # Check that each paper has an arxiv_id line
    arxiv_ids = re.findall(r"\*\*arxiv_id:\*\* (\S+)", output)
    ok &= assert_true(len(arxiv_ids) == 3, f"found 3 arxiv IDs: {arxiv_ids}")

    # Check that IDs look valid (digits and dots)
    for aid in arxiv_ids:
        ok &= assert_true(
            re.match(r"\d{4}\.\d{4,5}", aid) is not None,
            f"arxiv_id '{aid}' looks valid (NNNN.NNNNN format)",
        )

    # Check each paper has an HF URL
    hf_urls = re.findall(r"https://huggingface\.co/papers/\S+", output)
    ok &= assert_true(len(hf_urls) == 3, f"found 3 HF paper URLs")

    return ok


async def test_trending_with_query():
    print_test("trending with query='language' (limit=5)")
    output, success = await run({"operation": "trending", "query": "language", "limit": 5})
    print_output(output)

    ok = True
    ok &= assert_true(success, "success=True")
    ok &= assert_true("Filtered by: 'language'" in output, "shows filter applied")

    # The filter may return 0-5 results depending on today's papers
    match = re.search(r"Showing (\d+) paper\(s\)", output)
    ok &= assert_true(match is not None, "has 'Showing N paper(s)' line")
    if match:
        count = int(match.group(1))
        ok &= assert_true(count <= 5, f"returned {count} papers (within limit)")
        # If we got results, verify they mention language somewhere
        if count > 0:
            print_success(f"got {count} filtered results")

    return ok


async def test_search():
    print_test("search 'direct preference optimization' (limit=3)")
    output, success = await run(
        {"operation": "search", "query": "direct preference optimization", "limit": 3}
    )
    print_output(output)

    ok = True
    ok &= assert_true(success, "success=True")
    ok &= assert_true("Papers matching" in output, "has matching header")

    arxiv_ids = re.findall(r"\*\*arxiv_id:\*\* (\S+)", output)
    ok &= assert_true(len(arxiv_ids) == 3, f"found 3 results: {arxiv_ids}")

    # At least one result should mention "preference" in title or summary
    ok &= assert_true(
        "preference" in output.lower(),
        "results mention 'preference' (relevant to query)",
    )

    return ok


async def test_paper_details():
    print_test("paper_details for 2305.18290 (DPO paper)")
    output, success = await run({"operation": "paper_details", "arxiv_id": "2305.18290"})
    print_output(output)

    ok = True
    ok &= assert_true(success, "success=True")
    ok &= assert_true("Direct Preference Optimization" in output, "title contains 'Direct Preference Optimization'")
    ok &= assert_true("2305.18290" in output, "contains arxiv_id")
    ok &= assert_true("https://arxiv.org/abs/2305.18290" in output, "has arxiv URL")
    ok &= assert_true("https://huggingface.co/papers/2305.18290" in output, "has HF URL")
    ok &= assert_true("**Authors:**" in output, "has authors section")
    ok &= assert_true("**upvotes:**" in output, "has upvotes")

    # Check for abstract or AI summary
    ok &= assert_true(
        "## Abstract" in output or "## AI Summary" in output,
        "has Abstract or AI Summary section",
    )

    # Check for next steps hint
    ok &= assert_true("read_paper" in output, "mentions read_paper as next step")
    ok &= assert_true("find_all_resources" in output, "mentions find_all_resources as next step")

    return ok


# ---------------------------------------------------------------------------
# Test Suite 2: Read Paper
# ---------------------------------------------------------------------------


async def test_read_paper_toc():
    print_test("read_paper TOC for 2305.18290 (no section β†’ should return abstract + sections)")
    output, success = await run({"operation": "read_paper", "arxiv_id": "2305.18290"})
    print_output(output)

    ok = True
    ok &= assert_true(success, "success=True")
    ok &= assert_true("## Abstract" in output, "has Abstract section")
    ok &= assert_true("## Sections" in output, "has Sections heading (TOC)")

    # Check that sections are listed with bold titles
    section_titles = re.findall(r"- \*\*(.+?)\*\*:", output)
    ok &= assert_true(len(section_titles) >= 5, f"found {len(section_titles)} sections (expect >=5 for a full paper)")
    if section_titles:
        print_success(f"sections found: {section_titles[:5]}{'...' if len(section_titles) > 5 else ''}")

    # Check that expected DPO paper sections are present
    section_text = " ".join(section_titles).lower()
    ok &= assert_true("introduction" in section_text, "'Introduction' section present")
    ok &= assert_true("experiment" in section_text, "'Experiment' section present")

    # Check for the tip about reading specific sections
    ok &= assert_true("section=" in output, "has tip about using section parameter")

    # Check the abstract has actual content (not empty)
    abstract_match = re.search(r"## Abstract\n(.+?)(?:\n##|\n\*\*Tip)", output, re.DOTALL)
    if abstract_match:
        abstract_text = abstract_match.group(1).strip()
        ok &= assert_true(len(abstract_text) > 100, f"abstract has real content ({len(abstract_text)} chars)")
    else:
        ok &= assert_true(False, "could extract abstract text")

    return ok


async def test_read_paper_section_by_number():
    print_test("read_paper section='4' for 2305.18290")
    output, success = await run(
        {"operation": "read_paper", "arxiv_id": "2305.18290", "section": "4"}
    )
    print_output(output, max_lines=30)

    ok = True
    ok &= assert_true(success, "success=True")
    ok &= assert_true("https://arxiv.org/abs/2305.18290" in output, "has arxiv URL")

    # Should have a section heading at top
    ok &= assert_true(output.startswith("# "), "starts with heading")

    # Should have substantial content
    ok &= assert_true(len(output) > 500, f"section has substantial content ({len(output)} chars)")

    # Should NOT have TOC structure (this is a single section, not the TOC)
    ok &= assert_true("## Sections" not in output, "is a single section (not TOC)")

    return ok


async def test_read_paper_section_by_name():
    print_test("read_paper section='Experiments' for 2305.18290")
    output, success = await run(
        {"operation": "read_paper", "arxiv_id": "2305.18290", "section": "Experiments"}
    )
    print_output(output, max_lines=30)

    ok = True
    ok &= assert_true(success, "success=True")

    # Title should contain "Experiments"
    first_line = output.split("\n")[0]
    ok &= assert_true(
        "experiment" in first_line.lower(),
        f"heading contains 'Experiments': '{first_line}'",
    )

    ok &= assert_true(len(output) > 500, f"section has substantial content ({len(output)} chars)")

    return ok


async def test_read_paper_old_paper():
    print_test("read_paper for 1706.03762 (Attention Is All You Need β€” 2017 paper)")
    output, success = await run({"operation": "read_paper", "arxiv_id": "1706.03762"})
    print_output(output, max_lines=30)

    ok = True
    ok &= assert_true(success, "success=True")
    ok &= assert_true("attention" in output.lower(), "mentions 'attention' (relevant content)")

    # Either we get sections (HTML available) or abstract fallback
    has_sections = "## Sections" in output
    has_abstract_fallback = "HTML version not available" in output
    ok &= assert_true(
        has_sections or has_abstract_fallback or "## Abstract" in output,
        "got either full sections, or abstract fallback",
    )
    if has_sections:
        print_success("HTML version available β€” got full sections")
    elif has_abstract_fallback:
        print_success("HTML not available β€” graceful fallback to abstract")

    return ok


# ---------------------------------------------------------------------------
# Test Suite 3: Linked Resources
# ---------------------------------------------------------------------------


async def test_find_datasets():
    print_test("find_datasets for 2305.18290 (limit=5, sort=downloads)")
    output, success = await run(
        {"operation": "find_datasets", "arxiv_id": "2305.18290", "limit": 5}
    )
    print_output(output)

    ok = True
    ok &= assert_true(success, "success=True")
    ok &= assert_true("Datasets linked to paper 2305.18290" in output, "has correct heading")
    ok &= assert_true("sorted by downloads" in output, "sorted by downloads (default)")

    # Check we got dataset entries with IDs
    dataset_ids = re.findall(r"\[([^\]]+)\]\(https://huggingface\.co/datasets/", output)
    ok &= assert_true(len(dataset_ids) > 0, f"found {len(dataset_ids)} dataset links")
    if dataset_ids:
        print_success(f"dataset IDs: {dataset_ids}")

    # Check download counts are present
    downloads = re.findall(r"Downloads: ([\d,]+)", output)
    ok &= assert_true(len(downloads) > 0, f"found download counts: {downloads}")

    # Check for inspect hint
    ok &= assert_true("hf_inspect_dataset" in output, "has inspect dataset hint")

    return ok


async def test_find_datasets_sort_likes():
    print_test("find_datasets for 2305.18290 (sort=likes, limit=3)")
    output, success = await run(
        {"operation": "find_datasets", "arxiv_id": "2305.18290", "limit": 3, "sort": "likes"}
    )
    print_output(output)

    ok = True
    ok &= assert_true(success, "success=True")
    ok &= assert_true("sorted by likes" in output, "sorted by likes")

    return ok


async def test_find_models():
    print_test("find_models for 2305.18290 (limit=5)")
    output, success = await run(
        {"operation": "find_models", "arxiv_id": "2305.18290", "limit": 5}
    )
    print_output(output)

    ok = True
    ok &= assert_true(success, "success=True")
    ok &= assert_true("Models linked to paper 2305.18290" in output, "has correct heading")

    # Check model links
    model_ids = re.findall(r"\[([^\]]+)\]\(https://huggingface\.co/", output)
    ok &= assert_true(len(model_ids) > 0, f"found {len(model_ids)} model links")
    if model_ids:
        print_success(f"model IDs: {model_ids}")

    # Check for pipeline_tag / library info
    has_task = "Task:" in output
    has_library = "Library:" in output
    ok &= assert_true(has_task or has_library, "has Task or Library metadata")

    return ok


async def test_find_collections():
    print_test("find_collections for 2305.18290")
    output, success = await run(
        {"operation": "find_collections", "arxiv_id": "2305.18290"}
    )
    print_output(output)

    ok = True
    ok &= assert_true(success, "success=True")
    ok &= assert_true("Collections containing paper" in output, "has correct heading")

    # Check collection entries
    collection_urls = re.findall(r"https://huggingface\.co/collections/\S+", output)
    ok &= assert_true(len(collection_urls) > 0, f"found {len(collection_urls)} collection URLs")

    # Check for metadata
    ok &= assert_true("Upvotes:" in output, "has upvote counts")
    ok &= assert_true("Items:" in output, "has item counts")

    return ok


async def test_find_all_resources():
    print_test("find_all_resources for 2305.18290 (parallel fan-out)")
    output, success = await run(
        {"operation": "find_all_resources", "arxiv_id": "2305.18290"}
    )
    print_output(output)

    ok = True
    ok &= assert_true(success, "success=True")
    ok &= assert_true("# Resources linked to paper 2305.18290" in output, "has unified heading")
    ok &= assert_true("https://huggingface.co/papers/2305.18290" in output, "has paper URL")

    # All three sections should be present
    ok &= assert_true("## Datasets" in output, "has Datasets section")
    ok &= assert_true("## Models" in output, "has Models section")
    ok &= assert_true("## Collections" in output, "has Collections section")

    # Check that sections have actual entries (not just "None found")
    ok &= assert_true("downloads)" in output, "datasets/models have download counts")

    return ok


# ---------------------------------------------------------------------------
# Test Suite 4: Edge Cases
# ---------------------------------------------------------------------------


async def test_search_no_results():
    print_test("search with gibberish query β†’ should return empty gracefully")
    output, success = await run(
        {"operation": "search", "query": "xyzzyplugh_nonexistent_topic_9999"}
    )
    print_output(output)

    ok = True
    ok &= assert_true(success, "success=True (empty results is not an error)")
    ok &= assert_true("No papers found" in output, "says 'No papers found'")

    return ok


async def test_missing_query():
    print_test("search without query β†’ should error")
    output, success = await run({"operation": "search"})
    print_output(output)

    ok = True
    ok &= assert_true(not success, "success=False (missing required param)")
    ok &= assert_true("required" in output.lower(), "error mentions 'required'")

    return ok


async def test_missing_arxiv_id():
    print_test("find_datasets without arxiv_id β†’ should error")
    output, success = await run({"operation": "find_datasets"})
    print_output(output)

    ok = True
    ok &= assert_true(not success, "success=False")
    ok &= assert_true("required" in output.lower(), "error mentions 'required'")

    return ok


async def test_invalid_arxiv_id():
    print_test("paper_details with nonexistent arxiv ID")
    output, success = await run({"operation": "paper_details", "arxiv_id": "0000.00000"})
    print_output(output)

    ok = True
    ok &= assert_true(not success, "success=False (API returns error)")

    return ok


async def test_invalid_operation():
    print_test("invalid operation name β†’ should error")
    output, success = await run({"operation": "nonexistent_op"})
    print_output(output)

    ok = True
    ok &= assert_true(not success, "success=False")
    ok &= assert_true("Unknown operation" in output, "says 'Unknown operation'")
    ok &= assert_true("trending" in output, "lists valid operations")

    return ok


async def test_read_paper_bad_section():
    print_test("read_paper with nonexistent section β†’ should error with available sections")
    output, success = await run(
        {"operation": "read_paper", "arxiv_id": "2305.18290", "section": "Nonexistent Section XYZ"}
    )
    print_output(output)

    ok = True
    ok &= assert_true(not success, "success=False")
    ok &= assert_true("not found" in output.lower(), "says section 'not found'")
    ok &= assert_true("Introduction" in output, "lists available sections (includes Introduction)")

    return ok


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------


async def main():
    print("=" * 70)
    print(f"{BLUE}HF Papers Tool β€” Integration Tests{RESET}")
    print(f"{BLUE}All APIs are public, no authentication required.{RESET}")
    print("=" * 70)

    all_tests = [
        # Suite 1: Paper Discovery
        ("Paper Discovery", [
            test_trending,
            test_trending_with_query,
            test_search,
            test_paper_details,
        ]),
        # Suite 2: Read Paper
        ("Read Paper", [
            test_read_paper_toc,
            test_read_paper_section_by_number,
            test_read_paper_section_by_name,
            test_read_paper_old_paper,
        ]),
        # Suite 3: Linked Resources
        ("Linked Resources", [
            test_find_datasets,
            test_find_datasets_sort_likes,
            test_find_models,
            test_find_collections,
            test_find_all_resources,
        ]),
        # Suite 4: Edge Cases
        ("Edge Cases", [
            test_search_no_results,
            test_missing_query,
            test_missing_arxiv_id,
            test_invalid_arxiv_id,
            test_invalid_operation,
            test_read_paper_bad_section,
        ]),
    ]

    global assertions_passed, assertions_failed
    suite_results = []

    for suite_name, tests in all_tests:
        print(f"\n{YELLOW}{'=' * 70}{RESET}")
        print(f"{YELLOW}Test Suite: {suite_name} ({len(tests)} tests){RESET}")
        print(f"{YELLOW}{'=' * 70}{RESET}")

        suite_pass = 0
        suite_fail = 0

        for test_fn in tests:
            try:
                test_ok = await test_fn()
                if test_ok:
                    suite_pass += 1
                else:
                    suite_fail += 1
            except Exception as e:
                print_error(f"CRASHED: {e}")
                import traceback
                traceback.print_exc()
                suite_fail += 1

        suite_results.append((suite_name, suite_pass, suite_fail))

    # Summary
    print(f"\n{'=' * 70}")
    print(f"{BLUE}Summary{RESET}")
    print(f"{'=' * 70}")
    for suite_name, sp, sf in suite_results:
        icon = f"{GREEN}βœ“{RESET}" if sf == 0 else f"{RED}βœ—{RESET}"
        print(f"  {icon} {suite_name}: {sp}/{sp + sf} tests passed")

    print(f"{'─' * 70}")
    total_tests = sum(sp + sf for _, sp, sf in suite_results)
    total_failed = sum(sf for _, _, sf in suite_results)
    print(f"  Assertions: {assertions_passed} passed, {assertions_failed} failed")
    print(f"  Tests:      {total_tests - total_failed}/{total_tests} passed")
    print(f"{'=' * 70}\n")

    if total_failed > 0 or assertions_failed > 0:
        sys.exit(1)


if __name__ == "__main__":
    asyncio.run(main())