| import pytest |
| import requests_mock |
| from bs4 import BeautifulSoup |
|
|
| from ankigen_core.crawler import WebCrawler |
|
|
| BASE_URL = "http://example.com" |
| SUB_PAGE_URL = f"{BASE_URL}/subpage" |
| EXTERNAL_URL = "http://anotherdomain.com" |
|
|
|
|
| @pytest.fixture |
| def crawler_fixture(): |
| return WebCrawler(start_url=BASE_URL, max_depth=1) |
|
|
|
|
| @pytest.fixture |
| def crawler_with_patterns_fixture(): |
| return WebCrawler( |
| start_url=BASE_URL, |
| max_depth=1, |
| include_patterns=[r"http://example\.com/docs/.*"], |
| exclude_patterns=[r"http://example\.com/docs/v1/.*"], |
| ) |
|
|
|
|
| |
|
|
|
|
| def test_is_valid_url_valid(crawler_fixture): |
| assert crawler_fixture._is_valid_url(f"{BASE_URL}/page1") |
| assert crawler_fixture._is_valid_url(f"{BASE_URL}/another/page") |
|
|
|
|
| def test_is_valid_url_different_domain(crawler_fixture): |
| assert not crawler_fixture._is_valid_url("http://otherdomain.com/page") |
|
|
|
|
| def test_is_valid_url_different_scheme(crawler_fixture): |
| assert not crawler_fixture._is_valid_url("ftp://example.com/page") |
| assert not crawler_fixture._is_valid_url( |
| "mailto:user@example.com" |
| ) |
|
|
|
|
| def test_is_valid_url_malformed(crawler_fixture): |
| assert not crawler_fixture._is_valid_url( |
| "htp://example.com/page" |
| ) |
| assert not crawler_fixture._is_valid_url( |
| "http:///page" |
| ) |
|
|
|
|
| def test_is_valid_url_include_patterns_match(crawler_with_patterns_fixture): |
| assert crawler_with_patterns_fixture._is_valid_url(f"{BASE_URL}/docs/page1") |
| assert crawler_with_patterns_fixture._is_valid_url( |
| f"{BASE_URL}/docs/topic/subtopic" |
| ) |
|
|
|
|
| def test_is_valid_url_include_patterns_no_match(crawler_with_patterns_fixture): |
| assert not crawler_with_patterns_fixture._is_valid_url(f"{BASE_URL}/blog/page1") |
|
|
|
|
| def test_is_valid_url_exclude_patterns_match(crawler_with_patterns_fixture): |
| |
| assert not crawler_with_patterns_fixture._is_valid_url(f"{BASE_URL}/docs/v1/page1") |
|
|
|
|
| def test_is_valid_url_exclude_patterns_no_match(crawler_with_patterns_fixture): |
| |
| assert crawler_with_patterns_fixture._is_valid_url(f"{BASE_URL}/docs/v2/page1") |
|
|
|
|
| def test_is_valid_url_no_patterns_defined(crawler_fixture): |
| |
| assert crawler_fixture._is_valid_url(f"{BASE_URL}/any/path") |
|
|
|
|
| |
|
|
|
|
| @pytest.mark.parametrize( |
| "html_content, base_url, expected_links", |
| [ |
| |
| ( |
| """<a href="/page1">1</a> <a href="http://example.com/page2">2</a>""", |
| BASE_URL, |
| [f"{BASE_URL}/page1", f"{BASE_URL}/page2"], |
| ), |
| |
| ( |
| """<a href="#section">S</a> <a href="javascript:void(0)">JS</a> <a href="/page3">3</a>""", |
| BASE_URL, |
| [f"{BASE_URL}/page3"], |
| ), |
| |
| ( |
| """<a href="http://anotherdomain.com">Ext</a> <a href="/page4">4</a>""", |
| BASE_URL, |
| [f"{BASE_URL}/page4"], |
| ), |
| |
| ("""<a>No Href</a> <a href="/page5">5</a>""", BASE_URL, [f"{BASE_URL}/page5"]), |
| |
| ( |
| """<a href="">Empty Href</a> <a href="/page6">6</a>""", |
| BASE_URL, |
| [f"{BASE_URL}/page6"], |
| ), |
| |
| ( |
| """<a href="sub/page7">7</a>""", |
| f"{BASE_URL}/path/", |
| [f"{BASE_URL}/path/sub/page7"], |
| ), |
| ], |
| ) |
| def test_extract_links(crawler_fixture, html_content, base_url, expected_links): |
| soup = BeautifulSoup(html_content, "html.parser") |
| |
| |
| actual_links = crawler_fixture._extract_links(soup, base_url) |
| assert sorted(actual_links) == sorted(expected_links) |
|
|
|
|
| def test_extract_links_with_filtering(crawler_with_patterns_fixture): |
| html = """ |
| <a href="http://example.com/docs/pageA">Allowed Doc</a> |
| <a href="http://example.com/docs/v1/pageB">Excluded Doc v1</a> |
| <a href="http://example.com/blog/pageC">Non-Doc Page</a> |
| <a href="http://example.com/docs/v2/pageD">Allowed Doc v2</a> |
| """ |
| soup = BeautifulSoup(html, "html.parser") |
| |
| expected = [f"{BASE_URL}/docs/pageA", f"{BASE_URL}/docs/v2/pageD"] |
| actual_links = crawler_with_patterns_fixture._extract_links(soup, BASE_URL) |
| assert sorted(actual_links) == sorted(expected) |
|
|
|
|
| |
| @pytest.mark.parametrize( |
| "html_content, expected_text", |
| [ |
| ( |
| "<html><head><title>T</title><script>alert('x');</script><style>.c{}</style></head><body><p>Hello</p><div>World</div></body></html>", |
| "T Hello World", |
| ), |
| ("<body>Just text</body>", "Just text"), |
| ( |
| "<body><nav>Menu</nav><main><p>Main content</p></main><footer>Foot</footer></body>", |
| "Menu Main content Foot", |
| ), |
| ], |
| ) |
| def test_extract_text(crawler_fixture, html_content, expected_text): |
| soup = BeautifulSoup(html_content, "html.parser") |
| assert crawler_fixture._extract_text(soup) == expected_text |
|
|
|
|
| |
|
|
|
|
| def test_crawl_single_page_no_links(crawler_fixture): |
| with requests_mock.Mocker() as m: |
| m.get( |
| BASE_URL, |
| text="<html><head><title>Test Title</title></head><body>No links here.</body></html>", |
| ) |
|
|
| pages = crawler_fixture.crawl() |
|
|
| assert len(pages) == 1 |
| page = pages[0] |
| assert page.url == BASE_URL |
| assert page.title == "Test Title" |
| assert "No links here" in page.text_content |
| assert page.meta_description is None |
| assert page.meta_keywords == [] |
|
|
|
|
| def test_crawl_with_links_and_depth(crawler_fixture): |
| |
| with requests_mock.Mocker() as m: |
| m.get( |
| BASE_URL, |
| text=f"""<html><head><title>Main</title><meta name="description" content="Main page desc"><meta name="keywords" content="main, test"></head> |
| <body><a href="{SUB_PAGE_URL}">Subpage</a> <a href="{EXTERNAL_URL}">External</a></body></html>""", |
| ) |
| m.get( |
| SUB_PAGE_URL, |
| text="""<html><head><title>Sub</title></head><body>Subpage content. <a href="http://example.com/another_sub">Deeper</a></body></html>""", |
| ) |
| m.get(EXTERNAL_URL, text="External content") |
|
|
| pages = crawler_fixture.crawl() |
|
|
| assert len(pages) == 2 |
|
|
| main_page = next(p for p in pages if p.url == BASE_URL) |
| sub_page = next(p for p in pages if p.url == SUB_PAGE_URL) |
|
|
| assert main_page.title == "Main" |
| assert main_page.meta_description == "Main page desc" |
| assert sorted(main_page.meta_keywords) == sorted(["main", "test"]) |
| assert "Subpage" in main_page.text_content |
|
|
| assert sub_page.title == "Sub" |
| assert "Subpage content" in sub_page.text_content |
| assert sub_page.crawl_depth == 1 |
| assert sub_page.parent_url == BASE_URL |
|
|
| |
| assert len(crawler_fixture.visited_urls) == 2 |
| |
|
|
|
|
| def test_crawl_respects_max_depth_zero(crawler_fixture): |
| crawler_fixture.max_depth = 0 |
| with requests_mock.Mocker() as m: |
| m.get( |
| BASE_URL, |
| text=f"""<html><head><title>Depth Zero</title></head> |
| <body><a href="{SUB_PAGE_URL}">Link</a></body></html>""", |
| ) |
|
|
| pages = crawler_fixture.crawl() |
| assert len(pages) == 1 |
| assert pages[0].url == BASE_URL |
| assert pages[0].title == "Depth Zero" |
| assert len(crawler_fixture.visited_urls) == 1 |
|
|
|
|
| def test_crawl_handles_http_error(crawler_fixture): |
| with requests_mock.Mocker() as m: |
| m.get( |
| BASE_URL, |
| text=f"""<html><head><title>Main</title></head><body><a href="{SUB_PAGE_URL}">Subpage</a></body></html>""", |
| ) |
| m.get(SUB_PAGE_URL, status_code=404, text="Not Found") |
|
|
| pages = crawler_fixture.crawl() |
|
|
| assert len(pages) == 1 |
| assert pages[0].url == BASE_URL |
| |
| assert SUB_PAGE_URL in crawler_fixture.visited_urls |
|
|
|
|
| def test_crawl_include_exclude_patterns(crawler_with_patterns_fixture): |
| |
| |
|
|
| page_docs_allowed = f"{BASE_URL}/docs/allowed" |
| page_docs_v1_excluded = f"{BASE_URL}/docs/v1/excluded" |
| page_docs_v2_allowed = ( |
| f"{BASE_URL}/docs/v2/allowed_link" |
| ) |
| page_blog_excluded = f"{BASE_URL}/blog/initial_link" |
|
|
| crawler_with_patterns_fixture.start_url = ( |
| page_docs_allowed |
| ) |
|
|
| with requests_mock.Mocker() as m: |
| |
| m.get( |
| page_docs_allowed, |
| text=f"""<html><head><title>Docs Allowed</title></head> |
| <body> |
| <a href="{page_docs_v1_excluded}">To Excluded v1</a> |
| <a href="{page_docs_v2_allowed}">To Allowed v2</a> |
| <a href="{page_blog_excluded}">To Blog</a> |
| </body></html>""", |
| ) |
| |
| m.get(page_docs_v1_excluded, text="V1 Excluded Content") |
| m.get( |
| page_docs_v2_allowed, |
| text="<html><head><title>Docs V2 Allowed</title></head><body>V2 Content</body></html>", |
| ) |
| m.get(page_blog_excluded, text="Blog Content") |
|
|
| pages = crawler_with_patterns_fixture.crawl() |
|
|
| assert len(pages) == 2 |
|
|
| crawled_urls = [p.url for p in pages] |
| assert page_docs_allowed in crawled_urls |
| assert page_docs_v2_allowed in crawled_urls |
|
|
| assert page_docs_v1_excluded not in crawled_urls |
| assert page_blog_excluded not in crawled_urls |
|
|
| page_v2 = next(p for p in pages if p.url == page_docs_v2_allowed) |
| assert page_v2.title == "Docs V2 Allowed" |
|
|
|
|
| def test_crawl_progress_callback(crawler_fixture): |
| |
| |
| progress_log = [] |
|
|
| def callback(processed_count, total_urls, current_url): |
| progress_log.append((processed_count, total_urls, current_url)) |
|
|
| with requests_mock.Mocker() as m: |
| m.get( |
| BASE_URL, |
| text=f"""<html><head><title>Main</title></head> |
| <body> |
| <a href="{SUB_PAGE_URL}">Subpage</a> |
| <a href="{BASE_URL}/another">Another</a> |
| </body></html>""", |
| ) |
| m.get(SUB_PAGE_URL, text="<html><body>Sub</body></html>") |
| m.get(f"{BASE_URL}/another", text="<html><body>Another</body></html>") |
|
|
| crawler_fixture.crawl(progress_callback=callback) |
|
|
| |
| |
| |
| |
| |
| |
| |
| assert ( |
| len(progress_log) == 7 |
| ) |
|
|
| |
| |
|
|
| |
| |
| assert progress_log[0][0] == 0 |
| assert progress_log[0][2] == BASE_URL |
|
|
| |
| |
| |
| |
|
|