| |
|
|
| |
| |
| import pytest |
|
|
| from autogpt.commands.web_requests import scrape_links |
|
|
| """ |
| Code Analysis |
| |
| Objective: |
| The objective of the 'scrape_links' function is to scrape hyperlinks from a |
| given URL and return them in a formatted way. |
| |
| Inputs: |
| - url: a string representing the URL to be scraped. |
| |
| Flow: |
| 1. Send a GET request to the given URL using the requests library and the user agent header from the config file. |
| 2. Check if the response contains an HTTP error. If it does, return "error". |
| 3. Parse the HTML content of the response using the BeautifulSoup library. |
| 4. Remove any script and style tags from the parsed HTML. |
| 5. Extract all hyperlinks from the parsed HTML using the 'extract_hyperlinks' function. |
| 6. Format the extracted hyperlinks using the 'format_hyperlinks' function. |
| 7. Return the formatted hyperlinks. |
| |
| Outputs: |
| - A list of formatted hyperlinks. |
| |
| Additional aspects: |
| - The function uses the 'requests' and 'BeautifulSoup' libraries to send HTTP |
| requests and parse HTML content, respectively. |
| - The 'extract_hyperlinks' function is called to extract hyperlinks from the parsed HTML. |
| - The 'format_hyperlinks' function is called to format the extracted hyperlinks. |
| - The function checks for HTTP errors and returns "error" if any are found. |
| """ |
|
|
|
|
| class TestScrapeLinks: |
| |
| |
| def test_valid_url_with_hyperlinks(self): |
| url = "https://www.google.com" |
| result = scrape_links(url) |
| assert len(result) > 0 |
| assert isinstance(result, list) |
| assert isinstance(result[0], str) |
|
|
| |
| def test_valid_url(self, mocker): |
| |
| mock_response = mocker.Mock() |
| mock_response.status_code = 200 |
| mock_response.text = ( |
| "<html><body><a href='https://www.google.com'>Google</a></body></html>" |
| ) |
| mocker.patch("requests.Session.get", return_value=mock_response) |
|
|
| |
| result = scrape_links("https://www.example.com") |
|
|
| |
| assert result == ["Google (https://www.google.com)"] |
|
|
| |
| def test_invalid_url(self, mocker): |
| |
| mock_response = mocker.Mock() |
| mock_response.status_code = 404 |
| mocker.patch("requests.Session.get", return_value=mock_response) |
|
|
| |
| result = scrape_links("https://www.invalidurl.com") |
|
|
| |
| assert "Error:" in result |
|
|
| |
| def test_no_hyperlinks(self, mocker): |
| |
| mock_response = mocker.Mock() |
| mock_response.status_code = 200 |
| mock_response.text = "<html><body><p>No hyperlinks here</p></body></html>" |
| mocker.patch("requests.Session.get", return_value=mock_response) |
|
|
| |
| result = scrape_links("https://www.example.com") |
|
|
| |
| assert result == [] |
|
|
| |
| |
| def test_scrape_links_with_few_hyperlinks(self, mocker): |
| |
| mock_response = mocker.Mock() |
| mock_response.status_code = 200 |
| mock_response.text = """ |
| <html> |
| <body> |
| <div id="google-link"><a href="https://www.google.com">Google</a></div> |
| <div id="github"><a href="https://github.com">GitHub</a></div> |
| <div id="CodiumAI"><a href="https://www.codium.ai">CodiumAI</a></div> |
| </body> |
| </html> |
| """ |
| mocker.patch("requests.Session.get", return_value=mock_response) |
|
|
| |
| result = scrape_links("https://www.example.com") |
|
|
| |
| assert isinstance(result, list) |
| assert len(result) == 3 |
| assert result[0] == "Google (https://www.google.com)" |
| assert result[1] == "GitHub (https://github.com)" |
| assert result[2] == "CodiumAI (https://www.codium.ai)" |
|
|