| |
|
|
| import requests |
|
|
| from autogpt.commands.web_requests import scrape_text |
|
|
| """ |
| Code Analysis |
| |
| Objective: |
| The objective of the "scrape_text" function is to scrape the text content from |
| a given URL and return it as a string, after removing any unwanted HTML tags and scripts. |
| |
| Inputs: |
| - url: a string representing the URL of the webpage to be scraped. |
| |
| Flow: |
| 1. Send a GET request to the given URL using the requests library and the user agent header from the config file. |
| 2. Check if the response contains an HTTP error. If it does, return an error message. |
| 3. Use BeautifulSoup to parse the HTML content of the response and extract all script and style tags. |
| 4. Get the text content of the remaining HTML using the get_text() method of BeautifulSoup. |
| 5. Split the text into lines and then into chunks, removing any extra whitespace. |
| 6. Join the chunks into a single string with newline characters between them. |
| 7. Return the cleaned text. |
| |
| Outputs: |
| - A string representing the cleaned text content of the webpage. |
| |
| Additional aspects: |
| - The function uses the requests library and BeautifulSoup to handle the HTTP request and HTML parsing, respectively. |
| - The function removes script and style tags from the HTML to avoid including unwanted content in the text output. |
| - The function uses a generator expression to split the text into lines and chunks, which can improve performance for large amounts of text. |
| """ |
|
|
|
|
| class TestScrapeText: |
| |
| def test_scrape_text_with_valid_url(self, mocker): |
| |
| expected_text = "This is some sample text" |
| mock_response = mocker.Mock() |
| mock_response.status_code = 200 |
| mock_response.text = f"<html><body><div><p style='color: blue;'>{expected_text}</p></div></body></html>" |
| mocker.patch("requests.Session.get", return_value=mock_response) |
|
|
| |
| url = "http://www.example.com" |
| assert scrape_text(url) == expected_text |
|
|
| |
| def test_invalid_url(self, mocker): |
| |
| mocker.patch( |
| "requests.Session.get", side_effect=requests.exceptions.RequestException |
| ) |
|
|
| |
| url = "http://www.invalidurl.com" |
| error_message = scrape_text(url) |
| assert "Error:" in error_message |
|
|
| |
| def test_no_text(self, mocker): |
| |
| mock_response = mocker.Mock() |
| mock_response.status_code = 200 |
| mock_response.text = "<html><body></body></html>" |
| mocker.patch("requests.Session.get", return_value=mock_response) |
|
|
| |
| url = "http://www.example.com" |
| assert scrape_text(url) == "" |
|
|
| |
| def test_http_error(self, mocker): |
| |
| mocker.patch("requests.Session.get", return_value=mocker.Mock(status_code=404)) |
|
|
| |
| result = scrape_text("https://www.example.com") |
|
|
| |
| assert result == "Error: HTTP 404 error" |
|
|
| |
| def test_scrape_text_with_html_tags(self, mocker): |
| |
| html = "<html><body><p>This is <b>bold</b> text.</p></body></html>" |
| mock_response = mocker.Mock() |
| mock_response.status_code = 200 |
| mock_response.text = html |
| mocker.patch("requests.Session.get", return_value=mock_response) |
|
|
| |
| result = scrape_text("https://www.example.com") |
|
|
| |
| assert result == "This is bold text." |
|
|