Spaces:
Runtime error
Runtime error
# Generated by CodiumAI | |
import requests | |
from autogpt.commands.web_requests import scrape_text | |
""" | |
Code Analysis | |
Objective: | |
The objective of the "scrape_text" function is to scrape the text content from | |
a given URL and return it as a string, after removing any unwanted HTML tags and scripts. | |
Inputs: | |
- url: a string representing the URL of the webpage to be scraped. | |
Flow: | |
1. Send a GET request to the given URL using the requests library and the user agent header from the config file. | |
2. Check if the response contains an HTTP error. If it does, return an error message. | |
3. Use BeautifulSoup to parse the HTML content of the response and extract all script and style tags. | |
4. Get the text content of the remaining HTML using the get_text() method of BeautifulSoup. | |
5. Split the text into lines and then into chunks, removing any extra whitespace. | |
6. Join the chunks into a single string with newline characters between them. | |
7. Return the cleaned text. | |
Outputs: | |
- A string representing the cleaned text content of the webpage. | |
Additional aspects: | |
- The function uses the requests library and BeautifulSoup to handle the HTTP request and HTML parsing, respectively. | |
- The function removes script and style tags from the HTML to avoid including unwanted content in the text output. | |
- The function uses a generator expression to split the text into lines and chunks, which can improve performance for large amounts of text. | |
""" | |
class TestScrapeText: | |
# Tests that scrape_text() returns the expected text when given a valid URL. | |
def test_scrape_text_with_valid_url(self, mocker): | |
# Mock the requests.get() method to return a response with expected text | |
expected_text = "This is some sample text" | |
mock_response = mocker.Mock() | |
mock_response.status_code = 200 | |
mock_response.text = f"<html><body><div><p style='color: blue;'>{expected_text}</p></div></body></html>" | |
mocker.patch("requests.Session.get", return_value=mock_response) | |
# Call the function with a valid URL and assert that it returns the expected text | |
url = "http://www.example.com" | |
assert scrape_text(url) == expected_text | |
# Tests that the function returns an error message when an invalid or unreachable url is provided. | |
def test_invalid_url(self, mocker): | |
# Mock the requests.get() method to raise an exception | |
mocker.patch( | |
"requests.Session.get", side_effect=requests.exceptions.RequestException | |
) | |
# Call the function with an invalid URL and assert that it returns an error message | |
url = "http://www.invalidurl.com" | |
error_message = scrape_text(url) | |
assert "Error:" in error_message | |
# Tests that the function returns an empty string when the html page contains no text to be scraped. | |
def test_no_text(self, mocker): | |
# Mock the requests.get() method to return a response with no text | |
mock_response = mocker.Mock() | |
mock_response.status_code = 200 | |
mock_response.text = "<html><body></body></html>" | |
mocker.patch("requests.Session.get", return_value=mock_response) | |
# Call the function with a valid URL and assert that it returns an empty string | |
url = "http://www.example.com" | |
assert scrape_text(url) == "" | |
# Tests that the function returns an error message when the response status code is an http error (>=400). | |
def test_http_error(self, mocker): | |
# Mock the requests.get() method to return a response with a 404 status code | |
mocker.patch("requests.Session.get", return_value=mocker.Mock(status_code=404)) | |
# Call the function with a URL | |
result = scrape_text("https://www.example.com") | |
# Check that the function returns an error message | |
assert result == "Error: HTTP 404 error" | |
# Tests that scrape_text() properly handles HTML tags. | |
def test_scrape_text_with_html_tags(self, mocker): | |
# Create a mock response object with HTML containing tags | |
html = "<html><body><p>This is <b>bold</b> text.</p></body></html>" | |
mock_response = mocker.Mock() | |
mock_response.status_code = 200 | |
mock_response.text = html | |
mocker.patch("requests.Session.get", return_value=mock_response) | |
# Call the function with a URL | |
result = scrape_text("https://www.example.com") | |
# Check that the function properly handles HTML tags | |
assert result == "This is bold text." | |