| import gradio as gr |
| import requests |
| import os |
| import tempfile |
|
|
| from typing import Tuple |
|
|
|
|
|
|
| JINA_API_KEY = os.getenv('JINA_API_KEY') |
|
|
| def web_scraper(url: str) -> str: |
| """ |
| Scrape the content of a given URL using the Jina API. |
| |
| Args: |
| url (str): The URL to scrape. |
| |
| Returns: |
| str: The scraped content in markdown format. |
| """ |
| headers = { |
| 'Authorization': f'Bearer {JINA_API_KEY}', |
| 'X-Locale': 'en-US', |
| 'X-Return-Format': 'text', |
| 'X-With-Generated-Alt': 'true', |
| 'X-With-Links-Summary': 'true' |
| } |
| scrape_pattern = f'https://r.jina.ai/{url}' |
| |
| response = requests.get(scrape_pattern, headers=headers) |
| return response.text |
|
|
| def scrape_and_display(url: str) -> Tuple[str, tempfile._TemporaryFileWrapper]: |
| """ |
| Scrape the content of a given URL and prepare it for display and download. |
| |
| Args: |
| url (str): The URL to scrape. |
| |
| Returns: |
| Tuple[str, tempfile._TemporaryFileWrapper]: A tuple containing the scraped content and a temporary file for download. |
| """ |
| scraped_content = web_scraper(url) |
| |
| |
| temp_file = tempfile.NamedTemporaryFile(mode="w+", suffix=".md", delete=False) |
| temp_file.write(scraped_content) |
| temp_file.flush() |
| |
| return scraped_content, temp_file.name |
|
|
| def create_gradio_interface() -> gr.Interface: |
| """ |
| Create and configure the Gradio interface for the web scraper. |
| |
| Returns: |
| gr.Interface: The configured Gradio interface. |
| """ |
| return gr.Interface( |
| fn=scrape_and_display, |
| inputs=gr.Textbox(label="Enter URL to scrape"), |
| outputs=[ |
| gr.Markdown(label="Scraped Content"), |
| gr.File(label="Download Markdown") |
| ], |
| title="Web Scraper", |
| description="Enter a URL to scrape and view the content in markdown format. You can also download the markdown file.", |
| examples=[["https://www.robots.ox.ac.uk/~vgg/data/flowers/102/categories.html"]], |
| allow_flagging="never" |
| ) |
|
|
| if __name__ == "__main__": |
| iface = create_gradio_interface() |
| iface.launch() |