Spaces:
Sleeping
Sleeping
| import json | |
| import argparse | |
| import requests | |
| from bs4 import BeautifulSoup | |
| import markdown | |
| from urllib.parse import urlparse | |
| def fetch_blog_from_url(url): | |
| """Fetch blog content from URL""" | |
| try: | |
| # Add user agent to avoid 403 errors from sites like Medium | |
| headers = { | |
| 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' | |
| } | |
| response = requests.get(url, timeout=30, headers=headers) | |
| response.raise_for_status() | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| # Extract main content (adjust selectors based on blog platform) | |
| title = soup.find('h1').get_text() if soup.find('h1') else "Untitled" | |
| # Common content selectors - try multiple strategies | |
| content = (soup.find('article') or | |
| soup.find('main') or | |
| soup.find('div', class_='content') or | |
| soup.find('div', class_='post-content') or | |
| soup.find('div', class_='entry-content')) | |
| if content: | |
| text = content.get_text(separator='\n', strip=True) | |
| # Also extract code blocks separately | |
| code_blocks = content.find_all(['pre', 'code']) | |
| codes = [block.get_text() for block in code_blocks] | |
| else: | |
| text = soup.get_text(separator='\n', strip=True) | |
| codes = [] | |
| return { | |
| 'title': title, | |
| 'url': url, | |
| 'content': text, | |
| 'code_snippets': codes | |
| } | |
| except Exception as e: | |
| print(f"[ERROR] Failed to fetch URL: {e}") | |
| raise | |
| def process_markdown_file(file_path): | |
| """Process markdown blog file""" | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| md_content = f.read() | |
| # Convert markdown to HTML then extract text | |
| html = markdown.markdown(md_content, extensions=['fenced_code', 'codehilite']) | |
| soup = BeautifulSoup(html, 'html.parser') | |
| # Extract title (first h1) | |
| title = soup.find('h1') | |
| title_text = title.get_text() if title else "Untitled" | |
| # Extract code blocks | |
| code_blocks = soup.find_all(['pre', 'code']) | |
| codes = [block.get_text() for block in code_blocks] | |
| return { | |
| 'title': title_text, | |
| 'content': md_content, | |
| 'html': html, | |
| 'code_snippets': codes | |
| } | |
| def process_html_file(file_path): | |
| """Process HTML blog file""" | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| html_content = f.read() | |
| soup = BeautifulSoup(html_content, 'html.parser') | |
| title = soup.find('h1').get_text() if soup.find('h1') else "Untitled" | |
| text = soup.get_text(separator='\n', strip=True) | |
| # Extract code blocks | |
| code_blocks = soup.find_all(['pre', 'code']) | |
| codes = [block.get_text() for block in code_blocks] | |
| return { | |
| 'title': title, | |
| 'content': text, | |
| 'html': html_content, | |
| 'code_snippets': codes | |
| } | |
| def process_text_file(file_path): | |
| """Process plain text file""" | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| content = f.read() | |
| return { | |
| 'title': 'Blog Post', | |
| 'content': content, | |
| 'code_snippets': [] | |
| } | |
| def main(args): | |
| if args.url: | |
| print(f"[INFO] Fetching blog from URL: {args.url}") | |
| blog_data = fetch_blog_from_url(args.url) | |
| elif args.input_path: | |
| print(f"[INFO] Processing local file: {args.input_path}") | |
| if args.input_path.endswith('.md'): | |
| blog_data = process_markdown_file(args.input_path) | |
| elif args.input_path.endswith('.html'): | |
| blog_data = process_html_file(args.input_path) | |
| else: | |
| # Plain text | |
| blog_data = process_text_file(args.input_path) | |
| else: | |
| print("[ERROR] Must provide either --url or --input_path") | |
| return | |
| # Save as JSON | |
| with open(args.output_json_path, 'w', encoding='utf-8') as f: | |
| json.dump(blog_data, f, indent=2, ensure_ascii=False) | |
| print(f"[SAVED] {args.output_json_path}") | |
| print(f"[INFO] Title: {blog_data['title']}") | |
| print(f"[INFO] Content length: {len(blog_data['content'])} characters") | |
| print(f"[INFO] Code snippets found: {len(blog_data.get('code_snippets', []))}") | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser(description="Process blog posts into JSON format for Blog2Code") | |
| parser.add_argument("--url", type=str, help="Blog URL to fetch") | |
| parser.add_argument("--input_path", type=str, help="Local blog file path (.md, .html, or .txt)") | |
| parser.add_argument("--output_json_path", type=str, required=True, help="Output JSON file path") | |
| args = parser.parse_args() | |
| main(args) | |