import json import argparse import requests from bs4 import BeautifulSoup import markdown from urllib.parse import urlparse def fetch_blog_from_url(url): """Fetch blog content from URL""" try: # Add user agent to avoid 403 errors from sites like Medium headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } response = requests.get(url, timeout=30, headers=headers) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') # Extract main content (adjust selectors based on blog platform) title = soup.find('h1').get_text() if soup.find('h1') else "Untitled" # Common content selectors - try multiple strategies content = (soup.find('article') or soup.find('main') or soup.find('div', class_='content') or soup.find('div', class_='post-content') or soup.find('div', class_='entry-content')) if content: text = content.get_text(separator='\n', strip=True) # Also extract code blocks separately code_blocks = content.find_all(['pre', 'code']) codes = [block.get_text() for block in code_blocks] else: text = soup.get_text(separator='\n', strip=True) codes = [] return { 'title': title, 'url': url, 'content': text, 'code_snippets': codes } except Exception as e: print(f"[ERROR] Failed to fetch URL: {e}") raise def process_markdown_file(file_path): """Process markdown blog file""" with open(file_path, 'r', encoding='utf-8') as f: md_content = f.read() # Convert markdown to HTML then extract text html = markdown.markdown(md_content, extensions=['fenced_code', 'codehilite']) soup = BeautifulSoup(html, 'html.parser') # Extract title (first h1) title = soup.find('h1') title_text = title.get_text() if title else "Untitled" # Extract code blocks code_blocks = soup.find_all(['pre', 'code']) codes = [block.get_text() for block in code_blocks] return { 'title': title_text, 'content': md_content, 'html': html, 'code_snippets': codes } def process_html_file(file_path): """Process HTML blog file""" with open(file_path, 'r', encoding='utf-8') as f: html_content = f.read() soup = BeautifulSoup(html_content, 'html.parser') title = soup.find('h1').get_text() if soup.find('h1') else "Untitled" text = soup.get_text(separator='\n', strip=True) # Extract code blocks code_blocks = soup.find_all(['pre', 'code']) codes = [block.get_text() for block in code_blocks] return { 'title': title, 'content': text, 'html': html_content, 'code_snippets': codes } def process_text_file(file_path): """Process plain text file""" with open(file_path, 'r', encoding='utf-8') as f: content = f.read() return { 'title': 'Blog Post', 'content': content, 'code_snippets': [] } def main(args): if args.url: print(f"[INFO] Fetching blog from URL: {args.url}") blog_data = fetch_blog_from_url(args.url) elif args.input_path: print(f"[INFO] Processing local file: {args.input_path}") if args.input_path.endswith('.md'): blog_data = process_markdown_file(args.input_path) elif args.input_path.endswith('.html'): blog_data = process_html_file(args.input_path) else: # Plain text blog_data = process_text_file(args.input_path) else: print("[ERROR] Must provide either --url or --input_path") return # Save as JSON with open(args.output_json_path, 'w', encoding='utf-8') as f: json.dump(blog_data, f, indent=2, ensure_ascii=False) print(f"[SAVED] {args.output_json_path}") print(f"[INFO] Title: {blog_data['title']}") print(f"[INFO] Content length: {len(blog_data['content'])} characters") print(f"[INFO] Code snippets found: {len(blog_data.get('code_snippets', []))}") if __name__ == "__main__": parser = argparse.ArgumentParser(description="Process blog posts into JSON format for Blog2Code") parser.add_argument("--url", type=str, help="Blog URL to fetch") parser.add_argument("--input_path", type=str, help="Local blog file path (.md, .html, or .txt)") parser.add_argument("--output_json_path", type=str, required=True, help="Output JSON file path") args = parser.parse_args() main(args)