Spaces:

srishtichugh
/

blog2code-api

Sleeping

File size: 4,827 Bytes

2fd8593

import json
import argparse
import requests
from bs4 import BeautifulSoup
import markdown
from urllib.parse import urlparse

def fetch_blog_from_url(url):
    """Fetch blog content from URL"""
    try:
        # Add user agent to avoid 403 errors from sites like Medium
        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, timeout=30, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract main content (adjust selectors based on blog platform)
        title = soup.find('h1').get_text() if soup.find('h1') else "Untitled"
        
        # Common content selectors - try multiple strategies
        content = (soup.find('article') or 
                  soup.find('main') or 
                  soup.find('div', class_='content') or
                  soup.find('div', class_='post-content') or
                  soup.find('div', class_='entry-content'))
        
        if content:
            text = content.get_text(separator='\n', strip=True)
            # Also extract code blocks separately
            code_blocks = content.find_all(['pre', 'code'])
            codes = [block.get_text() for block in code_blocks]
        else:
            text = soup.get_text(separator='\n', strip=True)
            codes = []
        
        return {
            'title': title,
            'url': url,
            'content': text,
            'code_snippets': codes
        }
    except Exception as e:
        print(f"[ERROR] Failed to fetch URL: {e}")
        raise

def process_markdown_file(file_path):
    """Process markdown blog file"""
    with open(file_path, 'r', encoding='utf-8') as f:
        md_content = f.read()
    
    # Convert markdown to HTML then extract text
    html = markdown.markdown(md_content, extensions=['fenced_code', 'codehilite'])
    soup = BeautifulSoup(html, 'html.parser')
    
    # Extract title (first h1)
    title = soup.find('h1')
    title_text = title.get_text() if title else "Untitled"
    
    # Extract code blocks
    code_blocks = soup.find_all(['pre', 'code'])
    codes = [block.get_text() for block in code_blocks]
    
    return {
        'title': title_text,
        'content': md_content,
        'html': html,
        'code_snippets': codes
    }

def process_html_file(file_path):
    """Process HTML blog file"""
    with open(file_path, 'r', encoding='utf-8') as f:
        html_content = f.read()
    
    soup = BeautifulSoup(html_content, 'html.parser')
    title = soup.find('h1').get_text() if soup.find('h1') else "Untitled"
    text = soup.get_text(separator='\n', strip=True)
    
    # Extract code blocks
    code_blocks = soup.find_all(['pre', 'code'])
    codes = [block.get_text() for block in code_blocks]
    
    return {
        'title': title,
        'content': text,
        'html': html_content,
        'code_snippets': codes
    }

def process_text_file(file_path):
    """Process plain text file"""
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    
    return {
        'title': 'Blog Post',
        'content': content,
        'code_snippets': []
    }

def main(args):
    if args.url:
        print(f"[INFO] Fetching blog from URL: {args.url}")
        blog_data = fetch_blog_from_url(args.url)
    elif args.input_path:
        print(f"[INFO] Processing local file: {args.input_path}")
        if args.input_path.endswith('.md'):
            blog_data = process_markdown_file(args.input_path)
        elif args.input_path.endswith('.html'):
            blog_data = process_html_file(args.input_path)
        else:
            # Plain text
            blog_data = process_text_file(args.input_path)
    else:
        print("[ERROR] Must provide either --url or --input_path")
        return
    
    # Save as JSON
    with open(args.output_json_path, 'w', encoding='utf-8') as f:
        json.dump(blog_data, f, indent=2, ensure_ascii=False)
    
    print(f"[SAVED] {args.output_json_path}")
    print(f"[INFO] Title: {blog_data['title']}")
    print(f"[INFO] Content length: {len(blog_data['content'])} characters")
    print(f"[INFO] Code snippets found: {len(blog_data.get('code_snippets', []))}")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Process blog posts into JSON format for Blog2Code")
    parser.add_argument("--url", type=str, help="Blog URL to fetch")
    parser.add_argument("--input_path", type=str, help="Local blog file path (.md, .html, or .txt)")
    parser.add_argument("--output_json_path", type=str, required=True, help="Output JSON file path")
    
    args = parser.parse_args()
    main(args)