File size: 4,827 Bytes
2fd8593
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import json
import argparse
import requests
from bs4 import BeautifulSoup
import markdown
from urllib.parse import urlparse

def fetch_blog_from_url(url):
    """Fetch blog content from URL"""
    try:
        # Add user agent to avoid 403 errors from sites like Medium
        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, timeout=30, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract main content (adjust selectors based on blog platform)
        title = soup.find('h1').get_text() if soup.find('h1') else "Untitled"
        
        # Common content selectors - try multiple strategies
        content = (soup.find('article') or 
                  soup.find('main') or 
                  soup.find('div', class_='content') or
                  soup.find('div', class_='post-content') or
                  soup.find('div', class_='entry-content'))
        
        if content:
            text = content.get_text(separator='\n', strip=True)
            # Also extract code blocks separately
            code_blocks = content.find_all(['pre', 'code'])
            codes = [block.get_text() for block in code_blocks]
        else:
            text = soup.get_text(separator='\n', strip=True)
            codes = []
        
        return {
            'title': title,
            'url': url,
            'content': text,
            'code_snippets': codes
        }
    except Exception as e:
        print(f"[ERROR] Failed to fetch URL: {e}")
        raise

def process_markdown_file(file_path):
    """Process markdown blog file"""
    with open(file_path, 'r', encoding='utf-8') as f:
        md_content = f.read()
    
    # Convert markdown to HTML then extract text
    html = markdown.markdown(md_content, extensions=['fenced_code', 'codehilite'])
    soup = BeautifulSoup(html, 'html.parser')
    
    # Extract title (first h1)
    title = soup.find('h1')
    title_text = title.get_text() if title else "Untitled"
    
    # Extract code blocks
    code_blocks = soup.find_all(['pre', 'code'])
    codes = [block.get_text() for block in code_blocks]
    
    return {
        'title': title_text,
        'content': md_content,
        'html': html,
        'code_snippets': codes
    }

def process_html_file(file_path):
    """Process HTML blog file"""
    with open(file_path, 'r', encoding='utf-8') as f:
        html_content = f.read()
    
    soup = BeautifulSoup(html_content, 'html.parser')
    title = soup.find('h1').get_text() if soup.find('h1') else "Untitled"
    text = soup.get_text(separator='\n', strip=True)
    
    # Extract code blocks
    code_blocks = soup.find_all(['pre', 'code'])
    codes = [block.get_text() for block in code_blocks]
    
    return {
        'title': title,
        'content': text,
        'html': html_content,
        'code_snippets': codes
    }

def process_text_file(file_path):
    """Process plain text file"""
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    
    return {
        'title': 'Blog Post',
        'content': content,
        'code_snippets': []
    }

def main(args):
    if args.url:
        print(f"[INFO] Fetching blog from URL: {args.url}")
        blog_data = fetch_blog_from_url(args.url)
    elif args.input_path:
        print(f"[INFO] Processing local file: {args.input_path}")
        if args.input_path.endswith('.md'):
            blog_data = process_markdown_file(args.input_path)
        elif args.input_path.endswith('.html'):
            blog_data = process_html_file(args.input_path)
        else:
            # Plain text
            blog_data = process_text_file(args.input_path)
    else:
        print("[ERROR] Must provide either --url or --input_path")
        return
    
    # Save as JSON
    with open(args.output_json_path, 'w', encoding='utf-8') as f:
        json.dump(blog_data, f, indent=2, ensure_ascii=False)
    
    print(f"[SAVED] {args.output_json_path}")
    print(f"[INFO] Title: {blog_data['title']}")
    print(f"[INFO] Content length: {len(blog_data['content'])} characters")
    print(f"[INFO] Code snippets found: {len(blog_data.get('code_snippets', []))}")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Process blog posts into JSON format for Blog2Code")
    parser.add_argument("--url", type=str, help="Blog URL to fetch")
    parser.add_argument("--input_path", type=str, help="Local blog file path (.md, .html, or .txt)")
    parser.add_argument("--output_json_path", type=str, required=True, help="Output JSON file path")
    
    args = parser.parse_args()
    main(args)