Spaces:
Sleeping
Sleeping
File size: 4,827 Bytes
2fd8593 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 | import json
import argparse
import requests
from bs4 import BeautifulSoup
import markdown
from urllib.parse import urlparse
def fetch_blog_from_url(url):
"""Fetch blog content from URL"""
try:
# Add user agent to avoid 403 errors from sites like Medium
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
response = requests.get(url, timeout=30, headers=headers)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Extract main content (adjust selectors based on blog platform)
title = soup.find('h1').get_text() if soup.find('h1') else "Untitled"
# Common content selectors - try multiple strategies
content = (soup.find('article') or
soup.find('main') or
soup.find('div', class_='content') or
soup.find('div', class_='post-content') or
soup.find('div', class_='entry-content'))
if content:
text = content.get_text(separator='\n', strip=True)
# Also extract code blocks separately
code_blocks = content.find_all(['pre', 'code'])
codes = [block.get_text() for block in code_blocks]
else:
text = soup.get_text(separator='\n', strip=True)
codes = []
return {
'title': title,
'url': url,
'content': text,
'code_snippets': codes
}
except Exception as e:
print(f"[ERROR] Failed to fetch URL: {e}")
raise
def process_markdown_file(file_path):
"""Process markdown blog file"""
with open(file_path, 'r', encoding='utf-8') as f:
md_content = f.read()
# Convert markdown to HTML then extract text
html = markdown.markdown(md_content, extensions=['fenced_code', 'codehilite'])
soup = BeautifulSoup(html, 'html.parser')
# Extract title (first h1)
title = soup.find('h1')
title_text = title.get_text() if title else "Untitled"
# Extract code blocks
code_blocks = soup.find_all(['pre', 'code'])
codes = [block.get_text() for block in code_blocks]
return {
'title': title_text,
'content': md_content,
'html': html,
'code_snippets': codes
}
def process_html_file(file_path):
"""Process HTML blog file"""
with open(file_path, 'r', encoding='utf-8') as f:
html_content = f.read()
soup = BeautifulSoup(html_content, 'html.parser')
title = soup.find('h1').get_text() if soup.find('h1') else "Untitled"
text = soup.get_text(separator='\n', strip=True)
# Extract code blocks
code_blocks = soup.find_all(['pre', 'code'])
codes = [block.get_text() for block in code_blocks]
return {
'title': title,
'content': text,
'html': html_content,
'code_snippets': codes
}
def process_text_file(file_path):
"""Process plain text file"""
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
return {
'title': 'Blog Post',
'content': content,
'code_snippets': []
}
def main(args):
if args.url:
print(f"[INFO] Fetching blog from URL: {args.url}")
blog_data = fetch_blog_from_url(args.url)
elif args.input_path:
print(f"[INFO] Processing local file: {args.input_path}")
if args.input_path.endswith('.md'):
blog_data = process_markdown_file(args.input_path)
elif args.input_path.endswith('.html'):
blog_data = process_html_file(args.input_path)
else:
# Plain text
blog_data = process_text_file(args.input_path)
else:
print("[ERROR] Must provide either --url or --input_path")
return
# Save as JSON
with open(args.output_json_path, 'w', encoding='utf-8') as f:
json.dump(blog_data, f, indent=2, ensure_ascii=False)
print(f"[SAVED] {args.output_json_path}")
print(f"[INFO] Title: {blog_data['title']}")
print(f"[INFO] Content length: {len(blog_data['content'])} characters")
print(f"[INFO] Code snippets found: {len(blog_data.get('code_snippets', []))}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Process blog posts into JSON format for Blog2Code")
parser.add_argument("--url", type=str, help="Blog URL to fetch")
parser.add_argument("--input_path", type=str, help="Local blog file path (.md, .html, or .txt)")
parser.add_argument("--output_json_path", type=str, required=True, help="Output JSON file path")
args = parser.parse_args()
main(args)
|