blog2code-api / codes /0_blog_process.py
srishtichugh's picture
initial commit
2fd8593
import json
import argparse
import requests
from bs4 import BeautifulSoup
import markdown
from urllib.parse import urlparse
def fetch_blog_from_url(url):
"""Fetch blog content from URL"""
try:
# Add user agent to avoid 403 errors from sites like Medium
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
response = requests.get(url, timeout=30, headers=headers)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Extract main content (adjust selectors based on blog platform)
title = soup.find('h1').get_text() if soup.find('h1') else "Untitled"
# Common content selectors - try multiple strategies
content = (soup.find('article') or
soup.find('main') or
soup.find('div', class_='content') or
soup.find('div', class_='post-content') or
soup.find('div', class_='entry-content'))
if content:
text = content.get_text(separator='\n', strip=True)
# Also extract code blocks separately
code_blocks = content.find_all(['pre', 'code'])
codes = [block.get_text() for block in code_blocks]
else:
text = soup.get_text(separator='\n', strip=True)
codes = []
return {
'title': title,
'url': url,
'content': text,
'code_snippets': codes
}
except Exception as e:
print(f"[ERROR] Failed to fetch URL: {e}")
raise
def process_markdown_file(file_path):
"""Process markdown blog file"""
with open(file_path, 'r', encoding='utf-8') as f:
md_content = f.read()
# Convert markdown to HTML then extract text
html = markdown.markdown(md_content, extensions=['fenced_code', 'codehilite'])
soup = BeautifulSoup(html, 'html.parser')
# Extract title (first h1)
title = soup.find('h1')
title_text = title.get_text() if title else "Untitled"
# Extract code blocks
code_blocks = soup.find_all(['pre', 'code'])
codes = [block.get_text() for block in code_blocks]
return {
'title': title_text,
'content': md_content,
'html': html,
'code_snippets': codes
}
def process_html_file(file_path):
"""Process HTML blog file"""
with open(file_path, 'r', encoding='utf-8') as f:
html_content = f.read()
soup = BeautifulSoup(html_content, 'html.parser')
title = soup.find('h1').get_text() if soup.find('h1') else "Untitled"
text = soup.get_text(separator='\n', strip=True)
# Extract code blocks
code_blocks = soup.find_all(['pre', 'code'])
codes = [block.get_text() for block in code_blocks]
return {
'title': title,
'content': text,
'html': html_content,
'code_snippets': codes
}
def process_text_file(file_path):
"""Process plain text file"""
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
return {
'title': 'Blog Post',
'content': content,
'code_snippets': []
}
def main(args):
if args.url:
print(f"[INFO] Fetching blog from URL: {args.url}")
blog_data = fetch_blog_from_url(args.url)
elif args.input_path:
print(f"[INFO] Processing local file: {args.input_path}")
if args.input_path.endswith('.md'):
blog_data = process_markdown_file(args.input_path)
elif args.input_path.endswith('.html'):
blog_data = process_html_file(args.input_path)
else:
# Plain text
blog_data = process_text_file(args.input_path)
else:
print("[ERROR] Must provide either --url or --input_path")
return
# Save as JSON
with open(args.output_json_path, 'w', encoding='utf-8') as f:
json.dump(blog_data, f, indent=2, ensure_ascii=False)
print(f"[SAVED] {args.output_json_path}")
print(f"[INFO] Title: {blog_data['title']}")
print(f"[INFO] Content length: {len(blog_data['content'])} characters")
print(f"[INFO] Code snippets found: {len(blog_data.get('code_snippets', []))}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Process blog posts into JSON format for Blog2Code")
parser.add_argument("--url", type=str, help="Blog URL to fetch")
parser.add_argument("--input_path", type=str, help="Local blog file path (.md, .html, or .txt)")
parser.add_argument("--output_json_path", type=str, required=True, help="Output JSON file path")
args = parser.parse_args()
main(args)