Spaces:

srishtichugh
/

blog2code-api

Sleeping

App Files Files Community

blog2code-api / codes /0_blog_process.py

srishtichugh

initial commit

2fd8593 6 days ago

raw

history blame contribute delete

4.83 kB

	import json
	import argparse
	import requests
	from bs4 import BeautifulSoup
	import markdown
	from urllib.parse import urlparse

	def fetch_blog_from_url(url):
	"""Fetch blog content from URL"""
	try:
	# Add user agent to avoid 403 errors from sites like Medium
	headers = {
	'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
	}
	response = requests.get(url, timeout=30, headers=headers)
	response.raise_for_status()
	soup = BeautifulSoup(response.content, 'html.parser')

	# Extract main content (adjust selectors based on blog platform)
	title = soup.find('h1').get_text() if soup.find('h1') else "Untitled"

	# Common content selectors - try multiple strategies
	content = (soup.find('article') or
	soup.find('main') or
	soup.find('div', class_='content') or
	soup.find('div', class_='post-content') or
	soup.find('div', class_='entry-content'))

	if content:
	text = content.get_text(separator='\n', strip=True)
	# Also extract code blocks separately
	code_blocks = content.find_all(['pre', 'code'])
	codes = [block.get_text() for block in code_blocks]
	else:
	text = soup.get_text(separator='\n', strip=True)
	codes = []

	return {
	'title': title,
	'url': url,
	'content': text,
	'code_snippets': codes
	}
	except Exception as e:
	print(f"[ERROR] Failed to fetch URL: {e}")
	raise

	def process_markdown_file(file_path):
	"""Process markdown blog file"""
	with open(file_path, 'r', encoding='utf-8') as f:
	md_content = f.read()

	# Convert markdown to HTML then extract text
	html = markdown.markdown(md_content, extensions=['fenced_code', 'codehilite'])
	soup = BeautifulSoup(html, 'html.parser')

	# Extract title (first h1)
	title = soup.find('h1')
	title_text = title.get_text() if title else "Untitled"

	# Extract code blocks
	code_blocks = soup.find_all(['pre', 'code'])
	codes = [block.get_text() for block in code_blocks]

	return {
	'title': title_text,
	'content': md_content,
	'html': html,
	'code_snippets': codes
	}

	def process_html_file(file_path):
	"""Process HTML blog file"""
	with open(file_path, 'r', encoding='utf-8') as f:
	html_content = f.read()

	soup = BeautifulSoup(html_content, 'html.parser')
	title = soup.find('h1').get_text() if soup.find('h1') else "Untitled"
	text = soup.get_text(separator='\n', strip=True)

	# Extract code blocks
	code_blocks = soup.find_all(['pre', 'code'])
	codes = [block.get_text() for block in code_blocks]

	return {
	'title': title,
	'content': text,
	'html': html_content,
	'code_snippets': codes
	}

	def process_text_file(file_path):
	"""Process plain text file"""
	with open(file_path, 'r', encoding='utf-8') as f:
	content = f.read()

	return {
	'title': 'Blog Post',
	'content': content,
	'code_snippets': []
	}

	def main(args):
	if args.url:
	print(f"[INFO] Fetching blog from URL: {args.url}")
	blog_data = fetch_blog_from_url(args.url)
	elif args.input_path:
	print(f"[INFO] Processing local file: {args.input_path}")
	if args.input_path.endswith('.md'):
	blog_data = process_markdown_file(args.input_path)
	elif args.input_path.endswith('.html'):
	blog_data = process_html_file(args.input_path)
	else:
	# Plain text
	blog_data = process_text_file(args.input_path)
	else:
	print("[ERROR] Must provide either --url or --input_path")
	return

	# Save as JSON
	with open(args.output_json_path, 'w', encoding='utf-8') as f:
	json.dump(blog_data, f, indent=2, ensure_ascii=False)

	print(f"[SAVED] {args.output_json_path}")
	print(f"[INFO] Title: {blog_data['title']}")
	print(f"[INFO] Content length: {len(blog_data['content'])} characters")
	print(f"[INFO] Code snippets found: {len(blog_data.get('code_snippets', []))}")

	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description="Process blog posts into JSON format for Blog2Code")
	parser.add_argument("--url", type=str, help="Blog URL to fetch")
	parser.add_argument("--input_path", type=str, help="Local blog file path (.md, .html, or .txt)")
	parser.add_argument("--output_json_path", type=str, required=True, help="Output JSON file path")

	args = parser.parse_args()
	main(args)