Spaces:

shiva0013
/

YT-AI-Automation

Running

YT-AI-Automation / backend /src /utils /html_beautifier.py

github-actions

Sync Docker Space

5f3e9f5 3 days ago

6.9 kB

	"""HTML beautifier for auto-formatting messy HTML."""
	from html.parser import HTMLParser
	import re


	class HTMLBeautifier:
	"""Format and beautify HTML code."""

	def __init__(self, indent_size=2):
	self.indent_size = indent_size

	def beautify(self, html_content):
	"""
	Beautify HTML content with proper indentation.

	Args:
	html_content: Raw HTML string

	Returns:
	Formatted HTML string
	"""
	try:
	# Remove extra whitespace
	html_content = re.sub(r'\s+', ' ', html_content)
	html_content = html_content.strip()

	# Parse and format
	formatted = self._format_html(html_content)

	return formatted

	except Exception as e:
	print(f"⚠️ Beautification failed: {e}")
	return html_content

	def _format_html(self, html):
	"""Format HTML with proper indentation."""
	# Self-closing tags
	self_closing = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img',
	'input', 'link', 'meta', 'param', 'source', 'track', 'wbr']

	# Inline tags that shouldn't cause line breaks
	inline_tags = ['a', 'abbr', 'b', 'bdi', 'bdo', 'cite', 'code', 'data',
	'dfn', 'em', 'i', 'kbd', 'mark', 'q', 's', 'samp', 'small',
	'span', 'strong', 'sub', 'sup', 'time', 'u', 'var']

	result = []
	indent_level = 0
	indent = ' ' * self.indent_size

	# Split by tags
	parts = re.split(r'(<[^>]+>)', html)

	i = 0
	while i < len(parts):
	part = parts[i].strip()

	if not part:
	i += 1
	continue

	# Check if it's a tag
	if part.startswith('<'):
	tag_match = re.match(r'<(/?)(\w+)', part)

	if tag_match:
	is_closing = tag_match.group(1) == '/'
	tag_name = tag_match.group(2).lower()

	# Handle closing tags
	if is_closing:
	indent_level = max(0, indent_level - 1)
	result.append(indent * indent_level + part)

	# Handle self-closing tags
	elif tag_name in self_closing or part.endswith('/>'):
	result.append(indent * indent_level + part)

	# Handle inline tags
	elif tag_name in inline_tags:
	# Keep inline with previous content
	if result and not result[-1].endswith('>'):
	result[-1] += part
	else:
	result.append(indent * indent_level + part)

	# Handle opening tags
	else:
	result.append(indent * indent_level + part)
	indent_level += 1

	# Handle comments and special tags
	elif part.startswith('<!--') or part.startswith('<!'):
	result.append(indent * indent_level + part)

	else:
	result.append(indent * indent_level + part)

	# Handle text content
	else:
	# Check if next part is inline tag
	if i + 1 < len(parts):
	next_part = parts[i + 1].strip()
	if next_part.startswith('<'):
	next_tag_match = re.match(r'<(/?)(\w+)', next_part)
	if next_tag_match:
	next_tag = next_tag_match.group(2).lower()
	if next_tag in inline_tags:
	# Keep inline
	if result:
	result[-1] += part
	else:
	result.append(indent * indent_level + part)
	i += 1
	continue

	result.append(indent * indent_level + part)

	i += 1

	return '\n'.join(result)

	def minify(self, html_content):
	"""
	Minify HTML by removing unnecessary whitespace.

	Args:
	html_content: HTML string

	Returns:
	Minified HTML string
	"""
	try:
	# Remove comments
	html_content = re.sub(r'<!--.*?-->', '', html_content, flags=re.DOTALL)

	# Remove whitespace between tags
	html_content = re.sub(r'>\s+<', '><', html_content)

	# Remove leading/trailing whitespace
	html_content = re.sub(r'\s+', ' ', html_content)
	html_content = html_content.strip()

	return html_content

	except Exception as e:
	print(f"⚠️ Minification failed: {e}")
	return html_content

	def validate(self, html_content):
	"""
	Basic HTML validation.

	Args:
	html_content: HTML string

	Returns:
	dict with validation results
	"""
	issues = []

	# Check for unclosed tags
	opening_tags = re.findall(r'<(\w+)[^>]*>', html_content)
	closing_tags = re.findall(r'</(\w+)>', html_content)

	self_closing = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img',
	'input', 'link', 'meta', 'param', 'source', 'track', 'wbr']

	# Filter out self-closing tags
	opening_tags = [tag for tag in opening_tags if tag.lower() not in self_closing]

	# Check balance
	for tag in set(opening_tags):
	open_count = opening_tags.count(tag)
	close_count = closing_tags.count(tag)

	if open_count != close_count:
	issues.append(f"Unbalanced <{tag}> tags: {open_count} opening, {close_count} closing")

	# Check for common issues
	if '<html' not in html_content.lower():
	issues.append("Missing <html> tag")

	if '<head' not in html_content.lower():
	issues.append("Missing <head> tag")

	if '<body' not in html_content.lower():
	issues.append("Missing <body> tag")

	return {
	'valid': len(issues) == 0,
	'issues': issues,
	'tag_count': len(opening_tags) + len(closing_tags)
	}