YT-AI-Automation / backend /src /utils /html_beautifier.py
github-actions
Sync Docker Space
5f3e9f5
"""HTML beautifier for auto-formatting messy HTML."""
from html.parser import HTMLParser
import re
class HTMLBeautifier:
"""Format and beautify HTML code."""
def __init__(self, indent_size=2):
self.indent_size = indent_size
def beautify(self, html_content):
"""
Beautify HTML content with proper indentation.
Args:
html_content: Raw HTML string
Returns:
Formatted HTML string
"""
try:
# Remove extra whitespace
html_content = re.sub(r'\s+', ' ', html_content)
html_content = html_content.strip()
# Parse and format
formatted = self._format_html(html_content)
return formatted
except Exception as e:
print(f"⚠️ Beautification failed: {e}")
return html_content
def _format_html(self, html):
"""Format HTML with proper indentation."""
# Self-closing tags
self_closing = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img',
'input', 'link', 'meta', 'param', 'source', 'track', 'wbr']
# Inline tags that shouldn't cause line breaks
inline_tags = ['a', 'abbr', 'b', 'bdi', 'bdo', 'cite', 'code', 'data',
'dfn', 'em', 'i', 'kbd', 'mark', 'q', 's', 'samp', 'small',
'span', 'strong', 'sub', 'sup', 'time', 'u', 'var']
result = []
indent_level = 0
indent = ' ' * self.indent_size
# Split by tags
parts = re.split(r'(<[^>]+>)', html)
i = 0
while i < len(parts):
part = parts[i].strip()
if not part:
i += 1
continue
# Check if it's a tag
if part.startswith('<'):
tag_match = re.match(r'<(/?)(\w+)', part)
if tag_match:
is_closing = tag_match.group(1) == '/'
tag_name = tag_match.group(2).lower()
# Handle closing tags
if is_closing:
indent_level = max(0, indent_level - 1)
result.append(indent * indent_level + part)
# Handle self-closing tags
elif tag_name in self_closing or part.endswith('/>'):
result.append(indent * indent_level + part)
# Handle inline tags
elif tag_name in inline_tags:
# Keep inline with previous content
if result and not result[-1].endswith('>'):
result[-1] += part
else:
result.append(indent * indent_level + part)
# Handle opening tags
else:
result.append(indent * indent_level + part)
indent_level += 1
# Handle comments and special tags
elif part.startswith('<!--') or part.startswith('<!'):
result.append(indent * indent_level + part)
else:
result.append(indent * indent_level + part)
# Handle text content
else:
# Check if next part is inline tag
if i + 1 < len(parts):
next_part = parts[i + 1].strip()
if next_part.startswith('<'):
next_tag_match = re.match(r'<(/?)(\w+)', next_part)
if next_tag_match:
next_tag = next_tag_match.group(2).lower()
if next_tag in inline_tags:
# Keep inline
if result:
result[-1] += part
else:
result.append(indent * indent_level + part)
i += 1
continue
result.append(indent * indent_level + part)
i += 1
return '\n'.join(result)
def minify(self, html_content):
"""
Minify HTML by removing unnecessary whitespace.
Args:
html_content: HTML string
Returns:
Minified HTML string
"""
try:
# Remove comments
html_content = re.sub(r'<!--.*?-->', '', html_content, flags=re.DOTALL)
# Remove whitespace between tags
html_content = re.sub(r'>\s+<', '><', html_content)
# Remove leading/trailing whitespace
html_content = re.sub(r'\s+', ' ', html_content)
html_content = html_content.strip()
return html_content
except Exception as e:
print(f"⚠️ Minification failed: {e}")
return html_content
def validate(self, html_content):
"""
Basic HTML validation.
Args:
html_content: HTML string
Returns:
dict with validation results
"""
issues = []
# Check for unclosed tags
opening_tags = re.findall(r'<(\w+)[^>]*>', html_content)
closing_tags = re.findall(r'</(\w+)>', html_content)
self_closing = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img',
'input', 'link', 'meta', 'param', 'source', 'track', 'wbr']
# Filter out self-closing tags
opening_tags = [tag for tag in opening_tags if tag.lower() not in self_closing]
# Check balance
for tag in set(opening_tags):
open_count = opening_tags.count(tag)
close_count = closing_tags.count(tag)
if open_count != close_count:
issues.append(f"Unbalanced <{tag}> tags: {open_count} opening, {close_count} closing")
# Check for common issues
if '<html' not in html_content.lower():
issues.append("Missing <html> tag")
if '<head' not in html_content.lower():
issues.append("Missing <head> tag")
if '<body' not in html_content.lower():
issues.append("Missing <body> tag")
return {
'valid': len(issues) == 0,
'issues': issues,
'tag_count': len(opening_tags) + len(closing_tags)
}