Spaces:
Running
Running
| """HTML beautifier for auto-formatting messy HTML.""" | |
| from html.parser import HTMLParser | |
| import re | |
| class HTMLBeautifier: | |
| """Format and beautify HTML code.""" | |
| def __init__(self, indent_size=2): | |
| self.indent_size = indent_size | |
| def beautify(self, html_content): | |
| """ | |
| Beautify HTML content with proper indentation. | |
| Args: | |
| html_content: Raw HTML string | |
| Returns: | |
| Formatted HTML string | |
| """ | |
| try: | |
| # Remove extra whitespace | |
| html_content = re.sub(r'\s+', ' ', html_content) | |
| html_content = html_content.strip() | |
| # Parse and format | |
| formatted = self._format_html(html_content) | |
| return formatted | |
| except Exception as e: | |
| print(f"⚠️ Beautification failed: {e}") | |
| return html_content | |
| def _format_html(self, html): | |
| """Format HTML with proper indentation.""" | |
| # Self-closing tags | |
| self_closing = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', | |
| 'input', 'link', 'meta', 'param', 'source', 'track', 'wbr'] | |
| # Inline tags that shouldn't cause line breaks | |
| inline_tags = ['a', 'abbr', 'b', 'bdi', 'bdo', 'cite', 'code', 'data', | |
| 'dfn', 'em', 'i', 'kbd', 'mark', 'q', 's', 'samp', 'small', | |
| 'span', 'strong', 'sub', 'sup', 'time', 'u', 'var'] | |
| result = [] | |
| indent_level = 0 | |
| indent = ' ' * self.indent_size | |
| # Split by tags | |
| parts = re.split(r'(<[^>]+>)', html) | |
| i = 0 | |
| while i < len(parts): | |
| part = parts[i].strip() | |
| if not part: | |
| i += 1 | |
| continue | |
| # Check if it's a tag | |
| if part.startswith('<'): | |
| tag_match = re.match(r'<(/?)(\w+)', part) | |
| if tag_match: | |
| is_closing = tag_match.group(1) == '/' | |
| tag_name = tag_match.group(2).lower() | |
| # Handle closing tags | |
| if is_closing: | |
| indent_level = max(0, indent_level - 1) | |
| result.append(indent * indent_level + part) | |
| # Handle self-closing tags | |
| elif tag_name in self_closing or part.endswith('/>'): | |
| result.append(indent * indent_level + part) | |
| # Handle inline tags | |
| elif tag_name in inline_tags: | |
| # Keep inline with previous content | |
| if result and not result[-1].endswith('>'): | |
| result[-1] += part | |
| else: | |
| result.append(indent * indent_level + part) | |
| # Handle opening tags | |
| else: | |
| result.append(indent * indent_level + part) | |
| indent_level += 1 | |
| # Handle comments and special tags | |
| elif part.startswith('<!--') or part.startswith('<!'): | |
| result.append(indent * indent_level + part) | |
| else: | |
| result.append(indent * indent_level + part) | |
| # Handle text content | |
| else: | |
| # Check if next part is inline tag | |
| if i + 1 < len(parts): | |
| next_part = parts[i + 1].strip() | |
| if next_part.startswith('<'): | |
| next_tag_match = re.match(r'<(/?)(\w+)', next_part) | |
| if next_tag_match: | |
| next_tag = next_tag_match.group(2).lower() | |
| if next_tag in inline_tags: | |
| # Keep inline | |
| if result: | |
| result[-1] += part | |
| else: | |
| result.append(indent * indent_level + part) | |
| i += 1 | |
| continue | |
| result.append(indent * indent_level + part) | |
| i += 1 | |
| return '\n'.join(result) | |
| def minify(self, html_content): | |
| """ | |
| Minify HTML by removing unnecessary whitespace. | |
| Args: | |
| html_content: HTML string | |
| Returns: | |
| Minified HTML string | |
| """ | |
| try: | |
| # Remove comments | |
| html_content = re.sub(r'<!--.*?-->', '', html_content, flags=re.DOTALL) | |
| # Remove whitespace between tags | |
| html_content = re.sub(r'>\s+<', '><', html_content) | |
| # Remove leading/trailing whitespace | |
| html_content = re.sub(r'\s+', ' ', html_content) | |
| html_content = html_content.strip() | |
| return html_content | |
| except Exception as e: | |
| print(f"⚠️ Minification failed: {e}") | |
| return html_content | |
| def validate(self, html_content): | |
| """ | |
| Basic HTML validation. | |
| Args: | |
| html_content: HTML string | |
| Returns: | |
| dict with validation results | |
| """ | |
| issues = [] | |
| # Check for unclosed tags | |
| opening_tags = re.findall(r'<(\w+)[^>]*>', html_content) | |
| closing_tags = re.findall(r'</(\w+)>', html_content) | |
| self_closing = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', | |
| 'input', 'link', 'meta', 'param', 'source', 'track', 'wbr'] | |
| # Filter out self-closing tags | |
| opening_tags = [tag for tag in opening_tags if tag.lower() not in self_closing] | |
| # Check balance | |
| for tag in set(opening_tags): | |
| open_count = opening_tags.count(tag) | |
| close_count = closing_tags.count(tag) | |
| if open_count != close_count: | |
| issues.append(f"Unbalanced <{tag}> tags: {open_count} opening, {close_count} closing") | |
| # Check for common issues | |
| if '<html' not in html_content.lower(): | |
| issues.append("Missing <html> tag") | |
| if '<head' not in html_content.lower(): | |
| issues.append("Missing <head> tag") | |
| if '<body' not in html_content.lower(): | |
| issues.append("Missing <body> tag") | |
| return { | |
| 'valid': len(issues) == 0, | |
| 'issues': issues, | |
| 'tag_count': len(opening_tags) + len(closing_tags) | |
| } | |