File size: 6,902 Bytes
5f3e9f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
"""HTML beautifier for auto-formatting messy HTML."""
from html.parser import HTMLParser
import re


class HTMLBeautifier:
    """Format and beautify HTML code."""
    
    def __init__(self, indent_size=2):
        self.indent_size = indent_size
    
    def beautify(self, html_content):
        """
        Beautify HTML content with proper indentation.
        
        Args:
            html_content: Raw HTML string
            
        Returns:
            Formatted HTML string
        """
        try:
            # Remove extra whitespace
            html_content = re.sub(r'\s+', ' ', html_content)
            html_content = html_content.strip()
            
            # Parse and format
            formatted = self._format_html(html_content)
            
            return formatted
            
        except Exception as e:
            print(f"⚠️  Beautification failed: {e}")
            return html_content
    
    def _format_html(self, html):
        """Format HTML with proper indentation."""
        # Self-closing tags
        self_closing = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 
                       'input', 'link', 'meta', 'param', 'source', 'track', 'wbr']
        
        # Inline tags that shouldn't cause line breaks
        inline_tags = ['a', 'abbr', 'b', 'bdi', 'bdo', 'cite', 'code', 'data',
                      'dfn', 'em', 'i', 'kbd', 'mark', 'q', 's', 'samp', 'small',
                      'span', 'strong', 'sub', 'sup', 'time', 'u', 'var']
        
        result = []
        indent_level = 0
        indent = ' ' * self.indent_size
        
        # Split by tags
        parts = re.split(r'(<[^>]+>)', html)
        
        i = 0
        while i < len(parts):
            part = parts[i].strip()
            
            if not part:
                i += 1
                continue
            
            # Check if it's a tag
            if part.startswith('<'):
                tag_match = re.match(r'<(/?)(\w+)', part)
                
                if tag_match:
                    is_closing = tag_match.group(1) == '/'
                    tag_name = tag_match.group(2).lower()
                    
                    # Handle closing tags
                    if is_closing:
                        indent_level = max(0, indent_level - 1)
                        result.append(indent * indent_level + part)
                    
                    # Handle self-closing tags
                    elif tag_name in self_closing or part.endswith('/>'):
                        result.append(indent * indent_level + part)
                    
                    # Handle inline tags
                    elif tag_name in inline_tags:
                        # Keep inline with previous content
                        if result and not result[-1].endswith('>'):
                            result[-1] += part
                        else:
                            result.append(indent * indent_level + part)
                    
                    # Handle opening tags
                    else:
                        result.append(indent * indent_level + part)
                        indent_level += 1
                
                # Handle comments and special tags
                elif part.startswith('<!--') or part.startswith('<!'):
                    result.append(indent * indent_level + part)
                
                else:
                    result.append(indent * indent_level + part)
            
            # Handle text content
            else:
                # Check if next part is inline tag
                if i + 1 < len(parts):
                    next_part = parts[i + 1].strip()
                    if next_part.startswith('<'):
                        next_tag_match = re.match(r'<(/?)(\w+)', next_part)
                        if next_tag_match:
                            next_tag = next_tag_match.group(2).lower()
                            if next_tag in inline_tags:
                                # Keep inline
                                if result:
                                    result[-1] += part
                                else:
                                    result.append(indent * indent_level + part)
                                i += 1
                                continue
                
                result.append(indent * indent_level + part)
            
            i += 1
        
        return '\n'.join(result)
    
    def minify(self, html_content):
        """
        Minify HTML by removing unnecessary whitespace.
        
        Args:
            html_content: HTML string
            
        Returns:
            Minified HTML string
        """
        try:
            # Remove comments
            html_content = re.sub(r'<!--.*?-->', '', html_content, flags=re.DOTALL)
            
            # Remove whitespace between tags
            html_content = re.sub(r'>\s+<', '><', html_content)
            
            # Remove leading/trailing whitespace
            html_content = re.sub(r'\s+', ' ', html_content)
            html_content = html_content.strip()
            
            return html_content
            
        except Exception as e:
            print(f"⚠️  Minification failed: {e}")
            return html_content
    
    def validate(self, html_content):
        """
        Basic HTML validation.
        
        Args:
            html_content: HTML string
            
        Returns:
            dict with validation results
        """
        issues = []
        
        # Check for unclosed tags
        opening_tags = re.findall(r'<(\w+)[^>]*>', html_content)
        closing_tags = re.findall(r'</(\w+)>', html_content)
        
        self_closing = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 
                       'input', 'link', 'meta', 'param', 'source', 'track', 'wbr']
        
        # Filter out self-closing tags
        opening_tags = [tag for tag in opening_tags if tag.lower() not in self_closing]
        
        # Check balance
        for tag in set(opening_tags):
            open_count = opening_tags.count(tag)
            close_count = closing_tags.count(tag)
            
            if open_count != close_count:
                issues.append(f"Unbalanced <{tag}> tags: {open_count} opening, {close_count} closing")
        
        # Check for common issues
        if '<html' not in html_content.lower():
            issues.append("Missing <html> tag")
        
        if '<head' not in html_content.lower():
            issues.append("Missing <head> tag")
        
        if '<body' not in html_content.lower():
            issues.append("Missing <body> tag")
        
        return {
            'valid': len(issues) == 0,
            'issues': issues,
            'tag_count': len(opening_tags) + len(closing_tags)
        }