| import re |
|
|
| class AnalysisCleaner: |
| def __init__(self): |
| self.seen_paragraphs = set() |
|
|
| def remove_duplicates(self, text: str) -> str: |
| """Remove duplicate paragraphs while preserving order""" |
| paragraphs = text.split('\n\n') |
| unique_paragraphs = [] |
|
|
| for paragraph in paragraphs: |
| |
| normalized = ' '.join(paragraph.lower().split()) |
| if normalized and normalized not in self.seen_paragraphs: |
| self.seen_paragraphs.add(normalized) |
| unique_paragraphs.append(paragraph) |
|
|
| return '\n\n'.join(unique_paragraphs) |
|
|
| def reorganize_content(self, text: str) -> str: |
| """Convert bullet points into flowing paragraphs""" |
| lines = text.split('\n') |
| current_paragraph = [] |
| flowing_text = [] |
|
|
| for line in lines: |
| |
| cleaned_line = re.sub(r'^\s*[\*\-\•]\s*', '', line) |
| cleaned_line = re.sub(r'^\s*\d+\.\s*', '', cleaned_line) |
|
|
| if cleaned_line.strip(): |
| if cleaned_line.startswith('###'): |
| if current_paragraph: |
| flowing_text.append(' '.join(current_paragraph)) |
| current_paragraph = [] |
| flowing_text.append(cleaned_line) |
| else: |
| current_paragraph.append(cleaned_line) |
| elif current_paragraph: |
| flowing_text.append(' '.join(current_paragraph)) |
| current_paragraph = [] |
|
|
| if current_paragraph: |
| flowing_text.append(' '.join(current_paragraph)) |
|
|
| return '\n\n'.join(flowing_text) |
|
|
| def clean_analysis(self, text: str) -> str: |
| """Apply all cleanup steps""" |
| |
| cleaned = self.remove_duplicates(text) |
|
|
| |
| cleaned = self.reorganize_content(cleaned) |
|
|
| |
| cleaned = re.sub(r'\n{3,}', '\n\n', cleaned) |
|
|
| return cleaned |