Spaces:
Running
Running
| from src.utils.logger import setup_logger, get_logger | |
| from src.processing.text_cleaner import clean_text | |
| setup_logger() | |
| logger = get_logger(__name__) | |
| # Simulate dirty PDF text | |
| dirty_text = """ | |
| arXiv:2301.07041v2 [cs.LG] 17 Jan 2023 | |
| We propose a novel at- | |
| tention mechanism that re- | |
| duces computational com- | |
| plexity significantly. | |
| This method achieves state-of-the-art results. | |
| 2 | |
| ICML 2023 Workshop | |
| The key insight is that sparse attention patterns | |
| can approximate full attention with minimal quality loss. | |
| References | |
| Vaswani, A., et al. (2017). Attention is all you need. | |
| Brown, T., et al. (2020). Language models are few-shot learners. | |
| """ | |
| cleaned = clean_text(dirty_text) | |
| logger.info("βββ DIRTY TEXT βββ") | |
| print(dirty_text[:300]) | |
| logger.info("βββ CLEANED TEXT βββ") | |
| print(cleaned) | |
| logger.info(f"Original length: {len(dirty_text)}") | |
| logger.info(f"Cleaned length: {len(cleaned)}") |