| import re
|
| import nltk
|
| from nltk.tokenize import word_tokenize
|
| from nltk.corpus import stopwords
|
| from nltk.stem import WordNetLemmatizer
|
| from typing import List, Optional
|
|
|
| class TextPreprocessor:
|
| def __init__(self):
|
| nltk.download('punkt')
|
| nltk.download('stopwords')
|
| nltk.download('wordnet')
|
| self.stop_words = set(stopwords.words('english'))
|
| self.lemmatizer = WordNetLemmatizer()
|
|
|
| def clean_text(self, text: str) -> str:
|
| """Clean and normalize text"""
|
|
|
| text = text.lower()
|
|
|
|
|
| text = re.sub(r'[^a-zA-Z\s]', '', text)
|
|
|
|
|
| text = re.sub(r'\s+', ' ', text).strip()
|
|
|
| return text
|
|
|
| def tokenize(self, text: str) -> List[str]:
|
| """Tokenize text into words"""
|
| return word_tokenize(text)
|
|
|
| def remove_stopwords(self, tokens: List[str]) -> List[str]:
|
| """Remove stop words from token list"""
|
| return [token for token in tokens if token not in self.stop_words]
|
|
|
| def lemmatize(self, tokens: List[str]) -> List[str]:
|
| """Lemmatize tokens"""
|
| return [self.lemmatizer.lemmatize(token) for token in tokens]
|
|
|
| def process(self, text: str) -> List[str]:
|
| """Complete preprocessing pipeline"""
|
| cleaned_text = self.clean_text(text)
|
| tokens = self.tokenize(cleaned_text)
|
| tokens = self.remove_stopwords(tokens)
|
| tokens = self.lemmatize(tokens)
|
| return tokens |