| from pypdf import PdfReader | |
| from typing import Dict, List | |
| import re | |
| def load_documents(data_path: str) -> str: | |
| ''' | |
| Read the linkedin pdf and the summary in the data folder | |
| Parameters: | |
| - data_path (str): The path to the data folder | |
| Returns: | |
| - output (Dict[str, str]): A dictionary containing the text document and summary | |
| ''' | |
| reader = PdfReader(f"{data_path}\linkedin.pdf") | |
| text_document = "" | |
| for page in reader.pages: | |
| text_document += page.extract_text() | |
| with open(f"{data_path}\summary.txt", "r") as f: | |
| summary = f.read() | |
| output = f"{text_document}\n{summary}" | |
| return output | |
| def sliding_window_chunk(text: str, overlap: int = 20, chunk_size: int = 200) -> List[str]: | |
| ''' | |
| Split the text into chunks of non-empty substrings | |
| Parameters: | |
| - text (str): The text to split | |
| Returns: | |
| - chunks (List[str]): A list of chunks of text | |
| ''' | |
| # Remove unwanted characters | |
| text = re.sub(r'[\xa0\n]', " ", text) | |
| # Split the text into chunks of non-empty substrings | |
| words = text.split() | |
| chunks = [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), overlap)] | |
| return chunks | |
| # if __name__ == "__main__": | |
| # # reader = PdfReader("Week_1\Data_w1\linkedin.pdf") | |
| # # linkedin = "" | |
| # # for page in reader.pages: | |
| # # linkedin += page.extract_text() | |
| # # text_chunks = sliding_window_chunk(linkedin) | |
| # # print(len(text_chunks)) | |