| """ |
| Text extraction module for Norwegian RAG chatbot. |
| Extracts text from various document formats. |
| """ |
|
|
| import os |
| import PyPDF2 |
| from typing import List, Optional |
| from bs4 import BeautifulSoup |
|
|
| class TextExtractor: |
| """ |
| Extracts text from various document formats. |
| Currently supports: |
| - PDF (.pdf) |
| - Text files (.txt) |
| - HTML (.html, .htm) |
| """ |
| |
| @staticmethod |
| def extract_from_file(file_path: str) -> str: |
| """ |
| Extract text from a file based on its extension. |
| |
| Args: |
| file_path: Path to the document file |
| |
| Returns: |
| Extracted text content |
| """ |
| if not os.path.exists(file_path): |
| raise FileNotFoundError(f"File not found: {file_path}") |
| |
| file_extension = os.path.splitext(file_path)[1].lower() |
| |
| if file_extension == '.pdf': |
| return TextExtractor.extract_from_pdf(file_path) |
| elif file_extension == '.txt': |
| return TextExtractor.extract_from_text(file_path) |
| elif file_extension in ['.html', '.htm']: |
| return TextExtractor.extract_from_html(file_path) |
| else: |
| raise ValueError(f"Unsupported file format: {file_extension}") |
| |
| @staticmethod |
| def extract_from_pdf(file_path: str) -> str: |
| """ |
| Extract text from a PDF file. |
| |
| Args: |
| file_path: Path to the PDF file |
| |
| Returns: |
| Extracted text content |
| """ |
| text = "" |
| try: |
| with open(file_path, 'rb') as file: |
| pdf_reader = PyPDF2.PdfReader(file) |
| for page_num in range(len(pdf_reader.pages)): |
| page = pdf_reader.pages[page_num] |
| text += page.extract_text() + "\n\n" |
| except Exception as e: |
| print(f"Error extracting text from PDF {file_path}: {str(e)}") |
| return "" |
| |
| return text |
| |
| @staticmethod |
| def extract_from_text(file_path: str) -> str: |
| """ |
| Extract text from a plain text file. |
| |
| Args: |
| file_path: Path to the text file |
| |
| Returns: |
| Extracted text content |
| """ |
| try: |
| with open(file_path, 'r', encoding='utf-8') as file: |
| return file.read() |
| except UnicodeDecodeError: |
| |
| try: |
| with open(file_path, 'r', encoding='latin-1') as file: |
| return file.read() |
| except Exception as e: |
| print(f"Error extracting text from file {file_path}: {str(e)}") |
| return "" |
| except Exception as e: |
| print(f"Error extracting text from file {file_path}: {str(e)}") |
| return "" |
| |
| @staticmethod |
| def extract_from_html(file_path: str) -> str: |
| """ |
| Extract text from an HTML file. |
| |
| Args: |
| file_path: Path to the HTML file |
| |
| Returns: |
| Extracted text content |
| """ |
| try: |
| with open(file_path, 'r', encoding='utf-8') as file: |
| html_content = file.read() |
| soup = BeautifulSoup(html_content, 'html.parser') |
| |
| |
| for script in soup(["script", "style"]): |
| script.extract() |
| |
| |
| text = soup.get_text() |
| |
| |
| lines = (line.strip() for line in text.splitlines()) |
| |
| |
| chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) |
| |
| |
| text = '\n'.join(chunk for chunk in chunks if chunk) |
| |
| return text |
| except Exception as e: |
| print(f"Error extracting text from HTML {file_path}: {str(e)}") |
| return "" |
| |
| @staticmethod |
| def extract_from_url(url: str) -> str: |
| """ |
| Extract text from a web URL. |
| |
| Args: |
| url: Web URL to extract text from |
| |
| Returns: |
| Extracted text content |
| """ |
| try: |
| import requests |
| response = requests.get(url) |
| soup = BeautifulSoup(response.content, 'html.parser') |
| |
| |
| for script in soup(["script", "style"]): |
| script.extract() |
| |
| |
| text = soup.get_text() |
| |
| |
| lines = (line.strip() for line in text.splitlines()) |
| |
| |
| chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) |
| |
| |
| text = '\n'.join(chunk for chunk in chunks if chunk) |
| |
| return text |
| except Exception as e: |
| print(f"Error extracting text from URL {url}: {str(e)}") |
| return "" |
|
|