Spaces:

hevold
/

iver

Sleeping

App Files Files Community

iver / src /document_processing /extractor.py

hevold

Upload 29 files

b34efa5 verified about 1 year ago

raw

history blame contribute delete

5.39 kB

	"""
	Text extraction module for Norwegian RAG chatbot.
	Extracts text from various document formats.
	"""

	import os
	import PyPDF2
	from typing import List, Optional
	from bs4 import BeautifulSoup

	class TextExtractor:
	"""
	Extracts text from various document formats.
	Currently supports:
	- PDF (.pdf)
	- Text files (.txt)
	- HTML (.html, .htm)
	"""

	@staticmethod
	def extract_from_file(file_path: str) -> str:
	"""
	Extract text from a file based on its extension.

	Args:
	file_path: Path to the document file

	Returns:
	Extracted text content
	"""
	if not os.path.exists(file_path):
	raise FileNotFoundError(f"File not found: {file_path}")

	file_extension = os.path.splitext(file_path)[1].lower()

	if file_extension == '.pdf':
	return TextExtractor.extract_from_pdf(file_path)
	elif file_extension == '.txt':
	return TextExtractor.extract_from_text(file_path)
	elif file_extension in ['.html', '.htm']:
	return TextExtractor.extract_from_html(file_path)
	else:
	raise ValueError(f"Unsupported file format: {file_extension}")

	@staticmethod
	def extract_from_pdf(file_path: str) -> str:
	"""
	Extract text from a PDF file.

	Args:
	file_path: Path to the PDF file

	Returns:
	Extracted text content
	"""
	text = ""
	try:
	with open(file_path, 'rb') as file:
	pdf_reader = PyPDF2.PdfReader(file)
	for page_num in range(len(pdf_reader.pages)):
	page = pdf_reader.pages[page_num]
	text += page.extract_text() + "\n\n"
	except Exception as e:
	print(f"Error extracting text from PDF {file_path}: {str(e)}")
	return ""

	return text

	@staticmethod
	def extract_from_text(file_path: str) -> str:
	"""
	Extract text from a plain text file.

	Args:
	file_path: Path to the text file

	Returns:
	Extracted text content
	"""
	try:
	with open(file_path, 'r', encoding='utf-8') as file:
	return file.read()
	except UnicodeDecodeError:
	# Try with different encoding if UTF-8 fails
	try:
	with open(file_path, 'r', encoding='latin-1') as file:
	return file.read()
	except Exception as e:
	print(f"Error extracting text from file {file_path}: {str(e)}")
	return ""
	except Exception as e:
	print(f"Error extracting text from file {file_path}: {str(e)}")
	return ""

	@staticmethod
	def extract_from_html(file_path: str) -> str:
	"""
	Extract text from an HTML file.

	Args:
	file_path: Path to the HTML file

	Returns:
	Extracted text content
	"""
	try:
	with open(file_path, 'r', encoding='utf-8') as file:
	html_content = file.read()
	soup = BeautifulSoup(html_content, 'html.parser')

	# Remove script and style elements
	for script in soup(["script", "style"]):
	script.extract()

	# Get text
	text = soup.get_text()

	# Break into lines and remove leading and trailing space on each
	lines = (line.strip() for line in text.splitlines())

	# Break multi-headlines into a line each
	chunks = (phrase.strip() for line in lines for phrase in line.split(" "))

	# Drop blank lines
	text = '\n'.join(chunk for chunk in chunks if chunk)

	return text
	except Exception as e:
	print(f"Error extracting text from HTML {file_path}: {str(e)}")
	return ""

	@staticmethod
	def extract_from_url(url: str) -> str:
	"""
	Extract text from a web URL.

	Args:
	url: Web URL to extract text from

	Returns:
	Extracted text content
	"""
	try:
	import requests
	response = requests.get(url)
	soup = BeautifulSoup(response.content, 'html.parser')

	# Remove script and style elements
	for script in soup(["script", "style"]):
	script.extract()

	# Get text
	text = soup.get_text()

	# Break into lines and remove leading and trailing space on each
	lines = (line.strip() for line in text.splitlines())

	# Break multi-headlines into a line each
	chunks = (phrase.strip() for line in lines for phrase in line.split(" "))

	# Drop blank lines
	text = '\n'.join(chunk for chunk in chunks if chunk)

	return text
	except Exception as e:
	print(f"Error extracting text from URL {url}: {str(e)}")
	return ""