Spaces:

Lohith01
/

text

Sleeping

App Files Files Community

text / app.py

Lohith01

Update app.py

e04d89a verified over 1 year ago

raw

history blame contribute delete

2.72 kB

	import PyPDF2
	import openpyxl
	from bs4 import BeautifulSoup
	import os
	import streamlit as st

	def extract_pdf_text(pdf_file):
	# Read all pages of the PDF file
	reader = PyPDF2.PdfReader(pdf_file)
	text = ''
	for page in reader.pages:
	page_text = page.extract_text()
	if page_text: # Ensure there's text to append
	text += page_text + '\n' # Add a newline to separate pages
	return text.strip() # Remove trailing whitespace

	def extract_excel_text(excel_file):
	workbook = openpyxl.load_workbook(excel_file)
	sheet = workbook.active
	text = ''
	for row in sheet.iter_rows(values_only=True):
	# Concatenate all cells in the row, ensuring no cells are skipped
	row_text = ' '.join([str(cell) for cell in row if cell is not None])
	text += row_text + '\n' # Newline for each row
	return text.strip() # Remove trailing whitespace

	def extract_html_text(html_file):
	soup = BeautifulSoup(html_file, 'html.parser')
	text = soup.get_text(separator='\n') # Use separator to maintain line breaks
	return text.strip() # Remove trailing whitespace

	def extract_txt_text(txt_file):
	text = txt_file.read().decode('utf-8') # Read entire text file and decode
	return text.strip() # Remove trailing whitespace

	def process_file(file):
	extension = os.path.splitext(file.name)[1].lower()

	if extension == '.pdf':
	return extract_pdf_text(file)
	elif extension in ['.xlsx', '.xls']:
	return extract_excel_text(file)
	elif extension in ['.html', '.htm']:
	return extract_html_text(file)
	elif extension == '.txt':
	return extract_txt_text(file)
	else:
	return "Unsupported file format."

	# Streamlit application
	st.title("File Content Extractor")

	uploaded_file = st.file_uploader("Choose a file", type=['pdf', 'xlsx', 'xls', 'html', 'htm', 'txt'])

	if uploaded_file is not None:
	# Process the uploaded file
	content = process_file(uploaded_file)
	st.subheader("Extracted Content:")
	st.text(content) # Display extracted content

	# Search functionality
	search_query = st.text_input("Enter text to search for:")

	if search_query:
	# Search for the query in the extracted content
	search_results = []
	lines = content.split('\n')
	for line in lines:
	if search_query.lower() in line.lower(): # Case-insensitive search
	search_results.append(line)

	if search_results:
	st.subheader("Search Results:")
	for result in search_results:
	st.text(result)
	else:
	st.subheader("Search Results:")
	st.text("No matching content found.")