text / app.py
Lohith01's picture
Update app.py
e04d89a verified
import PyPDF2
import openpyxl
from bs4 import BeautifulSoup
import os
import streamlit as st
def extract_pdf_text(pdf_file):
# Read all pages of the PDF file
reader = PyPDF2.PdfReader(pdf_file)
text = ''
for page in reader.pages:
page_text = page.extract_text()
if page_text: # Ensure there's text to append
text += page_text + '\n' # Add a newline to separate pages
return text.strip() # Remove trailing whitespace
def extract_excel_text(excel_file):
workbook = openpyxl.load_workbook(excel_file)
sheet = workbook.active
text = ''
for row in sheet.iter_rows(values_only=True):
# Concatenate all cells in the row, ensuring no cells are skipped
row_text = ' '.join([str(cell) for cell in row if cell is not None])
text += row_text + '\n' # Newline for each row
return text.strip() # Remove trailing whitespace
def extract_html_text(html_file):
soup = BeautifulSoup(html_file, 'html.parser')
text = soup.get_text(separator='\n') # Use separator to maintain line breaks
return text.strip() # Remove trailing whitespace
def extract_txt_text(txt_file):
text = txt_file.read().decode('utf-8') # Read entire text file and decode
return text.strip() # Remove trailing whitespace
def process_file(file):
extension = os.path.splitext(file.name)[1].lower()
if extension == '.pdf':
return extract_pdf_text(file)
elif extension in ['.xlsx', '.xls']:
return extract_excel_text(file)
elif extension in ['.html', '.htm']:
return extract_html_text(file)
elif extension == '.txt':
return extract_txt_text(file)
else:
return "Unsupported file format."
# Streamlit application
st.title("File Content Extractor")
uploaded_file = st.file_uploader("Choose a file", type=['pdf', 'xlsx', 'xls', 'html', 'htm', 'txt'])
if uploaded_file is not None:
# Process the uploaded file
content = process_file(uploaded_file)
st.subheader("Extracted Content:")
st.text(content) # Display extracted content
# Search functionality
search_query = st.text_input("Enter text to search for:")
if search_query:
# Search for the query in the extracted content
search_results = []
lines = content.split('\n')
for line in lines:
if search_query.lower() in line.lower(): # Case-insensitive search
search_results.append(line)
if search_results:
st.subheader("Search Results:")
for result in search_results:
st.text(result)
else:
st.subheader("Search Results:")
st.text("No matching content found.")