import PyPDF2 import openpyxl from bs4 import BeautifulSoup import os import streamlit as st def extract_pdf_text(pdf_file): # Read all pages of the PDF file reader = PyPDF2.PdfReader(pdf_file) text = '' for page in reader.pages: page_text = page.extract_text() if page_text: # Ensure there's text to append text += page_text + '\n' # Add a newline to separate pages return text.strip() # Remove trailing whitespace def extract_excel_text(excel_file): workbook = openpyxl.load_workbook(excel_file) sheet = workbook.active text = '' for row in sheet.iter_rows(values_only=True): # Concatenate all cells in the row, ensuring no cells are skipped row_text = ' '.join([str(cell) for cell in row if cell is not None]) text += row_text + '\n' # Newline for each row return text.strip() # Remove trailing whitespace def extract_html_text(html_file): soup = BeautifulSoup(html_file, 'html.parser') text = soup.get_text(separator='\n') # Use separator to maintain line breaks return text.strip() # Remove trailing whitespace def extract_txt_text(txt_file): text = txt_file.read().decode('utf-8') # Read entire text file and decode return text.strip() # Remove trailing whitespace def process_file(file): extension = os.path.splitext(file.name)[1].lower() if extension == '.pdf': return extract_pdf_text(file) elif extension in ['.xlsx', '.xls']: return extract_excel_text(file) elif extension in ['.html', '.htm']: return extract_html_text(file) elif extension == '.txt': return extract_txt_text(file) else: return "Unsupported file format." # Streamlit application st.title("File Content Extractor") uploaded_file = st.file_uploader("Choose a file", type=['pdf', 'xlsx', 'xls', 'html', 'htm', 'txt']) if uploaded_file is not None: # Process the uploaded file content = process_file(uploaded_file) st.subheader("Extracted Content:") st.text(content) # Display extracted content # Search functionality search_query = st.text_input("Enter text to search for:") if search_query: # Search for the query in the extracted content search_results = [] lines = content.split('\n') for line in lines: if search_query.lower() in line.lower(): # Case-insensitive search search_results.append(line) if search_results: st.subheader("Search Results:") for result in search_results: st.text(result) else: st.subheader("Search Results:") st.text("No matching content found.")