| import PyPDF2 |
| import openpyxl |
| from bs4 import BeautifulSoup |
| import os |
| import streamlit as st |
|
|
| def extract_pdf_text(pdf_file): |
| |
| reader = PyPDF2.PdfReader(pdf_file) |
| text = '' |
| for page in reader.pages: |
| page_text = page.extract_text() |
| if page_text: |
| text += page_text + '\n' |
| return text.strip() |
|
|
| def extract_excel_text(excel_file): |
| workbook = openpyxl.load_workbook(excel_file) |
| sheet = workbook.active |
| text = '' |
| for row in sheet.iter_rows(values_only=True): |
| |
| row_text = ' '.join([str(cell) for cell in row if cell is not None]) |
| text += row_text + '\n' |
| return text.strip() |
|
|
| def extract_html_text(html_file): |
| soup = BeautifulSoup(html_file, 'html.parser') |
| text = soup.get_text(separator='\n') |
| return text.strip() |
|
|
| def extract_txt_text(txt_file): |
| text = txt_file.read().decode('utf-8') |
| return text.strip() |
|
|
| def process_file(file): |
| extension = os.path.splitext(file.name)[1].lower() |
|
|
| if extension == '.pdf': |
| return extract_pdf_text(file) |
| elif extension in ['.xlsx', '.xls']: |
| return extract_excel_text(file) |
| elif extension in ['.html', '.htm']: |
| return extract_html_text(file) |
| elif extension == '.txt': |
| return extract_txt_text(file) |
| else: |
| return "Unsupported file format." |
|
|
| |
| st.title("File Content Extractor") |
|
|
| uploaded_file = st.file_uploader("Choose a file", type=['pdf', 'xlsx', 'xls', 'html', 'htm', 'txt']) |
|
|
| if uploaded_file is not None: |
| |
| content = process_file(uploaded_file) |
| st.subheader("Extracted Content:") |
| st.text(content) |
|
|
| |
| search_query = st.text_input("Enter text to search for:") |
| |
| if search_query: |
| |
| search_results = [] |
| lines = content.split('\n') |
| for line in lines: |
| if search_query.lower() in line.lower(): |
| search_results.append(line) |
|
|
| if search_results: |
| st.subheader("Search Results:") |
| for result in search_results: |
| st.text(result) |
| else: |
| st.subheader("Search Results:") |
| st.text("No matching content found.") |
|
|