import PyPDF2
import openpyxl
from bs4 import BeautifulSoup
import os
import streamlit as st

def extract_pdf_text(pdf_file):
    # Read all pages of the PDF file
    reader = PyPDF2.PdfReader(pdf_file)
    text = ''
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:  # Ensure there's text to append
            text += page_text + '\n'  # Add a newline to separate pages
    return text.strip()  # Remove trailing whitespace

def extract_excel_text(excel_file):
    workbook = openpyxl.load_workbook(excel_file)
    sheet = workbook.active
    text = ''
    for row in sheet.iter_rows(values_only=True):
        # Concatenate all cells in the row, ensuring no cells are skipped
        row_text = ' '.join([str(cell) for cell in row if cell is not None])
        text += row_text + '\n'  # Newline for each row
    return text.strip()  # Remove trailing whitespace

def extract_html_text(html_file):
    soup = BeautifulSoup(html_file, 'html.parser')
    text = soup.get_text(separator='\n')  # Use separator to maintain line breaks
    return text.strip()  # Remove trailing whitespace

def extract_txt_text(txt_file):
    text = txt_file.read().decode('utf-8')  # Read entire text file and decode
    return text.strip()  # Remove trailing whitespace

def process_file(file):
    extension = os.path.splitext(file.name)[1].lower()

    if extension == '.pdf':
        return extract_pdf_text(file)
    elif extension in ['.xlsx', '.xls']:
        return extract_excel_text(file)
    elif extension in ['.html', '.htm']:
        return extract_html_text(file)
    elif extension == '.txt':
        return extract_txt_text(file)
    else:
        return "Unsupported file format."

# Streamlit application
st.title("File Content Extractor")

uploaded_file = st.file_uploader("Choose a file", type=['pdf', 'xlsx', 'xls', 'html', 'htm', 'txt'])

if uploaded_file is not None:
    # Process the uploaded file
    content = process_file(uploaded_file)
    st.subheader("Extracted Content:")
    st.text(content)  # Display extracted content

    # Search functionality
    search_query = st.text_input("Enter text to search for:")
    
    if search_query:
        # Search for the query in the extracted content
        search_results = []
        lines = content.split('\n')
        for line in lines:
            if search_query.lower() in line.lower():  # Case-insensitive search
                search_results.append(line)

        if search_results:
            st.subheader("Search Results:")
            for result in search_results:
                st.text(result)
        else:
            st.subheader("Search Results:")
            st.text("No matching content found.")