| import streamlit as st |
|
|
| import io |
|
|
| import requests |
| import pdfplumber |
|
|
|
|
| def fextractURL(pdf_path): |
| extracted_data = "" |
|
|
| if pdf_path.endswith('.pdf'): |
| |
| r = requests.get(pdf_path) |
| f = io.BytesIO(r.content) |
| with pdfplumber.open(f) as pdf: |
| for page in pdf.pages: |
| extracted_data += page.extract_text() + "\n" |
| tables = page.extract_tables() |
| for table in tables: |
| for row in table: |
| extracted_data += "\t".join(str(cell) for cell in row) + "\n" |
| else: |
| |
| response = requests.get(pdf_path) |
| pdf_content = response.content |
|
|
| |
| pdf_filename = 'downloaded_document.pdf' |
| with open(pdf_filename, 'wb') as pdf_file: |
| pdf_file.write(pdf_content) |
|
|
| |
| with pdfplumber.open(pdf_filename) as pdf: |
| for page in pdf.pages: |
| extracted_data += page.extract_text() + "\n" |
| tables = page.extract_tables() |
| for table in tables: |
| for row in table: |
| extracted_data += "\t".join(str(cell) for cell in row) + "\n" |
|
|
| |
| |
| |
|
|
|
|
|
|
| return extracted_data |
|
|
|
|
| vert_space = '<div style="padding: 3rem 1rem;"></div>' |
| st.markdown(vert_space, unsafe_allow_html=True) |
| st.write("Extarct full text from PDF url") |
|
|
| pdfURL = st.text_input(label="origin URL", value="", max_chars=None, key=None, type="default", help=None, autocomplete=None, on_change=None, args=None, kwargs=None, placeholder=None, disabled=False, label_visibility="visible") |
| button = st.button(label='Extract', key=None, help=None, on_click=None, args=None, kwargs=None, type="secondary", disabled=False, use_container_width=False) |
| extractedText = st.empty() |