Spaces:

ShayanRl
/

pdf2text

Running

App Files Files Community

pdf2text / app.py

ShayanRl

Update app.py

5405dd5 verified 5 months ago

raw

history blame contribute delete

2.06 kB

	import streamlit as st

	import io

	import requests
	import pdfplumber


	def fextractURL(pdf_path):
	extracted_data = ""

	if pdf_path.endswith('.pdf'):
	# If the URL ends with .pdf, use pdfplumber directly
	r = requests.get(pdf_path)
	f = io.BytesIO(r.content)
	with pdfplumber.open(f) as pdf:
	for page in pdf.pages:
	extracted_data += page.extract_text() + "\n" # Extract text
	tables = page.extract_tables() # Extract tables
	for table in tables:
	for row in table:
	extracted_data += "\t".join(str(cell) for cell in row) + "\n"
	else:
	# If the URL does not end with .pdf, download the PDF first
	response = requests.get(pdf_path)
	pdf_content = response.content

	# Save the PDF locally
	pdf_filename = 'downloaded_document.pdf'
	with open(pdf_filename, 'wb') as pdf_file:
	pdf_file.write(pdf_content)

	# Extract content using pdfplumber
	with pdfplumber.open(pdf_filename) as pdf:
	for page in pdf.pages:
	extracted_data += page.extract_text() + "\n" # Extract text
	tables = page.extract_tables() # Extract tables
	for table in tables:
	for row in table:
	extracted_data += "\t".join(str(cell) for cell in row) + "\n"

	# Delete the PDF file





	return extracted_data


	vert_space = '<div style="padding: 3rem 1rem;"></div>'
	st.markdown(vert_space, unsafe_allow_html=True)
	st.write("Extarct full text from PDF url")

	pdfURL = st.text_input(label="origin URL", value="", max_chars=None, key=None, type="default", help=None, autocomplete=None, on_change=None, args=None, kwargs=None, placeholder=None, disabled=False, label_visibility="visible")
	button = st.button(label='Extract', key=None, help=None, on_click=None, args=None, kwargs=None, type="secondary", disabled=False, use_container_width=False)
	extractedText = st.empty()