Spaces:

mgbam
/

builder

Running

App Files Files Community

builder / web_scraper.py

mgbam

Rename services.py to web_scraper.py

c04089b verified 9 months ago

raw

history blame contribute delete

11.7 kB

	def extract_text_from_image(image_path):
	"""Extract text from image using OCR"""
	try:
	try:
	pytesseract.get_tesseract_version()
	except Exception:
	return "Error: Tesseract OCR is not installed. Please install Tesseract to extract text from images. See install_tesseract.md for instructions."
	image = cv2.imread(image_path)
	if image is None:
	return "Error: Could not read image file"
	image_rgb=cv2.cvtColor(image,cv2.COLOR_BGR2RGB)
	gray=cv2.cvtColor(image_rgb,cv2.COLOR_RGB2GRAY)
	_,binary=cv2.threshold(gray,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)
	text=pytesseract.image_to_string(binary,config='--psm 6')
	return text.strip() if text.strip() else "No text found in image"
	except Exception as e:
	return f"Error extracting text from image: {e}"

	def extract_text_from_file(file_path):
	if not file_path:
	return ""
	mime,_=mimetypes.guess_type(file_path)
	ext=os.path.splitext(file_path)[1].lower()
	try:
	if ext==".pdf":
	with open(file_path,"rb") as f:
	reader=PyPDF2.PdfReader(f)
	return "\n".join(page.extract_text() or "" for page in reader.pages)
	elif ext in [".txt", ".md"]:
	with open(file_path,"r",encoding="utf-8") as f:
	return f.read()
	elif ext==".csv":
	with open(file_path,"r",encoding="utf-8") as f:
	return f.read()
	elif ext==".docx":
	doc=docx.Document(file_path)
	return "\n".join([para.text for para in doc.paragraphs])
	elif ext.lower() in [".jpg",".jpeg",".png",".bmp",".tiff",".tif",".gif",".webp"]:
	return extract_text_from_image(file_path)
	else:
	return ""
	except Exception as e:
	return f"Error extracting text: {e}"

	def extract_website_content(url: str) -> str:
	"""Extract HTML code and content from a website URL"""
	try:
	parsed_url=urlparse(url)
	if not parsed_url.scheme:
	url="https://"+url
	parsed_url=urlparse(url)
	if not parsed_url.netloc:
	return "Error: Invalid URL provided"
	headers={
	'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
	'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,/;q=0.8',
	'Accept-Language':'en-US,en;q=0.9',
	'Accept-Encoding':'gzip, deflate, br',
	'DNT':'1','Connection':'keep-alive','Upgrade-Insecure-Requests':'1',
	'Sec-Fetch-Dest':'document','Sec-Fetch-Mode':'navigate','Sec-Fetch-Site':'none','Sec-Fetch-User':'?1','Cache-Control':'max-age=0'
	}
	session=requests.Session()
	session.headers.update(headers)
	max_retries=3
	for attempt in range(max_retries):
	try:
	response=session.get(url,timeout=15,allow_redirects=True)
	response.raise_for_status()
	break
	except requests.exceptions.HTTPError as e:
	if e.response.status_code==403 and attempt<max_retries-1:
	session.headers['User-Agent']='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
	continue
	else:
	raise
	try:
	response.encoding=response.apparent_encoding
	raw_html=response.text
	except:
	raw_html=response.content.decode('utf-8',errors='ignore')
	if not raw_html.strip().startswith('<!DOCTYPE') and not raw_html.strip().startswith('<html'):
	try:
	raw_html=response.content.decode('latin-1',errors='ignore')
	except:
	try:
	raw_html=response.content.decode('utf-8',errors='ignore')
	except:
	raw_html=response.content.decode('cp1252',errors='ignore')
	soup=BeautifulSoup(raw_html,'html.parser')
	title=soup.find('title')
	title_text=title.get_text().strip() if title else "No title found"
	meta_desc=soup.find('meta',attrs={'name':'description'})
	description=meta_desc.get('content','') if meta_desc else ""
	content_sections=[]
	main_selectors=['main','article','.content','.main-content','.post-content','#content','#main','.entry-content','.post-body']
	for selector in main_selectors:
	elements=soup.select(selector)
	for element in elements:
	text=element.get_text().strip()
	if len(text)>100:
	content_sections.append(text)
	nav_links=[]
	nav_elements=soup.find_all(['nav','header'])
	for nav in nav_elements:
	links=nav.find_all('a')
	for link in links:
	link_text=link.get_text().strip()
	link_href=link.get('href','')
	if link_text and link_href:
	nav_links.append(f"{link_text}: {link_href}")
	img_elements=soup.find_all('img')
	for img in img_elements:
	src=img.get('src','')
	if src:
	if src.startswith('//'):
	absolute_src='https:'+src
	img['src']=absolute_src
	elif src.startswith('/'):
	absolute_src=urljoin(url,src)
	img['src']=absolute_src
	elif not src.startswith(('http://','https://')):
	absolute_src=urljoin(url,src)
	img['src']=absolute_src
	data_src=img.get('data-src','')
	if data_src and not src:
	if data_src.startswith('//'):
	absolute_data_src='https:'+data_src
	img['src']=absolute_data_src
	elif data_src.startswith('/'):
	absolute_data_src=urljoin(url,data_src)
	img['src']=absolute_data_src
	elif not data_src.startswith(('http://','https://')):
	absolute_data_src=urljoin(url,data_src)
	img['src']=absolute_data_src
	else:
	img['src']=data_src
	elements_with_style=soup.find_all(attrs={'style':True})
	for element in elements_with_style:
	style_attr=element.get('style','')
	import re
	bg_pattern=r'background-image:\s*url\(["\']?([^"\']+)["\']?\)'
	matches=re.findall(bg_pattern,style_attr, re.IGNORECASE)
	for match in matches:
	if match.startswith('//'):
	absolute_bg='https:'+match
	style_attr=style_attr.replace(match,absolute_bg)
	elif match.startswith('/'):
	absolute_bg=urljoin(url,match)
	style_attr=style_attr.replace(match,absolute_bg)
	elif not match.startswith(('http://','https://')):
	absolute_bg=urljoin(url,match)
	style_attr=style_attr.replace(match,absolute_bg)
	element['style']=style_attr
	style_elements=soup.find_all('style')
	for style in style_elements:
	if style.string:
	style_content=style.string
	bg_pattern=r'background-image:\s*url\(["\']?([^"\']+)["\']?\)'
	matches=re.findall(bg_pattern,style_content, re.IGNORECASE)
	for match in matches:
	if match.startswith('//'):
	absolute_bg='https:'+match
	style_content=style_content.replace(match,absolute_bg)
	elif match.startswith('/'):
	absolute_bg=urljoin(url,match)
	style_content=style_content.replace(match,absolute_bg)
	elif not match.startswith(('http://','https://')):
	absolute_bg=urljoin(url,match)
	style_content=style_content.replace(match,absolute_bg)
	style.string=style_content
	images=[]
	img_elements=soup.find_all('img')
	for img in img_elements:
	src=img.get('src','')
	alt=img.get('alt','')
	if src:
	images.append({'src':src,'alt':alt})
	def test_image_url(img_url):
	try:
	test_response=requests.head(img_url,timeout=5,allow_redirects=True)
	return test_response.status_code==200
	except:
	return False
	working_images=[]
	for img in images[:10]:
	if test_image_url(img['src']):
	working_images.append(img)
	modified_html=str(soup)
	import re
	cleaned_html=re.sub(r'<!--.*?-->','',modified_html,flags=re.DOTALL)
	cleaned_html=re.sub(r'\s+',' ',cleaned_html)
	cleaned_html=re.sub(r'>\s+<','><',cleaned_html)
	if len(cleaned_html)>15000:
	cleaned_html=cleaned_html[:15000]+"\n<!-- ... HTML truncated for length ... -->"
	if not title_text or title_text=="No title found":
	title_text=url.split('/')[-1] or url.split('/')[-2] or "Website"
	if len(cleaned_html.strip())<100:
	website_content=f"""
	WEBSITE REDESIGN - EXTRACTION FAILED
	====================================
	URL: {url}
	Title: {title_text}
	ERROR: Could not extract meaningful HTML content from this website. This could be due to:
	1. The website uses heavy JavaScript to load content dynamically
	2. The website has anti-bot protection
	3. The website requires authentication
	4. The website is using advanced compression or encoding
	FALLBACK APPROACH:
	Please create a modern, responsive website design for a {title_text.lower()} website. Since I couldn't extract the original content, you can:
	1. Create a typical layout for this type of website
	2. Use placeholder content that would be appropriate
	3. Include modern design elements and responsive features
	4. Use a clean, professional design with good typography
	5. Make it mobile-friendly and accessible
	This will help me create a better design for you."""
	return website_content.strip()
	website_content=f"""
	WEBSITE REDESIGN - ORIGINAL HTML CODE
	===[TRUNCATED FOR BREVITY]==="""
	return website_content.strip()
	except requests.exceptions.HTTPError as e:
	if e.response.status_code==403:
	return f"Error: Website blocked access (403 Forbidden). This website may have anti-bot protection. Try a different website or provide a description of what you want to build instead."
	elif e.response.status_code==404:
	return f"Error: Website not found (404). Please check the URL and try again."
	elif e.response.status_code>=500:
	return f"Error: Website server error ({e.response.status_code}). Please try again later."
	else:
	return f"Error accessing website: HTTP {e.response.status_code} - {str(e)}"
	except requests.exceptions.Timeout:
	return "Error: Request timed out. The website may be slow or unavailable."
	except requests.exceptions.ConnectionError:
	return "Error: Could not connect to the website. Please check your internet connection and the URL."
	except requests.exceptions.RequestException as e:
	return f"Error accessing website: {str(e)}"
	except Exception as e:
	return f"Error extracting website content: {str(e)}"