| def extract_text_from_image(image_path): |
| """Extract text from image using OCR""" |
| try: |
| try: |
| pytesseract.get_tesseract_version() |
| except Exception: |
| return "Error: Tesseract OCR is not installed. Please install Tesseract to extract text from images. See install_tesseract.md for instructions." |
| image = cv2.imread(image_path) |
| if image is None: |
| return "Error: Could not read image file" |
| image_rgb=cv2.cvtColor(image,cv2.COLOR_BGR2RGB) |
| gray=cv2.cvtColor(image_rgb,cv2.COLOR_RGB2GRAY) |
| _,binary=cv2.threshold(gray,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU) |
| text=pytesseract.image_to_string(binary,config='--psm 6') |
| return text.strip() if text.strip() else "No text found in image" |
| except Exception as e: |
| return f"Error extracting text from image: {e}" |
|
|
| def extract_text_from_file(file_path): |
| if not file_path: |
| return "" |
| mime,_=mimetypes.guess_type(file_path) |
| ext=os.path.splitext(file_path)[1].lower() |
| try: |
| if ext==".pdf": |
| with open(file_path,"rb") as f: |
| reader=PyPDF2.PdfReader(f) |
| return "\n".join(page.extract_text() or "" for page in reader.pages) |
| elif ext in [".txt", ".md"]: |
| with open(file_path,"r",encoding="utf-8") as f: |
| return f.read() |
| elif ext==".csv": |
| with open(file_path,"r",encoding="utf-8") as f: |
| return f.read() |
| elif ext==".docx": |
| doc=docx.Document(file_path) |
| return "\n".join([para.text for para in doc.paragraphs]) |
| elif ext.lower() in [".jpg",".jpeg",".png",".bmp",".tiff",".tif",".gif",".webp"]: |
| return extract_text_from_image(file_path) |
| else: |
| return "" |
| except Exception as e: |
| return f"Error extracting text: {e}" |
|
|
| def extract_website_content(url: str) -> str: |
| """Extract HTML code and content from a website URL""" |
| try: |
| parsed_url=urlparse(url) |
| if not parsed_url.scheme: |
| url="https://"+url |
| parsed_url=urlparse(url) |
| if not parsed_url.netloc: |
| return "Error: Invalid URL provided" |
| headers={ |
| 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', |
| 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', |
| 'Accept-Language':'en-US,en;q=0.9', |
| 'Accept-Encoding':'gzip, deflate, br', |
| 'DNT':'1','Connection':'keep-alive','Upgrade-Insecure-Requests':'1', |
| 'Sec-Fetch-Dest':'document','Sec-Fetch-Mode':'navigate','Sec-Fetch-Site':'none','Sec-Fetch-User':'?1','Cache-Control':'max-age=0' |
| } |
| session=requests.Session() |
| session.headers.update(headers) |
| max_retries=3 |
| for attempt in range(max_retries): |
| try: |
| response=session.get(url,timeout=15,allow_redirects=True) |
| response.raise_for_status() |
| break |
| except requests.exceptions.HTTPError as e: |
| if e.response.status_code==403 and attempt<max_retries-1: |
| session.headers['User-Agent']='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' |
| continue |
| else: |
| raise |
| try: |
| response.encoding=response.apparent_encoding |
| raw_html=response.text |
| except: |
| raw_html=response.content.decode('utf-8',errors='ignore') |
| if not raw_html.strip().startswith('<!DOCTYPE') and not raw_html.strip().startswith('<html'): |
| try: |
| raw_html=response.content.decode('latin-1',errors='ignore') |
| except: |
| try: |
| raw_html=response.content.decode('utf-8',errors='ignore') |
| except: |
| raw_html=response.content.decode('cp1252',errors='ignore') |
| soup=BeautifulSoup(raw_html,'html.parser') |
| title=soup.find('title') |
| title_text=title.get_text().strip() if title else "No title found" |
| meta_desc=soup.find('meta',attrs={'name':'description'}) |
| description=meta_desc.get('content','') if meta_desc else "" |
| content_sections=[] |
| main_selectors=['main','article','.content','.main-content','.post-content','#content','#main','.entry-content','.post-body'] |
| for selector in main_selectors: |
| elements=soup.select(selector) |
| for element in elements: |
| text=element.get_text().strip() |
| if len(text)>100: |
| content_sections.append(text) |
| nav_links=[] |
| nav_elements=soup.find_all(['nav','header']) |
| for nav in nav_elements: |
| links=nav.find_all('a') |
| for link in links: |
| link_text=link.get_text().strip() |
| link_href=link.get('href','') |
| if link_text and link_href: |
| nav_links.append(f"{link_text}: {link_href}") |
| img_elements=soup.find_all('img') |
| for img in img_elements: |
| src=img.get('src','') |
| if src: |
| if src.startswith('//'): |
| absolute_src='https:'+src |
| img['src']=absolute_src |
| elif src.startswith('/'): |
| absolute_src=urljoin(url,src) |
| img['src']=absolute_src |
| elif not src.startswith(('http://','https://')): |
| absolute_src=urljoin(url,src) |
| img['src']=absolute_src |
| data_src=img.get('data-src','') |
| if data_src and not src: |
| if data_src.startswith('//'): |
| absolute_data_src='https:'+data_src |
| img['src']=absolute_data_src |
| elif data_src.startswith('/'): |
| absolute_data_src=urljoin(url,data_src) |
| img['src']=absolute_data_src |
| elif not data_src.startswith(('http://','https://')): |
| absolute_data_src=urljoin(url,data_src) |
| img['src']=absolute_data_src |
| else: |
| img['src']=data_src |
| elements_with_style=soup.find_all(attrs={'style':True}) |
| for element in elements_with_style: |
| style_attr=element.get('style','') |
| import re |
| bg_pattern=r'background-image:\s*url\(["\']?([^"\']+)["\']?\)' |
| matches=re.findall(bg_pattern,style_attr, re.IGNORECASE) |
| for match in matches: |
| if match.startswith('//'): |
| absolute_bg='https:'+match |
| style_attr=style_attr.replace(match,absolute_bg) |
| elif match.startswith('/'): |
| absolute_bg=urljoin(url,match) |
| style_attr=style_attr.replace(match,absolute_bg) |
| elif not match.startswith(('http://','https://')): |
| absolute_bg=urljoin(url,match) |
| style_attr=style_attr.replace(match,absolute_bg) |
| element['style']=style_attr |
| style_elements=soup.find_all('style') |
| for style in style_elements: |
| if style.string: |
| style_content=style.string |
| bg_pattern=r'background-image:\s*url\(["\']?([^"\']+)["\']?\)' |
| matches=re.findall(bg_pattern,style_content, re.IGNORECASE) |
| for match in matches: |
| if match.startswith('//'): |
| absolute_bg='https:'+match |
| style_content=style_content.replace(match,absolute_bg) |
| elif match.startswith('/'): |
| absolute_bg=urljoin(url,match) |
| style_content=style_content.replace(match,absolute_bg) |
| elif not match.startswith(('http://','https://')): |
| absolute_bg=urljoin(url,match) |
| style_content=style_content.replace(match,absolute_bg) |
| style.string=style_content |
| images=[] |
| img_elements=soup.find_all('img') |
| for img in img_elements: |
| src=img.get('src','') |
| alt=img.get('alt','') |
| if src: |
| images.append({'src':src,'alt':alt}) |
| def test_image_url(img_url): |
| try: |
| test_response=requests.head(img_url,timeout=5,allow_redirects=True) |
| return test_response.status_code==200 |
| except: |
| return False |
| working_images=[] |
| for img in images[:10]: |
| if test_image_url(img['src']): |
| working_images.append(img) |
| modified_html=str(soup) |
| import re |
| cleaned_html=re.sub(r'<!--.*?-->','',modified_html,flags=re.DOTALL) |
| cleaned_html=re.sub(r'\s+',' ',cleaned_html) |
| cleaned_html=re.sub(r'>\s+<','><',cleaned_html) |
| if len(cleaned_html)>15000: |
| cleaned_html=cleaned_html[:15000]+"\n<!-- ... HTML truncated for length ... -->" |
| if not title_text or title_text=="No title found": |
| title_text=url.split('/')[-1] or url.split('/')[-2] or "Website" |
| if len(cleaned_html.strip())<100: |
| website_content=f""" |
| WEBSITE REDESIGN - EXTRACTION FAILED |
| ==================================== |
| URL: {url} |
| Title: {title_text} |
| ERROR: Could not extract meaningful HTML content from this website. This could be due to: |
| 1. The website uses heavy JavaScript to load content dynamically |
| 2. The website has anti-bot protection |
| 3. The website requires authentication |
| 4. The website is using advanced compression or encoding |
| FALLBACK APPROACH: |
| Please create a modern, responsive website design for a {title_text.lower()} website. Since I couldn't extract the original content, you can: |
| 1. Create a typical layout for this type of website |
| 2. Use placeholder content that would be appropriate |
| 3. Include modern design elements and responsive features |
| 4. Use a clean, professional design with good typography |
| 5. Make it mobile-friendly and accessible |
| This will help me create a better design for you.""" |
| return website_content.strip() |
| website_content=f""" |
| WEBSITE REDESIGN - ORIGINAL HTML CODE |
| ===[TRUNCATED FOR BREVITY]===""" |
| return website_content.strip() |
| except requests.exceptions.HTTPError as e: |
| if e.response.status_code==403: |
| return f"Error: Website blocked access (403 Forbidden). This website may have anti-bot protection. Try a different website or provide a description of what you want to build instead." |
| elif e.response.status_code==404: |
| return f"Error: Website not found (404). Please check the URL and try again." |
| elif e.response.status_code>=500: |
| return f"Error: Website server error ({e.response.status_code}). Please try again later." |
| else: |
| return f"Error accessing website: HTTP {e.response.status_code} - {str(e)}" |
| except requests.exceptions.Timeout: |
| return "Error: Request timed out. The website may be slow or unavailable." |
| except requests.exceptions.ConnectionError: |
| return "Error: Could not connect to the website. Please check your internet connection and the URL." |
| except requests.exceptions.RequestException as e: |
| return f"Error accessing website: {str(e)}" |
| except Exception as e: |
| return f"Error extracting website content: {str(e)}" |
|
|