| def get_text_from_html(html_content): |
| soup = BeautifulSoup(html_content, 'html.parser') |
| |
| all_text = soup.get_text() |
| all_text = re.sub(r"\s+", " ", all_text) |
| |
| return all_text |
| |
| def get_text(file_path): |
| with open(file_path, 'rb') as file: |
| message = email.message_from_bytes(file.read()) |
| text_content = "" |
| for part in message.walk(): |
| if part.get_content_type() == 'text/plain': |
| text_content += part.get_payload(decode=True).decode('iso-8859-1') |
| |
| return text_content.replace("\n","") |
| if text_content == "": |
| return get_text_from_html(get_html_general(file_path)); |
| from bs4 import BeautifulSoup |
| import email |
| def get_email_html(file_path): |
| with open(file_path, 'rb') as file: |
| content = email.message_from_bytes(file.read()) |
| html_content = "" |
| for part in content.walk(): |
| if part.get_content_type() == 'text/html': |
| html_content += part.get_payload(decode=True).decode('iso-8859-1') |
| html_content.replace("\n","") |
| if html_content != "": |
| |
| return html_content |
| else: |
| |
| return "" |
|
|
| |
| def get_html(file_path): |
| with open(file_path, 'r',encoding='iso-8859-1') as file: |
| html_flag = False |
| html_content = ""; |
| tag_list = [] |
| for line in file: |
| words = line.split() |
| for word in words: |
| if word == "<html>": |
| html_flag = True; |
| if html_flag: |
| html_content += word |
| if word == "</html>": |
| html_flag = False; |
| |
| html_content.replace("\n","") |
| if html_content == "": |
| |
| return "" |
| else: |
| |
| return html_content |
|
|
| def get_html_general(file_path): |
| if get_email_html(file_path)!="": |
| return get_email_html(file_path) |
| else: |
| return get_html(file_path) |
| def get_onclicks(file_path): |
| content = get_html_general(file_path) |
| if content == "": return None |
| soup = BeautifulSoup(content, 'html.parser') |
|
|
| elements = soup.find_all(attrs={'onClick': True}) |
| |
| count = len(elements) |
| return count |
| def check_popWindow(file_path): |
| content = get_html_general(file_path) |
| if content == "": return None |
| soup = BeautifulSoup(content, 'html.parser') |
|
|
| |
| try: |
| scripts = soup.find_all('script', text=lambda text: 'window.open' in text) |
| if scripts: |
| return True |
| |
| else: |
| |
| return False |
| except TypeError: |
| return False |
|
|
| def check_spf(file_path): |
| with open(file_path, 'rb') as file: |
| message = email.message_from_bytes(file.read()) |
| received_spf_header = message.get('Received-SPF') |
| if received_spf_header == None: |
| return 0 |
| if received_spf_header: |
| spf_result = received_spf_header.split()[0].lower() |
| if spf_result == 'pass': |
| return 1 |
| elif spf_result == 'neutral': |
| return 2 |
| elif spf_result == 'softfail': |
| return 3 |
| else: |
| return 0 |
| else: |
| return 0 |
| def check_dkim(file_path): |
| with open(file_path, 'rb') as file: |
| message = email.message_from_bytes(file.read()) |
| auth = message.get('Authentication-Results') |
| if auth == None: |
| return 0 |
| auth_result = auth.split() |
| |
| |
| if 'dkim=pass' in auth_result: |
| return 1 |
| else: |
| return 0 |
| def check_dmarc(file_path): |
| with open(file_path, 'rb') as file: |
| message = email.message_from_bytes(file.read()) |
| auth = message.get('Authentication-Results') |
| if auth == None: |
| return 0 |
| auth_result = auth.split() |
| |
| |
| if 'dmarc=pass' in auth_result: |
| return 1 |
| else: |
| return 0 |
| def check_deliver_receiver(filepath): |
| with open(filepath, 'rb') as file: |
| message = email.message_from_bytes(file.read()) |
| deliver = message.get('Delivered-To') |
| |
| receiver = message.get('To') |
| |
| if deliver == receiver: |
| return 1 |
| else: |
| return 0 |
| def check_encript(filepath): |
| with open(filepath, 'rb') as file: |
| message = email.message_from_bytes(file.read()) |
| received_headers = message.get_all('Received') |
| |
| version_string = 'version' |
| try: |
| for received_header in received_headers: |
| if version_string in received_header: |
| return 1 |
| except TypeError: |
| return 0 |
| return 0 |
| def get_tags_from_html(html_content): |
| soup = BeautifulSoup(html_content, 'html.parser') |
| tag_list = [] |
| html_tags = soup.find_all() |
| for tag in html_tags: |
| tag_list += [tag.name] |
| |
| return tag_list |
| import ipaddress |
| from urllib.parse import urlparse |
| import urllib.request |
| from bs4 import BeautifulSoup |
| import re |
| import email |
|
|
| |
| def get_urls_from_html(html_content): |
| soup = BeautifulSoup(html_content, 'html.parser') |
| urls = [] |
| |
| anchor_tags = soup.find_all('a') |
| for tag in anchor_tags: |
| href = tag.get('href') |
| if href: |
| if re.match('^https?://', href): |
| |
| urls += [href] |
| return urls |
| def get_text(file_path): |
| with open(file_path, 'rb') as file: |
| message = email.message_from_bytes(file.read()) |
| text_content = "" |
| for part in message.walk(): |
| if part.get_content_type() == 'text/plain': |
| text_content += part.get_payload(decode=True).decode('iso-8859-1') |
| |
| return text_content.replace("\n","") |
| if text_content == "": |
| return get_text_from_html(get_html_general(file_path)); |
| def get_num_words(file_path): |
| if get_text(file_path) != "": |
| words = len(get_text(file_path).split()) |
| return words |
| if get_html_general(file_path) != "": |
| words = len(get_text_from_html(get_html_general(file_path)).split()) |
| return words |
| else: |
| return 0 |
|
|
| |
| def get_num_chars(file_path): |
| if get_text(file_path) != "": |
| chars = len(get_text(file_path).replace(" ","")) |
| return chars |
| if get_html_general(file_path) != "": |
| chars = len(get_text_from_html(get_html_general(file_path)).replace(" ","")) |
| return chars |
| else: |
| return 0 |
|
|
| |
| def get_body_richness(filepath): |
| if get_num_chars(filepath) == 0: return 0 |
| return get_num_words(filepath)/get_num_chars(filepath) |
|
|
| |
| def get_num_FunctionWords(file_path): |
| function_words = ["account","access","bank","credit","click","identity","inconvenience","information","limited","log","minutes","password","recently","risk","social","security","service","suspended"] |
| content = "" |
| count = 0 |
| if get_text(file_path) != "": |
| content = get_text(file_path).split() |
| elif get_html_general(file_path) != "": |
| content = get_text_from_html(get_html_general(file_path)).split() |
| else: |
| return None |
| for w in function_words: |
| if w in content: |
| count += 1 |
| return count |
|
|
|
|
| def get_email_html(file_path): |
| with open(file_path, 'rb') as file: |
| content = email.message_from_bytes(file.read()) |
| html_content = "" |
| for part in content.walk(): |
| if part.get_content_type() == 'text/html': |
| html_content += part.get_payload(decode=True).decode('iso-8859-1') |
| html_content.replace("\n","") |
| if html_content != "": |
| |
| return html_content |
| else: |
| |
| return "" |
|
|
| |
| def get_num_sbj(file_path): |
| count = len(get_subject(file_path).split()) |
| return count |
| def get_subject(file_path): |
| with open(file_path, 'rb') as file: |
| message = email.message_from_bytes(file.read()) |
| headers = message.items() |
| |
| subject = "" |
| for header in headers: |
| if header[0] == "Subject": |
| |
| subject = header[1] |
| break |
| |
| |
| subject = re.sub(r"\s+", " ", str(subject)) |
| return subject |
|
|
|
|
| def get_sender(file_path): |
| with open(file_path, 'rb') as file: |
| message = email.message_from_bytes(file.read()) |
| headers = message.items() |
| |
| sender = "" |
| for header in headers: |
| if header[0] == "From": |
| |
| sender = header[1] |
| break |
| if sender == "": |
| return None |
| |
| return sender |
|
|
| |
| def get_num_sbjChar(file_path): |
| count = len(get_subject(file_path)) |
| return count |
|
|
| |
| def get_sbj_richness(file_path): |
| if get_num_sbjChar(file_path) == 0:return 0 |
| return get_num_sbj(file_path)/get_num_sbjChar(file_path) |
|
|
| |
| def get_num_urls_ip(file_path): |
| content = get_html_general(file_path) |
| if content == "": return 0 |
| urls = get_urls_from_html(content) |
| num_ip = 0 |
| for url in urls: |
| from urllib.parse import urlparse |
| hostname = urlparse(url).hostname |
| try: |
| ip_address = ipaddress.ip_address(hostname) |
| num_ip+=1 |
| |
| except ValueError: |
| pass |
| |
|
|
| return num_ip |
|
|
| |
| def get_num_urls(file_path): |
| urls = get_urls_from_html(get_html_general(file_path)) |
| if urls == []: |
| return None |
| return len(urls) |
|
|
| |
| def get_num_image_urls(file_path): |
| soup = BeautifulSoup(get_html_general(file_path), 'html.parser') |
|
|
| |
| image_links = soup.find_all('a', href=True, recursive=True, limit=None, string=None) |
| image_links_with_img = [link for link in image_links if link.find('img')] |
| return len(image_links_with_img) |
| |
| |
| |
| |
| |
|
|
| |
| def get_num_domain_urls(file_path): |
| urls = get_urls_from_html(get_html_general(file_path)) |
| domains = set() |
| for url in urls: |
| match = re.search(r'https?://([^/]+)/', url) |
| if match: |
| domain = match.group(1) |
| domains.add(domain) |
|
|
| |
| num_domains = len(domains) |
| return num_domains |
|
|
|
|
| |
| def get_num_url_ports(file_path): |
| urls = get_urls_from_html(get_html_general(file_path)) |
| count = 0 |
| for url in urls: |
| parsed_url = urlparse(url) |
| |
| if parsed_url.port: |
| count += 1 |
| |
| |
| |
| return count |
|
|
|
|
| |
| def get_chars_sender(file_path): |
| sender = get_sender(file_path) |
| return len(str(sender)) |