| from modules import * |
| from pathlib import Path |
| import pandas as pd |
| from flask import Flask, render_template, request |
| import nltk |
| import pickle |
| from nltk.corpus import stopwords |
| from nltk.stem import WordNetLemmatizer |
| from joblib import load |
| import sklearn |
| import ssl |
|
|
| try: |
| _create_unverified_https_context = ssl._create_unverified_context |
| except AttributeError: |
| pass |
| else: |
| ssl._create_default_https_context = _create_unverified_https_context |
| |
| |
| |
| |
|
|
| def check_file_type(file): |
| file_extension = Path(file.filename).suffix.lower() |
| if file_extension == '.eml' or file_extension == '.txt': |
| save_file(file) |
| return 'Extracted Features' |
| |
| else: |
| return "Please select .eml or .txt file." |
|
|
| def save_file(file): |
| file_path = 'email files/' + file.filename |
| with open(file_path, 'w') as f: |
| f.write(file.read().decode('utf-8')) |
|
|
| def text_feature(filepath): |
| text = get_text(filepath) |
| |
| if text != "": |
| text = text.split() |
| textlist = ' '.join(text) |
| dataf = pd.DataFrame([[textlist]], columns=['text']) |
| return dataf |
|
|
| def html_tags_feature(filepath): |
| tags = get_tags_from_html(get_html_general(filepath)) |
| taglist = ' '.join(tags) if tags !=[] else [] |
| dataf = pd.DataFrame([[taglist]], columns=['tags']) |
| return dataf |
|
|
| def extra_feature(filepath): |
| spf = check_spf(filepath) |
| dkim = check_dkim(filepath) |
| dmarc = check_dmarc(filepath) |
| deliver_receiver = check_deliver_receiver(filepath) |
| encript = check_encript(filepath) |
| onclick = get_onclicks(filepath) |
| popwindow = check_popWindow(filepath) |
| extra_data_row = [spf, dkim, dmarc, deliver_receiver, encript, onclick, popwindow] |
| extra_data_row = [0 if x is None else x for x in extra_data_row] |
| extra_data_row = [1 if x is True else x for x in extra_data_row] |
| extra_data_row = [0 if x is False else x for x in extra_data_row] |
| extra_data = pd.DataFrame([extra_data_row], |
| columns=['SPF(Pass:1,Neutral:2,Softdail:3,None:0)', 'DKIM', 'DMARC', 'Deliver-to Matches Receiver', 'Message_encrtpted', 'Onclick_events', 'Popwindow']) |
| return extra_data |
|
|
| def num_feature(filepath): |
| body_richness = get_body_richness(filepath) |
| func_words = get_num_FunctionWords(filepath) |
| sbj_richness = get_sbj_richness(filepath) |
| urls = get_num_urls(filepath) |
| ipurls = get_num_urls_ip(filepath) |
| imageurls = get_num_image_urls(filepath) |
| domainurls = get_num_domain_urls(filepath) |
| urlport = get_num_url_ports(filepath) |
| sen_chars = get_chars_sender(filepath) |
| num_data_row = [body_richness, func_words, sbj_richness, urls, ipurls, imageurls, domainurls, urlport, sen_chars] |
| num_data_row = [0 if x is None else x for x in num_data_row] |
| num_data = pd.DataFrame([num_data_row], |
| columns=['body richness', 'Include function words', 'Subject richness', 'Numers of URLs', 'IPURLs', 'ImageURLs', |
| 'DomainURLs', 'URLs contain port information', 'Characters in senders']) |
| return num_data |
| def get_features(filepath): |
| |
| textlist = text_feature(filepath) |
| |
| taglist = html_tags_feature(filepath) |
| |
| extra_data = extra_feature(filepath) |
| |
|
|
| num_data = num_feature(filepath) |
| combined_df = pd.concat([textlist, taglist, num_data,extra_data], axis=1) |
| |
| return combined_df |
|
|
|
|
| def predict_content(content): |
| content_clf = load("save_models/SVM_finalcontent.pkl") |
| predict = content_clf.predict(preprocess_content(content)) |
| return "Legitimate" if predict[0]=='ham' else "Phishing" |
|
|
| def predict_html(html_tag): |
| html_clf = load("save_models/Stack_tag.pkl") |
| predict = html_clf.predict(preprocess_html(html_tag)) |
| return "Legitimate" if predict[0]=='ham' else "Phishing" |
|
|
| def predict_num(num_df): |
| num_clf = load("save_models/RF_Num.pkl") |
| predict = num_clf.predict(preprocess_num(num_df)) |
| return "Legitimate" if predict[0]=='ham' else "Phishing" |
|
|
| def predict_extra(extra_df): |
| extra_clf = load("save_models/RF_extra.pkl") |
| predict = extra_clf.predict(preprocess_extra(extra_df)) |
| return "Legitimate" if predict[0]=='ham' else "Phishing" |
|
|
| def preprocess_content(content): |
| with open('vectorizer/content_tfidf.pickle', 'rb') as f: |
| tfidf = pickle.load(f) |
| |
| content_tfidf = tfidf.transform(content) |
| return content_tfidf |
|
|
| def preprocess_html(html_tag): |
| with open('vectorizer/html_cv.pickle', 'rb') as f: |
| cv = pickle.load(f) |
| tag_data = cv.transform(html_tag) |
| return tag_data |
|
|
| def preprocess_num(num_df): |
| with open('vectorizer/num_scaler.pkl', 'rb') as f: |
| num_scaler = pickle.load(f) |
| scale_num = num_scaler.transform(num_df.values) |
| return scale_num |
|
|
| def preprocess_extra(extra_df): |
| with open('vectorizer/extra_scaler.pkl', 'rb') as f: |
| extra_scaler = pickle.load(f) |
| scale_extra = extra_scaler.transform(extra_df.values) |
| return scale_extra |
|
|
|
|
| lemmatizer = WordNetLemmatizer() |
| def customtokenize(str): |
| |
| tokens = nltk.word_tokenize(str) |
| |
| nostop = list(filter(lambda token: token not in stopwords.words('english'), tokens)) |
| |
| lemmatized = [lemmatizer.lemmatize(word) for word in nostop] |
| return lemmatized |