| |
| """G project.ipynb |
| |
| Automatically generated by Colab. |
| |
| Original file is located at |
| https://colab.research.google.com/drive/13NvZhwwfiJloW8ZsdQ6HLf-jfSRc-tfv |
| """ |
|
|
| !wget "https://alt.qcri.org/resources/OSACT2022/OSACT2022-sharedTask-train.txt" |
| !wget "https://alt.qcri.org/resources/OSACT2022/OSACT2022-sharedTask-dev.txt" |
| !wget "https://alt.qcri.org/resources/OSACT2022/OSACT2022-sharedTask-test-tweets.txt" |
| !wget "https://alt.qcri.org/resources1/OSACT2022/OSACT2022-sharedTask-test-taskA-gold-labels.txt" |
|
|
| import pandas as pd |
| import csv |
| train_data = pd.read_csv("OSACT2022-sharedTask-train.txt", sep="\t", quoting=csv.QUOTE_NONE) |
| dev_data = pd.read_csv("OSACT2022-sharedTask-dev.txt", sep="\t", quoting=csv.QUOTE_NONE) |
| test_data = pd.read_csv("OSACT2022-sharedTask-test-tweets.txt", sep="\t", quoting=csv.QUOTE_NONE) |
| train_data |
|
|
| train_data = train_data.drop(columns=['1', 'NOT_HS', 'NOT_VLG' , 'NOT_VIO']) |
| train_data |
|
|
| train_data = train_data.rename(columns={"@USER ردينا ع التطنز 😏👊🏻": "Text"}) |
| train_data = train_data.rename(columns={"OFF": "label"}) |
| train_data |
|
|
| dev_data |
|
|
| dev_data = dev_data.drop(columns=['8888', 'NOT_HS', 'NOT_VLG' , 'NOT_VIO']) |
|
|
| dev_data = dev_data.rename(columns={"@USER افطرت عليك بعقاء واثنين من فروخها الجن 🔪😂": "Text"}) |
| dev_data = dev_data.rename(columns={"NOT_OFF": "label"}) |
| dev_data |
|
|
| test_data |
|
|
| test_data = test_data.drop(columns=['10158']) |
|
|
| test_data = test_data.rename(columns={"@USER هتهزر معايا ولا ايه 😡😡😡😡": "Text"}) |
| test_data |
|
|
| test_labels = pd.read_csv("OSACT2022-sharedTask-test-taskA-gold-labels.txt", sep="\t", quoting=csv.QUOTE_NONE) |
| test_labels = test_labels.rename(columns={"NOT_OFF": "label"}) |
| test_data = test_data.join(test_labels) |
| test_data |
|
|
| """# **DOWNLOADING A LIST OF ARABIC STOPWORDS**""" |
|
|
| |
| |
|
|
| !wget https://raw.githubusercontent.com/alaa-a-a/multi-dialect-arabic-stop-words/main/Stop-words/stop_list_1177.txt |
| arabic_stop_words = [] |
| with open ('./stop_list_1177.txt',encoding='utf-8') as f : |
| for word in f.readlines() : |
| arabic_stop_words.append(word.split("\n")[0]) |
|
|
| import nltk |
| from nltk.corpus import stopwords |
| from nltk.tokenize import WordPunctTokenizer |
| from nltk.stem.isri import ISRIStemmer |
| import string |
| import re |
| from bs4 import BeautifulSoup |
| nltk.download('stopwords') |
|
|
|
|
| tok = WordPunctTokenizer() |
|
|
| def normalize_arabic(text): |
| text = re.sub("[إأآا]", "ا", text) |
| text = re.sub("ى", "ي", text) |
| text = re.sub("ؤ", "ء", text) |
| text = re.sub("ئ", "ء", text) |
| text = re.sub("ة", "ه", text) |
| text = re.sub("گ", "ك", text) |
| return text |
|
|
|
|
| def remove_diacritics(text): |
| arabic_diacritics = re.compile(""" |
| ّ | # Tashdid |
| َ | # Fatha |
| ً | # Tanwin Fath |
| ُ | # Damma |
| ٌ | # Tanwin Damm |
| ِ | # Kasra |
| ٍ | # Tanwin Kasr |
| ْ | # Sukun |
| ـ # Tatwil/Kashida |
| """, re.VERBOSE) |
| return re.sub(arabic_diacritics, '', text) |
|
|
|
|
| def remove_punctuations(text): |
| arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ''' |
| english_punctuations = string.punctuation |
| punctuations_list = arabic_punctuations + english_punctuations |
| translator = str.maketrans('', '', punctuations_list) |
| return text.translate(translator) |
|
|
|
|
| def remove_repeating_char(text): |
| |
| return re.sub(r'(.)\1+', r'\1\1', text) |
|
|
| def remove_stop_words(text): |
| word_list = nltk.tokenize.wordpunct_tokenize(text.lower()) |
| word_list = [ w for w in word_list if not w in arabic_stop_words] |
| return (" ".join(word_list)).strip() |
|
|
|
|
|
|
| def remove_non_arabic_letters(text): |
| text = re.sub(r'([@A-Za-z0-9_]+)|#|http\S+', ' ', text) |
| text = re.sub(r'ـــــــــــــ', '', text) |
| return text |
|
|
|
|
|
|
|
|
| def clean_str(text): |
| text = remove_non_arabic_letters(text) |
| text = remove_punctuations(text) |
| text = remove_diacritics(text) |
| text = remove_repeating_char(text) |
| |
|
|
| |
| soup = BeautifulSoup(text, 'lxml') |
| souped = soup.get_text() |
| pat1 = r'@[A-Za-z0-9]+' |
| pat2 = r'https?://[A-Za-z0-9./]+' |
| combined_pat = r'|'.join((pat1, pat2)) |
| stripped = re.sub(combined_pat, '', souped) |
| try: |
| clean = stripped.decode("utf-8-sig").replace(u"\ufffd", "?") |
| except: |
| clean = stripped |
|
|
| words = tok.tokenize(clean) |
| return (" ".join(words)).strip() |
|
|
| """## **applying preprocessing on our dataset**""" |
|
|
| print("Cleaning and parsing the training dataset...\n") |
|
|
| train_data["Text"] = train_data["Text"].apply(lambda x: clean_str(x)) |
|
|
| train_data.head() |
|
|
| print("Cleaning and parsing the development dataset...\n") |
|
|
| dev_data["Text"] = dev_data["Text"].apply(lambda x: clean_str(x)) |
|
|
| dev_data.head() |
|
|
| print("Cleaning and parsing the test dataset...\n") |
|
|
| test_data["Text"] = test_data["Text"].apply(lambda x: clean_str(x)) |
|
|
| test_data.head() |
|
|
| label2id = {"NOT_OFF": 0,"OFF": 1} |
| id2label = {0: "NOT_OFF", 1: "OFF"} |
|
|
| train_data['label'] = train_data['label'].apply(lambda x: label2id[x]) |
| train_data=train_data[["Text", "label"]] |
| train_data.head() |
|
|
| dev_data['label'] = dev_data['label'].apply(lambda x: label2id[x]) |
| dev_data=dev_data[["Text", "label"]] |
| dev_data.head() |
|
|
| test_data['label'] = test_data['label'].apply(lambda x: label2id[x]) |
| test_data=test_data[["Text", "label"]] |
| test_data |
|
|
| import pandas as pd |
| from imblearn.over_sampling import RandomOverSampler |
| from collections import Counter |
|
|
| X = train_data[['Text']] |
| y = train_data['label'] |
|
|
| print('Original class distribution:', Counter(y)) |
|
|
| ros = RandomOverSampler(random_state=42) |
|
|
| X_resampled, y_resampled = ros.fit_resample(X, y) |
|
|
| train_data_resampled = pd.DataFrame(X_resampled, columns=['Text']) |
| train_data_resampled['label'] = y_resampled |
|
|
| print('Resampled class distribution:', Counter(y_resampled)) |
|
|
| y_resampled.value_counts() |
|
|
| train_data_resampled.head() |
|
|
| from sklearn.model_selection import train_test_split |
|
|
| X_train = train_data_resampled['Text'].values |
| y_train = train_data_resampled['label'].values |
|
|
| X_val = dev_data['Text'].values |
| y_val = dev_data['label'].values |
|
|
|
|
|
|
| print("Training data shape:", X_train.shape, y_train.shape) |
| print("Validation data shape:", X_val.shape, y_val.shape) |
|
|
| train_text_lengths = [len(text.split()) for text in X_train] |
| max_length = max(train_text_lengths) |
|
|
| print("Maximum length of text:", max_length) |
|
|
| """### APPLYING QARIB MODEL""" |
|
|
| ! pip install transformers[torch] |
|
|
| import numpy as np |
|
|
| |
| from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix, precision_score , recall_score |
|
|
| from transformers import AutoConfig, BertForSequenceClassification, AutoTokenizer |
| from transformers.data.processors import SingleSentenceClassificationProcessor, InputFeatures |
| from transformers import Trainer , TrainingArguments |
|
|
| train_df = pd.DataFrame({ |
| 'label':y_train, |
| 'text': X_train |
| }) |
|
|
| dev_df = pd.DataFrame({ |
| 'label':y_val, |
| 'text': X_val |
| }) |
|
|
| test_df = pd.DataFrame({ |
| 'label':test_data['label'], |
| 'text': test_data['Text'] |
| }) |
|
|
| PREFIX_LIST = [ |
| "ال", |
| "و", |
| "ف", |
| "ب", |
| "ك", |
| "ل", |
| "لل", |
| "\u0627\u0644", |
| "\u0648", |
| "\u0641", |
| "\u0628", |
| "\u0643", |
| "\u0644", |
| "\u0644\u0644", |
| "س", |
| ] |
| SUFFIX_LIST = [ |
| "ه", |
| "ها", |
| "ك", |
| "ي", |
| "هما", |
| "كما", |
| "نا", |
| "كم", |
| "هم", |
| "هن", |
| "كن", |
| "ا", |
| "ان", |
| "ين", |
| "ون", |
| "وا", |
| "ات", |
| "ت", |
| "ن", |
| "ة", |
| "\u0647", |
| "\u0647\u0627", |
| "\u0643", |
| "\u064a", |
| "\u0647\u0645\u0627", |
| "\u0643\u0645\u0627", |
| "\u0646\u0627", |
| "\u0643\u0645", |
| "\u0647\u0645", |
| "\u0647\u0646", |
| "\u0643\u0646", |
| "\u0627", |
| "\u0627\u0646", |
| "\u064a\u0646", |
| "\u0648\u0646", |
| "\u0648\u0627", |
| "\u0627\u062a", |
| "\u062a", |
| "\u0646", |
| "\u0629", |
| ] |
|
|
|
|
| |
| _PREFIX_SYMBOLS = [x + "+" for x in PREFIX_LIST] |
| _SUFFIX_SYMBOLS = ["+" + x for x in SUFFIX_LIST] |
| NEVER_SPLIT_TOKENS = list(set(_PREFIX_SYMBOLS + _SUFFIX_SYMBOLS)) |
|
|
| model_name = "qarib/bert-base-qarib" |
| num_labels = 2 |
| config = AutoConfig.from_pretrained(model_name,num_labels=num_labels, output_attentions=True) |
| tokenizer = AutoTokenizer.from_pretrained(model_name, |
| do_lower_case=False, |
| do_basic_tokenize=True, |
| never_split=NEVER_SPLIT_TOKENS) |
| tokenizer.max_len = 64 |
| model = BertForSequenceClassification.from_pretrained(model_name, config=config) |
|
|
| train_dataset = SingleSentenceClassificationProcessor(mode='classification') |
| dev_dataset = SingleSentenceClassificationProcessor(mode='classification') |
|
|
| train_dataset.add_examples(texts_or_text_and_labels=train_df['text'],labels=train_df['label'],overwrite_examples = True) |
| dev_dataset.add_examples(texts_or_text_and_labels=dev_df['text'],labels=dev_df['label'],overwrite_examples = True) |
| print(train_dataset.examples[0]) |
|
|
| train_features = train_dataset.get_features(tokenizer = tokenizer, max_length =64) |
| dev_features = dev_dataset.get_features(tokenizer = tokenizer, max_length =64) |
| |
|
|
| print(len(train_features)) |
| print(len(dev_features)) |
|
|
| def compute_metrics(p): |
| print(np.shape(p.predictions[0])) |
| print(np.shape(p.predictions[1])) |
| print(len(p.label_ids)) |
| preds = np.argmax(p.predictions[0], axis=1) |
| assert len(preds) == len(p.label_ids) |
| print(classification_report(p.label_ids,preds)) |
| print(confusion_matrix(p.label_ids,preds)) |
|
|
| macro_f1 = f1_score(p.label_ids,preds,average='macro') |
| macro_precision = precision_score(p.label_ids,preds,average='macro') |
| macro_recall = recall_score(p.label_ids,preds,average='macro') |
| acc = accuracy_score(p.label_ids,preds) |
| return { |
| 'macro_f1' : macro_f1, |
| 'macro_precision': macro_precision, |
| 'macro_recall': macro_recall, |
| 'accuracy': acc |
| } |
|
|
| ! mkdir train |
| training_args = TrainingArguments("./train") |
| training_args.do_train = True |
| training_args.evaluate_during_training = True |
| training_args.adam_epsilon = 1e-8 |
| training_args.learning_rate = 2e-5 |
| training_args.warmup_steps = 0 |
| training_args.per_device_train_batch_size = 64 |
| training_args.per_device_eval_batch_size = 64 |
| training_args.num_train_epochs = 2 |
| training_args.logging_steps = 300 |
| training_args.save_steps = 2000 |
| training_args.seed = 42 |
| print(training_args.logging_steps) |
|
|
| |
| trainer = Trainer(model=model, |
| args = training_args, |
| train_dataset = train_features, |
| eval_dataset = dev_features, |
| compute_metrics = compute_metrics) |
| |
| trainer.train() |
|
|
| trainer.evaluate() |
|
|
| !pip install fasttext |
| import fasttext |
| import fasttext.util |
| from huggingface_hub import hf_hub_download |
|
|
| model_path = hf_hub_download(repo_id="facebook/fasttext-ar-vectors", filename="model.bin") |
| |
| model_fasttext = fasttext.load_model(model_path) |
| |
| |
| print(len(model_fasttext.words)) |
| model_fasttext['bread'].shape |
|
|
| import nltk |
| from nltk.corpus import stopwords |
| from nltk.tokenize import WordPunctTokenizer |
| from nltk.stem.isri import ISRIStemmer |
| import string |
| import re |
| from bs4 import BeautifulSoup |
| nltk.download('stopwords') |
|
|
|
|
| tok = WordPunctTokenizer() |
|
|
| def normalize_arabic(text): |
| text = re.sub("[إأآا]", "ا", text) |
| text = re.sub("ى", "ي", text) |
| text = re.sub("ؤ", "ء", text) |
| text = re.sub("ئ", "ء", text) |
| text = re.sub("ة", "ه", text) |
| text = re.sub("گ", "ك", text) |
| return text |
|
|
|
|
| def remove_diacritics(text): |
| arabic_diacritics = re.compile(""" |
| ّ | # Tashdid |
| َ | # Fatha |
| ً | # Tanwin Fath |
| ُ | # Damma |
| ٌ | # Tanwin Damm |
| ِ | # Kasra |
| ٍ | # Tanwin Kasr |
| ْ | # Sukun |
| ـ # Tatwil/Kashida |
| """, re.VERBOSE) |
| return re.sub(arabic_diacritics, '', text) |
|
|
|
|
| def remove_punctuations(text): |
| arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ''' |
| english_punctuations = string.punctuation |
| punctuations_list = arabic_punctuations + english_punctuations |
| translator = str.maketrans('', '', punctuations_list) |
| return text.translate(translator) |
|
|
|
|
| def remove_repeating_char(text): |
| |
| return re.sub(r'(.)\1+', r'\1\1', text) |
|
|
| def remove_stop_words(text): |
| |
| englishStopWords = stopwords.words('english') |
|
|
| all_stopwords = set(englishStopWords + arabic_stop_words) |
|
|
| word_list = nltk.tokenize.wordpunct_tokenize(text.lower()) |
| word_list = [ w for w in word_list if not w in all_stopwords ] |
| return (" ".join(word_list)).strip() |
|
|
| def get_root(text): |
| word_list = nltk.tokenize.wordpunct_tokenize(text.lower()) |
| result = [] |
| arstemmer = ISRIStemmer() |
| for word in word_list: result.append(arstemmer.stem(word)) |
| return (' '.join(result)).strip() |
|
|
| def clean_tweet(text): |
| text = re.sub(r'([@A-Za-z0-9_]+)|#|http\S+', ' ', text) |
| text = re.sub(r'ـــــــــــــ', '', text) |
| return text |
|
|
|
|
|
|
|
|
| def clean_str(text): |
| text = clean_tweet(text) |
| |
| text = remove_punctuations(text) |
| text = remove_diacritics(text) |
| text = remove_repeating_char(text) |
| |
|
|
|
|
| text = text.replace('وو', 'و') |
| text = text.replace('يي', 'ي') |
| text = text.replace('اا', 'ا') |
|
|
| |
|
|
| soup = BeautifulSoup(text, 'lxml') |
| souped = soup.get_text() |
| pat1 = r'@[A-Za-z0-9]+' |
| pat2 = r'https?://[A-Za-z0-9./]+' |
| combined_pat = r'|'.join((pat1, pat2)) |
| stripped = re.sub(combined_pat, '', souped) |
| try: |
| clean = stripped.decode("utf-8-sig").replace(u"\ufffd", "?") |
| except: |
| clean = stripped |
|
|
| words = tok.tokenize(clean) |
| return (" ".join(words)).strip() |
|
|
| !gdown "165kzfZDsRTZAAfZKedeZiUlKzMcHNgPd" |
| !gdown "1WdgbvqDYIa-g5ijjsz5zb-3lVvUXUtmS&confirm=t" |
| !gdown "1foNTGFjhWAxS-_SfF7rga80UmFT7BDJ0&confirm=t" |
|
|
| !pip install pyarabic |
| !pip install farasapy |
| !pip install transformers[torch] |
| !pip install Keras-Preprocessing |
|
|
| ! git clone https://github.com/facebookresearch/fastText.git |
| ! cd fastText && sudo pip install . |
|
|
| from transformers import pipeline |
| unmasker_MARBERT = pipeline('fill-mask', model='UBC-NLP/MARBERT', top_k=50) |
|
|
| def light_preprocess(text): |
| text = clean_tweet(text) |
| |
| text = remove_punctuations(text) |
| text = remove_diacritics(text) |
| text = remove_repeating_char(text) |
| text = text.replace('وو', 'و') |
| text = text.replace('يي', 'ي') |
| text = text.replace('اا', 'ا') |
| return text |
|
|
| nltk.download('stopwords') |
| englishStopWords = stopwords.words('english') |
| arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ''' |
| english_punctuations = string.punctuation |
| punctuations_list = arabic_punctuations + english_punctuations |
|
|
| all_stopwords = set(englishStopWords + arabic_stop_words) |
|
|
| !pip install torch |
|
|
| import torch |
| |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
| def classsify_tweets(tweet): |
| df = pd.DataFrame({"tweet": tweet}) |
| df['clean_tweet'] = df['tweet'].apply(lambda x: clean_str(x)) |
|
|
| dev_df = pd.DataFrame({ |
| 'id':range(len(df)), |
| 'text': df["clean_tweet"] |
| }) |
|
|
| test_example = SingleSentenceClassificationProcessor(mode='classification') |
| test_example.add_examples(texts_or_text_and_labels=dev_df['text'], overwrite_examples = True) |
|
|
| test_features = test_example.get_features(tokenizer = tokenizer, max_length =64) |
|
|
| input_ids = [i.input_ids for i in test_features] |
| attention_masks = [i.attention_mask for i in test_features] |
|
|
| inputs = torch.tensor(input_ids) |
| masks = torch.tensor(attention_masks) |
|
|
| |
| model.eval() |
|
|
| |
| model.to(device) |
|
|
| torch.cuda.empty_cache() |
| |
| inputs = inputs.to(device) |
| masks = masks.to(device) |
|
|
| |
| output = model(inputs, attention_mask=masks)["logits"] |
| |
| output = output.cpu().detach().numpy() |
|
|
| return output |
|
|
| size = len(test_data) |
| print("size of test set:", size) |
| correct_class_tweets = [] |
| correct_class = [] |
| for i in range(0, size): |
| txt = test_data['Text'].astype('U')[i] |
| cls = test_data['label'][i] |
| label = id2label[np.argmax(classsify_tweets([txt]), axis=1)[0]] |
| if label == cls and label == 1: |
| correct_class_tweets.append(txt) |
| correct_class.append(cls) |
|
|
| from scipy.spatial import distance |
| from farasa.stemmer import FarasaStemmer |
| frasa_stemmer = FarasaStemmer(interactive=True) |
|
|
| !pip install emoji |
|
|
| import emoji |
|
|
| def select_best_replacement(pos, x_cur, verbose=False): |
| """ Select the most effective replacement to word at pos (pos) in (x_cur)""" |
|
|
| if bool(emoji.emoji_count(x_cur.split()[pos])): |
| return None |
|
|
| embedding_masked_word = model_fasttext[x_cur.split()[pos]] |
|
|
| x_masked = (" ".join(x_cur.split()[:pos]) + " [MASK] " + " ".join(x_cur.split()[pos + 1:])).strip() |
| unmasked_seq = unmasker_MARBERT(x_masked)[:20] |
|
|
| max_sim = -1 |
| best_perturb_dict = {} |
| for seq in unmasked_seq: |
| if frasa_stemmer.stem(seq['token_str']) in frasa_stemmer.stem(x_cur.split()[pos]): |
| continue |
| if seq['token_str'] in punctuations_list or pos >= len(seq["sequence"].split()): |
| continue |
| embedding_masked_word_new = model_fasttext[seq['token_str']] |
| if np.sum(embedding_masked_word) == 0 or np.sum(embedding_masked_word_new) == 0: |
| continue |
| if verbose: print("New word: ", seq['token_str']) |
| sim = 1 - distance.cosine(embedding_masked_word, embedding_masked_word_new) |
| if sim > max_sim: |
| max_sim = sim |
| best_perturb_dict["sim"] = sim |
| best_perturb_dict["Masked word"] = x_cur.split()[pos] |
| best_perturb_dict["New word"] = seq['token_str'] |
| best_perturb_dict["New seq"] = x_cur.replace(x_cur.split()[pos], seq['token_str']) |
|
|
| return best_perturb_dict.get("New seq", None) |
|
|
| |
| perturb_counter = 0 |
| for tweet_ix, tweet in enumerate(correct_class_tweets): |
| print("Tweet index: ", tweet_ix) |
|
|
| x_adv = light_preprocess(tweet) |
| x_len = len(x_adv.split()) |
| orig_class = np.argmax(classsify_tweets([x_adv]), axis=1)[0] |
| orig_label = id2label[orig_class] |
| print(f"Original tweet: {x_adv} : Original label: {orig_label}.") |
| splits = len(x_adv.split()) |
| perturbed_flag = False |
| for split_ix in range(splits): |
| perturbed = select_best_replacement(split_ix, x_adv) |
| if perturbed: |
| new_class = np.argmax(classsify_tweets([perturbed]), axis=1)[0] |
| if orig_class != new_class: |
| print(f"Perturbed tweet: {perturbed} : New label: {id2label[new_class]}.") |
| print(10 * "==") |
| if not perturbed_flag: |
| perturb_counter += 1 |
| perturbed_flag = True |
| if not perturbed_flag: |
| print(10 * "==") |
| print(f"Successful perturbation {perturb_counter} out of {len(correct_class_tweets)}.") |
|
|
| off_tweets_count = sum(test_data['label'] == 1 ) |
| print(f"Number of offensive tweets in the dataset: {off_tweets_count}") |
|
|
| size = len(test_data) |
| print("size of test set:", size) |
| correct_class_tweets = [] |
| correct_class = [] |
| for i in range(0, size): |
| txt = test_data['Text'].astype('U')[i] |
| cls = test_data['label'][i] |
| label = id2label[np.argmax(classsify_tweets([txt]), axis=1)[0]] |
| print(f"Tweet: {txt} | Actual: {cls} | Predicted: {label}") |
| if label == cls and label == "OFF": |
| correct_class_tweets.append(txt) |
| correct_class.append(cls) |
| print(f"Correctly classified as OFF: {txt}") |
|
|
| !pip install gradio |
|
|
| import gradio as gr |
| import torch |
| from transformers import AutoModelForSequenceClassification, AutoTokenizer |
|
|
| |
| model_name = "qarib/bert-base-qarib" |
| tokenizer = AutoTokenizer.from_pretrained(model_name) |
| model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2) |
|
|
| |
| def light_preprocess(text): |
| text = text.replace("@USER", "").replace("RT", "").strip() |
| return text |
|
|
| |
| def predict_offensive(text): |
| preprocessed_text = light_preprocess(text) |
| inputs = tokenizer(preprocessed_text, return_tensors="pt", truncation=True, padding=True) |
| with torch.no_grad(): |
| outputs = model(**inputs) |
| logits = outputs.logits |
| predicted_class = torch.argmax(logits, dim=1).item() |
| return "Offensive" if predicted_class == 1 else "Not Offensive" |
|
|
| |
| iface = gr.Interface( |
| fn=predict_offensive, |
| inputs=gr.Textbox(lines=2, placeholder="Enter text here..."), |
| outputs="text", |
| title="Offensive Language Detection", |
| description="Enter a text to check if it's offensive or not.", |
| ) |
|
|
| |
| iface.launch() |
|
|
| import gradio as gr |
| import torch |
| from transformers import AutoModelForSequenceClassification, AutoTokenizer |
|
|
| |
| model_name_1 = "qarib/bert-base-qarib" |
| model_name_2 = "bert-base-multilingual-cased" |
| tokenizer_1 = AutoTokenizer.from_pretrained(model_name_1) |
| model_1 = AutoModelForSequenceClassification.from_pretrained(model_name_1, num_labels=2) |
|
|
| tokenizer_2 = AutoTokenizer.from_pretrained(model_name_2) |
| model_2 = AutoModelForSequenceClassification.from_pretrained(model_name_2, num_labels=2) |
|
|
| |
| def light_preprocess(text): |
| text = text.replace("@USER", "").replace("RT", "").strip() |
| return text |
|
|
| |
| def predict_offensive(text, model_choice): |
| if model_choice == "Model 1": |
| tokenizer = tokenizer_1 |
| model = model_1 |
| else: |
| tokenizer = tokenizer_2 |
| model = model_2 |
|
|
| preprocessed_text = light_preprocess(text) |
| inputs = tokenizer(preprocessed_text, return_tensors="pt", truncation=True, padding=True) |
| with torch.no_grad(): |
| outputs = model(**inputs) |
| logits = outputs.logits |
| predicted_class = torch.argmax(logits, dim=1).item() |
| return "Offensive" if predicted_class == 1 else "Not Offensive" |
|
|
| |
| iface = gr.Interface( |
| fn=predict_offensive, |
| inputs=[ |
| gr.Textbox(lines=2, placeholder="Enter text here...", label="Input Text"), |
| gr.Dropdown(choices=["Model 1", "Model 2"], label="Select Model") |
| ], |
| outputs=gr.Textbox(label="Prediction"), |
| title="Offensive Language Detection", |
| description="Enter a text to check if it's offensive or not using the selected model.", |
| theme="default", |
| css=".gradio-container { background-color: #f0f0f0; } .output-textbox { font-size: 20px; color: #007BFF; }" |
| ) |
|
|
| |
| iface.launch() |
|
|
| !pip install gradio |
| import gradio as gr |
| import torch |
| from transformers import AutoModelForSequenceClassification, AutoTokenizer |
|
|
| |
| model_name_1 = "qarib/bert-base-qarib" |
| model_name_2 = "bert-base-multilingual-cased" |
| tokenizer_1 = AutoTokenizer.from_pretrained(model_name_1) |
| model_1 = AutoModelForSequenceClassification.from_pretrained(model_name_1, num_labels=2) |
|
|
| tokenizer_2 = AutoTokenizer.from_pretrained(model_name_2) |
| model_2 = AutoModelForSequenceClassification.from_pretrained(model_name_2, num_labels=2) |
|
|
| |
| def light_preprocess(text): |
| text = text.replace("@USER", "").replace("RT", "").strip() |
| return text |
|
|
| |
| def predict_offensive(text, model_choice): |
| if model_choice == "Model 1": |
| tokenizer = tokenizer_1 |
| model = model_1 |
| else: |
| tokenizer = tokenizer_2 |
| model = model_2 |
|
|
| preprocessed_text = light_preprocess(text) |
| inputs = tokenizer(preprocessed_text, return_tensors="pt", truncation=True, padding=True) |
| with torch.no_grad(): |
| outputs = model(**inputs) |
| logits = outputs.logits |
| predicted_class = torch.argmax(logits, dim=1).item() |
| return "Offensive" if predicted_class == 1 else "Not Offensive" |
|
|
| |
| iface = gr.Interface( |
| fn=predict_offensive, |
| inputs=[ |
| gr.Textbox(lines=2, placeholder="Enter text here...", label="Input Text"), |
| gr.Dropdown(choices=["Model 1", "Model 2"], label="Select Model") |
| ], |
| outputs=gr.Textbox(label="Prediction"), |
| title="Offensive Language Detection", |
| description="Enter a text to check if it's offensive or not using the selected model.", |
| theme="default", |
| ) |
|
|
| |
| iface.launch() |