| import torch |
| from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModelForSeq2SeqLM, AlbertTokenizer, AutoModelForMaskedLM |
| import re |
| import numpy as np |
| from sklearn.linear_model import LogisticRegression, LogisticRegressionCV |
| from sklearn.model_selection import train_test_split |
| import pandas as pd |
| import numpy as np |
| import json |
|
|
| language = "" |
| model_name = "ai4bharat/indic-bert" |
| sen_filepath = "./gold/malayalam/sentences.txt" |
| shifted_sen_filepath = "./gold/malayalam/shifted_sentences.txt" |
| outpath= f"./gold/{language}/indic_concat_{str(language)[:3]}.txt" |
|
|
|
|
| tokenizer = AutoTokenizer.from_pretrained(model_name) |
| model = AutoModelForMaskedLM.from_pretrained("ai4bharat/indic-bert") |
| print("MODEL LOADED !") |
|
|
| sentences = [] |
| with open(sen_filepath, "r") as f: |
| while True: |
| sentence = f.readline() |
| sentence = sentence.strip() |
| if len(sentence) == 0: |
| break |
| re.sub("\n","", sentence) |
| temp = "[CLS] "+ sentence +" [SEP]" |
| sentences.append(str(temp)) |
|
|
| shifted_sentences = [] |
| with open(shifted_sen_filepath, "r") as f: |
| while True: |
| sentence = f.readline() |
| sentence = sentence.strip() |
| if len(sentence) == 0: |
| break |
| re.sub("\n","", sentence) |
| temp = "[CLS] "+ sentence +" [SEP]" |
| shifted_sentences.append(str(temp)) |
|
|
| print("FILES LOADED!") |
|
|
| count = 0 |
| d = {} |
| for i in np.arange(12): |
| d[str(i)] = [] |
|
|
| for sentence in sentences: |
| if count%100 == 1: |
| print(count) |
| count += 1 |
| inputs = tokenizer(sentence, return_tensors="pt",max_length=512) |
| features = model(**inputs, output_hidden_states=True) |
| for i in range(0,12): |
| d[str(i)].append(features['hidden_states'][i][0][0].detach().numpy().tolist()) |
|
|
| print("FEATURES LOADED!") |
|
|
| with open(, 'w') as convert_file: |
| json.dump(dict(d), convert_file) |
|
|
| print("FEATURES WRITTEN") |
| exit(0) |
|
|
|
|
| def create_bins(lower_bound, width, quantity): |
| bins = [] |
| for low in range(lower_bound, |
| lower_bound + quantity*width + 1, width): |
| bins.append((low, low+width)) |
| return bins |
|
|
| def find_bin(value, bins): |
| for i in range(0, len(bins)): |
| if bins[i][0] <= value <= bins[i][1]: |
| return i |
|
|
| from collections import Counter |
| bins = create_bins(lower_bound = 15, |
| width = 7, |
| quantity=5) |
|
|
| |
| |
| bins_tree = [(0,2),(3,5),(6,8),(9,11),(12,20)] |
| bins = [(0,5),(6,8),(9,12),(13,16),(17,20),(21,25),(26,28),(29,200)] |
|
|
|
|
| df = pd.read_csv("./gold/marathi/senlen.csv") |
|
|
| y = df["len"].to_numpy() |
| for i in range(len(y)): |
| y[i] = find_bin(y[i],bins) |
|
|
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
|
|
| |
| |
|
|
| |
|
|
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
|
|
| for i in range(0,12): |
| X_train, X_test, y_train, y_test = train_test_split(d[str(i)], y, test_size=0.2, random_state=42) |
|
|
| clf = LogisticRegression(random_state=0, multi_class = "multinomial",max_iter = 250).fit(X_train, y_train) |
| |
|
|
| print(i, clf.score(X_test, y_test)) |
|
|
| print("JOB DONE!") |
|
|