IndicBertology / src /ada.py
JagritiRawat's picture
Add files using upload-large-folder tool
b123f1a verified
import torch
from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModelForSeq2SeqLM, AlbertTokenizer, AutoModelForMaskedLM
import re
import numpy as np
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import json
language = ""
model_name = "ai4bharat/indic-bert"
sen_filepath = "./gold/malayalam/sentences.txt"
shifted_sen_filepath = "./gold/malayalam/shifted_sentences.txt"
outpath= f"./gold/{language}/indic_concat_{str(language)[:3]}.txt"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained("ai4bharat/indic-bert")
print("MODEL LOADED !")
sentences = []
with open(sen_filepath, "r") as f:
while True:
sentence = f.readline()
sentence = sentence.strip()
if len(sentence) == 0:
break
re.sub("\n","", sentence)
temp = "[CLS] "+ sentence +" [SEP]"
sentences.append(str(temp))
shifted_sentences = []
with open(shifted_sen_filepath, "r") as f:
while True:
sentence = f.readline()
sentence = sentence.strip()
if len(sentence) == 0:
break
re.sub("\n","", sentence)
temp = "[CLS] "+ sentence +" [SEP]"
shifted_sentences.append(str(temp))
print("FILES LOADED!")
count = 0
d = {}
for i in np.arange(12):
d[str(i)] = []
for sentence in sentences:
if count%100 == 1:
print(count)
count += 1
inputs = tokenizer(sentence, return_tensors="pt",max_length=512)
features = model(**inputs, output_hidden_states=True)
for i in range(0,12):
d[str(i)].append(features['hidden_states'][i][0][0].detach().numpy().tolist())
print("FEATURES LOADED!")
with open(, 'w') as convert_file:
json.dump(dict(d), convert_file)
print("FEATURES WRITTEN")
exit(0)
def create_bins(lower_bound, width, quantity):
bins = []
for low in range(lower_bound,
lower_bound + quantity*width + 1, width):
bins.append((low, low+width))
return bins
def find_bin(value, bins):
for i in range(0, len(bins)):
if bins[i][0] <= value <= bins[i][1]:
return i
from collections import Counter
bins = create_bins(lower_bound = 15,
width = 7,
quantity=5)
# print(bins)
# bins_len = [(0,15),(16, 22), (23, 29), (30, 36), (37, 43), (44, 50), (51, 57),(58,67),(68,100)]
bins_tree = [(0,2),(3,5),(6,8),(9,11),(12,20)]
bins = [(0,5),(6,8),(9,12),(13,16),(17,20),(21,25),(26,28),(29,200)]
df = pd.read_csv("./gold/marathi/senlen.csv")
y = df["len"].to_numpy()
for i in range(len(y)):
y[i] = find_bin(y[i],bins)
# For Obj and Sub Number, data needs to be pruned before training the classifier
# new_d = {}
# for i in np.arange(12):
# new_d[i] = []
# for i in range(0,12):
# for j in range(len(y)):
# if y[j] == "sg" or y[j] == "pl":
# new_d[i].append(d[i][j])
# y_new = []
# for i in range(len(y)):
# if(y[i] == "pl"):
# y_new.append(1)
# elif(y[i] == "sg"):
# y_new.append(0)
# print(len(y_new))
# print(len(new_d[0]))
# y_new = np.array(y_new)
# For WordContent, we need to change input data
# new_d = {}
# for i in np.arange(12):
# new_d[i] = []
# for i in range(0,12):
# for j in range(len(y)):
# new_d[i].append(d[i][df["index"][j]])
# print(len(y))
# print(len(d[0]))
for i in range(0,12):
X_train, X_test, y_train, y_test = train_test_split(d[str(i)], y, test_size=0.2, random_state=42)
clf = LogisticRegression(random_state=0, multi_class = "multinomial",max_iter = 250).fit(X_train, y_train)
# clf = LogisticRegressionCV(cv=5,random_state=0,max_iter=1000).fit(np.array(d[i]), y)
print(i, clf.score(X_test, y_test))
print("JOB DONE!")