File size: 3,731 Bytes

b123f1a

import torch
from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModelForSeq2SeqLM, AlbertTokenizer, AutoModelForMaskedLM
import re
import numpy as np
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import json

language = ""
model_name = "ai4bharat/indic-bert"
sen_filepath = "./gold/malayalam/sentences.txt"
shifted_sen_filepath = "./gold/malayalam/shifted_sentences.txt"
outpath= f"./gold/{language}/indic_concat_{str(language)[:3]}.txt"


tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained("ai4bharat/indic-bert")
print("MODEL LOADED !")

sentences = []
with open(sen_filepath, "r") as f:
  while True:
    sentence = f.readline()
    sentence = sentence.strip()
    if len(sentence) == 0:
      break
    re.sub("\n","", sentence)
    temp = "[CLS] "+ sentence +" [SEP]"
    sentences.append(str(temp))

shifted_sentences = []
with open(shifted_sen_filepath, "r") as f:
  while True:
    sentence = f.readline()
    sentence = sentence.strip()
    if len(sentence) == 0:
      break
    re.sub("\n","", sentence)
    temp = "[CLS] "+ sentence +" [SEP]"
    shifted_sentences.append(str(temp))

print("FILES LOADED!")

count  = 0
d = {}
for i in np.arange(12):
  d[str(i)] = []

for sentence in sentences:
  if count%100 == 1:
    print(count)
  count += 1
  inputs = tokenizer(sentence, return_tensors="pt",max_length=512)
  features = model(**inputs, output_hidden_states=True)
  for i in range(0,12):
    d[str(i)].append(features['hidden_states'][i][0][0].detach().numpy().tolist())

print("FEATURES LOADED!")

with open(, 'w') as convert_file:
     json.dump(dict(d), convert_file)

print("FEATURES WRITTEN")
exit(0)


def create_bins(lower_bound, width, quantity):
    bins = []
    for low in range(lower_bound, 
                     lower_bound + quantity*width + 1, width):
        bins.append((low, low+width)) 
    return bins

def find_bin(value, bins):
    for i in range(0, len(bins)):
        if bins[i][0] <= value <= bins[i][1]:
            return i

from collections import Counter
bins = create_bins(lower_bound = 15,
                   width = 7,
                   quantity=5)

# print(bins)
# bins_len = [(0,15),(16, 22), (23, 29), (30, 36), (37, 43), (44, 50), (51, 57),(58,67),(68,100)]
bins_tree = [(0,2),(3,5),(6,8),(9,11),(12,20)]
bins = [(0,5),(6,8),(9,12),(13,16),(17,20),(21,25),(26,28),(29,200)]


df = pd.read_csv("./gold/marathi/senlen.csv")

y = df["len"].to_numpy()
for i in range(len(y)):
  y[i] = find_bin(y[i],bins)

# For Obj and Sub Number, data needs to be pruned before training the classifier

# new_d = {}
# for i in np.arange(12):
#   new_d[i] = []
# for i in range(0,12):
#   for j in range(len(y)):
#     if y[j] == "sg" or y[j] == "pl":
#       new_d[i].append(d[i][j])
  

# y_new = []
# for i in range(len(y)):
#   if(y[i] == "pl"):
#     y_new.append(1)
#   elif(y[i] == "sg"):
#     y_new.append(0)

# print(len(y_new))
# print(len(new_d[0]))

# y_new = np.array(y_new)

# For WordContent, we need to change input data
# new_d = {}
# for i in np.arange(12):
#   new_d[i] = []
# for i in range(0,12):
#   for j in range(len(y)):
#     new_d[i].append(d[i][df["index"][j]])  

# print(len(y))
# print(len(d[0]))

for i in range(0,12):
  X_train, X_test, y_train, y_test = train_test_split(d[str(i)], y, test_size=0.2, random_state=42)

  clf = LogisticRegression(random_state=0, multi_class = "multinomial",max_iter = 250).fit(X_train, y_train)
  # clf = LogisticRegressionCV(cv=5,random_state=0,max_iter=1000).fit(np.array(d[i]), y)

  print(i, clf.score(X_test, y_test))

print("JOB DONE!")