File size: 2,844 Bytes

b123f1a

import codecs,json
import numpy as np
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split
import pandas as pd


def create_bins(lower_bound, width, quantity):
    bins = []
    for low in range(lower_bound, 
                     lower_bound + quantity*width + 1, width):
        bins.append((low, low+width)) 
    return bins

def find_bin(value, bins):
    for i in range(0, len(bins)):
        if bins[i][0] <= value <= bins[i][1]:
            return i
    print(value)

from collections import Counter
bins = create_bins(lower_bound = 15,
                   width = 7,
                   quantity=5)
# bins_len = [(0,15),(16, 22), (23, 29), (30, 36), (37, 43), (44, 50), (51, 57),(58,67),(68,100)]
bins_tree = [(0,2),(3,5),(6,8),(9,11),(12,20)]
bins = [(0,5),(6,8),(9,12),(13,16),(17,20),(21,25),(26,28),(29,1000)]

def getxy(task,language):
    obj_text = codecs.open(f"./gold/{language}/multi_concat_tel_shifted.txt", 'r', encoding='utf-8').read()
    d = json.loads(obj_text) 
    df = pd.read_csv(f"./gold/{language}/{task}.csv")
    y = df[f"{task}"].to_numpy()

    if task == "senlen":
        for i in range(len(y)):
            y[i] = find_bin(y[i],bins)
        return d,y
    

    # For Obj and Sub Number, data needs to be pruned before training the classifier
    elif task == "subnum" or task =="objnum":
        new_d = {}
        for i in np.arange(12):
          new_d[i] = []
        for i in range(0,12):
          for j in range(len(y)):
            if y[j] == "sg" or y[j] == "pl":
              new_d[i].append(d[str(i)][j])

        y_new = []
        for i in range(len(y)):
          if(y[i] == "pl"):
            y_new.append(1)
          elif(y[i] == "sg"):
            y_new.append(0)

        y_new = np.array(y_new)
        return new_d,y_new

    # For WordContent, we need to change input data
    elif task == "wordcontent":
        new_d = {}
        for i in np.arange(12):
          new_d[i] = []
        for i in range(0,12):
          for j in range(len(y)):
            new_d[i].append(d[str(i)][df["index"][j]])
        return new_d,y
    
    else:
        # print(len(d[0]))
        # print(len(y))
        # exit(0)
        return d,y
 

tasks = ["senlen","objnum","subnum","treedepth","wordcontent","bshift"]
# temp = ["str","no","no","str","no","str"]
task = tasks[5]
language = "telugu"
d,y = getxy(task,language)
# print(len(d))
# print(len(y))
# exit(0)


for i in range(0,12):
  X_train, X_test, y_train, y_test = train_test_split(d[str(i)], y, test_size=0.2, random_state=42)

  clf = LogisticRegression(random_state=0, multi_class = "multinomial",max_iter = 250).fit(X_train, y_train)
  # clf = LogisticRegressionCV(cv=5,random_state=0,max_iter=1000).fit(np.array(d[i]), y)

  print(i, clf.score(X_test, y_test))