| import codecs,json |
| import numpy as np |
| from sklearn.linear_model import LogisticRegression, LogisticRegressionCV |
| from sklearn.model_selection import train_test_split |
| import pandas as pd |
|
|
|
|
| def create_bins(lower_bound, width, quantity): |
| bins = [] |
| for low in range(lower_bound, |
| lower_bound + quantity*width + 1, width): |
| bins.append((low, low+width)) |
| return bins |
|
|
| def find_bin(value, bins): |
| for i in range(0, len(bins)): |
| if bins[i][0] <= value <= bins[i][1]: |
| return i |
| print(value) |
|
|
| from collections import Counter |
| bins = create_bins(lower_bound = 15, |
| width = 7, |
| quantity=5) |
| |
| bins_tree = [(0,2),(3,5),(6,8),(9,11),(12,20)] |
| bins = [(0,5),(6,8),(9,12),(13,16),(17,20),(21,25),(26,28),(29,1000)] |
|
|
| def getxy(task,language): |
| obj_text = codecs.open(f"./gold/{language}/multi_concat_tel_shifted.txt", 'r', encoding='utf-8').read() |
| d = json.loads(obj_text) |
| df = pd.read_csv(f"./gold/{language}/{task}.csv") |
| y = df[f"{task}"].to_numpy() |
|
|
| if task == "senlen": |
| for i in range(len(y)): |
| y[i] = find_bin(y[i],bins) |
| return d,y |
| |
|
|
| |
| elif task == "subnum" or task =="objnum": |
| new_d = {} |
| for i in np.arange(12): |
| new_d[i] = [] |
| for i in range(0,12): |
| for j in range(len(y)): |
| if y[j] == "sg" or y[j] == "pl": |
| new_d[i].append(d[str(i)][j]) |
|
|
| y_new = [] |
| for i in range(len(y)): |
| if(y[i] == "pl"): |
| y_new.append(1) |
| elif(y[i] == "sg"): |
| y_new.append(0) |
|
|
| y_new = np.array(y_new) |
| return new_d,y_new |
|
|
| |
| elif task == "wordcontent": |
| new_d = {} |
| for i in np.arange(12): |
| new_d[i] = [] |
| for i in range(0,12): |
| for j in range(len(y)): |
| new_d[i].append(d[str(i)][df["index"][j]]) |
| return new_d,y |
| |
| else: |
| |
| |
| |
| return d,y |
| |
|
|
| tasks = ["senlen","objnum","subnum","treedepth","wordcontent","bshift"] |
| |
| task = tasks[5] |
| language = "telugu" |
| d,y = getxy(task,language) |
| |
| |
| |
|
|
|
|
| for i in range(0,12): |
| X_train, X_test, y_train, y_test = train_test_split(d[str(i)], y, test_size=0.2, random_state=42) |
|
|
| clf = LogisticRegression(random_state=0, multi_class = "multinomial",max_iter = 250).fit(X_train, y_train) |
| |
|
|
| print(i, clf.score(X_test, y_test)) |
|
|
|
|