JagritiRawat
/

IndicBertology

Model card Files Files and versions

IndicBertology / src /classifier.py

JagritiRawat's picture

Add files using upload-large-folder tool

b123f1a verified 3 days ago

history blame contribute delete

2.84 kB

	import codecs,json
	import numpy as np
	from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
	from sklearn.model_selection import train_test_split
	import pandas as pd


	def create_bins(lower_bound, width, quantity):
	bins = []
	for low in range(lower_bound,
	lower_bound + quantity*width + 1, width):
	bins.append((low, low+width))
	return bins

	def find_bin(value, bins):
	for i in range(0, len(bins)):
	if bins[i][0] <= value <= bins[i][1]:
	return i
	print(value)

	from collections import Counter
	bins = create_bins(lower_bound = 15,
	width = 7,
	quantity=5)
	# bins_len = [(0,15),(16, 22), (23, 29), (30, 36), (37, 43), (44, 50), (51, 57),(58,67),(68,100)]
	bins_tree = [(0,2),(3,5),(6,8),(9,11),(12,20)]
	bins = [(0,5),(6,8),(9,12),(13,16),(17,20),(21,25),(26,28),(29,1000)]

	def getxy(task,language):
	obj_text = codecs.open(f"./gold/{language}/multi_concat_tel_shifted.txt", 'r', encoding='utf-8').read()
	d = json.loads(obj_text)
	df = pd.read_csv(f"./gold/{language}/{task}.csv")
	y = df[f"{task}"].to_numpy()

	if task == "senlen":
	for i in range(len(y)):
	y[i] = find_bin(y[i],bins)
	return d,y


	# For Obj and Sub Number, data needs to be pruned before training the classifier
	elif task == "subnum" or task =="objnum":
	new_d = {}
	for i in np.arange(12):
	new_d[i] = []
	for i in range(0,12):
	for j in range(len(y)):
	if y[j] == "sg" or y[j] == "pl":
	new_d[i].append(d[str(i)][j])

	y_new = []
	for i in range(len(y)):
	if(y[i] == "pl"):
	y_new.append(1)
	elif(y[i] == "sg"):
	y_new.append(0)

	y_new = np.array(y_new)
	return new_d,y_new

	# For WordContent, we need to change input data
	elif task == "wordcontent":
	new_d = {}
	for i in np.arange(12):
	new_d[i] = []
	for i in range(0,12):
	for j in range(len(y)):
	new_d[i].append(d[str(i)][df["index"][j]])
	return new_d,y

	else:
	# print(len(d[0]))
	# print(len(y))
	# exit(0)
	return d,y


	tasks = ["senlen","objnum","subnum","treedepth","wordcontent","bshift"]
	# temp = ["str","no","no","str","no","str"]
	task = tasks[5]
	language = "telugu"
	d,y = getxy(task,language)
	# print(len(d))
	# print(len(y))
	# exit(0)


	for i in range(0,12):
	X_train, X_test, y_train, y_test = train_test_split(d[str(i)], y, test_size=0.2, random_state=42)

	clf = LogisticRegression(random_state=0, multi_class = "multinomial",max_iter = 250).fit(X_train, y_train)
	# clf = LogisticRegressionCV(cv=5,random_state=0,max_iter=1000).fit(np.array(d[i]), y)

	print(i, clf.score(X_test, y_test))