ierhon
/

bayes-chatbot

Model card Files Files and versions

bayes-chatbot / main.py

ierhon's picture

Add n-gram update

157c5bd verified about 2 years ago

history blame contribute delete

3.68 kB

	import math

	similar_letters = {"й": "и", "ё": "е", "e": "е", "t": "т", "i": "l", "o": "о", "k": "к", "3": "з", "a": "а", "x": "х", "c": "с", "m": "м"}
	letters = "qwertyuiopasdfghjklzxcvbnmёйцукенгшщзхъфывапролджэячсмитьбю "

	def countwords(x):
	temp = {}
	for word in x:
	if word not in temp:
	temp[word] = 1
	else:
	temp[word] += 1
	return temp

	def add_dict(a, b):
	temp = {}
	for key in a:
	if key in b:
	temp[key] = a[key]+b[key]
	else:
	temp[key] = a[key]
	for key in b:
	if key not in a:
	temp[key] = b[key]
	return temp

	class Chatbot:
	def __init__(self, name = None, n: int = 1, letter_replace: bool = True, data: dict = None, frequency_weight: float = 0, div_by_len: bool = False):
	self.name = name
	self.letter_replace = letter_replace
	self.frequency_weight = frequency_weight
	self.div_by_len = div_by_len
	self.model = {}
	self.n = n-1
	if data is not None:
	self.train(data)
	def tokenize(self, text: str, n: int = 1):
	preprocess = ""
	for x in text.lower():
	if x in letters:
	if x in similar_letters and self.letter_replace:
	preprocess += similar_letters[x]
	else:
	preprocess += x
	else:
	preprocess += " " + x + " "
	tokens = preprocess.split()
	output = tokens.copy()
	for i in range(self.n):
	for num, word in enumerate(tokens[:-i]):
	output.append(' '.join(tokens[num:num+i]))
	return output
	def train(self, data: dict):
	lendata = len(data)
	lendata_div = 1/lendata
	for x in data:
	if data[x] not in self.model:
	self.model[data[x]] = {"word count": countwords(self.tokenize(x)), "probabilities": {}, "weight count": 1, "weight": 0}
	else:
	self.model[data[x]]["word count"] = add_dict(countwords(self.tokenize(x)), self.model[data[x]]["word count"])
	self.model[data[x]]["weight count"] += 1
	for x in self.model:
	probabilities = {}
	div = 1/math.fsum(list(self.model[x]["word count"].values()))
	for word in self.model[x]["word count"]:
	probabilities[word] = self.model[x]["word count"][word]*div
	self.model[x]["probabilities"] = probabilities
	self.model[x]["weight"] = self.model[x]["weight count"] * lendata_div
	def get_responses(self, text: str):
	tokens = self.tokenize(text)
	lentokens = len(tokens)
	lentokens_div = 1/lentokens
	scores = []
	for choice in self.model:
	score = 0
	for token in tokens:
	if token in self.model[choice]["probabilities"]:
	score += self.model[choice]["probabilities"][token]
	if self.div_by_len:
	score *= lentokens_div
	score = self.frequency_weightself.model[choice]["weight"] + (1-self.frequency_weight)
	scores.append((choice, score))
	return sorted(scores, key=lambda x: x[1], reverse=True)
	def __call__(self, text: str):
	return self.get_responses(text)[0][0]

	if __name__ == "__main__":
	import json

	with open("dataset.json", "r") as file:
	data = json.load(file)

	cb = Chatbot(data=data)
	while True:
	message = input("User: ")
	response = cb(message)
	print("Chatbot:", response)
	if response == "Пока":
	break