| import math |
|
|
| similar_letters = {"й": "и", "ё": "е", "e": "е", "t": "т", "i": "l", "o": "о", "k": "к", "3": "з", "a": "а", "x": "х", "c": "с", "m": "м"} |
| letters = "qwertyuiopasdfghjklzxcvbnmёйцукенгшщзхъфывапролджэячсмитьбю " |
|
|
| def countwords(x): |
| temp = {} |
| for word in x: |
| if word not in temp: |
| temp[word] = 1 |
| else: |
| temp[word] += 1 |
| return temp |
|
|
| def add_dict(a, b): |
| temp = {} |
| for key in a: |
| if key in b: |
| temp[key] = a[key]+b[key] |
| else: |
| temp[key] = a[key] |
| for key in b: |
| if key not in a: |
| temp[key] = b[key] |
| return temp |
|
|
| class Chatbot: |
| def __init__(self, name = None, n: int = 1, letter_replace: bool = True, data: dict = None, frequency_weight: float = 0, div_by_len: bool = False): |
| self.name = name |
| self.letter_replace = letter_replace |
| self.frequency_weight = frequency_weight |
| self.div_by_len = div_by_len |
| self.model = {} |
| self.n = n-1 |
| if data is not None: |
| self.train(data) |
| def tokenize(self, text: str, n: int = 1): |
| preprocess = "" |
| for x in text.lower(): |
| if x in letters: |
| if x in similar_letters and self.letter_replace: |
| preprocess += similar_letters[x] |
| else: |
| preprocess += x |
| else: |
| preprocess += " " + x + " " |
| tokens = preprocess.split() |
| output = tokens.copy() |
| for i in range(self.n): |
| for num, word in enumerate(tokens[:-i]): |
| output.append(' '.join(tokens[num:num+i])) |
| return output |
| def train(self, data: dict): |
| lendata = len(data) |
| lendata_div = 1/lendata |
| for x in data: |
| if data[x] not in self.model: |
| self.model[data[x]] = {"word count": countwords(self.tokenize(x)), "probabilities": {}, "weight count": 1, "weight": 0} |
| else: |
| self.model[data[x]]["word count"] = add_dict(countwords(self.tokenize(x)), self.model[data[x]]["word count"]) |
| self.model[data[x]]["weight count"] += 1 |
| for x in self.model: |
| probabilities = {} |
| div = 1/math.fsum(list(self.model[x]["word count"].values())) |
| for word in self.model[x]["word count"]: |
| probabilities[word] = self.model[x]["word count"][word]*div |
| self.model[x]["probabilities"] = probabilities |
| self.model[x]["weight"] = self.model[x]["weight count"] * lendata_div |
| def get_responses(self, text: str): |
| tokens = self.tokenize(text) |
| lentokens = len(tokens) |
| lentokens_div = 1/lentokens |
| scores = [] |
| for choice in self.model: |
| score = 0 |
| for token in tokens: |
| if token in self.model[choice]["probabilities"]: |
| score += self.model[choice]["probabilities"][token] |
| if self.div_by_len: |
| score *= lentokens_div |
| score *= self.frequency_weight*self.model[choice]["weight"] + (1-self.frequency_weight) |
| scores.append((choice, score)) |
| return sorted(scores, key=lambda x: x[1], reverse=True) |
| def __call__(self, text: str): |
| return self.get_responses(text)[0][0] |
|
|
| if __name__ == "__main__": |
| import json |
|
|
| with open("dataset.json", "r") as file: |
| data = json.load(file) |
|
|
| cb = Chatbot(data=data) |
| while True: |
| message = input("User: ") |
| response = cb(message) |
| print("Chatbot:", response) |
| if response == "Пока": |
| break |
|
|