| import torch |
| from torch import nn |
| import re |
| import numpy as np |
| import pandas as pd |
| from collections import OrderedDict |
| import requests |
| from bs4 import BeautifulSoup |
|
|
| device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') |
| if device == 'cuda:0': |
| torch.cuda.set_device(device) |
| print(device) |
|
|
| def extract_text_from_link(url): |
| response = requests.get(url) |
| soup = BeautifulSoup(response.content, 'html.parser') |
| text = soup.get_text() |
| return text |
|
|
|
|
|
|
| doc = """The word "deep" in "deep learning" refers to the number of layers through which the data is transformed. More precisely, |
| deep learning systems have a substantial credit assignment path (CAP) depth. The CAP is the chain of transformations from input to |
| output. CAPs describe potentially causal connections between input and output. For a feedforward neural network, the depth of the |
| CAPs is that of the network and is the number of hidden layers plus one (as the output layer is also parameterized). For recurrent |
| neural networks, in which a signal may propagate through a layer more than once, the CAP depth is potentially unlimited.[13] No |
| universally agreed-upon threshold of depth divides shallow learning from deep learning, but most researchers agree that deep |
| learning involves CAP depth higher than 2. CAP of depth 2 has been shown to be a universal approximator in the sense that it |
| can emulate any function.[14] Beyond that, more layers do not add to the function approximator ability of the network. Deep |
| models (CAP > 2) are able to extract better features than shallow models and hence, extra layers help in learning the features |
| effectively.""" |
|
|
|
|
| class Text2Words: |
| def __init__(self, document): |
| self.text_all = re.findall(r'\b[A-Za-z]+\b', document) |
| self.text = list(set(self.text_all)) |
| self.chars_all = ''.join(self.text) |
| self.chars = self.unique_chars(self.chars_all) |
| self.int2char = dict(enumerate(self.chars)) |
| self.char2int = {char: ind for ind, char in self.int2char.items()} |
| self.maxlen = len(max(self.text, key=len)) |
| self.update_text() |
| self.input_seq_char, self.target_seq_char = self.get_seq_char(self.text) |
| self.input_seq_index, self.target_seq_index = self.get_seq(self.char2int, self.input_seq_char, self.target_seq_char, len(self.text)) |
| self.dict_size = len(self.char2int) |
| self.seq_len = self.maxlen - 1 |
| self.batch_size = len(self.text) |
| self.input_seq = self.one_hot_encode(self.input_seq_index, self.dict_size, self.seq_len, self.batch_size) |
|
|
| def one_hot_encode(self, sequence, dict_size, seq_len, batch_size): |
| |
| features = np.zeros((batch_size, seq_len, dict_size), dtype=np.float32) |
|
|
| |
| for i in range(batch_size): |
| for u in range(seq_len): |
| features[i, u, sequence[i][u]] = 1 |
| return features |
|
|
| def get_seq(self, char2int, input_seq_char, target_seq_char,n): |
| x=[] |
| y=[] |
| for i in range(n): |
| x.append([char2int[character] for character in input_seq_char[i]]) |
| y.append([char2int[character] for character in target_seq_char[i]]) |
| return x,y |
|
|
| def get_seq_char(self, text): |
| input_seq = [] |
| target_seq = [] |
|
|
| for i in range(len(text)): |
| |
| input_seq.append(text[i][:-1]) |
| |
| target_seq.append(text[i][1:]) |
| return input_seq, target_seq |
|
|
| def unique_chars(self, chars_all): |
| chars = [] |
| for letter in chars_all: |
| if letter not in chars: |
| chars.append(letter) |
| |
| if ' ' not in chars: |
| chars.append(' ') |
| return sorted(chars) |
|
|
| def update_text(self): |
| for i in range(len(self.text)): |
| while len(self.text[i])<self.maxlen: |
| self.text[i] += ' ' |
|
|
| def description(self): |
| text = {} |
| for word in self.text: |
| char = word[0] |
| if char not in text: |
| text[char] = [] |
| text[char].append(word.strip()) |
| for k,v in (sorted(text.items())): |
| print(f'{k} : {sorted(v)}') |
|
|
| def lengt_analysis(self): |
| text = {} |
| words = set(self.text_all) |
| for word in words: |
| n = len(word) |
| if n not in text: |
| text[n] = [] |
| text[n].append(word.strip()) |
| for k,v in (sorted(text.items())): |
| print(f'{k} : count = {len(v)} list = {sorted(v)}') |
| return None |
|
|
|
|
| def create_object(doc): |
| return Text2Words(doc) |
|
|
|
|
| def get_inputs(obj): |
| input_seq = torch.tensor(obj.input_seq, device=device) |
| target_seq_index = torch.tensor(obj.target_seq_index, device=device) |
| return input_seq, target_seq_index |
|
|
| class Model(nn.Module): |
| def __init__(self, input_size, output_size, hidden_dim, n_layers): |
| super(Model, self).__init__() |
|
|
| |
| self.hidden_dim = hidden_dim |
| self.n_layers = n_layers |
|
|
| |
| |
| self.rnn = nn.RNN(input_size, hidden_dim, n_layers, batch_first=True) |
| |
| self.fc = nn.Linear(hidden_dim, output_size) |
|
|
| def forward(self, x): |
| batch_size = x.size(0) |
| hidden = self.init_hidden(batch_size) |
| out, hidden = self.rnn(x, hidden) |
| out = out.contiguous().view(-1, self.hidden_dim) |
| out = self.fc(out) |
| return out, hidden |
|
|
| def init_hidden(self, batch_size): |
| |
| torch.manual_seed(42) |
| hidden = torch.zeros((self.n_layers, batch_size, self.hidden_dim), device=device) |
| return hidden |
|
|
| def create_model(obj): |
| model = Model(input_size=obj.dict_size, output_size=obj.dict_size, hidden_dim=2*obj.dict_size, n_layers=1) |
| model.to(device) |
| lr=0.01 |
| criterion = nn.CrossEntropyLoss() |
| optimizer = torch.optim.Adam(model.parameters(), lr=lr) |
| return model, criterion, optimizer |
|
|
| |
| def predict(model, character): |
| |
| |
| character = np.array([[obj.char2int[c] for c in character]]) |
| |
| character = obj.one_hot_encode(character, obj.dict_size, character.shape[1], 1) |
| |
| character = torch.tensor(character, device=device) |
| character.to(device) |
| out, hidden = model(character) |
| |
| prob = nn.functional.softmax(out[-1], dim=0).data |
| |
| char_ind = torch.max(prob, dim=0)[1].item() |
| |
| return obj.int2char[char_ind], hidden |
|
|
| |
| def sample(model, out_len, start='h'): |
| model.eval() |
| chars = [ch for ch in start] |
| char = chars[-1] |
| chars = chars[:-1] |
| |
| while char != ' ': |
| chars.append(char) |
| char, h = predict(model, chars) |
| return ''.join(chars) |
|
|
|
|
| def load_checkpoint(filepath): |
| checkpoint = torch.load(filepath) |
| |
| model = checkpoint['model'] |
| |
| model.load_state_dict(checkpoint['state_dict']) |
| |
| |
| |
| |
|
|
|
|
| model.eval() |
| return model |
|
|
| model = load_checkpoint('checkpoint.pth') |
|
|
| sample(model, obj.maxlen, 'ap') |
|
|