| |
| from tensorflow.keras.models import Model |
| from tensorflow.keras import models |
| from tensorflow.keras.utils import plot_model |
| from tensorflow.keras.layers import Input,LSTM,Dense |
| from sklearn.feature_extraction.text import CountVectorizer |
| import numpy as np |
| import pickle |
|
|
| |
| input_texts=[] |
| target_texts=[] |
| input_characters=set() |
| target_characters=set() |
|
|
| |
| with open('eng-french.txt','r',encoding='utf-8') as f: |
| rows=f.read().split('\n') |
| |
| for row in rows[:10000]: |
| |
| input_text,target_text = row.split('\t') |
| |
| target_text='\t' + target_text + '\n' |
| input_texts.append(input_text.lower()) |
| target_texts.append(target_text.lower()) |
| |
| input_characters.update(list(input_text.lower())) |
| target_characters.update(list(target_text.lower())) |
|
|
| |
| input_characters = sorted(list(input_characters)) |
| target_characters = sorted(list(target_characters)) |
| |
| num_en_chars = len(input_characters) |
| num_dec_chars = len(target_characters) |
| |
| max_input_length = max([len(i) for i in input_texts]) |
| max_target_length = max([len(i) for i in target_texts]) |
|
|
| def bagofcharacters(input_texts,target_texts): |
| |
| en_in_data=[] ; dec_in_data=[] ; dec_tr_data=[] |
| |
| pad_en=[1]+[0]*(len(input_characters)-1) |
| pad_dec=[0]*(len(target_characters)) ; pad_dec[2]=1 |
| |
| |
| cv=CountVectorizer(binary=True,tokenizer=lambda txt: txt.split(),stop_words=None,analyzer='char') |
| for i,(input_t,target_t) in enumerate(zip(input_texts,target_texts)): |
| |
| cv_inp= cv.fit(input_characters) |
| |
| |
| |
| en_in_data.append(cv_inp.transform(list(input_t)).toarray().tolist()) |
| cv_tar= cv.fit(target_characters) |
| dec_in_data.append(cv_tar.transform(list(target_t)).toarray().tolist()) |
| |
| |
| dec_tr_data.append(cv_tar.transform(list(target_t)[1:]).toarray().tolist()) |
| |
| |
| |
| if len(input_t) < max_input_length: |
| for _ in range(max_input_length-len(input_t)): |
| en_in_data[i].append(pad_en) |
| if len(target_t) < max_target_length: |
| for _ in range(max_target_length-len(target_t)): |
| dec_in_data[i].append(pad_dec) |
| if (len(target_t)-1) < max_target_length: |
| for _ in range(max_target_length-len(target_t)+1): |
| dec_tr_data[i].append(pad_dec) |
| |
| |
| en_in_data=np.array(en_in_data,dtype="float32") |
| dec_in_data=np.array(dec_in_data,dtype="float32") |
| dec_tr_data=np.array(dec_tr_data,dtype="float32") |
|
|
| return en_in_data,dec_in_data,dec_tr_data |
|
|
| |
| en_inputs = Input(shape=(None, num_en_chars)) |
| |
| |
| encoder = LSTM(256, return_state=True) |
| |
| en_outputs, state_h, state_c = encoder(en_inputs) |
| en_states = [state_h, state_c] |
|
|
| |
| dec_inputs = Input(shape=(None, num_dec_chars)) |
| |
| |
| dec_lstm = LSTM(256, return_sequences=True, return_state=True) |
| |
| dec_outputs, _, _ = dec_lstm(dec_inputs, initial_state=en_states) |
| |
| dec_dense = Dense(num_dec_chars, activation="softmax") |
| dec_outputs = dec_dense(dec_outputs) |
|
|
| |
| model = Model([en_inputs, dec_inputs], dec_outputs) |
| pickle.dump({'input_characters':input_characters,'target_characters':target_characters, |
| 'max_input_length':max_input_length,'max_target_length':max_target_length, |
| 'num_en_chars':num_en_chars,'num_dec_chars':num_dec_chars},open("training_data.pkl","wb")) |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| model.save("s2s")cd |
| |
| model.summary() |
| plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True) |
|
|
|
|