| import pickle |
| import numpy as np |
| from gensim.models import KeyedVectors |
| import gzip |
| import io |
| import os |
|
|
| def calc_mean_vec_for_lower_mapping(embedd_dict): |
| lower_counts = {} |
| for word in embedd_dict: |
| word_lower = word.lower() |
| if word_lower not in lower_counts: |
| lower_counts[word_lower] = [word] |
| else: |
| lower_counts[word_lower] = lower_counts[word_lower] + [word] |
| |
| for word in lower_counts: |
| embedd_dict[word] = np.mean([embedd_dict[word_] for word_ in lower_counts[word]]) |
| return embedd_dict |
|
|
| def load_embedding_dict(embedding, embedding_path, lower_case=False): |
| """ |
| load word embeddings from file |
| :param embedding: |
| :param embedding_path: |
| :return: embedding dict, embedding dimention, caseless |
| """ |
| print("loading embedding: %s from %s" % (embedding, embedding_path)) |
| if lower_case: |
| pkl_path = embedding_path + '_lower' + '.pkl' |
| else: |
| pkl_path = embedding_path + '.pkl' |
| if os.path.isfile(pkl_path): |
| |
| with open(pkl_path, 'rb') as f: |
| embedd_dict, embedd_dim = pickle.load(f) |
| print("num dimensions of word embeddings:", embedd_dim) |
| return embedd_dict, embedd_dim |
|
|
| if embedding == 'glove': |
| |
| embedd_dict = {} |
| word = None |
| with io.open(embedding_path, 'r', encoding='utf-8') as f: |
| for line in f: |
| word, vec = line.split(' ', 1) |
| embedd_dict[word] = np.fromstring(vec, sep=' ') |
| embedd_dim = len(embedd_dict[word]) |
| if lower_case: |
| embedd_dict = calc_mean_vec_for_lower_mapping(embedd_dict) |
| for k, v in embedd_dict.items(): |
| if len(v) != embedd_dim: |
| print(len(v),embedd_dim) |
|
|
| elif embedding == 'fasttext': |
| |
| embedd_dict = {} |
| word = None |
| with io.open(embedding_path, 'r', encoding='utf-8') as f: |
| |
| for i, line in enumerate(f): |
| if i == 0: |
| continue |
| word, vec = line.split(' ', 1) |
| embedd_dict[word] = np.fromstring(vec, sep=' ') |
| embedd_dim = len(embedd_dict[word]) |
| if lower_case: |
| embedd_dict = calc_mean_vec_for_lower_mapping(embedd_dict) |
| for k, v in embedd_dict.items(): |
| if len(v) != embedd_dim: |
| print(len(v),embedd_dim) |
|
|
| elif embedding == 'hellwig': |
| |
| embedd_dict = {} |
| word = None |
| with io.open(embedding_path, 'r', encoding='utf-8') as f: |
| |
| for i, line in enumerate(f): |
| if i == 0: |
| continue |
| word, vec = line.split(' ', 1) |
| embedd_dict[word] = np.fromstring(vec, sep=' ') |
| embedd_dim = len(embedd_dict[word]) |
| if lower_case: |
| embedd_dict = calc_mean_vec_for_lower_mapping(embedd_dict) |
| for k, v in embedd_dict.items(): |
| if len(v) != embedd_dim: |
| print(len(v),embedd_dim) |
|
|
| elif embedding == 'one_hot': |
| |
| embedd_dict = {} |
| word = None |
| with io.open(embedding_path, 'r', encoding='utf-8') as f: |
| |
| for i, line in enumerate(f): |
| if i == 0: |
| continue |
| word, vec = line.split('@', 1) |
| embedd_dict[word] = np.fromstring(vec, sep=' ') |
| embedd_dim = len(embedd_dict[word]) |
| if lower_case: |
| embedd_dict = calc_mean_vec_for_lower_mapping(embedd_dict) |
| for k, v in embedd_dict.items(): |
| if len(v) != embedd_dim: |
| print(len(v),embedd_dim) |
|
|
| elif embedding == 'word2vec': |
| |
| embedd_dict = KeyedVectors.load_word2vec_format(embedding_path, binary=True) |
| if lower_case: |
| embedd_dict = calc_mean_vec_for_lower_mapping(embedd_dict) |
| embedd_dim = embedd_dict.vector_size |
|
|
| else: |
| raise ValueError("embedding should choose from [fasttext, glove, word2vec]") |
|
|
| print("num dimensions of word embeddings:", embedd_dim) |
| |
| with open(pkl_path, 'wb') as f: |
| pickle.dump([embedd_dict, embedd_dim], f, pickle.HIGHEST_PROTOCOL) |
| return embedd_dict, embedd_dim |