| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
|
|
| import re |
| from gensim.models import Word2Vec |
|
|
|
|
| |
| def my_simple_tokenizer(text): |
| """Tokenizes text while preserving punctuation as separate tokens.""" |
| return [x.lower() for x in re.split(r"([.,;:¡!¿?]+)?\s+", text) if x] |
|
|
|
|
| |
| |
| maya2vec_path = "./model_512_60_5_-0.25_0.7308_3.35E-05" |
| model = Word2Vec.load(maya2vec_path) |
| print("Model loaded successfully.",type(model)) |
|
|
|
|
| |
| |
| word = "meyaj" |
| if word in model.wv: |
| vector = model.wv[word] |
| print(f"Semantic encoded word '{word}' in", type(vector), vector.shape) |
| else: |
| print(f"The word '{word}' is out-of-vocabulary (OOV).") |
|
|
|
|
| |
| |
| text = "Bix a bel Táan in bin ich kool Tene' ooxolen" |
| tokens = my_simple_tokenizer(text) |
| try: |
| vector = model.wv.get_mean_vector(tokens) |
| print("Semantic encoded text in", type(vector), vector.shape) |
| except KeyError: |
| print("Some words in the input text are OOV, affecting the embedding computation.") |
|
|
|
|
| |
| |
| word1, word2 = "peek'", "waalak'" |
| if word1 in model.wv and word2 in model.wv: |
| similarity = model.wv.similarity(word1, word2) |
| print(f"Similarity between '{word1}' and '{word2}': {similarity:.4f}") |
| else: |
| print(f"One or both words ('{word1}', '{word2}') are OOV.") |
|
|
|
|
| |
| unknown_word = "furnance" |
| if unknown_word in model.wv: |
| vector = model.wv[unknown_word] |
| else: |
| print(f"The word '{unknown_word}' is OOV.") |
|
|
|
|