| import json |
| import numpy as np |
| import pickle |
| from tqdm import tqdm |
| import torch |
| import torch.nn as nn |
|
|
| def load_tokenizer(): |
| """Load the CBOW tokenizer mappings.""" |
| print("Loading tokenizer...") |
| with open('tkn_words_to_ids.pkl', 'rb') as f: |
| words_to_ids = pickle.load(f) |
| with open('tkn_ids_to_words.pkl', 'rb') as f: |
| ids_to_words = pickle.load(f) |
| return words_to_ids, ids_to_words |
|
|
| def load_tokenized_triples(): |
| """Load the tokenized triples.""" |
| print("Loading tokenized triples...") |
| with open('tokenized_triples.json', 'r') as f: |
| data = json.load(f) |
| return data |
|
|
| def create_embedding_layer(vocab_size, embedding_dim=128): |
| """Create a simple embedding layer.""" |
| embedding = nn.Embedding(vocab_size, embedding_dim) |
| |
| nn.init.xavier_uniform_(embedding.weight) |
| return embedding |
|
|
| def average_pool(tokens, embedding_layer): |
| """Create average pooled vector for a list of tokens.""" |
| |
| tokens_tensor = torch.tensor(tokens, dtype=torch.long) |
| |
| embeddings = embedding_layer(tokens_tensor) |
| |
| return torch.mean(embeddings, dim=0).detach().numpy() |
|
|
| def process_triples(data, embedding_layer): |
| """Process triples and create average pooled vectors.""" |
| processed_data = { |
| 'train': [], |
| 'validation': [], |
| 'test': [] |
| } |
| |
| for split in ['train', 'validation', 'test']: |
| print(f"\nProcessing {split} split...") |
| for triple in tqdm(data[split]): |
| |
| query_vector = average_pool(triple['query_tokens'], embedding_layer) |
| pos_doc_vector = average_pool(triple['positive_document_tokens'], embedding_layer) |
| neg_doc_vector = average_pool(triple['negative_document_tokens'], embedding_layer) |
| |
| processed_data[split].append({ |
| 'query_vector': query_vector.tolist(), |
| 'positive_document_vector': pos_doc_vector.tolist(), |
| 'negative_document_vector': neg_doc_vector.tolist(), |
| 'query': triple['query'], |
| 'positive_document': triple['positive_document'], |
| 'negative_document': triple['negative_document'] |
| }) |
| |
| return processed_data |
|
|
| def main(): |
| |
| words_to_ids, ids_to_words = load_tokenizer() |
| data = load_tokenized_triples() |
| |
| |
| vocab_size = len(words_to_ids) |
| embedding_layer = create_embedding_layer(vocab_size) |
| |
| |
| processed_data = process_triples(data, embedding_layer) |
| |
| |
| print("\nSaving processed data...") |
| with open('triple_embeddings.json', 'w') as f: |
| json.dump(processed_data, f) |
| |
| |
| for split in ['train', 'validation', 'test']: |
| print(f"\n{split.upper()} split:") |
| print(f"Number of processed triples: {len(processed_data[split])}") |
| if processed_data[split]: |
| sample = processed_data[split][0] |
| print("\nSample vector shapes:") |
| print("Query vector shape:", len(sample['query_vector'])) |
| print("Positive doc vector shape:", len(sample['positive_document_vector'])) |
| print("Negative doc vector shape:", len(sample['negative_document_vector'])) |
|
|
| if __name__ == "__main__": |
| main() |