mlx2 / src /tokenize_triples.py

Upload src/tokenize_triples.py with huggingface_hub

362c0ce verified 10 months ago

2.82 kB

	import json
	import pickle
	from tqdm import tqdm
	import numpy as np

	def load_tokenizer():
	"""Load the CBOW tokenizer mappings."""
	with open('tkn_words_to_ids.pkl', 'rb') as f:
	words_to_ids = pickle.load(f)
	with open('tkn_ids_to_words.pkl', 'rb') as f:
	ids_to_words = pickle.load(f)
	return words_to_ids, ids_to_words

	def tokenize_text(text, words_to_ids):
	"""Tokenize text using the CBOW tokenizer."""
	# Convert to lowercase and split
	words = text.lower().split()
	# Convert words to IDs, using 0 for unknown words
	token_ids = [words_to_ids.get(word, 0) for word in words]
	return token_ids

	def process_triples(input_file, output_file):
	"""Process triples and tokenize queries and documents."""
	print("Loading tokenizer...")
	words_to_ids, ids_to_words = load_tokenizer()

	print("Loading triples...")
	with open(input_file, 'r') as f:
	data = json.load(f)

	tokenized_data = {
	'train': [],
	'validation': [],
	'test': []
	}

	for split in ['train', 'validation', 'test']:
	print(f"\nTokenizing {split} split...")
	for triple in tqdm(data[split]):
	query = triple['query']
	pos_doc = triple['positive_doc']
	neg_doc = triple['negative_doc']

	# Tokenize query and documents
	query_tokens = tokenize_text(query, words_to_ids)
	pos_doc_tokens = tokenize_text(pos_doc, words_to_ids)
	neg_doc_tokens = tokenize_text(neg_doc, words_to_ids)

	tokenized_data[split].append({
	'query_tokens': query_tokens,
	'positive_document_tokens': pos_doc_tokens,
	'negative_document_tokens': neg_doc_tokens,
	'query': query, # Keep original text for reference
	'positive_document': pos_doc,
	'negative_document': neg_doc
	})

	print("Saving tokenized triples...")
	with open(output_file, 'w') as f:
	json.dump(tokenized_data, f, indent=2)

	# Print statistics
	for split in ['train', 'validation', 'test']:
	print(f"\n{split.upper()} split:")
	print(f"Number of tokenized triples: {len(tokenized_data[split])}")
	if tokenized_data[split]:
	sample = tokenized_data[split][0]
	print("\nSample tokenized triple:")
	print("Query tokens length:", len(sample['query_tokens']))
	print("Positive doc tokens length:", len(sample['positive_document_tokens']))
	print("Negative doc tokens length:", len(sample['negative_document_tokens']))

	if __name__ == "__main__":
	input_file = "triples_small.json"
	output_file = "tokenized_triples.json"
	process_triples(input_file, output_file)