import torch from transformers import AutoTokenizer, AutoModel import numpy as np import json # Configuration language = "hindi" model_name = "bert-base-multilingual-cased" # Open model, no authentication needed sen_filepath = "./gold/hindi/sentences.txt" outpath = "./gold/hindi/indic_concat_hin.txt" print("Loading model...") tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModel.from_pretrained(model_name) print("MODEL LOADED!") # Read sentences sentences = [] with open(sen_filepath, "r", encoding="utf-8") as f: for line in f: sentence = line.strip() if len(sentence) > 0: sentences.append(sentence) print(f"FILES LOADED! Total sentences: {len(sentences)}") # Extract features from all 12 layers count = 0 d = {} for i in range(12): d[str(i)] = [] print("Extracting features from BERT...") for sentence in sentences: if count % 100 == 0: print(f"Processing sentence {count}/{len(sentences)}") count += 1 inputs = tokenizer(sentence, return_tensors="pt", max_length=512, truncation=True, padding=True) with torch.no_grad(): outputs = model(**inputs, output_hidden_states=True) # Extract [CLS] token representation from each layer for i in range(12): cls_embedding = outputs.hidden_states[i][0][0].detach().numpy().tolist() d[str(i)].append(cls_embedding) print("FEATURES EXTRACTED!") # Save to JSON with open(outpath, 'w', encoding='utf-8') as convert_file: json.dump(d, convert_file) print(f"FEATURES WRITTEN to {outpath}") print("JOB DONE!")