IndicBertology / src /ada_fixed.py
JagritiRawat's picture
Add files using upload-large-folder tool
b123f1a verified
import torch
from transformers import AutoTokenizer, AutoModel
import numpy as np
import json
# Configuration
language = "hindi"
model_name = "bert-base-multilingual-cased" # Open model, no authentication needed
sen_filepath = "./gold/hindi/sentences.txt"
outpath = "./gold/hindi/indic_concat_hin.txt"
print("Loading model...")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
print("MODEL LOADED!")
# Read sentences
sentences = []
with open(sen_filepath, "r", encoding="utf-8") as f:
for line in f:
sentence = line.strip()
if len(sentence) > 0:
sentences.append(sentence)
print(f"FILES LOADED! Total sentences: {len(sentences)}")
# Extract features from all 12 layers
count = 0
d = {}
for i in range(12):
d[str(i)] = []
print("Extracting features from BERT...")
for sentence in sentences:
if count % 100 == 0:
print(f"Processing sentence {count}/{len(sentences)}")
count += 1
inputs = tokenizer(sentence, return_tensors="pt", max_length=512, truncation=True, padding=True)
with torch.no_grad():
outputs = model(**inputs, output_hidden_states=True)
# Extract [CLS] token representation from each layer
for i in range(12):
cls_embedding = outputs.hidden_states[i][0][0].detach().numpy().tolist()
d[str(i)].append(cls_embedding)
print("FEATURES EXTRACTED!")
# Save to JSON
with open(outpath, 'w', encoding='utf-8') as convert_file:
json.dump(d, convert_file)
print(f"FEATURES WRITTEN to {outpath}")
print("JOB DONE!")