| import torch |
| from transformers import AutoTokenizer, AutoModel |
| import numpy as np |
| import json |
|
|
| |
| language = "hindi" |
| model_name = "bert-base-multilingual-cased" |
| sen_filepath = "./gold/hindi/sentences.txt" |
| outpath = "./gold/hindi/indic_concat_hin.txt" |
|
|
| print("Loading model...") |
| tokenizer = AutoTokenizer.from_pretrained(model_name) |
| model = AutoModel.from_pretrained(model_name) |
| print("MODEL LOADED!") |
|
|
| |
| sentences = [] |
| with open(sen_filepath, "r", encoding="utf-8") as f: |
| for line in f: |
| sentence = line.strip() |
| if len(sentence) > 0: |
| sentences.append(sentence) |
|
|
| print(f"FILES LOADED! Total sentences: {len(sentences)}") |
|
|
| |
| count = 0 |
| d = {} |
| for i in range(12): |
| d[str(i)] = [] |
|
|
| print("Extracting features from BERT...") |
| for sentence in sentences: |
| if count % 100 == 0: |
| print(f"Processing sentence {count}/{len(sentences)}") |
| count += 1 |
| |
| inputs = tokenizer(sentence, return_tensors="pt", max_length=512, truncation=True, padding=True) |
| |
| with torch.no_grad(): |
| outputs = model(**inputs, output_hidden_states=True) |
| |
| |
| for i in range(12): |
| cls_embedding = outputs.hidden_states[i][0][0].detach().numpy().tolist() |
| d[str(i)].append(cls_embedding) |
|
|
| print("FEATURES EXTRACTED!") |
|
|
| |
| with open(outpath, 'w', encoding='utf-8') as convert_file: |
| json.dump(d, convert_file) |
|
|
| print(f"FEATURES WRITTEN to {outpath}") |
| print("JOB DONE!") |
|
|