| import os |
| import pandas as pd |
| import torch |
| from transformers import AutoTokenizer, ClapTextModelWithProjection |
|
|
| if __name__ == '__main__': |
| |
| model = ClapTextModelWithProjection.from_pretrained("laion/clap-htsat-unfused") |
| model.eval() |
| tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused") |
|
|
| |
| input_csv_path = '/home/user/SSD/Dataset/Audioset_SL/no_rule_all/label_to_id.csv' |
| output_path = 'clap_embedding/' |
|
|
| |
| os.makedirs(output_path, exist_ok=True) |
|
|
| |
| df = pd.read_csv(input_csv_path) |
|
|
| |
| events = df['label'].unique() |
|
|
| with torch.no_grad(): |
| |
| for event in events: |
| text = event.replace('_', ' ') |
| text = f'The sound of {text}' |
| print(text) |
| inputs = tokenizer([text], padding=True, return_tensors="pt") |
| outputs = model(**inputs) |
| text_embeds = outputs.text_embeds |
|
|
| |
| output_file = os.path.join(output_path, f"{event}.pt") |
| torch.save(text_embeds, output_file) |
|
|
| print("Embedding extraction and saving complete!") |
|
|