--- base_model: - alan-yahya/MatBERT pipeline_tag: feature-extraction tags: - chemistry - biology --- ### ✨ [ACL 2025] Incorporating Domain Knowledge into Materials Tokenization You can find this paper in [here](https://arxiv.org/abs/2506.11115) If you only want to extract material concepts (material term or material formula) using **MatDetector**, please follow the steps below. ```python import os import torch import torch.nn.functional as F from transformers import AutoTokenizer, BertForTokenClassification from tqdm import tqdm model_path = MatDetector_ckp # you can download matbert at https://github.com/lbnlp/MatBERT tokenizer_path = '/matbert-base-cased' input_file = 'TARGET.txt' output_directory = './' tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, use_fast=False, do_lower_case=False) model = BertForTokenClassification.from_pretrained(model_path).half() device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu") model.to(device) model.eval() label_map = {0: "O", 1: "B-matname", 2: "I-matname", 3: "B-mf", 4: "I-mf"} def process_single_word(word, tokenizer, model, device): tokenized = tokenizer(word, return_tensors="pt", truncation=True, max_length=128) input_ids = tokenized["input_ids"].to(device) attention_mask = tokenized["attention_mask"].to(device) with torch.no_grad(): outputs = model(input_ids=input_ids, attention_mask=attention_mask) logits = outputs.logits probabilities = F.softmax(logits, dim=2) # (batch_size=1, seq_len, num_labels) return tokenized, probabilities def determine_label(tokenized, probabilities, label_map): tokens = tokenizer.convert_ids_to_tokens(tokenized["input_ids"][0].tolist()) probs = probabilities[0] # (seq_len, num_labels) token_labels = [] for token, prob in zip(tokens, probs): if token in ["[CLS]", "[SEP]", "[PAD]"]: continue clean_token = token[2:] if token.startswith("##") else token max_label = prob.argmax().item() label_name = label_map[max_label] token_labels.append(label_name) label_counts = {} for label in token_labels: if label not in label_counts: label_counts[label] = 0 label_counts[label] += 1 final_label = max(label_counts, key=label_counts.get) if label_counts else "O" return final_label with open(os.path.join(output_directory, "mf.txt"), "w") as mf_file, \ open(os.path.join(output_directory, "matname.txt"), "w") as matname_file, \ open(os.path.join(output_directory, "o_tags.txt"), "w") as o_file: with open(input_file, 'r') as file: lines = [line.strip() for line in file.readlines() if line.strip()] total_lines = len(lines) with tqdm(total=total_lines, desc="Processing words", unit="words") as progress_bar: for original_word in lines: tokenized, probabilities = process_single_word(original_word, tokenizer, model, device) final_label = determine_label(tokenized, probabilities, label_map) if final_label == "O": o_file.write(f"{original_word}\n") elif final_label in ["B-mf", "I-mf"]: mf_file.write(f"{original_word}\n") elif final_label in ["B-matname", "I-matname"]: matname_file.write(f"{original_word}\n") progress_bar.update(1) print("Processing completed. Files saved as mf.txt, matname.txt, and o_tags.txt.") ```