---
base_model:
- alan-yahya/MatBERT
pipeline_tag: feature-extraction
tags:
- chemistry
- biology
---
### ✨ [ACL 2025] Incorporating Domain Knowledge into Materials Tokenization

You can find this paper in [here](https://arxiv.org/abs/2506.11115)


If you only want to extract material concepts (material term or material formula) using **MatDetector**, please follow the steps below.

```python
import os
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, BertForTokenClassification
from tqdm import tqdm

model_path = MatDetector_ckp
# you can download matbert at https://github.com/lbnlp/MatBERT
tokenizer_path = '/matbert-base-cased'
input_file = 'TARGET.txt'
output_directory = './'


tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, use_fast=False, do_lower_case=False)
model = BertForTokenClassification.from_pretrained(model_path).half()  

device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval() 

label_map = {0: "O", 1: "B-matname", 2: "I-matname", 3: "B-mf", 4: "I-mf"}

def process_single_word(word, tokenizer, model, device):
    tokenized = tokenizer(word, return_tensors="pt", truncation=True, max_length=128)
    input_ids = tokenized["input_ids"].to(device)
    attention_mask = tokenized["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    probabilities = F.softmax(logits, dim=2)  # (batch_size=1, seq_len, num_labels)

    return tokenized, probabilities


def determine_label(tokenized, probabilities, label_map):
    tokens = tokenizer.convert_ids_to_tokens(tokenized["input_ids"][0].tolist())
    probs = probabilities[0]  # (seq_len, num_labels)

    token_labels = []
    for token, prob in zip(tokens, probs):
        if token in ["[CLS]", "[SEP]", "[PAD]"]:
            continue

        clean_token = token[2:] if token.startswith("##") else token
        max_label = prob.argmax().item()
        label_name = label_map[max_label]

        token_labels.append(label_name)

    label_counts = {}
    for label in token_labels:
        if label not in label_counts:
            label_counts[label] = 0
        label_counts[label] += 1

    final_label = max(label_counts, key=label_counts.get) if label_counts else "O"
    
    return final_label


with open(os.path.join(output_directory, "mf.txt"), "w") as mf_file, \
     open(os.path.join(output_directory, "matname.txt"), "w") as matname_file, \
     open(os.path.join(output_directory, "o_tags.txt"), "w") as o_file:

    with open(input_file, 'r') as file:
        lines = [line.strip() for line in file.readlines() if line.strip()]  
        total_lines = len(lines)

        with tqdm(total=total_lines, desc="Processing words", unit="words") as progress_bar:
            for original_word in lines:
                tokenized, probabilities = process_single_word(original_word, tokenizer, model, device)
                final_label = determine_label(tokenized, probabilities, label_map)

                if final_label == "O":
                    o_file.write(f"{original_word}\n")
                elif final_label in ["B-mf", "I-mf"]:
                    mf_file.write(f"{original_word}\n")
                elif final_label in ["B-matname", "I-matname"]:
                    matname_file.write(f"{original_word}\n")

                progress_bar.update(1)

print("Processing completed. Files saved as mf.txt, matname.txt, and o_tags.txt.")

```