Requirements
You need the collate_fn.py from the model repository to process entity spans to correctly create the inputs.
Inference
Use Otter Bi-Encoder to predict entities (we recommend a threshold of 0.1):
from transformers import AutoModelForTokenClassification, AutoTokenizer, AutoConfig
from torch.utils.data import DataLoader
from collate_fn import AllLabelsCollator # import this file from the model repository
from datasets import DatasetDict, Dataset
def main():
dataset = DatasetDict({
"test": Dataset.from_list([
{
"text": "John Doe works at OpenAI in San Francisco.",
"char_spans": [
{"start": 0, "end": 8, "label": "person"},
{"start": 18, "end": 24, "label": "organization"},
{"start": 28, "end": 41, "label": "location"},
]
},
{
"text": "Alice and Bob visited the Eiffel Tower.",
"char_spans": [
{"start": 0, "end": 5, "label": "person"},
{"start": 10, "end": 13, "label": "person"},
{"start": 28, "end": 40, "label": "location"},
]
},
{
"text": "Amazon was founded by Jeff Bezos.",
"char_spans": [
{"start": 0, "end": 6, "label": "organization"},
{"start": 22, "end": 32, "label": "person"},
]
}
])
})
config = AutoConfig.from_pretrained("whoisjones/otter-bi-mmbert", trust_remote_code=True)
model = AutoModelForTokenClassification.from_pretrained("whoisjones/otter-bi-mmbert", trust_remote_code=True)
token_encoder_tokenizer = AutoTokenizer.from_pretrained(config.token_encoder)
type_encoder_tokenizer = AutoTokenizer.from_pretrained(config.type_encoder)
labels = list(set([span["label"] for sample in dataset["test"] for span in sample["char_spans"]]))
label2id = {label: idx for idx, label in enumerate(labels)}
collator = AllLabelsCollator(token_encoder_tokenizer, type_encoder_tokenizer, label2id=label2id)
dataloader = DataLoader(dataset["test"], batch_size=1, collate_fn=collator)
for batch in dataloader:
gold_labels = batch["labels"]["ner"]
predictions = model.predict(batch, threshold=0.1)
print(f"Gold labels: {gold_labels}")
print(f"Predictions: {predictions}")
if __name__ == "__main__":
main()
Minimal Training Example:
import torch
from torch.utils.data import DataLoader
from datasets import load_dataset
from collate_fn.py import InBatchNegativesCollator
from configuration_biencoder.py import SpanModelConfig
from modeling_biencoder.py import OtterBiEncoderModel
def train():
dataset = load_dataset('whoisjones/finerweb', "eng", split="train")
config = SpanModelConfig(token_encoder="google-bert/bert-base-uncased", type_encoder="google-bert/bert-base-uncased")
model = OtterBiEncoderModel(config=config).to("cuda")
token_encoder_tokenizer = AutoTokenizer.from_pretrained(config.token_encoder)
type_encoder_tokenizer = AutoTokenizer.from_pretrained(config.type_encoder)
model.train()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
train_collator = InBatchNegativesCollator(token_encoder_tokenizer, type_encoder_tokenizer)
dataloader = DataLoader(dataset, batch_size=8, collate_fn=train_collator)
max_steps = 500
step = 0
while step < max_steps:
total_loss = 0.0
num_batches = 0
for batch in dataloader:
optimizer.zero_grad()
token_encoder_inputs = {k: v.to("cuda") for k, v in batch["token_encoder_inputs"].items()}
type_encoder_inputs = {k: v.to("cuda") for k, v in batch["type_encoder_inputs"].items()}
labels = {k: v.to("cuda") for k, v in batch["labels"].items()}
outputs = model(
token_encoder_inputs=token_encoder_inputs,
type_encoder_inputs=type_encoder_inputs,
labels=labels
)
loss = outputs.loss
loss.backward()
optimizer.step()
total_loss += loss.item()
num_batches += 1
avg_loss = total_loss / num_batches
if step % 10 == 0:
print(f"Step {step }: Loss = {loss.item():.4f}, Avg Loss = {avg_loss:.4f}")
step += 1
print(f"Training complete! Average Loss: {avg_loss:.4f}")
if __name__ == "__main__":
train()
Please refer to our project on GitHub for full training scripts.
- Downloads last month
- 568