asaak
/

BERTley

Text Classification

Model card Files Files and versions

Metrics Training metrics Community

BERTley / bertley.py

asaak's picture

Upload folder using huggingface_hub

b95938c verified 12 months ago

history blame contribute delete

3.74 kB

	import argparse
	import json
	from transformers import (
	AutoTokenizer,
	AutoModelForSequenceClassification,
	pipeline,
	)


	def chunk_and_classify(text, classifier, tokenizer, max_len=512, stride=50):
	"""
	Splits a given text into overlapping chunks, classifies each chunk using a
	provided classifier, and computes the average classification scores for
	each label across all chunks.

	Args:
	text (str): The input text to be chunked and classified.
	classifier (Callable): A function or model that takes a text input and
	returns a list of dictionaries containing classification labels and scores.
	tokenizer (Callable): A tokenizer function or model that tokenizes the input
	text and provides token IDs.
	max_len (int, optional): The maximum length of each chunk in tokens. Defaults to 512.
	stride (int, optional): The number of tokens to overlap between consecutive chunks.
	Defaults to 50.

	Returns:
	dict: A dictionary where keys are classification labels and values are the
	average scores for each label across all chunks.
	"""
	# tokenize entire doc once
	tokens = tokenizer(text, return_tensors="pt")["input_ids"][0]
	chunks = []
	for i in range(0, tokens.size(0), max_len - stride):
	chunk_ids = tokens[i : i + max_len]
	chunks.append(tokenizer.decode(chunk_ids, skip_special_tokens=True))
	if i + max_len >= tokens.size(0):
	break

	# classify each chunk
	chunk_scores = []
	for chunk in chunks:
	scores = classifier(chunk)[0] # list of {label, score}
	chunk_scores.append({d["label"]: d["score"] for d in scores})

	# average scores per label
	avg_scores = {
	label: sum(s[label] for s in chunk_scores) / len(chunk_scores)
	for label in chunk_scores[0]
	}
	return avg_scores


	def main():

	# This initial set of lines defines the command line arguments this
	# program uses

	default_dir = "~/Code/Huggingface-metadata-project/BERTley/checkpoint-3486"
	parser = argparse.ArgumentParser(
	description="Run inference on a trained BERT metadata classifier"
	)
	parser.add_argument(
	"--model_dir",
	type=str,
	default=default_dir,
	help="Directory where your trained model and config live",
	)
	group = parser.add_mutually_exclusive_group(required=True)
	group.add_argument("--text", type=str, help="Raw text string to classify")
	group.add_argument(
	"--input_file",
	type=str,
	help="Path to a .txt file containing the document to classify",
	)
	args = parser.parse_args()

	# 1) Load tokenizer + model (config.json should have the id2label/label2id baked in
	# thru training script)
	tokenizer = AutoTokenizer.from_pretrained(args.model_dir)
	model = AutoModelForSequenceClassification.from_pretrained(args.model_dir)

	# 2) Build the pipeline...
	classifier = pipeline(
	"text-classification",
	model=model,
	tokenizer=tokenizer,
	return_all_scores=True,
	)

	# 3) Read your document
	if args.input_file:
	text = open(args.input_file, "r", encoding="utf-8").read()
	else:
	text = args.text

	# If it’s longer than 512 tokens, needs to be chunked + classified
	# otherwise single call
	tokens = tokenizer(text, return_tensors="pt")["input_ids"]
	if tokens.size(1) <= 512:
	result = classifier(text)[0]
	scores = {d["label"]: d["score"] for d in result}
	else:
	scores = chunk_and_classify(text, classifier, tokenizer)

	# print scores
	print(json.dumps(scores, indent=2))


	if __name__ == "__main__":
	main()