Spaces:

UMCU
/

MLMtester

Sleeping

App Files Files Community

MLMtester / app.py

UMCU

Update app.py

e788c0f verified 6 months ago

raw

history blame contribute delete

3.73 kB

	import gradio as gr
	from transformers import pipeline
	from transformers import AutoTokenizer

	# Cache for loaded pipelines to avoid reloading
	pipeline_cache = {}

	# List of available masked language models
	def get_model_choices():
	return [
	"UMCU/CardioMedRoBERTa.nl",
	"UMCU/CardioBERTa_base.nl",
	"UMCU/CardioBERTa.nl_clinical",
	"UMCU/CardioDeBERTa.nl",
	"UMCU/CardioDeBERTa.nl_clinical",
	#"UMCU/CardioBigBird_base.nl",
	"CLTL/MedRoBERTa.nl",
	"DTAI-KULeuven/robbert-2023-dutch-base",
	"DTAI-KULeuven/robbert-2023-dutch-large",
	"joeranbosma/dragon-bert-base-mixed-domain",
	"joeranbosma/dragon-bert-base-domain-specific",
	"joeranbosma/dragon-roberta-base-mixed-domain",
	"joeranbosma/dragon-roberta-large-mixed-domain",
	"joeranbosma/dragon-roberta-base-domain-specific",
	"joeranbosma/dragon-roberta-large-domain-specific",
	"joeranbosma/dragon-longformer-base-mixed-domain",
	"joeranbosma/dragon-longformer-large-mixed-domain",
	"joeranbosma/dragon-longformer-base-domain-specific",
	"joeranbosma/dragon-longformer-large-domain-specific"
	]

	# Define the prediction function with top-k parameter
	def fill_masked(text: str, model_name: str, top_k: int):
	"""
	Takes text with [MASK] tokens, a model name, and top_k, returns top predictions.
	"""
	# Load the pipeline if not already cached
	if model_name not in pipeline_cache:
	pipeline_cache[model_name] = pipeline(
	"fill-mask",
	model=model_name
	)

	fill_mask = pipeline_cache[model_name]
	# Get top_k predictions
	# make sure the mask format is correct
	# [MASK] for BERT and DeBERTa
	# <mask> for BigBird, LongFormer, RoBERTa and XLM-RoBERTa
	#
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	mask_token = tokenizer.mask_token
	text = text.replace("[MASK]", mask_token)

	results = fill_mask(text, top_k=top_k)

	# Format results for display
	res_dict = {}
	if isinstance(results[0], list):
	for mask_num, mask_results in enumerate(results):
	formatted = []
	for cand in mask_results:
	formatted.append({
	"sequence": cand["sequence"],
	"score": round(cand["score"], 4),
	"token": cand["token_str"]
	})
	res_dict[f"mask_{str(mask_num)}"] = formatted
	elif isinstance(results, list):
	formatted = []
	for cand in results:
	formatted.append({
	"sequence": cand["sequence"],
	"score": round(cand["score"], 4),
	"token": cand["token_str"]
	})
	res_dict[f"mask_{str(0)}"] = formatted
	else:
	raise ValueError("The results need to be presented in a list or a list of lists")
	return res_dict

	# Build the Gradio interface with a slider for top-k
	iface = gr.Interface(
	fn=fill_masked,
	inputs=[
	gr.Textbox(
	lines=2,
	placeholder="Type text with [MASK] tokens here...",
	label="Masked Text"
	),
	gr.Dropdown(
	choices=get_model_choices(),
	value="bert-base-uncased",
	label="Model"
	),
	gr.Slider(
	minimum=1,
	maximum=20,
	step=1,
	value=5,
	label="Top K Predictions"
	)
	],
	outputs=gr.JSON(label="Predictions"),
	title="Masked Language Model tester",
	description="Enter a sentence with [MASK] tokens, select a model, and choose how many top predictions to return."
	)

	if __name__ == "__main__":
	iface.launch()