Spaces:

TornikeO
/

SimMS

Sleeping

App Files Files Community

SimMS / app.py

TornikeO

Make filtering default, add warnings

15498d2 over 1 year ago

raw

history blame contribute delete

8.32 kB

	import gradio as gr
	from pathlib import Path
	from matchms import Spectrum
	from typing import List, Optional, Literal
	import tempfile
	import numpy as np
	from simms.similarity import CudaCosineGreedy, CudaModifiedCosine
	from matchms.importing import load_from_mgf
	from matchms import calculate_scores
	import matplotlib.pyplot as plt
	import pickle

	# os.system("nvidia-smi")
	# print("TORCH_CUDA", torch.cuda.is_available())

	def preprocess_spectra(spectra: List[Spectrum]) -> Spectrum:
	from matchms.filtering import select_by_intensity, \
	normalize_intensities, \
	select_by_relative_intensity, \
	reduce_to_number_of_peaks, \
	select_by_mz, \
	require_minimum_number_of_peaks

	def process_spectrum(spectrum: Spectrum) -> Optional[Spectrum]:
	"""
	One of the many ways to preprocess the spectrum - we use this by default.
	"""
	spectrum = select_by_mz(spectrum, mz_from=10.0, mz_to=1000.0)
	spectrum = normalize_intensities(spectrum)
	spectrum = select_by_relative_intensity(spectrum, intensity_from=0.001)
	spectrum = reduce_to_number_of_peaks(spectrum, n_max=1024)
	return spectrum

	spectra = list(process_spectrum(s) for s in spectra) # Some might be None
	return spectra

	def run(r_filepath:Path, q_filepath:Path,
	similarity_method: Literal['CosineGreedy','ModifiedCosine'] = 'CosineGreedy',
	tolerance: float = 0.1,
	mz_power: float = 0.0,
	intensity_power: float = 1.0,
	shift: float = 0,
	batch_size: int = 2048,
	n_max_peaks: int = 1024,
	match_limit: int = 2048,
	array_type: Literal['sparse','numpy'] = "numpy",
	sparse_threshold: float = .75,
	do_preprocess: bool = False,
	):
	print('\n>>>>', r_filepath, q_filepath, array_type, '\n')
	# debug = os.getenv('CUDAMS_DEBUG') == '1'
	# if debug:
	# r_filepath = Path('tests/data/pesticides.mgf')
	# q_filepath = Path('tests/data/pesticides.mgf')

	assert r_filepath is not None, "Reference file is missing."
	assert q_filepath is not None, "Query file is missing."

	refs, ques = list(load_from_mgf(str(r_filepath))), list(load_from_mgf(str(q_filepath)))

	if do_preprocess:
	refs = preprocess_spectra(refs)
	ques = preprocess_spectra(ques)
	if not refs: gr.Error("References are empty after filtering")
	if not ques: gr.Error("Queries are empty after filtering")
	else:
	gr.Warning("Filtering is skipped. Malformed spectra can cause errors.")

	# If we have small spectra, don't make a huge batch
	if batch_size > max(len(refs), len(ques)):
	batch_size = max(len(refs), len(ques))


	kwargs = dict(tolerance=tolerance, mz_power=mz_power, intensity_power=intensity_power, shift=shift, batch_size=batch_size,
	n_max_peaks=n_max_peaks, match_limit=match_limit, sparse_threshold=sparse_threshold)

	if similarity_method == 'ModifiedCosine' and shift != 0:
	gr.Error("`ModifiedCosine` can not use shift - we will proceed as if shift is 0")

	if similarity_method == 'ModifiedCosine':
	kwargs.pop('shift')


	similarity_class = CudaCosineGreedy if similarity_method == 'CosineGreedy' else CudaModifiedCosine

	scores_obj = calculate_scores(
	refs, ques,
	similarity_function=similarity_class(**kwargs),
	array_type=array_type
	)

	score_vis = tempfile.NamedTemporaryFile(suffix='.jpg', delete=False)

	scores = scores_obj.to_array()

	outputs = len(scores.dtype.names)

	fig, axs = plt.subplots(1, outputs,
	figsize=(5*outputs, 5))
	for title, ax in zip(scores.dtype.names, axs):
	ax.imshow(scores[title])
	ax.set_title(title)

	plt.suptitle("Output values")
	plt.savefig(score_vis.name)

	score = tempfile.NamedTemporaryFile(prefix='scores-', suffix='.npz', delete=False)
	np.savez(score.name, scores=scores)

	pickle_ = tempfile.NamedTemporaryFile(prefix='scores-', suffix='.pickle', delete=False)

	Path(pickle_.name).write_bytes(pickle.dumps(scores_obj))
	return score.name, score_vis.name, pickle_.name

	with gr.Blocks() as demo:
	gr.Markdown("""
	# SimMS: A GPU-Accelerated Cosine Similarity implementation for Tandem Mass Spectrometry

	Calculate cosine greedy similarity matrix using CUDA. See the [main repo](https://github.com/pangeai/simms) for this project.
	This approach is x100-x500 faster than [MatchMS](https://github.com/matchms/matchms). Upload your MGF files below, or run the sample `pesticides.mgf` files against each other.

	In case of errors, check the "logs" above - malformed spectra will cause errors
	""")
	with gr.Row():
	refs = gr.File(label="Upload REFERENCES.mgf",
	interactive=True,
	value='pesticides.mgf')
	ques = gr.File(label="Upload QUERIES.mgf",
	interactive=True, value='pesticides.mgf')
	with gr.Row():
	similarity_method = gr.Radio(['CosineGreedy', 'ModifiedCosine'], value='ModifiedCosine', type='value',
	info="Choose one of the supported similarity methods. Need more? Let us know in github issues."
	)
	tolerance = gr.Number(value=0.1, label="tolerance")
	mz_power = gr.Number(value=0.0, label="m/z power")
	intensity_power = gr.Number(value=1.0, label="intensity power")
	shift = gr.Number(value=0, label="mass shift")
	with gr.Row():
	batch_size = gr.Number(value=2048, label="Batch Size",
	info='Compare this many spectra to same amount of other spectra at each iteration.')
	n_max_peaks = gr.Number(value=1024, label="Maximum Number of Peaks",
	info="Consider this many m/z peaks at most, per spectrum.")
	match_limit = gr.Number(value=2048, label="Match Limit",
	info="Consider this many pairs of m/z before stopping. "
	"In practice, a value of 2048 gives more than 99.99% accuracy on GNPS")
	do_preprocess = gr.Checkbox(value=True, label="filter spectra",
	info="If you want to filter spectra before processing, we can do that. Look at the code to see details.")
	with gr.Row():
	array_type = gr.Radio(['numpy', 'sparse'],
	value='numpy', type='value',
	label='If `sparse`, everything with score less than `sparse_threshold` will be discarded.'
	'If `numpy`, we disable sparse behaviour.')
	sparse_threshold = gr.Slider(minimum=0, maximum=1, value=0.75, label="Sparse Threshold",
	info="For very large results, when comparing, more than 10k x 10k, the output dense score matrix can grow too large for RAM."
	"While most of the scores aren't useful (near zero). This argument discards all scores less than sparse_threshold, and returns "
	"results as a SparseStack format."
	)
	with gr.Row():
	score_vis = gr.Image()

	with gr.Row():
	out_npz = gr.File(label="Download similarity matrix (.npz format)",
	interactive=False)
	out_pickle = gr.File(label="Download full `Scores` object (.pickle format)",
	interactive=False)
	gr.Markdown("""
	NOTE You can use this snippet to use the downloaded array:
	```py
	import numpy as np
	arr = np.load('scores-nr0hqp85.npz')['scores']
	print(arr)
	```""")
	btn = gr.Button("Run")
	btn.click(fn=run,
	inputs=[refs, ques, similarity_method, tolerance, mz_power, intensity_power, shift,
	batch_size, n_max_peaks, match_limit,
	array_type, sparse_threshold, do_preprocess],
	outputs=[out_npz, score_vis, out_pickle])

	if __name__ == "__main__":
	demo.launch(debug=True)