| import gradio as gr |
| from pathlib import Path |
| from matchms import Spectrum |
| from typing import List, Optional, Literal |
| import tempfile |
| import numpy as np |
| from simms.similarity import CudaCosineGreedy, CudaModifiedCosine |
| from matchms.importing import load_from_mgf |
| from matchms import calculate_scores |
| import matplotlib.pyplot as plt |
| import pickle |
|
|
| |
| |
|
|
| def preprocess_spectra(spectra: List[Spectrum]) -> Spectrum: |
| from matchms.filtering import select_by_intensity, \ |
| normalize_intensities, \ |
| select_by_relative_intensity, \ |
| reduce_to_number_of_peaks, \ |
| select_by_mz, \ |
| require_minimum_number_of_peaks |
| |
| def process_spectrum(spectrum: Spectrum) -> Optional[Spectrum]: |
| """ |
| One of the many ways to preprocess the spectrum - we use this by default. |
| """ |
| spectrum = select_by_mz(spectrum, mz_from=10.0, mz_to=1000.0) |
| spectrum = normalize_intensities(spectrum) |
| spectrum = select_by_relative_intensity(spectrum, intensity_from=0.001) |
| spectrum = reduce_to_number_of_peaks(spectrum, n_max=1024) |
| return spectrum |
| |
| spectra = list(process_spectrum(s) for s in spectra) |
| return spectra |
|
|
| def run(r_filepath:Path, q_filepath:Path, |
| similarity_method: Literal['CosineGreedy','ModifiedCosine'] = 'CosineGreedy', |
| tolerance: float = 0.1, |
| mz_power: float = 0.0, |
| intensity_power: float = 1.0, |
| shift: float = 0, |
| batch_size: int = 2048, |
| n_max_peaks: int = 1024, |
| match_limit: int = 2048, |
| array_type: Literal['sparse','numpy'] = "numpy", |
| sparse_threshold: float = .75, |
| do_preprocess: bool = False, |
| ): |
| print('\n>>>>', r_filepath, q_filepath, array_type, '\n') |
| |
| |
| |
| |
|
|
| assert r_filepath is not None, "Reference file is missing." |
| assert q_filepath is not None, "Query file is missing." |
|
|
| refs, ques = list(load_from_mgf(str(r_filepath))), list(load_from_mgf(str(q_filepath))) |
| |
| if do_preprocess: |
| refs = preprocess_spectra(refs) |
| ques = preprocess_spectra(ques) |
| if not refs: gr.Error("References are empty after filtering") |
| if not ques: gr.Error("Queries are empty after filtering") |
| else: |
| gr.Warning("Filtering is skipped. Malformed spectra can cause errors.") |
|
|
| |
| if batch_size > max(len(refs), len(ques)): |
| batch_size = max(len(refs), len(ques)) |
|
|
| |
| kwargs = dict(tolerance=tolerance, mz_power=mz_power, intensity_power=intensity_power, shift=shift, batch_size=batch_size, |
| n_max_peaks=n_max_peaks, match_limit=match_limit, sparse_threshold=sparse_threshold) |
| |
| if similarity_method == 'ModifiedCosine' and shift != 0: |
| gr.Error("`ModifiedCosine` can not use shift - we will proceed as if shift is 0") |
| |
| if similarity_method == 'ModifiedCosine': |
| kwargs.pop('shift') |
| |
| |
| similarity_class = CudaCosineGreedy if similarity_method == 'CosineGreedy' else CudaModifiedCosine |
| |
| scores_obj = calculate_scores( |
| refs, ques, |
| similarity_function=similarity_class(**kwargs), |
| array_type=array_type |
| ) |
|
|
| score_vis = tempfile.NamedTemporaryFile(suffix='.jpg', delete=False) |
|
|
| scores = scores_obj.to_array() |
| |
| outputs = len(scores.dtype.names) |
| |
| fig, axs = plt.subplots(1, outputs, |
| figsize=(5*outputs, 5)) |
| for title, ax in zip(scores.dtype.names, axs): |
| ax.imshow(scores[title]) |
| ax.set_title(title) |
|
|
| plt.suptitle("Output values") |
| plt.savefig(score_vis.name) |
|
|
| score = tempfile.NamedTemporaryFile(prefix='scores-', suffix='.npz', delete=False) |
| np.savez(score.name, scores=scores) |
|
|
| pickle_ = tempfile.NamedTemporaryFile(prefix='scores-', suffix='.pickle', delete=False) |
|
|
| Path(pickle_.name).write_bytes(pickle.dumps(scores_obj)) |
| return score.name, score_vis.name, pickle_.name |
|
|
| with gr.Blocks() as demo: |
| gr.Markdown(""" |
| # SimMS: A GPU-Accelerated Cosine Similarity implementation for Tandem Mass Spectrometry |
| |
| Calculate cosine greedy similarity matrix using CUDA. See the [main repo](https://github.com/pangeai/simms) for this project. |
| This approach is x100-x500 faster than [MatchMS](https://github.com/matchms/matchms). Upload your MGF files below, or run the sample `pesticides.mgf` files against each other. |
| |
| **In case of errors, check the "logs" above - malformed spectra will cause errors** |
| """) |
| with gr.Row(): |
| refs = gr.File(label="Upload REFERENCES.mgf", |
| interactive=True, |
| value='pesticides.mgf') |
| ques = gr.File(label="Upload QUERIES.mgf", |
| interactive=True, value='pesticides.mgf') |
| with gr.Row(): |
| similarity_method = gr.Radio(['CosineGreedy', 'ModifiedCosine'], value='ModifiedCosine', type='value', |
| info="Choose one of the supported similarity methods. Need more? Let us know in github issues." |
| ) |
| tolerance = gr.Number(value=0.1, label="tolerance") |
| mz_power = gr.Number(value=0.0, label="m/z power") |
| intensity_power = gr.Number(value=1.0, label="intensity power") |
| shift = gr.Number(value=0, label="mass shift") |
| with gr.Row(): |
| batch_size = gr.Number(value=2048, label="Batch Size", |
| info='Compare this many spectra to same amount of other spectra at each iteration.') |
| n_max_peaks = gr.Number(value=1024, label="Maximum Number of Peaks", |
| info="Consider this many m/z peaks at most, per spectrum.") |
| match_limit = gr.Number(value=2048, label="Match Limit", |
| info="Consider this many pairs of m/z before stopping. " |
| "In practice, a value of 2048 gives more than 99.99% accuracy on GNPS") |
| do_preprocess = gr.Checkbox(value=True, label="filter spectra", |
| info="If you want to filter spectra before processing, we can do that. Look at the code to see details.") |
| with gr.Row(): |
| array_type = gr.Radio(['numpy', 'sparse'], |
| value='numpy', type='value', |
| label='If `sparse`, everything with score less than `sparse_threshold` will be discarded.' |
| 'If `numpy`, we disable sparse behaviour.') |
| sparse_threshold = gr.Slider(minimum=0, maximum=1, value=0.75, label="Sparse Threshold", |
| info="For very large results, when comparing, more than 10k x 10k, the output dense score matrix can grow too large for RAM." |
| "While most of the scores aren't useful (near zero). This argument discards all scores less than sparse_threshold, and returns " |
| "results as a SparseStack format." |
| ) |
| with gr.Row(): |
| score_vis = gr.Image() |
|
|
| with gr.Row(): |
| out_npz = gr.File(label="Download similarity matrix (.npz format)", |
| interactive=False) |
| out_pickle = gr.File(label="Download full `Scores` object (.pickle format)", |
| interactive=False) |
| gr.Markdown(""" |
| **NOTE** You can use this snippet to use the downloaded array: |
| ```py |
| import numpy as np |
| arr = np.load('scores-nr0hqp85.npz')['scores'] |
| print(arr) |
| ```""") |
| btn = gr.Button("Run") |
| btn.click(fn=run, |
| inputs=[refs, ques, similarity_method, tolerance, mz_power, intensity_power, shift, |
| batch_size, n_max_peaks, match_limit, |
| array_type, sparse_threshold, do_preprocess], |
| outputs=[out_npz, score_vis, out_pickle]) |
|
|
| if __name__ == "__main__": |
| demo.launch(debug=True) |