all the code

Files changed (15) hide show

README.md +53 -0
agent/__pycache__/agent.cpython-313.pyc +0 -0
agent/__pycache__/prompt.cpython-313.pyc +0 -0
agent/__pycache__/utils.cpython-313.pyc +0 -0
agent/agent.py +676 -0
agent/prompt.py +64 -0
agent/utils.py +75 -0
data/README.md +7 -0
data/jump-dataset.ipynb +0 -0
data/jump-similarity.ipynb +0 -0
model.py +88 -0
run_app.py +296 -0
tahoe_model/apply_linear_model.py +93 -0
tahoe_model/compute_tahoe_deltas.py +26 -0
tahoe_model/merge_tahoe_vision.py +66 -0

README.md ADDED Viewed

	@@ -0,0 +1,53 @@

+---
+license: mit
+datasets:
+- tahoebio/Tahoe-100M
+tags:
+- tahoe-deepdive
+- hackathon
+- tahoe-100M
+---
+<div align="center">
+  <img src="img/SigSpace.png" alt="SigSpace Logo" width="400"/>
+</div>
+# SigSpace: An AI Agent for the Tahoe-100M dataset
+This is a submission for the **Tahoe-DeepDive Hackathon 2025**.
+# Team Name
+SigSpace
+## Members
+- Ishita Mangla
+- Kuan Pang
+- Giovanni Palla
+- Yanay Rosen
+- Sid Sanghi
+- Yasha Ektefaie
+- Rohit Khurana
+# Project
+## SigSpace: An AI Agent for the Tahoe-100M dataset
+## Overview
+We have developed an AI agent that accesses the Tahoe-100M dataset along with publicly available and novel datasets. This agent works to refine and expand the mechanisms of action (MOA) and drug signatures of the perturbations within the Tahoe-100M dataset.
+## Motivation
+Drug discovery in the age of Large Language Models (LLMs) can be enhanced through agentic workflows that parse diverse sources of unstructured information to synthesize and connect hypotheses across different fields and modalities. However, these models are primarily trained on text data and lack the capacity to effectively interrogate rich biological databases with complex, biologically-motivated queries. In this work, we provide a proof of concept demonstrating how the Tahoe-100M dataset can be integrated with publicly available relevant datasets to expand the hypothesis space for mechanisms of action and drug responses in the perturbations tested in the Tahoe-100M dataset.
+## Methods
+We have curated new datasets that enhance the description of drugs and cell-lines present in the Tahoe-100M dataset.
+Specifically:
+- TAHOE-100M: vision scores and metadata.
+- PRISM: We use PRISM drug sensitivity data, which reports the concentration of a compound needed to inhibit 50% of cancer cell viability. Measurements are based on pooled screening of barcoded cell lines and provide a high-throughput assessment of drug response across a large panel of cancer models.
+- NCI60: We use NCI-60 LC50 data, which reports the concentration of a drug that kills 50% of the cells present at the time of drug addition. It is measured across a panel of 60 human cancer cell lines using standardized multi-dose assays.
+- JUMP: We use the JUMP dataset, which captures morphological profiles of cells in response to chemical and genetic perturbations. High-content imaging and automated feature extraction are used to quantify cellular changes, enabling large-scale profiling of perturbation effects across diverse biological contexts.
+- UCE-CXG-EMBEDDING: natural perturbation search using AI virtual cell.
+## Results
+We have developed a Gradio application that accesses these databases and performs complex queries, enhancing and grounding the reasoning in real biological measurements.
+## Discussion

agent/__pycache__/agent.cpython-313.pyc ADDED Viewed

Binary file (23.8 kB). View file

agent/__pycache__/prompt.cpython-313.pyc ADDED Viewed

Binary file (3.7 kB). View file

agent/__pycache__/utils.cpython-313.pyc ADDED Viewed

Binary file (4.22 kB). View file

agent/agent.py ADDED Viewed

	@@ -0,0 +1,676 @@

+from agent.utils import *
+from agent.prompt import *
+import anndata
+import gradio as gr
+from gradio import ChatMessage
+import re
+import pandas as pd
+import pathlib
+import numpy as np
+class SigSpace(Basic_Agent):
+    def __init__(self, config_path:str):
+        super().__init__(config_path)
+        self.conversation = []
+        self.system_prompt = Agent_Prompt
+        self.conversation = []
+        self.conversation.append({"role": "system", "content": self.system_prompt})
+        # initialize data for jump
+        path = pathlib.Path("/home/ubuntu/giovanni/code/Tahoe_Hackathon/datasets")
+        # jump_path = pathlib.Path("/home/ubuntu/giovanni/data")
+        self.jump_tahoe_drug_metadata = pd.read_csv(path/"drug_metadata_inchikey.csv")
+        self.jump_similarity_score = pd.read_csv(path/"compound_genetic_perturbation_cosine_similarity_inchikey.csv")
+        # Load PRISM IC50 matrix
+        # prism_data_path = pathlib.Path("/home/ubuntu/sid/Hackathon_Tahoe/data")
+        self.ic50 = pd.read_csv(path / "Tahoe_PRISM_cell_by_drug_ic50_matrix_named.csv", index_col=0)
+        self.ic50.columns = self.ic50.columns.str.lower()
+        # nci60_path = pathlib.Path("/home/ubuntu/ishita/tahoe/")
+        self.lc50 = pd.read_csv(path / "filtered_results.csv")
+        # Filter out rows where CELL is nan
+        self.lc50 = self.lc50[self.lc50['CELL'].notna()]
+        # Load full Tahoe metadata
+        # tahoe_path = pathlib.Path("/home/ubuntu/rohit/data")
+        self.tahoe_cell_meta = pd.read_csv(path / "cell_line_metadata.csv")
+        self.tahoe_drug_meta = pd.read_csv(path / "drug_metadata.csv")
+        self.tahoe_vision_scores = anndata.read_h5ad(path / "tahoe_vision_scores.h5ad")
+        # Load PRISM subset of Tahoe metadata
+        self.prism_tahoe_cell_meta = pd.read_csv(path / "Tahoe_PRISM_matched_cell_metadata_final.csv")
+        self.prism_tahoe_drug_meta = pd.read_csv(path / "Tahoe_PRISM_matched_drug_metadata_final.csv")
+        # Build cell line common name to depmap_id map (strip whitespace and case)
+        self.cell_name_to_depmap = {
+            row["cell_name"].strip(): row["Cell_ID_DepMap"]
+            for _, row in self.prism_tahoe_cell_meta.iterrows()
+        }
+        self.cell_name_to_depmap_lc50 = {
+            row["clean"].strip(): row["cell_line_name"]
+            for _, row in self.lc50.iterrows()
+        }
+        self.tahoe_similarity_score = pd.read_csv(path / "in_tahoe_search_result_df.csv")
+        self.tahoe_cxg_similarity_score = pd.read_csv(path / "cxg_search_result_df.csv")
+    def initialize_conversation(self, message, conversation=None, history=None):
+        if conversation is None:
+            conversation = []
+        conversation.append({"role": "system", "content" : Agent_Prompt})
+        if history is not None:
+            if len(history) == 0:
+                conversation = []
+                print("clear conversation successfully")
+            else:
+                for i in range(len(history)):
+                    if history[i]['role'] == 'user':
+                        if i-1 >= 0 and history[i-1]['role'] == 'assistant':
+                            conversation.append(
+                                {"role": "assistant", "content": history[i-1]['content']})
+                        conversation.append(
+                            {"role": "user", "content": history[i]['content']})
+                    if i == len(history)-1 and history[i]['role'] == 'assistant':
+                        conversation.append(
+                            {"role": "assistant", "content": history[i]['content']})
+        conversation.append({"role": "user", "content": message})
+        return conversation
+    def get_similar_disease(self, disease_name, k_value):
+        if disease_name != "Alzheimer's":
+            return "FAIL"
+        return 'Parkinsons Disease'
+    def get_validated_target_jump(self, drug_name):
+        print(drug_name)
+        try:
+            inchikey = self.jump_tahoe_drug_metadata[self.jump_tahoe_drug_metadata.drug.isin([drug_name])]["InChIKey"].values[0]
+            similarity_scores = self.jump_similarity_score[self.jump_similarity_score.InChIKey.isin([inchikey])]
+            # Count ORF entries with cosine_similarity > 0.2 and < -0.2
+            orf_positive = similarity_scores[(similarity_scores.Genetic_Perturbation == 'ORF') & (similarity_scores.cosine_sim > 0.2)].shape[0]
+            orf_negative = similarity_scores[(similarity_scores.Genetic_Perturbation == 'ORF') & (similarity_scores.cosine_sim < -0.2)].shape[0]
+            # Count CRISPR entries with cosine_similarity > 0.2 and < -0.2
+            crispr_positive = similarity_scores[(similarity_scores.Genetic_Perturbation == 'CRISPR') & (similarity_scores.cosine_sim > 0.2)].shape[0]
+            crispr_negative = similarity_scores[(similarity_scores.Genetic_Perturbation == 'CRISPR') & (similarity_scores.cosine_sim < -0.2)].shape[0]
+            orf_targets = f"ORF: {orf_positive} positive correlations (>0.2), {orf_negative} negative correlations (<-0.2)"
+            crispr_targets = f"CRISPR: {crispr_positive} positive correlations (>0.2), {crispr_negative} negative correlations (<-0.2)"
+            orf_crispr_targets = orf_targets + " " +crispr_targets
+            known_targets_from_jump = self.jump_tahoe_drug_metadata[self.jump_tahoe_drug_metadata.drug.isin([drug_name])]["target_list"].values[0]
+            known_targets_output = f"The known targets from the JUMP dataset are: {', '.join(known_targets_from_jump.split('|'))}"
+        except Exception as e:
+            print(e)
+            return "For the drug {drug_name}, we were not able to find the target in the JUMP dataset."
+        orf_crispr_targets = \
+        f"""
+        Preturbation description:
+        ORF: The ORF perturbation consists of an overexpression of the target gene.
+        CRISPR: The CRISPR perturbation consists of a knockout of the target gene.
+        Considering the drug "{drug_name}", we expect positive correlations with shared CRISPR targets,
+        and negative correlations with shared ORF targets.
+        But, the measured correlations are:
+        {orf_crispr_targets}
+        Furthermore, the JUMP dataset has the following known targets for the drug "{drug_name}":
+        {known_targets_output}
+        """
+        return orf_crispr_targets
+    def get_similar_drug_effect_in_tahoe(self, cell_line_name: str, drug_name: str):
+        """
+        Get similar effect drugs in tahoe based on the drug name and cell line name.
+        Args:
+            cell_line_name (str): The name of the cell line.
+            drug_name (str): The name of the drug.
+        """
+        cell_line_names = self.tahoe_similarity_score["source_cell_line"].unique().tolist()
+        drug_names = self.tahoe_similarity_score["source_drug_name"].unique().tolist()
+        if cell_line_name not in cell_line_names:
+            return "FAIL: Cell line name not found in the dataset. A example: CVCL_0218"
+        if drug_name not in drug_names:
+            return "FAIL: Drug name not found in the dataset. A example: Daptomycin"
+        hits = self.tahoe_similarity_score[
+            (self.tahoe_similarity_score["source_cell_line"] == cell_line_name) &
+            (self.tahoe_similarity_score["source_drug_name"] == drug_name)
+        ]
+        # sort by distance
+        hits = hits.sort_values(by="distance", ascending=True).reset_index(drop=True)
+        hits = hits.head(10)
+        # keep target_drug_name and target_cell_line
+        hits = hits[["target_drug_name", "target_cell_line",]]
+        outputs = f"""
+        The following drugs have similar effects to the drug you provided:
+        hits:
+        {hits}
+        """
+        return outputs
+    def get_similar_drug_effects_in_cxg(self, cell_line_name: str, drug_name: str):
+        """
+        Get similar effect diseases in cxg based on the drug name and cell line name.
+        Args:
+            cell_line_name (str): The name of the cell line.
+            drug_name (str): The name of the drug.
+        """
+        cell_line_names = self.tahoe_cxg_similarity_score["cell_line"].unique().tolist()
+        drug_names = self.tahoe_cxg_similarity_score["perturbation_drug_name"].unique().tolist()
+        if cell_line_name not in cell_line_names:
+            return "FAIL: Cell line name not found in the dataset. A valid example: CVCL_0218"
+        if drug_name not in drug_names:
+            return "FAIL: Drug name not found in the dataset. A valid example:: Daptomycin"
+        hits = self.tahoe_cxg_similarity_score[
+            (self.tahoe_cxg_similarity_score["cell_line"] == cell_line_name) &
+            (self.tahoe_cxg_similarity_score["perturbation_drug_name"] == drug_name)
+        ]
+        hits = hits.sort_values(by="distance", ascending=True).reset_index(drop=True)
+        hits = hits.head(10)
+        # keeps cell_type tissue_type and disease
+        hits = hits[["cell_type", "tissue_type", "disease"]]
+        outputs = f"""
+        The following diseases have similar effects to the drug you provided:
+        hits:
+        {hits}
+        """
+        return outputs
+    def get_ic50_prism(self, drug_name: str, cell_line_name: str):
+        drug_name_lower = drug_name.strip().lower()
+        cell_line_key = cell_line_name.strip()
+        if cell_line_key not in self.cell_name_to_depmap:
+            print(f"Cell line name '{cell_line_key}' not found for PRISM data")
+            return f"FAIL: Cell line name '{cell_line_key}' not found for PRISM data"
+        depmap_id = self.cell_name_to_depmap[cell_line_key]
+        if drug_name_lower not in self.ic50.columns:
+            print(f"Drug name '{drug_name}' not found in IC50 matrix columns.")
+            return f"FAIL: Drug name '{drug_name}' not found in IC50 matrix columns."
+        try:
+            ic50_val = self.ic50.loc[depmap_id, drug_name_lower]
+            if pd.isna(ic50_val):
+                print(f"FAIL: IC50 value is missing for '{drug_name}' in cell line '{cell_line_name}' (DepMap ID: {depmap_id}).")
+                return f"FAIL: IC50 value is missing for '{drug_name}' in cell line '{cell_line_name}' (DepMap ID: {depmap_id})."
+            return (
+                f"The IC50 value of {ic50_val:.4f} corresponds to the log10-transformed micromolar concentration "
+                f"at which {drug_name} inhibits 50% of viability in the {cell_line_name} cell line "
+                f"(DepMap ID: {depmap_id}).\n\n"
+                "This value comes from the PRISM Repurposing Secondary Screen, which exposes pooled barcoded cell lines "
+                "to drug treatment for 5 days and infers viability from barcode abundance using sequencing.\n\n"
+                "The secondary screen includes higher-confidence compound–cell line pairs with improved replicability "
+                "compared to the primary screen.\n\n"
+                "Lower IC50 values indicate greater sensitivity of the cell line to the drug."
+            )
+        except KeyError as e:
+            print(f"Combination not found: {e}")
+            return None
+    def clean_cell_line_name(self, name):
+        """
+        Standardize cell line names for comparison by:
+        1. Converting to string (handles any non-string values)
+        2. Converting to uppercase
+        3. Removing all non-alphanumeric characters
+        Args:
+            name: Cell line name (string or other type)
+        Returns:
+            Cleaned string with only uppercase letters and numbers
+        """
+        return re.sub(r"[^A-Z0-9]", "", str(name).upper())
+    def get_lc50_nci60(self, drug_name: str, cell_line_name: str):
+        cell_line_name = cell_line_name.upper()
+        cell_line_key = self.clean_cell_line_name(cell_line_name)
+        if cell_line_key not in self.cell_name_to_depmap_lc50:
+            print(f"Cell line name '{cell_line_key}' not found for NCI60 data")
+            return None
+        depmap_id = self.cell_name_to_depmap_lc50[cell_line_key]
+        print ("Depmap_id", depmap_id)
+        # Find the drug in NCI60 dataset
+        # Since drugs are in uppercase in the list, convert search term to uppercase
+        drug_name_upper = drug_name.strip().upper()
+        # Filter rows where the drug name is in the drug column
+        # This assumes drugs in each row are comma-separated or in a format that can be searched
+        matching_row = self.lc50[self.lc50['drug'].str.contains(drug_name_upper, na=False)]
+        print ("Matching row", matching_row)
+        if matching_row.empty:
+            print(f"Drug name '{drug_name}' not found in NCI60 dataset.")
+            return None
+        if matching_row.empty:
+            raise ValueError(f"Multiple matches found for drug '{drug_name}' in NCI60 dataset.")
+        print ("Matching row", matching_row)
+        # Get the LC50 value from the matching row
+        lc50_val = matching_row.iloc[0]['NLOGLC50']
+        lconc_val = matching_row.iloc[0]['LCONC']
+        if pd.isna(lc50_val):
+            return "LC50 value is missing for '{drug_name}' in cell line '{cell_line_name}' (depmap_id: {depmap_id})."
+        lc50_output = \
+        f"""
+        The LC50 value of {lc50_val} represents -log10(LC50), the negative base-10 logarithm of the molar concentration that inhibits 50% of cell growth.
+        Higher LC50 values therefore indicate greater drug potency.
+        The LCONC value of {lconc_val} denotes the maximum log10 molar concentration tested in the dilution series—for example, LCONC = -4 corresponds to 10^-4 M.
+        Both metrics come from the NCI-60 drug screen, which applies a standardized 48-hour exposure assay across all compound–cell-line pairs."
+        """
+        return lc50_output
+    def load_gene_sets_file(self, file_path):
+        """
+        Load gene sets from a tab-delimited file where the first column is the gene set name
+        and the remaining columns are gene symbols.
+        Parameters:
+        -----------
+        file_path : str
+            Path to the gene sets file
+        Returns:
+        --------
+        dict
+            Dictionary mapping gene set names to lists of genes
+        """
+        gene_sets = {}
+        with open(file_path, 'r') as file:
+            for line in file:
+                parts = line.strip().split('\t')
+                if parts:
+                    set_name = parts[0]
+                    genes = [gene for gene in parts[1:] if gene]  # Filter out empty strings
+                    gene_sets[set_name] = genes
+        return gene_sets
+    def get_genes_for_set(self, set_name):
+        """
+        Get the list of genes for a specific gene set.
+        Parameters:
+        -----------
+        set_name : str
+            Name of the gene set to query
+        Returns:
+        --------
+        list
+            List of genes in the gene set, or empty list if set not found
+        """
+        if not hasattr(self, 'gene_sets'):
+            # Load the gene sets file if it hasn't been loaded yet
+            self.gene_sets = self.load_gene_sets_file('/home/ubuntu/ishita/msigdb_all_sigs_human_symbols.txt')
+        return self.gene_sets.get(set_name, [])
+    def rank_vision_scores(self, drug_name: str, cell_line_name: str, k_value: int):
+        self.tahoe_vision_scores.X = (self.tahoe_vision_scores.X - np.mean(self.tahoe_vision_scores.X, axis = 0)) / np.std(self.tahoe_vision_scores.X, axis = 0)
+        # subset to the drug / cell line at the highest tested concentration
+        filt = (
+            (self.tahoe_vision_scores.obs["Cell_Name_Vevo"] == cell_line_name)
+            & (self.tahoe_vision_scores.obs["drug"] == drug_name)
+        )
+        filtered_scores = self.tahoe_vision_scores[filt]
+        if filtered_scores.n_obs == 0:
+            return "VISION scores not found for this drug–cell-line combination."
+        filtered_scores = filtered_scores[
+            filtered_scores.obs["concentration"] == filtered_scores.obs["concentration"].max()
+        ]
+        # pick top-|score| gene sets
+        top_idx = np.argsort(-np.abs(filtered_scores.X[0]))[:k_value]
+        gene_sets = filtered_scores.var.index[top_idx].tolist()
+        scores    = filtered_scores.X[0, top_idx].tolist()
+        # build the narrative
+        header = (
+            "VISION scores are single-cell gene-set enrichment values computed by the "
+            "VISION algorithm (DeTomaso & Yosef 2021). Positive scores indicate relative "
+            "up-regulation of the gene set in the queried condition; negative scores indicate "
+            "down-regulation.\n"
+        )
+        lines = []
+        for gs, val in zip(gene_sets, scores):
+            gs_name = gs.replace("gs_", "")
+            genes = self.get_genes_for_set(gs_name)
+            direction = "up-regulated" if val > 0 else "down-regulated" if val < 0 else "not changed"
+            lines.append(f"{gs} has gene set {genes} : {direction} (VISION score = {val:.3f})")
+        return header + "\n".join(lines)
+    def obtain_moa(self, drug_name: str):
+        row = self.tahoe_drug_meta[self.tahoe_drug_meta["drug"] == drug_name]
+        if row.empty:
+            return "MOA annotation not found for this drug."
+        moa_broad = row["moa-broad"].values[0]
+        moa_fine  = row["moa-fine"].values[0]
+        return (
+            f"Broad MOA: {moa_broad}; "
+            f"Fine MOA: {moa_fine}. "
+            "Fine-grained mechanism of action (MOA) annotation for the drug, "
+            "specifying the biological process or molecular target affected. "
+            "Derived from MedChemExpress and curated with GPT-based annotations."
+        )
+    def obtain_gene_targets(self, drug_name: str):
+        row = self.tahoe_drug_meta[self.tahoe_drug_meta["drug"] == drug_name]
+        if row.empty:
+            return "Gene targets not found for this drug."
+        targets = row["targets"].values[0]
+        # Convert a stringified list/dict to a Python object, if necessary.
+        if isinstance(targets, str):
+            try:
+                targets = eval(targets)
+            except Exception:  # fall back to treating it as a single ID
+                targets = [targets]
+        return (
+            f"Gene target token IDs: {targets}. "
+            "Gene identifiers (integer token IDs) corresponding to each gene with non-zero expression in the cell."
+        )
+    def obtain_cell_line_data(self, cell_line_name: str):
+        row = self.tahoe_cell_meta[self.tahoe_cell_meta["cell_name"] == cell_line_name]
+        if row.empty:
+            return "Cell-line metadata not found for this cell line."
+        organ                 = row["Organ"].values[0]
+        driver_gene_symbol    = row["Driver_Gene_Symbol"].values[0]
+        driver_varzyg         = row["Driver_VarZyg"].values[0]
+        driver_vartype        = row["Driver_VarType"].values[0]
+        driver_proteffect     = row["Driver_ProtEffect_or_CdnaEffect"].values[0]
+        driver_mech_inferdm   = row["Driver_Mech_InferDM"].values[0]
+        driver_genetype_dm    = row["Driver_GeneType_DM"].values[0]
+        return (
+            f"Organ: {organ}; "
+            f"Driver_Gene_Symbol: {driver_gene_symbol}; "
+            f"Driver_VarZyg: {driver_varzyg}; "
+            f"Driver_VarType: {driver_vartype}; "
+            f"Driver_ProtEffect_or_CdnaEffect: {driver_proteffect}; "
+            f"Driver_Mech_InferDM: {driver_mech_inferdm}; "
+            f"Driver_GeneType_DM: {driver_genetype_dm}. "
+            "Organ = tissue or organ of origin for the cell line (e.g., Lung), used to interpret lineage-specific responses. "
+            "Driver_Gene_Symbol = HGNC-approved symbol of a driver gene with functional alterations in this cell line. "
+            "Driver_VarZyg = zygosity of the driver variant (Hom = homozygous, Het = heterozygous). "
+            "Driver_VarType = type of genetic alteration (e.g., Missense, Frameshift, Stopgain). "
+            "Driver_ProtEffect_or_CdnaEffect = precise protein or cDNA-level annotation of the mutation (e.g., p.G12S). "
+            "Driver_Mech_InferDM = inferred functional mechanism (LoF = loss-of-function, GoF = gain-of-function). "
+            "Driver_GeneType_DM = classification of the driver gene as an Oncogene or Suppressor."
+        )
+    def run_gradio_chat(self, message: str,
+                    history: list,
+                    temperature: float,
+                    max_new_tokens: int,
+                    max_token: int,
+                    call_agent: bool,
+                    conversation: gr.State,
+                    max_round: int = 20,
+                    seed: int = None,
+                    call_agent_level: int = 0,
+                    sub_agent_task: str = None):
+        print("\033[1;32;40mstart\033[0m")
+        print("len(message)", len(message))
+        if len(message) <= 10:
+            yield "Hi, I am Agent, an assistant for answering biomedical questions. Please provide a valid message with a string longer than 10 characters."
+            return "Please provide a valid message."
+        outputs = []
+        outputs_str = ''
+        last_outputs = []
+        conversation = self.initialize_conversation(
+            message,
+            conversation=conversation,
+            history=history)
+        history = []
+        next_round = True
+        function_call_messages = []
+        current_round = 0
+        enable_summary = False
+        last_status = {}  # for summary
+        token_overflow = False
+        # if self.enable_checker:
+        #     checker = ReasoningTraceChecker(
+        #         message, conversation, init_index=len(conversation))
+        # try:
+        self.conversation.append({"role": "user", "content": message})
+        while next_round and current_round < max_round:
+            current_round += 1
+            response = self.llm_infer(self.conversation)
+            self.conversation.append({"role": "system", "content": response})
+            tool_called = False
+            print(response)
+            # import pdb; pdb.set_trace()
+            if 'Tool-call:' in response:
+                match = re.search(r"Tool-call:\s*(.*)", response, re.DOTALL)
+                response_text = match.group(1).strip()
+                if "None" not in response_text and response_text.replace('-', '').rstrip().replace('FINISHED', '').rstrip():
+                    history.append(ChatMessage(
+                        role="assistant", content=f"{response.replace('FINISHED', '').split('</think>')[1]}"))
+                    yield history
+                    tool_called = True
+                    print(response_text)
+                    if "FAIL" in response_text:
+                        self.conversation.append({"role": "system", "content": tool_response})
+                        history.append(
+                            ChatMessage(role="assistant", content=f"Response from tool FAILED ")
+                        )
+                        next_round = False
+                        yield history
+                    else:
+                        tool_call_text = response_text
+                        if ';' in tool_call_text:
+                            tool_calls = [i.replace('\n', '').rstrip('-').replace('FINISHED', '').replace('Response:', '') for i in tool_call_text.split(';') if i]
+                        elif '\n' in tool_call_text:
+                            tool_calls = [i.replace('\n', '').rstrip('-').replace('FINISHED', '').replace('Response:', '') for i in tool_call_text.split('\n') if i]
+                        else:
+                            tool_calls = [tool_call_text]
+                        tool_calls = [i.rstrip('-') for i in tool_calls if i]
+                        for call in tool_calls:
+                            print(f"\033[1;34;40mCalling this command now {call}\033[0m")
+                            tool_response = str(eval(call))
+                            self.conversation.append({"role": "system", "content": tool_response})
+                            history.append(
+                                ChatMessage(role="assistant", content=f"Response from tool: {tool_response}")
+                            )
+                            print(f"\033[1;34;40mGot this response {tool_response}\033[0m")
+                            yield history
+                else:
+                    history.append(
+                                ChatMessage(role="assistant", content=f"{response}")
+                            )
+                    yield history
+            elif 'Response:' in response or tool_called is False:
+                match = re.search(r"Response:\s*(.*)", response, re.DOTALL)
+                response_text = match.group(1).strip().replace('Tool-call: None', '')
+                print(f"\033[1;33;40mresponse text: {response_text}\033[0m")
+                history.append(
+                    ChatMessage(
+                        role="assistant", content=f"{response_text.replace('FINISHED', '')}")
+                )
+                yield history
+            if 'FINISHED' in response and tool_called is False:
+                next_round = False
+        #         if len(last_outputs) > 0:
+        #             function_call_messages, picked_tools_prompt, special_tool_call, current_gradio_history = yield from self.run_function_call_stream(
+        #                 last_outputs, return_message=True,
+        #                 existing_tools_prompt=picked_tools_prompt,
+        #                 message_for_call_agent=message,
+        #                 call_agent=call_agent,
+        #                 call_agent_level=call_agent_level,
+        #                 temperature=temperature)
+        #             history.extend(current_gradio_history)
+        #             if special_tool_call == 'Finish':
+        #                 yield history
+        #                 next_round = False
+        #                 conversation.extend(function_call_messages)
+        #                 return function_call_messages[0]['content']
+        #             elif special_tool_call == 'RequireClarification' or special_tool_call == 'DirectResponse':
+        #                 history.append(
+        #                     ChatMessage(role="assistant", content=history[-1].content))
+        #                 yield history
+        #                 next_round = False
+        #                 return history[-1].content
+        #             if (self.enable_summary or token_overflow) and not call_agent:
+        #                 if token_overflow:
+        #                     print("token_overflow, using summary")
+        #                 enable_summary = True
+        #             last_status = self.function_result_summary(
+        #                 conversation, status=last_status,
+        #                 enable_summary=enable_summary)
+        #             if function_call_messages is not None:
+        #                 conversation.extend(function_call_messages)
+        #                 formated_md_function_call_messages = tool_result_format(
+        #                     function_call_messages)
+        #                 yield history
+        #             else:
+        #                 next_round = False
+        #                 conversation.extend(
+        #                     [{"role": "assistant", "content": ''.join(last_outputs)}])
+        #                 return ''.join(last_outputs).replace("</s>", "")
+        #         # if self.enable_checker:
+        #         #     good_status, wrong_info = checker.check_conversation()
+        #         #     if not good_status:
+        #         #         next_round = False
+        #         #         print("Internal error in reasoning: " + wrong_info)
+        #         #         break
+        #         last_outputs = []
+        #         last_outputs_str, token_overflow = self.llm_infer(
+        #             messages=conversation,
+        #             temperature=temperature,
+        #             tools=picked_tools_prompt,
+        #             skip_special_tokens=False,
+        #             max_new_tokens=max_new_tokens,
+        #             max_token=max_token,
+        #             seed=seed,
+        #             check_token_status=True)
+        #         last_thought = last_outputs_str.split("[TOOL_CALLS]")[0]
+        #         for each in history:
+        #             if each.metadata is not None:
+        #                 each.metadata['status'] = 'done'
+        #         if '[FinalAnswer]' in last_thought:
+        #             final_thought, final_answer = last_thought.split(
+        #                 '[FinalAnswer]')
+        #             history.append(
+        #                 ChatMessage(role="assistant",
+        #                             content=final_thought.strip())
+        #             )
+        #             yield history
+        #             history.append(
+        #                 ChatMessage(
+        #                     role="assistant", content="**Answer**:\n"+final_answer.strip())
+        #             )
+        #             yield history
+        #         else:
+        #             history.append(ChatMessage(
+        #                 role="assistant", content=last_thought))
+        #             yield history
+        #         last_outputs.append(last_outputs_str)
+        #     if self.force_finish:
+        #         last_outputs_str = self.get_answer_based_on_unfinished_reasoning(
+        #             conversation, temperature, max_new_tokens, max_token, return_full_thought=True)
+        #         for each in history:
+        #             if each.metadata is not None:
+        #                 each.metadata['status'] = 'done'
+        #         final_thought, final_answer = last_outputs_str.split('[FinalAnswer]')
+        #         history.append(
+        #             ChatMessage(role="assistant",
+        #                         content=final_thought.strip())
+        #         )
+        #         yield history
+        #         history.append(
+        #             ChatMessage(
+        #                 role="assistant", content="**Answer**:\n"+final_answer.strip())
+        #         )
+        #         yield history
+        #     else:
+        #         yield "The number of rounds exceeds the maximum limit!"
+        # except Exception as e:
+        #     print(f"Error: {e}")
+        #     if self.force_finish:
+        #         last_outputs_str = self.get_answer_based_on_unfinished_reasoning(
+        #             conversation,
+        #             temperature,
+        #             max_new_tokens,
+        #             max_token,
+        #             return_full_thought=True)
+        #         for each in history:
+        #             if each.metadata is not None:
+        #                 each.metadata['status'] = 'done'
+        #         final_thought, final_answer = last_outputs_str.split(
+        #             '[FinalAnswer]')
+        #         history.append(
+        #             ChatMessage(role="assistant",
+        #                         content=final_thought.strip())
+        #         )
+        #         yield history
+        #         history.append(
+        #             ChatMessage(
+        #                 role="assistant", content="**Answer**:\n"+final_answer.strip())
+        #         )
+        #         yield history
+        #     else:
+        #         return None

agent/prompt.py ADDED Viewed

	@@ -0,0 +1,64 @@

+Agent_Prompt = """
+You are an assistant who is helping the user identify novel gene sets for a particular disease. All your responses must be in the following format.
+If you don't use a tool then don't include a tool-call, if you don't need to respond to the user and instead want to solely call a tool then don't include a response.
+Return FINISHED at the end of a response if you have responded to the user query. DO NOT hallucinate, guess, or ASSUME tool responses. If you need to call multiple tools separate the tool-calls with a semi-colon; :
+Reasoning:
+[Your reasoning goes here]
+Response:
+[Your response goes here, if necessary]
+Tool-call:
+[Tool call goes here, if necessary]
+------------------------------------------------
+The tools you have in your disposal are:
+(1) A tool which can tell you the k-most diseases that are similar to your query disease.
+The tool call for this agent is: "self.get_similar_disease(disease_name, k_value)" where disease_name must be a string and k_value must be an integer. The output of this tool is a list of disease names.
+(2) A tool which can retrieve the gene targets validated from JUMP-CP dataset.
+The tool call for this agent is: "self.get_validated_target_jump(drug_name)" where drug_name must be a string. The output of this tool is a list of gene targets.
+(3) A tool which can retrieve an IC50 value for a drug and cell line from the PRISM Repurposing 20Q2 dataset.
+The tool call for this agent is: "self.get_ic50_prism(drug_name, cell_line)" where drug_name and cell_line must be strings. The output of this tool is scalar IC50 floating point value. These are not keyword arguments.
+(4) A tool which can retrieve gene-set expression scores from the Tahoe-100M dataset.
+The tool call for this agent is "self.rank_vision_scores(drug_name, cell_line, k_value)" where drug_name and cell_line must be strings and k_value must be an integer. These are not keyword arguments. The output of this tool is a list of tuples, where each tuple contains a gene-set name and its corresponding expression score.
+(5) A tool which can obtain the mechanism of action for a drug from the Tahoe-100M dataset.
+The tool call for this agent is "self.obtain_moa(drug_name)" where drug_name must be a string. This is not a keyword argument. The output of this tool is dictionary that contains a broad mechanism of action and a more specific mechanism of action.
+(6) A tool which can retrieve the gene targets for a drug from the Tahoe-100M dataset.
+The tool call for this agent is: "self.obtain_gene_targets(drug_name)" where drug_name must be a string. This is not a keyword argument. The output of this tool is a list of gene symbols representing the known molecular targets of the compound.
+(7) A tool which can retrieve the cell line metadata from the Tahoe-100M dataset.
+The tool call for this agent is: "self.obtain_cell_line_data(cell_line_name)" where cell_line_name must be a string. This is not a keyword argument. The output of this tool is a dictionary containing information about key driver mutations for each cell line.
+(8) A tool which can retrieve the LC50 value for a drug and cell line from the NCI-60 dataset.
+The tool call for this agent is: "self.get_lc50_nci60(drug_name, cell_line_name)" where drug_name and cell_line_name must be strings. These are not keyword arguments.The output of this tool is a tuple of (LC50, LCONC). The LC50 value is in log10 scale and the LCONC is a scalar value that is in log10 scale. It is thelog of the highest concentration tested.
+(9) A tool which searches for the similar perturbation effect within the Tahoe dataset.
+The tool call for this agent is: "self.get_similar_drug_effect_in_tahoe(cell_line_name, drug_name)" where cell_line_name and drug_name must be strings. These are not keyword arguments. The output of this tool is a string of dataframe that tells you about what other **drugs** in tahoe have similar perturbation effect on the cell line.
+(10) A tool that does natural perturbation search in Cellxgene database for similar drug effect.
+The tool call for this agent is: "self.get_similar_drug_effects_in_cxg(cell_line_name, drug_name)" where cell_line_name and drug_name must be strings. These are not keyword arguments. The output of this tool is a string of dataframe that tells you about what **disease** and cell types in cellxgene have similar perturbation effect on the cell line.
+"""

agent/utils.py ADDED Viewed

	@@ -0,0 +1,75 @@

+from openai import AzureOpenAI, OpenAI
+import yaml
+class Basic_Agent():
+    def __init__(self, config):
+        self.config = self.load_config(config)
+        self.openai_api_key = self.config['openai_api_key']
+        if 'open_api_base' in self.config:
+            self.open_api_base = self.config['open_api_base']
+        self.azure_openai_api_key = self.config['azure_openai_api_key']
+        self.azure_openai_endpoint = self.config['azure_openai_endpoint']
+        self.openai_backend = self.config['openai_backend']
+        # self.pqapi_token = self.config['pqapi_token']
+        # os.environ['PQA_API_TOKEN'] = self.pqapi_token
+    def load_config(self,config_file):
+        with open(config_file, 'r') as file:
+            return yaml.safe_load(file)
+    def llm_infer(self, conversation, temp = 0.000000001, max_tokens = 1000, image = None, role = None):
+        while True:
+            if self.openai_backend == 'azure':
+                client = AzureOpenAI(
+                        azure_endpoint = self.azure_openai_endpoint,
+                        api_key=self.azure_openai_api_key,
+                        api_version="2024-05-01-preview")
+                response = client.chat.completions.create(
+                        model='gpt-4o',
+                        messages = conversation,
+                        temperature=temp,
+                        max_tokens=max_tokens,
+                )
+            elif self.openai_backend == 'openai':
+                client = OpenAI(
+                    api_key=self.openai_api_key
+                )
+                response = client.chat.completions.create(
+                    model='gpt-4o',
+                    messages=conversation,
+                    temperature=temp,
+                    max_tokens=max_tokens,
+                )
+            elif self.openai_backend == 'lambda':
+                client = OpenAI(api_key = self.openai_api_key,
+                                base_url = self.open_api_base)
+                model = "deepseek-r1-671b"
+                response = client.chat.completions.create(
+                    model = model,
+                    messages = conversation)
+            else:
+                raise ValueError(f"Invalid openai_backend: {self.openai_backend}")
+            if "I'm sorry, I can't assist with that" in response.choices[0].message.content or "I'm unable to view the image" in response.choices[0].message.content or "I'm unable to provide a definitive answer" in response.choices[0].message.content:
+                    print("Failed to generate response, trying again")
+                    continue
+            else:
+                    response = response.choices[0].message.content
+                    return response
+    def run_function(self, output):
+        try:
+            tool_call = output.split('Tool-call:')[-1].rstrip().replace('\n', '')
+            res = eval(tool_call)
+            return res
+        except Exception as e:
+            print(f"Error in parsing tool call in {output} got this error {e}")
+            import pdb; pdb.set_trace()

data/README.md ADDED Viewed

	@@ -0,0 +1,7 @@

+# Data Download
+Download jump signatures
+```
+wget https://cellpainting-gallery.s3.amazonaws.com/cpg0016-jump-assembled/source_all/workspace/profiles/jump-profiling-recipe_2024_a917fa7/COMPOUND/profiles_var_mad_int_featselect_harmony/profiles_var_mad_int.parquet
+```

data/jump-dataset.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

data/jump-similarity.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

model.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import anndata
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torch.utils.data import DataLoader, TensorDataset
+import numpy as np
+# Generate random sample data - 2000 samples, 1280 dimensions each
+num_samples = 2000
+dimension = 1280
+# Generate random input vectors X
+X = np.random.randn(num_samples, dimension)
+# Generate target vectors Y (could be random or a function of X)
+# Option 1: Completely random Y
+# Y = np.random.randn(num_samples, dimension)
+# Option 2: Y as a noisy function of X (more realistic for regression task)
+W = np.random.randn(dimension, dimension) * 0.1  # Random weight matrix
+b = np.random.randn(dimension) * 0.1  # Random bias
+noise = np.random.randn(num_samples, dimension) * 0.05  # Random noise
+Y = X @ W + b + noise  # Y = XW + b + noise
+# Convert data to PyTorch tensors
+X_tensor = torch.tensor(X, dtype=torch.float32)
+Y_tensor = torch.tensor(Y, dtype=torch.float32)
+dataset = TensorDataset(X_tensor, Y_tensor)
+dataloader = DataLoader(dataset, batch_size=32, shuffle=True)
+# Option 1: Simple Linear Regression model
+class LinearModel(nn.Module):
+    def __init__(self):
+        super(LinearModel, self).__init__()
+        self.linear = nn.Linear(1280, 1280)
+    def forward(self, x):
+        return self.linear(x)
+# Option 2: Neural Network with hidden layers
+class NeuralNetwork(nn.Module):
+    def __init__(self, hidden_dim=512):
+        super(NeuralNetwork, self).__init__()
+        self.network = nn.Sequential(
+            nn.Linear(1280, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, 1280)
+        )
+    def forward(self, x):
+        return self.network(x)
+# Choose which model to use
+# model = LinearModel()
+model = NeuralNetwork()
+# Loss function and optimizer
+criterion = nn.MSELoss()
+optimizer = optim.Adam(model.parameters(), lr=0.001)
+# Training loop
+num_epochs = 50
+for epoch in range(num_epochs):
+    total_loss = 0
+    for inputs, targets in dataloader:
+        # Forward pass
+        outputs = model(inputs)
+        loss = criterion(outputs, targets)
+        # Backward pass and optimize
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+        total_loss += loss.item()
+    # Print progress
+    if (epoch + 1) % 5 == 0:
+        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(dataloader):.4f}')
+# After training, to use the model for prediction:
+def predict(input_vector):
+    model.eval()
+    with torch.no_grad():
+        input_tensor = torch.tensor(input_vector, dtype=torch.float32)
+        return model(input_tensor).numpy()

run_app.py ADDED Viewed

	@@ -0,0 +1,296 @@

+import random
+import datetime
+import sys
+from agent.agent import SigSpace
+import spaces
+import gradio as gr
+import os
+from PIL import Image
+import os
+os.environ["VLLM_USE_V1"] = "0" # Disable v1 API for now since it does not support logits processors.
+# Determine the directory where the current file is located
+current_dir = os.path.dirname(os.path.abspath(__file__))
+os.environ["MKL_THREADING_LAYER"] = "GNU"
+# Set an environment variable
+HF_TOKEN = os.environ.get("HF_TOKEN", None)
+# Create the image path - use absolute path for reliability
+img_path = os.path.join(current_dir, 'img', 'SigSpace.png')
+def display_image(image_path):
+    # Load and return the image
+    img = Image.open(image_path)
+    return img
+DESCRIPTION = f'''
+<div style="text-align: center;">
+  <h1 style="font-size: 32px; margin-bottom: 10px;">SigSpace: An AI Agent for Tahoe-100M</h1>
+</div>
+'''
+INTRO = """
+This is the intro that goes here
+"""
+LICENSE = """
+License goes here
+"""
+PLACEHOLDER = """
+<div style="padding: 30px; text-align: center; display: flex; flex-direction: column; align-items: center;">
+   <h1 style="font-size: 28px; margin-bottom: 2px; opacity: 0.55;">Agent</h1>
+   <p style="font-size: 18px; margin-bottom: 2px; opacity: 0.65;">Tips before using Agent:</p>
+   <p style="font-size: 18px; margin-bottom: 2px; opacity: 0.55;">Please click clear🗑️
+ (top-right) to remove previous context before sumbmitting a new question.</p>
+   <p style="font-size: 18px; margin-bottom: 2px; opacity: 0.55;">Click retry🔄 (below message)  to get multiple versions of the answer.</p>
+</div>
+"""
+css = """
+h1 {
+  text-align: center;
+  display: block;
+}
+#duplicate-button {
+  margin: auto;
+  color: white;
+  background: #1565c0;
+  border-radius: 100vh;
+}
+.small-button button {
+    font-size: 12px !important;
+    padding: 4px 8px !important;
+    height: 6px !important;
+    width: 4px !important;
+}
+.gradio-accordion {
+    margin-top: 0px !important;
+    margin-bottom: 0px !important;
+}
+"""
+chat_css = """
+.gr-button { font-size: 20px !important; }  /* Enlarges button icons */
+.gr-button svg { width: 32px !important; height: 32px !important; } /* Enlarges SVG icons */
+"""
+model_name = ''
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+question_examples = [
+    # ['What is the IC50 values for the drug Abemaciclib in the cell line A549?'],
+    ["What's the MoA of the drug Ponatinib on the HCT15 colon cancer cell line? Please synthesize results from the Tahoe-100M dataset, the jump dataset, and the IC50 dataset."],
+    ["Natural perturbation: find the disease perturbation that has the similar effect to Glycyrrhizic acid on CVCL_0334? use the result and what you know to explain the mechanism of action."],
+    ["Mechanism of action: give me the mechanism of action for drug name Abemaciclib provided by Tahoe."],
+    ["Vision scores: what are the top 5 vision scores for cell line A549 and drug name Abemaciclib"]
+]
+new_tool_files = {
+    'new_tool': os.path.join(current_dir, 'data', 'new_tool.json'),
+}
+config_path = "/home/ubuntu/.lambda_api_config.yaml"
+agent = SigSpace(config_path)
+# agent.init_model()
+def update_model_parameters(enable_finish, enable_rag, enable_summary,
+                            init_rag_num, step_rag_num, skip_last_k,
+                            summary_mode, summary_skip_last_k, summary_context_length, force_finish, seed):
+    # Update model instance parameters dynamically
+    updated_params = agent.update_parameters(
+        enable_finish=enable_finish,
+        enable_rag=enable_rag,
+        enable_summary=enable_summary,
+        init_rag_num=init_rag_num,
+        step_rag_num=step_rag_num,
+        skip_last_k=skip_last_k,
+        summary_mode=summary_mode,
+        summary_skip_last_k=summary_skip_last_k,
+        summary_context_length=summary_context_length,
+        force_finish=force_finish,
+        seed=seed,
+    )
+    return updated_params
+def update_seed():
+    # Update model instance parameters dynamically
+    seed = random.randint(0, 10000)
+    updated_params = agent.update_parameters(
+        seed=seed,
+    )
+    return updated_params
+def handle_retry(history, retry_data: gr.RetryData, temperature, max_new_tokens, max_tokens, multi_agent, conversation, max_round):
+    print("Updated seed:", update_seed())
+    new_history = history[:retry_data.index]
+    previous_prompt = history[retry_data.index]['content']
+    print("previous_prompt", previous_prompt)
+    yield from agent.run_gradio_chat(new_history + [{"role": "user", "content": previous_prompt}], temperature, max_new_tokens, max_tokens, multi_agent, conversation, max_round)
+PASSWORD = "mypassword"
+# Function to check if the password is correct
+def check_password(input_password):
+    if input_password == PASSWORD:
+        return gr.update(visible=True), ""
+    else:
+        return gr.update(visible=False), "Incorrect password, try again!"
+conversation_state = gr.State([])
+# Gradio block
+chatbot = gr.Chatbot(height=400, placeholder=PLACEHOLDER,
+                     label='SigSpace', type="messages",  show_copy_button=True)
+with gr.Blocks(css=css) as demo:
+    gr.Markdown(DESCRIPTION)
+    # gr.Markdown(INTRO)
+    gr.Image(value=display_image(img_path), label="", show_label=False, height=600, width=600)
+    default_temperature = 0.3
+    default_max_new_tokens = 1024
+    default_max_tokens = 81920
+    default_max_round = 30
+    temperature_state = gr.State(value=default_temperature)
+    max_new_tokens_state = gr.State(value=default_max_new_tokens)
+    max_tokens_state = gr.State(value=default_max_tokens)
+    max_round_state = gr.State(value=default_max_round)
+    chatbot.retry(handle_retry, chatbot, chatbot, temperature_state, max_new_tokens_state,
+                  max_tokens_state, gr.Checkbox(value=False, render=False), conversation_state, max_round_state)
+    gr.ChatInterface(
+        fn=agent.run_gradio_chat,
+        chatbot=chatbot,
+        fill_height=False, fill_width=False, stop_btn=True,
+        additional_inputs_accordion=gr.Accordion(
+            label="⚙️ Inference Parameters", open=False, render=False),
+        additional_inputs=[
+            temperature_state, max_new_tokens_state, max_tokens_state,
+            gr.Checkbox(
+                label="Activate X", value=False, render=False),
+            conversation_state,
+            max_round_state,
+            gr.Number(label="Seed", value=100, render=False)
+        ],
+        examples=question_examples,
+        cache_examples=False,
+        css=chat_css,
+    )
+    with gr.Accordion("Settings", open=False):
+        # Define the sliders
+        temperature_slider = gr.Slider(
+            minimum=0,
+            maximum=1,
+            step=0.1,
+            value=default_temperature,
+            label="Temperature"
+        )
+        max_new_tokens_slider = gr.Slider(
+            minimum=128,
+            maximum=4096,
+            step=1,
+            value=default_max_new_tokens,
+            label="Max new tokens"
+        )
+        max_tokens_slider = gr.Slider(
+            minimum=128,
+            maximum=32000,
+            step=1,
+            value=default_max_tokens,
+            label="Max tokens"
+        )
+        max_round_slider = gr.Slider(
+            minimum=0,
+            maximum=50,
+            step=1,
+            value=default_max_round,
+            label="Max round")
+        # Automatically update states when slider values change
+        temperature_slider.change(
+            lambda x: x, inputs=temperature_slider, outputs=temperature_state)
+        max_new_tokens_slider.change(
+            lambda x: x, inputs=max_new_tokens_slider, outputs=max_new_tokens_state)
+        max_tokens_slider.change(
+            lambda x: x, inputs=max_tokens_slider, outputs=max_tokens_state)
+        max_round_slider.change(
+            lambda x: x, inputs=max_round_slider, outputs=max_round_state)
+        # password_input = gr.Textbox(
+        #     label="Enter Password for More Settings", type="password")
+        # incorrect_message = gr.Textbox(visible=False, interactive=False)
+        # with gr.Accordion("⚙️ Settings", open=False, visible=False) as protected_accordion:
+        #     with gr.Row():
+        #         with gr.Column(scale=1):
+        #             with gr.Accordion("⚙️ Model Loading", open=False):
+        #                 model_name_input = gr.Textbox(
+        #                     label="Enter model path", value=model_name)
+        #                 load_model_btn = gr.Button(value="Load Model")
+        #                 load_model_btn.click(
+        #                     agent.load_models, inputs=model_name_input, outputs=gr.Textbox(label="Status"))
+        #         with gr.Column(scale=1):
+        #             with gr.Accordion("⚙️ Functional Parameters", open=False):
+        #                 # Create Gradio components for parameter inputs
+        #                 enable_finish = gr.Checkbox(
+        #                     label="Enable Finish", value=True)
+        #                 enable_rag = gr.Checkbox(
+        #                     label="Enable RAG", value=True)
+        #                 enable_summary = gr.Checkbox(
+        #                     label="Enable Summary", value=False)
+        #                 init_rag_num = gr.Number(
+        #                     label="Initial RAG Num", value=0)
+        #                 step_rag_num = gr.Number(
+        #                     label="Step RAG Num", value=10)
+        #                 skip_last_k = gr.Number(label="Skip Last K", value=0)
+        #                 summary_mode = gr.Textbox(
+        #                     label="Summary Mode", value='step')
+        #                 summary_skip_last_k = gr.Number(
+        #                     label="Summary Skip Last K", value=0)
+        #                 summary_context_length = gr.Number(
+        #                     label="Summary Context Length", value=None)
+        #                 force_finish = gr.Checkbox(
+        #                     label="Force FinalAnswer", value=True)
+        #                 seed = gr.Number(label="Seed", value=100)
+        #                 # Button to submit and update parameters
+        #                 submit_btn = gr.Button("Update Parameters")
+        #                 # Display the updated parameters
+        #                 updated_parameters_output = gr.JSON()
+        #                 # When button is clicked, update parameters
+        #                 submit_btn.click(fn=update_model_parameters,
+        #                                  inputs=[enable_finish, enable_rag, enable_summary, init_rag_num, step_rag_num, skip_last_k,
+        #                                          summary_mode, summary_skip_last_k, summary_context_length, force_finish, seed],
+        #                                  outputs=updated_parameters_output)
+        # Button to submit the password
+        # submit_button = gr.Button("Submit")
+        # # When the button is clicked, check if the password is correct
+        # submit_button.click(
+        #     check_password,
+        #     inputs=password_input,
+        #     outputs=[protected_accordion, incorrect_message]
+        # )
+    gr.Markdown(LICENSE)
+if __name__ == "__main__":
+    demo.launch(share=True)

tahoe_model/apply_linear_model.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import anndata
+import joblib
+import matplotlib.pyplot as plt
+import numpy as np
+from sklearn.model_selection import train_test_split
+from sklearn.linear_model import LinearRegression
+from sklearn.metrics import mean_squared_error
+from scipy.stats import pearsonr
+import os
+import pandas as pd
+merged_anndata = anndata.read_h5ad("data/tahoe_vision_universal_embeddings.h5ad")
+X = merged_anndata.obsm["X_delta"] # 60125 x 1280
+Y = merged_anndata.X # 60125 x 7467
+labels = merged_anndata.var.index.tolist() # 7467
+X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)
+if os.path.exists("models/linear_regression_model.pkl"):
+    model = joblib.load("models/linear_regression_model.pkl")
+    y_pred_test = model.predict(X_test)
+    test_pearson = [pearsonr(y_test[:, i], y_pred_test[:, i])[0] for i in range(y_test.shape[1])]
+    top_gene_set_indices = np.argsort(test_pearson)[-20:][::-1]
+    top_gene_sets = [(test_pearson[i], labels[i]) for i in top_gene_set_indices]
+    print("Top 20 gene sets with the highest correlation:")
+    for correlation, gene_set in top_gene_sets:
+        print(f"gene set {gene_set}: pearson correlation = {correlation:.4f}")
+    plt.hist(test_pearson, bins=50, color='blue', alpha=0.7)
+    plt.title("Distribution of Pearson Correlation Coefficients (Test Set)")
+    plt.xlabel("Pearson Correlation Coefficient")
+    plt.ylabel("Frequency")
+    plt.grid(axis='y', alpha=0.75)
+    if not os.path.exists("figures"):
+        os.makedirs("figures")
+    plt.savefig("figures/pearson_correlation_distribution.png")
+    top_20_indices_per_row = np.argsort(np.abs(y_test), axis=1)[:, -20:]
+    correlations = []
+    for i in range(y_test.shape[0]):
+        actual_top_20 = y_test[i, top_20_indices_per_row[i]]
+        predicted_top_20 = y_pred_test[i, top_20_indices_per_row[i]]
+        correlation = pearsonr(actual_top_20, predicted_top_20)[0]
+        correlations.append(correlation)
+    average_correlation = np.mean(correlations)
+    print(f"Average correlation for top 20 magnitude gene sets per row: {average_correlation:.4f}")
+else:
+    model = LinearRegression()
+    model.fit(X_train, y_train)
+    y_pred_train = model.predict(X_train)
+    y_pred_test = model.predict(X_test)
+    train_mse = mean_squared_error(y_train, y_pred_train)
+    test_mse = mean_squared_error(y_test, y_pred_test)
+    print(f"training MSE: {train_mse}")
+    print(f"testing MSE: {test_mse}")
+    joblib.dump(model, "models/linear_regression_model.pkl")
+model = joblib.load("models/linear_regression_model.pkl")
+disease_deltas = anndata.read_h5ad("data/disease_deltas.h5ad")
+predicted_vision_signatures = model.predict(disease_deltas.X)
+dataframe = pd.DataFrame(predicted_vision_signatures, columns = labels)
+labels_combined = disease_deltas.obs.apply(
+    lambda row: f"{row['cell_type']}_{row['tissue']}_{row['disease']}", axis=1
+).tolist()
+top_20_gene_sets = []
+for index, row in dataframe.iterrows():
+    top_20_indices = np.argsort(np.abs(row))[-20:][::-1]
+    top_20 = [(labels[i], "down" if row.iloc[i] < 0 else "up") for i in top_20_indices]
+    top_20_gene_sets.append(top_20)
+with open("top_20_gene_sets.txt", "w") as f:
+    for i, gene_set in enumerate(top_20_gene_sets):
+        f.write(f"{labels_combined[i]}\t" + "\t".join([f"{gene}:{direction}" for gene, direction in gene_set]) + "\n")

tahoe_model/compute_tahoe_deltas.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import anndata
+import numpy as np
+uce = anndata.read_h5ad("data/tahoe_universal_embeddings.h5ad")
+control_condition = "[('DMSO_TF', 0.0, 'uM')]"
+X_delta = np.zeros_like(uce.obsm["X_uce"])
+for cell_line in uce.obs["cell_line"].unique():
+    for plate in uce.obs["plate"].unique():
+        cell_plate_mask = (uce.obs["cell_line"] == cell_line) & (uce.obs["plate"] == plate)
+        control_mask = cell_plate_mask & (uce.obs["drugname_drugconc"] == control_condition)
+        cell_plate_indices = np.where(cell_plate_mask)[0]
+        control_indices = np.where(control_mask)[0]
+        X_delta[cell_plate_indices] = uce.obsm["X_uce"][cell_plate_indices] - uce.obsm["X_uce"][control_indices]
+uce.obsm["X_delta"] = X_delta
+print("X_uce shape", uce.obsm["X_uce"].shape)
+print("X_delta shape", uce.obsm["X_delta"].shape)
+uce.write("data/tahoe_universal_embeddings_deltas.h5ad")

tahoe_model/merge_tahoe_vision.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import anndata
+import pandas as pd
+uce = anndata.read_h5ad("data/tahoe_universal_embeddings_deltas.h5ad")
+vision = anndata.read_h5ad("data/tahoe_vision_scores.h5ad")
+uce.obs = uce.obs.reset_index().rename(columns = {"index": "condition"})
+vision.obs["condition"] = vision.obs.apply(
+    lambda row: f"{row['Cell_ID_Cellosaur']}_{[(row['drug'], row['concentration'], row['concentration_unit'])]}_plate{row['plate']}", axis = 1
+)
+unique_cell_lines_uce = uce.obs["cell_line"].unique().tolist()
+unique_drugs_uce = uce.obs["drugname_drugconc"].apply(
+    lambda x: eval(x)[0][0]
+).unique().tolist()
+# print("number of unique cell lines:", len(unique_cell_lines_uce))
+# print(unique_cell_lines_uce)
+# print("\nnumber of unique drugs:", len(unique_drugs_uce))
+# print(unique_drugs_uce)
+conditions_uce = set(uce.obs["condition"].unique())
+conditions_vision = set(vision.obs["condition"].unique())
+only_in_uce = conditions_uce - conditions_vision
+only_in_vision = conditions_vision - conditions_uce
+with open("conditions_only_in_uce.txt", "w") as f:
+    for condition in only_in_uce:
+        f.write(f"{condition}\n")
+with open("conditions_only_in_vision.txt", "w") as f:
+    for condition in only_in_vision:
+        f.write(f"{condition}\n")
+vision = vision[vision.obs["condition"].drop_duplicates(keep = "first").index, :]
+vision.obs = vision.obs.reset_index(drop = True)
+merged_obs = pd.merge(
+    uce.obs,
+    vision.obs,
+    on = "condition",
+    how = "inner"
+)
+indices_in_vision = vision.obs.index[
+    vision.obs["condition"].isin(merged_obs["condition"])
+].tolist()
+indices_in_uce = uce.obs.index[
+    uce.obs["condition"].isin(merged_obs["condition"])
+].tolist()
+indices_in_vision = [int(x) for x in indices_in_vision]
+indices_in_uce = [int(x) for x in indices_in_uce]
+anndata_merged = anndata.AnnData(
+    X = vision.X[indices_in_vision, :],
+    obs = merged_obs,
+    var = vision.var,
+    obsm = {"X_uce" : uce.obsm["X_uce"][indices_in_uce, :],
+            "X_delta": uce.obsm["X_delta"][indices_in_uce, :]}
+)
+anndata_merged.write("data/tahoe_vision_universal_embeddings.h5ad")