Spaces:
Running
Running
Commit ·
c0948c4
1
Parent(s): 7cf9dfd
update model
Browse files
app.py
CHANGED
|
@@ -73,6 +73,15 @@ ASSETS_DATA = ASSETS / "training_data_cleaned"; ASSETS_DATA.mkdir(parents=True
|
|
| 73 |
|
| 74 |
MODEL_REPO = "ChatterjeeLab/PeptiVerse" # model repo
|
| 75 |
DATASET_REPO = "ChatterjeeLab/PeptiVerse" # dataset repo
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
|
| 77 |
def canon_model(parsed) -> Optional[str]:
|
| 78 |
"""Return the bare lowercase model name from a parsed (model, emb_tag) tuple or raw string."""
|
|
@@ -88,9 +97,6 @@ def get_required_patterns(manifest_path: Path) -> List[str]:
|
|
| 88 |
|
| 89 |
manifest = read_best_manifest_csv(manifest_path)
|
| 90 |
patterns = set()
|
| 91 |
-
|
| 92 |
-
patterns.add("tokenizer/new_vocab.txt")
|
| 93 |
-
patterns.add("tokenizer/new_splits.txt")
|
| 94 |
patterns.add("training_data_cleaned/**/*.csv")
|
| 95 |
|
| 96 |
for prop_key, row in manifest.items():
|
|
@@ -159,18 +165,9 @@ def fetch_models_and_data():
|
|
| 159 |
"training_data_cleaned/**/*.csv",
|
| 160 |
],
|
| 161 |
)
|
| 162 |
-
|
| 163 |
-
fetch_models_and_data()
|
| 164 |
"""
|
| 165 |
-
|
| 166 |
-
TRAINING_ROOT = ASSETS_MODELS / "training_classifiers"
|
| 167 |
-
TOKENIZER_DIR = ASSETS_MODELS / "tokenizer"
|
| 168 |
-
|
| 169 |
-
# Banned models that should fall back to XGB
|
| 170 |
-
BANNED_MODELS = {"svm", "enet", "svm_gpu", "enet_gpu"}
|
| 171 |
|
| 172 |
-
# "lower is better" exceptions for classification labeling
|
| 173 |
-
LOWER_BETTER = {"hemolysis", "toxicity"}
|
| 174 |
|
| 175 |
# Property display names and descriptions
|
| 176 |
PROPERTY_INFO = {
|
|
@@ -313,8 +310,8 @@ class AppContext:
|
|
| 313 |
classifier_weight_root=ASSETS_MODELS,
|
| 314 |
esm_name="facebook/esm2_t33_650M_UR50D",
|
| 315 |
clm_name="aaronfeller/PeptideCLM-23M-all",
|
| 316 |
-
smiles_vocab=str(
|
| 317 |
-
smiles_splits=str(
|
| 318 |
device=str(self.device),
|
| 319 |
)
|
| 320 |
|
|
|
|
| 73 |
|
| 74 |
MODEL_REPO = "ChatterjeeLab/PeptiVerse" # model repo
|
| 75 |
DATASET_REPO = "ChatterjeeLab/PeptiVerse" # dataset repo
|
| 76 |
+
BEST_TXT = Path("basic_models.txt")
|
| 77 |
+
TRAINING_ROOT = ASSETS_MODELS / "training_classifiers"
|
| 78 |
+
#TOKENIZER_DIR = ASSETS_MODELS / "tokenizer"
|
| 79 |
+
|
| 80 |
+
# Banned models that should fall back to XGB
|
| 81 |
+
BANNED_MODELS = {"svm", "enet", "svm_gpu", "enet_gpu"}
|
| 82 |
+
|
| 83 |
+
# "lower is better" exceptions for classification labeling
|
| 84 |
+
LOWER_BETTER = {"hemolysis", "toxicity"}
|
| 85 |
|
| 86 |
def canon_model(parsed) -> Optional[str]:
|
| 87 |
"""Return the bare lowercase model name from a parsed (model, emb_tag) tuple or raw string."""
|
|
|
|
| 97 |
|
| 98 |
manifest = read_best_manifest_csv(manifest_path)
|
| 99 |
patterns = set()
|
|
|
|
|
|
|
|
|
|
| 100 |
patterns.add("training_data_cleaned/**/*.csv")
|
| 101 |
|
| 102 |
for prop_key, row in manifest.items():
|
|
|
|
| 165 |
"training_data_cleaned/**/*.csv",
|
| 166 |
],
|
| 167 |
)
|
|
|
|
|
|
|
| 168 |
"""
|
| 169 |
+
fetch_models_and_data()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
|
|
|
|
|
|
|
| 171 |
|
| 172 |
# Property display names and descriptions
|
| 173 |
PROPERTY_INFO = {
|
|
|
|
| 310 |
classifier_weight_root=ASSETS_MODELS,
|
| 311 |
esm_name="facebook/esm2_t33_650M_UR50D",
|
| 312 |
clm_name="aaronfeller/PeptideCLM-23M-all",
|
| 313 |
+
smiles_vocab=str(Path(__file__).parent / "tokenizer" / "new_vocab.txt"),
|
| 314 |
+
smiles_splits=str(Path(__file__).parent / "tokenizer" / "new_splits.txt"),
|
| 315 |
device=str(self.device),
|
| 316 |
)
|
| 317 |
|