bobbypaton commited on
Commit
899f17e
Β·
1 Parent(s): 99ae44b

Log predictions to patonlab/analytics dataset

Browse files
Files changed (2) hide show
  1. requirements.txt +3 -0
  2. worker.py +44 -5
requirements.txt CHANGED
@@ -21,4 +21,7 @@ scipy==1.3.3
21
  scikit-learn==0.21.3
22
  tqdm==4.64.1
23
 
 
 
 
24
  # nfp is vendored locally β€” no PyPI entry needed
 
21
  scikit-learn==0.21.3
22
  tqdm==4.64.1
23
 
24
+ # HF dataset logging
25
+ huggingface_hub==0.16.4
26
+
27
  # nfp is vendored locally β€” no PyPI entry needed
worker.py CHANGED
@@ -12,6 +12,7 @@ warnings.filterwarnings("ignore", category=FutureWarning)
12
  import json
13
  import math
14
  import pickle
 
15
  from io import StringIO
16
 
17
  import redis
@@ -68,6 +69,46 @@ redis_client = redis.StrictRedis(
68
  host="localhost", port=6379, db=0, decode_responses=True
69
  )
70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
  def _mol_to_sdf(mol, conf_id=0):
73
  sio = StringIO()
@@ -140,7 +181,6 @@ def _fmt_relative_E(spread_df, energy_order):
140
  def run_job(task_id, smiles, type_):
141
  result_key = f"task_result_{task_id}"
142
  try:
143
- # Prepare mol β€” preprocess_C/H expects a mol without Hs (it adds them internally)
144
  mol = Chem.MolFromSmiles(smiles)
145
  AllChem.EmbedMolecule(mol, useRandomCoords=True)
146
  mol_with_h = Chem.AddHs(mol, addCoords=True)
@@ -149,12 +189,8 @@ def run_job(task_id, smiles, type_):
149
  mol_with_confs, ids, nr = genConf(mol_with_h, rms=-1, nc=200, efilter=10.0, rmspost=0.5)
150
  print(f"genConf: {len(ids)} conformers", flush=True)
151
 
152
- # Build SDFs directly from genConf mol (energy order, lowest first)
153
  conf_sdfs, energy_order = _build_sdfs_from_genconf(mol_with_confs, ids)
154
 
155
- # Run GNN preprocessing and inference using the same mol_with_confs
156
- # Pass mol_with_confs as a list β€” preprocess will use its existing conformers
157
- # by calling genConf internally, but we suppress that output
158
  mols = [Chem.MolFromSmiles(smiles)]
159
  for m in mols:
160
  AllChem.EmbedMolecule(m, useRandomCoords=True)
@@ -215,6 +251,9 @@ def run_job(task_id, smiles, type_):
215
  redis_client.set(result_key, json.dumps(result), ex=3600)
216
  print(f"Task {task_id} complete β€” {len(conf_sdfs)} conformers", flush=True)
217
 
 
 
 
218
  except Exception as e:
219
  import traceback; traceback.print_exc()
220
  redis_client.set(result_key, json.dumps({"errMessage": str(e)}), ex=3600)
 
12
  import json
13
  import math
14
  import pickle
15
+ import datetime
16
  from io import StringIO
17
 
18
  import redis
 
69
  host="localhost", port=6379, db=0, decode_responses=True
70
  )
71
 
72
+ # ── Analytics logging to HF Dataset ──────────────────────────────────────────
73
+ _HF_TOKEN = os.environ.get("HF_TOKEN", "")
74
+ _ANALYTICS_REPO = "patonlab/analytics"
75
+ _ANALYTICS_FILE = "data.csv"
76
+
77
+ def _log_prediction():
78
+ """Append one row to the existing patonlab/analytics data.csv.
79
+ Format matches the alfabet log: space,timestamp
80
+ """
81
+ if not _HF_TOKEN:
82
+ return
83
+ try:
84
+ from huggingface_hub import HfApi
85
+ import tempfile
86
+
87
+ api = HfApi(token=_HF_TOKEN)
88
+ timestamp = datetime.datetime.utcnow().isoformat()
89
+
90
+ # Download the current CSV, append a row, re-upload
91
+ with tempfile.TemporaryDirectory() as tmpdir:
92
+ local_path = os.path.join(tmpdir, "data.csv")
93
+ api.hf_hub_download(
94
+ repo_id=_ANALYTICS_REPO,
95
+ filename=_ANALYTICS_FILE,
96
+ repo_type="dataset",
97
+ local_dir=tmpdir,
98
+ )
99
+ with open(local_path, "a") as f:
100
+ f.write(f"patonlab/cascade,{timestamp}\n")
101
+
102
+ api.upload_file(
103
+ path_or_fileobj=local_path,
104
+ path_in_repo=_ANALYTICS_FILE,
105
+ repo_id=_ANALYTICS_REPO,
106
+ repo_type="dataset",
107
+ commit_message=f"log: cascade prediction {timestamp[:10]}",
108
+ )
109
+ except Exception as e:
110
+ print(f"Analytics logging failed (non-fatal): {e}", flush=True)
111
+
112
 
113
  def _mol_to_sdf(mol, conf_id=0):
114
  sio = StringIO()
 
181
  def run_job(task_id, smiles, type_):
182
  result_key = f"task_result_{task_id}"
183
  try:
 
184
  mol = Chem.MolFromSmiles(smiles)
185
  AllChem.EmbedMolecule(mol, useRandomCoords=True)
186
  mol_with_h = Chem.AddHs(mol, addCoords=True)
 
189
  mol_with_confs, ids, nr = genConf(mol_with_h, rms=-1, nc=200, efilter=10.0, rmspost=0.5)
190
  print(f"genConf: {len(ids)} conformers", flush=True)
191
 
 
192
  conf_sdfs, energy_order = _build_sdfs_from_genconf(mol_with_confs, ids)
193
 
 
 
 
194
  mols = [Chem.MolFromSmiles(smiles)]
195
  for m in mols:
196
  AllChem.EmbedMolecule(m, useRandomCoords=True)
 
251
  redis_client.set(result_key, json.dumps(result), ex=3600)
252
  print(f"Task {task_id} complete β€” {len(conf_sdfs)} conformers", flush=True)
253
 
254
+ # Log to analytics dataset (non-blocking)
255
+ _log_prediction()
256
+
257
  except Exception as e:
258
  import traceback; traceback.print_exc()
259
  redis_client.set(result_key, json.dumps({"errMessage": str(e)}), ex=3600)