mekosotto Claude Sonnet 4.6 commited on
Commit
32d3a5f
·
1 Parent(s): 48cf9c9

refactor(bbb): pin LF lineterminator; move DEFAULTs to top; reject dir output

Browse files
src/pipelines/bbb_pipeline.py CHANGED
@@ -34,6 +34,11 @@ logger = get_logger(__name__)
34
  RDLogger.DisableLog("rdApp.*")
35
 
36
 
 
 
 
 
 
37
  def is_valid_smiles(smiles: str | float | None) -> bool:
38
  """Return True iff `smiles` is a non-empty string parseable by RDKit.
39
 
@@ -180,10 +185,6 @@ def extract_features_from_dataframe(
180
  return out
181
 
182
 
183
- DEFAULT_INPUT = Path("data/raw/bbbp.csv")
184
- DEFAULT_OUTPUT = Path("data/processed/bbbp_features.csv")
185
-
186
-
187
  def run_pipeline(
188
  input_path: Path = DEFAULT_INPUT,
189
  output_path: Path = DEFAULT_OUTPUT,
@@ -207,6 +208,7 @@ def run_pipeline(
207
 
208
  Raises:
209
  FileNotFoundError: if `input_path` does not exist.
 
210
  KeyError: if `smiles_col` is missing from the CSV.
211
  """
212
  input_path = Path(input_path)
@@ -217,14 +219,18 @@ def run_pipeline(
217
 
218
  logger.info("Reading raw BBBP from %s", input_path)
219
  df = pd.read_csv(input_path)
220
- logger.info("Loaded %d rows, columns=%s", len(df), list(df.columns))
221
 
222
  features = extract_features_from_dataframe(
223
  df, smiles_col=smiles_col, n_bits=n_bits, radius=radius,
224
  )
225
 
226
  output_path.parent.mkdir(parents=True, exist_ok=True)
227
- features.to_csv(output_path, index=False)
 
 
 
 
228
  logger.info(
229
  "Wrote processed features to %s (rows=%d, cols=%d)",
230
  output_path, len(features), features.shape[1],
@@ -232,6 +238,7 @@ def run_pipeline(
232
 
233
 
234
  if __name__ == "__main__":
235
- # Production-ready CLI entrypoint:
 
236
  # python -m src.pipelines.bbb_pipeline
237
  run_pipeline()
 
34
  RDLogger.DisableLog("rdApp.*")
35
 
36
 
37
+ # Default I/O paths for the BBB pipeline. Override via run_pipeline() args.
38
+ DEFAULT_INPUT = Path("data/raw/bbbp.csv")
39
+ DEFAULT_OUTPUT = Path("data/processed/bbbp_features.csv")
40
+
41
+
42
  def is_valid_smiles(smiles: str | float | None) -> bool:
43
  """Return True iff `smiles` is a non-empty string parseable by RDKit.
44
 
 
185
  return out
186
 
187
 
 
 
 
 
188
  def run_pipeline(
189
  input_path: Path = DEFAULT_INPUT,
190
  output_path: Path = DEFAULT_OUTPUT,
 
208
 
209
  Raises:
210
  FileNotFoundError: if `input_path` does not exist.
211
+ IsADirectoryError: if `output_path` resolves to an existing directory.
212
  KeyError: if `smiles_col` is missing from the CSV.
213
  """
214
  input_path = Path(input_path)
 
219
 
220
  logger.info("Reading raw BBBP from %s", input_path)
221
  df = pd.read_csv(input_path)
222
+ logger.info("Loaded %d rows, %d columns", len(df), len(df.columns))
223
 
224
  features = extract_features_from_dataframe(
225
  df, smiles_col=smiles_col, n_bits=n_bits, radius=radius,
226
  )
227
 
228
  output_path.parent.mkdir(parents=True, exist_ok=True)
229
+ if output_path.is_dir():
230
+ raise IsADirectoryError(
231
+ f"output_path must be a file, got a directory: {output_path}"
232
+ )
233
+ features.to_csv(output_path, index=False, lineterminator="\n")
234
  logger.info(
235
  "Wrote processed features to %s (rows=%d, cols=%d)",
236
  output_path, len(features), features.shape[1],
 
238
 
239
 
240
  if __name__ == "__main__":
241
+ # Day-1 CLI entrypoint — runs with default paths against `data/raw/bbbp.csv`.
242
+ # Argument parsing (argparse / click) will land in a later task.
243
  # python -m src.pipelines.bbb_pipeline
244
  run_pipeline()
tests/pipelines/test_bbb_pipeline.py CHANGED
@@ -186,3 +186,16 @@ class TestRunPipeline:
186
  input_path=tmp_path / "nope.csv",
187
  output_path=tmp_path / "out.csv",
188
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
  input_path=tmp_path / "nope.csv",
187
  output_path=tmp_path / "out.csv",
188
  )
189
+
190
+ def test_run_pipeline_rejects_directory_as_output(self, tmp_path: Path) -> None:
191
+ raw_dir = tmp_path / "data" / "raw"
192
+ raw_dir.mkdir(parents=True)
193
+ input_path = raw_dir / "bbbp.csv"
194
+ shutil.copy(FIXTURE, input_path)
195
+
196
+ # output_path points at an existing directory, not a file
197
+ bad_output = tmp_path / "out_dir"
198
+ bad_output.mkdir()
199
+
200
+ with pytest.raises(IsADirectoryError, match="must be a file"):
201
+ run_pipeline(input_path=input_path, output_path=bad_output, n_bits=32)