Pulastya B commited on
Commit
1932673
Β·
1 Parent(s): 8a420cb

Fixed a arguement normalization bug

Browse files
Files changed (2) hide show
  1. src/orchestrator.py +68 -10
  2. src/tools/auto_pipeline.py +1 -1
src/orchestrator.py CHANGED
@@ -704,11 +704,11 @@ structure, variable relationships, and expected insights - not hardcoded domain
704
  1. profile_dataset(file_path) - ONCE ONLY
705
  2. detect_data_quality_issues(file_path) - ONCE ONLY
706
  3. generate_data_quality_plots(file_path, output_dir="./outputs/plots/quality") - Generate quality visualizations
707
- 4. clean_missing_values(file_path, strategy="auto", output="./outputs/data/cleaned.csv")
708
- 5. handle_outliers(cleaned, method="clip", columns=["all"], output="./outputs/data/no_outliers.csv")
709
- 6. force_numeric_conversion(latest, columns=["all"], output="./outputs/data/numeric.csv", errors="coerce")
710
- 7. **IF DATETIME COLUMNS EXIST**: create_time_features(latest, date_col="<column_name>", output="./outputs/data/time_features.csv") - Extract year/month/day/hour/weekday/timestamp from each datetime column
711
- 8. encode_categorical(latest, method="auto", output="./outputs/data/encoded.csv")
712
  9. generate_eda_plots(encoded, target_col, output_dir="./outputs/plots/eda") - Generate EDA visualizations
713
  10. **ONLY IF USER EXPLICITLY REQUESTED ML**: train_with_autogluon(file_path=encoded, target_col=target_col, task_type="auto", time_limit=120, presets="medium_quality")
714
  - AutoGluon is the DEFAULT training tool. It trains 10+ models with auto ensembling.
@@ -2283,6 +2283,69 @@ You receive quality reports from EDA agent and deliver clean data to modeling ag
2283
  val = arguments.pop(invalid_param)
2284
  print(f" βœ“ Stripped invalid parameter '{invalid_param}': {val}")
2285
  print(f" ℹ️ create_statistical_features creates row-wise stats (mean, std, min, max)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2286
 
2287
  # πŸ”§ FIX: analyze_autogluon_model path resolution
2288
  # The Reasoner hallucinates model paths β€” resolve to actual saved path
@@ -2338,11 +2401,6 @@ You receive quality reports from EDA agent and deliver clean data to modeling ag
2338
  except (ValueError, TypeError):
2339
  pass # Can't inspect, skip validation
2340
 
2341
- # General parameter corrections for common LLM hallucinations
2342
- if "output" in arguments and "output_path" not in arguments:
2343
- # Many tools use 'output_path' but LLM uses 'output'
2344
- arguments["output_path"] = arguments.pop("output")
2345
-
2346
  # Fix "None" string being passed as actual None
2347
  for key, value in list(arguments.items()):
2348
  if isinstance(value, str) and value.lower() in ["none", "null", "undefined"]:
 
704
  1. profile_dataset(file_path) - ONCE ONLY
705
  2. detect_data_quality_issues(file_path) - ONCE ONLY
706
  3. generate_data_quality_plots(file_path, output_dir="./outputs/plots/quality") - Generate quality visualizations
707
+ 4. clean_missing_values(file_path=file_path, strategy="auto", output_path="./outputs/data/cleaned.csv")
708
+ 5. handle_outliers(file_path=cleaned, method="clip", columns=["all"], output_path="./outputs/data/no_outliers.csv")
709
+ 6. force_numeric_conversion(file_path=latest, columns=["all"], output_path="./outputs/data/numeric.csv", errors="coerce")
710
+ 7. **IF DATETIME COLUMNS EXIST**: create_time_features(file_path=latest, date_col="<column_name>", output_path="./outputs/data/time_features.csv") - Extract year/month/day/hour/weekday/timestamp from each datetime column
711
+ 8. encode_categorical(file_path=latest, method="auto", output_path="./outputs/data/encoded.csv")
712
  9. generate_eda_plots(encoded, target_col, output_dir="./outputs/plots/eda") - Generate EDA visualizations
713
  10. **ONLY IF USER EXPLICITLY REQUESTED ML**: train_with_autogluon(file_path=encoded, target_col=target_col, task_type="auto", time_limit=120, presets="medium_quality")
714
  - AutoGluon is the DEFAULT training tool. It trains 10+ models with auto ensembling.
 
2283
  val = arguments.pop(invalid_param)
2284
  print(f" βœ“ Stripped invalid parameter '{invalid_param}': {val}")
2285
  print(f" ℹ️ create_statistical_features creates row-wise stats (mean, std, min, max)")
2286
+
2287
+ # General parameter corrections for common LLM hallucinations
2288
+ # IMPORTANT: Do this BEFORE generic invalid-arg stripping.
2289
+ if "output" in arguments and "output_path" not in arguments:
2290
+ arguments["output_path"] = arguments.pop("output")
2291
+ print(f" βœ“ Parameter remapped: output β†’ output_path")
2292
+
2293
+ # Common file path aliases used by LLM plans/prompts
2294
+ for alias in ["data_path", "input_file", "input", "path", "latest"]:
2295
+ if alias in arguments and "file_path" not in arguments:
2296
+ arguments["file_path"] = arguments.pop(alias)
2297
+ print(f" βœ“ Parameter remapped: {alias} β†’ file_path")
2298
+ break
2299
+
2300
+ # create_time_features is frequently called with alias column names
2301
+ if tool_name == "create_time_features":
2302
+ for alias in ["date_column", "datetime_column", "datetime_col", "time_col", "column", "col"]:
2303
+ if alias in arguments and "date_col" not in arguments:
2304
+ arguments["date_col"] = arguments.pop(alias)
2305
+ print(f" βœ“ Parameter remapped: {alias} β†’ date_col")
2306
+ break
2307
+
2308
+ # Auto-fill output path if omitted
2309
+ if "output_path" not in arguments:
2310
+ arguments["output_path"] = str(self.output_base / "data" / "time_features.csv")
2311
+ print(f" βœ“ Parameter defaulted: output_path β†’ {arguments['output_path']}")
2312
+
2313
+ # Auto-detect datetime column if date_col is missing
2314
+ if "date_col" not in arguments and arguments.get("file_path"):
2315
+ try:
2316
+ import polars as pl
2317
+ fp = arguments["file_path"]
2318
+ df = pl.read_csv(fp) if str(fp).endswith(".csv") else pl.read_parquet(fp)
2319
+
2320
+ preferred_names = [
2321
+ "pickup_time", "pickup_datetime", "dropoff_time", "dropoff_datetime",
2322
+ "timestamp", "datetime", "date", "time"
2323
+ ]
2324
+ matched = next((c for c in preferred_names if c in df.columns), None)
2325
+
2326
+ if not matched:
2327
+ # Prefer true datetime/date dtypes first
2328
+ dt_cols = [
2329
+ c for c in df.columns
2330
+ if df[c].dtype in [pl.Date, pl.Datetime]
2331
+ ]
2332
+ if dt_cols:
2333
+ matched = dt_cols[0]
2334
+
2335
+ if not matched:
2336
+ # Fallback heuristic by name
2337
+ name_hint_cols = [
2338
+ c for c in df.columns
2339
+ if any(k in c.lower() for k in ["date", "time", "timestamp"])
2340
+ ]
2341
+ if name_hint_cols:
2342
+ matched = name_hint_cols[0]
2343
+
2344
+ if matched:
2345
+ arguments["date_col"] = matched
2346
+ print(f" βœ“ Auto-detected date_col: {matched}")
2347
+ except Exception as infer_err:
2348
+ print(f" ⚠️ Could not auto-detect date_col: {infer_err}")
2349
 
2350
  # πŸ”§ FIX: analyze_autogluon_model path resolution
2351
  # The Reasoner hallucinates model paths β€” resolve to actual saved path
 
2401
  except (ValueError, TypeError):
2402
  pass # Can't inspect, skip validation
2403
 
 
 
 
 
 
2404
  # Fix "None" string being passed as actual None
2405
  for key, value in list(arguments.items()):
2406
  if isinstance(value, str) and value.lower() in ["none", "null", "undefined"]:
src/tools/auto_pipeline.py CHANGED
@@ -156,7 +156,7 @@ def auto_ml_pipeline(file_path: str,
156
  try:
157
  time_result = create_time_features(
158
  current_file,
159
- date_column=dt_col,
160
  output_path=current_file # Overwrite
161
  )
162
  results["transformations_applied"].append({
 
156
  try:
157
  time_result = create_time_features(
158
  current_file,
159
+ date_col=dt_col,
160
  output_path=current_file # Overwrite
161
  )
162
  results["transformations_applied"].append({