Spaces:

Pulastya0
/

Data-Science-Agent

Running

App Files Files Community

Pulastya B commited on 17 days ago

Commit

1932673

1 Parent(s): 8a420cb

Fixed a arguement normalization bug

Browse files

Files changed (2) hide show

src/orchestrator.py +68 -10
src/tools/auto_pipeline.py +1 -1

src/orchestrator.py CHANGED Viewed

@@ -704,11 +704,11 @@ structure, variable relationships, and expected insights - not hardcoded domain
 1. profile_dataset(file_path) - ONCE ONLY
 2. detect_data_quality_issues(file_path) - ONCE ONLY
 3. generate_data_quality_plots(file_path, output_dir="./outputs/plots/quality") - Generate quality visualizations
-4. clean_missing_values(file_path, strategy="auto", output="./outputs/data/cleaned.csv")
-5. handle_outliers(cleaned, method="clip", columns=["all"], output="./outputs/data/no_outliers.csv")
-6. force_numeric_conversion(latest, columns=["all"], output="./outputs/data/numeric.csv", errors="coerce")
-7. **IF DATETIME COLUMNS EXIST**: create_time_features(latest, date_col="<column_name>", output="./outputs/data/time_features.csv") - Extract year/month/day/hour/weekday/timestamp from each datetime column
-8. encode_categorical(latest, method="auto", output="./outputs/data/encoded.csv")
 9. generate_eda_plots(encoded, target_col, output_dir="./outputs/plots/eda") - Generate EDA visualizations
 10. **ONLY IF USER EXPLICITLY REQUESTED ML**: train_with_autogluon(file_path=encoded, target_col=target_col, task_type="auto", time_limit=120, presets="medium_quality")
     - AutoGluon is the DEFAULT training tool. It trains 10+ models with auto ensembling.
@@ -2283,6 +2283,69 @@ You receive quality reports from EDA agent and deliver clean data to modeling ag
                         val = arguments.pop(invalid_param)
                         print(f"   ✓ Stripped invalid parameter '{invalid_param}': {val}")
                         print(f"   ℹ️ create_statistical_features creates row-wise stats (mean, std, min, max)")
             # 🔧 FIX: analyze_autogluon_model path resolution
             # The Reasoner hallucinates model paths — resolve to actual saved path
@@ -2338,11 +2401,6 @@ You receive quality reports from EDA agent and deliver clean data to modeling ag
             except (ValueError, TypeError):
                 pass  # Can't inspect, skip validation
-            # General parameter corrections for common LLM hallucinations
-            if "output" in arguments and "output_path" not in arguments:
-                # Many tools use 'output_path' but LLM uses 'output'
-                arguments["output_path"] = arguments.pop("output")
             # Fix "None" string being passed as actual None
             for key, value in list(arguments.items()):
                 if isinstance(value, str) and value.lower() in ["none", "null", "undefined"]:

 1. profile_dataset(file_path) - ONCE ONLY
 2. detect_data_quality_issues(file_path) - ONCE ONLY
 3. generate_data_quality_plots(file_path, output_dir="./outputs/plots/quality") - Generate quality visualizations
+4. clean_missing_values(file_path=file_path, strategy="auto", output_path="./outputs/data/cleaned.csv")
+5. handle_outliers(file_path=cleaned, method="clip", columns=["all"], output_path="./outputs/data/no_outliers.csv")
+6. force_numeric_conversion(file_path=latest, columns=["all"], output_path="./outputs/data/numeric.csv", errors="coerce")
+7. **IF DATETIME COLUMNS EXIST**: create_time_features(file_path=latest, date_col="<column_name>", output_path="./outputs/data/time_features.csv") - Extract year/month/day/hour/weekday/timestamp from each datetime column
+8. encode_categorical(file_path=latest, method="auto", output_path="./outputs/data/encoded.csv")
 9. generate_eda_plots(encoded, target_col, output_dir="./outputs/plots/eda") - Generate EDA visualizations
 10. **ONLY IF USER EXPLICITLY REQUESTED ML**: train_with_autogluon(file_path=encoded, target_col=target_col, task_type="auto", time_limit=120, presets="medium_quality")
     - AutoGluon is the DEFAULT training tool. It trains 10+ models with auto ensembling.
                         val = arguments.pop(invalid_param)
                         print(f"   ✓ Stripped invalid parameter '{invalid_param}': {val}")
                         print(f"   ℹ️ create_statistical_features creates row-wise stats (mean, std, min, max)")
+            # General parameter corrections for common LLM hallucinations
+            # IMPORTANT: Do this BEFORE generic invalid-arg stripping.
+            if "output" in arguments and "output_path" not in arguments:
+                arguments["output_path"] = arguments.pop("output")
+                print(f"   ✓ Parameter remapped: output → output_path")
+            # Common file path aliases used by LLM plans/prompts
+            for alias in ["data_path", "input_file", "input", "path", "latest"]:
+                if alias in arguments and "file_path" not in arguments:
+                    arguments["file_path"] = arguments.pop(alias)
+                    print(f"   ✓ Parameter remapped: {alias} → file_path")
+                    break
+            # create_time_features is frequently called with alias column names
+            if tool_name == "create_time_features":
+                for alias in ["date_column", "datetime_column", "datetime_col", "time_col", "column", "col"]:
+                    if alias in arguments and "date_col" not in arguments:
+                        arguments["date_col"] = arguments.pop(alias)
+                        print(f"   ✓ Parameter remapped: {alias} → date_col")
+                        break
+                # Auto-fill output path if omitted
+                if "output_path" not in arguments:
+                    arguments["output_path"] = str(self.output_base / "data" / "time_features.csv")
+                    print(f"   ✓ Parameter defaulted: output_path → {arguments['output_path']}")
+                # Auto-detect datetime column if date_col is missing
+                if "date_col" not in arguments and arguments.get("file_path"):
+                    try:
+                        import polars as pl
+                        fp = arguments["file_path"]
+                        df = pl.read_csv(fp) if str(fp).endswith(".csv") else pl.read_parquet(fp)
+                        preferred_names = [
+                            "pickup_time", "pickup_datetime", "dropoff_time", "dropoff_datetime",
+                            "timestamp", "datetime", "date", "time"
+                        ]
+                        matched = next((c for c in preferred_names if c in df.columns), None)
+                        if not matched:
+                            # Prefer true datetime/date dtypes first
+                            dt_cols = [
+                                c for c in df.columns
+                                if df[c].dtype in [pl.Date, pl.Datetime]
+                            ]
+                            if dt_cols:
+                                matched = dt_cols[0]
+                        if not matched:
+                            # Fallback heuristic by name
+                            name_hint_cols = [
+                                c for c in df.columns
+                                if any(k in c.lower() for k in ["date", "time", "timestamp"])
+                            ]
+                            if name_hint_cols:
+                                matched = name_hint_cols[0]
+                        if matched:
+                            arguments["date_col"] = matched
+                            print(f"   ✓ Auto-detected date_col: {matched}")
+                    except Exception as infer_err:
+                        print(f"   ⚠️ Could not auto-detect date_col: {infer_err}")
             # 🔧 FIX: analyze_autogluon_model path resolution
             # The Reasoner hallucinates model paths — resolve to actual saved path
             except (ValueError, TypeError):
                 pass  # Can't inspect, skip validation
             # Fix "None" string being passed as actual None
             for key, value in list(arguments.items()):
                 if isinstance(value, str) and value.lower() in ["none", "null", "undefined"]:

src/tools/auto_pipeline.py CHANGED Viewed

@@ -156,7 +156,7 @@ def auto_ml_pipeline(file_path: str,
                 try:
                     time_result = create_time_features(
                         current_file,
-                        date_column=dt_col,
                         output_path=current_file  # Overwrite
                     )
                     results["transformations_applied"].append({

                 try:
                     time_result = create_time_features(
                         current_file,
+                        date_col=dt_col,
                         output_path=current_file  # Overwrite
                     )
                     results["transformations_applied"].append({