Spaces:
Running
Running
Pulastya B commited on
Commit Β·
1932673
1
Parent(s): 8a420cb
Fixed a arguement normalization bug
Browse files- src/orchestrator.py +68 -10
- src/tools/auto_pipeline.py +1 -1
src/orchestrator.py
CHANGED
|
@@ -704,11 +704,11 @@ structure, variable relationships, and expected insights - not hardcoded domain
|
|
| 704 |
1. profile_dataset(file_path) - ONCE ONLY
|
| 705 |
2. detect_data_quality_issues(file_path) - ONCE ONLY
|
| 706 |
3. generate_data_quality_plots(file_path, output_dir="./outputs/plots/quality") - Generate quality visualizations
|
| 707 |
-
4. clean_missing_values(file_path, strategy="auto",
|
| 708 |
-
5. handle_outliers(cleaned, method="clip", columns=["all"],
|
| 709 |
-
6. force_numeric_conversion(latest, columns=["all"],
|
| 710 |
-
7. **IF DATETIME COLUMNS EXIST**: create_time_features(latest, date_col="<column_name>",
|
| 711 |
-
8. encode_categorical(latest, method="auto",
|
| 712 |
9. generate_eda_plots(encoded, target_col, output_dir="./outputs/plots/eda") - Generate EDA visualizations
|
| 713 |
10. **ONLY IF USER EXPLICITLY REQUESTED ML**: train_with_autogluon(file_path=encoded, target_col=target_col, task_type="auto", time_limit=120, presets="medium_quality")
|
| 714 |
- AutoGluon is the DEFAULT training tool. It trains 10+ models with auto ensembling.
|
|
@@ -2283,6 +2283,69 @@ You receive quality reports from EDA agent and deliver clean data to modeling ag
|
|
| 2283 |
val = arguments.pop(invalid_param)
|
| 2284 |
print(f" β Stripped invalid parameter '{invalid_param}': {val}")
|
| 2285 |
print(f" βΉοΈ create_statistical_features creates row-wise stats (mean, std, min, max)")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2286 |
|
| 2287 |
# π§ FIX: analyze_autogluon_model path resolution
|
| 2288 |
# The Reasoner hallucinates model paths β resolve to actual saved path
|
|
@@ -2338,11 +2401,6 @@ You receive quality reports from EDA agent and deliver clean data to modeling ag
|
|
| 2338 |
except (ValueError, TypeError):
|
| 2339 |
pass # Can't inspect, skip validation
|
| 2340 |
|
| 2341 |
-
# General parameter corrections for common LLM hallucinations
|
| 2342 |
-
if "output" in arguments and "output_path" not in arguments:
|
| 2343 |
-
# Many tools use 'output_path' but LLM uses 'output'
|
| 2344 |
-
arguments["output_path"] = arguments.pop("output")
|
| 2345 |
-
|
| 2346 |
# Fix "None" string being passed as actual None
|
| 2347 |
for key, value in list(arguments.items()):
|
| 2348 |
if isinstance(value, str) and value.lower() in ["none", "null", "undefined"]:
|
|
|
|
| 704 |
1. profile_dataset(file_path) - ONCE ONLY
|
| 705 |
2. detect_data_quality_issues(file_path) - ONCE ONLY
|
| 706 |
3. generate_data_quality_plots(file_path, output_dir="./outputs/plots/quality") - Generate quality visualizations
|
| 707 |
+
4. clean_missing_values(file_path=file_path, strategy="auto", output_path="./outputs/data/cleaned.csv")
|
| 708 |
+
5. handle_outliers(file_path=cleaned, method="clip", columns=["all"], output_path="./outputs/data/no_outliers.csv")
|
| 709 |
+
6. force_numeric_conversion(file_path=latest, columns=["all"], output_path="./outputs/data/numeric.csv", errors="coerce")
|
| 710 |
+
7. **IF DATETIME COLUMNS EXIST**: create_time_features(file_path=latest, date_col="<column_name>", output_path="./outputs/data/time_features.csv") - Extract year/month/day/hour/weekday/timestamp from each datetime column
|
| 711 |
+
8. encode_categorical(file_path=latest, method="auto", output_path="./outputs/data/encoded.csv")
|
| 712 |
9. generate_eda_plots(encoded, target_col, output_dir="./outputs/plots/eda") - Generate EDA visualizations
|
| 713 |
10. **ONLY IF USER EXPLICITLY REQUESTED ML**: train_with_autogluon(file_path=encoded, target_col=target_col, task_type="auto", time_limit=120, presets="medium_quality")
|
| 714 |
- AutoGluon is the DEFAULT training tool. It trains 10+ models with auto ensembling.
|
|
|
|
| 2283 |
val = arguments.pop(invalid_param)
|
| 2284 |
print(f" β Stripped invalid parameter '{invalid_param}': {val}")
|
| 2285 |
print(f" βΉοΈ create_statistical_features creates row-wise stats (mean, std, min, max)")
|
| 2286 |
+
|
| 2287 |
+
# General parameter corrections for common LLM hallucinations
|
| 2288 |
+
# IMPORTANT: Do this BEFORE generic invalid-arg stripping.
|
| 2289 |
+
if "output" in arguments and "output_path" not in arguments:
|
| 2290 |
+
arguments["output_path"] = arguments.pop("output")
|
| 2291 |
+
print(f" β Parameter remapped: output β output_path")
|
| 2292 |
+
|
| 2293 |
+
# Common file path aliases used by LLM plans/prompts
|
| 2294 |
+
for alias in ["data_path", "input_file", "input", "path", "latest"]:
|
| 2295 |
+
if alias in arguments and "file_path" not in arguments:
|
| 2296 |
+
arguments["file_path"] = arguments.pop(alias)
|
| 2297 |
+
print(f" β Parameter remapped: {alias} β file_path")
|
| 2298 |
+
break
|
| 2299 |
+
|
| 2300 |
+
# create_time_features is frequently called with alias column names
|
| 2301 |
+
if tool_name == "create_time_features":
|
| 2302 |
+
for alias in ["date_column", "datetime_column", "datetime_col", "time_col", "column", "col"]:
|
| 2303 |
+
if alias in arguments and "date_col" not in arguments:
|
| 2304 |
+
arguments["date_col"] = arguments.pop(alias)
|
| 2305 |
+
print(f" β Parameter remapped: {alias} β date_col")
|
| 2306 |
+
break
|
| 2307 |
+
|
| 2308 |
+
# Auto-fill output path if omitted
|
| 2309 |
+
if "output_path" not in arguments:
|
| 2310 |
+
arguments["output_path"] = str(self.output_base / "data" / "time_features.csv")
|
| 2311 |
+
print(f" β Parameter defaulted: output_path β {arguments['output_path']}")
|
| 2312 |
+
|
| 2313 |
+
# Auto-detect datetime column if date_col is missing
|
| 2314 |
+
if "date_col" not in arguments and arguments.get("file_path"):
|
| 2315 |
+
try:
|
| 2316 |
+
import polars as pl
|
| 2317 |
+
fp = arguments["file_path"]
|
| 2318 |
+
df = pl.read_csv(fp) if str(fp).endswith(".csv") else pl.read_parquet(fp)
|
| 2319 |
+
|
| 2320 |
+
preferred_names = [
|
| 2321 |
+
"pickup_time", "pickup_datetime", "dropoff_time", "dropoff_datetime",
|
| 2322 |
+
"timestamp", "datetime", "date", "time"
|
| 2323 |
+
]
|
| 2324 |
+
matched = next((c for c in preferred_names if c in df.columns), None)
|
| 2325 |
+
|
| 2326 |
+
if not matched:
|
| 2327 |
+
# Prefer true datetime/date dtypes first
|
| 2328 |
+
dt_cols = [
|
| 2329 |
+
c for c in df.columns
|
| 2330 |
+
if df[c].dtype in [pl.Date, pl.Datetime]
|
| 2331 |
+
]
|
| 2332 |
+
if dt_cols:
|
| 2333 |
+
matched = dt_cols[0]
|
| 2334 |
+
|
| 2335 |
+
if not matched:
|
| 2336 |
+
# Fallback heuristic by name
|
| 2337 |
+
name_hint_cols = [
|
| 2338 |
+
c for c in df.columns
|
| 2339 |
+
if any(k in c.lower() for k in ["date", "time", "timestamp"])
|
| 2340 |
+
]
|
| 2341 |
+
if name_hint_cols:
|
| 2342 |
+
matched = name_hint_cols[0]
|
| 2343 |
+
|
| 2344 |
+
if matched:
|
| 2345 |
+
arguments["date_col"] = matched
|
| 2346 |
+
print(f" β Auto-detected date_col: {matched}")
|
| 2347 |
+
except Exception as infer_err:
|
| 2348 |
+
print(f" β οΈ Could not auto-detect date_col: {infer_err}")
|
| 2349 |
|
| 2350 |
# π§ FIX: analyze_autogluon_model path resolution
|
| 2351 |
# The Reasoner hallucinates model paths β resolve to actual saved path
|
|
|
|
| 2401 |
except (ValueError, TypeError):
|
| 2402 |
pass # Can't inspect, skip validation
|
| 2403 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2404 |
# Fix "None" string being passed as actual None
|
| 2405 |
for key, value in list(arguments.items()):
|
| 2406 |
if isinstance(value, str) and value.lower() in ["none", "null", "undefined"]:
|
src/tools/auto_pipeline.py
CHANGED
|
@@ -156,7 +156,7 @@ def auto_ml_pipeline(file_path: str,
|
|
| 156 |
try:
|
| 157 |
time_result = create_time_features(
|
| 158 |
current_file,
|
| 159 |
-
|
| 160 |
output_path=current_file # Overwrite
|
| 161 |
)
|
| 162 |
results["transformations_applied"].append({
|
|
|
|
| 156 |
try:
|
| 157 |
time_result = create_time_features(
|
| 158 |
current_file,
|
| 159 |
+
date_col=dt_col,
|
| 160 |
output_path=current_file # Overwrite
|
| 161 |
)
|
| 162 |
results["transformations_applied"].append({
|