muthuk1 commited on
Commit
7b1fee3
·
verified ·
1 Parent(s): 53c7569

Upload training/train_completion.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. training/train_completion.py +259 -0
training/train_completion.py ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ALWAS Completion Time Predictor
3
+ Predicts remaining hours to completion given current block state and stage history.
4
+ Uses a gradient boosting approach on engineered sequential features.
5
+ """
6
+ import numpy as np
7
+ import pandas as pd
8
+ import json
9
+ import joblib
10
+ from sklearn.model_selection import train_test_split, cross_val_score
11
+ from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
12
+ from sklearn.ensemble import GradientBoostingRegressor
13
+ import xgboost as xgb
14
+
15
+ # Load data
16
+ df = pd.read_csv('/app/alwas_blocks_dataset.csv')
17
+ completed = df[df['is_completed'] == 1].copy()
18
+
19
+ print("=" * 60)
20
+ print("MODEL 4: Completion Time Predictor")
21
+ print("=" * 60)
22
+
23
+ # Parse transitions and compute sequential features
24
+ def extract_sequence_features(row):
25
+ """Extract features from stage transition history for completed blocks."""
26
+ try:
27
+ transitions = json.loads(row['transitions'])
28
+ except:
29
+ return None
30
+
31
+ features = {}
32
+
33
+ # Basic stage timing features
34
+ stage_hours = {}
35
+ stage_days = {}
36
+ drc_violations_cumulative = 0
37
+ lvs_mismatches_cumulative = 0
38
+
39
+ for t in transitions:
40
+ stage = t.get('stage', '')
41
+ hours = t.get('hours_in_stage', 0)
42
+ days = t.get('days_in_stage', 0)
43
+ drc_violations_cumulative += t.get('drc_violations', 0)
44
+ lvs_mismatches_cumulative += t.get('lvs_mismatches', 0)
45
+ stage_hours[stage] = hours
46
+ stage_days[stage] = days
47
+
48
+ # Time spent in each stage
49
+ features['hours_in_progress'] = stage_hours.get('In Progress', 0)
50
+ features['hours_drc'] = stage_hours.get('DRC', 0)
51
+ features['hours_lvs'] = stage_hours.get('LVS', 0)
52
+ features['hours_erc'] = stage_hours.get('ERC', 0)
53
+ features['hours_review'] = stage_hours.get('Review', 0)
54
+
55
+ features['days_in_progress'] = stage_days.get('In Progress', 0)
56
+ features['days_drc'] = stage_days.get('DRC', 0)
57
+ features['days_lvs'] = stage_days.get('LVS', 0)
58
+ features['days_erc'] = stage_days.get('ERC', 0)
59
+ features['days_review'] = stage_days.get('Review', 0)
60
+
61
+ # Cumulative metrics at each stage
62
+ features['drc_violations_cumulative'] = drc_violations_cumulative
63
+ features['lvs_mismatches_cumulative'] = lvs_mismatches_cumulative
64
+
65
+ # Velocity features
66
+ total_hours = sum(stage_hours.values())
67
+ total_stages_completed = len([t for t in transitions if t.get('hours_in_stage', 0) > 0])
68
+ features['avg_hours_per_stage'] = total_hours / max(total_stages_completed, 1)
69
+
70
+ # Acceleration (is the pace increasing or decreasing?)
71
+ if len(transitions) >= 3:
72
+ stage_durations = [t.get('days_in_stage', 0) for t in transitions if t.get('days_in_stage', 0) > 0]
73
+ if len(stage_durations) >= 2:
74
+ features['pace_trend'] = stage_durations[-1] - stage_durations[0]
75
+ features['pace_ratio'] = stage_durations[-1] / max(stage_durations[0], 0.1)
76
+ else:
77
+ features['pace_trend'] = 0
78
+ features['pace_ratio'] = 1.0
79
+ else:
80
+ features['pace_trend'] = 0
81
+ features['pace_ratio'] = 1.0
82
+
83
+ return features
84
+
85
+ print("Extracting sequence features from transitions...")
86
+ seq_features_list = []
87
+ for idx, row in completed.iterrows():
88
+ feat = extract_sequence_features(row)
89
+ if feat:
90
+ feat['idx'] = idx
91
+ seq_features_list.append(feat)
92
+
93
+ seq_df = pd.DataFrame(seq_features_list).set_index('idx')
94
+ completed = completed.join(seq_df)
95
+
96
+ # Now simulate "partial observation" — at each stage, predict remaining hours
97
+ # This creates multiple training examples per block
98
+ print("Creating partial-observation training samples...")
99
+
100
+ training_samples = []
101
+ for _, row in completed.iterrows():
102
+ try:
103
+ transitions = json.loads(row['transitions'])
104
+ except:
105
+ continue
106
+
107
+ total_actual_hours = row['actual_hours']
108
+ cumulative_hours = 0
109
+ cumulative_days = 0
110
+ cumulative_drc = 0
111
+ cumulative_lvs = 0
112
+
113
+ for i, t in enumerate(transitions):
114
+ if i == 0: # Skip "Not Started" — no useful features
115
+ continue
116
+
117
+ stage_hours = t.get('hours_in_stage', 0)
118
+ stage_days = t.get('days_in_stage', 0)
119
+ cumulative_hours += stage_hours
120
+ cumulative_days += stage_days
121
+ cumulative_drc += t.get('drc_violations', 0)
122
+ cumulative_lvs += t.get('lvs_mismatches', 0)
123
+
124
+ remaining_hours = max(0, total_actual_hours - cumulative_hours)
125
+
126
+ sample = {
127
+ # Static block features
128
+ 'tech_node_encoded': row.get('tech_node_encoded', 0),
129
+ 'block_type_encoded': row.get('block_type_encoded', 0),
130
+ 'priority_numeric': row['priority_numeric'],
131
+ 'transistor_count_log': row['transistor_count_log'],
132
+ 'has_dependencies': row['has_dependencies'],
133
+ 'num_dependencies': row['num_dependencies'],
134
+ 'constraint_complexity': row['constraint_complexity'],
135
+ 'estimated_hours': row['estimated_hours'],
136
+ 'engineer_skill_factor': row['engineer_skill_factor'],
137
+ 'drc_iterations': row['drc_iterations'],
138
+ # Dynamic features (observed so far)
139
+ 'current_stage_idx': i,
140
+ 'cumulative_hours': cumulative_hours,
141
+ 'cumulative_days': cumulative_days,
142
+ 'cumulative_drc_violations': cumulative_drc,
143
+ 'cumulative_lvs_mismatches': cumulative_lvs,
144
+ 'hours_vs_estimate_ratio': cumulative_hours / max(row['estimated_hours'], 1),
145
+ 'stages_completed': i,
146
+ 'stages_remaining': len(transitions) - i - 1,
147
+ 'avg_hours_per_stage_so_far': cumulative_hours / max(i, 1),
148
+ 'avg_days_per_stage_so_far': cumulative_days / max(i, 1),
149
+ # Target
150
+ 'remaining_hours': remaining_hours,
151
+ }
152
+ training_samples.append(sample)
153
+
154
+ train_df = pd.DataFrame(training_samples)
155
+ print(f"Created {len(train_df)} partial-observation training samples from {len(completed)} blocks")
156
+
157
+ # Features for completion time model
158
+ COMPLETION_FEATURES = [
159
+ 'tech_node_encoded', 'block_type_encoded', 'priority_numeric',
160
+ 'transistor_count_log', 'has_dependencies', 'num_dependencies',
161
+ 'constraint_complexity', 'estimated_hours', 'engineer_skill_factor',
162
+ 'drc_iterations', 'current_stage_idx', 'cumulative_hours',
163
+ 'cumulative_days', 'cumulative_drc_violations', 'cumulative_lvs_mismatches',
164
+ 'hours_vs_estimate_ratio', 'stages_completed',
165
+ 'avg_hours_per_stage_so_far', 'avg_days_per_stage_so_far'
166
+ ]
167
+
168
+ X = train_df[COMPLETION_FEATURES]
169
+ y = train_df['remaining_hours']
170
+
171
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
172
+
173
+ # Train XGBoost for completion time
174
+ completion_model = xgb.XGBRegressor(
175
+ n_estimators=800,
176
+ learning_rate=0.03,
177
+ max_depth=8,
178
+ subsample=0.8,
179
+ colsample_bytree=0.8,
180
+ min_child_weight=5,
181
+ reg_alpha=0.1,
182
+ reg_lambda=1.0,
183
+ objective='reg:squarederror',
184
+ tree_method='hist',
185
+ random_state=42,
186
+ early_stopping_rounds=50,
187
+ )
188
+ completion_model.fit(
189
+ X_train, y_train,
190
+ eval_set=[(X_test, y_test)],
191
+ verbose=False
192
+ )
193
+
194
+ y_pred = completion_model.predict(X_test)
195
+ mae = mean_absolute_error(y_test, y_pred)
196
+ rmse = np.sqrt(mean_squared_error(y_test, y_pred))
197
+ r2 = r2_score(y_test, y_pred)
198
+ mape = np.mean(np.abs((y_test - y_pred) / np.maximum(y_test, 1))) * 100
199
+
200
+ print(f"\nCompletion Time Prediction Results:")
201
+ print(f" MAE: {mae:.2f} hours")
202
+ print(f" RMSE: {rmse:.2f} hours")
203
+ print(f" R²: {r2:.4f}")
204
+ print(f" MAPE: {mape:.1f}%")
205
+
206
+ # Feature importance
207
+ importance = pd.DataFrame({
208
+ 'feature': COMPLETION_FEATURES,
209
+ 'importance': completion_model.feature_importances_
210
+ }).sort_values('importance', ascending=False)
211
+ print(f"\nTop features for completion time prediction:")
212
+ print(importance.to_string(index=False))
213
+
214
+ # Cross-validation
215
+ cv_model = xgb.XGBRegressor(
216
+ n_estimators=800, learning_rate=0.03, max_depth=8,
217
+ subsample=0.8, colsample_bytree=0.8, tree_method='hist', random_state=42
218
+ )
219
+ cv_scores = cross_val_score(cv_model, X, y, cv=5, scoring='r2')
220
+ print(f"\n5-Fold CV R²: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")
221
+
222
+ # Evaluate by stage
223
+ print(f"\n--- Per-Stage MAE ---")
224
+ for stage_idx in range(1, 7):
225
+ mask = X_test['current_stage_idx'] == stage_idx
226
+ if mask.sum() > 0:
227
+ stage_mae = mean_absolute_error(y_test[mask], y_pred[mask])
228
+ stage_names = ['Not Started', 'In Progress', 'DRC', 'LVS', 'ERC', 'Review', 'Completed']
229
+ print(f" Stage {stage_idx} ({stage_names[stage_idx]}): MAE = {stage_mae:.2f}h ({mask.sum()} samples)")
230
+
231
+ # Save
232
+ joblib.dump(completion_model, '/app/models/completion_predictor.joblib')
233
+
234
+ # Update feature config
235
+ with open('/app/models/feature_config.json', 'r') as f:
236
+ config = json.load(f)
237
+ config['completion_features'] = COMPLETION_FEATURES
238
+ with open('/app/models/feature_config.json', 'w') as f:
239
+ json.dump(config, f, indent=2)
240
+
241
+ # Update metrics
242
+ with open('/app/models/metrics.json', 'r') as f:
243
+ metrics = json.load(f)
244
+ metrics['completion_prediction'] = {
245
+ 'mae': round(mae, 2),
246
+ 'rmse': round(rmse, 2),
247
+ 'r2': round(r2, 4),
248
+ 'mape_percent': round(mape, 1),
249
+ 'cv_r2_mean': round(cv_scores.mean(), 4),
250
+ 'cv_r2_std': round(cv_scores.std(), 4),
251
+ 'training_samples': len(train_df),
252
+ }
253
+ with open('/app/models/metrics.json', 'w') as f:
254
+ json.dump(metrics, f, indent=2)
255
+
256
+ print(f"\nModel saved to /app/models/completion_predictor.joblib")
257
+ print("=" * 60)
258
+ print("COMPLETION TIME MODEL TRAINED SUCCESSFULLY")
259
+ print("=" * 60)