File size: 9,527 Bytes
7b1fee3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
"""
ALWAS Completion Time Predictor
Predicts remaining hours to completion given current block state and stage history.
Uses a gradient boosting approach on engineered sequential features.
"""
import numpy as np
import pandas as pd
import json
import joblib
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb

# Load data
df = pd.read_csv('/app/alwas_blocks_dataset.csv')
completed = df[df['is_completed'] == 1].copy()

print("=" * 60)
print("MODEL 4: Completion Time Predictor")
print("=" * 60)

# Parse transitions and compute sequential features
def extract_sequence_features(row):
    """Extract features from stage transition history for completed blocks."""
    try:
        transitions = json.loads(row['transitions'])
    except:
        return None
    
    features = {}
    
    # Basic stage timing features
    stage_hours = {}
    stage_days = {}
    drc_violations_cumulative = 0
    lvs_mismatches_cumulative = 0
    
    for t in transitions:
        stage = t.get('stage', '')
        hours = t.get('hours_in_stage', 0)
        days = t.get('days_in_stage', 0)
        drc_violations_cumulative += t.get('drc_violations', 0)
        lvs_mismatches_cumulative += t.get('lvs_mismatches', 0)
        stage_hours[stage] = hours
        stage_days[stage] = days
    
    # Time spent in each stage
    features['hours_in_progress'] = stage_hours.get('In Progress', 0)
    features['hours_drc'] = stage_hours.get('DRC', 0)
    features['hours_lvs'] = stage_hours.get('LVS', 0)
    features['hours_erc'] = stage_hours.get('ERC', 0)
    features['hours_review'] = stage_hours.get('Review', 0)
    
    features['days_in_progress'] = stage_days.get('In Progress', 0)
    features['days_drc'] = stage_days.get('DRC', 0)
    features['days_lvs'] = stage_days.get('LVS', 0)
    features['days_erc'] = stage_days.get('ERC', 0)
    features['days_review'] = stage_days.get('Review', 0)
    
    # Cumulative metrics at each stage
    features['drc_violations_cumulative'] = drc_violations_cumulative
    features['lvs_mismatches_cumulative'] = lvs_mismatches_cumulative
    
    # Velocity features
    total_hours = sum(stage_hours.values())
    total_stages_completed = len([t for t in transitions if t.get('hours_in_stage', 0) > 0])
    features['avg_hours_per_stage'] = total_hours / max(total_stages_completed, 1)
    
    # Acceleration (is the pace increasing or decreasing?)
    if len(transitions) >= 3:
        stage_durations = [t.get('days_in_stage', 0) for t in transitions if t.get('days_in_stage', 0) > 0]
        if len(stage_durations) >= 2:
            features['pace_trend'] = stage_durations[-1] - stage_durations[0]
            features['pace_ratio'] = stage_durations[-1] / max(stage_durations[0], 0.1)
        else:
            features['pace_trend'] = 0
            features['pace_ratio'] = 1.0
    else:
        features['pace_trend'] = 0
        features['pace_ratio'] = 1.0
    
    return features

print("Extracting sequence features from transitions...")
seq_features_list = []
for idx, row in completed.iterrows():
    feat = extract_sequence_features(row)
    if feat:
        feat['idx'] = idx
        seq_features_list.append(feat)

seq_df = pd.DataFrame(seq_features_list).set_index('idx')
completed = completed.join(seq_df)

# Now simulate "partial observation" — at each stage, predict remaining hours
# This creates multiple training examples per block
print("Creating partial-observation training samples...")

training_samples = []
for _, row in completed.iterrows():
    try:
        transitions = json.loads(row['transitions'])
    except:
        continue
    
    total_actual_hours = row['actual_hours']
    cumulative_hours = 0
    cumulative_days = 0
    cumulative_drc = 0
    cumulative_lvs = 0
    
    for i, t in enumerate(transitions):
        if i == 0:  # Skip "Not Started" — no useful features
            continue
        
        stage_hours = t.get('hours_in_stage', 0)
        stage_days = t.get('days_in_stage', 0)
        cumulative_hours += stage_hours
        cumulative_days += stage_days
        cumulative_drc += t.get('drc_violations', 0)
        cumulative_lvs += t.get('lvs_mismatches', 0)
        
        remaining_hours = max(0, total_actual_hours - cumulative_hours)
        
        sample = {
            # Static block features
            'tech_node_encoded': row.get('tech_node_encoded', 0),
            'block_type_encoded': row.get('block_type_encoded', 0),
            'priority_numeric': row['priority_numeric'],
            'transistor_count_log': row['transistor_count_log'],
            'has_dependencies': row['has_dependencies'],
            'num_dependencies': row['num_dependencies'],
            'constraint_complexity': row['constraint_complexity'],
            'estimated_hours': row['estimated_hours'],
            'engineer_skill_factor': row['engineer_skill_factor'],
            'drc_iterations': row['drc_iterations'],
            # Dynamic features (observed so far)
            'current_stage_idx': i,
            'cumulative_hours': cumulative_hours,
            'cumulative_days': cumulative_days,
            'cumulative_drc_violations': cumulative_drc,
            'cumulative_lvs_mismatches': cumulative_lvs,
            'hours_vs_estimate_ratio': cumulative_hours / max(row['estimated_hours'], 1),
            'stages_completed': i,
            'stages_remaining': len(transitions) - i - 1,
            'avg_hours_per_stage_so_far': cumulative_hours / max(i, 1),
            'avg_days_per_stage_so_far': cumulative_days / max(i, 1),
            # Target
            'remaining_hours': remaining_hours,
        }
        training_samples.append(sample)

train_df = pd.DataFrame(training_samples)
print(f"Created {len(train_df)} partial-observation training samples from {len(completed)} blocks")

# Features for completion time model
COMPLETION_FEATURES = [
    'tech_node_encoded', 'block_type_encoded', 'priority_numeric',
    'transistor_count_log', 'has_dependencies', 'num_dependencies',
    'constraint_complexity', 'estimated_hours', 'engineer_skill_factor',
    'drc_iterations', 'current_stage_idx', 'cumulative_hours',
    'cumulative_days', 'cumulative_drc_violations', 'cumulative_lvs_mismatches',
    'hours_vs_estimate_ratio', 'stages_completed',
    'avg_hours_per_stage_so_far', 'avg_days_per_stage_so_far'
]

X = train_df[COMPLETION_FEATURES]
y = train_df['remaining_hours']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train XGBoost for completion time
completion_model = xgb.XGBRegressor(
    n_estimators=800,
    learning_rate=0.03,
    max_depth=8,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_weight=5,
    reg_alpha=0.1,
    reg_lambda=1.0,
    objective='reg:squarederror',
    tree_method='hist',
    random_state=42,
    early_stopping_rounds=50,
)
completion_model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    verbose=False
)

y_pred = completion_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
mape = np.mean(np.abs((y_test - y_pred) / np.maximum(y_test, 1))) * 100

print(f"\nCompletion Time Prediction Results:")
print(f"  MAE:  {mae:.2f} hours")
print(f"  RMSE: {rmse:.2f} hours")
print(f"  R²:   {r2:.4f}")
print(f"  MAPE: {mape:.1f}%")

# Feature importance
importance = pd.DataFrame({
    'feature': COMPLETION_FEATURES,
    'importance': completion_model.feature_importances_
}).sort_values('importance', ascending=False)
print(f"\nTop features for completion time prediction:")
print(importance.to_string(index=False))

# Cross-validation
cv_model = xgb.XGBRegressor(
    n_estimators=800, learning_rate=0.03, max_depth=8,
    subsample=0.8, colsample_bytree=0.8, tree_method='hist', random_state=42
)
cv_scores = cross_val_score(cv_model, X, y, cv=5, scoring='r2')
print(f"\n5-Fold CV R²: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

# Evaluate by stage
print(f"\n--- Per-Stage MAE ---")
for stage_idx in range(1, 7):
    mask = X_test['current_stage_idx'] == stage_idx
    if mask.sum() > 0:
        stage_mae = mean_absolute_error(y_test[mask], y_pred[mask])
        stage_names = ['Not Started', 'In Progress', 'DRC', 'LVS', 'ERC', 'Review', 'Completed']
        print(f"  Stage {stage_idx} ({stage_names[stage_idx]}): MAE = {stage_mae:.2f}h ({mask.sum()} samples)")

# Save
joblib.dump(completion_model, '/app/models/completion_predictor.joblib')

# Update feature config
with open('/app/models/feature_config.json', 'r') as f:
    config = json.load(f)
config['completion_features'] = COMPLETION_FEATURES
with open('/app/models/feature_config.json', 'w') as f:
    json.dump(config, f, indent=2)

# Update metrics
with open('/app/models/metrics.json', 'r') as f:
    metrics = json.load(f)
metrics['completion_prediction'] = {
    'mae': round(mae, 2),
    'rmse': round(rmse, 2),
    'r2': round(r2, 4),
    'mape_percent': round(mape, 1),
    'cv_r2_mean': round(cv_scores.mean(), 4),
    'cv_r2_std': round(cv_scores.std(), 4),
    'training_samples': len(train_df),
}
with open('/app/models/metrics.json', 'w') as f:
    json.dump(metrics, f, indent=2)

print(f"\nModel saved to /app/models/completion_predictor.joblib")
print("=" * 60)
print("COMPLETION TIME MODEL TRAINED SUCCESSFULLY")
print("=" * 60)