muthuk1 commited on
Commit
232e073
·
verified ·
1 Parent(s): 2c41426

Upload training/generate_dataset.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. training/generate_dataset.py +361 -0
training/generate_dataset.py ADDED
@@ -0,0 +1,361 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ALWAS Synthetic Dataset Generator
3
+ Generates realistic analog IC layout block data for ML model training.
4
+ Covers: block metadata, stage transitions, hours, bottleneck labels.
5
+ """
6
+ import numpy as np
7
+ import pandas as pd
8
+ import json
9
+ from datetime import datetime, timedelta
10
+ import random
11
+
12
+ np.random.seed(42)
13
+ random.seed(42)
14
+
15
+ # === Domain Constants ===
16
+ TECH_NODES = ['5nm', '7nm', '12nm', '14nm', '22nm', '28nm', '45nm', '65nm']
17
+ TECH_NODE_COMPLEXITY = {'5nm': 1.6, '7nm': 1.4, '12nm': 1.2, '14nm': 1.1, '22nm': 0.9, '28nm': 0.8, '45nm': 0.6, '65nm': 0.5}
18
+ TECH_NODE_WEIGHTS = [0.05, 0.15, 0.2, 0.15, 0.15, 0.15, 0.1, 0.05]
19
+
20
+ BLOCK_TYPES = ['ADC', 'DAC', 'PLL', 'LDO', 'BGR', 'OTA', 'Comparator', 'SerDes',
21
+ 'VCO', 'Mixer', 'LNA', 'PA', 'TIA', 'SampleHold', 'LVDS_Driver',
22
+ 'BandgapRef', 'CurrentMirror', 'DiffAmp', 'Oscillator', 'PowerDetector']
23
+ BLOCK_TYPE_COMPLEXITY = {
24
+ 'ADC': 1.5, 'DAC': 1.3, 'PLL': 1.7, 'LDO': 0.8, 'BGR': 0.7, 'OTA': 0.6,
25
+ 'Comparator': 0.5, 'SerDes': 1.8, 'VCO': 1.2, 'Mixer': 1.1, 'LNA': 1.0,
26
+ 'PA': 1.3, 'TIA': 0.9, 'SampleHold': 0.7, 'LVDS_Driver': 1.0,
27
+ 'BandgapRef': 0.6, 'CurrentMirror': 0.4, 'DiffAmp': 0.5, 'Oscillator': 1.1,
28
+ 'PowerDetector': 0.8
29
+ }
30
+ BLOCK_TYPE_WEIGHTS = [0.1, 0.08, 0.08, 0.1, 0.06, 0.08, 0.07, 0.04, 0.06, 0.05,
31
+ 0.05, 0.04, 0.04, 0.03, 0.03, 0.02, 0.02, 0.02, 0.02, 0.01]
32
+
33
+ PRIORITIES = ['P1-Critical', 'P2-High', 'P3-Medium', 'P4-Low']
34
+ PRIORITY_WEIGHTS = [0.1, 0.25, 0.45, 0.2]
35
+ PRIORITY_FACTOR = {'P1-Critical': 0.85, 'P2-High': 0.95, 'P3-Medium': 1.0, 'P4-Low': 1.1}
36
+
37
+ STAGES = ['Not Started', 'In Progress', 'DRC', 'LVS', 'ERC', 'Review', 'Completed']
38
+ STAGE_IDX = {s: i for i, s in enumerate(STAGES)}
39
+
40
+ ENGINEERS = [f'eng_{i:03d}' for i in range(1, 51)]
41
+ ENGINEER_SKILL = {e: np.clip(np.random.normal(1.0, 0.2), 0.5, 1.5) for e in ENGINEERS}
42
+
43
+ # === Helper Functions ===
44
+ def estimate_transistor_count(block_type, tech_node):
45
+ base = {
46
+ 'ADC': 50000, 'DAC': 35000, 'PLL': 80000, 'LDO': 8000, 'BGR': 5000,
47
+ 'OTA': 3000, 'Comparator': 2000, 'SerDes': 120000, 'VCO': 15000,
48
+ 'Mixer': 10000, 'LNA': 6000, 'PA': 20000, 'TIA': 4000, 'SampleHold': 3500,
49
+ 'LVDS_Driver': 8000, 'BandgapRef': 3000, 'CurrentMirror': 1500,
50
+ 'DiffAmp': 2500, 'Oscillator': 12000, 'PowerDetector': 5000
51
+ }
52
+ node_scale = {'5nm': 2.0, '7nm': 1.7, '12nm': 1.3, '14nm': 1.2, '22nm': 1.0, '28nm': 0.9, '45nm': 0.7, '65nm': 0.5}
53
+ count = base.get(block_type, 10000) * node_scale.get(tech_node, 1.0)
54
+ return int(count * np.random.lognormal(0, 0.3))
55
+
56
+ def compute_true_hours(block_type, tech_node, transistor_count, priority, engineer,
57
+ has_dependencies, constraint_complexity):
58
+ """Physics-inspired hour estimation with noise."""
59
+ base = 20
60
+ type_mult = BLOCK_TYPE_COMPLEXITY.get(block_type, 1.0)
61
+ node_mult = TECH_NODE_COMPLEXITY.get(tech_node, 1.0)
62
+ size_mult = np.log1p(transistor_count) / np.log1p(10000)
63
+ priority_mult = PRIORITY_FACTOR.get(priority, 1.0)
64
+ skill_mult = 1.0 / ENGINEER_SKILL.get(engineer, 1.0)
65
+ dep_mult = 1.15 if has_dependencies else 1.0
66
+ constraint_mult = 1 + 0.2 * constraint_complexity
67
+
68
+ hours = base * type_mult * node_mult * size_mult * priority_mult * skill_mult * dep_mult * constraint_mult
69
+ noise = np.random.lognormal(0, 0.15)
70
+ return max(4, round(hours * noise, 1))
71
+
72
+ def compute_complexity_label(hours, transistor_count, tech_node):
73
+ """Derive complexity label from multiple signals."""
74
+ node_score = TECH_NODE_COMPLEXITY.get(tech_node, 1.0)
75
+ size_score = np.log1p(transistor_count) / np.log1p(100000)
76
+ combined = 0.5 * (hours / 100) + 0.3 * node_score + 0.2 * size_score
77
+ if combined < 0.35:
78
+ return 'Low'
79
+ elif combined < 0.65:
80
+ return 'Medium'
81
+ else:
82
+ return 'High'
83
+
84
+ def generate_stage_transitions(block, start_date):
85
+ """Generate realistic stage transition events with timestamps."""
86
+ transitions = []
87
+ current_date = start_date
88
+ total_hours = block['actual_hours']
89
+ stage_proportions = {
90
+ 'Not Started': 0.0, 'In Progress': 0.35, 'DRC': 0.2,
91
+ 'LVS': 0.15, 'ERC': 0.15, 'Review': 0.1, 'Completed': 0.05
92
+ }
93
+
94
+ for i, stage in enumerate(STAGES):
95
+ if stage == 'Not Started':
96
+ transitions.append({
97
+ 'stage': stage, 'timestamp': current_date.isoformat(),
98
+ 'hours_in_stage': 0, 'drc_violations': 0, 'lvs_mismatches': 0
99
+ })
100
+ current_date += timedelta(hours=np.random.exponential(4))
101
+ continue
102
+
103
+ proportion = stage_proportions.get(stage, 0.1)
104
+ stage_hours = total_hours * proportion * np.random.uniform(0.7, 1.3)
105
+ stage_hours = max(1, stage_hours)
106
+
107
+ drc_violations = 0
108
+ lvs_mismatches = 0
109
+
110
+ if stage == 'DRC':
111
+ if block['tech_node'] in ['5nm', '7nm', '12nm']:
112
+ drc_violations = int(np.random.exponential(8) + np.random.poisson(3))
113
+ else:
114
+ drc_violations = int(np.random.exponential(3) + np.random.poisson(1))
115
+
116
+ if stage == 'LVS':
117
+ lvs_mismatches = int(np.random.exponential(2))
118
+
119
+ # Days to complete this stage (8 hours/day)
120
+ days = max(0.5, stage_hours / 8)
121
+ # Add some variance for weekends, blocked time
122
+ if np.random.random() < 0.15:
123
+ days *= np.random.uniform(1.5, 3.0) # delays
124
+
125
+ transitions.append({
126
+ 'stage': stage,
127
+ 'timestamp': current_date.isoformat(),
128
+ 'hours_in_stage': round(stage_hours, 1),
129
+ 'days_in_stage': round(days, 1),
130
+ 'drc_violations': drc_violations,
131
+ 'lvs_mismatches': lvs_mismatches
132
+ })
133
+ current_date += timedelta(days=days)
134
+
135
+ if i >= block.get('final_stage_idx', len(STAGES) - 1):
136
+ break
137
+
138
+ return transitions
139
+
140
+ def generate_block(block_id, is_completed=True):
141
+ """Generate a single block with all features."""
142
+ tech_node = np.random.choice(TECH_NODES, p=TECH_NODE_WEIGHTS)
143
+ block_type = np.random.choice(BLOCK_TYPES, p=BLOCK_TYPE_WEIGHTS)
144
+ priority = np.random.choice(PRIORITIES, p=PRIORITY_WEIGHTS)
145
+ engineer = np.random.choice(ENGINEERS)
146
+ transistor_count = estimate_transistor_count(block_type, tech_node)
147
+ has_dependencies = np.random.random() < 0.35
148
+ num_dependencies = int(np.random.exponential(1.5)) if has_dependencies else 0
149
+ constraint_complexity = np.random.uniform(0, 3) # analog constraint score
150
+
151
+ actual_hours = compute_true_hours(
152
+ block_type, tech_node, transistor_count, priority, engineer,
153
+ has_dependencies, constraint_complexity
154
+ )
155
+
156
+ # Estimated hours (simulating AI/human estimate — noisy version of actual)
157
+ estimation_noise = np.random.normal(0, 0.25)
158
+ estimated_hours = max(4, round(actual_hours * np.exp(estimation_noise), 1))
159
+
160
+ complexity = compute_complexity_label(actual_hours, transistor_count, tech_node)
161
+
162
+ # Determine final stage
163
+ if is_completed:
164
+ final_stage = 'Completed'
165
+ final_stage_idx = 6
166
+ else:
167
+ # In-progress blocks stop at various stages
168
+ final_stage_idx = np.random.choice(range(1, 6), p=[0.3, 0.25, 0.2, 0.15, 0.1])
169
+ final_stage = STAGES[final_stage_idx]
170
+
171
+ # Start date: random in last 2 years
172
+ start_date = datetime(2024, 1, 1) + timedelta(days=np.random.randint(0, 730))
173
+
174
+ # DRC iterations (for completed blocks)
175
+ drc_iterations = max(1, int(np.random.exponential(2) + 1))
176
+ if tech_node in ['5nm', '7nm']:
177
+ drc_iterations = max(1, int(np.random.exponential(3) + 2))
178
+
179
+ # Hours logged so far
180
+ hours_logged = actual_hours if is_completed else round(actual_hours * np.random.uniform(0.1, 0.9), 1)
181
+
182
+ # Bottleneck risk label
183
+ hours_ratio = hours_logged / max(estimated_hours, 1)
184
+ days_in_current = np.random.exponential(3) if not is_completed else 0
185
+
186
+ if hours_ratio > 1.3 or days_in_current > 5:
187
+ bottleneck_risk = 'High'
188
+ elif hours_ratio > 1.0 or days_in_current > 3:
189
+ bottleneck_risk = 'Medium'
190
+ else:
191
+ bottleneck_risk = 'Low'
192
+
193
+ block = {
194
+ 'block_id': f'BLK-{block_id:05d}',
195
+ 'block_name': f'{block_type}_{tech_node}_{block_id}',
196
+ 'block_type': block_type,
197
+ 'tech_node': tech_node,
198
+ 'priority': priority,
199
+ 'priority_numeric': PRIORITIES.index(priority) + 1,
200
+ 'engineer_id': engineer,
201
+ 'engineer_skill_factor': round(ENGINEER_SKILL[engineer], 3),
202
+ 'transistor_count': transistor_count,
203
+ 'transistor_count_log': round(np.log1p(transistor_count), 4),
204
+ 'has_dependencies': int(has_dependencies),
205
+ 'num_dependencies': num_dependencies,
206
+ 'constraint_complexity': round(constraint_complexity, 2),
207
+ 'estimated_hours': estimated_hours,
208
+ 'actual_hours': actual_hours,
209
+ 'hours_logged': hours_logged,
210
+ 'hours_over_estimate_ratio': round(hours_logged / max(estimated_hours, 1), 3),
211
+ 'drc_iterations': drc_iterations,
212
+ 'drc_violations_total': 0, # filled from transitions
213
+ 'lvs_mismatches_total': 0,
214
+ 'current_stage': final_stage,
215
+ 'current_stage_idx': STAGE_IDX[final_stage],
216
+ 'days_in_current_stage': round(days_in_current, 1),
217
+ 'is_completed': int(is_completed),
218
+ 'complexity': complexity,
219
+ 'bottleneck_risk': bottleneck_risk,
220
+ 'start_date': start_date.strftime('%Y-%m-%d'),
221
+ 'final_stage_idx': final_stage_idx,
222
+ }
223
+
224
+ # Generate transitions
225
+ transitions = generate_stage_transitions(block, start_date)
226
+ block['transitions'] = json.dumps(transitions)
227
+ block['num_stage_transitions'] = len(transitions)
228
+
229
+ # Aggregate DRC/LVS from transitions
230
+ block['drc_violations_total'] = sum(t.get('drc_violations', 0) for t in transitions)
231
+ block['lvs_mismatches_total'] = sum(t.get('lvs_mismatches', 0) for t in transitions)
232
+
233
+ # Compute total days from transitions
234
+ if len(transitions) > 1:
235
+ block['total_days'] = sum(t.get('days_in_stage', 0) for t in transitions)
236
+ else:
237
+ block['total_days'] = round(actual_hours / 8, 1)
238
+
239
+ # Due date and overdue status
240
+ due_days = max(int(block['total_days'] * np.random.uniform(0.8, 1.5)), 3)
241
+ block['due_date'] = (start_date + timedelta(days=due_days)).strftime('%Y-%m-%d')
242
+ if is_completed:
243
+ block['is_overdue'] = int(block['total_days'] > due_days)
244
+ else:
245
+ elapsed = (datetime.now() - start_date).days
246
+ block['is_overdue'] = int(elapsed > due_days)
247
+
248
+ return block
249
+
250
+ def generate_dataset(n_completed=3000, n_in_progress=1000):
251
+ """Generate full dataset."""
252
+ print(f"Generating {n_completed} completed + {n_in_progress} in-progress blocks...")
253
+ blocks = []
254
+
255
+ for i in range(n_completed):
256
+ blocks.append(generate_block(i + 1, is_completed=True))
257
+
258
+ for i in range(n_in_progress):
259
+ blocks.append(generate_block(n_completed + i + 1, is_completed=False))
260
+
261
+ df = pd.DataFrame(blocks)
262
+ return df
263
+
264
+ # === Generate SFT Dataset for LLM Fine-tuning ===
265
+ def generate_sft_dataset(df, n_samples=2000):
266
+ """Generate conversational dataset for complexity estimation SFT."""
267
+ sft_data = []
268
+ sampled = df.sample(n=min(n_samples, len(df)), random_state=42)
269
+
270
+ for _, row in sampled.iterrows():
271
+ user_msg = (
272
+ f"Estimate the complexity and required hours for this analog IC layout block:\n"
273
+ f"- Block Type: {row['block_type']}\n"
274
+ f"- Technology Node: {row['tech_node']}\n"
275
+ f"- Priority: {row['priority']}\n"
276
+ f"- Estimated Transistor Count: ~{row['transistor_count']:,}\n"
277
+ f"- Has Dependencies: {'Yes' if row['has_dependencies'] else 'No'}"
278
+ + (f" ({row['num_dependencies']} blocks)\n" if row['has_dependencies'] else "\n") +
279
+ f"- Constraint Complexity Score: {row['constraint_complexity']:.1f}/3.0\n"
280
+ f"- DRC Iterations Expected: {row['drc_iterations']}"
281
+ )
282
+
283
+ # Build a realistic explanation
284
+ reasons = []
285
+ if row['complexity'] == 'High':
286
+ if row['tech_node'] in ['5nm', '7nm', '12nm']:
287
+ reasons.append(f"Advanced {row['tech_node']} node requires extensive DRC/LVS iterations")
288
+ if row['transistor_count'] > 50000:
289
+ reasons.append(f"Large transistor count (~{row['transistor_count']:,}) increases layout complexity")
290
+ if row['block_type'] in ['PLL', 'SerDes', 'ADC']:
291
+ reasons.append(f"{row['block_type']} blocks require precision matching and careful routing")
292
+ if row['has_dependencies']:
293
+ reasons.append(f"Inter-block dependencies ({row['num_dependencies']}) add integration overhead")
294
+ elif row['complexity'] == 'Medium':
295
+ reasons.append(f"{row['block_type']} at {row['tech_node']} has moderate layout challenges")
296
+ if row['constraint_complexity'] > 1.5:
297
+ reasons.append("Analog constraints require careful floor planning")
298
+ else:
299
+ reasons.append(f"{row['block_type']} at {row['tech_node']} is a well-characterized block")
300
+ if row['transistor_count'] < 10000:
301
+ reasons.append("Small transistor count allows straightforward layout")
302
+
303
+ if not reasons:
304
+ reasons.append(f"Standard {row['block_type']} layout at {row['tech_node']}")
305
+
306
+ risk_level = 'low' if row['complexity'] == 'Low' else ('medium' if row['complexity'] == 'Medium' else 'high')
307
+
308
+ assistant_msg = (
309
+ f'{{"complexity": "{row["complexity"]}", '
310
+ f'"estimated_hours": {row["actual_hours"]}, '
311
+ f'"confidence": {round(np.random.uniform(0.7, 0.95), 2)}, '
312
+ f'"risk_level": "{risk_level}", '
313
+ f'"reasoning": "{"; ".join(reasons)}", '
314
+ f'"recommended_drc_iterations": {row["drc_iterations"]}, '
315
+ f'"suggested_engineer_skill_level": "{"senior" if row["complexity"] == "High" else "mid" if row["complexity"] == "Medium" else "junior"}"}}'
316
+ )
317
+
318
+ sft_data.append({
319
+ "messages": [
320
+ {"role": "system", "content": "You are ALWAS AI, an analog IC layout complexity estimation assistant. Given block metadata, estimate complexity (Low/Medium/High), required hours, and provide reasoning. Respond in JSON format."},
321
+ {"role": "user", "content": user_msg},
322
+ {"role": "assistant", "content": assistant_msg}
323
+ ]
324
+ })
325
+
326
+ return sft_data
327
+
328
+
329
+ if __name__ == '__main__':
330
+ # Generate main tabular dataset
331
+ df = generate_dataset(n_completed=3000, n_in_progress=1000)
332
+
333
+ # Save tabular data
334
+ df.to_csv('/app/alwas_blocks_dataset.csv', index=False)
335
+ df.to_parquet('/app/alwas_blocks_dataset.parquet', index=False)
336
+
337
+ # Generate SFT dataset
338
+ sft_data = generate_sft_dataset(df, n_samples=2000)
339
+ with open('/app/alwas_sft_dataset.json', 'w') as f:
340
+ json.dump(sft_data, f, indent=2)
341
+
342
+ # Print dataset stats
343
+ print(f"\n=== Dataset Statistics ===")
344
+ print(f"Total blocks: {len(df)}")
345
+ print(f"Completed: {df['is_completed'].sum()}")
346
+ print(f"In-progress: {(~df['is_completed'].astype(bool)).sum()}")
347
+ print(f"\nComplexity distribution:")
348
+ print(df['complexity'].value_counts())
349
+ print(f"\nBottleneck risk distribution:")
350
+ print(df['bottleneck_risk'].value_counts())
351
+ print(f"\nBlock type distribution:")
352
+ print(df['block_type'].value_counts().head(10))
353
+ print(f"\nTech node distribution:")
354
+ print(df['tech_node'].value_counts())
355
+ print(f"\nHours statistics:")
356
+ print(df['actual_hours'].describe())
357
+ print(f"\nSFT samples: {len(sft_data)}")
358
+ print(f"\nFiles saved:")
359
+ print(f" /app/alwas_blocks_dataset.csv")
360
+ print(f" /app/alwas_blocks_dataset.parquet")
361
+ print(f" /app/alwas_sft_dataset.json")