muthuk1
/

alwas-ml-models

Joblib

Model card Files Files and versions

xet

Community

muthuk1 commited on 14 days ago

Commit

232e073

verified ·

1 Parent(s): 2c41426

Upload training/generate_dataset.py with huggingface_hub

Browse files

Files changed (1) hide show

training/generate_dataset.py +361 -0

training/generate_dataset.py ADDED Viewed

	@@ -0,0 +1,361 @@

+"""
+ALWAS Synthetic Dataset Generator
+Generates realistic analog IC layout block data for ML model training.
+Covers: block metadata, stage transitions, hours, bottleneck labels.
+"""
+import numpy as np
+import pandas as pd
+import json
+from datetime import datetime, timedelta
+import random
+np.random.seed(42)
+random.seed(42)
+# === Domain Constants ===
+TECH_NODES = ['5nm', '7nm', '12nm', '14nm', '22nm', '28nm', '45nm', '65nm']
+TECH_NODE_COMPLEXITY = {'5nm': 1.6, '7nm': 1.4, '12nm': 1.2, '14nm': 1.1, '22nm': 0.9, '28nm': 0.8, '45nm': 0.6, '65nm': 0.5}
+TECH_NODE_WEIGHTS = [0.05, 0.15, 0.2, 0.15, 0.15, 0.15, 0.1, 0.05]
+BLOCK_TYPES = ['ADC', 'DAC', 'PLL', 'LDO', 'BGR', 'OTA', 'Comparator', 'SerDes',
+               'VCO', 'Mixer', 'LNA', 'PA', 'TIA', 'SampleHold', 'LVDS_Driver',
+               'BandgapRef', 'CurrentMirror', 'DiffAmp', 'Oscillator', 'PowerDetector']
+BLOCK_TYPE_COMPLEXITY = {
+    'ADC': 1.5, 'DAC': 1.3, 'PLL': 1.7, 'LDO': 0.8, 'BGR': 0.7, 'OTA': 0.6,
+    'Comparator': 0.5, 'SerDes': 1.8, 'VCO': 1.2, 'Mixer': 1.1, 'LNA': 1.0,
+    'PA': 1.3, 'TIA': 0.9, 'SampleHold': 0.7, 'LVDS_Driver': 1.0,
+    'BandgapRef': 0.6, 'CurrentMirror': 0.4, 'DiffAmp': 0.5, 'Oscillator': 1.1,
+    'PowerDetector': 0.8
+}
+BLOCK_TYPE_WEIGHTS = [0.1, 0.08, 0.08, 0.1, 0.06, 0.08, 0.07, 0.04, 0.06, 0.05,
+                      0.05, 0.04, 0.04, 0.03, 0.03, 0.02, 0.02, 0.02, 0.02, 0.01]
+PRIORITIES = ['P1-Critical', 'P2-High', 'P3-Medium', 'P4-Low']
+PRIORITY_WEIGHTS = [0.1, 0.25, 0.45, 0.2]
+PRIORITY_FACTOR = {'P1-Critical': 0.85, 'P2-High': 0.95, 'P3-Medium': 1.0, 'P4-Low': 1.1}
+STAGES = ['Not Started', 'In Progress', 'DRC', 'LVS', 'ERC', 'Review', 'Completed']
+STAGE_IDX = {s: i for i, s in enumerate(STAGES)}
+ENGINEERS = [f'eng_{i:03d}' for i in range(1, 51)]
+ENGINEER_SKILL = {e: np.clip(np.random.normal(1.0, 0.2), 0.5, 1.5) for e in ENGINEERS}
+# === Helper Functions ===
+def estimate_transistor_count(block_type, tech_node):
+    base = {
+        'ADC': 50000, 'DAC': 35000, 'PLL': 80000, 'LDO': 8000, 'BGR': 5000,
+        'OTA': 3000, 'Comparator': 2000, 'SerDes': 120000, 'VCO': 15000,
+        'Mixer': 10000, 'LNA': 6000, 'PA': 20000, 'TIA': 4000, 'SampleHold': 3500,
+        'LVDS_Driver': 8000, 'BandgapRef': 3000, 'CurrentMirror': 1500,
+        'DiffAmp': 2500, 'Oscillator': 12000, 'PowerDetector': 5000
+    }
+    node_scale = {'5nm': 2.0, '7nm': 1.7, '12nm': 1.3, '14nm': 1.2, '22nm': 1.0, '28nm': 0.9, '45nm': 0.7, '65nm': 0.5}
+    count = base.get(block_type, 10000) * node_scale.get(tech_node, 1.0)
+    return int(count * np.random.lognormal(0, 0.3))
+def compute_true_hours(block_type, tech_node, transistor_count, priority, engineer,
+                        has_dependencies, constraint_complexity):
+    """Physics-inspired hour estimation with noise."""
+    base = 20
+    type_mult = BLOCK_TYPE_COMPLEXITY.get(block_type, 1.0)
+    node_mult = TECH_NODE_COMPLEXITY.get(tech_node, 1.0)
+    size_mult = np.log1p(transistor_count) / np.log1p(10000)
+    priority_mult = PRIORITY_FACTOR.get(priority, 1.0)
+    skill_mult = 1.0 / ENGINEER_SKILL.get(engineer, 1.0)
+    dep_mult = 1.15 if has_dependencies else 1.0
+    constraint_mult = 1 + 0.2 * constraint_complexity
+    hours = base * type_mult * node_mult * size_mult * priority_mult * skill_mult * dep_mult * constraint_mult
+    noise = np.random.lognormal(0, 0.15)
+    return max(4, round(hours * noise, 1))
+def compute_complexity_label(hours, transistor_count, tech_node):
+    """Derive complexity label from multiple signals."""
+    node_score = TECH_NODE_COMPLEXITY.get(tech_node, 1.0)
+    size_score = np.log1p(transistor_count) / np.log1p(100000)
+    combined = 0.5 * (hours / 100) + 0.3 * node_score + 0.2 * size_score
+    if combined < 0.35:
+        return 'Low'
+    elif combined < 0.65:
+        return 'Medium'
+    else:
+        return 'High'
+def generate_stage_transitions(block, start_date):
+    """Generate realistic stage transition events with timestamps."""
+    transitions = []
+    current_date = start_date
+    total_hours = block['actual_hours']
+    stage_proportions = {
+        'Not Started': 0.0, 'In Progress': 0.35, 'DRC': 0.2,
+        'LVS': 0.15, 'ERC': 0.15, 'Review': 0.1, 'Completed': 0.05
+    }
+    for i, stage in enumerate(STAGES):
+        if stage == 'Not Started':
+            transitions.append({
+                'stage': stage, 'timestamp': current_date.isoformat(),
+                'hours_in_stage': 0, 'drc_violations': 0, 'lvs_mismatches': 0
+            })
+            current_date += timedelta(hours=np.random.exponential(4))
+            continue
+        proportion = stage_proportions.get(stage, 0.1)
+        stage_hours = total_hours * proportion * np.random.uniform(0.7, 1.3)
+        stage_hours = max(1, stage_hours)
+        drc_violations = 0
+        lvs_mismatches = 0
+        if stage == 'DRC':
+            if block['tech_node'] in ['5nm', '7nm', '12nm']:
+                drc_violations = int(np.random.exponential(8) + np.random.poisson(3))
+            else:
+                drc_violations = int(np.random.exponential(3) + np.random.poisson(1))
+        if stage == 'LVS':
+            lvs_mismatches = int(np.random.exponential(2))
+        # Days to complete this stage (8 hours/day)
+        days = max(0.5, stage_hours / 8)
+        # Add some variance for weekends, blocked time
+        if np.random.random() < 0.15:
+            days *= np.random.uniform(1.5, 3.0)  # delays
+        transitions.append({
+            'stage': stage,
+            'timestamp': current_date.isoformat(),
+            'hours_in_stage': round(stage_hours, 1),
+            'days_in_stage': round(days, 1),
+            'drc_violations': drc_violations,
+            'lvs_mismatches': lvs_mismatches
+        })
+        current_date += timedelta(days=days)
+        if i >= block.get('final_stage_idx', len(STAGES) - 1):
+            break
+    return transitions
+def generate_block(block_id, is_completed=True):
+    """Generate a single block with all features."""
+    tech_node = np.random.choice(TECH_NODES, p=TECH_NODE_WEIGHTS)
+    block_type = np.random.choice(BLOCK_TYPES, p=BLOCK_TYPE_WEIGHTS)
+    priority = np.random.choice(PRIORITIES, p=PRIORITY_WEIGHTS)
+    engineer = np.random.choice(ENGINEERS)
+    transistor_count = estimate_transistor_count(block_type, tech_node)
+    has_dependencies = np.random.random() < 0.35
+    num_dependencies = int(np.random.exponential(1.5)) if has_dependencies else 0
+    constraint_complexity = np.random.uniform(0, 3)  # analog constraint score
+    actual_hours = compute_true_hours(
+        block_type, tech_node, transistor_count, priority, engineer,
+        has_dependencies, constraint_complexity
+    )
+    # Estimated hours (simulating AI/human estimate — noisy version of actual)
+    estimation_noise = np.random.normal(0, 0.25)
+    estimated_hours = max(4, round(actual_hours * np.exp(estimation_noise), 1))
+    complexity = compute_complexity_label(actual_hours, transistor_count, tech_node)
+    # Determine final stage
+    if is_completed:
+        final_stage = 'Completed'
+        final_stage_idx = 6
+    else:
+        # In-progress blocks stop at various stages
+        final_stage_idx = np.random.choice(range(1, 6), p=[0.3, 0.25, 0.2, 0.15, 0.1])
+        final_stage = STAGES[final_stage_idx]
+    # Start date: random in last 2 years
+    start_date = datetime(2024, 1, 1) + timedelta(days=np.random.randint(0, 730))
+    # DRC iterations (for completed blocks)
+    drc_iterations = max(1, int(np.random.exponential(2) + 1))
+    if tech_node in ['5nm', '7nm']:
+        drc_iterations = max(1, int(np.random.exponential(3) + 2))
+    # Hours logged so far
+    hours_logged = actual_hours if is_completed else round(actual_hours * np.random.uniform(0.1, 0.9), 1)
+    # Bottleneck risk label
+    hours_ratio = hours_logged / max(estimated_hours, 1)
+    days_in_current = np.random.exponential(3) if not is_completed else 0
+    if hours_ratio > 1.3 or days_in_current > 5:
+        bottleneck_risk = 'High'
+    elif hours_ratio > 1.0 or days_in_current > 3:
+        bottleneck_risk = 'Medium'
+    else:
+        bottleneck_risk = 'Low'
+    block = {
+        'block_id': f'BLK-{block_id:05d}',
+        'block_name': f'{block_type}_{tech_node}_{block_id}',
+        'block_type': block_type,
+        'tech_node': tech_node,
+        'priority': priority,
+        'priority_numeric': PRIORITIES.index(priority) + 1,
+        'engineer_id': engineer,
+        'engineer_skill_factor': round(ENGINEER_SKILL[engineer], 3),
+        'transistor_count': transistor_count,
+        'transistor_count_log': round(np.log1p(transistor_count), 4),
+        'has_dependencies': int(has_dependencies),
+        'num_dependencies': num_dependencies,
+        'constraint_complexity': round(constraint_complexity, 2),
+        'estimated_hours': estimated_hours,
+        'actual_hours': actual_hours,
+        'hours_logged': hours_logged,
+        'hours_over_estimate_ratio': round(hours_logged / max(estimated_hours, 1), 3),
+        'drc_iterations': drc_iterations,
+        'drc_violations_total': 0,  # filled from transitions
+        'lvs_mismatches_total': 0,
+        'current_stage': final_stage,
+        'current_stage_idx': STAGE_IDX[final_stage],
+        'days_in_current_stage': round(days_in_current, 1),
+        'is_completed': int(is_completed),
+        'complexity': complexity,
+        'bottleneck_risk': bottleneck_risk,
+        'start_date': start_date.strftime('%Y-%m-%d'),
+        'final_stage_idx': final_stage_idx,
+    }
+    # Generate transitions
+    transitions = generate_stage_transitions(block, start_date)
+    block['transitions'] = json.dumps(transitions)
+    block['num_stage_transitions'] = len(transitions)
+    # Aggregate DRC/LVS from transitions
+    block['drc_violations_total'] = sum(t.get('drc_violations', 0) for t in transitions)
+    block['lvs_mismatches_total'] = sum(t.get('lvs_mismatches', 0) for t in transitions)
+    # Compute total days from transitions
+    if len(transitions) > 1:
+        block['total_days'] = sum(t.get('days_in_stage', 0) for t in transitions)
+    else:
+        block['total_days'] = round(actual_hours / 8, 1)
+    # Due date and overdue status
+    due_days = max(int(block['total_days'] * np.random.uniform(0.8, 1.5)), 3)
+    block['due_date'] = (start_date + timedelta(days=due_days)).strftime('%Y-%m-%d')
+    if is_completed:
+        block['is_overdue'] = int(block['total_days'] > due_days)
+    else:
+        elapsed = (datetime.now() - start_date).days
+        block['is_overdue'] = int(elapsed > due_days)
+    return block
+def generate_dataset(n_completed=3000, n_in_progress=1000):
+    """Generate full dataset."""
+    print(f"Generating {n_completed} completed + {n_in_progress} in-progress blocks...")
+    blocks = []
+    for i in range(n_completed):
+        blocks.append(generate_block(i + 1, is_completed=True))
+    for i in range(n_in_progress):
+        blocks.append(generate_block(n_completed + i + 1, is_completed=False))
+    df = pd.DataFrame(blocks)
+    return df
+# === Generate SFT Dataset for LLM Fine-tuning ===
+def generate_sft_dataset(df, n_samples=2000):
+    """Generate conversational dataset for complexity estimation SFT."""
+    sft_data = []
+    sampled = df.sample(n=min(n_samples, len(df)), random_state=42)
+    for _, row in sampled.iterrows():
+        user_msg = (
+            f"Estimate the complexity and required hours for this analog IC layout block:\n"
+            f"- Block Type: {row['block_type']}\n"
+            f"- Technology Node: {row['tech_node']}\n"
+            f"- Priority: {row['priority']}\n"
+            f"- Estimated Transistor Count: ~{row['transistor_count']:,}\n"
+            f"- Has Dependencies: {'Yes' if row['has_dependencies'] else 'No'}"
+            + (f" ({row['num_dependencies']} blocks)\n" if row['has_dependencies'] else "\n") +
+            f"- Constraint Complexity Score: {row['constraint_complexity']:.1f}/3.0\n"
+            f"- DRC Iterations Expected: {row['drc_iterations']}"
+        )
+        # Build a realistic explanation
+        reasons = []
+        if row['complexity'] == 'High':
+            if row['tech_node'] in ['5nm', '7nm', '12nm']:
+                reasons.append(f"Advanced {row['tech_node']} node requires extensive DRC/LVS iterations")
+            if row['transistor_count'] > 50000:
+                reasons.append(f"Large transistor count (~{row['transistor_count']:,}) increases layout complexity")
+            if row['block_type'] in ['PLL', 'SerDes', 'ADC']:
+                reasons.append(f"{row['block_type']} blocks require precision matching and careful routing")
+            if row['has_dependencies']:
+                reasons.append(f"Inter-block dependencies ({row['num_dependencies']}) add integration overhead")
+        elif row['complexity'] == 'Medium':
+            reasons.append(f"{row['block_type']} at {row['tech_node']} has moderate layout challenges")
+            if row['constraint_complexity'] > 1.5:
+                reasons.append("Analog constraints require careful floor planning")
+        else:
+            reasons.append(f"{row['block_type']} at {row['tech_node']} is a well-characterized block")
+            if row['transistor_count'] < 10000:
+                reasons.append("Small transistor count allows straightforward layout")
+        if not reasons:
+            reasons.append(f"Standard {row['block_type']} layout at {row['tech_node']}")
+        risk_level = 'low' if row['complexity'] == 'Low' else ('medium' if row['complexity'] == 'Medium' else 'high')
+        assistant_msg = (
+            f'{{"complexity": "{row["complexity"]}", '
+            f'"estimated_hours": {row["actual_hours"]}, '
+            f'"confidence": {round(np.random.uniform(0.7, 0.95), 2)}, '
+            f'"risk_level": "{risk_level}", '
+            f'"reasoning": "{"; ".join(reasons)}", '
+            f'"recommended_drc_iterations": {row["drc_iterations"]}, '
+            f'"suggested_engineer_skill_level": "{"senior" if row["complexity"] == "High" else "mid" if row["complexity"] == "Medium" else "junior"}"}}'
+        )
+        sft_data.append({
+            "messages": [
+                {"role": "system", "content": "You are ALWAS AI, an analog IC layout complexity estimation assistant. Given block metadata, estimate complexity (Low/Medium/High), required hours, and provide reasoning. Respond in JSON format."},
+                {"role": "user", "content": user_msg},
+                {"role": "assistant", "content": assistant_msg}
+            ]
+        })
+    return sft_data
+if __name__ == '__main__':
+    # Generate main tabular dataset
+    df = generate_dataset(n_completed=3000, n_in_progress=1000)
+    # Save tabular data
+    df.to_csv('/app/alwas_blocks_dataset.csv', index=False)
+    df.to_parquet('/app/alwas_blocks_dataset.parquet', index=False)
+    # Generate SFT dataset
+    sft_data = generate_sft_dataset(df, n_samples=2000)
+    with open('/app/alwas_sft_dataset.json', 'w') as f:
+        json.dump(sft_data, f, indent=2)
+    # Print dataset stats
+    print(f"\n=== Dataset Statistics ===")
+    print(f"Total blocks: {len(df)}")
+    print(f"Completed: {df['is_completed'].sum()}")
+    print(f"In-progress: {(~df['is_completed'].astype(bool)).sum()}")
+    print(f"\nComplexity distribution:")
+    print(df['complexity'].value_counts())
+    print(f"\nBottleneck risk distribution:")
+    print(df['bottleneck_risk'].value_counts())
+    print(f"\nBlock type distribution:")
+    print(df['block_type'].value_counts().head(10))
+    print(f"\nTech node distribution:")
+    print(df['tech_node'].value_counts())
+    print(f"\nHours statistics:")
+    print(df['actual_hours'].describe())
+    print(f"\nSFT samples: {len(sft_data)}")
+    print(f"\nFiles saved:")
+    print(f"  /app/alwas_blocks_dataset.csv")
+    print(f"  /app/alwas_blocks_dataset.parquet")
+    print(f"  /app/alwas_sft_dataset.json")