"""
Cost Normalization (Section 5.2)

Renders the ZLP and SMT problems tractable by finding smaller integer cycle counts
whose ratios closely approximate the original cycle count ratios.

The optimization problem:
    Given original cycle counts C = [c1, c2, ..., cn],
    find new counts C' = [c'1, c'2, ..., c'n] such that:
    
    ∀i,j: -F ≤ C[i]·C'[j] - C[j]·C'[i] ≤ F    (bounded ratio change)
    1 ≤ Σ C'[i] ≤ U                                (avoid zero solution)
    minimize F                                       (minimize distortion)

Uses PuLP (CBC solver) since SCIP is not freely available in Python.
The paper uses SCIP but notes CBC also works (just slower on some instances).
"""

import pulp
import numpy as np
from typing import List, Dict, Tuple, Optional


def normalize_costs(
    original_costs: Dict[str, int],
    U: int = 300,
    solver_time_limit: int = 60,
) -> Tuple[Dict[str, int], float]:
    """Normalize cycle counts to keep ratios but reduce magnitudes.
    
    Args:
        original_costs: Dict mapping label -> original cycle count
        U: Upper bound on sum of new costs (controls resolution vs. speed tradeoff)
        solver_time_limit: Time limit in seconds for the solver
    
    Returns:
        Tuple of (normalized_costs dict, distortion F value)
    
    Example:
        >>> costs = {"GEMM": 1000, "EXP": 1000, "TMA_LOAD": 500}
        >>> norm, F = normalize_costs(costs, U=10)
        >>> # norm might be {"GEMM": 2, "EXP": 2, "TMA_LOAD": 1}
    """
    labels = list(original_costs.keys())
    C = [original_costs[l] for l in labels]
    n = len(C)

    if n == 0:
        return {}, 0.0

    # Trivial case: all costs equal
    if len(set(C)) == 1:
        # All equal -> just set them all to 1
        return {l: 1 for l in labels}, 0.0

    # If all costs already small enough, return as-is
    if sum(C) <= U:
        return dict(original_costs), 0.0

    # Set up the ILP problem
    prob = pulp.LpProblem("CostNormalization", pulp.LpMinimize)

    # Decision variables
    # C'[i] for each instruction
    c_prime = [pulp.LpVariable(f"c_{i}", lowBound=1, cat='Integer') for i in range(n)]

    # F: the distortion bound
    F = pulp.LpVariable("F", lowBound=0, cat='Integer')

    # Objective: minimize F
    prob += F

    # Constraint: 1 ≤ Σ C'[i] ≤ U
    prob += pulp.lpSum(c_prime) >= 1
    prob += pulp.lpSum(c_prime) <= U

    # Ratio constraints: ∀i,j: -F ≤ C[i]·C'[j] - C[j]·C'[i] ≤ F
    for i in range(n):
        for j in range(i + 1, n):
            # C[i]*C'[j] - C[j]*C'[i] ≤ F
            prob += C[i] * c_prime[j] - C[j] * c_prime[i] <= F
            # -(C[i]*C'[j] - C[j]*C'[i]) ≤ F  =>  C[j]*C'[i] - C[i]*C'[j] ≤ F
            prob += C[j] * c_prime[i] - C[i] * c_prime[j] <= F

    # Solve
    solver = pulp.PULP_CBC_CMD(msg=0, timeLimit=solver_time_limit)
    status = prob.solve(solver)

    if status != pulp.constants.LpStatusOptimal:
        # Fallback: proportional scaling
        max_c = max(C)
        scale = U / (n * max_c) if max_c > 0 else 1.0
        normalized = {l: max(1, int(round(c * scale))) for l, c in zip(labels, C)}
        return normalized, float('inf')

    normalized = {}
    for i, label in enumerate(labels):
        val = int(round(pulp.value(c_prime[i])))
        normalized[label] = max(1, val)

    distortion = pulp.value(F)
    return normalized, distortion


def apply_normalization(
    original_costs: Dict[str, int],
    normalized_costs: Dict[str, int],
    edge_delays: List[Tuple[str, str, int]],
) -> List[Tuple[str, str, int]]:
    """Scale edge delays according to normalized instruction costs.
    
    When instruction costs are normalized, edge delays (which depend on
    the execution time of the source instruction) must be rescaled proportionally.
    
    Args:
        original_costs: Original cycle counts per instruction
        normalized_costs: Normalized cycle counts per instruction
        edge_delays: List of (src, dst, original_delay) tuples
    
    Returns:
        List of (src, dst, normalized_delay) tuples
    """
    result = []
    for src, dst, d in edge_delays:
        if src in original_costs and original_costs[src] > 0:
            # Scale delay proportionally to source instruction's cost ratio
            ratio = normalized_costs[src] / original_costs[src]
            new_d = max(1, int(round(d * ratio)))
        else:
            new_d = d
        result.append((src, dst, new_d))
    return result