// ============================================================================ // Async NoC Mesh // ============================================================================ // // Copyright 2026 Henry Arthur Shulayev Barnes / Catalyst Neuromorphic Ltd // Company No. 17054540 — UK Patent Application No. 2602902.6 // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ============================================================================ `timescale 1ns/1ps module async_noc_mesh #( parameter NUM_CORES = 4, parameter CORE_ID_BITS = 2, parameter NUM_NEURONS = 1024, parameter NEURON_BITS = 10, parameter DATA_WIDTH = 16, parameter POOL_DEPTH = 32768, parameter POOL_ADDR_BITS = 15, parameter COUNT_BITS = 12, parameter REV_FANIN = 32, parameter REV_SLOT_BITS = 5, parameter THRESHOLD = 16'sd1000, parameter LEAK_RATE = 16'sd3, parameter REFRAC_CYCLES = 3, parameter GRADE_SHIFT = 7, parameter ROUTE_FANOUT = 8, parameter ROUTE_SLOT_BITS = 3, parameter ROUTE_ADDR_W = CORE_ID_BITS + NEURON_BITS + ROUTE_SLOT_BITS, parameter ROUTE_DATA_W = 1 + CORE_ID_BITS + NEURON_BITS + DATA_WIDTH, parameter CLUSTER_SIZE = 4, parameter GLOBAL_ROUTE_SLOTS = 4, parameter GLOBAL_ROUTE_SLOT_BITS = 2, parameter GLOBAL_ROUTE_ADDR_W = CORE_ID_BITS + NEURON_BITS + GLOBAL_ROUTE_SLOT_BITS, parameter CHIP_LINK_EN = 0, parameter DUAL_NOC = 0, parameter MESH_X = 2, parameter MESH_Y = 2 )( input wire clk, input wire rst_n, input wire start, input wire prog_pool_we, input wire [CORE_ID_BITS-1:0] prog_pool_core, input wire [POOL_ADDR_BITS-1:0] prog_pool_addr, input wire [NEURON_BITS-1:0] prog_pool_src, input wire [NEURON_BITS-1:0] prog_pool_target, input wire signed [DATA_WIDTH-1:0] prog_pool_weight, input wire [1:0] prog_pool_comp, input wire prog_index_we, input wire [CORE_ID_BITS-1:0] prog_index_core, input wire [NEURON_BITS-1:0] prog_index_neuron, input wire [POOL_ADDR_BITS-1:0] prog_index_base, input wire [COUNT_BITS-1:0] prog_index_count, input wire [1:0] prog_index_format, input wire prog_route_we, input wire [CORE_ID_BITS-1:0] prog_route_src_core, input wire [NEURON_BITS-1:0] prog_route_src_neuron, input wire [ROUTE_SLOT_BITS-1:0] prog_route_slot, input wire [CORE_ID_BITS-1:0] prog_route_dest_core, input wire [NEURON_BITS-1:0] prog_route_dest_neuron, input wire signed [DATA_WIDTH-1:0] prog_route_weight, input wire prog_global_route_we, input wire [CORE_ID_BITS-1:0] prog_global_route_src_core, input wire [NEURON_BITS-1:0] prog_global_route_src_neuron, input wire [GLOBAL_ROUTE_SLOT_BITS-1:0] prog_global_route_slot, input wire [CORE_ID_BITS-1:0] prog_global_route_dest_core, input wire [NEURON_BITS-1:0] prog_global_route_dest_neuron, input wire signed [DATA_WIDTH-1:0] prog_global_route_weight, input wire learn_enable, input wire graded_enable, input wire dendritic_enable, input wire async_enable, input wire threefactor_enable, input wire noise_enable, input wire skip_idle_enable, input wire scale_u_enable, input wire signed [DATA_WIDTH-1:0] reward_value, input wire prog_delay_we, input wire [CORE_ID_BITS-1:0] prog_delay_core, input wire [POOL_ADDR_BITS-1:0] prog_delay_addr, input wire [5:0] prog_delay_value, input wire prog_ucode_we, input wire [CORE_ID_BITS-1:0] prog_ucode_core, input wire [7:0] prog_ucode_addr, input wire [31:0] prog_ucode_data, input wire prog_param_we, input wire [CORE_ID_BITS-1:0] prog_param_core, input wire [NEURON_BITS-1:0] prog_param_neuron, input wire [4:0] prog_param_id, input wire signed [DATA_WIDTH-1:0] prog_param_value, input wire ext_valid, input wire [CORE_ID_BITS-1:0] ext_core, input wire [NEURON_BITS-1:0] ext_neuron_id, input wire signed [DATA_WIDTH-1:0] ext_current, input wire probe_read, input wire [CORE_ID_BITS-1:0] probe_core, input wire [NEURON_BITS-1:0] probe_neuron, input wire [4:0] probe_state_id, input wire [POOL_ADDR_BITS-1:0] probe_pool_addr, output wire signed [DATA_WIDTH-1:0] probe_data, output wire probe_valid, output reg timestep_done, output wire [NUM_CORES-1:0] spike_valid_bus, output wire [NUM_CORES*NEURON_BITS-1:0] spike_id_bus, output wire [5:0] mesh_state_out, output reg [31:0] total_spikes, output reg [31:0] timestep_count, output wire [NUM_CORES-1:0] core_idle_bus, output wire link_tx_push, output wire [CORE_ID_BITS-1:0] link_tx_core, output wire [NEURON_BITS-1:0] link_tx_neuron, output wire [7:0] link_tx_payload, input wire link_tx_full, input wire [CORE_ID_BITS-1:0] link_rx_core, input wire [NEURON_BITS-1:0] link_rx_neuron, input wire signed [DATA_WIDTH-1:0] link_rx_current, output wire link_rx_pop, input wire link_rx_empty ); assign link_tx_push = 0; assign link_tx_core = 0; assign link_tx_neuron = 0; assign link_tx_payload = 0; assign link_rx_pop = 0; localparam COORD_BITS = 4; localparam PACKET_W = 2*COORD_BITS + NEURON_BITS + DATA_WIDTH; function [COORD_BITS-1:0] core_to_x; input [CORE_ID_BITS-1:0] cid; core_to_x = cid % MESH_X; endfunction function [COORD_BITS-1:0] core_to_y; input [CORE_ID_BITS-1:0] cid; core_to_y = cid / MESH_X; endfunction localparam SM_IDLE = 4'd0; localparam SM_PKT_DRAIN = 4'd1; localparam SM_START = 4'd2; localparam SM_RUN_WAIT = 4'd3; localparam SM_ROUTE_POP = 4'd4; localparam SM_ROUTE_ADDR = 4'd5; localparam SM_ROUTE_WAIT = 4'd6; localparam SM_ROUTE_READ = 4'd7; localparam SM_GRT_ADDR = 4'd8; localparam SM_GRT_WAIT = 4'd9; localparam SM_GRT_READ = 4'd10; localparam SM_DONE = 4'd11; reg [3:0] mesh_state; assign mesh_state_out = {2'b0, mesh_state}; reg rt_we; reg [ROUTE_ADDR_W-1:0] rt_addr; wire [ROUTE_DATA_W-1:0] rt_rdata; wire rt_we_mux = (mesh_state == SM_IDLE) ? prog_route_we : rt_we; wire [ROUTE_ADDR_W-1:0] rt_addr_mux = (mesh_state == SM_IDLE) ? {prog_route_src_core, prog_route_src_neuron, prog_route_slot} : rt_addr; wire [ROUTE_DATA_W-1:0] rt_wdata_mux = (mesh_state == SM_IDLE) ? {1'b1, prog_route_dest_core, prog_route_dest_neuron, prog_route_weight} : {ROUTE_DATA_W{1'b0}}; sram #(.DATA_WIDTH(ROUTE_DATA_W), .ADDR_WIDTH(ROUTE_ADDR_W)) route_table ( .clk(clk), .we_a(rt_we_mux), .addr_a(rt_addr_mux), .wdata_a(rt_wdata_mux), .rdata_a(rt_rdata), .addr_b({ROUTE_ADDR_W{1'b0}}), .rdata_b() ); wire rt_valid = rt_rdata[ROUTE_DATA_W-1]; wire [CORE_ID_BITS-1:0] rt_dest_core = rt_rdata[NEURON_BITS+DATA_WIDTH +: CORE_ID_BITS]; wire [NEURON_BITS-1:0] rt_dest_nrn = rt_rdata[DATA_WIDTH +: NEURON_BITS]; wire signed [DATA_WIDTH-1:0] rt_weight = rt_rdata[DATA_WIDTH-1:0]; reg grt_we; reg [GLOBAL_ROUTE_ADDR_W-1:0] grt_addr; wire [ROUTE_DATA_W-1:0] grt_rdata; wire grt_we_mux = (mesh_state == SM_IDLE) ? prog_global_route_we : grt_we; wire [GLOBAL_ROUTE_ADDR_W-1:0] grt_addr_mux = (mesh_state == SM_IDLE) ? {prog_global_route_src_core, prog_global_route_src_neuron, prog_global_route_slot} : grt_addr; wire [ROUTE_DATA_W-1:0] grt_wdata_mux = (mesh_state == SM_IDLE) ? {1'b1, prog_global_route_dest_core, prog_global_route_dest_neuron, prog_global_route_weight} : {ROUTE_DATA_W{1'b0}}; sram #(.DATA_WIDTH(ROUTE_DATA_W), .ADDR_WIDTH(GLOBAL_ROUTE_ADDR_W)) global_route_table ( .clk(clk), .we_a(grt_we_mux), .addr_a(grt_addr_mux), .wdata_a(grt_wdata_mux), .rdata_a(grt_rdata), .addr_b({GLOBAL_ROUTE_ADDR_W{1'b0}}), .rdata_b() ); wire grt_valid = grt_rdata[ROUTE_DATA_W-1]; wire [CORE_ID_BITS-1:0] grt_dest_core = grt_rdata[NEURON_BITS+DATA_WIDTH +: CORE_ID_BITS]; wire [NEURON_BITS-1:0] grt_dest_nrn = grt_rdata[DATA_WIDTH +: NEURON_BITS]; wire signed [DATA_WIDTH-1:0] grt_weight = grt_rdata[DATA_WIDTH-1:0]; wire [NUM_CORES-1:0] core_done; wire [NUM_CORES-1:0] core_spike_valid; wire [NUM_CORES*NEURON_BITS-1:0] core_spike_id; wire [NUM_CORES*8-1:0] core_spike_payload; reg [NUM_CORES-1:0] core_start_r; reg [NUM_CORES-1:0] core_done_latch; always @(posedge clk or negedge rst_n) begin if (!rst_n) core_done_latch <= 0; else if (mesh_state == SM_START) core_done_latch <= 0; else core_done_latch <= core_done_latch | core_done; end assign spike_valid_bus = core_spike_valid; assign spike_id_bus = core_spike_id; wire sync_all_done; sync_tree #(.NUM_LEAVES(NUM_CORES)) u_sync ( .clk(clk), .rst_n(rst_n), .leaf_done(core_done_latch), .all_done(sync_all_done), .root_start(1'b0), .leaf_start() ); localparam CAP_WIDTH = NEURON_BITS + 8; reg [NUM_CORES-1:0] cap_pop; reg [NUM_CORES-1:0] cap_clear; wire [NUM_CORES-1:0] cap_empty; wire [NUM_CORES*CAP_WIDTH-1:0] cap_data; wire [NUM_CORES-1:0] core_probe_valid; wire [NUM_CORES*DATA_WIDTH-1:0] core_probe_data; assign probe_data = core_probe_data[probe_core*DATA_WIDTH +: DATA_WIDTH]; assign probe_valid = core_probe_valid[probe_core]; function [31:0] popcount; input [NUM_CORES-1:0] bits; integer k; begin popcount = 0; for (k = 0; k < NUM_CORES; k = k + 1) popcount = popcount + bits[k]; end endfunction wire [NUM_CORES-1:0] rtr_idle; wire [NUM_CORES-1:0] rtr_local_out_valid; wire [NUM_CORES*PACKET_W-1:0] rtr_local_out_data; wire [NUM_CORES-1:0] rtr_local_in_ready; reg [NUM_CORES-1:0] rtr_local_in_valid; reg [NUM_CORES*PACKET_W-1:0] rtr_local_in_data; wire [NUM_CORES-1:0] rtr_local_out_ready = (mesh_state == SM_PKT_DRAIN) ? {NUM_CORES{1'b1}} : {NUM_CORES{1'b0}}; wire [NUM_CORES-1:0] rtr_n_out_v, rtr_s_out_v, rtr_e_out_v, rtr_w_out_v; wire [NUM_CORES*PACKET_W-1:0] rtr_n_out_d, rtr_s_out_d, rtr_e_out_d, rtr_w_out_d; wire [NUM_CORES-1:0] rtr_n_in_r, rtr_s_in_r, rtr_e_in_r, rtr_w_in_r; wire [NUM_CORES-1:0] rtr_b_idle; wire [NUM_CORES-1:0] rtr_b_local_out_valid; wire [NUM_CORES*PACKET_W-1:0] rtr_b_local_out_data; wire [NUM_CORES-1:0] rtr_b_local_in_ready; reg [NUM_CORES-1:0] rtr_b_local_in_valid; reg [NUM_CORES*PACKET_W-1:0] rtr_b_local_in_data; wire [NUM_CORES-1:0] rtr_b_local_out_ready = (mesh_state == SM_PKT_DRAIN) ? ~rtr_local_out_valid : {NUM_CORES{1'b0}}; wire [NUM_CORES-1:0] rtr_b_n_out_v, rtr_b_s_out_v, rtr_b_e_out_v, rtr_b_w_out_v; wire [NUM_CORES*PACKET_W-1:0] rtr_b_n_out_d, rtr_b_s_out_d, rtr_b_e_out_d, rtr_b_w_out_d; wire [NUM_CORES-1:0] rtr_b_n_in_r, rtr_b_s_in_r, rtr_b_e_in_r, rtr_b_w_in_r; genvar gi; generate for (gi = 0; gi < NUM_CORES; gi = gi + 1) begin : gen_core wire this_ext_valid = (mesh_state == SM_IDLE && ext_valid && ext_core == gi[CORE_ID_BITS-1:0]) || (mesh_state == SM_PKT_DRAIN && (rtr_local_out_valid[gi] || rtr_b_local_out_valid[gi])); wire [PACKET_W-1:0] drain_pkt = rtr_local_out_valid[gi] ? rtr_local_out_data[gi*PACKET_W +: PACKET_W] : rtr_b_local_out_data[gi*PACKET_W +: PACKET_W]; wire [NEURON_BITS-1:0] this_ext_nid = (mesh_state == SM_PKT_DRAIN) ? drain_pkt[DATA_WIDTH +: NEURON_BITS] : ext_neuron_id; wire signed [DATA_WIDTH-1:0] this_ext_cur = (mesh_state == SM_PKT_DRAIN) ? drain_pkt[DATA_WIDTH-1:0] : ext_current; wire this_pool_we = prog_pool_we && (prog_pool_core == gi[CORE_ID_BITS-1:0]) && (mesh_state == SM_IDLE); wire this_index_we = prog_index_we && (prog_index_core == gi[CORE_ID_BITS-1:0]) && (mesh_state == SM_IDLE); wire this_param_we = prog_param_we && (prog_param_core == gi[CORE_ID_BITS-1:0]) && (mesh_state == SM_IDLE); wire this_delay_we = prog_delay_we && (prog_delay_core == gi[CORE_ID_BITS-1:0]) && (mesh_state == SM_IDLE); wire this_ucode_we = prog_ucode_we && (prog_ucode_core == gi[CORE_ID_BITS-1:0]) && (mesh_state == SM_IDLE); scalable_core_v2 #( .NUM_NEURONS(NUM_NEURONS), .NEURON_BITS(NEURON_BITS), .DATA_WIDTH(DATA_WIDTH), .POOL_DEPTH(POOL_DEPTH), .POOL_ADDR_BITS(POOL_ADDR_BITS), .COUNT_BITS(COUNT_BITS), .REV_FANIN(REV_FANIN), .REV_SLOT_BITS(REV_SLOT_BITS), .THRESHOLD(THRESHOLD), .LEAK_RATE(LEAK_RATE), .REFRAC_CYCLES(REFRAC_CYCLES), .GRADE_SHIFT(GRADE_SHIFT) ) core ( .clk(clk), .rst_n(rst_n), .start(core_start_r[gi]), .learn_enable(learn_enable), .graded_enable(graded_enable), .dendritic_enable(dendritic_enable), .threefactor_enable(threefactor_enable), .noise_enable(noise_enable), .skip_idle_enable(skip_idle_enable), .scale_u_enable(scale_u_enable), .reward_value(reward_value), .ext_valid(this_ext_valid), .ext_neuron_id(this_ext_nid), .ext_current(this_ext_cur), .pool_we(this_pool_we), .pool_addr_in(prog_pool_addr), .pool_src_in(prog_pool_src), .pool_target_in(prog_pool_target), .pool_weight_in(prog_pool_weight), .pool_comp_in(prog_pool_comp), .index_we(this_index_we), .index_neuron_in(prog_index_neuron), .index_base_in(prog_index_base), .index_count_in(prog_index_count), .index_format_in(prog_index_format), .delay_we(this_delay_we), .delay_addr_in(prog_delay_addr), .delay_value_in(prog_delay_value), .ucode_prog_we(this_ucode_we), .ucode_prog_addr(prog_ucode_addr), .ucode_prog_data(prog_ucode_data), .prog_param_we(this_param_we), .prog_param_neuron(prog_param_neuron), .prog_param_id(prog_param_id), .prog_param_value(prog_param_value), .probe_read(probe_read && (probe_core == gi[CORE_ID_BITS-1:0])), .probe_neuron(probe_neuron), .probe_state_id(probe_state_id), .probe_pool_addr(probe_pool_addr), .probe_data(core_probe_data[gi*DATA_WIDTH +: DATA_WIDTH]), .probe_valid(core_probe_valid[gi]), .timestep_done(core_done[gi]), .spike_out_valid(core_spike_valid[gi]), .spike_out_id(core_spike_id[gi*NEURON_BITS +: NEURON_BITS]), .spike_out_payload(core_spike_payload[gi*8 +: 8]), .state_out(), .total_spikes(), .timestep_count(), .core_idle(core_idle_bus[gi]) ); spike_fifo #(.ID_WIDTH(CAP_WIDTH), .DEPTH(64), .PTR_BITS(6)) capture_fifo ( .clk(clk), .rst_n(rst_n), .clear(cap_clear[gi]), .push(core_spike_valid[gi] && (mesh_state == SM_RUN_WAIT)), .push_data({core_spike_id[gi*NEURON_BITS +: NEURON_BITS], core_spike_payload[gi*8 +: 8]}), .pop(cap_pop[gi]), .pop_data(cap_data[gi*CAP_WIDTH +: CAP_WIDTH]), .empty(cap_empty[gi]), .full(), .count() ); localparam RX = gi % MESH_X; localparam RY = gi / MESH_X; localparam HAS_N = (RY < MESH_Y - 1) ? 1 : 0; localparam HAS_S = (RY > 0) ? 1 : 0; localparam HAS_E = (RX < MESH_X - 1) ? 1 : 0; localparam HAS_W = (RX > 0) ? 1 : 0; localparam N_ID = HAS_N ? ((RY+1)*MESH_X + RX) : 0; localparam S_ID = HAS_S ? ((RY-1)*MESH_X + RX) : 0; localparam E_ID = HAS_E ? (RY*MESH_X + (RX+1)) : 0; localparam W_ID = HAS_W ? (RY*MESH_X + (RX-1)) : 0; wire n_in_v = HAS_N ? rtr_s_out_v[N_ID] : 1'b0; wire [PACKET_W-1:0] n_in_d = HAS_N ? rtr_s_out_d[N_ID*PACKET_W +: PACKET_W] : {PACKET_W{1'b0}}; wire n_out_r = HAS_N ? rtr_s_in_r[N_ID] : 1'b1; wire s_in_v = HAS_S ? rtr_n_out_v[S_ID] : 1'b0; wire [PACKET_W-1:0] s_in_d = HAS_S ? rtr_n_out_d[S_ID*PACKET_W +: PACKET_W] : {PACKET_W{1'b0}}; wire s_out_r = HAS_S ? rtr_n_in_r[S_ID] : 1'b1; wire e_in_v = HAS_E ? rtr_w_out_v[E_ID] : 1'b0; wire [PACKET_W-1:0] e_in_d = HAS_E ? rtr_w_out_d[E_ID*PACKET_W +: PACKET_W] : {PACKET_W{1'b0}}; wire e_out_r = HAS_E ? rtr_w_in_r[E_ID] : 1'b1; wire w_in_v = HAS_W ? rtr_e_out_v[W_ID] : 1'b0; wire [PACKET_W-1:0] w_in_d = HAS_W ? rtr_e_out_d[W_ID*PACKET_W +: PACKET_W] : {PACKET_W{1'b0}}; wire w_out_r = HAS_W ? rtr_e_in_r[W_ID] : 1'b1; async_router #( .PACKET_W(PACKET_W), .COORD_BITS(COORD_BITS), .FIFO_DEPTH(16), .FIFO_PTR_BITS(4) ) router ( .clk(clk), .rst_n(rst_n), .my_x(core_to_x(gi[CORE_ID_BITS-1:0])), .my_y(core_to_y(gi[CORE_ID_BITS-1:0])), .local_in_valid (rtr_local_in_valid[gi]), .local_in_ready (rtr_local_in_ready[gi]), .local_in_data (rtr_local_in_data[gi*PACKET_W +: PACKET_W]), .local_out_valid(rtr_local_out_valid[gi]), .local_out_ready(rtr_local_out_ready[gi]), .local_out_data (rtr_local_out_data[gi*PACKET_W +: PACKET_W]), .north_in_valid (n_in_v), .north_in_ready (rtr_n_in_r[gi]), .north_in_data (n_in_d), .north_out_valid(rtr_n_out_v[gi]), .north_out_ready(n_out_r), .north_out_data (rtr_n_out_d[gi*PACKET_W +: PACKET_W]), .south_in_valid (s_in_v), .south_in_ready (rtr_s_in_r[gi]), .south_in_data (s_in_d), .south_out_valid(rtr_s_out_v[gi]), .south_out_ready(s_out_r), .south_out_data (rtr_s_out_d[gi*PACKET_W +: PACKET_W]), .east_in_valid (e_in_v), .east_in_ready (rtr_e_in_r[gi]), .east_in_data (e_in_d), .east_out_valid (rtr_e_out_v[gi]), .east_out_ready (e_out_r), .east_out_data (rtr_e_out_d[gi*PACKET_W +: PACKET_W]), .west_in_valid (w_in_v), .west_in_ready (rtr_w_in_r[gi]), .west_in_data (w_in_d), .west_out_valid (rtr_w_out_v[gi]), .west_out_ready (w_out_r), .west_out_data (rtr_w_out_d[gi*PACKET_W +: PACKET_W]), .idle (rtr_idle[gi]) ); end endgenerate generate if (DUAL_NOC) begin : gen_net_b genvar bi; for (bi = 0; bi < NUM_CORES; bi = bi + 1) begin : gen_rtr_b localparam BRX = bi % MESH_X; localparam BRY = bi / MESH_X; localparam B_HAS_N = (BRY < MESH_Y - 1) ? 1 : 0; localparam B_HAS_S = (BRY > 0) ? 1 : 0; localparam B_HAS_E = (BRX < MESH_X - 1) ? 1 : 0; localparam B_HAS_W = (BRX > 0) ? 1 : 0; localparam BN_ID = B_HAS_N ? ((BRY+1)*MESH_X + BRX) : 0; localparam BS_ID = B_HAS_S ? ((BRY-1)*MESH_X + BRX) : 0; localparam BE_ID = B_HAS_E ? (BRY*MESH_X + (BRX+1)) : 0; localparam BW_ID = B_HAS_W ? (BRY*MESH_X + (BRX-1)) : 0; wire bn_in_v = B_HAS_N ? rtr_b_s_out_v[BN_ID] : 1'b0; wire [PACKET_W-1:0] bn_in_d = B_HAS_N ? rtr_b_s_out_d[BN_ID*PACKET_W +: PACKET_W] : {PACKET_W{1'b0}}; wire bn_out_r = B_HAS_N ? rtr_b_s_in_r[BN_ID] : 1'b1; wire bs_in_v = B_HAS_S ? rtr_b_n_out_v[BS_ID] : 1'b0; wire [PACKET_W-1:0] bs_in_d = B_HAS_S ? rtr_b_n_out_d[BS_ID*PACKET_W +: PACKET_W] : {PACKET_W{1'b0}}; wire bs_out_r = B_HAS_S ? rtr_b_n_in_r[BS_ID] : 1'b1; wire be_in_v = B_HAS_E ? rtr_b_w_out_v[BE_ID] : 1'b0; wire [PACKET_W-1:0] be_in_d = B_HAS_E ? rtr_b_w_out_d[BE_ID*PACKET_W +: PACKET_W] : {PACKET_W{1'b0}}; wire be_out_r = B_HAS_E ? rtr_b_w_in_r[BE_ID] : 1'b1; wire bw_in_v = B_HAS_W ? rtr_b_e_out_v[BW_ID] : 1'b0; wire [PACKET_W-1:0] bw_in_d = B_HAS_W ? rtr_b_e_out_d[BW_ID*PACKET_W +: PACKET_W] : {PACKET_W{1'b0}}; wire bw_out_r = B_HAS_W ? rtr_b_e_in_r[BW_ID] : 1'b1; async_router #( .PACKET_W(PACKET_W), .COORD_BITS(COORD_BITS), .FIFO_DEPTH(16), .FIFO_PTR_BITS(4) ) router_b ( .clk(clk), .rst_n(rst_n), .my_x(core_to_x(bi[CORE_ID_BITS-1:0])), .my_y(core_to_y(bi[CORE_ID_BITS-1:0])), .local_in_valid (rtr_b_local_in_valid[bi]), .local_in_ready (rtr_b_local_in_ready[bi]), .local_in_data (rtr_b_local_in_data[bi*PACKET_W +: PACKET_W]), .local_out_valid(rtr_b_local_out_valid[bi]), .local_out_ready(rtr_b_local_out_ready[bi]), .local_out_data (rtr_b_local_out_data[bi*PACKET_W +: PACKET_W]), .north_in_valid (bn_in_v), .north_in_ready (rtr_b_n_in_r[bi]), .north_in_data (bn_in_d), .north_out_valid(rtr_b_n_out_v[bi]), .north_out_ready(bn_out_r), .north_out_data (rtr_b_n_out_d[bi*PACKET_W +: PACKET_W]), .south_in_valid (bs_in_v), .south_in_ready (rtr_b_s_in_r[bi]), .south_in_data (bs_in_d), .south_out_valid(rtr_b_s_out_v[bi]), .south_out_ready(bs_out_r), .south_out_data (rtr_b_s_out_d[bi*PACKET_W +: PACKET_W]), .east_in_valid (be_in_v), .east_in_ready (rtr_b_e_in_r[bi]), .east_in_data (be_in_d), .east_out_valid (rtr_b_e_out_v[bi]), .east_out_ready (be_out_r), .east_out_data (rtr_b_e_out_d[bi*PACKET_W +: PACKET_W]), .west_in_valid (bw_in_v), .west_in_ready (rtr_b_w_in_r[bi]), .west_in_data (bw_in_d), .west_out_valid (rtr_b_w_out_v[bi]), .west_out_ready (bw_out_r), .west_out_data (rtr_b_w_out_d[bi*PACKET_W +: PACKET_W]), .idle (rtr_b_idle[bi]) ); end end else begin : gen_no_net_b assign rtr_b_idle = {NUM_CORES{1'b1}}; assign rtr_b_local_out_valid = {NUM_CORES{1'b0}}; assign rtr_b_local_out_data = {NUM_CORES*PACKET_W{1'b0}}; assign rtr_b_local_in_ready = {NUM_CORES{1'b1}}; end endgenerate reg [CORE_ID_BITS-1:0] route_core_idx; reg [NEURON_BITS-1:0] route_neuron; reg [7:0] route_payload; reg [ROUTE_SLOT_BITS-1:0] route_slot; reg [GLOBAL_ROUTE_SLOT_BITS-1:0] global_slot; reg [3:0] drain_wait; wire signed [31:0] route_weight_ext = rt_weight; wire signed [31:0] route_payload_ext = {24'd0, route_payload}; wire signed [31:0] route_graded_product = route_weight_ext * route_payload_ext; wire signed [DATA_WIDTH-1:0] route_graded_current = route_graded_product >>> GRADE_SHIFT; wire signed [31:0] grt_weight_ext = grt_weight; wire signed [31:0] grt_graded_product = grt_weight_ext * route_payload_ext; wire signed [DATA_WIDTH-1:0] grt_graded_current = grt_graded_product >>> GRADE_SHIFT; wire signed [DATA_WIDTH-1:0] rt_eff_weight = graded_enable ? route_graded_current : rt_weight; wire signed [DATA_WIDTH-1:0] grt_eff_weight = graded_enable ? grt_graded_current : grt_weight; always @(posedge clk or negedge rst_n) begin if (!rst_n) begin mesh_state <= SM_IDLE; timestep_done <= 0; total_spikes <= 0; timestep_count <= 0; core_start_r <= 0; route_core_idx <= 0; route_neuron <= 0; route_payload <= 0; route_slot <= 0; global_slot <= 0; drain_wait <= 0; rt_we <= 0; rt_addr <= 0; grt_we <= 0; grt_addr <= 0; cap_pop <= 0; cap_clear <= 0; rtr_local_in_valid <= 0; rtr_local_in_data <= 0; rtr_b_local_in_valid <= 0; rtr_b_local_in_data <= 0; end else begin timestep_done <= 0; core_start_r <= 0; rt_we <= 0; grt_we <= 0; cap_pop <= 0; cap_clear <= 0; rtr_local_in_valid <= 0; rtr_b_local_in_valid <= 0; total_spikes <= total_spikes + popcount(core_spike_valid); case (mesh_state) SM_IDLE: begin if (start) begin drain_wait <= 0; mesh_state <= SM_PKT_DRAIN; end end SM_PKT_DRAIN: begin if ((&rtr_idle) && (&rtr_b_idle) && !(|rtr_local_out_valid) && !(|rtr_b_local_out_valid)) begin drain_wait <= drain_wait + 1; if (drain_wait >= 4'd3) mesh_state <= SM_START; end else begin drain_wait <= 0; end end SM_START: begin core_start_r <= {NUM_CORES{1'b1}}; mesh_state <= SM_RUN_WAIT; end SM_RUN_WAIT: begin if (sync_all_done) begin route_core_idx <= 0; mesh_state <= SM_ROUTE_POP; end end SM_ROUTE_POP: begin if (cap_empty[route_core_idx]) begin if (route_core_idx == NUM_CORES - 1) mesh_state <= SM_DONE; else route_core_idx <= route_core_idx + 1; end else begin cap_pop[route_core_idx] <= 1; route_neuron <= cap_data[route_core_idx * CAP_WIDTH + 8 +: NEURON_BITS]; route_payload <= cap_data[route_core_idx * CAP_WIDTH +: 8]; route_slot <= 0; mesh_state <= SM_ROUTE_ADDR; end end SM_ROUTE_ADDR: begin rt_addr <= {route_core_idx, route_neuron, route_slot}; mesh_state <= SM_ROUTE_WAIT; end SM_ROUTE_WAIT: begin mesh_state <= SM_ROUTE_READ; end SM_ROUTE_READ: begin if (rt_valid) begin if (route_core_idx[0] == 1'b0 || !DUAL_NOC) begin if (rtr_local_in_ready[route_core_idx]) begin rtr_local_in_valid[route_core_idx] <= 1; rtr_local_in_data[route_core_idx*PACKET_W +: PACKET_W] <= {core_to_x(rt_dest_core), core_to_y(rt_dest_core), rt_dest_nrn, rt_eff_weight}; end end else begin if (rtr_b_local_in_ready[route_core_idx]) begin rtr_b_local_in_valid[route_core_idx] <= 1; rtr_b_local_in_data[route_core_idx*PACKET_W +: PACKET_W] <= {core_to_x(rt_dest_core), core_to_y(rt_dest_core), rt_dest_nrn, rt_eff_weight}; end end end if (route_slot < ROUTE_FANOUT - 1) begin route_slot <= route_slot + 1; mesh_state <= SM_ROUTE_ADDR; end else begin global_slot <= 0; mesh_state <= SM_GRT_ADDR; end end SM_GRT_ADDR: begin grt_addr <= {route_core_idx, route_neuron, global_slot}; mesh_state <= SM_GRT_WAIT; end SM_GRT_WAIT: begin mesh_state <= SM_GRT_READ; end SM_GRT_READ: begin if (grt_valid) begin if (route_core_idx[0] == 1'b0 || !DUAL_NOC) begin if (rtr_local_in_ready[route_core_idx]) begin rtr_local_in_valid[route_core_idx] <= 1; rtr_local_in_data[route_core_idx*PACKET_W +: PACKET_W] <= {core_to_x(grt_dest_core), core_to_y(grt_dest_core), grt_dest_nrn, grt_eff_weight}; end end else begin if (rtr_b_local_in_ready[route_core_idx]) begin rtr_b_local_in_valid[route_core_idx] <= 1; rtr_b_local_in_data[route_core_idx*PACKET_W +: PACKET_W] <= {core_to_x(grt_dest_core), core_to_y(grt_dest_core), grt_dest_nrn, grt_eff_weight}; end end end if (global_slot < GLOBAL_ROUTE_SLOTS - 1) begin global_slot <= global_slot + 1; mesh_state <= SM_GRT_ADDR; end else begin mesh_state <= SM_ROUTE_POP; end end SM_DONE: begin cap_clear <= {NUM_CORES{1'b1}}; timestep_done <= 1; timestep_count <= timestep_count + 1; mesh_state <= SM_IDLE; end default: mesh_state <= SM_IDLE; endcase end end endmodule