catalyst-n1 / rtl /async_noc_mesh.v
mrwabbit's picture
Initial upload: Catalyst N1 open source neuromorphic processor RTL
e4cdd5f verified
// ============================================================================
// Async NoC Mesh
// ============================================================================
//
// Copyright 2026 Henry Arthur Shulayev Barnes / Catalyst Neuromorphic Ltd
// Company No. 17054540 — UK Patent Application No. 2602902.6
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// ============================================================================
`timescale 1ns/1ps
module async_noc_mesh #(
parameter NUM_CORES = 4,
parameter CORE_ID_BITS = 2,
parameter NUM_NEURONS = 1024,
parameter NEURON_BITS = 10,
parameter DATA_WIDTH = 16,
parameter POOL_DEPTH = 32768,
parameter POOL_ADDR_BITS = 15,
parameter COUNT_BITS = 12,
parameter REV_FANIN = 32,
parameter REV_SLOT_BITS = 5,
parameter THRESHOLD = 16'sd1000,
parameter LEAK_RATE = 16'sd3,
parameter REFRAC_CYCLES = 3,
parameter GRADE_SHIFT = 7,
parameter ROUTE_FANOUT = 8,
parameter ROUTE_SLOT_BITS = 3,
parameter ROUTE_ADDR_W = CORE_ID_BITS + NEURON_BITS + ROUTE_SLOT_BITS,
parameter ROUTE_DATA_W = 1 + CORE_ID_BITS + NEURON_BITS + DATA_WIDTH,
parameter CLUSTER_SIZE = 4,
parameter GLOBAL_ROUTE_SLOTS = 4,
parameter GLOBAL_ROUTE_SLOT_BITS = 2,
parameter GLOBAL_ROUTE_ADDR_W = CORE_ID_BITS + NEURON_BITS + GLOBAL_ROUTE_SLOT_BITS,
parameter CHIP_LINK_EN = 0,
parameter DUAL_NOC = 0,
parameter MESH_X = 2,
parameter MESH_Y = 2
)(
input wire clk,
input wire rst_n,
input wire start,
input wire prog_pool_we,
input wire [CORE_ID_BITS-1:0] prog_pool_core,
input wire [POOL_ADDR_BITS-1:0] prog_pool_addr,
input wire [NEURON_BITS-1:0] prog_pool_src,
input wire [NEURON_BITS-1:0] prog_pool_target,
input wire signed [DATA_WIDTH-1:0] prog_pool_weight,
input wire [1:0] prog_pool_comp,
input wire prog_index_we,
input wire [CORE_ID_BITS-1:0] prog_index_core,
input wire [NEURON_BITS-1:0] prog_index_neuron,
input wire [POOL_ADDR_BITS-1:0] prog_index_base,
input wire [COUNT_BITS-1:0] prog_index_count,
input wire [1:0] prog_index_format,
input wire prog_route_we,
input wire [CORE_ID_BITS-1:0] prog_route_src_core,
input wire [NEURON_BITS-1:0] prog_route_src_neuron,
input wire [ROUTE_SLOT_BITS-1:0] prog_route_slot,
input wire [CORE_ID_BITS-1:0] prog_route_dest_core,
input wire [NEURON_BITS-1:0] prog_route_dest_neuron,
input wire signed [DATA_WIDTH-1:0] prog_route_weight,
input wire prog_global_route_we,
input wire [CORE_ID_BITS-1:0] prog_global_route_src_core,
input wire [NEURON_BITS-1:0] prog_global_route_src_neuron,
input wire [GLOBAL_ROUTE_SLOT_BITS-1:0] prog_global_route_slot,
input wire [CORE_ID_BITS-1:0] prog_global_route_dest_core,
input wire [NEURON_BITS-1:0] prog_global_route_dest_neuron,
input wire signed [DATA_WIDTH-1:0] prog_global_route_weight,
input wire learn_enable,
input wire graded_enable,
input wire dendritic_enable,
input wire async_enable,
input wire threefactor_enable,
input wire noise_enable,
input wire skip_idle_enable,
input wire scale_u_enable,
input wire signed [DATA_WIDTH-1:0] reward_value,
input wire prog_delay_we,
input wire [CORE_ID_BITS-1:0] prog_delay_core,
input wire [POOL_ADDR_BITS-1:0] prog_delay_addr,
input wire [5:0] prog_delay_value,
input wire prog_ucode_we,
input wire [CORE_ID_BITS-1:0] prog_ucode_core,
input wire [7:0] prog_ucode_addr,
input wire [31:0] prog_ucode_data,
input wire prog_param_we,
input wire [CORE_ID_BITS-1:0] prog_param_core,
input wire [NEURON_BITS-1:0] prog_param_neuron,
input wire [4:0] prog_param_id,
input wire signed [DATA_WIDTH-1:0] prog_param_value,
input wire ext_valid,
input wire [CORE_ID_BITS-1:0] ext_core,
input wire [NEURON_BITS-1:0] ext_neuron_id,
input wire signed [DATA_WIDTH-1:0] ext_current,
input wire probe_read,
input wire [CORE_ID_BITS-1:0] probe_core,
input wire [NEURON_BITS-1:0] probe_neuron,
input wire [4:0] probe_state_id,
input wire [POOL_ADDR_BITS-1:0] probe_pool_addr,
output wire signed [DATA_WIDTH-1:0] probe_data,
output wire probe_valid,
output reg timestep_done,
output wire [NUM_CORES-1:0] spike_valid_bus,
output wire [NUM_CORES*NEURON_BITS-1:0] spike_id_bus,
output wire [5:0] mesh_state_out,
output reg [31:0] total_spikes,
output reg [31:0] timestep_count,
output wire [NUM_CORES-1:0] core_idle_bus,
output wire link_tx_push,
output wire [CORE_ID_BITS-1:0] link_tx_core,
output wire [NEURON_BITS-1:0] link_tx_neuron,
output wire [7:0] link_tx_payload,
input wire link_tx_full,
input wire [CORE_ID_BITS-1:0] link_rx_core,
input wire [NEURON_BITS-1:0] link_rx_neuron,
input wire signed [DATA_WIDTH-1:0] link_rx_current,
output wire link_rx_pop,
input wire link_rx_empty
);
assign link_tx_push = 0;
assign link_tx_core = 0;
assign link_tx_neuron = 0;
assign link_tx_payload = 0;
assign link_rx_pop = 0;
localparam COORD_BITS = 4;
localparam PACKET_W = 2*COORD_BITS + NEURON_BITS + DATA_WIDTH;
function [COORD_BITS-1:0] core_to_x;
input [CORE_ID_BITS-1:0] cid;
core_to_x = cid % MESH_X;
endfunction
function [COORD_BITS-1:0] core_to_y;
input [CORE_ID_BITS-1:0] cid;
core_to_y = cid / MESH_X;
endfunction
localparam SM_IDLE = 4'd0;
localparam SM_PKT_DRAIN = 4'd1;
localparam SM_START = 4'd2;
localparam SM_RUN_WAIT = 4'd3;
localparam SM_ROUTE_POP = 4'd4;
localparam SM_ROUTE_ADDR = 4'd5;
localparam SM_ROUTE_WAIT = 4'd6;
localparam SM_ROUTE_READ = 4'd7;
localparam SM_GRT_ADDR = 4'd8;
localparam SM_GRT_WAIT = 4'd9;
localparam SM_GRT_READ = 4'd10;
localparam SM_DONE = 4'd11;
reg [3:0] mesh_state;
assign mesh_state_out = {2'b0, mesh_state};
reg rt_we;
reg [ROUTE_ADDR_W-1:0] rt_addr;
wire [ROUTE_DATA_W-1:0] rt_rdata;
wire rt_we_mux = (mesh_state == SM_IDLE) ? prog_route_we : rt_we;
wire [ROUTE_ADDR_W-1:0] rt_addr_mux = (mesh_state == SM_IDLE) ?
{prog_route_src_core, prog_route_src_neuron, prog_route_slot} : rt_addr;
wire [ROUTE_DATA_W-1:0] rt_wdata_mux = (mesh_state == SM_IDLE) ?
{1'b1, prog_route_dest_core, prog_route_dest_neuron, prog_route_weight} : {ROUTE_DATA_W{1'b0}};
sram #(.DATA_WIDTH(ROUTE_DATA_W), .ADDR_WIDTH(ROUTE_ADDR_W)) route_table (
.clk(clk), .we_a(rt_we_mux), .addr_a(rt_addr_mux),
.wdata_a(rt_wdata_mux), .rdata_a(rt_rdata),
.addr_b({ROUTE_ADDR_W{1'b0}}), .rdata_b()
);
wire rt_valid = rt_rdata[ROUTE_DATA_W-1];
wire [CORE_ID_BITS-1:0] rt_dest_core = rt_rdata[NEURON_BITS+DATA_WIDTH +: CORE_ID_BITS];
wire [NEURON_BITS-1:0] rt_dest_nrn = rt_rdata[DATA_WIDTH +: NEURON_BITS];
wire signed [DATA_WIDTH-1:0] rt_weight = rt_rdata[DATA_WIDTH-1:0];
reg grt_we;
reg [GLOBAL_ROUTE_ADDR_W-1:0] grt_addr;
wire [ROUTE_DATA_W-1:0] grt_rdata;
wire grt_we_mux = (mesh_state == SM_IDLE) ? prog_global_route_we : grt_we;
wire [GLOBAL_ROUTE_ADDR_W-1:0] grt_addr_mux = (mesh_state == SM_IDLE) ?
{prog_global_route_src_core, prog_global_route_src_neuron, prog_global_route_slot} : grt_addr;
wire [ROUTE_DATA_W-1:0] grt_wdata_mux = (mesh_state == SM_IDLE) ?
{1'b1, prog_global_route_dest_core, prog_global_route_dest_neuron, prog_global_route_weight} : {ROUTE_DATA_W{1'b0}};
sram #(.DATA_WIDTH(ROUTE_DATA_W), .ADDR_WIDTH(GLOBAL_ROUTE_ADDR_W)) global_route_table (
.clk(clk), .we_a(grt_we_mux), .addr_a(grt_addr_mux),
.wdata_a(grt_wdata_mux), .rdata_a(grt_rdata),
.addr_b({GLOBAL_ROUTE_ADDR_W{1'b0}}), .rdata_b()
);
wire grt_valid = grt_rdata[ROUTE_DATA_W-1];
wire [CORE_ID_BITS-1:0] grt_dest_core = grt_rdata[NEURON_BITS+DATA_WIDTH +: CORE_ID_BITS];
wire [NEURON_BITS-1:0] grt_dest_nrn = grt_rdata[DATA_WIDTH +: NEURON_BITS];
wire signed [DATA_WIDTH-1:0] grt_weight = grt_rdata[DATA_WIDTH-1:0];
wire [NUM_CORES-1:0] core_done;
wire [NUM_CORES-1:0] core_spike_valid;
wire [NUM_CORES*NEURON_BITS-1:0] core_spike_id;
wire [NUM_CORES*8-1:0] core_spike_payload;
reg [NUM_CORES-1:0] core_start_r;
reg [NUM_CORES-1:0] core_done_latch;
always @(posedge clk or negedge rst_n) begin
if (!rst_n)
core_done_latch <= 0;
else if (mesh_state == SM_START)
core_done_latch <= 0;
else
core_done_latch <= core_done_latch | core_done;
end
assign spike_valid_bus = core_spike_valid;
assign spike_id_bus = core_spike_id;
wire sync_all_done;
sync_tree #(.NUM_LEAVES(NUM_CORES)) u_sync (
.clk(clk), .rst_n(rst_n),
.leaf_done(core_done_latch),
.all_done(sync_all_done),
.root_start(1'b0), .leaf_start()
);
localparam CAP_WIDTH = NEURON_BITS + 8;
reg [NUM_CORES-1:0] cap_pop;
reg [NUM_CORES-1:0] cap_clear;
wire [NUM_CORES-1:0] cap_empty;
wire [NUM_CORES*CAP_WIDTH-1:0] cap_data;
wire [NUM_CORES-1:0] core_probe_valid;
wire [NUM_CORES*DATA_WIDTH-1:0] core_probe_data;
assign probe_data = core_probe_data[probe_core*DATA_WIDTH +: DATA_WIDTH];
assign probe_valid = core_probe_valid[probe_core];
function [31:0] popcount;
input [NUM_CORES-1:0] bits;
integer k;
begin
popcount = 0;
for (k = 0; k < NUM_CORES; k = k + 1)
popcount = popcount + bits[k];
end
endfunction
wire [NUM_CORES-1:0] rtr_idle;
wire [NUM_CORES-1:0] rtr_local_out_valid;
wire [NUM_CORES*PACKET_W-1:0] rtr_local_out_data;
wire [NUM_CORES-1:0] rtr_local_in_ready;
reg [NUM_CORES-1:0] rtr_local_in_valid;
reg [NUM_CORES*PACKET_W-1:0] rtr_local_in_data;
wire [NUM_CORES-1:0] rtr_local_out_ready =
(mesh_state == SM_PKT_DRAIN) ? {NUM_CORES{1'b1}} : {NUM_CORES{1'b0}};
wire [NUM_CORES-1:0] rtr_n_out_v, rtr_s_out_v, rtr_e_out_v, rtr_w_out_v;
wire [NUM_CORES*PACKET_W-1:0] rtr_n_out_d, rtr_s_out_d, rtr_e_out_d, rtr_w_out_d;
wire [NUM_CORES-1:0] rtr_n_in_r, rtr_s_in_r, rtr_e_in_r, rtr_w_in_r;
wire [NUM_CORES-1:0] rtr_b_idle;
wire [NUM_CORES-1:0] rtr_b_local_out_valid;
wire [NUM_CORES*PACKET_W-1:0] rtr_b_local_out_data;
wire [NUM_CORES-1:0] rtr_b_local_in_ready;
reg [NUM_CORES-1:0] rtr_b_local_in_valid;
reg [NUM_CORES*PACKET_W-1:0] rtr_b_local_in_data;
wire [NUM_CORES-1:0] rtr_b_local_out_ready =
(mesh_state == SM_PKT_DRAIN) ? ~rtr_local_out_valid : {NUM_CORES{1'b0}};
wire [NUM_CORES-1:0] rtr_b_n_out_v, rtr_b_s_out_v, rtr_b_e_out_v, rtr_b_w_out_v;
wire [NUM_CORES*PACKET_W-1:0] rtr_b_n_out_d, rtr_b_s_out_d, rtr_b_e_out_d, rtr_b_w_out_d;
wire [NUM_CORES-1:0] rtr_b_n_in_r, rtr_b_s_in_r, rtr_b_e_in_r, rtr_b_w_in_r;
genvar gi;
generate
for (gi = 0; gi < NUM_CORES; gi = gi + 1) begin : gen_core
wire this_ext_valid =
(mesh_state == SM_IDLE && ext_valid && ext_core == gi[CORE_ID_BITS-1:0]) ||
(mesh_state == SM_PKT_DRAIN && (rtr_local_out_valid[gi] || rtr_b_local_out_valid[gi]));
wire [PACKET_W-1:0] drain_pkt = rtr_local_out_valid[gi] ?
rtr_local_out_data[gi*PACKET_W +: PACKET_W] :
rtr_b_local_out_data[gi*PACKET_W +: PACKET_W];
wire [NEURON_BITS-1:0] this_ext_nid =
(mesh_state == SM_PKT_DRAIN) ? drain_pkt[DATA_WIDTH +: NEURON_BITS] : ext_neuron_id;
wire signed [DATA_WIDTH-1:0] this_ext_cur =
(mesh_state == SM_PKT_DRAIN) ? drain_pkt[DATA_WIDTH-1:0] : ext_current;
wire this_pool_we = prog_pool_we && (prog_pool_core == gi[CORE_ID_BITS-1:0]) &&
(mesh_state == SM_IDLE);
wire this_index_we = prog_index_we && (prog_index_core == gi[CORE_ID_BITS-1:0]) &&
(mesh_state == SM_IDLE);
wire this_param_we = prog_param_we && (prog_param_core == gi[CORE_ID_BITS-1:0]) &&
(mesh_state == SM_IDLE);
wire this_delay_we = prog_delay_we && (prog_delay_core == gi[CORE_ID_BITS-1:0]) &&
(mesh_state == SM_IDLE);
wire this_ucode_we = prog_ucode_we && (prog_ucode_core == gi[CORE_ID_BITS-1:0]) &&
(mesh_state == SM_IDLE);
scalable_core_v2 #(
.NUM_NEURONS(NUM_NEURONS), .NEURON_BITS(NEURON_BITS),
.DATA_WIDTH(DATA_WIDTH), .POOL_DEPTH(POOL_DEPTH),
.POOL_ADDR_BITS(POOL_ADDR_BITS), .COUNT_BITS(COUNT_BITS),
.REV_FANIN(REV_FANIN), .REV_SLOT_BITS(REV_SLOT_BITS),
.THRESHOLD(THRESHOLD), .LEAK_RATE(LEAK_RATE),
.REFRAC_CYCLES(REFRAC_CYCLES), .GRADE_SHIFT(GRADE_SHIFT)
) core (
.clk(clk), .rst_n(rst_n),
.start(core_start_r[gi]),
.learn_enable(learn_enable), .graded_enable(graded_enable),
.dendritic_enable(dendritic_enable),
.threefactor_enable(threefactor_enable),
.noise_enable(noise_enable), .skip_idle_enable(skip_idle_enable),
.scale_u_enable(scale_u_enable),
.reward_value(reward_value),
.ext_valid(this_ext_valid),
.ext_neuron_id(this_ext_nid),
.ext_current(this_ext_cur),
.pool_we(this_pool_we), .pool_addr_in(prog_pool_addr),
.pool_src_in(prog_pool_src), .pool_target_in(prog_pool_target),
.pool_weight_in(prog_pool_weight), .pool_comp_in(prog_pool_comp),
.index_we(this_index_we), .index_neuron_in(prog_index_neuron),
.index_base_in(prog_index_base), .index_count_in(prog_index_count),
.index_format_in(prog_index_format),
.delay_we(this_delay_we), .delay_addr_in(prog_delay_addr),
.delay_value_in(prog_delay_value),
.ucode_prog_we(this_ucode_we), .ucode_prog_addr(prog_ucode_addr),
.ucode_prog_data(prog_ucode_data),
.prog_param_we(this_param_we), .prog_param_neuron(prog_param_neuron),
.prog_param_id(prog_param_id), .prog_param_value(prog_param_value),
.probe_read(probe_read && (probe_core == gi[CORE_ID_BITS-1:0])),
.probe_neuron(probe_neuron), .probe_state_id(probe_state_id),
.probe_pool_addr(probe_pool_addr),
.probe_data(core_probe_data[gi*DATA_WIDTH +: DATA_WIDTH]),
.probe_valid(core_probe_valid[gi]),
.timestep_done(core_done[gi]),
.spike_out_valid(core_spike_valid[gi]),
.spike_out_id(core_spike_id[gi*NEURON_BITS +: NEURON_BITS]),
.spike_out_payload(core_spike_payload[gi*8 +: 8]),
.state_out(), .total_spikes(), .timestep_count(),
.core_idle(core_idle_bus[gi])
);
spike_fifo #(.ID_WIDTH(CAP_WIDTH), .DEPTH(64), .PTR_BITS(6)) capture_fifo (
.clk(clk), .rst_n(rst_n), .clear(cap_clear[gi]),
.push(core_spike_valid[gi] && (mesh_state == SM_RUN_WAIT)),
.push_data({core_spike_id[gi*NEURON_BITS +: NEURON_BITS],
core_spike_payload[gi*8 +: 8]}),
.pop(cap_pop[gi]),
.pop_data(cap_data[gi*CAP_WIDTH +: CAP_WIDTH]),
.empty(cap_empty[gi]), .full(), .count()
);
localparam RX = gi % MESH_X;
localparam RY = gi / MESH_X;
localparam HAS_N = (RY < MESH_Y - 1) ? 1 : 0;
localparam HAS_S = (RY > 0) ? 1 : 0;
localparam HAS_E = (RX < MESH_X - 1) ? 1 : 0;
localparam HAS_W = (RX > 0) ? 1 : 0;
localparam N_ID = HAS_N ? ((RY+1)*MESH_X + RX) : 0;
localparam S_ID = HAS_S ? ((RY-1)*MESH_X + RX) : 0;
localparam E_ID = HAS_E ? (RY*MESH_X + (RX+1)) : 0;
localparam W_ID = HAS_W ? (RY*MESH_X + (RX-1)) : 0;
wire n_in_v = HAS_N ? rtr_s_out_v[N_ID] : 1'b0;
wire [PACKET_W-1:0] n_in_d = HAS_N ? rtr_s_out_d[N_ID*PACKET_W +: PACKET_W] : {PACKET_W{1'b0}};
wire n_out_r = HAS_N ? rtr_s_in_r[N_ID] : 1'b1;
wire s_in_v = HAS_S ? rtr_n_out_v[S_ID] : 1'b0;
wire [PACKET_W-1:0] s_in_d = HAS_S ? rtr_n_out_d[S_ID*PACKET_W +: PACKET_W] : {PACKET_W{1'b0}};
wire s_out_r = HAS_S ? rtr_n_in_r[S_ID] : 1'b1;
wire e_in_v = HAS_E ? rtr_w_out_v[E_ID] : 1'b0;
wire [PACKET_W-1:0] e_in_d = HAS_E ? rtr_w_out_d[E_ID*PACKET_W +: PACKET_W] : {PACKET_W{1'b0}};
wire e_out_r = HAS_E ? rtr_w_in_r[E_ID] : 1'b1;
wire w_in_v = HAS_W ? rtr_e_out_v[W_ID] : 1'b0;
wire [PACKET_W-1:0] w_in_d = HAS_W ? rtr_e_out_d[W_ID*PACKET_W +: PACKET_W] : {PACKET_W{1'b0}};
wire w_out_r = HAS_W ? rtr_e_in_r[W_ID] : 1'b1;
async_router #(
.PACKET_W(PACKET_W), .COORD_BITS(COORD_BITS),
.FIFO_DEPTH(16), .FIFO_PTR_BITS(4)
) router (
.clk(clk), .rst_n(rst_n),
.my_x(core_to_x(gi[CORE_ID_BITS-1:0])),
.my_y(core_to_y(gi[CORE_ID_BITS-1:0])),
.local_in_valid (rtr_local_in_valid[gi]),
.local_in_ready (rtr_local_in_ready[gi]),
.local_in_data (rtr_local_in_data[gi*PACKET_W +: PACKET_W]),
.local_out_valid(rtr_local_out_valid[gi]),
.local_out_ready(rtr_local_out_ready[gi]),
.local_out_data (rtr_local_out_data[gi*PACKET_W +: PACKET_W]),
.north_in_valid (n_in_v),
.north_in_ready (rtr_n_in_r[gi]),
.north_in_data (n_in_d),
.north_out_valid(rtr_n_out_v[gi]),
.north_out_ready(n_out_r),
.north_out_data (rtr_n_out_d[gi*PACKET_W +: PACKET_W]),
.south_in_valid (s_in_v),
.south_in_ready (rtr_s_in_r[gi]),
.south_in_data (s_in_d),
.south_out_valid(rtr_s_out_v[gi]),
.south_out_ready(s_out_r),
.south_out_data (rtr_s_out_d[gi*PACKET_W +: PACKET_W]),
.east_in_valid (e_in_v),
.east_in_ready (rtr_e_in_r[gi]),
.east_in_data (e_in_d),
.east_out_valid (rtr_e_out_v[gi]),
.east_out_ready (e_out_r),
.east_out_data (rtr_e_out_d[gi*PACKET_W +: PACKET_W]),
.west_in_valid (w_in_v),
.west_in_ready (rtr_w_in_r[gi]),
.west_in_data (w_in_d),
.west_out_valid (rtr_w_out_v[gi]),
.west_out_ready (w_out_r),
.west_out_data (rtr_w_out_d[gi*PACKET_W +: PACKET_W]),
.idle (rtr_idle[gi])
);
end
endgenerate
generate if (DUAL_NOC) begin : gen_net_b
genvar bi;
for (bi = 0; bi < NUM_CORES; bi = bi + 1) begin : gen_rtr_b
localparam BRX = bi % MESH_X;
localparam BRY = bi / MESH_X;
localparam B_HAS_N = (BRY < MESH_Y - 1) ? 1 : 0;
localparam B_HAS_S = (BRY > 0) ? 1 : 0;
localparam B_HAS_E = (BRX < MESH_X - 1) ? 1 : 0;
localparam B_HAS_W = (BRX > 0) ? 1 : 0;
localparam BN_ID = B_HAS_N ? ((BRY+1)*MESH_X + BRX) : 0;
localparam BS_ID = B_HAS_S ? ((BRY-1)*MESH_X + BRX) : 0;
localparam BE_ID = B_HAS_E ? (BRY*MESH_X + (BRX+1)) : 0;
localparam BW_ID = B_HAS_W ? (BRY*MESH_X + (BRX-1)) : 0;
wire bn_in_v = B_HAS_N ? rtr_b_s_out_v[BN_ID] : 1'b0;
wire [PACKET_W-1:0] bn_in_d = B_HAS_N ?
rtr_b_s_out_d[BN_ID*PACKET_W +: PACKET_W] : {PACKET_W{1'b0}};
wire bn_out_r = B_HAS_N ? rtr_b_s_in_r[BN_ID] : 1'b1;
wire bs_in_v = B_HAS_S ? rtr_b_n_out_v[BS_ID] : 1'b0;
wire [PACKET_W-1:0] bs_in_d = B_HAS_S ?
rtr_b_n_out_d[BS_ID*PACKET_W +: PACKET_W] : {PACKET_W{1'b0}};
wire bs_out_r = B_HAS_S ? rtr_b_n_in_r[BS_ID] : 1'b1;
wire be_in_v = B_HAS_E ? rtr_b_w_out_v[BE_ID] : 1'b0;
wire [PACKET_W-1:0] be_in_d = B_HAS_E ?
rtr_b_w_out_d[BE_ID*PACKET_W +: PACKET_W] : {PACKET_W{1'b0}};
wire be_out_r = B_HAS_E ? rtr_b_w_in_r[BE_ID] : 1'b1;
wire bw_in_v = B_HAS_W ? rtr_b_e_out_v[BW_ID] : 1'b0;
wire [PACKET_W-1:0] bw_in_d = B_HAS_W ?
rtr_b_e_out_d[BW_ID*PACKET_W +: PACKET_W] : {PACKET_W{1'b0}};
wire bw_out_r = B_HAS_W ? rtr_b_e_in_r[BW_ID] : 1'b1;
async_router #(
.PACKET_W(PACKET_W), .COORD_BITS(COORD_BITS),
.FIFO_DEPTH(16), .FIFO_PTR_BITS(4)
) router_b (
.clk(clk), .rst_n(rst_n),
.my_x(core_to_x(bi[CORE_ID_BITS-1:0])),
.my_y(core_to_y(bi[CORE_ID_BITS-1:0])),
.local_in_valid (rtr_b_local_in_valid[bi]),
.local_in_ready (rtr_b_local_in_ready[bi]),
.local_in_data (rtr_b_local_in_data[bi*PACKET_W +: PACKET_W]),
.local_out_valid(rtr_b_local_out_valid[bi]),
.local_out_ready(rtr_b_local_out_ready[bi]),
.local_out_data (rtr_b_local_out_data[bi*PACKET_W +: PACKET_W]),
.north_in_valid (bn_in_v),
.north_in_ready (rtr_b_n_in_r[bi]),
.north_in_data (bn_in_d),
.north_out_valid(rtr_b_n_out_v[bi]),
.north_out_ready(bn_out_r),
.north_out_data (rtr_b_n_out_d[bi*PACKET_W +: PACKET_W]),
.south_in_valid (bs_in_v),
.south_in_ready (rtr_b_s_in_r[bi]),
.south_in_data (bs_in_d),
.south_out_valid(rtr_b_s_out_v[bi]),
.south_out_ready(bs_out_r),
.south_out_data (rtr_b_s_out_d[bi*PACKET_W +: PACKET_W]),
.east_in_valid (be_in_v),
.east_in_ready (rtr_b_e_in_r[bi]),
.east_in_data (be_in_d),
.east_out_valid (rtr_b_e_out_v[bi]),
.east_out_ready (be_out_r),
.east_out_data (rtr_b_e_out_d[bi*PACKET_W +: PACKET_W]),
.west_in_valid (bw_in_v),
.west_in_ready (rtr_b_w_in_r[bi]),
.west_in_data (bw_in_d),
.west_out_valid (rtr_b_w_out_v[bi]),
.west_out_ready (bw_out_r),
.west_out_data (rtr_b_w_out_d[bi*PACKET_W +: PACKET_W]),
.idle (rtr_b_idle[bi])
);
end
end else begin : gen_no_net_b
assign rtr_b_idle = {NUM_CORES{1'b1}};
assign rtr_b_local_out_valid = {NUM_CORES{1'b0}};
assign rtr_b_local_out_data = {NUM_CORES*PACKET_W{1'b0}};
assign rtr_b_local_in_ready = {NUM_CORES{1'b1}};
end endgenerate
reg [CORE_ID_BITS-1:0] route_core_idx;
reg [NEURON_BITS-1:0] route_neuron;
reg [7:0] route_payload;
reg [ROUTE_SLOT_BITS-1:0] route_slot;
reg [GLOBAL_ROUTE_SLOT_BITS-1:0] global_slot;
reg [3:0] drain_wait;
wire signed [31:0] route_weight_ext = rt_weight;
wire signed [31:0] route_payload_ext = {24'd0, route_payload};
wire signed [31:0] route_graded_product = route_weight_ext * route_payload_ext;
wire signed [DATA_WIDTH-1:0] route_graded_current = route_graded_product >>> GRADE_SHIFT;
wire signed [31:0] grt_weight_ext = grt_weight;
wire signed [31:0] grt_graded_product = grt_weight_ext * route_payload_ext;
wire signed [DATA_WIDTH-1:0] grt_graded_current = grt_graded_product >>> GRADE_SHIFT;
wire signed [DATA_WIDTH-1:0] rt_eff_weight = graded_enable ? route_graded_current : rt_weight;
wire signed [DATA_WIDTH-1:0] grt_eff_weight = graded_enable ? grt_graded_current : grt_weight;
always @(posedge clk or negedge rst_n) begin
if (!rst_n) begin
mesh_state <= SM_IDLE;
timestep_done <= 0;
total_spikes <= 0;
timestep_count <= 0;
core_start_r <= 0;
route_core_idx <= 0;
route_neuron <= 0;
route_payload <= 0;
route_slot <= 0;
global_slot <= 0;
drain_wait <= 0;
rt_we <= 0;
rt_addr <= 0;
grt_we <= 0;
grt_addr <= 0;
cap_pop <= 0;
cap_clear <= 0;
rtr_local_in_valid <= 0;
rtr_local_in_data <= 0;
rtr_b_local_in_valid <= 0;
rtr_b_local_in_data <= 0;
end else begin
timestep_done <= 0;
core_start_r <= 0;
rt_we <= 0;
grt_we <= 0;
cap_pop <= 0;
cap_clear <= 0;
rtr_local_in_valid <= 0;
rtr_b_local_in_valid <= 0;
total_spikes <= total_spikes + popcount(core_spike_valid);
case (mesh_state)
SM_IDLE: begin
if (start) begin
drain_wait <= 0;
mesh_state <= SM_PKT_DRAIN;
end
end
SM_PKT_DRAIN: begin
if ((&rtr_idle) && (&rtr_b_idle) && !(|rtr_local_out_valid) && !(|rtr_b_local_out_valid)) begin
drain_wait <= drain_wait + 1;
if (drain_wait >= 4'd3)
mesh_state <= SM_START;
end else begin
drain_wait <= 0;
end
end
SM_START: begin
core_start_r <= {NUM_CORES{1'b1}};
mesh_state <= SM_RUN_WAIT;
end
SM_RUN_WAIT: begin
if (sync_all_done) begin
route_core_idx <= 0;
mesh_state <= SM_ROUTE_POP;
end
end
SM_ROUTE_POP: begin
if (cap_empty[route_core_idx]) begin
if (route_core_idx == NUM_CORES - 1)
mesh_state <= SM_DONE;
else
route_core_idx <= route_core_idx + 1;
end else begin
cap_pop[route_core_idx] <= 1;
route_neuron <= cap_data[route_core_idx * CAP_WIDTH + 8 +: NEURON_BITS];
route_payload <= cap_data[route_core_idx * CAP_WIDTH +: 8];
route_slot <= 0;
mesh_state <= SM_ROUTE_ADDR;
end
end
SM_ROUTE_ADDR: begin
rt_addr <= {route_core_idx, route_neuron, route_slot};
mesh_state <= SM_ROUTE_WAIT;
end
SM_ROUTE_WAIT: begin
mesh_state <= SM_ROUTE_READ;
end
SM_ROUTE_READ: begin
if (rt_valid) begin
if (route_core_idx[0] == 1'b0 || !DUAL_NOC) begin
if (rtr_local_in_ready[route_core_idx]) begin
rtr_local_in_valid[route_core_idx] <= 1;
rtr_local_in_data[route_core_idx*PACKET_W +: PACKET_W] <=
{core_to_x(rt_dest_core), core_to_y(rt_dest_core),
rt_dest_nrn, rt_eff_weight};
end
end else begin
if (rtr_b_local_in_ready[route_core_idx]) begin
rtr_b_local_in_valid[route_core_idx] <= 1;
rtr_b_local_in_data[route_core_idx*PACKET_W +: PACKET_W] <=
{core_to_x(rt_dest_core), core_to_y(rt_dest_core),
rt_dest_nrn, rt_eff_weight};
end
end
end
if (route_slot < ROUTE_FANOUT - 1) begin
route_slot <= route_slot + 1;
mesh_state <= SM_ROUTE_ADDR;
end else begin
global_slot <= 0;
mesh_state <= SM_GRT_ADDR;
end
end
SM_GRT_ADDR: begin
grt_addr <= {route_core_idx, route_neuron, global_slot};
mesh_state <= SM_GRT_WAIT;
end
SM_GRT_WAIT: begin
mesh_state <= SM_GRT_READ;
end
SM_GRT_READ: begin
if (grt_valid) begin
if (route_core_idx[0] == 1'b0 || !DUAL_NOC) begin
if (rtr_local_in_ready[route_core_idx]) begin
rtr_local_in_valid[route_core_idx] <= 1;
rtr_local_in_data[route_core_idx*PACKET_W +: PACKET_W] <=
{core_to_x(grt_dest_core), core_to_y(grt_dest_core),
grt_dest_nrn, grt_eff_weight};
end
end else begin
if (rtr_b_local_in_ready[route_core_idx]) begin
rtr_b_local_in_valid[route_core_idx] <= 1;
rtr_b_local_in_data[route_core_idx*PACKET_W +: PACKET_W] <=
{core_to_x(grt_dest_core), core_to_y(grt_dest_core),
grt_dest_nrn, grt_eff_weight};
end
end
end
if (global_slot < GLOBAL_ROUTE_SLOTS - 1) begin
global_slot <= global_slot + 1;
mesh_state <= SM_GRT_ADDR;
end else begin
mesh_state <= SM_ROUTE_POP;
end
end
SM_DONE: begin
cap_clear <= {NUM_CORES{1'b1}};
timestep_done <= 1;
timestep_count <= timestep_count + 1;
mesh_state <= SM_IDLE;
end
default: mesh_state <= SM_IDLE;
endcase
end
end
endmodule