Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
206 changes: 156 additions & 50 deletions parmys/parmys-plugin/core/multiplier.cc
Original file line number Diff line number Diff line change
Expand Up @@ -937,7 +937,7 @@ void init_multiplier_adder(nnode_t *node, nnode_t *parent, int a, int b)
*-----------------------------------------------------------------------*/
void split_multiplier(nnode_t *node, int a0, int b0, int a1, int b1, netlist_t *netlist)
{
nnode_t *a0b0, *a0b1, *a1b0, *a1b1, *addsmall, *addbig;
nnode_t *a0b0, *a0b1, *a1b0, *a1b1, *addsmall, *addsmall2, *addbig;
int size;

/* Check for a legitimate split */
Expand Down Expand Up @@ -976,50 +976,153 @@ void split_multiplier(nnode_t *node, int a0, int b0, int a1, int b1, netlist_t *
init_split_multiplier(node, a1b0, a0, a1, 0, b0, a1b1, a0b0);
mult_list = insert_in_vptr_list(mult_list, a1b0);

/* New node for the initial add */
addsmall = allocate_nnode(node->loc);
addsmall->name = (char *)vtr::malloc(strlen(node->name) + 6);
strcpy(addsmall->name, node->name);
strcat(addsmall->name, "-add0");
// this addition will have a carry out in the worst case, add to input pins and connect then to gnd
init_multiplier_adder(addsmall, a1b0, a1b0->num_output_pins + 1, a0b1->num_output_pins + 1);

/* New node for the BIG add */
addbig = allocate_nnode(node->loc);
addbig->name = (char *)vtr::malloc(strlen(node->name) + 6);
strcpy(addbig->name, node->name);
strcat(addbig->name, "-add1");
init_multiplier_adder(addbig, addsmall, addsmall->num_output_pins, a0b0->num_output_pins - b0 + a1b1->num_output_pins);

// connect inputs to port a of addsmall
for (int i = 0; i < a1b0->num_output_pins; i++)
connect_nodes(a1b0, i, addsmall, i);
add_input_pin_to_node(addsmall, get_zero_pin(netlist), a1b0->num_output_pins);
// connect inputs to port b of addsmall
for (int i = 0; i < a0b1->num_output_pins; i++)
connect_nodes(a0b1, i, addsmall, i + addsmall->input_port_sizes[0]);
add_input_pin_to_node(addsmall, get_zero_pin(netlist), a0b1->num_output_pins + addsmall->input_port_sizes[0]);

// connect inputs to port a of addbig
size = addsmall->num_output_pins;
for (int i = 0; i < size; i++)
connect_nodes(addsmall, i, addbig, i);

// connect inputs to port b of addbig
for (int i = b0; i < a0b0->output_port_sizes[0]; i++)
connect_nodes(a0b0, i, addbig, i - b0 + size);
size = size + a0b0->output_port_sizes[0] - b0;
for (int i = 0; i < a1b1->output_port_sizes[0]; i++)
connect_nodes(a1b1, i, addbig, i + size);

// remap the multiplier outputs coming directly from a0b0
for (int i = 0; i < b0; i++) {
remap_pin_to_new_node(node->output_pins[i], a0b0, i);
}
// using the balenced addition method only works if a0 and b0 are the same size
// (i.e. if the input ports on the hardware multiplier are equal)
if (b0 == a0) {
/* New node for the initial add */
addsmall = allocate_nnode(node->loc);
addsmall->name = (char *)vtr::malloc(strlen(node->name) + 6);
strcpy(addsmall->name, node->name);
strcat(addsmall->name, "-add0");
// this addition will have a carry out in the worst case, add to input pins and connect then to gnd
init_multiplier_adder(addsmall, a1b0, a1b0->num_output_pins + 1, a0b1->num_output_pins + 1);

// connect inputs to port a of addsmall
for (int i = 0; i < a1b0->num_output_pins; i++)
connect_nodes(a1b0, i, addsmall, i);

add_input_pin_to_node(addsmall, get_zero_pin(netlist), a1b0->num_output_pins);
// connect inputs to port b of addsmall
for (int i = 0; i < a0b1->num_output_pins; i++)
connect_nodes(a0b1, i, addsmall, i + addsmall->input_port_sizes[0]);
add_input_pin_to_node(addsmall, get_zero_pin(netlist), a0b1->num_output_pins + addsmall->input_port_sizes[0]);

/* New node for the BIG add */
addbig = allocate_nnode(node->loc);
addbig->name = (char *)vtr::malloc(strlen(node->name) + 6);
strcpy(addbig->name, node->name);
strcat(addbig->name, "-add1");
init_multiplier_adder(addbig, addsmall, addsmall->num_output_pins, a0b0->num_output_pins - b0 + a1b1->num_output_pins);

// connect inputs to port a of addbig
size = addsmall->num_output_pins;
for (int i = 0; i < size; i++)
connect_nodes(addsmall, i, addbig, i);

// connect inputs to port b of addbig
for (int i = b0; i < a0b0->output_port_sizes[0]; i++)
connect_nodes(a0b0, i, addbig, i - b0 + size);
size = size + a0b0->output_port_sizes[0] - b0;
for (int i = 0; i < a1b1->output_port_sizes[0]; i++)
connect_nodes(a1b1, i, addbig, i + size);

// remap the multiplier outputs coming directly from a0b0
for (int i = 0; i < b0; i++) {
remap_pin_to_new_node(node->output_pins[i], a0b0, i);
}

// remap the multiplier outputs coming from addbig
for (int i = 0; i < addbig->num_output_pins; i++) {
remap_pin_to_new_node(node->output_pins[i + b0], addbig, i);
}
} else {
/* Expounding upon the description for the method in this function.
if we have two numbers A and B and we have a hardware multiplier of size a0xb0,
we can split them into two parts:
A = A1 << a0 + A0
B = B1 << b0 + B0
where A1 and B1 are the high bits of A and B, and A0 and B0 are the low bits.
Note that len(A0) = a0 and len(B0) = b0 by definition.
The multiplication of A and B can be expressed as:
A * B = (A1 << a0 + A0) * (B1 << b0 + B0)
= {A1 * B1 << (a0 + b0)} + {(A1 * B0) << a0 + (A0 * B1) << b0} + {A0 * B0}
we define split the editions up like so:
addsmall = (A1 * B0) << a0 + (A0 * B1) << b0 // can have carry
addsmall2 = (A1 * B1 << (a0 + b0)) + (A0 * B0) // Will not have carry
addbig = addsmall + addsmall2
This is a slightly modified version of the Karatsuba algorithm.
*/
/////////////// Addsmall /////////////////////
addsmall = allocate_nnode(node->loc);
addsmall->name = (char *)vtr::malloc(strlen(node->name) + 6);
strcpy(addsmall->name, node->name);
strcat(addsmall->name, "-add0");
init_multiplier_adder(addsmall, a1b0, a1b0->num_output_pins + a0 + 1, a0b1->num_output_pins + b0 + 1);

// The first a0 pins of addsmall input connecting to a1b0 are connected to zero
for (int i = 0; i < a0; i++) {
add_input_pin_to_node(addsmall, get_zero_pin(netlist), i);
}

// connect inputs to port a of addsmall
for (int i = 0; i < a1b0->num_output_pins; i++) {
connect_nodes(a1b0, i, addsmall, i + a0);
}

// add zero pin for carry
add_input_pin_to_node(addsmall, get_zero_pin(netlist), a1b0->num_output_pins + a0);

// The first b0 pins of addsmall input connecting to a0b1 are connected to zero
for (int i = 0; i < b0; i++) {
add_input_pin_to_node(addsmall, get_zero_pin(netlist), i + addsmall->input_port_sizes[0]);
}

// connect inputs to port b of addsmall
for (int i = 0; i < a0b1->num_output_pins; i++) {
connect_nodes(a0b1, i, addsmall, i + addsmall->input_port_sizes[0] + b0);
}

// add zero pin for carry
add_input_pin_to_node(addsmall, get_zero_pin(netlist), a0b1->num_output_pins + addsmall->input_port_sizes[0] + b0);

/////////////// Addsmall2 /////////////////////
addsmall2 = allocate_nnode(node->loc);
addsmall2->name = (char *)vtr::malloc(strlen(node->name) + 6);
strcpy(addsmall2->name, node->name);
strcat(addsmall2->name, "-add1");
init_multiplier_adder(addsmall2, a1b1, a1b1->num_output_pins + a0 + b0, a0b0->num_output_pins);

// remap the multiplier outputs coming from addbig
for (int i = 0; i < addbig->num_output_pins; i++) {
remap_pin_to_new_node(node->output_pins[i + b0], addbig, i);
// connect first a0+ b0 pins of addsmall2 to zero
for (int i = 0; i < a0 + b0; i++) {
add_input_pin_to_node(addsmall2, get_zero_pin(netlist), i);
}

// connect inputs to port a of addsmall2
for (int i = 0; i < a1b1->num_output_pins; i++) {
connect_nodes(a1b1, i, addsmall2, i + a0 + b0);
}

// connect inputs to port b of addsmall2
for (int i = 0; i < a0b0->output_port_sizes[0]; i++) {
connect_nodes(a0b0, i, addsmall2, i + addsmall2->input_port_sizes[0]);
}

/////////////// Addbig /////////////////////
addbig = allocate_nnode(node->loc);
addbig->name = (char *)vtr::malloc(strlen(node->name) + 6);
strcpy(addbig->name, node->name);
strcat(addbig->name, "-add2");
init_multiplier_adder(addbig, addsmall, addsmall->num_output_pins, addsmall2->num_output_pins);
// Here the final addition can have a carry out in the worst case, however,
// our final product will always only be the length of the longest input port so regardless of the carry the
// final adds carry will always drop out.

// connect inputs to port a of addbig
for (int i = 0; i < addsmall->num_output_pins; i++) {
connect_nodes(addsmall, i, addbig, i);
}
// add_input_pin_to_node(addbig, get_zero_pin(netlist), addsmall->num_output_pins);

// connect inputs to port b of addbig
for (int i = 0; i < addsmall2->num_output_pins; i++) {
connect_nodes(addsmall2, i, addbig, i + addbig->input_port_sizes[0]);
}
// add_input_pin_to_node(addbig, get_zero_pin(netlist), addbig->input_port_sizes[0] + addsmall->num_output_pins);

// remap the multiplier outputs coming directly from a0b0
for (int i = 0; i < addbig->num_output_pins; i++) {
remap_pin_to_new_node(node->output_pins[i], addbig, i);
}
}

// CLEAN UP
Expand Down Expand Up @@ -1060,7 +1163,6 @@ void split_multiplier_a(nnode_t *node, int a0, int a1, int b)
strcat(a0b->name, "-0");
init_split_multiplier(node, a0b, 0, a0, 0, b, nullptr, nullptr);
mult_list = insert_in_vptr_list(mult_list, a0b);

/* New node for a1b multiply */
a1b = allocate_nnode(node->loc);
a1b->name = (char *)vtr::malloc(strlen(node->name) + 3);
Expand Down Expand Up @@ -1184,7 +1286,6 @@ void pad_multiplier(nnode_t *node, netlist_t *netlist)

oassert(node->type == MULTIPLY);
oassert(hard_multipliers != NULL);

sizea = node->input_port_sizes[0];
sizeb = node->input_port_sizes[1];
sizeout = node->output_port_sizes[0];
Expand All @@ -1199,6 +1300,13 @@ void pad_multiplier(nnode_t *node, netlist_t *netlist)
}
diffa = ina - sizea;
diffb = inb - sizeb;
// input multiplier size on middle range of unequal Hard Block size(ex; mul_size>18 && mul_size<25)
if (diffb < 0) {
std::swap(ina, inb);
diffa = ina - sizea;
diffb = inb - sizeb;
}

diffout = hard_multipliers->outputs->size - sizeout;

if (configuration.split_hard_multiplier == 1) {
Expand Down Expand Up @@ -1281,11 +1389,10 @@ void iterate_multipliers(netlist_t *netlist)
int mula, mulb;
int a0, a1, b0, b1;
nnode_t *node;

/* Can only perform the optimisation if hard multipliers exist! */
if (hard_multipliers == NULL)
return;

// std::cin.get();
sizea = hard_multipliers->inputs->size;
sizeb = hard_multipliers->inputs->next->size;
if (sizea < sizeb) {
Expand Down Expand Up @@ -1313,7 +1420,6 @@ void iterate_multipliers(netlist_t *netlist)
sizea = sizeb;
sizeb = swap;
}

/* Do I need to split the multiplier on both inputs? */
if ((mula > sizea) && (mulb > sizeb)) {
a0 = sizea;
Expand Down Expand Up @@ -1890,4 +1996,4 @@ void free_multipliers()

hard_multipliers->instances = NULL;
}
}
}
1 change: 1 addition & 0 deletions parmys/parmys-plugin/netlist/netlist_utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -485,6 +485,7 @@ void remap_pin_to_new_net(npin_t *pin, nnet_t *new_net)
*-----------------------------------------------------------------------*/
void remap_pin_to_new_node(npin_t *pin, nnode_t *new_node, int pin_idx)
{
oassert(pin != NULL);
if (pin->type == INPUT) {
/* clean out the entry in the old net */
pin->node->input_pins[pin->pin_node_idx] = NULL;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ circuits_dir=benchmarks/verilog
arch_list_add=7series_BRAM_DSP_carry.xml

# Add circuits to list to sweep
circuit_list_add=mcml.v
circuit_list_add=LU32PEEng.v
circuit_list_add=LU8PEEng.v
circuit_list_add=bgm.v
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ arch_list_add=7series_BRAM_DSP_carry.xml

# Add circuits to list to sweep
circuit_list_add=stereovision3.v
circuit_list_add=diffeq2.v


# Parse info and how to parse
Expand Down
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
arch circuit script_params vtr_flow_elapsed_time vtr_max_mem_stage vtr_max_mem error odin_synth_time max_odin_mem parmys_synth_time max_parmys_mem abc_depth abc_synth_time abc_cec_time abc_sec_time max_abc_mem ace_time max_ace_mem num_clb num_io num_memories num_mult vpr_status vpr_revision vpr_build_info vpr_compiler vpr_compiled hostname rundir max_vpr_mem num_primary_inputs num_primary_outputs num_pre_packed_nets num_pre_packed_blocks num_netlist_clocks num_post_packed_nets num_post_packed_blocks device_width device_height device_grid_tiles device_limiting_resources device_name pack_mem pack_time initial_placed_wirelength_est placed_wirelength_est total_swap accepted_swap rejected_swap aborted_swap place_mem place_time place_quench_time initial_placed_CPD_est placed_CPD_est placed_setup_TNS_est placed_setup_WNS_est placed_geomean_nonvirtual_intradomain_critical_path_delay_est place_delay_matrix_lookup_time place_quench_timing_analysis_time place_quench_sta_time place_total_timing_analysis_time place_total_sta_time ap_mem ap_time ap_full_legalizer_mem ap_full_legalizer_time min_chan_width routed_wirelength min_chan_width_route_success_iteration logic_block_area_total logic_block_area_used min_chan_width_routing_area_total min_chan_width_routing_area_per_tile min_chan_width_route_time min_chan_width_total_timing_analysis_time min_chan_width_total_sta_time crit_path_num_rr_graph_nodes crit_path_num_rr_graph_edges crit_path_collapsed_nodes crit_path_routed_wirelength crit_path_route_success_iteration crit_path_total_nets_routed crit_path_total_connections_routed crit_path_total_heap_pushes crit_path_total_heap_pops critical_path_delay geomean_nonvirtual_intradomain_critical_path_delay setup_TNS setup_WNS hold_TNS hold_WNS crit_path_routing_area_total crit_path_routing_area_per_tile router_lookahead_computation_time crit_path_route_time crit_path_create_rr_graph_time crit_path_create_intra_cluster_rr_graph_time crit_path_tile_lookahead_computation_time crit_path_router_lookahead_computation_time crit_path_total_timing_analysis_time crit_path_total_sta_time
7series_BRAM_DSP_carry.xml stereovision3.v common 2.87 vpr 72.69 MiB -1 -1 0.32 26404 4 0.08 -1 -1 35804 -1 -1 -1 11 0 -1 success v8.0.0-12999-gf153e4447-dirty release IPO VTR_ASSERT_LEVEL=2 GNU 11.4.0 on Linux-5.15.0-119-generic x86_64 2025-06-12T17:00:21 goeders10 /home/chem3000/GitClones/vtr_pulls/vtr_ccl/vtr-verilog-to-routing/vtr_flow/tasks 74436 11 2 303 283 2 114 35 7 7 49 CLB auto 33.4 MiB 1.15 577.007 408 947 108 584 255 72.7 MiB 0.01 0.00 3.1717 3.1717 -180.982 -3.1717 2.89952 0.12 0.000180919 0.000138049 0.00514751 0.00458797 -1 -1 -1 -1 40 911 27 1.34735e+06 1.18567e+06 152291. 3107.98 0.59 0.032835 0.0284564 6668 73471 -1 385 12 297 988 127228 60728 2.91111 2.8252 -221.503 -2.91111 -2.452 -0.04 215465. 4397.25 0.02 0.02 0.04 -1 -1 0.02 0.0099349 0.00925009
7series_BRAM_DSP_carry.xml stereovision3.v common 2.53 vpr 72.59 MiB -1 -1 0.33 26408 4 0.08 -1 -1 36120 -1 -1 -1 11 0 -1 success v8.0.0-13067-gda604502c-dirty release IPO VTR_ASSERT_LEVEL=2 GNU 11.4.0 on Linux-5.15.0-119-generic x86_64 2025-06-16T13:26:41 goeders10 /home/chem3000/GitClones/vtr_pulls/vtr_ccl/vtr-verilog-to-routing/vtr_flow/tasks 74336 11 2 303 283 2 114 35 7 7 49 CLB auto 33.2 MiB 1.18 577.007 408 947 108 584 255 72.6 MiB 0.01 0.00 3.1717 3.1717 -180.982 -3.1717 2.89952 0.12 0.000171885 0.000143559 0.00458104 0.00401289 -1 -1 -1 -1 40 889 22 1.34735e+06 1.18567e+06 152291. 3107.98 0.22 0.0179847 0.0160687 6668 73471 -1 385 12 297 988 127228 60728 2.91111 2.8252 -221.503 -2.91111 -2.452 -0.04 215465. 4397.25 0.02 0.02 0.04 -1 -1 0.02 0.00962131 0.00895172
7series_BRAM_DSP_carry.xml diffeq2.v common 48.98 vpr 129.37 MiB -1 -1 0.19 27844 5 0.09 -1 -1 38944 -1 -1 -1 66 0 -1 success v8.0.0-13067-gda604502c-dirty release IPO VTR_ASSERT_LEVEL=2 GNU 11.4.0 on Linux-5.15.0-119-generic x86_64 2025-06-16T13:26:41 goeders10 /home/chem3000/GitClones/vtr_pulls/vtr_ccl/vtr-verilog-to-routing/vtr_flow/tasks 132476 66 96 1819 1080 1 1150 336 26 26 676 DSP auto 40.6 MiB 2.77 20198.7 9188 64521 14520 45607 4394 119.7 MiB 0.65 0.01 22.6842 19.668 -1065.49 -19.668 19.668 3.45 0.000811128 0.000723384 0.0536543 0.0478746 -1 -1 -1 -1 74 12093 17 3.53732e+07 1.31407e+07 5.36197e+06 7931.91 33.65 0.395666 0.356239 133518 2720184 -1 10986 14 5828 10018 3165525 816384 19.4143 19.4143 -1210.49 -19.4143 -1.7 -0.034 6.54552e+06 9682.72 1.89 0.38 1.55 -1 -1 1.89 0.0458825 0.0428936