Professional Documents
Culture Documents
/// Contents
//
// Binary Multiplication Algorithm
// Simple Multiplication Hardware
// Streamlined Multiplication Hardware
// Booth Recoding for Higher-Radix and Signed Multiplication
// Binary Division Algorithm
// Division Hardware (Simple and Streamlined)
// "Real" Arithmetic Circuits
/// References
//
// :PH: Patterson & Hennessy, "Computer Organization & Design", 4th Edition
// :C: Michael D. Ciletti, "Adv. Digital Design with the Verilog HDL" 2nd Ed
// :HP: Hennessy & Patterson, "Computer Architecture: A Quantitative Approach"
////////////////////////////////////////////////////////////////////////////////
/// Binary Multiplication Algorithm
// :PH: 3.3
////////////////////////////////////////////////////////////////////////////////
/// Simple Multiplication Hardware
// :PH: 3.3
// Proportional to n.
// Includes 5n bits of register storage.
// Speed of n-bit multiplier:
// Product in n clocks.
// Clock period includes 2n-bit add.
// Uses more hardware than streamlined multipliers in next section.
// :Example:
//
// Simple Unsigned Combinational Multiplier
// Based on :PH: Figure 3.4
//
// This is a straightforward coding of the longhand multiplication
// technique.
integer i;
product = 0;
end
endmodule
// :Example:
//
// Simple Unsigned Multiplier
//
// Sequential multiplier. Handshaking signals are added to command
// multiplier to start and to indicate when finished. Registers are
// provided for shifted versions of the multiplier and multiplicand.
initial bit = 0;
bit = 16;
product = 0;
http://www.ece.lsu.edu/ee3755/2012f/l07.v.html 2/15
11/9/2017 l07.v
end
endmodule
////////////////////////////////////////////////////////////////////////////////
/// Streamlined Multiplication Hardware
// :PH: 3.3
// :Example:
//
// Streamlined Unsigned Multiplier
// Based on :PH: Figure 3.6
//
// Improvements:
//
// The product register also holds the multiplier.
// Only the product register is shifted.
// A register is not needed to hold the multiplicand.
// Wow, you save 2n bits! (To multiply n-bit numbers.)
module streamlined_mult(product,ready,multiplier,multiplicand,start,clk);
input [15:0] multiplier, multiplicand;
input start, clk;
output product;
http://www.ece.lsu.edu/ee3755/2012f/l07.v.html 3/15
11/9/2017 l07.v
output ready;
initial bit = 0;
bit = 16;
product = { 16'd0, multiplier };
reg lsb;
lsb = product[0];
product = product >> 1;
bit = bit - 1;
end
endmodule
// :Example:
//
// Streamlined Signed Multiplier
//
// Multiplies absolute value of multiplier and multiplicand.
// Negates product if multiplier and multiplicand differ in sign.
// Otherwise similar to streamlined unsigned multiplier.
module streamlined_signed_mult(product,ready,multiplier,multiplicand,
start,clk);
initial bit = 0;
bit = 16;
product = { 16'd0, multiplier[15] ? -multiplier : multiplier };
abs_multiplicand = multiplicand[15] ? -multiplicand : multiplicand;
reg lsb;
lsb = product[0];
product = product >> 1;
http://www.ece.lsu.edu/ee3755/2012f/l07.v.html 4/15
11/9/2017 l07.v
bit = bit - 1;
end
endmodule
////////////////////////////////////////////////////////////////////////////////
/// Booth Recoding for Higher-Radix and Signed Multiplication
// See http://www.geoffknagge.com/fyp/booth.shtml
// :C: 10.3.11 Radix-2 Booth recoding.
// :C: 10.3.12 Radix-4 Booth recoding, called Bit-Pair encoding in text.
//
// Uses n/2 additions to multiply two n-bit numbers.
//
// Two Radix-4 Multiplication Examples
//
// Multiply 5 times 12. Radix-4 multiplication (the faster way).
//
// Precompute: 2a: 01010, 3a: 01111
//
// 0101 Multiplicand
// 1100 Multiplier
// """"
// 00000 00 x 0101
// 01111 11 x 0101
// """""""
// 0111100 Product
//
// Multiply 5 times 9. Radix-4 multiplication (the faster way).
//
// 0101 Multiplicand
// 1001 Multiplier
// """"
// 00101 01 x 0101
// 01010 10 x 0101
// """""""
// 0101101 Product
// :Example:
//
// A Radix-4 multiplier. Takes unsigned numbers.
module imult_ord_radix_4(product,ready,multiplicand,multiplier,start,clk);
input [15:0] multiplicand, multiplier;
input start, clk;
output product;
output ready;
initial bit = 0;
bit = 8;
product = { 16'd0, multiplier };
http://www.ece.lsu.edu/ee3755/2012f/l07.v.html 6/15
11/9/2017 l07.v
mb = product[1:0];
case ( mb )
2'd0: pp = {2'b0, product[31:16] };
2'd1: pp = {2'b0, product[31:16] } + multiplicand_X_1;
2'd2: pp = {2'b0, product[31:16] } + multiplicand_X_2;
2'd3: pp = {2'b0, product[31:16] } + multiplicand_X_3;
endcase
end
endmodule
http://www.ece.lsu.edu/ee3755/2012f/l07.v.html 7/15
11/9/2017 l07.v
// Note:
// The carry out is the same as the MSB of the multiplier bits examined.
// That's intentional, so that no special storage or computation is need
// for the carry.
//
// The carry out of the last step is not used.
//
// Construction of higher-radix tables is straightforward.
// :Example:
//
// A Booth radix-4 multiplier. This takes signed numbers.
module imult_Booth_radix_4(prod,ready,mcand,mplier,start,clk);
input [15:0] mcand, mplier;
input start, clk;
output [31:0] prod;
output ready;
initial bit = 0;
bit = 8;
prod = { 16'd0, mplier };
lostbit = 0;
case ( {mb,lostbit} )
3'b000: high_prod_next = high_prod_prev;
3'b001: high_prod_next = high_prod_prev + mcand_X_p1;
3'b010: high_prod_next = high_prod_prev + mcand_X_p1;
3'b011: high_prod_next = high_prod_prev + mcand_X_p2;
3'b100: high_prod_next = high_prod_prev + mcand_X_m2;
3'b101: high_prod_next = high_prod_prev + mcand_X_m1;
3'b110: high_prod_next = high_prod_prev + mcand_X_m1;
3'b111: high_prod_next = high_prod_prev;
endcase
bit = bit - 1;
end
endmodule
// :Example:
//
// Radix-2 Booth multiplier. Unlike signed multiplier, does not
// require time or hardware for taking absolute value of operands.
module imult_Booth(product,ready,mcand,mplier,start,clk);
input [15:0] mcand, mplier;
input start, clk;
output product;
output ready;
initial bit = 0;
bit = 16;
product = { 16'd0, mplier };
lostbit = 0;
case ( {product[0],lostbit} )
2'b01: product[31:16] = product[31:16] + mcand;
2'b10: product[31:16] = product[31:16] - mcand;
endcase
lostbit = product[0];
product = { product[31], product[31:1] };
bit = bit - 1;
end
endmodule
http://www.ece.lsu.edu/ee3755/2012f/l07.v.html 9/15
11/9/2017 l07.v
////////////////////////////////////////////////////////////////////////////////
/// Binary Division Algorithm
// :PH: 3.4
////////////////////////////////////////////////////////////////////////////////
/// Division Hardware (Simple and Streamlined)
// :PH: 3.4
// :Example:
//
// Simple divider.
// Based on :PH: Figure 4.36
module simple_divider(quotient,remainder,ready,dividend,divider,start,clk);
input [15:0] dividend,divider;
input start, clk;
output [15:0] quotient,remainder;
http://www.ece.lsu.edu/ee3755/2012f/l07.v.html 10/15
11/9/2017 l07.v
output ready;
// """"""""|
// 1011 | <---- dividend_copy
// -0011 | <---- divider_copy
// """"""""| 0 Difference is negative: copy dividend and put 0 in quotient.
// 1011 | <---- dividend_copy
// -0011 | <---- divider_copy
// """"""""| 00 Difference is negative: copy dividend and put 0 in quotient.
// 1011 | <---- dividend_copy
// -0011 | <---- divider_copy
// """"""""| 001 Difference is positive: use difference and put 1 in quotient.
// quotient (numbers above)
initial bit = 0;
bit = 16;
quotient = 0;
dividend_copy = {16'd0,dividend};
divider_copy = {1'b0,divider,15'd0};
end
endmodule
// :Example:
//
// Streamlined divider.
// Based on :PH: Figure 4.41
//
// Uses less register storage than simple divider.
module streamlined_divider(quotient,remainder,ready,dividend,divider,start,clk);
input [15:0] dividend,divider;
input start, clk;
output quotient,remainder;
output ready;
//
// 0000 1011
// """"""""|
// 1011 | 0001 0110 <- qr reg
// -0011 | -0011 <- divider (never changes)
// """"""""|
http://www.ece.lsu.edu/ee3755/2012f/l07.v.html 11/15
11/9/2017 l07.v
initial bit = 0;
bit = 16;
qr = {16'd0,dividend};
if ( diff[16] )
qr = {qr[30:0],1'd0};
else
qr = {diff[15:0],qr[14:0],1'd1};
bit = bit - 1;
end
endmodule
////////////////////////////////////////////////////////////////////////////////
/// "Real" Arithmetic Circuits
// :HP: Appendix A
/// Adders
//
// Multiple-level CLA covered in class reasonably close to adders used
// in processors.
//
// Other fast adder techniques also used. (See :PH: problems.)
/// Multipliers
//
// The multiplication hardware presented above is much slower than
// the hardware used in real processors.
//
// "Real" n-bit Multiplier Features
//
// Multiplication done in one or two cycles (assume one cycle).
// Uses higher-radix (say 4) Booth recoding or something similar.
// Enough adders are provided so that product computed in one cycle.
// These adders can add n numbers quickly without the cost of n CLA's.
// Cost may be something like 3/2 n ripple adders plus 1 CLA.
http://www.ece.lsu.edu/ee3755/2012f/l07.v.html 12/15
11/9/2017 l07.v
/// Dividers
//
// The division hardware presented above is slower than hardware used
// in real processors.
//
// Division trickier than multiplication because result of step i
// needed for i+1. This precludes the tree structures used in
// fast multipliers.
//
// FP Dividers in general-purpose processors typically take 10-20 cycles.
//
// Several division techniques used. Examples:
// SRT: Sweeney, Robertson, Tocher
// Newton Iteration
// Goldschmidt's Algorithm
//
////////////////////////////////////////////////////////////////////////////////
/// Testbenches
///
// cadence translate_off
module l07_test_mult();
wire [31:0] or4_prod, prod_2;
reg [31:0] shadow_prod;
reg signed [15:0] a, b;
integer i;
parameter num_tests = 1000;
reg clk;
initial clk = 0;
always #1 clk = ~clk;
reg or4_start;
wire or4_ready;
// imult_ord_radix_4 c1(or4_prod,or4_ready,a,b,or4_start,clk);
// imult_Booth_radix_4 c1(or4_prod,or4_ready,a,b,or4_start,clk);
imult_Booth c1(or4_prod,or4_ready,a,b,or4_start,clk);
initial begin
# 0.5;
a = $random;
b = $random;
or4_start = 1;
or4_start = 0;
shadow_prod = a * b;
#1;
if ( or4_prod != shadow_prod ) begin
$display("Wrong prod: %h * %h = %h != %h (correct)",
a, b, or4_prod, shadow_prod);
http://www.ece.lsu.edu/ee3755/2012f/l07.v.html 13/15
11/9/2017 l07.v
$stop;
end
end
endmodule
module l07_test_div();
wire [15:0] quot, rem;
reg [15:0] shadow_quot, shadow_rem;
reg [15:0] a, b;
integer i;
parameter num_tests = 1000;
reg clk;
initial clk = 0;
always #1 clk = ~clk;
reg start;
wire ready;
// simple_divider div(quot,rem,ready,a,b,start,clk);
streamlined_divider div(quot,rem,ready,a,b,start,clk);
initial begin
# 0.5;
a = $random;
b = i & 1 ? $random : $random & 3;
start = 1;
start = 0;
shadow_quot = b ? a / b : infinity;
shadow_rem = b ? a % b : a;
#1;
if ( quot != shadow_quot || rem != shadow_rem ) begin
$display("Wrong quot: %h / %h = %h r %h != %h r %h (correct)",
a, b, quot, rem, shadow_quot, shadow_rem);
$stop;
end
end
endmodule
// cadence translate_on
http://www.ece.lsu.edu/ee3755/2012f/l07.v.html 14/15
11/9/2017 l07.v
http://www.ece.lsu.edu/ee3755/2012f/l07.v.html 15/15