# 将for-loops转换为FPGA的最佳方式[英] Best way to convert for-loops into an FPGA ### 问题描述

C代码的摘要看起来像:

```dot_product(&corr_sum, &sample_data_buffer[sample_index+d_circ_buf_size-sync_pattern_size], &sync_pattern, sync_pattern_size);
abs_corr_sum += abs(corr_sum);
```

```always @(sample_index)
begin
// for each incoming sample
abs_corr_sum = 64'd0;
corr_sum = 64'd0;
for (index2 = 0; index2 < sync_pattern_size; index2 = index2 + 1'b1)
begin
corr_sum = sample_data_buffer_I[index2+sample_index+circ_buf_size-sync_pattern_size] * sync_pattern_I[index2]
+ sample_data_buffer_Q[index2+sample_index+circ_buf_size-sync_pattern_size] * sync_pattern_Q[index2];

//this is my quick and dirty abs(corr_sum) summer
abs_corr_sum = (corr_sum < 0) ? abs_corr_sum + ~\$signed(corr_sum)+1 : abs_corr_sum + corr_sum;
end // for (index2 = 0; index2 < sync_pattern_size; index2 = index2 + 1'b1)
end //always @(sample_index)
```

## 推荐答案

.

```int sample_I[N], sync_I[N]; // assume 32-bit ints, 2-complement numbers.
int sample_Q[N], sync_Q[N];
int i, corsum, abscorsum = 0;

for (i=0;i<N;i++)
{
corsum = sample_I[i] * sync_I[i] + sample_Q[i] * sync_Q[i];
abscorsum += abs(corsum);
}
```

1)

```i = 0
abscorsum = 0
goto 2)
```

2)

```if i!=N
corsum = sample_I[i] * sync_I[i]
goto 3)
else
goto 5)
```

3)

```corsum = corsum + sample_Q[i] * sync_Q[i]
i = i + 1
goto 4)
```

4)

```if (corsum >= 0)
abscorsum = abscorsum + corsum
else
abscorsum = abscorsum + (-corsum)
goto 2)
```

5)

```STOP
```

Signal rst用于发信号以启动操作. finish由模块提出，以信号的运行结束和输出的有效性(abscorrsum)

sample_I，sync_i，sample_Q和sync_Q是使用内存块建模的，i是要读取的元素的地址.大多数合成器将推断这些向量的块RAM，因为它们仅在一个状态下读取，并且始终具有相同的地址信号.

```module corrdotprod #(N=4) (
input wire clk,
input wire rst,
output reg [31:0] i,
input wire [31:0] sample_i,
input wire [31:0] sync_i,
input wire [31:0] sample_q,
input wire [31:0] sync_q,
output reg [31:0] abscorrsum,
output reg finish
);

parameter
STATE1 = 3'd1,
STATE2 = 3'd2,
STATE3 = 3'd3,
STATE4 = 3'd4,
STATE5 = 3'd5;

reg [31:0] corrsum;
reg [2:0] state;

always @(posedge clk) begin
if (rst == 1'b1) begin
state <= STATE1;
end
else begin
case (state)
STATE1:
begin
i <= 0;
abscorrsum <= 0;
finish <= 1'b0;
state <= STATE2;
end
STATE2:
begin
if (i!=N) begin
corrsum <= sample_i * sync_i; // synthesizer deals with multiplication
state <= STATE3;
end
else begin
state <= STATE5;
end
end
STATE3:
begin
corrsum <= corrsum + sample_q * sync_q; // this product can share the multiplier as above
i <= i + 1;
state <= STATE4;
end
STATE4:
begin
if (corrsum == 1'b0) // remember: 2-complement
abscorrsum <= abscorrsum + corrsum;
else
abscorrsum <= abscorrsum + (~corrsum+1);
state <= STATE2;
end
STATE5:
finish <= 1'b1;
endcase
end
end
endmodule
```

```module tb;
reg clk;
reg rst;
reg [31:0] sample_i[0:3];
reg [31:0] sync_i[0:3];
reg [31:0] sample_q[0:3];
reg [31:0] sync_q[0:3];
wire [31:0] i;
wire [31:0] abscorrsum;

corrdotprod #(.N(4)) uut  (clk, rst, i, sample_i[i], sync_i[i], sample_q[i], sync_q[i], abscorrsum, finish);

integer tb_i, tb_corrsum, tb_abscorrsum;
initial begin
\$dumpfile ("dump.vcd");
\$dumpvars (0, tb.uut);

sample_i = 1;
sample_i = 2;
sample_i = 3;
sample_i = 4;

sync_i = 2;
sync_i = -2;
sync_i = 2;
sync_i = -2;

sample_q = -1;
sample_q = -2;
sample_q = -3;
sample_q = -4;

sync_q = 3;
sync_q = -3;
sync_q = 3;
sync_q = -3;

clk = 0;

rst = 1;
#30;
rst = 0;
wait (finish == 1);
\$display ("ABSCORRSUM    = %d\n", abscorrsum);

// Testing result from module
tb_abscorrsum = 0;
for (tb_i = 0; tb_i < 4; tb_i = tb_i + 1) begin
tb_corrsum = sample_i[tb_i] * sync_i[tb_i] + sample_q[tb_i] * sync_q[tb_i];
if (tb_corrsum<0)
tb_corrsum = -tb_corrsum;
tb_abscorrsum = tb_abscorrsum + tb_corrsum;
end
\$display ("TB_ABSCORRSUM = %d\n", tb_abscorrsum);

\$finish;
end

always begin
clk = #5 !clk;
end
endmodule
```

### 问题描述

I am having trouble wrapping my head how to best replicate some C code in an FPGA using a for-loop (not my first time being stuck on this).

The snippet of C code look like this:

```dot_product(&corr_sum, &sample_data_buffer[sample_index+d_circ_buf_size-sync_pattern_size], &sync_pattern, sync_pattern_size);
abs_corr_sum += abs(corr_sum);
```

Pretty straightforward, it is taking the dot product of two complex vectors and doing a cumulative sum of it.

And he was my attempt to replicate it:

```always @(sample_index)
begin
// for each incoming sample
abs_corr_sum = 64'd0;
corr_sum = 64'd0;
for (index2 = 0; index2 < sync_pattern_size; index2 = index2 + 1'b1)
begin
corr_sum = sample_data_buffer_I[index2+sample_index+circ_buf_size-sync_pattern_size] * sync_pattern_I[index2]
+ sample_data_buffer_Q[index2+sample_index+circ_buf_size-sync_pattern_size] * sync_pattern_Q[index2];

//this is my quick and dirty abs(corr_sum) summer
abs_corr_sum = (corr_sum < 0) ? abs_corr_sum + ~\$signed(corr_sum)+1 : abs_corr_sum + corr_sum;
end // for (index2 = 0; index2 < sync_pattern_size; index2 = index2 + 1'b1)
end //always @(sample_index)
```

Does this seem right? I am not getting the results I am expecting; and though the issue could be elsewhere, I think that this section is the most likely culprit.

## 推荐答案

To convert a piece of code coming from an algorithm with loops, conditionals, et al, into a synthesizable form of Verilog, you need to translate it in to a FSM.

For example, a for loop to do something similar you are asking for would be:

```int sample_I[N], sync_I[N]; // assume 32-bit ints, 2-complement numbers.
int sample_Q[N], sync_Q[N];
int i, corsum, abscorsum = 0;

for (i=0;i<N;i++)
{
corsum = sample_I[i] * sync_I[i] + sample_Q[i] * sync_Q[i];
abscorsum += abs(corsum);
}
```

First, group sentences into time slots, so you can see which actions can be done in the same clock cycle (same state), and assign a state to each slot:

1)

```i = 0
abscorsum = 0
goto 2)
```

2)

```if i!=N
corsum = sample_I[i] * sync_I[i]
goto 3)
else
goto 5)
```

3)

```corsum = corsum + sample_Q[i] * sync_Q[i]
i = i + 1
goto 4)
```

4)

```if (corsum >= 0)
abscorsum = abscorsum + corsum
else
abscorsum = abscorsum + (-corsum)
goto 2)
```

5)

```STOP
```

States 2 and 3 may be merged into a single state, but that would force the synthesizer to infer two multipliers, and besides, the propagation delay of the resulting combinatorial path could be very high, limiting the clock frequency allowable for this design. So, I have split the dot product calculation into two parts, each one them using a single multiplication operation. The synthesizer, if instructed so, can use one multiplier and share it for the two operations, as both happen in different clock cycles.

which translates to this module: http://www.edaplayground.com/x/MEG

Signal rst is used to signal the module to start operation. finish is raised by the module to signal end of operation and validness of output (abscorrsum)

sample_I, sync_i, sample_Q and sync_Q are modeled using memory blocks, with i being the address of the element to read. Most synthesizers will infer block RAMs for these vectors, as each of them is read only in one state, and always with the same address signal.

```module corrdotprod #(N=4) (
input wire clk,
input wire rst,
output reg [31:0] i,
input wire [31:0] sample_i,
input wire [31:0] sync_i,
input wire [31:0] sample_q,
input wire [31:0] sync_q,
output reg [31:0] abscorrsum,
output reg finish
);

parameter
STATE1 = 3'd1,
STATE2 = 3'd2,
STATE3 = 3'd3,
STATE4 = 3'd4,
STATE5 = 3'd5;

reg [31:0] corrsum;
reg [2:0] state;

always @(posedge clk) begin
if (rst == 1'b1) begin
state <= STATE1;
end
else begin
case (state)
STATE1:
begin
i <= 0;
abscorrsum <= 0;
finish <= 1'b0;
state <= STATE2;
end
STATE2:
begin
if (i!=N) begin
corrsum <= sample_i * sync_i; // synthesizer deals with multiplication
state <= STATE3;
end
else begin
state <= STATE5;
end
end
STATE3:
begin
corrsum <= corrsum + sample_q * sync_q; // this product can share the multiplier as above
i <= i + 1;
state <= STATE4;
end
STATE4:
begin
if (corrsum == 1'b0) // remember: 2-complement
abscorrsum <= abscorrsum + corrsum;
else
abscorrsum <= abscorrsum + (~corrsum+1);
state <= STATE2;
end
STATE5:
finish <= 1'b1;
endcase
end
end
endmodule
```

Which can be tested with this simple test bench:

```module tb;
reg clk;
reg rst;
reg [31:0] sample_i[0:3];
reg [31:0] sync_i[0:3];
reg [31:0] sample_q[0:3];
reg [31:0] sync_q[0:3];
wire [31:0] i;
wire [31:0] abscorrsum;

corrdotprod #(.N(4)) uut  (clk, rst, i, sample_i[i], sync_i[i], sample_q[i], sync_q[i], abscorrsum, finish);

integer tb_i, tb_corrsum, tb_abscorrsum;
initial begin
\$dumpfile ("dump.vcd");
\$dumpvars (0, tb.uut);

sample_i = 1;
sample_i = 2;
sample_i = 3;
sample_i = 4;

sync_i = 2;
sync_i = -2;
sync_i = 2;
sync_i = -2;

sample_q = -1;
sample_q = -2;
sample_q = -3;
sample_q = -4;

sync_q = 3;
sync_q = -3;
sync_q = 3;
sync_q = -3;

clk = 0;

rst = 1;
#30;
rst = 0;
wait (finish == 1);
\$display ("ABSCORRSUM    = %d\n", abscorrsum);

// Testing result from module
tb_abscorrsum = 0;
for (tb_i = 0; tb_i < 4; tb_i = tb_i + 1) begin
tb_corrsum = sample_i[tb_i] * sync_i[tb_i] + sample_q[tb_i] * sync_q[tb_i];
if (tb_corrsum<0)
tb_corrsum = -tb_corrsum;
tb_abscorrsum = tb_abscorrsum + tb_corrsum;
end
\$display ("TB_ABSCORRSUM = %d\n", tb_abscorrsum);

\$finish;
end

always begin
clk = #5 !clk;
end
endmodule
```