|
- #define AP_INT_MAX_W 10240
- #include "stream_tools.h"
- #include "sliding_window_unit.h"
- #include <ap_fixed.h>
- #include <stdint.h>
- //#include <hls_video.h>
- #include "pool2d.h"
- #include "function.h"
- #include "atss_0506_lossv2.h"
- #include "atss_0506_lossv2_config.h"
-
- #include <ap_int.h>
- #include "math.h"
-
-
- //#define DEBUG
- using namespace hls;
- using namespace std;
-
- /************************ Image Normalization ************************/
-
- const ap_fixed<16, 3, AP_RND> img_norm_weight[3] = {
- (ap_fixed<16, 3, AP_RND>)(1.0/58.395),
- (ap_fixed<16, 3, AP_RND>)(1.0/57.12),
- (ap_fixed<16, 3, AP_RND>)(1.0/57.375)
- };
- const ap_fixed<16, 3, AP_RND> img_norm_bias[3] = {
- (ap_fixed<16, 3, AP_RND>)(-123.675/58.395),
- (ap_fixed<16, 3, AP_RND>)(-116.28/57.12),
- (ap_fixed<16, 3, AP_RND>)(-103.53/57.375)
- };
-
- // template <int IL_IN, int FL_IN, int IL_OUT, int FL_OUT>
- // ap_uint<(IL_OUT+FL_OUT)> truncate_img_norm(
- // ap_uint<(IL_IN+FL_IN)> in) {
-
- // ap_int<(IL_OUT+FL_OUT)> out;
- // ap_int<(IL_OUT+FL_OUT+1)> out_tmp;
-
- // out_tmp = in >> (IL_IN+FL_IN-IL_OUT-FL_OUT-1);
-
- // if(out_tmp > 0){
- // if(out_tmp < ((1<<(IL_OUT+FL_OUT))-1)){
- // out_tmp += 1;
- // }
- // }
- // // else{
- // // if(out != -(1<<(IL_OUT+FL_OUT))){
- // // out -= 1;
- // // }
- // // }
-
- // out = out_tmp >> 1;
-
- // return out;
- // }
-
- template <int IL_IN, int FL_IN, int IL_OUT, int FL_OUT>
- ap_uint<(IL_OUT+FL_OUT)> truncate_img_norm(
- ap_uint<(IL_IN+FL_IN)> in){
-
- ap_int<(IL_OUT+FL_OUT)> out;
-
- if(in(IL_IN+FL_IN-1, IL_IN+FL_IN-1) == (ap_uint<1>)0){
- out = in(IL_IN+FL_IN-1, IL_IN+FL_IN-IL_OUT-FL_OUT) + in(IL_IN+FL_IN-IL_OUT-FL_OUT-1, IL_IN+FL_IN-IL_OUT-FL_OUT-1);
- }
- else{
- out = in(IL_IN+FL_IN-1, IL_IN+FL_IN-IL_OUT-FL_OUT) +
- (in(IL_IN+FL_IN-IL_OUT-FL_OUT-1, IL_IN+FL_IN-IL_OUT-FL_OUT-1) && in(IL_IN+FL_IN-IL_OUT-FL_OUT-2, 0).or_reduce());
- // out = in(IL_IN+FL_IN-1, IL_IN+FL_IN-IL_OUT-FL_OUT);
- }
-
- return out;
- }
-
-
- template <int IL_IN, int FL_IN, int IL_OUT, int FL_OUT>
- ap_uint<(IL_OUT+FL_OUT)> truncate_unsigned(
- ap_uint<(IL_IN+FL_IN)> in) {
-
- ap_uint<(IL_OUT+FL_OUT)> out;
- ap_uint<(IL_IN+FL_OUT+1)> out_tmp;
-
- const ap_uint<1> real_1 = 1;
- const ap_uint<IL_OUT+FL_OUT+2> max = (1<<(IL_OUT+FL_OUT+1)) - 1;
-
- out_tmp = in >> (FL_IN-FL_OUT-1);
- // cout << "in: " << in << endl;
- // cout << "out_tmp: " << out_tmp << endl;
-
- if(out_tmp < (max)){
- out_tmp += real_1;
- }
- else{
- out_tmp = max;
- }
- // cout << "out_tmp: " << out_tmp << endl;
-
- out = out_tmp >> real_1;
- // cout << "out: " << out << endl;
-
- return out;
- }
-
-
- template <int BIT_IN, int SIMD>
- ap_uint<BIT_IN*SIMD> img_norm_calc(
- ap_uint<BIT_IN*SIMD> in,
- const ap_fixed<16, 3, AP_RND> weights[3],
- const ap_fixed<16, 3, AP_RND> bias[3]){
-
- ap_uint<BIT_IN*SIMD> res_out = 0;
-
- for(int i=0; i<SIMD; i++){
- #pragma HLS UNROLL
- ap_uint<BIT_IN> temp_in = in(BIT_IN*(i+1)-1, BIT_IN*i);
- ap_fixed<16, 3, AP_RND> temp_res = temp_in*weights[i] + bias[i];
- ap_uint<16> temp_res_uint = *(ap_uint<16>*)&temp_res;
- ap_uint<BIT_IN> res_truncated = truncate_img_norm<3, 13, 3, 5>(temp_res_uint);
- res_out(BIT_IN*(i+1)-1, BIT_IN*i) = res_truncated;
- }
-
- return res_out;
- }
-
- template <int BIT_IN, int SIMD, int IMG_ROW, int IMG_COL, int BIT_OUT >
- void img_norm_none( hls::stream<ap_uint<BIT_IN*SIMD> >& in,
- hls::stream<ap_uint<BIT_OUT*SIMD> >& out,ap_uint<10> reps){
-
- // #pragma HLS DATAFLOW
-
- const unsigned loop_num = IMG_ROW*IMG_COL;
- for(int mm = 0;mm<reps;mm++){
- for(int i=0; i<IMG_ROW; i++){
- looppp:for(int m = 0;m < IMG_COL; m++ ){
- #pragma HLS PIPELINE II=1
- ap_uint<BIT_IN*SIMD> in_read = in.read();
- ap_uint<BIT_OUT*SIMD> out_buf;
- loopaaa:for(int j=0;j<SIMD;j++){
- ap_uint<BIT_IN> in_temp;
- in_temp = in_read(BIT_IN*(j+1)-1,BIT_IN * j);
- ap_uint<BIT_OUT>dac = (in_temp>>(BIT_IN-BIT_OUT));
- if(dac != 127)dac = dac+in_temp(BIT_IN-BIT_OUT-1,BIT_IN-BIT_OUT-1);
- out_buf(BIT_OUT*(j+1)-1,BIT_OUT * j) = dac;
- }
- out.write(out_buf);
- }
- }
- for(int i = 0;i<4;i++){
- for(int m = 0;m < IMG_COL; m++ ){
- #pragma HLS PIPELINE II=1
- ap_uint<BIT_OUT*SIMD> out_buf = 0;
- out.write(out_buf);
- }
- }
- }
-
- }
-
-
-
- template < int BIT_IN,
- int BIT_W,
- int BIT_ACC,
- int SIMD>
- ap_int<BIT_ACC> tioo_mutiplier(
- ap_uint<BIT_IN*SIMD> in,
- ap_uint<BIT_W*SIMD> weight){
- ap_int<BIT_ACC> accumulation = 0;
-
-
- for(int i=0; i<SIMD; i+=2){
- #pragma HLS UNROLL
-
- ap_int<BIT_IN>in_0;
- ap_int<BIT_IN>in_1;
-
- ap_int<BIT_W>w_0;
- ap_int<BIT_W>w_1;
-
- ap_int<3*(BIT_W+BIT_IN)>result;
-
- ap_int<(BIT_W+BIT_IN)>out_0;
- ap_int<(BIT_W+BIT_IN)>out_1;
-
- w_0 = weight((i+1)*BIT_W-1, i*BIT_W);
- w_1 = weight((i+2)*BIT_W-1, (i+1)*BIT_W);
-
- in_0 = in((i+1)*BIT_IN-1, i*BIT_IN);
- in_1 = in((i+2)*BIT_IN-1, (i+1)*BIT_IN);
-
- ap_uint<1> sign_0 = w_0(BIT_W - 1, BIT_W - 1);
- ap_uint<1> sign_1 = w_1(BIT_W - 1, BIT_W - 1);
-
- w_0 = sign_0?(ap_int<BIT_W>)(~w_0+1):w_0;
- w_1 = sign_1?(ap_int<BIT_W>)(~w_1+1):w_1;
-
- ap_uint<2*BIT_W+BIT_IN> temp_w = 0;
- ap_uint<2*BIT_IN+BIT_W> temp_in = 0;
-
- temp_w(BIT_W-1,0) = w_0;
- temp_w(2*BIT_W+BIT_IN-1,BIT_W+BIT_IN) = w_1;
-
- temp_in(BIT_IN-1,0) = in_0;
- temp_in(2*BIT_IN+BIT_W-1,1*(BIT_W+BIT_IN)) = in_1;
-
- result = temp_in * temp_w;
-
- out_0 = result((BIT_W+BIT_IN)-1,0);
- out_1 = result(3*(BIT_W+BIT_IN)-1,2*(BIT_W+BIT_IN));
-
- out_0 = sign_0?(ap_int<BIT_W+BIT_IN>)(~out_0+1):out_0;
- out_1 = sign_1?(ap_int<BIT_W+BIT_IN>)(~out_1+1):out_1;
-
- accumulation = accumulation + out_0 + out_1;
- }
- return accumulation;
- }
-
-
-
- template < int BIT_IN,
- int BIT_W,
- int BIT_ACC,
- int SIMD>
- ap_int<BIT_ACC> tioo_mutiplier_sb(
- ap_uint<BIT_IN*SIMD> in,
- ap_uint<BIT_W*SIMD> weight){
- ap_int<BIT_ACC> accumulation = 0;
-
-
- for(int i=0; i<SIMD; i+=2){
- #pragma HLS UNROLL
-
- ap_int<BIT_IN>in_0;
- ap_int<BIT_IN>in_1;
-
- ap_int<BIT_W>w_0;
- ap_int<BIT_W>w_1;
-
- ap_int<(BIT_W+BIT_IN)>result;
- ap_int<2+(BIT_W+BIT_IN)>result_2;
-
- ap_int<(BIT_W+BIT_IN)>out_0;
- ap_int<(BIT_W+BIT_IN)>out_1;
-
- w_0 = weight((i+1)*BIT_W-1, i*BIT_W);
- w_1 = weight((i+2)*BIT_W-1, (i+1)*BIT_W);
-
- in_0 = in((i+1)*BIT_IN-1, i*BIT_IN);
- in_1 = in((i+2)*BIT_IN-1, (i+1)*BIT_IN);
-
-
- result = w_0 * in_0;
- #pragma HLS RESOURCE variable=result core=Mul_LUT
-
- result_2 = w_1 * in_1 + result;
- accumulation += result_2;
- }
- return accumulation;
- }
-
-
- template <int BIT_IN, int SIMD, int IMG_ROW, int IMG_COL>
- void img_norm( hls::stream<ap_uint<BIT_IN*SIMD> >& in,
- hls::stream<ap_uint<BIT_IN*SIMD> >& out,
- ap_uint<10> reps ){
-
- // #pragma HLS DATAFLOW
-
- const unsigned loop_num = IMG_ROW*IMG_COL*reps;
-
- for(int i=0; i<loop_num; i++){
- #pragma HLS PIPELINE II=1
- ap_uint<BIT_IN*SIMD> in_read = in.read();
- ap_uint<BIT_IN*SIMD> out_buf = img_norm_calc<BIT_IN, SIMD>(in_read, img_norm_weight, img_norm_bias);
- out.write(out_buf);
- }
- }
-
- /************************ Image Normalization ************************/
-
- // template < int BIT_IN,
- // int BIT_W,
- // int BIT_ACC,
- // int SIMD>
- // ap_int<BIT_ACC> chi_vector_dot_product(
- // ap_uint<BIT_IN*SIMD> in,
- // ap_uint<BIT_W*SIMD> weight){
-
- // ap_int<BIT_ACC> accumulation = 0;
- // ap_uint<(2*BIT_IN+BIT_W)> temp_in;
- // ap_uint<(2*BIT_W+BIT_IN)> temp_w;
-
- // ap_uint<BIT_IN> temp_in_true_0;
- // ap_uint<BIT_IN> temp_in_true_1;
- // ap_uint<BIT_W> temp_weight_true_0;
- // ap_uint<BIT_W> temp_weight_true_1;
-
- // ap_uint<BIT_IN+BIT_W-1> res_complement_0;
- // ap_uint<BIT_IN+BIT_W-1> res_complement_1;
-
- // ap_uint<1> res_sign_0;
- // ap_uint<1> res_sign_1;
-
- // ap_int<BIT_IN+BIT_W> res_0;
- // ap_int<BIT_IN+BIT_W> res_1;
-
- // for(int i=0; i<(SIMD/2)+1; i++){
- // #pragma HLS UNROLL
- // if(i==0){
- // ap_int<BIT_IN> temp_in_0 = in((i+1)*BIT_IN-1, i*BIT_IN);
- // ap_int<BIT_W> temp_w_0 = weight((i+1)*BIT_W-1, i*BIT_W);
- // ap_int<BIT_IN> temp_in_1 = in((i+2)*BIT_IN-1, (i+1)*BIT_IN);
- // ap_int<BIT_W> temp_w_1 = weight((i+2)*BIT_W-1, (i+1)*BIT_W);
-
- // /* get sign bit */
- // ap_uint<1> temp_in_sign_0 = temp_in_0(BIT_IN-1, BIT_IN-1);
- // ap_uint<1> temp_weight_sign_0 = temp_w_0(BIT_W-1, BIT_W-1);
- // ap_uint<1> temp_in_sign_1 = temp_in_1(BIT_IN-1, BIT_IN-1);
- // ap_uint<1> temp_weight_sign_1 = temp_w_1(BIT_W-1, BIT_W-1);
-
- // if(temp_in_sign_0==1){
- // if(temp_in_0==-(1<<(BIT_IN-1))){
- // temp_in_true_0 = 1<<(BIT_IN-1);
- // }
- // else{
- // temp_in_true_0(BIT_IN-1, BIT_IN-1) = 0;
- // temp_in_true_0(BIT_IN-2, 0) = ~(temp_in_0(BIT_IN-2, 0)-1);
- // }
- // }
- // else{
- // temp_in_true_0 = temp_in_0;
- // }
-
- // if(temp_weight_sign_0==1){
- // if(temp_w_0==-(1<<(BIT_W-1))){
- // temp_weight_true_0 = 1<<(BIT_W-1);
- // }
- // else{
- // temp_weight_true_0(BIT_W-1, BIT_W-1) = 0;
- // temp_weight_true_0(BIT_W-2, 0) = ~(temp_w_0(BIT_W-2, 0)-1);
- // }
- // }
- // else{
- // temp_weight_true_0 = temp_w_0;
- // }
-
- // if(temp_in_sign_1==1){
- // if(temp_in_1==-(1<<(BIT_IN-1))){
- // temp_in_true_1 = 1<<(BIT_IN-1);
- // }
- // else{
- // temp_in_true_1(BIT_IN-1, BIT_IN-1) = 0;
- // temp_in_true_1(BIT_IN-2, 0) = ~(temp_in_1(BIT_IN-2, 0)-1);
- // }
- // }
- // else{
- // temp_in_true_1 = temp_in_1;
- // }
-
- // if(temp_weight_sign_1==1){
- // if(temp_w_1==-(1<<(BIT_W-1))){
- // temp_weight_true_1 = 1<<(BIT_W-1);
- // }
- // else{
- // temp_weight_true_1(BIT_W-1, BIT_W-1) = 0;
- // temp_weight_true_1(BIT_W-2, 0) = ~(temp_w_1(BIT_W-2, 0)-1);
- // }
- // }
- // else{
- // temp_weight_true_1 = temp_w_1;
- // }
-
- // temp_in(BIT_IN+BIT_W-1, 0) = temp_in_true_0;
- // temp_in(2*BIT_IN+BIT_W-1, BIT_IN+BIT_W) = temp_in_true_1;
- // temp_w(BIT_IN+BIT_W-1, 0) = temp_weight_true_0;
- // temp_w(BIT_IN+2*BIT_W-1, BIT_IN+BIT_W) = temp_weight_true_1;
-
- // ap_uint<(3*(BIT_IN+BIT_W))> result = temp_in * temp_w;
- // #pragma HLS RESOURCE variable=result core=DSP48
-
- // if(temp_in_0!=0 && temp_w_0!=0){
- // res_sign_0 = temp_in_sign_0 ^ temp_weight_sign_0;
- // }
- // else{
- // res_sign_0 = 0;
- // }
- // res_0(BIT_IN+BIT_W-1, BIT_IN+BIT_W-1) = res_sign_0;
-
- // if(temp_in_1!=0 && temp_w_1!=0){
- // res_sign_1 = temp_in_sign_1 ^ temp_weight_sign_1;
- // }
- // else{
- // res_sign_1 = 0;
- // }
- // res_1(BIT_IN+BIT_W-1, BIT_IN+BIT_W-1) = res_sign_1;
-
- // res_complement_0 = result(BIT_IN+BIT_W-2, 0);
- // if(res_sign_0==1){
- // res_complement_0 = ~res_complement_0 + 1;
- // }
- // res_0(BIT_IN+BIT_W-2, 0) = res_complement_0;
-
- // res_complement_1 = result(3*(BIT_IN+BIT_W)-2, 2*(BIT_IN+BIT_W));
- // if(res_sign_1==1){
- // res_complement_1 = ~res_complement_1 + 1;
- // }
- // res_1(BIT_IN+BIT_W-2, 0) = res_complement_1;
-
- // accumulation += (res_0 + res_1);
-
- // // cout << "temp_in_0: " << temp_in_0 << endl;
- // // cout << "temp_w_0: " << temp_w_0 << endl;
- // // cout << "temp_in_1: " << temp_in_1 << endl;
- // // cout << "temp_w_1: " << temp_w_1 << endl;
- // // cout << "res_0: " << res_0 << endl;
- // // cout << "res_1: " << res_1 << endl;
- // // cout << "accumulation: " << accumulation << endl;
- // }
- // else{
- // ap_int<BIT_W> temp_w = weight((i+2)*BIT_W-1, (i+1)*BIT_W);
- // ap_int<BIT_IN> temp_in = in((i+2)*BIT_IN-1, (i+1)*BIT_IN);
- // ap_int<(BIT_W+BIT_IN)> result = temp_in * temp_w;
- // #pragma HLS RESOURCE variable=result core=DSP48
- // accumulation += result;
-
- // // cout << "temp_in: " << temp_in << endl;
- // // cout << "temp_w: " << temp_w << endl;
- // // cout << "res: " << result << endl;
- // // cout << "accumulation: " << accumulation << endl;
- // }
- // }
- // return accumulation;
- // }
-
- // template < int BIT_IN,
- // int BIT_W,
- // int BIT_ACC,
- // int SIMD>
- // ap_int<BIT_ACC> chi_vector_dot_product_unsigned(
- // ap_uint<BIT_IN*SIMD> in,
- // ap_uint<BIT_W*SIMD> weight){
-
- // ap_int<BIT_ACC> accumulation = 0;
-
- // ap_uint<(2*BIT_IN+BIT_W)> temp_in;
- // ap_uint<(2*BIT_W+BIT_IN)> temp_w;
-
- // ap_uint<BIT_W> temp_weight_true_0;
- // ap_uint<BIT_W> temp_weight_true_1;
-
- // ap_uint<BIT_IN+BIT_W-1> res_complement_0;
- // ap_uint<BIT_IN+BIT_W-1> res_complement_1;
-
- // ap_uint<1> res_sign_0;
- // ap_uint<1> res_sign_1;
-
- // ap_int<BIT_IN+BIT_W> res_0;
- // ap_int<BIT_IN+BIT_W> res_1;
-
- // for(int i=0; i<SIMD; i+=2){
- // #pragma HLS UNROLL
- // ap_int<BIT_W> temp_w_0 = weight((i+1)*BIT_W-1, i*BIT_W);
- // ap_int<BIT_W> temp_w_1 = weight((i+2)*BIT_W-1, (i+1)*BIT_W);
- // ap_uint<BIT_IN> temp_in_0 = in((i+1)*BIT_IN-1, i*BIT_IN);
- // ap_uint<BIT_IN> temp_in_1 = in((i+2)*BIT_IN-1, (i+1)*BIT_IN);
-
- // // ap_uint<1> temp_weight_sign_0 = weight((i+1)*BIT_W-1, (i+1)*BIT_W-1);
- // // ap_uint<1> temp_weight_sign_1 = weight((i+2)*BIT_W-1, (i+2)*BIT_W-1);
-
- // if(weight((i+1)*BIT_W-1, (i+1)*BIT_W-1) == 1){
- // // if(temp_w_0==-(1<<(BIT_W-1))){
- // // temp_weight_true_0 = 1<<(BIT_W-1);
- // // }
- // // else{
- // temp_weight_true_0(BIT_W-1, BIT_W-1) = 0;
- // temp_weight_true_0(BIT_W-2, 0) = ~(temp_w_0(BIT_W-2, 0)-1);
- // // }
- // }
- // else{
- // temp_weight_true_0 = temp_w_0;
- // }
-
- // if(weight((i+2)*BIT_W-1, (i+2)*BIT_W-1) == 1){
- // // if(temp_w_1==-(1<<(BIT_W-1))){
- // // temp_weight_true_1 = 1<<(BIT_W-1);
- // // }
- // // else{
- // temp_weight_true_1(BIT_W-1, BIT_W-1) = 0;
- // temp_weight_true_1(BIT_W-2, 0) = ~(temp_w_1(BIT_W-2, 0)-1);
- // // }
- // }
- // else{
- // temp_weight_true_1 = temp_w_1;
- // }
-
- // if(temp_in_0 != 0){
- // res_sign_0 = weight((i+1)*BIT_W-1, (i+1)*BIT_W-1);
- // }
- // else{
- // res_sign_0 = 0;
- // }
-
- // if(temp_in_1 != 0){
- // res_sign_1 = weight((i+2)*BIT_W-1, (i+2)*BIT_W-1);
- // }
- // else{
- // res_sign_1 = 0;
- // }
-
- // temp_in(BIT_IN+BIT_W-1, 0) = temp_in_0;
- // temp_in(2*BIT_IN+BIT_W-1, BIT_IN+BIT_W) = temp_in_1;
- // temp_w(BIT_IN+BIT_W-1, 0) = temp_weight_true_0;
- // temp_w(BIT_IN+2*BIT_W-1, BIT_IN+BIT_W) = temp_weight_true_1;
-
- // ap_uint<(3*(BIT_W+BIT_IN))> result = temp_w * temp_in;
- // #pragma HLS RESOURCE variable=result core=DSP48
- // // #pragma HLS RESOURCE variable=result core=MulnS
-
- // res_complement_0 = result(BIT_IN+BIT_W-2, 0);
- // if(res_sign_0==1){
- // res_complement_0 = ~res_complement_0 + 1;
- // }
- // res_0(BIT_IN+BIT_W-1, BIT_IN+BIT_W-1) = res_sign_0;
- // res_0(BIT_IN+BIT_W-2, 0) = res_complement_0;
-
- // res_complement_1 = result(3*(BIT_IN+BIT_W)-2, 2*(BIT_IN+BIT_W));
- // if(res_sign_1==1){
- // res_complement_1 = ~res_complement_1 + 1;
- // }
- // res_1(BIT_IN+BIT_W-1, BIT_IN+BIT_W-1) = res_sign_1;
- // res_1(BIT_IN+BIT_W-2, 0) = res_complement_1;
-
- // accumulation += (res_0 + res_1);
- // }
- // return accumulation;
- // }
-
- template < int BIT_IN,
- int BIT_W,
- int BIT_ACC,
- int SIMD>
- ap_int<BIT_ACC> chi_vector_dot_product(
- ap_uint<BIT_IN*SIMD> in,
- ap_uint<BIT_W*SIMD> weight){
-
- ap_int<BIT_ACC> accumulation = 0;
-
- for(int i=0; i<SIMD; i++){
- #pragma HLS UNROLL
- ap_int<BIT_W> temp_w = weight((i+1)*BIT_W-1, i*BIT_W);
- ap_int<BIT_IN> temp_in = in((i+1)*BIT_IN-1, i*BIT_IN);
- ap_int<(BIT_W+BIT_IN)> result = temp_in * temp_w;
- accumulation += result;
- }
- // cout << "accumulation: " << accumulation << endl;
- return accumulation;
- }
-
- template < int BIT_IN,
- int BIT_W,
- int BIT_ACC,
- int SIMD>
- ap_int<BIT_ACC> chi_vector_dot_product_unsigned(
- ap_uint<BIT_IN*SIMD> in,
- ap_uint<BIT_W*SIMD> weight){
-
- ap_int<BIT_ACC> accumulation = 0;
-
- for(int i=0; i<SIMD; i++){
- #pragma HLS UNROLL
- ap_int<BIT_W> temp_w = weight((i+1)*BIT_W-1, i*BIT_W);
- ap_uint<BIT_IN> temp_in = in((i+1)*BIT_IN-1, i*BIT_IN);
- ap_int<(BIT_W+BIT_IN)> result = temp_in * temp_w;
- accumulation += result;
- }
- return accumulation;
- }
-
- template < int BIT_IN,
- int BIT_W,
- int BIT_ACC,
- int SIMD>
- ap_int<(BIT_W+BIT_IN)*SIMD> desperate_vector_pot_product_signed(
- ap_uint<BIT_IN*SIMD> in,
- ap_uint<BIT_W*SIMD> weight){
-
- ap_int<(BIT_W+BIT_IN)*SIMD> accumulation = 0;
-
- for(int i=0; i<SIMD; i++){
- #pragma HLS UNROLL
- ap_int<BIT_W> temp_w = weight((i+1)*BIT_W-1, i*BIT_W);
- ap_uint<BIT_IN> temp_in = in((i+1)*BIT_IN-1, i*BIT_IN);
- ap_int<(BIT_W+BIT_IN)> result = temp_in * temp_w;
- #pragma HLS RESOURCE variable=result core=Mul_LUT
- accumulation((BIT_W+BIT_IN)*(i+1)-1,(BIT_W+BIT_IN)*i) = result;
- }
- return accumulation;
- }
-
-
- template < int BIT_IN,
- int BIT_W,
- int BIT_ACC,
- int SIMD>
- ap_int<BIT_ACC> desperate_vector_pot_product(
- ap_uint<BIT_IN*SIMD> in,
- ap_uint<BIT_W*SIMD> weight){
-
- ap_int<BIT_ACC> accumulation = 0;
-
- for(int i=0; i<SIMD; i++){
- #pragma HLS UNROLL
- ap_int<BIT_W> temp_w = weight((i+1)*BIT_W-1, i*BIT_W);
- ap_uint<BIT_IN> temp_in = in((i+1)*BIT_IN-1, i*BIT_IN);
- ap_int<(BIT_W+BIT_IN)> result = temp_in * temp_w;
- #pragma HLS RESOURCE variable=result core=Mul_LUT
- accumulation((BIT_W+BIT_IN)*(i+1)-1,(BIT_W+BIT_IN)*i) = result;
- }
- return accumulation;
- }
-
-
-
- template < int BIT_IN,
- int BIT_W,
- int BIT_ACC,
- int SIMD>
- ap_int<BIT_ACC> tito_muler(
- ap_uint<BIT_IN*SIMD> in,
- ap_uint<BIT_W*SIMD> weight){
-
- ap_int<BIT_ACC> accumulation = 0;
-
- for(int i=0; i<SIMD; i++){
- #pragma HLS UNROLL
- ap_int<BIT_W> temp_w = weight((i+1)*BIT_W-1, i*BIT_W);
- ap_uint<BIT_IN> temp_in = in((i+1)*BIT_IN-1, i*BIT_IN);
- ap_int<(BIT_W+BIT_IN)> result = temp_in * temp_w;
- accumulation((BIT_W+BIT_IN)*(i+1)-1,(BIT_W+BIT_IN)*i) = result;
- }
- return accumulation;
- }
-
-
- template < int BIT_IN,
- int BIT_W,
- int BIT_ACC,
- int SIMD>
- void thwioo_tuple_sb(
- ap_uint<BIT_IN>in_0,
- ap_int<BIT_W> weight_0,
- ap_int<BIT_W> weight_1,
- ap_int<BIT_W> weight_2,
- ap_int<(BIT_W+BIT_IN)>&out_0,
- ap_int<(BIT_W+BIT_IN)>&out_1,
- ap_int<(BIT_W+BIT_IN)>&out_2
- ){
-
- ap_int<3*(BIT_W+BIT_IN-1)>result;
-
- ap_uint<BIT_W> real_weight_0;
- ap_uint<BIT_W> real_weight_1;
- ap_uint<BIT_W> real_weight_2;
-
- const ap_uint<1> real_one = 1;
-
- ap_uint<1> sign_0 = weight_0(BIT_W - 1, BIT_W - 1);
- ap_uint<1> sign_1 = weight_1(BIT_W - 1, BIT_W - 1);
- ap_uint<1> sign_2 = weight_2(BIT_W - 1, BIT_W - 1);
-
- real_weight_0 = (~weight_0);
- real_weight_1 = (~weight_1);
- real_weight_2 = (~weight_2);
-
- if(sign_0) weight_0 = real_weight_0+real_one;
- if(sign_1) weight_1 = real_weight_1+real_one;
- if(sign_2) weight_2 = real_weight_2+real_one;
- /*
- weight_0 = sign_0?(ap_int<BIT_W>)(real_weight_0+real_one):weight_0;
- weight_1 = sign_1?(ap_int<BIT_W>)(real_weight_1+real_one):weight_1;
- weight_2 = sign_2?(ap_int<BIT_W>)(real_weight_2+real_one):weight_2;
- */
- ap_uint<3*(BIT_W-1)+2*BIT_IN> temp_w = 0;
-
- temp_w(BIT_W-2,0) = weight_0;
- temp_w(2*(BIT_W-1)+BIT_IN-1,BIT_W-1+BIT_IN) = weight_1;
- temp_w(3*(BIT_W-1)+2*BIT_IN-1,2*(BIT_W-1+BIT_IN)) = weight_2;
-
- result = in_0 * temp_w;
-
- out_0 = result(1*(BIT_W+BIT_IN-1)-1,0*(BIT_W+BIT_IN-1));
- out_1 = result(2*(BIT_W+BIT_IN-1)-1,1*(BIT_W+BIT_IN-1));
- out_2 = result(3*(BIT_W+BIT_IN-1)-1,2*(BIT_W+BIT_IN-1));
-
- out_0 = sign_0?(ap_int<BIT_W+BIT_IN>)(~out_0+1):out_0;
- out_1 = sign_1?(ap_int<BIT_W+BIT_IN>)(~out_1+1):out_1;
- out_2 = sign_2?(ap_int<BIT_W+BIT_IN>)(~out_2+1):out_2;
-
- }
-
- template < int BIT_IN,
- int BIT_W,
- int BIT_ACC,
- int SIMD>
- void thwioo_tuple(
- ap_uint<BIT_IN>in_0,
- ap_int<BIT_W> weight_0,
- ap_int<BIT_W> weight_1,
- ap_int<BIT_W> weight_2,
- ap_int<(BIT_W+BIT_IN)>&out_0,
- ap_int<(BIT_W+BIT_IN)>&out_1,
- ap_int<(BIT_W+BIT_IN)>&out_2
- ){
-
- ap_int<3*(BIT_W+BIT_IN-1)>result;
-
- const ap_uint<1> real_one = 1;
-
- ap_uint<1> sign_0 = weight_0(BIT_W - 1, BIT_W - 1);
- ap_uint<1> sign_1 = weight_1(BIT_W - 1, BIT_W - 1);
- ap_uint<1> sign_2 = weight_2(BIT_W - 1, BIT_W - 1);
-
- ap_uint<BIT_W+BIT_IN> all_sign0;
- ap_uint<BIT_W+BIT_IN> all_sign1;
- ap_uint<BIT_W+BIT_IN> all_sign2;
-
- for(int i=0;i<BIT_W+BIT_IN;i++){
- #pragma hls unroll
- all_sign0(i,i) = sign_0;
- all_sign1(i,i) = sign_1;
- all_sign2(i,i) = sign_2;
- }
-
- ap_uint<BIT_W+BIT_IN> in_neg;
-
- ap_uint<BIT_W+BIT_IN> in_mod_0;
- ap_uint<BIT_W+BIT_IN> in_mod_1;
- ap_uint<BIT_W+BIT_IN> in_mod_2;
-
- ap_uint<BIT_W+BIT_IN> full = 4095;
-
- in_neg = (in_0 ^ full) + 1;
-
- ap_uint<3*(BIT_W-1)+2*BIT_IN> temp_w = 0;
-
- temp_w(BIT_W-2,0) = weight_0;
- temp_w(2*(BIT_W-1)+BIT_IN-1,BIT_W-1+BIT_IN) = weight_1;
- temp_w(3*(BIT_W-1)+2*BIT_IN-1,2*(BIT_W-1+BIT_IN)) = weight_2;
-
- result = in_0 * temp_w;
-
- out_0 = result(1*(BIT_W+BIT_IN-1)-1,0*(BIT_W+BIT_IN-1));
- out_1 = result(2*(BIT_W+BIT_IN-1)-1,1*(BIT_W+BIT_IN-1));
- out_2 = result(3*(BIT_W+BIT_IN-1)-1,2*(BIT_W+BIT_IN-1));
-
- in_neg = in_neg << (BIT_W - 1);
-
- in_mod_0 = in_neg & all_sign0;
- in_mod_1 = in_neg & all_sign1;
- in_mod_2 = in_neg & all_sign2;
-
- out_0 += in_mod_0;
- out_1 += in_mod_1;
- out_2 += in_mod_2;
-
- }
- template < int BIT_IN,
- int BIT_W,
- int BIT_ACC,
- int SIMD>
- void thwioo_tuple_dcm(
- ap_uint<BIT_IN>in_0,
- ap_int<BIT_W> weight_0,
- ap_int<BIT_W> weight_1,
- ap_int<BIT_W> weight_2,
- ap_int<(BIT_W+BIT_IN)>&out_0,
- ap_int<(BIT_W+BIT_IN)>&out_1,
- ap_int<(BIT_W+BIT_IN)>&out_2
- ){
- #pragma hls INLINE off
- ap_int<3*(BIT_W+BIT_IN-1)>result;
- /*
- ap_uint<BIT_W-1> real_weight_0;
- ap_uint<BIT_W-1> real_weight_1;
- ap_uint<BIT_W-1> real_weight_2;
-
- ap_uint<1> sign_0 = weight_0(BIT_W - 1, BIT_W - 1);
- ap_uint<1> sign_1 = weight_1(BIT_W - 1, BIT_W - 1);
- ap_uint<1> sign_2 = weight_2(BIT_W - 1, BIT_W - 1);
-
- weight_0 = sign_0?(ap_int<BIT_W>)(~weight_0+1):weight_0;
- weight_1 = sign_1?(ap_int<BIT_W>)(~weight_1+1):weight_1;
- weight_2 = sign_2?(ap_int<BIT_W>)(~weight_2+1):weight_2;
-
- ap_uint<3*(BIT_W-1)+2*BIT_IN> temp_w = 0;
-
- temp_w(BIT_W-2,0) = weight_0;
- temp_w(2*(BIT_W-1)+BIT_IN-1,BIT_W-1+BIT_IN) = weight_1;
- temp_w(3*(BIT_W-1)+2*BIT_IN-1,2*(BIT_W-1+BIT_IN)) = weight_2;
-
- result = in_0 * temp_w;
-
- out_0 = result(1*(BIT_W+BIT_IN-1)-1,0*(BIT_W+BIT_IN-1));
- out_1 = result(2*(BIT_W+BIT_IN-1)-1,1*(BIT_W+BIT_IN-1));
- out_2 = result(3*(BIT_W+BIT_IN-1)-1,2*(BIT_W+BIT_IN-1));
-
- out_0 = sign_0?(ap_int<BIT_W+BIT_IN>)(~out_0+1):out_0;
- out_1 = sign_1?(ap_int<BIT_W+BIT_IN>)(~out_1+1):out_1;
- out_2 = sign_2?(ap_int<BIT_W+BIT_IN>)(~out_2+1):out_2;
- */
- out_0 = in_0 * weight_0;
- #pragma HLS RESOURCE variable=out_0 core=Mul_LUT
- out_1 = in_0 * weight_1;
- #pragma HLS RESOURCE variable=out_1 core=Mul_LUT
- out_2 = in_0 * weight_2;
- #pragma HLS RESOURCE variable=out_2 core=Mul_LUT
- }
-
-
- template < int BIT_IN,
- int BIT_W,
- int BIT_ACC,
- int SIMD>
- ap_uint<BIT_ACC*3> tihoo_mutiplier(
- ap_uint<BIT_IN*SIMD> in,
-
- ap_uint<BIT_W*SIMD> weight_0,
- ap_uint<BIT_W*SIMD> weight_1,
- ap_uint<BIT_W*SIMD> weight_2
-
- ){
-
-
- ap_int<BIT_ACC> accumulation_0 = 0;
- ap_int<BIT_ACC> accumulation_1 = 0;
- ap_int<BIT_ACC> accumulation_2 = 0;
- for(int i=0; i<SIMD; i++){
- #pragma HLS UNROLL
-
- ap_int<(BIT_W+BIT_IN)>out_0;
- ap_int<(BIT_W+BIT_IN)>out_1;
- ap_int<(BIT_W+BIT_IN)>out_2;
-
- ap_int<BIT_W>w_0;
- ap_int<BIT_W>w_1;
- ap_int<BIT_W>w_2;
-
-
- w_0 = weight_0((i+1)*BIT_W-1, i*BIT_W);
- w_1 = weight_1((i+1)*BIT_W-1, (i)*BIT_W);
- w_2 = weight_2((i+1)*BIT_W-1, (i)*BIT_W);
-
- ap_uint<BIT_IN> in_temp = in((i+1)*BIT_IN-1, i*BIT_IN);
- thwioo_tuple< BIT_IN,BIT_W,BIT_ACC,SIMD>(in_temp,w_0,w_1,w_2,out_0,out_1,out_2);
- /*
- out_0 = in_temp * w_0;
- #pragma HLS RESOURCE variable=out_0 core=Mul_LUT
- out_1 = in_temp * w_1;
- #pragma HLS RESOURCE variable=out_1 core=Mul_LUT
- out_2 = in_temp * w_2;
- #pragma HLS RESOURCE variable=out_2 core=Mul_LUT
- */
- accumulation_0 += out_0;
- accumulation_1 += out_1;
- accumulation_2 += out_2;
- }
- ap_uint<BIT_ACC*3> out_buf;
- out_buf(BIT_ACC - 1,0) = accumulation_0;
- out_buf(2*BIT_ACC - 1,BIT_ACC) = accumulation_1;
- out_buf(3*BIT_ACC - 1,2*BIT_ACC) = accumulation_2;
-
- return out_buf;
-
- }
-
- template < int NUM_IN,
- int NUM_OUT,
-
- int BIT_IN,
- int FL_IN,
- int BIT_OUT,
- int FL_OUT,
-
- int BIT_W,
- int BIT_ALPHA,
- int FL_ALPHA,
- int BIT_BIAS,
- int BIT_TMP,
-
- int SIMD,
- int PE,
- int VECT_NUMS>
- void conv3x3( hls::stream<ap_uint<BIT_IN*SIMD> >& in,
- const ap_uint<BIT_W*SIMD*PE> weights[(NUM_OUT/PE)][(NUM_IN/9/SIMD)][9],
- const ap_uint<BIT_ALPHA> alpha[NUM_OUT/PE][PE],
- const ap_int<BIT_BIAS> bias[NUM_OUT/PE][PE],
- hls::stream<ap_uint<BIT_OUT*PE> >& out,
- ap_uint<10> reps){
-
- #pragma HLS DATAFLOW
-
- const unsigned INPUT_FOLD = NUM_IN/SIMD/9; /* input_channel / simd * kernel_size^2 */
- const unsigned OUTPUT_FOLD = NUM_OUT/PE; /* output_channel / pe */
- const unsigned BIT_ACC = 20;
-
- const unsigned total_loop_num = INPUT_FOLD * OUTPUT_FOLD * VECT_NUMS * reps;
-
- ap_uint<BIT_IN*SIMD> input_temp_arr[INPUT_FOLD][9];
- #pragma HLS ARRAY_PARTITION variable=input_temp_arr complete dim=2
-
- unsigned in_fold_cnt = 0;
- unsigned out_fold_cnt = 0;
- unsigned tile = 0;
-
- ap_uint<BIT_IN*SIMD> input_temp[9];
- #pragma HLS ARRAY_PARTITION variable=input_temp complete dim=0
- ap_int<BIT_ACC> acc[PE];
- #pragma HLS ARRAY_PARTITION variable=acc complete dim=0
- // ap_int<BIT_TMP> output_temp[PE];
- // ap_uint<BIT_OUT> output_uint[PE];
-
- for(int i=0; i<VECT_NUMS * reps; i++){
- for(int out_fold_cnt=0;out_fold_cnt<OUTPUT_FOLD;out_fold_cnt++){
- for(int ay = 0;ay < 9;ay++){
- for(int in_fold_cnt=0;in_fold_cnt<INPUT_FOLD;in_fold_cnt++){
- #pragma HLS PIPELINE II=1
- if(out_fold_cnt == 0){
- input_temp[ay] = in.read();
- input_temp_arr[in_fold_cnt][ay] = input_temp[ay];
-
- }else{
- for(int ay = 0;ay<9;ay++){
- input_temp[ay] = input_temp_arr[in_fold_cnt][ay];
- }
-
- }
-
- if((in_fold_cnt == 0)&&(ay == 0)){
- for(int m=0; m<PE; m++){
- #pragma HLS UNROLL
- acc[m] = 0;
- }
- }
- ap_uint<BIT_W*SIMD*PE> weight_temp = weights[out_fold_cnt][in_fold_cnt][ay];
- for(int pe = 0 ; pe < PE; pe++){
- #pragma HLS UNROLL
- ap_uint <BIT_W*SIMD> weight_pe_loop = weight_temp((pe+1)*BIT_W*SIMD-1,pe*BIT_W*SIMD);
- //ap_uint <BIT_IN*SIMD> in_pe_loop = input_temp((pe+1)*BIT_IN*SIMD-1,pe*BIT_IN*SIMD);
- acc[pe] += chi_vector_dot_product_unsigned<BIT_IN, BIT_W, BIT_ACC, SIMD>(input_temp[ay], weight_pe_loop);
- }
-
- }
- }
- ap_uint<BIT_OUT*PE> out_buf;
- for(int p=0; p<PE; p++){
- #pragma HLS UNROLL
- ap_int<BIT_TMP> output_temp = acc[p] * alpha[out_fold_cnt][p] + bias[out_fold_cnt][p];
- out_buf((p+1)*BIT_OUT-1, p*BIT_OUT) = output_temp;
- }
-
- out.write(out_buf);
- }
- }
- }
-
-
- template < int ROW_IN,
- int COL_IN,
- int CH_IN,
- int BIT_IN,
- int FL_IN,
-
- int CH_OUT,
- int BIT_OUT,
- int FL_OUT,
-
- int BIT_W,
- int BIT_ALPHA,
- int FL_ALPHA,
- int BIT_BIAS,
- int BIT_TMP,
-
- int SIMD,
- int PE>
- void conv3x3_layer_crc( hls::stream<ap_uint<SIMD*BIT_IN> >& in,
- const ap_uint<BIT_W * SIMD * PE> weights[CH_OUT / PE][CH_IN / SIMD][9],
- const ap_uint<BIT_ALPHA> alpha[CH_OUT/PE][PE],
- const ap_int<BIT_BIAS> bias[CH_OUT/PE][PE],
- hls::stream<ap_uint<BIT_OUT*PE> >& out,
- ap_uint<10> reps){
-
- #pragma HLS DATAFLOW
-
- const unsigned INTER_ROW = ROW_IN + 2;
- const unsigned INTER_COL = COL_IN + 2;
- const unsigned ROW_OUT = ROW_IN;
- const unsigned COL_OUT = COL_IN;
-
- stream<ap_uint<SIMD*BIT_IN> > padding_out("padding_out");
- padding<ROW_IN, COL_IN, CH_IN, SIMD, BIT_IN, 1>(in, padding_out, reps);
-
- stream<ap_uint<SIMD*BIT_IN> > swu_out("swu_out");
- SWU_DW<3, 1, INTER_ROW, INTER_COL, CH_IN, SIMD, SIMD, BIT_IN> (padding_out, swu_out, reps);
-
- // hls::stream<ap_uint<BIT_OUT*PE> > conv_out("conv_out");
-
- conv3x3<CH_IN*3*3, CH_OUT, BIT_IN, FL_IN, BIT_OUT, FL_OUT, BIT_W, BIT_ALPHA, FL_ALPHA, BIT_BIAS, BIT_TMP, SIMD, PE, ROW_OUT*COL_OUT >(
- swu_out, weights, alpha, bias, out, reps);
-
- // StreamingDataWidthConverter_Batch<PE*BIT_OUT, CH_OUT*BIT_OUT, ROW_OUT * COL_OUT * CH_OUT / PE>(
- // conv_out, out, reps );
- }
-
-
- template < int NUM_IN,
- int NUM_OUT,
-
- int BIT_IN,
- int FL_IN,
- int BIT_OUT,
- int FL_OUT,
-
- int BIT_W,
- int BIT_ALPHA,
- int FL_ALPHA,
- int BIT_BIAS,
- int BIT_TMP,
-
- int SIMD,
- int PE,
- int VECT_NUMS ,
- int BIT_ACC>
- void conv1x1_relu_t3_pf( hls::stream<ap_uint<BIT_IN*SIMD> >& in,
- const ap_uint<BIT_W*SIMD*PE> weights[(NUM_OUT/PE)][(NUM_IN/SIMD)],
- hls::stream<ap_uint<BIT_ACC*PE> >& out,
- ap_uint<10> reps){
-
- #pragma HLS DATAFLOW
-
- const unsigned INPUT_FOLD = NUM_IN/SIMD; /* input_channel / simd * kernel_size^2 */
- const unsigned OUTPUT_FOLD = NUM_OUT/PE; /* output_channel / pe */
-
- //const unsigned total_loop_num = INPUT_FOLD * OUTPUT_FOLD * VECT_NUMS * reps;
-
- ap_uint<BIT_IN*SIMD> input_temp_arr[INPUT_FOLD];
- #pragma HLS RESOURCE variable=input_temp_arr core=RAM_2P_LUTRAM
-
- unsigned tile = 0;
-
- ap_uint<BIT_IN*SIMD> input_temp;
- ap_int<BIT_ACC> acc[PE+2];
-
- #pragma HLS ARRAY_PARTITION variable=acc complete dim=0
- // ap_int<BIT_TMP> output_temp[PE];
- // ap_uint<BIT_OUT> output_uint[PE];
-
- for(ap_uint<25> i=0; i<VECT_NUMS * reps; i++){
- for(ap_uint<10> out_fold_cnt=0;out_fold_cnt<OUTPUT_FOLD;out_fold_cnt++){
- afwqe:for(ap_uint<10> in_fold_cnt=0;in_fold_cnt<INPUT_FOLD;in_fold_cnt++){
- #pragma HLS PIPELINE II=1
- if(out_fold_cnt == 0){
- input_temp = in.read();
- input_temp_arr[in_fold_cnt] = input_temp;
- }else{
- input_temp = input_temp_arr[in_fold_cnt];
- }
-
- if(in_fold_cnt == 0){
- for(ap_uint<8> m=0; m<PE; m++){
- #pragma HLS UNROLL
- acc[m] = 0;
- }
- }
-
- ap_uint<BIT_W*SIMD*PE> weight_temp = weights[out_fold_cnt][in_fold_cnt];
- loopqwe:for(ap_uint<7> pe = 0 ; pe < PE; pe +=3){
- #pragma HLS UNROLL
- #ifdef DEBUG
- //cout<<"PE:"<<pe<<endl;
- #endif
- ap_uint <BIT_W*SIMD> weight_pe_loop_0 = weight_temp((pe+1)*BIT_W*SIMD-1,pe*BIT_W*SIMD);
- ap_uint <BIT_W*SIMD> weight_pe_loop_1;
- ap_uint <BIT_W*SIMD> weight_pe_loop_2;
- if(pe+1<PE) weight_pe_loop_1 = weight_temp((pe+2)*BIT_W*SIMD-1,(pe+1)*BIT_W*SIMD);
- if(pe+2<PE) weight_pe_loop_2 = weight_temp((pe+3)*BIT_W*SIMD-1,(pe+2)*BIT_W*SIMD);
- ap_int<BIT_ACC*3> o;
- //ap_uint <BIT_IN*SIMD> in_pe_loop = input_temp((pe+1)*BIT_IN*SIMD-1,pe*BIT_IN*SIMD);
-
- o = tihoo_mutiplier<BIT_IN, BIT_W, BIT_ACC, SIMD>(input_temp, weight_pe_loop_0,weight_pe_loop_1,weight_pe_loop_2);
- acc[pe] += o(BIT_ACC - 1,0);
- acc[pe+1] += o(2*BIT_ACC - 1,BIT_ACC);
- acc[pe+2] += o(3*BIT_ACC - 1,2*BIT_ACC);
-
- }
- }
- ap_uint<BIT_ACC*PE> out_buf;
- for(ap_uint<8> p=0; p<PE; p++){
- #pragma HLS UNROLL
- out_buf((p+1)*BIT_ACC-1, p*BIT_ACC) = acc[p];
- }
- #ifdef DEBUG
- if(i==3*320){
- cout<< out_buf << endl;
- }
- #endif
- out.write(out_buf);
- }
- }
- }
-
-
- template < int NUM_IN,
- int NUM_OUT,
-
- int BIT_IN,
- int FL_IN,
- int BIT_OUT,
- int FL_OUT,
-
- int BIT_W,
- int BIT_ALPHA,
- int FL_ALPHA,
- int BIT_BIAS,
- int BIT_TMP,
-
- int SIMD,
- int PE,
- int VECT_NUMS >
- void conv1x1_relu( hls::stream<ap_uint<BIT_IN*SIMD> >& in,
- const ap_uint<BIT_W*SIMD*PE> weights[(NUM_OUT/PE)][(NUM_IN/SIMD)],
- const ap_uint<BIT_ALPHA> alpha[NUM_OUT/PE][PE],
- const ap_int<BIT_BIAS> bias[NUM_OUT/PE][PE],
- hls::stream<ap_uint<BIT_OUT*PE> >& out,
- ap_uint<10> reps ){
-
- #pragma HLS DATAFLOW
-
- const unsigned INPUT_FOLD = NUM_IN/SIMD; /* input_channel / simd * kernel_size^2 */
- const unsigned OUTPUT_FOLD = NUM_OUT/PE; /* output_channel / pe */
- const unsigned BIT_ACC = 20;
-
- //const unsigned total_loop_num = INPUT_FOLD * OUTPUT_FOLD * VECT_NUMS * reps;
-
- ap_uint<BIT_IN*SIMD> input_temp_arr[INPUT_FOLD];
- #pragma HLS RESOURCE variable=input_temp_arr core=RAM_2P_LUTRAM
-
- unsigned tile = 0;
-
- ap_uint<BIT_IN*SIMD> input_temp;
- ap_int<BIT_ACC> acc[PE];
-
- for(int i=0; i<VECT_NUMS * reps; i++){
- for(int out_fold_cnt=0;out_fold_cnt<OUTPUT_FOLD;out_fold_cnt++){
- for(int in_fold_cnt=0;in_fold_cnt<INPUT_FOLD;in_fold_cnt++){
- #pragma HLS PIPELINE II=1
- if(out_fold_cnt == 0){
- input_temp = in.read();
- input_temp_arr[in_fold_cnt] = input_temp;
- }else{
- input_temp = input_temp_arr[in_fold_cnt];
- }
-
- if(in_fold_cnt == 0){
- for(int m=0; m<PE; m++){
- #pragma HLS UNROLL
- acc[m] = 0;
- }
- }
-
- ap_uint<BIT_W*SIMD*PE> weight_temp = weights[out_fold_cnt][in_fold_cnt];
- for(int pe = 0 ; pe < PE; pe++){
- #pragma HLS UNROLL
- ap_uint <BIT_W*SIMD> weight_pe_loop = weight_temp((pe+1)*BIT_W*SIMD-1,pe*BIT_W*SIMD);
- //ap_uint <BIT_IN*SIMD> in_pe_loop = input_temp((pe+1)*BIT_IN*SIMD-1,pe*BIT_IN*SIMD);
- acc[pe] += tioo_mutiplier_sb<BIT_IN, BIT_W, BIT_ACC, SIMD>(input_temp, weight_pe_loop);
- }
- }
- ap_uint<BIT_OUT*PE> out_buf;
- for(int p=0; p<PE; p++){
- #pragma HLS UNROLL
- ap_int<BIT_TMP> output_temp = acc[p] * alpha[out_fold_cnt][p] + bias[out_fold_cnt][p];
- #pragma HLS RESOURCE variable=output_temp core=DSP48
- output_temp = (output_temp(BIT_TMP-1, BIT_TMP-1) == (ap_uint<1>)0) ? output_temp : (ap_int<BIT_TMP>)0;
-
- ap_uint<BIT_OUT> output_uint = truncate_unsigned<(BIT_TMP-FL_IN-FL_ALPHA), (FL_IN+FL_ALPHA), (BIT_OUT-FL_OUT), FL_OUT>(output_temp);
- out_buf((p+1)*BIT_OUT-1, p*BIT_OUT) = output_uint;
- }
-
- out.write(out_buf);
- }
- }
- }
-
-
-
-
- template < int NUM_IN,
- int NUM_OUT,
-
- int BIT_IN,
- int FL_IN,
- int BIT_OUT,
- int FL_OUT,
-
- int BIT_W,
- int BIT_ALPHA,
- int FL_ALPHA,
- int BIT_BIAS,
- int BIT_TMP,
-
- int SIMD,
- int PE,
- int VECT_NUMS,
- int BIT_ACC>
- void conv1x1_relu_pf( hls::stream<ap_uint<BIT_IN*SIMD> >& in,
- const ap_uint<BIT_W*SIMD*PE> weights[(NUM_OUT/PE)][(NUM_IN/SIMD)],
- hls::stream<ap_uint<BIT_ACC*PE> >& out,
- ap_uint<10> reps){
-
- #pragma HLS DATAFLOW
-
- const unsigned INPUT_FOLD = NUM_IN/SIMD; /* input_channel / simd * kernel_size^2 */
- const unsigned OUTPUT_FOLD = NUM_OUT/PE; /* output_channel / pe */
-
- //const unsigned total_loop_num = INPUT_FOLD * OUTPUT_FOLD * VECT_NUMS * reps;
-
- ap_uint<BIT_IN*SIMD> input_temp_arr[INPUT_FOLD];
- #pragma HLS RESOURCE variable=input_temp_arr core=RAM_2P_BRAM
-
- unsigned tile = 0;
-
- ap_uint<BIT_IN*SIMD> input_temp;
- ap_int<BIT_ACC> acc[PE];
- // ap_int<BIT_TMP> output_temp[PE];
- // ap_uint<BIT_OUT> output_uint[PE];
-
- for(int i=0; i<VECT_NUMS * reps; i++){
- for(int out_fold_cnt=0;out_fold_cnt<OUTPUT_FOLD;out_fold_cnt++){
- for(int in_fold_cnt=0;in_fold_cnt<INPUT_FOLD;in_fold_cnt++){
- #pragma HLS PIPELINE II=1
- if(out_fold_cnt == 0){
- input_temp = in.read();
- input_temp_arr[in_fold_cnt] = input_temp;
- }else{
- input_temp = input_temp_arr[in_fold_cnt];
- }
-
- if(in_fold_cnt == 0){
- for(int m=0; m<PE; m++){
- #pragma HLS UNROLL
- acc[m] = 0;
- }
- }
-
- ap_uint<BIT_W*SIMD*PE> weight_temp = weights[out_fold_cnt][in_fold_cnt];
- for(int pe = 0 ; pe < PE; pe++){
- #pragma HLS UNROLL
- ap_uint <BIT_W*SIMD> weight_pe_loop = weight_temp((pe+1)*BIT_W*SIMD-1,pe*BIT_W*SIMD);
- //ap_uint <BIT_IN*SIMD> in_pe_loop = input_temp((pe+1)*BIT_IN*SIMD-1,pe*BIT_IN*SIMD);
- acc[pe] += tioo_mutiplier_sb<BIT_IN, BIT_W, BIT_ACC, SIMD>(input_temp, weight_pe_loop);
- }
- }
- ap_uint<BIT_ACC*PE> out_buf;
- for(int p=0; p<PE; p++){
- #pragma HLS UNROLL
- out_buf((p+1)*BIT_ACC-1, p*BIT_ACC) = acc[p];
- }
-
- out.write(out_buf);
- }
- }
- }
-
- template < int NUM_IN,
- int NUM_OUT,
-
- int BIT_IN,
- int FL_IN,
- int BIT_OUT,
- int FL_OUT,
-
- int BIT_W,
- int BIT_ALPHA,
- int FL_ALPHA,
- int BIT_BIAS,
- int BIT_TMP,
-
- int SIMD,
- int PE,
- int VECT_NUMS,
- int BIT_ACC>
- void pw_11_pf_tuple(hls::stream<ap_uint<BIT_ACC*PE> >& in,
- const ap_uint<BIT_ALPHA> alpha[NUM_OUT/PE][PE],
- const ap_int<BIT_BIAS> bias[NUM_OUT/PE][PE],
- hls::stream<ap_uint<BIT_OUT*PE> >& out,
- ap_uint<10> reps
- ){
- const unsigned OUTPUT_FOLD = NUM_OUT/PE;
- ap_uint<7> out_fold_cnt = 0;
-
-
- for(int i=0;i<VECT_NUMS * reps;i++){
- for(out_fold_cnt = 0;out_fold_cnt < OUTPUT_FOLD;out_fold_cnt++){
- ap_uint<BIT_ACC*PE> in_buf;
- in_buf = in.read();
- ap_uint<BIT_OUT*PE> out_buf;
- for(int pe=0;pe<PE;pe++){
- #pragma HLS PIPELINE
- ap_int<BIT_ACC> in_single = in_buf((pe+1)*BIT_ACC-1,pe*BIT_ACC);
- ap_int<BIT_TMP> output_temp_0 = in_single * alpha[out_fold_cnt][pe] + bias[out_fold_cnt][pe];
- #pragma HLS RESOURCE variable=output_temp_0 core=DSP48
- ap_int<BIT_TMP> output_temp_1 = (output_temp_0(BIT_TMP-1, BIT_TMP-1) == (ap_uint<1>)0) ? output_temp_0 : (ap_int<BIT_TMP>)0;
- ap_uint<BIT_OUT> output_uint = truncate_unsigned<(BIT_TMP-FL_IN-FL_ALPHA), (FL_IN+FL_ALPHA), (BIT_OUT-FL_OUT), FL_OUT>(output_temp_1);
-
- out_buf((pe+1)*BIT_OUT-1, pe*BIT_OUT) = output_uint;
- }
- out.write(out_buf);
- }
-
- }
- }
-
- template < int NUM_IN,
- int NUM_OUT,
-
- int BIT_IN,
- int FL_IN,
- int BIT_OUT,
- int FL_OUT,
-
- int BIT_W,
- int BIT_ALPHA,
- int FL_ALPHA,
- int BIT_BIAS,
- int BIT_TMP,
-
- int SIMD,
- int PE,
- int VECT_NUMS,
- int BIT_ACC>
- void pw_11_pf_tuple_sh(hls::stream<ap_uint<BIT_ACC*PE> >& in,
- const ap_uint<BIT_ALPHA> alpha[NUM_OUT/PE][PE],
- const ap_int<BIT_BIAS> bias[NUM_OUT/PE][PE],
- hls::stream<ap_uint<BIT_OUT*PE> >& out,
- ap_uint<10> reps
- ){
- const unsigned OUTPUT_FOLD = NUM_OUT/PE;
- ap_uint<7> out_fold_cnt = 0;
-
-
- for(int i=0;i<VECT_NUMS * reps;i++){
- for(out_fold_cnt = 0;out_fold_cnt < OUTPUT_FOLD;out_fold_cnt++){
- ap_uint<BIT_ACC*PE> in_buf;
- in_buf = in.read();
- ap_uint<BIT_OUT*PE> out_buf;
- for(int pe=0;pe<PE;pe++){
- #pragma HLS PIPELINE
- ap_int<BIT_ACC> in_single = in_buf(BIT_ACC-1,0);
- in_buf >> BIT_ACC;
- ap_int<BIT_TMP> output_temp_0 = in_single * alpha[out_fold_cnt][pe] + bias[out_fold_cnt][pe];
- #pragma HLS RESOURCE variable=output_temp_0 core=DSP48
- ap_int<BIT_TMP> output_temp_1 = (output_temp_0(BIT_TMP-1, BIT_TMP-1) == (ap_uint<1>)0) ? output_temp_0 : (ap_int<BIT_TMP>)0;
- ap_uint<BIT_OUT> output_uint = truncate_unsigned<(BIT_TMP-FL_IN-FL_ALPHA), (FL_IN+FL_ALPHA), (BIT_OUT-FL_OUT), FL_OUT>(output_temp_1);
-
- out_buf((pe+1)*BIT_OUT-1, pe*BIT_OUT) = output_uint;
- }
- out.write(out_buf);
- }
-
- }
- }
-
- template < int NUM_IN,
- int NUM_OUT,
-
- int BIT_IN,
- int FL_IN,
- int BIT_OUT,
- int FL_OUT,
-
- int BIT_W,
- int BIT_ALPHA,
- int FL_ALPHA,
- int BIT_BIAS,
- int BIT_TMP,
-
- int SIMD,
- int PE,
- int VECT_NUMS,
- int BIT_ACC>
- void pw_11_pf_tuple_one(hls::stream<ap_uint<BIT_ACC> >& in,
- const ap_uint<BIT_ALPHA> alpha[NUM_OUT/PE][PE],
- const ap_int<BIT_BIAS> bias[NUM_OUT/PE][PE],
- hls::stream<ap_uint<BIT_OUT> >& out,
- ap_uint<10> reps
- ){
- const unsigned OUTPUT_FOLD = NUM_OUT/PE;
- ap_uint<7> out_fold_cnt = 0;
- ap_int<BIT_ACC> in_buf;
- for(int i=0;i<VECT_NUMS * reps;i++){
- for(out_fold_cnt = 0;out_fold_cnt < OUTPUT_FOLD;out_fold_cnt++){
- for(ap_uint<7> pe=0;pe<PE;pe++){
- #pragma HLS PIPELINE
- in_buf = in.read();
- ap_int<BIT_TMP> output_temp_0 = in_buf * alpha[out_fold_cnt][pe] + bias[out_fold_cnt][pe];
- #pragma HLS RESOURCE variable=output_temp_0 core=DSP48
- ap_int<BIT_TMP> output_temp_1 = (output_temp_0(BIT_TMP-1, BIT_TMP-1) == (ap_uint<1>)0) ? output_temp_0 : (ap_int<BIT_TMP>)0;
- ap_uint<BIT_OUT> output_uint = truncate_unsigned<(BIT_TMP-FL_IN-FL_ALPHA), (FL_IN+FL_ALPHA), (BIT_OUT-FL_OUT), FL_OUT>(output_temp_1);
- out.write(output_uint);
- }
- }
- }
- }
- template < int NUM_IN,
- int NUM_OUT,
-
- int BIT_IN,
- int FL_IN,
- int BIT_OUT,
- int FL_OUT,
-
- int BIT_W,
- int BIT_ALPHA,
- int FL_ALPHA,
- int BIT_BIAS,
- int BIT_TMP,
-
- int SIMD,
- int PE,
- int VECT_NUMS,
- int BIT_ACC>
- void pw_11_pf_tup2(hls::stream<ap_uint<BIT_ACC*PE> >& in,
- const ap_uint<BIT_ALPHA> alpha[NUM_OUT/PE][PE],
- const ap_int<BIT_BIAS> bias[NUM_OUT/PE][PE],
- hls::stream<ap_uint<BIT_OUT*PE> >& out,
- ap_uint<10> reps
- ){
- const unsigned OUTPUT_FOLD = NUM_OUT/PE;
- ap_uint<7> out_fold_cnt = 0;
-
-
- for(int i=0;i<VECT_NUMS * reps;i++){
- for(out_fold_cnt = 0;out_fold_cnt < OUTPUT_FOLD;out_fold_cnt++){
- #pragma HLS PIPELINE II=PE/2
- ap_uint<BIT_ACC*PE> in_buf;
- in_buf = in.read();
- ap_uint<BIT_OUT*PE> out_buf;
- for(int pe=0;pe<PE;pe++){
- #pragma HLS UNROLL factor=2
- ap_int<BIT_ACC> in_single = in_buf((pe+1)*BIT_ACC-1,pe*BIT_ACC);
- ap_int<BIT_TMP> output_temp_0 = in_single * alpha[out_fold_cnt][pe] + bias[out_fold_cnt][pe];
- #pragma HLS RESOURCE variable=output_temp_0 core=DSP48
- ap_int<BIT_TMP> output_temp_1 = (output_temp_0(BIT_TMP-1, BIT_TMP-1) == (ap_uint<1>)0) ? output_temp_0 : (ap_int<BIT_TMP>)0;
- ap_uint<BIT_OUT> output_uint = truncate_unsigned<(BIT_TMP-FL_IN-FL_ALPHA), (FL_IN+FL_ALPHA), (BIT_OUT-FL_OUT), FL_OUT>(output_temp_1);
-
- out_buf((pe+1)*BIT_OUT-1, pe*BIT_OUT) = output_uint;
- }
- out.write(out_buf);
- }
-
- }
- }
-
- template < int NUM_IN,
- int NUM_OUT,
-
- int BIT_IN,
- int FL_IN,
- int BIT_OUT,
- int FL_OUT,
-
- int BIT_W,
- int BIT_ALPHA,
- int FL_ALPHA,
- int BIT_BIAS,
- int BIT_TMP,
-
- int SIMD,
- int PE,
- int VECT_NUMS,
- int BIT_ACC>
- void pw_11_pf_tup6(hls::stream<ap_uint<BIT_ACC*PE> >& in,
- const ap_uint<BIT_ALPHA> alpha[NUM_OUT/PE][PE],
- const ap_int<BIT_BIAS> bias[NUM_OUT/PE][PE],
- hls::stream<ap_uint<BIT_OUT*PE> >& out,
- ap_uint<10> reps
- ){
- const unsigned OUTPUT_FOLD = NUM_OUT/PE;
- ap_uint<7> out_fold_cnt = 0;
-
-
- for(int i=0;i<VECT_NUMS * reps;i++){
- for(out_fold_cnt = 0;out_fold_cnt < OUTPUT_FOLD;out_fold_cnt++){
- #pragma HLS PIPELINE II=PE/6
- ap_uint<BIT_ACC*PE> in_buf;
- in_buf = in.read();
- ap_uint<BIT_OUT*PE> out_buf;
- for(int pe=0;pe<PE;pe++){
- #pragma HLS UNROLL factor=6
- ap_int<BIT_ACC> in_single = in_buf((pe+1)*BIT_ACC-1,pe*BIT_ACC);
- ap_int<BIT_TMP> output_temp_0 = in_single * alpha[out_fold_cnt][pe] + bias[out_fold_cnt][pe];
- #pragma HLS RESOURCE variable=output_temp_0 core=DSP48
- ap_int<BIT_TMP> output_temp_1 = (output_temp_0(BIT_TMP-1, BIT_TMP-1) == (ap_uint<1>)0) ? output_temp_0 : (ap_int<BIT_TMP>)0;
- ap_uint<BIT_OUT> output_uint = truncate_unsigned<(BIT_TMP-FL_IN-FL_ALPHA), (FL_IN+FL_ALPHA), (BIT_OUT-FL_OUT), FL_OUT>(output_temp_1);
-
- out_buf((pe+1)*BIT_OUT-1, pe*BIT_OUT) = output_uint;
- }
- out.write(out_buf);
- }
-
- }
- }
-
- template < int NUM_IN,
- int NUM_OUT,
-
- int BIT_IN,
- int FL_IN,
- int BIT_OUT,
- int FL_OUT,
-
- int BIT_W,
- int BIT_ALPHA,
- int FL_ALPHA,
- int BIT_BIAS,
- int BIT_TMP,
-
- int SIMD,
- int PE,
- int VECT_NUMS,
- int BIT_ACC>
- void pw_11_pf_tup3(hls::stream<ap_uint<BIT_ACC*PE> >& in,
- const ap_uint<BIT_ALPHA> alpha[NUM_OUT/PE][PE],
- const ap_int<BIT_BIAS> bias[NUM_OUT/PE][PE],
- hls::stream<ap_uint<BIT_OUT*PE> >& out,
- ap_uint<10> reps
- ){
- const unsigned OUTPUT_FOLD = NUM_OUT/PE;
- ap_uint<7> out_fold_cnt = 0;
-
-
- for(int i=0;i<VECT_NUMS * reps;i++){
- for(out_fold_cnt = 0;out_fold_cnt < OUTPUT_FOLD;out_fold_cnt++){
- ap_uint<BIT_ACC*PE> in_buf;
- in_buf = in.read();
- ap_uint<BIT_OUT*PE> out_buf;
- #pragma HLS PIPELINE II=PE/3
- for(int pe=0;pe<PE;pe++){
- #pragma HLS UNROLL factor=3
- ap_int<BIT_ACC> in_single = in_buf((pe+1)*BIT_ACC-1,pe*BIT_ACC);
- ap_int<BIT_TMP> output_temp_0 = in_single * alpha[out_fold_cnt][pe] + bias[out_fold_cnt][pe];
- #pragma HLS RESOURCE variable=output_temp_0 core=DSP48
- ap_int<BIT_TMP> output_temp_1 = (output_temp_0(BIT_TMP-1, BIT_TMP-1) == (ap_uint<1>)0) ? output_temp_0 : (ap_int<BIT_TMP>)0;
- ap_uint<BIT_OUT> output_uint = truncate_unsigned<(BIT_TMP-FL_IN-FL_ALPHA), (FL_IN+FL_ALPHA), (BIT_OUT-FL_OUT), FL_OUT>(output_temp_1);
-
- out_buf((pe+1)*BIT_OUT-1, pe*BIT_OUT) = output_uint;
- }
- #ifdef DEBUG
- if(i==3*320){
- cout<< out_buf << endl;
- }
- #endif
- out.write(out_buf);
- }
-
- }
- }
-
-
- template < int NUM_IN,
- int NUM_OUT,
-
- int BIT_IN,
- int FL_IN,
- int BIT_OUT,
- int FL_OUT,
-
- int BIT_W,
- int BIT_ALPHA,
- int FL_ALPHA,
- int BIT_BIAS,
- int BIT_TMP,
-
- int SIMD,
- int PE,
- int VECT_NUMS,
- int BIT_ACC>
- void pw_11_pf_tup4(hls::stream<ap_uint<BIT_ACC*PE> >& in,
- const ap_uint<BIT_ALPHA> alpha[NUM_OUT/PE][PE],
- const ap_int<BIT_BIAS> bias[NUM_OUT/PE][PE],
- hls::stream<ap_uint<BIT_OUT*PE> >& out,
- ap_uint<10> reps
- ){
- const unsigned OUTPUT_FOLD = NUM_OUT/PE;
- ap_uint<7> out_fold_cnt = 0;
-
-
- for(int i=0;i<VECT_NUMS * reps;i++){
- for(out_fold_cnt = 0;out_fold_cnt < OUTPUT_FOLD;out_fold_cnt++){
- #pragma HLS PIPELINE II=PE/4
- ap_uint<BIT_ACC*PE> in_buf;
- in_buf = in.read();
- ap_uint<BIT_OUT*PE> out_buf;
- for(int pe=0;pe<PE;pe++){
- #pragma HLS UNROLL factor=4
- ap_int<BIT_ACC> in_single = in_buf((pe+1)*BIT_ACC-1,pe*BIT_ACC);
- ap_int<BIT_TMP> output_temp_0 = in_single * alpha[out_fold_cnt][pe] + bias[out_fold_cnt][pe];
- #pragma HLS RESOURCE variable=output_temp_0 core=DSP48
- ap_int<BIT_TMP> output_temp_1 = (output_temp_0(BIT_TMP-1, BIT_TMP-1) == (ap_uint<1>)0) ? output_temp_0 : (ap_int<BIT_TMP>)0;
- ap_uint<BIT_OUT> output_uint = truncate_unsigned<(BIT_TMP-FL_IN-FL_ALPHA), (FL_IN+FL_ALPHA), (BIT_OUT-FL_OUT), FL_OUT>(output_temp_1);
-
- out_buf((pe+1)*BIT_OUT-1, pe*BIT_OUT) = output_uint;
- }
- out.write(out_buf);
- }
-
- }
- }
-
-
- template < int ROW_IN,
- int COL_IN,
- int CH_IN,
- int BIT_IN,
- int FL_IN,
-
- int CH_OUT,
- int BIT_OUT,
- int FL_OUT,
-
- int BIT_W,
- int BIT_ALPHA,
- int FL_ALPHA,
- int BIT_BIAS,
- int BIT_TMP,
-
- int SIMD,
- int PE>
- void conv1x1_layer( hls::stream<ap_uint<BIT_IN*SIMD> >& in,
- const ap_uint<BIT_W*SIMD*PE> weights[CH_OUT/PE][CH_IN/SIMD],
- const ap_uint<BIT_ALPHA> alpha[CH_OUT/PE][PE],
- const ap_int<BIT_BIAS> bias[CH_OUT/PE][PE],
- hls::stream<ap_uint<BIT_OUT*PE> >& out,
- ap_uint<10> reps){
-
- #pragma HLS DATAFLOW
-
- const unsigned INTER_ROW = ROW_IN;
- const unsigned INTER_COL = COL_IN;
- const unsigned ROW_OUT = ROW_IN;
- const unsigned COL_OUT = COL_IN;
-
- conv1x1_relu<CH_IN, CH_OUT, BIT_IN, FL_IN, BIT_OUT, FL_OUT, BIT_W, BIT_ALPHA, FL_ALPHA, BIT_BIAS, BIT_TMP, SIMD, PE, ROW_OUT*COL_OUT>(
- in, weights, alpha, bias, out, reps );
-
-
- }
-
-
- template < int ROW_IN,
- int COL_IN,
- int CH_IN,
- int BIT_IN,
- int FL_IN,
-
- int CH_OUT,
- int BIT_OUT,
- int FL_OUT,
-
- int BIT_W,
- int BIT_ALPHA,
- int FL_ALPHA,
- int BIT_BIAS,
- int BIT_TMP,
-
- int SIMD,
- int PE>
- void conv1x1_layer_pf( hls::stream<ap_uint<BIT_IN*SIMD> >& in,
- const ap_uint<BIT_W*SIMD*PE> weights[CH_OUT/PE][CH_IN/SIMD],
- const ap_uint<BIT_ALPHA> alpha[CH_OUT/PE][PE],
- const ap_int<BIT_BIAS> bias[CH_OUT/PE][PE],
- hls::stream<ap_uint<BIT_OUT*PE> >& out,
- ap_uint<10> reps){
-
- #pragma HLS DATAFLOW
-
- const unsigned INTER_ROW = ROW_IN;
- const unsigned INTER_COL = COL_IN;
- const unsigned ROW_OUT = ROW_IN;
- const unsigned COL_OUT = COL_IN;
- const unsigned BIT_ACC = 20;
-
- hls::stream<ap_uint<BIT_ACC*PE> > temp_out;
- conv1x1_relu_pf<CH_IN, CH_OUT, BIT_IN, FL_IN, BIT_OUT, FL_OUT, BIT_W, BIT_ALPHA, FL_ALPHA, BIT_BIAS, BIT_TMP, SIMD, PE, ROW_OUT*COL_OUT, BIT_ACC >(
- in, weights, temp_out, reps);
-
-
-
- pw_11_pf_tuple_sh<CH_IN*3*3, CH_OUT, BIT_IN, FL_IN, BIT_OUT, FL_OUT, BIT_W, BIT_ALPHA, FL_ALPHA, BIT_BIAS, BIT_TMP, SIMD, PE, ROW_OUT*COL_OUT,BIT_ACC>
- (temp_out, alpha, bias, out, reps);
-
-
- }
-
- template < int ROW_IN,
- int COL_IN,
- int CH_IN,
- int BIT_IN,
- int FL_IN,
-
- int CH_OUT,
- int BIT_OUT,
- int FL_OUT,
-
- int BIT_W,
- int BIT_ALPHA,
- int FL_ALPHA,
- int BIT_BIAS,
- int BIT_TMP,
-
- int SIMD,
- int PE>
- void conv1x1_layer_pf_t3( hls::stream<ap_uint<BIT_IN*SIMD> >& in,
- const ap_uint<BIT_W*SIMD*PE> weights[CH_OUT/PE][CH_IN/SIMD],
- const ap_uint<BIT_ALPHA> alpha[CH_OUT/PE][PE],
- const ap_int<BIT_BIAS> bias[CH_OUT/PE][PE],
- hls::stream<ap_uint<BIT_OUT*PE> >& out,
- ap_uint<10> reps){
-
- #pragma HLS DATAFLOW
-
- const unsigned INTER_ROW = ROW_IN;
- const unsigned INTER_COL = COL_IN;
- const unsigned ROW_OUT = ROW_IN;
- const unsigned COL_OUT = COL_IN;
- const unsigned BIT_ACC = 20;
-
- hls::stream<ap_uint<BIT_ACC*PE> > temp_out;
- conv1x1_relu_t3_pf<CH_IN, CH_OUT, BIT_IN, FL_IN, BIT_OUT, FL_OUT, BIT_W, BIT_ALPHA, FL_ALPHA, BIT_BIAS, BIT_TMP, SIMD, PE, ROW_OUT*COL_OUT, BIT_ACC>(
- in, weights, temp_out, reps);
-
- pw_11_pf_tuple<CH_IN*3*3, CH_OUT, BIT_IN, FL_IN, BIT_OUT, FL_OUT, BIT_W, BIT_ALPHA, FL_ALPHA, BIT_BIAS, BIT_TMP, SIMD, PE, ROW_OUT*COL_OUT,BIT_ACC>
- (temp_out, alpha, bias, out, reps);
-
-
- }
-
- template < int ROW_IN,
- int COL_IN,
- int CH_IN,
- int BIT_IN,
- int FL_IN,
-
- int CH_OUT,
- int BIT_OUT,
- int FL_OUT,
-
- int BIT_W,
- int BIT_ALPHA,
- int FL_ALPHA,
- int BIT_BIAS,
- int BIT_TMP,
-
- int SIMD,
- int PE>
- void conv1x1_layer_pf_t3_2( hls::stream<ap_uint<BIT_IN*SIMD> >& in,
- const ap_uint<BIT_W*SIMD*PE> weights[CH_OUT/PE][CH_IN/SIMD],
- const ap_uint<BIT_ALPHA> alpha[CH_OUT/PE][PE],
- const ap_int<BIT_BIAS> bias[CH_OUT/PE][PE],
- hls::stream<ap_uint<BIT_OUT*PE> >& out,
- ap_uint<10> reps ){
-
- #pragma HLS DATAFLOW
-
- const unsigned INTER_ROW = ROW_IN;
- const unsigned INTER_COL = COL_IN;
- const unsigned ROW_OUT = ROW_IN;
- const unsigned COL_OUT = COL_IN;
- const unsigned BIT_ACC = 20;
-
- hls::stream<ap_uint<BIT_ACC*PE> > temp_out;
- conv1x1_relu_t3_pf<CH_IN, CH_OUT, BIT_IN, FL_IN, BIT_OUT, FL_OUT, BIT_W, BIT_ALPHA, FL_ALPHA, BIT_BIAS, BIT_TMP, SIMD, PE, ROW_OUT*COL_OUT, BIT_ACC>(
- in, weights, temp_out, reps);
-
- pw_11_pf_tup2<CH_IN*3*3, CH_OUT, BIT_IN, FL_IN, BIT_OUT, FL_OUT, BIT_W, BIT_ALPHA, FL_ALPHA, BIT_BIAS, BIT_TMP, SIMD, PE, ROW_OUT*COL_OUT,BIT_ACC>
- (temp_out, alpha, bias, out, reps);
-
-
- }
-
- template < int ROW_IN,
- int COL_IN,
- int CH_IN,
- int BIT_IN,
- int FL_IN,
-
- int CH_OUT,
- int BIT_OUT,
- int FL_OUT,
-
- int BIT_W,
- int BIT_ALPHA,
- int FL_ALPHA,
- int BIT_BIAS,
- int BIT_TMP,
-
- int SIMD,
- int PE>
- void conv1x1_layer_pf_t3_6( hls::stream<ap_uint<BIT_IN*SIMD> >& in,
- const ap_uint<BIT_W*SIMD*PE> weights[CH_OUT/PE][CH_IN/SIMD],
- const ap_uint<BIT_ALPHA> alpha[CH_OUT/PE][PE],
- const ap_int<BIT_BIAS> bias[CH_OUT/PE][PE],
- hls::stream<ap_uint<BIT_OUT*PE> >& out,
- ap_uint<10> reps ){
-
- #pragma HLS DATAFLOW
-
- const unsigned INTER_ROW = ROW_IN;
- const unsigned INTER_COL = COL_IN;
- const unsigned ROW_OUT = ROW_IN;
- const unsigned COL_OUT = COL_IN;
- const unsigned BIT_ACC = 20;
-
- hls::stream<ap_uint<BIT_ACC*PE> > temp_out;
- conv1x1_relu_t3_pf<CH_IN, CH_OUT, BIT_IN, FL_IN, BIT_OUT, FL_OUT, BIT_W, BIT_ALPHA, FL_ALPHA, BIT_BIAS, BIT_TMP, SIMD, PE, ROW_OUT*COL_OUT, BIT_ACC>(
- in, weights, temp_out, reps);
-
- pw_11_pf_tup6<CH_IN*3*3, CH_OUT, BIT_IN, FL_IN, BIT_OUT, FL_OUT, BIT_W, BIT_ALPHA, FL_ALPHA, BIT_BIAS, BIT_TMP, SIMD, PE, ROW_OUT*COL_OUT,BIT_ACC>
- (temp_out, alpha, bias, out, reps);
-
-
- }
-
- template < int ROW_IN,
- int COL_IN,
- int CH_IN,
- int BIT_IN,
- int FL_IN,
-
- int CH_OUT,
- int BIT_OUT,
- int FL_OUT,
-
- int BIT_W,
- int BIT_ALPHA,
- int FL_ALPHA,
- int BIT_BIAS,
- int BIT_TMP,
-
- int SIMD,
- int PE>
- void conv1x1_layer_pf_t3_3( hls::stream<ap_uint<BIT_IN*SIMD> >& in,
- const ap_uint<BIT_W*SIMD*PE> weights[CH_OUT/PE][CH_IN/SIMD],
- const ap_uint<BIT_ALPHA> alpha[CH_OUT/PE][PE],
- const ap_int<BIT_BIAS> bias[CH_OUT/PE][PE],
- hls::stream<ap_uint<BIT_OUT*PE> >& out,
- ap_uint<10> reps){
-
- #pragma HLS DATAFLOW
-
- const unsigned INTER_ROW = ROW_IN;
- const unsigned INTER_COL = COL_IN;
- const unsigned ROW_OUT = ROW_IN;
- const unsigned COL_OUT = COL_IN;
- const unsigned BIT_ACC = 20;
-
- hls::stream<ap_uint<BIT_ACC*PE> > temp_out;
- conv1x1_relu_t3_pf<CH_IN, CH_OUT, BIT_IN, FL_IN, BIT_OUT, FL_OUT, BIT_W, BIT_ALPHA, FL_ALPHA, BIT_BIAS, BIT_TMP, SIMD, PE, ROW_OUT*COL_OUT, BIT_ACC>(
- in, weights, temp_out, reps);
-
- pw_11_pf_tup3<CH_IN*3*3, CH_OUT, BIT_IN, FL_IN, BIT_OUT, FL_OUT, BIT_W, BIT_ALPHA, FL_ALPHA, BIT_BIAS, BIT_TMP, SIMD, PE, ROW_OUT*COL_OUT,BIT_ACC>
- (temp_out, alpha, bias, out, reps);
-
-
- }
-
-
- template < int ROW_IN,
- int COL_IN,
- int CH_IN,
- int BIT_IN,
- int FL_IN,
-
- int CH_OUT,
- int BIT_OUT,
- int FL_OUT,
-
- int BIT_W,
- int BIT_ALPHA,
- int FL_ALPHA,
- int BIT_BIAS,
- int BIT_TMP,
-
- int SIMD,
- int PE >
- void conv1x1_layer_pf_t3_4( hls::stream<ap_uint<BIT_IN*SIMD> >& in,
- const ap_uint<BIT_W*SIMD*PE> weights[CH_OUT/PE][CH_IN/SIMD],
- const ap_uint<BIT_ALPHA> alpha[CH_OUT/PE][PE],
- const ap_int<BIT_BIAS> bias[CH_OUT/PE][PE],
- hls::stream<ap_uint<BIT_OUT*PE> >& out,
- ap_uint<10> reps){
-
- #pragma HLS DATAFLOW
-
- const unsigned INTER_ROW = ROW_IN;
- const unsigned INTER_COL = COL_IN;
- const unsigned ROW_OUT = ROW_IN;
- const unsigned COL_OUT = COL_IN;
- const unsigned BIT_ACC = 20;
-
- hls::stream<ap_uint<BIT_ACC*PE> > temp_out;
- conv1x1_relu_t3_pf<CH_IN, CH_OUT, BIT_IN, FL_IN, BIT_OUT, FL_OUT, BIT_W, BIT_ALPHA, FL_ALPHA, BIT_BIAS, BIT_TMP, SIMD, PE, ROW_OUT*COL_OUT, BIT_ACC>(
- in, weights, temp_out, reps);
-
- pw_11_pf_tup4<CH_IN*3*3, CH_OUT, BIT_IN, FL_IN, BIT_OUT, FL_OUT, BIT_W, BIT_ALPHA, FL_ALPHA, BIT_BIAS, BIT_TMP, SIMD, PE, ROW_OUT*COL_OUT,BIT_ACC>
- (temp_out, alpha, bias, out, reps);
-
-
- }
-
- template < int NUM_IN,
- int NUM_OUT,
-
- int BIT_IN,
- int FL_IN,
- int BIT_OUT,
- int FL_OUT,
-
- int BIT_W,
- int BIT_ALPHA,
- int FL_ALPHA,
- int BIT_BIAS,
- int BIT_TMP,
-
- int SIMD,
- int PE,
- int VECT_NUMS >
- void dw_conv3x3_relu( hls::stream<ap_uint<BIT_IN*SIMD> >& in,
- const ap_uint<BIT_W*SIMD> weights[NUM_OUT/SIMD][9],
- const ap_uint<BIT_ALPHA> alpha[NUM_OUT/SIMD][SIMD],
- const ap_int<BIT_BIAS> bias[NUM_OUT/SIMD][SIMD],
- hls::stream<ap_uint<BIT_OUT*SIMD> >& out,
- ap_uint<10> reps){
-
- #pragma HLS DATAFLOW
-
- //const unsigned INPUT_FOLD = NUM_OUT/SIMD; /* input_channel / simd * kernel_size^2 */
- //const unsigned OUTPUT_FOLD = NUM_OUT/PE; /* output_channel / pe */
- const unsigned BIT_ACC = SIMD*(BIT_IN+BIT_W);
-
- const unsigned BIT_SUM = 14;
-
- //const unsigned total_loop_num = INPUT_FOLD * VECT_NUMS * reps; // right and differ from
-
-
- //ap_uint<BIT_IN*SIMD> input_temp_arr[INPUT_FOLD];
- //#pragma HLS RESOURCE variable=input_temp_arr core=RAM_2P_BRAM
-
- unsigned in_fold_cnt = 0;
- unsigned out_fold_cnt = 0;
- unsigned tile = 0;
-
- ap_uint<BIT_IN*SIMD> input_temp;
-
- ap_int<30> acc_reordering[NUM_OUT/SIMD][SIMD];
- #pragma HLS RESOURCE variable=acc_reordering core=RAM_2P_LUTRAM
- #pragma HLS ARRAY_PARTITION variable=acc_reordering complete dim=2
- //ap_int<BIT_TMP> output_temp[PE];
- //ap_uint<BIT_OUT> output_uint[PE];
-
- ap_int<(BIT_IN+BIT_W)*SIMD> acc_temp;
- //int ch;
-
- for(int i=0 ;i<VECT_NUMS * reps;i++){
- for(int out_fold_cnt = 0;out_fold_cnt<NUM_OUT/SIMD;out_fold_cnt++){
- for(int simd=0;simd<SIMD;simd++){
- #pragma HLS PIPELINE II=1
- acc_reordering[out_fold_cnt][simd] = 0;
- }
- }
- for(int j = 0 ; j < 9; j++){
- for(int k=0 ; k < NUM_OUT/SIMD;k++){
- #pragma HLS PIPELINE II=1
- ap_uint<BIT_IN*SIMD> weight_temp = weights[k][j];
- input_temp = in.read();
- acc_temp = desperate_vector_pot_product<BIT_IN, BIT_W, BIT_ACC, SIMD>(input_temp, weight_temp);
- for(int s = 0;s<SIMD;s++){
- #pragma HLS UNROLL
- ap_int<BIT_IN + BIT_W> signed_temp = acc_temp((s+1)*(BIT_IN + BIT_W)-1 , s*(BIT_IN + BIT_W));
- acc_reordering[k][s] += signed_temp;
- }
- }
- }
-
- for(int out_fold_cnt = 0;out_fold_cnt<NUM_OUT/SIMD;out_fold_cnt++){
- #pragma HLS PIPELINE II=1
- ap_uint<BIT_OUT*SIMD> out_buf;
- for(int simd=0;simd<SIMD;simd++){
- ap_int<BIT_TMP> output_temp = acc_reordering[out_fold_cnt][simd] * alpha[out_fold_cnt][simd] + bias[out_fold_cnt][simd];//0 for pe
- #pragma HLS RESOURCE variable=output_temp core=DSP48
- output_temp = (output_temp(BIT_TMP-1, BIT_TMP-1) == (ap_uint<1>)0) ? output_temp : (ap_int<BIT_TMP>)0;
- ap_uint<BIT_OUT> output_uint = truncate_unsigned<(BIT_TMP-FL_IN-FL_ALPHA), (FL_IN+FL_ALPHA), (BIT_OUT-FL_OUT), FL_OUT>(output_temp);
- out_buf((simd+1)*BIT_OUT-1, simd*BIT_OUT) = output_uint;
- }
- out.write(out_buf);
- }
- }
- }
-
-
- template < int NUM_IN,
- int NUM_OUT,
-
- int BIT_IN,
- int FL_IN,
- int BIT_OUT,
- int FL_OUT,
-
- int BIT_W,
- int BIT_ALPHA,
- int FL_ALPHA,
- int BIT_BIAS,
- int BIT_TMP,
-
- int SIMD,
- int PE,
- int VECT_NUMS >
- void dw_conv3x3_relu_signed( hls::stream<ap_uint<BIT_IN*SIMD> >& in,
- const ap_uint<BIT_W*SIMD> weights[NUM_OUT/SIMD][9],
- const ap_uint<BIT_ALPHA> alpha[NUM_OUT/SIMD][SIMD],
- const ap_int<BIT_BIAS> bias[NUM_OUT/SIMD][SIMD],
- hls::stream<ap_uint<BIT_OUT*SIMD> >& out,
- ap_uint<10> reps){
-
- #pragma HLS DATAFLOW
-
- //const unsigned INPUT_FOLD = NUM_OUT/SIMD; /* input_channel / simd * kernel_size^2 */
- //const unsigned OUTPUT_FOLD = NUM_OUT/PE; /* output_channel / pe */
- const unsigned BIT_ACC = SIMD*(BIT_IN+BIT_W);
-
- const unsigned BIT_SUM = 14;
-
- //const unsigned total_loop_num = INPUT_FOLD * VECT_NUMS * reps; // right and differ from
-
-
- //ap_uint<BIT_IN*SIMD> input_temp_arr[INPUT_FOLD];
- //#pragma HLS RESOURCE variable=input_temp_arr core=RAM_2P_BRAM
-
- unsigned in_fold_cnt = 0;
- unsigned tile = 0;
-
- ap_uint<BIT_IN*SIMD> input_temp;
-
- ap_int<BIT_TMP> acc_reordering[NUM_OUT/SIMD][SIMD];
- #pragma HLS ARRAY_PARTITION variable=acc_reordering complete dim=2
- #pragma HLS RESOURCE variable acc_reordering core=RAM_2P_LUTRAM
- //ap_int<BIT_TMP> output_temp[PE];
- //ap_uint<BIT_OUT> output_uint[PE];
-
- ap_int<(BIT_IN+BIT_W)*SIMD> acc_temp;
- //int ch;
-
- aplp:for(int i=0 ;i<VECT_NUMS * reps;i++){
- etfd:for(int j = 0 ; j < 9; j++){
- hfs:for(int k=0 ; k < NUM_OUT/SIMD;k++){
- #pragma HLS PIPELINE II=1
- ap_uint<BIT_W*SIMD> weight_temp = weights[k][j];
- input_temp = in.read();
- acc_temp = desperate_vector_pot_product<BIT_IN, BIT_W, BIT_ACC, SIMD>(input_temp, weight_temp);
- ap_uint<BIT_OUT*SIMD> out_buf;
- nmmsl:for(int s = 0;s<SIMD;s++){
- #pragma HLS UNROLL
- ap_int<BIT_IN + BIT_W> signed_temp = acc_temp((s+1)*(BIT_IN + BIT_W)-1 , s*(BIT_IN + BIT_W));
-
- if(j==0) acc_reordering[k][s] = signed_temp;
- else {
- acc_reordering[k][s] += signed_temp;
-
- }
-
- ap_int<BIT_TMP> output_temp;
- ap_uint<BIT_OUT> output_uint;
- if(j==8){
- output_temp = acc_reordering[k][s] * alpha[k][s] + bias[k][s];
- output_temp = (output_temp(BIT_TMP-1, BIT_TMP-1) == (ap_uint<1>)0) ? output_temp : (ap_int<BIT_TMP>)0;
- output_uint = truncate_unsigned<(BIT_TMP-FL_IN-FL_ALPHA), (FL_IN+FL_ALPHA), (BIT_OUT-FL_OUT), FL_OUT>(output_temp);
- out_buf((s+1)*BIT_OUT-1, s*BIT_OUT) = output_uint;
- }
- }
- if(j==8)out.write(out_buf);
- }
- }
- #ifdef DEBUG
- if(i==3*320){
- i = 3*320;
- }
- #endif
- }
- }
-
-
-
- template < int NUM_IN,
- int NUM_OUT,
-
- int BIT_IN,
- int FL_IN,
- int BIT_OUT,
- int FL_OUT,
-
- int BIT_W,
- int BIT_ALPHA,
- int FL_ALPHA,
- int BIT_BIAS,
- int BIT_TMP,
-
- int SIMD,
- int PE,
- int VECT_NUMS >
- void dw_conv3x3_relu_oly( hls::stream<ap_uint<BIT_IN*SIMD> >& in,
- const ap_uint<BIT_W*SIMD> weights[NUM_OUT/SIMD][9],
- const ap_uint<BIT_ALPHA> alpha[NUM_OUT/SIMD][SIMD],
- const ap_int<BIT_BIAS> bias[NUM_OUT/SIMD][SIMD],
- hls::stream<ap_uint<BIT_OUT*SIMD> >& out,
- ap_uint<10> reps){
-
- #pragma HLS DATAFLOW
-
- //const unsigned INPUT_FOLD = NUM_OUT/SIMD; /* input_channel / simd * kernel_size^2 */
- //const unsigned OUTPUT_FOLD = NUM_OUT/PE; /* output_channel / pe */
- const unsigned BIT_ACC = SIMD*(BIT_IN+BIT_W);
-
- const unsigned BIT_SUM = 14;
-
- //const unsigned total_loop_num = INPUT_FOLD * VECT_NUMS * reps; // right and differ from
-
-
- //ap_uint<BIT_IN*SIMD> input_temp_arr[INPUT_FOLD];
- //#pragma HLS RESOURCE variable=input_temp_arr core=RAM_2P_BRAM
-
- unsigned in_fold_cnt = 0;
- unsigned tile = 0;
-
- ap_uint<BIT_IN*SIMD> input_temp;
-
- ap_int<BIT_TMP> acc_reordering[SIMD][10];
- #pragma HLS ARRAY_PARTITION variable=acc_reordering complete dim=0
- //ap_int<BIT_TMP> output_temp[PE];
- //ap_uint<BIT_OUT> output_uint[PE];
-
- ap_int<(BIT_IN+BIT_W)*SIMD> acc_temp;
- //int ch;
-
- for(int s = 0;s<SIMD;s++){
- acc_reordering[s][0] = 0;
- }
- aplp:for(int i=0 ;i<VECT_NUMS * reps;i++){
- etfd:for(int j = 0 ; j < 9; j++){
- #pragma HLS PIPELINE II=1
- ap_uint<BIT_W*SIMD> weight_temp = weights[0][j];
- input_temp = in.read();
- acc_temp = desperate_vector_pot_product<BIT_IN, BIT_W, BIT_ACC, SIMD>(input_temp, weight_temp);
- ap_uint<BIT_OUT*SIMD> out_buf;
- nmmsl:for(int s = 0;s<SIMD;s++){
- #pragma HLS UNROLL
- ap_int<BIT_IN + BIT_W> signed_temp = acc_temp((s+1)*(BIT_IN + BIT_W)-1 , s*(BIT_IN + BIT_W));
-
- acc_reordering[s][j+1] = signed_temp + acc_reordering[s][j];
-
-
- ap_int<BIT_TMP> output_temp;
- ap_uint<BIT_OUT> output_uint;
- if(j==8){
- output_temp = acc_reordering[s][9] * alpha[0][s] + bias[0][s];
- output_temp = (output_temp(BIT_TMP-1, BIT_TMP-1) == (ap_uint<1>)0) ? output_temp : (ap_int<BIT_TMP>)0;
- output_uint = truncate_unsigned<(BIT_TMP-FL_IN-FL_ALPHA), (FL_IN+FL_ALPHA), (BIT_OUT-FL_OUT), FL_OUT>(output_temp);
- out_buf((s+1)*BIT_OUT-1, s*BIT_OUT) = output_uint;
- }
- }
- if(j==8)out.write(out_buf);
-
- }
- #ifdef DEBUG
- if(i==3*320){
- i = 3*320;
- }
- #endif
- }
- }
-
-
- template < int ROW_IN,
- int COL_IN,
- int CH_IN,
- int BIT_IN,
- int FL_IN,
-
- int CH_OUT,
- int BIT_OUT,
- int FL_OUT,
-
- int BIT_W,
- int BIT_ALPHA,
- int FL_ALPHA,
- int BIT_BIAS,
- int BIT_TMP,
-
- int SIMD,
- int PE>
- void dw_conv3x3_layer_signed( hls::stream<ap_uint<BIT_IN*SIMD> >& in,
- const ap_uint<BIT_W*SIMD> weights[CH_OUT/SIMD][9],
- const ap_uint<BIT_ALPHA> alpha[CH_OUT/SIMD][SIMD],
- const ap_int<BIT_BIAS> bias[CH_OUT/SIMD][SIMD],
- hls::stream<ap_uint<BIT_OUT*SIMD> >& out,
- ap_uint<10> reps){
-
- #pragma HLS DATAFLOW
-
- const unsigned INTER_ROW = ROW_IN + 2;
- const unsigned INTER_COL = COL_IN + 2;
- const unsigned ROW_OUT = ROW_IN;
- const unsigned COL_OUT = COL_IN;
-
- stream<ap_uint<SIMD*BIT_IN> > padding_out("padding_out");
- padding<ROW_IN, COL_IN, CH_OUT, SIMD ,BIT_IN, 1>(in, padding_out, reps);
-
- #ifdef DEBUG
- printf("nnn\n\n");
- #endif
-
-
- stream<ap_uint<SIMD*BIT_IN> > swu_out("swu_out");
- #pragma HLS STREAM variable=swu_out depth=512 dim=1
- SWU_DW<3, 1, INTER_ROW, INTER_COL, CH_OUT,SIMD,SIMD, BIT_IN> (padding_out, swu_out, reps);
-
- #ifdef DEBUG
- printf("fff\n\n");
- #endif
-
- //hls::stream<ap_uint<BIT_OUT*SIMD> > conv_out("conv_out");
-
- dw_conv3x3_relu_signed<CH_OUT*3*3, CH_OUT, BIT_IN, FL_IN, BIT_OUT, FL_OUT, BIT_W, BIT_ALPHA, FL_ALPHA, BIT_BIAS, BIT_TMP, SIMD, PE, ROW_OUT*COL_OUT >(
- swu_out, weights, alpha, bias, out, reps);
-
- //for(int i=0;i<192;i++) cout << conv_out.read() << endl;
-
-
- }
-
-
-
- template < int ROW_IN,
- int COL_IN,
- int CH_IN,
- int BIT_IN,
- int FL_IN,
-
- int CH_OUT,
- int BIT_OUT,
- int FL_OUT,
-
- int BIT_W,
- int BIT_ALPHA,
- int FL_ALPHA,
- int BIT_BIAS,
- int BIT_TMP,
-
- int SIMD,
- int PE>
- void dw_conv3x3_laoyer( hls::stream<ap_uint<BIT_IN*SIMD> >& in,
- const ap_uint<BIT_W*SIMD> weights[CH_OUT/SIMD][9],
- const ap_uint<BIT_ALPHA> alpha[CH_OUT/SIMD][SIMD],
- const ap_int<BIT_BIAS> bias[CH_OUT/SIMD][SIMD],
- hls::stream<ap_uint<BIT_OUT*SIMD> >& out,
- ap_uint<10> reps){
-
- #pragma HLS DATAFLOW
-
- const unsigned INTER_ROW = ROW_IN + 2;
- const unsigned INTER_COL = COL_IN + 2;
- const unsigned ROW_OUT = ROW_IN;
- const unsigned COL_OUT = COL_IN;
-
- stream<ap_uint<SIMD*BIT_IN> > padding_out("padding_out");
- padding<ROW_IN, COL_IN, CH_OUT, SIMD ,BIT_IN, 1>(in, padding_out, reps);
-
- #ifdef DEBUG
- printf("nnn\n\n");
- #endif
-
-
- stream<ap_uint<SIMD*BIT_IN> > swu_out("swu_out");
- #pragma HLS STREAM variable=swu_out depth=512 dim=1
- SWU_DW_3x3_oly<3, 1, INTER_ROW, INTER_COL, CH_OUT,SIMD,SIMD, BIT_IN> (padding_out, swu_out, reps);
-
- #ifdef DEBUG
- printf("fff\n\n");
- #endif
-
- //hls::stream<ap_uint<BIT_OUT*SIMD> > conv_out("conv_out");
-
- dw_conv3x3_relu_oly<CH_OUT*3*3, CH_OUT, BIT_IN, FL_IN, BIT_OUT, FL_OUT, BIT_W, BIT_ALPHA, FL_ALPHA, BIT_BIAS, BIT_TMP, SIMD, PE, ROW_OUT*COL_OUT >(
- swu_out, weights, alpha, bias, out, reps);
-
- //for(int i=0;i<192;i++) cout << conv_out.read() << endl;
-
-
- }
-
- template < int ROW_IN,
- int COL_IN,
- int CH_IN,
- int BIT_IN,
- int FL_IN,
-
- int CH_OUT,
- int BIT_OUT,
- int FL_OUT,
-
- int BIT_W,
- int BIT_ALPHA,
- int FL_ALPHA,
- int BIT_BIAS,
- int BIT_TMP,
-
- int SIMD,
- int PE>
- void dw_conv3x3_layer_lidea( hls::stream<ap_uint<BIT_IN*SIMD> >& in,
- const ap_uint<BIT_W*SIMD> weights[CH_OUT/SIMD][9],
- const ap_uint<BIT_ALPHA> alpha[CH_OUT/SIMD][SIMD],
- const ap_int<BIT_BIAS> bias[CH_OUT/SIMD][SIMD],
- hls::stream<ap_uint<BIT_OUT*SIMD> >& out,
- ap_uint<10> reps){
-
- #pragma HLS DATAFLOW
-
- const unsigned INTER_ROW = ROW_IN + 2;
- const unsigned INTER_COL = COL_IN + 2;
- const unsigned ROW_OUT = ROW_IN;
- const unsigned COL_OUT = COL_IN;
-
- stream<ap_uint<SIMD*BIT_IN> > padding_out("padding_out");
- padding<ROW_IN, COL_IN, CH_OUT, SIMD ,BIT_IN, 1>(in, padding_out, reps);
-
- #ifdef DEBUG
- printf("nnn\n\n");
- #endif
-
-
- stream<ap_uint<SIMD*BIT_IN> > swu_out("swu_out");
- #pragma HLS STREAM variable=swu_out depth=512 dim=1
- SWU_DW_3x3_new<3, 1, INTER_ROW, INTER_COL, CH_OUT,SIMD,SIMD, BIT_IN> (padding_out, swu_out, reps);
-
- #ifdef DEBUG
- printf("fff\n\n");
- #endif
-
- //hls::stream<ap_uint<BIT_OUT*SIMD> > conv_out("conv_out");
-
- dw_conv3x3_relu_signed<CH_OUT*3*3, CH_OUT, BIT_IN, FL_IN, BIT_OUT, FL_OUT, BIT_W, BIT_ALPHA, FL_ALPHA, BIT_BIAS, BIT_TMP, SIMD, PE, ROW_OUT*COL_OUT >(
- swu_out, weights, alpha, bias, out, reps);
-
- //for(int i=0;i<192;i++) cout << conv_out.read() << endl;
-
-
- }
-
-
-
- template < int ROW_IN,
- int COL_IN
- >
- void pooping( hls::stream<ap_uint<32> >& in,
- hls::stream<ap_uint<32> >& out,
- ap_uint<10> reps){
-
- ap_int<32> in_temp;
- ap_int<32> max;
- ap_int<32> row;
- ap_int<32> col;
- ap_int<32> anchor_0;
- ap_int<32> anchor_1;
- ap_int<32> anchor_2;
- ap_int<32> anchor_3;
- ap_int<32> row_need;
- ap_int<32> col_need;
- ap_int<32> anchor_0_need;
- ap_int<32> anchor_1_need;
- ap_int<32> anchor_2_need;
- ap_int<32> anchor_3_need;
- max = -268435455;
- ap_uint<32> found;
-
- loopLv:for(unsigned i=0; i<reps; i++){
- max = -268435455;
- for(int row = 0;row < ROW_IN;row++){
- for(int col = 0;col < COL_IN;col++){
- found = 0;
- for(int ch = 0;ch < 10;ch++){
- in_temp = in.read();
- if(in_temp>max) {
- max = in_temp;
- found = 1;
- }
- }
- anchor_0 = in.read();
- anchor_1 = in.read();
- anchor_2 = in.read();
- anchor_3 = in.read();
- if(found){
- row_need = row;
- col_need = col;
- anchor_0_need = anchor_0;
- anchor_1_need = anchor_1;
- anchor_2_need = anchor_2;
- anchor_3_need = anchor_3;
- }
- }
- }
-
- out.write(row_need);
- out.write(col_need);
- out.write(anchor_0_need);
- out.write(anchor_1_need);
- out.write(anchor_2_need);
- out.write(anchor_3_need);
-
- }
- }
-
- template < int NUM_IN,
- int NUM_OUT,
-
- int BIT_IN,
- int FL_IN,
- int BIT_OUT,
- int FL_OUT,
-
- int BIT_W,
- int BIT_ALPHA,
- int FL_ALPHA,
- int BIT_BIAS,
- int BIT_TMP,
-
- int SIMD,
- int PE,
- int VECT_NUMS,
- int BIT_ACC>
- void conv3x3_relu_3t_pf( hls::stream<ap_uint<BIT_IN*SIMD> >& in,
- const ap_uint<BIT_W * SIMD * PE> weights[(NUM_OUT/PE)][(NUM_IN/9/SIMD)][9],
- hls::stream<ap_uint<BIT_ACC*PE> >& out,
- ap_uint<10> reps){
-
- #pragma HLS DATAFLOW
-
- const unsigned INPUT_FOLD = NUM_IN/SIMD/9; /* input_channel / simd * kernel_size^2 */
- const unsigned OUTPUT_FOLD = NUM_OUT/PE; /* output_channel / pe */
-
-
- ap_uint<BIT_IN*SIMD> input_temp_arr[INPUT_FOLD][9];
- #pragma HLS ARRAY_PARTITION variable=input_temp_arr complete dim=2
- ap_uint<7> in_fold_cnt = 0;
- ap_uint<7> out_fold_cnt = 0;
-
- ap_uint<BIT_IN*SIMD> input_temp[9];
- #pragma HLS ARRAY_PARTITION variable=input_temp complete dim=0
- ap_int<BIT_ACC> acc[PE+2];
- #pragma HLS ARRAY_PARTITION variable=acc complete dim=0
- // ap_int<BIT_TMP> output_temp[PE];
- // ap_uint<BIT_OUT> output_uint[PE];
-
- loopLv:for(unsigned i=0; i<VECT_NUMS * reps; i++){
- loopsd:for(ap_uint<7> out_fold_cnt=0;out_fold_cnt<OUTPUT_FOLD;out_fold_cnt++){
- for(ap_uint<7> m=0; m<PE; m++){
- #pragma HLS UNROLL
- acc[m] = 0;
- }
- loopasd:for(ap_uint<5> ay = 0;ay < 9;ay++){
- loopyy:for(ap_uint<7> in_fold_cnt=0;in_fold_cnt<INPUT_FOLD;in_fold_cnt++){
- #pragma HLS PIPELINE II=1
- if(out_fold_cnt == 0){
- input_temp[ay] = in.read();
- input_temp_arr[in_fold_cnt][ay] = input_temp[ay];
-
- }else{
- looqep:for(ap_uint<7> ay = 0;ay<9;ay++){
- input_temp[ay] = input_temp_arr[in_fold_cnt][ay];
- }
-
- }
-
- if((in_fold_cnt == 0)&&(ay == 0)){
-
- }
- ap_uint<BIT_W*SIMD*PE> weight_temp = weights[out_fold_cnt][in_fold_cnt][ay];
- ap_uint<BIT_ACC*PE> out_buf;
- loopqwe:for(ap_uint<7> pe = 0 ; pe < PE; pe +=3){
- #pragma HLS UNROLL
- ap_uint <BIT_W*SIMD> weight_pe_loop_0 = weight_temp((pe+1)*BIT_W*SIMD-1,pe*BIT_W*SIMD);
- ap_uint <BIT_W*SIMD> weight_pe_loop_1;
- ap_uint <BIT_W*SIMD> weight_pe_loop_2;
- if(pe+1<PE) weight_pe_loop_1 = weight_temp((pe+2)*BIT_W*SIMD-1,(pe+1)*BIT_W*SIMD);
- if(pe+2<PE) weight_pe_loop_2 = weight_temp((pe+3)*BIT_W*SIMD-1,(pe+2)*BIT_W*SIMD);
- ap_int<BIT_ACC*3> o;
- //ap_uint <BIT_IN*SIMD> in_pe_loop = input_temp((pe+1)*BIT_IN*SIMD-1,pe*BIT_IN*SIMD);
-
- o = tihoo_mutiplier<BIT_IN, BIT_W, BIT_ACC, SIMD>(input_temp[ay], weight_pe_loop_0,weight_pe_loop_1,weight_pe_loop_2);
- acc[pe] += o(BIT_ACC - 1,0);
- acc[pe+1] += o(2*BIT_ACC - 1,BIT_ACC);
- acc[pe+2] += o(3*BIT_ACC - 1,2*BIT_ACC);
-
- }
- loopqec:for(ap_uint<7> pe = 0 ; pe < PE; pe ++){
- #pragma HLS UNROLL
- if((ay == 8) && (in_fold_cnt ==INPUT_FOLD - 1)){
- out_buf((pe+1)*BIT_ACC-1, pe*BIT_ACC) = acc[pe];
- }
- }
- if((ay == 8) && (in_fold_cnt ==INPUT_FOLD - 1)){
- out.write(out_buf);
- }
- }
- }
-
- }
- }
- }
-
-
- template < int ROW_IN,
- int COL_IN,
- int CH_IN,
- int BIT_IN,
- int FL_IN,
-
- int CH_OUT,
- int BIT_OUT,
- int FL_OUT,
-
- int BIT_W,
- int BIT_ALPHA,
- int FL_ALPHA,
- int BIT_BIAS,
- int BIT_TMP,
-
- int SIMD,
- int PE>
- void dw_conv3x3_layer( hls::stream<ap_uint<BIT_IN*SIMD> >& in,
- const ap_uint<BIT_W*SIMD> weights[CH_OUT/SIMD][9],
- const ap_uint<BIT_ALPHA> alpha[CH_OUT/SIMD][SIMD],
- const ap_int<BIT_BIAS> bias[CH_OUT/SIMD][SIMD],
- hls::stream<ap_uint<BIT_OUT*SIMD> >& out,
- ap_uint<10> reps){
-
- #pragma HLS DATAFLOW
-
- const unsigned INTER_ROW = ROW_IN + 2;
- const unsigned INTER_COL = COL_IN + 2;
- const unsigned ROW_OUT = ROW_IN;
- const unsigned COL_OUT = COL_IN;
-
- stream<ap_uint<SIMD*BIT_IN> > padding_out("padding_out");
- padding<ROW_IN, COL_IN, CH_OUT, SIMD ,BIT_IN, 1>(in, padding_out, reps);
-
- #ifdef DEBUG
- #endif
-
- stream<ap_uint<SIMD*BIT_IN> > swu_out("swu_out");
- #pragma HLS STREAM variable=swu_out depth=512 dim=1
- SWU_DW<3, 1, INTER_ROW, INTER_COL, CH_OUT,SIMD,SIMD, BIT_IN> (padding_out, swu_out, reps);
-
- #ifdef DEBUG
- #endif
-
-
- //hls::stream<ap_uint<BIT_OUT*SIMD> > conv_out("conv_out");
-
- dw_conv3x3_relu_signed<CH_OUT*3*3, CH_OUT, BIT_IN, FL_IN, BIT_OUT, FL_OUT, BIT_W, BIT_ALPHA, FL_ALPHA, BIT_BIAS, BIT_TMP, SIMD, PE, ROW_OUT*COL_OUT>(
- swu_out, weights, alpha, bias, out , reps);
- #ifdef DEBUG
- #endif
- //for(int i=0;i<192;i++) cout << conv_out.read() << endl;
-
-
- }
-
- template < int ROW_IN,
- int COL_IN,
- int CH_IN,
- int BIT_IN,
- int FL_IN,
-
- int CH_OUT,
- int BIT_OUT,
- int FL_OUT,
-
- int BIT_W,
- int BIT_ALPHA,
- int FL_ALPHA,
- int BIT_BIAS,
- int BIT_TMP,
-
- int SIMD,
- int PE>
- void dw_conv3x3_layer_BRAM( hls::stream<ap_uint<BIT_IN*SIMD> >& in,
- const ap_uint<BIT_W*SIMD> weights[CH_OUT/SIMD][9],
- const ap_uint<BIT_ALPHA> alpha[CH_OUT/SIMD][SIMD],
- const ap_int<BIT_BIAS> bias[CH_OUT/SIMD][SIMD],
- hls::stream<ap_uint<BIT_OUT*SIMD> >& out,
- ap_uint<10> reps){
-
- #pragma HLS DATAFLOW
-
- const unsigned INTER_ROW = ROW_IN + 2;
- const unsigned INTER_COL = COL_IN + 2;
- const unsigned ROW_OUT = ROW_IN;
- const unsigned COL_OUT = COL_IN;
-
- stream<ap_uint<SIMD*BIT_IN> > padding_out("padding_out");
- padding<ROW_IN, COL_IN, CH_OUT, SIMD ,BIT_IN, 1>(in, padding_out, reps);
-
- #ifdef DEBUG
- #endif
-
- stream<ap_uint<SIMD*BIT_IN> > swu_out("swu_out");
- #pragma HLS STREAM variable=swu_out depth=512 dim=1
- SWU_DW_3x3_new_BRAM<3, 1, INTER_ROW, INTER_COL, CH_OUT,SIMD,SIMD, BIT_IN> (padding_out, swu_out, reps);
-
- #ifdef DEBUG
- #endif
-
-
- //hls::stream<ap_uint<BIT_OUT*SIMD> > conv_out("conv_out");
-
- dw_conv3x3_relu_signed<CH_OUT*3*3, CH_OUT, BIT_IN, FL_IN, BIT_OUT, FL_OUT, BIT_W, BIT_ALPHA, FL_ALPHA, BIT_BIAS, BIT_TMP, SIMD, PE, ROW_OUT*COL_OUT >(
- swu_out, weights, alpha, bias, out, reps);
- #ifdef DEBUG
- #endif
- //for(int i=0;i<192;i++) cout << conv_out.read() << endl;
-
- #ifdef DEBUG_2
- cout<<"out size :"<<out.size()<<endl;
- #endif
- }
-
- template < // kernel
- // unsigned S, // stride
- unsigned IN_ROW,
- unsigned IN_COL,
- unsigned IN_CH,
- unsigned IN_tran,
- unsigned IN_BIT>
- void stream_bypass(
- stream<ap_uint<IN_tran*IN_BIT> >& in,
- stream<ap_uint<IN_tran*IN_BIT> >& out,
- stream<ap_uint<IN_tran*IN_BIT> >& out_bypass,
- unsigned reps)
- {
- ap_uint<IN_tran * IN_BIT> temp;
- ap_uint<IN_tran * IN_BIT> out_bypass_temp;
- ap_uint<IN_tran * IN_BIT> cache[2][IN_COL][IN_CH/IN_tran];
- for(int i=0;i<(IN_ROW/2)*reps;i++){
-
- {
- for(int j=0;j<IN_COL;j++){
- for(int k = 0;k<IN_CH/IN_tran;k++){
- #pragma HLS pipeline II = 1
- temp = in.read();
- out.write(temp);
- cache[0][j][k] = temp;
- }
- }
- for(int j=0;j<IN_COL;j++){
- for(int k = 0;k<IN_CH/IN_tran;k++){
- #pragma HLS pipeline II = 1
- temp = in.read();
- out.write(temp);
- cache[1][j][k] = temp;
- }
- }
- }
- for(int j=0;j<IN_COL;j+=2){
- for(int k = 0;k<IN_CH/IN_tran;k++){
- #pragma HLS pipeline II = 1
- out_bypass_temp = cache[0][j][k];
- out_bypass.write(out_bypass_temp);
- }
- for(int k = 0;k<IN_CH/IN_tran;k++){
- #pragma HLS pipeline II = 1
- out_bypass_temp = cache[0][j+1][k];
- out_bypass.write(out_bypass_temp);
- }
- for(int k = 0;k<IN_CH/IN_tran;k++){
- #pragma HLS pipeline II = 1
- out_bypass_temp = cache[1][j][k];
- out_bypass.write(out_bypass_temp);
- }
- for(int k = 0;k<IN_CH/IN_tran;k++){
- #pragma HLS pipeline II = 1
- out_bypass_temp = cache[1][j+1][k];
- out_bypass.write(out_bypass_temp);
- }
- }
- }
-
- }
-
-
- template < // kernel
- // unsigned S, // stride
- unsigned IN_ROW,
- unsigned IN_COL,
- unsigned IN_CH,
- unsigned IN_tran,
- unsigned BYPASS_CH,
- unsigned BYPASS_tran,
- unsigned OUT_tran,
- unsigned IN_BIT>
- void stream_reorganize(
- //differ with vgg version
- stream<ap_uint<IN_tran*IN_BIT> >& in,
- stream<ap_uint<BYPASS_tran*IN_BIT> >& in_bypass,
- stream<ap_uint<OUT_tran*IN_BIT> >& out,
- unsigned reps)
- {
- ap_uint<IN_tran * IN_BIT> temp;
- ap_uint<BYPASS_tran * IN_BIT> in_bypass_temp;
- ap_uint<IN_CH * IN_BIT> cache[2][IN_COL];
- for(int i=0;i<IN_ROW * reps;i++){
- for(int j=0;j<IN_COL;j++){
- for(int k=0;k<(IN_CH+BYPASS_CH)/OUT_tran;k++){
- //in_bypass_temp = in_bypass.read();
- if(k<BYPASS_CH/OUT_tran) temp = in_bypass.read();
- else temp = in.read();
- out.write(temp);
- }
- }
- }
-
- }
-
- void atss_0506_lossv2( hls::stream<my_ap_axis >& in,
- hls::stream<my_ap_axis >& out,
- const unsigned repst){
-
- #pragma HLS DATAFLOW
- #pragma HLS INTERFACE axis register both port=out
- #pragma HLS INTERFACE axis register both port=in
- #pragma HLS INTERFACE s_axilite port=reps bundle=control
- #pragma HLS INTERFACE s_axilite port=return bundle=control
-
-
- #pragma HLS RESOURCE variable backbone_model_p1_0_0_weight_q core=RAM_2P_LUTRAM
- #pragma HLS ARRAY_PARTITION variable=backbone_model_p1_0_0_weight_q complete dim = 0
- #pragma HLS ARRAY_PARTITION variable=backbone_model_p1_0_0_alpha_q complete dim = 0
- #pragma HLS ARRAY_PARTITION variable=backbone_model_p1_0_0_bias_q complete dim = 0
-
-
-
- #pragma HLS RESOURCE variable backbone_model_p1_0_3_weight_q core=RAM_2P_LUTRAM
- #pragma HLS RESOURCE variable backbone_model_p1_0_3_alpha_q core=RAM_2P_LUTRAM
- #pragma HLS ARRAY_PARTITION variable=backbone_model_p1_0_3_alpha_q complete dim=2
- #pragma HLS RESOURCE variable backbone_model_p1_0_3_bias_q core=RAM_2P_LUTRAM
- #pragma HLS ARRAY_PARTITION variable=backbone_model_p1_0_3_bias_q complete dim=2
-
-
- #pragma HLS RESOURCE variable backbone_model_p1_2_0_weight_q core=RAM_2P_LUTRAM
- #pragma HLS RESOURCE variable backbone_model_p1_2_0_alpha_q core=RAM_2P_LUTRAM
- #pragma HLS ARRAY_PARTITION variable=backbone_model_p1_2_0_alpha_q complete dim=2
- #pragma HLS RESOURCE variable backbone_model_p1_2_0_bias_q core=RAM_2P_LUTRAM
- #pragma HLS ARRAY_PARTITION variable=backbone_model_p1_2_0_bias_q complete dim=2
-
- #pragma HLS RESOURCE variable backbone_model_p1_2_3_alpha_q core=RAM_2P_LUTRAM
- #pragma HLS RESOURCE variable backbone_model_p1_2_3_bias_q core=RAM_2P_LUTRAM
-
- #pragma HLS RESOURCE variable backbone_model_p1_4_0_alpha_q core=RAM_2P_LUTRAM
- #pragma HLS ARRAY_PARTITION variable=backbone_model_p1_4_0_alpha_q complete dim=2
- #pragma HLS RESOURCE variable backbone_model_p1_4_0_bias_q core=RAM_2P_LUTRAM
- #pragma HLS ARRAY_PARTITION variable=backbone_model_p1_4_0_bias_q complete dim=2
-
- #pragma HLS RESOURCE variable backbone_model_p2_1_0_alpha_q core=RAM_2P_LUTRAM
- #pragma HLS ARRAY_PARTITION variable=backbone_model_p2_1_0_alpha_q complete dim=2
- #pragma HLS RESOURCE variable backbone_model_p2_1_0_bias_q core=RAM_2P_LUTRAM
- #pragma HLS ARRAY_PARTITION variable=backbone_model_p2_1_0_bias_q complete dim=2
-
- #pragma HLS RESOURCE variable backbone_model_p3_0_3_conv_alpha_q core=RAM_2P_LUTRAM
- #pragma HLS RESOURCE variable backbone_model_p3_0_3_conv_bias_q core=RAM_2P_LUTRAM
-
- #pragma HLS RESOURCE variable backbone_model_p3_1_conv_alpha_q core=RAM_2P_LUTRAM
- #pragma HLS RESOURCE variable backbone_model_p3_1_conv_bias_q core=RAM_2P_LUTRAM
-
-
-
- ap_uint<8> reps = 50;
-
- #ifdef DEBUG
- reps = 2;
- #endif
- // const unsigned ap_uint<10> reps = 1;
- const unsigned int num_per_rep = 640 * 360 * 3 * 8 / 64;
- const unsigned int BIT_ALL = 7;
- hls::stream<ap_uint<64> > in_stream_extract("in_stream_extract");
- #pragma HLS STREAM variable=in_stream_extract depth=512 dim=1
- ExtractPixels<64, num_per_rep> (in, in_stream_extract, reps);
-
-
- hls::stream<ap_uint<192> > in_stream0("in_stream0");
- #pragma HLS STREAM variable=in_stream0 depth=32 dim=1
- Widthmuler<64, 192, num_per_rep>(in_stream_extract, in_stream0, reps);
-
-
- hls::stream<ap_uint<8 * 3> > in_stream1("in_stream1");
- #pragma HLS STREAM variable=in_stream1 depth=512 dim=1
- Widthdiver<192, 8 * 3, num_per_rep / 3> (in_stream0, in_stream1, reps);
-
- hls::stream<ap_uint<8 * 3> > r_stream1("r_stream1");
- #pragma HLS STREAM variable=in_stream1 depth=512 dim=1
- avg2x2_new<2, 360, 640, 3, 3, 8>(in_stream1, r_stream1, reps);
-
- #ifdef DEBUG
- cout << r_stream1.size()<<endl;
- #endif
-
- hls::stream<ap_uint<7 * 3> > in_stream2("in_stream2");
- #pragma HLS STREAM variable=in_stream2 depth=512 dim=1
- img_norm_none<8, 3, 180, 320 , 7>(r_stream1, in_stream2, reps);
-
-
-
- #ifdef DEBUG
- printf("start\n\n");
- #endif
-
- #ifdef DEBUG
- printf("mmm\n\n");
- #endif
-
- hls::stream<ap_uint<backbone_model_p1_0_0_BIT_OUT * backbone_model_p1_0_0_SIMD> > p100_out("p100_out");
- #pragma HLS STREAM variable=p100_out depth=512 dim=1
- dw_conv3x3_laoyer
- < backbone_model_p1_0_0_ROW_IN,
- backbone_model_p1_0_0_COL_IN,
-
- backbone_model_p1_0_0_CH_IN,
- backbone_model_p1_0_0_BIT_IN,
- backbone_model_p1_0_0_FL_IN,
- backbone_model_p1_0_0_CH_OUT,
- backbone_model_p1_0_0_BIT_OUT,
- backbone_model_p1_0_0_FL_OUT,
- backbone_model_p1_0_0_BIT_W,
- backbone_model_p1_0_0_BIT_ALPHA,
- backbone_model_p1_0_0_FL_ALPHA,
- backbone_model_p1_0_0_BIT_BIAS,
- backbone_model_p1_0_0_BIT_TMP,
- backbone_model_p1_0_0_SIMD,
- backbone_model_p1_0_0_PE>
- (in_stream2,backbone_model_p1_0_0_weight_q,backbone_model_p1_0_0_alpha_q,backbone_model_p1_0_0_bias_q,p100_out,
- reps);
-
-
- hls::stream<ap_uint<BIT_ALL> > p103_converting("p103_converting");
- #pragma HLS STREAM variable=p103_converting depth=512 dim=1
-
- Widthdiver<BIT_ALL * 3 , BIT_ALL, backbone_model_p1_0_3_ROW_IN * backbone_model_p1_0_3_COL_IN>(
- p100_out, p103_converting , reps);
-
- #ifdef DEBUG
- #endif
-
-
-
- hls::stream<ap_uint<backbone_model_p1_0_3_BIT_OUT * backbone_model_p1_0_3_PE> > p103_out("p103_out");
-
- #pragma HLS STREAM variable=p103_out depth=512 dim=1
- conv1x1_layer_pf_t3_6<backbone_model_p1_0_3_ROW_IN,
- backbone_model_p1_0_3_COL_IN,
- backbone_model_p1_0_3_CH_IN,
- backbone_model_p1_0_3_BIT_IN,
- backbone_model_p1_0_3_FL_IN,
-
- backbone_model_p1_0_3_CH_OUT,
- backbone_model_p1_0_3_BIT_OUT,
- backbone_model_p1_0_3_FL_OUT,
-
- backbone_model_p1_0_3_BIT_W,
- backbone_model_p1_0_3_BIT_ALPHA,
- backbone_model_p1_0_3_FL_ALPHA,
- backbone_model_p1_0_3_BIT_BIAS,
- backbone_model_p1_0_3_BIT_TMP,
-
- backbone_model_p1_0_3_SIMD,
- backbone_model_p1_0_3_PE>
- (p103_converting,backbone_model_p1_0_3_weight_q,backbone_model_p1_0_3_alpha_q,backbone_model_p1_0_3_bias_q,p103_out,
- reps);
-
-
- hls::stream<ap_uint<12 * BIT_ALL> > pool_p103_out("pool_p103_out");
- #pragma HLS STREAM variable=pool_p103_out depth=512 dim=1
- max_pool2x2_new<2, 184, 320, 48, 12, BIT_ALL>(p103_out, pool_p103_out, reps);
-
-
-
- #ifdef DEBUG
- cout << pool_p103_out.size()<<endl;
- #endif
- hls::stream<ap_uint<2 * BIT_ALL> > p120_converting("p120_converting");
- #pragma HLS STREAM variable=p120_converting depth=512 dim=1
- Widthdiver<BIT_ALL * 12 , BIT_ALL * 2, 92 * 160 * 4>(
- pool_p103_out, p120_converting, reps);
-
- #ifdef DEBUG
- cout << p120_converting.size()<<endl;
-
- #endif
- hls::stream<ap_uint<12 * BIT_ALL> > p120_converted("p120_converted");
- #pragma HLS STREAM variable=p120_converted depth=512 dim=1
- Widthmuler<BIT_ALL * 2 , BIT_ALL * 12, 92 * 160 * 24 >(
- p120_converting, p120_converted, reps);
-
- #ifdef DEBUG
- cout << p120_converted.size()<<endl;
-
- #endif
- hls::stream<ap_uint<backbone_model_p1_2_0_SIMD * backbone_model_p1_2_0_BIT_OUT> > p120_out("p120_out");
- #pragma HLS STREAM variable=p120_out depth=512 dim=1
- dw_conv3x3_layer_lidea
- < backbone_model_p1_2_0_ROW_IN,
- backbone_model_p1_2_0_COL_IN,
- backbone_model_p1_2_0_CH_IN,
- backbone_model_p1_2_0_BIT_IN,
- backbone_model_p1_2_0_FL_IN,
- backbone_model_p1_2_0_CH_OUT,
- backbone_model_p1_2_0_BIT_OUT,
- backbone_model_p1_2_0_FL_OUT,
- backbone_model_p1_2_0_BIT_W,
- backbone_model_p1_2_0_BIT_ALPHA,
- backbone_model_p1_2_0_FL_ALPHA,
- backbone_model_p1_2_0_BIT_BIAS,
- backbone_model_p1_2_0_BIT_TMP,
- backbone_model_p1_2_0_SIMD,
- backbone_model_p1_2_0_PE>
- (p120_converted,backbone_model_p1_2_0_weight_q,backbone_model_p1_2_0_alpha_q,backbone_model_p1_2_0_bias_q,p120_out,reps);
- #ifdef DEBUG
- cout << p120_out.size()<<endl;
- #endif
-
-
- hls::stream<ap_uint<2 * BIT_ALL> > p123_converting("p123_converting");
- #pragma HLS STREAM variable=p123_converting depth=512 dim=1
- Widthdiver<12 * BIT_ALL , BIT_ALL * 2, 92 * 160 * 4 >(
- p120_out, p123_converting, reps);
- #ifdef DEBUG
- cout << p123_converting.size()<<endl;
- cout<< "p123_converting"<<endl;
- #endif
- hls::stream<ap_uint<8 * BIT_ALL> > p123_converted("p123_converted");
- #pragma HLS STREAM variable=p123_converted depth=512 dim=1
- Widthmuler<2 * BIT_ALL , BIT_ALL * 8, 92 * 160 * 24 >(
- p123_converting, p123_converted, reps);
- #ifdef DEBUG
- cout << p123_converted.size()<<endl;
- cout<< "p123_converted"<<endl;
- #endif
- hls::stream<ap_uint<backbone_model_p1_2_3_BIT_OUT * backbone_model_p1_2_3_PE> > p123_out("p123_out");
- #pragma HLS STREAM variable=p123_out depth=512
- conv1x1_layer_pf_t3_2<backbone_model_p1_2_3_ROW_IN,
- backbone_model_p1_2_3_COL_IN,
- backbone_model_p1_2_3_CH_IN,
- backbone_model_p1_2_3_BIT_IN,
- backbone_model_p1_2_3_FL_IN,
- backbone_model_p1_2_3_CH_OUT,
- backbone_model_p1_2_3_BIT_OUT,
- backbone_model_p1_2_3_FL_OUT,
- backbone_model_p1_2_3_BIT_W,
- backbone_model_p1_2_3_BIT_ALPHA,
- backbone_model_p1_2_3_FL_ALPHA,
- backbone_model_p1_2_3_BIT_BIAS,
- backbone_model_p1_2_3_BIT_TMP,
- backbone_model_p1_2_3_SIMD,
- backbone_model_p1_2_3_PE>
- (p123_converted,backbone_model_p1_2_3_weight_q,backbone_model_p1_2_3_alpha_q,backbone_model_p1_2_3_bias_q,p123_out,reps);
-
-
- hls::stream<ap_uint<12 * 7> > pool_p123_out("pool_p123_out");
- #pragma HLS STREAM variable=pool_p123_out depth=128
- max_pool2x2_new<2, 92, 160, 96, 12, 7>(p123_out, pool_p123_out, reps);
-
- #ifdef DEBUG
- cout << pool_p123_out.size()<<endl;
- cout<< "pool_p123_out"<<endl;
- #endif
-
- hls::stream<ap_uint<6 * BIT_ALL> > p140_coneded("p140_coneded");
- #pragma HLS STREAM variable=p140_coneded depth=512 dim=1
- Widthdiver<12 * BIT_ALL , BIT_ALL * 6, 46 * 80 * 8 >(
- pool_p123_out, p140_coneded, reps);
-
- hls::stream<ap_uint<backbone_model_p1_4_0_SIMD * backbone_model_p1_4_0_BIT_OUT> > p140_out("p140_out");
- #pragma HLS STREAM variable=p140_out depth=512
- dw_conv3x3_layer_lidea
- < backbone_model_p1_4_0_ROW_IN,
- backbone_model_p1_4_0_COL_IN,
- backbone_model_p1_4_0_CH_IN,
- backbone_model_p1_4_0_BIT_IN,
- backbone_model_p1_4_0_FL_IN,
- backbone_model_p1_4_0_CH_OUT,
- backbone_model_p1_4_0_BIT_OUT,
- backbone_model_p1_4_0_FL_OUT,
- backbone_model_p1_4_0_BIT_W,
- backbone_model_p1_4_0_BIT_ALPHA,
- backbone_model_p1_4_0_FL_ALPHA,
- backbone_model_p1_4_0_BIT_BIAS,
- backbone_model_p1_4_0_BIT_TMP,
- backbone_model_p1_4_0_SIMD,
- backbone_model_p1_4_0_PE>
- (p140_coneded,backbone_model_p1_4_0_weight_q,backbone_model_p1_4_0_alpha_q,backbone_model_p1_4_0_bias_q,p140_out,reps);
-
- #ifdef DEBUG
- cout << p140_out.size()<<endl;
- cout<< "p140_out"<<endl;
- #endif
-
-
- hls::stream<ap_uint<2 * BIT_ALL> > p143_converting("p143_converting");
- #pragma HLS STREAM variable=p143_converting depth=512
- Widthdiver<6 * BIT_ALL , BIT_ALL * 2, 46 * 80 * 16>(
- p140_out, p143_converting, reps );
-
-
- hls::stream<ap_uint<8 * BIT_ALL> > p143_converted("p143_converted");
- #pragma HLS STREAM variable=p143_converted depth=512
- Widthmuler<2 * BIT_ALL , BIT_ALL * 8, 46 * 80 * 48>(
- p143_converting, p143_converted, reps );
-
- hls::stream<ap_uint<backbone_model_p1_4_3_BIT_OUT * backbone_model_p1_4_3_PE> > p143_out("p143_out");
- #pragma HLS STREAM variable=p143_out depth=512
- conv1x1_layer_pf_t3_2<backbone_model_p1_4_3_ROW_IN,
- backbone_model_p1_4_3_COL_IN,
- backbone_model_p1_4_3_CH_IN,
- backbone_model_p1_4_3_BIT_IN,
- backbone_model_p1_4_3_FL_IN,
- backbone_model_p1_4_3_CH_OUT,
- backbone_model_p1_4_3_BIT_OUT,
- backbone_model_p1_4_3_FL_OUT,
- backbone_model_p1_4_3_BIT_W,
- backbone_model_p1_4_3_BIT_ALPHA,
- backbone_model_p1_4_3_FL_ALPHA,
- backbone_model_p1_4_3_BIT_BIAS,
- backbone_model_p1_4_3_BIT_TMP,
- backbone_model_p1_4_3_SIMD,
- backbone_model_p1_4_3_PE >
- (p143_converted,backbone_model_p1_4_3_weight_q,backbone_model_p1_4_3_alpha_q,backbone_model_p1_4_3_bias_q,p143_out, reps);
-
-
-
- #ifdef DEBUG
- cout << p143_out.size()<<endl;
- cout<< "p143_out"<<endl;
- #endif
-
- hls::stream<ap_uint<12 * 7> > pool_p143_out("pool_p143_out");
- #pragma HLS STREAM variable=pool_p143_out depth=256
- max_pool2x2_new<2, 46, 80, 192, 12, 7>(p143_out, pool_p143_out, reps);
-
- hls::stream<ap_uint<backbone_model_p2_1_0_SIMD * 7> > p210_converting("p210_converting");
- #pragma HLS STREAM variable=p210_converting depth=512
- Widthdiver<12 * 7, 3 * 7, 40*23*192/12 >(
- pool_p143_out, p210_converting, reps);
-
- #ifdef DEBUG
- cout << p210_converting.size()<<endl;
- cout<< "p210_converting"<<endl;
- #endif
-
- hls::stream<ap_uint<backbone_model_p2_1_0_SIMD * backbone_model_p2_1_0_BIT_OUT> > p210_out("p210_out");
- #pragma HLS STREAM variable=p210_out depth=512
-
- dw_conv3x3_layer_lidea
- < backbone_model_p2_1_0_ROW_IN,
- backbone_model_p2_1_0_COL_IN,
- backbone_model_p2_1_0_CH_IN,
- backbone_model_p2_1_0_BIT_IN,
- backbone_model_p2_1_0_FL_IN,
- backbone_model_p2_1_0_CH_OUT,
- backbone_model_p2_1_0_BIT_OUT,
- backbone_model_p2_1_0_FL_OUT,
- backbone_model_p2_1_0_BIT_W,
- backbone_model_p2_1_0_BIT_ALPHA,
- backbone_model_p2_1_0_FL_ALPHA,
- backbone_model_p2_1_0_BIT_BIAS,
- backbone_model_p2_1_0_BIT_TMP,
- backbone_model_p2_1_0_SIMD,
- backbone_model_p2_1_0_PE>
- (p210_converting,backbone_model_p2_1_0_weight_q,backbone_model_p2_1_0_alpha_q,backbone_model_p2_1_0_bias_q,p210_out,reps);
- #ifdef DEBUG
- cout << p210_out.size()<<endl;
- cout<< "p210_out"<<endl;
- #endif
-
-
- hls::stream<ap_uint<1 * 7> > p213_converting("p213_converting");
- #pragma HLS STREAM variable=p213_converting depth=512
- Widthdiver<3 * 7, 1 * 7, 40*23*192/3 >(
- p210_out, p213_converting, reps);
-
-
-
- hls::stream<ap_uint<backbone_model_p2_1_3_SIMD * 7> > p213_converted("p213_converted");
- #pragma HLS STREAM variable=p213_converted depth=1024
- Widthmuler<1*7, backbone_model_p2_1_3_SIMD * 7, 40*23*192 >(
- p213_converting, p213_converted, reps);
-
-
-
-
- #ifdef DEBUG
- cout << p213_converting.size()<<endl;
- #endif
-
- hls::stream<ap_uint<backbone_model_p2_1_3_BIT_OUT * backbone_model_p2_1_3_PE> > p213_out("p213_out");
- #pragma HLS STREAM variable=p213_out depth=512
- conv1x1_layer_pf_t3<backbone_model_p2_1_3_ROW_IN,
- backbone_model_p2_1_3_COL_IN,
- backbone_model_p2_1_3_CH_IN,
- backbone_model_p2_1_3_BIT_IN,
- backbone_model_p2_1_3_FL_IN,
- backbone_model_p2_1_3_CH_OUT,
- backbone_model_p2_1_3_BIT_OUT,
- backbone_model_p2_1_3_FL_OUT,
- backbone_model_p2_1_3_BIT_W,
- backbone_model_p2_1_3_BIT_ALPHA,
- backbone_model_p2_1_3_FL_ALPHA,
- backbone_model_p2_1_3_BIT_BIAS,
- backbone_model_p2_1_3_BIT_TMP,
- backbone_model_p2_1_3_SIMD,
- backbone_model_p2_1_3_PE>
- (p213_converted,backbone_model_p2_1_3_weight_q,backbone_model_p2_1_3_alpha_q,backbone_model_p2_1_3_bias_q,p213_out,reps);
-
-
-
-
- hls::stream<ap_uint<6 * 7> > p220_converting("p220_converting");
- #pragma HLS STREAM variable=p220_converting depth=512
- Widthdiver<12 * 7, 6 * 7, 40*23*384/12>(
- p213_out, p220_converting , reps);
-
- hls::stream<ap_uint<backbone_model_p2_2_0_SIMD * backbone_model_p2_2_0_BIT_OUT> > p220_out("p220_out");
- #pragma HLS STREAM variable=p220_out depth=512 dim=1
- dw_conv3x3_layer_BRAM
- < backbone_model_p2_2_0_ROW_IN,
- backbone_model_p2_2_0_COL_IN,
- backbone_model_p2_2_0_CH_IN,
- backbone_model_p2_2_0_BIT_IN,
- backbone_model_p2_2_0_FL_IN,
- backbone_model_p2_2_0_CH_OUT,
- backbone_model_p2_2_0_BIT_OUT,
- backbone_model_p2_2_0_FL_OUT,
- backbone_model_p2_2_0_BIT_W,
- backbone_model_p2_2_0_BIT_ALPHA,
- backbone_model_p2_2_0_FL_ALPHA,
- backbone_model_p2_2_0_BIT_BIAS,
- backbone_model_p2_2_0_BIT_TMP,
- backbone_model_p2_2_0_SIMD,
- backbone_model_p2_2_0_PE>
- (p220_converting,backbone_model_p2_2_0_weight_q,backbone_model_p2_2_0_alpha_q,backbone_model_p2_2_0_bias_q,p220_out,reps);
-
-
- #ifdef DEBUG
- cout << p220_out.size()<<endl;
- cout<< "p220_out"<<endl;
- #endif
-
- hls::stream<ap_uint<2 * 7> > p223_converting("p223_converting");
- #pragma HLS STREAM variable=p223_converting depth=512
- Widthdiver<6 * 7, 2 * 7, 40*23*384/6>(
- p220_out, p223_converting , reps);
-
- hls::stream<ap_uint<16 * 7> > p223_converted("p223_converted");
- #pragma HLS STREAM variable=p223_converted depth=512
- Widthmuler<2 * 7, 16 * 7, 40*23*384/2>(
- p223_converting, p223_converted , reps);
-
-
- hls::stream<ap_uint<backbone_model_p2_2_3_BIT_OUT * backbone_model_p2_2_3_PE> > p223_out("p223_out");
- #pragma HLS STREAM variable=p223_out depth=512
- conv1x1_layer_pf_t3<backbone_model_p2_2_3_ROW_IN,
- backbone_model_p2_2_3_COL_IN,
- backbone_model_p2_2_3_CH_IN,
- backbone_model_p2_2_3_BIT_IN,
- backbone_model_p2_2_3_FL_IN,
- backbone_model_p2_2_3_CH_OUT,
- backbone_model_p2_2_3_BIT_OUT,
- backbone_model_p2_2_3_FL_OUT,
- backbone_model_p2_2_3_BIT_W,
- backbone_model_p2_2_3_BIT_ALPHA,
- backbone_model_p2_2_3_FL_ALPHA,
- backbone_model_p2_2_3_BIT_BIAS,
- backbone_model_p2_2_3_BIT_TMP,
- backbone_model_p2_2_3_SIMD,
- backbone_model_p2_2_3_PE>
- (p223_converted,backbone_model_p2_2_3_weight_q,backbone_model_p2_2_3_alpha_q,backbone_model_p2_2_3_bias_q,p223_out,reps);
-
- #ifdef DEBUG
- cout << p223_out.size()<<endl;
- cout<< "p223_out"<<endl;
- #endif
-
- hls::stream<ap_uint<8 * 7> > p300_converting("p300_converting");
- #pragma HLS STREAM variable=p300_converting depth=512
- Widthdiver<16 * 7, 8 * 7, 40*23*512/16>(
- p223_out, p300_converting , reps);
-
-
- hls::stream<ap_uint<backbone_model_p3_0_0_SIMD * backbone_model_p3_0_0_BIT_OUT> > p300_out("p300_out");
- #pragma HLS STREAM variable=p300_out depth=512 dim=1
- dw_conv3x3_layer_BRAM
- < backbone_model_p3_0_0_ROW_IN,
- backbone_model_p3_0_0_COL_IN,
- backbone_model_p3_0_0_CH_IN,
- backbone_model_p3_0_0_BIT_IN,
- backbone_model_p3_0_0_FL_IN,
- backbone_model_p3_0_0_CH_OUT,
- backbone_model_p3_0_0_BIT_OUT,
- backbone_model_p3_0_0_FL_OUT,
- backbone_model_p3_0_0_BIT_W,
- backbone_model_p3_0_0_BIT_ALPHA,
- backbone_model_p3_0_0_FL_ALPHA,
- backbone_model_p3_0_0_BIT_BIAS,
- backbone_model_p3_0_0_BIT_TMP,
- backbone_model_p3_0_0_SIMD,
- backbone_model_p3_0_0_PE >
- (p300_converting,backbone_model_p3_0_0_weight_q,backbone_model_p3_0_0_alpha_q,backbone_model_p3_0_0_bias_q,p300_out,reps);
-
- #ifdef DEBUG
- #endif
-
-
- #ifdef DEBUG
- cout<< p300_out.size() <<endl;
- #endif
-
- hls::stream<ap_uint<backbone_model_p3_0_3_BIT_OUT * backbone_model_p3_0_3_PE> > p303_out("p303_out");
- #pragma HLS STREAM variable=p303_out depth=512 dim=1
- conv1x1_layer_pf_t3<backbone_model_p3_0_3_ROW_IN,
- backbone_model_p3_0_3_COL_IN,
- backbone_model_p3_0_3_CH_IN,
- backbone_model_p3_0_3_BIT_IN,
- backbone_model_p3_0_3_FL_IN,
- backbone_model_p3_0_3_CH_OUT,
- backbone_model_p3_0_3_BIT_OUT,
- backbone_model_p3_0_3_FL_OUT,
- backbone_model_p3_0_3_BIT_W,
- backbone_model_p3_0_3_BIT_ALPHA,
- backbone_model_p3_0_3_FL_ALPHA,
- backbone_model_p3_0_3_BIT_BIAS,
- backbone_model_p3_0_3_BIT_TMP,
- backbone_model_p3_0_3_SIMD,
- backbone_model_p3_0_3_PE>
- (p300_out,backbone_model_p3_0_3_weight_q,backbone_model_p3_0_3_alpha_q,backbone_model_p3_0_3_bias_q,p303_out,reps);
-
- #ifdef DEBUG
- cout<< p303_out.size() <<endl;
- #endif
-
- hls::stream<ap_uint<7 * 2> > p31_converting("p31_converting");
- #pragma HLS STREAM variable=p31_converting depth=512
- Widthdiver<8 *7 , 2 * 7, 40*23*96/8>(p303_out, p31_converting, reps );
-
-
- hls::stream<ap_uint<backbone_model_p3_1_conv_BIT_OUT * backbone_model_p3_1_conv_PE> > p31_out("p31_out");
- #pragma HLS STREAM variable=p31_out depth=512 dim=1
- conv1x1_layer_pf_t3<backbone_model_p3_1_conv_ROW_IN,
- backbone_model_p3_1_conv_COL_IN,
- backbone_model_p3_1_conv_CH_IN,
- backbone_model_p3_1_conv_BIT_IN,
- backbone_model_p3_1_conv_FL_IN,
- backbone_model_p3_1_conv_CH_OUT,
- backbone_model_p3_1_conv_BIT_OUT,
- backbone_model_p3_1_conv_FL_OUT,
- backbone_model_p3_1_conv_BIT_W,
- backbone_model_p3_1_conv_BIT_ALPHA,
- backbone_model_p3_1_conv_FL_ALPHA,
- backbone_model_p3_1_conv_BIT_BIAS,
- backbone_model_p3_1_conv_BIT_TMP,
- backbone_model_p3_1_conv_SIMD,
- backbone_model_p3_1_conv_PE >
- (p31_converting,backbone_model_p3_1_conv_weight_q,backbone_model_p3_1_conv_alpha_q,backbone_model_p3_1_conv_bias_q,p31_out,reps);
-
-
- #ifdef DEBUG
- cout<< p31_out.size() <<endl;
- #endif
-
- hls::stream<ap_uint<4 * 7> > pf_converting("pf_converting");
- #pragma HLS STREAM variable=pf_converting depth=512
- Widthmuler<2 * 7, 4 * 7, 40*23*32/2>(
- p31_out, pf_converting , reps);
-
-
- hls::stream<ap_uint<bbox_head_atss_cls_reg_center_BIT_OUT * bbox_head_atss_cls_reg_center_PE> > final_conv("final_conv");
- #pragma HLS STREAM variable=p31_out depth=512 dim=1
- conv3x3_layer_crc< bbox_head_atss_cls_reg_center_ROW_IN,
- bbox_head_atss_cls_reg_center_COL_IN,
- bbox_head_atss_cls_reg_center_CH_IN,
- bbox_head_atss_cls_reg_center_BIT_IN,
- bbox_head_atss_cls_reg_center_FL_IN,
- bbox_head_atss_cls_reg_center_CH_OUT,
- bbox_head_atss_cls_reg_center_BIT_OUT,
- bbox_head_atss_cls_reg_center_FL_OUT,
- bbox_head_atss_cls_reg_center_BIT_W,
- bbox_head_atss_cls_reg_center_BIT_ALPHA,
- bbox_head_atss_cls_reg_center_FL_ALPHA,
- bbox_head_atss_cls_reg_center_BIT_BIAS,
- bbox_head_atss_cls_reg_center_BIT_TMP,
- bbox_head_atss_cls_reg_center_SIMD,
- bbox_head_atss_cls_reg_center_PE>(
- pf_converting, bbox_head_atss_cls_reg_center_weight_q, bbox_head_atss_cls_reg_center_alpha_q, bbox_head_atss_cls_reg_center_bias_q, final_conv ,reps);
-
-
- // hls::stream<ap_uint<64> > chang_1("chang_1");
- //#pragma HLS STREAM variable=chang_1 depth=512
- // Widthmuler<32, 64, 40*23*14 >(final_conv, chang_1, reps);
- hls::stream<ap_uint<32> > pconverting("pconverting");
- #pragma HLS STREAM variable=pconverting depth=512
- Widthdiver<64, 32, 40*23*14/2>(final_conv, pconverting , reps);
-
- hls::stream<ap_uint<32> > chang_2("chang_2");
- //StreamingDataWidthConverter_Batch<448, 64, 200>(chang_1, chang_2, reps );
-
- pooping<23,40>(pconverting,chang_2,reps);
-
-
- hls::stream<ap_uint<64> > ff("ff");
- #pragma HLS STREAM variable=ff depth=256
- Widthmuler<32, 64, 6>(chang_2, ff , reps);
-
- AddLast<6/2>(ff, out, reps);
- /*
- #define PE 1
- for(int i=0;i<20;i++){
- for(int j=0;j<40;j++){
- int CH = 14;
- ap_int<PE * 32> temp;
- for(int k=0;k<CH/PE;k++){
- temp= final_conv.read();
- for(int bit = 0; bit < PE ;bit ++ ){
- if(i==0){
- ap_int<32> temp2 = temp(bit*32+31,bit*32);
- cout<< temp2 << " " ;
- }
- }
- if(i==0) cout << endl;
- }
- }
- }/*
-
- #define PE 2
- for(int i=0;i<46;i++){
- for(int j=0;j<80;j++){
- int CH = 96;
- ap_int<PE * 7> temp;
- for(int k=0;k<CH/PE;k++){
- temp= pool_p123_out.read();
- for(int bit = 0; bit < PE ;bit ++ ){
- if(i==3){
- ap_uint<7> temp2 = temp(bit*7+6,bit*7);
- cout<< temp2 << " " ;
- }
- }
- if(i==3) cout << endl;
- }
- }
- }/*
- */
- }
- #ifdef DEBUG
- #include <hls_stream.h>
- #include <iostream>
- #include <fstream>
-
- void load_data(const char *path, char *ptr, unsigned int size)
- {
- std::ifstream f(path, std::ios::in | std::ios::binary);
- if (!f)
- {
- std::cout << "no such file,please check the file name!/n";
- exit(0);
- }
- f.read(ptr, size);
- f.close();
- }
-
- void write_data(const char *path, char *ptr, unsigned int size)
- {
- std::ofstream f(path, std::ios::out | std::ios::binary);
- if (!f)
- {
- std::cout << "write no such file,please check the file name!/n";
- exit(0);
- }
- f.write(ptr, size);
- f.close();
- }
-
-
- int main(){
- uint16_t img_h = 360;
- uint16_t img_w = 640;
- uint16_t img_ch = 3;
-
- printf("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\n\n");
- uint8_t img[img_h][img_w][img_ch];
- load_data("/home/lq/skynet_0506/1_full.bin", (char *) img, sizeof(img));
-
-
- const int data_points_per_line = 8;
- const int nums_line_pre_img = img_h * img_w * img_ch * 8 / 64;
- uint8_t * data = (uint8_t *) img;
-
- for(int j=100; j<103; j++){
- for(int k=100; k<103; k++){
- for(int i=0; i<img_ch; i++){
- printf(" %3x", img[j][k][i]);
- }
- cout << endl;
- }
- }
-
- ap_uint<10> reps = 1;
- hls::stream<my_ap_axis > input_stream("input stream");
- for(int mm=0;mm<reps;mm++)
- for (unsigned int i = 0; i < nums_line_pre_img; i++) {
- my_ap_axis temp;
- for (unsigned int j = 0; j < data_points_per_line; j++) {
- temp.data( 8*(j+1)-1, 8*j ) = data[i * data_points_per_line + j];
- }
- input_stream.write(temp);
-
- //cout<< hex <<temp.data << " ";
- //if(i%3 == 0) cout<<endl;
- }
-
- cout << "input size :" << input_stream.size() << endl;
- cout << "start ..... " << endl;
-
- hls::stream<my_ap_axis > out_stream("out_stream");
-
- atss_0506_lossv2(input_stream, out_stream,reps);
-
-
- while(!out_stream.empty()){
- static uint8_t flag = 0;
- my_ap_axis out_read = out_stream.read();
- // for(int i=0; i<2; i++){
- // ap_uint<8> value_tmp = out_read.data(8*(i+1)-1, 8*i);
- // ap_int<32> value_tmp = out_read.data(32*(i+1)-1, 32*i);
- ap_int<64> value_tmp = out_read.data;
- // ap_fixed<32, 12, AP_RND> value = *(ap_fixed<32, 12, AP_RND>*)&value_tmp;
- // ap_fixed<8, 4, AP_RND> value = *(ap_fixed<8, 4, AP_RND>*)&value_tmp;
-
- cout << (ap_int<32>)value_tmp(31,0) << " " ;
- cout << (ap_int<32>)value_tmp(63,32) << " " ;
- if(++flag == 14){
- flag = 0;
- cout << endl;
- }
- // }
- }
- }
- #endif
|