JDOpenISCT
/
基于FPGA的deep_wise深度可分离卷积实现方案

 
			
							#define AP_INT_MAX_W 10240
#include "stream_tools.h"
#include "sliding_window_unit.h"
#include <ap_fixed.h>
#include <stdint.h>
//#include <hls_video.h>
#include "pool2d.h"
#include "function.h"
#include "atss_0506_lossv2.h"
#include "atss_0506_lossv2_config.h"

#include <ap_int.h>
#include "math.h"


//#define DEBUG
using namespace hls;
using namespace std;

/************************ Image Normalization ************************/

const ap_fixed<16, 3, AP_RND> img_norm_weight[3] = {
    (ap_fixed<16, 3, AP_RND>)(1.0/58.395),
    (ap_fixed<16, 3, AP_RND>)(1.0/57.12),
    (ap_fixed<16, 3, AP_RND>)(1.0/57.375)
};
const ap_fixed<16, 3, AP_RND> img_norm_bias[3] = {
    (ap_fixed<16, 3, AP_RND>)(-123.675/58.395),
    (ap_fixed<16, 3, AP_RND>)(-116.28/57.12),
    (ap_fixed<16, 3, AP_RND>)(-103.53/57.375)
};

// template <int IL_IN, int FL_IN, int IL_OUT, int FL_OUT>
// ap_uint<(IL_OUT+FL_OUT)> truncate_img_norm(
//     ap_uint<(IL_IN+FL_IN)> in) {

//     ap_int<(IL_OUT+FL_OUT)> out;
//     ap_int<(IL_OUT+FL_OUT+1)> out_tmp;

//     out_tmp = in >> (IL_IN+FL_IN-IL_OUT-FL_OUT-1);

//     if(out_tmp > 0){
//         if(out_tmp < ((1<<(IL_OUT+FL_OUT))-1)){
//             out_tmp += 1;
//         }
//     }
//     // else{
//     //     if(out != -(1<<(IL_OUT+FL_OUT))){
//     //         out -= 1;
//     //     }
//     // }

//     out = out_tmp >> 1;

//     return out;
// }

template <int IL_IN, int FL_IN, int IL_OUT, int FL_OUT>
ap_uint<(IL_OUT+FL_OUT)> truncate_img_norm(
    ap_uint<(IL_IN+FL_IN)> in){

    ap_int<(IL_OUT+FL_OUT)> out;

    if(in(IL_IN+FL_IN-1, IL_IN+FL_IN-1) == (ap_uint<1>)0){
        out = in(IL_IN+FL_IN-1, IL_IN+FL_IN-IL_OUT-FL_OUT) + in(IL_IN+FL_IN-IL_OUT-FL_OUT-1, IL_IN+FL_IN-IL_OUT-FL_OUT-1);
    }
    else{
        out = in(IL_IN+FL_IN-1, IL_IN+FL_IN-IL_OUT-FL_OUT) +
             (in(IL_IN+FL_IN-IL_OUT-FL_OUT-1, IL_IN+FL_IN-IL_OUT-FL_OUT-1) && in(IL_IN+FL_IN-IL_OUT-FL_OUT-2, 0).or_reduce());
        // out = in(IL_IN+FL_IN-1, IL_IN+FL_IN-IL_OUT-FL_OUT);
    }

    return out;
}


template <int IL_IN, int FL_IN, int IL_OUT, int FL_OUT>
ap_uint<(IL_OUT+FL_OUT)> truncate_unsigned(
    ap_uint<(IL_IN+FL_IN)> in) {

    ap_uint<(IL_OUT+FL_OUT)> out;
    ap_uint<(IL_IN+FL_OUT+1)> out_tmp;

    const ap_uint<1> real_1 = 1;
    const ap_uint<IL_OUT+FL_OUT+2> max = (1<<(IL_OUT+FL_OUT+1)) - 1;

    out_tmp = in >> (FL_IN-FL_OUT-1);
    // cout << "in: " << in << endl;
    // cout << "out_tmp: " << out_tmp << endl;

    if(out_tmp < (max)){
        out_tmp += real_1;
    }
    else{
        out_tmp = max;
    }
    // cout << "out_tmp: " << out_tmp << endl;

    out = out_tmp >> real_1;
    // cout << "out: " << out << endl;

    return out;
}


template <int BIT_IN, int SIMD>
ap_uint<BIT_IN*SIMD> img_norm_calc(
    ap_uint<BIT_IN*SIMD> in,
    const ap_fixed<16, 3, AP_RND> weights[3],
    const ap_fixed<16, 3, AP_RND> bias[3]){

    ap_uint<BIT_IN*SIMD> res_out = 0;

    for(int i=0; i<SIMD; i++){
    #pragma HLS UNROLL
        ap_uint<BIT_IN> temp_in = in(BIT_IN*(i+1)-1, BIT_IN*i);
        ap_fixed<16, 3, AP_RND> temp_res = temp_in*weights[i] + bias[i];
        ap_uint<16> temp_res_uint = *(ap_uint<16>*)&temp_res;
        ap_uint<BIT_IN> res_truncated = truncate_img_norm<3, 13, 3, 5>(temp_res_uint);
        res_out(BIT_IN*(i+1)-1, BIT_IN*i) = res_truncated;
    }

    return res_out;
}

template <int BIT_IN, int SIMD, int IMG_ROW, int IMG_COL, int BIT_OUT >
void img_norm_none(  hls::stream<ap_uint<BIT_IN*SIMD> >& in,
                hls::stream<ap_uint<BIT_OUT*SIMD> >& out,ap_uint<10> reps){

// #pragma HLS DATAFLOW

    const unsigned loop_num = IMG_ROW*IMG_COL;
	for(int mm = 0;mm<reps;mm++){
		for(int i=0; i<IMG_ROW; i++){
			looppp:for(int m = 0;m < IMG_COL; m++ ){
#pragma HLS PIPELINE II=1
				ap_uint<BIT_IN*SIMD> in_read = in.read();
				ap_uint<BIT_OUT*SIMD> out_buf;
				loopaaa:for(int j=0;j<SIMD;j++){
					ap_uint<BIT_IN> in_temp;
					in_temp = in_read(BIT_IN*(j+1)-1,BIT_IN * j);
					ap_uint<BIT_OUT>dac = (in_temp>>(BIT_IN-BIT_OUT));
					if(dac != 127)dac = dac+in_temp(BIT_IN-BIT_OUT-1,BIT_IN-BIT_OUT-1);
					out_buf(BIT_OUT*(j+1)-1,BIT_OUT * j) = dac;
				}
				out.write(out_buf);
			}
		}
		for(int i = 0;i<4;i++){
			for(int m = 0;m < IMG_COL; m++ ){
#pragma HLS PIPELINE II=1
				ap_uint<BIT_OUT*SIMD> out_buf = 0;
				out.write(out_buf);
			}
		}
	}

}


template <  int BIT_IN,
            int BIT_W,
            int BIT_ACC,
            int SIMD>
ap_int<BIT_ACC> tioo_mutiplier(
    ap_uint<BIT_IN*SIMD> in,
    ap_uint<BIT_W*SIMD> weight){
    ap_int<BIT_ACC> accumulation = 0;


    for(int i=0; i<SIMD; i+=2){
#pragma HLS UNROLL

    	ap_int<BIT_IN>in_0;
    	ap_int<BIT_IN>in_1;

    	ap_int<BIT_W>w_0;
    	ap_int<BIT_W>w_1;

    	ap_int<3*(BIT_W+BIT_IN)>result;

    	ap_int<(BIT_W+BIT_IN)>out_0;
    	ap_int<(BIT_W+BIT_IN)>out_1;

    	w_0 = weight((i+1)*BIT_W-1, i*BIT_W);
    	w_1 = weight((i+2)*BIT_W-1, (i+1)*BIT_W);

    	in_0 = in((i+1)*BIT_IN-1, i*BIT_IN);
    	in_1 = in((i+2)*BIT_IN-1, (i+1)*BIT_IN);

    	ap_uint<1> sign_0 = w_0(BIT_W - 1, BIT_W - 1);
    	ap_uint<1> sign_1 = w_1(BIT_W - 1, BIT_W - 1);

    	w_0 = sign_0?(ap_int<BIT_W>)(~w_0+1):w_0;
    	w_1 = sign_1?(ap_int<BIT_W>)(~w_1+1):w_1;

    	ap_uint<2*BIT_W+BIT_IN> temp_w = 0;
    	ap_uint<2*BIT_IN+BIT_W> temp_in = 0;

    	temp_w(BIT_W-1,0) = w_0;
    	temp_w(2*BIT_W+BIT_IN-1,BIT_W+BIT_IN) = w_1;

    	temp_in(BIT_IN-1,0) = in_0;
    	temp_in(2*BIT_IN+BIT_W-1,1*(BIT_W+BIT_IN)) = in_1;

        result = temp_in * temp_w;

        out_0 = result((BIT_W+BIT_IN)-1,0);
        out_1 = result(3*(BIT_W+BIT_IN)-1,2*(BIT_W+BIT_IN));

    	out_0 = sign_0?(ap_int<BIT_W+BIT_IN>)(~out_0+1):out_0;
    	out_1 = sign_1?(ap_int<BIT_W+BIT_IN>)(~out_1+1):out_1;

        accumulation = accumulation + out_0 + out_1;
    }
    return accumulation;
}


template <  int BIT_IN,
            int BIT_W,
            int BIT_ACC,
            int SIMD>
ap_int<BIT_ACC> tioo_mutiplier_sb(
    ap_uint<BIT_IN*SIMD> in,
    ap_uint<BIT_W*SIMD> weight){
    ap_int<BIT_ACC> accumulation = 0;


    for(int i=0; i<SIMD; i+=2){
#pragma HLS UNROLL

    	ap_int<BIT_IN>in_0;
    	ap_int<BIT_IN>in_1;

    	ap_int<BIT_W>w_0;
    	ap_int<BIT_W>w_1;

    	ap_int<(BIT_W+BIT_IN)>result;
    	ap_int<2+(BIT_W+BIT_IN)>result_2;

    	ap_int<(BIT_W+BIT_IN)>out_0;
    	ap_int<(BIT_W+BIT_IN)>out_1;

    	w_0 = weight((i+1)*BIT_W-1, i*BIT_W);
    	w_1 = weight((i+2)*BIT_W-1, (i+1)*BIT_W);

    	in_0 = in((i+1)*BIT_IN-1, i*BIT_IN);
    	in_1 = in((i+2)*BIT_IN-1, (i+1)*BIT_IN);


        result = w_0 * in_0;
#pragma HLS RESOURCE variable=result core=Mul_LUT

        result_2 = w_1 * in_1 + result;
        accumulation += result_2;
    }
    return accumulation;
}


template <int BIT_IN, int SIMD, int IMG_ROW, int IMG_COL>
void img_norm(  hls::stream<ap_uint<BIT_IN*SIMD> >& in,
                hls::stream<ap_uint<BIT_IN*SIMD> >& out,
                ap_uint<10> reps ){

// #pragma HLS DATAFLOW

    const unsigned loop_num = IMG_ROW*IMG_COL*reps;

    for(int i=0; i<loop_num; i++){
        #pragma HLS PIPELINE II=1
        ap_uint<BIT_IN*SIMD> in_read = in.read();
        ap_uint<BIT_IN*SIMD> out_buf = img_norm_calc<BIT_IN, SIMD>(in_read, img_norm_weight, img_norm_bias);
        out.write(out_buf);
    }
}

/************************ Image Normalization ************************/

// template <  int BIT_IN,
//             int BIT_W,
//             int BIT_ACC,
//             int SIMD>
// ap_int<BIT_ACC> chi_vector_dot_product(
//     ap_uint<BIT_IN*SIMD> in,
//     ap_uint<BIT_W*SIMD> weight){

//     ap_int<BIT_ACC> accumulation = 0;
//     ap_uint<(2*BIT_IN+BIT_W)> temp_in;
//     ap_uint<(2*BIT_W+BIT_IN)> temp_w;

//     ap_uint<BIT_IN> temp_in_true_0;
//     ap_uint<BIT_IN> temp_in_true_1;
//     ap_uint<BIT_W> temp_weight_true_0;
//     ap_uint<BIT_W> temp_weight_true_1;

//     ap_uint<BIT_IN+BIT_W-1> res_complement_0;
//     ap_uint<BIT_IN+BIT_W-1> res_complement_1;

//     ap_uint<1> res_sign_0;
//     ap_uint<1> res_sign_1;

//     ap_int<BIT_IN+BIT_W> res_0;
//     ap_int<BIT_IN+BIT_W> res_1;

//     for(int i=0; i<(SIMD/2)+1; i++){
//     #pragma HLS UNROLL
//         if(i==0){
//             ap_int<BIT_IN> temp_in_0 = in((i+1)*BIT_IN-1, i*BIT_IN);
//             ap_int<BIT_W> temp_w_0 = weight((i+1)*BIT_W-1, i*BIT_W);
//             ap_int<BIT_IN> temp_in_1 = in((i+2)*BIT_IN-1, (i+1)*BIT_IN);
//             ap_int<BIT_W> temp_w_1 = weight((i+2)*BIT_W-1, (i+1)*BIT_W);

//             /* get sign bit */
//             ap_uint<1> temp_in_sign_0 = temp_in_0(BIT_IN-1, BIT_IN-1);
//             ap_uint<1> temp_weight_sign_0 = temp_w_0(BIT_W-1, BIT_W-1);
//             ap_uint<1> temp_in_sign_1 = temp_in_1(BIT_IN-1, BIT_IN-1);
//             ap_uint<1> temp_weight_sign_1 = temp_w_1(BIT_W-1, BIT_W-1);

//             if(temp_in_sign_0==1){
//                 if(temp_in_0==-(1<<(BIT_IN-1))){
//                     temp_in_true_0 = 1<<(BIT_IN-1);
//                 }
//                 else{
//                     temp_in_true_0(BIT_IN-1, BIT_IN-1) = 0;
//                     temp_in_true_0(BIT_IN-2, 0) = ~(temp_in_0(BIT_IN-2, 0)-1);
//                 }
//             }
//             else{
//                 temp_in_true_0 = temp_in_0;
//             }

//             if(temp_weight_sign_0==1){
//                 if(temp_w_0==-(1<<(BIT_W-1))){
//                     temp_weight_true_0 = 1<<(BIT_W-1);
//                 }
//                 else{
//                     temp_weight_true_0(BIT_W-1, BIT_W-1) = 0;
//                     temp_weight_true_0(BIT_W-2, 0) = ~(temp_w_0(BIT_W-2, 0)-1);
//                 }
//             }
//             else{
//                 temp_weight_true_0 = temp_w_0;
//             }

//             if(temp_in_sign_1==1){
//                 if(temp_in_1==-(1<<(BIT_IN-1))){
//                     temp_in_true_1 = 1<<(BIT_IN-1);
//                 }
//                 else{
//                     temp_in_true_1(BIT_IN-1, BIT_IN-1) = 0;
//                     temp_in_true_1(BIT_IN-2, 0) = ~(temp_in_1(BIT_IN-2, 0)-1);
//                 }
//             }
//             else{
//                 temp_in_true_1 = temp_in_1;
//             }

//             if(temp_weight_sign_1==1){
//                 if(temp_w_1==-(1<<(BIT_W-1))){
//                     temp_weight_true_1 = 1<<(BIT_W-1);
//                 }
//                 else{
//                     temp_weight_true_1(BIT_W-1, BIT_W-1) = 0;
//                     temp_weight_true_1(BIT_W-2, 0) = ~(temp_w_1(BIT_W-2, 0)-1);
//                 }
//             }
//             else{
//                 temp_weight_true_1 = temp_w_1;
//             }

//             temp_in(BIT_IN+BIT_W-1, 0) = temp_in_true_0;
//             temp_in(2*BIT_IN+BIT_W-1, BIT_IN+BIT_W) = temp_in_true_1;
//             temp_w(BIT_IN+BIT_W-1, 0) = temp_weight_true_0;
//             temp_w(BIT_IN+2*BIT_W-1, BIT_IN+BIT_W) = temp_weight_true_1;

//             ap_uint<(3*(BIT_IN+BIT_W))> result = temp_in * temp_w;
//             #pragma HLS RESOURCE variable=result core=DSP48

//             if(temp_in_0!=0 && temp_w_0!=0){
//                 res_sign_0 = temp_in_sign_0 ^ temp_weight_sign_0;
//             }
//             else{
//                 res_sign_0 = 0;
//             }
//             res_0(BIT_IN+BIT_W-1, BIT_IN+BIT_W-1) = res_sign_0;

//             if(temp_in_1!=0 && temp_w_1!=0){
//                 res_sign_1 = temp_in_sign_1 ^ temp_weight_sign_1;
//             }
//             else{
//                 res_sign_1 = 0;
//             }
//             res_1(BIT_IN+BIT_W-1, BIT_IN+BIT_W-1) = res_sign_1;

//             res_complement_0 = result(BIT_IN+BIT_W-2, 0);
//             if(res_sign_0==1){
//                 res_complement_0 = ~res_complement_0 + 1;
//             }
//             res_0(BIT_IN+BIT_W-2, 0) = res_complement_0;

//             res_complement_1 = result(3*(BIT_IN+BIT_W)-2, 2*(BIT_IN+BIT_W));
//             if(res_sign_1==1){
//                 res_complement_1 = ~res_complement_1 + 1;
//             }
//             res_1(BIT_IN+BIT_W-2, 0) = res_complement_1;

//             accumulation += (res_0 + res_1);

//             // cout << "temp_in_0: " << temp_in_0 << endl;
//             // cout << "temp_w_0: " << temp_w_0 << endl;
//             // cout << "temp_in_1: " << temp_in_1 << endl;
//             // cout << "temp_w_1: " << temp_w_1 << endl;
//             // cout << "res_0: " << res_0 << endl;
//             // cout << "res_1: " << res_1 << endl;
//             // cout << "accumulation: " << accumulation << endl;
//         }
//         else{
//             ap_int<BIT_W> temp_w = weight((i+2)*BIT_W-1, (i+1)*BIT_W);
//             ap_int<BIT_IN> temp_in = in((i+2)*BIT_IN-1, (i+1)*BIT_IN);
//             ap_int<(BIT_W+BIT_IN)> result = temp_in * temp_w;
//             #pragma HLS RESOURCE variable=result core=DSP48
//             accumulation += result;

//             // cout << "temp_in: " << temp_in << endl;
//             // cout << "temp_w: " << temp_w << endl;
//             // cout << "res: " << result << endl;
//             // cout << "accumulation: " << accumulation << endl;
//         }
//     }
//     return accumulation;
// }

// template <  int BIT_IN,
//             int BIT_W,
//             int BIT_ACC,
//             int SIMD>
// ap_int<BIT_ACC> chi_vector_dot_product_unsigned(
//     ap_uint<BIT_IN*SIMD> in,
//     ap_uint<BIT_W*SIMD> weight){

//     ap_int<BIT_ACC> accumulation = 0;

//     ap_uint<(2*BIT_IN+BIT_W)> temp_in;
//     ap_uint<(2*BIT_W+BIT_IN)> temp_w;

//     ap_uint<BIT_W> temp_weight_true_0;
//     ap_uint<BIT_W> temp_weight_true_1;

//     ap_uint<BIT_IN+BIT_W-1> res_complement_0;
//     ap_uint<BIT_IN+BIT_W-1> res_complement_1;

//     ap_uint<1> res_sign_0;
//     ap_uint<1> res_sign_1;

//     ap_int<BIT_IN+BIT_W> res_0;
//     ap_int<BIT_IN+BIT_W> res_1;

//     for(int i=0; i<SIMD; i+=2){
//     #pragma HLS UNROLL
//         ap_int<BIT_W> temp_w_0 = weight((i+1)*BIT_W-1, i*BIT_W);
//         ap_int<BIT_W> temp_w_1 = weight((i+2)*BIT_W-1, (i+1)*BIT_W);
//         ap_uint<BIT_IN> temp_in_0 = in((i+1)*BIT_IN-1, i*BIT_IN);
//         ap_uint<BIT_IN> temp_in_1 = in((i+2)*BIT_IN-1, (i+1)*BIT_IN);

//         // ap_uint<1> temp_weight_sign_0 = weight((i+1)*BIT_W-1, (i+1)*BIT_W-1);
//         // ap_uint<1> temp_weight_sign_1 = weight((i+2)*BIT_W-1, (i+2)*BIT_W-1);

//         if(weight((i+1)*BIT_W-1, (i+1)*BIT_W-1) == 1){
//             // if(temp_w_0==-(1<<(BIT_W-1))){
//             //     temp_weight_true_0 = 1<<(BIT_W-1);
//             // }
//             // else{
//                 temp_weight_true_0(BIT_W-1, BIT_W-1) = 0;
//                 temp_weight_true_0(BIT_W-2, 0) = ~(temp_w_0(BIT_W-2, 0)-1);
//             // }
//         }
//         else{
//             temp_weight_true_0 = temp_w_0;
//         }

//         if(weight((i+2)*BIT_W-1, (i+2)*BIT_W-1) == 1){
//             // if(temp_w_1==-(1<<(BIT_W-1))){
//             //     temp_weight_true_1 = 1<<(BIT_W-1);
//             // }
//             // else{
//                 temp_weight_true_1(BIT_W-1, BIT_W-1) = 0;
//                 temp_weight_true_1(BIT_W-2, 0) = ~(temp_w_1(BIT_W-2, 0)-1);
//             // }
//         }
//         else{
//             temp_weight_true_1 = temp_w_1;
//         }

//         if(temp_in_0 != 0){
//             res_sign_0 = weight((i+1)*BIT_W-1, (i+1)*BIT_W-1);
//         }
//         else{
//             res_sign_0 = 0;
//         }

//         if(temp_in_1 != 0){
//             res_sign_1 = weight((i+2)*BIT_W-1, (i+2)*BIT_W-1);
//         }
//         else{
//             res_sign_1 = 0;
//         }

//         temp_in(BIT_IN+BIT_W-1, 0) = temp_in_0;
//         temp_in(2*BIT_IN+BIT_W-1, BIT_IN+BIT_W) = temp_in_1;
//         temp_w(BIT_IN+BIT_W-1, 0) = temp_weight_true_0;
//         temp_w(BIT_IN+2*BIT_W-1, BIT_IN+BIT_W) = temp_weight_true_1;

//         ap_uint<(3*(BIT_W+BIT_IN))> result = temp_w * temp_in;
//         #pragma HLS RESOURCE variable=result core=DSP48
//         // #pragma HLS RESOURCE variable=result core=MulnS

//         res_complement_0 = result(BIT_IN+BIT_W-2, 0);
//         if(res_sign_0==1){
//             res_complement_0 = ~res_complement_0 + 1;
//         }
//         res_0(BIT_IN+BIT_W-1, BIT_IN+BIT_W-1) = res_sign_0;
//         res_0(BIT_IN+BIT_W-2, 0) = res_complement_0;

//         res_complement_1 = result(3*(BIT_IN+BIT_W)-2, 2*(BIT_IN+BIT_W));
//         if(res_sign_1==1){
//             res_complement_1 = ~res_complement_1 + 1;
//         }
//         res_1(BIT_IN+BIT_W-1, BIT_IN+BIT_W-1) = res_sign_1;
//         res_1(BIT_IN+BIT_W-2, 0) = res_complement_1;

//         accumulation += (res_0 + res_1);
//     }
//     return accumulation;
// }

template <  int BIT_IN,
            int BIT_W,
            int BIT_ACC,
            int SIMD>
ap_int<BIT_ACC> chi_vector_dot_product(
    ap_uint<BIT_IN*SIMD> in,
    ap_uint<BIT_W*SIMD> weight){

    ap_int<BIT_ACC> accumulation = 0;

    for(int i=0; i<SIMD; i++){
        #pragma HLS UNROLL
        ap_int<BIT_W> temp_w = weight((i+1)*BIT_W-1, i*BIT_W);
        ap_int<BIT_IN> temp_in = in((i+1)*BIT_IN-1, i*BIT_IN);
        ap_int<(BIT_W+BIT_IN)> result = temp_in * temp_w;
        accumulation += result;
    }
    // cout << "accumulation: " << accumulation << endl;
    return accumulation;
}

template <  int BIT_IN,
            int BIT_W,
            int BIT_ACC,
            int SIMD>
ap_int<BIT_ACC> chi_vector_dot_product_unsigned(
    ap_uint<BIT_IN*SIMD> in,
    ap_uint<BIT_W*SIMD> weight){

    ap_int<BIT_ACC> accumulation = 0;

    for(int i=0; i<SIMD; i++){
        #pragma HLS UNROLL
        ap_int<BIT_W> temp_w = weight((i+1)*BIT_W-1, i*BIT_W);
        ap_uint<BIT_IN> temp_in = in((i+1)*BIT_IN-1, i*BIT_IN);
        ap_int<(BIT_W+BIT_IN)> result = temp_in * temp_w;
        accumulation += result;
    }
    return accumulation;
}

template <  int BIT_IN,
            int BIT_W,
            int BIT_ACC,
            int SIMD>
ap_int<(BIT_W+BIT_IN)*SIMD> desperate_vector_pot_product_signed(
    ap_uint<BIT_IN*SIMD> in,
    ap_uint<BIT_W*SIMD> weight){

    ap_int<(BIT_W+BIT_IN)*SIMD> accumulation = 0;

    for(int i=0; i<SIMD; i++){
#pragma HLS UNROLL
        ap_int<BIT_W> temp_w = weight((i+1)*BIT_W-1, i*BIT_W);
        ap_uint<BIT_IN> temp_in = in((i+1)*BIT_IN-1, i*BIT_IN);
        ap_int<(BIT_W+BIT_IN)> result = temp_in * temp_w;
#pragma HLS RESOURCE variable=result core=Mul_LUT
        accumulation((BIT_W+BIT_IN)*(i+1)-1,(BIT_W+BIT_IN)*i) = result;
    }
    return accumulation;
}


template <  int BIT_IN,
            int BIT_W,
            int BIT_ACC,
            int SIMD>
ap_int<BIT_ACC> desperate_vector_pot_product(
    ap_uint<BIT_IN*SIMD> in,
    ap_uint<BIT_W*SIMD> weight){

    ap_int<BIT_ACC> accumulation = 0;

    for(int i=0; i<SIMD; i++){
#pragma HLS UNROLL
        ap_int<BIT_W> temp_w = weight((i+1)*BIT_W-1, i*BIT_W);
        ap_uint<BIT_IN> temp_in = in((i+1)*BIT_IN-1, i*BIT_IN);
        ap_int<(BIT_W+BIT_IN)> result = temp_in * temp_w;
#pragma HLS RESOURCE variable=result core=Mul_LUT
        accumulation((BIT_W+BIT_IN)*(i+1)-1,(BIT_W+BIT_IN)*i) = result;
    }
    return accumulation;
}


template <  int BIT_IN,
            int BIT_W,
            int BIT_ACC,
            int SIMD>
ap_int<BIT_ACC> tito_muler(
    ap_uint<BIT_IN*SIMD> in,
    ap_uint<BIT_W*SIMD> weight){

    ap_int<BIT_ACC> accumulation = 0;

    for(int i=0; i<SIMD; i++){
#pragma HLS UNROLL
        ap_int<BIT_W> temp_w = weight((i+1)*BIT_W-1, i*BIT_W);
        ap_uint<BIT_IN> temp_in = in((i+1)*BIT_IN-1, i*BIT_IN);
        ap_int<(BIT_W+BIT_IN)> result = temp_in * temp_w;
        accumulation((BIT_W+BIT_IN)*(i+1)-1,(BIT_W+BIT_IN)*i) = result;
    }
    return accumulation;
}


template <  int BIT_IN,
            int BIT_W,
            int BIT_ACC,
            int SIMD>
void thwioo_tuple_sb(
    ap_uint<BIT_IN>in_0,
    ap_int<BIT_W> weight_0,
    ap_int<BIT_W> weight_1,
    ap_int<BIT_W> weight_2,
	ap_int<(BIT_W+BIT_IN)>&out_0,
	ap_int<(BIT_W+BIT_IN)>&out_1,
	ap_int<(BIT_W+BIT_IN)>&out_2
	){

    ap_int<3*(BIT_W+BIT_IN-1)>result;

    ap_uint<BIT_W> real_weight_0;
    ap_uint<BIT_W> real_weight_1;
    ap_uint<BIT_W> real_weight_2;

    const ap_uint<1> real_one = 1;

    ap_uint<1> sign_0 = weight_0(BIT_W - 1, BIT_W - 1);
    ap_uint<1> sign_1 = weight_1(BIT_W - 1, BIT_W - 1);
    ap_uint<1> sign_2 = weight_2(BIT_W - 1, BIT_W - 1);

    real_weight_0 = (~weight_0);
    real_weight_1 = (~weight_1);
    real_weight_2 = (~weight_2);

    if(sign_0) weight_0 = real_weight_0+real_one;
    if(sign_1) weight_1 = real_weight_1+real_one;
    if(sign_2) weight_2 = real_weight_2+real_one;
/*
    weight_0 = sign_0?(ap_int<BIT_W>)(real_weight_0+real_one):weight_0;
    weight_1 = sign_1?(ap_int<BIT_W>)(real_weight_1+real_one):weight_1;
    weight_2 = sign_2?(ap_int<BIT_W>)(real_weight_2+real_one):weight_2;
*/
    ap_uint<3*(BIT_W-1)+2*BIT_IN> temp_w = 0;

    temp_w(BIT_W-2,0) = weight_0;
    temp_w(2*(BIT_W-1)+BIT_IN-1,BIT_W-1+BIT_IN) = weight_1;
    temp_w(3*(BIT_W-1)+2*BIT_IN-1,2*(BIT_W-1+BIT_IN)) = weight_2;

    result = in_0 * temp_w;

    out_0 = result(1*(BIT_W+BIT_IN-1)-1,0*(BIT_W+BIT_IN-1));
    out_1 = result(2*(BIT_W+BIT_IN-1)-1,1*(BIT_W+BIT_IN-1));
    out_2 = result(3*(BIT_W+BIT_IN-1)-1,2*(BIT_W+BIT_IN-1));

	out_0 = sign_0?(ap_int<BIT_W+BIT_IN>)(~out_0+1):out_0;
	out_1 = sign_1?(ap_int<BIT_W+BIT_IN>)(~out_1+1):out_1;
	out_2 = sign_2?(ap_int<BIT_W+BIT_IN>)(~out_2+1):out_2;

}

template <  int BIT_IN,
            int BIT_W,
            int BIT_ACC,
            int SIMD>
void thwioo_tuple(
    ap_uint<BIT_IN>in_0,
    ap_int<BIT_W> weight_0,
    ap_int<BIT_W> weight_1,
    ap_int<BIT_W> weight_2,
	ap_int<(BIT_W+BIT_IN)>&out_0,
	ap_int<(BIT_W+BIT_IN)>&out_1,
	ap_int<(BIT_W+BIT_IN)>&out_2
	){

    ap_int<3*(BIT_W+BIT_IN-1)>result;

    const ap_uint<1> real_one = 1;

    ap_uint<1> sign_0 = weight_0(BIT_W - 1, BIT_W - 1);
    ap_uint<1> sign_1 = weight_1(BIT_W - 1, BIT_W - 1);
    ap_uint<1> sign_2 = weight_2(BIT_W - 1, BIT_W - 1);

    ap_uint<BIT_W+BIT_IN> all_sign0;
    ap_uint<BIT_W+BIT_IN> all_sign1;
    ap_uint<BIT_W+BIT_IN> all_sign2;

    for(int i=0;i<BIT_W+BIT_IN;i++){
#pragma hls unroll
    	all_sign0(i,i) = sign_0;
    	all_sign1(i,i) = sign_1;
    	all_sign2(i,i) = sign_2;
    }

    ap_uint<BIT_W+BIT_IN> in_neg;

    ap_uint<BIT_W+BIT_IN> in_mod_0;
    ap_uint<BIT_W+BIT_IN> in_mod_1;
    ap_uint<BIT_W+BIT_IN> in_mod_2;

    ap_uint<BIT_W+BIT_IN> full = 4095;

    in_neg = (in_0 ^ full) + 1;

    ap_uint<3*(BIT_W-1)+2*BIT_IN> temp_w = 0;

    temp_w(BIT_W-2,0) = weight_0;
    temp_w(2*(BIT_W-1)+BIT_IN-1,BIT_W-1+BIT_IN) = weight_1;
    temp_w(3*(BIT_W-1)+2*BIT_IN-1,2*(BIT_W-1+BIT_IN)) = weight_2;

    result = in_0 * temp_w;

    out_0 = result(1*(BIT_W+BIT_IN-1)-1,0*(BIT_W+BIT_IN-1));
    out_1 = result(2*(BIT_W+BIT_IN-1)-1,1*(BIT_W+BIT_IN-1));
    out_2 = result(3*(BIT_W+BIT_IN-1)-1,2*(BIT_W+BIT_IN-1));

    in_neg = in_neg << (BIT_W - 1);

    in_mod_0 = in_neg & all_sign0;
    in_mod_1 = in_neg & all_sign1;
    in_mod_2 = in_neg & all_sign2;

	out_0 += in_mod_0;
	out_1 += in_mod_1;
	out_2 += in_mod_2;

}
template <  int BIT_IN,
            int BIT_W,
            int BIT_ACC,
            int SIMD>
void thwioo_tuple_dcm(
    ap_uint<BIT_IN>in_0,
    ap_int<BIT_W> weight_0,
    ap_int<BIT_W> weight_1,
    ap_int<BIT_W> weight_2,
	ap_int<(BIT_W+BIT_IN)>&out_0,
	ap_int<(BIT_W+BIT_IN)>&out_1,
	ap_int<(BIT_W+BIT_IN)>&out_2
	){
#pragma hls INLINE off
    ap_int<3*(BIT_W+BIT_IN-1)>result;
/*
    ap_uint<BIT_W-1> real_weight_0;
    ap_uint<BIT_W-1> real_weight_1;
    ap_uint<BIT_W-1> real_weight_2;

    ap_uint<1> sign_0 = weight_0(BIT_W - 1, BIT_W - 1);
    ap_uint<1> sign_1 = weight_1(BIT_W - 1, BIT_W - 1);
    ap_uint<1> sign_2 = weight_2(BIT_W - 1, BIT_W - 1);

    weight_0 = sign_0?(ap_int<BIT_W>)(~weight_0+1):weight_0;
    weight_1 = sign_1?(ap_int<BIT_W>)(~weight_1+1):weight_1;
    weight_2 = sign_2?(ap_int<BIT_W>)(~weight_2+1):weight_2;

    ap_uint<3*(BIT_W-1)+2*BIT_IN> temp_w = 0;

    temp_w(BIT_W-2,0) = weight_0;
    temp_w(2*(BIT_W-1)+BIT_IN-1,BIT_W-1+BIT_IN) = weight_1;
    temp_w(3*(BIT_W-1)+2*BIT_IN-1,2*(BIT_W-1+BIT_IN)) = weight_2;

    result = in_0 * temp_w;

    out_0 = result(1*(BIT_W+BIT_IN-1)-1,0*(BIT_W+BIT_IN-1));
    out_1 = result(2*(BIT_W+BIT_IN-1)-1,1*(BIT_W+BIT_IN-1));
    out_2 = result(3*(BIT_W+BIT_IN-1)-1,2*(BIT_W+BIT_IN-1));

	out_0 = sign_0?(ap_int<BIT_W+BIT_IN>)(~out_0+1):out_0;
	out_1 = sign_1?(ap_int<BIT_W+BIT_IN>)(~out_1+1):out_1;
	out_2 = sign_2?(ap_int<BIT_W+BIT_IN>)(~out_2+1):out_2;
*/
    out_0 = in_0 * weight_0;
#pragma HLS RESOURCE variable=out_0 core=Mul_LUT
    out_1 = in_0 * weight_1;
#pragma HLS RESOURCE variable=out_1 core=Mul_LUT
    out_2 = in_0 * weight_2;
#pragma HLS RESOURCE variable=out_2 core=Mul_LUT
}


template <  int BIT_IN,
            int BIT_W,
            int BIT_ACC,
            int SIMD>
ap_uint<BIT_ACC*3> tihoo_mutiplier(
    ap_uint<BIT_IN*SIMD> in,

    ap_uint<BIT_W*SIMD> weight_0,
    ap_uint<BIT_W*SIMD> weight_1,
    ap_uint<BIT_W*SIMD> weight_2

){


    ap_int<BIT_ACC> accumulation_0 = 0;
    ap_int<BIT_ACC> accumulation_1 = 0;
    ap_int<BIT_ACC> accumulation_2 = 0;
    for(int i=0; i<SIMD; i++){
#pragma HLS UNROLL

    	ap_int<(BIT_W+BIT_IN)>out_0;
    	ap_int<(BIT_W+BIT_IN)>out_1;
    	ap_int<(BIT_W+BIT_IN)>out_2;

    	ap_int<BIT_W>w_0;
    	ap_int<BIT_W>w_1;
    	ap_int<BIT_W>w_2;


    	w_0 = weight_0((i+1)*BIT_W-1, i*BIT_W);
    	w_1 = weight_1((i+1)*BIT_W-1, (i)*BIT_W);
    	w_2 = weight_2((i+1)*BIT_W-1, (i)*BIT_W);

    	ap_uint<BIT_IN> in_temp = in((i+1)*BIT_IN-1, i*BIT_IN);
    	thwioo_tuple< BIT_IN,BIT_W,BIT_ACC,SIMD>(in_temp,w_0,w_1,w_2,out_0,out_1,out_2);
/*
        out_0 = in_temp * w_0;
#pragma HLS RESOURCE variable=out_0 core=Mul_LUT
        out_1 = in_temp * w_1;
#pragma HLS RESOURCE variable=out_1 core=Mul_LUT
        out_2 = in_temp * w_2;
#pragma HLS RESOURCE variable=out_2 core=Mul_LUT
*/
    	accumulation_0 += out_0;
    	accumulation_1 += out_1;
    	accumulation_2 += out_2;
    }
    ap_uint<BIT_ACC*3> out_buf;
    out_buf(BIT_ACC - 1,0) = accumulation_0;
    out_buf(2*BIT_ACC - 1,BIT_ACC) = accumulation_1;
    out_buf(3*BIT_ACC - 1,2*BIT_ACC) = accumulation_2;

    return out_buf;

}

template <  int NUM_IN,
            int NUM_OUT,

            int BIT_IN,
            int FL_IN,
            int BIT_OUT,
            int FL_OUT,

            int BIT_W,
            int BIT_ALPHA,
            int FL_ALPHA,
            int BIT_BIAS,
            int BIT_TMP,

            int SIMD,
            int PE,
            int VECT_NUMS>
void conv3x3(   hls::stream<ap_uint<BIT_IN*SIMD> >& in,
                const ap_uint<BIT_W*SIMD*PE> weights[(NUM_OUT/PE)][(NUM_IN/9/SIMD)][9],
                const ap_uint<BIT_ALPHA> alpha[NUM_OUT/PE][PE],
                const ap_int<BIT_BIAS> bias[NUM_OUT/PE][PE],
                hls::stream<ap_uint<BIT_OUT*PE> >& out,
				ap_uint<10> reps){

#pragma HLS DATAFLOW

    const unsigned INPUT_FOLD = NUM_IN/SIMD/9;    /* input_channel / simd * kernel_size^2 */
	const unsigned OUTPUT_FOLD = NUM_OUT/PE;     /* output_channel / pe */
    const unsigned BIT_ACC = 20;

    const unsigned total_loop_num = INPUT_FOLD * OUTPUT_FOLD * VECT_NUMS * reps;

    ap_uint<BIT_IN*SIMD> input_temp_arr[INPUT_FOLD][9];
#pragma HLS ARRAY_PARTITION variable=input_temp_arr complete dim=2

    unsigned in_fold_cnt = 0;
	unsigned out_fold_cnt = 0;
	unsigned tile = 0;

    ap_uint<BIT_IN*SIMD> input_temp[9];
#pragma HLS ARRAY_PARTITION variable=input_temp complete dim=0
    ap_int<BIT_ACC> acc[PE];
#pragma HLS ARRAY_PARTITION variable=acc complete dim=0
    // ap_int<BIT_TMP> output_temp[PE];
    // ap_uint<BIT_OUT> output_uint[PE];

    for(int i=0; i<VECT_NUMS * reps; i++){
    	for(int out_fold_cnt=0;out_fold_cnt<OUTPUT_FOLD;out_fold_cnt++){
            for(int ay = 0;ay < 9;ay++){
				for(int in_fold_cnt=0;in_fold_cnt<INPUT_FOLD;in_fold_cnt++){
	#pragma HLS PIPELINE II=1
					if(out_fold_cnt == 0){
						input_temp[ay] = in.read();
						input_temp_arr[in_fold_cnt][ay] = input_temp[ay];

					}else{
						for(int ay = 0;ay<9;ay++){
							input_temp[ay] = input_temp_arr[in_fold_cnt][ay];
						}

					}

					if((in_fold_cnt == 0)&&(ay == 0)){
						for(int m=0; m<PE; m++){
						#pragma HLS UNROLL
							acc[m] = 0;
						}
					}
					ap_uint<BIT_W*SIMD*PE> weight_temp = weights[out_fold_cnt][in_fold_cnt][ay];
					for(int pe = 0 ; pe < PE; pe++){
	#pragma HLS UNROLL
						ap_uint <BIT_W*SIMD> weight_pe_loop = weight_temp((pe+1)*BIT_W*SIMD-1,pe*BIT_W*SIMD);
						//ap_uint <BIT_IN*SIMD> in_pe_loop = input_temp((pe+1)*BIT_IN*SIMD-1,pe*BIT_IN*SIMD);
						acc[pe] += chi_vector_dot_product_unsigned<BIT_IN, BIT_W, BIT_ACC, SIMD>(input_temp[ay], weight_pe_loop);
					}

				}
    		}
    		ap_uint<BIT_OUT*PE> out_buf;
            for(int p=0; p<PE; p++){
            #pragma HLS UNROLL
                ap_int<BIT_TMP> output_temp = acc[p] * alpha[out_fold_cnt][p] + bias[out_fold_cnt][p];
                out_buf((p+1)*BIT_OUT-1, p*BIT_OUT) = output_temp;
            }

            out.write(out_buf);
    	}
    }
}


template <  int ROW_IN,
            int COL_IN,
            int CH_IN,
            int BIT_IN,
            int FL_IN,

            int CH_OUT,
            int BIT_OUT,
            int FL_OUT,

            int BIT_W,
            int BIT_ALPHA,
            int FL_ALPHA,
            int BIT_BIAS,
            int BIT_TMP,

            int SIMD,
            int PE>
void conv3x3_layer_crc( hls::stream<ap_uint<SIMD*BIT_IN> >& in,
				const ap_uint<BIT_W * SIMD * PE> weights[CH_OUT / PE][CH_IN / SIMD][9],
                const ap_uint<BIT_ALPHA> alpha[CH_OUT/PE][PE],
                const ap_int<BIT_BIAS> bias[CH_OUT/PE][PE],
				hls::stream<ap_uint<BIT_OUT*PE> >& out,
				ap_uint<10> reps){

#pragma HLS DATAFLOW

    const unsigned INTER_ROW = ROW_IN + 2;
	const unsigned INTER_COL = COL_IN + 2;
    const unsigned ROW_OUT = ROW_IN;
	const unsigned COL_OUT = COL_IN;

    stream<ap_uint<SIMD*BIT_IN> > padding_out("padding_out");
	padding<ROW_IN, COL_IN, CH_IN, SIMD, BIT_IN, 1>(in, padding_out, reps);

	stream<ap_uint<SIMD*BIT_IN> > swu_out("swu_out");
	SWU_DW<3, 1, INTER_ROW, INTER_COL, CH_IN, SIMD, SIMD, BIT_IN> (padding_out, swu_out, reps);

    // hls::stream<ap_uint<BIT_OUT*PE> > conv_out("conv_out");

    conv3x3<CH_IN*3*3, CH_OUT, BIT_IN, FL_IN, BIT_OUT, FL_OUT, BIT_W, BIT_ALPHA, FL_ALPHA, BIT_BIAS, BIT_TMP, SIMD, PE, ROW_OUT*COL_OUT >(
    		swu_out, weights, alpha, bias, out, reps);

    // StreamingDataWidthConverter_Batch<PE*BIT_OUT, CH_OUT*BIT_OUT, ROW_OUT * COL_OUT * CH_OUT / PE>(
    //     conv_out, out, reps );
}


template <  int NUM_IN,
            int NUM_OUT,

            int BIT_IN,
            int FL_IN,
            int BIT_OUT,
            int FL_OUT,

            int BIT_W,
            int BIT_ALPHA,
            int FL_ALPHA,
            int BIT_BIAS,
            int BIT_TMP,

            int SIMD,
            int PE,
            int VECT_NUMS ,
			int BIT_ACC>
void conv1x1_relu_t3_pf(  hls::stream<ap_uint<BIT_IN*SIMD> >& in,
                    const ap_uint<BIT_W*SIMD*PE> weights[(NUM_OUT/PE)][(NUM_IN/SIMD)],
                    hls::stream<ap_uint<BIT_ACC*PE> >& out,
					ap_uint<10> reps){

#pragma HLS DATAFLOW

    const unsigned INPUT_FOLD = NUM_IN/SIMD;    /* input_channel / simd * kernel_size^2 */
	const unsigned OUTPUT_FOLD = NUM_OUT/PE;     /* output_channel / pe */

    //const unsigned total_loop_num = INPUT_FOLD * OUTPUT_FOLD * VECT_NUMS * reps;

    ap_uint<BIT_IN*SIMD> input_temp_arr[INPUT_FOLD];
    #pragma HLS RESOURCE variable=input_temp_arr core=RAM_2P_LUTRAM

	unsigned tile = 0;

    ap_uint<BIT_IN*SIMD> input_temp;
    ap_int<BIT_ACC> acc[PE+2];

#pragma HLS ARRAY_PARTITION variable=acc complete dim=0
    // ap_int<BIT_TMP> output_temp[PE];
    // ap_uint<BIT_OUT> output_uint[PE];

    for(ap_uint<25> i=0; i<VECT_NUMS * reps; i++){
    	for(ap_uint<10> out_fold_cnt=0;out_fold_cnt<OUTPUT_FOLD;out_fold_cnt++){
    		afwqe:for(ap_uint<10>  in_fold_cnt=0;in_fold_cnt<INPUT_FOLD;in_fold_cnt++){
#pragma HLS PIPELINE II=1
    			if(out_fold_cnt == 0){
    	            input_temp = in.read();
    	            input_temp_arr[in_fold_cnt] = input_temp;
    			}else{
    	            input_temp = input_temp_arr[in_fold_cnt];
    	        }

    			if(in_fold_cnt == 0){
    			    for(ap_uint<8> m=0; m<PE; m++){
#pragma HLS UNROLL
    			    	acc[m] = 0;
    			    }
    			}

    			ap_uint<BIT_W*SIMD*PE> weight_temp = weights[out_fold_cnt][in_fold_cnt];
    			loopqwe:for(ap_uint<7> pe = 0 ; pe < PE; pe +=3){
#pragma HLS UNROLL
#ifdef DEBUG
    				//cout<<"PE:"<<pe<<endl;
#endif
    				ap_uint <BIT_W*SIMD> weight_pe_loop_0 = weight_temp((pe+1)*BIT_W*SIMD-1,pe*BIT_W*SIMD);
    				ap_uint <BIT_W*SIMD> weight_pe_loop_1;
    				ap_uint <BIT_W*SIMD> weight_pe_loop_2;
    				if(pe+1<PE) weight_pe_loop_1 = weight_temp((pe+2)*BIT_W*SIMD-1,(pe+1)*BIT_W*SIMD);
    				if(pe+2<PE) weight_pe_loop_2 = weight_temp((pe+3)*BIT_W*SIMD-1,(pe+2)*BIT_W*SIMD);
    				ap_int<BIT_ACC*3> o;
    				//ap_uint <BIT_IN*SIMD> in_pe_loop = input_temp((pe+1)*BIT_IN*SIMD-1,pe*BIT_IN*SIMD);

    				o = tihoo_mutiplier<BIT_IN, BIT_W, BIT_ACC, SIMD>(input_temp, weight_pe_loop_0,weight_pe_loop_1,weight_pe_loop_2);
    				acc[pe] += o(BIT_ACC - 1,0);
    				acc[pe+1] += o(2*BIT_ACC - 1,BIT_ACC);
    				acc[pe+2] += o(3*BIT_ACC - 1,2*BIT_ACC);

    			}
    		}
			ap_uint<BIT_ACC*PE> out_buf;
            for(ap_uint<8>  p=0; p<PE; p++){
            #pragma HLS UNROLL
            	out_buf((p+1)*BIT_ACC-1, p*BIT_ACC) = acc[p];
            }
#ifdef DEBUG
            if(i==3*320){
            	cout<< out_buf << endl;
            }
#endif
            out.write(out_buf);
    	}
    }
}


template <  int NUM_IN,
            int NUM_OUT,

            int BIT_IN,
            int FL_IN,
            int BIT_OUT,
            int FL_OUT,

            int BIT_W,
            int BIT_ALPHA,
            int FL_ALPHA,
            int BIT_BIAS,
            int BIT_TMP,

            int SIMD,
            int PE,
            int VECT_NUMS >
void conv1x1_relu(  hls::stream<ap_uint<BIT_IN*SIMD> >& in,
                    const ap_uint<BIT_W*SIMD*PE> weights[(NUM_OUT/PE)][(NUM_IN/SIMD)],
                    const ap_uint<BIT_ALPHA> alpha[NUM_OUT/PE][PE],
                    const ap_int<BIT_BIAS> bias[NUM_OUT/PE][PE],
                    hls::stream<ap_uint<BIT_OUT*PE> >& out,
		            ap_uint<10> reps ){

#pragma HLS DATAFLOW

    const unsigned INPUT_FOLD = NUM_IN/SIMD;    /* input_channel / simd * kernel_size^2 */
	const unsigned OUTPUT_FOLD = NUM_OUT/PE;     /* output_channel / pe */
    const unsigned BIT_ACC = 20;

    //const unsigned total_loop_num = INPUT_FOLD * OUTPUT_FOLD * VECT_NUMS * reps;

    ap_uint<BIT_IN*SIMD> input_temp_arr[INPUT_FOLD];
    #pragma HLS RESOURCE variable=input_temp_arr core=RAM_2P_LUTRAM

	unsigned tile = 0;

    ap_uint<BIT_IN*SIMD> input_temp;
    ap_int<BIT_ACC> acc[PE];

    for(int i=0; i<VECT_NUMS * reps; i++){
    	for(int out_fold_cnt=0;out_fold_cnt<OUTPUT_FOLD;out_fold_cnt++){
    		for(int in_fold_cnt=0;in_fold_cnt<INPUT_FOLD;in_fold_cnt++){
#pragma HLS PIPELINE II=1
    			if(out_fold_cnt == 0){
    	            input_temp = in.read();
    	            input_temp_arr[in_fold_cnt] = input_temp;
    			}else{
    	            input_temp = input_temp_arr[in_fold_cnt];
    	        }

    			if(in_fold_cnt == 0){
    			    for(int m=0; m<PE; m++){
    			    #pragma HLS UNROLL
    			    	acc[m] = 0;
    			    }
    			}

    			ap_uint<BIT_W*SIMD*PE> weight_temp = weights[out_fold_cnt][in_fold_cnt];
    			for(int pe = 0 ; pe < PE; pe++){
		#pragma HLS UNROLL
    				ap_uint <BIT_W*SIMD> weight_pe_loop = weight_temp((pe+1)*BIT_W*SIMD-1,pe*BIT_W*SIMD);
    				//ap_uint <BIT_IN*SIMD> in_pe_loop = input_temp((pe+1)*BIT_IN*SIMD-1,pe*BIT_IN*SIMD);
    				acc[pe] += tioo_mutiplier_sb<BIT_IN, BIT_W, BIT_ACC, SIMD>(input_temp, weight_pe_loop);
    			}
    		}
    		ap_uint<BIT_OUT*PE> out_buf;
            for(int p=0; p<PE; p++){
            #pragma HLS UNROLL
                ap_int<BIT_TMP> output_temp = acc[p] * alpha[out_fold_cnt][p] + bias[out_fold_cnt][p];
                #pragma HLS RESOURCE variable=output_temp core=DSP48
                output_temp = (output_temp(BIT_TMP-1, BIT_TMP-1) == (ap_uint<1>)0) ? output_temp : (ap_int<BIT_TMP>)0;

                ap_uint<BIT_OUT> output_uint = truncate_unsigned<(BIT_TMP-FL_IN-FL_ALPHA), (FL_IN+FL_ALPHA), (BIT_OUT-FL_OUT), FL_OUT>(output_temp);
                out_buf((p+1)*BIT_OUT-1, p*BIT_OUT) = output_uint;
            }

            out.write(out_buf);
    	}
    }
}


template <  int NUM_IN,
            int NUM_OUT,

            int BIT_IN,
            int FL_IN,
            int BIT_OUT,
            int FL_OUT,

            int BIT_W,
            int BIT_ALPHA,
            int FL_ALPHA,
            int BIT_BIAS,
            int BIT_TMP,

            int SIMD,
            int PE,
            int VECT_NUMS,
			int BIT_ACC>
void conv1x1_relu_pf(  hls::stream<ap_uint<BIT_IN*SIMD> >& in,
                    const ap_uint<BIT_W*SIMD*PE> weights[(NUM_OUT/PE)][(NUM_IN/SIMD)],
                    hls::stream<ap_uint<BIT_ACC*PE> >& out,
					ap_uint<10> reps){

#pragma HLS DATAFLOW

    const unsigned INPUT_FOLD = NUM_IN/SIMD;    /* input_channel / simd * kernel_size^2 */
	const unsigned OUTPUT_FOLD = NUM_OUT/PE;     /* output_channel / pe */

    //const unsigned total_loop_num = INPUT_FOLD * OUTPUT_FOLD * VECT_NUMS * reps;

    ap_uint<BIT_IN*SIMD> input_temp_arr[INPUT_FOLD];
    #pragma HLS RESOURCE variable=input_temp_arr core=RAM_2P_BRAM

	unsigned tile = 0;

    ap_uint<BIT_IN*SIMD> input_temp;
    ap_int<BIT_ACC> acc[PE];
    // ap_int<BIT_TMP> output_temp[PE];
    // ap_uint<BIT_OUT> output_uint[PE];

    for(int i=0; i<VECT_NUMS * reps; i++){
    	for(int out_fold_cnt=0;out_fold_cnt<OUTPUT_FOLD;out_fold_cnt++){
    		for(int in_fold_cnt=0;in_fold_cnt<INPUT_FOLD;in_fold_cnt++){
#pragma HLS PIPELINE II=1
    			if(out_fold_cnt == 0){
    	            input_temp = in.read();
    	            input_temp_arr[in_fold_cnt] = input_temp;
    			}else{
    	            input_temp = input_temp_arr[in_fold_cnt];
    	        }

    			if(in_fold_cnt == 0){
    			    for(int m=0; m<PE; m++){
    			    #pragma HLS UNROLL
    			    	acc[m] = 0;
    			    }
    			}

    			ap_uint<BIT_W*SIMD*PE> weight_temp = weights[out_fold_cnt][in_fold_cnt];
    			for(int pe = 0 ; pe < PE; pe++){
		#pragma HLS UNROLL
    				ap_uint <BIT_W*SIMD> weight_pe_loop = weight_temp((pe+1)*BIT_W*SIMD-1,pe*BIT_W*SIMD);
    				//ap_uint <BIT_IN*SIMD> in_pe_loop = input_temp((pe+1)*BIT_IN*SIMD-1,pe*BIT_IN*SIMD);
    				acc[pe] += tioo_mutiplier_sb<BIT_IN, BIT_W, BIT_ACC, SIMD>(input_temp, weight_pe_loop);
    			}
    		}
    		ap_uint<BIT_ACC*PE> out_buf;
            for(int p=0; p<PE; p++){
            #pragma HLS UNROLL
            	out_buf((p+1)*BIT_ACC-1, p*BIT_ACC) = acc[p];
            }

            out.write(out_buf);
    	}
    }
}

template <    int NUM_IN,
			int NUM_OUT,

			int BIT_IN,
			int FL_IN,
			int BIT_OUT,
			int FL_OUT,

			int BIT_W,
			int BIT_ALPHA,
			int FL_ALPHA,
			int BIT_BIAS,
			int BIT_TMP,

			int SIMD,
			int PE,
			int VECT_NUMS,
			int BIT_ACC>
void pw_11_pf_tuple(hls::stream<ap_uint<BIT_ACC*PE> >& in,
        const ap_uint<BIT_ALPHA> alpha[NUM_OUT/PE][PE],
        const ap_int<BIT_BIAS> bias[NUM_OUT/PE][PE],
        hls::stream<ap_uint<BIT_OUT*PE> >& out,
		ap_uint<10> reps
){
	const unsigned OUTPUT_FOLD = NUM_OUT/PE;
	ap_uint<7> out_fold_cnt = 0;


	for(int i=0;i<VECT_NUMS * reps;i++){
		for(out_fold_cnt = 0;out_fold_cnt < OUTPUT_FOLD;out_fold_cnt++){
			ap_uint<BIT_ACC*PE> in_buf;
			in_buf = in.read();
			ap_uint<BIT_OUT*PE> out_buf;
			for(int pe=0;pe<PE;pe++){
#pragma HLS PIPELINE
				ap_int<BIT_ACC> in_single = in_buf((pe+1)*BIT_ACC-1,pe*BIT_ACC);
				ap_int<BIT_TMP> output_temp_0 = in_single * alpha[out_fold_cnt][pe] + bias[out_fold_cnt][pe];
#pragma HLS RESOURCE variable=output_temp_0 core=DSP48
				ap_int<BIT_TMP> output_temp_1 = (output_temp_0(BIT_TMP-1, BIT_TMP-1) == (ap_uint<1>)0) ? output_temp_0 : (ap_int<BIT_TMP>)0;
				ap_uint<BIT_OUT> output_uint = truncate_unsigned<(BIT_TMP-FL_IN-FL_ALPHA), (FL_IN+FL_ALPHA), (BIT_OUT-FL_OUT), FL_OUT>(output_temp_1);

				out_buf((pe+1)*BIT_OUT-1, pe*BIT_OUT) = output_uint;
			}
			out.write(out_buf);
		}

	}
}

template <    int NUM_IN,
			int NUM_OUT,

			int BIT_IN,
			int FL_IN,
			int BIT_OUT,
			int FL_OUT,

			int BIT_W,
			int BIT_ALPHA,
			int FL_ALPHA,
			int BIT_BIAS,
			int BIT_TMP,

			int SIMD,
			int PE,
			int VECT_NUMS,
			int BIT_ACC>
void pw_11_pf_tuple_sh(hls::stream<ap_uint<BIT_ACC*PE> >& in,
        const ap_uint<BIT_ALPHA> alpha[NUM_OUT/PE][PE],
        const ap_int<BIT_BIAS> bias[NUM_OUT/PE][PE],
        hls::stream<ap_uint<BIT_OUT*PE> >& out,
		ap_uint<10> reps
){
	const unsigned OUTPUT_FOLD = NUM_OUT/PE;
	ap_uint<7> out_fold_cnt = 0;


	for(int i=0;i<VECT_NUMS * reps;i++){
		for(out_fold_cnt = 0;out_fold_cnt < OUTPUT_FOLD;out_fold_cnt++){
			ap_uint<BIT_ACC*PE> in_buf;
			in_buf = in.read();
			ap_uint<BIT_OUT*PE> out_buf;
			for(int pe=0;pe<PE;pe++){
#pragma HLS PIPELINE
				ap_int<BIT_ACC> in_single = in_buf(BIT_ACC-1,0);
				in_buf >> BIT_ACC;
				ap_int<BIT_TMP> output_temp_0 = in_single * alpha[out_fold_cnt][pe] + bias[out_fold_cnt][pe];
#pragma HLS RESOURCE variable=output_temp_0 core=DSP48
				ap_int<BIT_TMP> output_temp_1 = (output_temp_0(BIT_TMP-1, BIT_TMP-1) == (ap_uint<1>)0) ? output_temp_0 : (ap_int<BIT_TMP>)0;
				ap_uint<BIT_OUT> output_uint = truncate_unsigned<(BIT_TMP-FL_IN-FL_ALPHA), (FL_IN+FL_ALPHA), (BIT_OUT-FL_OUT), FL_OUT>(output_temp_1);

				out_buf((pe+1)*BIT_OUT-1, pe*BIT_OUT) = output_uint;
			}
			out.write(out_buf);
		}

	}
}

template <    int NUM_IN,
			int NUM_OUT,

			int BIT_IN,
			int FL_IN,
			int BIT_OUT,
			int FL_OUT,

			int BIT_W,
			int BIT_ALPHA,
			int FL_ALPHA,
			int BIT_BIAS,
			int BIT_TMP,

			int SIMD,
			int PE,
			int VECT_NUMS,
			int BIT_ACC>
void pw_11_pf_tuple_one(hls::stream<ap_uint<BIT_ACC> >& in,
        const ap_uint<BIT_ALPHA> alpha[NUM_OUT/PE][PE],
        const ap_int<BIT_BIAS> bias[NUM_OUT/PE][PE],
        hls::stream<ap_uint<BIT_OUT> >& out,
		ap_uint<10> reps
){
	const unsigned OUTPUT_FOLD = NUM_OUT/PE;
	ap_uint<7> out_fold_cnt = 0;
	ap_int<BIT_ACC> in_buf;
	for(int i=0;i<VECT_NUMS * reps;i++){
		for(out_fold_cnt = 0;out_fold_cnt < OUTPUT_FOLD;out_fold_cnt++){
			for(ap_uint<7> pe=0;pe<PE;pe++){
#pragma HLS PIPELINE
				in_buf = in.read();
				ap_int<BIT_TMP> output_temp_0 = in_buf * alpha[out_fold_cnt][pe] + bias[out_fold_cnt][pe];
#pragma HLS RESOURCE variable=output_temp_0 core=DSP48
				ap_int<BIT_TMP> output_temp_1 = (output_temp_0(BIT_TMP-1, BIT_TMP-1) == (ap_uint<1>)0) ? output_temp_0 : (ap_int<BIT_TMP>)0;
				ap_uint<BIT_OUT> output_uint = truncate_unsigned<(BIT_TMP-FL_IN-FL_ALPHA), (FL_IN+FL_ALPHA), (BIT_OUT-FL_OUT), FL_OUT>(output_temp_1);
				out.write(output_uint);
			}
		}
	}
}
template <    int NUM_IN,
			int NUM_OUT,

			int BIT_IN,
			int FL_IN,
			int BIT_OUT,
			int FL_OUT,

			int BIT_W,
			int BIT_ALPHA,
			int FL_ALPHA,
			int BIT_BIAS,
			int BIT_TMP,

			int SIMD,
			int PE,
			int VECT_NUMS,
			int BIT_ACC>
void pw_11_pf_tup2(hls::stream<ap_uint<BIT_ACC*PE> >& in,
        const ap_uint<BIT_ALPHA> alpha[NUM_OUT/PE][PE],
        const ap_int<BIT_BIAS> bias[NUM_OUT/PE][PE],
        hls::stream<ap_uint<BIT_OUT*PE> >& out,
		ap_uint<10> reps
){
	const unsigned OUTPUT_FOLD = NUM_OUT/PE;
	ap_uint<7> out_fold_cnt = 0;


	for(int i=0;i<VECT_NUMS * reps;i++){
		for(out_fold_cnt = 0;out_fold_cnt < OUTPUT_FOLD;out_fold_cnt++){
#pragma HLS PIPELINE II=PE/2
			ap_uint<BIT_ACC*PE> in_buf;
			in_buf = in.read();
			ap_uint<BIT_OUT*PE> out_buf;
			for(int pe=0;pe<PE;pe++){
#pragma HLS UNROLL factor=2
				ap_int<BIT_ACC> in_single = in_buf((pe+1)*BIT_ACC-1,pe*BIT_ACC);
				ap_int<BIT_TMP> output_temp_0 = in_single * alpha[out_fold_cnt][pe] + bias[out_fold_cnt][pe];
#pragma HLS RESOURCE variable=output_temp_0 core=DSP48
				ap_int<BIT_TMP> output_temp_1 = (output_temp_0(BIT_TMP-1, BIT_TMP-1) == (ap_uint<1>)0) ? output_temp_0 : (ap_int<BIT_TMP>)0;
				ap_uint<BIT_OUT> output_uint = truncate_unsigned<(BIT_TMP-FL_IN-FL_ALPHA), (FL_IN+FL_ALPHA), (BIT_OUT-FL_OUT), FL_OUT>(output_temp_1);

				out_buf((pe+1)*BIT_OUT-1, pe*BIT_OUT) = output_uint;
			}
			out.write(out_buf);
		}

	}
}

template <    int NUM_IN,
			int NUM_OUT,

			int BIT_IN,
			int FL_IN,
			int BIT_OUT,
			int FL_OUT,

			int BIT_W,
			int BIT_ALPHA,
			int FL_ALPHA,
			int BIT_BIAS,
			int BIT_TMP,

			int SIMD,
			int PE,
			int VECT_NUMS,
			int BIT_ACC>
void pw_11_pf_tup6(hls::stream<ap_uint<BIT_ACC*PE> >& in,
        const ap_uint<BIT_ALPHA> alpha[NUM_OUT/PE][PE],
        const ap_int<BIT_BIAS> bias[NUM_OUT/PE][PE],
        hls::stream<ap_uint<BIT_OUT*PE> >& out,
		ap_uint<10> reps
){
	const unsigned OUTPUT_FOLD = NUM_OUT/PE;
	ap_uint<7> out_fold_cnt = 0;


	for(int i=0;i<VECT_NUMS * reps;i++){
		for(out_fold_cnt = 0;out_fold_cnt < OUTPUT_FOLD;out_fold_cnt++){
#pragma HLS PIPELINE II=PE/6
			ap_uint<BIT_ACC*PE> in_buf;
			in_buf = in.read();
			ap_uint<BIT_OUT*PE> out_buf;
			for(int pe=0;pe<PE;pe++){
#pragma HLS UNROLL factor=6
				ap_int<BIT_ACC> in_single = in_buf((pe+1)*BIT_ACC-1,pe*BIT_ACC);
				ap_int<BIT_TMP> output_temp_0 = in_single * alpha[out_fold_cnt][pe] + bias[out_fold_cnt][pe];
#pragma HLS RESOURCE variable=output_temp_0 core=DSP48
				ap_int<BIT_TMP> output_temp_1 = (output_temp_0(BIT_TMP-1, BIT_TMP-1) == (ap_uint<1>)0) ? output_temp_0 : (ap_int<BIT_TMP>)0;
				ap_uint<BIT_OUT> output_uint = truncate_unsigned<(BIT_TMP-FL_IN-FL_ALPHA), (FL_IN+FL_ALPHA), (BIT_OUT-FL_OUT), FL_OUT>(output_temp_1);

				out_buf((pe+1)*BIT_OUT-1, pe*BIT_OUT) = output_uint;
			}
			out.write(out_buf);
		}

	}
}

template <    int NUM_IN,
			int NUM_OUT,

			int BIT_IN,
			int FL_IN,
			int BIT_OUT,
			int FL_OUT,

			int BIT_W,
			int BIT_ALPHA,
			int FL_ALPHA,
			int BIT_BIAS,
			int BIT_TMP,

			int SIMD,
			int PE,
			int VECT_NUMS,
			int BIT_ACC>
void pw_11_pf_tup3(hls::stream<ap_uint<BIT_ACC*PE> >& in,
        const ap_uint<BIT_ALPHA> alpha[NUM_OUT/PE][PE],
        const ap_int<BIT_BIAS> bias[NUM_OUT/PE][PE],
        hls::stream<ap_uint<BIT_OUT*PE> >& out,
		ap_uint<10> reps
){
	const unsigned OUTPUT_FOLD = NUM_OUT/PE;
	ap_uint<7> out_fold_cnt = 0;


	for(int i=0;i<VECT_NUMS * reps;i++){
		for(out_fold_cnt = 0;out_fold_cnt < OUTPUT_FOLD;out_fold_cnt++){
			ap_uint<BIT_ACC*PE> in_buf;
			in_buf = in.read();
			ap_uint<BIT_OUT*PE> out_buf;
#pragma HLS PIPELINE II=PE/3
			for(int pe=0;pe<PE;pe++){
#pragma HLS UNROLL factor=3
				ap_int<BIT_ACC> in_single = in_buf((pe+1)*BIT_ACC-1,pe*BIT_ACC);
				ap_int<BIT_TMP> output_temp_0 = in_single * alpha[out_fold_cnt][pe] + bias[out_fold_cnt][pe];
#pragma HLS RESOURCE variable=output_temp_0 core=DSP48
				ap_int<BIT_TMP> output_temp_1 = (output_temp_0(BIT_TMP-1, BIT_TMP-1) == (ap_uint<1>)0) ? output_temp_0 : (ap_int<BIT_TMP>)0;
				ap_uint<BIT_OUT> output_uint = truncate_unsigned<(BIT_TMP-FL_IN-FL_ALPHA), (FL_IN+FL_ALPHA), (BIT_OUT-FL_OUT), FL_OUT>(output_temp_1);

				out_buf((pe+1)*BIT_OUT-1, pe*BIT_OUT) = output_uint;
			}
#ifdef DEBUG
            if(i==3*320){
            	cout<< out_buf << endl;
            }
#endif
			out.write(out_buf);
		}

	}
}


template <    int NUM_IN,
			int NUM_OUT,

			int BIT_IN,
			int FL_IN,
			int BIT_OUT,
			int FL_OUT,

			int BIT_W,
			int BIT_ALPHA,
			int FL_ALPHA,
			int BIT_BIAS,
			int BIT_TMP,

			int SIMD,
			int PE,
			int VECT_NUMS,
			int BIT_ACC>
void pw_11_pf_tup4(hls::stream<ap_uint<BIT_ACC*PE> >& in,
        const ap_uint<BIT_ALPHA> alpha[NUM_OUT/PE][PE],
        const ap_int<BIT_BIAS> bias[NUM_OUT/PE][PE],
        hls::stream<ap_uint<BIT_OUT*PE> >& out,
		ap_uint<10> reps
){
	const unsigned OUTPUT_FOLD = NUM_OUT/PE;
	ap_uint<7> out_fold_cnt = 0;


	for(int i=0;i<VECT_NUMS * reps;i++){
		for(out_fold_cnt = 0;out_fold_cnt < OUTPUT_FOLD;out_fold_cnt++){
#pragma HLS PIPELINE II=PE/4
			ap_uint<BIT_ACC*PE> in_buf;
			in_buf = in.read();
			ap_uint<BIT_OUT*PE> out_buf;
			for(int pe=0;pe<PE;pe++){
#pragma HLS UNROLL factor=4
				ap_int<BIT_ACC> in_single = in_buf((pe+1)*BIT_ACC-1,pe*BIT_ACC);
				ap_int<BIT_TMP> output_temp_0 = in_single * alpha[out_fold_cnt][pe] + bias[out_fold_cnt][pe];
#pragma HLS RESOURCE variable=output_temp_0 core=DSP48
				ap_int<BIT_TMP> output_temp_1 = (output_temp_0(BIT_TMP-1, BIT_TMP-1) == (ap_uint<1>)0) ? output_temp_0 : (ap_int<BIT_TMP>)0;
				ap_uint<BIT_OUT> output_uint = truncate_unsigned<(BIT_TMP-FL_IN-FL_ALPHA), (FL_IN+FL_ALPHA), (BIT_OUT-FL_OUT), FL_OUT>(output_temp_1);

				out_buf((pe+1)*BIT_OUT-1, pe*BIT_OUT) = output_uint;
			}
			out.write(out_buf);
		}

	}
}


template <  int ROW_IN,
            int COL_IN,
            int CH_IN,
            int BIT_IN,
            int FL_IN,

            int CH_OUT,
            int BIT_OUT,
            int FL_OUT,

            int BIT_W,
            int BIT_ALPHA,
            int FL_ALPHA,
            int BIT_BIAS,
            int BIT_TMP,

            int SIMD,
            int PE>
void conv1x1_layer( hls::stream<ap_uint<BIT_IN*SIMD> >& in,
                    const ap_uint<BIT_W*SIMD*PE> weights[CH_OUT/PE][CH_IN/SIMD],
                    const ap_uint<BIT_ALPHA> alpha[CH_OUT/PE][PE],
                    const ap_int<BIT_BIAS> bias[CH_OUT/PE][PE],
                    hls::stream<ap_uint<BIT_OUT*PE> >& out,
					ap_uint<10> reps){

#pragma HLS DATAFLOW

    const unsigned INTER_ROW = ROW_IN;
	const unsigned INTER_COL = COL_IN;
    const unsigned ROW_OUT = ROW_IN;
	const unsigned COL_OUT = COL_IN;

    conv1x1_relu<CH_IN, CH_OUT, BIT_IN, FL_IN, BIT_OUT, FL_OUT, BIT_W, BIT_ALPHA, FL_ALPHA, BIT_BIAS, BIT_TMP, SIMD, PE, ROW_OUT*COL_OUT>(
    		in, weights, alpha, bias, out, reps );


}


template <  int ROW_IN,
            int COL_IN,
            int CH_IN,
            int BIT_IN,
            int FL_IN,

            int CH_OUT,
            int BIT_OUT,
            int FL_OUT,

            int BIT_W,
            int BIT_ALPHA,
            int FL_ALPHA,
            int BIT_BIAS,
            int BIT_TMP,

            int SIMD,
            int PE>
void conv1x1_layer_pf( hls::stream<ap_uint<BIT_IN*SIMD> >& in,
                    const ap_uint<BIT_W*SIMD*PE> weights[CH_OUT/PE][CH_IN/SIMD],
                    const ap_uint<BIT_ALPHA> alpha[CH_OUT/PE][PE],
                    const ap_int<BIT_BIAS> bias[CH_OUT/PE][PE],
                    hls::stream<ap_uint<BIT_OUT*PE> >& out,
					ap_uint<10> reps){

#pragma HLS DATAFLOW

    const unsigned INTER_ROW = ROW_IN;
	const unsigned INTER_COL = COL_IN;
    const unsigned ROW_OUT = ROW_IN;
	const unsigned COL_OUT = COL_IN;
	const unsigned BIT_ACC = 20;

	hls::stream<ap_uint<BIT_ACC*PE> > temp_out;
    conv1x1_relu_pf<CH_IN, CH_OUT, BIT_IN, FL_IN, BIT_OUT, FL_OUT, BIT_W, BIT_ALPHA, FL_ALPHA, BIT_BIAS, BIT_TMP, SIMD, PE, ROW_OUT*COL_OUT, BIT_ACC >(
    		in, weights, temp_out, reps);


    pw_11_pf_tuple_sh<CH_IN*3*3, CH_OUT, BIT_IN, FL_IN, BIT_OUT, FL_OUT, BIT_W, BIT_ALPHA, FL_ALPHA, BIT_BIAS, BIT_TMP, SIMD, PE, ROW_OUT*COL_OUT,BIT_ACC>
    (temp_out, alpha, bias, out, reps);


}

template <  int ROW_IN,
            int COL_IN,
            int CH_IN,
            int BIT_IN,
            int FL_IN,

            int CH_OUT,
            int BIT_OUT,
            int FL_OUT,

            int BIT_W,
            int BIT_ALPHA,
            int FL_ALPHA,
            int BIT_BIAS,
            int BIT_TMP,

            int SIMD,
            int PE>
void conv1x1_layer_pf_t3( hls::stream<ap_uint<BIT_IN*SIMD> >& in,
                    const ap_uint<BIT_W*SIMD*PE> weights[CH_OUT/PE][CH_IN/SIMD],
                    const ap_uint<BIT_ALPHA> alpha[CH_OUT/PE][PE],
                    const ap_int<BIT_BIAS> bias[CH_OUT/PE][PE],
                    hls::stream<ap_uint<BIT_OUT*PE> >& out,
					ap_uint<10> reps){

#pragma HLS DATAFLOW

    const unsigned INTER_ROW = ROW_IN;
	const unsigned INTER_COL = COL_IN;
    const unsigned ROW_OUT = ROW_IN;
	const unsigned COL_OUT = COL_IN;
	const unsigned BIT_ACC = 20;

	hls::stream<ap_uint<BIT_ACC*PE> > temp_out;
	conv1x1_relu_t3_pf<CH_IN, CH_OUT, BIT_IN, FL_IN, BIT_OUT, FL_OUT, BIT_W, BIT_ALPHA, FL_ALPHA, BIT_BIAS, BIT_TMP, SIMD, PE, ROW_OUT*COL_OUT, BIT_ACC>(
    		in, weights, temp_out, reps);

    pw_11_pf_tuple<CH_IN*3*3, CH_OUT, BIT_IN, FL_IN, BIT_OUT, FL_OUT, BIT_W, BIT_ALPHA, FL_ALPHA, BIT_BIAS, BIT_TMP, SIMD, PE, ROW_OUT*COL_OUT,BIT_ACC>
    (temp_out, alpha, bias, out, reps);


}

template <  int ROW_IN,
            int COL_IN,
            int CH_IN,
            int BIT_IN,
            int FL_IN,

            int CH_OUT,
            int BIT_OUT,
            int FL_OUT,

            int BIT_W,
            int BIT_ALPHA,
            int FL_ALPHA,
            int BIT_BIAS,
            int BIT_TMP,

            int SIMD,
            int PE>
void conv1x1_layer_pf_t3_2( hls::stream<ap_uint<BIT_IN*SIMD> >& in,
                    const ap_uint<BIT_W*SIMD*PE> weights[CH_OUT/PE][CH_IN/SIMD],
                    const ap_uint<BIT_ALPHA> alpha[CH_OUT/PE][PE],
                    const ap_int<BIT_BIAS> bias[CH_OUT/PE][PE],
                    hls::stream<ap_uint<BIT_OUT*PE> >& out,
		            ap_uint<10> reps ){

#pragma HLS DATAFLOW

    const unsigned INTER_ROW = ROW_IN;
	const unsigned INTER_COL = COL_IN;
    const unsigned ROW_OUT = ROW_IN;
	const unsigned COL_OUT = COL_IN;
	const unsigned BIT_ACC = 20;

	hls::stream<ap_uint<BIT_ACC*PE> > temp_out;
	conv1x1_relu_t3_pf<CH_IN, CH_OUT, BIT_IN, FL_IN, BIT_OUT, FL_OUT, BIT_W, BIT_ALPHA, FL_ALPHA, BIT_BIAS, BIT_TMP, SIMD, PE, ROW_OUT*COL_OUT, BIT_ACC>(
    		in, weights, temp_out, reps);

    pw_11_pf_tup2<CH_IN*3*3, CH_OUT, BIT_IN, FL_IN, BIT_OUT, FL_OUT, BIT_W, BIT_ALPHA, FL_ALPHA, BIT_BIAS, BIT_TMP, SIMD, PE, ROW_OUT*COL_OUT,BIT_ACC>
    (temp_out, alpha, bias, out, reps);


}

template <  int ROW_IN,
            int COL_IN,
            int CH_IN,
            int BIT_IN,
            int FL_IN,

            int CH_OUT,
            int BIT_OUT,
            int FL_OUT,

            int BIT_W,
            int BIT_ALPHA,
            int FL_ALPHA,
            int BIT_BIAS,
            int BIT_TMP,

            int SIMD,
            int PE>
void conv1x1_layer_pf_t3_6( hls::stream<ap_uint<BIT_IN*SIMD> >& in,
                    const ap_uint<BIT_W*SIMD*PE> weights[CH_OUT/PE][CH_IN/SIMD],
                    const ap_uint<BIT_ALPHA> alpha[CH_OUT/PE][PE],
                    const ap_int<BIT_BIAS> bias[CH_OUT/PE][PE],
                    hls::stream<ap_uint<BIT_OUT*PE> >& out,
		            ap_uint<10> reps ){

#pragma HLS DATAFLOW

    const unsigned INTER_ROW = ROW_IN;
	const unsigned INTER_COL = COL_IN;
    const unsigned ROW_OUT = ROW_IN;
	const unsigned COL_OUT = COL_IN;
	const unsigned BIT_ACC = 20;

	hls::stream<ap_uint<BIT_ACC*PE> > temp_out;
	conv1x1_relu_t3_pf<CH_IN, CH_OUT, BIT_IN, FL_IN, BIT_OUT, FL_OUT, BIT_W, BIT_ALPHA, FL_ALPHA, BIT_BIAS, BIT_TMP, SIMD, PE, ROW_OUT*COL_OUT, BIT_ACC>(
    		in, weights, temp_out, reps);

    pw_11_pf_tup6<CH_IN*3*3, CH_OUT, BIT_IN, FL_IN, BIT_OUT, FL_OUT, BIT_W, BIT_ALPHA, FL_ALPHA, BIT_BIAS, BIT_TMP, SIMD, PE, ROW_OUT*COL_OUT,BIT_ACC>
    (temp_out, alpha, bias, out, reps);


}

template <  int ROW_IN,
            int COL_IN,
            int CH_IN,
            int BIT_IN,
            int FL_IN,

            int CH_OUT,
            int BIT_OUT,
            int FL_OUT,

            int BIT_W,
            int BIT_ALPHA,
            int FL_ALPHA,
            int BIT_BIAS,
            int BIT_TMP,

            int SIMD,
            int PE>
void conv1x1_layer_pf_t3_3( hls::stream<ap_uint<BIT_IN*SIMD> >& in,
                    const ap_uint<BIT_W*SIMD*PE> weights[CH_OUT/PE][CH_IN/SIMD],
                    const ap_uint<BIT_ALPHA> alpha[CH_OUT/PE][PE],
                    const ap_int<BIT_BIAS> bias[CH_OUT/PE][PE],
                    hls::stream<ap_uint<BIT_OUT*PE> >& out,
					ap_uint<10> reps){

#pragma HLS DATAFLOW

    const unsigned INTER_ROW = ROW_IN;
	const unsigned INTER_COL = COL_IN;
    const unsigned ROW_OUT = ROW_IN;
	const unsigned COL_OUT = COL_IN;
	const unsigned BIT_ACC = 20;

	hls::stream<ap_uint<BIT_ACC*PE> > temp_out;
	conv1x1_relu_t3_pf<CH_IN, CH_OUT, BIT_IN, FL_IN, BIT_OUT, FL_OUT, BIT_W, BIT_ALPHA, FL_ALPHA, BIT_BIAS, BIT_TMP, SIMD, PE, ROW_OUT*COL_OUT, BIT_ACC>(
    		in, weights, temp_out, reps);

    pw_11_pf_tup3<CH_IN*3*3, CH_OUT, BIT_IN, FL_IN, BIT_OUT, FL_OUT, BIT_W, BIT_ALPHA, FL_ALPHA, BIT_BIAS, BIT_TMP, SIMD, PE, ROW_OUT*COL_OUT,BIT_ACC>
    (temp_out, alpha, bias, out, reps);


}


template <  int ROW_IN,
            int COL_IN,
            int CH_IN,
            int BIT_IN,
            int FL_IN,

            int CH_OUT,
            int BIT_OUT,
            int FL_OUT,

            int BIT_W,
            int BIT_ALPHA,
            int FL_ALPHA,
            int BIT_BIAS,
            int BIT_TMP,

            int SIMD,
            int PE >
void conv1x1_layer_pf_t3_4( hls::stream<ap_uint<BIT_IN*SIMD> >& in,
                    const ap_uint<BIT_W*SIMD*PE> weights[CH_OUT/PE][CH_IN/SIMD],
                    const ap_uint<BIT_ALPHA> alpha[CH_OUT/PE][PE],
                    const ap_int<BIT_BIAS> bias[CH_OUT/PE][PE],
                    hls::stream<ap_uint<BIT_OUT*PE> >& out,
		            ap_uint<10> reps){

#pragma HLS DATAFLOW

    const unsigned INTER_ROW = ROW_IN;
	const unsigned INTER_COL = COL_IN;
    const unsigned ROW_OUT = ROW_IN;
	const unsigned COL_OUT = COL_IN;
	const unsigned BIT_ACC = 20;

	hls::stream<ap_uint<BIT_ACC*PE> > temp_out;
	conv1x1_relu_t3_pf<CH_IN, CH_OUT, BIT_IN, FL_IN, BIT_OUT, FL_OUT, BIT_W, BIT_ALPHA, FL_ALPHA, BIT_BIAS, BIT_TMP, SIMD, PE, ROW_OUT*COL_OUT, BIT_ACC>(
    		in, weights, temp_out, reps);

    pw_11_pf_tup4<CH_IN*3*3, CH_OUT, BIT_IN, FL_IN, BIT_OUT, FL_OUT, BIT_W, BIT_ALPHA, FL_ALPHA, BIT_BIAS, BIT_TMP, SIMD, PE, ROW_OUT*COL_OUT,BIT_ACC>
    (temp_out, alpha, bias, out, reps);


}

template <  int NUM_IN,
            int NUM_OUT,

            int BIT_IN,
            int FL_IN,
            int BIT_OUT,
            int FL_OUT,

            int BIT_W,
            int BIT_ALPHA,
            int FL_ALPHA,
            int BIT_BIAS,
            int BIT_TMP,

            int SIMD,
            int PE,
            int VECT_NUMS  >
void dw_conv3x3_relu(  hls::stream<ap_uint<BIT_IN*SIMD> >& in,
                    const ap_uint<BIT_W*SIMD> weights[NUM_OUT/SIMD][9],
                    const ap_uint<BIT_ALPHA> alpha[NUM_OUT/SIMD][SIMD],
                    const ap_int<BIT_BIAS> bias[NUM_OUT/SIMD][SIMD],
                    hls::stream<ap_uint<BIT_OUT*SIMD> >& out,
					ap_uint<10> reps){

#pragma HLS DATAFLOW

    //const unsigned INPUT_FOLD = NUM_OUT/SIMD;    /* input_channel / simd * kernel_size^2 */
	//const unsigned OUTPUT_FOLD = NUM_OUT/PE;     /* output_channel / pe */
    const unsigned BIT_ACC = SIMD*(BIT_IN+BIT_W);

    const unsigned BIT_SUM = 14;

    //const unsigned total_loop_num = INPUT_FOLD * VECT_NUMS * reps; // right and differ from


    //ap_uint<BIT_IN*SIMD> input_temp_arr[INPUT_FOLD];
    //#pragma HLS RESOURCE variable=input_temp_arr core=RAM_2P_BRAM

    unsigned in_fold_cnt = 0;
	unsigned out_fold_cnt = 0;
	unsigned tile = 0;

    ap_uint<BIT_IN*SIMD> input_temp;

    ap_int<30> acc_reordering[NUM_OUT/SIMD][SIMD];
#pragma HLS RESOURCE variable=acc_reordering core=RAM_2P_LUTRAM
#pragma HLS ARRAY_PARTITION variable=acc_reordering complete dim=2
    //ap_int<BIT_TMP> output_temp[PE];
    //ap_uint<BIT_OUT> output_uint[PE];

    ap_int<(BIT_IN+BIT_W)*SIMD> acc_temp;
    //int ch;

    for(int i=0 ;i<VECT_NUMS * reps;i++){
    	for(int out_fold_cnt = 0;out_fold_cnt<NUM_OUT/SIMD;out_fold_cnt++){
    		for(int simd=0;simd<SIMD;simd++){
#pragma HLS PIPELINE II=1
        		acc_reordering[out_fold_cnt][simd] = 0;
    		}
    	}
    	for(int j = 0 ; j < 9; j++){
    		for(int k=0 ; k < NUM_OUT/SIMD;k++){
#pragma HLS PIPELINE II=1
    			ap_uint<BIT_IN*SIMD> weight_temp = weights[k][j];
    			input_temp = in.read();
    			acc_temp = desperate_vector_pot_product<BIT_IN, BIT_W, BIT_ACC, SIMD>(input_temp, weight_temp);
    			for(int s = 0;s<SIMD;s++){
#pragma HLS UNROLL
    				ap_int<BIT_IN + BIT_W> signed_temp = acc_temp((s+1)*(BIT_IN + BIT_W)-1 , s*(BIT_IN + BIT_W));
    				acc_reordering[k][s] += signed_temp;
    			}
    		}
    	}

		for(int out_fold_cnt = 0;out_fold_cnt<NUM_OUT/SIMD;out_fold_cnt++){
#pragma HLS PIPELINE II=1
			ap_uint<BIT_OUT*SIMD> out_buf;
			for(int simd=0;simd<SIMD;simd++){
				ap_int<BIT_TMP> output_temp = acc_reordering[out_fold_cnt][simd] * alpha[out_fold_cnt][simd] + bias[out_fold_cnt][simd];//0 for pe
				#pragma HLS RESOURCE variable=output_temp core=DSP48
				output_temp = (output_temp(BIT_TMP-1, BIT_TMP-1) == (ap_uint<1>)0) ? output_temp : (ap_int<BIT_TMP>)0;
				ap_uint<BIT_OUT> output_uint = truncate_unsigned<(BIT_TMP-FL_IN-FL_ALPHA), (FL_IN+FL_ALPHA), (BIT_OUT-FL_OUT), FL_OUT>(output_temp);
				out_buf((simd+1)*BIT_OUT-1, simd*BIT_OUT) = output_uint;
			}
			out.write(out_buf);
		}
    }
}


template <  int NUM_IN,
            int NUM_OUT,

            int BIT_IN,
            int FL_IN,
            int BIT_OUT,
            int FL_OUT,

            int BIT_W,
            int BIT_ALPHA,
            int FL_ALPHA,
            int BIT_BIAS,
            int BIT_TMP,

            int SIMD,
            int PE,
            int VECT_NUMS >
void dw_conv3x3_relu_signed(  hls::stream<ap_uint<BIT_IN*SIMD> >& in,
                    const ap_uint<BIT_W*SIMD> weights[NUM_OUT/SIMD][9],
                    const ap_uint<BIT_ALPHA> alpha[NUM_OUT/SIMD][SIMD],
                    const ap_int<BIT_BIAS> bias[NUM_OUT/SIMD][SIMD],
                    hls::stream<ap_uint<BIT_OUT*SIMD> >& out,
					ap_uint<10> reps){

#pragma HLS DATAFLOW

    //const unsigned INPUT_FOLD = NUM_OUT/SIMD;    /* input_channel / simd * kernel_size^2 */
	//const unsigned OUTPUT_FOLD = NUM_OUT/PE;     /* output_channel / pe */
    const unsigned BIT_ACC = SIMD*(BIT_IN+BIT_W);

    const unsigned BIT_SUM = 14;

    //const unsigned total_loop_num = INPUT_FOLD * VECT_NUMS * reps; // right and differ from


    //ap_uint<BIT_IN*SIMD> input_temp_arr[INPUT_FOLD];
    //#pragma HLS RESOURCE variable=input_temp_arr core=RAM_2P_BRAM

    unsigned in_fold_cnt = 0;
	unsigned tile = 0;

    ap_uint<BIT_IN*SIMD> input_temp;

    ap_int<BIT_TMP> acc_reordering[NUM_OUT/SIMD][SIMD];
#pragma HLS ARRAY_PARTITION variable=acc_reordering complete dim=2
#pragma HLS RESOURCE variable acc_reordering core=RAM_2P_LUTRAM
    //ap_int<BIT_TMP> output_temp[PE];
    //ap_uint<BIT_OUT> output_uint[PE];

    ap_int<(BIT_IN+BIT_W)*SIMD> acc_temp;
    //int ch;

    aplp:for(int i=0 ;i<VECT_NUMS * reps;i++){
    	etfd:for(int j = 0 ; j < 9; j++){
    		hfs:for(int k=0 ; k < NUM_OUT/SIMD;k++){
#pragma HLS PIPELINE II=1
				ap_uint<BIT_W*SIMD> weight_temp = weights[k][j];
				input_temp = in.read();
				acc_temp = desperate_vector_pot_product<BIT_IN, BIT_W, BIT_ACC, SIMD>(input_temp, weight_temp);
				ap_uint<BIT_OUT*SIMD> out_buf;
				nmmsl:for(int s = 0;s<SIMD;s++){
#pragma HLS UNROLL
					ap_int<BIT_IN + BIT_W> signed_temp = acc_temp((s+1)*(BIT_IN + BIT_W)-1 , s*(BIT_IN + BIT_W));

	    			if(j==0) acc_reordering[k][s] = signed_temp;
	    			else {
	    				acc_reordering[k][s] += signed_temp;

	    			}

					ap_int<BIT_TMP> output_temp;
					ap_uint<BIT_OUT> output_uint;
					if(j==8){
						output_temp = acc_reordering[k][s] * alpha[k][s] + bias[k][s];
						output_temp = (output_temp(BIT_TMP-1, BIT_TMP-1) == (ap_uint<1>)0) ? output_temp : (ap_int<BIT_TMP>)0;
						output_uint = truncate_unsigned<(BIT_TMP-FL_IN-FL_ALPHA), (FL_IN+FL_ALPHA), (BIT_OUT-FL_OUT), FL_OUT>(output_temp);
						out_buf((s+1)*BIT_OUT-1, s*BIT_OUT) = output_uint;
					}
				}
				if(j==8)out.write(out_buf);
			}
		}
#ifdef DEBUG
    if(i==3*320){
    	i = 3*320;
    }
#endif
    }
}


template <  int NUM_IN,
            int NUM_OUT,

            int BIT_IN,
            int FL_IN,
            int BIT_OUT,
            int FL_OUT,

            int BIT_W,
            int BIT_ALPHA,
            int FL_ALPHA,
            int BIT_BIAS,
            int BIT_TMP,

            int SIMD,
            int PE,
            int VECT_NUMS >
void dw_conv3x3_relu_oly(  hls::stream<ap_uint<BIT_IN*SIMD> >& in,
                    const ap_uint<BIT_W*SIMD> weights[NUM_OUT/SIMD][9],
                    const ap_uint<BIT_ALPHA> alpha[NUM_OUT/SIMD][SIMD],
                    const ap_int<BIT_BIAS> bias[NUM_OUT/SIMD][SIMD],
                    hls::stream<ap_uint<BIT_OUT*SIMD> >& out,
					ap_uint<10> reps){

#pragma HLS DATAFLOW

    //const unsigned INPUT_FOLD = NUM_OUT/SIMD;    /* input_channel / simd * kernel_size^2 */
	//const unsigned OUTPUT_FOLD = NUM_OUT/PE;     /* output_channel / pe */
    const unsigned BIT_ACC = SIMD*(BIT_IN+BIT_W);

    const unsigned BIT_SUM = 14;

    //const unsigned total_loop_num = INPUT_FOLD * VECT_NUMS * reps; // right and differ from


    //ap_uint<BIT_IN*SIMD> input_temp_arr[INPUT_FOLD];
    //#pragma HLS RESOURCE variable=input_temp_arr core=RAM_2P_BRAM

    unsigned in_fold_cnt = 0;
	unsigned tile = 0;

    ap_uint<BIT_IN*SIMD> input_temp;

    ap_int<BIT_TMP> acc_reordering[SIMD][10];
#pragma HLS ARRAY_PARTITION variable=acc_reordering complete dim=0
    //ap_int<BIT_TMP> output_temp[PE];
    //ap_uint<BIT_OUT> output_uint[PE];

    ap_int<(BIT_IN+BIT_W)*SIMD> acc_temp;
    //int ch;

	for(int s = 0;s<SIMD;s++){
		acc_reordering[s][0] = 0;
	}
    aplp:for(int i=0 ;i<VECT_NUMS * reps;i++){
    	etfd:for(int j = 0 ; j < 9; j++){
#pragma HLS PIPELINE II=1
				ap_uint<BIT_W*SIMD> weight_temp = weights[0][j];
				input_temp = in.read();
				acc_temp = desperate_vector_pot_product<BIT_IN, BIT_W, BIT_ACC, SIMD>(input_temp, weight_temp);
				ap_uint<BIT_OUT*SIMD> out_buf;
				nmmsl:for(int s = 0;s<SIMD;s++){
#pragma HLS UNROLL
					ap_int<BIT_IN + BIT_W> signed_temp = acc_temp((s+1)*(BIT_IN + BIT_W)-1 , s*(BIT_IN + BIT_W));

	    				acc_reordering[s][j+1] = signed_temp + acc_reordering[s][j];


					ap_int<BIT_TMP> output_temp;
					ap_uint<BIT_OUT> output_uint;
					if(j==8){
						output_temp = acc_reordering[s][9] * alpha[0][s] + bias[0][s];
						output_temp = (output_temp(BIT_TMP-1, BIT_TMP-1) == (ap_uint<1>)0) ? output_temp : (ap_int<BIT_TMP>)0;
						output_uint = truncate_unsigned<(BIT_TMP-FL_IN-FL_ALPHA), (FL_IN+FL_ALPHA), (BIT_OUT-FL_OUT), FL_OUT>(output_temp);
						out_buf((s+1)*BIT_OUT-1, s*BIT_OUT) = output_uint;
					}
				}
				if(j==8)out.write(out_buf);

		}
#ifdef DEBUG
    if(i==3*320){
    	i = 3*320;
    }
#endif
    }
}


template <  int ROW_IN,
            int COL_IN,
            int CH_IN,
            int BIT_IN,
            int FL_IN,

            int CH_OUT,
            int BIT_OUT,
            int FL_OUT,

            int BIT_W,
            int BIT_ALPHA,
            int FL_ALPHA,
            int BIT_BIAS,
            int BIT_TMP,

            int SIMD,
            int PE>
void dw_conv3x3_layer_signed( hls::stream<ap_uint<BIT_IN*SIMD> >& in,
                    const ap_uint<BIT_W*SIMD> weights[CH_OUT/SIMD][9],
                    const ap_uint<BIT_ALPHA> alpha[CH_OUT/SIMD][SIMD],
                    const ap_int<BIT_BIAS> bias[CH_OUT/SIMD][SIMD],
                    hls::stream<ap_uint<BIT_OUT*SIMD> >& out,
					ap_uint<10> reps){

#pragma HLS DATAFLOW

    const unsigned INTER_ROW = ROW_IN + 2;
	const unsigned INTER_COL = COL_IN + 2;
    const unsigned ROW_OUT = ROW_IN;
	const unsigned COL_OUT = COL_IN;

    stream<ap_uint<SIMD*BIT_IN> > padding_out("padding_out");
	padding<ROW_IN, COL_IN, CH_OUT, SIMD ,BIT_IN, 1>(in, padding_out, reps);

#ifdef DEBUG
    printf("nnn\n\n");
#endif


	stream<ap_uint<SIMD*BIT_IN> > swu_out("swu_out");
#pragma HLS STREAM variable=swu_out depth=512 dim=1
	SWU_DW<3, 1, INTER_ROW, INTER_COL, CH_OUT,SIMD,SIMD, BIT_IN> (padding_out, swu_out, reps);

#ifdef DEBUG
    printf("fff\n\n");
#endif

    //hls::stream<ap_uint<BIT_OUT*SIMD> > conv_out("conv_out");

    dw_conv3x3_relu_signed<CH_OUT*3*3, CH_OUT, BIT_IN, FL_IN, BIT_OUT, FL_OUT, BIT_W, BIT_ALPHA, FL_ALPHA, BIT_BIAS, BIT_TMP, SIMD, PE, ROW_OUT*COL_OUT >(
    		swu_out, weights, alpha, bias, out, reps);

	//for(int i=0;i<192;i++) cout << conv_out.read() << endl;


}


template <  int ROW_IN,
            int COL_IN,
            int CH_IN,
            int BIT_IN,
            int FL_IN,

            int CH_OUT,
            int BIT_OUT,
            int FL_OUT,

            int BIT_W,
            int BIT_ALPHA,
            int FL_ALPHA,
            int BIT_BIAS,
            int BIT_TMP,

            int SIMD,
            int PE>
void dw_conv3x3_laoyer( hls::stream<ap_uint<BIT_IN*SIMD> >& in,
                    const ap_uint<BIT_W*SIMD> weights[CH_OUT/SIMD][9],
                    const ap_uint<BIT_ALPHA> alpha[CH_OUT/SIMD][SIMD],
                    const ap_int<BIT_BIAS> bias[CH_OUT/SIMD][SIMD],
                    hls::stream<ap_uint<BIT_OUT*SIMD> >& out,
					ap_uint<10> reps){

#pragma HLS DATAFLOW

    const unsigned INTER_ROW = ROW_IN + 2;
	const unsigned INTER_COL = COL_IN + 2;
    const unsigned ROW_OUT = ROW_IN;
	const unsigned COL_OUT = COL_IN;

    stream<ap_uint<SIMD*BIT_IN> > padding_out("padding_out");
	padding<ROW_IN, COL_IN, CH_OUT, SIMD ,BIT_IN, 1>(in, padding_out, reps);

#ifdef DEBUG
    printf("nnn\n\n");
#endif


	stream<ap_uint<SIMD*BIT_IN> > swu_out("swu_out");
#pragma HLS STREAM variable=swu_out depth=512 dim=1
	SWU_DW_3x3_oly<3, 1, INTER_ROW, INTER_COL, CH_OUT,SIMD,SIMD, BIT_IN> (padding_out, swu_out, reps);

#ifdef DEBUG
    printf("fff\n\n");
#endif

    //hls::stream<ap_uint<BIT_OUT*SIMD> > conv_out("conv_out");

    dw_conv3x3_relu_oly<CH_OUT*3*3, CH_OUT, BIT_IN, FL_IN, BIT_OUT, FL_OUT, BIT_W, BIT_ALPHA, FL_ALPHA, BIT_BIAS, BIT_TMP, SIMD, PE, ROW_OUT*COL_OUT >(
    		swu_out, weights, alpha, bias, out, reps);

	//for(int i=0;i<192;i++) cout << conv_out.read() << endl;


}

template <  int ROW_IN,
            int COL_IN,
            int CH_IN,
            int BIT_IN,
            int FL_IN,

            int CH_OUT,
            int BIT_OUT,
            int FL_OUT,

            int BIT_W,
            int BIT_ALPHA,
            int FL_ALPHA,
            int BIT_BIAS,
            int BIT_TMP,

            int SIMD,
            int PE>
void dw_conv3x3_layer_lidea( hls::stream<ap_uint<BIT_IN*SIMD> >& in,
                    const ap_uint<BIT_W*SIMD> weights[CH_OUT/SIMD][9],
                    const ap_uint<BIT_ALPHA> alpha[CH_OUT/SIMD][SIMD],
                    const ap_int<BIT_BIAS> bias[CH_OUT/SIMD][SIMD],
                    hls::stream<ap_uint<BIT_OUT*SIMD> >& out,
					ap_uint<10> reps){

#pragma HLS DATAFLOW

    const unsigned INTER_ROW = ROW_IN + 2;
	const unsigned INTER_COL = COL_IN + 2;
    const unsigned ROW_OUT = ROW_IN;
	const unsigned COL_OUT = COL_IN;

    stream<ap_uint<SIMD*BIT_IN> > padding_out("padding_out");
	padding<ROW_IN, COL_IN, CH_OUT, SIMD ,BIT_IN, 1>(in, padding_out, reps);

#ifdef DEBUG
    printf("nnn\n\n");
#endif


	stream<ap_uint<SIMD*BIT_IN> > swu_out("swu_out");
#pragma HLS STREAM variable=swu_out depth=512 dim=1
	SWU_DW_3x3_new<3, 1, INTER_ROW, INTER_COL, CH_OUT,SIMD,SIMD, BIT_IN> (padding_out, swu_out, reps);

#ifdef DEBUG
    printf("fff\n\n");
#endif

    //hls::stream<ap_uint<BIT_OUT*SIMD> > conv_out("conv_out");

    dw_conv3x3_relu_signed<CH_OUT*3*3, CH_OUT, BIT_IN, FL_IN, BIT_OUT, FL_OUT, BIT_W, BIT_ALPHA, FL_ALPHA, BIT_BIAS, BIT_TMP, SIMD, PE, ROW_OUT*COL_OUT >(
    		swu_out, weights, alpha, bias, out, reps);

	//for(int i=0;i<192;i++) cout << conv_out.read() << endl;


}


template <  int ROW_IN,
            int COL_IN
>
void pooping( hls::stream<ap_uint<32> >& in,
		hls::stream<ap_uint<32> >& out,
		ap_uint<10> reps){

	ap_int<32> in_temp;
	ap_int<32> max;
	ap_int<32> row;
	ap_int<32> col;
	ap_int<32> anchor_0;
	ap_int<32> anchor_1;
	ap_int<32> anchor_2;
	ap_int<32> anchor_3;
	ap_int<32> row_need;
	ap_int<32> col_need;
	ap_int<32> anchor_0_need;
	ap_int<32> anchor_1_need;
	ap_int<32> anchor_2_need;
	ap_int<32> anchor_3_need;
	max = -268435455;
	ap_uint<32> found;

	loopLv:for(unsigned i=0; i<reps; i++){
		max = -268435455;
		for(int row = 0;row < ROW_IN;row++){
			for(int col = 0;col < COL_IN;col++){
				found = 0;
				for(int ch = 0;ch < 10;ch++){
					in_temp = in.read();
					if(in_temp>max) {
						max = in_temp;
						found = 1;
					}
				}
				anchor_0 = in.read();
				anchor_1 = in.read();
				anchor_2 = in.read();
				anchor_3 = in.read();
				if(found){
					row_need = row;
					col_need = col;
					anchor_0_need = anchor_0;
					anchor_1_need = anchor_1;
					anchor_2_need = anchor_2;
					anchor_3_need = anchor_3;
				}
			}
		}

		out.write(row_need);
		out.write(col_need);
		out.write(anchor_0_need);
		out.write(anchor_1_need);
		out.write(anchor_2_need);
		out.write(anchor_3_need);

	}
}

template <  int NUM_IN,
            int NUM_OUT,

            int BIT_IN,
            int FL_IN,
            int BIT_OUT,
            int FL_OUT,

            int BIT_W,
            int BIT_ALPHA,
            int FL_ALPHA,
            int BIT_BIAS,
            int BIT_TMP,

            int SIMD,
            int PE,
            int VECT_NUMS,
			int BIT_ACC>
void conv3x3_relu_3t_pf(   hls::stream<ap_uint<BIT_IN*SIMD> >& in,
        const ap_uint<BIT_W * SIMD * PE> weights[(NUM_OUT/PE)][(NUM_IN/9/SIMD)][9],
        hls::stream<ap_uint<BIT_ACC*PE> >& out,
		ap_uint<10> reps){

#pragma HLS DATAFLOW

    const unsigned INPUT_FOLD = NUM_IN/SIMD/9;    /* input_channel / simd * kernel_size^2 */
	const unsigned OUTPUT_FOLD = NUM_OUT/PE;     /* output_channel / pe */


    ap_uint<BIT_IN*SIMD> input_temp_arr[INPUT_FOLD][9];
#pragma HLS ARRAY_PARTITION variable=input_temp_arr complete dim=2
    ap_uint<7> in_fold_cnt = 0;
    ap_uint<7> out_fold_cnt = 0;

    ap_uint<BIT_IN*SIMD> input_temp[9];
#pragma HLS ARRAY_PARTITION variable=input_temp complete dim=0
    ap_int<BIT_ACC> acc[PE+2];
#pragma HLS ARRAY_PARTITION variable=acc complete dim=0
    // ap_int<BIT_TMP> output_temp[PE];
    // ap_uint<BIT_OUT> output_uint[PE];

    loopLv:for(unsigned i=0; i<VECT_NUMS * reps; i++){
    	loopsd:for(ap_uint<7> out_fold_cnt=0;out_fold_cnt<OUTPUT_FOLD;out_fold_cnt++){
    		for(ap_uint<7> m=0; m<PE; m++){
#pragma HLS UNROLL
    			acc[m] = 0;
    		}
            loopasd:for(ap_uint<5> ay = 0;ay < 9;ay++){
				loopyy:for(ap_uint<7> in_fold_cnt=0;in_fold_cnt<INPUT_FOLD;in_fold_cnt++){
	#pragma HLS PIPELINE II=1
					if(out_fold_cnt == 0){
						input_temp[ay] = in.read();
						input_temp_arr[in_fold_cnt][ay] = input_temp[ay];

					}else{
						looqep:for(ap_uint<7> ay = 0;ay<9;ay++){
							input_temp[ay] = input_temp_arr[in_fold_cnt][ay];
						}

					}

					if((in_fold_cnt == 0)&&(ay == 0)){

					}
					ap_uint<BIT_W*SIMD*PE> weight_temp = weights[out_fold_cnt][in_fold_cnt][ay];
					ap_uint<BIT_ACC*PE> out_buf;
					loopqwe:for(ap_uint<7> pe = 0 ; pe < PE; pe +=3){
	#pragma HLS UNROLL
						ap_uint <BIT_W*SIMD> weight_pe_loop_0 = weight_temp((pe+1)*BIT_W*SIMD-1,pe*BIT_W*SIMD);
						ap_uint <BIT_W*SIMD> weight_pe_loop_1;
						ap_uint <BIT_W*SIMD> weight_pe_loop_2;
						if(pe+1<PE) weight_pe_loop_1 = weight_temp((pe+2)*BIT_W*SIMD-1,(pe+1)*BIT_W*SIMD);
						if(pe+2<PE) weight_pe_loop_2 = weight_temp((pe+3)*BIT_W*SIMD-1,(pe+2)*BIT_W*SIMD);
						ap_int<BIT_ACC*3> o;
						//ap_uint <BIT_IN*SIMD> in_pe_loop = input_temp((pe+1)*BIT_IN*SIMD-1,pe*BIT_IN*SIMD);

						o = tihoo_mutiplier<BIT_IN, BIT_W, BIT_ACC, SIMD>(input_temp[ay], weight_pe_loop_0,weight_pe_loop_1,weight_pe_loop_2);
						acc[pe] += o(BIT_ACC - 1,0);
						acc[pe+1] += o(2*BIT_ACC - 1,BIT_ACC);
						acc[pe+2] += o(3*BIT_ACC - 1,2*BIT_ACC);

					}
					loopqec:for(ap_uint<7> pe = 0 ; pe < PE; pe ++){
						#pragma HLS UNROLL
						if((ay == 8) && (in_fold_cnt ==INPUT_FOLD - 1)){
							out_buf((pe+1)*BIT_ACC-1, pe*BIT_ACC) = acc[pe];
						}
					}
					if((ay == 8) && (in_fold_cnt ==INPUT_FOLD - 1)){
						out.write(out_buf);
					}
				}
    		}

    	}
    }
}


template <  int ROW_IN,
            int COL_IN,
            int CH_IN,
            int BIT_IN,
            int FL_IN,

            int CH_OUT,
            int BIT_OUT,
            int FL_OUT,

            int BIT_W,
            int BIT_ALPHA,
            int FL_ALPHA,
            int BIT_BIAS,
            int BIT_TMP,

            int SIMD,
            int PE>
void dw_conv3x3_layer( hls::stream<ap_uint<BIT_IN*SIMD> >& in,
                    const ap_uint<BIT_W*SIMD> weights[CH_OUT/SIMD][9],
                    const ap_uint<BIT_ALPHA> alpha[CH_OUT/SIMD][SIMD],
                    const ap_int<BIT_BIAS> bias[CH_OUT/SIMD][SIMD],
                    hls::stream<ap_uint<BIT_OUT*SIMD> >& out,
					ap_uint<10> reps){

#pragma HLS DATAFLOW

    const unsigned INTER_ROW = ROW_IN + 2;
	const unsigned INTER_COL = COL_IN + 2;
    const unsigned ROW_OUT = ROW_IN;
	const unsigned COL_OUT = COL_IN;

    stream<ap_uint<SIMD*BIT_IN> > padding_out("padding_out");
	padding<ROW_IN, COL_IN, CH_OUT, SIMD ,BIT_IN, 1>(in, padding_out, reps);

#ifdef DEBUG
#endif

	stream<ap_uint<SIMD*BIT_IN> > swu_out("swu_out");
#pragma HLS STREAM variable=swu_out depth=512 dim=1
	SWU_DW<3, 1, INTER_ROW, INTER_COL, CH_OUT,SIMD,SIMD, BIT_IN> (padding_out, swu_out, reps);

#ifdef DEBUG
#endif


    //hls::stream<ap_uint<BIT_OUT*SIMD> > conv_out("conv_out");

	dw_conv3x3_relu_signed<CH_OUT*3*3, CH_OUT, BIT_IN, FL_IN, BIT_OUT, FL_OUT, BIT_W, BIT_ALPHA, FL_ALPHA, BIT_BIAS, BIT_TMP, SIMD, PE, ROW_OUT*COL_OUT>(
    		swu_out, weights, alpha, bias, out , reps);
#ifdef DEBUG
#endif
	//for(int i=0;i<192;i++) cout << conv_out.read() << endl;


}

template <  int ROW_IN,
            int COL_IN,
            int CH_IN,
            int BIT_IN,
            int FL_IN,

            int CH_OUT,
            int BIT_OUT,
            int FL_OUT,

            int BIT_W,
            int BIT_ALPHA,
            int FL_ALPHA,
            int BIT_BIAS,
            int BIT_TMP,

            int SIMD,
            int PE>
void dw_conv3x3_layer_BRAM( hls::stream<ap_uint<BIT_IN*SIMD> >& in,
                    const ap_uint<BIT_W*SIMD> weights[CH_OUT/SIMD][9],
                    const ap_uint<BIT_ALPHA> alpha[CH_OUT/SIMD][SIMD],
                    const ap_int<BIT_BIAS> bias[CH_OUT/SIMD][SIMD],
                    hls::stream<ap_uint<BIT_OUT*SIMD> >& out,
					ap_uint<10> reps){

#pragma HLS DATAFLOW

    const unsigned INTER_ROW = ROW_IN + 2;
	const unsigned INTER_COL = COL_IN + 2;
    const unsigned ROW_OUT = ROW_IN;
	const unsigned COL_OUT = COL_IN;

    stream<ap_uint<SIMD*BIT_IN> > padding_out("padding_out");
	padding<ROW_IN, COL_IN, CH_OUT, SIMD ,BIT_IN, 1>(in, padding_out, reps);

#ifdef DEBUG
#endif

	stream<ap_uint<SIMD*BIT_IN> > swu_out("swu_out");
#pragma HLS STREAM variable=swu_out depth=512 dim=1
	SWU_DW_3x3_new_BRAM<3, 1, INTER_ROW, INTER_COL, CH_OUT,SIMD,SIMD, BIT_IN> (padding_out, swu_out, reps);

#ifdef DEBUG
#endif


    //hls::stream<ap_uint<BIT_OUT*SIMD> > conv_out("conv_out");

	dw_conv3x3_relu_signed<CH_OUT*3*3, CH_OUT, BIT_IN, FL_IN, BIT_OUT, FL_OUT, BIT_W, BIT_ALPHA, FL_ALPHA, BIT_BIAS, BIT_TMP, SIMD, PE, ROW_OUT*COL_OUT >(
    		swu_out, weights, alpha, bias, out, reps);
#ifdef DEBUG
#endif
	//for(int i=0;i<192;i++) cout << conv_out.read() << endl;

#ifdef DEBUG_2
	cout<<"out size :"<<out.size()<<endl;
#endif
}

template <               // kernel
			// unsigned S,                 // stride
			unsigned IN_ROW,
			unsigned IN_COL,
			unsigned IN_CH,
			unsigned IN_tran,
			unsigned IN_BIT>
void stream_bypass(
	stream<ap_uint<IN_tran*IN_BIT> >& in,
	stream<ap_uint<IN_tran*IN_BIT> >& out,
	stream<ap_uint<IN_tran*IN_BIT> >& out_bypass,
	unsigned reps)
{
	ap_uint<IN_tran * IN_BIT> temp;
	ap_uint<IN_tran * IN_BIT> out_bypass_temp;
	ap_uint<IN_tran * IN_BIT> cache[2][IN_COL][IN_CH/IN_tran];
	for(int i=0;i<(IN_ROW/2)*reps;i++){

		{
			for(int j=0;j<IN_COL;j++){
				for(int k = 0;k<IN_CH/IN_tran;k++){
	#pragma HLS pipeline II = 1
					temp = in.read();
					out.write(temp);
					cache[0][j][k] = temp;
				}
			}
			for(int j=0;j<IN_COL;j++){
				for(int k = 0;k<IN_CH/IN_tran;k++){
	#pragma HLS pipeline II = 1
					temp = in.read();
					out.write(temp);
					cache[1][j][k] = temp;
				}
			}
		}
		for(int j=0;j<IN_COL;j+=2){
			for(int k = 0;k<IN_CH/IN_tran;k++){
#pragma HLS pipeline II = 1
				out_bypass_temp = cache[0][j][k];
				out_bypass.write(out_bypass_temp);
			}
			for(int k = 0;k<IN_CH/IN_tran;k++){
#pragma HLS pipeline II = 1
				out_bypass_temp = cache[0][j+1][k];
				out_bypass.write(out_bypass_temp);
			}
			for(int k = 0;k<IN_CH/IN_tran;k++){
#pragma HLS pipeline II = 1
				out_bypass_temp = cache[1][j][k];
				out_bypass.write(out_bypass_temp);
			}
			for(int k = 0;k<IN_CH/IN_tran;k++){
#pragma HLS pipeline II = 1
				out_bypass_temp = cache[1][j+1][k];
				out_bypass.write(out_bypass_temp);
			}
		}
	}

}


template <               // kernel
			// unsigned S,                 // stride
			unsigned IN_ROW,
			unsigned IN_COL,
			unsigned IN_CH,
			unsigned IN_tran,
			unsigned BYPASS_CH,
			unsigned BYPASS_tran,
			unsigned OUT_tran,
			unsigned IN_BIT>
void stream_reorganize(
		//differ with vgg version
	stream<ap_uint<IN_tran*IN_BIT> >& in,
	stream<ap_uint<BYPASS_tran*IN_BIT> >& in_bypass,
	stream<ap_uint<OUT_tran*IN_BIT> >& out,
	unsigned reps)
{
	ap_uint<IN_tran * IN_BIT> temp;
	ap_uint<BYPASS_tran * IN_BIT> in_bypass_temp;
	ap_uint<IN_CH * IN_BIT> cache[2][IN_COL];
	for(int i=0;i<IN_ROW * reps;i++){
		for(int j=0;j<IN_COL;j++){
			for(int k=0;k<(IN_CH+BYPASS_CH)/OUT_tran;k++){
				//in_bypass_temp = in_bypass.read();
				if(k<BYPASS_CH/OUT_tran) temp = in_bypass.read();
				else temp = in.read();
				out.write(temp);
			}
		}
	}

}

void atss_0506_lossv2(  hls::stream<my_ap_axis >& in,
                    hls::stream<my_ap_axis >& out,
                    const unsigned repst){

#pragma HLS DATAFLOW
#pragma HLS INTERFACE axis register both port=out
#pragma HLS INTERFACE axis register both port=in
#pragma HLS INTERFACE s_axilite port=reps bundle=control
#pragma HLS INTERFACE s_axilite port=return bundle=control


#pragma HLS RESOURCE variable backbone_model_p1_0_0_weight_q core=RAM_2P_LUTRAM
#pragma HLS ARRAY_PARTITION variable=backbone_model_p1_0_0_weight_q complete dim = 0
#pragma HLS ARRAY_PARTITION variable=backbone_model_p1_0_0_alpha_q complete dim = 0
#pragma HLS ARRAY_PARTITION variable=backbone_model_p1_0_0_bias_q complete dim = 0


#pragma HLS RESOURCE variable backbone_model_p1_0_3_weight_q core=RAM_2P_LUTRAM
#pragma HLS RESOURCE variable backbone_model_p1_0_3_alpha_q core=RAM_2P_LUTRAM
#pragma HLS ARRAY_PARTITION variable=backbone_model_p1_0_3_alpha_q complete dim=2
#pragma HLS RESOURCE variable backbone_model_p1_0_3_bias_q core=RAM_2P_LUTRAM
#pragma HLS ARRAY_PARTITION variable=backbone_model_p1_0_3_bias_q complete dim=2


#pragma HLS RESOURCE variable backbone_model_p1_2_0_weight_q core=RAM_2P_LUTRAM
#pragma HLS RESOURCE variable backbone_model_p1_2_0_alpha_q core=RAM_2P_LUTRAM
#pragma HLS ARRAY_PARTITION variable=backbone_model_p1_2_0_alpha_q complete dim=2
#pragma HLS RESOURCE variable backbone_model_p1_2_0_bias_q core=RAM_2P_LUTRAM
#pragma HLS ARRAY_PARTITION variable=backbone_model_p1_2_0_bias_q complete dim=2

#pragma HLS RESOURCE variable backbone_model_p1_2_3_alpha_q core=RAM_2P_LUTRAM
#pragma HLS RESOURCE variable backbone_model_p1_2_3_bias_q core=RAM_2P_LUTRAM

#pragma HLS RESOURCE variable backbone_model_p1_4_0_alpha_q core=RAM_2P_LUTRAM
#pragma HLS ARRAY_PARTITION variable=backbone_model_p1_4_0_alpha_q complete dim=2
#pragma HLS RESOURCE variable backbone_model_p1_4_0_bias_q core=RAM_2P_LUTRAM
#pragma HLS ARRAY_PARTITION variable=backbone_model_p1_4_0_bias_q complete dim=2

#pragma HLS RESOURCE variable backbone_model_p2_1_0_alpha_q core=RAM_2P_LUTRAM
#pragma HLS ARRAY_PARTITION variable=backbone_model_p2_1_0_alpha_q complete dim=2
#pragma HLS RESOURCE variable backbone_model_p2_1_0_bias_q core=RAM_2P_LUTRAM
#pragma HLS ARRAY_PARTITION variable=backbone_model_p2_1_0_bias_q complete dim=2

#pragma HLS RESOURCE variable backbone_model_p3_0_3_conv_alpha_q core=RAM_2P_LUTRAM
#pragma HLS RESOURCE variable backbone_model_p3_0_3_conv_bias_q core=RAM_2P_LUTRAM

#pragma HLS RESOURCE variable backbone_model_p3_1_conv_alpha_q core=RAM_2P_LUTRAM
#pragma HLS RESOURCE variable backbone_model_p3_1_conv_bias_q core=RAM_2P_LUTRAM


    ap_uint<8> reps = 50;

#ifdef DEBUG
    reps = 2;
#endif
//    const unsigned ap_uint<10> reps = 1;
    const unsigned int num_per_rep = 640 * 360 * 3 * 8 / 64;
    const unsigned int BIT_ALL = 7;
    hls::stream<ap_uint<64> > in_stream_extract("in_stream_extract");
    #pragma HLS STREAM variable=in_stream_extract depth=512 dim=1
	ExtractPixels<64, num_per_rep> (in, in_stream_extract, reps);


    hls::stream<ap_uint<192> > in_stream0("in_stream0");
    #pragma HLS STREAM variable=in_stream0 depth=32 dim=1
    Widthmuler<64, 192, num_per_rep>(in_stream_extract, in_stream0, reps);


    hls::stream<ap_uint<8 * 3> > in_stream1("in_stream1");
    #pragma HLS STREAM variable=in_stream1 depth=512 dim=1
    Widthdiver<192, 8 * 3, num_per_rep / 3> (in_stream0, in_stream1, reps);

    hls::stream<ap_uint<8 * 3> > r_stream1("r_stream1");
        #pragma HLS STREAM variable=in_stream1 depth=512 dim=1
    avg2x2_new<2, 360, 640, 3, 3, 8>(in_stream1, r_stream1, reps);

#ifdef DEBUG
    cout << r_stream1.size()<<endl;
#endif

    hls::stream<ap_uint<7 * 3> > in_stream2("in_stream2");
    #pragma HLS STREAM variable=in_stream2 depth=512 dim=1
    img_norm_none<8, 3, 180, 320 , 7>(r_stream1, in_stream2, reps);


#ifdef DEBUG
    printf("start\n\n");
#endif

#ifdef DEBUG
    printf("mmm\n\n");
#endif

    hls::stream<ap_uint<backbone_model_p1_0_0_BIT_OUT * backbone_model_p1_0_0_SIMD> > p100_out("p100_out");
    #pragma HLS STREAM variable=p100_out depth=512 dim=1
    dw_conv3x3_laoyer
	<  	backbone_model_p1_0_0_ROW_IN,
		backbone_model_p1_0_0_COL_IN,

		backbone_model_p1_0_0_CH_IN,
		backbone_model_p1_0_0_BIT_IN,
		backbone_model_p1_0_0_FL_IN,
		backbone_model_p1_0_0_CH_OUT,
		backbone_model_p1_0_0_BIT_OUT,
		backbone_model_p1_0_0_FL_OUT,
		backbone_model_p1_0_0_BIT_W,
		backbone_model_p1_0_0_BIT_ALPHA,
		backbone_model_p1_0_0_FL_ALPHA,
		backbone_model_p1_0_0_BIT_BIAS,
		backbone_model_p1_0_0_BIT_TMP,
		backbone_model_p1_0_0_SIMD,
		backbone_model_p1_0_0_PE>
    (in_stream2,backbone_model_p1_0_0_weight_q,backbone_model_p1_0_0_alpha_q,backbone_model_p1_0_0_bias_q,p100_out,
    		reps);


    hls::stream<ap_uint<BIT_ALL> > p103_converting("p103_converting");
#pragma HLS STREAM variable=p103_converting depth=512 dim=1

    Widthdiver<BIT_ALL * 3 , BIT_ALL, backbone_model_p1_0_3_ROW_IN * backbone_model_p1_0_3_COL_IN>(
    		p100_out, p103_converting , reps);

#ifdef DEBUG
#endif


    hls::stream<ap_uint<backbone_model_p1_0_3_BIT_OUT * backbone_model_p1_0_3_PE> > p103_out("p103_out");

#pragma HLS STREAM variable=p103_out depth=512 dim=1
    conv1x1_layer_pf_t3_6<backbone_model_p1_0_3_ROW_IN,
	backbone_model_p1_0_3_COL_IN,
	backbone_model_p1_0_3_CH_IN,
	backbone_model_p1_0_3_BIT_IN,
	backbone_model_p1_0_3_FL_IN,

	backbone_model_p1_0_3_CH_OUT,
	backbone_model_p1_0_3_BIT_OUT,
	backbone_model_p1_0_3_FL_OUT,

	backbone_model_p1_0_3_BIT_W,
	backbone_model_p1_0_3_BIT_ALPHA,
	backbone_model_p1_0_3_FL_ALPHA,
	backbone_model_p1_0_3_BIT_BIAS,
	backbone_model_p1_0_3_BIT_TMP,

	backbone_model_p1_0_3_SIMD,
	backbone_model_p1_0_3_PE>
    (p103_converting,backbone_model_p1_0_3_weight_q,backbone_model_p1_0_3_alpha_q,backbone_model_p1_0_3_bias_q,p103_out,
    		reps);


    hls::stream<ap_uint<12 * BIT_ALL> > pool_p103_out("pool_p103_out");
	#pragma HLS STREAM variable=pool_p103_out depth=512 dim=1
    max_pool2x2_new<2, 184, 320, 48, 12, BIT_ALL>(p103_out, pool_p103_out, reps);


#ifdef DEBUG
    cout << pool_p103_out.size()<<endl;
#endif
    hls::stream<ap_uint<2 * BIT_ALL> > p120_converting("p120_converting");
#pragma HLS STREAM variable=p120_converting depth=512 dim=1
    Widthdiver<BIT_ALL * 12 , BIT_ALL * 2, 92 * 160 * 4>(
    		pool_p103_out, p120_converting, reps);

    #ifdef DEBUG
    cout << p120_converting.size()<<endl;

#endif
    hls::stream<ap_uint<12 * BIT_ALL> > p120_converted("p120_converted");
#pragma HLS STREAM variable=p120_converted depth=512 dim=1
    Widthmuler<BIT_ALL * 2 , BIT_ALL * 12, 92 * 160 * 24 >(
    		p120_converting, p120_converted, reps);

#ifdef DEBUG
    cout << p120_converted.size()<<endl;

#endif
    hls::stream<ap_uint<backbone_model_p1_2_0_SIMD * backbone_model_p1_2_0_BIT_OUT> > p120_out("p120_out");
   	#pragma HLS STREAM variable=p120_out depth=512 dim=1
    dw_conv3x3_layer_lidea
    	<  	backbone_model_p1_2_0_ROW_IN,
		backbone_model_p1_2_0_COL_IN,
		backbone_model_p1_2_0_CH_IN,
		backbone_model_p1_2_0_BIT_IN,
		backbone_model_p1_2_0_FL_IN,
		backbone_model_p1_2_0_CH_OUT,
		backbone_model_p1_2_0_BIT_OUT,
		backbone_model_p1_2_0_FL_OUT,
		backbone_model_p1_2_0_BIT_W,
		backbone_model_p1_2_0_BIT_ALPHA,
		backbone_model_p1_2_0_FL_ALPHA,
		backbone_model_p1_2_0_BIT_BIAS,
		backbone_model_p1_2_0_BIT_TMP,
		backbone_model_p1_2_0_SIMD,
		backbone_model_p1_2_0_PE>
        (p120_converted,backbone_model_p1_2_0_weight_q,backbone_model_p1_2_0_alpha_q,backbone_model_p1_2_0_bias_q,p120_out,reps);
#ifdef DEBUG
    cout << p120_out.size()<<endl;
#endif


    hls::stream<ap_uint<2 * BIT_ALL> > p123_converting("p123_converting");
#pragma HLS STREAM variable=p123_converting depth=512 dim=1
    Widthdiver<12 * BIT_ALL , BIT_ALL * 2, 92 * 160 * 4 >(
    		p120_out, p123_converting, reps);
#ifdef DEBUG
    cout << p123_converting.size()<<endl;
    cout<< "p123_converting"<<endl;
#endif
    hls::stream<ap_uint<8 * BIT_ALL> > p123_converted("p123_converted");
#pragma HLS STREAM variable=p123_converted depth=512 dim=1
    Widthmuler<2 * BIT_ALL , BIT_ALL * 8, 92 * 160 * 24 >(
    		p123_converting, p123_converted, reps);
#ifdef DEBUG
    cout << p123_converted.size()<<endl;
    cout<< "p123_converted"<<endl;
#endif
        hls::stream<ap_uint<backbone_model_p1_2_3_BIT_OUT * backbone_model_p1_2_3_PE> > p123_out("p123_out");
        #pragma HLS STREAM variable=p123_out depth=512
        conv1x1_layer_pf_t3_2<backbone_model_p1_2_3_ROW_IN,
		backbone_model_p1_2_3_COL_IN,
		backbone_model_p1_2_3_CH_IN,
		backbone_model_p1_2_3_BIT_IN,
		backbone_model_p1_2_3_FL_IN,
		backbone_model_p1_2_3_CH_OUT,
		backbone_model_p1_2_3_BIT_OUT,
		backbone_model_p1_2_3_FL_OUT,
		backbone_model_p1_2_3_BIT_W,
		backbone_model_p1_2_3_BIT_ALPHA,
		backbone_model_p1_2_3_FL_ALPHA,
		backbone_model_p1_2_3_BIT_BIAS,
		backbone_model_p1_2_3_BIT_TMP,
		backbone_model_p1_2_3_SIMD,
		backbone_model_p1_2_3_PE>
        (p123_converted,backbone_model_p1_2_3_weight_q,backbone_model_p1_2_3_alpha_q,backbone_model_p1_2_3_bias_q,p123_out,reps);


        hls::stream<ap_uint<12 * 7> > pool_p123_out("pool_p123_out");
#pragma HLS STREAM variable=pool_p123_out depth=128
        max_pool2x2_new<2, 92, 160, 96, 12, 7>(p123_out, pool_p123_out, reps);

#ifdef DEBUG
    cout << pool_p123_out.size()<<endl;
    cout<< "pool_p123_out"<<endl;
#endif

    hls::stream<ap_uint<6 * BIT_ALL> > p140_coneded("p140_coneded");
#pragma HLS STREAM variable=p140_coneded depth=512 dim=1
    Widthdiver<12 * BIT_ALL , BIT_ALL * 6, 46 * 80 * 8 >(
    		pool_p123_out, p140_coneded, reps);

        hls::stream<ap_uint<backbone_model_p1_4_0_SIMD * backbone_model_p1_4_0_BIT_OUT> > p140_out("p140_out");
           	#pragma HLS STREAM variable=p140_out depth=512
        dw_conv3x3_layer_lidea
            	<  	backbone_model_p1_4_0_ROW_IN,
        		backbone_model_p1_4_0_COL_IN,
        		backbone_model_p1_4_0_CH_IN,
        		backbone_model_p1_4_0_BIT_IN,
        		backbone_model_p1_4_0_FL_IN,
        		backbone_model_p1_4_0_CH_OUT,
        		backbone_model_p1_4_0_BIT_OUT,
        		backbone_model_p1_4_0_FL_OUT,
        		backbone_model_p1_4_0_BIT_W,
        		backbone_model_p1_4_0_BIT_ALPHA,
        		backbone_model_p1_4_0_FL_ALPHA,
        		backbone_model_p1_4_0_BIT_BIAS,
        		backbone_model_p1_4_0_BIT_TMP,
        		backbone_model_p1_4_0_SIMD,
        		backbone_model_p1_4_0_PE>
                (p140_coneded,backbone_model_p1_4_0_weight_q,backbone_model_p1_4_0_alpha_q,backbone_model_p1_4_0_bias_q,p140_out,reps);

#ifdef DEBUG
    cout << p140_out.size()<<endl;
    cout<< "p140_out"<<endl;
#endif


    hls::stream<ap_uint<2 * BIT_ALL> > p143_converting("p143_converting");
#pragma HLS STREAM variable=p143_converting depth=512
    Widthdiver<6 * BIT_ALL , BIT_ALL * 2, 46 * 80 * 16>(
    		p140_out, p143_converting, reps );


            hls::stream<ap_uint<8 * BIT_ALL> > p143_converted("p143_converted");
	#pragma HLS STREAM variable=p143_converted depth=512
            Widthmuler<2 * BIT_ALL , BIT_ALL * 8, 46 * 80 * 48>(
            		p143_converting, p143_converted, reps );

                hls::stream<ap_uint<backbone_model_p1_4_3_BIT_OUT * backbone_model_p1_4_3_PE> > p143_out("p143_out");
                #pragma HLS STREAM variable=p143_out depth=512
                conv1x1_layer_pf_t3_2<backbone_model_p1_4_3_ROW_IN,
        		backbone_model_p1_4_3_COL_IN,
        		backbone_model_p1_4_3_CH_IN,
        		backbone_model_p1_4_3_BIT_IN,
        		backbone_model_p1_4_3_FL_IN,
        		backbone_model_p1_4_3_CH_OUT,
        		backbone_model_p1_4_3_BIT_OUT,
        		backbone_model_p1_4_3_FL_OUT,
        		backbone_model_p1_4_3_BIT_W,
        		backbone_model_p1_4_3_BIT_ALPHA,
        		backbone_model_p1_4_3_FL_ALPHA,
        		backbone_model_p1_4_3_BIT_BIAS,
        		backbone_model_p1_4_3_BIT_TMP,
        		backbone_model_p1_4_3_SIMD,
        		backbone_model_p1_4_3_PE >
                (p143_converted,backbone_model_p1_4_3_weight_q,backbone_model_p1_4_3_alpha_q,backbone_model_p1_4_3_bias_q,p143_out, reps);


#ifdef DEBUG
    cout << p143_out.size()<<endl;
    cout<< "p143_out"<<endl;
#endif

    hls::stream<ap_uint<12 * 7> > pool_p143_out("pool_p143_out");
#pragma HLS STREAM variable=pool_p143_out depth=256
    max_pool2x2_new<2, 46, 80, 192, 12, 7>(p143_out, pool_p143_out, reps);

    hls::stream<ap_uint<backbone_model_p2_1_0_SIMD * 7> > p210_converting("p210_converting");
#pragma HLS STREAM variable=p210_converting depth=512
    Widthdiver<12 * 7, 3 * 7, 40*23*192/12 >(
    		pool_p143_out, p210_converting, reps);

#ifdef DEBUG
    cout << p210_converting.size()<<endl;
    cout<< "p210_converting"<<endl;
#endif

    hls::stream<ap_uint<backbone_model_p2_1_0_SIMD * backbone_model_p2_1_0_BIT_OUT> > p210_out("p210_out");
#pragma HLS STREAM variable=p210_out depth=512

    dw_conv3x3_layer_lidea
	<  	backbone_model_p2_1_0_ROW_IN,
	backbone_model_p2_1_0_COL_IN,
	backbone_model_p2_1_0_CH_IN,
	backbone_model_p2_1_0_BIT_IN,
	backbone_model_p2_1_0_FL_IN,
	backbone_model_p2_1_0_CH_OUT,
	backbone_model_p2_1_0_BIT_OUT,
	backbone_model_p2_1_0_FL_OUT,
	backbone_model_p2_1_0_BIT_W,
	backbone_model_p2_1_0_BIT_ALPHA,
	backbone_model_p2_1_0_FL_ALPHA,
	backbone_model_p2_1_0_BIT_BIAS,
	backbone_model_p2_1_0_BIT_TMP,
	backbone_model_p2_1_0_SIMD,
	backbone_model_p2_1_0_PE>
    (p210_converting,backbone_model_p2_1_0_weight_q,backbone_model_p2_1_0_alpha_q,backbone_model_p2_1_0_bias_q,p210_out,reps);
#ifdef DEBUG
    cout << p210_out.size()<<endl;
    cout<< "p210_out"<<endl;
#endif


    hls::stream<ap_uint<1 * 7> > p213_converting("p213_converting");
#pragma HLS STREAM variable=p213_converting depth=512
    Widthdiver<3 * 7, 1 * 7, 40*23*192/3 >(
    		p210_out, p213_converting, reps);


    hls::stream<ap_uint<backbone_model_p2_1_3_SIMD * 7> > p213_converted("p213_converted");
#pragma HLS STREAM variable=p213_converted depth=1024
    Widthmuler<1*7, backbone_model_p2_1_3_SIMD * 7, 40*23*192 >(
    		p213_converting, p213_converted, reps);


#ifdef DEBUG
    cout << p213_converting.size()<<endl;
#endif

    hls::stream<ap_uint<backbone_model_p2_1_3_BIT_OUT * backbone_model_p2_1_3_PE> > p213_out("p213_out");
#pragma HLS STREAM variable=p213_out depth=512
    conv1x1_layer_pf_t3<backbone_model_p2_1_3_ROW_IN,
	backbone_model_p2_1_3_COL_IN,
	backbone_model_p2_1_3_CH_IN,
	backbone_model_p2_1_3_BIT_IN,
	backbone_model_p2_1_3_FL_IN,
	backbone_model_p2_1_3_CH_OUT,
	backbone_model_p2_1_3_BIT_OUT,
	backbone_model_p2_1_3_FL_OUT,
	backbone_model_p2_1_3_BIT_W,
	backbone_model_p2_1_3_BIT_ALPHA,
	backbone_model_p2_1_3_FL_ALPHA,
	backbone_model_p2_1_3_BIT_BIAS,
	backbone_model_p2_1_3_BIT_TMP,
	backbone_model_p2_1_3_SIMD,
	backbone_model_p2_1_3_PE>
    (p213_converted,backbone_model_p2_1_3_weight_q,backbone_model_p2_1_3_alpha_q,backbone_model_p2_1_3_bias_q,p213_out,reps);


    hls::stream<ap_uint<6 * 7> > p220_converting("p220_converting");
#pragma HLS STREAM variable=p220_converting depth=512
    Widthdiver<12 * 7, 6 * 7, 40*23*384/12>(
    		p213_out, p220_converting , reps);

    hls::stream<ap_uint<backbone_model_p2_2_0_SIMD * backbone_model_p2_2_0_BIT_OUT> > p220_out("p220_out");
#pragma HLS STREAM variable=p220_out depth=512 dim=1
    	dw_conv3x3_layer_BRAM
		<	backbone_model_p2_2_0_ROW_IN,
		backbone_model_p2_2_0_COL_IN,
		backbone_model_p2_2_0_CH_IN,
		backbone_model_p2_2_0_BIT_IN,
		backbone_model_p2_2_0_FL_IN,
		backbone_model_p2_2_0_CH_OUT,
		backbone_model_p2_2_0_BIT_OUT,
		backbone_model_p2_2_0_FL_OUT,
		backbone_model_p2_2_0_BIT_W,
		backbone_model_p2_2_0_BIT_ALPHA,
		backbone_model_p2_2_0_FL_ALPHA,
		backbone_model_p2_2_0_BIT_BIAS,
		backbone_model_p2_2_0_BIT_TMP,
		backbone_model_p2_2_0_SIMD,
		backbone_model_p2_2_0_PE>
    	(p220_converting,backbone_model_p2_2_0_weight_q,backbone_model_p2_2_0_alpha_q,backbone_model_p2_2_0_bias_q,p220_out,reps);


#ifdef DEBUG
    cout << p220_out.size()<<endl;
    cout<< "p220_out"<<endl;
#endif

    hls::stream<ap_uint<2 * 7> > p223_converting("p223_converting");
#pragma HLS STREAM variable=p223_converting depth=512
    Widthdiver<6 * 7, 2 * 7, 40*23*384/6>(
    		p220_out, p223_converting , reps);

    hls::stream<ap_uint<16 * 7> > p223_converted("p223_converted");
#pragma HLS STREAM variable=p223_converted depth=512
    Widthmuler<2 * 7, 16 * 7, 40*23*384/2>(
    		p223_converting, p223_converted , reps);


    hls::stream<ap_uint<backbone_model_p2_2_3_BIT_OUT * backbone_model_p2_2_3_PE> > p223_out("p223_out");
#pragma HLS STREAM variable=p223_out depth=512
    conv1x1_layer_pf_t3<backbone_model_p2_2_3_ROW_IN,
	backbone_model_p2_2_3_COL_IN,
	backbone_model_p2_2_3_CH_IN,
	backbone_model_p2_2_3_BIT_IN,
	backbone_model_p2_2_3_FL_IN,
	backbone_model_p2_2_3_CH_OUT,
	backbone_model_p2_2_3_BIT_OUT,
	backbone_model_p2_2_3_FL_OUT,
	backbone_model_p2_2_3_BIT_W,
	backbone_model_p2_2_3_BIT_ALPHA,
	backbone_model_p2_2_3_FL_ALPHA,
	backbone_model_p2_2_3_BIT_BIAS,
	backbone_model_p2_2_3_BIT_TMP,
	backbone_model_p2_2_3_SIMD,
	backbone_model_p2_2_3_PE>
    (p223_converted,backbone_model_p2_2_3_weight_q,backbone_model_p2_2_3_alpha_q,backbone_model_p2_2_3_bias_q,p223_out,reps);

#ifdef DEBUG
    cout << p223_out.size()<<endl;
    cout<< "p223_out"<<endl;
#endif

    hls::stream<ap_uint<8 * 7> > p300_converting("p300_converting");
#pragma HLS STREAM variable=p300_converting depth=512
    Widthdiver<16 * 7, 8 * 7, 40*23*512/16>(
    		p223_out, p300_converting , reps);


    hls::stream<ap_uint<backbone_model_p3_0_0_SIMD * backbone_model_p3_0_0_BIT_OUT> > p300_out("p300_out");
#pragma HLS STREAM variable=p300_out depth=512 dim=1
    dw_conv3x3_layer_BRAM
	<	backbone_model_p3_0_0_ROW_IN,
	backbone_model_p3_0_0_COL_IN,
	backbone_model_p3_0_0_CH_IN,
	backbone_model_p3_0_0_BIT_IN,
	backbone_model_p3_0_0_FL_IN,
	backbone_model_p3_0_0_CH_OUT,
	backbone_model_p3_0_0_BIT_OUT,
	backbone_model_p3_0_0_FL_OUT,
	backbone_model_p3_0_0_BIT_W,
	backbone_model_p3_0_0_BIT_ALPHA,
	backbone_model_p3_0_0_FL_ALPHA,
	backbone_model_p3_0_0_BIT_BIAS,
	backbone_model_p3_0_0_BIT_TMP,
	backbone_model_p3_0_0_SIMD,
	backbone_model_p3_0_0_PE >
    (p300_converting,backbone_model_p3_0_0_weight_q,backbone_model_p3_0_0_alpha_q,backbone_model_p3_0_0_bias_q,p300_out,reps);

#ifdef DEBUG
#endif


#ifdef DEBUG
    cout<< p300_out.size() <<endl;
#endif

    hls::stream<ap_uint<backbone_model_p3_0_3_BIT_OUT * backbone_model_p3_0_3_PE> > p303_out("p303_out");
#pragma HLS STREAM variable=p303_out depth=512 dim=1
    conv1x1_layer_pf_t3<backbone_model_p3_0_3_ROW_IN,
	backbone_model_p3_0_3_COL_IN,
	backbone_model_p3_0_3_CH_IN,
	backbone_model_p3_0_3_BIT_IN,
	backbone_model_p3_0_3_FL_IN,
	backbone_model_p3_0_3_CH_OUT,
	backbone_model_p3_0_3_BIT_OUT,
	backbone_model_p3_0_3_FL_OUT,
	backbone_model_p3_0_3_BIT_W,
	backbone_model_p3_0_3_BIT_ALPHA,
	backbone_model_p3_0_3_FL_ALPHA,
	backbone_model_p3_0_3_BIT_BIAS,
	backbone_model_p3_0_3_BIT_TMP,
	backbone_model_p3_0_3_SIMD,
	backbone_model_p3_0_3_PE>
    (p300_out,backbone_model_p3_0_3_weight_q,backbone_model_p3_0_3_alpha_q,backbone_model_p3_0_3_bias_q,p303_out,reps);

#ifdef DEBUG
    cout<< p303_out.size() <<endl;
#endif

    hls::stream<ap_uint<7 * 2> > p31_converting("p31_converting");
#pragma HLS STREAM variable=p31_converting depth=512
    Widthdiver<8 *7 , 2 * 7, 40*23*96/8>(p303_out, p31_converting, reps );


    hls::stream<ap_uint<backbone_model_p3_1_conv_BIT_OUT * backbone_model_p3_1_conv_PE> > p31_out("p31_out");
#pragma HLS STREAM variable=p31_out depth=512 dim=1
    conv1x1_layer_pf_t3<backbone_model_p3_1_conv_ROW_IN,
	backbone_model_p3_1_conv_COL_IN,
	backbone_model_p3_1_conv_CH_IN,
	backbone_model_p3_1_conv_BIT_IN,
	backbone_model_p3_1_conv_FL_IN,
	backbone_model_p3_1_conv_CH_OUT,
	backbone_model_p3_1_conv_BIT_OUT,
	backbone_model_p3_1_conv_FL_OUT,
	backbone_model_p3_1_conv_BIT_W,
	backbone_model_p3_1_conv_BIT_ALPHA,
	backbone_model_p3_1_conv_FL_ALPHA,
	backbone_model_p3_1_conv_BIT_BIAS,
	backbone_model_p3_1_conv_BIT_TMP,
	backbone_model_p3_1_conv_SIMD,
	backbone_model_p3_1_conv_PE >
    (p31_converting,backbone_model_p3_1_conv_weight_q,backbone_model_p3_1_conv_alpha_q,backbone_model_p3_1_conv_bias_q,p31_out,reps);


#ifdef DEBUG
    cout<< p31_out.size() <<endl;
#endif

    hls::stream<ap_uint<4 * 7> > pf_converting("pf_converting");
#pragma HLS STREAM variable=pf_converting depth=512
    Widthmuler<2 * 7, 4 * 7, 40*23*32/2>(
    		p31_out, pf_converting , reps);


    hls::stream<ap_uint<bbox_head_atss_cls_reg_center_BIT_OUT * bbox_head_atss_cls_reg_center_PE> > final_conv("final_conv");
#pragma HLS STREAM variable=p31_out depth=512 dim=1
    conv3x3_layer_crc<  bbox_head_atss_cls_reg_center_ROW_IN,
	bbox_head_atss_cls_reg_center_COL_IN,
	bbox_head_atss_cls_reg_center_CH_IN,
	bbox_head_atss_cls_reg_center_BIT_IN,
	bbox_head_atss_cls_reg_center_FL_IN,
	bbox_head_atss_cls_reg_center_CH_OUT,
	bbox_head_atss_cls_reg_center_BIT_OUT,
	bbox_head_atss_cls_reg_center_FL_OUT,
	bbox_head_atss_cls_reg_center_BIT_W,
	bbox_head_atss_cls_reg_center_BIT_ALPHA,
	bbox_head_atss_cls_reg_center_FL_ALPHA,
	bbox_head_atss_cls_reg_center_BIT_BIAS,
	bbox_head_atss_cls_reg_center_BIT_TMP,
	bbox_head_atss_cls_reg_center_SIMD,
	bbox_head_atss_cls_reg_center_PE>(
			pf_converting, bbox_head_atss_cls_reg_center_weight_q, bbox_head_atss_cls_reg_center_alpha_q, bbox_head_atss_cls_reg_center_bias_q, final_conv ,reps);


//     hls::stream<ap_uint<64> > chang_1("chang_1");
//#pragma HLS STREAM variable=chang_1 depth=512
//     Widthmuler<32, 64, 40*23*14 >(final_conv, chang_1, reps);
    hls::stream<ap_uint<32> > pconverting("pconverting");
#pragma HLS STREAM variable=pconverting depth=512
    Widthdiver<64, 32, 40*23*14/2>(final_conv, pconverting , reps);

     hls::stream<ap_uint<32> > chang_2("chang_2");
     //StreamingDataWidthConverter_Batch<448, 64, 200>(chang_1, chang_2, reps );

     pooping<23,40>(pconverting,chang_2,reps);


     hls::stream<ap_uint<64> > ff("ff");
 #pragma HLS STREAM variable=ff depth=256
     Widthmuler<32, 64, 6>(chang_2, ff , reps);

     AddLast<6/2>(ff, out, reps);
     /*
#define PE 1
    for(int i=0;i<20;i++){
    	for(int j=0;j<40;j++){
    		int CH = 14;
    		ap_int<PE * 32> temp;
    		for(int k=0;k<CH/PE;k++){
        		temp= final_conv.read();
        		for(int bit = 0; bit < PE ;bit ++ ){
        			if(i==0){
        				ap_int<32> temp2 = temp(bit*32+31,bit*32);
        				cout<< temp2 << "  " ;
        			}
        		}
        		if(i==0) cout << endl;
    		}
    	}
    }/*

#define PE 2
    for(int i=0;i<46;i++){
    	for(int j=0;j<80;j++){
    		int CH = 96;
    		ap_int<PE * 7> temp;
    		for(int k=0;k<CH/PE;k++){
        		temp= pool_p123_out.read();
        		for(int bit = 0; bit < PE ;bit ++ ){
        			if(i==3){
        				ap_uint<7> temp2 = temp(bit*7+6,bit*7);
        				cout<< temp2 << "  " ;
        			}
        		}
        		if(i==3) cout << endl;
    		}
    	}
    }/*
*/
}
#ifdef DEBUG
#include <hls_stream.h>
#include <iostream>
#include <fstream>

void load_data(const char *path, char *ptr, unsigned int size)
{
    std::ifstream f(path, std::ios::in | std::ios::binary);
    if (!f)
    {
        std::cout << "no such file,please check the file name!/n";
        exit(0);
    }
    f.read(ptr, size);
    f.close();
}

void write_data(const char *path, char *ptr, unsigned int size)
{
    std::ofstream f(path, std::ios::out | std::ios::binary);
    if (!f)
    {
        std::cout << "write no such file,please check the file name!/n";
        exit(0);
    }
    f.write(ptr, size);
    f.close();
}


int main(){
	 uint16_t img_h = 360;
	    uint16_t img_w = 640;
	    uint16_t img_ch = 3;

	    printf("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\n\n");
	    uint8_t img[img_h][img_w][img_ch];
	    load_data("/home/lq/skynet_0506/1_full.bin", (char *) img, sizeof(img));


	     const int data_points_per_line = 8;
	     const int nums_line_pre_img = img_h * img_w * img_ch * 8 / 64;
	     uint8_t * data = (uint8_t *) img;

	    for(int j=100; j<103; j++){
	        for(int k=100; k<103; k++){
	            for(int i=0; i<img_ch; i++){
	                printf(" %3x", img[j][k][i]);
	            }
	            cout << endl;
	        }
	    }

	    ap_uint<10> reps = 1;
	    hls::stream<my_ap_axis > input_stream("input stream");
	    for(int mm=0;mm<reps;mm++)
			for (unsigned int i = 0; i < nums_line_pre_img; i++) {
				my_ap_axis temp;
				for (unsigned int j = 0; j < data_points_per_line; j++) {
					temp.data( 8*(j+1)-1, 8*j ) = data[i * data_points_per_line + j];
				}
				input_stream.write(temp);

				//cout<< hex <<temp.data << "  ";
				//if(i%3 == 0) cout<<endl;
			}

	    cout << "input size :" << input_stream.size() << endl;
	    cout << "start ..... " << endl;

	    hls::stream<my_ap_axis > out_stream("out_stream");

	    atss_0506_lossv2(input_stream, out_stream,reps);


	    while(!out_stream.empty()){
	    	static uint8_t flag = 0;
	    	my_ap_axis out_read = out_stream.read();
	    	// for(int i=0; i<2; i++){
	    	// ap_uint<8> value_tmp = out_read.data(8*(i+1)-1, 8*i);
	    	// ap_int<32> value_tmp = out_read.data(32*(i+1)-1, 32*i);
	    	ap_int<64> value_tmp = out_read.data;
	    	// ap_fixed<32, 12, AP_RND> value = *(ap_fixed<32, 12, AP_RND>*)&value_tmp;
	    	// ap_fixed<8, 4, AP_RND> value = *(ap_fixed<8, 4, AP_RND>*)&value_tmp;

	    	cout << (ap_int<32>)value_tmp(31,0) << " " ;
	    	cout << (ap_int<32>)value_tmp(63,32) << " " ;
	    	if(++flag == 14){
	    		flag = 0;
	    		cout << endl;
	    	}
	    	// }
	    }
}
#endif