#define IS_HLS
#ifdef IS_HLS
#define PRAGMA_SUB(x) _Pragma (#x)
#define DO_PRAGMA(x) PRAGMA_SUB(x)
#endif

#define RX_ANT	      4  //If you want to change this you need to change the code also
#define MAX_LAYERS	  4
#define SFIXED_MAX 		1
#define MAX_SC 			1200
#define TOOSMALL 10
#define SCALING_CENTER_POINT 9
#define max( a, b ) ( ((a) > (b)) ? (a) : (b) )
typedef struct {
  short int re;
  short int im;
} complex;
#include "math.h"

typedef complex complexMatrix_t[RX_ANT][MAX_LAYERS];
typedef complex scData_t[MAX_SC];
//typedef scData_t rxData_t[RX_ANT];
//typedef rxData_t layerData_t[MAX_LAYERS];

int intSquare(int a)
{
int square = 1;
int delta = 3;
while(square <= a)
	{
		#ifdef IS_HLS
		#pragma HLS PIPELINE
		#pragma HLS LOOP_TRIPCOUNT max=256
		#endif
		square += delta;
		delta += 2;
	}
return (delta/2 - 1);
}

complex cmake(short int re, short int im) {
  complex t;
  t.re = re;
  t.im = im;
  return t;
}
complex cscale(int a, int exp, complex b)
{
	complex t;
	t.re = (a * b.re) >> exp;
	t.im = (a * b.im) >> exp;
	return t;
}
complex cadd(complex a, complex b)
{
	complex t;
	t.re = a.re + b.re;
	t.im = a.im + b.im;
	return t;
}
complex csub(complex a, complex b)
{
  complex t;
  t.re = a.re - b.re;
  t.im = a.im - b.im;
  return t;
}
complex cmul(complex a, complex b)
{
	complex t;
	t.re = a.re * b.re - a.im * b.im;
	t.im = a.im * b.re + a.re * b.im;
	return t;
}
complex cconj(complex a) {
	complex t;
	t.re = a.re;
	t.im = -a.im;
	return t;
}
// retuens square of abs
int cabs2(complex a) {
 int t;
 t = a.re * a.re + a.im * a.im;
 return t;
}



	
	
	
void cholsolve_4xX_complex(int X, complexMatrix_t W, complexMatrix_t U, complexMatrix_t H) {
int layer, j, k;
complex sumc;
int sum;

for (layer=0; layer<X; layer++)
	{
	#ifdef IS_HLS
	#pragma HLS PIPELINE
	#pragma HLS LOOP_TRIPCOUNT max=4
	#endif
	// First compute U[layer][layer]
	sum = cabs2(U[layer][layer]);
	for (j=1; j<=(layer-1); j++)
		{
		#ifdef IS_HLS
		#pragma HLS PIPELINE
		#pragma HLS LOOP_TRIPCOUNT max=4
		#endif
		sum -= cabs2(U[j][layer]);
		}

	if (sum > TOOSMALL)
		{
		short int Wtmp = (short int)intSquare(sum);
		W[layer][layer].re = Wtmp;
		W[layer][layer].im = 0;
		// Now find elements U[row][k], k > row.

		for (k=layer; k < RX_ANT; k++)
			{
			#ifdef IS_HLS
			#pragma HLS PIPELINE
			#pragma HLS LOOP_TRIPCOUNT max=4
			#endif

			W[layer][k] = cscale(Wtmp, 0, H[layer][k]);
			if(k==layer)
				Wtmp = (Wtmp * H[layer][k].re);
			}

		}
	else
		{
		// blast off the entire row.
		for (k=layer; k<X; k++)
			{
			#ifdef IS_HLS
			#pragma HLS PIPELINE
			#pragma HLS LOOP_TRIPCOUNT max=4
			#endif
			W[layer][k].re = 0;
			W[layer][k].im = 0;
			}
		}
	}
}


void Regularize_Ri(
		int scaling_Ri,
		int alpha,
		int beta,
		int X,
		complexMatrix_t Ri_regularized,
        complexMatrix_t Ri,
		complexMatrix_t A,
		complexMatrix_t Sum)
{
    int i,j, layer, k;
    int alpha_temp;
    complex complex_beta, sumc;
    complex_beta=cscale(SFIXED_MAX,-scaling_Ri, cmake(beta,0));
	alpha_temp=16000+alpha; // to be able to scale up at all
	for  (i=0; i<X; i++)
	{
	#ifdef IS_HLS
	#pragma HLS PIPELINE
	#pragma HLS LOOP_TRIPCOUNT max=4
	#endif
		for(j=0; j < RX_ANT; j++)
		{
		#ifdef IS_HLS
		#pragma HLS PIPELINE
		#pragma HLS LOOP_TRIPCOUNT max=4
		#endif
		if(j==i)
			Ri_regularized[i][j] = cscale(alpha_temp, 1, cadd(Ri[i][j],complex_beta));
		else
			Ri_regularized[i][j] = cscale(alpha_temp, 1, Ri[i][j]);
		}
	}

	for (layer=0; layer<X; layer++)
	{
	#ifdef IS_HLS
	#pragma HLS PIPELINE
	#pragma HLS LOOP_TRIPCOUNT max=4
	#endif
	for (k=layer; k < RX_ANT; k++)
		{
		#ifdef IS_HLS
		#pragma HLS PIPELINE
		#pragma HLS LOOP_TRIPCOUNT max=4
		#endif
		sumc = A[layer][k];
		for (j=1; j<=(layer-1); j++)
			{
			#ifdef IS_HLS
			#pragma HLS PIPELINE
			#pragma HLS LOOP_TRIPCOUNT max=4
			#endif
			complex tmp1 = A[j][layer];
			complex tmp2 = A[j][k];
			tmp1 = cmul(tmp1,tmp2);
			sumc = csub(sumc, tmp1);
			}
		Sum[layer][k]=sumc;
		}
	}
}

void matrix_a_a_hermite_plus_b_4xX_complex(
		int X,
		complexMatrix_t A,
		complexMatrix_t A2,
		complexMatrix_t C,
		int *scaling_no_of_bits)
{
int i, j, k, max_element = 0, scale_limit = 2, size_no_of_bits = 0, neg_scaling_no_of_bits;
complex A_temp, A_temp_conj, AA_conj_temp, temp_complex;
complexMatrix_t C_Tmp;
complex c1;
c1.re = 0;
c1.im = 0;

for (i=0; i<RX_ANT; i++)
	{
	#ifdef IS_HLS
	#pragma HLS LOOP_TRIPCOUNT max=4
	#endif
	for (j=0; j<X; j++)
		{
		#ifdef IS_HLS
		#pragma HLS PIPELINE
		#pragma HLS LOOP_TRIPCOUNT max=4
		#endif
		// reset accumulator
		temp_complex.re = 1;
		temp_complex.im = 0;
		for(k=0; k<RX_ANT; k++)
			{
			#ifdef IS_HLS
			#pragma HLS PIPELINE
			#pragma HLS LOOP_TRIPCOUNT max=4
			#endif
			A_temp_conj = cconj(A[j][k]);
			A_temp = A[i][k];
			AA_conj_temp = cmul(A_temp, A_temp_conj);
			temp_complex = cadd(temp_complex, AA_conj_temp);
			}
		C_Tmp[i][j] = cadd(c1, temp_complex);
		max_element = max(max_element, max(abs(C_Tmp[i][j].re),abs(C_Tmp[i][j].im))); // find maximum size of real or im of elements
		}
	}
	
	// calculate scaling factor
	if (max_element <= 0x2)
		{
		*scaling_no_of_bits = 0 - SCALING_CENTER_POINT;
		neg_scaling_no_of_bits = SCALING_CENTER_POINT - 0;
		}
	else if (max_element <= 0x4)
		{
		*scaling_no_of_bits = 1 - SCALING_CENTER_POINT;
		neg_scaling_no_of_bits = SCALING_CENTER_POINT - 1;
		}		
	else if (max_element <= 0x8)
		{
		*scaling_no_of_bits = 2 - SCALING_CENTER_POINT;
		neg_scaling_no_of_bits = SCALING_CENTER_POINT - 2;
		}		
	else if (max_element <= 0x10)
		{
		*scaling_no_of_bits = 3 - SCALING_CENTER_POINT;
		neg_scaling_no_of_bits = SCALING_CENTER_POINT - 3;
		}		
	else if (max_element <= 0x20)
		{
		*scaling_no_of_bits = 4 - SCALING_CENTER_POINT;
		neg_scaling_no_of_bits = SCALING_CENTER_POINT - 4;
		}		
	else if (max_element <= 0x40)
		{
		*scaling_no_of_bits = 5 - SCALING_CENTER_POINT;
		neg_scaling_no_of_bits = SCALING_CENTER_POINT - 5;
		}		
	else if (max_element <= 0x80)
		{
		*scaling_no_of_bits = 6 - SCALING_CENTER_POINT;
		neg_scaling_no_of_bits = SCALING_CENTER_POINT - 6;
		}				
	else if (max_element <= 0x100)
		{
		*scaling_no_of_bits = 7 - SCALING_CENTER_POINT;
		neg_scaling_no_of_bits = SCALING_CENTER_POINT - 7;
		}
	else if (max_element <= 0x200)
		{
		*scaling_no_of_bits = 8 - SCALING_CENTER_POINT;
		neg_scaling_no_of_bits = SCALING_CENTER_POINT - 8;
		}		
	else if (max_element <= 0x400)
		{
		*scaling_no_of_bits = 9 - SCALING_CENTER_POINT;
		neg_scaling_no_of_bits = SCALING_CENTER_POINT - 9;
		}		
	else if (max_element <= 0x800)
		{
		*scaling_no_of_bits = 10 - SCALING_CENTER_POINT;
		neg_scaling_no_of_bits = SCALING_CENTER_POINT - 10;
		}		
	else if (max_element <= 0x1000)
		{
		*scaling_no_of_bits = 11 - SCALING_CENTER_POINT;
		neg_scaling_no_of_bits = SCALING_CENTER_POINT - 11;
		}		
	else if (max_element <= 0x2000)
		{
		*scaling_no_of_bits = 12 - SCALING_CENTER_POINT;
		neg_scaling_no_of_bits = SCALING_CENTER_POINT - 12;
		}		
	else if (max_element <= 0x4000)
		{
		*scaling_no_of_bits = 13 - SCALING_CENTER_POINT;
		neg_scaling_no_of_bits = SCALING_CENTER_POINT - 13;
		}				
	else if (max_element <= 0x8000)
		{
		*scaling_no_of_bits = 14 - SCALING_CENTER_POINT;
		neg_scaling_no_of_bits = SCALING_CENTER_POINT - 14;
		}
	else
		{
		*scaling_no_of_bits = 15 - SCALING_CENTER_POINT;
		neg_scaling_no_of_bits = SCALING_CENTER_POINT - 15;
		}
/*
while (abs(scale_limit) < max_element)
	{
	#ifdef IS_HLS
	#pragma HLS PIPELINE
	#pragma HLS LOOP_TRIPCOUNT max=32
	#endif
	scale_limit = scale_limit * 2;
	size_no_of_bits = size_no_of_bits + 1;
	}
*scaling_no_of_bits = size_no_of_bits - SCALING_CENTER_POINT;
neg_scaling_no_of_bits = -(*scaling_no_of_bits);
*/
/* perform the very scaling of C */
for (i=0; i<RX_ANT; i++)
	{
	#ifdef IS_HLS
	#pragma HLS LOOP_TRIPCOUNT max=4
	#endif
	for (j=0; j<X; j++)
		{
		#ifdef IS_HLS
		#pragma HLS PIPELINE
		#pragma HLS LOOP_TRIPCOUNT max=4
		#endif
		C[i][j] = cscale(16000, neg_scaling_no_of_bits + 1, C_Tmp[i][j]);
		A2[i][j] = A[i][j];
		}
	}
}


// A*B=C, square 4x1 A, B is a __fixed column vector complex matrixes only
void matrix_scale_4xX_complex_fixed(int X, complexMatrix_t A, int B,  complexMatrix_t C)
{
int i, j;
for (i=0; i<X; i++)
	{
	#ifdef IS_HLS
	#pragma HLS PIPELINE
	#pragma HLS LOOP_TRIPCOUNT max=4
	#endif
	for(j=0; j<RX_ANT; j++)
		{
		#ifdef IS_HLS
		#pragma HLS PIPELINE
		#pragma HLS LOOP_TRIPCOUNT max=4
		#endif
		C[i][j]=cscale(B,0,A[i][j]);
		}
	}
}

/* A*B=C, square 4xX complex matrixes only */
void matrix_mult_4xX_complex(int X, complexMatrix_t A,  complexMatrix_t C)
{
int i,j;
complex c0;
c0.re= 1;
c0.im = 1;
complex temp, temp1, temp2, temp3, temp4, temp5, temp6;
for (i=0; i<RX_ANT; i++)
	{
	#ifdef IS_HLS
	#pragma HLS PIPELINE
	#pragma HLS LOOP_TRIPCOUNT max=4
	#endif
	for (j=0; j<RX_ANT; j++)
		{
		#ifdef IS_HLS
		#pragma HLS PIPELINE
		#pragma HLS LOOP_TRIPCOUNT max=4
		#endif

		if(X == 1)
			temp=cmul(A[i][0],c0);

		else if(X == 2)
			{
			temp1=cmul(A[i][0],c0);
			temp2=cmul(A[i][1],c0);
			temp=cadd(temp1,temp2);
			}
		else if(X == 3)
			{
			temp1=cmul(A[i][0],c0);
			temp2=cmul(A[i][1],c0);
			temp3=cmul(A[i][2],c0);
			temp4=cadd(temp1,temp2);
			temp=cadd(temp3,temp4);
			}
		else if(X == 4)
			{
			temp1= cmul(A[i][0],c0);
			temp2= cmul(A[i][1],c0);
			temp3= cmul(A[i][2],c0);
			temp4= cmul(A[i][3],c0);
			temp5=cadd(temp1,temp2);
			temp6=cadd(temp3,temp4);
			temp=cadd(temp5,temp6);
			}
		C[i][j]=temp;
		}
	}
}


void Compact_Input_Data(
        int current_sc,
        complex in[MAX_SC][RX_ANT][MAX_LAYERS],
        complexMatrix_t H)
{
    int i,j;
    for (i=0; i<RX_ANT; i++)
        {
        for (j=0; j<MAX_LAYERS; j++)
            {
            #ifdef IS_HLS
        	DO_PRAGMA(HLS LOOP_TRIPCOUNT max=MAX_LAYERS)
			#pragma HLS PIPELINE
            #endif
            H[i][j]=in[current_sc][j][i];
            }
        }
}

void make_Hprim(
        int X,
		complexMatrix_t Hprim_transposed,
        complexMatrix_t Hprim)
{
	int i,j;
for (i=0; i<RX_ANT; i++) // RX_ANT should be equal to MAX_LAY !!!!
	{
	#pragma HLS LOOP_TRIPCOUNT max=4
	for (j=0; j<X; j++)
		{
		#pragma HLS PIPELINE enable_flush
		#pragma HLS LOOP_TRIPCOUNT max=4
		Hprim[i][j]=Hprim_transposed[j][i];
		}
	}
}



void Compact_Output_Data(
        int current_sc,
        complex W_p[MAX_SC][RX_ANT][MAX_LAYERS],
        complexMatrix_t H)
{
    int i,j;
    for (i=0; i<RX_ANT; i++)
        {
        for (j=0; j<MAX_LAYERS; j++)
            {
            #ifdef IS_HLS
        	DO_PRAGMA(HLS LOOP_TRIPCOUNT max=MAX_LAYERS)
			#pragma HLS PIPELINE
            #endif
				W_p[current_sc][i][j]=H[i][j];
            }
        }
}


void comb_w_calc(
		int X,
		complex W_p [MAX_SC][RX_ANT][MAX_LAYERS],
		complex in[MAX_SC][RX_ANT][MAX_LAYERS],
		int no_of_sc,
		int rho,
		int beta,
		int alpha)
{
#ifdef IS_HLS
#pragma HLS DATA_PACK variable=in
#pragma HLS DATA_PACK variable=W_p
#pragma HLS RESOURCE variable=in core=RAM_1P_BRAM
#pragma HLS RESOURCE variable=W_p core=RAM_1P_BRAM
#endif

// local matrixes used for cholsolve etc
complex c0, c1;
c0.re= 0;
c0.im = 0;
c1.re = 1;
c1.im = 1;


//complexMatrix_t W_p [MAX_SC],
//scData_t in[RX_ANT][MAX_LAYERS],




//int current_sc;
int i,j;


// call/spawn some instances for parts of the total number of RBs
for (int current_sc = 0; current_sc < no_of_sc; current_sc++)
	{
	#ifdef IS_HLS
	#pragma HLS DATAFLOW
	#pragma HLS LOOP_TRIPCOUNT max=1200
	#endif
	complexMatrix_t H, Ri, Hrho, Hprim, Hprim2, Sum, Ri_regularized, Hprim_transposed, EM;
	int scaling_Ri;
	Compact_Input_Data(current_sc, in, H);
	matrix_scale_4xX_complex_fixed(X, H, rho, Hrho);
	matrix_mult_4xX_complex(X, Hrho, Hprim_transposed);
	make_Hprim(X, Hprim_transposed, Hprim);
	matrix_a_a_hermite_plus_b_4xX_complex(X, Hprim, Hprim2, Ri, &scaling_Ri);
	Regularize_Ri(scaling_Ri, alpha, beta, X, Ri_regularized, Ri, Hprim2, Sum);
	cholsolve_4xX_complex(X, EM, Ri_regularized, Sum);
	Compact_Output_Data(current_sc, W_p, EM);
/*
	// Take care of precoding: calculate H'=F*Hrho as matrix_mult_4x1_complex(Hrho,F,Hprim)


	// Form Ri=H*H'+R






	// calculate W as cholsolve_4x1_complex(W,Ri,Hprim)


	for (i=0; i<RX_ANT; i++) // RX_ANT should be equal to MAX_LAY !!!!
		{
		#ifdef IS_HLS
		#pragma HLS PIPELINE
		#endif
		for (j=0; j<X; j++)
			{
			#ifdef IS_HLS
			#pragma HLS PIPELINE
			#pragma HLS LOOP_TRIPCOUNT max=4
			#endif
			W_p[current_sc][i][j]=EM[i][j];
			}
		}
		*/

	}
}
/*
complex W_p [MAX_SC][RX_ANT][MAX_LAYERS];
complex in[MAX_SC][RX_ANT][MAX_LAYERS];
int main()
{
	int X=4;

	int no_of_sc=1200;
	int rho=3;
	int beta=3;
	int alpha=3;

	int i, j, k,c=0;
	for (i = 0; i < no_of_sc; i++)
		for (j=0; j<4; j++)
			for (k=0; k<4; k++){
				in[i][j][k].re=c;
				in[i][j][k].im=c;
				c++;
			}

	comb_w_calc(X,W_p,in,no_of_sc,rho,beta,alpha);

	for (i = 0; i < no_of_sc; i++)
		for (j=0; j<4; j++)
			for (k=0; k<4; k++)
				if((in[i][j][k].re!=W_p[i][j][k].re) || (in[i][j][k].im!=W_p[i][j][k].im))
					return 1;

	return 0;

}
*/
