typedef struct
{
	float c1, c2, c3, c4;
} sse_float __attribute__ ((aligned (16)));

#define _prefetch_b(b)	\
	__asm__ __volatile__("prefetcht0 %0"	\
			     :	\
			     :	\
			     "m" (b))

#define _su2m(c, a, b, d, e, f) \
	__asm__ __volatile__ (			\
		"movaps %1, %%xmm0 \n\t"	\
		"movaps %2, %%xmm1 \n\t"	\
		"mulps %%xmm0, %%xmm1 \n\t"	\
		"shufps $0xb1, %%xmm0, %%xmm0 \n\t" \
                "movaps %3, %%xmm2 \n\t"        \
		"mulps %%xmm0, %%xmm2 \n\t"	\
                "shufps $0x4e, %%xmm0, %%xmm0 \n\t" \
                "movaps %4, %%xmm3 \n\t"        \
		"mulps %%xmm0, %%xmm3 \n\t"	\
		"shufps $0xb1, %%xmm0, %%xmm0 \n\t" \
                "movaps %5, %%xmm4 \n\t"	\
		"mulps %%xmm0, %%xmm4 \n\t"	\
		"addps %%xmm1, %%xmm2 \n\t"	\
		"addps %%xmm2, %%xmm3 \n\t"	\
		"addps %%xmm3, %%xmm4 \n\t"	\
		"movaps %%xmm4, %0 \n\t"	\
		:				\
		"=m" (c)			\
		:				\
		"m" (a), 			\
		"m" (b), 			\
		"m" (d),			\
		"m" (e),			\
		"m" (f))
		
void xmult(float UA[], float S[], float T[], float UN[])
{
	int k;
static	sse_float a __attribute__ ((aligned (16)));
static	sse_float b __attribute__ ((aligned (16)));
static	sse_float c __attribute__ ((aligned (16)));
static  sse_float d __attribute__ ((aligned (16)));
static  sse_float e __attribute__ ((aligned (16)));  
static  sse_float f __attribute__ ((aligned (16)));
static  sse_float g __attribute__ ((aligned (16)));
static  sse_float h __attribute__ ((aligned (16)));
static  sse_float i __attribute__ ((aligned (16)));
static  sse_float j __attribute__ ((aligned (16)));
static  sse_float nc __attribute__ ((aligned (16)));

        (a).c1 = UA[0];
        (a).c2 = UA[1];
        (a).c3 = UA[2];
        (a).c4 = UA[3];

        (b).c1=S[0];
        (b).c2=S[0];
        (b).c3=S[0];
        (b).c4=S[0];

        (d).c1=-S[1]; 
        (d).c2=S[1];
        (d).c3=-S[1];
        (d).c4=S[1];

        (e).c1=-S[3]; 
        (e).c2=-S[3];
        (e).c3=S[3];
        (e).c4=S[3];

        (f).c1=-S[2]; 
        (f).c2=S[2];
        (f).c3=S[2];
        (f).c4=-S[2];

        (g).c1=T[0];
        (g).c2=T[0];
        (g).c3=T[0];
        (g).c4=T[0];

        (h).c1=-T[1];
        (h).c2=T[1];
        (h).c3=-T[1];
        (h).c4=T[1];

        (i).c1=-T[3];
        (i).c2=-T[3];
        (i).c3=T[3];
        (i).c4=T[3];

        (j).c1=-T[2];  
        (j).c2=T[2];   
        (j).c3=T[2];    
        (j).c4=-T[2];

	_su2m(c, a, b, d, e, f);
	_prefetch_b(g);
        _prefetch_b(h);
	_su2m(nc, c, g, h, i, j);

        UN[0]=(nc).c1;
        UN[1]=(nc).c2;
        UN[2]=(nc).c3;
        UN[3]=(nc).c4;
}

void xmat_mult_(float UA[], float S[], float T[], float UN[])
{
  xmult(UA,S,T,UN);   
}
        
void xmat_mult__(float UA[], float S[], float T[], float UN[])
{
  xmult(UA,S,T,UN);
}


