Readme:

This program reads 2 su(2) vectors "a" and "b" from a 
data file "dat" and multiplies them together according to
the su(2) multiplication rules (quarternionic algebra)
and stores the output in the vector "c".
---------------------------------------------------------- 
#include <stdio.h>
#include <stdlib.h>

typedef struct
{
	float c1, c2, c3, c4;
} sse_float __attribute__ ((aligned (16)));

#define _prefetch_b(b)	\
	__asm__ __volatile__("prefetcht0 %0"	\
			     :	\
			     :	\
			     "m" (b))

#define _su2m(c, a, b, s1, s2) 			\
	__asm__ __volatile__ (			\
		"movaps %2, %%xmm1 \n\t"	\
		"movaps %3, %%xmm2 \n\t"	\
		"mulps %%xmm1, %%xmm2 \n\t"	\
                "movaps %1, %%xmm0 \n\t"        \
		"mulps %%xmm0, %%xmm2 \n\t"     \
                "movaps %4, %%xmm3 \n\t"        \
		"shufps $0x4e, %%xmm0, %%xmm0 \n\t" \
		"shufps $0xb1, %%xmm1, %%xmm1 \n\t" \
		"mulps %%xmm1, %%xmm3 \n\t"	\
		"mulps %%xmm0, %%xmm3 \n\t"	\
                "shufps $0x1b, %%xmm0, %%xmm0 \n\t" \
                "movaps %4, %%xmm4 \n\t"        \
                "shufps $0x4e, %%xmm1, %%xmm1 \n\t" \
		"mulps %%xmm1, %%xmm4 \n\t"	\
		"mulps %%xmm0, %%xmm4 \n\t"	\
		"shufps $0x4e, %%xmm0, %%xmm0 \n\t" \
		"shufps $0xb1, %%xmm1, %%xmm1 \n\t" \
                "movaps %4, %%xmm5 \n\t"	\
		"mulps %%xmm1, %%xmm5 \n\t"	\
		"mulps %%xmm0, %%xmm5 \n\t"	\
		"addps %%xmm2, %%xmm3 \n\t"	\
		"addps %%xmm3, %%xmm4 \n\t"	\
		"addps %%xmm4, %%xmm5 \n\t"	\
		"movaps %%xmm5, %0 \n\t"	\
		:				\
		"=m" (c)			\
		:				\
		"m" (a), 			\
		"m" (b), 			\
		"m" (s1),			\
		"m" (s2))			

main()
{
	int j;
static	sse_float a __attribute__ ((aligned (16)));
static	sse_float b __attribute__ ((aligned (16)));
static	sse_float c __attribute__ ((aligned (16)));

static  sse_float s1 __attribute__ ((aligned (16))) ={+1.0f,-1.0f,-1.0f,-1.0f};
static  sse_float s2 __attribute__ ((aligned (16))) ={-1.0f,+1.0f,+1.0f,+1.0f};

	FILE *fpin;

  fpin = fopen("dat", "r");
 
  fscanf(fpin,"%f%f%f%f\n",&(a).c1,&(a).c2,&(a).c3,&(a).c4);
  fscanf(fpin,"%f%f%f%f\n",&(b).c1,&(b).c4,&(b).c2,&(b).c3);
  close(fpin);	
	for (j = 1; j <= 100000000; j++)
	{
	_su2m(c, a, b, s1, s2);
	}
	printf("%f\t%f\t%f\t%f\n", (c).c1,(c).c3,(c).c4,(c).c2);
}


