Readme :

This C program loads two sets of numbers into the registers 
xmm0 and xmm1 and adds them. Each set consistes of 4 floats 
and the pairwise addition is done by one instruction addps.
The "=m" indicates that the result be stored in a memory variable
which is described by it (*c). The %0,%1 and %2 correspond consecutively
to c,a and b in the sequence they appear. The movups is an unpacked move.

-----------------------------------------------------------------------
#include <stdio.h>
#include <stdlib.h>

typedef struct
{
	float c1, c2, c3, c4;
} sse_float __attribute__ ((aligned (16)));

#define _sse_add(c, a, b) 			\
	__asm__ __volatile__ (			\
		"movups %1, %%xmm0 \n\t"	\
		"movups %2, %%xmm1 \n\t"	\
		"addps %%xmm1, %%xmm0 \n\t"	\
		"movups %%xmm0, %0"		\
		:				\
		"=m" (*c)			\
		:				\
		"m" (*a), "m" (*b))

main ()
{
	float d;
        sse_float *a, *b, *c;
	float a1[]={1.0f,2.0f,3.0f,4.0f};
	float b1[]={8.0f,7.0f,6.0f,5.0f};
	a=(sse_float*)&a1[0];
	b=(sse_float*)&b1[0];
	printf("%d\n", sizeof(float));
	printf("%f\n", (*a).c2);
	_sse_add(c, a, b);
	printf("Ok\n");
	d=(*c).c1+(*c).c2+(*c).c3+(*c).c4;
	printf("%f\n", d);
}

