Readme:

The shufps is a very versatile command. It allows one to permute
the contents inside the register. The way it works is as follows:

The "$0x1b" controls the shuffling operation and the two registers 
following it are the source and destination registers. (If the source
and destination registers are the same then one gets a permutation).

The code has to be hex number. The 4 words in each register is denoted
as :         register-1             register-2
        --------------------     -------------------
       | 11 | 10 | 01 | 00 |    | 11 | 10 | 01 | 00 |
        -------------------      -------------------
The first number of the code fills up the higher words of the destination
register while the second number fills up the lower words of the 
destination register.

ex. $0xb1, %%xmm0, %%xmm0 where xmm0 contains 1 2 3 4
should produce 2 1 4 3. The way it works is follows.
b=1011 so it picks up 2 (in 10) and 1 (in 11) and puts them 
in the higher word of %%xmm0.
1=0001 so it picks up 4 (in 00) and 3 (in 01) and puts them 
in the lower word of %%xmm0.
--------------------------------------------------------------------------
#include <stdio.h>
#include <stdlib.h>

typedef struct
{
	float c1, c2, c3, c4;
} sse_float __attribute__ ((aligned (16)));

#define _sse_per(c, a) 				\
	__asm__ __volatile__ (			\
		"movaps %1, %%xmm0 \n\t"	\
		"shufps $0x1b, %%xmm0, %%xmm0 \n\t"	\
		"movaps %%xmm0, %0"		\
		:				\
		"=m" (c)			\
		:				\
		"m" (a))

main ()
{
static	sse_float a __attribute__ ((aligned (16))) ={1.0f,2.0f,3.0f,4.0f};
static	sse_float c __attribute__ ((aligned (16)));
	printf("%d\n", sizeof(float));
	printf("%f\n", (a).c2);
	_sse_per(c, a);
	printf("Ok\n");
        printf("%f%f%f%f\n", (a).c1,(a).c2,(a).c3,(a).c4);
	printf("%f%f%f%f\n", (c).c1,(c).c2,(c).c3,(c).c4);
}

