asm volatile( "xorl %%esi, %%esi \n\t" "leal "MANGLE(bit_reverse_512)", %%eax \n\t" "movl $1008, %%edi \n\t" "pushl %%ebp \n\t" //use ebp without telling gcc ".balign 16 \n\t" "1: \n\t" "movlps (%0, %%esi), %%xmm0 \n\t" // XXXI "movhps 8(%0, %%edi), %%xmm0 \n\t" // RXXI "movlps 8(%0, %%esi), %%xmm1 \n\t" // XXXi "movhps (%0, %%edi), %%xmm1 \n\t" // rXXi "shufps $0x33, %%xmm1, %%xmm0 \n\t" // irIR "movaps "MANGLE(sseSinCos1c)"(%%esi), %%xmm2\n\t" "mulps %%xmm0, %%xmm2 \n\t" "shufps $0xB1, %%xmm0, %%xmm0 \n\t" // riRI "mulps "MANGLE(sseSinCos1d)"(%%esi), %%xmm0\n\t" "subps %%xmm0, %%xmm2 \n\t" "movzbl (%%eax), %%edx \n\t" "movzbl 1(%%eax), %%ebp \n\t" "movlps %%xmm2, (%1, %%edx,8) \n\t" "movhps %%xmm2, (%1, %%ebp,8) \n\t" "addl $16, %%esi \n\t" "addl $2, %%eax \n\t" // avoid complex addressing for P4 crap "subl $16, %%edi \n\t" " jnc 1b \n\t" "popl %%ebp \n\t"//no we didnt touch ebp *g* :: "b" (data), "c" (buf) : "%esi", "%edi", "%eax", "%edx" );