diff -urp jpeg-mmx-old/fdct_mmx.s jpeg-mmx/fdct_mmx.s --- jpeg-mmx-old/fdct_mmx.s 2003-08-02 06:31:28.000000000 +0200 +++ jpeg-mmx/fdct_mmx.s 2005-12-22 22:39:28.000000000 +0100 @@ -83,7 +83,12 @@ extern tab_frw_01234567 ; Defined in C ;; Offsets into table.. SECTION .text - + +extern _GLOBAL_OFFSET_TABLE_ +get_pc.bp: + mov ebp, [esp] + retn + global jpeg_fdct_ifast_mmx ;;; @@ -111,15 +116,18 @@ jpeg_fdct_ifast_mmx: push ebp ; save stack pointer mov ebp, esp ; link + call get_pc.bp + add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc + push ebx push ecx push edx push edi - mov INP, [ebp+8]; ; input data is row 0 of blk[] + mov INP, [esp+24]; ; input data is row 0 of blk[] ;// transform the left half of the matrix (4 columns) - lea TABLEF, [fdct_tg_all_16]; + lea TABLEF, [ebp + fdct_tg_all_16 wrt ..gotoff]; mov OUT, INP; ; lea round_frw_col, [r_frw_col] @@ -127,34 +135,34 @@ jpeg_fdct_ifast_mmx: ; the for-loop is executed twice. We are better off unrolling the ; loop to avoid branch misprediction. mmx32_fdct_col03: - movq mm0, [x1] ; 0 ; x1 + movq mm0, [ebp + x1 wrt ..gotoff] ; 0 ; x1 ;; - movq mm1, [x6] ; 1 ; x6 + movq mm1, [ebp + x6 wrt ..gotoff] ; 1 ; x6 movq mm2, mm0 ; 2 ; x1 - movq mm3, [x2] ; 3 ; x2 + movq mm3, [ebp + x2 wrt ..gotoff] ; 3 ; x2 paddsw mm0, mm1 ; t1 = x[1] + x[6] - movq mm4, [x5] ; 4 ; x5 + movq mm4, [ebp + x5 wrt ..gotoff] ; 4 ; x5 psllw mm0, SHIFT_FRW_COL ; t1 - movq mm5, [x0] ; 5 ; x0 + movq mm5, [ebp + x0 wrt ..gotoff] ; 5 ; x0 paddsw mm4, mm3 ; t2 = x[2] + x[5] - paddsw mm5, [x7] ; t0 = x[0] + x[7] + paddsw mm5, [ebp + x7 wrt ..gotoff] ; t0 = x[0] + x[7] psllw mm4, SHIFT_FRW_COL ; t2 movq mm6, mm0 ; 6 ; t1 psubsw mm2, mm1 ; 1 ; t6 = x[1] - x[6] - movq mm1, [tg_2_16] ; 1 ; tg_2_16 + movq mm1, [ebp + tg_2_16 wrt ..gotoff] ; 1 ; tg_2_16 psubsw mm0, mm4 ; tm12 = t1 - t2 - movq mm7, [x3] ; 7 ; x3 + movq mm7, [ebp + x3 wrt ..gotoff] ; 7 ; x3 pmulhw mm1, mm0 ; tm12*tg_2_16 - paddsw mm7, [x4] ; t3 = x[3] + x[4] + paddsw mm7, [ebp + x4 wrt ..gotoff] ; t3 = x[3] + x[4] psllw mm5, SHIFT_FRW_COL ; t0 paddsw mm6, mm4 ; 4 ; tp12 = t1 + t2 @@ -166,58 +174,58 @@ mmx32_fdct_col03: paddsw mm1, mm5 ; y2 = tm03 + tm12*tg_2_16 paddsw mm4, mm7 ; 7 ; tp03 = t0 + t3 - por mm1, [fdct_one_corr] ; correction y2 +0.5 + por mm1, [ebp + fdct_one_corr wrt ..gotoff] ; correction y2 +0.5 psllw mm2, SHIFT_FRW_COL+1 ; t6 - pmulhw mm5, [tg_2_16] ; tm03*tg_2_16 + pmulhw mm5, [ebp + tg_2_16 wrt ..gotoff] ; tm03*tg_2_16 movq mm7, mm4 ; 7 ; tp03 - psubsw mm3, [x5] ; t5 = x[2] - x[5] + psubsw mm3, [ebp + x5 wrt ..gotoff] ; t5 = x[2] - x[5] psubsw mm4, mm6 ; y4 = tp03 - tp12 - movq [y2], mm1 ; 1 ; save y2 + movq [ebp + y2 wrt ..gotoff], mm1 ; 1 ; save y2 paddsw mm7, mm6 ; 6 ; y0 = tp03 + tp12 - movq mm1, [x3] ; 1 ; x3 + movq mm1, [ebp + x3 wrt ..gotoff] ; 1 ; x3 psllw mm3, SHIFT_FRW_COL+1 ; t5 - psubsw mm1, [x4] ; t4 = x[3] - x[4] + psubsw mm1, [ebp + x4 wrt ..gotoff] ; t4 = x[3] - x[4] movq mm6, mm2 ; 6 ; t6 - movq [y4], mm4 ; 4 ; save y4 + movq [ebp + y4 wrt ..gotoff], mm4 ; 4 ; save y4 paddsw mm2, mm3 ; t6 + t5 - pmulhw mm2, [ocos_4_16] ; tp65 = (t6 + t5)*cos_4_16 + pmulhw mm2, [ebp + ocos_4_16 wrt ..gotoff] ; tp65 = (t6 + t5)*cos_4_16 psubsw mm6, mm3 ; 3 ; t6 - t5 - pmulhw mm6, [ocos_4_16] ; tm65 = (t6 - t5)*cos_4_16 + pmulhw mm6, [ebp + ocos_4_16 wrt ..gotoff] ; tm65 = (t6 - t5)*cos_4_16 psubsw mm5, mm0 ; 0 ; y6 = tm03*tg_2_16 - tm12 - por mm5, [fdct_one_corr] ; correction y6 +0.5 + por mm5, [ebp + fdct_one_corr wrt ..gotoff] ; correction y6 +0.5 psllw mm1, SHIFT_FRW_COL ; t4 - por mm2, [fdct_one_corr] ; correction tp65 +0.5 + por mm2, [ebp + fdct_one_corr wrt ..gotoff] ; correction tp65 +0.5 movq mm4, mm1 ; 4 ; t4 - movq mm3, [x0] ; 3 ; x0 + movq mm3, [ebp + x0 wrt ..gotoff] ; 3 ; x0 paddsw mm1, mm6 ; tp465 = t4 + tm65 - psubsw mm3, [x7] ; t7 = x[0] - x[7] + psubsw mm3, [ebp + x7 wrt ..gotoff] ; t7 = x[0] - x[7] psubsw mm4, mm6 ; 6 ; tm465 = t4 - tm65 - movq mm0, [tg_1_16] ; 0 ; tg_1_16 + movq mm0, [ebp + tg_1_16 wrt ..gotoff] ; 0 ; tg_1_16 psllw mm3, SHIFT_FRW_COL ; t7 - movq mm6, [tg_3_16] ; 6 ; tg_3_16 + movq mm6, [ebp + tg_3_16 wrt ..gotoff] ; 6 ; tg_3_16 pmulhw mm0, mm1 ; tp465*tg_1_16 - movq [y0], mm7 ; 7 ; save y0 + movq [ebp + y0 wrt ..gotoff], mm7 ; 7 ; save y0 pmulhw mm6, mm4 ; tm465*tg_3_16 - movq [y6], mm5 ; 5 ; save y6 + movq [ebp + y6 wrt ..gotoff], mm5 ; 5 ; save y6 movq mm7, mm3 ; 7 ; t7 - movq mm5, [tg_3_16] ; 5 ; tg_3_16 + movq mm5, [ebp + tg_3_16 wrt ..gotoff] ; 5 ; tg_3_16 psubsw mm7, mm2 ; tm765 = t7 - tp65 paddsw mm3, mm2 ; 2 ; tp765 = t7 + tp65 @@ -226,55 +234,55 @@ mmx32_fdct_col03: paddsw mm0, mm3 ; y1 = tp765 + tp465*tg_1_16 paddsw mm6, mm4 ; tm465*tg_3_16 - pmulhw mm3, [tg_1_16] ; tp765*tg_1_16 + pmulhw mm3, [ebp + tg_1_16 wrt ..gotoff] ; tp765*tg_1_16 ;; - por mm0, [fdct_one_corr] ; correction y1 +0.5 + por mm0, [ebp + fdct_one_corr wrt ..gotoff] ; correction y1 +0.5 paddsw mm5, mm7 ; tm765*tg_3_16 psubsw mm7, mm6 ; 6 ; y3 = tm765 - tm465*tg_3_16 add INP, 0x08 ; ; increment pointer - movq [y1], mm0 ; 0 ; save y1 + movq [ebp + y1 wrt ..gotoff], mm0 ; 0 ; save y1 paddsw mm5, mm4 ; 4 ; y5 = tm765*tg_3_16 + tm465 - movq [y3], mm7 ; 7 ; save y3 + movq [ebp + y3 wrt ..gotoff], mm7 ; 7 ; save y3 psubsw mm3, mm1 ; 1 ; y7 = tp765*tg_1_16 - tp465 - movq [y5], mm5 ; 5 ; save y5 + movq [ebp + y5 wrt ..gotoff], mm5 ; 5 ; save y5 mmx32_fdct_col47: ; begin processing last four columns - movq mm0, [x1] ; 0 ; x1 + movq mm0, [ebp + x1 wrt ..gotoff] ; 0 ; x1 ;; - movq [y7], mm3 ; 3 ; save y7 (columns 0-4) + movq [ebp + y7 wrt ..gotoff], mm3 ; 3 ; save y7 (columns 0-4) ;; - movq mm1, [x6] ; 1 ; x6 + movq mm1, [ebp + x6 wrt ..gotoff] ; 1 ; x6 movq mm2, mm0 ; 2 ; x1 - movq mm3, [x2] ; 3 ; x2 + movq mm3, [ebp + x2 wrt ..gotoff] ; 3 ; x2 paddsw mm0, mm1 ; t1 = x[1] + x[6] - movq mm4, [x5] ; 4 ; x5 + movq mm4, [ebp + x5 wrt ..gotoff] ; 4 ; x5 psllw mm0, SHIFT_FRW_COL ; t1 - movq mm5, [x0] ; 5 ; x0 + movq mm5, [ebp + x0 wrt ..gotoff] ; 5 ; x0 paddsw mm4, mm3 ; t2 = x[2] + x[5] - paddsw mm5, [x7] ; t0 = x[0] + x[7] + paddsw mm5, [ebp + x7 wrt ..gotoff] ; t0 = x[0] + x[7] psllw mm4, SHIFT_FRW_COL ; t2 movq mm6, mm0 ; 6 ; t1 psubsw mm2, mm1 ; 1 ; t6 = x[1] - x[6] - movq mm1, [tg_2_16] ; 1 ; tg_2_16 + movq mm1, [ebp + tg_2_16 wrt ..gotoff] ; 1 ; tg_2_16 psubsw mm0, mm4 ; tm12 = t1 - t2 - movq mm7, [x3] ; 7 ; x3 + movq mm7, [ebp + x3 wrt ..gotoff] ; 7 ; x3 pmulhw mm1, mm0 ; tm12*tg_2_16 - paddsw mm7, [x4] ; t3 = x[3] + x[4] + paddsw mm7, [ebp + x4 wrt ..gotoff] ; t3 = x[3] + x[4] psllw mm5, SHIFT_FRW_COL ; t0 paddsw mm6, mm4 ; 4 ; tp12 = t1 + t2 @@ -286,58 +294,58 @@ mmx32_fdct_col47: ; begin processing las paddsw mm1, mm5 ; y2 = tm03 + tm12*tg_2_16 paddsw mm4, mm7 ; 7 ; tp03 = t0 + t3 - por mm1, [fdct_one_corr] ; correction y2 +0.5 + por mm1, [ebp + fdct_one_corr wrt ..gotoff] ; correction y2 +0.5 psllw mm2, SHIFT_FRW_COL+1 ; t6 - pmulhw mm5, [tg_2_16] ; tm03*tg_2_16 + pmulhw mm5, [ebp + tg_2_16 wrt ..gotoff] ; tm03*tg_2_16 movq mm7, mm4 ; 7 ; tp03 - psubsw mm3, [x5] ; t5 = x[2] - x[5] + psubsw mm3, [ebp + x5 wrt ..gotoff] ; t5 = x[2] - x[5] psubsw mm4, mm6 ; y4 = tp03 - tp12 - movq [y2+8], mm1 ; 1 ; save y2 + movq [ebp + 8 + y2 wrt ..gotoff], mm1 ; 1 ; save y2 paddsw mm7, mm6 ; 6 ; y0 = tp03 + tp12 - movq mm1, [x3] ; 1 ; x3 + movq mm1, [ebp + x3 wrt ..gotoff] ; 1 ; x3 psllw mm3, SHIFT_FRW_COL+1 ; t5 - psubsw mm1, [x4] ; t4 = x[3] - x[4] + psubsw mm1, [ebp + x4 wrt ..gotoff] ; t4 = x[3] - x[4] movq mm6, mm2 ; 6 ; t6 - movq [y4+8], mm4 ; 4 ; save y4 + movq [ebp + 8 + y4 wrt ..gotoff], mm4 ; 4 ; save y4 paddsw mm2, mm3 ; t6 + t5 - pmulhw mm2, [ocos_4_16] ; tp65 = (t6 + t5)*cos_4_16 + pmulhw mm2, [ebp + ocos_4_16 wrt ..gotoff] ; tp65 = (t6 + t5)*cos_4_16 psubsw mm6, mm3 ; 3 ; t6 - t5 - pmulhw mm6, [ocos_4_16] ; tm65 = (t6 - t5)*cos_4_16 + pmulhw mm6, [ebp + ocos_4_16 wrt ..gotoff] ; tm65 = (t6 - t5)*cos_4_16 psubsw mm5, mm0 ; 0 ; y6 = tm03*tg_2_16 - tm12 - por mm5, [fdct_one_corr] ; correction y6 +0.5 + por mm5, [ebp + fdct_one_corr wrt ..gotoff] ; correction y6 +0.5 psllw mm1, SHIFT_FRW_COL ; t4 - por mm2, [fdct_one_corr] ; correction tp65 +0.5 + por mm2, [ebp + fdct_one_corr wrt ..gotoff] ; correction tp65 +0.5 movq mm4, mm1 ; 4 ; t4 - movq mm3, [x0] ; 3 ; x0 + movq mm3, [ebp + x0 wrt ..gotoff] ; 3 ; x0 paddsw mm1, mm6 ; tp465 = t4 + tm65 - psubsw mm3, [x7] ; t7 = x[0] - x[7] + psubsw mm3, [ebp + x7 wrt ..gotoff] ; t7 = x[0] - x[7] psubsw mm4, mm6 ; 6 ; tm465 = t4 - tm65 - movq mm0, [tg_1_16] ; 0 ; tg_1_16 + movq mm0, [ebp + tg_1_16 wrt ..gotoff] ; 0 ; tg_1_16 psllw mm3, SHIFT_FRW_COL ; t7 - movq mm6, [tg_3_16] ; 6 ; tg_3_16 + movq mm6, [ebp + tg_3_16 wrt ..gotoff] ; 6 ; tg_3_16 pmulhw mm0, mm1 ; tp465*tg_1_16 - movq [y0+8], mm7 ; 7 ; save y0 + movq [ebp + 8 + y0 wrt ..gotoff], mm7 ; 7 ; save y0 pmulhw mm6, mm4 ; tm465*tg_3_16 - movq [y6+8], mm5 ; 5 ; save y6 + movq [ebp + 8 + y6 wrt ..gotoff], mm5 ; 5 ; save y6 movq mm7, mm3 ; 7 ; t7 - movq mm5, [tg_3_16] ; 5 ; tg_3_16 + movq mm5, [ebp + tg_3_16 wrt ..gotoff] ; 5 ; tg_3_16 psubsw mm7, mm2 ; tm765 = t7 - tp65 paddsw mm3, mm2 ; 2 ; tp765 = t7 + tp65 @@ -346,24 +354,24 @@ mmx32_fdct_col47: ; begin processing las paddsw mm0, mm3 ; y1 = tp765 + tp465*tg_1_16 paddsw mm6, mm4 ; tm465*tg_3_16 - pmulhw mm3, [tg_1_16] ; tp765*tg_1_16 + pmulhw mm3, [ebp + tg_1_16 wrt ..gotoff] ; tp765*tg_1_16 ;; - por mm0, [fdct_one_corr] ; correction y1 +0.5 + por mm0, [ebp + fdct_one_corr wrt ..gotoff] ; correction y1 +0.5 paddsw mm5, mm7 ; tm765*tg_3_16 psubsw mm7, mm6 ; 6 ; y3 = tm765 - tm465*tg_3_16 ;; - movq [y1+8], mm0 ; 0 ; save y1 + movq [ebp + 8 + y1 wrt ..gotoff], mm0 ; 0 ; save y1 paddsw mm5, mm4 ; 4 ; y5 = tm765*tg_3_16 + tm465 - movq [y3+8], mm7 ; 7 ; save y3 + movq [ebp + 8 + y3 wrt ..gotoff], mm7 ; 7 ; save y3 psubsw mm3, mm1 ; 1 ; y7 = tp765*tg_1_16 - tp465 - movq [y5+8], mm5 ; 5 ; save y5 + movq [ebp + 8 + y5 wrt ..gotoff], mm5 ; 5 ; save y5 - movq [y7+8], mm3 ; 3 ; save y7 + movq [ebp + 8 + y7 wrt ..gotoff], mm3 ; 3 ; save y7 ; emms; ; } ; end of forward_dct_col07() @@ -378,13 +386,13 @@ mmx32_fdct_col47: ; begin processing las ; ; The output is stored into blk[], which destroys the original ; input data. - mov INP, [ebp+8]; ;; row 0 + mov INP, [esp+24]; ;; row 0 mov edi, 0x08; ;x = 8 - lea TABLE, [tab_frw_01234567]; ; row 0 + lea TABLE, [ebp + tab_frw_01234567 wrt ..gotoff]; ; row 0 mov OUT, INP; - lea round_frw_row, [fdct_r_row]; + lea round_frw_row, [ebp + fdct_r_row wrt ..gotoff]; ; for ( x = 8; x > 0; --x ) ; transform one row per iteration ; ---------- loop begin @@ -510,4 +518,6 @@ mmx32_fdct_col47: ; begin processing las pop ebp ; restore stack pointer emms ret - \ No newline at end of file + +section .note.GNU-stack noalloc noexec nowrite progbits + diff -urp jpeg-mmx-old/jdcolor.c jpeg-mmx/jdcolor.c --- jpeg-mmx-old/jdcolor.c 2005-02-17 19:32:48.000000000 +0100 +++ jpeg-mmx/jdcolor.c 2005-10-29 23:26:17.000000000 +0200 @@ -262,13 +262,9 @@ ycc_rgb_convert_mmx (j_decompress_ptr ci #endif #if defined(HAVE_MMX_ATT_MNEMONICS) __asm__( - "pushl %%ebx\n" - "mov %1, %%eax\n" - "mov %2, %%ebx\n" - "mov %3, %%ecx\n" - "movd (%%eax),%%mm0\n" // mm0: 0 0 0 0 y3 y2 y1 y0 - 8 bit - "movd (%%ebx),%%mm1\n" // mm1: 0 0 0 0 cb3 cb2 cb1 cb0 - "movd (%%ecx),%%mm2\n" // mm2: 0 0 0 0 cr3 cr2 cr1 cr0 + "movd (%1),%%mm0\n" // mm0: 0 0 0 0 y3 y2 y1 y0 - 8 bit + "movd (%2),%%mm1\n" // mm1: 0 0 0 0 cb3 cb2 cb1 cb0 + "movd (%3),%%mm2\n" // mm2: 0 0 0 0 cr3 cr2 cr1 cr0 "pxor %%mm7,%%mm7\n" // mm7 = 0 "punpcklbw %%mm7,%%mm0\n" // mm0: y3 y2 y1 y0 - expand to 16 bit "punpcklbw %%mm7,%%mm1\n" // mm1: cb3 cb2 cb1 cb0 @@ -350,9 +346,9 @@ ycc_rgb_convert_mmx (j_decompress_ptr ci "movq %%mm3,6%0\n" // save two more RGB pixels :"=m"(outptr[0]) - :"m"(inptr0),"m"(inptr1),"m"(inptr2), //y cb cr + :"r"(inptr0),"r"(inptr1),"r"(inptr2), //y cb cr "m"(te0),"m"(te1),"m"(te2) - :"eax", "ebx", "ecx", "st"); + :"st"); #endif outptr+=12; diff -urp jpeg-mmx-old/jdmerge.c jpeg-mmx/jdmerge.c --- jpeg-mmx-old/jdmerge.c 2005-02-18 17:52:25.000000000 +0100 +++ jpeg-mmx/jdmerge.c 2005-12-22 21:36:38.000000000 +0100 @@ -39,7 +39,7 @@ #ifdef UPSAMPLE_MERGING_SUPPORTED #if defined(HAVE_MMX_INTEL_MNEMONICS) || defined(HAVE_MMX_ATT_MNEMONICS) -#define __int64 unsigned long long +#define __int64 static const unsigned long long __int64 const1 = 0x59BA0000D24B59BALL; // Cr_r Cr_b Cr_g Cr_r __int64 const2 = 0x00007168E9FA0000LL; // Cb-r Cb_b Cb_g Cb_r __int64 const5 = 0x0000D24B59BA0000LL; // Cr_b Cr_g Cr_r Cr_b @@ -816,29 +816,20 @@ do_next16: fprintf(stderr, "Using accelerated MMX code for merge !\n"); __asm__ ( - "pushl %%ebx \n\t" - - "movl %0, %%esi \n\t" - - "movl %1, %%eax \n\t" - - "movl %2, %%ebx \n\t" - - "movl %3, %%ecx \n\t" - - "movl %4, %%edi \n\t" - - "movl %5, %%edx \n\t" + "pushl %%ebx \n\t" + "pushl %2 \n\t" "do_next16: \n\t" - "movd (%%ebx),%%mm0 \n\t" // Cr7 Cr6.....Cr1 Cr0 + "movl (%%esp),%%ebx\n\t" + "movd (%%ebx),%%mm0 \n\t" // Cr7 Cr6.....Cr1 Cr0 + "movl 4(%%esp),%%ebx\n\t" "pxor %%mm6,%%mm6 \n\t" "punpcklbw %%mm0,%%mm0 \n\t" // Cr3 Cr3 Cr2 Cr2 Cr1 Cr1 Cr0 Cr0 - "movq const128,%%mm7 \n\t" + "movq %7,%%mm7 \n\t" "punpcklwd %%mm0,%%mm0 \n\t" // Cr1 Cr1 Cr1 Cr1 Cr0 Cr0 Cr0 Cr0 @@ -854,13 +845,13 @@ do_next16: "punpcklbw %%mm1,%%mm1 \n\t" // Cb3 Cb3 Cb2 Cb2 Cb1 Cb1 Cb0 Cb0 - "paddsw const05,%%mm0 \n\t" // add (one_half/fix(x)) << 2 + "paddsw %8,%%mm0 \n\t" // add (one_half/fix(x)) << 2 "punpcklwd %%mm1,%%mm1 \n\t" // Cb1 Cb1 Cb1 Cb1 Cb0 Cb0 Cb0 Cb0 "movq %%mm1,%%mm5 \n\t" - "pmulhw const1,%%mm0 \n\t" // multiply by (fix(x) >> 1) + "pmulhw %9,%%mm0 \n\t" // multiply by (fix(x) >> 1) "punpcklbw %%mm6,%%mm1 \n\t" // Cb0 Cb0 Cb0 Cb0 @@ -872,53 +863,53 @@ do_next16: "psllw $2,%%mm1 \n\t" // left shift by 2 bits - "paddsw const15,%%mm1 \n\t" // add (one_half/fix(x)) << 2 + "paddsw %10,%%mm1 \n\t" // add (one_half/fix(x)) << 2 "psubsw %%mm7,%%mm4 \n\t" // Cr1 - 128:Cr1-128:Cr1-128:Cr1 -128 "psubsw %%mm7,%%mm5 \n\t" // Cb1 - 128:Cb1-128:Cb1-128:Cb1 -128 - "pmulhw const2,%%mm1 \n\t" // multiply by (fix(x) >> 1) + "pmulhw %11,%%mm1 \n\t" // multiply by (fix(x) >> 1) "psllw $2,%%mm4 \n\t" // left shift by 2 bits "psllw $2,%%mm5 \n\t" // left shift by 2 bits - "paddsw const45,%%mm4 \n\t" // add (one_half/fix(x)) << 2 + "paddsw %12,%%mm4 \n\t" // add (one_half/fix(x)) << 2 "movd (%%esi),%%mm7 \n\t" // Y13 Y12 Y9 Y8 Y5 Y4 Y1 Y0 - "pmulhw const5,%%mm4 \n\t" // multiply by (fix(x) >> 1) + "pmulhw %13,%%mm4 \n\t" // multiply by (fix(x) >> 1) "movq %%mm7,%%mm6 \n\t" "punpcklbw %%mm7,%%mm7 \n\t" // Y5 Y5 Y4 Y4 Y1 Y1 Y0 Y0 - "paddsw const55,%%mm5 \n\t" // add (one_half/fix(x)) << 2 + "paddsw %14,%%mm5 \n\t" // add (one_half/fix(x)) << 2 "paddsw %%mm1,%%mm0 \n\t" // cred0 cbl0 cgr0 cred0 "movq %%mm7,%%mm1 \n\t" - "pmulhw const6,%%mm5 \n\t" // multiply by (fix(x) >> 1) + "pmulhw %15,%%mm5 \n\t" // multiply by (fix(x) >> 1) "movq %%mm0,%%mm2 \n\t" // cred0 cbl0 cgr0 cred0 "punpcklwd %%mm6,%%mm7 \n\t" // Y5 Y4 Y1 Y1 Y1 Y0 Y0 Y0 - "pand davemask,%%mm2 \n\t" // 0 cbl0 cgr0 0 + "pand %16,%%mm2 \n\t" // 0 cbl0 cgr0 0 "psrlq $16,%%mm1 \n\t" // 0 0 Y5 Y5 Y4 Y4 Y1 Y1 "psrlq $16,%%mm2 \n\t" // 0 0 cbl0 cgr0 - "punpcklbw empty,%%mm7 \n\t" // Y1 Y0 Y0 Y0 + "punpcklbw %17,%%mm7 \n\t" // Y1 Y0 Y0 Y0 "paddsw %%mm5,%%mm4 \n\t" // cbl1 cgr1 cred1 cbl1 "movq %%mm4,%%mm3 \n\t" // cbl1 cgr1 cred1 cbl1 - "pand davemask,%%mm3 \n\t" // 0 cgr1 cred1 0 + "pand %16,%%mm3 \n\t" // 0 cgr1 cred1 0 "paddsw %%mm0,%%mm7 \n\t" // r1 b0 g0 r0 @@ -928,7 +919,7 @@ do_next16: "por %%mm3,%%mm2 \n\t" // cgr1 cred1 cbl0 cgr0 - "punpcklbw empty,%%mm6 \n\t" // Y4 Y4 Y1 Y1 + "punpcklbw %17,%%mm6 \n\t" // Y4 Y4 Y1 Y1 "movd (%%eax),%%mm3 \n\t" // Y15 Y14 Y11 Y10 Y7 Y6 Y3 Y2 @@ -946,7 +937,7 @@ do_next16: "punpcklwd %%mm6,%%mm3 \n\t" // X X X X Y3 Y2 Y2 Y2 - "punpcklbw empty,%%mm3 \n\t" // Y3 Y2 Y2 Y2 + "punpcklbw %17,%%mm3 \n\t" // Y3 Y2 Y2 Y2 "psrlq $16,%%mm5 \n\t" // 0 0 Y7 Y7 Y6 Y6 Y3 Y3 @@ -958,7 +949,7 @@ do_next16: "punpckldq %%mm6,%%mm6 \n\t" // X X X X Y6 Y6 Y3 Y3 - "punpcklbw empty,%%mm6 \n\t" // Y6 Y6 Y3 Y3 + "punpcklbw %17,%%mm6 \n\t" // Y6 Y6 Y3 Y3 "psrlq $24,%%mm1 \n\t" // 0 0 0 0 0 Y5 Y5 Y4 @@ -976,19 +967,21 @@ do_next16: "psrlq $24,%%mm5 \n\t" // 0 0 0 0 0 Y7 Y7 Y6 - "movd (%%ebx),%%mm0 \n\t" // Cr9 Cr8.....Cr3 Cr2 + "movl (%%esp),%%ebx\n\t" + "movd (%%ebx),%%mm0 \n\t" // Cr9 Cr8.....Cr3 Cr2 + "movl 4(%%esp),%%ebx\n\t" "psrlq $32,%%mm2 \n\t" // 0 0 0 0 0 0 Y7 Y7 "psrlq $16,%%mm0 \n\t" - "punpcklbw empty,%%mm1 \n\t" // Y5 Y5 Y5 Y4 + "punpcklbw %17,%%mm1 \n\t" // Y5 Y5 Y5 Y4 "punpcklwd %%mm2,%%mm5 \n\t" // X X X X Y7 Y7 Y7 Y6 "paddsw %%mm4,%%mm1 \n\t" // b5 g5 r5 b4 - "punpcklbw empty,%%mm5 \n\t" // Y7 Y7 Y7 Y6 + "punpcklbw %17,%%mm5 \n\t" // Y7 Y7 Y7 Y6 "pxor %%mm6,%%mm6 \n\t" // clear mm6 registr @@ -1006,67 +999,67 @@ do_next16: "psrlq $16,%%mm3 \n\t" - "psubsw const128,%%mm0 \n\t" // Cr2 - 128:Cr2-128:Cr2-128:Cr2 -128 + "psubsw %7,%%mm0 \n\t" // Cr2 - 128:Cr2-128:Cr2-128:Cr2 -128 "punpcklbw %%mm3,%%mm3 \n\t" // X X X X Cb3 Cb3 Cb2 Cb2 "psllw $2,%%mm0 \n\t" // left shift by 2 bits - "paddsw const05,%%mm0 \n\t" // add (one_half/fix(x)) << 2 + "paddsw %8,%%mm0 \n\t" // add (one_half/fix(x)) << 2 "punpcklwd %%mm3,%%mm3 \n\t" // Cb3 Cb3 Cb3 Cb3 Cb2 Cb2 Cb2 Cb2 "movq %%mm3,%%mm7 \n\t" - "pmulhw const1,%%mm0 \n\t" // multiply by (fix(x) >> 1) + "pmulhw %9,%%mm0 \n\t" // multiply by (fix(x) >> 1) "punpcklbw %%mm6,%%mm3 \n\t" // Cb2 Cb2 Cb2 Cb2 - "psubsw const128,%%mm3 \n\t" // Cb0 - 128:Cb0-128:Cb0-128:Cb0 -128 + "psubsw %7,%%mm3 \n\t" // Cb0 - 128:Cb0-128:Cb0-128:Cb0 -128 "punpckhbw %%mm6,%%mm4 \n\t" // Cr3 Cr3 Cr3 Cr3 "psllw $2,%%mm3 \n\t" // left shift by 2 bits - "paddsw const15,%%mm3 \n\t" // add (one_half/fix(x)) << 2 + "paddsw %10,%%mm3 \n\t" // add (one_half/fix(x)) << 2 "punpckhbw %%mm6,%%mm7 \n\t" // Cb3 Cb3 Cb3 Cb3 - "pmulhw const2,%%mm3 \n\t" // multiply by (fix(x) >> 1) + "pmulhw %11,%%mm3 \n\t" // multiply by (fix(x) >> 1) - "psubsw const128,%%mm7 \n\t" // Cb3 - 128:Cb3-128:Cb3-128:Cb3 -128 + "psubsw %7,%%mm7 \n\t" // Cb3 - 128:Cb3-128:Cb3-128:Cb3 -128 "paddsw %%mm3,%%mm0 \n\t" // cred2 cbl2 cgr2 cred2 "psllw $2,%%mm7 \n\t" // left shift by 2 bits - "psubsw const128,%%mm4 \n\t" // Cr3 - 128:Cr3-128:Cr3-128:Cr3 -128 + "psubsw %7,%%mm4 \n\t" // Cr3 - 128:Cr3-128:Cr3-128:Cr3 -128 "movd 4(%%esi),%%mm3 \n\t" // Y21 Y20 Y17 Y16 Y13 Y12 Y9 Y8 "psllw $2,%%mm4 \n\t" // left shift by 2 bits - "paddsw const55,%%mm7 \n\t" // add (one_half/fix(x)) << 2 + "paddsw %14,%%mm7 \n\t" // add (one_half/fix(x)) << 2 "movq %%mm3,%%mm6 \n\t" // Y21 Y20 Y17 Y16 Y13 Y12 Y9 Y8 "movq %%mm0,%%mm2 \n\t" - "pand davemask,%%mm2 \n\t" + "pand %16,%%mm2 \n\t" "punpcklbw %%mm3,%%mm3 \n\t" // Y13 Y13 Y12 Y12 Y9 Y9 Y8 Y8 "psrlq $16,%%mm2 \n\t" - "paddsw const45,%%mm4 \n\t" // add (one_half/fix(x)) << 2 + "paddsw %12,%%mm4 \n\t" // add (one_half/fix(x)) << 2 "punpcklwd %%mm6,%%mm3 \n\t" // X X X X Y9 Y8 Y8 Y8 - "pmulhw const5,%%mm4 \n\t" // multiply by (fix(x) >> 1) + "pmulhw %13,%%mm4 \n\t" // multiply by (fix(x) >> 1) - "pmulhw const6,%%mm7 \n\t" // multiply by (fix(x) >> 1) + "pmulhw %15,%%mm7 \n\t" // multiply by (fix(x) >> 1) - "punpcklbw empty,%%mm3 \n\t" // Y9 Y8 Y8 Y8 + "punpcklbw %17,%%mm3 \n\t" // Y9 Y8 Y8 Y8 "paddsw %%mm7,%%mm4 \n\t" // cbl3 cgr3 cred3 cbl3 @@ -1078,7 +1071,7 @@ do_next16: "movd 4(%%eax),%%mm3 \n\t" // Y23 Y22 Y19 Y18 Y15 Y14 Y11 Y10 - "pand davemask,%%mm7 \n\t" + "pand %16,%%mm7 \n\t" "psrlq $8,%%mm6 \n\t" // 0 Y21 Y20 Y17 Y16 Y13 Y12 Y9 @@ -1138,7 +1131,7 @@ do_next16: "psrlq $8,%%mm0 \n\t" // 0 0 Y23 Y22 Y19 Y18 Y15 Y14 - "punpcklbw empty,%%mm1 \n\t" // Y13 Y13 Y13 Y12 + "punpcklbw %17,%%mm1 \n\t" // Y13 Y13 Y13 Y12 "movq %%mm0,%%mm5 \n\t" // 0 0 Y23 Y22 Y19 Y18 Y15 Y14 @@ -1156,9 +1149,9 @@ do_next16: "addl $24,%%edx \n\t" - "punpcklbw empty,%%mm5 \n\t" // Y15 Y15 Y15 Y14 + "punpcklbw %17,%%mm5 \n\t" // Y15 Y15 Y15 Y14 - "addl $4,%%ebx \n\t" + "addl $4,(%%esp) \n\t" "paddsw %%mm4,%%mm5 \n\t" // b15 g15 r15 b14 @@ -1175,14 +1168,16 @@ do_next16: "jnz do_next16 \n\t" "emms \n\t" - - "popl %%ebx \n\t" + "popl %%ebx \n\t" + "popl %%ebx \n\t" : //"=m"(&cols_asm) - : "m"(inptr00), "m"(inptr01), "m"(inptr2), "m"(inptr1), "m"(outptr1), - "m"(outptr0),"m"(cols_asm) /* was (&cols_asm) */ - : "eax", "ecx", "edx", "edi", "esi", "st", "cc", "memory" + : "S"(inptr00), "a"(inptr01), "m"(inptr2), "c"(inptr1), "D"(outptr1), + "d"(outptr0), "m"(cols_asm), + "m"(const128), "m"(const05), "m"(const1), "m"(const15), "m"(const2), "m"(const45), + "m"(const5), "m"(const55), "m"(const6), "m"(davemask), "m"(empty) + : "st", "cc", "memory" ); #if 0 "movl $inptr00, %%esi \n\t" diff -urp jpeg-mmx-old/jdsample.c jpeg-mmx/jdsample.c --- jpeg-mmx-old/jdsample.c 2005-02-18 17:43:18.000000000 +0100 +++ jpeg-mmx/jdsample.c 2005-12-22 22:27:48.000000000 +0100 @@ -304,7 +304,7 @@ h2v2_upsample (j_decompress_ptr cinfo, j #define __int64 long long /* This won't work for Intel compilers - tell Gernot to help fixing ! */ /* I have no clue why it is written in that strange way, but ok, it works */ -union u1 { __int64 q; double align; } +static union u1 { __int64 q; double align; } mul3w={0x0003000300030003LL}, mul9w={0x0009000900090009LL}, mul9ws={0x000900090009000cLL}, mul3ws={0x0003000300030004LL}, bias7w={0x0007000700070007LL}, bias8w={0x0008000800080008LL}, @@ -544,19 +544,19 @@ h2v1_fancy_upsample_mmx (j_decompress_pt "punpcklbw %%mm6,%%mm0 \n\t" // unpack lower values; inptr[0][1][2][3] "movq %%mm0,%%mm3 \n\t" // make a copy - "pmullw mul3w,%%mm0 \n\t" // multiply by 3 + "pmullw %3,%%mm0 \n\t" // multiply by 3 "psllq $8,%%mm1 \n\t" // shift 1 byte for previous values; inptr[-1][0][1][2] "movq %%mm7,%%mm5 \n\t" // copy original data - "pand mask2,%%mm5 \n\t" // mask out all but lower byte for "previous" state + "pand %4,%%mm5 \n\t" // mask out all but lower byte for "previous" state "paddb %%mm5,%%mm1 \n\t" // add in byte to quadword "psrlq $8,%%mm2 \n\t" // shift right for "next" state; inptr[1][2][3][4] "punpcklbw %%mm6,%%mm1 \n\t" // unpack "punpcklbw %%mm6,%%mm2 \n\t" // unpack "paddw %%mm0,%%mm1 \n\t" // add in result from multiply to "previous" data - "paddw bias1w,%%mm1 \n\t" // add in bias + "paddw %5,%%mm1 \n\t" // add in bias "paddw %%mm0,%%mm2 \n\t" // add in result from multiply to "next" data - "paddw bias2w,%%mm2 \n\t" // add in bias + "paddw %6,%%mm2 \n\t" // add in bias "psrlw $2,%%mm1 \n\t" // convert from word to byte "psrlw $2,%%mm2 \n\t" // convert from word to byte "psllq $8,%%mm2 \n\t" // prepare for interleave @@ -570,7 +570,7 @@ h2v1_fancy_upsample_mmx (j_decompress_pt "movq %%mm7,%%mm3 \n\t" // copy input data "punpckhbw %%mm6,%%mm0 \n\t" // unpack hi data - "pmullw mul3w,%%mm0 \n\t" // multiply by 3 + "pmullw %3,%%mm0 \n\t" // multiply by 3 "psllq $8,%%mm1 \n\t" // shift 1 byte for previous values; inptr[-1][0][1][2] "psrlq $8,%%mm2 \n\t" // shift right for "next" state; inptr[1][2][3][4] "movq 8(%%esi),%%mm7 \n\t" // get next quadword from input buffer @@ -581,9 +581,9 @@ h2v1_fancy_upsample_mmx (j_decompress_pt "punpckhbw %%mm6,%%mm1 \n\t" // unpack "punpckhbw %%mm6,%%mm2 \n\t" // unpack "paddw %%mm0,%%mm1 \n\t" // add in result from multiply to "previous" data - "paddw bias1w,%%mm1 \n\t" // add in bias + "paddw %5,%%mm1 \n\t" // add in bias "paddw %%mm0,%%mm2 \n\t" // add in result from multiply to "next" data - "paddw bias2w,%%mm2 \n\t" // add in bias + "paddw %6,%%mm2 \n\t" // add in bias "psrlw $2,%%mm1 \n\t" // convert from word to byte "psrlw $2,%%mm2 \n\t" // convert from word to byte "psllq $8,%%mm2 \n\t" // prepare for interleave @@ -602,7 +602,7 @@ h2v1_fancy_upsample_mmx (j_decompress_pt "movq %%mm7,%%mm1 \n\t" // copy input data "movq %%mm7,%%mm2 \n\t" // copy input data "punpcklbw %%mm6,%%mm0 \n\t" // unpack lo data - "pmullw mul3w,%%mm0 \n\t" // multiply by 3; i[0][1][2][3] + "pmullw %3,%%mm0 \n\t" // multiply by 3; i[0][1][2][3] "psllq $8,%%mm1 \n\t" // shift left to get previous byte "movq %%mm3,%%mm5 \n\t" // retrieve copy of "previous" state "psrlq $56,%%mm5 \n\t" // shift to get LSB @@ -611,9 +611,9 @@ h2v1_fancy_upsample_mmx (j_decompress_pt "punpcklbw %%mm6,%%mm1 \n\t" // unpack "punpcklbw %%mm6,%%mm2 \n\t" // unpack "paddw %%mm0,%%mm1 \n\t" // add in result from multiply to "previous" data - "paddw bias1w,%%mm1 \n\t" // add in bias + "paddw %5,%%mm1 \n\t" // add in bias "paddw %%mm0,%%mm2 \n\t" // add in result from multiply to "next" data - "paddw bias2w,%%mm2 \n\t" // add in bias + "paddw %6,%%mm2 \n\t" // add in bias "psrlw $2,%%mm1 \n\t" // convert from word to byte "psrlw $2,%%mm2 \n\t" // convert from word to byte "psllq $8,%%mm2 \n\t" // prepare for interleave @@ -628,7 +628,7 @@ h2v1_fancy_upsample_mmx (j_decompress_pt "movq %%mm7,%%mm3 \n\t" // copy input data "punpckhbw %%mm6,%%mm0 \n\t" // unpack hi data - "pmullw mul3w,%%mm0 \n\t" // multiply by 3; i[0][1][2][3] + "pmullw %3,%%mm0 \n\t" // multiply by 3; i[0][1][2][3] "psllq $8,%%mm1 \n\t" // shift left to get previous byte "psrlq $8,%%mm2 \n\t" // shift rt for "next" state "movq 8(%%esi),%%mm7 \n\t" // get next quadword from input buffer @@ -639,9 +639,9 @@ h2v1_fancy_upsample_mmx (j_decompress_pt "punpckhbw %%mm6,%%mm1 \n\t" // unpack "punpckhbw %%mm6,%%mm2 \n\t" // unpack "paddw %%mm0,%%mm1 \n\t" // add in result from multiply to "previous" data - "paddw bias1w,%%mm1 \n\t" // add in bias + "paddw %5,%%mm1 \n\t" // add in bias "paddw %%mm0,%%mm2 \n\t" // add in result from multiply to "next" data - "paddw bias2w,%%mm2 \n\t" // add in bias + "paddw %6,%%mm2 \n\t" // add in bias "psrlw $2,%%mm1 \n\t" // convert from word to byte "psrlw $2,%%mm2 \n\t" // convert from word to byte @@ -664,7 +664,7 @@ h2v1_fancy_upsample_mmx (j_decompress_pt "punpcklbw %%mm6,%%mm0 \n\t" // unpack lo data - "pmullw mul3w,%%mm0 \n\t" // multiply by 3; i[0][1][2][3] + "pmullw %3,%%mm0 \n\t" // multiply by 3; i[0][1][2][3] "psllq $8,%%mm1 \n\t" // shift left to get previous byte @@ -676,9 +676,9 @@ h2v1_fancy_upsample_mmx (j_decompress_pt "punpcklbw %%mm6,%%mm1 \n\t" // unpack "punpcklbw %%mm6,%%mm2 \n\t" // unpack "paddw %%mm0,%%mm1 \n\t" // add in result from multiply to "previous" data - "paddw bias1w,%%mm1 \n\t" // add in bias + "paddw %5,%%mm1 \n\t" // add in bias "paddw %%mm0,%%mm2 \n\t" // add in result from multiply to "next" data - "paddw bias2w,%%mm2 \n\t" // add in bias + "paddw %6,%%mm2 \n\t" // add in bias "psrlw $2,%%mm1 \n\t" // convert from word to byte "psrlw $2,%%mm2 \n\t" // convert from word to byte "psllq $8,%%mm2 \n\t" // prepare for interleave @@ -691,17 +691,17 @@ h2v1_fancy_upsample_mmx (j_decompress_pt "movq %%mm7,%%mm2 \n\t" // copy input data "punpckhbw %%mm6,%%mm0 \n\t" // unpack hi data - "pmullw mul3w,%%mm0 \n\t" // multiply by 3; i[0][1][2][3] + "pmullw %3,%%mm0 \n\t" // multiply by 3; i[0][1][2][3] "psllq $8,%%mm1 \n\t" // shift left to get previous byte "psrlq $8,%%mm2 \n\t" // shift rt for "next" state - "pand mask1,%%mm7 \n\t" // mask out all but MSB + "pand %7,%%mm7 \n\t" // mask out all but MSB "paddb %%mm7,%%mm2 \n\t" // add in byte "punpckhbw %%mm6,%%mm1 \n\t" // unpack "punpckhbw %%mm6,%%mm2 \n\t" // unpack "paddw %%mm0,%%mm1 \n\t" // add in result from multiply to "previous" data - "paddw bias1w,%%mm1 \n\t" // add in bias + "paddw %5,%%mm1 \n\t" // add in bias "paddw %%mm0,%%mm2 \n\t" // add in result from multiply to "next" data - "paddw bias2w,%%mm2 \n\t" // add in bias + "paddw %6,%%mm2 \n\t" // add in bias "psrlw $2,%%mm1 \n\t" // convert from word to byte "psrlw $2,%%mm2 \n\t" // convert from word to byte "psllq $8,%%mm2 \n\t" // prepare for interleave @@ -711,7 +711,8 @@ h2v1_fancy_upsample_mmx (j_decompress_pt : // no output regs // %0 %1 %2 %3 %4 - : "m"(hsize), "m"(inptr), "m"(outptr) + : "m"(hsize), "m"(inptr), "m"(outptr), "m"(mul3w), "m"(mask2), + "m"(bias1w), "m"(bias2w), "m"(mask1) : "eax", "ecx", "edx", "esi", "edi", "memory", "cc", "st" ); @@ -1306,21 +1307,21 @@ h2v2_fancy_upsample_mmx (j_decompress_pt "movq %%mm0,%%mm4 \n\t"// save to process hi half of input0 "movq %%mm2,%%mm5 \n\t"// save to process hi half of input1 - "punpcklbw noval,%%mm0 \n\t"// process inptr0 + "punpcklbw %7,%%mm0 \n\t"// process inptr0 "movq %%mm0,%%mm1 \n\t"// copy inptr0 "psllq $16,%%mm1 \n\t"// shift for first column special case i0[-1] - "pmullw mul9ws,%%mm0 \n\t"// multiply by special case constant - "pmullw mul3w,%%mm1 \n\t"// multiply input1 by 3 - "punpcklbw noval,%%mm2 \n\t"// process inptr1 + "pmullw %8,%%mm0 \n\t"// multiply by special case constant + "pmullw %9,%%mm1 \n\t"// multiply input1 by 3 + "punpcklbw %7,%%mm2 \n\t"// process inptr1 "movq %%mm2,%%mm3 \n\t"// copy inptr0 "psllq $16,%%mm3 \n\t"// shift for first column special case i1[-1] - "pmullw mul3ws,%%mm2 \n\t"// multiply by special case constant + "pmullw %9,%%mm2 \n\t"// multiply by special case constant "paddw %%mm0,%%mm1 \n\t"// Add up results for "movq %%mm1,(%%eax) \n\t" "movq %%mm1,%%mm6 \n\t"// with the next results "paddw %%mm2,%%mm3 \n\t"// final o1 o3 o5 o7 "paddw %%mm3,%%mm6 \n\t"// output to be interleaved - "paddw bias8w,%%mm6 \n\t"// Add even bias + "paddw %10,%%mm6 \n\t"// Add even bias "psrlw $4,%%mm6 \n\t"// convert from word to byte (truncate) // Part 2 of the output - process lo data for o2 o4 o6 o8 @@ -1329,21 +1330,21 @@ h2v2_fancy_upsample_mmx (j_decompress_pt "movq %%mm0,%%mm1 \n\t"// copy inptr0 for unpack "movq %%mm2,%%mm3 \n\t"// copy inptr1 for unpack - "punpcklbw noval,%%mm0 \n\t"// process inptr1 + "punpcklbw %7,%%mm0 \n\t"// process inptr1 "psrlq $8,%%mm1 \n\t"// shift right for i0[1][2][3][4] - "punpcklbw noval,%%mm1 \n\t"// process inptr1 - "pmullw mul9w,%%mm0 \n\t"// multiply by nearest point constant - "pmullw mul3w,%%mm1 \n\t"// multiply by next nearest constant + "punpcklbw %7,%%mm1 \n\t"// process inptr1 + "pmullw %14,%%mm0 \n\t"// multiply by nearest point constant + "pmullw %9,%%mm1 \n\t"// multiply by next nearest constant - "punpcklbw noval,%%mm2 \n\t"// process inptr1 + "punpcklbw %7,%%mm2 \n\t"// process inptr1 "psrlq $8,%%mm3 \n\t"// shift right for i1[1][2][3][4] - "punpcklbw noval,%%mm3 \n\t"// process inptr1 - "pmullw mul3w,%%mm2 \n\t"// multiply by next nearest constant + "punpcklbw %7,%%mm3 \n\t"// process inptr1 + "pmullw %9,%%mm2 \n\t"// multiply by next nearest constant "paddw %%mm1,%%mm0 \n\t"// Add up results for final o2 o4 o6 o8 "movq %%mm0,8(%%eax) \n\t" "paddw %%mm3,%%mm0 \n\t"// previous results for o1 o3 o5 o7 - "paddw bias7w,%%mm0 \n\t"// Add odd bias + "paddw %11,%%mm0 \n\t"// Add odd bias "paddw %%mm2,%%mm0 \n\t"// output to be interleaved with the "psrlw $4,%%mm0 \n\t"// convert back to byte (with truncate) @@ -1365,23 +1366,23 @@ h2v2_fancy_upsample_mmx (j_decompress_pt "movq %%mm5,%%mm2 \n\t"// get data from input row 1 "movq %%mm0,%%mm1 \n\t"// copy inptr0 for unpack "movq %%mm2,%%mm3 \n\t"// copy inptr1 for unpack - "movq %%mm0, input0 \n\t" - "movq %%mm2, input1 \n\t" + "movq %%mm0, %12 \n\t" + "movq %%mm2, %13 \n\t" - "punpckhbw noval,%%mm0 \n\t"// process inptr1[0] + "punpckhbw %7,%%mm0 \n\t"// process inptr1[0] "psllq $8,%%mm1 \n\t"// shift for inptr0[-1] - "punpckhbw noval,%%mm1 \n\t"// process inptr1[1] - "pmullw mul9w,%%mm0 \n\t"// multiply by special case constant - "pmullw mul3w,%%mm1 \n\t"// multiply inptr1 by 3 - "punpckhbw noval,%%mm2 \n\t"// process inptr1[0] + "punpckhbw %7,%%mm1 \n\t"// process inptr1[1] + "pmullw %14,%%mm0 \n\t"// multiply by special case constant + "pmullw %9,%%mm1 \n\t"// multiply inptr1 by 3 + "punpckhbw %7,%%mm2 \n\t"// process inptr1[0] "psllq $8,%%mm3 \n\t"// shift for inptr1[-1] - "punpckhbw noval,%%mm3 \n\t"// process inptr1 - "pmullw mul3w,%%mm2 \n\t"// multiply by special case constant + "punpckhbw %7,%%mm3 \n\t"// process inptr1 + "pmullw %9,%%mm2 \n\t"// multiply by special case constant "paddw %%mm0,%%mm1 \n\t"// Add up results for "movq %%mm1,(%%eax) \n\t" "movq %%mm1,%%mm6 \n\t"// with the next results - "paddw bias8w,%%mm6 \n\t"// Add even bias + "paddw %10,%%mm6 \n\t"// Add even bias "paddw %%mm2,%%mm3 \n\t"// final o1 o3 o5 o7 "paddw %%mm3,%%mm6 \n\t"// output to be interleaved "psrlw $4,%%mm6 \n\t"// convert from word to byte (truncate) @@ -1397,8 +1398,8 @@ h2v2_fancy_upsample_mmx (j_decompress_pt "movq %%mm4,%%mm7 \n\t" "psllq $56,%%mm7 \n\t"// shift for MSB "paddb %%mm7,%%mm1 \n\t"// add in MSB from next input0 column - "punpckhbw noval,%%mm1 \n\t"// process inptr0 - "pmullw mul3w,%%mm1 \n\t"// multiply by next nearest constant + "punpckhbw %7,%%mm1 \n\t"// process inptr0 + "pmullw %9,%%mm1 \n\t"// multiply by next nearest constant "psrlq $8,%%mm3 \n\t"// shift right for i1[1][2][3][4] "movq 8(%%esi),%%mm5 \n\t"// need to add in a byte from the next column @@ -1407,11 +1408,11 @@ h2v2_fancy_upsample_mmx (j_decompress_pt "movq %%mm5,%%mm7 \n\t" "psllq $56,%%mm7 \n\t"// shift for MSB "paddb %%mm7,%%mm3 \n\t"// add in MSB from next input1 column - "punpckhbw noval,%%mm3 \n\t"// process inptr1 + "punpckhbw %7,%%mm3 \n\t"// process inptr1 "paddw %%mm1,%%mm0 \n\t"// Add odd bias "movq %%mm0,8(%%eax) \n\t" - "paddw bias7w,%%mm3 \n\t"// Add up results for final o2 o4 o6 o8 + "paddw %11,%%mm3 \n\t"// Add up results for final o2 o4 o6 o8 "paddw %%mm3,%%mm0 \n\t"// output to be interleaved with the "paddw %%mm2,%%mm0 \n\t"// previous results for o1 o3 o5 o7 "psrlw $4,%%mm0 \n\t"// convert back to byte (with truncate) @@ -1427,28 +1428,28 @@ h2v2_fancy_upsample_mmx (j_decompress_pt "movq %%mm4,%%mm0 \n\t"// get data from input row 0 "movq %%mm5,%%mm2 \n\t"// get data from input row 1 - "punpcklbw noval,%%mm0 \n\t"// process inptr0 + "punpcklbw %7,%%mm0 \n\t"// process inptr0 "movq %%mm0,%%mm1 \n\t"// copy inptr0 "psllq $16,%%mm1 \n\t"// shift for first column special case i0[-1] - "movq input0,%%mm7 \n\t" + "movq %12,%%mm7 \n\t" "psrlq $56,%%mm7 \n\t" "paddw %%mm7,%%mm1 \n\t" - "pmullw mul9w,%%mm0 \n\t"// multiply by special case constant - "pmullw mul3w,%%mm1 \n\t"// multiply input1 by 3 - "punpcklbw noval,%%mm2 \n\t"// process intr1 + "pmullw %14,%%mm0 \n\t"// multiply by special case constant + "pmullw %9,%%mm1 \n\t"// multiply input1 by 3 + "punpcklbw %7,%%mm2 \n\t"// process intr1 "movq %%mm2,%%mm3 \n\t"// copy inptr0 "psllq $16,%%mm3 \n\t"// shift for first column special case i1[-1] - "movq input1,%%mm7 \n\t" + "movq %13,%%mm7 \n\t" "psrlq $56,%%mm7 \n\t" "paddw %%mm7,%%mm3 \n\t" - "pmullw mul3w,%%mm2 \n\t"// multiply by special case constant + "pmullw %9,%%mm2 \n\t"// multiply by special case constant "paddw %%mm0,%%mm1 \n\t"// Add up results for "movq %%mm1,16(%%eax) \n\t" "movq %%mm1,%%mm6 \n\t"// with the next results - "paddw bias8w,%%mm6 \n\t"// Add even bias + "paddw %10,%%mm6 \n\t"// Add even bias "paddw %%mm2,%%mm3 \n\t"// final o1 o3 o5 o7 "paddw %%mm3,%%mm6 \n\t"// output to be interleaved "psrlw $4,%%mm6 \n\t"// convert from word to byte (truncate) @@ -1458,16 +1459,16 @@ h2v2_fancy_upsample_mmx (j_decompress_pt "movq %%mm5,%%mm3 \n\t"// copy inptr1 for unpack "psrlq $8,%%mm1 \n\t"// shift right for i0[1][2][3][4] - "punpcklbw noval,%%mm1 \n\t"// process inptr1 - "pmullw mul3w,%%mm1 \n\t"// multiply by next nearest constant + "punpcklbw %7,%%mm1 \n\t"// process inptr1 + "pmullw %9,%%mm1 \n\t"// multiply by next nearest constant "psrlq $8,%%mm3 \n\t"// shift right for i1[1][2][3][4] - "punpcklbw noval,%%mm3 \n\t"// process inptr1 + "punpcklbw %7,%%mm3 \n\t"// process inptr1 "paddw %%mm1,%%mm0 \n\t"// Add up results for final o2 o4 o6 o8 "movq %%mm0,24(%%eax) \n\t" "paddw %%mm3,%%mm0 \n\t"// previous results for o1 o3 o5 o7 - "paddw bias7w,%%mm0 \n\t"// Add odd bias + "paddw %11,%%mm0 \n\t"// Add odd bias "paddw %%mm2,%%mm0 \n\t"// output to be interleaved with the "psrlw $4,%%mm0 \n\t"// convert back to byte (with truncate) @@ -1491,21 +1492,21 @@ h2v2_fancy_upsample_mmx (j_decompress_pt "movq %%mm0,%%mm1 \n\t"// copy inptr0 for unpack "movq %%mm5,%%mm3 \n\t"// copy inptr1 for unpack - "punpckhbw noval,%%mm0 \n\t"// process inptr1[0] + "punpckhbw %7,%%mm0 \n\t"// process inptr1[0] "psllq $8,%%mm1 \n\t"// shift for inptr0[-1] - "punpckhbw noval,%%mm1 \n\t"// process inptr1[1] - "pmullw mul9w,%%mm0 \n\t"// multiply by special case constant - "pmullw mul3w,%%mm1 \n\t"// multiply inptr1 by 3 + "punpckhbw %7,%%mm1 \n\t"// process inptr1[1] + "pmullw %14,%%mm0 \n\t"// multiply by special case constant + "pmullw %9,%%mm1 \n\t"// multiply inptr1 by 3 // punpckhbw mm2, noval ;// process inptr1[0] "psllq $8,%%mm3 \n\t"// shift for inptr1[-1] - "punpckhbw noval,%%mm3 \n\t"// process inptr1 + "punpckhbw %7,%%mm3 \n\t"// process inptr1 // pmullw mm2, mul3w ;// multiply by special case constant "paddw %%mm0,%%mm1 \n\t"// Add up results for "movq %%mm1,(%%eax) \n\t" "movq %%mm1,%%mm6 \n\t"// with the next results - "paddw bias8w,%%mm6 \n\t"// Add even bias + "paddw %10,%%mm6 \n\t"// Add even bias "paddw %%mm2,%%mm3 \n\t"// final o1 o3 o5 o7 "paddw %%mm3,%%mm6 \n\t"// output to be interleaved "psrlw $4,%%mm6 \n\t"// convert from word to byte (truncate) @@ -1517,20 +1518,20 @@ h2v2_fancy_upsample_mmx (j_decompress_pt "psrlq $8,%%mm1 \n\t"// shift right for i0[1][2][3][4] // load next inptr0 to mm4 for future use - "pand mask1,%%mm4 \n\t" + "pand %15,%%mm4 \n\t" "paddb %%mm4,%%mm1 \n\t"// add in MSB from next input0 column - "punpckhbw noval,%%mm1 \n\t"// process inptr0 - "pmullw mul3w,%%mm1 \n\t"// multiply by next nearest constan + "punpckhbw %7,%%mm1 \n\t"// process inptr0 + "pmullw %9,%%mm1 \n\t"// multiply by next nearest constan "psrlq $8,%%mm3 \n\t"// shift right for i1[1][2][3][4] // load next inptr1 to mm5 for future use - "pand mask1,%%mm5 \n\t" + "pand %15,%%mm5 \n\t" "paddb %%mm5,%%mm3 \n\t"// add in MSB from next input1 column - "punpckhbw noval,%%mm3 \n\t"// process inptr1 + "punpckhbw %7,%%mm3 \n\t"// process inptr1 "paddw %%mm1,%%mm0 \n\t"// Add odd bias "movq %%mm0,8(%%eax) \n\t" - "paddw bias7w,%%mm3 \n\t"// Add up results for final o2 o4 o6 o8 + "paddw %11,%%mm3 \n\t"// Add up results for final o2 o4 o6 o8 "paddw %%mm3,%%mm0 \n\t"// output to be interleaved with the "paddw %%mm2,%%mm0 \n\t"// previous results for o1 o3 o5 o7 @@ -1554,30 +1555,30 @@ h2v2_fancy_upsample_mmx (j_decompress_pt "movq (%%esi),%%mm2 \n\t"// get data from input row 1 "movq %%mm2,%%mm5 \n\t"// save to process hi half of input1 - "punpcklbw noval,%%mm2 \n\t"// process inptr1 + "punpcklbw %7,%%mm2 \n\t"// process inptr1 "movq %%mm2,%%mm3 \n\t"// copy inptr0 "psllq $16,%%mm3 \n\t"// shift for first column special case i1[-1] - "pmullw mul3ws,%%mm2 \n\t"// multiply by special case constant + "pmullw %9,%%mm2 \n\t"// multiply by special case constant "movq (%%eax),%%mm6 \n\t"// Add up results for "paddw %%mm2,%%mm3 \n\t"// final o1 o3 o5 o7 "paddw %%mm3,%%mm6 \n\t"// output to be interleaved - "paddw bias8w,%%mm6 \n\t"// Add even bias + "paddw %10,%%mm6 \n\t"// Add even bias "psrlw $4,%%mm6 \n\t"// convert from word to byte (truncate) // Part 2 of the output - process lo data for o2 o4 o6 o8 "movq %%mm5,%%mm2 \n\t"// get data from input row 1 "movq %%mm2,%%mm3 \n\t"// copy inptr1 for unpack - "punpcklbw noval,%%mm2 \n\t"// process inptr1 + "punpcklbw %7,%%mm2 \n\t"// process inptr1 "psrlq $8,%%mm3 \n\t"// shift right for i1[1][2][3][4] - "punpcklbw noval,%%mm3 \n\t"// process inptr1 - "pmullw mul3w,%%mm2 \n\t"// multiply by next nearest constant + "punpcklbw %7,%%mm3 \n\t"// process inptr1 + "pmullw %9,%%mm2 \n\t"// multiply by next nearest constant "movq 8(%%eax),%%mm0 \n\t"// Add up results for final o2 o4 o6 o8 "paddw %%mm3,%%mm0 \n\t"// previous results for o1 o3 o5 o7 - "paddw bias7w,%%mm0 \n\t"// Add odd bias + "paddw %11,%%mm0 \n\t"// Add odd bias "paddw %%mm2,%%mm0 \n\t"// output to be interleaved with the "psrlw $4,%%mm0 \n\t"// convert back to byte (with truncate) @@ -1600,13 +1601,13 @@ h2v2_fancy_upsample_mmx (j_decompress_pt "movq %%mm2,%%mm3 \n\t"// copy inptr1 for unpack "movq %%mm2,%%mm1 \n\t" - "punpckhbw noval,%%mm2 \n\t"// process inptr1[0] + "punpckhbw %7,%%mm2 \n\t"// process inptr1[0] "psllq $8,%%mm3 \n\t"// shift for inptr1[-1] - "punpckhbw noval,%%mm3 \n\t"// process inptr1 - "pmullw mul3w,%%mm2 \n\t"// multiply by special case constant + "punpckhbw %7,%%mm3 \n\t"// process inptr1 + "pmullw %9,%%mm2 \n\t"// multiply by special case constant "movq (%%eax),%%mm6 \n\t"// with the next results - "paddw bias8w,%%mm6 \n\t"// Add even bias + "paddw %10,%%mm6 \n\t"// Add even bias "paddw %%mm2,%%mm3 \n\t"// final o1 o3 o5 o7 "paddw %%mm3,%%mm6 \n\t"// output to be interleaved "psrlw $4,%%mm6 \n\t"// convert from word to byte (truncate) @@ -1615,18 +1616,18 @@ h2v2_fancy_upsample_mmx (j_decompress_pt "movq %%mm5,%%mm2 \n\t"// get data from input row 1 "movq %%mm2,%%mm3 \n\t"// copy inptr1 for unpack - "punpckhbw noval,%%mm2 \n\t"// process inptr1 + "punpckhbw %7,%%mm2 \n\t"// process inptr1 "psrlq $8,%%mm3 \n\t"// shift right for i1[1][2][3][4] "movq 8(%%esi),%%mm5 \n\t"// need to add in a byte from the next column // load next inptr1 to mm5 for future use "movq %%mm5,%%mm7 \n\t" "psllq $56,%%mm7 \n\t"// shift for MSB "paddb %%mm7,%%mm3 \n\t"// add in MSB from next input1 column - "punpckhbw noval,%%mm3 \n\t"// process inptr1 - "pmullw mul3w,%%mm2 \n\t"// multiply by next nearest constant + "punpckhbw %7,%%mm3 \n\t"// process inptr1 + "pmullw %9,%%mm2 \n\t"// multiply by next nearest constant "movq 8(%%eax),%%mm0 \n\t"// Add odd bias - "paddw bias7w,%%mm3 \n\t"// Add up results for final o2 o4 o6 o8 + "paddw %11,%%mm3 \n\t"// Add up results for final o2 o4 o6 o8 "paddw %%mm3,%%mm0 \n\t"// output to be interleaved with the "paddw %%mm2,%%mm0 \n\t"// previous results for o1 o3 o5 o7 "psrlw $4,%%mm0 \n\t"// convert back to byte (with truncate) @@ -1640,17 +1641,17 @@ h2v2_fancy_upsample_mmx (j_decompress_pt // Part 1 of the output - process lo data for o1 o3 o5 o7 "movq %%mm5,%%mm2 \n\t"// get data from input row 1 - "punpcklbw noval,%%mm2 \n\t"// process inptr1 + "punpcklbw %7,%%mm2 \n\t"// process inptr1 "movq %%mm2,%%mm3 \n\t"// copy inptr0 "psllq $16,%%mm3 \n\t"// shift for first column special case i1[-1] "movq %%mm1,%%mm7 \n\t" "psrlq $56,%%mm7 \n\t" "paddw %%mm7,%%mm3 \n\t" - "pmullw mul3w,%%mm2 \n\t"// multiply by special case constant + "pmullw %9,%%mm2 \n\t"// multiply by special case constant "movq 16(%%eax),%%mm6 \n\t"// Add up results for - "paddw bias8w,%%mm6 \n\t"// Add even bias + "paddw %10,%%mm6 \n\t"// Add even bias "paddw %%mm2,%%mm3 \n\t"// final o1 o3 o5 o7 "paddw %%mm3,%%mm6 \n\t"// output to be interleaved "psrlw $4,%%mm6 \n\t"// convert from word to byte (truncate) @@ -1659,14 +1660,14 @@ h2v2_fancy_upsample_mmx (j_decompress_pt "movq %%mm5,%%mm2 \n\t"// get data from input row 1 "movq %%mm2,%%mm3 \n\t"// copy inptr1 for unpack - "punpcklbw noval,%%mm2 \n\t"// process inptr1 + "punpcklbw %7,%%mm2 \n\t"// process inptr1 "psrlq $8,%%mm3 \n\t"// shift right for i1[1][2][3][4] - "punpcklbw noval,%%mm3 \n\t"// process inptr1 - "pmullw mul3w,%%mm2 \n\t"// multiply by next nearest constant + "punpcklbw %7,%%mm3 \n\t"// process inptr1 + "pmullw %9,%%mm2 \n\t"// multiply by next nearest constant "movq 24(%%eax),%%mm0 \n\t"// Add up results for final o2 o4 o6 o8 "paddw %%mm3,%%mm0 \n\t"// previous results for o1 o3 o5 o7 - "paddw bias7w,%%mm0 \n\t"// Add odd bias + "paddw %11,%%mm0 \n\t"// Add odd bias "paddw %%mm2,%%mm0 \n\t"// output to be interleaved with the "psrlw $4,%%mm0 \n\t"// convert back to byte (with truncate) @@ -1692,13 +1693,13 @@ h2v2_fancy_upsample_mmx (j_decompress_pt "movq %%mm5,%%mm2 \n\t"// get data from input row 1 "movq %%mm2,%%mm3 \n\t"// copy inptr1 for unpack - "punpckhbw noval,%%mm2 \n\t"// process inptr1[0] + "punpckhbw %7,%%mm2 \n\t"// process inptr1[0] "psllq $8,%%mm3 \n\t"// shift for inptr1[-1] - "punpckhbw noval,%%mm3 \n\t"// process inptr1 - "pmullw mul3w,%%mm2 \n\t"// multiply by special case constant + "punpckhbw %7,%%mm3 \n\t"// process inptr1 + "pmullw %9,%%mm2 \n\t"// multiply by special case constant "movq (%%eax),%%mm6 \n\t"// with the next results - "paddw bias8w,%%mm6 \n\t"// Add even bias + "paddw %10,%%mm6 \n\t"// Add even bias "paddw %%mm2,%%mm3 \n\t"// final o1 o3 o5 o7 "paddw %%mm3,%%mm6 \n\t"// output to be interleaved "psrlw $4,%%mm6 \n\t"// convert from word to byte (truncate) @@ -1707,16 +1708,16 @@ h2v2_fancy_upsample_mmx (j_decompress_pt "movq %%mm5,%%mm2 \n\t"// get data from input row 1 "movq %%mm2,%%mm3 \n\t"// copy inptr1 for unpack - "punpckhbw noval,%%mm2 \n\t"// process inptr1 + "punpckhbw %7,%%mm2 \n\t"// process inptr1 "psrlq $8,%%mm3 \n\t"// shift right for i1[1][2][3][4] // load next inptr1 to mm5 for future use - "pand mask1,%%mm5 \n\t" + "pand %15,%%mm5 \n\t" "paddb %%mm5,%%mm3 \n\t"// add in MSB from next input1 column - "punpckhbw noval,%%mm3 \n\t"// process inptr1 - "pmullw mul3w,%%mm2 \n\t"// multiply by next nearest constant + "punpckhbw %7,%%mm3 \n\t"// process inptr1 + "pmullw %9,%%mm2 \n\t"// multiply by next nearest constant "movq 8(%%eax),%%mm0 \n\t"// Add odd bias - "paddw bias7w,%%mm3 \n\t"// Add up results for final o2 o4 o6 o8 + "paddw %11,%%mm3 \n\t"// Add up results for final o2 o4 o6 o8 "paddw %%mm3,%%mm0 \n\t"// output to be interleaved with the "paddw %%mm2,%%mm0 \n\t"// previous results for o1 o3 o5 o7 "psrlw $4,%%mm0 \n\t"// convert back to byte (with truncate) @@ -1730,7 +1731,8 @@ h2v2_fancy_upsample_mmx (j_decompress_pt : // no output regs // %0 %1 %2 %3 %4 %5 : "m"(dsamp), "m"(inptr0), "m"(inptr1), "m"(outptr), "m"(save_val), "m"(inptr2), - "m"(outptr2) /* %6 */ + "m"(outptr2) /* %6 */, "m"(noval), "m"(mul9ws), "m"(mul3w), "m"(bias8w), + "m"(bias7w), "m"(input0), "m"(input1), "m"(mul9w), "m"(mask1) : "eax", "ecx", "edx", "esi", "edi", "memory", "cc", "st" ); diff -urp jpeg-mmx-old/jidctfst.c jpeg-mmx/jidctfst.c --- jpeg-mmx-old/jidctfst.c 2005-02-18 17:42:40.000000000 +0100 +++ jpeg-mmx/jidctfst.c 2005-12-22 22:01:01.000000000 +0100 @@ -397,12 +397,12 @@ jpeg_idct_ifast (j_decompress_ptr cinfo, #endif #define __int64 long long /* This won't work for Intel compilers - tell Gernot to help fixing ! */ #define int16 short /* And this won't either */ -const __int64 _fix_141 = 0x5a825a825a825a82LL; -const __int64 _fix_184n261 = 0xcf04cf04cf04cf04LL; -const __int64 _fix_184 = 0x7641764176417641LL; -const __int64 _fix_n184 = 0x896f896f896f896fLL; -const __int64 _fix_108n184 = 0xcf04cf04cf04cf04LL; -const __int64 _const_0x0080 = 0x0080008000800080LL; +static const __int64 _fix_141 = 0x5a825a825a825a82LL; +static const __int64 _fix_184n261 = 0xcf04cf04cf04cf04LL; +static const __int64 _fix_184 = 0x7641764176417641LL; +static const __int64 _fix_n184 = 0x896f896f896f896fLL; +static const __int64 _fix_108n184 = 0xcf04cf04cf04cf04LL; +static const __int64 _const_0x0080 = 0x0080008000800080LL; __inline GLOBAL(void) jpeg_idct_ifast_mmx (j_decompress_ptr cinfo, jpeg_component_info * compptr, @@ -1444,11 +1444,6 @@ jpeg_idct_ifast_mmx (j_decompress_ptr ci #if defined(HAVE_MMX_ATT_MNEMONICS) __asm__ ( - - "pushl %%ebx\n\t" - "movl %0, %%edi \n\t" - "movl %1, %%ebx \n\t" - "movl %2, %%esi \n\t" "addl $0x07,%%esi \n\t" //align wsptr to qword "andl $0xfffffff8,%%esi \n\t" //align wsptr to qword @@ -1457,20 +1452,20 @@ jpeg_idct_ifast_mmx (j_decompress_ptr ci /* Odd part */ - "movq 8*10(%%ebx),%%mm1 \n\t" //load inptr[DCTSIZE*5] + "movq 8*10(%1),%%mm1 \n\t" //load inptr[DCTSIZE*5] "pmullw 8*10(%%edi),%%mm1 \n\t" //tmp6 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]); - "movq 8*6(%%ebx),%%mm0 \n\t" //load inptr[DCTSIZE*3] + "movq 8*6(%1),%%mm0 \n\t" //load inptr[DCTSIZE*3] "pmullw 8*6(%%edi),%%mm0 \n\t" //tmp5 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]); - "movq 8*2(%%ebx),%%mm3 \n\t" //load inptr[DCTSIZE*1] + "movq 8*2(%1),%%mm3 \n\t" //load inptr[DCTSIZE*1] "movq %%mm1,%%mm2 \n\t" //copy tmp6 /* phase 6 */ "pmullw 8*2(%%edi),%%mm3 \n\t" //tmp4 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]); - "movq 8*14(%%ebx),%%mm4 \n\t" //load inptr[DCTSIZE*1] + "movq 8*14(%1),%%mm4 \n\t" //load inptr[DCTSIZE*1] "paddw %%mm0,%%mm1 \n\t" //z13 = tmp6 + tmp5; "pmullw 8*14(%%edi),%%mm4 \n\t" //tmp7 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]); @@ -1479,10 +1474,10 @@ jpeg_idct_ifast_mmx (j_decompress_ptr ci "psllw $2,%%mm2 \n\t" //shift z10 "movq %%mm2,%%mm0 \n\t" //copy z10 - "pmulhw _fix_184n261,%%mm2 \n\t" //MULTIPLY( z12, FIX_1_847759065); /* 2*c2 */ + "pmulhw %5,%%mm2 \n\t" //MULTIPLY( z12, FIX_1_847759065); /* 2*c2 */ "movq %%mm3,%%mm5 \n\t" //copy tmp4 - "pmulhw _fix_n184,%%mm0 \n\t" //MULTIPLY(z10, -FIX_1_847759065); /* 2*c2 */ + "pmulhw %6,%%mm0 \n\t" //MULTIPLY(z10, -FIX_1_847759065); /* 2*c2 */ "paddw %%mm4,%%mm3 \n\t" //z11 = tmp4 + tmp7; "movq %%mm3,%%mm6 \n\t" //copy z11 /* phase 5 */ @@ -1491,18 +1486,18 @@ jpeg_idct_ifast_mmx (j_decompress_ptr ci "psubw %%mm1,%%mm6 \n\t" //z11-z13 "psllw $2,%%mm5 \n\t" //shift z12 - "movq 8*12(%%ebx),%%mm4 \n\t" //load inptr[DCTSIZE*6], even part + "movq 8*12(%1),%%mm4 \n\t" //load inptr[DCTSIZE*6], even part "movq %%mm5,%%mm7 \n\t" //copy z12 - "pmulhw _fix_108n184,%%mm5 \n\t" //MULT(z12, (FIX_1_08-FIX_1_84)) //- z5; /* 2*(c2-c6) */ even part + "pmulhw %7,%%mm5 \n\t" //MULT(z12, (FIX_1_08-FIX_1_84)) //- z5; /* 2*(c2-c6) */ even part "paddw %%mm1,%%mm3 \n\t" //tmp7 = z11 + z13; /* Even part */ - "pmulhw _fix_184,%%mm7 \n\t" //MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) //+ z5; /* -2*(c2+c6) */ + "pmulhw %8,%%mm7 \n\t" //MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) //+ z5; /* -2*(c2+c6) */ "psllw $2,%%mm6 \n\t" - "movq 8*4(%%ebx),%%mm1 \n\t" //load inptr[DCTSIZE*2] + "movq 8*4(%1),%%mm1 \n\t" //load inptr[DCTSIZE*2] "pmullw 8*4(%%edi),%%mm1 \n\t" //tmp1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]); "paddw %%mm5,%%mm0 \n\t" //tmp10 @@ -1510,7 +1505,7 @@ jpeg_idct_ifast_mmx (j_decompress_ptr ci "pmullw 8*12(%%edi),%%mm4 \n\t" //tmp3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]); "paddw %%mm7,%%mm2 \n\t" //tmp12 - "pmulhw _fix_141,%%mm6 \n\t" //tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */ + "pmulhw %9,%%mm6 \n\t" //tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */ "psubw %%mm3,%%mm2 \n\t" //tmp6 = tmp12 - tmp7 "movq %%mm1,%%mm5 \n\t" //copy tmp1 @@ -1522,14 +1517,14 @@ jpeg_idct_ifast_mmx (j_decompress_ptr ci "movq %%mm1,8*0(%%esi) \n\t" //save tmp13 in workspace "psllw $2,%%mm5 \n\t" //shift tmp1-tmp3 - "movq 8*0(%%ebx),%%mm7 \n\t" //load inptr[DCTSIZE*0] + "movq 8*0(%1),%%mm7 \n\t" //load inptr[DCTSIZE*0] - "pmulhw _fix_141,%%mm5 \n\t" //MULTIPLY(tmp1 - tmp3, FIX_1_414213562) + "pmulhw %9,%%mm5 \n\t" //MULTIPLY(tmp1 - tmp3, FIX_1_414213562) "paddw %%mm6,%%mm0 \n\t" //tmp4 = tmp10 + tmp5; "pmullw 8*0(%%edi),%%mm7 \n\t" //tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); - "movq 8*8(%%ebx),%%mm4 \n\t" //load inptr[DCTSIZE*4] + "movq 8*8(%1),%%mm4 \n\t" //load inptr[DCTSIZE*4] "pmullw 8*8(%%edi),%%mm4 \n\t" //tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]); "psubw %%mm1,%%mm5 \n\t" //tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; /* 2*c4 */ @@ -1589,7 +1584,7 @@ jpeg_idct_ifast_mmx (j_decompress_ptr ci /*****************************************************************/ "addl $8,%%edi \n\t" - "addl $8,%%ebx \n\t" + "addl $8,%1 \n\t" "addl $8,%%esi \n\t" /*****************************************************************/ @@ -1597,20 +1592,20 @@ jpeg_idct_ifast_mmx (j_decompress_ptr ci - "movq 8*10(%%ebx),%%mm1 \n\t" //load inptr[DCTSIZE*5] + "movq 8*10(%1),%%mm1 \n\t" //load inptr[DCTSIZE*5] "pmullw 8*10(%%edi),%%mm1 \n\t" //tmp6 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]); - "movq 8*6(%%ebx),%%mm0 \n\t" //load inptr[DCTSIZE*3] + "movq 8*6(%1),%%mm0 \n\t" //load inptr[DCTSIZE*3] "pmullw 8*6(%%edi),%%mm0 \n\t" //tmp5 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]); - "movq 8*2(%%ebx),%%mm3 \n\t" //load inptr[DCTSIZE*1] + "movq 8*2(%1),%%mm3 \n\t" //load inptr[DCTSIZE*1] "movq %%mm1,%%mm2 \n\t" //copy tmp6 /* phase 6 */ "pmullw 8*2(%%edi),%%mm3 \n\t" //tmp4 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]); - "movq 8*14(%%ebx),%%mm4 \n\t" //load inptr[DCTSIZE*1] + "movq 8*14(%1),%%mm4 \n\t" //load inptr[DCTSIZE*1] "paddw %%mm0,%%mm1 \n\t" //z13 = tmp6 + tmp5; "pmullw 8*14(%%edi),%%mm4 \n\t" //tmp7 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]); @@ -1619,10 +1614,10 @@ jpeg_idct_ifast_mmx (j_decompress_ptr ci "psllw $2,%%mm2 \n\t" //shift z10 "movq %%mm2,%%mm0 \n\t" //copy z10 - "pmulhw _fix_184n261,%%mm2 \n\t" //MULTIPLY( z12, FIX_1_847759065); /* 2*c2 */ + "pmulhw %5,%%mm2 \n\t" //MULTIPLY( z12, FIX_1_847759065); /* 2*c2 */ "movq %%mm3,%%mm5 \n\t" //copy tmp4 - "pmulhw _fix_n184,%%mm0 \n\t" //MULTIPLY(z10, -FIX_1_847759065); /* 2*c2 */ + "pmulhw %6,%%mm0 \n\t" //MULTIPLY(z10, -FIX_1_847759065); /* 2*c2 */ "paddw %%mm4,%%mm3 \n\t" //z11 = tmp4 + tmp7; "movq %%mm3,%%mm6 \n\t" //copy z11 /* phase 5 */ @@ -1631,18 +1626,18 @@ jpeg_idct_ifast_mmx (j_decompress_ptr ci "psubw %%mm1,%%mm6 \n\t" //z11-z13 "psllw $2,%%mm5 \n\t" //shift z12 - "movq 8*12(%%ebx),%%mm4 \n\t" //load inptr[DCTSIZE*6], even part + "movq 8*12(%1),%%mm4 \n\t" //load inptr[DCTSIZE*6], even part "movq %%mm5,%%mm7 \n\t" //copy z12 - "pmulhw _fix_108n184,%%mm5 \n\t" //MULT(z12, (FIX_1_08-FIX_1_84)) //- z5; /* 2*(c2-c6) */ even part + "pmulhw %7,%%mm5 \n\t" //MULT(z12, (FIX_1_08-FIX_1_84)) //- z5; /* 2*(c2-c6) */ even part "paddw %%mm1,%%mm3 \n\t" //tmp7 = z11 + z13; /* Even part */ - "pmulhw _fix_184,%%mm7 \n\t" //MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) //+ z5; /* -2*(c2+c6) */ + "pmulhw %8,%%mm7 \n\t" //MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) //+ z5; /* -2*(c2+c6) */ "psllw $2,%%mm6 \n\t" - "movq 8*4(%%ebx),%%mm1 \n\t" //load inptr[DCTSIZE*2] + "movq 8*4(%1),%%mm1 \n\t" //load inptr[DCTSIZE*2] "pmullw 8*4(%%edi),%%mm1 \n\t" //tmp1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]); "paddw %%mm5,%%mm0 \n\t" //tmp10 @@ -1650,7 +1645,7 @@ jpeg_idct_ifast_mmx (j_decompress_ptr ci "pmullw 8*12(%%edi),%%mm4 \n\t" //tmp3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]); "paddw %%mm7,%%mm2 \n\t" //tmp12 - "pmulhw _fix_141,%%mm6 \n\t" //tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */ + "pmulhw %9,%%mm6 \n\t" //tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */ "psubw %%mm3,%%mm2 \n\t" //tmp6 = tmp12 - tmp7 "movq %%mm1,%%mm5 \n\t" //copy tmp1 @@ -1662,14 +1657,14 @@ jpeg_idct_ifast_mmx (j_decompress_ptr ci "movq %%mm1,8*0(%%esi) \n\t" //save tmp13 in workspace "psllw $2,%%mm5 \n\t" //shift tmp1-tmp3 - "movq 8*0(%%ebx),%%mm7 \n\t" //load inptr[DCTSIZE*0] + "movq 8*0(%1),%%mm7 \n\t" //load inptr[DCTSIZE*0] "paddw %%mm6,%%mm0 \n\t" //tmp4 = tmp10 + tmp5; - "pmulhw _fix_141,%%mm5 \n\t" //MULTIPLY(tmp1 - tmp3, FIX_1_414213562) + "pmulhw %9,%%mm5 \n\t" //MULTIPLY(tmp1 - tmp3, FIX_1_414213562) "pmullw 8*0(%%edi),%%mm7 \n\t" //tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); - "movq 8*8(%%ebx),%%mm4 \n\t" //load inptr[DCTSIZE*4] + "movq 8*8(%1),%%mm4 \n\t" //load inptr[DCTSIZE*4] "pmullw 8*8(%%edi),%%mm4 \n\t" //tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]); "psubw %%mm1,%%mm5 \n\t" //tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; /* 2*c4 */ @@ -1812,7 +1807,7 @@ jpeg_idct_ifast_mmx (j_decompress_ptr ci "punpckldq %%mm4,%%mm1 \n\t" //wsptr[0,tmp11],[1,tmp11],[2,tmp11],[3,tmp11] "psllw $2,%%mm6 \n\t" - "pmulhw _fix_141,%%mm6 \n\t" + "pmulhw %9,%%mm6 \n\t" "punpckldq %%mm3,%%mm0 \n\t" //wsptr[0,tmp10],[1,tmp10],[2,tmp10],[3,tmp10] "punpckhdq %%mm3,%%mm2 \n\t" //wsptr[0,tmp13],[1,tmp13],[2,tmp13],[3,tmp13] @@ -1945,36 +1940,36 @@ jpeg_idct_ifast_mmx (j_decompress_ptr ci "psllw $2,%%mm0 \n\t" - "pmulhw _fix_141,%%mm1 \n\t" //tmp21 + "pmulhw %9,%%mm1 \n\t" //tmp21 // tmp20 = MULTIPLY(z12, (FIX_1_082392200- FIX_1_847759065)) /* 2*(c2-c6) */ // + MULTIPLY(z10, - FIX_1_847759065); /* 2*c2 */ "psllw $2,%%mm3 \n\t" "movq %%mm0,%%mm7 \n\t" - "pmulhw _fix_n184,%%mm7 \n\t" + "pmulhw %6,%%mm7 \n\t" "movq %%mm3,%%mm6 \n\t" "movq 8*0(%%esi),%%mm2 \n\t" //tmp0,final1 - "pmulhw _fix_108n184,%%mm6 \n\t" + "pmulhw %7,%%mm6 \n\t" // tmp22 = MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) /* -2*(c2+c6) */ // + MULTIPLY(z12, FIX_1_847759065); /* 2*c2 */ "movq %%mm2,%%mm4 \n\t" //final1 - "pmulhw _fix_184n261,%%mm0 \n\t" + "pmulhw %5,%%mm0 \n\t" "paddw %%mm5,%%mm2 \n\t" //tmp0+tmp7,final1 - "pmulhw _fix_184,%%mm3 \n\t" + "pmulhw %8,%%mm3 \n\t" "psubw %%mm5,%%mm4 \n\t" //tmp0-tmp7,final1 // tmp6 = tmp22 - tmp7; /* phase 2 */ "psraw $5,%%mm2 \n\t" //outptr[0,0],[1,0],[2,0],[3,0],final1 - "paddsw _const_0x0080,%%mm2 \n\t" //final1 + "paddsw %10,%%mm2 \n\t" //final1 "paddw %%mm6,%%mm7 \n\t" //tmp20 "psraw $5,%%mm4 \n\t" //outptr[0,7],[1,7],[2,7],[3,7],final1 - "paddsw _const_0x0080,%%mm4 \n\t" //final1 + "paddsw %10,%%mm4 \n\t" //final1 "paddw %%mm0,%%mm3 \n\t" //tmp22 // tmp5 = tmp21 - tmp6; @@ -2003,10 +1998,10 @@ jpeg_idct_ifast_mmx (j_decompress_ptr ci "psubw %%mm3,%%mm6 \n\t" //tmp1-tmp6,final2 "psraw $5,%%mm0 \n\t" //outptr[0,1],[1,1],[2,1],[3,1] - "paddsw _const_0x0080,%%mm0 \n\t" + "paddsw %10,%%mm0 \n\t" "psraw $5,%%mm6 \n\t" //outptr[0,6],[1,6],[2,6],[3,6] - "paddsw _const_0x0080,%%mm6 \n\t" //need to check this value + "paddsw %10,%%mm6 \n\t" //need to check this value "packuswb %%mm4,%%mm0 \n\t" //out[0,1],[1,1],[2,1],[3,1],[0,7],[1,7],[2,7],[3,7] "movq 8*2(%%esi),%%mm5 \n\t" //tmp2,final3 @@ -2024,11 +2019,11 @@ jpeg_idct_ifast_mmx (j_decompress_ptr ci "psraw $5,%%mm5 \n\t" //outptr[0,2],[1,2],[2,2],[3,2] - "paddsw _const_0x0080,%%mm5 \n\t" + "paddsw %10,%%mm5 \n\t" "movq 8*3(%%esi),%%mm4 \n\t" //tmp3,final4 "psraw $5,%%mm3 \n\t" //outptr[0,5],[1,5],[2,5],[3,5] - "paddsw _const_0x0080,%%mm3 \n\t" + "paddsw %10,%%mm3 \n\t" // outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3) @@ -2042,10 +2037,10 @@ jpeg_idct_ifast_mmx (j_decompress_ptr ci "psraw $5,%%mm4 \n\t" //outptr[0,4],[1,4],[2,4],[3,4] "movl (%%eax),%%ecx \n\t" - "paddsw _const_0x0080,%%mm4 \n\t" + "paddsw %10,%%mm4 \n\t" "psraw $5,%%mm6 \n\t" //outptr[0,3],[1,3],[2,3],[3,3] - "paddsw _const_0x0080,%%mm6 \n\t" + "paddsw %10,%%mm6 \n\t" "packuswb %%mm4,%%mm5 \n\t" //out[0,2],[1,2],[2,2],[3,2],[0,4],[1,4],[2,4],[3,4] "packuswb %%mm3,%%mm6 \n\t" //out[0,3],[1,3],[2,3],[3,3],[0,5],[1,5],[2,5],[3,5] @@ -2071,13 +2066,13 @@ jpeg_idct_ifast_mmx (j_decompress_ptr ci "movq %%mm2,%%mm0 \n\t" "punpcklwd %%mm4,%%mm6 \n\t" //out[0,4],[0,5],[0,6],[0,7],[1,4],[1,5],[1,6],[1,7] - "movl (%%eax),%%ebx \n\t" + "movl (%%eax),%1 \n\t" "punpckldq %%mm6,%%mm2 \n\t" //out[0,0],[0,1],[0,2],[0,3],[0,4],[0,5],[0,6],[0,7] "addl $4,%%eax \n\t" "movq %%mm1,%%mm3 \n\t" - "addl %4, %%ebx \n\t" + "addl %4, %1 \n\t" "punpckhwd %%mm4,%%mm7 \n\t" //out[2,4],[2,5],[2,6],[2,7],[3,4],[3,5],[3,6],[3,7] "movq %%mm2,(%%ecx) \n\t" @@ -2087,17 +2082,17 @@ jpeg_idct_ifast_mmx (j_decompress_ptr ci "addl $4,%%eax \n\t" "addl %4, %%ecx \n\t" - "movq %%mm0,(%%ebx) \n\t" + "movq %%mm0,(%1) \n\t" "punpckldq %%mm7,%%mm1 \n\t" //out[2,0],[2,1],[2,2],[2,3],[2,4],[2,5],[2,6],[2,7] - "movl (%%eax),%%ebx \n\t" + "movl (%%eax),%1 \n\t" - "addl %4, %%ebx \n\t" + "addl %4, %1 \n\t" "punpckhdq %%mm7,%%mm3 \n\t" //out[3,0],[3,1],[3,2],[3,3],[3,4],[3,5],[3,6],[3,7] "movq %%mm1,(%%ecx) \n\t" - "movq %%mm3,(%%ebx) \n\t" + "movq %%mm3,(%1) \n\t" @@ -2181,7 +2176,7 @@ jpeg_idct_ifast_mmx (j_decompress_ptr ci "punpckldq %%mm4,%%mm1 \n\t" //wsptr[0,tmp11],[1,tmp11],[2,tmp11],[3,tmp11] "psllw $2,%%mm6 \n\t" - "pmulhw _fix_141,%%mm6 \n\t" + "pmulhw %9,%%mm6 \n\t" "punpckldq %%mm3,%%mm0 \n\t" //wsptr[0,tmp10],[1,tmp10],[2,tmp10],[3,tmp10] "punpckhdq %%mm3,%%mm2 \n\t" //wsptr[0,tmp13],[1,tmp13],[2,tmp13],[3,tmp13] @@ -2314,36 +2309,36 @@ jpeg_idct_ifast_mmx (j_decompress_ptr ci "psllw $2,%%mm0 \n\t" - "pmulhw _fix_141,%%mm1 \n\t" //tmp21 + "pmulhw %9,%%mm1 \n\t" //tmp21 // tmp20 = MULTIPLY(z12, (FIX_1_082392200- FIX_1_847759065)) /* 2*(c2-c6) */ // + MULTIPLY(z10, - FIX_1_847759065); /* 2*c2 */ "psllw $2,%%mm3 \n\t" "movq %%mm0,%%mm7 \n\t" - "pmulhw _fix_n184,%%mm7 \n\t" + "pmulhw %6,%%mm7 \n\t" "movq %%mm3,%%mm6 \n\t" "movq 8*0(%%esi),%%mm2 \n\t" //tmp0,final1 - "pmulhw _fix_108n184,%%mm6 \n\t" + "pmulhw %7,%%mm6 \n\t" // tmp22 = MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) /* -2*(c2+c6) */ // + MULTIPLY(z12, FIX_1_847759065); /* 2*c2 */ "movq %%mm2,%%mm4 \n\t" //final1 - "pmulhw _fix_184n261,%%mm0 \n\t" + "pmulhw %5,%%mm0 \n\t" "paddw %%mm5,%%mm2 \n\t" //tmp0+tmp7,final1 - "pmulhw _fix_184,%%mm3 \n\t" + "pmulhw %8,%%mm3 \n\t" "psubw %%mm5,%%mm4 \n\t" //tmp0-tmp7,final1 // tmp6 = tmp22 - tmp7; /* phase 2 */ "psraw $5,%%mm2 \n\t" //outptr[0,0],[1,0],[2,0],[3,0],final1 - "paddsw _const_0x0080,%%mm2 \n\t" //final1 + "paddsw %10,%%mm2 \n\t" //final1 "paddw %%mm6,%%mm7 \n\t" //tmp20 "psraw $5,%%mm4 \n\t" //outptr[0,7],[1,7],[2,7],[3,7],final1 - "paddsw _const_0x0080,%%mm4 \n\t" //final1 + "paddsw %10,%%mm4 \n\t" //final1 "paddw %%mm0,%%mm3 \n\t" //tmp22 // tmp5 = tmp21 - tmp6; @@ -2372,10 +2367,10 @@ jpeg_idct_ifast_mmx (j_decompress_ptr ci "psubw %%mm3,%%mm6 \n\t" //tmp1-tmp6,final2 "psraw $5,%%mm0 \n\t" //outptr[0,1],[1,1],[2,1],[3,1] - "paddsw _const_0x0080,%%mm0 \n\t" + "paddsw %10,%%mm0 \n\t" "psraw $5,%%mm6 \n\t" //outptr[0,6],[1,6],[2,6],[3,6] - "paddsw _const_0x0080,%%mm6 \n\t" //need to check this value + "paddsw %10,%%mm6 \n\t" //need to check this value "packuswb %%mm4,%%mm0 \n\t" //out[0,1],[1,1],[2,1],[3,1],[0,7],[1,7],[2,7],[3,7] "movq 8*2(%%esi),%%mm5 \n\t" //tmp2,final3 @@ -2393,11 +2388,11 @@ jpeg_idct_ifast_mmx (j_decompress_ptr ci "psraw $5,%%mm5 \n\t" //outptr[0,2],[1,2],[2,2],[3,2] - "paddsw _const_0x0080,%%mm5 \n\t" + "paddsw %10,%%mm5 \n\t" "movq 8*3(%%esi),%%mm4 \n\t" //tmp3,final4 "psraw $5,%%mm3 \n\t" //outptr[0,5],[1,5],[2,5],[3,5] - "paddsw _const_0x0080,%%mm3 \n\t" + "paddsw %10,%%mm3 \n\t" // outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3) @@ -2411,10 +2406,10 @@ jpeg_idct_ifast_mmx (j_decompress_ptr ci "psraw $5,%%mm4 \n\t" //outptr[0,4],[1,4],[2,4],[3,4] "movl (%%eax),%%ecx \n\t" - "paddsw _const_0x0080,%%mm4 \n\t" + "paddsw %10,%%mm4 \n\t" "psraw $5,%%mm6 \n\t" //outptr[0,3],[1,3],[2,3],[3,3] - "paddsw _const_0x0080,%%mm6 \n\t" + "paddsw %10,%%mm6 \n\t" "packuswb %%mm4,%%mm5 \n\t" //out[0,2],[1,2],[2,2],[3,2],[0,4],[1,4],[2,4],[3,4] "packuswb %%mm3,%%mm6 \n\t" //out[0,3],[1,3],[2,3],[3,3],[0,5],[1,5],[2,5],[3,5] @@ -2440,13 +2435,13 @@ jpeg_idct_ifast_mmx (j_decompress_ptr ci "movq %%mm2,%%mm0 \n\t" "punpcklwd %%mm4,%%mm6 \n\t" //out[0,4],[0,5],[0,6],[0,7],[1,4],[1,5],[1,6],[1,7] - "movl (%%eax),%%ebx \n\t" + "movl (%%eax),%1 \n\t" "punpckldq %%mm6,%%mm2 \n\t" //out[0,0],[0,1],[0,2],[0,3],[0,4],[0,5],[0,6],[0,7] "addl $4,%%eax \n\t" "movq %%mm1,%%mm3 \n\t" - "addl %4, %%ebx \n\t" + "addl %4, %1 \n\t" "punpckhwd %%mm4,%%mm7 \n\t" //out[2,4],[2,5],[2,6],[2,7],[3,4],[3,5],[3,6],[3,7] "movq %%mm2,(%%ecx) \n\t" @@ -2456,25 +2451,25 @@ jpeg_idct_ifast_mmx (j_decompress_ptr ci "addl $4,%%eax \n\t" "addl %4, %%ecx \n\t" - "movq %%mm0,(%%ebx) \n\t" + "movq %%mm0,(%1) \n\t" "punpckldq %%mm7,%%mm1 \n\t" //out[2,0],[2,1],[2,2],[2,3],[2,4],[2,5],[2,6],[2,7] - "movl (%%eax),%%ebx \n\t" + "movl (%%eax),%1 \n\t" - "addl %4, %%ebx \n\t" + "addl %4, %1 \n\t" "punpckhdq %%mm7,%%mm3 \n\t" //out[3,0],[3,1],[3,2],[3,3],[3,4],[3,5],[3,6],[3,7] "movq %%mm1,(%%ecx) \n\t" - "movq %%mm3,(%%ebx) \n\t" + "movq %%mm3,(%1) \n\t" "emms \n\t" - "popl %%ebx\n\t" : // no output regs // %0 %1 %2 %3 %4 - : "m"(quantptr), "m"(inptr), "m"(wsptr), "m"(outptr), "m"(output_col) - - : "eax", "ecx", "edx", "esi", "edi", "memory", "cc", "st" + : "D"(quantptr), "d"(inptr), "S"(wsptr), "m"(outptr), "m"(output_col), + "m"(_fix_184n261), "m"(_fix_n184), "m"(_fix_108n184), "m"(_fix_184), + "m"(_fix_141), "m"(_const_0x0080) + : "eax", "ecx", "memory", "cc", "st" ); #endif diff -urp jpeg-mmx-old/jidctint.c jpeg-mmx/jidctint.c --- jpeg-mmx-old/jidctint.c 2005-02-27 10:25:52.000000000 +0100 +++ jpeg-mmx/jidctint.c 2005-10-29 23:04:11.000000000 +0200 @@ -2837,6 +2837,7 @@ __inline void domidct8x8llmW(short *inpt /************************************************************************/ "emms \n\t" + "popl %%ebx \n\t" : // %0 %1 %2 %3 %4 @@ -2857,7 +2858,7 @@ __inline void domidct8x8llmW(short *inpt "m"(fix_n196p307n256), "m"(fix_054p076), "m"(fix_054), "m"(fix_054n184), // %28 "m"(const_0x0808) - : "eax", "ebx", "ecx", "edx", "edi", "esi", "cc", "memory", "st" + : "eax", "ecx", "edx", "edi", "esi", "cc", "memory", "st" ); #endif /* ATT style assembler */ } diff -urp jpeg-mmx-old/quant_mmx.s jpeg-mmx/quant_mmx.s --- jpeg-mmx-old/quant_mmx.s 2003-08-02 06:31:28.000000000 +0200 +++ jpeg-mmx/quant_mmx.s 2005-12-22 22:39:41.000000000 +0100 @@ -135,5 +135,5 @@ return: emms ; clear mmx registers ret - +section .note.GNU-stack noalloc noexec nowrite progbits