/* * Combine src and mask */ #include #include #include //#include "fbpict.h" typedef uint32_t CARD32; typedef uint16_t CARD16; #define FASTCALL #define AVV(x...) {x} #define Alpha(x) ((x) >> 24) #define Red(x) (((x) >> 16) & 0xff) #define Green(x) (((x) >> 8) & 0xff) #define Blue(x) ((x) & 0xff) /* x_c = (x_c * a) / 255 */ #define FbByteMul(x, a) do { \ CARD32 t = ((x & 0xff00ff) * a) + 0x800080; \ t = (t + ((t >> 8) & 0xff00ff)) >> 8; \ t &= 0xff00ff; \ \ x = (((x >> 8) & 0xff00ff) * a) + 0x800080; \ x = (x + ((x >> 8) & 0xff00ff)); \ x &= 0xff00ff00; \ x += t; \ } while (0) #define FbByteAdd(x, y) do { \ CARD32 t; \ CARD32 r = (x & 0xff00ff) + (y & 0xff00ff); \ r |= 0x1000100 - ((r >> 8) & 0xff00ff); \ r &= 0xff00ff; \ \ t = ((x >> 8) & 0xff00ff) + ((y >> 8) & 0xff00ff); \ t |= 0x1000100 - ((t >> 8) & 0xff00ff); \ r |= (t & 0xff00ff) << 8; \ x = r; \ } while (0) #define FbByteMulAdd(x, a, y) do { \ CARD32 t = ((x & 0xff00ff) * a) + 0x800080; \ t = (t + ((t >> 8) & 0xff00ff)) >> 8; \ t &= 0xff00ff; \ t += y & 0xff00ff; \ t |= 0x1000100 - ((t >> 8) & 0xff00ff); \ t &= 0xff00ff; \ \ x = (((x >> 8) & 0xff00ff) * a) + 0x800080; \ x = (x + ((x >> 8) & 0xff00ff)) >> 8; \ x &= 0xff00ff; \ x += (y >> 8) & 0xff00ff; \ x |= 0x1000100 - ((t >> 8) & 0xff00ff); \ x &= 0xff00ff; \ x <<= 8; \ x += t; \ } while (0) #define FbByteAddMul(x, a, y, b) do { \ CARD32 t; \ CARD32 r = (x >> 24) * a + (y >> 24) * b + 0x80; \ r += (r >> 8); \ r >>= 8; \ \ t = (x & 0xff00) * a + (y & 0xff00) * b + 0x8000; \ t += (t >> 8); \ t >>= 16; \ \ t |= r << 16; \ t |= 0x1000100 - ((t >> 8) & 0xff00ff); \ t &= 0xff00ff; \ t <<= 8; \ \ r = ((x >> 16) & 0xff) * a + ((y >> 16) & 0xff) * b + 0x80; \ r += (r >> 8); \ r >>= 8; \ \ x = (x & 0xff) * a + (y & 0xff) * b + 0x80; \ x += (x >> 8); \ x >>= 8; \ x |= r << 16; \ x |= 0x1000100 - ((x >> 8) & 0xff00ff); \ x &= 0xff00ff; \ x |= t; \ } while (0) #define FbByteAddMul_256(x, a, y, b) do { \ CARD32 t = (x & 0xff00ff) * a + (y & 0xff00ff) * b; \ t >>= 8; \ t &= 0xff00ff; \ \ x = ((x >> 8) & 0xff00ff) * a + ((y >> 8) & 0xff00ff) * b; \ x &= 0xff00ff00; \ x += t; \ } while (0) static __inline__ vector unsigned int splat_alpha(vector unsigned int pix) { return vec_perm(pix, pix, (vector unsigned char)AVV(0x00,0x00,0x00,0x00, 0x04,0x04,0x04,0x04, 0x08,0x08,0x08,0x08, 0x0C,0x0C,0x0C,0x0C)); } static __inline__ vector unsigned int pix_multiply(vector unsigned int p, vector unsigned int a) { vector unsigned short hi, lo, mod; /* unpack to short */ hi = (vector unsigned short) vec_mergeh((vector unsigned char)AVV(0), (vector unsigned char)p); mod = (vector unsigned short) vec_mergeh((vector unsigned char)AVV(0), (vector unsigned char)a); hi = vec_mladd(hi, mod, (vector unsigned short) //AVV(0)); // hi = vec_adds(hi, (vector unsigned short) AVV(0x0080,0x0080,0x0080,0x0080, 0x0080,0x0080,0x0080,0x0080)); hi = vec_adds(hi, vec_sr(hi, vec_splat_u16(8))); hi = vec_sr(hi, vec_splat_u16(8)); /* unpack to short */ lo = (vector unsigned short) vec_mergel((vector unsigned char)AVV(0), (vector unsigned char)p); mod = (vector unsigned short) vec_mergel((vector unsigned char)AVV(0), (vector unsigned char)a); lo = vec_mladd(lo, mod, (vector unsigned short) //AVV(0)); // lo = vec_adds(lo, (vector unsigned short) AVV(0x0080,0x0080,0x0080,0x0080, 0x0080,0x0080,0x0080,0x0080)); lo = vec_adds(lo, vec_sr(lo, vec_splat_u16(8))); lo = vec_sr(lo, vec_splat_u16(8)); return vec_packsu(hi, lo); } static __inline__ vector unsigned int pix_add (vector unsigned int a, vector unsigned int b) { return vec_adds ((vector unsigned char)a, (vector unsigned char)b); } static __inline__ vector unsigned int pix_add_mul (vector unsigned int x, vector unsigned int a, vector unsigned int y, vector unsigned int b) { vector unsigned short hi, lo, mod, hiy, loy, mody; hi = (vector unsigned short) vec_mergeh((vector unsigned char)AVV(0), (vector unsigned char)x); mod = (vector unsigned short) vec_mergeh((vector unsigned char)AVV(0), (vector unsigned char)a); hiy = (vector unsigned short) vec_mergeh((vector unsigned char)AVV(0), (vector unsigned char)y); mody = (vector unsigned short) vec_mergeh((vector unsigned char)AVV(0), (vector unsigned char)b); hi = vec_mladd(hi, mod, (vector unsigned short) AVV(0x0080,0x0080,0x0080,0x0080, 0x0080,0x0080,0x0080,0x0080)); hi = vec_mladd(hiy, mody, hi); hi = vec_adds(hi, vec_sr(hi, vec_splat_u16(8))); hi = vec_sr(hi, vec_splat_u16(8)); lo = (vector unsigned short) vec_mergel((vector unsigned char)AVV(0), (vector unsigned char)x); mod = (vector unsigned short) vec_mergel((vector unsigned char)AVV(0), (vector unsigned char)a); loy = (vector unsigned short) vec_mergel((vector unsigned char)AVV(0), (vector unsigned char)y); mody = (vector unsigned short) vec_mergel((vector unsigned char)AVV(0), (vector unsigned char)b); lo = vec_mladd(lo, mod, (vector unsigned short) AVV(0x0080,0x0080,0x0080,0x0080, 0x0080,0x0080,0x0080,0x0080)); lo = vec_mladd(loy, mody, lo); lo = vec_adds(lo, vec_sr(lo, vec_splat_u16(8))); lo = vec_sr(lo, vec_splat_u16(8)); return vec_packsu(hi, lo); } static __inline__ vector unsigned int negate (vector unsigned int src) { return vec_nor (src, src); } static __inline__ vector unsigned int over (vector unsigned int src, vector unsigned int srca, vector unsigned int dest) { vector unsigned char tmp = pix_multiply(dest, negate (srca)); tmp = vec_adds((vector unsigned char)src, tmp); return tmp; } static FASTCALL void vmxCombineMaskU (CARD32 *src, const CARD32 *msk, int width) { int i; vector unsigned int vsrc, vmsk; vector unsigned char tmp1, tmp2, tmp3, tmp4, edges, src_mask, msk_mask, store_mask; src_mask = vec_lvsl(0, src); msk_mask = vec_lvsl(0, msk); store_mask = vec_lvsr(0, src); for (i = width/4; i > 0; i--) { tmp1 = vec_ld(0, msk); tmp2 = vec_ld(15, msk); tmp3 = vec_ld(0, src); vmsk = vec_perm(tmp1, tmp2, msk_mask); tmp4 = vec_ld(15, src); vsrc = vec_perm(tmp3, tmp4, src_mask); vmsk = splat_alpha(vmsk); vsrc = pix_multiply(vsrc, vmsk); edges = vec_perm(tmp4, tmp3, src_mask); tmp3 = vec_perm((vector unsigned char)vsrc, edges, store_mask); tmp1 = vec_perm(edges, (vector unsigned char)vsrc, store_mask); vec_st((vector unsigned int) tmp3, 15, src ); vec_st((vector unsigned int) tmp1, 0, src ); msk+=4; src+=4; } for (i = width%4; --i >= 0;) { CARD32 a = msk[i] >> 24; CARD32 s = src[i]; FbByteMul(s, a); src[i] = s; } } static FASTCALL void vmxCombineOverU (CARD32 *dest, const CARD32 *src, int width) { int i; vector unsigned int vdest, vsrc; vector unsigned char tmp1, tmp2, tmp3, tmp4, edges, dest_mask, src_mask, store_mask; dest_mask = vec_lvsl(0, dest); src_mask = vec_lvsl(0, src); store_mask = vec_lvsr(0, dest); for (i = width/4; i > 0; i--) { tmp1 = vec_ld(0, src); tmp2 = vec_ld(15, src); tmp3 = vec_ld(0, dest); vsrc = vec_perm(tmp1, tmp2, src_mask); tmp4 = vec_ld(15, dest); vdest = vec_perm(tmp3, tmp4, dest_mask); tmp1 = splat_alpha(vsrc); vdest = over(vsrc, tmp1, vdest); edges = vec_perm(tmp4, tmp3, dest_mask); tmp3 = vec_perm((vector unsigned char)vdest, edges, store_mask); tmp1 = vec_perm(edges, (vector unsigned char)vdest, store_mask); vec_st((vector unsigned int) tmp3, 15, dest ); vec_st((vector unsigned int) tmp1, 0, dest ); src+=4; dest+=4; } for (i = width%4; --i >=0;) { CARD32 s = src[i]; CARD32 d = dest[i]; CARD32 ia = Alpha(~s); FbByteMulAdd(d, ia, s); dest[i] = d; } } static FASTCALL void vmxCombineOverReverseU (CARD32 *dest, const CARD32 *src, int width) { int i; vector unsigned int vdest, vsrc; vector unsigned char tmp1, tmp2, tmp3, tmp4, edges, dest_mask, src_mask, store_mask; dest_mask = vec_lvsl(0, dest); src_mask = vec_lvsl(0, src); store_mask = vec_lvsr(0, dest); for (i = width/4; i > 0; i--) { // unaligned load tmp1 = vec_ld(0, src); tmp2 = vec_ld(15, src); tmp3 = vec_ld(0, dest); vsrc = vec_perm(tmp1, tmp2, src_mask); tmp4 = vec_ld(15, dest); vdest = vec_perm(tmp3, tmp4, dest_mask); tmp1 = splat_alpha(vdest); vdest = over(vdest, tmp1, vsrc); edges = vec_perm(tmp4, tmp3, dest_mask); tmp3 = vec_perm((vector unsigned char)vdest, edges, store_mask); tmp1 = vec_perm(edges, (vector unsigned char)vdest, store_mask); vec_st((vector unsigned int) tmp3, 15, dest ); vec_st((vector unsigned int) tmp1, 0, dest ); src+=4; dest+=4; } for (i = width%4; --i >=0;) { CARD32 s = src[i]; CARD32 d = dest[i]; CARD32 ia = Alpha(~s); FbByteMulAdd(d, ia, s); dest[i] = d; } } static FASTCALL void vmxCombineInU (CARD32 *dest, const CARD32 *src, int width) { int i; vector unsigned int vdest, vsrc; vector unsigned char tmp1, tmp2, tmp3, tmp4, edges, dest_mask, src_mask, store_mask; dest_mask = vec_lvsl(0, dest); src_mask = vec_lvsl(0, src); store_mask = vec_lvsr(0, dest); for (i = width/4; i > 0; i--) { tmp1 = vec_ld(0, src); tmp2 = vec_ld(15, src); tmp3 = vec_ld(0, dest); vsrc = vec_perm(tmp1, tmp2, src_mask); tmp4 = vec_ld(15, dest); vdest = vec_perm(tmp3, tmp4, dest_mask); tmp1 = splat_alpha(vdest); vdest = pix_multiply(vsrc, tmp1); edges = vec_perm(tmp4, tmp3, dest_mask); tmp3 = vec_perm((vector unsigned char)vdest, edges, store_mask); tmp1 = vec_perm(edges, (vector unsigned char)vdest, store_mask); vec_st((vector unsigned int) tmp3, 15, dest ); vec_st((vector unsigned int) tmp1, 0, dest ); src+=4; dest+=4; } for (i = width%4; --i >=0;) { CARD32 s = src[i]; CARD32 a = Alpha(dest[i]); FbByteMul(s, a); dest[i] = s; } } static FASTCALL void vmxCombineInReverseU (CARD32 *dest, const CARD32 *src, int width) { int i; vector unsigned int vdest, vsrc; vector unsigned char tmp1, tmp2, tmp3, tmp4, edges, dest_mask, src_mask, store_mask; dest_mask = vec_lvsl(0, dest); src_mask = vec_lvsl(0, src); store_mask = vec_lvsr(0, dest); for (i = width/4; i > 0; i--) { tmp1 = vec_ld(0, src); tmp2 = vec_ld(15, src); tmp3 = vec_ld(0, dest); vsrc = vec_perm(tmp1, tmp2, src_mask); tmp4 = vec_ld(15, dest); vdest = vec_perm(tmp3, tmp4, dest_mask); tmp1 = splat_alpha(vsrc); vdest = pix_multiply(vdest, tmp1); edges = vec_perm(tmp4, tmp3, dest_mask); tmp3 = vec_perm((vector unsigned char)vdest, edges, store_mask); tmp1 = vec_perm(edges, (vector unsigned char)vdest, store_mask); vec_st((vector unsigned int) tmp3, 15, dest ); vec_st((vector unsigned int) tmp1, 0, dest ); src+=4; dest+=4; } for (i = width%4; --i >=0;) { CARD32 s = dest[i]; CARD32 a = Alpha(src[i]); FbByteMul(s, a); dest[i] = s; } } static FASTCALL void vmxCombineOutU (CARD32 *dest, const CARD32 *src, int width) { int i; vector unsigned int vdest, vsrc; vector unsigned char tmp1, tmp2, tmp3, tmp4, edges, dest_mask, src_mask, store_mask; dest_mask = vec_lvsl(0, dest); src_mask = vec_lvsl(0, src); store_mask = vec_lvsr(0, dest); for (i = width/4; i > 0; i--) { tmp1 = vec_ld(0, src); tmp2 = vec_ld(15, src); tmp3 = vec_ld(0, dest); vsrc = vec_perm(tmp1, tmp2, src_mask); tmp4 = vec_ld(15, dest); vdest = vec_perm(tmp3, tmp4, dest_mask); tmp1 = splat_alpha(negate(vdest)); vdest = pix_multiply(vsrc, tmp1); edges = vec_perm(tmp4, tmp3, dest_mask); tmp3 = vec_perm((vector unsigned char)vdest, edges, store_mask); tmp1 = vec_perm(edges, (vector unsigned char)vdest, store_mask); vec_st((vector unsigned int) tmp3, 15, dest ); vec_st((vector unsigned int) tmp1, 0, dest ); src+=4; dest+=4; } for (i = width%4; --i >=0;) { CARD32 s = src[i]; CARD32 a = Alpha(~dest[i]); FbByteMul(s, a); dest[i] = s; } } static FASTCALL void vmxCombineOutReverseU (CARD32 *dest, const CARD32 *src, int width) { int i; vector unsigned int vdest, vsrc; vector unsigned char tmp1, tmp2, tmp3, tmp4, edges, dest_mask, src_mask, store_mask; dest_mask = vec_lvsl(0, dest); src_mask = vec_lvsl(0, src); store_mask = vec_lvsr(0, dest); for (i = width/4; i > 0; i--) { tmp1 = vec_ld(0, src); tmp2 = vec_ld(15, src); tmp3 = vec_ld(0, dest); vsrc = vec_perm(tmp1, tmp2, src_mask); tmp4 = vec_ld(15, dest); vdest = vec_perm(tmp3, tmp4, dest_mask); tmp1 = splat_alpha(negate(vsrc)); vdest = pix_multiply(vdest, tmp1); edges = vec_perm(tmp4, tmp3, dest_mask); tmp3 = vec_perm((vector unsigned char)vdest, edges, store_mask); tmp1 = vec_perm(edges, (vector unsigned char)vdest, store_mask); vec_st((vector unsigned int) tmp3, 15, dest ); vec_st((vector unsigned int) tmp1, 0, dest ); src+=4; dest+=4; } for (i = width%4; --i >=0;) { CARD32 s = src[i]; CARD32 a = Alpha(~dest[i]); FbByteMul(s, a); dest[i] = s; } } static FASTCALL void vmxCombineAtopU (CARD32 *dest, const CARD32 *src, int width) { int i; vector unsigned int vdest, vsrc; vector unsigned char tmp1, tmp2, tmp3, tmp4, edges, dest_mask, src_mask, store_mask; dest_mask = vec_lvsl(0, dest); src_mask = vec_lvsl(0, src); store_mask = vec_lvsr(0, dest); for (i = width/4; i > 0; i--) { tmp1 = vec_ld(0, src); tmp2 = vec_ld(15, src); tmp3 = vec_ld(0, dest); vsrc = vec_perm(tmp1, tmp2, src_mask); tmp4 = vec_ld(15, dest); vdest = vec_perm(tmp3, tmp4, dest_mask); tmp1 = splat_alpha(negate(vsrc)); vdest = pix_add_mul(vsrc, splat_alpha(vdest), vdest, tmp1); edges = vec_perm(tmp4, tmp3, dest_mask); tmp3 = vec_perm((vector unsigned char)vdest, edges, store_mask); tmp1 = vec_perm(edges, (vector unsigned char)vdest, store_mask); vec_st((vector unsigned int) tmp3, 15, dest ); vec_st((vector unsigned int) tmp1, 0, dest ); src+=4; dest+=4; } for (i = width%4; --i >=0;) { CARD32 s = src[i]; CARD32 d = dest[i]; CARD32 dest_a = Alpha(d); CARD32 src_ia = Alpha(~s); FbByteAddMul(s, dest_a, d, src_ia); dest[i] = s; } } static FASTCALL void vmxCombineAtopReverseU (CARD32 *dest, const CARD32 *src, int width) { int i; vector unsigned int vdest, vsrc; vector unsigned char tmp1, tmp2, tmp3, tmp4, edges, dest_mask, src_mask, store_mask; dest_mask = vec_lvsl(0, dest); src_mask = vec_lvsl(0, src); store_mask = vec_lvsr(0, dest); for (i = width/4; i > 0; i--) { tmp1 = vec_ld(0, src); tmp2 = vec_ld(15, src); tmp3 = vec_ld(0, dest); vsrc = vec_perm(tmp1, tmp2, src_mask); tmp4 = vec_ld(15, dest); vdest = vec_perm(tmp3, tmp4, dest_mask); tmp1 = splat_alpha(negate(vdest)); vdest = pix_add_mul(vdest, splat_alpha(vsrc), vsrc, tmp1); edges = vec_perm(tmp4, tmp3, dest_mask); tmp3 = vec_perm((vector unsigned char)vdest, edges, store_mask); tmp1 = vec_perm(edges, (vector unsigned char)vdest, store_mask); vec_st((vector unsigned int) tmp3, 15, dest ); vec_st((vector unsigned int) tmp1, 0, dest ); src+=4; dest+=4; } for (i = width%4; --i >=0;) { CARD32 s = src[i]; CARD32 d = dest[i]; CARD32 src_a = Alpha(s); CARD32 dest_ia = Alpha(~d); FbByteAddMul(s, dest_ia, d, src_a); dest[i] = s; } } static FASTCALL void vmxCombineXorU (CARD32 *dest, const CARD32 *src, int width) { int i; vector unsigned int vdest, vsrc; vector unsigned char tmp1, tmp2, tmp3, tmp4, edges, dest_mask, src_mask, store_mask; dest_mask = vec_lvsl(0, dest); src_mask = vec_lvsl(0, src); store_mask = vec_lvsr(0, dest); for (i = width/4; i > 0; i--) { tmp1 = vec_ld(0, src); tmp2 = vec_ld(15, src); tmp3 = vec_ld(0, dest); vsrc = vec_perm(tmp1, tmp2, src_mask); tmp4 = vec_ld(15, dest); vdest = vec_perm(tmp3, tmp4, dest_mask); vdest = pix_add_mul(vsrc, splat_alpha(negate(vdest)), vdest, splat_alpha(negate(vsrc))); edges = vec_perm(tmp4, tmp3, dest_mask); tmp3 = vec_perm((vector unsigned char)vdest, edges, store_mask); tmp1 = vec_perm(edges, (vector unsigned char)vdest, store_mask); vec_st((vector unsigned int) tmp3, 15, dest ); vec_st((vector unsigned int) tmp1, 0, dest ); src+=4; dest+=4; } for (i = width%4; --i >=0;) { CARD32 s = src[i]; CARD32 d = dest[i]; CARD32 src_ia = Alpha(~s); CARD32 dest_ia = Alpha(~d); FbByteAddMul(s, dest_ia, d, src_ia); dest[i] = s; } } static FASTCALL void vmxCombineAddU (CARD32 *dest, const CARD32 *src, int width) { int i; vector unsigned int vdest, vsrc; vector unsigned char tmp1, tmp2, tmp3, tmp4, edges, dest_mask, src_mask, store_mask; dest_mask = vec_lvsl(0, dest); src_mask = vec_lvsl(0, src); store_mask = vec_lvsr(0, dest); for (i = width/4; i > 0; i--) { tmp1 = vec_ld(0, src); tmp2 = vec_ld(15, src); tmp3 = vec_ld(0, dest); vsrc = vec_perm(tmp1, tmp2, src_mask); tmp4 = vec_ld(15, dest); vdest = vec_perm(tmp3, tmp4, dest_mask); vdest = pix_add(vsrc, vdest); edges = vec_perm(tmp4, tmp3, dest_mask); tmp3 = vec_perm((vector unsigned char)vdest, edges, store_mask); tmp1 = vec_perm(edges, (vector unsigned char)vdest, store_mask); vec_st((vector unsigned int) tmp3, 15, dest ); vec_st((vector unsigned int) tmp1, 0, dest ); src+=4; dest+=4; } for (i = width%4; --i >=0;) { CARD32 s = src[i]; CARD32 d = dest[i]; FbByteAdd(d, s); dest[i] = d; } } //static FASTCALL void //vmxCombineSaturateU (CARD32 *dest, const CARD32 *src, int width) //unimplemented #if 0 static FASTCALL void vmxCombineSrcC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width) { int i; vector unsigned int vsrc, vmsk, vdest; vector unsigned char tmp1, tmp2, tmp3, tmp4, edges, src_mask, msk_mask, dest_mask, store_mask; src_mask = vec_lvsl(0, src); dest_mask = vec_lvsl(0, dest); msk_mask = vec_lvsl(0, msk); store_mask = vec_lvsr(0, dest); for (i = width/4; i > 0; i--) { tmp1 = vec_ld(0, msk); tmp2 = vec_ld(15, msk); tmp3 = vec_ld(0, src); vmsk = vec_perm(tmp1, tmp2, msk_mask); tmp4 = vec_ld(15, src); vsrc = vec_perm(tmp3, tmp4, src_mask); tmp1 = vec_ld(0, dest); tmp2 = vec_ld(15, dest); vdest = pix_multiply(vsrc, vmsk); edges = vec_perm(tmp2, tmp1, src_mask); tmp3 = vec_perm((vector unsigned char)vdest, edges, store_mask); tmp4 = vec_perm(edges, (vector unsigned char)vdest, store_mask); vec_st((vector unsigned int) tmp3, 15, src ); vec_st((vector unsigned int) tmp4, 0, src ); msk+=4; src+=4; dest+=4; } fbCombineSrcC (dest, src, mask, width%4); } #endif static void fbCombineMaskU (CARD32 *src, const CARD32 *mask, int width) { int i; for (i = 0; i < width; ++i) { CARD32 a = mask[i] >> 24; CARD32 s = src[i]; FbByteMul(s, a); src[i] = s; } } static void fbCombineOverU (CARD32 *dest, const CARD32 *src, int width) { int i; for (i = 0; i < width; ++i) { CARD32 s = src[i]; CARD32 d = dest[i]; CARD32 ia = Alpha(~s); FbByteMulAdd(d, ia, s); dest[i] = d; } } static void fbCombineOverReverseU (CARD32 *dest, const CARD32 *src, int width) { int i; for (i = 0; i < width; ++i) { CARD32 s = src[i]; CARD32 d = dest[i]; CARD32 ia = Alpha(~dest[i]); FbByteMulAdd(s, ia, d); dest[i] = s; } } static void fbCombineInU (CARD32 *dest, const CARD32 *src, int width) { int i; for (i = 0; i < width; ++i) { CARD32 s = src[i]; CARD32 a = Alpha(dest[i]); FbByteMul(s, a); dest[i] = s; } } static void fbCombineOutU (CARD32 *dest, const CARD32 *src, int width) { int i; for (i = 0; i < width; ++i) { CARD32 s = src[i]; CARD32 a = Alpha(~dest[i]); FbByteMul(s, a); dest[i] = s; } } static FASTCALL void fbCombineOutReverseU (CARD32 *dest, const CARD32 *src, int width) { int i; for (i = 0; i < width; ++i) { CARD32 d = dest[i]; CARD32 a = Alpha(~src[i]); FbByteMul(d, a); dest[i] = d; } } static FASTCALL void fbCombineAtopU (CARD32 *dest, const CARD32 *src, int width) { int i; for (i = 0; i < width; ++i) { CARD32 s = src[i]; CARD32 d = dest[i]; CARD32 dest_a = Alpha(d); CARD32 src_ia = Alpha(~s); FbByteAddMul(s, dest_a, d, src_ia); dest[i] = s; } } static FASTCALL void fbCombineAtopReverseU (CARD32 *dest, const CARD32 *src, int width) { int i; for (i = 0; i < width; ++i) { CARD32 s = src[i]; CARD32 d = dest[i]; CARD32 src_a = Alpha(s); CARD32 dest_ia = Alpha(~d); FbByteAddMul(s, dest_ia, d, src_a); dest[i] = s; } } static FASTCALL void fbCombineXorU (CARD32 *dest, const CARD32 *src, int width) { int i; for (i = 0; i < width; ++i) { CARD32 s = src[i]; CARD32 d = dest[i]; CARD32 src_ia = Alpha(~s); CARD32 dest_ia = Alpha(~d); FbByteAddMul(s, dest_ia, d, src_ia); dest[i] = s; } } static FASTCALL void fbCombineAddU (CARD32 *dest, const CARD32 *src, int width) { int i; for (i = 0; i < width; ++i) { CARD32 s = src[i]; CARD32 d = dest[i]; FbByteAdd(d, s); dest[i] = d; } } static FASTCALL void fbCombineSaturateU (CARD32 *dest, const CARD32 *src, int width) { int i; for (i = 0; i < width; ++i) { CARD32 s = src[i]; CARD32 d = dest[i]; CARD16 sa, da; sa = s >> 24; da = ~d >> 24; if (sa > da) { // sa = FbIntDiv(da, sa); // FbByteMul(s, sa); } FbByteAdd(d, s); dest[i] = d; } } int main (int argc, char** argv) { int data[3][10245]; __attribute__((aligned(16))); int i; //input data for (i = 0; i<10240; i++) data[0][i] = data[1][i] = (i&0xff) * 0x01010101; // for (i = 0; i<10240; i++) data[2][i] = (i&0xff) * 0x01010101; for (i = 0; i<10240; i++) if (data[0][i] != data[1][i]) { printf ("wrong byte %d : %d != %d\n",i , data[0][i], data[1][i]); } printf ("combine \n"); fbCombineAddU (data[0], data[2], 1024); vmxCombineAddU (data[1], data[2],1024); for (i = 0; i<10240; i++) if (data[0][i] != data[1][i]) { printf ("wrong byte %0d : %0x != %0x\n",i , data[0][i], data[1][i]); } return 0; }