/* * Written and Copyright (C) 2006 by Luca Barbato * * Altivec optimization for 15bit and 32bit pixel other depth pending * for questions or suggestions * * Permission is hereby granted to Michael Jennings to license this code as * he sees fit. I'd prefer the GPL but he will choose the BSD. The debate * is moot as this is to become a part of the Eterm project, for which he is * the primary author. For users of this code I ask that any modifications * be released back into the community but with Michael Jennings chooses the * BSD license then that request has no backing in law. * * * */ #include "cmod.h" #include #ifdef ETERM_ALTIVEC #include #include #ifdef CONFIG_DARWIN #define AVV(x...) (x) #else #define AVV(x...) {x} #endif #define DATA16 unsigned short // beware the variables have to be declared in the block! #define ALTIVEC_UNALIGNED_LOAD(source, target) \ /* load two vector */\ tmp1 = vec_ld(0, source);\ tmp2 = vec_ld(15, source);\ \ /* compute the mask to align */\ mask = vec_lvsl(0, source);\ \ target = vec_perm(tmp1, tmp2, mask); #define ALTIVEC_UNALIGNED_STORE(source, target) \ /* get the unaligned vectors edges * (the mask is from load) */\ edges = vec_perm(tmp2, tmp1, mask);\ \ mask = vec_lvsr( 0, target);\ \ tmp1 = vec_perm(edges, (vector unsigned char)source, mask);\ \ tmp2 = vec_perm((vector unsigned char)source, edges, mask);\ /* vec_stl? */\ vec_st( tmp2, 15, target );\ \ vec_st( tmp1, 0, target );\ #define SHADE15_ALTIVEC_NONSAT(p) \ /* unpack to byte */\ a = vec_unpackh(p);\ \ /* multiply even */\ ev = vec_sr(vec_mule(a,mod),vec_splat_u16(5));\ \ /* multiply odd */\ od = vec_sr(vec_mulo(a,mod),vec_splat_u16(5));\ \ /* pack */\ a = vec_pack(vec_mergeh(ev,od),vec_mergel(ev,od));\ \ /* unpack to byte */\ b = vec_unpackl(p);\ \ /* multiply even */\ ev = vec_sr(vec_mule(b,mod),vec_splat_u16(5));\ \ /* multiply odd */\ od = vec_sr(vec_mulo(b,mod),vec_splat_u16(5));\ \ /* pack+saturate */\ b = vec_pack(vec_mergeh(ev,od),vec_mergel(ev,od));\ \ /* pack to pixel */\ p = vec_packpx((vector unsigned int)a,\ (vector unsigned int)b); #define SHADE15_ALTIVEC_SAT(p)\ /* unpack to byte */\ a = vec_unpackh(p);\ \ /* unpack to short */\ hi = (vector unsigned short)vec_unpackh((vector signed char)a);\ \ /* multiply and shift to right */\ ev = vec_sr(vec_mule(hi,mod),vec_splat_u32(5));\ od = vec_sr(vec_mulo(hi,mod),vec_splat_u32(5));\ \ /* pack to short */\ hi = vec_packsu(vec_mergeh(ev,od),vec_mergel(ev,od));\ \ /* unpack to short */\ lo = (vector unsigned short)vec_unpackl((vector signed char)a);\ \ /* multiply and shift to right */\ ev = vec_sr(vec_mule(lo,mod),vec_splat_u32(5));\ od = vec_sr(vec_mulo(lo,mod),vec_splat_u32(5));\ \ /* pack to short */\ lo = vec_packsu(vec_mergeh(ev,od),vec_mergel(ev,od));\ \ /* pack to byte */\ a = vec_packsu(hi,lo);\ \ /* unpack to byte */\ b = vec_unpackl(p);\ \ /* unpack to short */\ hi = (vector unsigned short)vec_unpackh((vector signed char)b);\ \ /* multiply */\ ev = vec_sr(vec_mule(hi,mod),vec_splat_u32(5));\ od = vec_sr(vec_mulo(hi,mod),vec_splat_u32(5));\ \ hi = vec_packsu(vec_mergeh(ev,od),vec_mergel(ev,od));\ \ /* unpack to short */\ lo = (vector unsigned short)vec_unpackl((vector signed char)b);\ \ /* multiply */\ ev = vec_sr(vec_mule(lo,mod),vec_splat_u32(5));\ od = vec_sr(vec_mulo(lo,mod),vec_splat_u32(5));\ \ lo = vec_packsu(vec_mergeh(ev,od),vec_mergel(ev,od));\ \ /* pack to byte */\ b = vec_packsu(hi,lo);\ \ /* pack to pixel */\ p = vec_packpx(\ (vector unsigned int)a,\ (vector unsigned int)b); #define SHADE32_ALTIVEC_NONSAT(p)\ /* multiply even */\ ev = vec_sr(vec_mule(p,mod),vec_splat_u16(8));\ \ /* multiply odd */\ od = vec_sr(vec_mulo(p,mod),vec_splat_u16(8));\ \ /* pack */\ p = vec_pack(vec_mergeh(ev,od),vec_mergel(ev,od)); #define SHADE32_ALTIVEC_SAT(p)\ /* unpack to short */\ hi = (vector unsigned short)\ vec_mergeh((vector unsigned char)AVV(0),\ (vector unsigned char)p);\ \ /* multiply and shift to right */\ ev = vec_sr(vec_mule(hi,mod),vec_splat_u32(8));\ od = vec_sr(vec_mulo(hi,mod),vec_splat_u32(8));\ \ /* pack to short */\ hi = vec_packsu(vec_mergeh(ev,od),vec_mergel(ev,od));\ \ /* unpack to short */\ lo = (vector unsigned short)\ vec_mergel((vector unsigned char)AVV(0),\ (vector unsigned char)p);\ \ /* multiply and shift to right */\ ev = vec_sr(vec_mule(lo,mod),vec_splat_u32(8));\ od = vec_sr(vec_mulo(lo,mod),vec_splat_u32(8));\ \ /* pack to short */\ lo = vec_packsu(vec_mergeh(ev,od),vec_mergel(ev,od));\ \ /* pack to byte */\ p = vec_packsu(hi,lo); #define SHADE15_SCALAR_NONSAT(p) #define SHADE32_SCALAR_SAT(p) #define SHADE15_SCALAR_NONSAT(p) #define SHADE32_SCALAR_SAT(p) /* RGB 15 */ void shade_ximage_15_altivec(void *data, int bpl, int w, int h, int rm, int gm, int bm) { union { unsigned short tmp_s[8] __attribute__ ((aligned(16))); unsigned char tmp_c[16] __attribute__ ((aligned(16))); vector signed short vec; } argb; int x, y; // splat the *m modifiers to a full vector if ((rm <= 255) && (gm <= 255) && (bm <= 255)) { /* No saturation */ printf("no saturation "); vector unsigned char mod; argb.tmp_c[0] = 1; //alpha argb.tmp_c[1] = rm; //red argb.tmp_c[2] = gm; //green argb.tmp_c[3] = bm; //blue //we use vec_splat mod = vec_splat((vector unsigned int)argb.vec, 0); if ((intptr_t)(data)%16) { //unaligned printf("unalign"); unsigned char *ptr = data; bpl -= 16*w/8; // remainder in byte for (y = h; --y >= 0;) { for (x = w/8; x > 0; x--) { vector unsigned char a, b, mask, edges; vector unsigned short ev, od; vector unsigned char tmp1, tmp2; vector pixel src; ALTIVEC_UNALIGNED_LOAD(ptr, src) SHADE15_ALTIVEC_NONSAT(src) ALTIVEC_UNALIGNED_STORE(src, ptr) ptr+=16; } for (x = w%8; --x >= 0;) { int r, g, b; b = ((DATA16 *) ptr)[x]; r = (b & 0x7c00) * rm; g = (b & 0x3e0) * gm; b = (b & 0x1f) * bm; ((DATA16 *) ptr)[x] = ((r >> 8) & 0x7c00) | ((g >> 8) & 0x3e0) | ((b >> 8) & 0x1f); } ptr += bpl; // what remains } } else { vector pixel *ptr = data; bpl = bpl/16 - w/8; // remainder in vector offset for (y = h; --y >= 0;) { for (x = w/8; x > 0; x--) { vector unsigned char a, b; vector unsigned short ev, od; SHADE15_ALTIVEC_NONSAT(*ptr) ptr++; } for (x = w%8; --x >= 0;) { int r, g, b; b = ((DATA16 *) ptr)[x]; r = (b & 0x7c00) * rm; g = (b & 0x3e0) * gm; b = (b & 0x1f) * bm; ((DATA16 *) ptr)[x] = ((r >> 8) & 0x7c00) | ((g >> 8) & 0x3e0) | ((b >> 8) & 0x1f); } ptr += bpl; //what remains } }// align condition } else { vector unsigned short mod; argb.tmp_s[0] = 1; //alpha argb.tmp_s[1] = rm; //red argb.tmp_s[2] = gm; //green argb.tmp_s[3] = bm; //blue // permute, we cannot splat mod = vec_perm((vector unsigned char)argb.vec, (vector unsigned char)AVV(0,0,0,0), (vector unsigned char)AVV(0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07)); if ((intptr_t)(data)%16) { //unaligned printf("unalign"); unsigned char *ptr = data; bpl -= 16*w/8; // remainder in byte for (y = h; --y >= 0;) { for (x = w/8; x > 0; x--) { vector unsigned char a, b, mask, edges; vector unsigned short hi, lo; vector unsigned int ev, od; vector unsigned char tmp1, tmp2; vector pixel src; ALTIVEC_UNALIGNED_LOAD(ptr, src) SHADE15_ALTIVEC_SAT(src) ALTIVEC_UNALIGNED_STORE(src, ptr) ptr += 16; } for (x = w%8; --x >= 0;) { int r, g, b; b = ((DATA16 *) ptr)[x]; r = (((b >> 10 ) & 0x001f ) * rm ) >> 8; r = ( r > 0x001f ) ? 0x7c00 : ( r << 10 ); g = (((b >> 5 ) & 0x001f ) * gm ) >> 8; g = ( g > 0x001f ) ? 0x03e0 : ( g << 5 ); b = (( b & 0x001f ) * bm ) >> 8; b = ( b > 0x001f ) ? 0x001f : b; ((DATA16 *) ptr)[x] = (r|g|b); } ptr += bpl; } } else { vector pixel *ptr = data; bpl = bpl/16 - w/8; // remainder in vector offset for (y = h; --y >= 0;) { for (x = w/8; x > 0; x--) { vector unsigned char a, b; vector unsigned short hi, lo; vector unsigned int ev, od; SHADE15_ALTIVEC_SAT(*ptr) ptr++; } for (x = w%8; --x >= 0;) { int r, g, b; b = ((DATA16 *) ptr)[x]; r = (((b >> 10 ) & 0x001f ) * rm ) >> 8; r = ( r > 0x001f ) ? 0x7c00 : ( r << 10 ); g = (((b >> 5 ) & 0x001f ) * gm ) >> 8; g = ( g > 0x001f ) ? 0x03e0 : ( g << 5 ); b = (( b & 0x001f ) * bm ) >> 8; b = ( b > 0x001f ) ? 0x001f : b; ((DATA16 *) ptr)[x] = (r|g|b); } ptr += bpl; } } // align condition } } #if 0 /* RGB 16 */ void shade_ximage_16_C(void *data, int bpl, int w, int h, int rm, int gm, int bm) { /* int r, g, b; b = ((DATA16 *) ptr)[x]; r = (b & 0xf800) * rm; g = (b & 0x7e0) * gm; b = (b & 0x1f) * bm; ((DATA16 *) ptr)[x] = ((r >> 8) & 0xf800) | ((g >> 8) & 0x7e0) | ((b >> 8) & 0x1f); */ } ptr += bpl; } } else { for (y = h; --y >= 0;) { for (x = -w; x < 0; x++) { /* int r, g, b; b = ((DATA16 *) ptr)[x]; r = ( (b >> 11 ) * rm ) >> 8; r = ( r > 0x001f ) ? 0xf800 : ( r << 11 ); g = (((b >> 5 ) & 0x003f ) * gm ) >> 8; g = ( g > 0x003f ) ? 0x07e0 : ( g << 5 ); b = (( b & 0x001f ) * bm ) >> 8; b = ( b > 0x001f ) ? 0x001f : b; ((DATA16 *) ptr)[x] = (r|g|b);*/ } for (x=-(w%4), x > 0, x++) { } ptr += bpl; } } } #endif /* RGB 32 */ void shade_ximage_32_altivec(void *data, int bpl, int w, int h, int rm, int gm, int bm) { union { unsigned short tmp_s[8] __attribute__ ((aligned(16))); unsigned char tmp_c[16] __attribute__ ((aligned(16))); vector unsigned char vec; } argb; int x, y; if ((rm <= 255) && (gm <= 255) && (bm <= 255)) { /* No saturation */ vector unsigned char mod; argb.tmp_c[0] = 1; //alpha argb.tmp_c[1] = rm; //red argb.tmp_c[2] = gm; //green argb.tmp_c[3] = bm; //blue printf (" no saturation\n"); //we use vec_splat mod = vec_splat((vector unsigned int)argb.vec, 0); if ((uintptr_t)(data) & 15) { unsigned char *ptr = data; bpl -= 16*(w/4); for (y = h; --y >= 0;) { for (x = w/4; x > 0; x--) { // 4 pixel at time vector unsigned short ev, od; vector unsigned char tmp1, tmp2, mask, edges; vector unsigned char src; ALTIVEC_UNALIGNED_LOAD(ptr,src) SHADE32_ALTIVEC_NONSAT(src) ALTIVEC_UNALIGNED_STORE(src,ptr) ptr+=16; } for (x = (w%4-1)*4; x >=0; x -= 4) { unsigned char *ptr_c = (unsigned char *)ptr; ptr_c[x + 1] = (unsigned char) ((ptr_c[x + 1] * rm) >> 8); ptr_c[x + 2] = (unsigned char) ((ptr_c[x + 2] * gm) >> 8); ptr_c[x + 3] = (unsigned char) ((ptr_c[x + 3] * bm) >> 8); } ptr += bpl; } } else { vector unsigned char *ptr = data; bpl = bpl/16 - w/4; for (y = h; --y >= 0;) { for (x = w/4; x > 0; x--) { // 4 pixel at time vector unsigned short ev, od; SHADE32_ALTIVEC_NONSAT(*ptr) ptr++; } for (x = (w%4-1)*4; x >=0; x -= 4) { unsigned char *ptr_c = (unsigned char *)ptr; ptr_c[x + 1] = (unsigned char) ((ptr_c[x + 1] * rm) >> 8); ptr_c[x + 2] = (unsigned char) ((ptr_c[x + 2] * gm) >> 8); ptr_c[x + 3] = (unsigned char) ((ptr_c[x + 3] * bm) >> 8); } ptr += bpl; } } } else { vector unsigned short mod; argb.tmp_s[0] = 1; //alpha argb.tmp_s[1] = rm; //red argb.tmp_s[2] = gm; //green argb.tmp_s[3] = bm; //blue // permute, we cannot splat mod = vec_perm((vector unsigned char)argb.vec, (vector unsigned char)AVV(0,0,0,0), (vector unsigned char)AVV(0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07)); if ((uintptr_t)(data) & 15) { unsigned char *ptr = data; bpl -= 16*w/4; for (y = h; --y >= 0;) { for (x = w/4; x > 0; x--) { // 4 pixel at time vector unsigned short hi, lo; vector unsigned int ev, od; vector unsigned char tmp1, tmp2, mask, edges; vector unsigned char src; ALTIVEC_UNALIGNED_LOAD(ptr,src) SHADE32_ALTIVEC_SAT(src) ALTIVEC_UNALIGNED_STORE(src, ptr) ptr += 16; } for (x = (w%4-1) * 4; x >= 0; x -= 4) { int r, g, b; r = (ptr[x + 1] * rm) >> 8; ptr[x + 1] = r|(!(r >> 8) - 1); g = (ptr[x + 2] * gm) >> 8; ptr[x + 2] = g|(!(g >> 8) - 1); b = (ptr[x + 3] * bm) >> 8; ptr[x + 3] = b|(!(b >> 8) - 1); } ptr += bpl; } } else { vector unsigned char *ptr = data; bpl = bpl/16 - w/4; for (y = h; --y >= 0;) { for (x = w/4; x > 0; x--) { // 4 pixel at time vector unsigned short hi, lo; vector unsigned int ev, od; SHADE32_ALTIVEC_SAT(*ptr) ptr++; } for (x = (w%4-1) * 4; x >= 0; x -= 4) { int r, g, b; unsigned char *ptr_c = (unsigned char *) ptr; r = (ptr_c[x + 1] * rm) >> 8; ptr_c[x + 1] = r|(!(r >> 8) - 1); g = (ptr_c[x + 2] * gm) >> 8; ptr_c[x + 2] = g|(!(g >> 8) - 1); b = (ptr_c[x + 3] * bm) >> 8; ptr_c[x + 3] = b|(!(b >> 8) - 1); } ptr += bpl; } } } } #if 0 /* RGB 24 */ void shade_ximage_24_C(void *data, int bpl, int w, int h, int rm, int gm, int bm) { unsigned char *ptr; int x, y; ptr = (unsigned char *) data + (w * 3); if ((rm <= 256) && (gm <= 256) && (bm <= 256)) { /* No saturation */ for (y = h; --y >= 0;) { for (x = -(w * 3); x < 0; x += 3) { # if WORDS_BIGENDIAN ptr[x + 0] = (ptr[x + 0] * rm) >> 8; ptr[x + 1] = (ptr[x + 1] * gm) >> 8; ptr[x + 2] = (ptr[x + 2] * bm) >> 8; # else ptr[x + 2] = (ptr[x + 2] * rm) >> 8; ptr[x + 1] = (ptr[x + 1] * gm) >> 8; ptr[x + 0] = (ptr[x + 0] * bm) >> 8; # endif } ptr += bpl; } } else { for (y = h; --y >= 0;) { for (x = -(w * 3); x < 0; x += 3) { int r, g, b; # if WORDS_BIGENDIAN r = (ptr[x + 0] * rm) >> 8; ptr[x + 0] = r|(!(r >> 8) - 1); g = (ptr[x + 1] * gm) >> 8; ptr[x + 1] = g|(!(g >> 8) - 1); b = (ptr[x + 2] * bm) >> 8; ptr[x + 2] = b|(!(b >> 8) - 1); # else r = (ptr[x + 2] * rm) >> 8; ptr[x + 2] = r|(!(r >> 8) - 1); g = (ptr[x + 1] * gm) >> 8; ptr[x + 1] = g|(!(g >> 8) - 1); b = (ptr[x + 0] * bm) >> 8; ptr[x + 0] = b|(!(b >> 8) - 1); # endif } ptr += bpl; } } } #endif #endif