/*
 * Written and Copyright (C) 2006 by Luca Barbato
 * 
 * Altivec optimization for 15bit and 32bit pixel other depth pending
 * for questions or suggestions <lu_zero@gentoo.org>
 * 
 * Permission is hereby granted to Michael Jennings to license this code as
 * he sees fit. I'd prefer the GPL but he will choose the BSD. The debate
 * is moot as this is to become a part of the Eterm project, for which he is 
 * the primary author.  For users of this code I ask that any modifications
 * be released back into the community but with Michael Jennings chooses the
 * BSD license then that request has no backing in law.
 *
 * 
 * 
 */

#include "cmod.h"
#include <stdio.h>
#ifdef ETERM_ALTIVEC
#include <altivec.h>
#include <inttypes.h>

#ifdef CONFIG_DARWIN
#define AVV(x...) (x)
#else
#define AVV(x...) {x}
#endif

#define DATA16 unsigned short

// beware the variables have to be declared in the block!

#define ALTIVEC_UNALIGNED_LOAD(source, target) \
		/* load two vector */\
                tmp1 = vec_ld(0, source);\
                tmp2 = vec_ld(15, source);\
\
		/* compute the mask to align */\
                mask = vec_lvsl(0, source);\
\
                target = vec_perm(tmp1, tmp2, mask);

#define ALTIVEC_UNALIGNED_STORE(source, target) \
	        /* get the unaligned vectors edges 
		 * (the mask is from load) */\
                edges = vec_perm(tmp2, tmp1, mask);\
\
                mask = vec_lvsr( 0, target);\
\
                tmp1 = vec_perm(edges, (vector unsigned char)source, mask);\
\
                tmp2 = vec_perm((vector unsigned char)source, edges, mask);\
/* vec_stl? */\
                vec_st( tmp2, 15, target );\
\
                vec_st( tmp1, 0, target );\


#define SHADE15_ALTIVEC_NONSAT(p) \
		/* unpack to byte */\
                a = vec_unpackh(p);\
\
                /* multiply even */\
                ev = vec_sr(vec_mule(a,mod),vec_splat_u16(5));\
\
                /* multiply odd */\
                od = vec_sr(vec_mulo(a,mod),vec_splat_u16(5));\
\
                /* pack */\
                a = vec_pack(vec_mergeh(ev,od),vec_mergel(ev,od));\
\
                /* unpack to byte */\
                b = vec_unpackl(p);\
\
                /* multiply even */\
                ev = vec_sr(vec_mule(b,mod),vec_splat_u16(5));\
\
                /* multiply odd */\
                od = vec_sr(vec_mulo(b,mod),vec_splat_u16(5));\
\
                /* pack+saturate */\
                b = vec_pack(vec_mergeh(ev,od),vec_mergel(ev,od));\
\
                /* pack to pixel */\
                p = vec_packpx((vector unsigned int)a,\
				  (vector unsigned int)b);


#define SHADE15_ALTIVEC_SAT(p)\
		/* unpack to byte */\
                a = vec_unpackh(p);\
                \
		/* unpack to short */\
                hi = (vector unsigned short)vec_unpackh((vector signed char)a);\
\
		/* multiply and shift to right */\
		ev = vec_sr(vec_mule(hi,mod),vec_splat_u32(5));\
		od = vec_sr(vec_mulo(hi,mod),vec_splat_u32(5));\
\
		/* pack to short */\
		hi = vec_packsu(vec_mergeh(ev,od),vec_mergel(ev,od));\
		\
		/* unpack to short */\
                lo = (vector unsigned short)vec_unpackl((vector signed char)a);\
\
                /* multiply and shift to right */\
		ev = vec_sr(vec_mule(lo,mod),vec_splat_u32(5));\
		od = vec_sr(vec_mulo(lo,mod),vec_splat_u32(5));\
		\
		/* pack to short */\
		lo = vec_packsu(vec_mergeh(ev,od),vec_mergel(ev,od));\
\
                /* pack to byte */\
                a = vec_packsu(hi,lo);\
                \
                /* unpack to byte  */\
                b = vec_unpackl(p);\
\
                /* unpack to short */\
                hi = (vector unsigned short)vec_unpackh((vector signed char)b);\
\
                /* multiply */\
		ev = vec_sr(vec_mule(hi,mod),vec_splat_u32(5));\
		od = vec_sr(vec_mulo(hi,mod),vec_splat_u32(5));\
		\
		hi = vec_packsu(vec_mergeh(ev,od),vec_mergel(ev,od));\
\
                /* unpack to short */\
                lo = (vector unsigned short)vec_unpackl((vector signed char)b);\
\
                /* multiply */\
		ev = vec_sr(vec_mule(lo,mod),vec_splat_u32(5));\
		od = vec_sr(vec_mulo(lo,mod),vec_splat_u32(5));\
		    \
		lo = vec_packsu(vec_mergeh(ev,od),vec_mergel(ev,od));\
\
                /* pack to byte */\
                b = vec_packsu(hi,lo);\
\
                /* pack to pixel */\
                p = vec_packpx(\
			    (vector unsigned int)a,\
			    (vector unsigned int)b);


#define SHADE32_ALTIVEC_NONSAT(p)\
                /* multiply even */\
                ev = vec_sr(vec_mule(p,mod),vec_splat_u16(8));\
\
		/* multiply odd */\
                od = vec_sr(vec_mulo(p,mod),vec_splat_u16(8));\
\
		/* pack */\
                p = vec_pack(vec_mergeh(ev,od),vec_mergel(ev,od));


#define SHADE32_ALTIVEC_SAT(p)\
                /* unpack to short */\
                hi = (vector unsigned short)\
				vec_mergeh((vector unsigned char)AVV(0),\
					    (vector unsigned char)p);\
\
		/* multiply and shift to right */\
		ev = vec_sr(vec_mule(hi,mod),vec_splat_u32(8));\
		od = vec_sr(vec_mulo(hi,mod),vec_splat_u32(8));\
	\
		/* pack to short */\
		hi = vec_packsu(vec_mergeh(ev,od),vec_mergel(ev,od));\
\
                /* unpack to short */\
                lo = (vector unsigned short)\
				vec_mergel((vector unsigned char)AVV(0),\
					    (vector unsigned char)p);\
\
                /* multiply and shift to right */\
		ev = vec_sr(vec_mule(lo,mod),vec_splat_u32(8));\
		od = vec_sr(vec_mulo(lo,mod),vec_splat_u32(8));\
\
		/* pack to short */\
		lo = vec_packsu(vec_mergeh(ev,od),vec_mergel(ev,od));\
\
                /* pack to byte */\
                p = vec_packsu(hi,lo);

#define SHADE15_SCALAR_NONSAT(p) 
#define SHADE32_SCALAR_SAT(p) 
#define SHADE15_SCALAR_NONSAT(p) 
#define SHADE32_SCALAR_SAT(p) 


/* RGB 15 */
void
shade_ximage_15_altivec(void *data, int bpl, int w, int h, int rm, int gm, int bm)
{

    union {
        unsigned short tmp_s[8] __attribute__ ((aligned(16)));
        unsigned char tmp_c[16] __attribute__ ((aligned(16)));
        vector signed short vec;
        } argb;
    int x, y;
    

    // splat the *m modifiers to a full vector
    
    if ((rm <= 255) && (gm <= 255) && (bm <= 255)) {
        /* No saturation */
	printf("no saturation ");
        vector unsigned char mod;
        argb.tmp_c[0] = 1;    //alpha
	argb.tmp_c[1] = rm;   //red
	argb.tmp_c[2] = gm;   //green
	argb.tmp_c[3] = bm;   //blue
        //we use vec_splat
	mod = vec_splat((vector unsigned int)argb.vec, 0);
    if ((intptr_t)(data)%16) { //unaligned
	printf("unalign");
        unsigned char *ptr = data; 
	bpl -= 16*w/8; // remainder in byte
	for (y = h; --y >= 0;) {
            for (x = w/8; x > 0; x--) {
                vector unsigned char a, b, mask, edges;
                vector unsigned short ev, od;
                vector unsigned char tmp1, tmp2;
                vector pixel src;
		
		ALTIVEC_UNALIGNED_LOAD(ptr, src)
		
		SHADE15_ALTIVEC_NONSAT(src)

		ALTIVEC_UNALIGNED_STORE(src, ptr)

		ptr+=16;
            }
            
            for (x = w%8; --x >= 0;) {
                int r, g, b;

                b = ((DATA16 *) ptr)[x];
                r = (b & 0x7c00) * rm;
                g = (b & 0x3e0) * gm;
                b = (b & 0x1f) * bm;
                ((DATA16 *) ptr)[x] = ((r >> 8) & 0x7c00)
                    | ((g >> 8) & 0x3e0)
                    | ((b >> 8) & 0x1f);
            }

            ptr += bpl; // what remains
        }
    } else {
	vector pixel *ptr = data;
	bpl = bpl/16 - w/8; // remainder in vector offset
	for (y = h; --y >= 0;) {
            for (x = w/8; x > 0; x--) {
                vector unsigned char a, b;
                vector unsigned short ev, od;   
		SHADE15_ALTIVEC_NONSAT(*ptr)
		ptr++;
            }
            
            for (x = w%8; --x >= 0;) {
                int r, g, b;

                b = ((DATA16 *) ptr)[x];
                r = (b & 0x7c00) * rm;
                g = (b & 0x3e0) * gm;
                b = (b & 0x1f) * bm;
                ((DATA16 *) ptr)[x] = ((r >> 8) & 0x7c00)
                    | ((g >> 8) & 0x3e0)
                    | ((b >> 8) & 0x1f);
            }

            ptr += bpl; //what remains
        }
	}// align condition
    } else {
        vector unsigned short mod;
	
        argb.tmp_s[0] = 1;    //alpha
	argb.tmp_s[1] = rm;   //red
	argb.tmp_s[2] = gm;   //green
	argb.tmp_s[3] = bm;   //blue

        // permute, we cannot splat
        mod = vec_perm((vector unsigned char)argb.vec,
	    (vector unsigned char)AVV(0,0,0,0),
	    (vector unsigned char)AVV(0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
				      0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07));
    if ((intptr_t)(data)%16) { //unaligned
	printf("unalign");
        unsigned char *ptr = data; 
	bpl -= 16*w/8; // remainder in byte
        for (y = h; --y >= 0;) {
            for (x = w/8; x > 0; x--) {
                vector unsigned char a, b, mask, edges;
                vector unsigned short hi, lo;
		vector unsigned int ev, od;
                vector unsigned char tmp1, tmp2;
		vector pixel src;
		
		ALTIVEC_UNALIGNED_LOAD(ptr, src)
		    
                SHADE15_ALTIVEC_SAT(src)
		
		ALTIVEC_UNALIGNED_STORE(src, ptr)
		
                ptr += 16;
            }
            
            for (x = w%8; --x >= 0;) {
                int r, g, b;
                b = ((DATA16 *) ptr)[x];
                r = (((b >> 10 ) & 0x001f )  * rm ) >> 8;
                r = ( r > 0x001f ) ? 0x7c00 : ( r << 10 );
                g = (((b >>  5 ) & 0x001f ) * gm ) >> 8;
                g = ( g > 0x001f ) ? 0x03e0 : ( g << 5 );
                b = (( b         & 0x001f ) * bm ) >> 8;
                b = ( b > 0x001f ) ? 0x001f : b;
                ((DATA16 *) ptr)[x] = (r|g|b);
            }
            ptr += bpl;
        }
    } else {
	vector pixel *ptr = data;
	bpl = bpl/16 - w/8; // remainder in vector offset
        for (y = h; --y >= 0;) {
            for (x = w/8; x > 0; x--) {
                vector unsigned char a, b;
                vector unsigned short hi, lo;
		vector unsigned int ev, od;
                SHADE15_ALTIVEC_SAT(*ptr)
                ptr++;
            }
            
            for (x = w%8; --x >= 0;) {
                int r, g, b;
                b = ((DATA16 *) ptr)[x];
                r = (((b >> 10 ) & 0x001f )  * rm ) >> 8;
                r = ( r > 0x001f ) ? 0x7c00 : ( r << 10 );
                g = (((b >>  5 ) & 0x001f ) * gm ) >> 8;
                g = ( g > 0x001f ) ? 0x03e0 : ( g << 5 );
                b = (( b         & 0x001f ) * bm ) >> 8;
                b = ( b > 0x001f ) ? 0x001f : b;
                ((DATA16 *) ptr)[x] = (r|g|b);
            }
            ptr += bpl;
        }
    } // align condition
    }
}


#if 0
/* RGB 16 */
void
shade_ximage_16_C(void *data, int bpl, int w, int h, int rm, int gm, int bm)
{
         /*       int r, g, b;

                b = ((DATA16 *) ptr)[x];
                r = (b & 0xf800) * rm;
                g = (b & 0x7e0) * gm;
                b = (b & 0x1f) * bm;
                ((DATA16 *) ptr)[x] = ((r >> 8) & 0xf800)
                    | ((g >> 8) & 0x7e0)
                    | ((b >> 8) & 0x1f); */
            }
            ptr += bpl;
        }
    } else {
        for (y = h; --y >= 0;) {
            for (x = -w; x < 0; x++) {

        /*
                int r, g, b;
                b = ((DATA16 *) ptr)[x];
                r = ( (b >> 11 )            * rm ) >> 8;
		r = ( r > 0x001f ) ? 0xf800 : ( r << 11 );
                g = (((b >>  5 ) & 0x003f ) * gm ) >> 8;
		g = ( g > 0x003f ) ? 0x07e0 : ( g << 5 );
                b = (( b         & 0x001f ) * bm ) >> 8;
		b = ( b > 0x001f ) ? 0x001f : b;
                ((DATA16 *) ptr)[x] = (r|g|b);*/
            }
            for (x=-(w%4), x > 0, x++) {

            }
            ptr += bpl;
        }
    }
}

#endif

/* RGB 32 */
void
shade_ximage_32_altivec(void *data, int bpl, int w, int h, int rm, int gm, int bm)
{
    union {
        unsigned short tmp_s[8] __attribute__ ((aligned(16)));
        unsigned char tmp_c[16] __attribute__ ((aligned(16)));
        vector unsigned char vec;
        } argb;
    
    int x, y;

    
    if ((rm <= 255) && (gm <= 255) && (bm <= 255)) {
        /* No saturation */
        vector unsigned char mod;
        argb.tmp_c[0] = 1;    //alpha
	argb.tmp_c[1] = rm;   //red
	argb.tmp_c[2] = gm;   //green
	argb.tmp_c[3] = bm;   //blue
	printf (" no saturation\n");
        //we use vec_splat
	mod = vec_splat((vector unsigned int)argb.vec, 0);
    
    if ((uintptr_t)(data) & 15) {
	unsigned char *ptr = data;
	bpl -= 16*(w/4);
	for (y = h; --y >= 0;) {
            for (x = w/4; x > 0; x--) { // 4 pixel at time

                vector unsigned short ev, od;
		vector unsigned char tmp1, tmp2, mask, edges;
                vector unsigned char src;
		
		ALTIVEC_UNALIGNED_LOAD(ptr,src)
		
		SHADE32_ALTIVEC_NONSAT(src)
		
		ALTIVEC_UNALIGNED_STORE(src,ptr)

		ptr+=16;
            }
	    
            for (x = (w%4-1)*4; x >=0; x -= 4) {
		unsigned char *ptr_c = (unsigned char *)ptr;
		
		ptr_c[x + 1] =
			(unsigned char) ((ptr_c[x + 1] * rm) >> 8);
                ptr_c[x + 2] =
			(unsigned char) ((ptr_c[x + 2] * gm) >> 8);
		ptr_c[x + 3] = 
			(unsigned char) ((ptr_c[x + 3] * bm) >> 8);
            }
            
            ptr += bpl;
        }
    } else {
	vector unsigned char *ptr = data;
	bpl = bpl/16 - w/4;
	for (y = h; --y >= 0;) {
            for (x = w/4; x > 0; x--) { // 4 pixel at time

                vector unsigned short ev, od;
		
		SHADE32_ALTIVEC_NONSAT(*ptr) 
		
		ptr++;
            }
	    
            for (x = (w%4-1)*4; x >=0; x -= 4) {
		unsigned char *ptr_c = (unsigned char *)ptr;
		
		ptr_c[x + 1] =
			(unsigned char) ((ptr_c[x + 1] * rm) >> 8);
                ptr_c[x + 2] =
			(unsigned char) ((ptr_c[x + 2] * gm) >> 8);
		ptr_c[x + 3] = 
			(unsigned char) ((ptr_c[x + 3] * bm) >> 8);
            }
            
            ptr += bpl;
        }
    }
    } else {
        vector unsigned short mod;
	
        argb.tmp_s[0] = 1;    //alpha
	argb.tmp_s[1] = rm;   //red
	argb.tmp_s[2] = gm;   //green
	argb.tmp_s[3] = bm;   //blue

        // permute, we cannot splat
        mod = vec_perm((vector unsigned char)argb.vec,
	    (vector unsigned char)AVV(0,0,0,0),
	    (vector unsigned char)AVV(0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
				      0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07));

    if ((uintptr_t)(data) & 15) {	
	unsigned char *ptr = data;
	bpl -= 16*w/4;	
        for (y = h; --y >= 0;) {
            for (x = w/4; x > 0; x--) { // 4 pixel at time
                vector unsigned short hi, lo;
		vector unsigned int ev, od;
		vector unsigned char tmp1, tmp2, mask, edges;
                vector unsigned char src;

		ALTIVEC_UNALIGNED_LOAD(ptr,src)
		
		SHADE32_ALTIVEC_SAT(src)

		ALTIVEC_UNALIGNED_STORE(src, ptr)
				
		ptr += 16;
            }
            
            for (x = (w%4-1) * 4; x >= 0; x -= 4) {
                int r, g, b;
                r = (ptr[x + 1] * rm) >> 8;
                ptr[x + 1] = r|(!(r >> 8) - 1);
                g = (ptr[x + 2] * gm) >> 8;
                ptr[x + 2] = g|(!(g >> 8) - 1);
                b = (ptr[x + 3] * bm) >> 8;
                ptr[x + 3] = b|(!(b >> 8) - 1);
            }
            ptr += bpl;
        }
    } else {
    	vector unsigned char *ptr = data;
	bpl = bpl/16 - w/4;	
        for (y = h; --y >= 0;) {
            for (x = w/4; x > 0; x--) { // 4 pixel at time
                vector unsigned short hi, lo;
		vector unsigned int ev, od;
		
		SHADE32_ALTIVEC_SAT(*ptr)
				
		ptr++;
            }
            
            for (x = (w%4-1) * 4; x >= 0; x -= 4) {
                int r, g, b;
		unsigned char *ptr_c = (unsigned char *) ptr;
                r = (ptr_c[x + 1] * rm) >> 8;
                ptr_c[x + 1] = r|(!(r >> 8) - 1);
                g = (ptr_c[x + 2] * gm) >> 8;
                ptr_c[x + 2] = g|(!(g >> 8) - 1);
                b = (ptr_c[x + 3] * bm) >> 8;
                ptr_c[x + 3] = b|(!(b >> 8) - 1);
            }
            ptr += bpl;
        }
    }
    }
}
#if 0
/* RGB 24 */
void
shade_ximage_24_C(void *data, int bpl, int w, int h, int rm, int gm, int bm)
{
    unsigned char *ptr;
    int x, y;

    ptr = (unsigned char *) data + (w * 3);
    if ((rm <= 256) && (gm <= 256) && (bm <= 256)) {
        /* No saturation */
        for (y = h; --y >= 0;) {
            for (x = -(w * 3); x < 0; x += 3) {

# if WORDS_BIGENDIAN
                ptr[x + 0] = (ptr[x + 0] * rm) >> 8;
                ptr[x + 1] = (ptr[x + 1] * gm) >> 8;
                ptr[x + 2] = (ptr[x + 2] * bm) >> 8;
# else
                ptr[x + 2] = (ptr[x + 2] * rm) >> 8;
                ptr[x + 1] = (ptr[x + 1] * gm) >> 8;
                ptr[x + 0] = (ptr[x + 0] * bm) >> 8;
# endif
            }
            ptr += bpl;
        }
    } else {
        for (y = h; --y >= 0;) {
            for (x = -(w * 3); x < 0; x += 3) {
                int r, g, b;
# if WORDS_BIGENDIAN
                r = (ptr[x + 0] * rm) >> 8;
                ptr[x + 0] = r|(!(r >> 8) - 1);
                g = (ptr[x + 1] * gm) >> 8;
                ptr[x + 1] = g|(!(g >> 8) - 1);
                b = (ptr[x + 2] * bm) >> 8;
                ptr[x + 2] = b|(!(b >> 8) - 1);
# else
                r = (ptr[x + 2] * rm) >> 8;
                ptr[x + 2] = r|(!(r >> 8) - 1);
                g = (ptr[x + 1] * gm) >> 8;
                ptr[x + 1] = g|(!(g >> 8) - 1);
                b = (ptr[x + 0] * bm) >> 8;
                ptr[x + 0] = b|(!(b >> 8) - 1);
# endif
            }
            ptr += bpl;
        }
    }
}
#endif
#endif