/*
 * Combine src and mask
 */

#include <altivec.h>
#include <stdio.h>
#include <inttypes.h>
//#include "fbpict.h"

typedef uint32_t CARD32;
typedef uint16_t CARD16;
#define FASTCALL

#define AVV(x...) {x}

#define Alpha(x) ((x) >> 24)
#define Red(x) (((x) >> 16) & 0xff)
#define Green(x) (((x) >> 8) & 0xff)
#define Blue(x) ((x) & 0xff)

/*
  x_c = (x_c * a) / 255
*/
#define FbByteMul(x, a) do {                                      \
        CARD32 t = ((x & 0xff00ff) * a) + 0x800080;               \
        t = (t + ((t >> 8) & 0xff00ff)) >> 8;                     \
        t &= 0xff00ff;                                            \
                                                                  \
        x = (((x >> 8) & 0xff00ff) * a) + 0x800080;               \
        x = (x + ((x >> 8) & 0xff00ff));                          \
        x &= 0xff00ff00;                                          \
        x += t;                                                   \
    } while (0)

#define FbByteAdd(x, y) do {                                            \
        CARD32 t;                                                       \
        CARD32 r = (x & 0xff00ff) + (y & 0xff00ff);                     \
        r |= 0x1000100 - ((r >> 8) & 0xff00ff);                         \
        r &= 0xff00ff;                                                  \
                                                                        \
        t = ((x >> 8) & 0xff00ff) + ((y >> 8) & 0xff00ff);              \
        t |= 0x1000100 - ((t >> 8) & 0xff00ff);                         \
        r |= (t & 0xff00ff) << 8;                                       \
        x = r;                                                          \
    } while (0)

#define FbByteMulAdd(x, a, y) do {                                \
        CARD32 t = ((x & 0xff00ff) * a) + 0x800080;               \
        t = (t + ((t >> 8) & 0xff00ff)) >> 8;                     \
        t &= 0xff00ff;                                            \
        t += y & 0xff00ff;                                        \
        t |= 0x1000100 - ((t >> 8) & 0xff00ff);                   \
        t &= 0xff00ff;                                            \
                                                                  \
        x = (((x >> 8) & 0xff00ff) * a) + 0x800080;                 \
        x = (x + ((x >> 8) & 0xff00ff)) >> 8;                       \
        x &= 0xff00ff;                                              \
        x += (y >> 8) & 0xff00ff;                                   \
        x |= 0x1000100 - ((t >> 8) & 0xff00ff);                     \
        x &= 0xff00ff;                                              \
        x <<= 8;                                                    \
        x += t;                                                     \
    } while (0)

#define FbByteAddMul(x, a, y, b) do {                                   \
        CARD32 t;                                                       \
        CARD32 r = (x >> 24) * a + (y >> 24) * b + 0x80;                \
        r += (r >> 8);                                                  \
        r >>= 8;                                                        \
                                                                        \
        t = (x & 0xff00) * a + (y & 0xff00) * b + 0x8000;               \
        t += (t >> 8);                                                  \
        t >>= 16;                                                       \
                                                                        \
        t |= r << 16;                                                   \
        t |= 0x1000100 - ((t >> 8) & 0xff00ff);                         \
        t &= 0xff00ff;                                                  \
        t <<= 8;                                                        \
                                                                        \
        r = ((x >> 16) & 0xff) * a + ((y >> 16) & 0xff) * b + 0x80;     \
        r += (r >> 8);                                                  \
        r >>= 8;                                                        \
                                                                        \
        x = (x & 0xff) * a + (y & 0xff) * b + 0x80;                     \
        x += (x >> 8);                                                  \
        x >>= 8;                                                        \
        x |= r << 16;                                                   \
        x |= 0x1000100 - ((x >> 8) & 0xff00ff);                         \
        x &= 0xff00ff;                                                  \
        x |= t;                                                         \
} while (0)

#define FbByteAddMul_256(x, a, y, b) do {                               \
        CARD32 t = (x & 0xff00ff) * a + (y & 0xff00ff) * b;             \
        t >>= 8;                                                        \
        t &= 0xff00ff;                                                  \
                                                                        \
        x = ((x >> 8) & 0xff00ff) * a + ((y >> 8) & 0xff00ff) * b;      \
        x &= 0xff00ff00;                                                \
        x += t;                                                         \
} while (0)

static __inline__ vector unsigned int
splat_alpha(vector unsigned int pix) {
    return vec_perm(pix, pix,
    (vector unsigned char)AVV(0x00,0x00,0x00,0x00, 0x04,0x04,0x04,0x04,
                              0x08,0x08,0x08,0x08, 0x0C,0x0C,0x0C,0x0C));
}

static __inline__ vector unsigned int 
pix_multiply(vector unsigned int p, vector unsigned int a)
{
    vector unsigned short hi, lo, mod;
    /* unpack to short */
    hi = (vector unsigned short)
                    vec_mergeh((vector unsigned char)AVV(0),
                                (vector unsigned char)p);
    mod = (vector unsigned short)
                    vec_mergeh((vector unsigned char)AVV(0),
                                (vector unsigned char)a);

    hi = vec_mladd(hi, mod, (vector unsigned short) //AVV(0));

//    hi = vec_adds(hi, (vector unsigned short)
                            AVV(0x0080,0x0080,0x0080,0x0080,
                                0x0080,0x0080,0x0080,0x0080));

    hi = vec_adds(hi, vec_sr(hi, vec_splat_u16(8)));

    hi = vec_sr(hi, vec_splat_u16(8));

    /* unpack to short */
    lo = (vector unsigned short)
                    vec_mergel((vector unsigned char)AVV(0),
                                (vector unsigned char)p);
    mod = (vector unsigned short)
                    vec_mergel((vector unsigned char)AVV(0),
                                (vector unsigned char)a);

    lo = vec_mladd(lo, mod, (vector unsigned short) //AVV(0));

//    lo = vec_adds(lo, (vector unsigned short)
                            AVV(0x0080,0x0080,0x0080,0x0080,
                                0x0080,0x0080,0x0080,0x0080));

    lo = vec_adds(lo, vec_sr(lo, vec_splat_u16(8)));

    lo = vec_sr(lo, vec_splat_u16(8));

    return vec_packsu(hi, lo);
}

static __inline__ vector unsigned int
pix_add (vector unsigned int a, vector unsigned int b)
{
    return vec_adds ((vector unsigned char)a,
                     (vector unsigned char)b);
}

static __inline__ vector unsigned int
pix_add_mul (vector unsigned int x, vector unsigned int a,
             vector unsigned int y, vector unsigned int b)
{
    vector unsigned short hi, lo, mod, hiy, loy, mody;

    hi = (vector unsigned short)
                    vec_mergeh((vector unsigned char)AVV(0),
                                (vector unsigned char)x);
    mod = (vector unsigned short)
                    vec_mergeh((vector unsigned char)AVV(0),
                                (vector unsigned char)a);
    hiy = (vector unsigned short)
                    vec_mergeh((vector unsigned char)AVV(0),
                                (vector unsigned char)y);
    mody = (vector unsigned short)
                    vec_mergeh((vector unsigned char)AVV(0),
                                (vector unsigned char)b);

    hi = vec_mladd(hi, mod, (vector unsigned short)
                            AVV(0x0080,0x0080,0x0080,0x0080,
                                0x0080,0x0080,0x0080,0x0080));

    hi = vec_mladd(hiy, mody, hi);

    hi = vec_adds(hi, vec_sr(hi, vec_splat_u16(8)));

    hi = vec_sr(hi, vec_splat_u16(8));

    lo = (vector unsigned short)
                    vec_mergel((vector unsigned char)AVV(0),
                                (vector unsigned char)x);
    mod = (vector unsigned short)
                    vec_mergel((vector unsigned char)AVV(0),
                                (vector unsigned char)a);

    loy = (vector unsigned short)
                    vec_mergel((vector unsigned char)AVV(0),
                                (vector unsigned char)y);
    mody = (vector unsigned short)
                    vec_mergel((vector unsigned char)AVV(0),
                                (vector unsigned char)b);

    lo = vec_mladd(lo, mod, (vector unsigned short)
                            AVV(0x0080,0x0080,0x0080,0x0080,
                                0x0080,0x0080,0x0080,0x0080));

    lo = vec_mladd(loy, mody, lo);

    lo = vec_adds(lo, vec_sr(lo, vec_splat_u16(8)));

    lo = vec_sr(lo, vec_splat_u16(8));

    return vec_packsu(hi, lo);
}

static __inline__ vector unsigned int
negate (vector unsigned int src)
{
    return vec_nor (src, src);
}

static __inline__ vector unsigned int
over (vector unsigned int src, vector unsigned int srca,
      vector unsigned int dest)
{
    vector unsigned char tmp = pix_multiply(dest, negate (srca));
    tmp = vec_adds((vector unsigned char)src, tmp);
    return tmp;
}

static FASTCALL void
vmxCombineMaskU (CARD32 *src, const CARD32 *msk, int width)
{
    int i;
    vector unsigned int  vsrc, vmsk;
    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
                         src_mask, msk_mask, store_mask;

    src_mask = vec_lvsl(0, src);
    msk_mask = vec_lvsl(0, msk);
    store_mask = vec_lvsr(0, src);

    for (i = width/4; i > 0; i--) {

        tmp1 = vec_ld(0, msk);

        tmp2 = vec_ld(15, msk);

        tmp3 = vec_ld(0, src);
        vmsk = vec_perm(tmp1, tmp2, msk_mask);

        tmp4 = vec_ld(15, src);
        vsrc = vec_perm(tmp3, tmp4, src_mask);

        vmsk = splat_alpha(vmsk);

        vsrc = pix_multiply(vsrc, vmsk);

        edges = vec_perm(tmp4, tmp3, src_mask);

        tmp3 = vec_perm((vector unsigned char)vsrc, edges, store_mask);

        tmp1 = vec_perm(edges, (vector unsigned char)vsrc, store_mask);

        vec_st((vector unsigned int) tmp3, 15, src );

        vec_st((vector unsigned int) tmp1, 0, src );

        msk+=4;
        src+=4;
    }

    for (i = width%4; --i >= 0;) {
        CARD32 a = msk[i] >> 24;
        CARD32 s = src[i];
        FbByteMul(s, a);
        src[i] = s;
    }
}

static FASTCALL void
vmxCombineOverU (CARD32 *dest, const CARD32 *src, int width)
{
    int i;
    vector unsigned int  vdest, vsrc;
    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
                         dest_mask, src_mask, store_mask;

    dest_mask = vec_lvsl(0, dest);
    src_mask = vec_lvsl(0, src);
    store_mask = vec_lvsr(0, dest);

    for (i = width/4; i > 0; i--) {

        tmp1 = vec_ld(0, src);

        tmp2 = vec_ld(15, src);

        tmp3 = vec_ld(0, dest);
        vsrc = vec_perm(tmp1, tmp2, src_mask);

        tmp4 = vec_ld(15, dest);
        vdest = vec_perm(tmp3, tmp4, dest_mask);

        tmp1 = splat_alpha(vsrc);

        vdest = over(vsrc, tmp1, vdest);

        edges = vec_perm(tmp4, tmp3, dest_mask);

        tmp3 = vec_perm((vector unsigned char)vdest, edges, store_mask);

        tmp1 = vec_perm(edges, (vector unsigned char)vdest, store_mask);

        vec_st((vector unsigned int) tmp3, 15, dest );

        vec_st((vector unsigned int) tmp1, 0, dest );

        src+=4;
        dest+=4;
    }

    for (i = width%4; --i >=0;) {
        CARD32 s = src[i];
        CARD32 d = dest[i];
        CARD32 ia = Alpha(~s);

        FbByteMulAdd(d, ia, s);
        dest[i] = d;
    }
}

static FASTCALL void
vmxCombineOverReverseU (CARD32 *dest, const CARD32 *src, int width)
{
    int i;
    vector unsigned int  vdest, vsrc;
    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
                         dest_mask, src_mask, store_mask;

    dest_mask = vec_lvsl(0, dest);
    src_mask = vec_lvsl(0, src);
    store_mask = vec_lvsr(0, dest);

    for (i = width/4; i > 0; i--) {
        // unaligned load
        tmp1 = vec_ld(0, src);

        tmp2 = vec_ld(15, src);

        tmp3 = vec_ld(0, dest);
        vsrc = vec_perm(tmp1, tmp2, src_mask);

        tmp4 = vec_ld(15, dest);
        vdest = vec_perm(tmp3, tmp4, dest_mask);

        tmp1 = splat_alpha(vdest);

        vdest = over(vdest, tmp1, vsrc);

        edges = vec_perm(tmp4, tmp3, dest_mask);

        tmp3 = vec_perm((vector unsigned char)vdest, edges, store_mask);

        tmp1 = vec_perm(edges, (vector unsigned char)vdest, store_mask);

        vec_st((vector unsigned int) tmp3, 15, dest );

        vec_st((vector unsigned int) tmp1, 0, dest );

        src+=4;
        dest+=4;
    }

    for (i = width%4; --i >=0;) {
        CARD32 s = src[i];
        CARD32 d = dest[i];
        CARD32 ia = Alpha(~s);

        FbByteMulAdd(d, ia, s);
        dest[i] = d;
    }
}

static FASTCALL void
vmxCombineInU (CARD32 *dest, const CARD32 *src, int width)
{
    int i;
    vector unsigned int  vdest, vsrc;
    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
                         dest_mask, src_mask, store_mask;

    dest_mask = vec_lvsl(0, dest);
    src_mask = vec_lvsl(0, src);
    store_mask = vec_lvsr(0, dest);

    for (i = width/4; i > 0; i--) {

        tmp1 = vec_ld(0, src);

        tmp2 = vec_ld(15, src);

        tmp3 = vec_ld(0, dest);
        vsrc = vec_perm(tmp1, tmp2, src_mask);

        tmp4 = vec_ld(15, dest);
        vdest = vec_perm(tmp3, tmp4, dest_mask);

        tmp1 = splat_alpha(vdest);

        vdest = pix_multiply(vsrc, tmp1);

        edges = vec_perm(tmp4, tmp3, dest_mask);

        tmp3 = vec_perm((vector unsigned char)vdest, edges, store_mask);

        tmp1 = vec_perm(edges, (vector unsigned char)vdest, store_mask);

        vec_st((vector unsigned int) tmp3, 15, dest );

        vec_st((vector unsigned int) tmp1, 0, dest );

        src+=4;
        dest+=4;
    }

    for (i = width%4; --i >=0;) {

        CARD32 s = src[i];
        CARD32 a = Alpha(dest[i]);
        FbByteMul(s, a);
        dest[i] = s;
    }
}

static FASTCALL void
vmxCombineInReverseU (CARD32 *dest, const CARD32 *src, int width)
{
    int i;
    vector unsigned int  vdest, vsrc;
    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
                         dest_mask, src_mask, store_mask;

    dest_mask = vec_lvsl(0, dest);
    src_mask = vec_lvsl(0, src);
    store_mask = vec_lvsr(0, dest);

    for (i = width/4; i > 0; i--) {

        tmp1 = vec_ld(0, src);

        tmp2 = vec_ld(15, src);

        tmp3 = vec_ld(0, dest);
        vsrc = vec_perm(tmp1, tmp2, src_mask);

        tmp4 = vec_ld(15, dest);
        vdest = vec_perm(tmp3, tmp4, dest_mask);

        tmp1 = splat_alpha(vsrc);

        vdest = pix_multiply(vdest, tmp1);

        edges = vec_perm(tmp4, tmp3, dest_mask);

        tmp3 = vec_perm((vector unsigned char)vdest, edges, store_mask);

        tmp1 = vec_perm(edges, (vector unsigned char)vdest, store_mask);

        vec_st((vector unsigned int) tmp3, 15, dest );

        vec_st((vector unsigned int) tmp1, 0, dest );

        src+=4;
        dest+=4;
    }

    for (i = width%4; --i >=0;) {
        CARD32 s = dest[i];
        CARD32 a = Alpha(src[i]);
        FbByteMul(s, a);
        dest[i] = s;
    }
}

static FASTCALL void
vmxCombineOutU (CARD32 *dest, const CARD32 *src, int width)
{
    int i;
    vector unsigned int  vdest, vsrc;
    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
                         dest_mask, src_mask, store_mask;

    dest_mask = vec_lvsl(0, dest);
    src_mask = vec_lvsl(0, src);
    store_mask = vec_lvsr(0, dest);

    for (i = width/4; i > 0; i--) {

        tmp1 = vec_ld(0, src);

        tmp2 = vec_ld(15, src);

        tmp3 = vec_ld(0, dest);
        vsrc = vec_perm(tmp1, tmp2, src_mask);

        tmp4 = vec_ld(15, dest);
        vdest = vec_perm(tmp3, tmp4, dest_mask);

        tmp1 = splat_alpha(negate(vdest));

        vdest = pix_multiply(vsrc, tmp1);

        edges = vec_perm(tmp4, tmp3, dest_mask);

        tmp3 = vec_perm((vector unsigned char)vdest, edges, store_mask);

        tmp1 = vec_perm(edges, (vector unsigned char)vdest, store_mask);

        vec_st((vector unsigned int) tmp3, 15, dest );

        vec_st((vector unsigned int) tmp1, 0, dest );

        src+=4;
        dest+=4;
    }

    for (i = width%4; --i >=0;) {
        CARD32 s = src[i];
        CARD32 a = Alpha(~dest[i]);
        FbByteMul(s, a);
        dest[i] = s;
    }
}

static FASTCALL void
vmxCombineOutReverseU (CARD32 *dest, const CARD32 *src, int width)
{
    int i;
    vector unsigned int  vdest, vsrc;
    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
                         dest_mask, src_mask, store_mask;

    dest_mask = vec_lvsl(0, dest);
    src_mask = vec_lvsl(0, src);
    store_mask = vec_lvsr(0, dest);

    for (i = width/4; i > 0; i--) {

        tmp1 = vec_ld(0, src);

        tmp2 = vec_ld(15, src);

        tmp3 = vec_ld(0, dest);
        vsrc = vec_perm(tmp1, tmp2, src_mask);

        tmp4 = vec_ld(15, dest);
        vdest = vec_perm(tmp3, tmp4, dest_mask);

        tmp1 = splat_alpha(negate(vsrc));

        vdest = pix_multiply(vdest, tmp1);

        edges = vec_perm(tmp4, tmp3, dest_mask);

        tmp3 = vec_perm((vector unsigned char)vdest, edges, store_mask);

        tmp1 = vec_perm(edges, (vector unsigned char)vdest, store_mask);

        vec_st((vector unsigned int) tmp3, 15, dest );

        vec_st((vector unsigned int) tmp1, 0, dest );

        src+=4;
        dest+=4;
    }

    for (i = width%4; --i >=0;) {
        CARD32 s = src[i];
        CARD32 a = Alpha(~dest[i]);
        FbByteMul(s, a);
        dest[i] = s;
    }
}

static FASTCALL void
vmxCombineAtopU (CARD32 *dest, const CARD32 *src, int width)
{
    int i;
    vector unsigned int  vdest, vsrc;
    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
                         dest_mask, src_mask, store_mask;

    dest_mask = vec_lvsl(0, dest);
    src_mask = vec_lvsl(0, src);
    store_mask = vec_lvsr(0, dest);

    for (i = width/4; i > 0; i--) {

        tmp1 = vec_ld(0, src);

        tmp2 = vec_ld(15, src);

        tmp3 = vec_ld(0, dest);
        vsrc = vec_perm(tmp1, tmp2, src_mask);

        tmp4 = vec_ld(15, dest);
        vdest = vec_perm(tmp3, tmp4, dest_mask);

        tmp1 = splat_alpha(negate(vsrc));

        vdest = pix_add_mul(vsrc, splat_alpha(vdest), vdest, tmp1);

        edges = vec_perm(tmp4, tmp3, dest_mask);

        tmp3 = vec_perm((vector unsigned char)vdest, edges, store_mask);

        tmp1 = vec_perm(edges, (vector unsigned char)vdest, store_mask);

        vec_st((vector unsigned int) tmp3, 15, dest );

        vec_st((vector unsigned int) tmp1, 0, dest );

        src+=4;
        dest+=4;
    }

    for (i = width%4; --i >=0;) {
        CARD32 s = src[i];
        CARD32 d = dest[i];
        CARD32 dest_a = Alpha(d);
        CARD32 src_ia = Alpha(~s);

        FbByteAddMul(s, dest_a, d, src_ia);
        dest[i] = s;
    }
}

static FASTCALL void
vmxCombineAtopReverseU (CARD32 *dest, const CARD32 *src, int width)
{
    int i;
    vector unsigned int  vdest, vsrc;
    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
                         dest_mask, src_mask, store_mask;

    dest_mask = vec_lvsl(0, dest);
    src_mask = vec_lvsl(0, src);
    store_mask = vec_lvsr(0, dest);

    for (i = width/4; i > 0; i--) {

        tmp1 = vec_ld(0, src);

        tmp2 = vec_ld(15, src);

        tmp3 = vec_ld(0, dest);
        vsrc = vec_perm(tmp1, tmp2, src_mask);

        tmp4 = vec_ld(15, dest);
        vdest = vec_perm(tmp3, tmp4, dest_mask);

        tmp1 = splat_alpha(negate(vdest));

        vdest = pix_add_mul(vdest, splat_alpha(vsrc), vsrc, tmp1);

        edges = vec_perm(tmp4, tmp3, dest_mask);

        tmp3 = vec_perm((vector unsigned char)vdest, edges, store_mask);

        tmp1 = vec_perm(edges, (vector unsigned char)vdest, store_mask);

        vec_st((vector unsigned int) tmp3, 15, dest );

        vec_st((vector unsigned int) tmp1, 0, dest );

        src+=4;
        dest+=4;
    }

    for (i = width%4; --i >=0;) {
        CARD32 s = src[i];
        CARD32 d = dest[i];
        CARD32 src_a = Alpha(s);
        CARD32 dest_ia = Alpha(~d);

        FbByteAddMul(s, dest_ia, d, src_a);
        dest[i] = s;
    }
}

static FASTCALL void
vmxCombineXorU (CARD32 *dest, const CARD32 *src, int width)
{
    int i;
    vector unsigned int  vdest, vsrc;
    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
                         dest_mask, src_mask, store_mask;

    dest_mask = vec_lvsl(0, dest);
    src_mask = vec_lvsl(0, src);
    store_mask = vec_lvsr(0, dest);

    for (i = width/4; i > 0; i--) {

        tmp1 = vec_ld(0, src);

        tmp2 = vec_ld(15, src);

        tmp3 = vec_ld(0, dest);
        vsrc = vec_perm(tmp1, tmp2, src_mask);

        tmp4 = vec_ld(15, dest);
        vdest = vec_perm(tmp3, tmp4, dest_mask);

        vdest = pix_add_mul(vsrc, splat_alpha(negate(vdest)),
                            vdest, splat_alpha(negate(vsrc)));

        edges = vec_perm(tmp4, tmp3, dest_mask);

        tmp3 = vec_perm((vector unsigned char)vdest, edges, store_mask);

        tmp1 = vec_perm(edges, (vector unsigned char)vdest, store_mask);

        vec_st((vector unsigned int) tmp3, 15, dest );

        vec_st((vector unsigned int) tmp1, 0, dest );

        src+=4;
        dest+=4;
    }

    for (i = width%4; --i >=0;) {
        CARD32 s = src[i];
        CARD32 d = dest[i];
        CARD32 src_ia = Alpha(~s);
        CARD32 dest_ia = Alpha(~d);

        FbByteAddMul(s, dest_ia, d, src_ia);
        dest[i] = s;
    }
}

static FASTCALL void
vmxCombineAddU (CARD32 *dest, const CARD32 *src, int width)
{
    int i;
    vector unsigned int  vdest, vsrc;
    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
                         dest_mask, src_mask, store_mask;

    dest_mask = vec_lvsl(0, dest);
    src_mask = vec_lvsl(0, src);
    store_mask = vec_lvsr(0, dest);

    for (i = width/4; i > 0; i--) {

        tmp1 = vec_ld(0, src);

        tmp2 = vec_ld(15, src);

        tmp3 = vec_ld(0, dest);
        vsrc = vec_perm(tmp1, tmp2, src_mask);

        tmp4 = vec_ld(15, dest);
        vdest = vec_perm(tmp3, tmp4, dest_mask);

        vdest = pix_add(vsrc, vdest);

        edges = vec_perm(tmp4, tmp3, dest_mask);

        tmp3 = vec_perm((vector unsigned char)vdest, edges, store_mask);

        tmp1 = vec_perm(edges, (vector unsigned char)vdest, store_mask);

        vec_st((vector unsigned int) tmp3, 15, dest );

        vec_st((vector unsigned int) tmp1, 0, dest );

        src+=4;
        dest+=4;
    }

    for (i = width%4; --i >=0;) {
        CARD32 s = src[i];
        CARD32 d = dest[i];
        FbByteAdd(d, s);
        dest[i] = d;
    }
}

//static FASTCALL void
//vmxCombineSaturateU (CARD32 *dest, const CARD32 *src, int width)
//unimplemented
#if 0
static FASTCALL void
vmxCombineSrcC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
{
    int i;
    vector unsigned int  vsrc, vmsk, vdest;
    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
                         src_mask, msk_mask, dest_mask, store_mask;

    src_mask = vec_lvsl(0, src);
    dest_mask = vec_lvsl(0, dest);
    msk_mask = vec_lvsl(0, msk);
    store_mask = vec_lvsr(0, dest);

    for (i = width/4; i > 0; i--) {

        tmp1 = vec_ld(0, msk);

        tmp2 = vec_ld(15, msk);

        tmp3 = vec_ld(0, src);
        vmsk = vec_perm(tmp1, tmp2, msk_mask);

        tmp4 = vec_ld(15, src);
        vsrc = vec_perm(tmp3, tmp4, src_mask);

        tmp1 = vec_ld(0, dest);
        tmp2 = vec_ld(15, dest);

        vdest = pix_multiply(vsrc, vmsk);

        edges = vec_perm(tmp2, tmp1, src_mask);

        tmp3 = vec_perm((vector unsigned char)vdest, edges, store_mask);

        tmp4 = vec_perm(edges, (vector unsigned char)vdest, store_mask);

        vec_st((vector unsigned int) tmp3, 15, src );

        vec_st((vector unsigned int) tmp4, 0, src );

        msk+=4;
        src+=4;
        dest+=4;
    }

    fbCombineSrcC (dest, src, mask, width%4);
}

#endif

static void
fbCombineMaskU (CARD32 *src, const CARD32 *mask, int width)
{
    int i;
    for (i = 0; i < width; ++i) {
        CARD32 a = mask[i] >> 24;
        CARD32 s = src[i];
        FbByteMul(s, a);
        src[i] = s;
    }
}

static void
fbCombineOverU (CARD32 *dest, const CARD32 *src, int width)
{
    int i;
    for (i = 0; i < width; ++i) {
        CARD32 s = src[i];
        CARD32 d = dest[i];
        CARD32 ia = Alpha(~s);

        FbByteMulAdd(d, ia, s);
        dest[i] = d;
    }
}

static void
fbCombineOverReverseU (CARD32 *dest, const CARD32 *src, int width)
{
    int i;
    for (i = 0; i < width; ++i) {
        CARD32 s = src[i];
        CARD32 d = dest[i];
        CARD32 ia = Alpha(~dest[i]);
        FbByteMulAdd(s, ia, d);
        dest[i] = s;
    }
}

static void
fbCombineInU (CARD32 *dest, const CARD32 *src, int width)
{
    int i;
    for (i = 0; i < width; ++i) {
        CARD32 s = src[i];
        CARD32 a = Alpha(dest[i]);
        FbByteMul(s, a);
        dest[i] = s;
    }
}

static void
fbCombineOutU (CARD32 *dest, const CARD32 *src, int width)
{
    int i;
    for (i = 0; i < width; ++i) {
        CARD32 s = src[i];
        CARD32 a = Alpha(~dest[i]);
        FbByteMul(s, a);
        dest[i] = s;
    }
}

static FASTCALL void
fbCombineOutReverseU (CARD32 *dest, const CARD32 *src, int width)
{
    int i;
    for (i = 0; i < width; ++i) {
        CARD32 d = dest[i];
        CARD32 a = Alpha(~src[i]);
        FbByteMul(d, a);
        dest[i] = d;
    }
}

static FASTCALL void
fbCombineAtopU (CARD32 *dest, const CARD32 *src, int width)
{
    int i;
    for (i = 0; i < width; ++i) {
        CARD32 s = src[i];
        CARD32 d = dest[i];
        CARD32 dest_a = Alpha(d);
        CARD32 src_ia = Alpha(~s);

        FbByteAddMul(s, dest_a, d, src_ia);
        dest[i] = s;
    }
}

static FASTCALL void
fbCombineAtopReverseU (CARD32 *dest, const CARD32 *src, int width)
{
    int i;
    for (i = 0; i < width; ++i) {
        CARD32 s = src[i];
        CARD32 d = dest[i];
        CARD32 src_a = Alpha(s);
        CARD32 dest_ia = Alpha(~d);

        FbByteAddMul(s, dest_ia, d, src_a);
        dest[i] = s;
    }
}

static FASTCALL void
fbCombineXorU (CARD32 *dest, const CARD32 *src, int width)
{
    int i;
    for (i = 0; i < width; ++i) {
        CARD32 s = src[i];
        CARD32 d = dest[i];
        CARD32 src_ia = Alpha(~s);
        CARD32 dest_ia = Alpha(~d);

        FbByteAddMul(s, dest_ia, d, src_ia);
        dest[i] = s;
    }
}

static FASTCALL void
fbCombineAddU (CARD32 *dest, const CARD32 *src, int width)
{
    int i;
    for (i = 0; i < width; ++i) {
        CARD32 s = src[i];
        CARD32 d = dest[i];
        FbByteAdd(d, s);
        dest[i] = d;
    }
}

static FASTCALL void
fbCombineSaturateU (CARD32 *dest, const CARD32 *src, int width)
{
    int i;
    for (i = 0; i < width; ++i) {
        CARD32  s = src[i];
        CARD32 d = dest[i];
        CARD16  sa, da;

        sa = s >> 24;
        da = ~d >> 24;
        if (sa > da)
        {
//            sa = FbIntDiv(da, sa);
//            FbByteMul(s, sa);
        }
        FbByteAdd(d, s);
        dest[i] = d;
    }
}

int main (int argc, char** argv)
{

    int data[3][10245]; __attribute__((aligned(16)));
    int i;
    //input data
    for (i = 0; i<10240; i++) data[0][i] = data[1][i] = (i&0xff) * 0x01010101;
    //
    for (i = 0; i<10240; i++) data[2][i] = (i&0xff) * 0x01010101;

    for (i = 0; i<10240; i++) 
    if (data[0][i] != data[1][i]) {
        printf ("wrong byte %d : %d != %d\n",i , data[0][i],  data[1][i]);
    }

    printf ("combine \n");
    fbCombineAddU (data[0], data[2], 1024);
    vmxCombineAddU (data[1], data[2],1024);

    for (i = 0; i<10240; i++) 
        if (data[0][i] != data[1][i]) {
            printf ("wrong byte %0d : %0x != %0x\n",i , data[0][i],  data[1][i]);
        }
    return 0;
}
