/* GIMP - The GNU Image Manipulation Program * Copyright (C) 1995 Spencer Kimball and Peter Mattis * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ /* Much of the content of this file are derivative works of David * Monniaux which are Copyright (C) 1999, 2001 David Monniaux * Tip-o-the-hat to David for pioneering this effort. * * All of these functions use the mmx and sse registers and expect * them to remain intact across multiple asm() constructs. This may * not work in the future, if the compiler allocates mmx/sse registers * for it's own use. XXX */ #include "config.h" #include #include #include "base/base-types.h" #include "gimp-composite.h" #include "gimp-composite-sse.h" #ifdef COMPILE_SSE_IS_OKAY #include "gimp-composite-x86.h" #define pminub(src,dst,tmp) "pminub " "%%" #src ", %%" #dst #define pmaxub(src,dst,tmp) "pmaxub " "%%" #src ", %%" #dst extern const guint32 rgba8_alpha_mask_64[2]; extern const guint32 rgba8_b1_64[2]; extern const guint32 rgba8_b255_64[2]; extern const guint32 rgba8_w1_64[2]; extern const guint32 rgba8_w2_64[2]; extern const guint32 rgba8_w128_64[2]; extern const guint32 rgba8_w256_64[2]; extern const guint32 rgba8_w255_64[2]; /* * */ void gimp_composite_addition_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op) { uint64 *d = (uint64 *) _op->D; uint64 *a = (uint64 *) _op->A; uint64 *b = (uint64 *) _op->B; gulong n_pixels = _op->n_pixels; asm volatile ("movq %0,%%mm0" : /* empty */ : "m" (*rgba8_alpha_mask_64) : "%mm0"); for (; n_pixels >= 2; n_pixels -= 2) { asm volatile (" movq %1, %%mm2\n" "\tmovq %2, %%mm3\n" "\tmovq %%mm2, %%mm4\n" "\tpaddusb %%mm3, %%mm4\n" "\tmovq %%mm0, %%mm1\n" "\tpandn %%mm4, %%mm1\n" "\t" pminub(mm3, mm2, mm4) "\n" "\tpand %%mm0, %%mm2\n" "\tpor %%mm2, %%mm1\n" "\tmovntq %%mm1, %0\n" : "=m" (*d) : "m" (*a), "m" (*b) : "%mm0", "%mm1", "%mm2", "%mm3", "%mm4"); a++; b++; d++; } if (n_pixels > 0) { asm volatile (" movd %1, %%mm2\n" "\tmovd %2, %%mm3\n" "\tmovq %%mm2, %%mm4\n" "\tpaddusb %%mm3, %%mm4\n" "\tmovq %%mm0, %%mm1\n" "\tpandn %%mm4, %%mm1\n" "\t" pminub(mm3, mm2, mm4) "\n" "\tpand %%mm0, %%mm2\n" "\tpor %%mm2, %%mm1\n" "\tmovd %%mm1, %0\n" : "=m" (*d) : "m" (*a), "m" (*b) : "%mm0", "%mm1", "%mm2", "%mm3", "%mm4"); } asm("emms"); } #if 0 void gimp_composite_burn_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op) { uint64 *d = (uint64 *) _op->D; uint64 *a = (uint64 *) _op->A; uint64 *b = (uint64 *) _op->B; gulong n_pixels = _op->n_pixels; for (; n_pixels >= 2; n_pixels -= 2) { asm volatile (" movq %1,%%mm0\n" "\tmovq %2,%%mm1\n" "\tmovq %3,%%mm2\n" "\tpsubb %%mm0,%%mm2\n" /* mm2 = 255 - A */ "\tpxor %%mm4,%%mm4\n" "\tpunpcklbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256 */ "\tmovq %%mm1,%%mm3\n" "\tpxor %%mm5,%%mm5\n" "\tpunpcklbw %%mm5,%%mm3\n" "\tmovq %4,%%mm5\n" "\tpaddusw %%mm3,%%mm5\n" /* mm5 = B + 1 */ "\t" pdivwqX(mm4,mm5,mm7) "\n" "\tmovq %3,%%mm2\n" "\tpsubb %%mm0,%%mm2\n" /* mm2 = 255 - A */ "\tpxor %%mm4,%%mm4\n" "\tpunpckhbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256 */ "\tmovq %%mm1,%%mm3\n" "\tpxor %%mm5,%%mm5\n" "\tpunpckhbw %%mm5,%%mm3\n" "\tmovq %4,%%mm5\n" "\tpaddusw %%mm3,%%mm5\n" /* mm5 = B + 1 */ "\t" pdivwqX(mm4,mm5,mm6) "\n" "\tmovq %5,%%mm4\n" "\tmovq %%mm4,%%mm5\n" "\tpsubusw %%mm6,%%mm4\n" "\tpsubusw %%mm7,%%mm5\n" "\tpackuswb %%mm4,%%mm5\n" "\t" pminub(mm0,mm1,mm3) "\n" /* mm1 = min(mm0,mm1) clobber mm3 */ "\tmovq %6,%%mm7\n" /* mm6 = rgba8_alpha_mask_64 */ "\tpand %%mm7,%%mm1\n" /* mm1 = mm7 & alpha_mask */ "\tpandn %%mm5,%%mm7\n" /* mm7 = ~mm7 & mm5 */ "\tpor %%mm1,%%mm7\n" /* mm7 = mm7 | mm1 */ "\tmovq %%mm7,%0\n" : "=m" (*d) : "m" (*a), "m" (*b), "m" (*rgba8_b255_64), "m" (*rgba8_w1_64), "m" (*rgba8_w255_64), "m" (*rgba8_alpha_mask_64) : pdivwqX_clobber, "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"); d++; b++; a++; } if (n_pixels > 0) { asm volatile (" movd %1,%%mm0\n" "\tmovd %2,%%mm1\n" "\tmovq %3,%%mm2\n" "\tpsubb %%mm0,%%mm2\n" /* mm2 = 255 - A */ "\tpxor %%mm4,%%mm4\n" "\tpunpcklbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256 */ "\tmovq %%mm1,%%mm3\n" "\tpxor %%mm5,%%mm5\n" "\tpunpcklbw %%mm5,%%mm3\n" "\tmovq %4,%%mm5\n" "\tpaddusw %%mm3,%%mm5\n" /* mm5 = B + 1 */ "\t" pdivwqX(mm4,mm5,mm7) "\n" "\tmovq %3,%%mm2\n" "\tpsubb %%mm0,%%mm2\n" /* mm2 = 255 - A */ "\tpxor %%mm4,%%mm4\n" "\tpunpckhbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256 */ "\tmovq %%mm1,%%mm3\n" "\tpxor %%mm5,%%mm5\n" "\tpunpckhbw %%mm5,%%mm3\n" "\tmovq %4,%%mm5\n" "\tpaddusw %%mm3,%%mm5\n" /* mm5 = B + 1 */ "\t" pdivwqX(mm4,mm5,mm6) "\n" "\tmovq %5,%%mm4\n" "\tmovq %%mm4,%%mm5\n" "\tpsubusw %%mm6,%%mm4\n" "\tpsubusw %%mm7,%%mm5\n" "\tpackuswb %%mm4,%%mm5\n" "\t" pminub(mm0,mm1,mm3) "\n" /* mm1 = min(mm0,mm1) clobber mm3 */ "\tmovq %6,%%mm7\n" "\tpand %%mm7,%%mm1\n" /* mm1 = mm7 & alpha_mask */ "\tpandn %%mm5,%%mm7\n" /* mm7 = ~mm7 & mm5 */ "\tpor %%mm1,%%mm7\n" /* mm7 = mm7 | mm1 */ "\tmovd %%mm7,%0\n" : "=m" (*d) : "m" (*a), "m" (*b), "m" (*rgba8_b255_64), "m" (*rgba8_w1_64), "m" (*rgba8_w255_64), "m" (*rgba8_alpha_mask_64) : pdivwqX_clobber, "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"); } asm("emms"); } #endif void gimp_composite_darken_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op) { uint64 *d = (uint64 *) _op->D; uint64 *a = (uint64 *) _op->A; uint64 *b = (uint64 *) _op->B; gulong n_pixels = _op->n_pixels; for (; n_pixels >= 2; n_pixels -= 2) { asm volatile (" movq %1,%%mm2\n" "\tmovq %2,%%mm3\n" "\t" pminub(mm3, mm2, mm4) "\n" "\tmovntq %%mm2,%0\n" : "=m" (*d) : "m" (*a), "m" (*b) : "%mm1", "%mm2", "%mm3", "%mm4"); a++; b++; d++; } if (n_pixels > 0) { asm volatile (" movd %1, %%mm2\n" "\tmovd %2, %%mm3\n" "\t" pminub(mm3, mm2, mm4) "\n" "\tmovd %%mm2, %0\n" : "=m" (*d) : "m" (*a), "m" (*b) : "%mm2", "%mm3", "%mm4"); } asm("emms"); } void gimp_composite_difference_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op) { uint64 *d = (uint64 *) _op->D; uint64 *a = (uint64 *) _op->A; uint64 *b = (uint64 *) _op->B; gulong n_pixels = _op->n_pixels; asm volatile ("movq %0,%%mm0" : : "m" (*rgba8_alpha_mask_64) : "%mm0"); for (; n_pixels >= 2; n_pixels -= 2) { asm volatile (" movq %1, %%mm2\n" "\tmovq %2, %%mm3\n" "\tmovq %%mm2, %%mm4\n" "\tmovq %%mm3, %%mm5\n" "\tpsubusb %%mm3, %%mm4\n" "\tpsubusb %%mm2, %%mm5\n" "\tpaddb %%mm5, %%mm4\n" "\tmovq %%mm0, %%mm1\n" "\tpandn %%mm4, %%mm1\n" "\tpminub %%mm3, %%mm2\n" "\tpand %%mm0, %%mm2\n" "\tpor %%mm2, %%mm1\n" "\tmovntq %%mm1, %0\n" : "=m" (*d) : "m" (*a), "m" (*b) : "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); a++; b++; d++; } if (n_pixels > 0) { asm volatile (" movd %1, %%mm2\n" "\tmovd %2, %%mm3\n" "\tmovq %%mm2, %%mm4\n" "\tmovq %%mm3, %%mm5\n" "\tpsubusb %%mm3, %%mm4\n" "\tpsubusb %%mm2, %%mm5\n" "\tpaddb %%mm5, %%mm4\n" "\tmovq %%mm0, %%mm1\n" "\tpandn %%mm4, %%mm1\n" "\tpminub %%mm3, %%mm2\n" "\tpand %%mm0, %%mm2\n" "\tpor %%mm2, %%mm1\n" "\tmovd %%mm1, %0\n" : "=m" (*d) : "m" (*a), "m" (*b) : "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); } asm("emms"); } void gimp_composite_grain_extract_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op) { uint64 *d = (uint64 *) _op->D; uint64 *a = (uint64 *) _op->A; uint64 *b = (uint64 *) _op->B; gulong n_pixels = _op->n_pixels; asm volatile (" movq %0,%%mm0\n" "\tpxor %%mm6,%%mm6\n" "\tmovq %1,%%mm7\n" : /* empty */ : "m" (*rgba8_alpha_mask_64), "m" (*rgba8_w128_64) : "%mm0", "%mm6", "%mm7"); for (; n_pixels >= 2; n_pixels -= 2) { asm volatile (" movq %1,%%mm2\n" "\tmovq %2,%%mm3\n" mmx_low_bytes_to_words(mm2,mm4,mm6) mmx_low_bytes_to_words(mm3,mm5,mm6) "\tpsubw %%mm5,%%mm4\n" "\tpaddw %%mm7,%%mm4\n" "\tmovq %%mm4,%%mm1\n" mmx_high_bytes_to_words(mm2,mm4,mm6) mmx_high_bytes_to_words(mm3,mm5,mm6) "\tpsubw %%mm5,%%mm4\n" "\tpaddw %%mm7,%%mm4\n" "\tpackuswb %%mm4,%%mm1\n" "\tmovq %%mm1,%%mm4\n" "\tmovq %%mm0,%%mm1\n" "\tpandn %%mm4,%%mm1\n" "\tpminub %%mm3,%%mm2\n" "\tpand %%mm0,%%mm2\n" "\tpor %%mm2,%%mm1\n" "\tmovntq %%mm1,%0\n" : "=m" (*d) : "m" (*a), "m" (*b) : "%mm1", "%mm2", "%mm3", "%mm4"); a++; b++; d++; } if (n_pixels > 0) { asm volatile (" movd %1, %%mm2\n" "\tmovd %2, %%mm3\n" mmx_low_bytes_to_words(mm2,mm4,mm6) mmx_low_bytes_to_words(mm3,mm5,mm6) "\tpsubw %%mm5, %%mm4\n" "\tpaddw %%mm7, %%mm4\n" "\tmovq %%mm4, %%mm1\n" "\tpackuswb %%mm6, %%mm1\n" "\tmovq %%mm1, %%mm4\n" "\tmovq %%mm0, %%mm1\n" "\tpandn %%mm4, %%mm1\n" "\tpminub %%mm3, %%mm2\n" "\tpand %%mm0, %%mm2\n" "\tpor %%mm2, %%mm1\n" "\tmovd %%mm1, %0\n" : "=m" (*d) : "m" (*a), "m" (*b) : "%mm1", "%mm2", "%mm3", "%mm4"); } asm("emms"); } void gimp_composite_grain_merge_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op) { uint64 *d = (uint64 *) _op->D; uint64 *a = (uint64 *) _op->A; uint64 *b = (uint64 *) _op->B; gulong n_pixels = _op->n_pixels; asm volatile ("movq %0, %%mm0\n" "pxor %%mm6, %%mm6\n" "movq %1, %%mm7\n" : /* empty */ : "m" (*rgba8_alpha_mask_64), "m" (*rgba8_w128_64) : "%mm0", "%mm6", "%mm7"); for (; n_pixels >= 2; n_pixels -= 2) { asm volatile (" movq %1, %%mm2\n" "\tmovq %2, %%mm3\n" mmx_low_bytes_to_words(mm2,mm4,mm6) mmx_low_bytes_to_words(mm3,mm5,mm6) "\tpaddw %%mm5, %%mm4\n" "\tpsubw %%mm7, %%mm4\n" mmx_high_bytes_to_words(mm2,mm1,mm6) mmx_high_bytes_to_words(mm3,mm5,mm6) "\tpaddw %%mm5, %%mm1\n" "\tpsubw %%mm7, %%mm1\n" "\tpackuswb %%mm1, %%mm4\n" "\t" pminub(mm3,mm2,mm5) "\n" "\tpand %%mm0, %%mm2\n" "\tmovq %%mm0, %%mm1\n" "\tpandn %%mm4, %%mm1\n" "\tpor %%mm2, %%mm1\n" "\tmovntq %%mm1, %0\n" : "=m" (*d) : "m" (*a), "m" (*b) : "%mm1", "%mm2", "%mm3", "%mm4"); a++; b++; d++; } if (n_pixels > 0) { asm volatile (" movd %1, %%mm2\n" "\tmovd %2, %%mm3\n" mmx_low_bytes_to_words(mm2,mm4,mm6) mmx_low_bytes_to_words(mm3,mm5,mm6) "\tpaddw %%mm5, %%mm4\n" "\tpsubw %%mm7, %%mm4\n" "\tmovq %%mm4, %%mm1\n" "\tpackuswb %%mm6, %%mm1\n" "\tmovq %%mm1, %%mm4\n" "\tmovq %%mm0, %%mm1\n" "\tpandn %%mm4, %%mm1\n" "\tpminub %%mm3, %%mm2\n" "\tpand %%mm0, %%mm2\n" "\tpor %%mm2, %%mm1\n" "\tmovd %%mm1, %0\n" : "+m" (*d) : "m" (*a), "m" (*b) : "%mm1", "%mm2", "%mm3", "%mm4"); } asm("emms"); } void gimp_composite_lighten_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op) { uint64 *d = (uint64 *) _op->D; uint64 *a = (uint64 *) _op->A; uint64 *b = (uint64 *) _op->B; gulong n_pixels = _op->n_pixels; asm volatile ("movq %0,%%mm0" : : "m" (*rgba8_alpha_mask_64) : "%mm0"); for (; n_pixels >= 2; n_pixels -= 2) { asm volatile (" movq %1, %%mm2\n" "\tmovq %2, %%mm3\n" "\tmovq %%mm2, %%mm4\n" "\tpmaxub %%mm3, %%mm4\n" "\tmovq %%mm0, %%mm1\n" "\tpandn %%mm4, %%mm1\n" "\tpminub %%mm2, %%mm3\n" "\tpand %%mm0, %%mm3\n" "\tpor %%mm3, %%mm1\n" "\tmovntq %%mm1, %0\n" : "=m" (*d) : "m" (*a), "m" (*b) : "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); a++; b++; d++; } if (n_pixels > 0) { asm volatile (" movd %1, %%mm2\n" "\tmovd %2, %%mm3\n" "\tmovq %%mm2, %%mm4\n" "\tpmaxub %%mm3, %%mm4\n" "\tmovq %%mm0, %%mm1\n" "\tpandn %%mm4, %%mm1\n" "\tpminub %%mm2, %%mm3\n" "\tpand %%mm0, %%mm3\n" "\tpor %%mm3, %%mm1\n" "\tmovd %%mm1, %0\n" : "=m" (*d) : "m" (*a), "m" (*b) : "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); } asm("emms"); } void gimp_composite_multiply_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op) { uint64 *d = (uint64 *) _op->D; uint64 *a = (uint64 *) _op->A; uint64 *b = (uint64 *) _op->B; gulong n_pixels = _op->n_pixels; asm volatile ("movq %0,%%mm0" : : "m" (*rgba8_alpha_mask_64) : "%mm0"); asm volatile ("movq %0,%%mm7" : : "m" (*rgba8_w128_64) : "%mm7"); asm volatile ("pxor %%mm6,%%mm6" : : : "%mm6"); for (; n_pixels >= 2; n_pixels -= 2) { asm volatile (" movq %1, %%mm2\n" "\tmovq %2, %%mm3\n" mmx_low_bytes_to_words(mm2,mm1,mm6) mmx_low_bytes_to_words(mm3,mm5,mm6) mmx_int_mult(mm5,mm1,mm7) mmx_high_bytes_to_words(mm2,mm4,mm6) mmx_high_bytes_to_words(mm3,mm5,mm6) mmx_int_mult(mm5,mm4,mm7) "\tpackuswb %%mm4, %%mm1\n" "\tmovq %%mm0, %%mm4\n" "\tpandn %%mm1, %%mm4\n" "\tmovq %%mm4, %%mm1\n" "\t" pminub(mm3,mm2,mm4) "\n" "\tpand %%mm0, %%mm2\n" "\tpor %%mm2, %%mm1\n" "\tmovntq %%mm1, %0\n" : "=m" (*d) : "m" (*a), "m" (*b) : "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); a++; b++; d++; } if (n_pixels > 0) { asm volatile (" movd %1, %%mm2\n" "\tmovd %2, %%mm3\n" mmx_low_bytes_to_words(mm2,mm1,mm6) mmx_low_bytes_to_words(mm3,mm5,mm6) pmulwX(mm5,mm1,mm7) "\tpackuswb %%mm6, %%mm1\n" "\tmovq %%mm0, %%mm4\n" "\tpandn %%mm1, %%mm4\n" "\tmovq %%mm4, %%mm1\n" "\t" pminub(mm3,mm2,mm4) "\n" "\tpand %%mm0, %%mm2\n" "\tpor %%mm2, %%mm1\n" "\tmovd %%mm1, %0\n" : "=m" (*d) : "m" (*a), "m" (*b) : "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); } asm("emms"); } #if 0 static void sse_op_overlay(void) { asm volatile ( /* low bytes */ mmx_low_bytes_to_words(mm3,mm5,mm0) "\tpcmpeqb %%mm4,%%mm4\n" "\tpsubb %%mm2,%%mm4\n" /* mm4 = 255 - A */ "\tpunpcklbw %%mm0,%%mm4\n" /* mm4 = (low bytes as word) mm4 */ "\tmovq %0,%%mm6\n" /* mm6 = words of value 2 */ "\tpmullw %%mm5,%%mm6\n" /* mm6 = 2 * low bytes of B */ mmx_int_mult(mm6,mm4,mm7) /* mm4 = INT_MULT(mm6, mm4) */ /* high bytes */ mmx_high_bytes_to_words(mm3,mm5,mm0) "\tpcmpeqb %%mm1,%%mm1\n" "\tpsubb %%mm2,%%mm1\n" /* mm1 = 255 - A */ "\tpunpckhbw %%mm0,%%mm1\n" /* mm1 = (high bytes as word) mm1 */ "\tmovq %0,%%mm6\n" /* mm6 = words of value 2 */ "\tpmullw %%mm5,%%mm6\n" /* mm6 = 2 * high bytes of B */ mmx_int_mult(mm6,mm1,mm7) /* mm1 = INT_MULT(mm6, mm1) */ "\tpackuswb %%mm1,%%mm4\n" /* mm4 = intermediate value */ mmx_low_bytes_to_words(mm4,mm5,mm0) mmx_low_bytes_to_words(mm2,mm6,mm0) "\tpaddw %%mm6,%%mm5\n" mmx_int_mult(mm6,mm5,mm7) /* mm5 = INT_MULT(mm6, mm5) low bytes */ mmx_high_bytes_to_words(mm4,mm1,mm0) mmx_high_bytes_to_words(mm2,mm6,mm0) "\tpaddw %%mm6,%%mm1\n" mmx_int_mult(mm6,mm1,mm7) /* mm1 = INT_MULT(mm6, mm1) high bytes */ "\tpackuswb %%mm1,%%mm5\n" "\tmovq %1,%%mm0\n" "\tmovq %%mm0,%%mm1\n" "\tpandn %%mm5,%%mm1\n" "\t" pminub(mm2,mm3,mm4) "\n" "\tpand %%mm0,%%mm3\n" "\tpor %%mm3,%%mm1\n" : /* empty */ : "m" (*rgba8_w2_64), "m" (*rgba8_alpha_mask_64) ); } void xxxgimp_composite_overlay_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op) { GimpCompositeContext op = *_op; asm volatile ("pxor %%mm0,%%mm0\n" "movq %0,%%mm7" : /* empty */ : "m" (*rgba8_w128_64) : "%mm0"); for (; op.n_pixels >= 2; op.n_pixels -= 2) { asm volatile (" movq %1,%%mm2\n" "\tmovq %2,%%mm3\n" /* low bytes */ mmx_low_bytes_to_words(mm3,mm5,mm0) "\tpcmpeqb %%mm4,%%mm4\n" "\tpsubb %%mm2,%%mm4\n" /* mm4 = 255 - A */ "\tpunpcklbw %%mm0,%%mm4\n" /* mm4 = (low bytes as word) mm4 */ "\tmovq %3,%%mm6\n" /* mm6 = words of value 2 */ "\tpmullw %%mm5,%%mm6\n" /* mm6 = 2 * low bytes of B */ mmx_int_mult(mm6,mm4,mm7) /* mm4 = INT_MULT(mm6, mm4) */ /* high bytes */ mmx_high_bytes_to_words(mm3,mm5,mm0) "\tpcmpeqb %%mm1,%%mm1\n" "\tpsubb %%mm2,%%mm1\n" /* mm1 = 255 - A */ "\tpunpckhbw %%mm0,%%mm1\n" /* mm1 = (high bytes as word) mm1 */ "\tmovq %3,%%mm6\n" /* mm6 = words of value 2 */ "\tpmullw %%mm5,%%mm6\n" /* mm6 = 2 * high bytes of B */ mmx_int_mult(mm6,mm1,mm7) /* mm1 = INT_MULT(mm6, mm1) */ "\tpackuswb %%mm1,%%mm4\n" /* mm4 = intermediate value */ mmx_low_bytes_to_words(mm4,mm5,mm0) mmx_low_bytes_to_words(mm2,mm6,mm0) "\tpaddw %%mm6,%%mm5\n" mmx_int_mult(mm6,mm5,mm7) /* mm5 = INT_MULT(mm6, mm5) low bytes */ mmx_high_bytes_to_words(mm4,mm1,mm0) mmx_high_bytes_to_words(mm2,mm6,mm0) "\tpaddw %%mm6,%%mm1\n" mmx_int_mult(mm6,mm1,mm7) /* mm1 = INT_MULT(mm6, mm1) high bytes */ "\tpackuswb %%mm1,%%mm5\n" "\tmovq %4,%%mm0\n" "\tmovq %%mm0,%%mm1\n" "\tpandn %%mm5,%%mm1\n" "\t" pminub(mm2,mm3,mm4) "\n" "\tpand %%mm0,%%mm3\n" "\tpor %%mm3,%%mm1\n" "\tmovq %%mm1,%0\n" : "=m" (*op.D) : "m" (*op.A), "m" (*op.B), "m" (*rgba8_w2_64), "m" (*rgba8_alpha_mask_64) : "%mm1", "%mm2", "%mm3", "%mm4"); op.A += 8; op.B += 8; op.D += 8; } if (op.n_pixels) { asm volatile (" movd %1,%%mm2\n" "\tmovd %2,%%mm3\n" /* low bytes */ mmx_low_bytes_to_words(mm3,mm5,mm0) "\tpcmpeqb %%mm4,%%mm4\n" "\tpsubb %%mm2,%%mm4\n" /* mm4 = 255 - A */ "\tpunpcklbw %%mm0,%%mm4\n" /* mm4 = (low bytes as word) mm4 */ "\tmovq %3,%%mm6\n" /* mm6 = words of value 2 */ "\tpmullw %%mm5,%%mm6\n" /* mm6 = 2 * low bytes of B */ mmx_int_mult(mm6,mm4,mm7) /* mm4 = INT_MULT(mm6, mm4) */ /* high bytes */ mmx_high_bytes_to_words(mm3,mm5,mm0) "\tpcmpeqb %%mm1,%%mm1\n" "\tpsubb %%mm2,%%mm1\n" /* mm1 = 255 - A */ "\tpunpckhbw %%mm0,%%mm1\n" /* mm1 = (high bytes as word) mm1 */ "\tmovq %3,%%mm6\n" /* mm6 = words of value 2 */ "\tpmullw %%mm5,%%mm6\n" /* mm6 = 2 * high bytes of B */ mmx_int_mult(mm6,mm1,mm7) /* mm1 = INT_MULT(mm6, mm1) */ "\tpackuswb %%mm1,%%mm4\n" /* mm4 = intermediate value */ mmx_low_bytes_to_words(mm4,mm5,mm0) mmx_low_bytes_to_words(mm2,mm6,mm0) "\tpaddw %%mm6,%%mm5\n" mmx_int_mult(mm6,mm5,mm7) /* mm5 = INT_MULT(mm6, mm5) low bytes */ mmx_high_bytes_to_words(mm4,mm1,mm0) mmx_high_bytes_to_words(mm2,mm6,mm0) "\tpaddw %%mm6,%%mm1\n" mmx_int_mult(mm6,mm1,mm7) /* mm1 = INT_MULT(mm6, mm1) high bytes */ "\tpackuswb %%mm1,%%mm5\n" "\tmovq %4,%%mm0\n" "\tmovq %%mm0,%%mm1\n" "\tpandn %%mm5,%%mm1\n" "\t" pminub(mm2,mm3,mm4) "\n" "\tpand %%mm0,%%mm3\n" "\tpor %%mm3,%%mm1\n" "\tmovd %%mm1,%0\n" : "=m" (*op.D) : "m" (*op.A), "m" (*op.B), "m" (*rgba8_w2_64), "m" (*rgba8_alpha_mask_64) : "%mm1", "%mm2", "%mm3", "%mm4"); } asm("emms"); } #endif void gimp_composite_scale_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op) { uint64 *d = (uint64 *) _op->D; uint64 *a = (uint64 *) _op->A; /*uint64 *b = (uint64 *) _op->B;*/ gulong n_pixels = _op->n_pixels; asm volatile ("pxor %%mm0,%%mm0\n" "\tmovl %0,%%eax\n" "\tmovl %%eax,%%ebx\n" "\tshl $16,%%ebx\n" "\torl %%ebx,%%eax\n" "\tmovd %%eax,%%mm5\n" "\tmovd %%eax,%%mm3\n" "\tpsllq $32,%%mm5\n" "\tpor %%mm5,%%mm3\n" "\tmovq %1,%%mm7\n" : /* empty */ : "m" (_op->scale.scale), "m" (*rgba8_w128_64) : "%eax", "%ebx", "%mm0", "%mm3", "%mm5", "%mm6", "%mm7"); for (; n_pixels >= 2; n_pixels -= 2) { asm volatile ("movq %1,%%mm2\n" "\tmovq %%mm2,%%mm1\n" "\tpunpcklbw %%mm0,%%mm1\n" "\tmovq %%mm3,%%mm5\n" "\t" pmulwX(mm5,mm1,mm7) "\n" "\tmovq %%mm2,%%mm4\n" "\tpunpckhbw %%mm0,%%mm4\n" "\tmovq %%mm3,%%mm5\n" "\t" pmulwX(mm5,mm4,mm7) "\n" "\tpackuswb %%mm4,%%mm1\n" "\tmovntq %%mm1,%0\n" : "=m" (*d) : "m" (*a) : "%mm1", "%mm2", "%mm4", "%mm5", "%mm7"); a++; d++; } if (n_pixels > 0) { asm volatile (" movd %1,%%mm2\n" "\tmovq %%mm2,%%mm1\n" "\tpunpcklbw %%mm0,%%mm1\n" "\tmovq %%mm3,%%mm5\n" "\t" pmulwX(mm5,mm1,mm7) "\n" "\tpackuswb %%mm0,%%mm1\n" "\tmovd %%mm1,%0\n" : "=m" (*d) : "m" (*a) : "%mm1", "%mm2", "%mm4", "%mm5", "%mm6", "%mm7"); } asm("emms"); } void gimp_composite_screen_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op) { uint64 *d = (uint64 *) _op->D; uint64 *a = (uint64 *) _op->A; uint64 *b = (uint64 *) _op->B; gulong n_pixels = _op->n_pixels; asm volatile ("pxor %%mm6,%%mm6\n" "movq %0,%%mm0\n" "movq %1,%%mm7\n" : /* empty */ : "m" (*rgba8_alpha_mask_64), "m" (*rgba8_w128_64) : "%mm0", "%mm6", "%mm7"); for (; n_pixels >= 2; n_pixels -= 2) { asm volatile (" movq %1,%%mm2\n" "\tmovq %2,%%mm3\n" "\tpcmpeqb %%mm4,%%mm4\n" "\tpsubb %%mm2,%%mm4\n" "\tpcmpeqb %%mm5,%%mm5\n" "\tpsubb %%mm3,%%mm5\n" "\tpunpcklbw %%mm6,%%mm4\n" "\tpunpcklbw %%mm6,%%mm5\n" "\tpmullw %%mm4,%%mm5\n" "\tpaddw %%mm7,%%mm5\n" "\tmovq %%mm5,%%mm1\n" "\tpsrlw $ 8,%%mm1\n" "\tpaddw %%mm5,%%mm1\n" "\tpsrlw $ 8,%%mm1\n" "\tpcmpeqb %%mm4,%%mm4\n" "\tpsubb %%mm2,%%mm4\n" "\tpcmpeqb %%mm5,%%mm5\n" "\tpsubb %%mm3,%%mm5\n" "\tpunpckhbw %%mm6,%%mm4\n" "\tpunpckhbw %%mm6,%%mm5\n" "\tpmullw %%mm4,%%mm5\n" "\tpaddw %%mm7,%%mm5\n" "\tmovq %%mm5,%%mm4\n" "\tpsrlw $ 8,%%mm4\n" "\tpaddw %%mm5,%%mm4\n" "\tpsrlw $ 8,%%mm4\n" "\tpackuswb %%mm4,%%mm1\n" "\tpcmpeqb %%mm4,%%mm4\n" "\tpsubb %%mm1,%%mm4\n" "\tmovq %%mm0,%%mm1\n" "\tpandn %%mm4,%%mm1\n" "\t" pminub(mm2,mm3,mm4) "\n" "\tpand %%mm0,%%mm3\n" "\tpor %%mm3,%%mm1\n" "\tmovq %%mm1,%0\n" : "=m" (*d) : "m" (*a), "m" (*b) : "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); a++; b++; d++; } if (n_pixels > 0) { asm volatile (" movd %1,%%mm2\n" "\tmovd %2,%%mm3\n" "\tpcmpeqb %%mm4,%%mm4\n" "\tpsubb %%mm2,%%mm4\n" "\tpcmpeqb %%mm5,%%mm5\n" "\tpsubb %%mm3,%%mm5\n" "\tpunpcklbw %%mm6,%%mm4\n" "\tpunpcklbw %%mm6,%%mm5\n" "\tpmullw %%mm4,%%mm5\n" "\tpaddw %%mm7,%%mm5\n" "\tmovq %%mm5,%%mm1\n" "\tpsrlw $ 8,%%mm1\n" "\tpaddw %%mm5,%%mm1\n" "\tpsrlw $ 8,%%mm1\n" "\tpcmpeqb %%mm4,%%mm4\n" "\tpsubb %%mm2,%%mm4\n" "\tpcmpeqb %%mm5,%%mm5\n" "\tpsubb %%mm3,%%mm5\n" "\tpunpckhbw %%mm6,%%mm4\n" "\tpunpckhbw %%mm6,%%mm5\n" "\tpmullw %%mm4,%%mm5\n" "\tpaddw %%mm7,%%mm5\n" "\tmovq %%mm5,%%mm4\n" "\tpsrlw $ 8,%%mm4\n" "\tpaddw %%mm5,%%mm4\n" "\tpsrlw $ 8,%%mm4\n" "\tpackuswb %%mm4,%%mm1\n" "\tpcmpeqb %%mm4,%%mm4\n" "\tpsubb %%mm1,%%mm4\n" "\tmovq %%mm0,%%mm1\n" "\tpandn %%mm4,%%mm1\n" "\t" pminub(mm2,mm3,mm4) "\n" "\tpand %%mm0,%%mm3\n" "\tpor %%mm3,%%mm1\n" "\tmovd %%mm1,%0\n" : "=m" (*d) : "m" (*a), "m" (*b) : "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); } asm("emms"); } void gimp_composite_subtract_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op) { uint64 *d = (uint64 *) _op->D; uint64 *a = (uint64 *) _op->A; uint64 *b = (uint64 *) _op->B; gulong n_pixels = _op->n_pixels; asm volatile ("movq %0,%%mm0" : : "m" (*rgba8_alpha_mask_64) : "%mm0"); for (; n_pixels >= 2; n_pixels -= 2) { asm volatile (" movq %1,%%mm2\n" "\tmovq %2,%%mm3\n" "\tmovq %%mm2,%%mm4\n" "\tpsubusb %%mm3,%%mm4\n" "\tmovq %%mm0,%%mm1\n" "\tpandn %%mm4,%%mm1\n" "\tpminub %%mm3,%%mm2\n" "\tpand %%mm0,%%mm2\n" "\tpor %%mm2,%%mm1\n" "\tmovq %%mm1,%0\n" : "=m" (*d) : "m" (*a), "m" (*b) : "%mm1", "%mm2", "%mm3", "%mm4"); a++; b++; d++; } if (n_pixels > 0) { asm volatile (" movd %1,%%mm2\n" "\tmovd %2,%%mm3\n" "\tmovq %%mm2,%%mm4\n" "\tpsubusb %%mm3,%%mm4\n" "\tmovq %%mm0,%%mm1\n" "\tpandn %%mm4,%%mm1\n" "\tpminub %%mm3,%%mm2\n" "\tpand %%mm0,%%mm2\n" "\tpor %%mm2,%%mm1\n" "\tmovd %%mm1,%0\n" : "=m" (*d) : "m" (*a), "m" (*b) : "%mm1", "%mm2", "%mm3", "%mm4"); } asm("emms"); } void gimp_composite_swap_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op) { uint64 *a = (uint64 *) _op->A; uint64 *b = (uint64 *) _op->B; gulong n_pixels = _op->n_pixels; for (; n_pixels >= 2; n_pixels -= 2) { asm volatile (" movq %0,%%mm2\n" "\tmovq %1,%%mm3\n" "\tmovq %%mm3,%0\n" "\tmovq %%mm2,%1\n" : "+m" (*a), "+m" (*b) : : "%mm1", "%mm2", "%mm3", "%mm4"); a++; b++; } if (n_pixels > 0) { asm volatile (" movd %0,%%mm2\n" "\tmovd %1,%%mm3\n" "\tmovd %%mm3,%0\n" "\tmovd %%mm2,%1\n" : "+m" (*a), "+m" (*b) : /* empty */ : "%mm1", "%mm2", "%mm3", "%mm4"); } asm("emms"); } #if 0 static const guint32 v8_alpha_mask[2] = { 0xFF00FF00, 0xFF00FF00}; static const guint32 v8_mul_shift[2] = { 0x00800080, 0x00800080 }; void xxxgimp_composite_addition_va8_va8_va8_sse (GimpCompositeContext *_op) { GimpCompositeContext op = *_op; asm("pushl %edi"); asm("pushl %ebx"); asm("movl 12(%esp), %edi"); asm("movq v8_alpha_mask, %mm0"); asm("subl $ 4, %ecx"); asm("jl .add_pixels_1a_1a_last3"); asm("movl $ 8, %ebx"); asm(".add_pixels_1a_1a_loop:"); asm("movq (%eax), %mm2"); asm("movq (%edx), %mm3"); asm("movq %mm2, %mm4"); asm("paddusb %mm3, %mm4"); asm("movq %mm0, %mm1"); asm("pandn %mm4, %mm1"); asm("movq %mm2, %mm4"); asm("psubusb %mm3, %mm4"); asm("psubb %mm4, %mm2"); asm("pand %mm0, %mm2"); asm("por %mm2, %mm1"); asm("movq %mm1, (%edi)"); asm("addl %ebx, %eax"); asm("addl %ebx, %edx"); asm("addl %ebx, %edi"); asm("subl $ 4, %ecx"); asm("jge .add_pixels_1a_1a_loop"); asm(".add_pixels_1a_1a_last3:"); asm("test $ 2, %ecx"); asm("jz .add_pixels_1a_1a_last1"); asm("movd (%eax), %mm2"); asm("movd (%edx), %mm3"); asm("movq %mm2, %mm4"); asm("paddusb %mm3, %mm4"); asm("movq %mm0, %mm1"); asm("pandn %mm4, %mm1"); asm("movq %mm2, %mm4"); asm("psubusb %mm3, %mm4"); asm("psubb %mm4, %mm2"); asm("pand %mm0, %mm2"); asm("por %mm2, %mm1"); asm("addl $ 4, %eax"); asm("addl $ 4, %edx"); asm("addl $ 4, %edi"); asm(".add_pixels_1a_1a_last1:"); asm("test $ 1, %ecx"); asm("jz .add_pixels_1a_1a_end"); asm("movw (%eax), %bx"); asm("movd %ebx, %mm2"); asm("movw (%edx), %bx"); asm("movd %ebx, %mm3"); asm("movq %mm2, %mm4"); asm("paddusb %mm3, %mm4"); asm("movq %mm0, %mm1"); asm("pandn %mm4, %mm1"); asm("movq %mm2, %mm4"); asm("psubusb %mm3, %mm4"); asm("psubb %mm4, %mm2"); asm("pand %mm0, %mm2"); asm("por %mm2, %mm1"); asm("movd %mm1, %ebx"); asm("movw %bx, (%edi)"); asm(".add_pixels_1a_1a_end:"); asm("emms"); asm("popl %ebx"); asm("popl %edi"); } void xxxgimp_composite_burn_va8_va8_va8_sse (GimpCompositeContext *_op) { GimpCompositeContext op = *_op; asm("movq %0,%%mm1" : : "m" (*va8_alpha_mask) : "%mm1"); for (; op.n_pixels >= 4; op.n_pixels -= 4) { asm volatile (" movq (%0),%%mm0; addl $8,%0\n" "\tmovq (%1),%%mm1; addl $8,%1\n" "\tmovq %3,%%mm2\n" "\tpsubb %%mm0,%%mm2\n" /* mm2 = 255 - A */ "\tpxor %%mm4,%%mm4\n" "\tpunpcklbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256 */ "\tmovq %%mm1,%%mm3\n" "\tpxor %%mm5,%%mm5\n" "\tpunpcklbw %%mm5,%%mm3\n" "\tmovq %4,%%mm5\n" "\tpaddusw %%mm3,%%mm5\n" /* mm5 = B + 1 */ "\t" pdivwqX(mm4,mm5,mm7) "\n" "\tmovq %3,%%mm2\n" "\tpsubb %%mm0,%%mm2\n" /* mm2 = 255 - A */ "\tpxor %%mm4,%%mm4\n" "\tpunpckhbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256 */ "\tmovq %%mm1,%%mm3\n" "\tpxor %%mm5,%%mm5\n" "\tpunpckhbw %%mm5,%%mm3\n" "\tmovq %4,%%mm5\n" "\tpaddusw %%mm3,%%mm5\n" /* mm5 = B + 1 */ "\t" pdivwqX(mm4,mm5,mm6) "\n" "\tmovq %5,%%mm4\n" "\tmovq %%mm4,%%mm5\n" "\tpsubusw %%mm6,%%mm4\n" "\tpsubusw %%mm7,%%mm5\n" "\tpackuswb %%mm4,%%mm5\n" "\t" pminub(mm0,mm1,mm3) "\n" /* mm1 = min(mm0,mm1) clobber mm3 */ "\tmovq %6,%%mm7\n" "\tpand %%mm7,%%mm1\n" /* mm1 = mm7 & alpha_mask */ "\tpandn %%mm5,%%mm7\n" /* mm7 = ~mm7 & mm5 */ "\tpor %%mm1,%%mm7\n" /* mm7 = mm7 | mm1 */ "\tmovq %%mm7,(%2); addl $8,%2\n" : "+r" (op.A), "+r" (op.B), "+r" (op.D) : "m" (*va8_b255), "m" (*va8_w1), "m" (*va8_w255), "m" (*va8_alpha_mask) : "%mm1", "%mm2", "%mm3", "%mm4"); } if (op.n_pixels) { asm volatile (" movd (%0),%%mm0\n" "\tmovd (%1),%%mm1\n" "\tmovq %3,%%mm2\n" "\tpsubb %%mm0,%%mm2\n" /* mm2 = 255 - A */ "\tpxor %%mm4,%%mm4\n" "\tpunpcklbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256 */ "\tmovq %%mm1,%%mm3\n" "\tpxor %%mm5,%%mm5\n" "\tpunpcklbw %%mm5,%%mm3\n" "\tmovq %4,%%mm5\n" "\tpaddusw %%mm3,%%mm5\n" /* mm5 = B + 1 */ "\t" pdivwqX(mm4,mm5,mm7) "\n" "\tmovq %3,%%mm2\n" "\tpsubb %%mm0,%%mm2\n" /* mm2 = 255 - A */ "\tpxor %%mm4,%%mm4\n" "\tpunpckhbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256 */ "\tmovq %%mm1,%%mm3\n" "\tpxor %%mm5,%%mm5\n" "\tpunpckhbw %%mm5,%%mm3\n" "\tmovq %4,%%mm5\n" "\tpaddusw %%mm3,%%mm5\n" /* mm5 = B + 1 */ "\t" pdivwqX(mm4,mm5,mm6) "\n" "\tmovq %5,%%mm4\n" "\tmovq %%mm4,%%mm5\n" "\tpsubusw %%mm6,%%mm4\n" "\tpsubusw %%mm7,%%mm5\n" "\tpackuswb %%mm4,%%mm5\n" "\t" pminub(mm0,mm1,mm3) "\n" /* mm1 = min(mm0,mm1) clobber mm3 */ "\tmovq %6,%%mm7\n" "\tpand %%mm7,%%mm1\n" /* mm1 = mm7 & alpha_mask */ "\tpandn %%mm5,%%mm7\n" /* mm7 = ~mm7 & mm5 */ "\tpor %%mm1,%%mm7\n" /* mm7 = mm7 | mm1 */ "\tmovd %%mm7,(%2)\n" : /* empty */ : "r" (op.A), "r" (op.B), "r" (op.D), "m" (*va8_b255), "m" (*va8_w1), "m" (*va8_w255), "m" (*va8_alpha_mask) : "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"); } asm("emms"); } void xxxgimp_composite_coloronly_va8_va8_va8_sse (GimpCompositeContext *_op) { GimpCompositeContext op = *_op; } void gimp_composite_darken_va8_va8_va8_sse (GimpCompositeContext *_op) { GimpCompositeContext op = *_op; asm("pushl %edi"); asm("pushl %ebx"); asm("movl 12(%esp), %edi"); asm("movq v8_alpha_mask, %mm0"); asm("subl $ 4, %ecx"); asm("jl .darken_pixels_1a_1a_last3"); asm("movl $ 8, %ebx"); asm(".darken_pixels_1a_1a_loop:"); asm("movq (%eax), %mm2"); asm("movq (%edx), %mm3"); asm("movq %mm2, %mm4"); asm("psubusb %mm3, %mm4"); asm("psubb %mm4, %mm2"); asm("movq %mm2, %mm1"); asm("movq %mm1, (%edi)"); asm("addl %ebx, %eax"); asm("addl %ebx, %edx"); asm("addl %ebx, %edi"); asm("subl $ 4, %ecx"); asm("jge .darken_pixels_1a_1a_loop"); asm(".darken_pixels_1a_1a_last3:"); asm("test $ 2, %ecx"); asm("jz .darken_pixels_1a_1a_last1"); asm("movd (%eax), %mm2"); asm("movd (%edx), %mm3"); asm("movq %mm2, %mm4"); asm("psubusb %mm3, %mm4"); asm("psubb %mm4, %mm2"); asm("movq %mm2, %mm1"); asm("addl $ 4, %eax"); asm("addl $ 4, %edx"); asm("addl $ 4, %edi"); asm(".darken_pixels_1a_1a_last1:"); asm("test $ 1, %ecx"); asm("jz .darken_pixels_1a_1a_end"); asm("movw (%eax), %bx"); asm("movd %ebx, %mm2"); asm("movw (%edx), %bx"); asm("movd %ebx, %mm3"); asm("movq %mm2, %mm4"); asm("psubusb %mm3, %mm4"); asm("psubb %mm4, %mm2"); asm("movq %mm2, %mm1"); asm("movd %mm1, %ebx"); asm("movw %bx, (%edi)"); asm(".darken_pixels_1a_1a_end:"); asm("emms"); asm("popl %ebx"); asm("popl %edi"); } void gimp_composite_difference_va8_va8_va8_sse (GimpCompositeContext *_op) { GimpCompositeContext op = *_op; asm("pushl %edi"); asm("pushl %ebx"); asm("movl 12(%esp), %edi"); asm("movq v8_alpha_mask, %mm0"); asm("subl $ 4, %ecx"); asm("jl .difference_pixels_1a_1a_last3"); asm("movl $ 8, %ebx"); asm(".difference_pixels_1a_1a_loop:"); asm("movq (%eax), %mm2"); asm("movq (%edx), %mm3"); asm("movq %mm2, %mm4"); asm("movq %mm3, %mm5"); asm("psubusb %mm3, %mm4"); asm("psubusb %mm2, %mm5"); asm("movq %mm0, %mm1"); asm("paddb %mm5, %mm4"); asm("pandn %mm4, %mm1"); asm("psubb %mm4, %mm2"); asm("pand %mm0, %mm2"); asm("por %mm2, %mm1"); asm("movq %mm1, (%edi)"); asm("addl %ebx, %eax"); asm("addl %ebx, %edx"); asm("addl %ebx, %edi"); asm("subl $ 4, %ecx"); asm("jge .difference_pixels_1a_1a_loop"); asm(".difference_pixels_1a_1a_last3:"); asm("test $ 2, %ecx"); asm("jz .difference_pixels_1a_1a_last1"); asm("movd (%eax), %mm2"); asm("movd (%edx), %mm3"); asm("movq %mm2, %mm4"); asm("movq %mm3, %mm5"); asm("psubusb %mm3, %mm4"); asm("psubusb %mm2, %mm5"); asm("movq %mm0, %mm1"); asm("paddb %mm5, %mm4"); asm("pandn %mm4, %mm1"); asm("psubb %mm4, %mm2"); asm("pand %mm0, %mm2"); asm("por %mm2, %mm1"); asm("addl $ 4, %eax"); asm("addl $ 4, %edx"); asm("addl $ 4, %edi"); asm(".difference_pixels_1a_1a_last1:"); asm("test $ 1, %ecx"); asm("jz .difference_pixels_1a_1a_end"); asm("movw (%eax), %bx"); asm("movd %ebx, %mm2"); asm("movw (%edx), %bx"); asm("movd %ebx, %mm3"); asm("movq %mm2, %mm4"); asm("movq %mm3, %mm5"); asm("psubusb %mm3, %mm4"); asm("psubusb %mm2, %mm5"); asm("movq %mm0, %mm1"); asm("paddb %mm5, %mm4"); asm("pandn %mm4, %mm1"); asm("psubb %mm4, %mm2"); asm("pand %mm0, %mm2"); asm("por %mm2, %mm1"); asm("movd %mm1, %ebx"); asm("movw %bx, (%edi)"); asm(".difference_pixels_1a_1a_end:"); asm("emms"); asm("popl %ebx"); asm("popl %edi"); } void xxxgimp_composite_dissolve_va8_va8_va8_sse (GimpCompositeContext *_op) { GimpCompositeContext op = *_op; } void xxxgimp_composite_divide_va8_va8_va8_sse (GimpCompositeContext *_op) { GimpCompositeContext op = *_op; } void xxxgimp_composite_dodge_va8_va8_va8_sse (GimpCompositeContext *_op) { GimpCompositeContext op = *_op; } void xxxgimp_composite_grainextract_va8_va8_va8_sse (GimpCompositeContext *_op) { GimpCompositeContext op = *_op; } void xxxgimp_composite_grainmerge_va8_va8_va8_sse (GimpCompositeContext *_op) { GimpCompositeContext op = *_op; } void xxxgimp_composite_hardlight_va8_va8_va8_sse (GimpCompositeContext *_op) { GimpCompositeContext op = *_op; } void xxxgimp_composite_hueonly_va8_va8_va8_sse (GimpCompositeContext *_op) { GimpCompositeContext op = *_op; } void xxxgimp_composite_lighten_va8_va8_va8_sse (GimpCompositeContext *_op) { GimpCompositeContext op = *_op; asm("pushl %edi"); asm("pushl %ebx"); asm("movl 12(%esp), %edi"); asm("movq v8_alpha_mask, %mm0"); asm("subl $ 4, %ecx"); asm("jl .lighten_pixels_1a_1a_last3"); asm("movl $ 8, %ebx"); asm(".lighten_pixels_1a_1a_loop:"); asm("movq (%eax), %mm2"); asm("movq (%edx), %mm3"); asm("movq %mm2, %mm4"); asm("psubusb %mm3, %mm4"); asm("paddb %mm4, %mm3"); asm("movq %mm0, %mm1"); asm("pandn %mm3, %mm1"); asm("psubb %mm4, %mm2"); asm("pand %mm0, %mm2"); asm("por %mm2, %mm1"); asm("movq %mm1, (%edi)"); asm("addl %ebx, %eax"); asm("addl %ebx, %edx"); asm("addl %ebx, %edi"); asm("subl $ 4, %ecx"); asm("jge .lighten_pixels_1a_1a_loop"); asm(".lighten_pixels_1a_1a_last3:"); asm("test $ 2, %ecx"); asm("jz .lighten_pixels_1a_1a_last1"); asm("movd (%eax), %mm2"); asm("movd (%edx), %mm3"); asm("movq %mm2, %mm4"); asm("psubusb %mm3, %mm4"); asm("paddb %mm4, %mm3"); asm("movq %mm0, %mm1"); asm("pandn %mm3, %mm1"); asm("psubb %mm4, %mm2"); asm("pand %mm0, %mm2"); asm("por %mm2, %mm1"); asm("addl $ 4, %eax"); asm("addl $ 4, %edx"); asm("addl $ 4, %edi"); asm(".lighten_pixels_1a_1a_last1:"); asm("test $ 1, %ecx"); asm("jz .lighten_pixels_1a_1a_end"); asm("movw (%eax), %bx"); asm("movd %ebx, %mm2"); asm("movw (%edx), %bx"); asm("movd %ebx, %mm3"); asm("movq %mm2, %mm4"); asm("psubusb %mm3, %mm4"); asm("paddb %mm4, %mm3"); asm("movq %mm0, %mm1"); asm("pandn %mm3, %mm1"); asm("psubb %mm4, %mm2"); asm("pand %mm0, %mm2"); asm("por %mm2, %mm1"); asm("movd %mm1, %ebx"); asm("movw %bx, (%edi)"); asm(".lighten_pixels_1a_1a_end:"); asm("emms"); asm("popl %ebx"); asm("popl %edi"); } void xxxgimp_composite_multiply_va8_va8_va8_sse (GimpCompositeContext *_op) { GimpCompositeContext op = *_op; asm("pushl %edi"); asm("pushl %ebx"); asm("movl 12(%esp), %edi"); asm("movq v8_alpha_mask, %mm0"); asm("subl $ 4, %ecx"); asm("jl .multiply_pixels_1a_1a_last3"); asm("movl $ 8, %ebx"); asm(".multiply_pixels_1a_1a_loop:"); asm("movq (%eax), %mm2"); asm("movq (%edx), %mm3"); asm("movq %mm2, %mm1"); asm("punpcklbw %mm6, %mm1"); asm("movq %mm3, %mm5"); asm("punpcklbw %mm6, %mm5"); asm("pmullw %mm5, %mm1"); asm("paddw %mm7, %mm1"); asm("movq %mm1, %mm5"); asm("psrlw $ 8, %mm5"); asm("paddw %mm5, %mm1"); asm("psrlw $ 8, %mm1"); asm("movq %mm2, %mm4"); asm("punpckhbw %mm6, %mm4"); asm("movq %mm3, %mm5"); asm("punpckhbw %mm6, %mm5"); asm("pmullw %mm5, %mm4"); asm("paddw %mm7, %mm4"); asm("movq %mm4, %mm5"); asm("psrlw $ 8, %mm5"); asm("paddw %mm5, %mm4"); asm("psrlw $ 8, %mm4"); asm("packuswb %mm4, %mm1"); asm("movq %mm0, %mm4"); asm("pandn %mm1, %mm4"); asm("movq %mm4, %mm1"); asm("movq %mm2, %mm4"); asm("psubusb %mm3, %mm4"); asm("psubb %mm4, %mm2"); asm("pand %mm0, %mm2"); asm("por %mm2, %mm1"); asm("movq %mm1, (%edi)"); asm("addl %ebx, %eax"); asm("addl %ebx, %edx"); asm("addl %ebx, %edi"); asm("subl $ 4, %ecx"); asm("jge .multiply_pixels_1a_1a_loop"); asm(".multiply_pixels_1a_1a_last3:"); asm("test $ 2, %ecx"); asm("jz .multiply_pixels_1a_1a_last1"); asm("movd (%eax), %mm2"); asm("movd (%edx), %mm3"); asm("movq %mm2, %mm1"); asm("punpcklbw %mm6, %mm1"); asm("movq %mm3, %mm5"); asm("punpcklbw %mm6, %mm5"); asm("pmullw %mm5, %mm1"); asm("paddw %mm7, %mm1"); asm("movq %mm1, %mm5"); asm("psrlw $ 8, %mm5"); asm("paddw %mm5, %mm1"); asm("psrlw $ 8, %mm1"); asm("movq %mm2, %mm4"); asm("punpckhbw %mm6, %mm4"); asm("movq %mm3, %mm5"); asm("punpckhbw %mm6, %mm5"); asm("pmullw %mm5, %mm4"); asm("paddw %mm7, %mm4"); asm("movq %mm4, %mm5"); asm("psrlw $ 8, %mm5"); asm("paddw %mm5, %mm4"); asm("psrlw $ 8, %mm4"); asm("packuswb %mm4, %mm1"); asm("movq %mm0, %mm4"); asm("pandn %mm1, %mm4"); asm("movq %mm4, %mm1"); asm("movq %mm2, %mm4"); asm("psubusb %mm3, %mm4"); asm("psubb %mm4, %mm2"); asm("pand %mm0, %mm2"); asm("por %mm2, %mm1"); asm("addl $ 4, %eax"); asm("addl $ 4, %edx"); asm("addl $ 4, %edi"); asm(".multiply_pixels_1a_1a_last1:"); asm("test $ 1, %ecx"); asm("jz .multiply_pixels_1a_1a_end"); asm("movw (%eax), %bx"); asm("movd %ebx, %mm2"); asm("movw (%edx), %bx"); asm("movd %ebx, %mm3"); asm("movq %mm2, %mm1"); asm("punpcklbw %mm6, %mm1"); asm("movq %mm3, %mm5"); asm("punpcklbw %mm6, %mm5"); asm("pmullw %mm5, %mm1"); asm("paddw %mm7, %mm1"); asm("movq %mm1, %mm5"); asm("psrlw $ 8, %mm5"); asm("paddw %mm5, %mm1"); asm("psrlw $ 8, %mm1"); asm("movq %mm2, %mm4"); asm("punpckhbw %mm6, %mm4"); asm("movq %mm3, %mm5"); asm("punpckhbw %mm6, %mm5"); asm("pmullw %mm5, %mm4"); asm("paddw %mm7, %mm4"); asm("movq %mm4, %mm5"); asm("psrlw $ 8, %mm5"); asm("paddw %mm5, %mm4"); asm("psrlw $ 8, %mm4"); asm("packuswb %mm4, %mm1"); asm("movq %mm0, %mm4"); asm("pandn %mm1, %mm4"); asm("movq %mm4, %mm1"); asm("movq %mm2, %mm4"); asm("psubusb %mm3, %mm4"); asm("psubb %mm4, %mm2"); asm("pand %mm0, %mm2"); asm("por %mm2, %mm1"); asm("movd %mm1, %ebx"); asm("movw %bx, (%edi)"); asm(".multiply_pixels_1a_1a_end:"); asm("emms"); asm("popl %ebx"); asm("popl %edi"); } void gimp_composite_overlay_va8_va8_va8_sse (GimpCompositeContext *_op) { GimpCompositeContext op = *_op; asm("pushl %edi"); asm("pushl %ebx"); asm("movl 12(%esp), %edi"); asm("movq v8_alpha_mask, %mm0"); asm("subl $ 4, %ecx"); asm("jl .overlay_pixels_1a_1a_last3"); asm("movl $ 8, %ebx"); asm(".overlay_pixels_1a_1a_loop:"); asm("movq (%eax), %mm2"); asm("movq (%edx), %mm3"); asm("call op_overlay"); asm("movq %mm1, (%edi)"); asm("addl %ebx, %eax"); asm("addl %ebx, %edx"); asm("addl %ebx, %edi"); asm("subl $ 4, %ecx"); asm("jge .overlay_pixels_1a_1a_loop"); asm(".overlay_pixels_1a_1a_last3:"); asm("test $ 2, %ecx"); asm("jz .overlay_pixels_1a_1a_last1"); asm("movd (%eax), %mm2"); asm("movd (%edx), %mm3"); asm("call op_overlay"); asm("addl $ 4, %eax"); asm("addl $ 4, %edx"); asm("addl $ 4, %edi"); asm(".overlay_pixels_1a_1a_last1:"); asm("test $ 1, %ecx"); asm("jz .overlay_pixels_1a_1a_end"); asm("movw (%eax), %bx"); asm("movd %ebx, %mm2"); asm("movw (%edx), %bx"); asm("movd %ebx, %mm3"); asm("call op_overlay"); asm("movd %mm1, %ebx"); asm("movw %bx, (%edi)"); asm(".overlay_pixels_1a_1a_end:"); asm("emms"); asm("popl %ebx"); asm("popl %edi"); } void xxxgimp_composite_replace_va8_va8_va8_sse (GimpCompositeContext *_op) { GimpCompositeContext op = *_op; } void xxxgimp_composite_saturationonly_va8_va8_va8_sse (GimpCompositeContext *_op) { GimpCompositeContext op = *_op; } void xxxgimp_composite_screen_va8_va8_va8_sse (GimpCompositeContext *_op) { GimpCompositeContext op = *_op; asm("pushl %edi"); asm("pushl %ebx"); asm("movl 12(%esp), %edi"); asm("movq v8_alpha_mask, %mm0"); asm("subl $ 4, %ecx"); asm("jl .screen_pixels_1a_1a_last3"); asm("movl $ 8, %ebx"); asm(".screen_pixels_1a_1a_loop:"); asm("movq (%eax), %mm2"); asm("movq (%edx), %mm3"); asm("pcmpeqb %mm4, %mm4"); asm("psubb %mm2, %mm4"); asm("pcmpeqb %mm5, %mm5"); asm("psubb %mm3, %mm5"); asm("movq %mm4, %mm1"); asm("punpcklbw %mm6, %mm1"); asm("movq %mm5, %mm3"); asm("punpcklbw %mm6, %mm3"); asm("pmullw %mm3, %mm1"); asm("paddw %mm7, %mm1"); asm("movq %mm1, %mm3"); asm("psrlw $ 8, %mm3"); asm("paddw %mm3, %mm1"); asm("psrlw $ 8, %mm1"); asm("movq %mm4, %mm2"); asm("punpckhbw %mm6, %mm2"); asm("movq %mm5, %mm3"); asm("punpckhbw %mm6, %mm3"); asm("pmullw %mm3, %mm2"); asm("paddw %mm7, %mm2"); asm("movq %mm2, %mm3"); asm("psrlw $ 8, %mm3"); asm("paddw %mm3, %mm2"); asm("psrlw $ 8, %mm2"); asm("packuswb %mm2, %mm1"); asm("pcmpeqb %mm3, %mm3"); asm("psubb %mm1, %mm3"); asm("movq %mm0, %mm1"); asm("pandn %mm3, %mm1"); asm("movq %mm2, %mm4"); asm("psubusb %mm5, %mm2"); asm("paddb %mm2, %mm5"); asm("pcmpeqb %mm3, %mm3"); asm("psubb %mm5, %mm3"); asm("pand %mm0, %mm3"); asm("por %mm3, %mm1"); asm("movq %mm1, (%edi)"); asm("addl %ebx, %eax"); asm("addl %ebx, %edx"); asm("addl %ebx, %edi"); asm("subl $ 4, %ecx"); asm("jge .screen_pixels_1a_1a_loop"); asm(".screen_pixels_1a_1a_last3:"); asm("test $ 2, %ecx"); asm("jz .screen_pixels_1a_1a_last1"); asm("movd (%eax), %mm2"); asm("movd (%edx), %mm3"); asm("pcmpeqb %mm4, %mm4"); asm("psubb %mm2, %mm4"); asm("pcmpeqb %mm5, %mm5"); asm("psubb %mm3, %mm5"); asm("movq %mm4, %mm1"); asm("punpcklbw %mm6, %mm1"); asm("movq %mm5, %mm3"); asm("punpcklbw %mm6, %mm3"); asm("pmullw %mm3, %mm1"); asm("paddw %mm7, %mm1"); asm("movq %mm1, %mm3"); asm("psrlw $ 8, %mm3"); asm("paddw %mm3, %mm1"); asm("psrlw $ 8, %mm1"); asm("movq %mm4, %mm2"); asm("punpckhbw %mm6, %mm2"); asm("movq %mm5, %mm3"); asm("punpckhbw %mm6, %mm3"); asm("pmullw %mm3, %mm2"); asm("paddw %mm7, %mm2"); asm("movq %mm2, %mm3"); asm("psrlw $ 8, %mm3"); asm("paddw %mm3, %mm2"); asm("psrlw $ 8, %mm2"); asm("packuswb %mm2, %mm1"); asm("pcmpeqb %mm3, %mm3"); asm("psubb %mm1, %mm3"); asm("movq %mm0, %mm1"); asm("pandn %mm3, %mm1"); asm("movq %mm2, %mm4"); asm("psubusb %mm5, %mm2"); asm("paddb %mm2, %mm5"); asm("pcmpeqb %mm3, %mm3"); asm("psubb %mm5, %mm3"); asm("pand %mm0, %mm3"); asm("por %mm3, %mm1"); asm("addl $ 4, %eax"); asm("addl $ 4, %edx"); asm("addl $ 4, %edi"); asm(".screen_pixels_1a_1a_last1:"); asm("test $ 1, %ecx"); asm("jz .screen_pixels_1a_1a_end"); asm("movw (%eax), %bx"); asm("movd %ebx, %mm2"); asm("movw (%edx), %bx"); asm("movd %ebx, %mm3"); asm("pcmpeqb %mm4, %mm4"); asm("psubb %mm2, %mm4"); asm("pcmpeqb %mm5, %mm5"); asm("psubb %mm3, %mm5"); asm("movq %mm4, %mm1"); asm("punpcklbw %mm6, %mm1"); asm("movq %mm5, %mm3"); asm("punpcklbw %mm6, %mm3"); asm("pmullw %mm3, %mm1"); asm("paddw %mm7, %mm1"); asm("movq %mm1, %mm3"); asm("psrlw $ 8, %mm3"); asm("paddw %mm3, %mm1"); asm("psrlw $ 8, %mm1"); asm("movq %mm4, %mm2"); asm("punpckhbw %mm6, %mm2"); asm("movq %mm5, %mm3"); asm("punpckhbw %mm6, %mm3"); asm("pmullw %mm3, %mm2"); asm("paddw %mm7, %mm2"); asm("movq %mm2, %mm3"); asm("psrlw $ 8, %mm3"); asm("paddw %mm3, %mm2"); asm("psrlw $ 8, %mm2"); asm("packuswb %mm2, %mm1"); asm("pcmpeqb %mm3, %mm3"); asm("psubb %mm1, %mm3"); asm("movq %mm0, %mm1"); asm("pandn %mm3, %mm1"); asm("movq %mm2, %mm4"); asm("psubusb %mm5, %mm2"); asm("paddb %mm2, %mm5"); asm("pcmpeqb %mm3, %mm3"); asm("psubb %mm5, %mm3"); asm("pand %mm0, %mm3"); asm("por %mm3, %mm1"); asm("movd %mm1, %ebx"); asm("movw %bx, (%edi)"); asm(".screen_pixels_1a_1a_end:"); asm("emms"); asm("popl %ebx"); asm("popl %edi"); } void xxxgimp_composite_softlight_va8_va8_va8_sse (GimpCompositeContext *_op) { GimpCompositeContext op = *_op; } void xxxgimp_composite_subtract_va8_va8_va8_sse (GimpCompositeContext *_op) { GimpCompositeContext op = *_op; asm("pushl %edi"); asm("pushl %ebx"); asm("movl 12(%esp), %edi"); asm("movq v8_alpha_mask, %mm0"); asm("subl $ 4, %ecx"); asm("jl .substract_pixels_1a_1a_last3"); asm("movl $ 8, %ebx"); asm(".substract_pixels_1a_1a_loop:"); asm("movq (%eax), %mm2"); asm("movq (%edx), %mm3"); asm("movq %mm2, %mm4"); asm("psubusb %mm3, %mm4"); asm("movq %mm0, %mm1"); asm("pandn %mm4, %mm1"); asm("psubb %mm4, %mm2"); asm("pand %mm0, %mm2"); asm("por %mm2, %mm1"); asm("movq %mm1, (%edi)"); asm("addl %ebx, %eax"); asm("addl %ebx, %edx"); asm("addl %ebx, %edi"); asm("subl $ 4, %ecx"); asm("jge .substract_pixels_1a_1a_loop"); asm(".substract_pixels_1a_1a_last3:"); asm("test $ 2, %ecx"); asm("jz .substract_pixels_1a_1a_last1"); asm("movd (%eax), %mm2"); asm("movd (%edx), %mm3"); asm("movq %mm2, %mm4"); asm("psubusb %mm3, %mm4"); asm("movq %mm0, %mm1"); asm("pandn %mm4, %mm1"); asm("psubb %mm4, %mm2"); asm("pand %mm0, %mm2"); asm("por %mm2, %mm1"); asm("addl $ 4, %eax"); asm("addl $ 4, %edx"); asm("addl $ 4, %edi"); asm(".substract_pixels_1a_1a_last1:"); asm("test $ 1, %ecx"); asm("jz .substract_pixels_1a_1a_end"); asm("movw (%eax), %bx"); asm("movd %ebx, %mm2"); asm("movw (%edx), %bx"); asm("movd %ebx, %mm3"); asm("movq %mm2, %mm4"); asm("psubusb %mm3, %mm4"); asm("movq %mm0, %mm1"); asm("pandn %mm4, %mm1"); asm("psubb %mm4, %mm2"); asm("pand %mm0, %mm2"); asm("por %mm2, %mm1"); asm("movd %mm1, %ebx"); asm("movw %bx, (%edi)"); asm(".substract_pixels_1a_1a_end:"); asm("emms"); asm("popl %ebx"); asm("popl %edi"); } void xxxgimp_composite_swap_va8_va8_va8_sse (GimpCompositeContext *_op) { GimpCompositeContext op = *_op; } void xxxgimp_composite_valueonly_va8_va8_va8_sse (GimpCompositeContext *_op) { GimpCompositeContext op = *_op; } #endif #endif /* COMPILE_SSE_IS_OKAY */