Print the list of active instruction sets if the --verbose command line

* app/composite/gimp-composite.c (gimp_composite_init): Print the
list of active instruction sets if the --verbose command line
switch is ON (via be_verbose)

* app/composite/gimp-composite-x86.h: Factored code from the mmx,
and sse implementations.

* app/composite/make-installer.py: Raised the number of test
iterations from 1 to 10.

* app/composite/gimp-composite-3dnow.[ch]
* app/composite/gimp-composite-3dnow-test.c
* app/composite/gimp-composite-3dnow-installer.c
* app/composite/gimp-composite-altivec.[ch]
* app/composite/gimp-composite-altivec-test.c
* app/composite/gimp-composite-altivec-installer.c
* app/composite/gimp-composite-mmx.[ch]
* app/composite/gimp-composite-altivec-test.c
* app/composite/gimp-composite-altivec-installer.c
* app/composite/gimp-composite-sse.[ch]
* app/composite/gimp-composite-sse-test.c
* app/composite/gimp-composite-sse-installer.c
* app/composite/gimp-composite-sse2.[ch]
* app/composite/gimp-composite-sse2-test.c
* app/composite/gimp-composite-sse2-installer.c
* app/composite/gimp-composite-vis.[ch]
* app/composite/gimp-composite-vis-test.c:
Regenerated sources via make-installer.py
This commit is contained in:
Helvetix Victorinox
2004-07-20 15:59:12 +00:00
parent 03b3f8c90f
commit 54630be219
21 changed files with 505 additions and 384 deletions

View File

@ -1,3 +1,35 @@
2004-07-20 Helvetix Victorinox <helvetix@gimp.org>
* app/composite/gimp-composite.c (gimp_composite_init): Print the
list of active instruction sets if the --verbose command line
switch is ON (via be_verbose)
* app/composite/gimp-composite-x86.h: Factored code from the mmx,
and sse implementations.
* app/composite/make-installer.py: Raised the number of test
iterations from 1 to 10.
* app/composite/gimp-composite-3dnow.[ch]
* app/composite/gimp-composite-3dnow-test.c
* app/composite/gimp-composite-3dnow-installer.c
* app/composite/gimp-composite-altivec.[ch]
* app/composite/gimp-composite-altivec-test.c
* app/composite/gimp-composite-altivec-installer.c
* app/composite/gimp-composite-mmx.[ch]
* app/composite/gimp-composite-altivec-test.c
* app/composite/gimp-composite-altivec-installer.c
* app/composite/gimp-composite-sse.[ch]
* app/composite/gimp-composite-sse-test.c
* app/composite/gimp-composite-sse-installer.c
* app/composite/gimp-composite-sse2.[ch]
* app/composite/gimp-composite-sse2-test.c
* app/composite/gimp-composite-sse2-installer.c
* app/composite/gimp-composite-vis.[ch]
* app/composite/gimp-composite-vis-test.c:
Regenerated sources via make-installer.py
2004-07-20 Sven Neumann <sven@gimp.org> 2004-07-20 Sven Neumann <sven@gimp.org>
* app/app_procs.c * app/app_procs.c

View File

@ -19,7 +19,7 @@
int int
gimp_composite_3dnow_test (int iterations, int n_pixels) gimp_composite_3dnow_test (int iterations, int n_pixels)
{ {
#if (__GNUC__ >= 3) && defined(USE_3DNOW) && defined(ARCH_X86) && (defined(ARCH_X86_64) || !defined(PIC)) #if defined(COMPILE_3DNOW_IS_OKAY)
GimpCompositeContext generic_ctx; GimpCompositeContext generic_ctx;
GimpCompositeContext special_ctx; GimpCompositeContext special_ctx;
double ft0; double ft0;
@ -78,7 +78,7 @@ main (int argc, char *argv[])
putenv ("GIMP_COMPOSITE=0x1"); putenv ("GIMP_COMPOSITE=0x1");
iterations = 1; iterations = 10;
n_pixels = 1048593; n_pixels = 1048593;
argv++, argc--; argv++, argc--;

View File

@ -31,22 +31,17 @@
#include "base/cpu-accel.h" #include "base/cpu-accel.h"
#include "gimp-composite.h" #include "gimp-composite.h"
#include "gimp-composite-3dnow.h" #include "gimp-composite-3dnow.h"
#if defined(USE_MMX) #ifdef COMPILE_3DNOW_IS_OKAY
#if defined(ARCH_X86)
#if __GNUC__ >= 3
#if defined(ARCH_X86_64) || !defined(PIC)
#endif /* ARCH_X86_64 || !PIC */ #endif
#endif /* __GNUC__ > 3 */
#endif /* ARCH_X86 */
#endif /* USE_MMX */
gboolean gboolean
gimp_composite_3dnow_init (void) gimp_composite_3dnow_init (void)
{ {
#if defined(USE_MMX) && defined(ARCH_X86) && (defined(ARCH_X86_64) || !defined(PIC)) #ifdef COMPILE_3DNOW_IS_OKAY
if (cpu_accel () & CPU_ACCEL_X86_3DNOW) if (cpu_accel () & CPU_ACCEL_X86_3DNOW)
{ {
return (TRUE); return (TRUE);

View File

@ -9,4 +9,16 @@ extern gboolean gimp_composite_3dnow_init (void);
*/ */
extern gboolean gimp_composite_3dnow_install (void); extern gboolean gimp_composite_3dnow_install (void);
#if !defined(__INTEL_COMPILER)
#if defined(USE_MMX)
#if defined(ARCH_X86)
#if __GNUC__ >= 3
#if defined(ARCH_X86_64) || !defined(PIC)
#define COMPILE_3DNOW_IS_OKAY (1)
#endif /* ARCH_X86_64 || !PIC */
#endif /* __GNUC__ > 3 */
#endif /* ARCH_X86 */
#endif /* USE_MMX */
#endif /* !defined(__INTEL_COMPILER) */
#endif #endif

View File

@ -19,7 +19,7 @@
int int
gimp_composite_altivec_test (int iterations, int n_pixels) gimp_composite_altivec_test (int iterations, int n_pixels)
{ {
#if (__GNUC__ >= 3) && defined(USE_ALTIVEC) && defined(ARCH_PPC) #if defined(COMPILE_ALTIVEC_IS_OKAY)
GimpCompositeContext generic_ctx; GimpCompositeContext generic_ctx;
GimpCompositeContext special_ctx; GimpCompositeContext special_ctx;
double ft0; double ft0;
@ -78,7 +78,7 @@ main (int argc, char *argv[])
putenv ("GIMP_COMPOSITE=0x1"); putenv ("GIMP_COMPOSITE=0x1");
iterations = 1; iterations = 10;
n_pixels = 1048593; n_pixels = 1048593;
argv++, argc--; argv++, argc--;

View File

@ -30,16 +30,14 @@
#include "gimp-composite.h" #include "gimp-composite.h"
#include "gimp-composite-altivec.h" #include "gimp-composite-altivec.h"
#ifdef ARCH_PPC #ifdef COMPILE_ALTIVEC_IS_OKAY
#if __GNUC__ >= 3
#endif /* __GNUC__ > 3 */ #endif
#endif /* ARCH_PPC */
gboolean gboolean
gimp_composite_altivec_init (void) gimp_composite_altivec_init (void)
{ {
#ifdef ARCH_PPC #ifdef COMPILE_ALTIVEC_IS_OKAY
if (cpu_accel () & CPU_ACCEL_PPC_ALTIVEC) if (cpu_accel () & CPU_ACCEL_PPC_ALTIVEC)
{ {
return (TRUE); return (TRUE);

View File

@ -9,4 +9,10 @@ extern gboolean gimp_composite_altivec_init (void);
*/ */
extern gboolean gimp_composite_altivec_install (void); extern gboolean gimp_composite_altivec_install (void);
#ifdef ARCH_PPC
#if __GNUC__ >= 3
#define COMPILE_ALTIVEC_IS_OKAY (1)
#endif /* __GNUC__ > 3 */
#endif /* ARCH_PPC */
#endif #endif

View File

@ -16,7 +16,7 @@ static struct install_table {
GimpPixelFormat D; GimpPixelFormat D;
void (*function)(GimpCompositeContext *); void (*function)(GimpCompositeContext *);
} _gimp_composite_mmx[] = { } _gimp_composite_mmx[] = {
#if (__GNUC__ >= 3) && defined(USE_MMX) && defined(ARCH_X86) && (defined(ARCH_X86_64) || !defined(PIC)) #if defined(COMPILE_MMX_IS_OKAY)
{ GIMP_COMPOSITE_MULTIPLY, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, gimp_composite_multiply_rgba8_rgba8_rgba8_mmx }, { GIMP_COMPOSITE_MULTIPLY, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, gimp_composite_multiply_rgba8_rgba8_rgba8_mmx },
{ GIMP_COMPOSITE_SCREEN, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, gimp_composite_screen_rgba8_rgba8_rgba8_mmx }, { GIMP_COMPOSITE_SCREEN, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, gimp_composite_screen_rgba8_rgba8_rgba8_mmx },
{ GIMP_COMPOSITE_DIFFERENCE, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, gimp_composite_difference_rgba8_rgba8_rgba8_mmx }, { GIMP_COMPOSITE_DIFFERENCE, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, gimp_composite_difference_rgba8_rgba8_rgba8_mmx },

View File

@ -19,7 +19,7 @@
int int
gimp_composite_mmx_test (int iterations, int n_pixels) gimp_composite_mmx_test (int iterations, int n_pixels)
{ {
#if (__GNUC__ >= 3) && defined(USE_MMX) && defined(ARCH_X86) && (defined(ARCH_X86_64) || !defined(PIC)) #if defined(COMPILE_MMX_IS_OKAY)
GimpCompositeContext generic_ctx; GimpCompositeContext generic_ctx;
GimpCompositeContext special_ctx; GimpCompositeContext special_ctx;
double ft0; double ft0;
@ -210,7 +210,7 @@ main (int argc, char *argv[])
putenv ("GIMP_COMPOSITE=0x1"); putenv ("GIMP_COMPOSITE=0x1");
iterations = 1; iterations = 10;
n_pixels = 1048593; n_pixels = 1048593;
argv++, argc--; argv++, argc--;

View File

@ -62,19 +62,19 @@ debug_display_mmx(void)
printf("--------------------------------------------\n"); printf("--------------------------------------------\n");
} }
static const guint32 rgba8_alpha_mask[2] = { 0xFF000000, 0xFF000000 }; const guint32 rgba8_alpha_mask_64[2] = { 0xFF000000, 0xFF000000 };
static const guint32 rgba8_b1[2] = { 0x01010101, 0x01010101 }; const guint32 rgba8_b1_64[2] = { 0x01010101, 0x01010101 };
static const guint32 rgba8_b255[2] = { 0xFFFFFFFF, 0xFFFFFFFF }; const guint32 rgba8_b255_64[2] = { 0xFFFFFFFF, 0xFFFFFFFF };
static const guint32 rgba8_w1[2] = { 0x00010001, 0x00010001 }; const guint32 rgba8_w1_64[2] = { 0x00010001, 0x00010001 };
static const guint32 rgba8_w2[2] = { 0x00020002, 0x00020002 }; const guint32 rgba8_w2_64[2] = { 0x00020002, 0x00020002 };
static const guint32 rgba8_w128[2] = { 0x00800080, 0x00800080 }; const guint32 rgba8_w128_64[2] = { 0x00800080, 0x00800080 };
static const guint32 rgba8_w256[2] = { 0x01000100, 0x01000100 }; const guint32 rgba8_w256_64[2] = { 0x01000100, 0x01000100 };
static const guint32 rgba8_w255[2] = { 0X00FF00FF, 0X00FF00FF }; const guint32 rgba8_w255_64[2] = { 0X00FF00FF, 0X00FF00FF };
static const guint32 va8_alpha_mask[2] = { 0xFF00FF00, 0xFF00FF00 }; const guint32 va8_alpha_mask_64[2] = { 0xFF00FF00, 0xFF00FF00 };
static const guint32 va8_b255[2] = { 0xFFFFFFFF, 0xFFFFFFFF }; const guint32 va8_b255_64[2] = { 0xFFFFFFFF, 0xFFFFFFFF };
static const guint32 va8_w1[2] = { 0x00010001, 0x00010001 }; const guint32 va8_w1_64[2] = { 0x00010001, 0x00010001 };
static const guint32 va8_w255[2] = { 0x00FF00FF, 0x00FF00FF }; const guint32 va8_w255_64[2] = { 0x00FF00FF, 0x00FF00FF };
/* /*
* *
@ -89,13 +89,13 @@ gimp_composite_addition_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
asm volatile ("movq %0,%%mm0" asm volatile ("movq %0,%%mm0"
: /* empty */ : /* empty */
: "m" (*rgba8_alpha_mask) : "m" (*rgba8_alpha_mask_64)
: "%mm0"); : "%mm0");
for (; n_pixels >= 2; n_pixels -= 2) for (; n_pixels >= 2; n_pixels -= 2)
{ {
asm volatile (" movq %1, %%mm2\n" asm volatile (" movq %1, %%mm2\n"
"\tmovq %2, %%mm3\n" "\tmovq %2, %%mm3\n"
"\tmovq %%mm2, %%mm4\n" "\tmovq %%mm2, %%mm4\n"
"\tpaddusb %%mm3, %%mm4\n" "\tpaddusb %%mm3, %%mm4\n"
"\tmovq %%mm0, %%mm1\n" "\tmovq %%mm0, %%mm1\n"
@ -179,7 +179,7 @@ gimp_composite_burn_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
"\t" pminub(mm0,mm1,mm3) "\n" /* mm1 = min(mm0,mm1) clobber mm3 */ "\t" pminub(mm0,mm1,mm3) "\n" /* mm1 = min(mm0,mm1) clobber mm3 */
"\tmovq %6,%%mm7\n" /* mm6 = rgba8_alpha_mask */ "\tmovq %6,%%mm7\n" /* mm6 = rgba8_alpha_mask_64 */
"\tpand %%mm7,%%mm1\n" /* mm1 = mm7 & alpha_mask */ "\tpand %%mm7,%%mm1\n" /* mm1 = mm7 & alpha_mask */
"\tpandn %%mm5,%%mm7\n" /* mm7 = ~mm7 & mm5 */ "\tpandn %%mm5,%%mm7\n" /* mm7 = ~mm7 & mm5 */
@ -187,7 +187,7 @@ gimp_composite_burn_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
"\tmovq %%mm7,%0\n" "\tmovq %%mm7,%0\n"
: "=m" (*d) : "=m" (*d)
: "m" (*a), "m" (*b), "m" (*rgba8_b255), "m" (*rgba8_w1), "m" (*rgba8_w255), "m" (*rgba8_alpha_mask) : "m" (*a), "m" (*b), "m" (*rgba8_b255_64), "m" (*rgba8_w1_64), "m" (*rgba8_w255_64), "m" (*rgba8_alpha_mask_64)
: pdivwqX_clobber, "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"); : pdivwqX_clobber, "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7");
d++; d++;
b++; b++;
@ -241,7 +241,7 @@ gimp_composite_burn_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
"\tmovd %%mm7,%0\n" "\tmovd %%mm7,%0\n"
: "=m" (*d) : "=m" (*d)
: "m" (*a), "m" (*b), "m" (*rgba8_b255), "m" (*rgba8_w1), "m" (*rgba8_w255), "m" (*rgba8_alpha_mask) : "m" (*a), "m" (*b), "m" (*rgba8_b255_64), "m" (*rgba8_w1_64), "m" (*rgba8_w255_64), "m" (*rgba8_alpha_mask_64)
: pdivwqX_clobber, "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"); : pdivwqX_clobber, "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7");
} }
@ -293,7 +293,7 @@ gimp_composite_difference_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
uint64 *b = (uint64 *) _op->B; uint64 *b = (uint64 *) _op->B;
gulong n_pixels = _op->n_pixels; gulong n_pixels = _op->n_pixels;
asm volatile ("movq %0,%%mm0" : : "m" (*rgba8_alpha_mask) : "%mm0"); asm volatile ("movq %0,%%mm0" : : "m" (*rgba8_alpha_mask_64) : "%mm0");
for (; n_pixels >= 2; n_pixels -= 2) for (; n_pixels >= 2; n_pixels -= 2)
{ {
@ -318,7 +318,7 @@ gimp_composite_difference_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
d++; d++;
} }
if (n_pixels) if (n_pixels > 0)
{ {
asm volatile (" movd %1, %%mm2\n" asm volatile (" movd %1, %%mm2\n"
"\tmovd %2, %%mm3\n" "\tmovd %2, %%mm3\n"
@ -352,7 +352,7 @@ xxxgimp_composite_divide_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
asm volatile (" movq %0, %%mm0\n" asm volatile (" movq %0, %%mm0\n"
"\tmovq %1, %%mm7\n" "\tmovq %1, %%mm7\n"
: :
: "m" (*rgba8_alpha_mask), "m" (*rgba8_w1) : "m" (*rgba8_alpha_mask_64), "m" (*rgba8_w1_64)
: "%mm0", "%mm7"); : "%mm0", "%mm7");
for (; n_pixels >= 2; n_pixels -= 2) for (; n_pixels >= 2; n_pixels -= 2)
@ -382,7 +382,7 @@ xxxgimp_composite_divide_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
"\tpackuswb %%mm4,%%mm5\n" /* expects mm4 and mm5 to be signed values */ "\tpackuswb %%mm4,%%mm5\n" /* expects mm4 and mm5 to be signed values */
"\t" pminub(mm0,mm1,mm3) "\n" "\t" pminub(mm0,mm1,mm3) "\n"
"\tmovq %3,%%mm3\n" "\tmovq %3,%%mm3\n"
"\tmovq %%mm3,%%mm2\n" "\tmovq %%mm3,%%mm2\n"
"\tpandn %%mm5,%%mm3\n" "\tpandn %%mm5,%%mm3\n"
@ -392,14 +392,14 @@ xxxgimp_composite_divide_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
"\tmovq %%mm3,%0\n" "\tmovq %%mm3,%0\n"
: "=m" (*d) : "=m" (*d)
: "m" (*a), "m" (*b), "m" (*rgba8_alpha_mask) : "m" (*a), "m" (*b), "m" (*rgba8_alpha_mask_64)
: "%eax", "%ecx", "%edx", "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); : pdivwuqX_clobber, "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
a++; a++;
b++; b++;
d++; d++;
} }
if (n_pixels) if (n_pixels > 0)
{ {
asm volatile (" movd %1,%%mm0\n" asm volatile (" movd %1,%%mm0\n"
"\tmovd %2,%%mm1\n" "\tmovd %2,%%mm1\n"
@ -436,8 +436,8 @@ xxxgimp_composite_divide_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
"\tmovd %%mm3,%0\n" "\tmovd %%mm3,%0\n"
: "=m" (*d) : "=m" (*d)
: "m" (*a), "m" (*b), "m" (*rgba8_alpha_mask) : "m" (*a), "m" (*b), "m" (*rgba8_alpha_mask_64)
: "%eax", "%ecx", "%edx", "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); : pdivwuqX_clobber, "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
} }
asm("emms"); asm("emms");
@ -487,14 +487,14 @@ xxxgimp_composite_dodge_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
"\tmovq %%mm7,%0\n" "\tmovq %%mm7,%0\n"
: "=m" (*d) : "=m" (*d)
: "m" (*a), "m" (*b), "m" (*rgba8_w256), "m" (*rgba8_alpha_mask) : "m" (*a), "m" (*b), "m" (*rgba8_w256_64), "m" (*rgba8_alpha_mask_64)
: pdivwuqX_clobber, "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"); : pdivwuqX_clobber, "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7");
a++; a++;
b++; b++;
d++; d++;
} }
if (n_pixels) if (n_pixels > 0)
{ {
asm volatile (" movd %0,%%mm0\n" asm volatile (" movd %0,%%mm0\n"
"\tmovq %1,%%mm1\n" "\tmovq %1,%%mm1\n"
@ -530,7 +530,7 @@ xxxgimp_composite_dodge_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
"\tmovd %%mm7,%2\n" "\tmovd %%mm7,%2\n"
: /* empty */ : /* empty */
: "m" (*a), "m" (*b), "m" (*d), "m" (*rgba8_w256), "m" (*rgba8_alpha_mask) : "m" (*a), "m" (*b), "m" (*d), "m" (*rgba8_w256_64), "m" (*rgba8_alpha_mask_64)
: pdivwuqX_clobber, "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); : pdivwuqX_clobber, "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
} }
@ -540,16 +540,22 @@ xxxgimp_composite_dodge_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
void void
gimp_composite_grain_extract_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op) gimp_composite_grain_extract_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
{ {
GimpCompositeContext op = *_op; uint64 *d = (uint64 *) _op->D;
uint64 *a = (uint64 *) _op->A;
uint64 *b = (uint64 *) _op->B;
gulong n_pixels = _op->n_pixels;
asm volatile ("movq %0,%%mm0" : : "m" (*rgba8_alpha_mask) : "%mm0"); asm volatile ("movq %0,%%mm0\n"
asm volatile ("pxor %%mm6,%%mm6" : : : "%mm6"); "pxor %%mm6,%%mm6\n"
asm volatile ("movq %0,%%mm7" : : "m" (*rgba8_w128) : "%mm7"); "movq %1,%%mm7\n"
: /* no outputs */
: "m" (*rgba8_alpha_mask_64), "m" (*rgba8_w128_64)
: "%mm0", "%mm7", "%mm6");
for (; op.n_pixels >= 2; op.n_pixels -= 2) for (; n_pixels >= 2; n_pixels -= 2)
{ {
asm volatile (" movq %0,%%mm2\n" asm volatile (" movq %1,%%mm2\n"
"\tmovq %1,%%mm3\n" "\tmovq %2,%%mm3\n"
mmx_low_bytes_to_words(mm2,mm4,mm6) mmx_low_bytes_to_words(mm2,mm4,mm6)
mmx_low_bytes_to_words(mm3,mm5,mm6) mmx_low_bytes_to_words(mm3,mm5,mm6)
"\tpsubw %%mm5,%%mm4\n" "\tpsubw %%mm5,%%mm4\n"
@ -572,19 +578,19 @@ gimp_composite_grain_extract_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
"\tpand %%mm0,%%mm2\n" "\tpand %%mm0,%%mm2\n"
"\tpor %%mm2,%%mm1\n" "\tpor %%mm2,%%mm1\n"
"\tmovq %%mm1,%2\n" "\tmovq %%mm1,%0\n"
: /* empty */ : "=m" (*d)
: "m" (*op.A), "m" (*op.B), "m" (*op.D) : "m" (*a), "m" (*b)
: "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4"); : "%mm1", "%mm2", "%mm3", "%mm4");
op.A += 8; a++;
op.B += 8; b++;
op.D += 8; d++;
} }
if (op.n_pixels) if (n_pixels > 0)
{ {
asm volatile (" movd %0, %%mm2\n" asm volatile (" movd %1, %%mm2\n"
"\tmovd %1, %%mm3\n" "\tmovd %2, %%mm3\n"
mmx_low_bytes_to_words(mm2,mm4,mm6) mmx_low_bytes_to_words(mm2,mm4,mm6)
mmx_low_bytes_to_words(mm3,mm5,mm6) mmx_low_bytes_to_words(mm3,mm5,mm6)
@ -603,10 +609,10 @@ gimp_composite_grain_extract_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
"\tpand %%mm0, %%mm2\n" "\tpand %%mm0, %%mm2\n"
"\tpor %%mm2, %%mm1\n" "\tpor %%mm2, %%mm1\n"
"\tmovd %%mm1, %2\n" "\tmovd %%mm1, %0\n"
: /* empty */ : "=m" (*d)
: "m" (*op.A), "m" (*op.B), "m" (*op.D) : "m" (*a), "m" (*b)
: "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4"); : "%mm1", "%mm2", "%mm3", "%mm4");
} }
asm("emms"); asm("emms");
@ -615,19 +621,22 @@ gimp_composite_grain_extract_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
void void
gimp_composite_grain_merge_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op) gimp_composite_grain_merge_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
{ {
GimpCompositeContext op = *_op; uint64 *d = (uint64 *) _op->D;
uint64 *a = (uint64 *) _op->A;
uint64 *b = (uint64 *) _op->B;
gulong n_pixels = _op->n_pixels;
asm volatile ("movq %0, %%mm0\n" asm volatile ("movq %0, %%mm0\n"
"pxor %%mm6, %%mm6\n" "pxor %%mm6, %%mm6\n"
"movq %1, %%mm7\n" "movq %1, %%mm7\n"
: /* empty */ : /* empty */
: "m" (*rgba8_alpha_mask), "m" (*rgba8_w128) : "m" (*rgba8_alpha_mask_64), "m" (*rgba8_w128_64)
: "%mm0", "%mm6", "%mm7"); : "%mm0", "%mm6", "%mm7");
for (; op.n_pixels >= 2; op.n_pixels -= 2) for (; n_pixels >= 2; n_pixels -= 2)
{ {
asm volatile (" movq %0, %%mm2\n" asm volatile (" movq %1, %%mm2\n"
"\tmovq %1, %%mm3\n" "\tmovq %2, %%mm3\n"
mmx_low_bytes_to_words(mm2,mm4,mm6) mmx_low_bytes_to_words(mm2,mm4,mm6)
mmx_low_bytes_to_words(mm3,mm5,mm6) mmx_low_bytes_to_words(mm3,mm5,mm6)
@ -647,19 +656,19 @@ gimp_composite_grain_merge_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
"\tmovq %%mm0, %%mm1\n" "\tmovq %%mm0, %%mm1\n"
"\tpandn %%mm4, %%mm1\n" "\tpandn %%mm4, %%mm1\n"
"\tpor %%mm2, %%mm1\n" "\tpor %%mm2, %%mm1\n"
"\tmovq %%mm1, %2\n" "\tmovq %%mm1, %0\n"
: /* empty */ : "=m" (*d)
: "m" (*op.A), "m" (*op.B), "m" (*op.D) : "m" (*a), "m" (*b)
: "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4"); : "%mm1", "%mm2", "%mm3", "%mm4");
op.A += 8; a++;
op.B += 8; b++;
op.D += 8; d++;
} }
if (op.n_pixels) if (n_pixels > 0)
{ {
asm volatile (" movd %0, %%mm2\n" asm volatile (" movd %1, %%mm2\n"
"\tmovd %1, %%mm3\n" "\tmovd %2, %%mm3\n"
mmx_low_bytes_to_words(mm2,mm4,mm6) mmx_low_bytes_to_words(mm2,mm4,mm6)
mmx_low_bytes_to_words(mm3,mm5,mm6) mmx_low_bytes_to_words(mm3,mm5,mm6)
@ -677,10 +686,10 @@ gimp_composite_grain_merge_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
"\tpand %%mm0, %%mm2\n" "\tpand %%mm0, %%mm2\n"
"\tpor %%mm2, %%mm1\n" "\tpor %%mm2, %%mm1\n"
"\tmovd %%mm1, %2\n" "\tmovd %%mm1, %0\n"
: /* empty */ : "=m" (*d)
: "m" (*op.A), "m" (*op.B), "m" (*op.D) : "m" (*a), "m" (*b)
: "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4"); : "%mm1", "%mm2", "%mm3", "%mm4");
} }
asm("emms"); asm("emms");
@ -689,14 +698,17 @@ gimp_composite_grain_merge_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
void void
gimp_composite_lighten_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op) gimp_composite_lighten_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
{ {
GimpCompositeContext op = *_op; uint64 *d = (uint64 *) _op->D;
uint64 *a = (uint64 *) _op->A;
uint64 *b = (uint64 *) _op->B;
gulong n_pixels = _op->n_pixels;
asm volatile ("movq %0,%%mm0" : : "m" (*rgba8_alpha_mask) : "%mm0"); asm volatile ("movq %0,%%mm0" : : "m" (*rgba8_alpha_mask_64) : "%mm0");
for (; op.n_pixels >= 2; op.n_pixels -= 2) for (; n_pixels >= 2; n_pixels -= 2)
{ {
asm volatile (" movq %0, %%mm2\n" asm volatile (" movq %1, %%mm2\n"
"\tmovq %1, %%mm3\n" "\tmovq %2, %%mm3\n"
"\tmovq %%mm2, %%mm4\n" "\tmovq %%mm2, %%mm4\n"
"\t" pmaxub(mm3,mm4,mm5) "\n" "\t" pmaxub(mm3,mm4,mm5) "\n"
"\tmovq %%mm0, %%mm1\n" "\tmovq %%mm0, %%mm1\n"
@ -704,34 +716,34 @@ gimp_composite_lighten_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
"\t" pminub(mm2,mm3,mm4) "\n" "\t" pminub(mm2,mm3,mm4) "\n"
"\tpand %%mm0, %%mm3\n" "\tpand %%mm0, %%mm3\n"
"\tpor %%mm3, %%mm1\n" "\tpor %%mm3, %%mm1\n"
"\tmovq %%mm1, %2\n" "\tmovq %%mm1, %0\n"
: /* empty */ : "=m" (*d)
: "m" (*op.A), "m" (*op.B), "m" (*op.D) : "m" (*a), "m" (*b)
: "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); : "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
op.A += 8; a++;
op.B += 8; b++;
op.D += 8; d++;
} }
if (op.n_pixels) if (n_pixels > 0)
{ {
asm volatile (" movd %0, %%mm2\n" asm volatile (" movd %1, %%mm2\n"
"\tmovd %1, %%mm3\n" "\tmovd %2, %%mm3\n"
"\tmovq %%mm2, %%mm4\n" "\tmovq %%mm2, %%mm4\n"
"\t" pmaxub(mm3,mm4,mm5) "\n" "\t" pmaxub(mm3,mm4,mm5) "\n"
"\tmovq %%mm0, %%mm1\n" "\tmovq %%mm0, %%mm1\n"
"\tpandn %%mm4, %%mm1\n" "\tpandn %%mm4, %%mm1\n"
"\t" pminub(mm2,mm3,mm4) "\n" "\t" pminub(mm2,mm3,mm4) "\n"
"\tpand %%mm0, %%mm3\n" "\tpand %%mm0, %%mm3\n"
"\tpor %%mm3, %%mm1\n" "\tpor %%mm3, %%mm1\n"
"\tmovd %%mm1, %2\n" "\tmovd %%mm1, %0\n"
: /* empty */ : "=m" (*d)
: "m" (*op.A), "m" (*op.B), "m" (*op.D) : "m" (*a), "m" (*b)
: "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); : "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
} }
asm("emms"); asm("emms");
} }
@ -739,16 +751,23 @@ gimp_composite_lighten_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
void void
gimp_composite_multiply_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op) gimp_composite_multiply_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
{ {
GimpCompositeContext op = *_op; uint64 *d = (uint64 *) _op->D;
uint64 *a = (uint64 *) _op->A;
uint64 *b = (uint64 *) _op->B;
gulong n_pixels = _op->n_pixels;
asm volatile ("movq %0,%%mm0" : : "m" (*rgba8_alpha_mask) : "%mm0"); asm volatile (
asm volatile ("movq %0,%%mm7" : : "m" (*rgba8_w128) : "%mm7"); "movq %0,%%mm0\n"
asm volatile ("pxor %%mm6,%%mm6" : : : "%mm6"); "movq %1,%%mm7\n"
"pxor %%mm6,%%mm6\n"
: /* empty */
: "m" (*rgba8_alpha_mask_64), "m" (*rgba8_w128_64)
: "%mm6", "%mm7", "%mm0");
for (; op.n_pixels >= 2; op.n_pixels -= 2) for (; n_pixels >= 2; n_pixels -= 2)
{ {
asm volatile (" movq %0, %%mm2\n" asm volatile (" movq %1, %%mm2\n"
"\tmovq %1, %%mm3\n" "\tmovq %2, %%mm3\n"
mmx_low_bytes_to_words(mm2,mm1,mm6) mmx_low_bytes_to_words(mm2,mm1,mm6)
mmx_low_bytes_to_words(mm3,mm5,mm6) mmx_low_bytes_to_words(mm3,mm5,mm6)
@ -767,37 +786,37 @@ gimp_composite_multiply_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
"\tpand %%mm0, %%mm2\n" "\tpand %%mm0, %%mm2\n"
"\tpor %%mm2, %%mm1\n" "\tpor %%mm2, %%mm1\n"
"\tmovq %%mm1, %2\n" "\tmovq %%mm1, %0\n"
: /* empty */ : "=m" (*d)
: "m" (*op.A), "m" (*op.B), "m" (*op.D) : "m" (*a), "m" (*b)
: "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); : "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
op.A += 8; a++;
op.B += 8; b++;
op.D += 8; d++;
} }
if (op.n_pixels) if (n_pixels > 0)
{ {
asm volatile (" movd %0, %%mm2\n" asm volatile (" movd %1, %%mm2\n"
"\tmovd %1, %%mm3\n" "\tmovd %2, %%mm3\n"
mmx_low_bytes_to_words(mm2,mm1,mm6) mmx_low_bytes_to_words(mm2,mm1,mm6)
mmx_low_bytes_to_words(mm3,mm5,mm6) mmx_low_bytes_to_words(mm3,mm5,mm6)
pmulwX(mm5,mm1,mm7) pmulwX(mm5,mm1,mm7)
"\tpackuswb %%mm6, %%mm1\n" "\tpackuswb %%mm6, %%mm1\n"
"\tmovq %%mm0, %%mm4\n" "\tmovq %%mm0, %%mm4\n"
"\tpandn %%mm1, %%mm4\n" "\tpandn %%mm1, %%mm4\n"
"\tmovq %%mm4, %%mm1\n" "\tmovq %%mm4, %%mm1\n"
"\t" pminub(mm3,mm2,mm4) "\n" "\t" pminub(mm3,mm2,mm4) "\n"
"\tpand %%mm0, %%mm2\n" "\tpand %%mm0, %%mm2\n"
"\tpor %%mm2, %%mm1\n" "\tpor %%mm2, %%mm1\n"
"\tmovd %%mm1, %2\n" "\tmovd %%mm1, %0\n"
: /* empty */ : "=m" (*d)
: "m" (*op.A), "m" (*op.B), "m" (*op.D) : "m" (*a), "m" (*b)
: "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); : "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
} }
asm("emms"); asm("emms");
@ -849,21 +868,24 @@ mmx_op_overlay(void)
"\tpor %%mm3,%%mm1\n" "\tpor %%mm3,%%mm1\n"
: /* empty */ : /* empty */
: "m" (*rgba8_w2), "m" (*rgba8_alpha_mask) : "m" (*rgba8_w2_64), "m" (*rgba8_alpha_mask_64)
); );
} }
void void
xxxgimp_composite_overlay_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op) xxxgimp_composite_overlay_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
{ {
GimpCompositeContext op = *_op; uint64 *d = (uint64 *) _op->D;
uint64 *a = (uint64 *) _op->A;
uint64 *b = (uint64 *) _op->B;
gulong n_pixels = _op->n_pixels;
asm volatile ("pxor %%mm0,%%mm0\n" asm volatile ("pxor %%mm0,%%mm0\n"
"movq %0,%%mm7" "movq %0,%%mm7"
: /* empty */ : /* empty */
: "m" (*rgba8_w128) : "%mm0"); : "m" (*rgba8_w128_64) : "%mm0");
for (; op.n_pixels >= 2; op.n_pixels -= 2) for (; n_pixels >= 2; n_pixels -= 2)
{ {
asm volatile (" movq %0,%%mm2\n" asm volatile (" movq %0,%%mm2\n"
"\tmovq %1,%%mm3\n" "\tmovq %1,%%mm3\n"
@ -910,25 +932,25 @@ xxxgimp_composite_overlay_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
"\tpor %%mm3,%%mm1\n" "\tpor %%mm3,%%mm1\n"
"\tmovq %%mm1,%2\n" "\tmovq %%mm1,%2\n"
: "+m" (*op.A), "+m" (*op.B), "+m" (*op.D) : "+m" (*a), "+m" (*b), "+m" (*d)
: "m" (*rgba8_w2), "m" (*rgba8_alpha_mask) : "m" (*rgba8_w2_64), "m" (*rgba8_alpha_mask_64)
: "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4"); : "%mm1", "%mm2", "%mm3", "%mm4");
op.A += 8; a++;
op.B += 8; b++;
op.D += 8; d++;
} }
if (op.n_pixels) if (n_pixels > 0)
{ {
asm volatile (" movd %0,%%mm2\n" asm volatile (" movd %1,%%mm2\n"
"\tmovd %1,%%mm3\n" "\tmovd %2,%%mm3\n"
/* low bytes */ /* low bytes */
mmx_low_bytes_to_words(mm3,mm5,mm0) mmx_low_bytes_to_words(mm3,mm5,mm0)
"\tpcmpeqb %%mm4,%%mm4\n" "\tpcmpeqb %%mm4,%%mm4\n"
"\tpsubb %%mm2,%%mm4\n" /* mm4 = 255 - A */ "\tpsubb %%mm2,%%mm4\n" /* mm4 = 255 - A */
"\tpunpcklbw %%mm0,%%mm4\n" /* mm4 = (low bytes as word) mm4 */ "\tpunpcklbw %%mm0,%%mm4\n" /* mm4 = (low bytes as word) mm4 */
"\tmovq %3,%%mm6\n" /* mm6 = words of value 2 */ "\tmovq %3,%%mm6\n" /* mm6 = words of integer value 2 */
"\tpmullw %%mm5,%%mm6\n" /* mm6 = 2 * low bytes of B */ "\tpmullw %%mm5,%%mm6\n" /* mm6 = 2 * low bytes of B */
mmx_int_mult(mm6,mm4,mm7) /* mm4 = INT_MULT(mm6, mm4) */ mmx_int_mult(mm6,mm4,mm7) /* mm4 = INT_MULT(mm6, mm4) */
@ -937,7 +959,7 @@ xxxgimp_composite_overlay_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
"\tpcmpeqb %%mm1,%%mm1\n" "\tpcmpeqb %%mm1,%%mm1\n"
"\tpsubb %%mm2,%%mm1\n" /* mm1 = 255 - A */ "\tpsubb %%mm2,%%mm1\n" /* mm1 = 255 - A */
"\tpunpckhbw %%mm0,%%mm1\n" /* mm1 = (high bytes as word) mm1 */ "\tpunpckhbw %%mm0,%%mm1\n" /* mm1 = (high bytes as word) mm1 */
"\tmovq %3,%%mm6\n" /* mm6 = words of value 2 */ "\tmovq %3,%%mm6\n" /* mm6 = words of integer value 2 */
"\tpmullw %%mm5,%%mm6\n" /* mm6 = 2 * high bytes of B */ "\tpmullw %%mm5,%%mm6\n" /* mm6 = 2 * high bytes of B */
mmx_int_mult(mm6,mm1,mm7) /* mm1 = INT_MULT(mm6, mm1) */ mmx_int_mult(mm6,mm1,mm7) /* mm1 = INT_MULT(mm6, mm1) */
@ -964,10 +986,10 @@ xxxgimp_composite_overlay_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
"\tpor %%mm3,%%mm1\n" "\tpor %%mm3,%%mm1\n"
"\tmovd %%mm1,%2\n" "\tmovd %%mm1,%0\n"
: /* empty */ : "=m" (*d)
: "m" (*op.A), "m" (*op.B), "m" (*op.D), "m" (*rgba8_w2), "m" (*rgba8_alpha_mask) : "m" (*a), "m" (*b), "m" (*rgba8_w2_64), "m" (*rgba8_alpha_mask_64)
: "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4"); : "%mm1", "%mm2", "%mm3", "%mm4");
} }
asm("emms"); asm("emms");
@ -977,23 +999,25 @@ xxxgimp_composite_overlay_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
void void
gimp_composite_scale_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op) gimp_composite_scale_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
{ {
GimpCompositeContext op = *_op; uint64 *d = (uint64 *) _op->D;
uint64 *a = (uint64 *) _op->A;
gulong n_pixels = _op->n_pixels;
asm volatile ("pxor %%mm0,%%mm0\n" asm volatile ("pxor %%mm0,%%mm0\n"
"\tmovl %0,%%eax\n" "\tmovl %0,%%eax\n"
"\tmovl %%eax,%%ebx\n" "\tmovl %%eax,%%ebx\n"
"\tshl $16,%%ebx\n" "\tshl $16,%%ebx\n"
"\torl %%ebx,%%eax\n" "\torl %%ebx,%%eax\n"
"\tmovd %%eax,%%mm5\n" "\tmovd %%eax,%%mm5\n"
"\tmovd %%eax,%%mm3\n" "\tmovd %%eax,%%mm3\n"
"\tpsllq $32,%%mm5\n" "\tpsllq $32,%%mm5\n"
"\tpor %%mm5,%%mm3\n" "\tpor %%mm5,%%mm3\n"
"\tmovq %1,%%mm7\n" "\tmovq %1,%%mm7\n"
: /* empty */ : /* empty */
: "m" (op.scale.scale), "m" (*rgba8_w128) : "m" (_op->scale.scale), "m" (*rgba8_w128_64)
: "%eax", "%mm0", "%mm5", "%mm6", "%mm7"); : "%eax", "%ebx", "%mm0", "%mm5", "%mm6", "%mm7");
for (; op.n_pixels >= 2; op.n_pixels -= 2) for (; n_pixels >= 2; n_pixels -= 2)
{ {
asm volatile ("movq %1,%%mm2\n" asm volatile ("movq %1,%%mm2\n"
"\tmovq %%mm2,%%mm1\n" "\tmovq %%mm2,%%mm1\n"
@ -1010,15 +1034,15 @@ gimp_composite_scale_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
"\tpackuswb %%mm4,%%mm1\n" "\tpackuswb %%mm4,%%mm1\n"
"\tmovq %%mm1,%0\n" "\tmovq %%mm1,%0\n"
: "=m" (*op.D) : "=m" (*d)
: "m" (*op.A) : "m" (*a)
: "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"); : "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7");
op.A += 8; a++;
op.D += 8; d++;
} }
if (op.n_pixels) if (n_pixels > 0)
{ {
asm volatile ("movd %1,%%mm2\n" asm volatile ("movd %1,%%mm2\n"
"\tmovq %%mm2,%%mm1\n" "\tmovq %%mm2,%%mm1\n"
@ -1029,8 +1053,8 @@ gimp_composite_scale_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
"\tpackuswb %%mm0,%%mm1\n" "\tpackuswb %%mm0,%%mm1\n"
"\tmovd %%mm1,%0\n" "\tmovd %%mm1,%0\n"
: "=m" (*op.D) : "=m" (*d)
: "m" (*op.A) : "m" (*a)
: "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"); : "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7");
} }
@ -1040,16 +1064,22 @@ gimp_composite_scale_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
void void
gimp_composite_screen_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op) gimp_composite_screen_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
{ {
GimpCompositeContext op = *_op; uint64 *d = (uint64 *) _op->D;
uint64 *a = (uint64 *) _op->A;
uint64 *b = (uint64 *) _op->B;
gulong n_pixels = _op->n_pixels;
asm volatile ("movq %0,%%mm0" : : "m" (*rgba8_alpha_mask) : "%mm0"); asm volatile ("pxor %%mm6,%%mm6\n"
asm volatile ("movq %0,%%mm7" : : "m" (*rgba8_w128) : "%mm7"); "movq %0,%%mm0\n"
asm volatile ("pxor %mm6, %mm6"); "movq %1,%%mm7\n"
: /* empty */
: "m" (*rgba8_alpha_mask_64), "m" (*rgba8_w128_64)
: "%mm0", "%mm6", "%mm7");
for (; op.n_pixels >= 2; op.n_pixels -= 2) for (; n_pixels >= 2; n_pixels -= 2)
{ {
asm volatile (" movq %0,%%mm2\n" asm volatile (" movq %1,%%mm2\n"
"\tmovq %1,%%mm3\n" "\tmovq %2,%%mm3\n"
"\tpcmpeqb %%mm4,%%mm4\n" "\tpcmpeqb %%mm4,%%mm4\n"
"\tpsubb %%mm2,%%mm4\n" "\tpsubb %%mm2,%%mm4\n"
@ -1092,79 +1122,82 @@ gimp_composite_screen_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
"\tpor %%mm3,%%mm1\n" "\tpor %%mm3,%%mm1\n"
"\tmovq %%mm1,%2\n" "\tmovq %%mm1,%0\n"
: /* empty */ : "=m" (*d)
: "m" (*op.A), "m" (*op.B), "m" (*op.D) : "m" (*a), "m" (*b)
: "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); : "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
op.A += 8; a++;
op.B += 8; b++;
op.D += 8; d++;
} }
if (op.n_pixels) if (n_pixels > 0)
{ {
asm volatile (" movd %0,%%mm2\n" asm volatile (" movd %1,%%mm2\n"
"\tmovd %1,%%mm3\n" "\tmovd %2,%%mm3\n"
"\tpcmpeqb %%mm4,%%mm4\n" "\tpcmpeqb %%mm4,%%mm4\n"
"\tpsubb %%mm2,%%mm4\n" "\tpsubb %%mm2,%%mm4\n"
"\tpcmpeqb %%mm5,%%mm5\n" "\tpcmpeqb %%mm5,%%mm5\n"
"\tpsubb %%mm3,%%mm5\n" "\tpsubb %%mm3,%%mm5\n"
"\tpunpcklbw %%mm6,%%mm4\n" "\tpunpcklbw %%mm6,%%mm4\n"
"\tpunpcklbw %%mm6,%%mm5\n" "\tpunpcklbw %%mm6,%%mm5\n"
"\tpmullw %%mm4,%%mm5\n" "\tpmullw %%mm4,%%mm5\n"
"\tpaddw %%mm7,%%mm5\n" "\tpaddw %%mm7,%%mm5\n"
"\tmovq %%mm5,%%mm1\n" "\tmovq %%mm5,%%mm1\n"
"\tpsrlw $ 8,%%mm1\n" "\tpsrlw $ 8,%%mm1\n"
"\tpaddw %%mm5,%%mm1\n" "\tpaddw %%mm5,%%mm1\n"
"\tpsrlw $ 8,%%mm1\n" "\tpsrlw $ 8,%%mm1\n"
"\tpcmpeqb %%mm4,%%mm4\n" "\tpcmpeqb %%mm4,%%mm4\n"
"\tpsubb %%mm2,%%mm4\n" "\tpsubb %%mm2,%%mm4\n"
"\tpcmpeqb %%mm5,%%mm5\n" "\tpcmpeqb %%mm5,%%mm5\n"
"\tpsubb %%mm3,%%mm5\n" "\tpsubb %%mm3,%%mm5\n"
"\tpunpckhbw %%mm6,%%mm4\n" "\tpunpckhbw %%mm6,%%mm4\n"
"\tpunpckhbw %%mm6,%%mm5\n" "\tpunpckhbw %%mm6,%%mm5\n"
"\tpmullw %%mm4,%%mm5\n" "\tpmullw %%mm4,%%mm5\n"
"\tpaddw %%mm7,%%mm5\n" "\tpaddw %%mm7,%%mm5\n"
"\tmovq %%mm5,%%mm4\n" "\tmovq %%mm5,%%mm4\n"
"\tpsrlw $ 8,%%mm4\n" "\tpsrlw $ 8,%%mm4\n"
"\tpaddw %%mm5,%%mm4\n" "\tpaddw %%mm5,%%mm4\n"
"\tpsrlw $ 8,%%mm4\n" "\tpsrlw $ 8,%%mm4\n"
"\tpackuswb %%mm4,%%mm1\n" "\tpackuswb %%mm4,%%mm1\n"
"\tpcmpeqb %%mm4,%%mm4\n" "\tpcmpeqb %%mm4,%%mm4\n"
"\tpsubb %%mm1,%%mm4\n" "\tpsubb %%mm1,%%mm4\n"
"\tmovq %%mm0,%%mm1\n" "\tmovq %%mm0,%%mm1\n"
"\tpandn %%mm4,%%mm1\n" "\tpandn %%mm4,%%mm1\n"
"\t" pminub(mm2,mm3,mm4) "\n" "\t" pminub(mm2,mm3,mm4) "\n"
"\tpand %%mm0,%%mm3\n" "\tpand %%mm0,%%mm3\n"
"\tpor %%mm3,%%mm1\n" "\tpor %%mm3,%%mm1\n"
"\tmovd %%mm1,%2\n" "\tmovd %%mm1,%0\n"
: /* empty */ : "=m" (*d)
: "m" (*op.A), "m" (*op.B), "m" (*op.D) : "m" (*a), "m" (*b)
: "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); : "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
} }
asm("emms"); asm volatile ("emms");
} }
void void
gimp_composite_subtract_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op) gimp_composite_subtract_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
{ {
GimpCompositeContext op = *_op; uint64 *d = (uint64 *) _op->D;
uint64 *a = (uint64 *) _op->A;
uint64 *b = (uint64 *) _op->B;
gulong n_pixels = _op->n_pixels;
asm volatile ("movq %0,%%mm0" : : "m" (*rgba8_alpha_mask) : "%mm0"); asm volatile ("movq %0,%%mm0" : : "m" (*rgba8_alpha_mask_64) : "%mm0");
for (; op.n_pixels >= 2; op.n_pixels -= 2) for (; n_pixels >= 2; n_pixels -= 2)
{ {
asm volatile (" movq %1,%%mm2\n" asm volatile (" movq %1,%%mm2\n"
"\tmovq %2,%%mm3\n" "\tmovq %2,%%mm3\n"
@ -1180,65 +1213,68 @@ gimp_composite_subtract_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
"\tpand %%mm0,%%mm2\n" "\tpand %%mm0,%%mm2\n"
"\tpor %%mm2,%%mm1\n" "\tpor %%mm2,%%mm1\n"
"\tmovq %%mm1,%0\n" "\tmovq %%mm1,%0\n"
: "=m" (*op.D) : "=m" (*d)
: "m" (*op.A), "m" (*op.B) : "m" (*a), "m" (*b)
: "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); : "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
op.A += 8; a++;
op.B += 8; b++;
op.D += 8; d++;
} }
if (op.n_pixels) if (n_pixels > 0)
{ {
asm volatile (" movd %0,%%mm2\n" asm volatile (" movd %1,%%mm2\n"
"\tmovd %1,%%mm3\n" "\tmovd %2,%%mm3\n"
"\tmovq %%mm2,%%mm4\n" "\tmovq %%mm2,%%mm4\n"
"\tpsubusb %%mm3,%%mm4\n" "\tpsubusb %%mm3,%%mm4\n"
"\tmovq %%mm0,%%mm1\n" "\tmovq %%mm0,%%mm1\n"
"\tpandn %%mm4,%%mm1\n" "\tpandn %%mm4,%%mm1\n"
"\t" pminub(mm3,mm2,mm4) "\n" "\t" pminub(mm3,mm2,mm4) "\n"
"\tpand %%mm0,%%mm2\n" "\tpand %%mm0,%%mm2\n"
"\tpor %%mm2,%%mm1\n" "\tpor %%mm2,%%mm1\n"
"\tmovd %%mm1,%2\n" "\tmovd %%mm1,%0\n"
: /* empty */ : "=m" (*d)
: "m" (*op.A), "m" (*op.B), "m" (*op.D) : "m" (*a), "m" (*b)
: "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); : "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
} }
asm("emms"); asm volatile ("emms");
} }
void void
gimp_composite_swap_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op) gimp_composite_swap_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
{ {
GimpCompositeContext op = *_op; uint64 *d = (uint64 *) _op->D;
uint64 *a = (uint64 *) _op->A;
uint64 *b = (uint64 *) _op->B;
gulong n_pixels = _op->n_pixels;
for (; op.n_pixels >= 2; op.n_pixels -= 2) for (; n_pixels >= 2; n_pixels -= 2)
{ {
asm volatile (" movq %0,%%mm2\n" asm volatile (" movq %0,%%mm2\n"
"\tmovq %1,%%mm3\n" "\tmovq %1,%%mm3\n"
"\tmovq %%mm3,%0\n" "\tmovq %%mm3,%0\n"
"\tmovq %%mm2,%1\n" "\tmovq %%mm2,%1\n"
: /* empty */ : "+m" (*a), "+m" (*b)
: "m" (*op.A), "m" (*op.B) :
: "0", "1", "%mm1", "%mm2", "%mm3", "%mm4"); : "%mm1", "%mm2", "%mm3", "%mm4");
op.A += 8; a++;
op.B += 8; b++;
} }
if (op.n_pixels) if (n_pixels > 0)
{ {
asm volatile (" movd %0,%%mm2\n" asm volatile (" movd %0,%%mm2\n"
"\tmovd %1,%%mm3\n" "\tmovd %1,%%mm3\n"
"\tmovd %%mm3,%0\n" "\tmovd %%mm3,%0\n"
"\tmovd %%mm2,%1\n" "\tmovd %%mm2,%1\n"
: /* empty */ : "+m" (*a), "+m" (*b)
: "m" (*op.A), "m" (*op.B) :
: "0", "1", "%mm1", "%mm2", "%mm3", "%mm4"); : "%mm1", "%mm2", "%mm3", "%mm4");
} }
asm("emms"); asm("emms");
@ -1388,7 +1424,7 @@ gimp_composite_burn_va8_va8_va8_mmx (GimpCompositeContext *_op)
"\tmovq %%mm7,%2\n" "\tmovq %%mm7,%2\n"
: /* empty */ : /* empty */
: "+m" (*op.A), "+m" (*op.B), "+m" (*op.D), "m" (*va8_b255), "m" (*va8_w1), "m" (*va8_w255), "m" (*va8_alpha_mask) : "+m" (*op.A), "+m" (*op.B), "+m" (*op.D), "m" (*va8_b255), "m" (*va8_w1), "m" (*va8_w255_64), "m" (*va8_alpha_mask)
: "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4"); : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4");
op.A += 8; op.A += 8;
op.B += 8; op.B += 8;
@ -1441,7 +1477,7 @@ gimp_composite_burn_va8_va8_va8_mmx (GimpCompositeContext *_op)
"\tmovd %%mm7,%2\n" "\tmovd %%mm7,%2\n"
: /* empty */ : /* empty */
: "m" (*op.A), "m" (*op.B), "m" (*op.D), "m" (*va8_b255), "m" (*va8_w1), "m" (*va8_w255), "m" (*va8_alpha_mask) : "m" (*op.A), "m" (*op.B), "m" (*op.D), "m" (*va8_b255), "m" (*va8_w1), "m" (*va8_w255_64), "m" (*va8_alpha_mask)
: "0", "1", "2", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"); : "0", "1", "2", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7");
} }

View File

@ -16,7 +16,7 @@ static struct install_table {
GimpPixelFormat D; GimpPixelFormat D;
void (*function)(GimpCompositeContext *); void (*function)(GimpCompositeContext *);
} _gimp_composite_sse[] = { } _gimp_composite_sse[] = {
#if (__GNUC__ >= 3) && defined(USE_SSE) && defined(ARCH_X86) && (defined(ARCH_X86_64) || !defined(PIC)) #if defined(COMPILE_SSE_IS_OKAY)
{ GIMP_COMPOSITE_MULTIPLY, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, gimp_composite_multiply_rgba8_rgba8_rgba8_sse }, { GIMP_COMPOSITE_MULTIPLY, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, gimp_composite_multiply_rgba8_rgba8_rgba8_sse },
{ GIMP_COMPOSITE_SCREEN, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, gimp_composite_screen_rgba8_rgba8_rgba8_sse }, { GIMP_COMPOSITE_SCREEN, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, gimp_composite_screen_rgba8_rgba8_rgba8_sse },
{ GIMP_COMPOSITE_DIFFERENCE, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, gimp_composite_difference_rgba8_rgba8_rgba8_sse }, { GIMP_COMPOSITE_DIFFERENCE, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, gimp_composite_difference_rgba8_rgba8_rgba8_sse },

View File

@ -19,7 +19,7 @@
int int
gimp_composite_sse_test (int iterations, int n_pixels) gimp_composite_sse_test (int iterations, int n_pixels)
{ {
#if (__GNUC__ >= 3) && defined(USE_SSE) && defined(ARCH_X86) && (defined(ARCH_X86_64) || !defined(PIC)) #if defined(COMPILE_SSE_IS_OKAY)
GimpCompositeContext generic_ctx; GimpCompositeContext generic_ctx;
GimpCompositeContext special_ctx; GimpCompositeContext special_ctx;
double ft0; double ft0;
@ -210,7 +210,7 @@ main (int argc, char *argv[])
putenv ("GIMP_COMPOSITE=0x1"); putenv ("GIMP_COMPOSITE=0x1");
iterations = 1; iterations = 10;
n_pixels = 1048593; n_pixels = 1048593;
argv++, argc--; argv++, argc--;

View File

@ -48,6 +48,7 @@
#define pminub(src,dst,tmp) "pminub " "%%" #src ", %%" #dst #define pminub(src,dst,tmp) "pminub " "%%" #src ", %%" #dst
#define pmaxub(src,dst,tmp) "pmaxub " "%%" #src ", %%" #dst #define pmaxub(src,dst,tmp) "pmaxub " "%%" #src ", %%" #dst
#if 0
/* /*
* Double-word divide. Adjusted for subsequent unsigned packing * Double-word divide. Adjusted for subsequent unsigned packing
* (high-order bit of each word is cleared) * (high-order bit of each word is cleared)
@ -65,7 +66,8 @@
"roll $16, %%eax; " \ "roll $16, %%eax; " \
"btr $15, %%eax; " \ "btr $15, %%eax; " \
"movd %%eax,%%" #quotient ";" "movd %%eax,%%" #quotient ";"
#endif
#if 0
/* /*
* Quadword divide. No adjustment for subsequent unsigned packing * Quadword divide. No adjustment for subsequent unsigned packing
* (high-order bit of each word is left alone) * (high-order bit of each word is left alone)
@ -107,7 +109,8 @@
"movd %%eax,%%" #divisor ";" \ "movd %%eax,%%" #divisor ";" \
"psllq $32,%%" #divisor ";" \ "psllq $32,%%" #divisor ";" \
"por %%" #divisor ",%%" #quotient ";" "por %%" #divisor ",%%" #quotient ";"
#endif
#if 0
/* equivalent to the INT_MULT() macro in gimp-composite-generic.c */ /* equivalent to the INT_MULT() macro in gimp-composite-generic.c */
/* /*
* opr2 = INT_MULT(opr1, opr2, t) * opr2 = INT_MULT(opr1, opr2, t)
@ -126,7 +129,8 @@
"\tpsrlw $8, %%"#opr2"; " \ "\tpsrlw $8, %%"#opr2"; " \
"\tpaddw %%"#opr1", %%"#opr2"; " \ "\tpaddw %%"#opr1", %%"#opr2"; " \
"\tpsrlw $8, %%"#opr2"\n" "\tpsrlw $8, %%"#opr2"\n"
#endif
#if 0
/* a = INT_MULT(a,b) */ /* a = INT_MULT(a,b) */
#define mmx_int_mult(a,b,w128) \ #define mmx_int_mult(a,b,w128) \
"\tpmullw %%"#b", %%"#a"; " \ "\tpmullw %%"#b", %%"#a"; " \
@ -135,7 +139,9 @@
"\tpsrlw $8, %%"#b"; " \ "\tpsrlw $8, %%"#b"; " \
"\tpaddw %%"#a", %%"#b"; " \ "\tpaddw %%"#a", %%"#b"; " \
"\tpsrlw $8, %%"#b"\n" "\tpsrlw $8, %%"#b"\n"
#endif
#if 0
static const guint32 rgba8_alpha_mask_64[2] = { 0xFF000000, 0xFF000000 }; static const guint32 rgba8_alpha_mask_64[2] = { 0xFF000000, 0xFF000000 };
static const guint32 rgba8_b1_64[2] = { 0x01010101, 0x01010101 }; static const guint32 rgba8_b1_64[2] = { 0x01010101, 0x01010101 };
static const guint32 rgba8_b255_64[2] = { 0xFFFFFFFF, 0xFFFFFFFF }; static const guint32 rgba8_b255_64[2] = { 0xFFFFFFFF, 0xFFFFFFFF };
@ -149,6 +155,7 @@ static const guint32 va8_alpha_mask[2] = { 0xFF00FF00, 0xFF00FF00 };
static const guint32 va8_b255[2] = { 0xFFFFFFFF, 0xFFFFFFFF }; static const guint32 va8_b255[2] = { 0xFFFFFFFF, 0xFFFFFFFF };
static const guint32 va8_w1[2] = { 0x00010001, 0x00010001 }; static const guint32 va8_w1[2] = { 0x00010001, 0x00010001 };
static const guint32 va8_w255[2] = { 0x00FF00FF, 0x00FF00FF }; static const guint32 va8_w255[2] = { 0x00FF00FF, 0x00FF00FF };
#endif
/* /*
* *
@ -156,48 +163,51 @@ static const guint32 va8_w255[2] = { 0x00FF00FF, 0x00FF00FF };
void void
gimp_composite_addition_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op) gimp_composite_addition_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op)
{ {
GimpCompositeContext op = *_op; uint64 *d = (uint64 *) _op->D;
uint64 *a = (uint64 *) _op->A;
uint64 *b = (uint64 *) _op->B;
gulong n_pixels = _op->n_pixels;
asm volatile ("movq %0,%%mm0" asm volatile ("movq %0,%%mm0"
: /* empty */ : /* empty */
: "m" (*rgba8_alpha_mask_64) : "m" (*rgba8_alpha_mask_64)
: "%mm0"); : "%mm0");
for (; op.n_pixels >= 2; op.n_pixels -= 2) for (; n_pixels >= 2; n_pixels -= 2)
{ {
asm (" movq %1, %%mm2\n" asm volatile (" movq %1, %%mm2\n"
"\tmovq %2, %%mm3\n" "\tmovq %2, %%mm3\n"
"\tmovq %%mm2, %%mm4\n"
"\tpaddusb %%mm3, %%mm4\n"
"\tmovq %%mm0, %%mm1\n"
"\tpandn %%mm4, %%mm1\n"
"\tpminub %%mm3, %%mm2\n"
"\tpand %%mm0, %%mm2\n"
"\tpor %%mm2, %%mm1\n"
"\tmovq %%mm1, %0\n"
: "=m" (*op.D)
: "m" (*op.A), "m" (*op.B)
: "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7");
op.A += 8;
op.B += 8;
op.D += 8;
}
if (op.n_pixels)
{
asm volatile (" movd %1, %%mm2\n"
"\tmovd %2, %%mm3\n"
"\tmovq %%mm2, %%mm4\n" "\tmovq %%mm2, %%mm4\n"
"\tpaddusb %%mm3, %%mm4\n" "\tpaddusb %%mm3, %%mm4\n"
"\tmovq %%mm0, %%mm1\n" "\tmovq %%mm0, %%mm1\n"
"\tpandn %%mm4, %%mm1\n" "\tpandn %%mm4, %%mm1\n"
"\tpminub %%mm3, %%mm2\n" "\t" pminub(mm3, mm2, mm4) "\n"
"\tpand %%mm0, %%mm2\n"
"\tpor %%mm2, %%mm1\n"
"\tmovq %%mm1, %0\n"
: "=m" (*d)
: "m" (*a), "m" (*b)
: "%mm0", "%mm1", "%mm2", "%mm3", "%mm4");
a++;
b++;
d++;
}
if (n_pixels > 0)
{
asm volatile (" movd %1, %%mm2\n"
"\tmovd %2, %%mm3\n"
"\tmovq %%mm2, %%mm4\n"
"\tpaddusb %%mm3, %%mm4\n"
"\tmovq %%mm0, %%mm1\n"
"\tpandn %%mm4, %%mm1\n"
"\t" pminub(mm3, mm2, mm4) "\n"
"\tpand %%mm0, %%mm2\n" "\tpand %%mm0, %%mm2\n"
"\tpor %%mm2, %%mm1\n" "\tpor %%mm2, %%mm1\n"
"\tmovd %%mm1, %0\n" "\tmovd %%mm1, %0\n"
: "=m" (*op.D) : "=m" (*d)
: "m" (*op.A), "m" (*op.B) : "m" (*a), "m" (*b)
: "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"); : "%mm0", "%mm1", "%mm2", "%mm3", "%mm4");
} }
asm("emms"); asm("emms");
@ -207,63 +217,66 @@ gimp_composite_addition_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op)
void void
gimp_composite_burn_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op) gimp_composite_burn_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op)
{ {
GimpCompositeContext op = *_op; uint64 *d = (uint64 *) _op->D;
uint64 *a = (uint64 *) _op->A;
uint64 *b = (uint64 *) _op->B;
gulong n_pixels = _op->n_pixels;
for (; op.n_pixels >= 2; op.n_pixels -= 2) for (; n_pixels >= 2; n_pixels -= 2)
{ {
asm (" movq %1,%%mm0\n" asm volatile (" movq %1,%%mm0\n"
"\tmovq %2,%%mm1\n" "\tmovq %2,%%mm1\n"
"\tmovq %3,%%mm2\n" "\tmovq %3,%%mm2\n"
"\tpsubb %%mm0,%%mm2\n" /* mm2 = 255 - A */ "\tpsubb %%mm0,%%mm2\n" /* mm2 = 255 - A */
"\tpxor %%mm4,%%mm4\n" "\tpxor %%mm4,%%mm4\n"
"\tpunpcklbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256 */ "\tpunpcklbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256 */
"\tmovq %%mm1,%%mm3\n" "\tmovq %%mm1,%%mm3\n"
"\tpxor %%mm5,%%mm5\n" "\tpxor %%mm5,%%mm5\n"
"\tpunpcklbw %%mm5,%%mm3\n" "\tpunpcklbw %%mm5,%%mm3\n"
"\tmovq %4,%%mm5\n" "\tmovq %4,%%mm5\n"
"\tpaddusw %%mm3,%%mm5\n" /* mm5 = B + 1 */ "\tpaddusw %%mm3,%%mm5\n" /* mm5 = B + 1 */
"\t" pdivwqX(mm4,mm5,mm7) "\n" "\t" pdivwqX(mm4,mm5,mm7) "\n"
"\tmovq %3,%%mm2\n" "\tmovq %3,%%mm2\n"
"\tpsubb %%mm0,%%mm2\n" /* mm2 = 255 - A */ "\tpsubb %%mm0,%%mm2\n" /* mm2 = 255 - A */
"\tpxor %%mm4,%%mm4\n" "\tpxor %%mm4,%%mm4\n"
"\tpunpckhbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256 */ "\tpunpckhbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256 */
"\tmovq %%mm1,%%mm3\n" "\tmovq %%mm1,%%mm3\n"
"\tpxor %%mm5,%%mm5\n" "\tpxor %%mm5,%%mm5\n"
"\tpunpckhbw %%mm5,%%mm3\n" "\tpunpckhbw %%mm5,%%mm3\n"
"\tmovq %4,%%mm5\n" "\tmovq %4,%%mm5\n"
"\tpaddusw %%mm3,%%mm5\n" /* mm5 = B + 1 */ "\tpaddusw %%mm3,%%mm5\n" /* mm5 = B + 1 */
"\t" pdivwqX(mm4,mm5,mm6) "\n" "\t" pdivwqX(mm4,mm5,mm6) "\n"
"\tmovq %5,%%mm4\n" "\tmovq %5,%%mm4\n"
"\tmovq %%mm4,%%mm5\n" "\tmovq %%mm4,%%mm5\n"
"\tpsubusw %%mm6,%%mm4\n" "\tpsubusw %%mm6,%%mm4\n"
"\tpsubusw %%mm7,%%mm5\n" "\tpsubusw %%mm7,%%mm5\n"
"\tpackuswb %%mm4,%%mm5\n" "\tpackuswb %%mm4,%%mm5\n"
"\tpminub %%mm0,%%mm1\n" /* mm1 = min(mm0,mm1) clobber mm3 */ "\t" pminub(mm0,mm1,mm3) "\n" /* mm1 = min(mm0,mm1) clobber mm3 */
"\tmovq %6,%%mm7\n" "\tmovq %6,%%mm7\n" /* mm6 = rgba8_alpha_mask_64 */
"\tpand %%mm7,%%mm1\n" /* mm1 = mm7 & alpha_mask */ "\tpand %%mm7,%%mm1\n" /* mm1 = mm7 & alpha_mask */
"\tpandn %%mm5,%%mm7\n" /* mm7 = ~mm7 & mm5 */ "\tpandn %%mm5,%%mm7\n" /* mm7 = ~mm7 & mm5 */
"\tpor %%mm1,%%mm7\n" /* mm7 = mm7 | mm1 */ "\tpor %%mm1,%%mm7\n" /* mm7 = mm7 | mm1 */
"\tmovq %%mm7,%0\n" "\tmovq %%mm7,%0\n"
: "=m" (*op.D) : "=m" (*d)
: "m" (*op.A), "m" (*op.B), "m" (*rgba8_b255_64), "m" (*rgba8_w1_64), "m" (*rgba8_w255_64), "m" (*rgba8_alpha_mask_64) : "m" (*a), "m" (*b), "m" (*rgba8_b255_64), "m" (*rgba8_w1_64), "m" (*rgba8_w255_64), "m" (*rgba8_alpha_mask_64)
: "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"); : pdivwqX_clobber, "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7");
op.A += 8; d++;
op.B += 8; b++;
op.D += 8; a++;
} }
if (op.n_pixels) if (n_pixels > 0)
{ {
asm volatile (" movd %1,%%mm0\n" asm volatile (" movd %1,%%mm0\n"
"\tmovd %2,%%mm1\n" "\tmovd %2,%%mm1\n"
@ -300,7 +313,7 @@ gimp_composite_burn_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op)
"\tpackuswb %%mm4,%%mm5\n" "\tpackuswb %%mm4,%%mm5\n"
"\tpminub %%mm0,%%mm1\n" /* mm1 = min(mm0,mm1) clobber mm3 */ "\t" pminub(mm0,mm1,mm3) "\n" /* mm1 = min(mm0,mm1) clobber mm3 */
"\tmovq %6,%%mm7\n" "\tmovq %6,%%mm7\n"
"\tpand %%mm7,%%mm1\n" /* mm1 = mm7 & alpha_mask */ "\tpand %%mm7,%%mm1\n" /* mm1 = mm7 & alpha_mask */
@ -309,9 +322,9 @@ gimp_composite_burn_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op)
"\tpor %%mm1,%%mm7\n" /* mm7 = mm7 | mm1 */ "\tpor %%mm1,%%mm7\n" /* mm7 = mm7 | mm1 */
"\tmovd %%mm7,%0\n" "\tmovd %%mm7,%0\n"
: "=m" (*op.D) : "=m" (*d)
: "m" (*op.A), "m" (*op.B), "m" (*rgba8_b255_64), "m" (*rgba8_w1_64), "m" (*rgba8_w255_64), "m" (*rgba8_alpha_mask_64) : "m" (*a), "m" (*b), "m" (*rgba8_b255_64), "m" (*rgba8_w1_64), "m" (*rgba8_w255_64), "m" (*rgba8_alpha_mask_64)
: "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"); : pdivwqX_clobber, "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7");
} }
asm("emms"); asm("emms");

View File

@ -16,7 +16,7 @@ static struct install_table {
GimpPixelFormat D; GimpPixelFormat D;
void (*function)(GimpCompositeContext *); void (*function)(GimpCompositeContext *);
} _gimp_composite_sse2[] = { } _gimp_composite_sse2[] = {
#if (__GNUC__ >= 3) && defined(USE_SSE) && defined(ARCH_X86) && (defined(ARCH_X86_64) || !defined(PIC)) #if defined(COMPILE_SSE2_IS_OKAY)
{ GIMP_COMPOSITE_DIFFERENCE, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, gimp_composite_difference_rgba8_rgba8_rgba8_sse2 }, { GIMP_COMPOSITE_DIFFERENCE, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, gimp_composite_difference_rgba8_rgba8_rgba8_sse2 },
{ GIMP_COMPOSITE_ADDITION, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, gimp_composite_addition_rgba8_rgba8_rgba8_sse2 }, { GIMP_COMPOSITE_ADDITION, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, gimp_composite_addition_rgba8_rgba8_rgba8_sse2 },
{ GIMP_COMPOSITE_SUBTRACT, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, gimp_composite_subtract_rgba8_rgba8_rgba8_sse2 }, { GIMP_COMPOSITE_SUBTRACT, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, gimp_composite_subtract_rgba8_rgba8_rgba8_sse2 },

View File

@ -19,7 +19,7 @@
int int
gimp_composite_sse2_test (int iterations, int n_pixels) gimp_composite_sse2_test (int iterations, int n_pixels)
{ {
#if (__GNUC__ >= 3) && defined(USE_SSE) && defined(ARCH_X86) && (defined(ARCH_X86_64) || !defined(PIC)) #if defined(COMPILE_SSE2_IS_OKAY)
GimpCompositeContext generic_ctx; GimpCompositeContext generic_ctx;
GimpCompositeContext special_ctx; GimpCompositeContext special_ctx;
double ft0; double ft0;
@ -155,7 +155,7 @@ main (int argc, char *argv[])
putenv ("GIMP_COMPOSITE=0x1"); putenv ("GIMP_COMPOSITE=0x1");
iterations = 1; iterations = 10;
n_pixels = 1048593; n_pixels = 1048593;
argv++, argc--; argv++, argc--;

View File

@ -19,7 +19,7 @@
int int
gimp_composite_vis_test (int iterations, int n_pixels) gimp_composite_vis_test (int iterations, int n_pixels)
{ {
#if (__GNUC__ >= 3) && defined(USE_VIS) && defined(ARCH_SPARC) #if defined(COMPILE_VIS_IS_OKAY)
GimpCompositeContext generic_ctx; GimpCompositeContext generic_ctx;
GimpCompositeContext special_ctx; GimpCompositeContext special_ctx;
double ft0; double ft0;
@ -78,7 +78,7 @@ main (int argc, char *argv[])
putenv ("GIMP_COMPOSITE=0x1"); putenv ("GIMP_COMPOSITE=0x1");
iterations = 1; iterations = 10;
n_pixels = 1048593; n_pixels = 1048593;
argv++, argc--; argv++, argc--;

View File

@ -32,18 +32,14 @@
#include "gimp-composite.h" #include "gimp-composite.h"
#include "gimp-composite-vis.h" #include "gimp-composite-vis.h"
#if defined(USE_VIS) #ifdef COMPILE_VIS_IS_OKAY
#if defined(ARCH_SPARC)
#if __GNUC__ >= 3
#endif /* __GNUC__ > 3 */ #endif
#endif /* defined(ARCH_SPARC) */
#endif /* defined(USE_VIS) */
gboolean gboolean
gimp_composite_vis_init (void) gimp_composite_vis_init (void)
{ {
#ifdef ARCH_SPARC #ifdef COMPILE_VIS_IS_OKAY
return (TRUE); return (TRUE);
#else #else
return (FALSE); return (FALSE);

View File

@ -9,4 +9,12 @@ extern gboolean gimp_composite_vis_init (void);
*/ */
extern gboolean gimp_composite_vis_install (void); extern gboolean gimp_composite_vis_install (void);
#if defined(USE_VIS)
#if defined(ARCH_SPARC)
#if __GNUC__ >= 3
#define COMPILE_VIS_IS_OKAY (1)
#endif /* __GNUC__ > 3 */
#endif /* defined(ARCH_SPARC) */
#endif /* defined(USE_VIS) */
#endif #endif

View File

@ -21,10 +21,16 @@
#if __GNUC__ >= 3 #if __GNUC__ >= 3
/*
* Convert the low 8bit byte of the src to 16bit words in dst.
*/
#define mmx_low_bytes_to_words(src,dst,zero) \ #define mmx_low_bytes_to_words(src,dst,zero) \
"\tmovq %%"#src", %%"#dst"; " \ "\tmovq %%"#src", %%"#dst"; " \
"\tpunpcklbw %%"#zero", %%"#dst"\n" "\tpunpcklbw %%"#zero", %%"#dst"\n"
/*
* Convert the high 8bit byte of the src to 16bit words in dst.
*/
#define mmx_high_bytes_to_words(src,dst,zero) \ #define mmx_high_bytes_to_words(src,dst,zero) \
"\tmovq %%"#src", %%"#dst"; " \ "\tmovq %%"#src", %%"#dst"; " \
"\tpunpckhbw %%"#zero", %%"#dst"\n" "\tpunpckhbw %%"#zero", %%"#dst"\n"
@ -231,4 +237,17 @@
typedef unsigned long long uint64; typedef unsigned long long uint64;
extern const guint32 rgba8_alpha_mask_64[2];
extern const guint32 rgba8_b1_64[2];
extern const guint32 rgba8_b255_64[2];
extern const guint32 rgba8_w1_64[2];
extern const guint32 rgba8_w2_64[2];
extern const guint32 rgba8_w128_64[2];
extern const guint32 rgba8_w256_64[2];
extern const guint32 rgba8_w255_64[2];
extern const guint32 va8_alpha_mask[2];
extern const guint32 va8_b255[2];
extern const guint32 va8_w1[2];
extern const guint32 va8_w255[2];
#endif /* __GNUC__ >= 3 */ #endif /* __GNUC__ >= 3 */

View File

@ -366,9 +366,12 @@ gimp_composite_init (gboolean be_verbose,
gimp_composite_options.bits |= GIMP_COMPOSITE_OPTION_NOEXTENSIONS; gimp_composite_options.bits |= GIMP_COMPOSITE_OPTION_NOEXTENSIONS;
#ifdef GIMP_UNSTABLE #ifdef GIMP_UNSTABLE
g_printerr ("gimp_composite: use=%s, verbose=%s\n", if (be_verbose)
(gimp_composite_options.bits & GIMP_COMPOSITE_OPTION_USE) ? "yes" : "no", {
(gimp_composite_options.bits & GIMP_COMPOSITE_OPTION_VERBOSE) ? "yes" : "no"); g_printerr ("gimp_composite: use=%s, verbose=%s\n",
(gimp_composite_options.bits & GIMP_COMPOSITE_OPTION_USE) ? "yes" : "no",
(gimp_composite_options.bits & GIMP_COMPOSITE_OPTION_VERBOSE) ? "yes" : "no");
}
#endif #endif
gimp_composite_generic_install (); gimp_composite_generic_install ();
@ -400,14 +403,17 @@ gimp_composite_init (gboolean be_verbose,
gboolean can_use_vis = gimp_composite_vis_install (); gboolean can_use_vis = gimp_composite_vis_install ();
#ifdef GIMP_UNSTABLE #ifdef GIMP_UNSTABLE
g_printerr ("supported by gimp_composite: " if (be_verbose)
"%cmmx %csse %csse2 %c3dnow %caltivec %cvis\n", {
can_use_mmx ? '+' : '-', g_printerr ("Processor instruction sets: "
can_use_sse ? '+' : '-', "%cmmx %csse %csse2 %c3dnow %caltivec %cvis\n",
can_use_sse2 ? '+' : '-', can_use_mmx ? '+' : '-',
can_use_3dnow ? '+' : '-', can_use_sse ? '+' : '-',
can_use_altivec ? '+' : '-', can_use_sse2 ? '+' : '-',
can_use_vis ? '+' : '-'); can_use_3dnow ? '+' : '-',
can_use_altivec ? '+' : '-',
can_use_vis ? '+' : '-');
}
#endif #endif
} }
} }

View File

@ -471,7 +471,7 @@ op.add_option('-f', '--file', action='store', type='string', dest='file',
help='the input object file') help='the input object file')
op.add_option('-t', '--test', action='store_true', dest='test', default=False, op.add_option('-t', '--test', action='store_true', dest='test', default=False,
help='generate regression testing code') help='generate regression testing code')
op.add_option('-i', '--iterations', action='store', type='int', dest='iterations', default=1, op.add_option('-i', '--iterations', action='store', type='int', dest='iterations', default=10,
help='number of iterations in regression tests') help='number of iterations in regression tests')
op.add_option('-n', '--n-pixels', action='store', type="int", dest='n_pixels', default=128*8192+16+1, op.add_option('-n', '--n-pixels', action='store', type="int", dest='n_pixels', default=128*8192+16+1,
help='number of pixels in each regression test iteration') help='number of pixels in each regression test iteration')