Print the list of active instruction sets if the --verbose command line

* app/composite/gimp-composite.c (gimp_composite_init): Print the
list of active instruction sets if the --verbose command line
switch is ON (via be_verbose)

* app/composite/gimp-composite-x86.h: Factored code from the mmx,
and sse implementations.

* app/composite/make-installer.py: Raised the number of test
iterations from 1 to 10.

* app/composite/gimp-composite-3dnow.[ch]
* app/composite/gimp-composite-3dnow-test.c
* app/composite/gimp-composite-3dnow-installer.c
* app/composite/gimp-composite-altivec.[ch]
* app/composite/gimp-composite-altivec-test.c
* app/composite/gimp-composite-altivec-installer.c
* app/composite/gimp-composite-mmx.[ch]
* app/composite/gimp-composite-altivec-test.c
* app/composite/gimp-composite-altivec-installer.c
* app/composite/gimp-composite-sse.[ch]
* app/composite/gimp-composite-sse-test.c
* app/composite/gimp-composite-sse-installer.c
* app/composite/gimp-composite-sse2.[ch]
* app/composite/gimp-composite-sse2-test.c
* app/composite/gimp-composite-sse2-installer.c
* app/composite/gimp-composite-vis.[ch]
* app/composite/gimp-composite-vis-test.c:
Regenerated sources via make-installer.py
This commit is contained in:
Helvetix Victorinox
2004-07-20 15:59:12 +00:00
parent 03b3f8c90f
commit 54630be219
21 changed files with 505 additions and 384 deletions

View File

@ -1,3 +1,35 @@
2004-07-20 Helvetix Victorinox <helvetix@gimp.org>
* app/composite/gimp-composite.c (gimp_composite_init): Print the
list of active instruction sets if the --verbose command line
switch is ON (via be_verbose)
* app/composite/gimp-composite-x86.h: Factored code from the mmx,
and sse implementations.
* app/composite/make-installer.py: Raised the number of test
iterations from 1 to 10.
* app/composite/gimp-composite-3dnow.[ch]
* app/composite/gimp-composite-3dnow-test.c
* app/composite/gimp-composite-3dnow-installer.c
* app/composite/gimp-composite-altivec.[ch]
* app/composite/gimp-composite-altivec-test.c
* app/composite/gimp-composite-altivec-installer.c
* app/composite/gimp-composite-mmx.[ch]
* app/composite/gimp-composite-altivec-test.c
* app/composite/gimp-composite-altivec-installer.c
* app/composite/gimp-composite-sse.[ch]
* app/composite/gimp-composite-sse-test.c
* app/composite/gimp-composite-sse-installer.c
* app/composite/gimp-composite-sse2.[ch]
* app/composite/gimp-composite-sse2-test.c
* app/composite/gimp-composite-sse2-installer.c
* app/composite/gimp-composite-vis.[ch]
* app/composite/gimp-composite-vis-test.c:
Regenerated sources via make-installer.py
2004-07-20 Sven Neumann <sven@gimp.org>
* app/app_procs.c

View File

@ -19,7 +19,7 @@
int
gimp_composite_3dnow_test (int iterations, int n_pixels)
{
#if (__GNUC__ >= 3) && defined(USE_3DNOW) && defined(ARCH_X86) && (defined(ARCH_X86_64) || !defined(PIC))
#if defined(COMPILE_3DNOW_IS_OKAY)
GimpCompositeContext generic_ctx;
GimpCompositeContext special_ctx;
double ft0;
@ -78,7 +78,7 @@ main (int argc, char *argv[])
putenv ("GIMP_COMPOSITE=0x1");
iterations = 1;
iterations = 10;
n_pixels = 1048593;
argv++, argc--;

View File

@ -31,22 +31,17 @@
#include "base/cpu-accel.h"
#include "gimp-composite.h"
#include "gimp-composite-3dnow.h"
#if defined(USE_MMX)
#if defined(ARCH_X86)
#if __GNUC__ >= 3
#if defined(ARCH_X86_64) || !defined(PIC)
#ifdef COMPILE_3DNOW_IS_OKAY
#endif /* ARCH_X86_64 || !PIC */
#endif /* __GNUC__ > 3 */
#endif /* ARCH_X86 */
#endif /* USE_MMX */
#endif
gboolean
gimp_composite_3dnow_init (void)
{
#if defined(USE_MMX) && defined(ARCH_X86) && (defined(ARCH_X86_64) || !defined(PIC))
#ifdef COMPILE_3DNOW_IS_OKAY
if (cpu_accel () & CPU_ACCEL_X86_3DNOW)
{
return (TRUE);

View File

@ -9,4 +9,16 @@ extern gboolean gimp_composite_3dnow_init (void);
*/
extern gboolean gimp_composite_3dnow_install (void);
#if !defined(__INTEL_COMPILER)
#if defined(USE_MMX)
#if defined(ARCH_X86)
#if __GNUC__ >= 3
#if defined(ARCH_X86_64) || !defined(PIC)
#define COMPILE_3DNOW_IS_OKAY (1)
#endif /* ARCH_X86_64 || !PIC */
#endif /* __GNUC__ > 3 */
#endif /* ARCH_X86 */
#endif /* USE_MMX */
#endif /* !defined(__INTEL_COMPILER) */
#endif

View File

@ -19,7 +19,7 @@
int
gimp_composite_altivec_test (int iterations, int n_pixels)
{
#if (__GNUC__ >= 3) && defined(USE_ALTIVEC) && defined(ARCH_PPC)
#if defined(COMPILE_ALTIVEC_IS_OKAY)
GimpCompositeContext generic_ctx;
GimpCompositeContext special_ctx;
double ft0;
@ -78,7 +78,7 @@ main (int argc, char *argv[])
putenv ("GIMP_COMPOSITE=0x1");
iterations = 1;
iterations = 10;
n_pixels = 1048593;
argv++, argc--;

View File

@ -30,16 +30,14 @@
#include "gimp-composite.h"
#include "gimp-composite-altivec.h"
#ifdef ARCH_PPC
#if __GNUC__ >= 3
#ifdef COMPILE_ALTIVEC_IS_OKAY
#endif /* __GNUC__ > 3 */
#endif /* ARCH_PPC */
#endif
gboolean
gimp_composite_altivec_init (void)
{
#ifdef ARCH_PPC
#ifdef COMPILE_ALTIVEC_IS_OKAY
if (cpu_accel () & CPU_ACCEL_PPC_ALTIVEC)
{
return (TRUE);

View File

@ -9,4 +9,10 @@ extern gboolean gimp_composite_altivec_init (void);
*/
extern gboolean gimp_composite_altivec_install (void);
#ifdef ARCH_PPC
#if __GNUC__ >= 3
#define COMPILE_ALTIVEC_IS_OKAY (1)
#endif /* __GNUC__ > 3 */
#endif /* ARCH_PPC */
#endif

View File

@ -16,7 +16,7 @@ static struct install_table {
GimpPixelFormat D;
void (*function)(GimpCompositeContext *);
} _gimp_composite_mmx[] = {
#if (__GNUC__ >= 3) && defined(USE_MMX) && defined(ARCH_X86) && (defined(ARCH_X86_64) || !defined(PIC))
#if defined(COMPILE_MMX_IS_OKAY)
{ GIMP_COMPOSITE_MULTIPLY, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, gimp_composite_multiply_rgba8_rgba8_rgba8_mmx },
{ GIMP_COMPOSITE_SCREEN, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, gimp_composite_screen_rgba8_rgba8_rgba8_mmx },
{ GIMP_COMPOSITE_DIFFERENCE, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, gimp_composite_difference_rgba8_rgba8_rgba8_mmx },

View File

@ -19,7 +19,7 @@
int
gimp_composite_mmx_test (int iterations, int n_pixels)
{
#if (__GNUC__ >= 3) && defined(USE_MMX) && defined(ARCH_X86) && (defined(ARCH_X86_64) || !defined(PIC))
#if defined(COMPILE_MMX_IS_OKAY)
GimpCompositeContext generic_ctx;
GimpCompositeContext special_ctx;
double ft0;
@ -210,7 +210,7 @@ main (int argc, char *argv[])
putenv ("GIMP_COMPOSITE=0x1");
iterations = 1;
iterations = 10;
n_pixels = 1048593;
argv++, argc--;

View File

@ -62,19 +62,19 @@ debug_display_mmx(void)
printf("--------------------------------------------\n");
}
static const guint32 rgba8_alpha_mask[2] = { 0xFF000000, 0xFF000000 };
static const guint32 rgba8_b1[2] = { 0x01010101, 0x01010101 };
static const guint32 rgba8_b255[2] = { 0xFFFFFFFF, 0xFFFFFFFF };
static const guint32 rgba8_w1[2] = { 0x00010001, 0x00010001 };
static const guint32 rgba8_w2[2] = { 0x00020002, 0x00020002 };
static const guint32 rgba8_w128[2] = { 0x00800080, 0x00800080 };
static const guint32 rgba8_w256[2] = { 0x01000100, 0x01000100 };
static const guint32 rgba8_w255[2] = { 0X00FF00FF, 0X00FF00FF };
const guint32 rgba8_alpha_mask_64[2] = { 0xFF000000, 0xFF000000 };
const guint32 rgba8_b1_64[2] = { 0x01010101, 0x01010101 };
const guint32 rgba8_b255_64[2] = { 0xFFFFFFFF, 0xFFFFFFFF };
const guint32 rgba8_w1_64[2] = { 0x00010001, 0x00010001 };
const guint32 rgba8_w2_64[2] = { 0x00020002, 0x00020002 };
const guint32 rgba8_w128_64[2] = { 0x00800080, 0x00800080 };
const guint32 rgba8_w256_64[2] = { 0x01000100, 0x01000100 };
const guint32 rgba8_w255_64[2] = { 0X00FF00FF, 0X00FF00FF };
static const guint32 va8_alpha_mask[2] = { 0xFF00FF00, 0xFF00FF00 };
static const guint32 va8_b255[2] = { 0xFFFFFFFF, 0xFFFFFFFF };
static const guint32 va8_w1[2] = { 0x00010001, 0x00010001 };
static const guint32 va8_w255[2] = { 0x00FF00FF, 0x00FF00FF };
const guint32 va8_alpha_mask_64[2] = { 0xFF00FF00, 0xFF00FF00 };
const guint32 va8_b255_64[2] = { 0xFFFFFFFF, 0xFFFFFFFF };
const guint32 va8_w1_64[2] = { 0x00010001, 0x00010001 };
const guint32 va8_w255_64[2] = { 0x00FF00FF, 0x00FF00FF };
/*
*
@ -89,13 +89,13 @@ gimp_composite_addition_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
asm volatile ("movq %0,%%mm0"
: /* empty */
: "m" (*rgba8_alpha_mask)
: "m" (*rgba8_alpha_mask_64)
: "%mm0");
for (; n_pixels >= 2; n_pixels -= 2)
{
asm volatile (" movq %1, %%mm2\n"
"\tmovq %2, %%mm3\n"
asm volatile (" movq %1, %%mm2\n"
"\tmovq %2, %%mm3\n"
"\tmovq %%mm2, %%mm4\n"
"\tpaddusb %%mm3, %%mm4\n"
"\tmovq %%mm0, %%mm1\n"
@ -179,7 +179,7 @@ gimp_composite_burn_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
"\t" pminub(mm0,mm1,mm3) "\n" /* mm1 = min(mm0,mm1) clobber mm3 */
"\tmovq %6,%%mm7\n" /* mm6 = rgba8_alpha_mask */
"\tmovq %6,%%mm7\n" /* mm6 = rgba8_alpha_mask_64 */
"\tpand %%mm7,%%mm1\n" /* mm1 = mm7 & alpha_mask */
"\tpandn %%mm5,%%mm7\n" /* mm7 = ~mm7 & mm5 */
@ -187,7 +187,7 @@ gimp_composite_burn_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
"\tmovq %%mm7,%0\n"
: "=m" (*d)
: "m" (*a), "m" (*b), "m" (*rgba8_b255), "m" (*rgba8_w1), "m" (*rgba8_w255), "m" (*rgba8_alpha_mask)
: "m" (*a), "m" (*b), "m" (*rgba8_b255_64), "m" (*rgba8_w1_64), "m" (*rgba8_w255_64), "m" (*rgba8_alpha_mask_64)
: pdivwqX_clobber, "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7");
d++;
b++;
@ -241,7 +241,7 @@ gimp_composite_burn_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
"\tmovd %%mm7,%0\n"
: "=m" (*d)
: "m" (*a), "m" (*b), "m" (*rgba8_b255), "m" (*rgba8_w1), "m" (*rgba8_w255), "m" (*rgba8_alpha_mask)
: "m" (*a), "m" (*b), "m" (*rgba8_b255_64), "m" (*rgba8_w1_64), "m" (*rgba8_w255_64), "m" (*rgba8_alpha_mask_64)
: pdivwqX_clobber, "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7");
}
@ -293,7 +293,7 @@ gimp_composite_difference_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
uint64 *b = (uint64 *) _op->B;
gulong n_pixels = _op->n_pixels;
asm volatile ("movq %0,%%mm0" : : "m" (*rgba8_alpha_mask) : "%mm0");
asm volatile ("movq %0,%%mm0" : : "m" (*rgba8_alpha_mask_64) : "%mm0");
for (; n_pixels >= 2; n_pixels -= 2)
{
@ -318,7 +318,7 @@ gimp_composite_difference_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
d++;
}
if (n_pixels)
if (n_pixels > 0)
{
asm volatile (" movd %1, %%mm2\n"
"\tmovd %2, %%mm3\n"
@ -352,7 +352,7 @@ xxxgimp_composite_divide_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
asm volatile (" movq %0, %%mm0\n"
"\tmovq %1, %%mm7\n"
:
: "m" (*rgba8_alpha_mask), "m" (*rgba8_w1)
: "m" (*rgba8_alpha_mask_64), "m" (*rgba8_w1_64)
: "%mm0", "%mm7");
for (; n_pixels >= 2; n_pixels -= 2)
@ -382,7 +382,7 @@ xxxgimp_composite_divide_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
"\tpackuswb %%mm4,%%mm5\n" /* expects mm4 and mm5 to be signed values */
"\t" pminub(mm0,mm1,mm3) "\n"
"\tmovq %3,%%mm3\n"
"\tmovq %3,%%mm3\n"
"\tmovq %%mm3,%%mm2\n"
"\tpandn %%mm5,%%mm3\n"
@ -392,14 +392,14 @@ xxxgimp_composite_divide_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
"\tmovq %%mm3,%0\n"
: "=m" (*d)
: "m" (*a), "m" (*b), "m" (*rgba8_alpha_mask)
: "%eax", "%ecx", "%edx", "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
: "m" (*a), "m" (*b), "m" (*rgba8_alpha_mask_64)
: pdivwuqX_clobber, "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
a++;
b++;
d++;
}
if (n_pixels)
if (n_pixels > 0)
{
asm volatile (" movd %1,%%mm0\n"
"\tmovd %2,%%mm1\n"
@ -436,8 +436,8 @@ xxxgimp_composite_divide_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
"\tmovd %%mm3,%0\n"
: "=m" (*d)
: "m" (*a), "m" (*b), "m" (*rgba8_alpha_mask)
: "%eax", "%ecx", "%edx", "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
: "m" (*a), "m" (*b), "m" (*rgba8_alpha_mask_64)
: pdivwuqX_clobber, "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
}
asm("emms");
@ -487,14 +487,14 @@ xxxgimp_composite_dodge_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
"\tmovq %%mm7,%0\n"
: "=m" (*d)
: "m" (*a), "m" (*b), "m" (*rgba8_w256), "m" (*rgba8_alpha_mask)
: "m" (*a), "m" (*b), "m" (*rgba8_w256_64), "m" (*rgba8_alpha_mask_64)
: pdivwuqX_clobber, "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7");
a++;
b++;
d++;
}
if (n_pixels)
if (n_pixels > 0)
{
asm volatile (" movd %0,%%mm0\n"
"\tmovq %1,%%mm1\n"
@ -530,7 +530,7 @@ xxxgimp_composite_dodge_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
"\tmovd %%mm7,%2\n"
: /* empty */
: "m" (*a), "m" (*b), "m" (*d), "m" (*rgba8_w256), "m" (*rgba8_alpha_mask)
: "m" (*a), "m" (*b), "m" (*d), "m" (*rgba8_w256_64), "m" (*rgba8_alpha_mask_64)
: pdivwuqX_clobber, "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
}
@ -540,16 +540,22 @@ xxxgimp_composite_dodge_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
void
gimp_composite_grain_extract_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
{
GimpCompositeContext op = *_op;
uint64 *d = (uint64 *) _op->D;
uint64 *a = (uint64 *) _op->A;
uint64 *b = (uint64 *) _op->B;
gulong n_pixels = _op->n_pixels;
asm volatile ("movq %0,%%mm0" : : "m" (*rgba8_alpha_mask) : "%mm0");
asm volatile ("pxor %%mm6,%%mm6" : : : "%mm6");
asm volatile ("movq %0,%%mm7" : : "m" (*rgba8_w128) : "%mm7");
asm volatile ("movq %0,%%mm0\n"
"pxor %%mm6,%%mm6\n"
"movq %1,%%mm7\n"
: /* no outputs */
: "m" (*rgba8_alpha_mask_64), "m" (*rgba8_w128_64)
: "%mm0", "%mm7", "%mm6");
for (; op.n_pixels >= 2; op.n_pixels -= 2)
for (; n_pixels >= 2; n_pixels -= 2)
{
asm volatile (" movq %0,%%mm2\n"
"\tmovq %1,%%mm3\n"
asm volatile (" movq %1,%%mm2\n"
"\tmovq %2,%%mm3\n"
mmx_low_bytes_to_words(mm2,mm4,mm6)
mmx_low_bytes_to_words(mm3,mm5,mm6)
"\tpsubw %%mm5,%%mm4\n"
@ -572,19 +578,19 @@ gimp_composite_grain_extract_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
"\tpand %%mm0,%%mm2\n"
"\tpor %%mm2,%%mm1\n"
"\tmovq %%mm1,%2\n"
: /* empty */
: "m" (*op.A), "m" (*op.B), "m" (*op.D)
: "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4");
op.A += 8;
op.B += 8;
op.D += 8;
"\tmovq %%mm1,%0\n"
: "=m" (*d)
: "m" (*a), "m" (*b)
: "%mm1", "%mm2", "%mm3", "%mm4");
a++;
b++;
d++;
}
if (op.n_pixels)
if (n_pixels > 0)
{
asm volatile (" movd %0, %%mm2\n"
"\tmovd %1, %%mm3\n"
asm volatile (" movd %1, %%mm2\n"
"\tmovd %2, %%mm3\n"
mmx_low_bytes_to_words(mm2,mm4,mm6)
mmx_low_bytes_to_words(mm3,mm5,mm6)
@ -603,10 +609,10 @@ gimp_composite_grain_extract_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
"\tpand %%mm0, %%mm2\n"
"\tpor %%mm2, %%mm1\n"
"\tmovd %%mm1, %2\n"
: /* empty */
: "m" (*op.A), "m" (*op.B), "m" (*op.D)
: "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4");
"\tmovd %%mm1, %0\n"
: "=m" (*d)
: "m" (*a), "m" (*b)
: "%mm1", "%mm2", "%mm3", "%mm4");
}
asm("emms");
@ -615,19 +621,22 @@ gimp_composite_grain_extract_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
void
gimp_composite_grain_merge_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
{
GimpCompositeContext op = *_op;
uint64 *d = (uint64 *) _op->D;
uint64 *a = (uint64 *) _op->A;
uint64 *b = (uint64 *) _op->B;
gulong n_pixels = _op->n_pixels;
asm volatile ("movq %0, %%mm0\n"
"pxor %%mm6, %%mm6\n"
"movq %1, %%mm7\n"
: /* empty */
: "m" (*rgba8_alpha_mask), "m" (*rgba8_w128)
: "m" (*rgba8_alpha_mask_64), "m" (*rgba8_w128_64)
: "%mm0", "%mm6", "%mm7");
for (; op.n_pixels >= 2; op.n_pixels -= 2)
for (; n_pixels >= 2; n_pixels -= 2)
{
asm volatile (" movq %0, %%mm2\n"
"\tmovq %1, %%mm3\n"
asm volatile (" movq %1, %%mm2\n"
"\tmovq %2, %%mm3\n"
mmx_low_bytes_to_words(mm2,mm4,mm6)
mmx_low_bytes_to_words(mm3,mm5,mm6)
@ -647,19 +656,19 @@ gimp_composite_grain_merge_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
"\tmovq %%mm0, %%mm1\n"
"\tpandn %%mm4, %%mm1\n"
"\tpor %%mm2, %%mm1\n"
"\tmovq %%mm1, %2\n"
: /* empty */
: "m" (*op.A), "m" (*op.B), "m" (*op.D)
: "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4");
op.A += 8;
op.B += 8;
op.D += 8;
"\tmovq %%mm1, %0\n"
: "=m" (*d)
: "m" (*a), "m" (*b)
: "%mm1", "%mm2", "%mm3", "%mm4");
a++;
b++;
d++;
}
if (op.n_pixels)
if (n_pixels > 0)
{
asm volatile (" movd %0, %%mm2\n"
"\tmovd %1, %%mm3\n"
asm volatile (" movd %1, %%mm2\n"
"\tmovd %2, %%mm3\n"
mmx_low_bytes_to_words(mm2,mm4,mm6)
mmx_low_bytes_to_words(mm3,mm5,mm6)
@ -677,10 +686,10 @@ gimp_composite_grain_merge_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
"\tpand %%mm0, %%mm2\n"
"\tpor %%mm2, %%mm1\n"
"\tmovd %%mm1, %2\n"
: /* empty */
: "m" (*op.A), "m" (*op.B), "m" (*op.D)
: "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4");
"\tmovd %%mm1, %0\n"
: "=m" (*d)
: "m" (*a), "m" (*b)
: "%mm1", "%mm2", "%mm3", "%mm4");
}
asm("emms");
@ -689,14 +698,17 @@ gimp_composite_grain_merge_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
void
gimp_composite_lighten_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
{
GimpCompositeContext op = *_op;
uint64 *d = (uint64 *) _op->D;
uint64 *a = (uint64 *) _op->A;
uint64 *b = (uint64 *) _op->B;
gulong n_pixels = _op->n_pixels;
asm volatile ("movq %0,%%mm0" : : "m" (*rgba8_alpha_mask) : "%mm0");
asm volatile ("movq %0,%%mm0" : : "m" (*rgba8_alpha_mask_64) : "%mm0");
for (; op.n_pixels >= 2; op.n_pixels -= 2)
for (; n_pixels >= 2; n_pixels -= 2)
{
asm volatile (" movq %0, %%mm2\n"
"\tmovq %1, %%mm3\n"
asm volatile (" movq %1, %%mm2\n"
"\tmovq %2, %%mm3\n"
"\tmovq %%mm2, %%mm4\n"
"\t" pmaxub(mm3,mm4,mm5) "\n"
"\tmovq %%mm0, %%mm1\n"
@ -704,34 +716,34 @@ gimp_composite_lighten_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
"\t" pminub(mm2,mm3,mm4) "\n"
"\tpand %%mm0, %%mm3\n"
"\tpor %%mm3, %%mm1\n"
"\tmovq %%mm1, %2\n"
: /* empty */
: "m" (*op.A), "m" (*op.B), "m" (*op.D)
: "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
op.A += 8;
op.B += 8;
op.D += 8;
"\tmovq %%mm1, %0\n"
: "=m" (*d)
: "m" (*a), "m" (*b)
: "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
a++;
b++;
d++;
}
if (op.n_pixels)
if (n_pixels > 0)
{
asm volatile (" movd %0, %%mm2\n"
"\tmovd %1, %%mm3\n"
"\tmovq %%mm2, %%mm4\n"
"\t" pmaxub(mm3,mm4,mm5) "\n"
asm volatile (" movd %1, %%mm2\n"
"\tmovd %2, %%mm3\n"
"\tmovq %%mm2, %%mm4\n"
"\t" pmaxub(mm3,mm4,mm5) "\n"
"\tmovq %%mm0, %%mm1\n"
"\tpandn %%mm4, %%mm1\n"
"\tmovq %%mm0, %%mm1\n"
"\tpandn %%mm4, %%mm1\n"
"\t" pminub(mm2,mm3,mm4) "\n"
"\t" pminub(mm2,mm3,mm4) "\n"
"\tpand %%mm0, %%mm3\n"
"\tpor %%mm3, %%mm1\n"
"\tmovd %%mm1, %2\n"
: /* empty */
: "m" (*op.A), "m" (*op.B), "m" (*op.D)
: "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
}
"\tpand %%mm0, %%mm3\n"
"\tpor %%mm3, %%mm1\n"
"\tmovd %%mm1, %0\n"
: "=m" (*d)
: "m" (*a), "m" (*b)
: "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
}
asm("emms");
}
@ -739,16 +751,23 @@ gimp_composite_lighten_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
void
gimp_composite_multiply_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
{
GimpCompositeContext op = *_op;
uint64 *d = (uint64 *) _op->D;
uint64 *a = (uint64 *) _op->A;
uint64 *b = (uint64 *) _op->B;
gulong n_pixels = _op->n_pixels;
asm volatile ("movq %0,%%mm0" : : "m" (*rgba8_alpha_mask) : "%mm0");
asm volatile ("movq %0,%%mm7" : : "m" (*rgba8_w128) : "%mm7");
asm volatile ("pxor %%mm6,%%mm6" : : : "%mm6");
asm volatile (
"movq %0,%%mm0\n"
"movq %1,%%mm7\n"
"pxor %%mm6,%%mm6\n"
: /* empty */
: "m" (*rgba8_alpha_mask_64), "m" (*rgba8_w128_64)
: "%mm6", "%mm7", "%mm0");
for (; op.n_pixels >= 2; op.n_pixels -= 2)
for (; n_pixels >= 2; n_pixels -= 2)
{
asm volatile (" movq %0, %%mm2\n"
"\tmovq %1, %%mm3\n"
asm volatile (" movq %1, %%mm2\n"
"\tmovq %2, %%mm3\n"
mmx_low_bytes_to_words(mm2,mm1,mm6)
mmx_low_bytes_to_words(mm3,mm5,mm6)
@ -767,37 +786,37 @@ gimp_composite_multiply_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
"\tpand %%mm0, %%mm2\n"
"\tpor %%mm2, %%mm1\n"
"\tmovq %%mm1, %2\n"
: /* empty */
: "m" (*op.A), "m" (*op.B), "m" (*op.D)
: "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
op.A += 8;
op.B += 8;
op.D += 8;
"\tmovq %%mm1, %0\n"
: "=m" (*d)
: "m" (*a), "m" (*b)
: "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
a++;
b++;
d++;
}
if (op.n_pixels)
if (n_pixels > 0)
{
asm volatile (" movd %0, %%mm2\n"
"\tmovd %1, %%mm3\n"
asm volatile (" movd %1, %%mm2\n"
"\tmovd %2, %%mm3\n"
mmx_low_bytes_to_words(mm2,mm1,mm6)
mmx_low_bytes_to_words(mm3,mm5,mm6)
pmulwX(mm5,mm1,mm7)
mmx_low_bytes_to_words(mm2,mm1,mm6)
mmx_low_bytes_to_words(mm3,mm5,mm6)
pmulwX(mm5,mm1,mm7)
"\tpackuswb %%mm6, %%mm1\n"
"\tpackuswb %%mm6, %%mm1\n"
"\tmovq %%mm0, %%mm4\n"
"\tpandn %%mm1, %%mm4\n"
"\tmovq %%mm4, %%mm1\n"
"\t" pminub(mm3,mm2,mm4) "\n"
"\tpand %%mm0, %%mm2\n"
"\tpor %%mm2, %%mm1\n"
"\tmovq %%mm0, %%mm4\n"
"\tpandn %%mm1, %%mm4\n"
"\tmovq %%mm4, %%mm1\n"
"\t" pminub(mm3,mm2,mm4) "\n"
"\tpand %%mm0, %%mm2\n"
"\tpor %%mm2, %%mm1\n"
"\tmovd %%mm1, %2\n"
: /* empty */
: "m" (*op.A), "m" (*op.B), "m" (*op.D)
: "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
"\tmovd %%mm1, %0\n"
: "=m" (*d)
: "m" (*a), "m" (*b)
: "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
}
asm("emms");
@ -849,21 +868,24 @@ mmx_op_overlay(void)
"\tpor %%mm3,%%mm1\n"
: /* empty */
: "m" (*rgba8_w2), "m" (*rgba8_alpha_mask)
: "m" (*rgba8_w2_64), "m" (*rgba8_alpha_mask_64)
);
}
void
xxxgimp_composite_overlay_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
{
GimpCompositeContext op = *_op;
uint64 *d = (uint64 *) _op->D;
uint64 *a = (uint64 *) _op->A;
uint64 *b = (uint64 *) _op->B;
gulong n_pixels = _op->n_pixels;
asm volatile ("pxor %%mm0,%%mm0\n"
"movq %0,%%mm7"
: /* empty */
: "m" (*rgba8_w128) : "%mm0");
: "m" (*rgba8_w128_64) : "%mm0");
for (; op.n_pixels >= 2; op.n_pixels -= 2)
for (; n_pixels >= 2; n_pixels -= 2)
{
asm volatile (" movq %0,%%mm2\n"
"\tmovq %1,%%mm3\n"
@ -910,25 +932,25 @@ xxxgimp_composite_overlay_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
"\tpor %%mm3,%%mm1\n"
"\tmovq %%mm1,%2\n"
: "+m" (*op.A), "+m" (*op.B), "+m" (*op.D)
: "m" (*rgba8_w2), "m" (*rgba8_alpha_mask)
: "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4");
op.A += 8;
op.B += 8;
op.D += 8;
: "+m" (*a), "+m" (*b), "+m" (*d)
: "m" (*rgba8_w2_64), "m" (*rgba8_alpha_mask_64)
: "%mm1", "%mm2", "%mm3", "%mm4");
a++;
b++;
d++;
}
if (op.n_pixels)
if (n_pixels > 0)
{
asm volatile (" movd %0,%%mm2\n"
"\tmovd %1,%%mm3\n"
asm volatile (" movd %1,%%mm2\n"
"\tmovd %2,%%mm3\n"
/* low bytes */
mmx_low_bytes_to_words(mm3,mm5,mm0)
"\tpcmpeqb %%mm4,%%mm4\n"
"\tpsubb %%mm2,%%mm4\n" /* mm4 = 255 - A */
"\tpunpcklbw %%mm0,%%mm4\n" /* mm4 = (low bytes as word) mm4 */
"\tmovq %3,%%mm6\n" /* mm6 = words of value 2 */
"\tmovq %3,%%mm6\n" /* mm6 = words of integer value 2 */
"\tpmullw %%mm5,%%mm6\n" /* mm6 = 2 * low bytes of B */
mmx_int_mult(mm6,mm4,mm7) /* mm4 = INT_MULT(mm6, mm4) */
@ -937,7 +959,7 @@ xxxgimp_composite_overlay_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
"\tpcmpeqb %%mm1,%%mm1\n"
"\tpsubb %%mm2,%%mm1\n" /* mm1 = 255 - A */
"\tpunpckhbw %%mm0,%%mm1\n" /* mm1 = (high bytes as word) mm1 */
"\tmovq %3,%%mm6\n" /* mm6 = words of value 2 */
"\tmovq %3,%%mm6\n" /* mm6 = words of integer value 2 */
"\tpmullw %%mm5,%%mm6\n" /* mm6 = 2 * high bytes of B */
mmx_int_mult(mm6,mm1,mm7) /* mm1 = INT_MULT(mm6, mm1) */
@ -964,10 +986,10 @@ xxxgimp_composite_overlay_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
"\tpor %%mm3,%%mm1\n"
"\tmovd %%mm1,%2\n"
: /* empty */
: "m" (*op.A), "m" (*op.B), "m" (*op.D), "m" (*rgba8_w2), "m" (*rgba8_alpha_mask)
: "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4");
"\tmovd %%mm1,%0\n"
: "=m" (*d)
: "m" (*a), "m" (*b), "m" (*rgba8_w2_64), "m" (*rgba8_alpha_mask_64)
: "%mm1", "%mm2", "%mm3", "%mm4");
}
asm("emms");
@ -977,23 +999,25 @@ xxxgimp_composite_overlay_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
void
gimp_composite_scale_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
{
GimpCompositeContext op = *_op;
uint64 *d = (uint64 *) _op->D;
uint64 *a = (uint64 *) _op->A;
gulong n_pixels = _op->n_pixels;
asm volatile ("pxor %%mm0,%%mm0\n"
"\tmovl %0,%%eax\n"
"\tmovl %0,%%eax\n"
"\tmovl %%eax,%%ebx\n"
"\tshl $16,%%ebx\n"
"\tshl $16,%%ebx\n"
"\torl %%ebx,%%eax\n"
"\tmovd %%eax,%%mm5\n"
"\tmovd %%eax,%%mm3\n"
"\tpsllq $32,%%mm5\n"
"\tpsllq $32,%%mm5\n"
"\tpor %%mm5,%%mm3\n"
"\tmovq %1,%%mm7\n"
"\tmovq %1,%%mm7\n"
: /* empty */
: "m" (op.scale.scale), "m" (*rgba8_w128)
: "%eax", "%mm0", "%mm5", "%mm6", "%mm7");
: "m" (_op->scale.scale), "m" (*rgba8_w128_64)
: "%eax", "%ebx", "%mm0", "%mm5", "%mm6", "%mm7");
for (; op.n_pixels >= 2; op.n_pixels -= 2)
for (; n_pixels >= 2; n_pixels -= 2)
{
asm volatile ("movq %1,%%mm2\n"
"\tmovq %%mm2,%%mm1\n"
@ -1010,15 +1034,15 @@ gimp_composite_scale_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
"\tpackuswb %%mm4,%%mm1\n"
"\tmovq %%mm1,%0\n"
: "=m" (*op.D)
: "m" (*op.A)
"\tmovq %%mm1,%0\n"
: "=m" (*d)
: "m" (*a)
: "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7");
op.A += 8;
op.D += 8;
a++;
d++;
}
if (op.n_pixels)
if (n_pixels > 0)
{
asm volatile ("movd %1,%%mm2\n"
"\tmovq %%mm2,%%mm1\n"
@ -1029,8 +1053,8 @@ gimp_composite_scale_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
"\tpackuswb %%mm0,%%mm1\n"
"\tmovd %%mm1,%0\n"
: "=m" (*op.D)
: "m" (*op.A)
: "=m" (*d)
: "m" (*a)
: "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7");
}
@ -1040,16 +1064,22 @@ gimp_composite_scale_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
void
gimp_composite_screen_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
{
GimpCompositeContext op = *_op;
uint64 *d = (uint64 *) _op->D;
uint64 *a = (uint64 *) _op->A;
uint64 *b = (uint64 *) _op->B;
gulong n_pixels = _op->n_pixels;
asm volatile ("movq %0,%%mm0" : : "m" (*rgba8_alpha_mask) : "%mm0");
asm volatile ("movq %0,%%mm7" : : "m" (*rgba8_w128) : "%mm7");
asm volatile ("pxor %mm6, %mm6");
asm volatile ("pxor %%mm6,%%mm6\n"
"movq %0,%%mm0\n"
"movq %1,%%mm7\n"
: /* empty */
: "m" (*rgba8_alpha_mask_64), "m" (*rgba8_w128_64)
: "%mm0", "%mm6", "%mm7");
for (; op.n_pixels >= 2; op.n_pixels -= 2)
for (; n_pixels >= 2; n_pixels -= 2)
{
asm volatile (" movq %0,%%mm2\n"
"\tmovq %1,%%mm3\n"
asm volatile (" movq %1,%%mm2\n"
"\tmovq %2,%%mm3\n"
"\tpcmpeqb %%mm4,%%mm4\n"
"\tpsubb %%mm2,%%mm4\n"
@ -1092,79 +1122,82 @@ gimp_composite_screen_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
"\tpor %%mm3,%%mm1\n"
"\tmovq %%mm1,%2\n"
: /* empty */
: "m" (*op.A), "m" (*op.B), "m" (*op.D)
: "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
op.A += 8;
op.B += 8;
op.D += 8;
"\tmovq %%mm1,%0\n"
: "=m" (*d)
: "m" (*a), "m" (*b)
: "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
a++;
b++;
d++;
}
if (op.n_pixels)
if (n_pixels > 0)
{
asm volatile (" movd %0,%%mm2\n"
"\tmovd %1,%%mm3\n"
asm volatile (" movd %1,%%mm2\n"
"\tmovd %2,%%mm3\n"
"\tpcmpeqb %%mm4,%%mm4\n"
"\tpsubb %%mm2,%%mm4\n"
"\tpcmpeqb %%mm5,%%mm5\n"
"\tpsubb %%mm3,%%mm5\n"
"\tpcmpeqb %%mm4,%%mm4\n"
"\tpsubb %%mm2,%%mm4\n"
"\tpcmpeqb %%mm5,%%mm5\n"
"\tpsubb %%mm3,%%mm5\n"
"\tpunpcklbw %%mm6,%%mm4\n"
"\tpunpcklbw %%mm6,%%mm5\n"
"\tpmullw %%mm4,%%mm5\n"
"\tpaddw %%mm7,%%mm5\n"
"\tmovq %%mm5,%%mm1\n"
"\tpsrlw $ 8,%%mm1\n"
"\tpaddw %%mm5,%%mm1\n"
"\tpsrlw $ 8,%%mm1\n"
"\tpunpcklbw %%mm6,%%mm4\n"
"\tpunpcklbw %%mm6,%%mm5\n"
"\tpmullw %%mm4,%%mm5\n"
"\tpaddw %%mm7,%%mm5\n"
"\tmovq %%mm5,%%mm1\n"
"\tpsrlw $ 8,%%mm1\n"
"\tpaddw %%mm5,%%mm1\n"
"\tpsrlw $ 8,%%mm1\n"
"\tpcmpeqb %%mm4,%%mm4\n"
"\tpsubb %%mm2,%%mm4\n"
"\tpcmpeqb %%mm5,%%mm5\n"
"\tpsubb %%mm3,%%mm5\n"
"\tpcmpeqb %%mm4,%%mm4\n"
"\tpsubb %%mm2,%%mm4\n"
"\tpcmpeqb %%mm5,%%mm5\n"
"\tpsubb %%mm3,%%mm5\n"
"\tpunpckhbw %%mm6,%%mm4\n"
"\tpunpckhbw %%mm6,%%mm5\n"
"\tpmullw %%mm4,%%mm5\n"
"\tpaddw %%mm7,%%mm5\n"
"\tmovq %%mm5,%%mm4\n"
"\tpsrlw $ 8,%%mm4\n"
"\tpaddw %%mm5,%%mm4\n"
"\tpsrlw $ 8,%%mm4\n"
"\tpunpckhbw %%mm6,%%mm4\n"
"\tpunpckhbw %%mm6,%%mm5\n"
"\tpmullw %%mm4,%%mm5\n"
"\tpaddw %%mm7,%%mm5\n"
"\tmovq %%mm5,%%mm4\n"
"\tpsrlw $ 8,%%mm4\n"
"\tpaddw %%mm5,%%mm4\n"
"\tpsrlw $ 8,%%mm4\n"
"\tpackuswb %%mm4,%%mm1\n"
"\tpackuswb %%mm4,%%mm1\n"
"\tpcmpeqb %%mm4,%%mm4\n"
"\tpsubb %%mm1,%%mm4\n"
"\tpcmpeqb %%mm4,%%mm4\n"
"\tpsubb %%mm1,%%mm4\n"
"\tmovq %%mm0,%%mm1\n"
"\tpandn %%mm4,%%mm1\n"
"\tmovq %%mm0,%%mm1\n"
"\tpandn %%mm4,%%mm1\n"
"\t" pminub(mm2,mm3,mm4) "\n"
"\tpand %%mm0,%%mm3\n"
"\t" pminub(mm2,mm3,mm4) "\n"
"\tpand %%mm0,%%mm3\n"
"\tpor %%mm3,%%mm1\n"
"\tpor %%mm3,%%mm1\n"
"\tmovd %%mm1,%2\n"
: /* empty */
: "m" (*op.A), "m" (*op.B), "m" (*op.D)
: "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
}
"\tmovd %%mm1,%0\n"
: "=m" (*d)
: "m" (*a), "m" (*b)
: "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
}
asm("emms");
asm volatile ("emms");
}
void
gimp_composite_subtract_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
{
GimpCompositeContext op = *_op;
uint64 *d = (uint64 *) _op->D;
uint64 *a = (uint64 *) _op->A;
uint64 *b = (uint64 *) _op->B;
gulong n_pixels = _op->n_pixels;
asm volatile ("movq %0,%%mm0" : : "m" (*rgba8_alpha_mask) : "%mm0");
asm volatile ("movq %0,%%mm0" : : "m" (*rgba8_alpha_mask_64) : "%mm0");
for (; op.n_pixels >= 2; op.n_pixels -= 2)
for (; n_pixels >= 2; n_pixels -= 2)
{
asm volatile (" movq %1,%%mm2\n"
"\tmovq %2,%%mm3\n"
@ -1180,65 +1213,68 @@ gimp_composite_subtract_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
"\tpand %%mm0,%%mm2\n"
"\tpor %%mm2,%%mm1\n"
"\tmovq %%mm1,%0\n"
: "=m" (*op.D)
: "m" (*op.A), "m" (*op.B)
: "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
op.A += 8;
op.B += 8;
op.D += 8;
: "=m" (*d)
: "m" (*a), "m" (*b)
: "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
a++;
b++;
d++;
}
if (op.n_pixels)
if (n_pixels > 0)
{
asm volatile (" movd %0,%%mm2\n"
"\tmovd %1,%%mm3\n"
asm volatile (" movd %1,%%mm2\n"
"\tmovd %2,%%mm3\n"
"\tmovq %%mm2,%%mm4\n"
"\tpsubusb %%mm3,%%mm4\n"
"\tmovq %%mm2,%%mm4\n"
"\tpsubusb %%mm3,%%mm4\n"
"\tmovq %%mm0,%%mm1\n"
"\tpandn %%mm4,%%mm1\n"
"\tmovq %%mm0,%%mm1\n"
"\tpandn %%mm4,%%mm1\n"
"\t" pminub(mm3,mm2,mm4) "\n"
"\t" pminub(mm3,mm2,mm4) "\n"
"\tpand %%mm0,%%mm2\n"
"\tpor %%mm2,%%mm1\n"
"\tmovd %%mm1,%2\n"
: /* empty */
: "m" (*op.A), "m" (*op.B), "m" (*op.D)
: "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
}
"\tpand %%mm0,%%mm2\n"
"\tpor %%mm2,%%mm1\n"
"\tmovd %%mm1,%0\n"
: "=m" (*d)
: "m" (*a), "m" (*b)
: "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
}
asm("emms");
asm volatile ("emms");
}
void
gimp_composite_swap_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
{
GimpCompositeContext op = *_op;
uint64 *d = (uint64 *) _op->D;
uint64 *a = (uint64 *) _op->A;
uint64 *b = (uint64 *) _op->B;
gulong n_pixels = _op->n_pixels;
for (; op.n_pixels >= 2; op.n_pixels -= 2)
for (; n_pixels >= 2; n_pixels -= 2)
{
asm volatile (" movq %0,%%mm2\n"
"\tmovq %1,%%mm3\n"
"\tmovq %%mm3,%0\n"
"\tmovq %%mm2,%1\n"
: /* empty */
: "m" (*op.A), "m" (*op.B)
: "0", "1", "%mm1", "%mm2", "%mm3", "%mm4");
op.A += 8;
op.B += 8;
: "+m" (*a), "+m" (*b)
:
: "%mm1", "%mm2", "%mm3", "%mm4");
a++;
b++;
}
if (op.n_pixels)
if (n_pixels > 0)
{
asm volatile (" movd %0,%%mm2\n"
"\tmovd %1,%%mm3\n"
"\tmovd %%mm3,%0\n"
"\tmovd %%mm2,%1\n"
: /* empty */
: "m" (*op.A), "m" (*op.B)
: "0", "1", "%mm1", "%mm2", "%mm3", "%mm4");
: "+m" (*a), "+m" (*b)
:
: "%mm1", "%mm2", "%mm3", "%mm4");
}
asm("emms");
@ -1388,7 +1424,7 @@ gimp_composite_burn_va8_va8_va8_mmx (GimpCompositeContext *_op)
"\tmovq %%mm7,%2\n"
: /* empty */
: "+m" (*op.A), "+m" (*op.B), "+m" (*op.D), "m" (*va8_b255), "m" (*va8_w1), "m" (*va8_w255), "m" (*va8_alpha_mask)
: "+m" (*op.A), "+m" (*op.B), "+m" (*op.D), "m" (*va8_b255), "m" (*va8_w1), "m" (*va8_w255_64), "m" (*va8_alpha_mask)
: "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4");
op.A += 8;
op.B += 8;
@ -1441,7 +1477,7 @@ gimp_composite_burn_va8_va8_va8_mmx (GimpCompositeContext *_op)
"\tmovd %%mm7,%2\n"
: /* empty */
: "m" (*op.A), "m" (*op.B), "m" (*op.D), "m" (*va8_b255), "m" (*va8_w1), "m" (*va8_w255), "m" (*va8_alpha_mask)
: "m" (*op.A), "m" (*op.B), "m" (*op.D), "m" (*va8_b255), "m" (*va8_w1), "m" (*va8_w255_64), "m" (*va8_alpha_mask)
: "0", "1", "2", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7");
}

View File

@ -16,7 +16,7 @@ static struct install_table {
GimpPixelFormat D;
void (*function)(GimpCompositeContext *);
} _gimp_composite_sse[] = {
#if (__GNUC__ >= 3) && defined(USE_SSE) && defined(ARCH_X86) && (defined(ARCH_X86_64) || !defined(PIC))
#if defined(COMPILE_SSE_IS_OKAY)
{ GIMP_COMPOSITE_MULTIPLY, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, gimp_composite_multiply_rgba8_rgba8_rgba8_sse },
{ GIMP_COMPOSITE_SCREEN, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, gimp_composite_screen_rgba8_rgba8_rgba8_sse },
{ GIMP_COMPOSITE_DIFFERENCE, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, gimp_composite_difference_rgba8_rgba8_rgba8_sse },

View File

@ -19,7 +19,7 @@
int
gimp_composite_sse_test (int iterations, int n_pixels)
{
#if (__GNUC__ >= 3) && defined(USE_SSE) && defined(ARCH_X86) && (defined(ARCH_X86_64) || !defined(PIC))
#if defined(COMPILE_SSE_IS_OKAY)
GimpCompositeContext generic_ctx;
GimpCompositeContext special_ctx;
double ft0;
@ -210,7 +210,7 @@ main (int argc, char *argv[])
putenv ("GIMP_COMPOSITE=0x1");
iterations = 1;
iterations = 10;
n_pixels = 1048593;
argv++, argc--;

View File

@ -48,6 +48,7 @@
#define pminub(src,dst,tmp) "pminub " "%%" #src ", %%" #dst
#define pmaxub(src,dst,tmp) "pmaxub " "%%" #src ", %%" #dst
#if 0
/*
* Double-word divide. Adjusted for subsequent unsigned packing
* (high-order bit of each word is cleared)
@ -65,7 +66,8 @@
"roll $16, %%eax; " \
"btr $15, %%eax; " \
"movd %%eax,%%" #quotient ";"
#endif
#if 0
/*
* Quadword divide. No adjustment for subsequent unsigned packing
* (high-order bit of each word is left alone)
@ -107,7 +109,8 @@
"movd %%eax,%%" #divisor ";" \
"psllq $32,%%" #divisor ";" \
"por %%" #divisor ",%%" #quotient ";"
#endif
#if 0
/* equivalent to the INT_MULT() macro in gimp-composite-generic.c */
/*
* opr2 = INT_MULT(opr1, opr2, t)
@ -126,7 +129,8 @@
"\tpsrlw $8, %%"#opr2"; " \
"\tpaddw %%"#opr1", %%"#opr2"; " \
"\tpsrlw $8, %%"#opr2"\n"
#endif
#if 0
/* a = INT_MULT(a,b) */
#define mmx_int_mult(a,b,w128) \
"\tpmullw %%"#b", %%"#a"; " \
@ -135,7 +139,9 @@
"\tpsrlw $8, %%"#b"; " \
"\tpaddw %%"#a", %%"#b"; " \
"\tpsrlw $8, %%"#b"\n"
#endif
#if 0
static const guint32 rgba8_alpha_mask_64[2] = { 0xFF000000, 0xFF000000 };
static const guint32 rgba8_b1_64[2] = { 0x01010101, 0x01010101 };
static const guint32 rgba8_b255_64[2] = { 0xFFFFFFFF, 0xFFFFFFFF };
@ -149,6 +155,7 @@ static const guint32 va8_alpha_mask[2] = { 0xFF00FF00, 0xFF00FF00 };
static const guint32 va8_b255[2] = { 0xFFFFFFFF, 0xFFFFFFFF };
static const guint32 va8_w1[2] = { 0x00010001, 0x00010001 };
static const guint32 va8_w255[2] = { 0x00FF00FF, 0x00FF00FF };
#endif
/*
*
@ -156,48 +163,51 @@ static const guint32 va8_w255[2] = { 0x00FF00FF, 0x00FF00FF };
void
gimp_composite_addition_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op)
{
GimpCompositeContext op = *_op;
uint64 *d = (uint64 *) _op->D;
uint64 *a = (uint64 *) _op->A;
uint64 *b = (uint64 *) _op->B;
gulong n_pixels = _op->n_pixels;
asm volatile ("movq %0,%%mm0"
: /* empty */
: "m" (*rgba8_alpha_mask_64)
: "%mm0");
for (; op.n_pixels >= 2; op.n_pixels -= 2)
for (; n_pixels >= 2; n_pixels -= 2)
{
asm (" movq %1, %%mm2\n"
"\tmovq %2, %%mm3\n"
"\tmovq %%mm2, %%mm4\n"
"\tpaddusb %%mm3, %%mm4\n"
"\tmovq %%mm0, %%mm1\n"
"\tpandn %%mm4, %%mm1\n"
"\tpminub %%mm3, %%mm2\n"
"\tpand %%mm0, %%mm2\n"
"\tpor %%mm2, %%mm1\n"
"\tmovq %%mm1, %0\n"
: "=m" (*op.D)
: "m" (*op.A), "m" (*op.B)
: "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7");
op.A += 8;
op.B += 8;
op.D += 8;
}
if (op.n_pixels)
{
asm volatile (" movd %1, %%mm2\n"
"\tmovd %2, %%mm3\n"
asm volatile (" movq %1, %%mm2\n"
"\tmovq %2, %%mm3\n"
"\tmovq %%mm2, %%mm4\n"
"\tpaddusb %%mm3, %%mm4\n"
"\tmovq %%mm0, %%mm1\n"
"\tpandn %%mm4, %%mm1\n"
"\tpminub %%mm3, %%mm2\n"
"\t" pminub(mm3, mm2, mm4) "\n"
"\tpand %%mm0, %%mm2\n"
"\tpor %%mm2, %%mm1\n"
"\tmovq %%mm1, %0\n"
: "=m" (*d)
: "m" (*a), "m" (*b)
: "%mm0", "%mm1", "%mm2", "%mm3", "%mm4");
a++;
b++;
d++;
}
if (n_pixels > 0)
{
asm volatile (" movd %1, %%mm2\n"
"\tmovd %2, %%mm3\n"
"\tmovq %%mm2, %%mm4\n"
"\tpaddusb %%mm3, %%mm4\n"
"\tmovq %%mm0, %%mm1\n"
"\tpandn %%mm4, %%mm1\n"
"\t" pminub(mm3, mm2, mm4) "\n"
"\tpand %%mm0, %%mm2\n"
"\tpor %%mm2, %%mm1\n"
"\tmovd %%mm1, %0\n"
: "=m" (*op.D)
: "m" (*op.A), "m" (*op.B)
: "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7");
: "=m" (*d)
: "m" (*a), "m" (*b)
: "%mm0", "%mm1", "%mm2", "%mm3", "%mm4");
}
asm("emms");
@ -207,63 +217,66 @@ gimp_composite_addition_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op)
void
gimp_composite_burn_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op)
{
GimpCompositeContext op = *_op;
uint64 *d = (uint64 *) _op->D;
uint64 *a = (uint64 *) _op->A;
uint64 *b = (uint64 *) _op->B;
gulong n_pixels = _op->n_pixels;
for (; op.n_pixels >= 2; op.n_pixels -= 2)
for (; n_pixels >= 2; n_pixels -= 2)
{
asm (" movq %1,%%mm0\n"
"\tmovq %2,%%mm1\n"
asm volatile (" movq %1,%%mm0\n"
"\tmovq %2,%%mm1\n"
"\tmovq %3,%%mm2\n"
"\tpsubb %%mm0,%%mm2\n" /* mm2 = 255 - A */
"\tpxor %%mm4,%%mm4\n"
"\tpunpcklbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256 */
"\tmovq %3,%%mm2\n"
"\tpsubb %%mm0,%%mm2\n" /* mm2 = 255 - A */
"\tpxor %%mm4,%%mm4\n"
"\tpunpcklbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256 */
"\tmovq %%mm1,%%mm3\n"
"\tpxor %%mm5,%%mm5\n"
"\tpunpcklbw %%mm5,%%mm3\n"
"\tmovq %4,%%mm5\n"
"\tpaddusw %%mm3,%%mm5\n" /* mm5 = B + 1 */
"\tmovq %%mm1,%%mm3\n"
"\tpxor %%mm5,%%mm5\n"
"\tpunpcklbw %%mm5,%%mm3\n"
"\tmovq %4,%%mm5\n"
"\tpaddusw %%mm3,%%mm5\n" /* mm5 = B + 1 */
"\t" pdivwqX(mm4,mm5,mm7) "\n"
"\t" pdivwqX(mm4,mm5,mm7) "\n"
"\tmovq %3,%%mm2\n"
"\tpsubb %%mm0,%%mm2\n" /* mm2 = 255 - A */
"\tpxor %%mm4,%%mm4\n"
"\tpunpckhbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256 */
"\tmovq %3,%%mm2\n"
"\tpsubb %%mm0,%%mm2\n" /* mm2 = 255 - A */
"\tpxor %%mm4,%%mm4\n"
"\tpunpckhbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256 */
"\tmovq %%mm1,%%mm3\n"
"\tpxor %%mm5,%%mm5\n"
"\tpunpckhbw %%mm5,%%mm3\n"
"\tmovq %4,%%mm5\n"
"\tpaddusw %%mm3,%%mm5\n" /* mm5 = B + 1 */
"\t" pdivwqX(mm4,mm5,mm6) "\n"
"\tmovq %%mm1,%%mm3\n"
"\tpxor %%mm5,%%mm5\n"
"\tpunpckhbw %%mm5,%%mm3\n"
"\tmovq %4,%%mm5\n"
"\tpaddusw %%mm3,%%mm5\n" /* mm5 = B + 1 */
"\t" pdivwqX(mm4,mm5,mm6) "\n"
"\tmovq %5,%%mm4\n"
"\tmovq %%mm4,%%mm5\n"
"\tpsubusw %%mm6,%%mm4\n"
"\tpsubusw %%mm7,%%mm5\n"
"\tmovq %5,%%mm4\n"
"\tmovq %%mm4,%%mm5\n"
"\tpsubusw %%mm6,%%mm4\n"
"\tpsubusw %%mm7,%%mm5\n"
"\tpackuswb %%mm4,%%mm5\n"
"\tpackuswb %%mm4,%%mm5\n"
"\tpminub %%mm0,%%mm1\n" /* mm1 = min(mm0,mm1) clobber mm3 */
"\t" pminub(mm0,mm1,mm3) "\n" /* mm1 = min(mm0,mm1) clobber mm3 */
"\tmovq %6,%%mm7\n"
"\tpand %%mm7,%%mm1\n" /* mm1 = mm7 & alpha_mask */
"\tmovq %6,%%mm7\n" /* mm6 = rgba8_alpha_mask_64 */
"\tpand %%mm7,%%mm1\n" /* mm1 = mm7 & alpha_mask */
"\tpandn %%mm5,%%mm7\n" /* mm7 = ~mm7 & mm5 */
"\tpor %%mm1,%%mm7\n" /* mm7 = mm7 | mm1 */
"\tpandn %%mm5,%%mm7\n" /* mm7 = ~mm7 & mm5 */
"\tpor %%mm1,%%mm7\n" /* mm7 = mm7 | mm1 */
"\tmovq %%mm7,%0\n"
: "=m" (*op.D)
: "m" (*op.A), "m" (*op.B), "m" (*rgba8_b255_64), "m" (*rgba8_w1_64), "m" (*rgba8_w255_64), "m" (*rgba8_alpha_mask_64)
: "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7");
op.A += 8;
op.B += 8;
op.D += 8;
"\tmovq %%mm7,%0\n"
: "=m" (*d)
: "m" (*a), "m" (*b), "m" (*rgba8_b255_64), "m" (*rgba8_w1_64), "m" (*rgba8_w255_64), "m" (*rgba8_alpha_mask_64)
: pdivwqX_clobber, "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7");
d++;
b++;
a++;
}
if (op.n_pixels)
if (n_pixels > 0)
{
asm volatile (" movd %1,%%mm0\n"
"\tmovd %2,%%mm1\n"
@ -300,7 +313,7 @@ gimp_composite_burn_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op)
"\tpackuswb %%mm4,%%mm5\n"
"\tpminub %%mm0,%%mm1\n" /* mm1 = min(mm0,mm1) clobber mm3 */
"\t" pminub(mm0,mm1,mm3) "\n" /* mm1 = min(mm0,mm1) clobber mm3 */
"\tmovq %6,%%mm7\n"
"\tpand %%mm7,%%mm1\n" /* mm1 = mm7 & alpha_mask */
@ -309,9 +322,9 @@ gimp_composite_burn_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op)
"\tpor %%mm1,%%mm7\n" /* mm7 = mm7 | mm1 */
"\tmovd %%mm7,%0\n"
: "=m" (*op.D)
: "m" (*op.A), "m" (*op.B), "m" (*rgba8_b255_64), "m" (*rgba8_w1_64), "m" (*rgba8_w255_64), "m" (*rgba8_alpha_mask_64)
: "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7");
: "=m" (*d)
: "m" (*a), "m" (*b), "m" (*rgba8_b255_64), "m" (*rgba8_w1_64), "m" (*rgba8_w255_64), "m" (*rgba8_alpha_mask_64)
: pdivwqX_clobber, "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7");
}
asm("emms");

View File

@ -16,7 +16,7 @@ static struct install_table {
GimpPixelFormat D;
void (*function)(GimpCompositeContext *);
} _gimp_composite_sse2[] = {
#if (__GNUC__ >= 3) && defined(USE_SSE) && defined(ARCH_X86) && (defined(ARCH_X86_64) || !defined(PIC))
#if defined(COMPILE_SSE2_IS_OKAY)
{ GIMP_COMPOSITE_DIFFERENCE, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, gimp_composite_difference_rgba8_rgba8_rgba8_sse2 },
{ GIMP_COMPOSITE_ADDITION, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, gimp_composite_addition_rgba8_rgba8_rgba8_sse2 },
{ GIMP_COMPOSITE_SUBTRACT, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, gimp_composite_subtract_rgba8_rgba8_rgba8_sse2 },

View File

@ -19,7 +19,7 @@
int
gimp_composite_sse2_test (int iterations, int n_pixels)
{
#if (__GNUC__ >= 3) && defined(USE_SSE) && defined(ARCH_X86) && (defined(ARCH_X86_64) || !defined(PIC))
#if defined(COMPILE_SSE2_IS_OKAY)
GimpCompositeContext generic_ctx;
GimpCompositeContext special_ctx;
double ft0;
@ -155,7 +155,7 @@ main (int argc, char *argv[])
putenv ("GIMP_COMPOSITE=0x1");
iterations = 1;
iterations = 10;
n_pixels = 1048593;
argv++, argc--;

View File

@ -19,7 +19,7 @@
int
gimp_composite_vis_test (int iterations, int n_pixels)
{
#if (__GNUC__ >= 3) && defined(USE_VIS) && defined(ARCH_SPARC)
#if defined(COMPILE_VIS_IS_OKAY)
GimpCompositeContext generic_ctx;
GimpCompositeContext special_ctx;
double ft0;
@ -78,7 +78,7 @@ main (int argc, char *argv[])
putenv ("GIMP_COMPOSITE=0x1");
iterations = 1;
iterations = 10;
n_pixels = 1048593;
argv++, argc--;

View File

@ -32,18 +32,14 @@
#include "gimp-composite.h"
#include "gimp-composite-vis.h"
#if defined(USE_VIS)
#if defined(ARCH_SPARC)
#if __GNUC__ >= 3
#ifdef COMPILE_VIS_IS_OKAY
#endif /* __GNUC__ > 3 */
#endif /* defined(ARCH_SPARC) */
#endif /* defined(USE_VIS) */
#endif
gboolean
gimp_composite_vis_init (void)
{
#ifdef ARCH_SPARC
#ifdef COMPILE_VIS_IS_OKAY
return (TRUE);
#else
return (FALSE);

View File

@ -9,4 +9,12 @@ extern gboolean gimp_composite_vis_init (void);
*/
extern gboolean gimp_composite_vis_install (void);
#if defined(USE_VIS)
#if defined(ARCH_SPARC)
#if __GNUC__ >= 3
#define COMPILE_VIS_IS_OKAY (1)
#endif /* __GNUC__ > 3 */
#endif /* defined(ARCH_SPARC) */
#endif /* defined(USE_VIS) */
#endif

View File

@ -21,10 +21,16 @@
#if __GNUC__ >= 3
/*
* Convert the low 8bit byte of the src to 16bit words in dst.
*/
#define mmx_low_bytes_to_words(src,dst,zero) \
"\tmovq %%"#src", %%"#dst"; " \
"\tpunpcklbw %%"#zero", %%"#dst"\n"
/*
* Convert the high 8bit byte of the src to 16bit words in dst.
*/
#define mmx_high_bytes_to_words(src,dst,zero) \
"\tmovq %%"#src", %%"#dst"; " \
"\tpunpckhbw %%"#zero", %%"#dst"\n"
@ -231,4 +237,17 @@
typedef unsigned long long uint64;
extern const guint32 rgba8_alpha_mask_64[2];
extern const guint32 rgba8_b1_64[2];
extern const guint32 rgba8_b255_64[2];
extern const guint32 rgba8_w1_64[2];
extern const guint32 rgba8_w2_64[2];
extern const guint32 rgba8_w128_64[2];
extern const guint32 rgba8_w256_64[2];
extern const guint32 rgba8_w255_64[2];
extern const guint32 va8_alpha_mask[2];
extern const guint32 va8_b255[2];
extern const guint32 va8_w1[2];
extern const guint32 va8_w255[2];
#endif /* __GNUC__ >= 3 */

View File

@ -366,9 +366,12 @@ gimp_composite_init (gboolean be_verbose,
gimp_composite_options.bits |= GIMP_COMPOSITE_OPTION_NOEXTENSIONS;
#ifdef GIMP_UNSTABLE
g_printerr ("gimp_composite: use=%s, verbose=%s\n",
(gimp_composite_options.bits & GIMP_COMPOSITE_OPTION_USE) ? "yes" : "no",
(gimp_composite_options.bits & GIMP_COMPOSITE_OPTION_VERBOSE) ? "yes" : "no");
if (be_verbose)
{
g_printerr ("gimp_composite: use=%s, verbose=%s\n",
(gimp_composite_options.bits & GIMP_COMPOSITE_OPTION_USE) ? "yes" : "no",
(gimp_composite_options.bits & GIMP_COMPOSITE_OPTION_VERBOSE) ? "yes" : "no");
}
#endif
gimp_composite_generic_install ();
@ -400,14 +403,17 @@ gimp_composite_init (gboolean be_verbose,
gboolean can_use_vis = gimp_composite_vis_install ();
#ifdef GIMP_UNSTABLE
g_printerr ("supported by gimp_composite: "
"%cmmx %csse %csse2 %c3dnow %caltivec %cvis\n",
can_use_mmx ? '+' : '-',
can_use_sse ? '+' : '-',
can_use_sse2 ? '+' : '-',
can_use_3dnow ? '+' : '-',
can_use_altivec ? '+' : '-',
can_use_vis ? '+' : '-');
if (be_verbose)
{
g_printerr ("Processor instruction sets: "
"%cmmx %csse %csse2 %c3dnow %caltivec %cvis\n",
can_use_mmx ? '+' : '-',
can_use_sse ? '+' : '-',
can_use_sse2 ? '+' : '-',
can_use_3dnow ? '+' : '-',
can_use_altivec ? '+' : '-',
can_use_vis ? '+' : '-');
}
#endif
}
}

View File

@ -471,7 +471,7 @@ op.add_option('-f', '--file', action='store', type='string', dest='file',
help='the input object file')
op.add_option('-t', '--test', action='store_true', dest='test', default=False,
help='generate regression testing code')
op.add_option('-i', '--iterations', action='store', type='int', dest='iterations', default=1,
op.add_option('-i', '--iterations', action='store', type='int', dest='iterations', default=10,
help='number of iterations in regression tests')
op.add_option('-n', '--n-pixels', action='store', type="int", dest='n_pixels', default=128*8192+16+1,
help='number of pixels in each regression test iteration')