From aa196f4ebb87442d64b44181388f17a8731596c0 Mon Sep 17 00:00:00 2001 From: Helvetix Victorinox Date: Sun, 14 Sep 2003 00:45:26 +0000 Subject: [PATCH] regenerated with new functions. * gimp-composite-sse2-installer.c, gimp-composite-sse2-test.c: regenerated with new functions. * gimp-composite-sse.c, gimp-composite-sse2.c: Distinguish between 64bit and 128bit constants with a little faux hungarian notation. * gimp-composite-sse2.[ch]: Added implementations of addition_rgba8_rgba8_rgba8, subtract_rgba8_rgba8_rgba8, and swap_rgba8_rgba8_rgba8 * gimp-composite-generic.c: Some formating beautification --- ChangeLog | 16 + app/composite/gimp-composite-generic.c | 4 +- app/composite/gimp-composite-sse.c | 62 +-- app/composite/gimp-composite-sse2-installer.c | 21 +- app/composite/gimp-composite-sse2-test.c | 36 ++ app/composite/gimp-composite-sse2.c | 360 +++++++++++++++++- app/composite/gimp-composite-sse2.h | 3 + 7 files changed, 467 insertions(+), 35 deletions(-) diff --git a/ChangeLog b/ChangeLog index a79f2d4d59..369a7bdaaf 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,19 @@ +2003-09-13 Helvetix Victorinox + + * gimp-composite-sse2-installer.c, gimp-composite-sse2-test.c: + regenerated with new functions. + + * gimp-composite-sse.c, gimp-composite-sse2.c: + Distinguish between 64bit and 128bit constants with a little + faux hungarian notation. + + * gimp-composite-sse2.[ch]: Added implementations of + addition_rgba8_rgba8_rgba8, subtract_rgba8_rgba8_rgba8, and + swap_rgba8_rgba8_rgba8 + + * gimp-composite-generic.c: + Some formating beautification + 2003-09-13 Maurits Rijk * plug-ins/imagemap/grid.xpm: removed diff --git a/app/composite/gimp-composite-generic.c b/app/composite/gimp-composite-generic.c index 281bbb9fda..a39c38b239 100644 --- a/app/composite/gimp-composite-generic.c +++ b/app/composite/gimp-composite-generic.c @@ -1116,7 +1116,9 @@ void gimp_composite_color_erase_any_any_any_generic (GimpCompositeContext *ctx) { ctx->D = ctx->B; - ctx->combine = (gimp_composite_pixel_alphap[ctx->pixelformat_A] && gimp_composite_pixel_alphap[ctx->pixelformat_B]) ? COLOR_ERASE_INTEN : 0; + ctx->combine = (gimp_composite_pixel_alphap[ctx->pixelformat_A] && gimp_composite_pixel_alphap[ctx->pixelformat_B]) + ? COLOR_ERASE_INTEN + : 0; } diff --git a/app/composite/gimp-composite-sse.c b/app/composite/gimp-composite-sse.c index 89e943e3ec..2633bfed1b 100644 --- a/app/composite/gimp-composite-sse.c +++ b/app/composite/gimp-composite-sse.c @@ -146,14 +146,14 @@ "\tpunpckhbw %%"#zero", %%"#dst"\n" -const static guint32 rgba8_alpha_mask[2] = { 0xFF000000, 0xFF000000 }; -const static guint32 rgba8_b1[2] = { 0x01010101, 0x01010101 }; -const static guint32 rgba8_b255[2] = { 0xFFFFFFFF, 0xFFFFFFFF }; -const static guint32 rgba8_w1[2] = { 0x00010001, 0x00010001 }; -const static guint32 rgba8_w2[2] = { 0x00020002, 0x00020002 }; -const static guint32 rgba8_w128[2] = { 0x00800080, 0x00800080 }; -const static guint32 rgba8_w256[2] = { 0x01000100, 0x01000100 }; -const static guint32 rgba8_w255[2] = { 0X00FF00FF, 0X00FF00FF }; +const static guint32 rgba8_alpha_mask_64[2] = { 0xFF000000, 0xFF000000 }; +const static guint32 rgba8_b1_64[2] = { 0x01010101, 0x01010101 }; +const static guint32 rgba8_b255_64[2] = { 0xFFFFFFFF, 0xFFFFFFFF }; +const static guint32 rgba8_w1_64[2] = { 0x00010001, 0x00010001 }; +const static guint32 rgba8_w2_64[2] = { 0x00020002, 0x00020002 }; +const static guint32 rgba8_w128_64[2] = { 0x00800080, 0x00800080 }; +const static guint32 rgba8_w256_64[2] = { 0x01000100, 0x01000100 }; +const static guint32 rgba8_w255_64[2] = { 0X00FF00FF, 0X00FF00FF }; const static guint32 va8_alpha_mask[2] = { 0xFF00FF00, 0xFF00FF00 }; const static guint32 va8_b255[2] = { 0xFFFFFFFF, 0xFFFFFFFF }; @@ -170,7 +170,7 @@ gimp_composite_addition_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op) asm volatile ("movq %0,%%mm0" : /* empty */ - : "m" (*rgba8_alpha_mask) + : "m" (*rgba8_alpha_mask_64) : "%mm0"); for (; op.n_pixels >= 2; op.n_pixels -= 2) @@ -265,7 +265,7 @@ gimp_composite_burn_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op) "\tmovq %%mm7,%2\n" : /* empty */ - : "m" (*op.A), "m" (*op.B), "m" (*op.D), "m" (*rgba8_b255), "m" (*rgba8_w1), "m" (*rgba8_w255), "m" (*rgba8_alpha_mask) + : "m" (*op.A), "m" (*op.B), "m" (*op.D), "m" (*rgba8_b255_64), "m" (*rgba8_w1_64), "m" (*rgba8_w255_64), "m" (*rgba8_alpha_mask_64) : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4"); op.A += 8; op.B += 8; @@ -319,7 +319,7 @@ gimp_composite_burn_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op) "\tmovd %%mm7,%2\n" : /* empty */ - : "m" (*op.A), "m" (*op.B), "m" (*op.D), "m" (*rgba8_b255), "m" (*rgba8_w1), "m" (*rgba8_w255), "m" (*rgba8_alpha_mask) + : "m" (*op.A), "m" (*op.B), "m" (*op.D), "m" (*rgba8_b255_64), "m" (*rgba8_w1_64), "m" (*rgba8_w255_64), "m" (*rgba8_alpha_mask_64) : "0", "1", "2", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"); } @@ -365,7 +365,7 @@ gimp_composite_difference_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op) { GimpCompositeContext op = *_op; - asm volatile ("movq %0,%%mm0" : : "m" (*rgba8_alpha_mask) : "%mm0"); + asm volatile ("movq %0,%%mm0" : : "m" (*rgba8_alpha_mask_64) : "%mm0"); for (; op.n_pixels >= 2; op.n_pixels -= 2) { @@ -421,7 +421,7 @@ gimp_composite_divide_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op) asm volatile ("movq %0, %%mm0\n" "\tmovq %1, %%mm7\n" : - : "m" (*rgba8_alpha_mask), "m" (*rgba8_w1) + : "m" (*rgba8_alpha_mask_64), "m" (*rgba8_w1_64) : "%mm0", "%mm7"); for (; op.n_pixels >= 2; op.n_pixels -= 2) @@ -461,7 +461,7 @@ gimp_composite_divide_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op) "\tmovq %%mm3,%2\n" : /* empty */ - : "m" (*op.A), "m" (*op.B), "m" (*op.D), "m" (*rgba8_alpha_mask) + : "m" (*op.A), "m" (*op.B), "m" (*op.D), "m" (*rgba8_alpha_mask_64) : "%eax", "%ecx", "%edx", "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); op.A += 8; op.B += 8; @@ -506,7 +506,7 @@ gimp_composite_divide_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op) "\tmovd %%mm3,%2\n" : /* empty */ - : "m" (*op.A), "m" (*op.B), "m" (*op.D), "m" (*rgba8_alpha_mask) + : "m" (*op.A), "m" (*op.B), "m" (*op.D), "m" (*rgba8_alpha_mask_64) : "%eax", "%ecx", "%edx", "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); } @@ -554,7 +554,7 @@ gimp_composite_dodge_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op) "\tmovq %%mm7,%2\n" : /* empty */ - : "m" (*op.A), "m" (*op.B), "m" (*op.D), "m" (*rgba8_w256), "m" (*rgba8_alpha_mask) + : "m" (*op.A), "m" (*op.B), "m" (*op.D), "m" (*rgba8_w256_64), "m" (*rgba8_alpha_mask_64) : "0", "1", "2", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"); op.A += 8; op.B += 8; @@ -597,7 +597,7 @@ gimp_composite_dodge_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op) "\tmovd %%mm7,%2\n" : /* empty */ - : "m" (*op.A), "m" (*op.B), "m" (*op.D), "m" (*rgba8_w256), "m" (*rgba8_alpha_mask) + : "m" (*op.A), "m" (*op.B), "m" (*op.D), "m" (*rgba8_w256_64), "m" (*rgba8_alpha_mask_64) : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); } @@ -609,9 +609,9 @@ gimp_composite_grain_extract_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op) { GimpCompositeContext op = *_op; - asm volatile ("movq %0,%%mm0" : : "m" (*rgba8_alpha_mask) : "%mm0"); + asm volatile ("movq %0,%%mm0" : : "m" (*rgba8_alpha_mask_64) : "%mm0"); asm volatile ("pxor %%mm6,%%mm6" : : : "%mm6"); - asm volatile ("movq %0,%%mm7" : : "m" (*rgba8_w128) : "%mm7"); + asm volatile ("movq %0,%%mm7" : : "m" (*rgba8_w128_64) : "%mm7"); for (; op.n_pixels >= 2; op.n_pixels -= 2) { @@ -688,7 +688,7 @@ gimp_composite_grain_merge_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op) "pxor %%mm6, %%mm6\n" "movq %1, %%mm7\n" : /* empty */ - : "m" (*rgba8_alpha_mask), "m" (*rgba8_w128) + : "m" (*rgba8_alpha_mask_64), "m" (*rgba8_w128_64) : "%mm0", "%mm6", "%mm7"); for (; op.n_pixels >= 2; op.n_pixels -= 2) @@ -758,7 +758,7 @@ gimp_composite_lighten_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op) { GimpCompositeContext op = *_op; - asm volatile ("movq %0,%%mm0" : : "m" (*rgba8_alpha_mask) : "%mm0"); + asm volatile ("movq %0,%%mm0" : : "m" (*rgba8_alpha_mask_64) : "%mm0"); for (; op.n_pixels >= 2; op.n_pixels -= 2) { @@ -808,8 +808,8 @@ gimp_composite_multiply_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op) { GimpCompositeContext op = *_op; - asm volatile ("movq %0,%%mm0" : : "m" (*rgba8_alpha_mask) : "%mm0"); - asm volatile ("movq %0,%%mm7" : : "m" (*rgba8_w128) : "%mm7"); + asm volatile ("movq %0,%%mm0" : : "m" (*rgba8_alpha_mask_64) : "%mm0"); + asm volatile ("movq %0,%%mm7" : : "m" (*rgba8_w128_64) : "%mm7"); asm volatile ("pxor %%mm6,%%mm6" : : : "%mm6"); for (; op.n_pixels >= 2; op.n_pixels -= 2) @@ -916,7 +916,7 @@ sse_op_overlay(void) "\tpor %%mm3,%%mm1\n" : /* empty */ - : "m" (*rgba8_w2), "m" (*rgba8_alpha_mask) + : "m" (*rgba8_w2_64), "m" (*rgba8_alpha_mask_64) ); } @@ -928,7 +928,7 @@ xxxgimp_composite_overlay_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op) asm volatile ("pxor %%mm0,%%mm0\n" "movq %0,%%mm7" : /* empty */ - : "m" (*rgba8_w128) : "%mm0"); + : "m" (*rgba8_w128_64) : "%mm0"); for (; op.n_pixels >= 2; op.n_pixels -= 2) { @@ -978,7 +978,7 @@ xxxgimp_composite_overlay_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op) "\tmovq %%mm1,%2\n" : "+m" (*op.A), "+m" (*op.B), "+m" (*op.D) - : "m" (*rgba8_w2), "m" (*rgba8_alpha_mask) + : "m" (*rgba8_w2_64), "m" (*rgba8_alpha_mask_64) : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4"); op.A += 8; op.B += 8; @@ -1033,7 +1033,7 @@ xxxgimp_composite_overlay_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op) "\tmovd %%mm1,%2\n" : /* empty */ - : "m" (*op.A), "m" (*op.B), "m" (*op.D), "m" (*rgba8_w2), "m" (*rgba8_alpha_mask) + : "m" (*op.A), "m" (*op.B), "m" (*op.D), "m" (*rgba8_w2_64), "m" (*rgba8_alpha_mask_64) : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4"); } @@ -1057,7 +1057,7 @@ gimp_composite_scale_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op) "\tpor %%mm5,%%mm3\n" "\tmovq %1,%%mm7\n" : /* empty */ - : "m" (op.scale.scale), "m" (*rgba8_w128) + : "m" (op.scale.scale), "m" (*rgba8_w128_64) : "%eax", "%mm0", "%mm5", "%mm6", "%mm7"); for (; op.n_pixels >= 2; op.n_pixels -= 2) @@ -1109,8 +1109,8 @@ gimp_composite_screen_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op) { GimpCompositeContext op = *_op; - asm volatile ("movq %0,%%mm0" : : "m" (*rgba8_alpha_mask) : "%mm0"); - asm volatile ("movq %0,%%mm7" : : "m" (*rgba8_w128) : "%mm7"); + asm volatile ("movq %0,%%mm0" : : "m" (*rgba8_alpha_mask_64) : "%mm0"); + asm volatile ("movq %0,%%mm7" : : "m" (*rgba8_w128_64) : "%mm7"); asm volatile ("pxor %mm6, %mm6"); for (; op.n_pixels >= 2; op.n_pixels -= 2) @@ -1229,7 +1229,7 @@ gimp_composite_subtract_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op) { GimpCompositeContext op = *_op; - asm volatile ("movq %0,%%mm0" : : "m" (*rgba8_alpha_mask) : "%mm0"); + asm volatile ("movq %0,%%mm0" : : "m" (*rgba8_alpha_mask_64) : "%mm0"); for (; op.n_pixels >= 2; op.n_pixels -= 2) { diff --git a/app/composite/gimp-composite-sse2-installer.c b/app/composite/gimp-composite-sse2-installer.c index 5a22ed0788..9ce5757eed 100644 --- a/app/composite/gimp-composite-sse2-installer.c +++ b/app/composite/gimp-composite-sse2-installer.c @@ -9,11 +9,30 @@ #include "gimp-composite-sse2.h" +static struct install_table { + GimpCompositeOperation mode; + GimpPixelFormat A; + GimpPixelFormat B; + GimpPixelFormat D; + void (*function)(GimpCompositeContext *); +} _gimp_composite_sse2[] = { +#if (__GNUC__ >= 3) && defined(USE_SSE) && defined(ARCH_X86) + { GIMP_COMPOSITE_DIFFERENCE, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, gimp_composite_difference_rgba8_rgba8_rgba8_sse2 }, + { GIMP_COMPOSITE_ADDITION, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, gimp_composite_addition_rgba8_rgba8_rgba8_sse2 }, + { GIMP_COMPOSITE_SUBTRACT, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, gimp_composite_subtract_rgba8_rgba8_rgba8_sse2 }, + { GIMP_COMPOSITE_SWAP, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, gimp_composite_swap_rgba8_rgba8_rgba8_sse2 }, +#endif + { 0, 0, 0, 0, NULL } +}; void gimp_composite_sse2_install (void) { - /* nothing to do */ + static struct install_table *t = _gimp_composite_sse2; + + for (t = &_gimp_composite_sse2[0]; t->function != NULL; t++) { + gimp_composite_function[t->mode][t->A][t->B][t->D] = t->function; + } gimp_composite_sse2_init (); } diff --git a/app/composite/gimp-composite-sse2-test.c b/app/composite/gimp-composite-sse2-test.c index 562bec56dd..762f4676d7 100644 --- a/app/composite/gimp-composite-sse2-test.c +++ b/app/composite/gimp-composite-sse2-test.c @@ -56,6 +56,42 @@ gimp_composite_sse2_test (int iterations, int n_pixels) va8M[i].a = i; } + + gimp_composite_context_init (&special_ctx, GIMP_COMPOSITE_DIFFERENCE, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D2); + gimp_composite_context_init (&generic_ctx, GIMP_COMPOSITE_DIFFERENCE, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D1); + ft0 = gimp_composite_regression_time_function (iterations, gimp_composite_dispatch, &generic_ctx); + ft1 = gimp_composite_regression_time_function (iterations, gimp_composite_difference_rgba8_rgba8_rgba8_sse2, &special_ctx); + if (gimp_composite_regression_compare_contexts ("difference", &generic_ctx, &special_ctx)) { + return (1); + } + gimp_composite_regression_timer_report ("difference", ft0, ft1); + + gimp_composite_context_init (&special_ctx, GIMP_COMPOSITE_ADDITION, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D2); + gimp_composite_context_init (&generic_ctx, GIMP_COMPOSITE_ADDITION, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D1); + ft0 = gimp_composite_regression_time_function (iterations, gimp_composite_dispatch, &generic_ctx); + ft1 = gimp_composite_regression_time_function (iterations, gimp_composite_addition_rgba8_rgba8_rgba8_sse2, &special_ctx); + if (gimp_composite_regression_compare_contexts ("addition", &generic_ctx, &special_ctx)) { + return (1); + } + gimp_composite_regression_timer_report ("addition", ft0, ft1); + + gimp_composite_context_init (&special_ctx, GIMP_COMPOSITE_SUBTRACT, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D2); + gimp_composite_context_init (&generic_ctx, GIMP_COMPOSITE_SUBTRACT, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D1); + ft0 = gimp_composite_regression_time_function (iterations, gimp_composite_dispatch, &generic_ctx); + ft1 = gimp_composite_regression_time_function (iterations, gimp_composite_subtract_rgba8_rgba8_rgba8_sse2, &special_ctx); + if (gimp_composite_regression_compare_contexts ("subtract", &generic_ctx, &special_ctx)) { + return (1); + } + gimp_composite_regression_timer_report ("subtract", ft0, ft1); + + gimp_composite_context_init (&special_ctx, GIMP_COMPOSITE_SWAP, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D2); + gimp_composite_context_init (&generic_ctx, GIMP_COMPOSITE_SWAP, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D1); + ft0 = gimp_composite_regression_time_function (iterations, gimp_composite_dispatch, &generic_ctx); + ft1 = gimp_composite_regression_time_function (iterations, gimp_composite_swap_rgba8_rgba8_rgba8_sse2, &special_ctx); + if (gimp_composite_regression_compare_contexts ("swap", &generic_ctx, &special_ctx)) { + return (1); + } + gimp_composite_regression_timer_report ("swap", ft0, ft1); #endif return (0); } diff --git a/app/composite/gimp-composite-sse2.c b/app/composite/gimp-composite-sse2.c index 95ea4d970b..b0c99a882b 100644 --- a/app/composite/gimp-composite-sse2.c +++ b/app/composite/gimp-composite-sse2.c @@ -41,9 +41,35 @@ #define pmaxub(src,dst,tmp) "pmaxub " "%%" #src ", %%" #dst const static guint32 rgba8_alpha_mask_128[4] = { 0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000 }; +const static guint32 rgba8_b1_128[4] = { 0x01010101, 0x01010101, 0x01010101, 0x01010101 }; +const static guint32 rgba8_b255_128[4] = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }; +const static guint32 rgba8_w1_128[4] = { 0x00010001, 0x00010001, 0x00010001, 0x00010001 }; +const static guint32 rgba8_w2_128[4] = { 0x00020002, 0x00020002, 0x00020002, 0x00020002 }; +const static guint32 rgba8_w128_128[4] = { 0x00800080, 0x00800080, 0x00800080, 0x00800080 }; +const static guint32 rgba8_w256_128[4] = { 0x01000100, 0x01000100, 0x01000100, 0x01000100 }; +const static guint32 rgba8_w255_128[4] = { 0X00FF00FF, 0X00FF00FF, 0X00FF00FF, 0X00FF00FF }; + +const static guint32 va8_alpha_mask_128[4] = { 0xFF00FF00, 0xFF00FF00, 0xFF00FF00, 0xFF00FF00 }; +const static guint32 va8_b255_128[4] = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }; +const static guint32 va8_w1_128[4] = { 0x00010001, 0x00010001, 0x00010001, 0x00010001 }; +const static guint32 va8_w255_128[4] = { 0x00FF00FF, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF }; + +const static guint32 rgba8_alpha_mask_64[2] = { 0xFF000000, 0xFF000000 }; +const static guint32 rgba8_b1_64[2] = { 0x01010101, 0x01010101 }; +const static guint32 rgba8_b255_64[2] = { 0xFFFFFFFF, 0xFFFFFFFF }; +const static guint32 rgba8_w1_64[2] = { 0x00010001, 0x00010001 }; +const static guint32 rgba8_w2_64[2] = { 0x00020002, 0x00020002 }; +const static guint32 rgba8_w128_64[2] = { 0x00800080, 0x00800080 }; +const static guint32 rgba8_w256_64[2] = { 0x01000100, 0x01000100 }; +const static guint32 rgba8_w255_64[2] = { 0X00FF00FF, 0X00FF00FF }; + +const static guint32 va8_alpha_mask_64[2] = { 0xFF00FF00, 0xFF00FF00 }; +const static guint32 va8_b255_64[2] = { 0xFFFFFFFF, 0xFFFFFFFF }; +const static guint32 va8_w1_64[2] = { 0x00010001, 0x00010001 }; +const static guint32 va8_w255_64[2] = { 0x00FF00FF, 0x00FF00FF }; void -debug_display_sse(void) +debug_display_sse (void) { #define mask32(x) ((x)& (unsigned long long) 0xFFFFFFFF) #define print128(reg) { \ @@ -61,7 +87,7 @@ debug_display_sse(void) } void -xxxgimp_composite_addition_rgba8_rgba8_rgba8_sse2 (GimpCompositeContext *_op) +gimp_composite_addition_rgba8_rgba8_rgba8_sse2 (GimpCompositeContext *_op) { GimpCompositeContext op = *_op; @@ -91,6 +117,31 @@ xxxgimp_composite_addition_rgba8_rgba8_rgba8_sse2 (GimpCompositeContext *_op) op.D += 16; } + asm volatile ("movq %0,%%mm0" + : /* empty */ + : "m" (*rgba8_alpha_mask_64) + : "%mm0"); + + for (; op.n_pixels >= 2; op.n_pixels -= 2) + { + asm (" movq %0, %%mm2\n" + "\tmovq %1, %%mm3\n" + "\tmovq %%mm2, %%mm4\n" + "\tpaddusb %%mm3, %%mm4\n" + "\tmovq %%mm0, %%mm1\n" + "\tpandn %%mm4, %%mm1\n" + "\t" pminub(mm3, mm2, mm4) "\n" + "\tpand %%mm0, %%mm2\n" + "\tpor %%mm2, %%mm1\n" + "\tmovq %%mm1, %2\n" + : /* empty */ + : "m" (*op.A), "m" (*op.B), "m" (*op.D) + : "0", "1", "2", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"); + op.A += 8; + op.B += 8; + op.D += 8; + } + if (op.n_pixels) { asm volatile (" movd (%0), %%mm2;\n" @@ -111,6 +162,311 @@ xxxgimp_composite_addition_rgba8_rgba8_rgba8_sse2 (GimpCompositeContext *_op) asm("emms"); } + +void +xxxgimp_composite_burn_rgba8_rgba8_rgba8_sse2 (GimpCompositeContext *_op) +{ + +} + + +void +xxxgimp_composite_darken_rgba8_rgba8_rgba8_sse2 (GimpCompositeContext *_op) +{ + +} + +void +gimp_composite_difference_rgba8_rgba8_rgba8_sse2 (GimpCompositeContext *_op) +{ + GimpCompositeContext op = *_op; + + asm volatile (" movq %0,%%mm0\n" + "\tmovdqu %1,%%xmm0" + : + : "m" (*rgba8_alpha_mask_64), "m" (*rgba8_alpha_mask_128) + : "%mm0"); + + for (; op.n_pixels >= 4; op.n_pixels -= 4) + { + asm volatile (" movdqu %0,%%xmm2\n" + "\tmovdqu %1,%%xmm3\n" + "\tmovdqu %%xmm2,%%xmm4\n" + "\tmovdqu %%xmm3,%%xmm5\n" + "\tpsubusb %%xmm3,%%xmm4\n" + "\tpsubusb %%xmm2,%%xmm5\n" + "\tpaddb %%xmm5,%%xmm4\n" + "\tmovdqu %%xmm0,%%xmm1\n" + "\tpandn %%xmm4,%%xmm1\n" + "\tpminub %%xmm3,%%xmm2\n" + "\tpand %%xmm0,%%xmm2\n" + "\tpor %%xmm2,%%xmm1\n" + "\tmovdqu %%xmm1,%2\n" + : /* empty */ + : "m" (*op.A), "m" (*op.B), "m" (*op.D) + : "0", "1", "2", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5"); + op.A += 16; + op.B += 16; + op.D += 16; + } + + for (; op.n_pixels >= 2; op.n_pixels -= 2) + { + asm volatile (" movq %0, %%mm2\n" + "\tmovq %1, %%mm3\n" + "\tmovq %%mm2, %%mm4\n" + "\tmovq %%mm3, %%mm5\n" + "\tpsubusb %%mm3, %%mm4\n" + "\tpsubusb %%mm2, %%mm5\n" + "\tpaddb %%mm5, %%mm4\n" + "\tmovq %%mm0, %%mm1\n" + "\tpandn %%mm4, %%mm1\n" + "\tpminub %%mm3, %%mm2\n" + "\tpand %%mm0, %%mm2\n" + "\tpor %%mm2, %%mm1\n" + "\tmovq %%mm1, %2\n" + : /* empty */ + : "m" (*op.A), "m" (*op.B), "m" (*op.D) + : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); + op.A += 8; + op.B += 8; + op.D += 8; + } + + if (op.n_pixels) + { + asm volatile (" movd %0, %%mm2\n" + "\tmovd %1, %%mm3\n" + "\tmovq %%mm2, %%mm4\n" + "\tmovq %%mm3, %%mm5\n" + "\tpsubusb %%mm3, %%mm4\n" + "\tpsubusb %%mm2, %%mm5\n" + "\tpaddb %%mm5, %%mm4\n" + "\tmovq %%mm0, %%mm1\n" + "\tpandn %%mm4, %%mm1\n" + "\tpminub %%mm3, %%mm2\n" + "\tpand %%mm0, %%mm2\n" + "\tpor %%mm2, %%mm1\n" + "\tmovd %%mm1, %2\n" + : /* empty */ + : "m" (*op.A), "m" (*op.B), "m" (*op.D) + : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4"); + } + + asm("emms"); +} + +void +xxxgimp_composite_divide_rgba8_rgba8_rgba8_sse2 (GimpCompositeContext *_op) +{ +} + +void +xxxgimp_composite_dodge_rgba8_rgba8_rgba8_sse2 (GimpCompositeContext *_op) +{ +} + +void +xxxgimp_composite_grain_extract_rgba8_rgba8_rgba8_sse2 (GimpCompositeContext *_op) +{ + +} + +void +xxxgimp_composite_grain_merge_rgba8_rgba8_rgba8_sse2 (GimpCompositeContext *_op) +{ + +} + +void +xxxgimp_composite_lighten_rgba8_rgba8_rgba8_sse2 (GimpCompositeContext *_op) +{ +} + +void +xxxgimp_composite_multiply_rgba8_rgba8_rgba8_sse2 (GimpCompositeContext *_op) +{ +} + +static void +sse2_op_overlay(void) +{ +} + +void +xxxgimp_composite_overlay_rgba8_rgba8_rgba8_sse2 (GimpCompositeContext *_op) +{ +} + + +void +xxxgimp_composite_scale_rgba8_rgba8_rgba8_sse2 (GimpCompositeContext *_op) +{ +} + +void +xxxgimp_composite_screen_rgba8_rgba8_rgba8_sse2 (GimpCompositeContext *_op) +{ +} + + +void +gimp_composite_subtract_rgba8_rgba8_rgba8_sse2 (GimpCompositeContext *_op) +{ + GimpCompositeContext op = *_op; + + asm volatile (" movq %0,%%mm0\n" + "\tmovdqu %1,%%xmm0\n" + : /* empty */ + : "m" (*rgba8_alpha_mask_64), "m" (*rgba8_alpha_mask_128) + : "%mm0", "%xmm0"); + + for (; op.n_pixels >= 4; op.n_pixels -= 4) + { + asm volatile (" movdqu %0,%%xmm2\n" + "\tmovdqu %1,%%xmm3\n" + "\tmovdqu %%xmm2,%%xmm4\n" + "\tpsubusb %%xmm3,%%xmm4\n" + + "\tmovdqu %%xmm0,%%xmm1\n" + "\tpandn %%xmm4,%%xmm1\n" + "\t" pminub(xmm3,xmm2,xmm4) "\n" + "\tpand %%xmm0,%%xmm2\n" + "\tpor %%xmm2,%%xmm1\n" + "\tmovdqu %%xmm1,%2\n" + : /* empty */ + : "m" (*op.A), "m" (*op.B), "m" (*op.D) + : "0", "1", "2", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5"); + op.A += 16; + op.B += 16; + op.D += 16; + } + + for (; op.n_pixels >= 2; op.n_pixels -= 2) + { + asm volatile (" movq %0,%%mm2\n" + "\tmovq %1,%%mm3\n" + + "\tmovq %%mm2,%%mm4\n" + "\tpsubusb %%mm3,%%mm4\n" + + "\tmovq %%mm0,%%mm1\n" + "\tpandn %%mm4,%%mm1\n" + + "\t" pminub(mm3,mm2,mm4) "\n" + + "\tpand %%mm0,%%mm2\n" + "\tpor %%mm2,%%mm1\n" + "\tmovq %%mm1,%2\n" + : /* empty */ + : "m" (*op.A), "m" (*op.B), "m" (*op.D) + : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); + op.A += 8; + op.B += 8; + op.D += 8; + } + + if (op.n_pixels) + { + asm volatile (" movd %0,%%mm2\n" + "\tmovd %1,%%mm3\n" + + "\tmovq %%mm2,%%mm4\n" + "\tpsubusb %%mm3,%%mm4\n" + + "\tmovq %%mm0,%%mm1\n" + "\tpandn %%mm4,%%mm1\n" + + "\t" pminub(mm3,mm2,mm4) "\n" + + "\tpand %%mm0,%%mm2\n" + "\tpor %%mm2,%%mm1\n" + "\tmovd %%mm1,%2\n" + : /* empty */ + : "m" (*op.A), "m" (*op.B), "m" (*op.D) + : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); + } + + asm("emms"); +} + +void +gimp_composite_swap_rgba8_rgba8_rgba8_sse2 (GimpCompositeContext *_op) +{ + GimpCompositeContext op = *_op; + + /* + * Inhale one whole i686 cache line at once. 64 bytes, 16 rgba8 pixels, 4 128 bit xmm registers. + */ + for (; op.n_pixels >= 16; op.n_pixels -= 16) + { + asm volatile (" movdqu %0,%%xmm0\n" + "\tmovdqu %1,%%xmm1\n" + "\tmovdqu %2,%%xmm2\n" + "\tmovdqu %3,%%xmm3\n" + "\tmovdqu %4,%%xmm4\n" + "\tmovdqu %5,%%xmm5\n" + "\tmovdqu %6,%%xmm6\n" + "\tmovdqu %7,%%xmm7\n" + + "\tmovdqu %%xmm0,%1\n" + "\tmovdqu %%xmm1,%0\n" + "\tmovdqu %%xmm2,%3\n" + "\tmovdqu %%xmm3,%2\n" + "\tmovdqu %%xmm4,%5\n" + "\tmovdqu %%xmm5,%4\n" + "\tmovdqu %%xmm6,%7\n" + "\tmovdqu %%xmm7,%6\n" + : /* empty */ + : "m" (op.A[0]), "m" (op.B[0]), + "m" (op.A[1]), "m" (op.B[1]), + "m" (op.A[2]), "m" (op.B[2]), + "m" (op.A[3]), "m" (op.B[3]) + : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7"); + op.A += 64; + op.B += 64; + } + + for (; op.n_pixels >= 4; op.n_pixels -= 4) + { + asm volatile (" movdqu %0,%%xmm2\n" + "\tmovdqu %1,%%xmm3\n" + "\tmovdqu %%xmm3,%0\n" + "\tmovdqu %%xmm2,%1\n" + : /* empty */ + : "m" (*op.A), "m" (*op.B) + : "0", "1", "%xmm1", "%xmm2", "%xmm3", "%xmm4"); + op.A += 16; + op.B += 16; + } + + for (; op.n_pixels >= 2; op.n_pixels -= 2) + { + asm volatile (" movq %0,%%mm2\n" + "\tmovq %1,%%mm3\n" + "\tmovq %%mm3,%0\n" + "\tmovq %%mm2,%1\n" + : /* empty */ + : "m" (*op.A), "m" (*op.B) + : "0", "1", "%mm1", "%mm2", "%mm3", "%mm4"); + op.A += 8; + op.B += 8; + } + + if (op.n_pixels) + { + asm volatile (" movd %0,%%mm2\n" + "\tmovd %1,%%mm3\n" + "\tmovd %%mm3,%0\n" + "\tmovd %%mm2,%1\n" + : /* empty */ + : "m" (*op.A), "m" (*op.B) + : "0", "1", "%mm1", "%mm2", "%mm3", "%mm4"); + } + + asm("emms"); +} + #endif /* __GNUC__ > 3 */ #endif /* defined(ARCH_X86) */ #endif /* defined(USE_SSE) */ diff --git a/app/composite/gimp-composite-sse2.h b/app/composite/gimp-composite-sse2.h index ce28e679ed..8310ee6272 100644 --- a/app/composite/gimp-composite-sse2.h +++ b/app/composite/gimp-composite-sse2.h @@ -10,4 +10,7 @@ extern void gimp_composite_sse2_init (void); extern void gimp_composite_sse2_install (void); extern void gimp_composite_addition_rgba8_rgba8_rgba8_sse2 (GimpCompositeContext *); +extern void gimp_composite_difference_rgba8_rgba8_rgba8_sse2 (GimpCompositeContext *); +extern void gimp_composite_subtract_rgba8_rgba8_rgba8_sse2 (GimpCompositeContext *); +extern void gimp_composite_swap_rgba8_rgba8_rgba8_sse2 (GimpCompositeContext *); #endif