diff --git a/ChangeLog b/ChangeLog index 4a55fa3a7d..996500474f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,20 @@ +2003-09-15 Helvetix Victorinox + + * gimp-composite-mmx-{test,installer}.c + * gimp-composite-sse-{test,installer}.c + * gimp-composite-sse2-{test,installer}.c + * make-installer.py: sort test output by mode name + + * gimp-composite-sse2.[ch]: New compositing mode implementations + darken + difference + grain_extract + lighten + + * gimp-composite-x86.h: added + + * gimp-composite-{sse,mmx,sse2}.[ch]: Tightening declarations of clobbered registers. + 2003-09-16 Manish Singh * app/Makefile.am: use -u to prevent garbage collection of symbols diff --git a/app/composite/gimp-composite-3dnow-test.c b/app/composite/gimp-composite-3dnow-test.c index d3963f8eb8..8fd48e755d 100644 --- a/app/composite/gimp-composite-3dnow-test.c +++ b/app/composite/gimp-composite-3dnow-test.c @@ -71,7 +71,7 @@ main (int argc, char *argv[]) putenv ("GIMP_COMPOSITE=0x1"); iterations = 1; - n_pixels = 1048577; + n_pixels = 163921; argv++, argc--; while (argc >= 2) { diff --git a/app/composite/gimp-composite-altivec-test.c b/app/composite/gimp-composite-altivec-test.c index d8c4def20e..c9739b313e 100644 --- a/app/composite/gimp-composite-altivec-test.c +++ b/app/composite/gimp-composite-altivec-test.c @@ -71,7 +71,7 @@ main (int argc, char *argv[]) putenv ("GIMP_COMPOSITE=0x1"); iterations = 1; - n_pixels = 1048577; + n_pixels = 163921; argv++, argc--; while (argc >= 2) { diff --git a/app/composite/gimp-composite-mmx-test.c b/app/composite/gimp-composite-mmx-test.c index 953cd2f6d2..5465affe3c 100644 --- a/app/composite/gimp-composite-mmx-test.c +++ b/app/composite/gimp-composite-mmx-test.c @@ -57,74 +57,52 @@ gimp_composite_mmx_test (int iterations, int n_pixels) } - gimp_composite_context_init (&special_ctx, GIMP_COMPOSITE_MULTIPLY, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D2); - gimp_composite_context_init (&generic_ctx, GIMP_COMPOSITE_MULTIPLY, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D1); - ft0 = gimp_composite_regression_time_function (iterations, gimp_composite_dispatch, &generic_ctx); - ft1 = gimp_composite_regression_time_function (iterations, gimp_composite_multiply_rgba8_rgba8_rgba8_mmx, &special_ctx); - if (gimp_composite_regression_compare_contexts ("multiply", &generic_ctx, &special_ctx)) { - return (1); - } - gimp_composite_regression_timer_report ("multiply", ft0, ft1); - - gimp_composite_context_init (&special_ctx, GIMP_COMPOSITE_SCREEN, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D2); - gimp_composite_context_init (&generic_ctx, GIMP_COMPOSITE_SCREEN, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D1); - ft0 = gimp_composite_regression_time_function (iterations, gimp_composite_dispatch, &generic_ctx); - ft1 = gimp_composite_regression_time_function (iterations, gimp_composite_screen_rgba8_rgba8_rgba8_mmx, &special_ctx); - if (gimp_composite_regression_compare_contexts ("screen", &generic_ctx, &special_ctx)) { - return (1); - } - gimp_composite_regression_timer_report ("screen", ft0, ft1); - - gimp_composite_context_init (&special_ctx, GIMP_COMPOSITE_DIFFERENCE, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D2); - gimp_composite_context_init (&generic_ctx, GIMP_COMPOSITE_DIFFERENCE, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D1); - ft0 = gimp_composite_regression_time_function (iterations, gimp_composite_dispatch, &generic_ctx); - ft1 = gimp_composite_regression_time_function (iterations, gimp_composite_difference_rgba8_rgba8_rgba8_mmx, &special_ctx); - if (gimp_composite_regression_compare_contexts ("difference", &generic_ctx, &special_ctx)) { - return (1); - } - gimp_composite_regression_timer_report ("difference", ft0, ft1); - gimp_composite_context_init (&special_ctx, GIMP_COMPOSITE_ADDITION, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D2); gimp_composite_context_init (&generic_ctx, GIMP_COMPOSITE_ADDITION, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D1); ft0 = gimp_composite_regression_time_function (iterations, gimp_composite_dispatch, &generic_ctx); ft1 = gimp_composite_regression_time_function (iterations, gimp_composite_addition_rgba8_rgba8_rgba8_mmx, &special_ctx); if (gimp_composite_regression_compare_contexts ("addition", &generic_ctx, &special_ctx)) { + printf("addition failed\n"); return (1); } gimp_composite_regression_timer_report ("addition", ft0, ft1); - gimp_composite_context_init (&special_ctx, GIMP_COMPOSITE_SUBTRACT, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D2); - gimp_composite_context_init (&generic_ctx, GIMP_COMPOSITE_SUBTRACT, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D1); + gimp_composite_context_init (&special_ctx, GIMP_COMPOSITE_BURN, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D2); + gimp_composite_context_init (&generic_ctx, GIMP_COMPOSITE_BURN, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D1); ft0 = gimp_composite_regression_time_function (iterations, gimp_composite_dispatch, &generic_ctx); - ft1 = gimp_composite_regression_time_function (iterations, gimp_composite_subtract_rgba8_rgba8_rgba8_mmx, &special_ctx); - if (gimp_composite_regression_compare_contexts ("subtract", &generic_ctx, &special_ctx)) { + ft1 = gimp_composite_regression_time_function (iterations, gimp_composite_burn_rgba8_rgba8_rgba8_mmx, &special_ctx); + if (gimp_composite_regression_compare_contexts ("burn", &generic_ctx, &special_ctx)) { + printf("burn failed\n"); return (1); } - gimp_composite_regression_timer_report ("subtract", ft0, ft1); + gimp_composite_regression_timer_report ("burn", ft0, ft1); gimp_composite_context_init (&special_ctx, GIMP_COMPOSITE_DARKEN, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D2); gimp_composite_context_init (&generic_ctx, GIMP_COMPOSITE_DARKEN, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D1); ft0 = gimp_composite_regression_time_function (iterations, gimp_composite_dispatch, &generic_ctx); ft1 = gimp_composite_regression_time_function (iterations, gimp_composite_darken_rgba8_rgba8_rgba8_mmx, &special_ctx); if (gimp_composite_regression_compare_contexts ("darken", &generic_ctx, &special_ctx)) { + printf("darken failed\n"); return (1); } gimp_composite_regression_timer_report ("darken", ft0, ft1); - gimp_composite_context_init (&special_ctx, GIMP_COMPOSITE_LIGHTEN, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D2); - gimp_composite_context_init (&generic_ctx, GIMP_COMPOSITE_LIGHTEN, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D1); + gimp_composite_context_init (&special_ctx, GIMP_COMPOSITE_DIFFERENCE, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D2); + gimp_composite_context_init (&generic_ctx, GIMP_COMPOSITE_DIFFERENCE, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D1); ft0 = gimp_composite_regression_time_function (iterations, gimp_composite_dispatch, &generic_ctx); - ft1 = gimp_composite_regression_time_function (iterations, gimp_composite_lighten_rgba8_rgba8_rgba8_mmx, &special_ctx); - if (gimp_composite_regression_compare_contexts ("lighten", &generic_ctx, &special_ctx)) { + ft1 = gimp_composite_regression_time_function (iterations, gimp_composite_difference_rgba8_rgba8_rgba8_mmx, &special_ctx); + if (gimp_composite_regression_compare_contexts ("difference", &generic_ctx, &special_ctx)) { + printf("difference failed\n"); return (1); } - gimp_composite_regression_timer_report ("lighten", ft0, ft1); + gimp_composite_regression_timer_report ("difference", ft0, ft1); gimp_composite_context_init (&special_ctx, GIMP_COMPOSITE_DIVIDE, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D2); gimp_composite_context_init (&generic_ctx, GIMP_COMPOSITE_DIVIDE, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D1); ft0 = gimp_composite_regression_time_function (iterations, gimp_composite_dispatch, &generic_ctx); ft1 = gimp_composite_regression_time_function (iterations, gimp_composite_divide_rgba8_rgba8_rgba8_mmx, &special_ctx); if (gimp_composite_regression_compare_contexts ("divide", &generic_ctx, &special_ctx)) { + printf("divide failed\n"); return (1); } gimp_composite_regression_timer_report ("divide", ft0, ft1); @@ -134,24 +112,17 @@ gimp_composite_mmx_test (int iterations, int n_pixels) ft0 = gimp_composite_regression_time_function (iterations, gimp_composite_dispatch, &generic_ctx); ft1 = gimp_composite_regression_time_function (iterations, gimp_composite_dodge_rgba8_rgba8_rgba8_mmx, &special_ctx); if (gimp_composite_regression_compare_contexts ("dodge", &generic_ctx, &special_ctx)) { + printf("dodge failed\n"); return (1); } gimp_composite_regression_timer_report ("dodge", ft0, ft1); - gimp_composite_context_init (&special_ctx, GIMP_COMPOSITE_BURN, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D2); - gimp_composite_context_init (&generic_ctx, GIMP_COMPOSITE_BURN, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D1); - ft0 = gimp_composite_regression_time_function (iterations, gimp_composite_dispatch, &generic_ctx); - ft1 = gimp_composite_regression_time_function (iterations, gimp_composite_burn_rgba8_rgba8_rgba8_mmx, &special_ctx); - if (gimp_composite_regression_compare_contexts ("burn", &generic_ctx, &special_ctx)) { - return (1); - } - gimp_composite_regression_timer_report ("burn", ft0, ft1); - gimp_composite_context_init (&special_ctx, GIMP_COMPOSITE_GRAIN_EXTRACT, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D2); gimp_composite_context_init (&generic_ctx, GIMP_COMPOSITE_GRAIN_EXTRACT, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D1); ft0 = gimp_composite_regression_time_function (iterations, gimp_composite_dispatch, &generic_ctx); ft1 = gimp_composite_regression_time_function (iterations, gimp_composite_grain_extract_rgba8_rgba8_rgba8_mmx, &special_ctx); if (gimp_composite_regression_compare_contexts ("grain_extract", &generic_ctx, &special_ctx)) { + printf("grain_extract failed\n"); return (1); } gimp_composite_regression_timer_report ("grain_extract", ft0, ft1); @@ -161,27 +132,70 @@ gimp_composite_mmx_test (int iterations, int n_pixels) ft0 = gimp_composite_regression_time_function (iterations, gimp_composite_dispatch, &generic_ctx); ft1 = gimp_composite_regression_time_function (iterations, gimp_composite_grain_merge_rgba8_rgba8_rgba8_mmx, &special_ctx); if (gimp_composite_regression_compare_contexts ("grain_merge", &generic_ctx, &special_ctx)) { + printf("grain_merge failed\n"); return (1); } gimp_composite_regression_timer_report ("grain_merge", ft0, ft1); - gimp_composite_context_init (&special_ctx, GIMP_COMPOSITE_SWAP, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D2); - gimp_composite_context_init (&generic_ctx, GIMP_COMPOSITE_SWAP, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D1); + gimp_composite_context_init (&special_ctx, GIMP_COMPOSITE_LIGHTEN, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D2); + gimp_composite_context_init (&generic_ctx, GIMP_COMPOSITE_LIGHTEN, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D1); ft0 = gimp_composite_regression_time_function (iterations, gimp_composite_dispatch, &generic_ctx); - ft1 = gimp_composite_regression_time_function (iterations, gimp_composite_swap_rgba8_rgba8_rgba8_mmx, &special_ctx); - if (gimp_composite_regression_compare_contexts ("swap", &generic_ctx, &special_ctx)) { + ft1 = gimp_composite_regression_time_function (iterations, gimp_composite_lighten_rgba8_rgba8_rgba8_mmx, &special_ctx); + if (gimp_composite_regression_compare_contexts ("lighten", &generic_ctx, &special_ctx)) { + printf("lighten failed\n"); return (1); } - gimp_composite_regression_timer_report ("swap", ft0, ft1); + gimp_composite_regression_timer_report ("lighten", ft0, ft1); + + gimp_composite_context_init (&special_ctx, GIMP_COMPOSITE_MULTIPLY, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D2); + gimp_composite_context_init (&generic_ctx, GIMP_COMPOSITE_MULTIPLY, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D1); + ft0 = gimp_composite_regression_time_function (iterations, gimp_composite_dispatch, &generic_ctx); + ft1 = gimp_composite_regression_time_function (iterations, gimp_composite_multiply_rgba8_rgba8_rgba8_mmx, &special_ctx); + if (gimp_composite_regression_compare_contexts ("multiply", &generic_ctx, &special_ctx)) { + printf("multiply failed\n"); + return (1); + } + gimp_composite_regression_timer_report ("multiply", ft0, ft1); gimp_composite_context_init (&special_ctx, GIMP_COMPOSITE_SCALE, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D2); gimp_composite_context_init (&generic_ctx, GIMP_COMPOSITE_SCALE, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D1); ft0 = gimp_composite_regression_time_function (iterations, gimp_composite_dispatch, &generic_ctx); ft1 = gimp_composite_regression_time_function (iterations, gimp_composite_scale_rgba8_rgba8_rgba8_mmx, &special_ctx); if (gimp_composite_regression_compare_contexts ("scale", &generic_ctx, &special_ctx)) { + printf("scale failed\n"); return (1); } gimp_composite_regression_timer_report ("scale", ft0, ft1); + + gimp_composite_context_init (&special_ctx, GIMP_COMPOSITE_SCREEN, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D2); + gimp_composite_context_init (&generic_ctx, GIMP_COMPOSITE_SCREEN, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D1); + ft0 = gimp_composite_regression_time_function (iterations, gimp_composite_dispatch, &generic_ctx); + ft1 = gimp_composite_regression_time_function (iterations, gimp_composite_screen_rgba8_rgba8_rgba8_mmx, &special_ctx); + if (gimp_composite_regression_compare_contexts ("screen", &generic_ctx, &special_ctx)) { + printf("screen failed\n"); + return (1); + } + gimp_composite_regression_timer_report ("screen", ft0, ft1); + + gimp_composite_context_init (&special_ctx, GIMP_COMPOSITE_SUBTRACT, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D2); + gimp_composite_context_init (&generic_ctx, GIMP_COMPOSITE_SUBTRACT, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D1); + ft0 = gimp_composite_regression_time_function (iterations, gimp_composite_dispatch, &generic_ctx); + ft1 = gimp_composite_regression_time_function (iterations, gimp_composite_subtract_rgba8_rgba8_rgba8_mmx, &special_ctx); + if (gimp_composite_regression_compare_contexts ("subtract", &generic_ctx, &special_ctx)) { + printf("subtract failed\n"); + return (1); + } + gimp_composite_regression_timer_report ("subtract", ft0, ft1); + + gimp_composite_context_init (&special_ctx, GIMP_COMPOSITE_SWAP, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D2); + gimp_composite_context_init (&generic_ctx, GIMP_COMPOSITE_SWAP, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D1); + ft0 = gimp_composite_regression_time_function (iterations, gimp_composite_dispatch, &generic_ctx); + ft1 = gimp_composite_regression_time_function (iterations, gimp_composite_swap_rgba8_rgba8_rgba8_mmx, &special_ctx); + if (gimp_composite_regression_compare_contexts ("swap", &generic_ctx, &special_ctx)) { + printf("swap failed\n"); + return (1); + } + gimp_composite_regression_timer_report ("swap", ft0, ft1); #endif return (0); } @@ -197,7 +211,7 @@ main (int argc, char *argv[]) putenv ("GIMP_COMPOSITE=0x1"); iterations = 1; - n_pixels = 1048577; + n_pixels = 163921; argv++, argc--; while (argc >= 2) { diff --git a/app/composite/gimp-composite-mmx.c b/app/composite/gimp-composite-mmx.c index 7006e9ded4..81f587bea3 100644 --- a/app/composite/gimp-composite-mmx.c +++ b/app/composite/gimp-composite-mmx.c @@ -1,7 +1,8 @@ -/* The GIMP -- an image manipulation program +/* -*- mode: c tab-width: 2; c-basic-indent: 2; indent-tabs-mode: nil -*- + * + * The GIMP -- an image manipulation program * Copyright (C) 1995 Spencer Kimball and Peter Mattis * - * -*- mode: c tab-width: 2; c-basic-indent: 2; indent-tabs-mode: nil -*- * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -42,109 +43,12 @@ #include "gimp-composite.h" #include "gimp-composite-mmx.h" +#include "gimp-composite-x86.h" #define pminub(src,dst,tmp) "\tmovq %%" #dst ", %%" #tmp ";" "psubusb %%" #src ", %%" #tmp ";" "psubb %%" #tmp ", %%" #dst "\n" #define pmaxub(a,b,tmp) "\tmovq %%" #a ", %%" #tmp ";" "psubusb %%" #b ", %%" #tmp ";" "paddb %%" #tmp ", %%" #b "\n" -/* - * Clobbers eax, ecx edx - */ -/* - * Double-word divide. Adjusted for subsequent unsigned packing - * (high-order bit of each word is cleared) - */ -#define pdivwX(dividend,divisor,quotient) "movd %%" #dividend ",%%eax; " \ - "movd %%" #divisor ",%%ecx; " \ - "xorl %%edx,%%edx; " \ - "divw %%cx; " \ - "roll $16, %%eax; " \ - "roll $16, %%ecx; " \ - "xorl %%edx,%%edx; " \ - "divw %%cx; " \ - "btr $15, %%eax; " \ - "roll $16, %%eax; " \ - "btr $15, %%eax; " \ - "movd %%eax,%%" #quotient ";" - -/* - * Quadword divide. No adjustment for subsequent unsigned packing - * (high-order bit of each word is left alone) - */ -#define pdivwqX(dividend,divisor,quotient) "movd %%" #dividend ",%%eax; " \ - "movd %%" #divisor ",%%ecx; " \ - "xorl %%edx,%%edx; " \ - "divw %%cx; " \ - "roll $16, %%eax; " \ - "roll $16, %%ecx; " \ - "xorl %%edx,%%edx; " \ - "divw %%cx; " \ - "roll $16, %%eax; " \ - "movd %%eax,%%" #quotient "; " \ - "psrlq $32,%%" #dividend ";" \ - "psrlq $32,%%" #divisor ";" \ - "movd %%" #dividend ",%%eax; " \ - "movd %%" #divisor ",%%ecx; " \ - "xorl %%edx,%%edx; " \ - "divw %%cx; " \ - "roll $16, %%eax; " \ - "roll $16, %%ecx; " \ - "xorl %%edx,%%edx; " \ - "divw %%cx; " \ - "roll $16, %%eax; " \ - "movd %%eax,%%" #divisor ";" \ - "psllq $32,%%" #divisor ";" \ - "por %%" #divisor ",%%" #quotient ";" - -/* - * Quadword divide. Adjusted for subsequent unsigned packing - * (high-order bit of each word is cleared) - */ -#define pdivwuqX(dividend,divisor,quotient) \ - pdivwX(dividend,divisor,quotient) \ - "psrlq $32,%%" #dividend ";" \ - "psrlq $32,%%" #divisor ";" \ - pdivwX(dividend,divisor,quotient) \ - "movd %%eax,%%" #divisor ";" \ - "psllq $32,%%" #divisor ";" \ - "por %%" #divisor ",%%" #quotient ";" - -/* equivalent to the INT_MULT() macro in gimp-composite-generic.c */ -/* - * opr2 = INT_MULT(opr1, opr2, t) - * - * Operates across quad-words using x86 word (16bit) value. - * Result is left in opr2 - * - * opr1 = opr1 * opr2 + w128 - * opr2 = opr1 - * opr2 = ((opr2 >> 8) + opr1) >> 8 - */ -#define pmulwX(opr1,opr2,w128) \ - "\tpmullw %%"#opr2", %%"#opr1"; " \ - "\tpaddw %%"#w128", %%"#opr1"; " \ - "\tmovq %%"#opr1", %%"#opr2"; " \ - "\tpsrlw $8, %%"#opr2"; " \ - "\tpaddw %%"#opr1", %%"#opr2"; " \ - "\tpsrlw $8, %%"#opr2"\n" - -/* a = INT_MULT(a,b) */ -#define mmx_int_mult(a,b,w128) \ - "\tpmullw %%"#b", %%"#a"; " \ - "\tpaddw %%"#w128", %%"#a"; " \ - "\tmovq %%"#a", %%"#b"; " \ - "\tpsrlw $8, %%"#b"; " \ - "\tpaddw %%"#a", %%"#b"; " \ - "\tpsrlw $8, %%"#b"\n" - -#define mmx_low_bytes_to_words(src,dst,zero) \ - "\tmovq %%"#src", %%"#dst"; " \ - "\tpunpcklbw %%"#zero", %%"#dst"\n" - -#define mmx_high_bytes_to_words(src,dst,zero) \ - "\tmovq %%"#src", %%"#dst"; " \ - "\tpunpckhbw %%"#zero", %%"#dst"\n" - void debug_display_mmx(void) @@ -188,40 +92,40 @@ gimp_composite_addition_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op) for (; op.n_pixels >= 2; op.n_pixels -= 2) { - asm (" movq %0, %%mm2\n" - "\tmovq %1, %%mm3\n" - "\tmovq %%mm2, %%mm4\n" - "\tpaddusb %%mm3, %%mm4\n" - "\tmovq %%mm0, %%mm1\n" - "\tpandn %%mm4, %%mm1\n" - "\t" pminub(mm3, mm2, mm4) "\n" - "\tpand %%mm0, %%mm2\n" - "\tpor %%mm2, %%mm1\n" - "\tmovq %%mm1, %2\n" - : /* empty */ - : "m" (*op.A), "m" (*op.B), "m" (*op.D) - : "0", "1", "2", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"); - op.A += 8; - op.B += 8; - op.D += 8; - } + asm (" movq %0, %%mm2\n" + "\tmovq %1, %%mm3\n" + "\tmovq %%mm2, %%mm4\n" + "\tpaddusb %%mm3, %%mm4\n" + "\tmovq %%mm0, %%mm1\n" + "\tpandn %%mm4, %%mm1\n" + "\t" pminub(mm3, mm2, mm4) "\n" + "\tpand %%mm0, %%mm2\n" + "\tpor %%mm2, %%mm1\n" + "\tmovq %%mm1, %2\n" + : /* empty */ + : "m" (*op.A), "m" (*op.B), "m" (*op.D) + : "0", "1", "2", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"); + op.A += 8; + op.B += 8; + op.D += 8; + } if (op.n_pixels) { asm volatile (" movd %0, %%mm2\n" "\tmovd %1, %%mm3\n" - "\tmovq %%mm2, %%mm4\n" - "\tpaddusb %%mm3, %%mm4\n" - "\tmovq %%mm0, %%mm1\n" - "\tpandn %%mm4, %%mm1\n" - "\t" pminub(mm3, mm2, mm4) "\n" - "\tpand %%mm0, %%mm2\n" - "\tpor %%mm2, %%mm1\n" - "\tmovd %%mm1, %2\n" - : /* empty */ - : "m" (*op.A), "m" (*op.B), "m" (*op.D) - : "0", "1", "2", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"); - } + "\tmovq %%mm2, %%mm4\n" + "\tpaddusb %%mm3, %%mm4\n" + "\tmovq %%mm0, %%mm1\n" + "\tpandn %%mm4, %%mm1\n" + "\t" pminub(mm3, mm2, mm4) "\n" + "\tpand %%mm0, %%mm2\n" + "\tpor %%mm2, %%mm1\n" + "\tmovd %%mm1, %2\n" + : /* empty */ + : "m" (*op.A), "m" (*op.B), "m" (*op.D) + : "0", "1", "2", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"); + } asm("emms"); } @@ -232,109 +136,109 @@ gimp_composite_burn_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op) GimpCompositeContext op = *_op; for (; op.n_pixels >= 2; op.n_pixels -= 2) - { - asm (" movq %0,%%mm0\n" - "\tmovq %1,%%mm1\n" + { + asm (" movq %0,%%mm0\n" + "\tmovq %1,%%mm1\n" - "\tmovq %3,%%mm2\n" - "\tpsubb %%mm0,%%mm2\n" /* mm2 = 255 - A */ - "\tpxor %%mm4,%%mm4\n" - "\tpunpcklbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256 */ + "\tmovq %3,%%mm2\n" + "\tpsubb %%mm0,%%mm2\n" /* mm2 = 255 - A */ + "\tpxor %%mm4,%%mm4\n" + "\tpunpcklbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256 */ - "\tmovq %%mm1,%%mm3\n" - "\tpxor %%mm5,%%mm5\n" - "\tpunpcklbw %%mm5,%%mm3\n" - "\tmovq %4,%%mm5\n" - "\tpaddusw %%mm3,%%mm5\n" /* mm5 = B + 1 */ + "\tmovq %%mm1,%%mm3\n" + "\tpxor %%mm5,%%mm5\n" + "\tpunpcklbw %%mm5,%%mm3\n" + "\tmovq %4,%%mm5\n" + "\tpaddusw %%mm3,%%mm5\n" /* mm5 = B + 1 */ - "\t" pdivwqX(mm4,mm5,mm7) "\n" + "\t" pdivwqX(mm4,mm5,mm7) "\n" - "\tmovq %3,%%mm2\n" - "\tpsubb %%mm0,%%mm2\n" /* mm2 = 255 - A */ - "\tpxor %%mm4,%%mm4\n" - "\tpunpckhbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256 */ + "\tmovq %3,%%mm2\n" + "\tpsubb %%mm0,%%mm2\n" /* mm2 = 255 - A */ + "\tpxor %%mm4,%%mm4\n" + "\tpunpckhbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256 */ - "\tmovq %%mm1,%%mm3\n" - "\tpxor %%mm5,%%mm5\n" - "\tpunpckhbw %%mm5,%%mm3\n" - "\tmovq %4,%%mm5\n" - "\tpaddusw %%mm3,%%mm5\n" /* mm5 = B + 1 */ - "\t" pdivwqX(mm4,mm5,mm6) "\n" + "\tmovq %%mm1,%%mm3\n" + "\tpxor %%mm5,%%mm5\n" + "\tpunpckhbw %%mm5,%%mm3\n" + "\tmovq %4,%%mm5\n" + "\tpaddusw %%mm3,%%mm5\n" /* mm5 = B + 1 */ + "\t" pdivwqX(mm4,mm5,mm6) "\n" - "\tmovq %5,%%mm4\n" - "\tmovq %%mm4,%%mm5\n" - "\tpsubusw %%mm6,%%mm4\n" - "\tpsubusw %%mm7,%%mm5\n" + "\tmovq %5,%%mm4\n" + "\tmovq %%mm4,%%mm5\n" + "\tpsubusw %%mm6,%%mm4\n" + "\tpsubusw %%mm7,%%mm5\n" - "\tpackuswb %%mm4,%%mm5\n" + "\tpackuswb %%mm4,%%mm5\n" - "\t" pminub(mm0,mm1,mm3) "\n" /* mm1 = min(mm0,mm1) clobber mm3 */ + "\t" pminub(mm0,mm1,mm3) "\n" /* mm1 = min(mm0,mm1) clobber mm3 */ - "\tmovq %6,%%mm7\n" - "\tpand %%mm7,%%mm1\n" /* mm1 = mm7 & alpha_mask */ + "\tmovq %6,%%mm7\n" + "\tpand %%mm7,%%mm1\n" /* mm1 = mm7 & alpha_mask */ - "\tpandn %%mm5,%%mm7\n" /* mm7 = ~mm7 & mm5 */ - "\tpor %%mm1,%%mm7\n" /* mm7 = mm7 | mm1 */ + "\tpandn %%mm5,%%mm7\n" /* mm7 = ~mm7 & mm5 */ + "\tpor %%mm1,%%mm7\n" /* mm7 = mm7 | mm1 */ - "\tmovq %%mm7,%2\n" - : /* empty */ - : "m" (*op.A), "m" (*op.B), "m" (*op.D), "m" (*rgba8_b255), "m" (*rgba8_w1), "m" (*rgba8_w255), "m" (*rgba8_alpha_mask) - : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4"); - op.A += 8; - op.B += 8; - op.D += 8; - } + "\tmovq %%mm7,%2\n" + : /* empty */ + : "m" (*op.A), "m" (*op.B), "m" (*op.D), "m" (*rgba8_b255), "m" (*rgba8_w1), "m" (*rgba8_w255), "m" (*rgba8_alpha_mask) + : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4"); + op.A += 8; + op.B += 8; + op.D += 8; + } if (op.n_pixels) - { - asm volatile (" movd %0,%%mm0\n" - "\tmovd %1,%%mm1\n" + { + asm volatile (" movd %0,%%mm0\n" + "\tmovd %1,%%mm1\n" - "\tmovq %3,%%mm2\n" - "\tpsubb %%mm0,%%mm2\n" /* mm2 = 255 - A */ - "\tpxor %%mm4,%%mm4\n" - "\tpunpcklbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256 */ + "\tmovq %3,%%mm2\n" + "\tpsubb %%mm0,%%mm2\n" /* mm2 = 255 - A */ + "\tpxor %%mm4,%%mm4\n" + "\tpunpcklbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256 */ - "\tmovq %%mm1,%%mm3\n" - "\tpxor %%mm5,%%mm5\n" - "\tpunpcklbw %%mm5,%%mm3\n" - "\tmovq %4,%%mm5\n" - "\tpaddusw %%mm3,%%mm5\n" /* mm5 = B + 1 */ + "\tmovq %%mm1,%%mm3\n" + "\tpxor %%mm5,%%mm5\n" + "\tpunpcklbw %%mm5,%%mm3\n" + "\tmovq %4,%%mm5\n" + "\tpaddusw %%mm3,%%mm5\n" /* mm5 = B + 1 */ - "\t" pdivwqX(mm4,mm5,mm7) "\n" + "\t" pdivwqX(mm4,mm5,mm7) "\n" - "\tmovq %3,%%mm2\n" - "\tpsubb %%mm0,%%mm2\n" /* mm2 = 255 - A */ - "\tpxor %%mm4,%%mm4\n" - "\tpunpckhbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256 */ + "\tmovq %3,%%mm2\n" + "\tpsubb %%mm0,%%mm2\n" /* mm2 = 255 - A */ + "\tpxor %%mm4,%%mm4\n" + "\tpunpckhbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256 */ - "\tmovq %%mm1,%%mm3\n" - "\tpxor %%mm5,%%mm5\n" - "\tpunpckhbw %%mm5,%%mm3\n" - "\tmovq %4,%%mm5\n" - "\tpaddusw %%mm3,%%mm5\n" /* mm5 = B + 1 */ - "\t" pdivwqX(mm4,mm5,mm6) "\n" + "\tmovq %%mm1,%%mm3\n" + "\tpxor %%mm5,%%mm5\n" + "\tpunpckhbw %%mm5,%%mm3\n" + "\tmovq %4,%%mm5\n" + "\tpaddusw %%mm3,%%mm5\n" /* mm5 = B + 1 */ + "\t" pdivwqX(mm4,mm5,mm6) "\n" - "\tmovq %5,%%mm4\n" - "\tmovq %%mm4,%%mm5\n" - "\tpsubusw %%mm6,%%mm4\n" - "\tpsubusw %%mm7,%%mm5\n" + "\tmovq %5,%%mm4\n" + "\tmovq %%mm4,%%mm5\n" + "\tpsubusw %%mm6,%%mm4\n" + "\tpsubusw %%mm7,%%mm5\n" - "\tpackuswb %%mm4,%%mm5\n" + "\tpackuswb %%mm4,%%mm5\n" - "\t" pminub(mm0,mm1,mm3) "\n" /* mm1 = min(mm0,mm1) clobber mm3 */ + "\t" pminub(mm0,mm1,mm3) "\n" /* mm1 = min(mm0,mm1) clobber mm3 */ - "\tmovq %6,%%mm7\n" - "\tpand %%mm7,%%mm1\n" /* mm1 = mm7 & alpha_mask */ + "\tmovq %6,%%mm7\n" + "\tpand %%mm7,%%mm1\n" /* mm1 = mm7 & alpha_mask */ - "\tpandn %%mm5,%%mm7\n" /* mm7 = ~mm7 & mm5 */ - "\tpor %%mm1,%%mm7\n" /* mm7 = mm7 | mm1 */ + "\tpandn %%mm5,%%mm7\n" /* mm7 = ~mm7 & mm5 */ + "\tpor %%mm1,%%mm7\n" /* mm7 = mm7 | mm1 */ - "\tmovd %%mm7,%2\n" - : /* empty */ - : "m" (*op.A), "m" (*op.B), "m" (*op.D), "m" (*rgba8_b255), "m" (*rgba8_w1), "m" (*rgba8_w255), "m" (*rgba8_alpha_mask) - : "0", "1", "2", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"); - } + "\tmovd %%mm7,%2\n" + : /* empty */ + : "m" (*op.A), "m" (*op.B), "m" (*op.D), "m" (*rgba8_b255), "m" (*rgba8_w1), "m" (*rgba8_w255), "m" (*rgba8_alpha_mask) + : "0", "1", "2", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"); + } asm("emms"); } @@ -345,30 +249,30 @@ gimp_composite_darken_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op) { GimpCompositeContext op = *_op; - for (; op.n_pixels >= 2; op.n_pixels -= 2) + for (; op.n_pixels >= 2; op.n_pixels -= 2) { - asm volatile (" movq %0, %%mm2\n" - "\tmovq %1, %%mm3\n" - "\t" pminub(mm3, mm2, mm4) "\n" - "\tmovq %%mm2, %2\n" - : /* empty */ - : "m" (*op.A), "m" (*op.B), "m" (*op.D) - : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4"); - op.A += 8; - op.B += 8; - op.D += 8; - } + asm volatile (" movq %1, %%mm2\n" + "\tmovq %2, %%mm3\n" + "\t" pminub(mm3, mm2, mm4) "\n" + "\tmovq %%mm2, %0\n" + : "=m" (*op.D) + : "m" (*op.A), "m" (*op.B) + : "%mm1", "%mm2", "%mm3", "%mm4"); + op.A += 8; + op.B += 8; + op.D += 8; + } if (op.n_pixels) { - asm volatile (" movd %0, %%mm2\n" - "\tmovd %1, %%mm3\n" - "\t" pminub(mm3, mm2, mm4) "\n" - "\tmovd %%mm2, %2\n" - : /* empty */ - : "m" (*op.A), "m" (*op.B), "m" (*op.D) - : "0", "1", "2", "%mm2", "%mm3", "%mm4"); - } + asm volatile (" movd %0, %%mm2\n" + "\tmovd %1, %%mm3\n" + "\t" pminub(mm3, mm2, mm4) "\n" + "\tmovd %%mm2, %2\n" + : /* empty */ + : "m" (*op.A), "m" (*op.B), "m" (*op.D) + : "0", "1", "2", "%mm2", "%mm3", "%mm4"); + } asm("emms"); } @@ -381,47 +285,47 @@ gimp_composite_difference_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op) asm volatile ("movq %0,%%mm0" : : "m" (*rgba8_alpha_mask) : "%mm0"); for (; op.n_pixels >= 2; op.n_pixels -= 2) - { - asm volatile (" movq %0, %%mm2\n" - "\tmovq %1, %%mm3\n" - "\tmovq %%mm2, %%mm4\n" - "\tmovq %%mm3, %%mm5\n" - "\tpsubusb %%mm3, %%mm4\n" - "\tpsubusb %%mm2, %%mm5\n" - "\tpaddb %%mm5, %%mm4\n" - "\tmovq %%mm0, %%mm1\n" - "\tpandn %%mm4, %%mm1\n" - "\tpminub %%mm3, %%mm2\n" - "\tpand %%mm0, %%mm2\n" - "\tpor %%mm2, %%mm1\n" - "\tmovq %%mm1, %2\n" - : /* empty */ - : "m" (*op.A), "m" (*op.B), "m" (*op.D) - : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4"); - op.A += 8; - op.B += 8; - op.D += 8; - } + { + asm volatile (" movq %0, %%mm2\n" + "\tmovq %1, %%mm3\n" + "\tmovq %%mm2, %%mm4\n" + "\tmovq %%mm3, %%mm5\n" + "\tpsubusb %%mm3, %%mm4\n" + "\tpsubusb %%mm2, %%mm5\n" + "\tpaddb %%mm5, %%mm4\n" + "\tmovq %%mm0, %%mm1\n" + "\tpandn %%mm4, %%mm1\n" + "\tpminub %%mm3, %%mm2\n" + "\tpand %%mm0, %%mm2\n" + "\tpor %%mm2, %%mm1\n" + "\tmovq %%mm1, %2\n" + : /* empty */ + : "m" (*op.A), "m" (*op.B), "m" (*op.D) + : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4"); + op.A += 8; + op.B += 8; + op.D += 8; + } if (op.n_pixels) - { - asm volatile (" movd %0, %%mm2\n" - "\tmovd %1, %%mm3\n" - "\tmovq %%mm2, %%mm4\n" - "\tmovq %%mm3, %%mm5\n" - "\tpsubusb %%mm3, %%mm4\n" - "\tpsubusb %%mm2, %%mm5\n" - "\tpaddb %%mm5, %%mm4\n" - "\tmovq %%mm0, %%mm1\n" - "\tpandn %%mm4, %%mm1\n" - "\tpminub %%mm3, %%mm2\n" - "\tpand %%mm0, %%mm2\n" - "\tpor %%mm2, %%mm1\n" - "\tmovd %%mm1, %2\n" - : /* empty */ - : "m" (*op.A), "m" (*op.B), "m" (*op.D) - : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4"); - } + { + asm volatile (" movd %0, %%mm2\n" + "\tmovd %1, %%mm3\n" + "\tmovq %%mm2, %%mm4\n" + "\tmovq %%mm3, %%mm5\n" + "\tpsubusb %%mm3, %%mm4\n" + "\tpsubusb %%mm2, %%mm5\n" + "\tpaddb %%mm5, %%mm4\n" + "\tmovq %%mm0, %%mm1\n" + "\tpandn %%mm4, %%mm1\n" + "\tpminub %%mm3, %%mm2\n" + "\tpand %%mm0, %%mm2\n" + "\tpor %%mm2, %%mm1\n" + "\tmovd %%mm1, %2\n" + : /* empty */ + : "m" (*op.A), "m" (*op.B), "m" (*op.D) + : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4"); + } asm("emms"); } @@ -432,95 +336,95 @@ gimp_composite_divide_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op) GimpCompositeContext op = *_op; asm volatile ("movq %0, %%mm0\n" - "\tmovq %1, %%mm7\n" - : - : "m" (*rgba8_alpha_mask), "m" (*rgba8_w1) - : "%mm0", "%mm7"); + "\tmovq %1, %%mm7\n" + : + : "m" (*rgba8_alpha_mask), "m" (*rgba8_w1) + : "%mm0", "%mm7"); for (; op.n_pixels >= 2; op.n_pixels -= 2) - { - asm volatile (" movq %0,%%mm0\n" - "\tmovq %1,%%mm1\n" - "\tpxor %%mm2,%%mm2\n" - "\tpunpcklbw %%mm0,%%mm2\n" /* mm2 = A*256 */ + { + asm volatile (" movq %0,%%mm0\n" + "\tmovq %1,%%mm1\n" + "\tpxor %%mm2,%%mm2\n" + "\tpunpcklbw %%mm0,%%mm2\n" /* mm2 = A*256 */ - "\tmovq %%mm1,%%mm3\n" - "\tpxor %%mm5,%%mm5\n" - "\tpunpcklbw %%mm5,%%mm3\n" - "\tpaddw %%mm7,%%mm3\n" /* mm3 = B+1 */ + "\tmovq %%mm1,%%mm3\n" + "\tpxor %%mm5,%%mm5\n" + "\tpunpcklbw %%mm5,%%mm3\n" + "\tpaddw %%mm7,%%mm3\n" /* mm3 = B+1 */ - "\t" pdivwuqX(mm2,mm3,mm5) "\n" /* mm5 = (A*256)/(B+1) */ + "\t" pdivwuqX(mm2,mm3,mm5) "\n" /* mm5 = (A*256)/(B+1) */ - "\tpxor %%mm2,%%mm2\n" - "\tpunpckhbw %%mm0,%%mm2\n" /* mm2 = A*256 */ + "\tpxor %%mm2,%%mm2\n" + "\tpunpckhbw %%mm0,%%mm2\n" /* mm2 = A*256 */ - "\tmovq %%mm1,%%mm3\n" - "\tpxor %%mm6,%%mm6\n" - "\tpunpckhbw %%mm6,%%mm3\n" - "\tpaddw %%mm7,%%mm3\n" /* mm3 = B+1 */ + "\tmovq %%mm1,%%mm3\n" + "\tpxor %%mm6,%%mm6\n" + "\tpunpckhbw %%mm6,%%mm3\n" + "\tpaddw %%mm7,%%mm3\n" /* mm3 = B+1 */ - "\t" pdivwuqX(mm2,mm3,mm4) "\n" /* mm4 = (A*256)/(B+1) */ + "\t" pdivwuqX(mm2,mm3,mm4) "\n" /* mm4 = (A*256)/(B+1) */ - "\tpackuswb %%mm4,%%mm5\n" /* expects mm4 and mm5 to be signed values */ + "\tpackuswb %%mm4,%%mm5\n" /* expects mm4 and mm5 to be signed values */ - "\t" pminub(mm0,mm1,mm3) "\n" - "\tmovq %3,%%mm3\n" - "\tmovq %%mm3,%%mm2\n" + "\t" pminub(mm0,mm1,mm3) "\n" + "\tmovq %3,%%mm3\n" + "\tmovq %%mm3,%%mm2\n" - "\tpandn %%mm5,%%mm3\n" + "\tpandn %%mm5,%%mm3\n" - "\tpand %%mm2,%%mm1\n" - "\tpor %%mm1,%%mm3\n" + "\tpand %%mm2,%%mm1\n" + "\tpor %%mm1,%%mm3\n" - "\tmovq %%mm3,%2\n" - : /* empty */ - : "m" (*op.A), "m" (*op.B), "m" (*op.D), "m" (*rgba8_alpha_mask) - : "%eax", "%ecx", "%edx", "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); - op.A += 8; - op.B += 8; - op.D += 8; - } + "\tmovq %%mm3,%2\n" + : /* empty */ + : "m" (*op.A), "m" (*op.B), "m" (*op.D), "m" (*rgba8_alpha_mask) + : "%eax", "%ecx", "%edx", "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); + op.A += 8; + op.B += 8; + op.D += 8; + } if (op.n_pixels) - { - asm volatile (" movd %0,%%mm0\n" - "\tmovd %1,%%mm1\n" - "\tpxor %%mm2,%%mm2\n" - "\tpunpcklbw %%mm0,%%mm2\n" /* mm2 = A*256 */ + { + asm volatile (" movd %0,%%mm0\n" + "\tmovd %1,%%mm1\n" + "\tpxor %%mm2,%%mm2\n" + "\tpunpcklbw %%mm0,%%mm2\n" /* mm2 = A*256 */ - "\tmovq %%mm1,%%mm3\n" - "\tpxor %%mm5,%%mm5\n" - "\tpunpcklbw %%mm5,%%mm3\n" - "\tpaddw %%mm7,%%mm3\n" /* mm3 = B+1 */ + "\tmovq %%mm1,%%mm3\n" + "\tpxor %%mm5,%%mm5\n" + "\tpunpcklbw %%mm5,%%mm3\n" + "\tpaddw %%mm7,%%mm3\n" /* mm3 = B+1 */ - "\t" pdivwuqX(mm2,mm3,mm5) "\n" /* mm5 = (A*256)/(B+1) */ + "\t" pdivwuqX(mm2,mm3,mm5) "\n" /* mm5 = (A*256)/(B+1) */ - "\tpxor %%mm2,%%mm2\n" - "\tpunpckhbw %%mm0,%%mm2\n" /* mm2 = A*256 */ + "\tpxor %%mm2,%%mm2\n" + "\tpunpckhbw %%mm0,%%mm2\n" /* mm2 = A*256 */ - "\tmovq %%mm1,%%mm3\n" - "\tpxor %%mm6,%%mm6\n" - "\tpunpckhbw %%mm6,%%mm3\n" - "\tpaddw %%mm7,%%mm3\n" /* mm3 = B+1 */ + "\tmovq %%mm1,%%mm3\n" + "\tpxor %%mm6,%%mm6\n" + "\tpunpckhbw %%mm6,%%mm3\n" + "\tpaddw %%mm7,%%mm3\n" /* mm3 = B+1 */ - "\t" pdivwuqX(mm2,mm3,mm4) "\n" /* mm4 = (A*256)/(B+1) */ + "\t" pdivwuqX(mm2,mm3,mm4) "\n" /* mm4 = (A*256)/(B+1) */ - "\tpackuswb %%mm4,%%mm5\n" /* expects mm4 and mm5 to be signed values */ + "\tpackuswb %%mm4,%%mm5\n" /* expects mm4 and mm5 to be signed values */ - "\t" pminub(mm0,mm1,mm3) "\n" - "\tmovq %3,%%mm3\n" - "\tmovq %%mm3,%%mm2\n" + "\t" pminub(mm0,mm1,mm3) "\n" + "\tmovq %3,%%mm3\n" + "\tmovq %%mm3,%%mm2\n" - "\tpandn %%mm5,%%mm3\n" + "\tpandn %%mm5,%%mm3\n" - "\tpand %%mm2,%%mm1\n" - "\tpor %%mm1,%%mm3\n" + "\tpand %%mm2,%%mm1\n" + "\tpor %%mm1,%%mm3\n" - "\tmovd %%mm3,%2\n" - : /* empty */ - : "m" (*op.A), "m" (*op.B), "m" (*op.D), "m" (*rgba8_alpha_mask) - : "%eax", "%ecx", "%edx", "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); - } + "\tmovd %%mm3,%2\n" + : /* empty */ + : "m" (*op.A), "m" (*op.B), "m" (*op.D), "m" (*rgba8_alpha_mask) + : "%eax", "%ecx", "%edx", "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); + } asm("emms"); } @@ -531,87 +435,87 @@ gimp_composite_dodge_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op) GimpCompositeContext op = *_op; for (; op.n_pixels >= 2; op.n_pixels -= 2) - { - asm volatile (" movq %0,%%mm0\n" - "\tmovq %1,%%mm1\n" - "\tmovq %%mm1,%%mm3\n" - "\tpxor %%mm2,%%mm2\n" - "\tpunpcklbw %%mm2,%%mm3\n" - "\tpunpcklbw %%mm0,%%mm2\n" + { + asm volatile (" movq %0,%%mm0\n" + "\tmovq %1,%%mm1\n" + "\tmovq %%mm1,%%mm3\n" + "\tpxor %%mm2,%%mm2\n" + "\tpunpcklbw %%mm2,%%mm3\n" + "\tpunpcklbw %%mm0,%%mm2\n" - "\tmovq %3,%%mm4\n" - "\tpsubw %%mm3,%%mm4\n" + "\tmovq %3,%%mm4\n" + "\tpsubw %%mm3,%%mm4\n" - "\t" pdivwuqX(mm2,mm4,mm5) "\n" + "\t" pdivwuqX(mm2,mm4,mm5) "\n" - "\tmovq %%mm1,%%mm3\n" - "\tpxor %%mm2,%%mm2\n" - "\tpunpckhbw %%mm2,%%mm3\n" - "\tpunpckhbw %%mm0,%%mm2\n" + "\tmovq %%mm1,%%mm3\n" + "\tpxor %%mm2,%%mm2\n" + "\tpunpckhbw %%mm2,%%mm3\n" + "\tpunpckhbw %%mm0,%%mm2\n" - "\tmovq %3,%%mm4\n" - "\tpsubw %%mm3,%%mm4\n" + "\tmovq %3,%%mm4\n" + "\tpsubw %%mm3,%%mm4\n" - "\t" pdivwuqX(mm2,mm4,mm6) "\n" + "\t" pdivwuqX(mm2,mm4,mm6) "\n" - "\tpackuswb %%mm6,%%mm5\n" + "\tpackuswb %%mm6,%%mm5\n" - "\tmovq %4,%%mm6\n" - "\tmovq %%mm1,%%mm7\n" - "\t" pminub(mm0,mm7,mm2) "\n" - "\tpand %%mm6,%%mm7\n" - "\tpandn %%mm5,%%mm6\n" + "\tmovq %4,%%mm6\n" + "\tmovq %%mm1,%%mm7\n" + "\t" pminub(mm0,mm7,mm2) "\n" + "\tpand %%mm6,%%mm7\n" + "\tpandn %%mm5,%%mm6\n" - "\tpor %%mm6,%%mm7\n" + "\tpor %%mm6,%%mm7\n" - "\tmovq %%mm7,%2\n" - : /* empty */ - : "m" (*op.A), "m" (*op.B), "m" (*op.D), "m" (*rgba8_w256), "m" (*rgba8_alpha_mask) - : "0", "1", "2", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"); - op.A += 8; - op.B += 8; - op.D += 8; - } + "\tmovq %%mm7,%2\n" + : /* empty */ + : "m" (*op.A), "m" (*op.B), "m" (*op.D), "m" (*rgba8_w256), "m" (*rgba8_alpha_mask) + : "0", "1", "2", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"); + op.A += 8; + op.B += 8; + op.D += 8; + } if (op.n_pixels) - { - asm volatile (" movd %0,%%mm0\n" - "\tmovq %1,%%mm1\n" - "\tmovq %%mm1,%%mm3\n" - "\tpxor %%mm2,%%mm2\n" - "\tpunpcklbw %%mm2,%%mm3\n" - "\tpunpcklbw %%mm0,%%mm2\n" + { + asm volatile (" movd %0,%%mm0\n" + "\tmovq %1,%%mm1\n" + "\tmovq %%mm1,%%mm3\n" + "\tpxor %%mm2,%%mm2\n" + "\tpunpcklbw %%mm2,%%mm3\n" + "\tpunpcklbw %%mm0,%%mm2\n" - "\tmovq %3,%%mm4\n" - "\tpsubw %%mm3,%%mm4\n" + "\tmovq %3,%%mm4\n" + "\tpsubw %%mm3,%%mm4\n" - "\t" pdivwuqX(mm2,mm4,mm5) "\n" + "\t" pdivwuqX(mm2,mm4,mm5) "\n" - "\tmovq %%mm1,%%mm3\n" - "\tpxor %%mm2,%%mm2\n" - "\tpunpckhbw %%mm2,%%mm3\n" - "\tpunpckhbw %%mm0,%%mm2\n" + "\tmovq %%mm1,%%mm3\n" + "\tpxor %%mm2,%%mm2\n" + "\tpunpckhbw %%mm2,%%mm3\n" + "\tpunpckhbw %%mm0,%%mm2\n" - "\tmovq %3,%%mm4\n" - "\tpsubw %%mm3,%%mm4\n" + "\tmovq %3,%%mm4\n" + "\tpsubw %%mm3,%%mm4\n" - "\t" pdivwuqX(mm2,mm4,mm6) "\n" + "\t" pdivwuqX(mm2,mm4,mm6) "\n" - "\tpackuswb %%mm6,%%mm5\n" + "\tpackuswb %%mm6,%%mm5\n" - "\tmovq %4,%%mm6\n" - "\tmovq %%mm1,%%mm7\n" - "\t" pminub(mm0,mm7,mm2) "\n" - "\tpand %%mm6,%%mm7\n" - "\tpandn %%mm5,%%mm6\n" + "\tmovq %4,%%mm6\n" + "\tmovq %%mm1,%%mm7\n" + "\t" pminub(mm0,mm7,mm2) "\n" + "\tpand %%mm6,%%mm7\n" + "\tpandn %%mm5,%%mm6\n" - "\tpor %%mm6,%%mm7\n" + "\tpor %%mm6,%%mm7\n" - "\tmovd %%mm7,%2\n" - : /* empty */ - : "m" (*op.A), "m" (*op.B), "m" (*op.D), "m" (*rgba8_w256), "m" (*rgba8_alpha_mask) - : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); - } + "\tmovd %%mm7,%2\n" + : /* empty */ + : "m" (*op.A), "m" (*op.B), "m" (*op.D), "m" (*rgba8_w256), "m" (*rgba8_alpha_mask) + : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); + } asm("emms"); } @@ -621,72 +525,72 @@ gimp_composite_grain_extract_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op) { GimpCompositeContext op = *_op; - asm volatile ("movq %0,%%mm0" : : "m" (*rgba8_alpha_mask) : "%mm0"); + asm volatile ("movq %0,%%mm0" : : "m" (*rgba8_alpha_mask) : "%mm0"); asm volatile ("pxor %%mm6,%%mm6" : : : "%mm6"); asm volatile ("movq %0,%%mm7" : : "m" (*rgba8_w128) : "%mm7"); for (; op.n_pixels >= 2; op.n_pixels -= 2) - { - asm volatile (" movq %0,%%mm2\n" - "\tmovq %1,%%mm3\n" - mmx_low_bytes_to_words(mm2,mm4,mm6) - mmx_low_bytes_to_words(mm3,mm5,mm6) - "\tpsubw %%mm5,%%mm4\n" - "\tpaddw %%mm7,%%mm4\n" - "\tmovq %%mm4,%%mm1\n" + { + asm volatile (" movq %0,%%mm2\n" + "\tmovq %1,%%mm3\n" + mmx_low_bytes_to_words(mm2,mm4,mm6) + mmx_low_bytes_to_words(mm3,mm5,mm6) + "\tpsubw %%mm5,%%mm4\n" + "\tpaddw %%mm7,%%mm4\n" + "\tmovq %%mm4,%%mm1\n" - mmx_high_bytes_to_words(mm2,mm4,mm6) - mmx_high_bytes_to_words(mm3,mm5,mm6) + mmx_high_bytes_to_words(mm2,mm4,mm6) + mmx_high_bytes_to_words(mm3,mm5,mm6) - "\tpsubw %%mm5,%%mm4\n" - "\tpaddw %%mm7,%%mm4\n" + "\tpsubw %%mm5,%%mm4\n" + "\tpaddw %%mm7,%%mm4\n" - "\tpackuswb %%mm4,%%mm1\n" - "\tmovq %%mm1,%%mm4\n" + "\tpackuswb %%mm4,%%mm1\n" + "\tmovq %%mm1,%%mm4\n" - "\tmovq %%mm0,%%mm1\n" - "\tpandn %%mm4,%%mm1\n" - - "\t" pminub(mm3,mm2,mm4) "\n" - "\tpand %%mm0,%%mm2\n" + "\tmovq %%mm0,%%mm1\n" + "\tpandn %%mm4,%%mm1\n" + + "\t" pminub(mm3,mm2,mm4) "\n" + "\tpand %%mm0,%%mm2\n" - "\tpor %%mm2,%%mm1\n" - "\tmovq %%mm1,%2\n" - : /* empty */ - : "m" (*op.A), "m" (*op.B), "m" (*op.D) - : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4"); - op.A += 8; - op.B += 8; - op.D += 8; - } + "\tpor %%mm2,%%mm1\n" + "\tmovq %%mm1,%2\n" + : /* empty */ + : "m" (*op.A), "m" (*op.B), "m" (*op.D) + : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4"); + op.A += 8; + op.B += 8; + op.D += 8; + } if (op.n_pixels) - { - asm volatile (" movd %0, %%mm2\n" - "\tmovd %1, %%mm3\n" + { + asm volatile (" movd %0, %%mm2\n" + "\tmovd %1, %%mm3\n" - mmx_low_bytes_to_words(mm2,mm4,mm6) - mmx_low_bytes_to_words(mm3,mm5,mm6) + mmx_low_bytes_to_words(mm2,mm4,mm6) + mmx_low_bytes_to_words(mm3,mm5,mm6) - "\tpsubw %%mm5, %%mm4\n" - "\tpaddw %%mm7, %%mm4\n" - "\tmovq %%mm4, %%mm1\n" + "\tpsubw %%mm5, %%mm4\n" + "\tpaddw %%mm7, %%mm4\n" + "\tmovq %%mm4, %%mm1\n" - "\tpackuswb %%mm6, %%mm1\n" + "\tpackuswb %%mm6, %%mm1\n" - "\tmovq %%mm1, %%mm4\n" + "\tmovq %%mm1, %%mm4\n" - "\tmovq %%mm0, %%mm1; pandn %%mm4, %%mm1\n" + "\tmovq %%mm0, %%mm1; pandn %%mm4, %%mm1\n" - "\t" pminub(mm3,mm2,mm4) "\n" - "\tpand %%mm0, %%mm2\n" + "\t" pminub(mm3,mm2,mm4) "\n" + "\tpand %%mm0, %%mm2\n" - "\tpor %%mm2, %%mm1\n" - "\tmovd %%mm1, %2\n" - : /* empty */ - : "m" (*op.A), "m" (*op.B), "m" (*op.D) - : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4"); - } + "\tpor %%mm2, %%mm1\n" + "\tmovd %%mm1, %2\n" + : /* empty */ + : "m" (*op.A), "m" (*op.B), "m" (*op.D) + : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4"); + } asm("emms"); } @@ -704,63 +608,63 @@ gimp_composite_grain_merge_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op) : "%mm0", "%mm6", "%mm7"); for (; op.n_pixels >= 2; op.n_pixels -= 2) - { - asm volatile (" movq %0, %%mm2\n" - "\tmovq %1, %%mm3\n" + { + asm volatile (" movq %0, %%mm2\n" + "\tmovq %1, %%mm3\n" - mmx_low_bytes_to_words(mm2,mm4,mm6) - mmx_low_bytes_to_words(mm3,mm5,mm6) - "\tpaddw %%mm5, %%mm4\n" - "\tpsubw %%mm7, %%mm4\n" + mmx_low_bytes_to_words(mm2,mm4,mm6) + mmx_low_bytes_to_words(mm3,mm5,mm6) + "\tpaddw %%mm5, %%mm4\n" + "\tpsubw %%mm7, %%mm4\n" - mmx_high_bytes_to_words(mm2,mm1,mm6) - mmx_high_bytes_to_words(mm3,mm5,mm6) - "\tpaddw %%mm5, %%mm1\n" - "\tpsubw %%mm7, %%mm1\n" + mmx_high_bytes_to_words(mm2,mm1,mm6) + mmx_high_bytes_to_words(mm3,mm5,mm6) + "\tpaddw %%mm5, %%mm1\n" + "\tpsubw %%mm7, %%mm1\n" - "\tpackuswb %%mm1, %%mm4\n" + "\tpackuswb %%mm1, %%mm4\n" - "\t" pminub(mm3,mm2,mm5) "\n" - "\tpand %%mm0, %%mm2\n" + "\t" pminub(mm3,mm2,mm5) "\n" + "\tpand %%mm0, %%mm2\n" - "\tmovq %%mm0, %%mm1\n" - "\tpandn %%mm4, %%mm1\n" - "\tpor %%mm2, %%mm1\n" - "\tmovq %%mm1, %2\n" - : /* empty */ - : "m" (*op.A), "m" (*op.B), "m" (*op.D) - : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4"); - op.A += 8; - op.B += 8; - op.D += 8; - } + "\tmovq %%mm0, %%mm1\n" + "\tpandn %%mm4, %%mm1\n" + "\tpor %%mm2, %%mm1\n" + "\tmovq %%mm1, %2\n" + : /* empty */ + : "m" (*op.A), "m" (*op.B), "m" (*op.D) + : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4"); + op.A += 8; + op.B += 8; + op.D += 8; + } if (op.n_pixels) - { - asm volatile (" movd %0, %%mm2\n" - "\tmovd %1, %%mm3\n" + { + asm volatile (" movd %0, %%mm2\n" + "\tmovd %1, %%mm3\n" - mmx_low_bytes_to_words(mm2,mm4,mm6) - mmx_low_bytes_to_words(mm3,mm5,mm6) - - "\tpaddw %%mm5, %%mm4\n" - "\tpsubw %%mm7, %%mm4\n" - "\tmovq %%mm4, %%mm1\n" - "\tpackuswb %%mm6, %%mm1\n" - - "\tmovq %%mm1, %%mm4\n" - - "\tmovq %%mm0, %%mm1; pandn %%mm4, %%mm1\n" - - "\t" pminub(mm3,mm2,mm4) "\n" - "\tpand %%mm0, %%mm2\n" - - "\tpor %%mm2, %%mm1\n" - "\tmovd %%mm1, %2\n" - : /* empty */ - : "m" (*op.A), "m" (*op.B), "m" (*op.D) - : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4"); - } + mmx_low_bytes_to_words(mm2,mm4,mm6) + mmx_low_bytes_to_words(mm3,mm5,mm6) + + "\tpaddw %%mm5, %%mm4\n" + "\tpsubw %%mm7, %%mm4\n" + "\tmovq %%mm4, %%mm1\n" + "\tpackuswb %%mm6, %%mm1\n" + + "\tmovq %%mm1, %%mm4\n" + + "\tmovq %%mm0, %%mm1; pandn %%mm4, %%mm1\n" + + "\t" pminub(mm3,mm2,mm4) "\n" + "\tpand %%mm0, %%mm2\n" + + "\tpor %%mm2, %%mm1\n" + "\tmovd %%mm1, %2\n" + : /* empty */ + : "m" (*op.A), "m" (*op.B), "m" (*op.D) + : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4"); + } asm("emms"); } @@ -773,27 +677,27 @@ gimp_composite_lighten_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op) asm volatile ("movq %0,%%mm0" : : "m" (*rgba8_alpha_mask) : "%mm0"); for (; op.n_pixels >= 2; op.n_pixels -= 2) - { - asm volatile (" movq %0, %%mm2\n" - "\tmovq %1, %%mm3\n" - "\tmovq %%mm2, %%mm4\n" - "\t" pmaxub(mm3,mm4,mm5) "\n" - "\tmovq %%mm0, %%mm1\n" - "\tpandn %%mm4, %%mm1\n" - "\t" pminub(mm2,mm3,mm4) "\n" - "\tpand %%mm0, %%mm3\n" - "\tpor %%mm3, %%mm1\n" - "\tmovq %%mm1, %2\n" - : /* empty */ - : "m" (*op.A), "m" (*op.B), "m" (*op.D) - : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); - op.A += 8; - op.B += 8; - op.D += 8; - } + { + asm volatile (" movq %0, %%mm2\n" + "\tmovq %1, %%mm3\n" + "\tmovq %%mm2, %%mm4\n" + "\t" pmaxub(mm3,mm4,mm5) "\n" + "\tmovq %%mm0, %%mm1\n" + "\tpandn %%mm4, %%mm1\n" + "\t" pminub(mm2,mm3,mm4) "\n" + "\tpand %%mm0, %%mm3\n" + "\tpor %%mm3, %%mm1\n" + "\tmovq %%mm1, %2\n" + : /* empty */ + : "m" (*op.A), "m" (*op.B), "m" (*op.D) + : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); + op.A += 8; + op.B += 8; + op.D += 8; + } if (op.n_pixels) - { + { asm volatile (" movd %0, %%mm2\n" "\tmovd %1, %%mm3\n" "\tmovq %%mm2, %%mm4\n" @@ -825,38 +729,38 @@ gimp_composite_multiply_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op) asm volatile ("pxor %%mm6,%%mm6" : : : "%mm6"); for (; op.n_pixels >= 2; op.n_pixels -= 2) - { - asm volatile (" movq %0, %%mm2\n" - "\tmovq %1, %%mm3\n" + { + asm volatile (" movq %0, %%mm2\n" + "\tmovq %1, %%mm3\n" - mmx_low_bytes_to_words(mm2,mm1,mm6) - mmx_low_bytes_to_words(mm3,mm5,mm6) - mmx_int_mult(mm5,mm1,mm7) + mmx_low_bytes_to_words(mm2,mm1,mm6) + mmx_low_bytes_to_words(mm3,mm5,mm6) + mmx_int_mult(mm5,mm1,mm7) - mmx_high_bytes_to_words(mm2,mm4,mm6) - mmx_high_bytes_to_words(mm3,mm5,mm6) - mmx_int_mult(mm5,mm4,mm7) + mmx_high_bytes_to_words(mm2,mm4,mm6) + mmx_high_bytes_to_words(mm3,mm5,mm6) + mmx_int_mult(mm5,mm4,mm7) - "\tpackuswb %%mm4, %%mm1\n" + "\tpackuswb %%mm4, %%mm1\n" - "\tmovq %%mm0, %%mm4\n" - "\tpandn %%mm1, %%mm4\n" - "\tmovq %%mm4, %%mm1\n" - "\t" pminub(mm3,mm2,mm4) "\n" - "\tpand %%mm0, %%mm2\n" - "\tpor %%mm2, %%mm1\n" - - "\tmovq %%mm1, %2\n" - : /* empty */ - : "m" (*op.A), "m" (*op.B), "m" (*op.D) - : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); - op.A += 8; - op.B += 8; - op.D += 8; + "\tmovq %%mm0, %%mm4\n" + "\tpandn %%mm1, %%mm4\n" + "\tmovq %%mm4, %%mm1\n" + "\t" pminub(mm3,mm2,mm4) "\n" + "\tpand %%mm0, %%mm2\n" + "\tpor %%mm2, %%mm1\n" + + "\tmovq %%mm1, %2\n" + : /* empty */ + : "m" (*op.A), "m" (*op.B), "m" (*op.D) + : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); + op.A += 8; + op.B += 8; + op.D += 8; } if (op.n_pixels) - { + { asm volatile (" movd %0, %%mm2\n" "\tmovd %1, %%mm3\n" @@ -940,114 +844,114 @@ xxxgimp_composite_overlay_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op) asm volatile ("pxor %%mm0,%%mm0\n" "movq %0,%%mm7" : /* empty */ - : "m" (*rgba8_w128) : "%mm0"); + : "m" (*rgba8_w128) : "%mm0"); for (; op.n_pixels >= 2; op.n_pixels -= 2) - { - asm volatile (" movq %0,%%mm2\n" - "\tmovq %1,%%mm3\n" - - /* low bytes */ - mmx_low_bytes_to_words(mm3,mm5,mm0) - "\tpcmpeqb %%mm4,%%mm4\n" - "\tpsubb %%mm2,%%mm4\n" /* mm4 = 255 - A */ - "\tpunpcklbw %%mm0,%%mm4\n" /* mm4 = (low bytes as word) mm4 */ - "\tmovq %3,%%mm6\n" /* mm6 = words of value 2 */ - "\tpmullw %%mm5,%%mm6\n" /* mm6 = 2 * low bytes of B */ - mmx_int_mult(mm6,mm4,mm7) /* mm4 = INT_MULT(mm6, mm4) */ - - /* high bytes */ - mmx_high_bytes_to_words(mm3,mm5,mm0) - "\tpcmpeqb %%mm1,%%mm1\n" - "\tpsubb %%mm2,%%mm1\n" /* mm1 = 255 - A */ - "\tpunpckhbw %%mm0,%%mm1\n" /* mm1 = (high bytes as word) mm1 */ - "\tmovq %3,%%mm6\n" /* mm6 = words of value 2 */ - "\tpmullw %%mm5,%%mm6\n" /* mm6 = 2 * high bytes of B */ - mmx_int_mult(mm6,mm1,mm7) /* mm1 = INT_MULT(mm6, mm1) */ - - "\tpackuswb %%mm1,%%mm4\n" /* mm4 = intermediate value */ - - mmx_low_bytes_to_words(mm4,mm5,mm0) - mmx_low_bytes_to_words(mm2,mm6,mm0) - "\tpaddw %%mm6,%%mm5\n" - mmx_int_mult(mm6,mm5,mm7) /* mm5 = INT_MULT(mm6, mm5) low bytes */ - - mmx_high_bytes_to_words(mm4,mm1,mm0) - mmx_high_bytes_to_words(mm2,mm6,mm0) - "\tpaddw %%mm6,%%mm1\n" - mmx_int_mult(mm6,mm1,mm7) /* mm1 = INT_MULT(mm6, mm1) high bytes */ - - "\tpackuswb %%mm1,%%mm5\n" - - "\tmovq %4,%%mm0\n" - "\tmovq %%mm0,%%mm1\n" - "\tpandn %%mm5,%%mm1\n" - - "\t" pminub(mm2,mm3,mm4) "\n" - "\tpand %%mm0,%%mm3\n" - - "\tpor %%mm3,%%mm1\n" - - "\tmovq %%mm1,%2\n" - : "+m" (*op.A), "+m" (*op.B), "+m" (*op.D) - : "m" (*rgba8_w2), "m" (*rgba8_alpha_mask) - : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4"); - op.A += 8; - op.B += 8; - op.D += 8; + { + asm volatile (" movq %0,%%mm2\n" + "\tmovq %1,%%mm3\n" + + /* low bytes */ + mmx_low_bytes_to_words(mm3,mm5,mm0) + "\tpcmpeqb %%mm4,%%mm4\n" + "\tpsubb %%mm2,%%mm4\n" /* mm4 = 255 - A */ + "\tpunpcklbw %%mm0,%%mm4\n" /* mm4 = (low bytes as word) mm4 */ + "\tmovq %3,%%mm6\n" /* mm6 = words of value 2 */ + "\tpmullw %%mm5,%%mm6\n" /* mm6 = 2 * low bytes of B */ + mmx_int_mult(mm6,mm4,mm7) /* mm4 = INT_MULT(mm6, mm4) */ + + /* high bytes */ + mmx_high_bytes_to_words(mm3,mm5,mm0) + "\tpcmpeqb %%mm1,%%mm1\n" + "\tpsubb %%mm2,%%mm1\n" /* mm1 = 255 - A */ + "\tpunpckhbw %%mm0,%%mm1\n" /* mm1 = (high bytes as word) mm1 */ + "\tmovq %3,%%mm6\n" /* mm6 = words of value 2 */ + "\tpmullw %%mm5,%%mm6\n" /* mm6 = 2 * high bytes of B */ + mmx_int_mult(mm6,mm1,mm7) /* mm1 = INT_MULT(mm6, mm1) */ + + "\tpackuswb %%mm1,%%mm4\n" /* mm4 = intermediate value */ + + mmx_low_bytes_to_words(mm4,mm5,mm0) + mmx_low_bytes_to_words(mm2,mm6,mm0) + "\tpaddw %%mm6,%%mm5\n" + mmx_int_mult(mm6,mm5,mm7) /* mm5 = INT_MULT(mm6, mm5) low bytes */ + + mmx_high_bytes_to_words(mm4,mm1,mm0) + mmx_high_bytes_to_words(mm2,mm6,mm0) + "\tpaddw %%mm6,%%mm1\n" + mmx_int_mult(mm6,mm1,mm7) /* mm1 = INT_MULT(mm6, mm1) high bytes */ + + "\tpackuswb %%mm1,%%mm5\n" + + "\tmovq %4,%%mm0\n" + "\tmovq %%mm0,%%mm1\n" + "\tpandn %%mm5,%%mm1\n" + + "\t" pminub(mm2,mm3,mm4) "\n" + "\tpand %%mm0,%%mm3\n" + + "\tpor %%mm3,%%mm1\n" + + "\tmovq %%mm1,%2\n" + : "+m" (*op.A), "+m" (*op.B), "+m" (*op.D) + : "m" (*rgba8_w2), "m" (*rgba8_alpha_mask) + : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4"); + op.A += 8; + op.B += 8; + op.D += 8; } if (op.n_pixels) - { - asm volatile (" movd %0,%%mm2\n" - "\tmovd %1,%%mm3\n" + { + asm volatile (" movd %0,%%mm2\n" + "\tmovd %1,%%mm3\n" - /* low bytes */ - mmx_low_bytes_to_words(mm3,mm5,mm0) - "\tpcmpeqb %%mm4,%%mm4\n" - "\tpsubb %%mm2,%%mm4\n" /* mm4 = 255 - A */ - "\tpunpcklbw %%mm0,%%mm4\n" /* mm4 = (low bytes as word) mm4 */ - "\tmovq %3,%%mm6\n" /* mm6 = words of value 2 */ - "\tpmullw %%mm5,%%mm6\n" /* mm6 = 2 * low bytes of B */ - mmx_int_mult(mm6,mm4,mm7) /* mm4 = INT_MULT(mm6, mm4) */ + /* low bytes */ + mmx_low_bytes_to_words(mm3,mm5,mm0) + "\tpcmpeqb %%mm4,%%mm4\n" + "\tpsubb %%mm2,%%mm4\n" /* mm4 = 255 - A */ + "\tpunpcklbw %%mm0,%%mm4\n" /* mm4 = (low bytes as word) mm4 */ + "\tmovq %3,%%mm6\n" /* mm6 = words of value 2 */ + "\tpmullw %%mm5,%%mm6\n" /* mm6 = 2 * low bytes of B */ + mmx_int_mult(mm6,mm4,mm7) /* mm4 = INT_MULT(mm6, mm4) */ - /* high bytes */ - mmx_high_bytes_to_words(mm3,mm5,mm0) - "\tpcmpeqb %%mm1,%%mm1\n" - "\tpsubb %%mm2,%%mm1\n" /* mm1 = 255 - A */ - "\tpunpckhbw %%mm0,%%mm1\n" /* mm1 = (high bytes as word) mm1 */ - "\tmovq %3,%%mm6\n" /* mm6 = words of value 2 */ - "\tpmullw %%mm5,%%mm6\n" /* mm6 = 2 * high bytes of B */ - mmx_int_mult(mm6,mm1,mm7) /* mm1 = INT_MULT(mm6, mm1) */ + /* high bytes */ + mmx_high_bytes_to_words(mm3,mm5,mm0) + "\tpcmpeqb %%mm1,%%mm1\n" + "\tpsubb %%mm2,%%mm1\n" /* mm1 = 255 - A */ + "\tpunpckhbw %%mm0,%%mm1\n" /* mm1 = (high bytes as word) mm1 */ + "\tmovq %3,%%mm6\n" /* mm6 = words of value 2 */ + "\tpmullw %%mm5,%%mm6\n" /* mm6 = 2 * high bytes of B */ + mmx_int_mult(mm6,mm1,mm7) /* mm1 = INT_MULT(mm6, mm1) */ - "\tpackuswb %%mm1,%%mm4\n" /* mm4 = intermediate value */ + "\tpackuswb %%mm1,%%mm4\n" /* mm4 = intermediate value */ - mmx_low_bytes_to_words(mm4,mm5,mm0) - mmx_low_bytes_to_words(mm2,mm6,mm0) - "\tpaddw %%mm6,%%mm5\n" - mmx_int_mult(mm6,mm5,mm7) /* mm5 = INT_MULT(mm6, mm5) low bytes */ + mmx_low_bytes_to_words(mm4,mm5,mm0) + mmx_low_bytes_to_words(mm2,mm6,mm0) + "\tpaddw %%mm6,%%mm5\n" + mmx_int_mult(mm6,mm5,mm7) /* mm5 = INT_MULT(mm6, mm5) low bytes */ - mmx_high_bytes_to_words(mm4,mm1,mm0) - mmx_high_bytes_to_words(mm2,mm6,mm0) - "\tpaddw %%mm6,%%mm1\n" - mmx_int_mult(mm6,mm1,mm7) /* mm1 = INT_MULT(mm6, mm1) high bytes */ + mmx_high_bytes_to_words(mm4,mm1,mm0) + mmx_high_bytes_to_words(mm2,mm6,mm0) + "\tpaddw %%mm6,%%mm1\n" + mmx_int_mult(mm6,mm1,mm7) /* mm1 = INT_MULT(mm6, mm1) high bytes */ - "\tpackuswb %%mm1,%%mm5\n" + "\tpackuswb %%mm1,%%mm5\n" - "\tmovq %4,%%mm0\n" - "\tmovq %%mm0,%%mm1\n" - "\tpandn %%mm5,%%mm1\n" + "\tmovq %4,%%mm0\n" + "\tmovq %%mm0,%%mm1\n" + "\tpandn %%mm5,%%mm1\n" - "\t" pminub(mm2,mm3,mm4) "\n" - "\tpand %%mm0,%%mm3\n" + "\t" pminub(mm2,mm3,mm4) "\n" + "\tpand %%mm0,%%mm3\n" - "\tpor %%mm3,%%mm1\n" + "\tpor %%mm3,%%mm1\n" - "\tmovd %%mm1,%2\n" - : /* empty */ - : "m" (*op.A), "m" (*op.B), "m" (*op.D), "m" (*rgba8_w2), "m" (*rgba8_alpha_mask) - : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4"); - } + "\tmovd %%mm1,%2\n" + : /* empty */ + : "m" (*op.A), "m" (*op.B), "m" (*op.D), "m" (*rgba8_w2), "m" (*rgba8_alpha_mask) + : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4"); + } asm("emms"); } @@ -1073,32 +977,32 @@ gimp_composite_scale_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op) : "%eax", "%mm0", "%mm5", "%mm6", "%mm7"); for (; op.n_pixels >= 2; op.n_pixels -= 2) - { - asm volatile ("movq %0,%%mm2\n" - "\tmovq %%mm2,%%mm1\n" - "\tpunpcklbw %%mm0,%%mm1\n" - "\tmovq %%mm3,%%mm5\n" + { + asm volatile ("movq %0,%%mm2\n" + "\tmovq %%mm2,%%mm1\n" + "\tpunpcklbw %%mm0,%%mm1\n" + "\tmovq %%mm3,%%mm5\n" - "\t" pmulwX(mm5,mm1,mm7) "\n" + "\t" pmulwX(mm5,mm1,mm7) "\n" - "\tmovq %%mm2,%%mm4\n" - "\tpunpckhbw %%mm0,%%mm4\n" - "\tmovq %%mm3,%%mm5\n" + "\tmovq %%mm2,%%mm4\n" + "\tpunpckhbw %%mm0,%%mm4\n" + "\tmovq %%mm3,%%mm5\n" - "\t" pmulwX(mm5,mm4,mm7) "\n" + "\t" pmulwX(mm5,mm4,mm7) "\n" - "\tpackuswb %%mm4,%%mm1\n" - - "\tmovq %%mm1,%1\n" - : /* empty */ - : "m" (*op.A), "m" (*op.D) - : "0", "1", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"); - op.A += 8; - op.D += 8; - } + "\tpackuswb %%mm4,%%mm1\n" + + "\tmovq %%mm1,%1\n" + : /* empty */ + : "m" (*op.A), "m" (*op.D) + : "0", "1", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"); + op.A += 8; + op.D += 8; + } if (op.n_pixels) - { + { asm volatile ("movd %0,%%mm2\n" "\tmovq %%mm2,%%mm1\n" "\tpunpcklbw %%mm0,%%mm1\n" @@ -1126,62 +1030,62 @@ gimp_composite_screen_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op) asm volatile ("pxor %mm6, %mm6"); for (; op.n_pixels >= 2; op.n_pixels -= 2) - { - asm volatile (" movq %0,%%mm2\n" - "\tmovq %1,%%mm3\n" + { + asm volatile (" movq %0,%%mm2\n" + "\tmovq %1,%%mm3\n" - "\tpcmpeqb %%mm4,%%mm4\n" - "\tpsubb %%mm2,%%mm4\n" - "\tpcmpeqb %%mm5,%%mm5\n" - "\tpsubb %%mm3,%%mm5\n" + "\tpcmpeqb %%mm4,%%mm4\n" + "\tpsubb %%mm2,%%mm4\n" + "\tpcmpeqb %%mm5,%%mm5\n" + "\tpsubb %%mm3,%%mm5\n" - "\tpunpcklbw %%mm6,%%mm4\n" - "\tpunpcklbw %%mm6,%%mm5\n" - "\tpmullw %%mm4,%%mm5\n" - "\tpaddw %%mm7,%%mm5\n" - "\tmovq %%mm5,%%mm1\n" - "\tpsrlw $ 8,%%mm1\n" - "\tpaddw %%mm5,%%mm1\n" - "\tpsrlw $ 8,%%mm1\n" + "\tpunpcklbw %%mm6,%%mm4\n" + "\tpunpcklbw %%mm6,%%mm5\n" + "\tpmullw %%mm4,%%mm5\n" + "\tpaddw %%mm7,%%mm5\n" + "\tmovq %%mm5,%%mm1\n" + "\tpsrlw $ 8,%%mm1\n" + "\tpaddw %%mm5,%%mm1\n" + "\tpsrlw $ 8,%%mm1\n" - "\tpcmpeqb %%mm4,%%mm4\n" - "\tpsubb %%mm2,%%mm4\n" - "\tpcmpeqb %%mm5,%%mm5\n" - "\tpsubb %%mm3,%%mm5\n" - - "\tpunpckhbw %%mm6,%%mm4\n" - "\tpunpckhbw %%mm6,%%mm5\n" - "\tpmullw %%mm4,%%mm5\n" - "\tpaddw %%mm7,%%mm5\n" - "\tmovq %%mm5,%%mm4\n" - "\tpsrlw $ 8,%%mm4\n" - "\tpaddw %%mm5,%%mm4\n" - "\tpsrlw $ 8,%%mm4\n" + "\tpcmpeqb %%mm4,%%mm4\n" + "\tpsubb %%mm2,%%mm4\n" + "\tpcmpeqb %%mm5,%%mm5\n" + "\tpsubb %%mm3,%%mm5\n" + + "\tpunpckhbw %%mm6,%%mm4\n" + "\tpunpckhbw %%mm6,%%mm5\n" + "\tpmullw %%mm4,%%mm5\n" + "\tpaddw %%mm7,%%mm5\n" + "\tmovq %%mm5,%%mm4\n" + "\tpsrlw $ 8,%%mm4\n" + "\tpaddw %%mm5,%%mm4\n" + "\tpsrlw $ 8,%%mm4\n" - "\tpackuswb %%mm4,%%mm1\n" + "\tpackuswb %%mm4,%%mm1\n" - "\tpcmpeqb %%mm4,%%mm4\n" - "\tpsubb %%mm1,%%mm4\n" + "\tpcmpeqb %%mm4,%%mm4\n" + "\tpsubb %%mm1,%%mm4\n" - "\tmovq %%mm0,%%mm1\n" - "\tpandn %%mm4,%%mm1\n" - - "\t" pminub(mm2,mm3,mm4) "\n" - "\tpand %%mm0,%%mm3\n" + "\tmovq %%mm0,%%mm1\n" + "\tpandn %%mm4,%%mm1\n" + + "\t" pminub(mm2,mm3,mm4) "\n" + "\tpand %%mm0,%%mm3\n" - "\tpor %%mm3,%%mm1\n" - - "\tmovq %%mm1,%2\n" - : /* empty */ - : "m" (*op.A), "m" (*op.B), "m" (*op.D) - : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); - op.A += 8; - op.B += 8; - op.D += 8; + "\tpor %%mm3,%%mm1\n" + + "\tmovq %%mm1,%2\n" + : /* empty */ + : "m" (*op.A), "m" (*op.B), "m" (*op.D) + : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); + op.A += 8; + op.B += 8; + op.D += 8; } if (op.n_pixels) - { + { asm volatile (" movd %0,%%mm2\n" "\tmovd %1,%%mm3\n" @@ -1244,31 +1148,31 @@ gimp_composite_subtract_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op) asm volatile ("movq %0,%%mm0" : : "m" (*rgba8_alpha_mask) : "%mm0"); for (; op.n_pixels >= 2; op.n_pixels -= 2) - { - asm volatile (" movq %0,%%mm2\n" - "\tmovq %1,%%mm3\n" + { + asm volatile (" movq %0,%%mm2\n" + "\tmovq %1,%%mm3\n" - "\tmovq %%mm2,%%mm4\n" - "\tpsubusb %%mm3,%%mm4\n" + "\tmovq %%mm2,%%mm4\n" + "\tpsubusb %%mm3,%%mm4\n" - "\tmovq %%mm0,%%mm1\n" - "\tpandn %%mm4,%%mm1\n" - - "\t" pminub(mm3,mm2,mm4) "\n" - - "\tpand %%mm0,%%mm2\n" - "\tpor %%mm2,%%mm1\n" - "\tmovq %%mm1,%2\n" - : /* empty */ - : "m" (*op.A), "m" (*op.B), "m" (*op.D) - : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); - op.A += 8; - op.B += 8; - op.D += 8; - } + "\tmovq %%mm0,%%mm1\n" + "\tpandn %%mm4,%%mm1\n" + + "\t" pminub(mm3,mm2,mm4) "\n" + + "\tpand %%mm0,%%mm2\n" + "\tpor %%mm2,%%mm1\n" + "\tmovq %%mm1,%2\n" + : /* empty */ + : "m" (*op.A), "m" (*op.B), "m" (*op.D) + : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); + op.A += 8; + op.B += 8; + op.D += 8; + } if (op.n_pixels) - { + { asm volatile (" movd %0,%%mm2\n" "\tmovd %1,%%mm3\n" @@ -1297,20 +1201,20 @@ gimp_composite_swap_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op) GimpCompositeContext op = *_op; for (; op.n_pixels >= 2; op.n_pixels -= 2) - { - asm volatile (" movq %0,%%mm2\n" - "\tmovq %1,%%mm3\n" - "\tmovq %%mm3,%0\n" - "\tmovq %%mm2,%1\n" - : /* empty */ - : "m" (*op.A), "m" (*op.B) - : "0", "1", "%mm1", "%mm2", "%mm3", "%mm4"); - op.A += 8; - op.B += 8; - } + { + asm volatile (" movq %0,%%mm2\n" + "\tmovq %1,%%mm3\n" + "\tmovq %%mm3,%0\n" + "\tmovq %%mm2,%1\n" + : /* empty */ + : "m" (*op.A), "m" (*op.B) + : "0", "1", "%mm1", "%mm2", "%mm3", "%mm4"); + op.A += 8; + op.B += 8; + } if (op.n_pixels) - { + { asm volatile (" movd %0,%%mm2\n" "\tmovd %1,%%mm3\n" "\tmovd %%mm3,%0\n" @@ -1421,7 +1325,7 @@ gimp_composite_burn_va8_va8_va8_mmx (GimpCompositeContext *_op) : "%mm1"); for (; op.n_pixels >= 4; op.n_pixels -= 4) - { + { asm volatile (" movq %0,%%mm0\n" "\tmovq %1,%%mm1\n" @@ -1466,16 +1370,16 @@ gimp_composite_burn_va8_va8_va8_mmx (GimpCompositeContext *_op) "\tpor %%mm1,%%mm7\n" /* mm7 = mm7 | mm1 */ "\tmovq %%mm7,%2\n" - : /* empty */ + : /* empty */ : "+m" (*op.A), "+m" (*op.B), "+m" (*op.D), "m" (*va8_b255), "m" (*va8_w1), "m" (*va8_w255), "m" (*va8_alpha_mask) : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4"); - op.A += 8; - op.B += 8; - op.D += 8; + op.A += 8; + op.B += 8; + op.D += 8; } if (op.n_pixels) - { + { asm volatile (" movd %0,%%mm0\n" "\tmovd %1,%%mm1\n" "\tmovq %3,%%mm2\n" @@ -2315,4 +2219,3 @@ gimp_composite_mmx_init (void) { } - diff --git a/app/composite/gimp-composite-regression.c b/app/composite/gimp-composite-regression.c index 8e49a25b05..b3835950cd 100644 --- a/app/composite/gimp-composite-regression.c +++ b/app/composite/gimp-composite-regression.c @@ -127,7 +127,6 @@ gimp_composite_regression_compare_contexts (char *operation, GimpCompositeContex #endif } - return (0); } diff --git a/app/composite/gimp-composite-sse-test.c b/app/composite/gimp-composite-sse-test.c index 88fba8945b..637f34a88f 100644 --- a/app/composite/gimp-composite-sse-test.c +++ b/app/composite/gimp-composite-sse-test.c @@ -57,74 +57,52 @@ gimp_composite_sse_test (int iterations, int n_pixels) } - gimp_composite_context_init (&special_ctx, GIMP_COMPOSITE_MULTIPLY, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D2); - gimp_composite_context_init (&generic_ctx, GIMP_COMPOSITE_MULTIPLY, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D1); - ft0 = gimp_composite_regression_time_function (iterations, gimp_composite_dispatch, &generic_ctx); - ft1 = gimp_composite_regression_time_function (iterations, gimp_composite_multiply_rgba8_rgba8_rgba8_sse, &special_ctx); - if (gimp_composite_regression_compare_contexts ("multiply", &generic_ctx, &special_ctx)) { - return (1); - } - gimp_composite_regression_timer_report ("multiply", ft0, ft1); - - gimp_composite_context_init (&special_ctx, GIMP_COMPOSITE_SCREEN, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D2); - gimp_composite_context_init (&generic_ctx, GIMP_COMPOSITE_SCREEN, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D1); - ft0 = gimp_composite_regression_time_function (iterations, gimp_composite_dispatch, &generic_ctx); - ft1 = gimp_composite_regression_time_function (iterations, gimp_composite_screen_rgba8_rgba8_rgba8_sse, &special_ctx); - if (gimp_composite_regression_compare_contexts ("screen", &generic_ctx, &special_ctx)) { - return (1); - } - gimp_composite_regression_timer_report ("screen", ft0, ft1); - - gimp_composite_context_init (&special_ctx, GIMP_COMPOSITE_DIFFERENCE, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D2); - gimp_composite_context_init (&generic_ctx, GIMP_COMPOSITE_DIFFERENCE, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D1); - ft0 = gimp_composite_regression_time_function (iterations, gimp_composite_dispatch, &generic_ctx); - ft1 = gimp_composite_regression_time_function (iterations, gimp_composite_difference_rgba8_rgba8_rgba8_sse, &special_ctx); - if (gimp_composite_regression_compare_contexts ("difference", &generic_ctx, &special_ctx)) { - return (1); - } - gimp_composite_regression_timer_report ("difference", ft0, ft1); - gimp_composite_context_init (&special_ctx, GIMP_COMPOSITE_ADDITION, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D2); gimp_composite_context_init (&generic_ctx, GIMP_COMPOSITE_ADDITION, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D1); ft0 = gimp_composite_regression_time_function (iterations, gimp_composite_dispatch, &generic_ctx); ft1 = gimp_composite_regression_time_function (iterations, gimp_composite_addition_rgba8_rgba8_rgba8_sse, &special_ctx); if (gimp_composite_regression_compare_contexts ("addition", &generic_ctx, &special_ctx)) { + printf("addition failed\n"); return (1); } gimp_composite_regression_timer_report ("addition", ft0, ft1); - gimp_composite_context_init (&special_ctx, GIMP_COMPOSITE_SUBTRACT, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D2); - gimp_composite_context_init (&generic_ctx, GIMP_COMPOSITE_SUBTRACT, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D1); + gimp_composite_context_init (&special_ctx, GIMP_COMPOSITE_BURN, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D2); + gimp_composite_context_init (&generic_ctx, GIMP_COMPOSITE_BURN, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D1); ft0 = gimp_composite_regression_time_function (iterations, gimp_composite_dispatch, &generic_ctx); - ft1 = gimp_composite_regression_time_function (iterations, gimp_composite_subtract_rgba8_rgba8_rgba8_sse, &special_ctx); - if (gimp_composite_regression_compare_contexts ("subtract", &generic_ctx, &special_ctx)) { + ft1 = gimp_composite_regression_time_function (iterations, gimp_composite_burn_rgba8_rgba8_rgba8_sse, &special_ctx); + if (gimp_composite_regression_compare_contexts ("burn", &generic_ctx, &special_ctx)) { + printf("burn failed\n"); return (1); } - gimp_composite_regression_timer_report ("subtract", ft0, ft1); + gimp_composite_regression_timer_report ("burn", ft0, ft1); gimp_composite_context_init (&special_ctx, GIMP_COMPOSITE_DARKEN, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D2); gimp_composite_context_init (&generic_ctx, GIMP_COMPOSITE_DARKEN, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D1); ft0 = gimp_composite_regression_time_function (iterations, gimp_composite_dispatch, &generic_ctx); ft1 = gimp_composite_regression_time_function (iterations, gimp_composite_darken_rgba8_rgba8_rgba8_sse, &special_ctx); if (gimp_composite_regression_compare_contexts ("darken", &generic_ctx, &special_ctx)) { + printf("darken failed\n"); return (1); } gimp_composite_regression_timer_report ("darken", ft0, ft1); - gimp_composite_context_init (&special_ctx, GIMP_COMPOSITE_LIGHTEN, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D2); - gimp_composite_context_init (&generic_ctx, GIMP_COMPOSITE_LIGHTEN, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D1); + gimp_composite_context_init (&special_ctx, GIMP_COMPOSITE_DIFFERENCE, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D2); + gimp_composite_context_init (&generic_ctx, GIMP_COMPOSITE_DIFFERENCE, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D1); ft0 = gimp_composite_regression_time_function (iterations, gimp_composite_dispatch, &generic_ctx); - ft1 = gimp_composite_regression_time_function (iterations, gimp_composite_lighten_rgba8_rgba8_rgba8_sse, &special_ctx); - if (gimp_composite_regression_compare_contexts ("lighten", &generic_ctx, &special_ctx)) { + ft1 = gimp_composite_regression_time_function (iterations, gimp_composite_difference_rgba8_rgba8_rgba8_sse, &special_ctx); + if (gimp_composite_regression_compare_contexts ("difference", &generic_ctx, &special_ctx)) { + printf("difference failed\n"); return (1); } - gimp_composite_regression_timer_report ("lighten", ft0, ft1); + gimp_composite_regression_timer_report ("difference", ft0, ft1); gimp_composite_context_init (&special_ctx, GIMP_COMPOSITE_DIVIDE, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D2); gimp_composite_context_init (&generic_ctx, GIMP_COMPOSITE_DIVIDE, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D1); ft0 = gimp_composite_regression_time_function (iterations, gimp_composite_dispatch, &generic_ctx); ft1 = gimp_composite_regression_time_function (iterations, gimp_composite_divide_rgba8_rgba8_rgba8_sse, &special_ctx); if (gimp_composite_regression_compare_contexts ("divide", &generic_ctx, &special_ctx)) { + printf("divide failed\n"); return (1); } gimp_composite_regression_timer_report ("divide", ft0, ft1); @@ -134,24 +112,17 @@ gimp_composite_sse_test (int iterations, int n_pixels) ft0 = gimp_composite_regression_time_function (iterations, gimp_composite_dispatch, &generic_ctx); ft1 = gimp_composite_regression_time_function (iterations, gimp_composite_dodge_rgba8_rgba8_rgba8_sse, &special_ctx); if (gimp_composite_regression_compare_contexts ("dodge", &generic_ctx, &special_ctx)) { + printf("dodge failed\n"); return (1); } gimp_composite_regression_timer_report ("dodge", ft0, ft1); - gimp_composite_context_init (&special_ctx, GIMP_COMPOSITE_BURN, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D2); - gimp_composite_context_init (&generic_ctx, GIMP_COMPOSITE_BURN, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D1); - ft0 = gimp_composite_regression_time_function (iterations, gimp_composite_dispatch, &generic_ctx); - ft1 = gimp_composite_regression_time_function (iterations, gimp_composite_burn_rgba8_rgba8_rgba8_sse, &special_ctx); - if (gimp_composite_regression_compare_contexts ("burn", &generic_ctx, &special_ctx)) { - return (1); - } - gimp_composite_regression_timer_report ("burn", ft0, ft1); - gimp_composite_context_init (&special_ctx, GIMP_COMPOSITE_GRAIN_EXTRACT, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D2); gimp_composite_context_init (&generic_ctx, GIMP_COMPOSITE_GRAIN_EXTRACT, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D1); ft0 = gimp_composite_regression_time_function (iterations, gimp_composite_dispatch, &generic_ctx); ft1 = gimp_composite_regression_time_function (iterations, gimp_composite_grain_extract_rgba8_rgba8_rgba8_sse, &special_ctx); if (gimp_composite_regression_compare_contexts ("grain_extract", &generic_ctx, &special_ctx)) { + printf("grain_extract failed\n"); return (1); } gimp_composite_regression_timer_report ("grain_extract", ft0, ft1); @@ -161,27 +132,70 @@ gimp_composite_sse_test (int iterations, int n_pixels) ft0 = gimp_composite_regression_time_function (iterations, gimp_composite_dispatch, &generic_ctx); ft1 = gimp_composite_regression_time_function (iterations, gimp_composite_grain_merge_rgba8_rgba8_rgba8_sse, &special_ctx); if (gimp_composite_regression_compare_contexts ("grain_merge", &generic_ctx, &special_ctx)) { + printf("grain_merge failed\n"); return (1); } gimp_composite_regression_timer_report ("grain_merge", ft0, ft1); - gimp_composite_context_init (&special_ctx, GIMP_COMPOSITE_SWAP, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D2); - gimp_composite_context_init (&generic_ctx, GIMP_COMPOSITE_SWAP, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D1); + gimp_composite_context_init (&special_ctx, GIMP_COMPOSITE_LIGHTEN, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D2); + gimp_composite_context_init (&generic_ctx, GIMP_COMPOSITE_LIGHTEN, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D1); ft0 = gimp_composite_regression_time_function (iterations, gimp_composite_dispatch, &generic_ctx); - ft1 = gimp_composite_regression_time_function (iterations, gimp_composite_swap_rgba8_rgba8_rgba8_sse, &special_ctx); - if (gimp_composite_regression_compare_contexts ("swap", &generic_ctx, &special_ctx)) { + ft1 = gimp_composite_regression_time_function (iterations, gimp_composite_lighten_rgba8_rgba8_rgba8_sse, &special_ctx); + if (gimp_composite_regression_compare_contexts ("lighten", &generic_ctx, &special_ctx)) { + printf("lighten failed\n"); return (1); } - gimp_composite_regression_timer_report ("swap", ft0, ft1); + gimp_composite_regression_timer_report ("lighten", ft0, ft1); + + gimp_composite_context_init (&special_ctx, GIMP_COMPOSITE_MULTIPLY, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D2); + gimp_composite_context_init (&generic_ctx, GIMP_COMPOSITE_MULTIPLY, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D1); + ft0 = gimp_composite_regression_time_function (iterations, gimp_composite_dispatch, &generic_ctx); + ft1 = gimp_composite_regression_time_function (iterations, gimp_composite_multiply_rgba8_rgba8_rgba8_sse, &special_ctx); + if (gimp_composite_regression_compare_contexts ("multiply", &generic_ctx, &special_ctx)) { + printf("multiply failed\n"); + return (1); + } + gimp_composite_regression_timer_report ("multiply", ft0, ft1); gimp_composite_context_init (&special_ctx, GIMP_COMPOSITE_SCALE, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D2); gimp_composite_context_init (&generic_ctx, GIMP_COMPOSITE_SCALE, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D1); ft0 = gimp_composite_regression_time_function (iterations, gimp_composite_dispatch, &generic_ctx); ft1 = gimp_composite_regression_time_function (iterations, gimp_composite_scale_rgba8_rgba8_rgba8_sse, &special_ctx); if (gimp_composite_regression_compare_contexts ("scale", &generic_ctx, &special_ctx)) { + printf("scale failed\n"); return (1); } gimp_composite_regression_timer_report ("scale", ft0, ft1); + + gimp_composite_context_init (&special_ctx, GIMP_COMPOSITE_SCREEN, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D2); + gimp_composite_context_init (&generic_ctx, GIMP_COMPOSITE_SCREEN, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D1); + ft0 = gimp_composite_regression_time_function (iterations, gimp_composite_dispatch, &generic_ctx); + ft1 = gimp_composite_regression_time_function (iterations, gimp_composite_screen_rgba8_rgba8_rgba8_sse, &special_ctx); + if (gimp_composite_regression_compare_contexts ("screen", &generic_ctx, &special_ctx)) { + printf("screen failed\n"); + return (1); + } + gimp_composite_regression_timer_report ("screen", ft0, ft1); + + gimp_composite_context_init (&special_ctx, GIMP_COMPOSITE_SUBTRACT, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D2); + gimp_composite_context_init (&generic_ctx, GIMP_COMPOSITE_SUBTRACT, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D1); + ft0 = gimp_composite_regression_time_function (iterations, gimp_composite_dispatch, &generic_ctx); + ft1 = gimp_composite_regression_time_function (iterations, gimp_composite_subtract_rgba8_rgba8_rgba8_sse, &special_ctx); + if (gimp_composite_regression_compare_contexts ("subtract", &generic_ctx, &special_ctx)) { + printf("subtract failed\n"); + return (1); + } + gimp_composite_regression_timer_report ("subtract", ft0, ft1); + + gimp_composite_context_init (&special_ctx, GIMP_COMPOSITE_SWAP, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D2); + gimp_composite_context_init (&generic_ctx, GIMP_COMPOSITE_SWAP, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D1); + ft0 = gimp_composite_regression_time_function (iterations, gimp_composite_dispatch, &generic_ctx); + ft1 = gimp_composite_regression_time_function (iterations, gimp_composite_swap_rgba8_rgba8_rgba8_sse, &special_ctx); + if (gimp_composite_regression_compare_contexts ("swap", &generic_ctx, &special_ctx)) { + printf("swap failed\n"); + return (1); + } + gimp_composite_regression_timer_report ("swap", ft0, ft1); #endif return (0); } @@ -197,7 +211,7 @@ main (int argc, char *argv[]) putenv ("GIMP_COMPOSITE=0x1"); iterations = 1; - n_pixels = 1048577; + n_pixels = 163921; argv++, argc--; while (argc >= 2) { diff --git a/app/composite/gimp-composite-sse.c b/app/composite/gimp-composite-sse.c index 2633bfed1b..a95eacb0b1 100644 --- a/app/composite/gimp-composite-sse.c +++ b/app/composite/gimp-composite-sse.c @@ -1,7 +1,9 @@ -/* The GIMP -- an image manipulation program +/* -*- mode: c tab-width: 2; c-basic-indent: 2; indent-tabs-mode: nil -*- + * + * The GIMP -- an image manipulation program * Copyright (C) 1995 Spencer Kimball and Peter Mattis * - * -*- mode: c tab-width: 2; c-basic-indent: 2; indent-tabs-mode: nil -*- + * * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -42,17 +44,15 @@ #include "gimp-composite.h" #include "gimp-composite-sse.h" +#include "gimp-composite-x86.h" #define pminub(src,dst,tmp) "pminub " "%%" #src ", %%" #dst #define pmaxub(src,dst,tmp) "pmaxub " "%%" #src ", %%" #dst - -/* - * Clobbers eax, ecx edx - */ /* * Double-word divide. Adjusted for subsequent unsigned packing * (high-order bit of each word is cleared) + * Clobbers eax, ecx edx */ #define pdivwX(dividend,divisor,quotient) "movd %%" #dividend ",%%eax; " \ "movd %%" #divisor ",%%ecx; " \ @@ -137,15 +137,6 @@ "\tpaddw %%"#a", %%"#b"; " \ "\tpsrlw $8, %%"#b"\n" -#define mmx_low_bytes_to_words(src,dst,zero) \ - "\tmovq %%"#src", %%"#dst"; " \ - "\tpunpcklbw %%"#zero", %%"#dst"\n" - -#define mmx_high_bytes_to_words(src,dst,zero) \ - "\tmovq %%"#src", %%"#dst"; " \ - "\tpunpckhbw %%"#zero", %%"#dst"\n" - - const static guint32 rgba8_alpha_mask_64[2] = { 0xFF000000, 0xFF000000 }; const static guint32 rgba8_b1_64[2] = { 0x01010101, 0x01010101 }; const static guint32 rgba8_b255_64[2] = { 0xFFFFFFFF, 0xFFFFFFFF }; @@ -174,155 +165,156 @@ gimp_composite_addition_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op) : "%mm0"); for (; op.n_pixels >= 2; op.n_pixels -= 2) - { - asm (" movq %0, %%mm2\n" - "\tmovq %1, %%mm3\n" - "\tmovq %%mm2, %%mm4\n" - "\tpaddusb %%mm3, %%mm4\n" - "\tmovq %%mm0, %%mm1\n" - "\tpandn %%mm4, %%mm1\n" - "\t" pminub(mm3, mm2, mm4) "\n" - "\tpand %%mm0, %%mm2\n" - "\tpor %%mm2, %%mm1\n" - "\tmovq %%mm1, %2\n" - : /* empty */ - : "m" (*op.A), "m" (*op.B), "m" (*op.D) - : "0", "1", "2", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"); - op.A += 8; - op.B += 8; - op.D += 8; - } - + { + asm (" movq %1, %%mm2\n" + "\tmovq %2, %%mm3\n" + "\tmovq %%mm2, %%mm4\n" + "\tpaddusb %%mm3, %%mm4\n" + "\tmovq %%mm0, %%mm1\n" + "\tpandn %%mm4, %%mm1\n" + "\tpminub %%mm3, %%mm2\n" + "\tpand %%mm0, %%mm2\n" + "\tpor %%mm2, %%mm1\n" + "\tmovq %%mm1, %0\n" + : "=m" (*op.D) + : "m" (*op.A), "m" (*op.B) + : "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"); + op.A += 8; + op.B += 8; + op.D += 8; + } + if (op.n_pixels) - { - asm volatile (" movd %0, %%mm2\n" - "\tmovd %1, %%mm3\n" - "\tmovq %%mm2, %%mm4\n" - "\tpaddusb %%mm3, %%mm4\n" - "\tmovq %%mm0, %%mm1\n" - "\tpandn %%mm4, %%mm1\n" - "\t" pminub(mm3, mm2, mm4) "\n" - "\tpand %%mm0, %%mm2\n" - "\tpor %%mm2, %%mm1\n" - "\tmovd %%mm1, %2\n" - : /* empty */ - : "m" (*op.A), "m" (*op.B), "m" (*op.D) - : "0", "1", "2", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"); - } - + { + asm volatile (" movd %1, %%mm2\n" + "\tmovd %2, %%mm3\n" + "\tmovq %%mm2, %%mm4\n" + "\tpaddusb %%mm3, %%mm4\n" + "\tmovq %%mm0, %%mm1\n" + "\tpandn %%mm4, %%mm1\n" + "\tpminub %%mm3, %%mm2\n" + "\tpand %%mm0, %%mm2\n" + "\tpor %%mm2, %%mm1\n" + "\tmovd %%mm1, %0\n" + : "=m" (*op.D) + : "m" (*op.A), "m" (*op.B) + : "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"); + } + asm("emms"); } + void gimp_composite_burn_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op) { GimpCompositeContext op = *_op; - + for (; op.n_pixels >= 2; op.n_pixels -= 2) - { - asm (" movq %0,%%mm0\n" - "\tmovq %1,%%mm1\n" - - "\tmovq %3,%%mm2\n" - "\tpsubb %%mm0,%%mm2\n" /* mm2 = 255 - A */ - "\tpxor %%mm4,%%mm4\n" - "\tpunpcklbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256 */ - - "\tmovq %%mm1,%%mm3\n" - "\tpxor %%mm5,%%mm5\n" - "\tpunpcklbw %%mm5,%%mm3\n" - "\tmovq %4,%%mm5\n" - "\tpaddusw %%mm3,%%mm5\n" /* mm5 = B + 1 */ - - "\t" pdivwqX(mm4,mm5,mm7) "\n" - - "\tmovq %3,%%mm2\n" - "\tpsubb %%mm0,%%mm2\n" /* mm2 = 255 - A */ - "\tpxor %%mm4,%%mm4\n" - "\tpunpckhbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256 */ - - "\tmovq %%mm1,%%mm3\n" - "\tpxor %%mm5,%%mm5\n" - "\tpunpckhbw %%mm5,%%mm3\n" - "\tmovq %4,%%mm5\n" - "\tpaddusw %%mm3,%%mm5\n" /* mm5 = B + 1 */ - "\t" pdivwqX(mm4,mm5,mm6) "\n" - - "\tmovq %5,%%mm4\n" - "\tmovq %%mm4,%%mm5\n" - "\tpsubusw %%mm6,%%mm4\n" - "\tpsubusw %%mm7,%%mm5\n" - - "\tpackuswb %%mm4,%%mm5\n" - - "\t" pminub(mm0,mm1,mm3) "\n" /* mm1 = min(mm0,mm1) clobber mm3 */ - - "\tmovq %6,%%mm7\n" - "\tpand %%mm7,%%mm1\n" /* mm1 = mm7 & alpha_mask */ - - "\tpandn %%mm5,%%mm7\n" /* mm7 = ~mm7 & mm5 */ - "\tpor %%mm1,%%mm7\n" /* mm7 = mm7 | mm1 */ - - "\tmovq %%mm7,%2\n" - : /* empty */ - : "m" (*op.A), "m" (*op.B), "m" (*op.D), "m" (*rgba8_b255_64), "m" (*rgba8_w1_64), "m" (*rgba8_w255_64), "m" (*rgba8_alpha_mask_64) - : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4"); - op.A += 8; - op.B += 8; - op.D += 8; - } - + { + asm (" movq %1,%%mm0\n" + "\tmovq %2,%%mm1\n" + + "\tmovq %3,%%mm2\n" + "\tpsubb %%mm0,%%mm2\n" /* mm2 = 255 - A */ + "\tpxor %%mm4,%%mm4\n" + "\tpunpcklbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256 */ + + "\tmovq %%mm1,%%mm3\n" + "\tpxor %%mm5,%%mm5\n" + "\tpunpcklbw %%mm5,%%mm3\n" + "\tmovq %4,%%mm5\n" + "\tpaddusw %%mm3,%%mm5\n" /* mm5 = B + 1 */ + + "\t" pdivwqX(mm4,mm5,mm7) "\n" + + "\tmovq %3,%%mm2\n" + "\tpsubb %%mm0,%%mm2\n" /* mm2 = 255 - A */ + "\tpxor %%mm4,%%mm4\n" + "\tpunpckhbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256 */ + + "\tmovq %%mm1,%%mm3\n" + "\tpxor %%mm5,%%mm5\n" + "\tpunpckhbw %%mm5,%%mm3\n" + "\tmovq %4,%%mm5\n" + "\tpaddusw %%mm3,%%mm5\n" /* mm5 = B + 1 */ + "\t" pdivwqX(mm4,mm5,mm6) "\n" + + "\tmovq %5,%%mm4\n" + "\tmovq %%mm4,%%mm5\n" + "\tpsubusw %%mm6,%%mm4\n" + "\tpsubusw %%mm7,%%mm5\n" + + "\tpackuswb %%mm4,%%mm5\n" + + "\tpminub %%mm0,%%mm1\n" /* mm1 = min(mm0,mm1) clobber mm3 */ + + "\tmovq %6,%%mm7\n" + "\tpand %%mm7,%%mm1\n" /* mm1 = mm7 & alpha_mask */ + + "\tpandn %%mm5,%%mm7\n" /* mm7 = ~mm7 & mm5 */ + "\tpor %%mm1,%%mm7\n" /* mm7 = mm7 | mm1 */ + + "\tmovq %%mm7,%0\n" + : "=m" (*op.D) + : "m" (*op.A), "m" (*op.B), "m" (*rgba8_b255_64), "m" (*rgba8_w1_64), "m" (*rgba8_w255_64), "m" (*rgba8_alpha_mask_64) + : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"); + op.A += 8; + op.B += 8; + op.D += 8; + } + if (op.n_pixels) - { - asm volatile (" movd %0,%%mm0\n" - "\tmovd %1,%%mm1\n" - - "\tmovq %3,%%mm2\n" - "\tpsubb %%mm0,%%mm2\n" /* mm2 = 255 - A */ - "\tpxor %%mm4,%%mm4\n" - "\tpunpcklbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256 */ - - "\tmovq %%mm1,%%mm3\n" - "\tpxor %%mm5,%%mm5\n" - "\tpunpcklbw %%mm5,%%mm3\n" - "\tmovq %4,%%mm5\n" - "\tpaddusw %%mm3,%%mm5\n" /* mm5 = B + 1 */ - - "\t" pdivwqX(mm4,mm5,mm7) "\n" - - "\tmovq %3,%%mm2\n" - "\tpsubb %%mm0,%%mm2\n" /* mm2 = 255 - A */ - "\tpxor %%mm4,%%mm4\n" - "\tpunpckhbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256 */ - - "\tmovq %%mm1,%%mm3\n" - "\tpxor %%mm5,%%mm5\n" - "\tpunpckhbw %%mm5,%%mm3\n" - "\tmovq %4,%%mm5\n" - "\tpaddusw %%mm3,%%mm5\n" /* mm5 = B + 1 */ - "\t" pdivwqX(mm4,mm5,mm6) "\n" - - "\tmovq %5,%%mm4\n" - "\tmovq %%mm4,%%mm5\n" - "\tpsubusw %%mm6,%%mm4\n" - "\tpsubusw %%mm7,%%mm5\n" - - "\tpackuswb %%mm4,%%mm5\n" - - "\t" pminub(mm0,mm1,mm3) "\n" /* mm1 = min(mm0,mm1) clobber mm3 */ - - "\tmovq %6,%%mm7\n" - "\tpand %%mm7,%%mm1\n" /* mm1 = mm7 & alpha_mask */ - - "\tpandn %%mm5,%%mm7\n" /* mm7 = ~mm7 & mm5 */ - "\tpor %%mm1,%%mm7\n" /* mm7 = mm7 | mm1 */ - - "\tmovd %%mm7,%2\n" - : /* empty */ - : "m" (*op.A), "m" (*op.B), "m" (*op.D), "m" (*rgba8_b255_64), "m" (*rgba8_w1_64), "m" (*rgba8_w255_64), "m" (*rgba8_alpha_mask_64) - : "0", "1", "2", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"); - } - + { + asm volatile (" movd %1,%%mm0\n" + "\tmovd %2,%%mm1\n" + + "\tmovq %3,%%mm2\n" + "\tpsubb %%mm0,%%mm2\n" /* mm2 = 255 - A */ + "\tpxor %%mm4,%%mm4\n" + "\tpunpcklbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256 */ + + "\tmovq %%mm1,%%mm3\n" + "\tpxor %%mm5,%%mm5\n" + "\tpunpcklbw %%mm5,%%mm3\n" + "\tmovq %4,%%mm5\n" + "\tpaddusw %%mm3,%%mm5\n" /* mm5 = B + 1 */ + + "\t" pdivwqX(mm4,mm5,mm7) "\n" + + "\tmovq %3,%%mm2\n" + "\tpsubb %%mm0,%%mm2\n" /* mm2 = 255 - A */ + "\tpxor %%mm4,%%mm4\n" + "\tpunpckhbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256 */ + + "\tmovq %%mm1,%%mm3\n" + "\tpxor %%mm5,%%mm5\n" + "\tpunpckhbw %%mm5,%%mm3\n" + "\tmovq %4,%%mm5\n" + "\tpaddusw %%mm3,%%mm5\n" /* mm5 = B + 1 */ + "\t" pdivwqX(mm4,mm5,mm6) "\n" + + "\tmovq %5,%%mm4\n" + "\tmovq %%mm4,%%mm5\n" + "\tpsubusw %%mm6,%%mm4\n" + "\tpsubusw %%mm7,%%mm5\n" + + "\tpackuswb %%mm4,%%mm5\n" + + "\tpminub %%mm0,%%mm1\n" /* mm1 = min(mm0,mm1) clobber mm3 */ + + "\tmovq %6,%%mm7\n" + "\tpand %%mm7,%%mm1\n" /* mm1 = mm7 & alpha_mask */ + + "\tpandn %%mm5,%%mm7\n" /* mm7 = ~mm7 & mm5 */ + "\tpor %%mm1,%%mm7\n" /* mm7 = mm7 | mm1 */ + + "\tmovd %%mm7,%0\n" + : "=m" (*op.D) + : "m" (*op.A), "m" (*op.B), "m" (*rgba8_b255_64), "m" (*rgba8_w1_64), "m" (*rgba8_w255_64), "m" (*rgba8_alpha_mask_64) + : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"); + } + asm("emms"); } @@ -331,32 +323,31 @@ void gimp_composite_darken_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op) { GimpCompositeContext op = *_op; - + for (; op.n_pixels >= 2; op.n_pixels -= 2) - { - asm volatile (" movq %0, %%mm2\n" - "\tmovq %1, %%mm3\n" - "\t" pminub(mm3, mm2, mm4) "\n" - "\tmovq %%mm2, %2\n" - : /* empty */ - : "m" (*op.A), "m" (*op.B), "m" (*op.D) - : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4"); - op.A += 8; - op.B += 8; - op.D += 8; - } - + { + asm volatile (" movq %1, %%mm2\n" + "\tpminub %2, %%mm2\n" + "\tmovq %%mm2, %0\n" + : "=m" (*op.D) + : "m" (*op.A), "m" (*op.B) + : "%mm1", "%mm2", "%mm3", "%mm4"); + op.A += 8; + op.B += 8; + op.D += 8; + } + if (op.n_pixels) - { - asm volatile (" movd %0, %%mm2\n" - "\tmovd %1, %%mm3\n" - "\t" pminub(mm3, mm2, mm4) "\n" - "\tmovd %%mm2, %2\n" - : /* empty */ - : "m" (*op.A), "m" (*op.B), "m" (*op.D) - : "0", "1", "2", "%mm2", "%mm3", "%mm4"); - } - + { + asm volatile (" movd %1, %%mm2\n" + "\tmovd %2, %%mm3\n" + "\tpminub %%mm3, %%mm2\n" + "\tmovd %%mm2, %0\n" + : "=m" (*op.D) + : "m" (*op.A), "m" (*op.B) + : "%mm2", "%mm3", "%mm4"); + } + asm("emms"); } @@ -364,52 +355,52 @@ void gimp_composite_difference_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op) { GimpCompositeContext op = *_op; - - asm volatile ("movq %0,%%mm0" : : "m" (*rgba8_alpha_mask_64) : "%mm0"); - + + asm volatile ("movq %0,%%mm0" : : "m" (*rgba8_alpha_mask_64) : "%mm0"); + for (; op.n_pixels >= 2; op.n_pixels -= 2) - { - asm volatile (" movq %0, %%mm2\n" - "\tmovq %1, %%mm3\n" - "\tmovq %%mm2, %%mm4\n" - "\tmovq %%mm3, %%mm5\n" - "\tpsubusb %%mm3, %%mm4\n" - "\tpsubusb %%mm2, %%mm5\n" - "\tpaddb %%mm5, %%mm4\n" - "\tmovq %%mm0, %%mm1\n" - "\tpandn %%mm4, %%mm1\n" - "\tpminub %%mm3, %%mm2\n" - "\tpand %%mm0, %%mm2\n" - "\tpor %%mm2, %%mm1\n" - "\tmovq %%mm1, %2\n" - : /* empty */ - : "m" (*op.A), "m" (*op.B), "m" (*op.D) - : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4"); - op.A += 8; - op.B += 8; - op.D += 8; - } + { + asm volatile (" movq %1, %%mm2\n" + "\tmovq %2, %%mm3\n" + "\tmovq %%mm2, %%mm4\n" + "\tmovq %%mm3, %%mm5\n" + "\tpsubusb %%mm3, %%mm4\n" + "\tpsubusb %%mm2, %%mm5\n" + "\tpaddb %%mm5, %%mm4\n" + "\tmovq %%mm0, %%mm1\n" + "\tpandn %%mm4, %%mm1\n" + "\tpminub %%mm3, %%mm2\n" + "\tpand %%mm0, %%mm2\n" + "\tpor %%mm2, %%mm1\n" + "\tmovq %%mm1, %0\n" + : "=m" (*op.D) + : "m" (*op.A), "m" (*op.B) + : "%mm1", "%mm2", "%mm3", "%mm4"); + op.A += 8; + op.B += 8; + op.D += 8; + } if (op.n_pixels) - { - asm volatile (" movd %0, %%mm2\n" - "\tmovd %1, %%mm3\n" - "\tmovq %%mm2, %%mm4\n" - "\tmovq %%mm3, %%mm5\n" - "\tpsubusb %%mm3, %%mm4\n" - "\tpsubusb %%mm2, %%mm5\n" - "\tpaddb %%mm5, %%mm4\n" - "\tmovq %%mm0, %%mm1\n" - "\tpandn %%mm4, %%mm1\n" - "\tpminub %%mm3, %%mm2\n" - "\tpand %%mm0, %%mm2\n" - "\tpor %%mm2, %%mm1\n" - "\tmovd %%mm1, %2\n" - : /* empty */ - : "m" (*op.A), "m" (*op.B), "m" (*op.D) - : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4"); - } - + { + asm volatile (" movd %1, %%mm2\n" + "\tmovd %2, %%mm3\n" + "\tmovq %%mm2, %%mm4\n" + "\tmovq %%mm3, %%mm5\n" + "\tpsubusb %%mm3, %%mm4\n" + "\tpsubusb %%mm2, %%mm5\n" + "\tpaddb %%mm5, %%mm4\n" + "\tmovq %%mm0, %%mm1\n" + "\tpandn %%mm4, %%mm1\n" + "\tpminub %%mm3, %%mm2\n" + "\tpand %%mm0, %%mm2\n" + "\tpor %%mm2, %%mm1\n" + "\tmovd %%mm1, %0\n" + : "=m" (*op.D) + : "m" (*op.A), "m" (*op.B), "m" (*op.D) + : "%mm1", "%mm2", "%mm3", "%mm4"); + } + asm("emms"); } @@ -417,99 +408,99 @@ void gimp_composite_divide_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op) { GimpCompositeContext op = *_op; - - asm volatile ("movq %0, %%mm0\n" - "\tmovq %1, %%mm7\n" - : - : "m" (*rgba8_alpha_mask_64), "m" (*rgba8_w1_64) - : "%mm0", "%mm7"); + + asm volatile (" movq %0, %%mm0\n" + "\tmovq %1, %%mm7\n" + : + : "m" (*rgba8_alpha_mask_64), "m" (*rgba8_w1_64) + : "%mm0", "%mm7"); for (; op.n_pixels >= 2; op.n_pixels -= 2) - { - asm volatile (" movq %0,%%mm0\n" - "\tmovq %1,%%mm1\n" - "\tpxor %%mm2,%%mm2\n" - "\tpunpcklbw %%mm0,%%mm2\n" /* mm2 = A*256 */ - - "\tmovq %%mm1,%%mm3\n" - "\tpxor %%mm5,%%mm5\n" - "\tpunpcklbw %%mm5,%%mm3\n" - "\tpaddw %%mm7,%%mm3\n" /* mm3 = B+1 */ - - "\t" pdivwuqX(mm2,mm3,mm5) "\n" /* mm5 = (A*256)/(B+1) */ - - "\tpxor %%mm2,%%mm2\n" - "\tpunpckhbw %%mm0,%%mm2\n" /* mm2 = A*256 */ - - "\tmovq %%mm1,%%mm3\n" - "\tpxor %%mm6,%%mm6\n" - "\tpunpckhbw %%mm6,%%mm3\n" - "\tpaddw %%mm7,%%mm3\n" /* mm3 = B+1 */ - - "\t" pdivwuqX(mm2,mm3,mm4) "\n" /* mm4 = (A*256)/(B+1) */ - - "\tpackuswb %%mm4,%%mm5\n" /* expects mm4 and mm5 to be signed values */ - - "\t" pminub(mm0,mm1,mm3) "\n" - "\tmovq %3,%%mm3\n" - "\tmovq %%mm3,%%mm2\n" - - "\tpandn %%mm5,%%mm3\n" - - "\tpand %%mm2,%%mm1\n" - "\tpor %%mm1,%%mm3\n" - - "\tmovq %%mm3,%2\n" - : /* empty */ - : "m" (*op.A), "m" (*op.B), "m" (*op.D), "m" (*rgba8_alpha_mask_64) - : "%eax", "%ecx", "%edx", "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); - op.A += 8; - op.B += 8; - op.D += 8; - } - + { + asm volatile (" movq %0,%%mm0\n" + "\tmovq %1,%%mm1\n" + "\tpxor %%mm2,%%mm2\n" + "\tpunpcklbw %%mm0,%%mm2\n" /* mm2 = A*256 */ + + "\tmovq %%mm1,%%mm3\n" + "\tpxor %%mm5,%%mm5\n" + "\tpunpcklbw %%mm5,%%mm3\n" + "\tpaddw %%mm7,%%mm3\n" /* mm3 = B+1 */ + + "\t" pdivwuqX(mm2,mm3,mm5) "\n" /* mm5 = (A*256)/(B+1) */ + + "\tpxor %%mm2,%%mm2\n" + "\tpunpckhbw %%mm0,%%mm2\n" /* mm2 = A*256 */ + + "\tmovq %%mm1,%%mm3\n" + "\tpxor %%mm6,%%mm6\n" + "\tpunpckhbw %%mm6,%%mm3\n" + "\tpaddw %%mm7,%%mm3\n" /* mm3 = B+1 */ + + "\t" pdivwuqX(mm2,mm3,mm4) "\n" /* mm4 = (A*256)/(B+1) */ + + "\tpackuswb %%mm4,%%mm5\n" /* expects mm4 and mm5 to be signed values */ + + "\tpminub %%mm0,%%mm1\n" + "\tmovq %3,%%mm3\n" + "\tmovq %%mm3,%%mm2\n" + + "\tpandn %%mm5,%%mm3\n" + + "\tpand %%mm2,%%mm1\n" + "\tpor %%mm1,%%mm3\n" + + "\tmovq %%mm3,%2\n" + : /* empty */ + : "m" (*op.A), "m" (*op.B), "m" (*op.D), "m" (*rgba8_alpha_mask_64) + : "%eax", "%ecx", "%edx", "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); + op.A += 8; + op.B += 8; + op.D += 8; + } + if (op.n_pixels) - { - asm volatile (" movd %0,%%mm0\n" - "\tmovd %1,%%mm1\n" - - "\tpxor %%mm2,%%mm2\n" - "\tpunpcklbw %%mm0,%%mm2\n" /* mm2 = A*256 */ - - "\tmovq %%mm1,%%mm3\n" - "\tpxor %%mm5,%%mm5\n" - "\tpunpcklbw %%mm5,%%mm3\n" - "\tpaddw %%mm7,%%mm3\n" /* mm3 = B+1 */ - - "\t" pdivwuqX(mm2,mm3,mm5) "\n" /* mm5 = (A*256)/(B+1) */ - - "\tpxor %%mm2,%%mm2\n" - "\tpunpckhbw %%mm0,%%mm2\n" /* mm2 = A*256 */ - - "\tmovq %%mm1,%%mm3\n" - "\tpxor %%mm6,%%mm6\n" - "\tpunpckhbw %%mm6,%%mm3\n" - "\tpaddw %%mm7,%%mm3\n" /* mm3 = B+1 */ - - "\t" pdivwuqX(mm2,mm3,mm4) "\n" /* mm4 = (A*256)/(B+1) */ - - "\tpackuswb %%mm4,%%mm5\n" /* expects mm4 and mm5 to be signed values */ - - "\t" pminub(mm0,mm1,mm3) "\n" - "\tmovq %3,%%mm3\n" - "\tmovq %%mm3,%%mm2\n" - - "\tpandn %%mm5,%%mm3\n" - - "\tpand %%mm2,%%mm1\n" - "\tpor %%mm1,%%mm3\n" - - "\tmovd %%mm3,%2\n" - : /* empty */ - : "m" (*op.A), "m" (*op.B), "m" (*op.D), "m" (*rgba8_alpha_mask_64) - : "%eax", "%ecx", "%edx", "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); - } - + { + asm volatile (" movd %0,%%mm0\n" + "\tmovd %1,%%mm1\n" + + "\tpxor %%mm2,%%mm2\n" + "\tpunpcklbw %%mm0,%%mm2\n" /* mm2 = A*256 */ + + "\tmovq %%mm1,%%mm3\n" + "\tpxor %%mm5,%%mm5\n" + "\tpunpcklbw %%mm5,%%mm3\n" + "\tpaddw %%mm7,%%mm3\n" /* mm3 = B+1 */ + + "\t" pdivwuqX(mm2,mm3,mm5) "\n" /* mm5 = (A*256)/(B+1) */ + + "\tpxor %%mm2,%%mm2\n" + "\tpunpckhbw %%mm0,%%mm2\n" /* mm2 = A*256 */ + + "\tmovq %%mm1,%%mm3\n" + "\tpxor %%mm6,%%mm6\n" + "\tpunpckhbw %%mm6,%%mm3\n" + "\tpaddw %%mm7,%%mm3\n" /* mm3 = B+1 */ + + "\t" pdivwuqX(mm2,mm3,mm4) "\n" /* mm4 = (A*256)/(B+1) */ + + "\tpackuswb %%mm4,%%mm5\n" /* expects mm4 and mm5 to be signed values */ + + "\tpminub %%mm0,%%mm1\n" + "\tmovq %3,%%mm3\n" + "\tmovq %%mm3,%%mm2\n" + + "\tpandn %%mm5,%%mm3\n" + + "\tpand %%mm2,%%mm1\n" + "\tpor %%mm1,%%mm3\n" + + "\tmovd %%mm3,%2\n" + : /* empty */ + : "m" (*op.A), "m" (*op.B), "m" (*op.D), "m" (*rgba8_alpha_mask_64) + : "%eax", "%ecx", "%edx", "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); + } + asm("emms"); } @@ -517,165 +508,170 @@ void gimp_composite_dodge_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op) { GimpCompositeContext op = *_op; - + for (; op.n_pixels >= 2; op.n_pixels -= 2) - { - asm volatile (" movq %0,%%mm0\n" - "\tmovq %1,%%mm1\n" - "\tmovq %%mm1,%%mm3\n" - "\tpxor %%mm2,%%mm2\n" - "\tpunpcklbw %%mm2,%%mm3\n" - "\tpunpcklbw %%mm0,%%mm2\n" - - "\tmovq %3,%%mm4\n" - "\tpsubw %%mm3,%%mm4\n" - - "\t" pdivwuqX(mm2,mm4,mm5) "\n" - - "\tmovq %%mm1,%%mm3\n" - "\tpxor %%mm2,%%mm2\n" - "\tpunpckhbw %%mm2,%%mm3\n" - "\tpunpckhbw %%mm0,%%mm2\n" - - "\tmovq %3,%%mm4\n" - "\tpsubw %%mm3,%%mm4\n" - - "\t" pdivwuqX(mm2,mm4,mm6) "\n" - - "\tpackuswb %%mm6,%%mm5\n" - - "\tmovq %4,%%mm6\n" - "\tmovq %%mm1,%%mm7\n" - "\t" pminub(mm0,mm7,mm2) "\n" - "\tpand %%mm6,%%mm7\n" - "\tpandn %%mm5,%%mm6\n" - - "\tpor %%mm6,%%mm7\n" - - "\tmovq %%mm7,%2\n" - : /* empty */ - : "m" (*op.A), "m" (*op.B), "m" (*op.D), "m" (*rgba8_w256_64), "m" (*rgba8_alpha_mask_64) - : "0", "1", "2", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"); - op.A += 8; - op.B += 8; - op.D += 8; - } - + { + asm volatile (" movq %0,%%mm0\n" + "\tmovq %1,%%mm1\n" + "\tmovq %%mm1,%%mm3\n" + "\tpxor %%mm2,%%mm2\n" + "\tpunpcklbw %%mm2,%%mm3\n" + "\tpunpcklbw %%mm0,%%mm2\n" + + "\tmovq %3,%%mm4\n" + "\tpsubw %%mm3,%%mm4\n" + + "\t" pdivwuqX(mm2,mm4,mm5) "\n" + + "\tmovq %%mm1,%%mm3\n" + "\tpxor %%mm2,%%mm2\n" + "\tpunpckhbw %%mm2,%%mm3\n" + "\tpunpckhbw %%mm0,%%mm2\n" + + "\tmovq %3,%%mm4\n" + "\tpsubw %%mm3,%%mm4\n" + + "\t" pdivwuqX(mm2,mm4,mm6) "\n" + + "\tpackuswb %%mm6,%%mm5\n" + + "\tmovq %4,%%mm6\n" + "\tmovq %%mm1,%%mm7\n" + "\t" pminub(mm0,mm7,mm2) "\n" + "\tpand %%mm6,%%mm7\n" + "\tpandn %%mm5,%%mm6\n" + + "\tpor %%mm6,%%mm7\n" + + "\tmovq %%mm7,%2\n" + : /* empty */ + : "m" (*op.A), "m" (*op.B), "m" (*op.D), "m" (*rgba8_w256_64), "m" (*rgba8_alpha_mask_64) + : "0", "1", "2", "%eax", "%ecx", "%edx", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"); + op.A += 8; + op.B += 8; + op.D += 8; + } + if (op.n_pixels) - { - asm volatile (" movd %0,%%mm0\n" - "\tmovq %1,%%mm1\n" - "\tmovq %%mm1,%%mm3\n" - "\tpxor %%mm2,%%mm2\n" - "\tpunpcklbw %%mm2,%%mm3\n" - "\tpunpcklbw %%mm0,%%mm2\n" - - "\tmovq %3,%%mm4\n" - "\tpsubw %%mm3,%%mm4\n" - - "\t" pdivwuqX(mm2,mm4,mm5) "\n" - - "\tmovq %%mm1,%%mm3\n" - "\tpxor %%mm2,%%mm2\n" - "\tpunpckhbw %%mm2,%%mm3\n" - "\tpunpckhbw %%mm0,%%mm2\n" - - "\tmovq %3,%%mm4\n" - "\tpsubw %%mm3,%%mm4\n" - - "\t" pdivwuqX(mm2,mm4,mm6) "\n" - - "\tpackuswb %%mm6,%%mm5\n" - - "\tmovq %4,%%mm6\n" - "\tmovq %%mm1,%%mm7\n" - "\t" pminub(mm0,mm7,mm2) "\n" - "\tpand %%mm6,%%mm7\n" - "\tpandn %%mm5,%%mm6\n" - - "\tpor %%mm6,%%mm7\n" - - "\tmovd %%mm7,%2\n" - : /* empty */ - : "m" (*op.A), "m" (*op.B), "m" (*op.D), "m" (*rgba8_w256_64), "m" (*rgba8_alpha_mask_64) - : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); - } - + { + asm volatile (" movd %0,%%mm0\n" + "\tmovq %1,%%mm1\n" + "\tmovq %%mm1,%%mm3\n" + "\tpxor %%mm2,%%mm2\n" + "\tpunpcklbw %%mm2,%%mm3\n" + "\tpunpcklbw %%mm0,%%mm2\n" + + "\tmovq %3,%%mm4\n" + "\tpsubw %%mm3,%%mm4\n" + + "\t" pdivwuqX(mm2,mm4,mm5) "\n" + + "\tmovq %%mm1,%%mm3\n" + "\tpxor %%mm2,%%mm2\n" + "\tpunpckhbw %%mm2,%%mm3\n" + "\tpunpckhbw %%mm0,%%mm2\n" + + "\tmovq %3,%%mm4\n" + "\tpsubw %%mm3,%%mm4\n" + + "\t" pdivwuqX(mm2,mm4,mm6) "\n" + + "\tpackuswb %%mm6,%%mm5\n" + + "\tmovq %4,%%mm6\n" + "\tmovq %%mm1,%%mm7\n" + "\tpminub %%mm0,%%mm7\n" + "\tpand %%mm6,%%mm7\n" + "\tpandn %%mm5,%%mm6\n" + + "\tpor %%mm6,%%mm7\n" + + "\tmovd %%mm7,%2\n" + : /* empty */ + : "m" (*op.A), "m" (*op.B), "m" (*op.D), "m" (*rgba8_w256_64), "m" (*rgba8_alpha_mask_64) + : "0", "1", "2", "%eax", "%ecx", "%edx", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); + } + asm("emms"); } void gimp_composite_grain_extract_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op) { - GimpCompositeContext op = *_op; - - asm volatile ("movq %0,%%mm0" : : "m" (*rgba8_alpha_mask_64) : "%mm0"); - asm volatile ("pxor %%mm6,%%mm6" : : : "%mm6"); - asm volatile ("movq %0,%%mm7" : : "m" (*rgba8_w128_64) : "%mm7"); + GimpCompositeContext op = *_op; + + asm volatile (" movq %0,%%mm0\n" + "\tpxor %%mm6,%%mm6\n" + "\tmovq %1,%%mm7\n" + : /* empty */ + : "m" (*rgba8_alpha_mask_64), "m" (*rgba8_w128_64) + : "%mm0", "%mm6", "%mm7"); + for (; op.n_pixels >= 2; op.n_pixels -= 2) - { - asm volatile (" movq %0,%%mm2\n" - "\tmovq %1,%%mm3\n" - mmx_low_bytes_to_words(mm2,mm4,mm6) - mmx_low_bytes_to_words(mm3,mm5,mm6) - "\tpsubw %%mm5,%%mm4\n" - "\tpaddw %%mm7,%%mm4\n" - "\tmovq %%mm4,%%mm1\n" - - mmx_high_bytes_to_words(mm2,mm4,mm6) - mmx_high_bytes_to_words(mm3,mm5,mm6) - - "\tpsubw %%mm5,%%mm4\n" - "\tpaddw %%mm7,%%mm4\n" - - "\tpackuswb %%mm4,%%mm1\n" - "\tmovq %%mm1,%%mm4\n" - - "\tmovq %%mm0,%%mm1\n" - "\tpandn %%mm4,%%mm1\n" - - "\t" pminub(mm3,mm2,mm4) "\n" - "\tpand %%mm0,%%mm2\n" - - "\tpor %%mm2,%%mm1\n" - "\tmovq %%mm1,%2\n" - : /* empty */ - : "m" (*op.A), "m" (*op.B), "m" (*op.D) - : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4"); - op.A += 8; - op.B += 8; - op.D += 8; - } - + { + asm volatile (" movq %1,%%mm2\n" + "\tmovq %2,%%mm3\n" + mmx_low_bytes_to_words(mm2,mm4,mm6) + mmx_low_bytes_to_words(mm3,mm5,mm6) + "\tpsubw %%mm5,%%mm4\n" + "\tpaddw %%mm7,%%mm4\n" + "\tmovq %%mm4,%%mm1\n" + + mmx_high_bytes_to_words(mm2,mm4,mm6) + mmx_high_bytes_to_words(mm3,mm5,mm6) + + "\tpsubw %%mm5,%%mm4\n" + "\tpaddw %%mm7,%%mm4\n" + + "\tpackuswb %%mm4,%%mm1\n" + "\tmovq %%mm1,%%mm4\n" + + "\tmovq %%mm0,%%mm1\n" + "\tpandn %%mm4,%%mm1\n" + + "\tpminub %%mm3,%%mm2\n" + "\tpand %%mm0,%%mm2\n" + + "\tpor %%mm2,%%mm1\n" + "\tmovq %%mm1,%0\n" + : "+m" (*op.D) + : "m" (*op.A), "m" (*op.B) + : "%mm1", "%mm2", "%mm3", "%mm4"); + op.A += 8; + op.B += 8; + op.D += 8; + } + if (op.n_pixels) - { - asm volatile (" movd %0, %%mm2\n" - "\tmovd %1, %%mm3\n" - - mmx_low_bytes_to_words(mm2,mm4,mm6) - mmx_low_bytes_to_words(mm3,mm5,mm6) - - "\tpsubw %%mm5, %%mm4\n" - "\tpaddw %%mm7, %%mm4\n" - "\tmovq %%mm4, %%mm1\n" - - "\tpackuswb %%mm6, %%mm1\n" - - "\tmovq %%mm1, %%mm4\n" - - "\tmovq %%mm0, %%mm1; pandn %%mm4, %%mm1\n" - - "\t" pminub(mm3,mm2,mm4) "\n" - "\tpand %%mm0, %%mm2\n" - - "\tpor %%mm2, %%mm1\n" - "\tmovd %%mm1, %2\n" - : /* empty */ - : "m" (*op.A), "m" (*op.B), "m" (*op.D) - : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4"); - } - + { + asm volatile (" movd %1, %%mm2\n" + "\tmovd %2, %%mm3\n" + + mmx_low_bytes_to_words(mm2,mm4,mm6) + mmx_low_bytes_to_words(mm3,mm5,mm6) + + "\tpsubw %%mm5, %%mm4\n" + "\tpaddw %%mm7, %%mm4\n" + "\tmovq %%mm4, %%mm1\n" + + "\tpackuswb %%mm6, %%mm1\n" + + "\tmovq %%mm1, %%mm4\n" + + "\tmovq %%mm0, %%mm1\n" + "\tpandn %%mm4, %%mm1\n" + + "\tpminub %%mm3, %%mm2\n" + "\tpand %%mm0, %%mm2\n" + + "\tpor %%mm2, %%mm1\n" + "\tmovd %%mm1, %0\n" + : "+m" (*op.D) + : "m" (*op.A), "m" (*op.B) + : "%mm1", "%mm2", "%mm3", "%mm4"); + } + asm("emms"); } @@ -683,73 +679,74 @@ void gimp_composite_grain_merge_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op) { GimpCompositeContext op = *_op; - + asm volatile ("movq %0, %%mm0\n" "pxor %%mm6, %%mm6\n" "movq %1, %%mm7\n" : /* empty */ : "m" (*rgba8_alpha_mask_64), "m" (*rgba8_w128_64) : "%mm0", "%mm6", "%mm7"); - + for (; op.n_pixels >= 2; op.n_pixels -= 2) - { - asm volatile (" movq %0, %%mm2\n" - "\tmovq %1, %%mm3\n" - - mmx_low_bytes_to_words(mm2,mm4,mm6) - mmx_low_bytes_to_words(mm3,mm5,mm6) - "\tpaddw %%mm5, %%mm4\n" - "\tpsubw %%mm7, %%mm4\n" - - mmx_high_bytes_to_words(mm2,mm1,mm6) - mmx_high_bytes_to_words(mm3,mm5,mm6) - "\tpaddw %%mm5, %%mm1\n" - "\tpsubw %%mm7, %%mm1\n" - - "\tpackuswb %%mm1, %%mm4\n" - - "\t" pminub(mm3,mm2,mm5) "\n" - "\tpand %%mm0, %%mm2\n" - - "\tmovq %%mm0, %%mm1\n" - "\tpandn %%mm4, %%mm1\n" - "\tpor %%mm2, %%mm1\n" - "\tmovq %%mm1, %2\n" - : /* empty */ - : "m" (*op.A), "m" (*op.B), "m" (*op.D) - : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4"); - op.A += 8; - op.B += 8; - op.D += 8; - } - + { + asm volatile (" movq %1, %%mm2\n" + "\tmovq %2, %%mm3\n" + + mmx_low_bytes_to_words(mm2,mm4,mm6) + mmx_low_bytes_to_words(mm3,mm5,mm6) + "\tpaddw %%mm5, %%mm4\n" + "\tpsubw %%mm7, %%mm4\n" + + mmx_high_bytes_to_words(mm2,mm1,mm6) + mmx_high_bytes_to_words(mm3,mm5,mm6) + "\tpaddw %%mm5, %%mm1\n" + "\tpsubw %%mm7, %%mm1\n" + + "\tpackuswb %%mm1, %%mm4\n" + + "\t" pminub(mm3,mm2,mm5) "\n" + "\tpand %%mm0, %%mm2\n" + + "\tmovq %%mm0, %%mm1\n" + "\tpandn %%mm4, %%mm1\n" + "\tpor %%mm2, %%mm1\n" + "\tmovq %%mm1, %0\n" + : "+m" (*op.D) + : "m" (*op.A), "m" (*op.B) + : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4"); + op.A += 8; + op.B += 8; + op.D += 8; + } + if (op.n_pixels) - { - asm volatile (" movd %0, %%mm2\n" - "\tmovd %1, %%mm3\n" - - mmx_low_bytes_to_words(mm2,mm4,mm6) - mmx_low_bytes_to_words(mm3,mm5,mm6) - - "\tpaddw %%mm5, %%mm4\n" - "\tpsubw %%mm7, %%mm4\n" - "\tmovq %%mm4, %%mm1\n" - "\tpackuswb %%mm6, %%mm1\n" - - "\tmovq %%mm1, %%mm4\n" - - "\tmovq %%mm0, %%mm1; pandn %%mm4, %%mm1\n" - - "\t" pminub(mm3,mm2,mm4) "\n" - "\tpand %%mm0, %%mm2\n" - - "\tpor %%mm2, %%mm1\n" - "\tmovd %%mm1, %2\n" - : /* empty */ - : "m" (*op.A), "m" (*op.B), "m" (*op.D) - : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4"); - } - + { + asm volatile (" movd %1, %%mm2\n" + "\tmovd %2, %%mm3\n" + + mmx_low_bytes_to_words(mm2,mm4,mm6) + mmx_low_bytes_to_words(mm3,mm5,mm6) + + "\tpaddw %%mm5, %%mm4\n" + "\tpsubw %%mm7, %%mm4\n" + "\tmovq %%mm4, %%mm1\n" + "\tpackuswb %%mm6, %%mm1\n" + + "\tmovq %%mm1, %%mm4\n" + + "\tmovq %%mm0, %%mm1\n" + "\tpandn %%mm4, %%mm1\n" + + "\tpminub %%mm3, %%mm2\n" + "\tpand %%mm0, %%mm2\n" + + "\tpor %%mm2, %%mm1\n" + "\tmovd %%mm1, %0\n" + : "+m" (*op.D) + : "m" (*op.A), "m" (*op.B) + : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4"); + } + asm("emms"); } @@ -757,49 +754,49 @@ void gimp_composite_lighten_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op) { GimpCompositeContext op = *_op; - + asm volatile ("movq %0,%%mm0" : : "m" (*rgba8_alpha_mask_64) : "%mm0"); - + for (; op.n_pixels >= 2; op.n_pixels -= 2) - { - asm volatile (" movq %0, %%mm2\n" - "\tmovq %1, %%mm3\n" - "\tmovq %%mm2, %%mm4\n" - "\t" pmaxub(mm3,mm4,mm5) "\n" - "\tmovq %%mm0, %%mm1\n" - "\tpandn %%mm4, %%mm1\n" - "\t" pminub(mm2,mm3,mm4) "\n" - "\tpand %%mm0, %%mm3\n" - "\tpor %%mm3, %%mm1\n" - "\tmovq %%mm1, %2\n" - : /* empty */ - : "m" (*op.A), "m" (*op.B), "m" (*op.D) - : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); - op.A += 8; - op.B += 8; - op.D += 8; - } - + { + asm volatile (" movq %1, %%mm2\n" + "\tmovq %2, %%mm3\n" + "\tmovq %%mm2, %%mm4\n" + "\tpmaxub %%mm3, %%mm4\n" + "\tmovq %%mm0, %%mm1\n" + "\tpandn %%mm4, %%mm1\n" + "\tpminub %%mm2, %%mm3\n" + "\tpand %%mm0, %%mm3\n" + "\tpor %%mm3, %%mm1\n" + "\tmovq %%mm1, %0\n" + : "=m" (*op.D) + : "m" (*op.A), "m" (*op.B) + : "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); + op.A += 8; + op.B += 8; + op.D += 8; + } + if (op.n_pixels) - { - asm volatile (" movd %0, %%mm2\n" - "\tmovd %1, %%mm3\n" - "\tmovq %%mm2, %%mm4\n" - "\t" pmaxub(mm3,mm4,mm5) "\n" - - "\tmovq %%mm0, %%mm1\n" - "\tpandn %%mm4, %%mm1\n" - - "\t" pminub(mm2,mm3,mm4) "\n" - - "\tpand %%mm0, %%mm3\n" - "\tpor %%mm3, %%mm1\n" - "\tmovd %%mm1, %2\n" - : /* empty */ - : "m" (*op.A), "m" (*op.B), "m" (*op.D) - : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); - } - + { + asm volatile (" movd %1, %%mm2\n" + "\tmovd %2, %%mm3\n" + "\tmovq %%mm2, %%mm4\n" + "\tpmaxub %%mm3, %%mm4\n" + + "\tmovq %%mm0, %%mm1\n" + "\tpandn %%mm4, %%mm1\n" + + "\tpminub %%mm2,%%mm3\n" + + "\tpand %%mm0, %%mm3\n" + "\tpor %%mm3, %%mm1\n" + "\tmovd %%mm1, %0\n" + : "=m" (*op.D) + : "m" (*op.A), "m" (*op.B) + : "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); + } + asm("emms"); } @@ -807,66 +804,66 @@ void gimp_composite_multiply_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op) { GimpCompositeContext op = *_op; - + asm volatile ("movq %0,%%mm0" : : "m" (*rgba8_alpha_mask_64) : "%mm0"); asm volatile ("movq %0,%%mm7" : : "m" (*rgba8_w128_64) : "%mm7"); asm volatile ("pxor %%mm6,%%mm6" : : : "%mm6"); - + for (; op.n_pixels >= 2; op.n_pixels -= 2) - { - asm volatile (" movq %0, %%mm2\n" - "\tmovq %1, %%mm3\n" - - mmx_low_bytes_to_words(mm2,mm1,mm6) - mmx_low_bytes_to_words(mm3,mm5,mm6) - mmx_int_mult(mm5,mm1,mm7) - - mmx_high_bytes_to_words(mm2,mm4,mm6) - mmx_high_bytes_to_words(mm3,mm5,mm6) - mmx_int_mult(mm5,mm4,mm7) - - "\tpackuswb %%mm4, %%mm1\n" - - "\tmovq %%mm0, %%mm4\n" - "\tpandn %%mm1, %%mm4\n" - "\tmovq %%mm4, %%mm1\n" - "\t" pminub(mm3,mm2,mm4) "\n" - "\tpand %%mm0, %%mm2\n" - "\tpor %%mm2, %%mm1\n" - - "\tmovq %%mm1, %2\n" - : /* empty */ - : "m" (*op.A), "m" (*op.B), "m" (*op.D) - : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); - op.A += 8; - op.B += 8; - op.D += 8; - } - + { + asm volatile (" movq %0, %%mm2\n" + "\tmovq %1, %%mm3\n" + + mmx_low_bytes_to_words(mm2,mm1,mm6) + mmx_low_bytes_to_words(mm3,mm5,mm6) + mmx_int_mult(mm5,mm1,mm7) + + mmx_high_bytes_to_words(mm2,mm4,mm6) + mmx_high_bytes_to_words(mm3,mm5,mm6) + mmx_int_mult(mm5,mm4,mm7) + + "\tpackuswb %%mm4, %%mm1\n" + + "\tmovq %%mm0, %%mm4\n" + "\tpandn %%mm1, %%mm4\n" + "\tmovq %%mm4, %%mm1\n" + "\t" pminub(mm3,mm2,mm4) "\n" + "\tpand %%mm0, %%mm2\n" + "\tpor %%mm2, %%mm1\n" + + "\tmovq %%mm1, %2\n" + : /* empty */ + : "m" (*op.A), "m" (*op.B), "m" (*op.D) + : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); + op.A += 8; + op.B += 8; + op.D += 8; + } + if (op.n_pixels) - { - asm volatile (" movd %0, %%mm2\n" - "\tmovd %1, %%mm3\n" - - mmx_low_bytes_to_words(mm2,mm1,mm6) - mmx_low_bytes_to_words(mm3,mm5,mm6) - pmulwX(mm5,mm1,mm7) - - "\tpackuswb %%mm6, %%mm1\n" - - "\tmovq %%mm0, %%mm4\n" - "\tpandn %%mm1, %%mm4\n" - "\tmovq %%mm4, %%mm1\n" - "\t" pminub(mm3,mm2,mm4) "\n" - "\tpand %%mm0, %%mm2\n" - "\tpor %%mm2, %%mm1\n" - - "\tmovd %%mm1, %2\n" - : /* empty */ - : "m" (*op.A), "m" (*op.B), "m" (*op.D) - : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); - } - + { + asm volatile (" movd %0, %%mm2\n" + "\tmovd %1, %%mm3\n" + + mmx_low_bytes_to_words(mm2,mm1,mm6) + mmx_low_bytes_to_words(mm3,mm5,mm6) + pmulwX(mm5,mm1,mm7) + + "\tpackuswb %%mm6, %%mm1\n" + + "\tmovq %%mm0, %%mm4\n" + "\tpandn %%mm1, %%mm4\n" + "\tmovq %%mm4, %%mm1\n" + "\t" pminub(mm3,mm2,mm4) "\n" + "\tpand %%mm0, %%mm2\n" + "\tpor %%mm2, %%mm1\n" + + "\tmovd %%mm1, %2\n" + : /* empty */ + : "m" (*op.A), "m" (*op.B), "m" (*op.D) + : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); + } + asm("emms"); } @@ -882,7 +879,7 @@ sse_op_overlay(void) "\tmovq %0,%%mm6\n" /* mm6 = words of value 2 */ "\tpmullw %%mm5,%%mm6\n" /* mm6 = 2 * low bytes of B */ mmx_int_mult(mm6,mm4,mm7) /* mm4 = INT_MULT(mm6, mm4) */ - + /* high bytes */ mmx_high_bytes_to_words(mm3,mm5,mm0) "\tpcmpeqb %%mm1,%%mm1\n" @@ -891,9 +888,9 @@ sse_op_overlay(void) "\tmovq %0,%%mm6\n" /* mm6 = words of value 2 */ "\tpmullw %%mm5,%%mm6\n" /* mm6 = 2 * high bytes of B */ mmx_int_mult(mm6,mm1,mm7) /* mm1 = INT_MULT(mm6, mm1) */ - + "\tpackuswb %%mm1,%%mm4\n" /* mm4 = intermediate value */ - + mmx_low_bytes_to_words(mm4,mm5,mm0) mmx_low_bytes_to_words(mm2,mm6,mm0) "\tpaddw %%mm6,%%mm5\n" @@ -909,12 +906,12 @@ sse_op_overlay(void) "\tmovq %1,%%mm0\n" "\tmovq %%mm0,%%mm1\n" "\tpandn %%mm5,%%mm1\n" - + "\t" pminub(mm2,mm3,mm4) "\n" "\tpand %%mm0,%%mm3\n" - + "\tpor %%mm3,%%mm1\n" - + : /* empty */ : "m" (*rgba8_w2_64), "m" (*rgba8_alpha_mask_64) ); @@ -924,119 +921,116 @@ void xxxgimp_composite_overlay_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op) { GimpCompositeContext op = *_op; - + asm volatile ("pxor %%mm0,%%mm0\n" "movq %0,%%mm7" : /* empty */ - : "m" (*rgba8_w128_64) : "%mm0"); - + : "m" (*rgba8_w128_64) : "%mm0"); + for (; op.n_pixels >= 2; op.n_pixels -= 2) - { - asm volatile (" movq %0,%%mm2\n" - "\tmovq %1,%%mm3\n" + { + asm volatile (" movq %0,%%mm2\n" + "\tmovq %1,%%mm3\n" + + /* low bytes */ + mmx_low_bytes_to_words(mm3,mm5,mm0) + "\tpcmpeqb %%mm4,%%mm4\n" + "\tpsubb %%mm2,%%mm4\n" /* mm4 = 255 - A */ + "\tpunpcklbw %%mm0,%%mm4\n" /* mm4 = (low bytes as word) mm4 */ + "\tmovq %3,%%mm6\n" /* mm6 = words of value 2 */ + "\tpmullw %%mm5,%%mm6\n" /* mm6 = 2 * low bytes of B */ + mmx_int_mult(mm6,mm4,mm7) /* mm4 = INT_MULT(mm6, mm4) */ + + /* high bytes */ + mmx_high_bytes_to_words(mm3,mm5,mm0) + "\tpcmpeqb %%mm1,%%mm1\n" + "\tpsubb %%mm2,%%mm1\n" /* mm1 = 255 - A */ + "\tpunpckhbw %%mm0,%%mm1\n" /* mm1 = (high bytes as word) mm1 */ + "\tmovq %3,%%mm6\n" /* mm6 = words of value 2 */ + "\tpmullw %%mm5,%%mm6\n" /* mm6 = 2 * high bytes of B */ + mmx_int_mult(mm6,mm1,mm7) /* mm1 = INT_MULT(mm6, mm1) */ + "\tpackuswb %%mm1,%%mm4\n" /* mm4 = intermediate value */ + mmx_low_bytes_to_words(mm4,mm5,mm0) + mmx_low_bytes_to_words(mm2,mm6,mm0) + "\tpaddw %%mm6,%%mm5\n" + mmx_int_mult(mm6,mm5,mm7) /* mm5 = INT_MULT(mm6, mm5) low bytes */ + mmx_high_bytes_to_words(mm4,mm1,mm0) + mmx_high_bytes_to_words(mm2,mm6,mm0) + "\tpaddw %%mm6,%%mm1\n" + mmx_int_mult(mm6,mm1,mm7) /* mm1 = INT_MULT(mm6, mm1) high bytes */ - /* low bytes */ - mmx_low_bytes_to_words(mm3,mm5,mm0) - "\tpcmpeqb %%mm4,%%mm4\n" - "\tpsubb %%mm2,%%mm4\n" /* mm4 = 255 - A */ - "\tpunpcklbw %%mm0,%%mm4\n" /* mm4 = (low bytes as word) mm4 */ - "\tmovq %3,%%mm6\n" /* mm6 = words of value 2 */ - "\tpmullw %%mm5,%%mm6\n" /* mm6 = 2 * low bytes of B */ - mmx_int_mult(mm6,mm4,mm7) /* mm4 = INT_MULT(mm6, mm4) */ - - /* high bytes */ - mmx_high_bytes_to_words(mm3,mm5,mm0) - "\tpcmpeqb %%mm1,%%mm1\n" - "\tpsubb %%mm2,%%mm1\n" /* mm1 = 255 - A */ - "\tpunpckhbw %%mm0,%%mm1\n" /* mm1 = (high bytes as word) mm1 */ - "\tmovq %3,%%mm6\n" /* mm6 = words of value 2 */ - "\tpmullw %%mm5,%%mm6\n" /* mm6 = 2 * high bytes of B */ - mmx_int_mult(mm6,mm1,mm7) /* mm1 = INT_MULT(mm6, mm1) */ - - "\tpackuswb %%mm1,%%mm4\n" /* mm4 = intermediate value */ - - mmx_low_bytes_to_words(mm4,mm5,mm0) - mmx_low_bytes_to_words(mm2,mm6,mm0) - "\tpaddw %%mm6,%%mm5\n" - mmx_int_mult(mm6,mm5,mm7) /* mm5 = INT_MULT(mm6, mm5) low bytes */ - - mmx_high_bytes_to_words(mm4,mm1,mm0) - mmx_high_bytes_to_words(mm2,mm6,mm0) - "\tpaddw %%mm6,%%mm1\n" - mmx_int_mult(mm6,mm1,mm7) /* mm1 = INT_MULT(mm6, mm1) high bytes */ - - "\tpackuswb %%mm1,%%mm5\n" - - "\tmovq %4,%%mm0\n" - "\tmovq %%mm0,%%mm1\n" - "\tpandn %%mm5,%%mm1\n" - - "\t" pminub(mm2,mm3,mm4) "\n" - "\tpand %%mm0,%%mm3\n" - - "\tpor %%mm3,%%mm1\n" - - "\tmovq %%mm1,%2\n" - : "+m" (*op.A), "+m" (*op.B), "+m" (*op.D) - : "m" (*rgba8_w2_64), "m" (*rgba8_alpha_mask_64) - : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4"); - op.A += 8; - op.B += 8; - op.D += 8; - } + "\tpackuswb %%mm1,%%mm5\n" + + "\tmovq %4,%%mm0\n" + "\tmovq %%mm0,%%mm1\n" + "\tpandn %%mm5,%%mm1\n" + + "\t" pminub(mm2,mm3,mm4) "\n" + "\tpand %%mm0,%%mm3\n" + + "\tpor %%mm3,%%mm1\n" + + "\tmovq %%mm1,%2\n" + : "+m" (*op.A), "+m" (*op.B), "+m" (*op.D) + : "m" (*rgba8_w2_64), "m" (*rgba8_alpha_mask_64) + : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4"); + op.A += 8; + op.B += 8; + op.D += 8; + } if (op.n_pixels) - { - asm volatile (" movd %0,%%mm2\n" - "\tmovd %1,%%mm3\n" + { + asm volatile (" movd %0,%%mm2\n" + "\tmovd %1,%%mm3\n" + + /* low bytes */ + mmx_low_bytes_to_words(mm3,mm5,mm0) + "\tpcmpeqb %%mm4,%%mm4\n" + "\tpsubb %%mm2,%%mm4\n" /* mm4 = 255 - A */ + "\tpunpcklbw %%mm0,%%mm4\n" /* mm4 = (low bytes as word) mm4 */ + "\tmovq %3,%%mm6\n" /* mm6 = words of value 2 */ + "\tpmullw %%mm5,%%mm6\n" /* mm6 = 2 * low bytes of B */ + mmx_int_mult(mm6,mm4,mm7) /* mm4 = INT_MULT(mm6, mm4) */ + + /* high bytes */ + mmx_high_bytes_to_words(mm3,mm5,mm0) + "\tpcmpeqb %%mm1,%%mm1\n" + "\tpsubb %%mm2,%%mm1\n" /* mm1 = 255 - A */ + "\tpunpckhbw %%mm0,%%mm1\n" /* mm1 = (high bytes as word) mm1 */ + "\tmovq %3,%%mm6\n" /* mm6 = words of value 2 */ + "\tpmullw %%mm5,%%mm6\n" /* mm6 = 2 * high bytes of B */ + mmx_int_mult(mm6,mm1,mm7) /* mm1 = INT_MULT(mm6, mm1) */ + + "\tpackuswb %%mm1,%%mm4\n" /* mm4 = intermediate value */ + + mmx_low_bytes_to_words(mm4,mm5,mm0) + mmx_low_bytes_to_words(mm2,mm6,mm0) + "\tpaddw %%mm6,%%mm5\n" + mmx_int_mult(mm6,mm5,mm7) /* mm5 = INT_MULT(mm6, mm5) low bytes */ + + mmx_high_bytes_to_words(mm4,mm1,mm0) + mmx_high_bytes_to_words(mm2,mm6,mm0) + "\tpaddw %%mm6,%%mm1\n" + mmx_int_mult(mm6,mm1,mm7) /* mm1 = INT_MULT(mm6, mm1) high bytes */ + + "\tpackuswb %%mm1,%%mm5\n" + + "\tmovq %4,%%mm0\n" + "\tmovq %%mm0,%%mm1\n" + "\tpandn %%mm5,%%mm1\n" + + "\t" pminub(mm2,mm3,mm4) "\n" + "\tpand %%mm0,%%mm3\n" + + "\tpor %%mm3,%%mm1\n" + + "\tmovd %%mm1,%2\n" + : /* empty */ + : "m" (*op.A), "m" (*op.B), "m" (*op.D), "m" (*rgba8_w2_64), "m" (*rgba8_alpha_mask_64) + : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4"); + } - /* low bytes */ - mmx_low_bytes_to_words(mm3,mm5,mm0) - "\tpcmpeqb %%mm4,%%mm4\n" - "\tpsubb %%mm2,%%mm4\n" /* mm4 = 255 - A */ - "\tpunpcklbw %%mm0,%%mm4\n" /* mm4 = (low bytes as word) mm4 */ - "\tmovq %3,%%mm6\n" /* mm6 = words of value 2 */ - "\tpmullw %%mm5,%%mm6\n" /* mm6 = 2 * low bytes of B */ - mmx_int_mult(mm6,mm4,mm7) /* mm4 = INT_MULT(mm6, mm4) */ - - /* high bytes */ - mmx_high_bytes_to_words(mm3,mm5,mm0) - "\tpcmpeqb %%mm1,%%mm1\n" - "\tpsubb %%mm2,%%mm1\n" /* mm1 = 255 - A */ - "\tpunpckhbw %%mm0,%%mm1\n" /* mm1 = (high bytes as word) mm1 */ - "\tmovq %3,%%mm6\n" /* mm6 = words of value 2 */ - "\tpmullw %%mm5,%%mm6\n" /* mm6 = 2 * high bytes of B */ - mmx_int_mult(mm6,mm1,mm7) /* mm1 = INT_MULT(mm6, mm1) */ - - "\tpackuswb %%mm1,%%mm4\n" /* mm4 = intermediate value */ - - mmx_low_bytes_to_words(mm4,mm5,mm0) - mmx_low_bytes_to_words(mm2,mm6,mm0) - "\tpaddw %%mm6,%%mm5\n" - mmx_int_mult(mm6,mm5,mm7) /* mm5 = INT_MULT(mm6, mm5) low bytes */ - - mmx_high_bytes_to_words(mm4,mm1,mm0) - mmx_high_bytes_to_words(mm2,mm6,mm0) - "\tpaddw %%mm6,%%mm1\n" - mmx_int_mult(mm6,mm1,mm7) /* mm1 = INT_MULT(mm6, mm1) high bytes */ - - "\tpackuswb %%mm1,%%mm5\n" - - "\tmovq %4,%%mm0\n" - "\tmovq %%mm0,%%mm1\n" - "\tpandn %%mm5,%%mm1\n" - - "\t" pminub(mm2,mm3,mm4) "\n" - "\tpand %%mm0,%%mm3\n" - - "\tpor %%mm3,%%mm1\n" - - "\tmovd %%mm1,%2\n" - : /* empty */ - : "m" (*op.A), "m" (*op.B), "m" (*op.D), "m" (*rgba8_w2_64), "m" (*rgba8_alpha_mask_64) - : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4"); - } - asm("emms"); } @@ -1058,48 +1052,48 @@ gimp_composite_scale_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op) "\tmovq %1,%%mm7\n" : /* empty */ : "m" (op.scale.scale), "m" (*rgba8_w128_64) - : "%eax", "%mm0", "%mm5", "%mm6", "%mm7"); + : "%eax", "%ebx", "%mm0", "%mm3", "%mm5", "%mm6", "%mm7"); for (; op.n_pixels >= 2; op.n_pixels -= 2) - { - asm volatile ("movq %0,%%mm2\n" - "\tmovq %%mm2,%%mm1\n" - "\tpunpcklbw %%mm0,%%mm1\n" - "\tmovq %%mm3,%%mm5\n" - - "\t" pmulwX(mm5,mm1,mm7) "\n" - - "\tmovq %%mm2,%%mm4\n" - "\tpunpckhbw %%mm0,%%mm4\n" - "\tmovq %%mm3,%%mm5\n" - - "\t" pmulwX(mm5,mm4,mm7) "\n" - - "\tpackuswb %%mm4,%%mm1\n" - - "\tmovq %%mm1,%1\n" - : /* empty */ - : "m" (*op.A), "m" (*op.D) - : "0", "1", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"); - op.A += 8; - op.D += 8; - } - + { + asm volatile ("movq %1,%%mm2\n" + "\tmovq %%mm2,%%mm1\n" + "\tpunpcklbw %%mm0,%%mm1\n" + "\tmovq %%mm3,%%mm5\n" + + "\t" pmulwX(mm5,mm1,mm7) "\n" + + "\tmovq %%mm2,%%mm4\n" + "\tpunpckhbw %%mm0,%%mm4\n" + "\tmovq %%mm3,%%mm5\n" + + "\t" pmulwX(mm5,mm4,mm7) "\n" + + "\tpackuswb %%mm4,%%mm1\n" + + "\tmovq %%mm1,%0\n" + : "+m" (*op.D) + : "m" (*op.A) + : "%mm1", "%mm2", "%mm4", "%mm5", "%mm7"); + op.A += 8; + op.D += 8; + } + if (op.n_pixels) - { - asm volatile ("movd %0,%%mm2\n" - "\tmovq %%mm2,%%mm1\n" - "\tpunpcklbw %%mm0,%%mm1\n" - "\tmovq %%mm3,%%mm5\n" + { + asm volatile (" movd %1,%%mm2\n" + "\tmovq %%mm2,%%mm1\n" + "\tpunpcklbw %%mm0,%%mm1\n" + "\tmovq %%mm3,%%mm5\n" - "\t" pmulwX(mm5,mm1,mm7) "\n" + "\t" pmulwX(mm5,mm1,mm7) "\n" - "\tpackuswb %%mm0,%%mm1\n" - "\tmovd %%mm1,%1\n" - : /* empty */ - : "m" (*op.A), "m" (*op.D) - : "0", "1", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"); - } + "\tpackuswb %%mm0,%%mm1\n" + "\tmovd %%mm1,%0\n" + : "+m" (*op.D) + : "m" (*op.A) + : "%mm1", "%mm2", "%mm4", "%mm5", "%mm6", "%mm7"); + } asm("emms"); } @@ -1108,118 +1102,118 @@ void gimp_composite_screen_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op) { GimpCompositeContext op = *_op; - + asm volatile ("movq %0,%%mm0" : : "m" (*rgba8_alpha_mask_64) : "%mm0"); asm volatile ("movq %0,%%mm7" : : "m" (*rgba8_w128_64) : "%mm7"); asm volatile ("pxor %mm6, %mm6"); - + for (; op.n_pixels >= 2; op.n_pixels -= 2) - { - asm volatile (" movq %0,%%mm2\n" - "\tmovq %1,%%mm3\n" - - "\tpcmpeqb %%mm4,%%mm4\n" - "\tpsubb %%mm2,%%mm4\n" - "\tpcmpeqb %%mm5,%%mm5\n" - "\tpsubb %%mm3,%%mm5\n" - - "\tpunpcklbw %%mm6,%%mm4\n" - "\tpunpcklbw %%mm6,%%mm5\n" - "\tpmullw %%mm4,%%mm5\n" - "\tpaddw %%mm7,%%mm5\n" - "\tmovq %%mm5,%%mm1\n" - "\tpsrlw $ 8,%%mm1\n" - "\tpaddw %%mm5,%%mm1\n" - "\tpsrlw $ 8,%%mm1\n" - - "\tpcmpeqb %%mm4,%%mm4\n" - "\tpsubb %%mm2,%%mm4\n" - "\tpcmpeqb %%mm5,%%mm5\n" - "\tpsubb %%mm3,%%mm5\n" - - "\tpunpckhbw %%mm6,%%mm4\n" - "\tpunpckhbw %%mm6,%%mm5\n" - "\tpmullw %%mm4,%%mm5\n" - "\tpaddw %%mm7,%%mm5\n" - "\tmovq %%mm5,%%mm4\n" - "\tpsrlw $ 8,%%mm4\n" - "\tpaddw %%mm5,%%mm4\n" - "\tpsrlw $ 8,%%mm4\n" - - "\tpackuswb %%mm4,%%mm1\n" - - "\tpcmpeqb %%mm4,%%mm4\n" - "\tpsubb %%mm1,%%mm4\n" - - "\tmovq %%mm0,%%mm1\n" - "\tpandn %%mm4,%%mm1\n" - - "\t" pminub(mm2,mm3,mm4) "\n" - "\tpand %%mm0,%%mm3\n" - - "\tpor %%mm3,%%mm1\n" - - "\tmovq %%mm1,%2\n" - : /* empty */ - : "m" (*op.A), "m" (*op.B), "m" (*op.D) - : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); - op.A += 8; - op.B += 8; - op.D += 8; - } + { + asm volatile (" movq %0,%%mm2\n" + "\tmovq %1,%%mm3\n" + + "\tpcmpeqb %%mm4,%%mm4\n" + "\tpsubb %%mm2,%%mm4\n" + "\tpcmpeqb %%mm5,%%mm5\n" + "\tpsubb %%mm3,%%mm5\n" + + "\tpunpcklbw %%mm6,%%mm4\n" + "\tpunpcklbw %%mm6,%%mm5\n" + "\tpmullw %%mm4,%%mm5\n" + "\tpaddw %%mm7,%%mm5\n" + "\tmovq %%mm5,%%mm1\n" + "\tpsrlw $ 8,%%mm1\n" + "\tpaddw %%mm5,%%mm1\n" + "\tpsrlw $ 8,%%mm1\n" + + "\tpcmpeqb %%mm4,%%mm4\n" + "\tpsubb %%mm2,%%mm4\n" + "\tpcmpeqb %%mm5,%%mm5\n" + "\tpsubb %%mm3,%%mm5\n" + + "\tpunpckhbw %%mm6,%%mm4\n" + "\tpunpckhbw %%mm6,%%mm5\n" + "\tpmullw %%mm4,%%mm5\n" + "\tpaddw %%mm7,%%mm5\n" + "\tmovq %%mm5,%%mm4\n" + "\tpsrlw $ 8,%%mm4\n" + "\tpaddw %%mm5,%%mm4\n" + "\tpsrlw $ 8,%%mm4\n" + + "\tpackuswb %%mm4,%%mm1\n" + + "\tpcmpeqb %%mm4,%%mm4\n" + "\tpsubb %%mm1,%%mm4\n" + + "\tmovq %%mm0,%%mm1\n" + "\tpandn %%mm4,%%mm1\n" + + "\t" pminub(mm2,mm3,mm4) "\n" + "\tpand %%mm0,%%mm3\n" + + "\tpor %%mm3,%%mm1\n" + + "\tmovq %%mm1,%2\n" + : /* empty */ + : "m" (*op.A), "m" (*op.B), "m" (*op.D) + : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); + op.A += 8; + op.B += 8; + op.D += 8; + } if (op.n_pixels) - { - asm volatile (" movd %0,%%mm2\n" - "\tmovd %1,%%mm3\n" - - "\tpcmpeqb %%mm4,%%mm4\n" - "\tpsubb %%mm2,%%mm4\n" - "\tpcmpeqb %%mm5,%%mm5\n" - "\tpsubb %%mm3,%%mm5\n" - - "\tpunpcklbw %%mm6,%%mm4\n" - "\tpunpcklbw %%mm6,%%mm5\n" - "\tpmullw %%mm4,%%mm5\n" - "\tpaddw %%mm7,%%mm5\n" - "\tmovq %%mm5,%%mm1\n" - "\tpsrlw $ 8,%%mm1\n" - "\tpaddw %%mm5,%%mm1\n" - "\tpsrlw $ 8,%%mm1\n" - - "\tpcmpeqb %%mm4,%%mm4\n" - "\tpsubb %%mm2,%%mm4\n" - "\tpcmpeqb %%mm5,%%mm5\n" - "\tpsubb %%mm3,%%mm5\n" - - "\tpunpckhbw %%mm6,%%mm4\n" - "\tpunpckhbw %%mm6,%%mm5\n" - "\tpmullw %%mm4,%%mm5\n" - "\tpaddw %%mm7,%%mm5\n" - "\tmovq %%mm5,%%mm4\n" - "\tpsrlw $ 8,%%mm4\n" - "\tpaddw %%mm5,%%mm4\n" - "\tpsrlw $ 8,%%mm4\n" - - "\tpackuswb %%mm4,%%mm1\n" - - "\tpcmpeqb %%mm4,%%mm4\n" - "\tpsubb %%mm1,%%mm4\n" - - "\tmovq %%mm0,%%mm1\n" - "\tpandn %%mm4,%%mm1\n" - - "\t" pminub(mm2,mm3,mm4) "\n" - "\tpand %%mm0,%%mm3\n" - - "\tpor %%mm3,%%mm1\n" - - "\tmovd %%mm1,%2\n" - : /* empty */ - : "m" (*op.A), "m" (*op.B), "m" (*op.D) - : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); - } + { + asm volatile (" movd %0,%%mm2\n" + "\tmovd %1,%%mm3\n" + "\tpcmpeqb %%mm4,%%mm4\n" + "\tpsubb %%mm2,%%mm4\n" + "\tpcmpeqb %%mm5,%%mm5\n" + "\tpsubb %%mm3,%%mm5\n" + + "\tpunpcklbw %%mm6,%%mm4\n" + "\tpunpcklbw %%mm6,%%mm5\n" + "\tpmullw %%mm4,%%mm5\n" + "\tpaddw %%mm7,%%mm5\n" + "\tmovq %%mm5,%%mm1\n" + "\tpsrlw $ 8,%%mm1\n" + "\tpaddw %%mm5,%%mm1\n" + "\tpsrlw $ 8,%%mm1\n" + + "\tpcmpeqb %%mm4,%%mm4\n" + "\tpsubb %%mm2,%%mm4\n" + "\tpcmpeqb %%mm5,%%mm5\n" + "\tpsubb %%mm3,%%mm5\n" + + "\tpunpckhbw %%mm6,%%mm4\n" + "\tpunpckhbw %%mm6,%%mm5\n" + "\tpmullw %%mm4,%%mm5\n" + "\tpaddw %%mm7,%%mm5\n" + "\tmovq %%mm5,%%mm4\n" + "\tpsrlw $ 8,%%mm4\n" + "\tpaddw %%mm5,%%mm4\n" + "\tpsrlw $ 8,%%mm4\n" + + "\tpackuswb %%mm4,%%mm1\n" + + "\tpcmpeqb %%mm4,%%mm4\n" + "\tpsubb %%mm1,%%mm4\n" + + "\tmovq %%mm0,%%mm1\n" + "\tpandn %%mm4,%%mm1\n" + + "\t" pminub(mm2,mm3,mm4) "\n" + "\tpand %%mm0,%%mm3\n" + + "\tpor %%mm3,%%mm1\n" + + "\tmovd %%mm1,%2\n" + : /* empty */ + : "m" (*op.A), "m" (*op.B), "m" (*op.D) + : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); + } + asm("emms"); } @@ -1228,54 +1222,46 @@ void gimp_composite_subtract_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op) { GimpCompositeContext op = *_op; - + asm volatile ("movq %0,%%mm0" : : "m" (*rgba8_alpha_mask_64) : "%mm0"); - + for (; op.n_pixels >= 2; op.n_pixels -= 2) - { - asm volatile (" movq %0,%%mm2\n" - "\tmovq %1,%%mm3\n" - - "\tmovq %%mm2,%%mm4\n" - "\tpsubusb %%mm3,%%mm4\n" - - "\tmovq %%mm0,%%mm1\n" - "\tpandn %%mm4,%%mm1\n" - - "\t" pminub(mm3,mm2,mm4) "\n" - - "\tpand %%mm0,%%mm2\n" - "\tpor %%mm2,%%mm1\n" - "\tmovq %%mm1,%2\n" - : /* empty */ - : "m" (*op.A), "m" (*op.B), "m" (*op.D) - : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); - op.A += 8; - op.B += 8; - op.D += 8; - } - + { + asm volatile (" movq %1,%%mm2\n" + "\tmovq %2,%%mm3\n" + "\tmovq %%mm2,%%mm4\n" + "\tpsubusb %%mm3,%%mm4\n" + "\tmovq %%mm0,%%mm1\n" + "\tpandn %%mm4,%%mm1\n" + "\tpminub %%mm3,%%mm2\n" + "\tpand %%mm0,%%mm2\n" + "\tpor %%mm2,%%mm1\n" + "\tmovq %%mm1,%0\n" + : "+m" (*op.D) + : "m" (*op.A), "m" (*op.B) + : "%mm1", "%mm2", "%mm3", "%mm4"); + op.A += 8; + op.B += 8; + op.D += 8; + } + if (op.n_pixels) - { - asm volatile (" movd %0,%%mm2\n" - "\tmovd %1,%%mm3\n" - - "\tmovq %%mm2,%%mm4\n" - "\tpsubusb %%mm3,%%mm4\n" - - "\tmovq %%mm0,%%mm1\n" - "\tpandn %%mm4,%%mm1\n" - - "\t" pminub(mm3,mm2,mm4) "\n" - - "\tpand %%mm0,%%mm2\n" - "\tpor %%mm2,%%mm1\n" - "\tmovd %%mm1,%2\n" - : /* empty */ - : "m" (*op.A), "m" (*op.B), "m" (*op.D) - : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); - } - + { + asm volatile (" movd %1,%%mm2\n" + "\tmovd %2,%%mm3\n" + "\tmovq %%mm2,%%mm4\n" + "\tpsubusb %%mm3,%%mm4\n" + "\tmovq %%mm0,%%mm1\n" + "\tpandn %%mm4,%%mm1\n" + "\tpminub %%mm3,%%mm2\n" + "\tpand %%mm0,%%mm2\n" + "\tpor %%mm2,%%mm1\n" + "\tmovd %%mm1,%0\n" + : "+m" (*op.D) + : "m" (*op.A), "m" (*op.B) + : "%mm1", "%mm2", "%mm3", "%mm4"); + } + asm("emms"); } @@ -1283,30 +1269,30 @@ void gimp_composite_swap_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op) { GimpCompositeContext op = *_op; - + for (; op.n_pixels >= 2; op.n_pixels -= 2) - { - asm volatile (" movq %0,%%mm2\n" - "\tmovq %1,%%mm3\n" - "\tmovq %%mm3,%0\n" - "\tmovq %%mm2,%1\n" - : /* empty */ - : "m" (*op.A), "m" (*op.B) - : "0", "1", "%mm1", "%mm2", "%mm3", "%mm4"); - op.A += 8; - op.B += 8; - } - + { + asm volatile (" movq %0,%%mm2\n" + "\tmovq %1,%%mm3\n" + "\tmovq %%mm3,%0\n" + "\tmovq %%mm2,%1\n" + : "+m" (*op.A), "+m" (*op.B) + : + : "%mm1", "%mm2", "%mm3", "%mm4"); + op.A += 8; + op.B += 8; + } + if (op.n_pixels) - { - asm volatile (" movd %0,%%mm2\n" - "\tmovd %1,%%mm3\n" - "\tmovd %%mm3,%0\n" - "\tmovd %%mm2,%1\n" - : /* empty */ - : "m" (*op.A), "m" (*op.B) - : "0", "1", "%mm1", "%mm2", "%mm3", "%mm4"); - } + { + asm volatile (" movd %0,%%mm2\n" + "\tmovd %1,%%mm3\n" + "\tmovd %%mm3,%0\n" + "\tmovd %%mm2,%1\n" + : "+m" (*op.A), "+m" (*op.B) + : /* empty */ + : "%mm1", "%mm2", "%mm3", "%mm4"); + } asm("emms"); } @@ -1320,20 +1306,20 @@ void xxxgimp_composite_addition_va8_va8_va8_sse (GimpCompositeContext *_op) { GimpCompositeContext op = *_op; - + asm("pushl %edi"); asm("pushl %ebx"); asm("movl 12(%esp), %edi"); asm("movq v8_alpha_mask, %mm0"); - + asm("subl $ 4, %ecx"); asm("jl .add_pixels_1a_1a_last3"); asm("movl $ 8, %ebx"); asm(".add_pixels_1a_1a_loop:"); - + asm("movq (%eax), %mm2"); asm("movq (%edx), %mm3"); - + asm("movq %mm2, %mm4"); asm("paddusb %mm3, %mm4"); asm("movq %mm0, %mm1"); @@ -1349,13 +1335,13 @@ xxxgimp_composite_addition_va8_va8_va8_sse (GimpCompositeContext *_op) asm("addl %ebx, %edi"); asm("subl $ 4, %ecx"); asm("jge .add_pixels_1a_1a_loop"); - + asm(".add_pixels_1a_1a_last3:"); asm("test $ 2, %ecx"); asm("jz .add_pixels_1a_1a_last1"); asm("movd (%eax), %mm2"); asm("movd (%edx), %mm3"); - + asm("movq %mm2, %mm4"); asm("paddusb %mm3, %mm4"); asm("movq %mm0, %mm1"); @@ -1368,7 +1354,7 @@ xxxgimp_composite_addition_va8_va8_va8_sse (GimpCompositeContext *_op) asm("addl $ 4, %eax"); asm("addl $ 4, %edx"); asm("addl $ 4, %edi"); - + asm(".add_pixels_1a_1a_last1:"); asm("test $ 1, %ecx"); asm("jz .add_pixels_1a_1a_end"); @@ -1408,107 +1394,107 @@ xxxgimp_composite_burn_va8_va8_va8_sse (GimpCompositeContext *_op) : "%mm1"); for (; op.n_pixels >= 4; op.n_pixels -= 4) - { - asm volatile (" movq (%0),%%mm0; addl $8,%0\n" - "\tmovq (%1),%%mm1; addl $8,%1\n" + { + asm volatile (" movq (%0),%%mm0; addl $8,%0\n" + "\tmovq (%1),%%mm1; addl $8,%1\n" + + "\tmovq %3,%%mm2\n" + "\tpsubb %%mm0,%%mm2\n" /* mm2 = 255 - A */ + "\tpxor %%mm4,%%mm4\n" + "\tpunpcklbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256 */ - "\tmovq %3,%%mm2\n" - "\tpsubb %%mm0,%%mm2\n" /* mm2 = 255 - A */ - "\tpxor %%mm4,%%mm4\n" - "\tpunpcklbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256 */ + "\tmovq %%mm1,%%mm3\n" + "\tpxor %%mm5,%%mm5\n" + "\tpunpcklbw %%mm5,%%mm3\n" + "\tmovq %4,%%mm5\n" + "\tpaddusw %%mm3,%%mm5\n" /* mm5 = B + 1 */ - "\tmovq %%mm1,%%mm3\n" - "\tpxor %%mm5,%%mm5\n" - "\tpunpcklbw %%mm5,%%mm3\n" - "\tmovq %4,%%mm5\n" - "\tpaddusw %%mm3,%%mm5\n" /* mm5 = B + 1 */ + "\t" pdivwqX(mm4,mm5,mm7) "\n" - "\t" pdivwqX(mm4,mm5,mm7) "\n" + "\tmovq %3,%%mm2\n" + "\tpsubb %%mm0,%%mm2\n" /* mm2 = 255 - A */ + "\tpxor %%mm4,%%mm4\n" + "\tpunpckhbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256 */ - "\tmovq %3,%%mm2\n" - "\tpsubb %%mm0,%%mm2\n" /* mm2 = 255 - A */ - "\tpxor %%mm4,%%mm4\n" - "\tpunpckhbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256 */ + "\tmovq %%mm1,%%mm3\n" + "\tpxor %%mm5,%%mm5\n" + "\tpunpckhbw %%mm5,%%mm3\n" + "\tmovq %4,%%mm5\n" + "\tpaddusw %%mm3,%%mm5\n" /* mm5 = B + 1 */ + "\t" pdivwqX(mm4,mm5,mm6) "\n" - "\tmovq %%mm1,%%mm3\n" - "\tpxor %%mm5,%%mm5\n" - "\tpunpckhbw %%mm5,%%mm3\n" - "\tmovq %4,%%mm5\n" - "\tpaddusw %%mm3,%%mm5\n" /* mm5 = B + 1 */ - "\t" pdivwqX(mm4,mm5,mm6) "\n" - - "\tmovq %5,%%mm4\n" - "\tmovq %%mm4,%%mm5\n" - "\tpsubusw %%mm6,%%mm4\n" - "\tpsubusw %%mm7,%%mm5\n" + "\tmovq %5,%%mm4\n" + "\tmovq %%mm4,%%mm5\n" + "\tpsubusw %%mm6,%%mm4\n" + "\tpsubusw %%mm7,%%mm5\n" - "\tpackuswb %%mm4,%%mm5\n" + "\tpackuswb %%mm4,%%mm5\n" - "\t" pminub(mm0,mm1,mm3) "\n" /* mm1 = min(mm0,mm1) clobber mm3 */ - - "\tmovq %6,%%mm7\n" - "\tpand %%mm7,%%mm1\n" /* mm1 = mm7 & alpha_mask */ - - "\tpandn %%mm5,%%mm7\n" /* mm7 = ~mm7 & mm5 */ - "\tpor %%mm1,%%mm7\n" /* mm7 = mm7 | mm1 */ - - "\tmovq %%mm7,(%2); addl $8,%2\n" - : "+r" (op.A), "+r" (op.B), "+r" (op.D) - : "m" (*va8_b255), "m" (*va8_w1), "m" (*va8_w255), "m" (*va8_alpha_mask) - : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4"); - } + "\t" pminub(mm0,mm1,mm3) "\n" /* mm1 = min(mm0,mm1) clobber mm3 */ + + "\tmovq %6,%%mm7\n" + "\tpand %%mm7,%%mm1\n" /* mm1 = mm7 & alpha_mask */ + + "\tpandn %%mm5,%%mm7\n" /* mm7 = ~mm7 & mm5 */ + "\tpor %%mm1,%%mm7\n" /* mm7 = mm7 | mm1 */ + + "\tmovq %%mm7,(%2); addl $8,%2\n" + : "+r" (op.A), "+r" (op.B), "+r" (op.D) + : "m" (*va8_b255), "m" (*va8_w1), "m" (*va8_w255), "m" (*va8_alpha_mask) + : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4"); + } if (op.n_pixels) - { - asm volatile (" movd (%0),%%mm0\n" - "\tmovd (%1),%%mm1\n" - - "\tmovq %3,%%mm2\n" - "\tpsubb %%mm0,%%mm2\n" /* mm2 = 255 - A */ - "\tpxor %%mm4,%%mm4\n" - "\tpunpcklbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256 */ - - "\tmovq %%mm1,%%mm3\n" - "\tpxor %%mm5,%%mm5\n" - "\tpunpcklbw %%mm5,%%mm3\n" - "\tmovq %4,%%mm5\n" - "\tpaddusw %%mm3,%%mm5\n" /* mm5 = B + 1 */ - - "\t" pdivwqX(mm4,mm5,mm7) "\n" - - "\tmovq %3,%%mm2\n" - "\tpsubb %%mm0,%%mm2\n" /* mm2 = 255 - A */ - "\tpxor %%mm4,%%mm4\n" - "\tpunpckhbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256 */ - - "\tmovq %%mm1,%%mm3\n" - "\tpxor %%mm5,%%mm5\n" - "\tpunpckhbw %%mm5,%%mm3\n" - "\tmovq %4,%%mm5\n" - "\tpaddusw %%mm3,%%mm5\n" /* mm5 = B + 1 */ - "\t" pdivwqX(mm4,mm5,mm6) "\n" - - "\tmovq %5,%%mm4\n" - "\tmovq %%mm4,%%mm5\n" - "\tpsubusw %%mm6,%%mm4\n" - "\tpsubusw %%mm7,%%mm5\n" - - "\tpackuswb %%mm4,%%mm5\n" - - "\t" pminub(mm0,mm1,mm3) "\n" /* mm1 = min(mm0,mm1) clobber mm3 */ - - "\tmovq %6,%%mm7\n" - "\tpand %%mm7,%%mm1\n" /* mm1 = mm7 & alpha_mask */ - - "\tpandn %%mm5,%%mm7\n" /* mm7 = ~mm7 & mm5 */ - "\tpor %%mm1,%%mm7\n" /* mm7 = mm7 | mm1 */ - - "\tmovd %%mm7,(%2)\n" - : /* empty */ - : "r" (op.A), "r" (op.B), "r" (op.D), "m" (*va8_b255), "m" (*va8_w1), "m" (*va8_w255), "m" (*va8_alpha_mask) - : "0", "1", "2", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"); - } - + { + asm volatile (" movd (%0),%%mm0\n" + "\tmovd (%1),%%mm1\n" + + "\tmovq %3,%%mm2\n" + "\tpsubb %%mm0,%%mm2\n" /* mm2 = 255 - A */ + "\tpxor %%mm4,%%mm4\n" + "\tpunpcklbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256 */ + + "\tmovq %%mm1,%%mm3\n" + "\tpxor %%mm5,%%mm5\n" + "\tpunpcklbw %%mm5,%%mm3\n" + "\tmovq %4,%%mm5\n" + "\tpaddusw %%mm3,%%mm5\n" /* mm5 = B + 1 */ + + "\t" pdivwqX(mm4,mm5,mm7) "\n" + + "\tmovq %3,%%mm2\n" + "\tpsubb %%mm0,%%mm2\n" /* mm2 = 255 - A */ + "\tpxor %%mm4,%%mm4\n" + "\tpunpckhbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256 */ + + "\tmovq %%mm1,%%mm3\n" + "\tpxor %%mm5,%%mm5\n" + "\tpunpckhbw %%mm5,%%mm3\n" + "\tmovq %4,%%mm5\n" + "\tpaddusw %%mm3,%%mm5\n" /* mm5 = B + 1 */ + "\t" pdivwqX(mm4,mm5,mm6) "\n" + + "\tmovq %5,%%mm4\n" + "\tmovq %%mm4,%%mm5\n" + "\tpsubusw %%mm6,%%mm4\n" + "\tpsubusw %%mm7,%%mm5\n" + + "\tpackuswb %%mm4,%%mm5\n" + + "\t" pminub(mm0,mm1,mm3) "\n" /* mm1 = min(mm0,mm1) clobber mm3 */ + + "\tmovq %6,%%mm7\n" + "\tpand %%mm7,%%mm1\n" /* mm1 = mm7 & alpha_mask */ + + "\tpandn %%mm5,%%mm7\n" /* mm7 = ~mm7 & mm5 */ + "\tpor %%mm1,%%mm7\n" /* mm7 = mm7 | mm1 */ + + "\tmovd %%mm7,(%2)\n" + : /* empty */ + : "r" (op.A), "r" (op.B), "r" (op.D), "m" (*va8_b255), "m" (*va8_w1), "m" (*va8_w255), "m" (*va8_alpha_mask) + : "0", "1", "2", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"); + } + asm("emms"); } diff --git a/app/composite/gimp-composite-sse2-installer.c b/app/composite/gimp-composite-sse2-installer.c index 9ce5757eed..a0de35cfde 100644 --- a/app/composite/gimp-composite-sse2-installer.c +++ b/app/composite/gimp-composite-sse2-installer.c @@ -20,6 +20,10 @@ static struct install_table { { GIMP_COMPOSITE_DIFFERENCE, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, gimp_composite_difference_rgba8_rgba8_rgba8_sse2 }, { GIMP_COMPOSITE_ADDITION, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, gimp_composite_addition_rgba8_rgba8_rgba8_sse2 }, { GIMP_COMPOSITE_SUBTRACT, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, gimp_composite_subtract_rgba8_rgba8_rgba8_sse2 }, + { GIMP_COMPOSITE_DARKEN, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, gimp_composite_darken_rgba8_rgba8_rgba8_sse2 }, + { GIMP_COMPOSITE_LIGHTEN, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, gimp_composite_lighten_rgba8_rgba8_rgba8_sse2 }, + { GIMP_COMPOSITE_DODGE, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, gimp_composite_dodge_rgba8_rgba8_rgba8_sse2 }, + { GIMP_COMPOSITE_GRAIN_EXTRACT, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, gimp_composite_grain_extract_rgba8_rgba8_rgba8_sse2 }, { GIMP_COMPOSITE_SWAP, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, gimp_composite_swap_rgba8_rgba8_rgba8_sse2 }, #endif { 0, 0, 0, 0, NULL } diff --git a/app/composite/gimp-composite-sse2-test.c b/app/composite/gimp-composite-sse2-test.c index 762f4676d7..fda9fb3bdd 100644 --- a/app/composite/gimp-composite-sse2-test.c +++ b/app/composite/gimp-composite-sse2-test.c @@ -57,29 +57,72 @@ gimp_composite_sse2_test (int iterations, int n_pixels) } - gimp_composite_context_init (&special_ctx, GIMP_COMPOSITE_DIFFERENCE, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D2); - gimp_composite_context_init (&generic_ctx, GIMP_COMPOSITE_DIFFERENCE, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D1); - ft0 = gimp_composite_regression_time_function (iterations, gimp_composite_dispatch, &generic_ctx); - ft1 = gimp_composite_regression_time_function (iterations, gimp_composite_difference_rgba8_rgba8_rgba8_sse2, &special_ctx); - if (gimp_composite_regression_compare_contexts ("difference", &generic_ctx, &special_ctx)) { - return (1); - } - gimp_composite_regression_timer_report ("difference", ft0, ft1); - gimp_composite_context_init (&special_ctx, GIMP_COMPOSITE_ADDITION, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D2); gimp_composite_context_init (&generic_ctx, GIMP_COMPOSITE_ADDITION, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D1); ft0 = gimp_composite_regression_time_function (iterations, gimp_composite_dispatch, &generic_ctx); ft1 = gimp_composite_regression_time_function (iterations, gimp_composite_addition_rgba8_rgba8_rgba8_sse2, &special_ctx); if (gimp_composite_regression_compare_contexts ("addition", &generic_ctx, &special_ctx)) { + printf("addition failed\n"); return (1); } gimp_composite_regression_timer_report ("addition", ft0, ft1); + gimp_composite_context_init (&special_ctx, GIMP_COMPOSITE_DARKEN, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D2); + gimp_composite_context_init (&generic_ctx, GIMP_COMPOSITE_DARKEN, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D1); + ft0 = gimp_composite_regression_time_function (iterations, gimp_composite_dispatch, &generic_ctx); + ft1 = gimp_composite_regression_time_function (iterations, gimp_composite_darken_rgba8_rgba8_rgba8_sse2, &special_ctx); + if (gimp_composite_regression_compare_contexts ("darken", &generic_ctx, &special_ctx)) { + printf("darken failed\n"); + return (1); + } + gimp_composite_regression_timer_report ("darken", ft0, ft1); + + gimp_composite_context_init (&special_ctx, GIMP_COMPOSITE_DIFFERENCE, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D2); + gimp_composite_context_init (&generic_ctx, GIMP_COMPOSITE_DIFFERENCE, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D1); + ft0 = gimp_composite_regression_time_function (iterations, gimp_composite_dispatch, &generic_ctx); + ft1 = gimp_composite_regression_time_function (iterations, gimp_composite_difference_rgba8_rgba8_rgba8_sse2, &special_ctx); + if (gimp_composite_regression_compare_contexts ("difference", &generic_ctx, &special_ctx)) { + printf("difference failed\n"); + return (1); + } + gimp_composite_regression_timer_report ("difference", ft0, ft1); + + gimp_composite_context_init (&special_ctx, GIMP_COMPOSITE_DODGE, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D2); + gimp_composite_context_init (&generic_ctx, GIMP_COMPOSITE_DODGE, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D1); + ft0 = gimp_composite_regression_time_function (iterations, gimp_composite_dispatch, &generic_ctx); + ft1 = gimp_composite_regression_time_function (iterations, gimp_composite_dodge_rgba8_rgba8_rgba8_sse2, &special_ctx); + if (gimp_composite_regression_compare_contexts ("dodge", &generic_ctx, &special_ctx)) { + printf("dodge failed\n"); + return (1); + } + gimp_composite_regression_timer_report ("dodge", ft0, ft1); + + gimp_composite_context_init (&special_ctx, GIMP_COMPOSITE_GRAIN_EXTRACT, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D2); + gimp_composite_context_init (&generic_ctx, GIMP_COMPOSITE_GRAIN_EXTRACT, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D1); + ft0 = gimp_composite_regression_time_function (iterations, gimp_composite_dispatch, &generic_ctx); + ft1 = gimp_composite_regression_time_function (iterations, gimp_composite_grain_extract_rgba8_rgba8_rgba8_sse2, &special_ctx); + if (gimp_composite_regression_compare_contexts ("grain_extract", &generic_ctx, &special_ctx)) { + printf("grain_extract failed\n"); + return (1); + } + gimp_composite_regression_timer_report ("grain_extract", ft0, ft1); + + gimp_composite_context_init (&special_ctx, GIMP_COMPOSITE_LIGHTEN, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D2); + gimp_composite_context_init (&generic_ctx, GIMP_COMPOSITE_LIGHTEN, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D1); + ft0 = gimp_composite_regression_time_function (iterations, gimp_composite_dispatch, &generic_ctx); + ft1 = gimp_composite_regression_time_function (iterations, gimp_composite_lighten_rgba8_rgba8_rgba8_sse2, &special_ctx); + if (gimp_composite_regression_compare_contexts ("lighten", &generic_ctx, &special_ctx)) { + printf("lighten failed\n"); + return (1); + } + gimp_composite_regression_timer_report ("lighten", ft0, ft1); + gimp_composite_context_init (&special_ctx, GIMP_COMPOSITE_SUBTRACT, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D2); gimp_composite_context_init (&generic_ctx, GIMP_COMPOSITE_SUBTRACT, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, GIMP_PIXELFORMAT_RGBA8, n_pixels, (unsigned char *) rgba8A, (unsigned char *) rgba8B, (unsigned char *) rgba8B, (unsigned char *) rgba8D1); ft0 = gimp_composite_regression_time_function (iterations, gimp_composite_dispatch, &generic_ctx); ft1 = gimp_composite_regression_time_function (iterations, gimp_composite_subtract_rgba8_rgba8_rgba8_sse2, &special_ctx); if (gimp_composite_regression_compare_contexts ("subtract", &generic_ctx, &special_ctx)) { + printf("subtract failed\n"); return (1); } gimp_composite_regression_timer_report ("subtract", ft0, ft1); @@ -89,6 +132,7 @@ gimp_composite_sse2_test (int iterations, int n_pixels) ft0 = gimp_composite_regression_time_function (iterations, gimp_composite_dispatch, &generic_ctx); ft1 = gimp_composite_regression_time_function (iterations, gimp_composite_swap_rgba8_rgba8_rgba8_sse2, &special_ctx); if (gimp_composite_regression_compare_contexts ("swap", &generic_ctx, &special_ctx)) { + printf("swap failed\n"); return (1); } gimp_composite_regression_timer_report ("swap", ft0, ft1); @@ -107,7 +151,7 @@ main (int argc, char *argv[]) putenv ("GIMP_COMPOSITE=0x1"); iterations = 1; - n_pixels = 1048577; + n_pixels = 163921; argv++, argc--; while (argc >= 2) { diff --git a/app/composite/gimp-composite-sse2.c b/app/composite/gimp-composite-sse2.c index 07003dde96..aed14ebf11 100644 --- a/app/composite/gimp-composite-sse2.c +++ b/app/composite/gimp-composite-sse2.c @@ -1,4 +1,6 @@ -/* The GIMP -- an image manipulation program +/* -*- mode: c tab-width: 2; c-basic-indent: 2; indent-tabs-mode: nil -*- + * + * The GIMP -- an image manipulation program * Copyright (C) 1995 Spencer Kimball and Peter Mattis * * -*- mode: c tab-width: 2; c-basic-indent: 2; indent-tabs-mode: nil -*- @@ -34,12 +36,10 @@ #include "gimp-composite.h" #include "gimp-composite-sse2.h" +#include "gimp-composite-x86.h" #if __GNUC__ >= 3 -#define pminub(src,dst,tmp) "pminub " "%%" #src ", %%" #dst -#define pmaxub(src,dst,tmp) "pmaxub " "%%" #src ", %%" #dst - const static guint32 rgba8_alpha_mask_128[4] = { 0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000 }; const static guint32 rgba8_b1_128[4] = { 0x01010101, 0x01010101, 0x01010101, 0x01010101 }; const static guint32 rgba8_b255_128[4] = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }; @@ -90,75 +90,74 @@ void gimp_composite_addition_rgba8_rgba8_rgba8_sse2 (GimpCompositeContext *_op) { GimpCompositeContext op = *_op; - - asm volatile ("movdqu %0,%%xmm0" + + asm volatile (" movdqu %0,%%xmm0\n" + "\tmovq %1,%%mm0" : /* empty */ - : "m" (*rgba8_alpha_mask_128) - : "%xmm0"); - + : "m" (*rgba8_alpha_mask_128), "m" (*rgba8_alpha_mask_64) + : "%xmm0", "%mm0"); + for (; op.n_pixels >= 4; op.n_pixels -= 4) { - asm (" movdqu %0, %%xmm2\n" - "\tmovdqu %1, %%xmm3\n" - "\tmovdqu %%xmm2, %%xmm4\n" - "\tpaddusb %%xmm3, %%xmm4\n" - - "\tmovdqu %%xmm0, %%xmm1\n" - "\tpandn %%xmm4, %%xmm1\n" - "\t" pminub(xmm3, xmm2, xmm4) "\n" - "\tpand %%xmm0, %%xmm2\n" - "\tpor %%xmm2, %%xmm1\n" - "\tmovdqu %%xmm1, %2\n" - : /* empty */ - : "m" (*op.A), "m" (*op.B), "m" (*op.D) - : "0", "1", "2", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7"); + asm (" movdqu %1,%%xmm2\n" + "\tmovdqu %2,%%xmm3\n" + "\tmovdqu %%xmm2,%%xmm4\n" + "\tpaddusb %%xmm3,%%xmm4\n" + + "\tmovdqu %%xmm0,%%xmm1\n" + "\tpandn %%xmm4,%%xmm1\n" + "\tpminub %%xmm3,%%xmm2\n" + "\tpand %%xmm0,%%xmm2\n" + "\tpor %%xmm2,%%xmm1\n" + "\tmovdqu %%xmm1,%0\n" + : "+m" (*op.D) + : "m" (*op.A), "m" (*op.B) + : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7"); op.A += 16; op.B += 16; op.D += 16; } - - asm volatile ("movq %0,%%mm0" - : /* empty */ - : "m" (*rgba8_alpha_mask_64) - : "%mm0"); - + for (; op.n_pixels >= 2; op.n_pixels -= 2) { - asm (" movq %0, %%mm2\n" - "\tmovq %1, %%mm3\n" - "\tmovq %%mm2, %%mm4\n" - "\tpaddusb %%mm3, %%mm4\n" - "\tmovq %%mm0, %%mm1\n" - "\tpandn %%mm4, %%mm1\n" - "\t" pminub(mm3, mm2, mm4) "\n" - "\tpand %%mm0, %%mm2\n" - "\tpor %%mm2, %%mm1\n" - "\tmovq %%mm1, %2\n" - : /* empty */ - : "m" (*op.A), "m" (*op.B), "m" (*op.D) - : "0", "1", "2", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"); + asm (" movq %1,%%mm2\n" + "\tmovq %2,%%mm3\n" + "\tmovq %%mm2,%%mm4\n" + "\tpaddusb %%mm3,%%mm4\n" + "\tmovq %%mm0,%%mm1\n" + "\tpandn %%mm4,%%mm1\n" + "\tpminub %%mm3,%%mm2\n" + "\tpand %%mm0,%%mm2\n" + "\tpor %%mm2,%%mm1\n" + "\tmovq %%mm1,%0\n" + : "+m" (*op.D) + : "m" (*op.A), "m" (*op.B) + : "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"); op.A += 8; op.B += 8; op.D += 8; } - - if (op.n_pixels) + + for (; op.n_pixels >= 1; op.n_pixels -= 1) { - asm volatile (" movd (%0), %%mm2;\n" - "\tmovd (%1), %%mm3;\n" - "\tmovq %%mm2, %%mm4\n" - "\tpaddusb %%mm3, %%mm4\n" - "\tmovq %%mm0, %%mm1\n" - "\tpandn %%mm4, %%mm1\n" - "\t" pminub(mm3, mm2, mm4) "\n" - "\tpand %%mm0, %%mm2\n" - "\tpor %%mm2, %%mm1\n" - "\tmovd %%mm1, (%2);\n" - : /* empty */ - : "r" (op.A), "r" (op.B), "r" (op.D) - : "0", "1", "2", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"); + asm volatile (" movd %1,%%mm2\n" + "\tmovd %2,%%mm3\n" + "\tmovq %%mm2,%%mm4\n" + "\tpaddusb %%mm3,%%mm4\n" + "\tmovq %%mm0,%%mm1\n" + "\tpandn %%mm4,%%mm1\n" + "\tpminub %%mm3,%%mm2\n" + "\tpand %%mm0,%%mm2\n" + "\tpor %%mm2,%%mm1\n" + "\tmovd %%mm1,%0\n" + : "+m" (*op.D) + : "m" (*op.A), "m" (*op.B) + : "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"); + op.A += 4; + op.B += 4; + op.D += 4; } - + asm("emms"); } @@ -166,31 +165,70 @@ gimp_composite_addition_rgba8_rgba8_rgba8_sse2 (GimpCompositeContext *_op) void xxxgimp_composite_burn_rgba8_rgba8_rgba8_sse2 (GimpCompositeContext *_op) { - + } - void -xxxgimp_composite_darken_rgba8_rgba8_rgba8_sse2 (GimpCompositeContext *_op) +gimp_composite_darken_rgba8_rgba8_rgba8_sse2 (GimpCompositeContext *_op) { - + GimpCompositeContext op = *_op; + + for (; op.n_pixels >= 4; op.n_pixels -= 4) + { + asm volatile (" movdqu %1,%%xmm2\n" + "\tmovdqu %2,%%xmm3\n" + "\tpminub %%xmm3,%%xmm2\n" + "\tmovdqu %%xmm2,%0\n" + : "=m" (*op.D) + : "m" (*op.A), "m" (*op.B) + : "%xmm1", "%xmm2", "%xmm3", "%xmm4"); + op.A += 16; + op.B += 16; + op.D += 16; + } + + for (; op.n_pixels >= 2; op.n_pixels -= 2) + { + asm volatile (" movq %1, %%mm2\n" + "\tpminub %2, %%mm2\n" + "\tmovq %%mm2, %0\n" + : "=m" (*op.D) + : "m" (*op.A), "m" (*op.B) + : "%mm1", "%mm2", "%mm3", "%mm4"); + op.A += 8; + op.B += 8; + op.D += 8; + } + + if (op.n_pixels) + { + asm volatile (" movd %1, %%mm2\n" + "\tmovd %2, %%mm3\n" + "\tpminub %%mm3, %%mm2\n" + "\tmovd %%mm2, %0\n" + : "=m" (*op.D) + : "m" (*op.A), "m" (*op.B) + : "%mm2", "%mm3", "%mm4"); + } + + asm("emms"); } void gimp_composite_difference_rgba8_rgba8_rgba8_sse2 (GimpCompositeContext *_op) { GimpCompositeContext op = *_op; - + asm volatile (" movq %0,%%mm0\n" "\tmovdqu %1,%%xmm0" : : "m" (*rgba8_alpha_mask_64), "m" (*rgba8_alpha_mask_128) - : "%mm0"); - + : "%mm0", "%xmm0"); + for (; op.n_pixels >= 4; op.n_pixels -= 4) { - asm volatile (" movdqu %0,%%xmm2\n" - "\tmovdqu %1,%%xmm3\n" + asm volatile (" movdqu %1,%%xmm2\n" + "\tmovdqu %2,%%xmm3\n" "\tmovdqu %%xmm2,%%xmm4\n" "\tmovdqu %%xmm3,%%xmm5\n" "\tpsubusb %%xmm3,%%xmm4\n" @@ -201,10 +239,10 @@ gimp_composite_difference_rgba8_rgba8_rgba8_sse2 (GimpCompositeContext *_op) "\tpminub %%xmm3,%%xmm2\n" "\tpand %%xmm0,%%xmm2\n" "\tpor %%xmm2,%%xmm1\n" - "\tmovdqu %%xmm1,%2\n" - : /* empty */ - : "m" (*op.A), "m" (*op.B), "m" (*op.D) - : "0", "1", "2", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5"); + "\tmovdqu %%xmm1,%0\n" + : "+m" (*op.D) + : "m" (*op.A), "m" (*op.B) + : "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5"); op.A += 16; op.B += 16; op.D += 16; @@ -212,8 +250,8 @@ gimp_composite_difference_rgba8_rgba8_rgba8_sse2 (GimpCompositeContext *_op) for (; op.n_pixels >= 2; op.n_pixels -= 2) { - asm volatile (" movq %0, %%mm2\n" - "\tmovq %1, %%mm3\n" + asm volatile (" movq %1, %%mm2\n" + "\tmovq %2, %%mm3\n" "\tmovq %%mm2, %%mm4\n" "\tmovq %%mm3, %%mm5\n" "\tpsubusb %%mm3, %%mm4\n" @@ -224,10 +262,10 @@ gimp_composite_difference_rgba8_rgba8_rgba8_sse2 (GimpCompositeContext *_op) "\tpminub %%mm3, %%mm2\n" "\tpand %%mm0, %%mm2\n" "\tpor %%mm2, %%mm1\n" - "\tmovq %%mm1, %2\n" - : /* empty */ - : "m" (*op.A), "m" (*op.B), "m" (*op.D) - : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); + "\tmovq %%mm1, %0\n" + : "+m" (*op.D) + : "m" (*op.A), "m" (*op.B) + : "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); op.A += 8; op.B += 8; op.D += 8; @@ -235,8 +273,8 @@ gimp_composite_difference_rgba8_rgba8_rgba8_sse2 (GimpCompositeContext *_op) if (op.n_pixels) { - asm volatile (" movd %0, %%mm2\n" - "\tmovd %1, %%mm3\n" + asm volatile (" movd %1, %%mm2\n" + "\tmovd %2, %%mm3\n" "\tmovq %%mm2, %%mm4\n" "\tmovq %%mm3, %%mm5\n" "\tpsubusb %%mm3, %%mm4\n" @@ -247,141 +285,398 @@ gimp_composite_difference_rgba8_rgba8_rgba8_sse2 (GimpCompositeContext *_op) "\tpminub %%mm3, %%mm2\n" "\tpand %%mm0, %%mm2\n" "\tpor %%mm2, %%mm1\n" - "\tmovd %%mm1, %2\n" - : /* empty */ - : "m" (*op.A), "m" (*op.B), "m" (*op.D) - : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4"); + "\tmovd %%mm1, %0\n" + : "+m" (*op.D) + : "m" (*op.A), "m" (*op.B) + : "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); } + + asm("emms"); +} + +void +gimp_composite_dodge_rgba8_rgba8_rgba8_sse2 (GimpCompositeContext *_op) +{ + GimpCompositeContext op = *_op; + +#if 0 + for (; op.n_pixels >= 4; op.n_pixels -= 4) + { + asm volatile (" movdqu %0,%%xmm0\n" + "\tmovdqu %1,%%xmm1\n" + "\tmovdqu %%xmm1,%%xmm3\n" + "\tpxor %%xmm2,%%xmm2\n" + "\tpunpcklbw %%xmm2,%%xmm3\n" + "\tpunpcklbw %%xmm0,%%xmm2\n" + + "\tmovdqu %3,%%xmm4\n" + "\tpsubw %%xmm3,%%xmm4\n" + + "\t" xmm_pdivwuqX(xmm2,xmm4,xmm5,xmm6) "\n" + + "\tmovdqu %%xmm1,%%xmm3\n" + "\tpxor %%xmm2,%%xmm2\n" + "\tpunpckhbw %%xmm2,%%xmm3\n" + "\tpunpckhbw %%xmm0,%%xmm2\n" + + "\tmovdqu %3,%%xmm4\n" + "\tpsubw %%xmm3,%%xmm4\n" + + "\t" xmm_pdivwuqX(xmm2,xmm4,xmm6,xmm7) "\n" + + "\tpackuswb %%xmm6,%%xmm5\n" + + "\tmovdqu %4,%%xmm6\n" + "\tmovdqu %%xmm1,%%xmm7\n" + "\tpminub %%xmm0,%%xmm7\n" + "\tpand %%xmm6,%%xmm7\n" + "\tpandn %%xmm5,%%xmm6\n" + + "\tpor %%xmm6,%%xmm7\n" + + "\tmovdqu %%xmm7,%2\n" + : /* empty */ + : "m" (*op.A), "m" (*op.B), "m" (*op.D), "m" (*rgba8_w256_128), "m" (*rgba8_alpha_mask_128) + : "0", "1", "2", "%eax", "%ecx", "%edx", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7"); + op.A += 16; + op.B += 16; + op.D += 16; + } +#endif + + for (; op.n_pixels >= 2; op.n_pixels -= 2) + { + asm volatile (" movq %0,%%mm0\n" + "\tmovq %1,%%mm1\n" + "\tmovq %%mm1,%%mm3\n" + "\tpxor %%mm2,%%mm2\n" + "\tpunpcklbw %%mm2,%%mm3\n" + "\tpunpcklbw %%mm0,%%mm2\n" + + "\tmovq %3,%%mm4\n" + "\tpsubw %%mm3,%%mm4\n" + + "\t" pdivwuqX(mm2,mm4,mm5) "\n" + + "\tmovq %%mm1,%%mm3\n" + "\tpxor %%mm2,%%mm2\n" + "\tpunpckhbw %%mm2,%%mm3\n" + "\tpunpckhbw %%mm0,%%mm2\n" + + "\tmovq %3,%%mm4\n" + "\tpsubw %%mm3,%%mm4\n" + + "\t" pdivwuqX(mm2,mm4,mm6) "\n" + + "\tpackuswb %%mm6,%%mm5\n" + + "\tmovq %4,%%mm6\n" + "\tmovq %%mm1,%%mm7\n" + "\tpminub %%mm0,%%mm7\n" + "\tpand %%mm6,%%mm7\n" + "\tpandn %%mm5,%%mm6\n" + + "\tpor %%mm6,%%mm7\n" + + "\tmovq %%mm7,%2\n" + : /* empty */ + : "m" (*op.A), "m" (*op.B), "m" (*op.D), "m" (*rgba8_w256_64), "m" (*rgba8_alpha_mask_64) + : "0", "1", "2", "%eax", "%ecx", "%edx", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"); + op.A += 8; + op.B += 8; + op.D += 8; + } + + if (op.n_pixels) + { + asm volatile (" movd %0,%%mm0\n" + "\tmovq %1,%%mm1\n" + "\tmovq %%mm1,%%mm3\n" + "\tpxor %%mm2,%%mm2\n" + "\tpunpcklbw %%mm2,%%mm3\n" + "\tpunpcklbw %%mm0,%%mm2\n" + + "\tmovq %3,%%mm4\n" + "\tpsubw %%mm3,%%mm4\n" + + "\t" pdivwuqX(mm2,mm4,mm5) "\n" + + "\tmovq %%mm1,%%mm3\n" + "\tpxor %%mm2,%%mm2\n" + "\tpunpckhbw %%mm2,%%mm3\n" + "\tpunpckhbw %%mm0,%%mm2\n" + + "\tmovq %3,%%mm4\n" + "\tpsubw %%mm3,%%mm4\n" + + "\t" pdivwuqX(mm2,mm4,mm6) "\n" + + "\tpackuswb %%mm6,%%mm5\n" + + "\tmovq %4,%%mm6\n" + "\tmovq %%mm1,%%mm7\n" + "\tpminub %%mm0,%%mm7\n" + "\tpand %%mm6,%%mm7\n" + "\tpandn %%mm5,%%mm6\n" + + "\tpor %%mm6,%%mm7\n" + + "\tmovd %%mm7,%2\n" + : /* empty */ + : "m" (*op.A), "m" (*op.B), "m" (*op.D), "m" (*rgba8_w256_64), "m" (*rgba8_alpha_mask_64) + : "0", "1", "2", "%eax", "%ecx", "%edx", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); + } + asm("emms"); } void -xxxgimp_composite_divide_rgba8_rgba8_rgba8_sse2 (GimpCompositeContext *_op) +gimp_composite_grain_extract_rgba8_rgba8_rgba8_sse2 (GimpCompositeContext *_op) { + GimpCompositeContext op = *_op; + + asm volatile (" movq %0,%%mm0\n" + "\tpxor %%mm6,%%mm6\n" + "\tmovq %1,%%mm7\n" + "\tmovdqu %2,%%xmm0\n" + "\tpxor %%xmm6,%%xmm6\n" + "\tmovdqu %3,%%xmm7\n" + : /* empty */ + : "m" (*rgba8_alpha_mask_64), "m" (*rgba8_w128_64), "m" (*rgba8_alpha_mask_128), "m" (*rgba8_w128_128) + : "%mm0", "%mm6", "%mm7", "%xmm0", "%xmm6", "%xmm7"); + + for (; op.n_pixels >= 4; op.n_pixels -= 4) + { + asm volatile (" movdqu %1,%%xmm2\n" + "\tmovdqu %2,%%xmm3\n" + xmm_low_bytes_to_words(xmm2,xmm4,xmm6) + xmm_low_bytes_to_words(xmm3,xmm5,xmm6) + "\tpsubw %%xmm5,%%xmm4\n" + "\tpaddw %%xmm7,%%xmm4\n" + "\tmovdqu %%xmm4,%%xmm1\n" + + xmm_high_bytes_to_words(xmm2,xmm4,xmm6) + xmm_high_bytes_to_words(xmm3,xmm5,xmm6) + + "\tpsubw %%xmm5,%%xmm4\n" + "\tpaddw %%xmm7,%%xmm4\n" + + "\tpackuswb %%xmm4,%%xmm1\n" + "\tmovdqu %%xmm1,%%xmm4\n" + + "\tmovdqu %%xmm0,%%xmm1\n" + "\tpandn %%xmm4,%%xmm1\n" + + "\tpminub %%xmm3,%%xmm2\n" + "\tpand %%xmm0,%%xmm2\n" + + "\tpor %%xmm2,%%xmm1\n" + "\tmovdqu %%xmm1,%0\n" + : "+m" (*op.D) + : "m" (*op.A), "m" (*op.B) + : "%xmm1", "%xmm2", "%xmm3", "%xmm4"); + op.A += 16; + op.B += 16; + op.D += 16; + } + + for (; op.n_pixels >= 2; op.n_pixels -= 2) + { + asm volatile (" movq %1,%%mm2\n" + "\tmovq %2,%%mm3\n" + mmx_low_bytes_to_words(mm2,mm4,mm6) + mmx_low_bytes_to_words(mm3,mm5,mm6) + "\tpsubw %%mm5,%%mm4\n" + "\tpaddw %%mm7,%%mm4\n" + "\tmovq %%mm4,%%mm1\n" + + mmx_high_bytes_to_words(mm2,mm4,mm6) + mmx_high_bytes_to_words(mm3,mm5,mm6) + + "\tpsubw %%mm5,%%mm4\n" + "\tpaddw %%mm7,%%mm4\n" + + "\tpackuswb %%mm4,%%mm1\n" + "\tmovq %%mm1,%%mm4\n" + + "\tmovq %%mm0,%%mm1\n" + "\tpandn %%mm4,%%mm1\n" + + "\tpminub %%mm3,%%mm2\n" + "\tpand %%mm0,%%mm2\n" + + "\tpor %%mm2,%%mm1\n" + "\tmovq %%mm1,%0\n" + : "+m" (*op.D) + : "m" (*op.A), "m" (*op.B) + : "%mm1", "%mm2", "%mm3", "%mm4"); + op.A += 8; + op.B += 8; + op.D += 8; + } + + if (op.n_pixels) + { + asm volatile (" movd %1, %%mm2\n" + "\tmovd %2, %%mm3\n" + mmx_low_bytes_to_words(mm2,mm4,mm6) + mmx_low_bytes_to_words(mm3,mm5,mm6) + "\tpsubw %%mm5, %%mm4\n" + "\tpaddw %%mm7, %%mm4\n" + "\tmovq %%mm4, %%mm1\n" + "\tpackuswb %%mm6, %%mm1\n" + "\tmovq %%mm1, %%mm4\n" + "\tmovq %%mm0, %%mm1\n" + "\tpandn %%mm4, %%mm1\n" + "\tpminub %%mm3, %%mm2\n" + "\tpand %%mm0, %%mm2\n" + "\tpor %%mm2, %%mm1\n" + "\tmovd %%mm1, %0\n" + : "+m" (*op.D) + : "m" (*op.A), "m" (*op.B) + : "%mm1", "%mm2", "%mm3", "%mm4"); + } + + asm("emms"); } void -xxxgimp_composite_dodge_rgba8_rgba8_rgba8_sse2 (GimpCompositeContext *_op) +gimp_composite_lighten_rgba8_rgba8_rgba8_sse2 (GimpCompositeContext *_op) { + GimpCompositeContext op = *_op; + + asm volatile ("movq %0,%%mm0" : : "m" (*rgba8_alpha_mask_64) : "%mm0"); + + for (; op.n_pixels >= 4; op.n_pixels -= 4) + { + asm volatile (" movdqu %1, %%xmm2\n" + "\tmovdqu %2, %%xmm3\n" + "\tmovdqu %%xmm2, %%xmm4\n" + "\tpmaxub %%xmm3, %%xmm4\n" + "\tmovdqu %%xmm0, %%xmm1\n" + "\tpandn %%xmm4, %%xmm1\n" + "\tpminub %%xmm2, %%xmm3\n" + "\tpand %%xmm0, %%xmm3\n" + "\tpor %%xmm3, %%xmm1\n" + "\tmovdqu %%xmm1, %0\n" + : "=m" (*op.D) + : "m" (*op.A), "m" (*op.B) + : "%xmm1", "%xmm2", "%xmm3", "%xmm4"); + op.A += 16; + op.B += 16; + op.D += 16; + } + + for (; op.n_pixels >= 2; op.n_pixels -= 2) + { + asm volatile (" movq %1, %%mm2\n" + "\tmovq %2, %%mm3\n" + "\tmovq %%mm2, %%mm4\n" + "\tpmaxub %%mm3, %%mm4\n" + "\tmovq %%mm0, %%mm1\n" + "\tpandn %%mm4, %%mm1\n" + "\tpminub %%mm2, %%mm3\n" + "\tpand %%mm0, %%mm3\n" + "\tpor %%mm3, %%mm1\n" + "\tmovq %%mm1, %0\n" + : "=m" (*op.D) + : "m" (*op.A), "m" (*op.B) + : "%mm1", "%mm2", "%mm3", "%mm4"); + op.A += 8; + op.B += 8; + op.D += 8; + } + + if (op.n_pixels) + { + asm volatile (" movd %1, %%mm2\n" + "\tmovd %2, %%mm3\n" + "\tmovq %%mm2, %%mm4\n" + "\tpmaxub %%mm3, %%mm4\n" + "\tmovq %%mm0, %%mm1\n" + "\tpandn %%mm4, %%mm1\n" + "\tpminub %%mm2,%%mm3\n" + "\tpand %%mm0, %%mm3\n" + "\tpor %%mm3, %%mm1\n" + "\tmovd %%mm1, %0\n" + : "=m" (*op.D) + : "m" (*op.A), "m" (*op.B) + : "%mm1", "%mm2", "%mm3", "%mm4"); + } + + asm("emms"); } -void -xxxgimp_composite_grain_extract_rgba8_rgba8_rgba8_sse2 (GimpCompositeContext *_op) -{ - -} - -void -xxxgimp_composite_grain_merge_rgba8_rgba8_rgba8_sse2 (GimpCompositeContext *_op) -{ - -} - -void -xxxgimp_composite_lighten_rgba8_rgba8_rgba8_sse2 (GimpCompositeContext *_op) -{ -} - -void -xxxgimp_composite_multiply_rgba8_rgba8_rgba8_sse2 (GimpCompositeContext *_op) -{ -} - -void -xxxgimp_composite_overlay_rgba8_rgba8_rgba8_sse2 (GimpCompositeContext *_op) -{ -} - - -void -xxxgimp_composite_scale_rgba8_rgba8_rgba8_sse2 (GimpCompositeContext *_op) -{ -} - -void -xxxgimp_composite_screen_rgba8_rgba8_rgba8_sse2 (GimpCompositeContext *_op) -{ -} - - void gimp_composite_subtract_rgba8_rgba8_rgba8_sse2 (GimpCompositeContext *_op) { GimpCompositeContext op = *_op; - + asm volatile (" movq %0,%%mm0\n" "\tmovdqu %1,%%xmm0\n" : /* empty */ : "m" (*rgba8_alpha_mask_64), "m" (*rgba8_alpha_mask_128) : "%mm0", "%xmm0"); - + for (; op.n_pixels >= 4; op.n_pixels -= 4) { - asm volatile (" movdqu %0,%%xmm2\n" - "\tmovdqu %1,%%xmm3\n" + asm volatile (" movdqu %1,%%xmm2\n" + "\tmovdqu %2,%%xmm3\n" "\tmovdqu %%xmm2,%%xmm4\n" "\tpsubusb %%xmm3,%%xmm4\n" - + "\tmovdqu %%xmm0,%%xmm1\n" "\tpandn %%xmm4,%%xmm1\n" - "\t" pminub(xmm3,xmm2,xmm4) "\n" + "\tpminub %%xmm3,%%xmm2\n" "\tpand %%xmm0,%%xmm2\n" "\tpor %%xmm2,%%xmm1\n" - "\tmovdqu %%xmm1,%2\n" - : /* empty */ - : "m" (*op.A), "m" (*op.B), "m" (*op.D) - : "0", "1", "2", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5"); + "\tmovdqu %%xmm1,%0\n" + : "+m" (*op.D) + : "m" (*op.A), "m" (*op.B) + : "%xmm1", "%xmm2", "%xmm3", "%xmm4"); op.A += 16; op.B += 16; op.D += 16; } - + for (; op.n_pixels >= 2; op.n_pixels -= 2) { - asm volatile (" movq %0,%%mm2\n" - "\tmovq %1,%%mm3\n" - + asm volatile (" movq %1,%%mm2\n" + "\tmovq %2,%%mm3\n" "\tmovq %%mm2,%%mm4\n" "\tpsubusb %%mm3,%%mm4\n" - "\tmovq %%mm0,%%mm1\n" "\tpandn %%mm4,%%mm1\n" - - "\t" pminub(mm3,mm2,mm4) "\n" - + "\tpminub %%mm3,%%mm2\n" "\tpand %%mm0,%%mm2\n" "\tpor %%mm2,%%mm1\n" - "\tmovq %%mm1,%2\n" - : /* empty */ - : "m" (*op.A), "m" (*op.B), "m" (*op.D) - : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); + "\tmovq %%mm1,%0\n" + : "+m" (*op.D) + : "m" (*op.A), "m" (*op.B) + : "%mm1", "%mm2", "%mm3", "%mm4"); op.A += 8; op.B += 8; op.D += 8; - } - + } + if (op.n_pixels) { - asm volatile (" movd %0,%%mm2\n" - "\tmovd %1,%%mm3\n" - + asm volatile (" movd %1,%%mm2\n" + "\tmovd %2,%%mm3\n" "\tmovq %%mm2,%%mm4\n" "\tpsubusb %%mm3,%%mm4\n" - "\tmovq %%mm0,%%mm1\n" "\tpandn %%mm4,%%mm1\n" - - "\t" pminub(mm3,mm2,mm4) "\n" - + "\tpminub %%mm3,%%mm2\n" "\tpand %%mm0,%%mm2\n" "\tpor %%mm2,%%mm1\n" - "\tmovd %%mm1,%2\n" - : /* empty */ - : "m" (*op.A), "m" (*op.B), "m" (*op.D) - : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); - } - + "\tmovd %%mm1,%0\n" + : "+m" (*op.D) + : "m" (*op.A), "m" (*op.B) + : "%mm1", "%mm2", "%mm3", "%mm4"); + } + asm("emms"); } @@ -389,7 +684,7 @@ void gimp_composite_swap_rgba8_rgba8_rgba8_sse2 (GimpCompositeContext *_op) { GimpCompositeContext op = *_op; - + /* * Inhale one whole i686 cache line at once. 64 bytes, 16 rgba8 pixels, 4 128 bit xmm registers. */ @@ -403,7 +698,7 @@ gimp_composite_swap_rgba8_rgba8_rgba8_sse2 (GimpCompositeContext *_op) "\tmovdqu %5,%%xmm5\n" "\tmovdqu %6,%%xmm6\n" "\tmovdqu %7,%%xmm7\n" - + "\tmovdqu %%xmm0,%1\n" "\tmovdqu %%xmm1,%0\n" "\tmovdqu %%xmm2,%3\n" @@ -412,52 +707,53 @@ gimp_composite_swap_rgba8_rgba8_rgba8_sse2 (GimpCompositeContext *_op) "\tmovdqu %%xmm5,%4\n" "\tmovdqu %%xmm6,%7\n" "\tmovdqu %%xmm7,%6\n" + : "+m" (op.A[0]), "+m" (op.B[0]), + "+m" (op.A[1]), "+m" (op.B[1]), + "+m" (op.A[2]), "+m" (op.B[2]), + "+m" (op.A[3]), "+m" (op.B[3]) : /* empty */ - : "m" (op.A[0]), "m" (op.B[0]), - "m" (op.A[1]), "m" (op.B[1]), - "m" (op.A[2]), "m" (op.B[2]), - "m" (op.A[3]), "m" (op.B[3]) - ); + : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7" + ); op.A += 64; op.B += 64; - } - + } + for (; op.n_pixels >= 4; op.n_pixels -= 4) { - asm volatile (" movdqu %0,%%xmm2\n" - "\tmovdqu %1,%%xmm3\n" - "\tmovdqu %%xmm3,%0\n" - "\tmovdqu %%xmm2,%1\n" - : /* empty */ - : "m" (*op.A), "m" (*op.B) - : "0", "1", "%xmm1", "%xmm2", "%xmm3", "%xmm4"); + asm volatile (" movdqu %0,%%xmm2\n" + "\tmovdqu %1,%%xmm3\n" + "\tmovdqu %%xmm3,%0\n" + "\tmovdqu %%xmm2,%1\n" + : "+m" (*op.A), "+m" (*op.B) + : /* empty */ + : "%xmm2", "%xmm3"); op.A += 16; op.B += 16; - } - + } + for (; op.n_pixels >= 2; op.n_pixels -= 2) { - asm volatile (" movq %0,%%mm2\n" - "\tmovq %1,%%mm3\n" - "\tmovq %%mm3,%0\n" - "\tmovq %%mm2,%1\n" - : /* empty */ - : "m" (*op.A), "m" (*op.B) - : "0", "1", "%mm1", "%mm2", "%mm3", "%mm4"); + asm volatile (" movq %0,%%mm2\n" + "\tmovq %1,%%mm3\n" + "\tmovq %%mm3,%0\n" + "\tmovq %%mm2,%1\n" + : "+m" (*op.A), "+m" (*op.B) + : /* empty */ + : "%mm2", "%mm3"); op.A += 8; op.B += 8; - } - + } + if (op.n_pixels) { - asm volatile (" movd %0,%%mm2\n" - "\tmovd %1,%%mm3\n" - "\tmovd %%mm3,%0\n" - "\tmovd %%mm2,%1\n" - : /* empty */ - : "m" (*op.A), "m" (*op.B) - : "0", "1", "%mm1", "%mm2", "%mm3", "%mm4"); - } + asm volatile (" movd %0,%%mm2\n" + "\tmovd %1,%%mm3\n" + "\tmovd %%mm3,%0\n" + "\tmovd %%mm2,%1\n" + : "+m" (*op.A), "+m" (*op.B) + : /* empty */ + : "%mm1", "%mm2", "%mm3", "%mm4"); + } asm("emms"); } diff --git a/app/composite/gimp-composite-sse2.h b/app/composite/gimp-composite-sse2.h index 8310ee6272..d7fd192539 100644 --- a/app/composite/gimp-composite-sse2.h +++ b/app/composite/gimp-composite-sse2.h @@ -10,7 +10,11 @@ extern void gimp_composite_sse2_init (void); extern void gimp_composite_sse2_install (void); extern void gimp_composite_addition_rgba8_rgba8_rgba8_sse2 (GimpCompositeContext *); +extern void gimp_composite_darken_rgba8_rgba8_rgba8_sse2 (GimpCompositeContext *); extern void gimp_composite_difference_rgba8_rgba8_rgba8_sse2 (GimpCompositeContext *); +extern void gimp_composite_grain_extract_rgba8_rgba8_rgba8_sse2 (GimpCompositeContext *); +extern void gimp_composite_lighten_rgba8_rgba8_rgba8_sse2 (GimpCompositeContext *); extern void gimp_composite_subtract_rgba8_rgba8_rgba8_sse2 (GimpCompositeContext *); extern void gimp_composite_swap_rgba8_rgba8_rgba8_sse2 (GimpCompositeContext *); +extern void gimp_composite_dodge_rgba8_rgba8_rgba8_sse2 (GimpCompositeContext *); #endif diff --git a/app/composite/gimp-composite-vis-test.c b/app/composite/gimp-composite-vis-test.c index c0d15e9d8f..15a4eb932e 100644 --- a/app/composite/gimp-composite-vis-test.c +++ b/app/composite/gimp-composite-vis-test.c @@ -71,7 +71,7 @@ main (int argc, char *argv[]) putenv ("GIMP_COMPOSITE=0x1"); iterations = 1; - n_pixels = 1048577; + n_pixels = 163921; argv++, argc--; while (argc >= 2) { diff --git a/app/composite/gimp-composite-x86.h b/app/composite/gimp-composite-x86.h new file mode 100644 index 0000000000..9d9d179fe5 --- /dev/null +++ b/app/composite/gimp-composite-x86.h @@ -0,0 +1,208 @@ +#if __GNUC__ >= 3 + +#define mmx_low_bytes_to_words(src,dst,zero) \ + "\tmovq %%"#src", %%"#dst"; " \ + "\tpunpcklbw %%"#zero", %%"#dst"\n" + +#define mmx_high_bytes_to_words(src,dst,zero) \ + "\tmovq %%"#src", %%"#dst"; " \ + "\tpunpckhbw %%"#zero", %%"#dst"\n" + +#define xmm_low_bytes_to_words(src,dst,zero) \ + "\tmovdqu %%"#src", %%"#dst"; " \ + "\tpunpcklbw %%"#zero", %%"#dst"\n" + +#define xmm_high_bytes_to_words(src,dst,zero) \ + "\tmovdqu %%"#src", %%"#dst"; " \ + "\tpunpckhbw %%"#zero", %%"#dst"\n" + +/* a = INT_MULT(a,b) */ +#define mmx_int_mult(a,b,w128) \ + "\tpmullw %%"#b", %%"#a"; " \ + "\tpaddw %%"#w128", %%"#a"; " \ + "\tmovq %%"#a", %%"#b"; " \ + "\tpsrlw $8, %%"#b"; " \ + "\tpaddw %%"#a", %%"#b"; " \ + "\tpsrlw $8, %%"#b"\n" + +#define sse2_int_mult(a,b,w128) \ + "\tpmullw %%"#b", %%"#a"; " \ + "\tpaddw %%"#w128", %%"#a"; " \ + "\tmovdqu %%"#a", %%"#b"; " \ + "\tpsrlw $8, %%"#b"; " \ + "\tpaddw %%"#a", %%"#b"; " \ + "\tpsrlw $8, %%"#b"\n" + +/* + * Double-word divide. Adjusted for subsequent unsigned packing + * (high-order bit of each word is cleared) + * Clobbers eax, ecx edx + */ +#define pdivwX(dividend,divisor,quotient) "movd %%" #dividend ",%%eax; " \ + "movd %%" #divisor ",%%ecx; " \ + "xorl %%edx,%%edx; " \ + "divw %%cx; " \ + "roll $16, %%eax; " \ + "roll $16, %%ecx; " \ + "xorl %%edx,%%edx; " \ + "divw %%cx; " \ + "btr $15, %%eax; " \ + "roll $16, %%eax; " \ + "btr $15, %%eax; " \ + "movd %%eax,%%" #quotient ";" + + + +/* + * Quadword divide. No adjustment for subsequent unsigned packing + * (high-order bit of each word is left alone) + */ +#define pdivwqX(dividend,divisor,quotient) "movd %%" #dividend ",%%eax; " \ + "movd %%" #divisor ",%%ecx; " \ + "xorl %%edx,%%edx; " \ + "divw %%cx; " \ + "roll $16, %%eax; " \ + "roll $16, %%ecx; " \ + "xorl %%edx,%%edx; " \ + "divw %%cx; " \ + "roll $16, %%eax; " \ + "movd %%eax,%%" #quotient "; " \ + "psrlq $32,%%" #dividend ";" \ + "psrlq $32,%%" #divisor ";" \ + "movd %%" #dividend ",%%eax; " \ + "movd %%" #divisor ",%%ecx; " \ + "xorl %%edx,%%edx; " \ + "divw %%cx; " \ + "roll $16, %%eax; " \ + "roll $16, %%ecx; " \ + "xorl %%edx,%%edx; " \ + "divw %%cx; " \ + "roll $16, %%eax; " \ + "movd %%eax,%%" #divisor ";" \ + "psllq $32,%%" #divisor ";" \ + "por %%" #divisor ",%%" #quotient ";" + +/* + * Quadword divide. Adjusted for subsequent unsigned packing + * (high-order bit of each word is cleared) + */ +#define pdivwuqX(dividend,divisor,quotient) \ + pdivwX(dividend,divisor,quotient) \ + "psrlq $32,%%" #dividend ";" \ + "psrlq $32,%%" #divisor ";" \ + pdivwX(dividend,divisor,quotient) \ + "movd %%eax,%%" #divisor ";" \ + "psllq $32,%%" #divisor ";" \ + "por %%" #divisor ",%%" #quotient ";" + +#define xmm_pdivwqX(dividend,divisor,quotient,scratch) "movd %%" #dividend ",%%eax; " \ + "movd %%" #divisor ",%%ecx; " \ + "xorl %%edx,%%edx; " \ + "divw %%cx; " \ + "roll $16, %%eax; " \ + "roll $16, %%ecx; " \ + "xorl %%edx,%%edx; " \ + "divw %%cx; " \ + "roll $16, %%eax; " \ + "movd %%eax,%%" #quotient "; " \ + "psrlq $32,%%" #divisor ";" \ + "psrlq $32,%%" #dividend ";" \ + "movd %%" #dividend ",%%eax; " \ + "movd %%" #divisor ",%%ecx; " \ + "xorl %%edx,%%edx; " \ + "divw %%cx; " \ + "roll $16, %%eax; " \ + "roll $16, %%ecx; " \ + "xorl %%edx,%%edx; " \ + "divw %%cx; " \ + "roll $16, %%eax; " \ + "movd %%eax,%%" #scratch ";" \ + "psllq $32,%%" #scratch ";" \ + "psrlq $32,%%" #divisor ";" \ + "psrlq $32,%%" #dividend ";" \ + "movd %%" #dividend ",%%eax; " \ + "movd %%" #divisor ",%%ecx; " \ + "xorl %%edx,%%edx; " \ + "divw %%cx; " \ + "roll $16, %%eax; " \ + "roll $16, %%ecx; " \ + "xorl %%edx,%%edx; " \ + "divw %%cx; " \ + "roll $16, %%eax; " \ + "movd %%eax,%%" #scratch ";" \ + "psllq $64,%%" #scratch ";" \ + "psrlq $32,%%" #divisor ";" \ + "psrlq $32,%%" #dividend ";" \ + "movd %%" #dividend ",%%eax; " \ + "movd %%" #divisor ",%%ecx; " \ + "xorl %%edx,%%edx; " \ + "divw %%cx; " \ + "roll $16, %%eax; " \ + "roll $16, %%ecx; " \ + "xorl %%edx,%%edx; " \ + "divw %%cx; " \ + "roll $16, %%eax; " \ + "movd %%eax,%%" #scratch ";" \ + "psllq $96,%%" #scratch ";" \ + "por %%" #scratch ",%%" #quotient ";" + +#define xmm_pdivwX(dividend,divisor,quotient) "movd %%" #dividend ",%%eax; " \ + "movd %%" #divisor ",%%ecx; " \ + "xorl %%edx,%%edx; " \ + "divw %%cx; " \ + "roll $16, %%eax; " \ + "roll $16, %%ecx; " \ + "xorl %%edx,%%edx; " \ + "divw %%cx; " \ + "btr $15, %%eax; " \ + "roll $16, %%eax; " \ + "btr $15, %%eax; " \ + "movd %%eax,%%" #quotient ";" + +#define xmm_pdivwuqX(dividend,divisor,quotient,scratch) \ + xmm_pdivwX(dividend,divisor,scratch) \ + "movd %%"#scratch ",%%"#quotient ";" \ + "psrlq $32,%%"#dividend ";" \ + "psrlq $32,%%"#divisor ";" \ + xmm_pdivwX(dividend,divisor,scratch) \ + "psllq $32,%%"#scratch ";" \ + "por %%"#scratch ",%%"#quotient ";" \ + "psrlq $32,%%"#dividend ";" \ + "psrlq $32,%%"#divisor ";" \ + xmm_pdivwX(dividend,divisor,scratch) \ + "psllq $64,%%"#scratch ";" \ + "por %%"#scratch ",%%"#quotient ";" \ + "psrlq $32,%%"#dividend ";" \ + "psrlq $32,%%"#divisor ";" \ + xmm_pdivwX(dividend,divisor,scratch) \ + "psllq $96,%%"#scratch ";" \ + "por %%"#scratch ",%%"#quotient + +/* equivalent to the INT_MULT() macro in gimp-composite-generic.c */ +/* + * opr2 = INT_MULT(opr1, opr2, t) + * + * Operates across quad-words using x86 word (16bit) value. + * Result is left in opr2 + * + * opr1 = opr1 * opr2 + w128 + * opr2 = opr1 + * opr2 = ((opr2 >> 8) + opr1) >> 8 + */ +#define pmulwX(opr1,opr2,w128) \ + "\tpmullw %%"#opr2", %%"#opr1"; " \ + "\tpaddw %%"#w128", %%"#opr1"; " \ + "\tmovq %%"#opr1", %%"#opr2"; " \ + "\tpsrlw $8, %%"#opr2"; " \ + "\tpaddw %%"#opr1", %%"#opr2"; " \ + "\tpsrlw $8, %%"#opr2"\n" + +#define xmm_pmulwX(opr1,opr2,w128) \ + "\tpmullw %%"#opr2", %%"#opr1"; " \ + "\tpaddw %%"#w128", %%"#opr1"; " \ + "\tmovdqu %%"#opr1", %%"#opr2"; " \ + "\tpsrlw $8, %%"#opr2"; " \ + "\tpaddw %%"#opr1", %%"#opr2"; " \ + "\tpsrlw $8, %%"#opr2"\n" + +#endif diff --git a/app/composite/make-installer.py b/app/composite/make-installer.py index 4f5cd8c001..65300165f1 100755 --- a/app/composite/make-installer.py +++ b/app/composite/make-installer.py @@ -317,6 +317,8 @@ def gimp_composite_regression(fpout, function_tables, options): #pp.pprint(function_tables) generic_table = function_tables + + composite_modes.sort(); for mode in composite_modes: for A in filter(lambda pf: pf != "GIMP_PIXELFORMAT_ANY", pixel_format): @@ -368,6 +370,8 @@ def gimp_composite_regression(fpout, function_tables, options): print >>fpout, ' ft0 = gimp_composite_regression_time_function (iterations, %s, &generic_ctx);' % ("gimp_composite_dispatch") print >>fpout, ' ft1 = gimp_composite_regression_time_function (iterations, %s, &special_ctx);' % (generic_table[key][0]) print >>fpout, ' if (gimp_composite_regression_compare_contexts ("%s", &generic_ctx, &special_ctx)) {' % (mode_name(mode)) + + print >>fpout, ' printf("%s failed\\n");' % (mode_name(mode)) print >>fpout, ' return (1);' print >>fpout, ' }' print >>fpout, ' gimp_composite_regression_timer_report ("%s", ft0, ft1);' % (mode_name(mode)) @@ -518,7 +522,7 @@ op.add_option('-t', '--test', action='store_true', dest='test', help='generate regression testing code') op.add_option('-i', '--iterations', action='store', type='int', dest='iterations', default=1, help='number of iterations in regression tests') -op.add_option('-n', '--n-pixels', action='store', type="int", dest='n_pixels', default=1024*1024+1, +op.add_option('-n', '--n-pixels', action='store', type="int", dest='n_pixels', default=64*2049+16*2049+1, help='number of pixels in each regression test iteration') op.add_option('-r', '--requires', action='append', type='string', dest='requires', default=[], help='cpp #if conditionals')