Add proposed patches to improve handling of flaky tests

With these changes, the flaky/known-failing tests are no longer installed as installed-tests at all, so remove them from the autopkgtest metadata.
2022-11-23 21:48:08 +00:00
parent baa6ae9e9c
commit 5794821592
8 changed files with 288 additions and 43 deletions
--- a/debian/patches/series
+++ b/debian/patches/series
@ -1,5 +1,8 @@
 dist/Reinstate-gdk-wayland-cursor-meson.build.patch
 Flush-drawable-surface-when-getting-a-pixbuf.patch
+testsuite-Avoid-using-should_fail.patch
+testsuite-Try-enabling-a11y-tests-other-than-those-known-.patch
+testsuite-Don-t-create-.test-files-for-flaky-or-failing-t.patch
 016_no_offscreen_widgets_grabbing.patch
 017_no_offscreen_device_grabbing.patch
 060_ignore-random-icons.patch
--- a/debian/patches/testsuite-Avoid-using-should_fail.patch
+++ b/debian/patches/testsuite-Avoid-using-should_fail.patch
@ -0,0 +1,163 @@
+From: Simon McVittie <smcv@debian.org>
+Date: Wed, 23 Nov 2022 18:36:08 +0000
+Subject: testsuite: Avoid using should_fail
+
+There are two possible interpretations of "expected failure": either
+the test *must* fail (exactly the inverse of an ordinary test, with
+success becoming failure and failure becoming success), or the test
+*may* fail (with success intended, but failure possible in some
+environments). Autotools had the second interpretation, which seems
+more useful in practice, but Meson has the first.
+
+In GTK 3.24.35, if the environment is such that the label-sizing.ui
+reftest happens to be successful, the overall result of the test suite
+is failure. This seems unlikely to have been the intention.
+
+Instead of using should_fail, put the tests in one of two new suites:
+"flaky" is intended for tests that succeed or fail unpredictably
+according to the test environment or chance, while "failing" is for
+tests that ought to succeed but currently never do as a result of a
+bug or missing functionality. With a sufficiently new version of Meson,
+the flaky and failing tests are not run by default, but can be requested
+with a command like:
+
+    meson test --setup=unstable_tests --suite=flaky --suite=failing
+
+This arrangement is inspired by GNOME/glib!2987, which was contributed
+by Marco Trevisan.
+
+Signed-off-by: Simon McVittie <smcv@debian.org>
+---
+ testsuite/a11y/meson.build     |  4 ++--
+ testsuite/gtk/meson.build      | 17 +++++++++++++----
+ testsuite/meson.build          |  7 +++++++
+ testsuite/reftests/meson.build | 17 ++++++++++++++---
+ 4 files changed, 36 insertions(+), 9 deletions(-)
+
+diff --git a/testsuite/a11y/meson.build b/testsuite/a11y/meson.build
+index fa8b045..85d0f5d 100644
+--- a/testsuite/a11y/meson.build
+++ b/testsuite/a11y/meson.build
+@@ -66,7 +66,7 @@ foreach t: a11y_state_tests
+         'GSETTINGS_SCHEMA_DIR=@0@'.format(gtk_schema_build_dir),
+         'GTK_TEST_MESON=1',
+       ],
+-      suite: 'a11y',
+      suite: ['a11y', 'flaky'],
+     )
+   endif
+ endforeach
+@@ -100,7 +100,7 @@ foreach t: a11y_tests
+                'G_TEST_BUILDDIR=@0@'.format(meson.current_build_dir()),
+                'GSETTINGS_SCHEMA_DIR=@0@'.format(gtk_schema_build_dir),
+              ],
+-        suite: 'a11y')
+        suite: ['a11y', 'flaky'])
+ endforeach
+ 
+ installed_test_data = [
+diff --git a/testsuite/gtk/meson.build b/testsuite/gtk/meson.build
+index 206af0d..137f93a 100644
+--- a/testsuite/gtk/meson.build
+++ b/testsuite/gtk/meson.build
+@@ -50,7 +50,9 @@ tests = [
+   ['revealer-size'],
+ ]
+ 
+-# Tests that are expected to fail
+# Tests that are expected to fail, sometimes or always
+flaky = [
+]
+ xfail = [
+ ]
+ 
+@@ -83,7 +85,15 @@ foreach t : tests
+     install: get_option('installed_tests'),
+     install_dir: installed_test_bindir)
+ 
+-  expect_fail = xfail.contains(test_name)
+  suites = ['gtk']
+
+  if flaky.contains(test_name)
+    suites += 'flaky'
+  endif
+
+  if xfail.contains(test_name)
+    suites += 'failing'
+  endif
+ 
+   test(test_name, test_exe,
+        args: [ '--tap', '-k' ],
+@@ -97,8 +107,7 @@ foreach t : tests
+               'GSETTINGS_SCHEMA_DIR=@0@'.format(gtk_schema_build_dir),
+               'GTK_TEST_MESON=1',
+             ],
+-       suite: 'gtk',
+-       should_fail: expect_fail,
+       suite: suites,
+   )
+ endforeach
+ 
+diff --git a/testsuite/meson.build b/testsuite/meson.build
+index 289f272..032baf2 100644
+--- a/testsuite/meson.build
+++ b/testsuite/meson.build
+@@ -2,6 +2,13 @@ gtk_libexecdir = join_paths(gtk_prefix, get_option('libexecdir'))
+ installed_test_bindir = join_paths(gtk_libexecdir, 'installed-tests', 'gtk+')
+ installed_test_datadir = join_paths(gtk_datadir, 'installed-tests', 'gtk+')
+ 
+if meson.version().version_compare('>=0.57.0')
+  add_test_setup('default',
+    is_default: true,
+    exclude_suites: ['flaky', 'failing'],
+  )
+endif
+
+ subdir('gtk')
+ subdir('gdk')
+ subdir('css')
+diff --git a/testsuite/reftests/meson.build b/testsuite/reftests/meson.build
+index 2135ebb..8c3c3e8 100644
+--- a/testsuite/reftests/meson.build
+++ b/testsuite/reftests/meson.build
+@@ -419,7 +419,7 @@ test_data = [
+ ]
+ 
+ # Depending on the environment these fail, feel free to fix them
+-somehow_broken = [
+flaky = [
+   'button-wrapping.ui',
+   'cellrenderer-pixbuf-stock-rtl.ui',
+   'label-sizing.ui',
+@@ -428,9 +428,21 @@ somehow_broken = [
+   'symbolic-icon-translucent-color.ui',
+   'window-height-for-width.ui',
+ ]
+xfail = [
+]
+ 
+ foreach testname : test_data
+   if testname.endswith('.ui') and not testname.endswith('.ref.ui')
+    suites = ['reftest']
+
+    if flaky.contains(testname)
+      suites += 'flaky'
+    endif
+
+    if xfail.contains(testname)
+      suites += 'failing'
+    endif
+
+     # reftests fail when multiple windows open at the same time stealing the focus,
+     # so set is_parallel to false
+     test('reftest ' + testname, gtk_reftest,
+@@ -450,8 +462,7 @@ foreach testname : test_data
+                 'G_TEST_BUILDDIR=@0@'.format(meson.current_build_dir()),
+                 'REFTEST_MODULE_DIR=@0@'.format(meson.current_build_dir()),
+               ],
+-         suite: 'reftest',
+-         should_fail: somehow_broken.contains(testname),
+         suite: suites,
+          is_parallel: false)
+   endif
+ endforeach
--- a/debian/patches/testsuite-Don-t-create-.test-files-for-flaky-or-failing-t.patch
+++ b/debian/patches/testsuite-Don-t-create-.test-files-for-flaky-or-failing-t.patch
@ -0,0 +1,93 @@
+From: Simon McVittie <smcv@debian.org>
+Date: Wed, 23 Nov 2022 21:26:50 +0000
+Subject: testsuite: Don't create .test files for flaky or failing tests
+
+These tests can be run manually, but are not suitable for use as an
+acceptance test, so let's not make frameworks like Debian's autopkgtest
+run these when they run ginsttest-runner in the most obvious way.
+
+a11ytests.test doesn't seem to be reliable enough to be used as a QA
+acceptance criterion, and has been disabled as a build-time test in both
+Gitlab-CI and Debian since 2019. a11ystate.test is not set up to be run
+at build time at all, and has been marked as flaky on ci.debian.net
+since 2018.
+
+The rest of the testsuite/a11y directory seems to have been
+reliable in practice, at least on ci.debian.net, so try leaving them
+enabled as installed-tests.
+
+In principle this could be made finer-grained by having a separate .test
+file and a separate Meson test() for each .ui file, but that would
+require more active maintenance of GTK 3.
+
+Signed-off-by: Simon McVittie <smcv@debian.org>
+---
+ testsuite/a11y/meson.build       | 7 ++++++-
+ testsuite/a11y/state/meson.build | 7 +++++--
+ testsuite/gtk/meson.build        | 7 ++++++-
+ 3 files changed, 17 insertions(+), 4 deletions(-)
+
+diff --git a/testsuite/a11y/meson.build b/testsuite/a11y/meson.build
+index ea6348c..2613e3c 100644
+--- a/testsuite/a11y/meson.build
+++ b/testsuite/a11y/meson.build
+@@ -181,7 +181,12 @@ installed_test_data = [
+ ]
+ 
+ a11y_installed_tests = [
+-  'a11ytests.test',
+  # This is the equivalent of a11y_state_tests above, and does not seem
+  # to be reliable enough to act as a QA gate in practice. We install the
+  # test executable and the data needed to run it, but don't hook it up
+  # to ginsttest-runner.
+  #'a11ytests.test',
+
+   'a11ychildren.test',
+   'a11ytree.test',
+   'a11yvalue.test',
+diff --git a/testsuite/a11y/state/meson.build b/testsuite/a11y/state/meson.build
+index 782649e..aa18a84 100644
+--- a/testsuite/a11y/state/meson.build
+++ b/testsuite/a11y/state/meson.build
+@@ -16,7 +16,10 @@ test_data = [
+ 
+ 
+ a11y_installed_tests = [
+-  'a11ystate.test',
+  # This is not run at build time at all, and consistently fails on
+  # Debian's CI infrastructure, so don't set it up to be run by
+  # ginsttest-runner.
+  #'a11ystate.test',
+ ]
+ 
+ if get_option('installed_tests')
+@@ -32,4 +35,4 @@ if get_option('installed_tests')
+   endforeach
+ 
+   install_data(test_data, install_dir: join_paths(installed_test_bindir, 'state'))
+-endif
+\ No newline at end of file
+endif
+diff --git a/testsuite/gtk/meson.build b/testsuite/gtk/meson.build
+index 137f93a..84a8cfc 100644
+--- a/testsuite/gtk/meson.build
+++ b/testsuite/gtk/meson.build
+@@ -146,6 +146,11 @@ endif
+ if get_option('installed_tests')
+   foreach t : tests
+     test_name = t.get(0)
+
+    if flaky.contains(test_name) or xfail.contains(test_name)
+      continue
+    endif
+
+     conf = configuration_data()
+     conf.set('testexecdir', installed_test_bindir)
+     conf.set('test', test_name)
+@@ -158,4 +163,4 @@ if get_option('installed_tests')
+   install_subdir('icons', install_dir: installed_test_bindir)
+   install_subdir('icons2', install_dir: installed_test_bindir)
+   install_subdir('ui', install_dir: installed_test_bindir)
+-endif
+\ No newline at end of file
+endif
--- a/debian/patches/testsuite-Try-enabling-a11y-tests-other-than-those-known-.patch
+++ b/debian/patches/testsuite-Try-enabling-a11y-tests-other-than-those-known-.patch
@ -0,0 +1,29 @@
+From: Simon McVittie <smcv@debian.org>
+Date: Thu, 24 Nov 2022 12:09:12 +0000
+Subject: testsuite: Try enabling a11y tests,
+ other than those known to be unstable
+
+At least some of the tests implemented via the accessibility-dump
+executable are known to be unstable, but the tests based on separate
+executables (tree-performance.c, etc.) have been reasonably consistently
+passing on ci.debian.net for several years, so hopefully they are also
+reliable enough for upstream CI and we don't need to mark them as flaky?
+
+Signed-off-by: Simon McVittie <smcv@debian.org>
+---
+ testsuite/a11y/meson.build | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/testsuite/a11y/meson.build b/testsuite/a11y/meson.build
+index 85d0f5d..ea6348c 100644
+--- a/testsuite/a11y/meson.build
+++ b/testsuite/a11y/meson.build
+@@ -100,7 +100,7 @@ foreach t: a11y_tests
+                'G_TEST_BUILDDIR=@0@'.format(meson.current_build_dir()),
+                'GSETTINGS_SCHEMA_DIR=@0@'.format(gtk_schema_build_dir),
+              ],
+-        suite: ['a11y', 'flaky'])
+        suite: ['a11y'])
+ endforeach
+ 
+ installed_test_data = [
--- a/debian/run-tests.sh
+++ b/debian/run-tests.sh
@ -45,7 +45,6 @@ for BACKEND in x11; do
        dbus-run-session -- \
            xvfb-run -a \
                dh_auto_test --builddirectory="$BUILDDIR" -- \
-                    --no-suite=gtk+-3.0:a11y \
                    "$@" \
    || touch "$test_data/tests-failed"

--- a/debian/tests/control
+++ b/debian/tests/control
@ -5,7 +5,3 @@ Restrictions: allow-stderr, superficial
 Tests: installed-tests
 Depends: at-spi2-core, dbus-daemon, gnome-desktop-testing (>= 2018.1-1~), gtk-3-examples, librsvg2-common, xauth, xvfb
 Restrictions: allow-stderr
-
-Tests: installed-tests-a11ystate installed-tests-reftests
-Depends: at-spi2-core, dbus-daemon, gnome-desktop-testing (>= 2018.1-1~), gtk-3-examples, librsvg2-common, xauth, xvfb
-Restrictions: allow-stderr, flaky
--- a/debian/tests/installed-tests-a11ystate
+++ b/debian/tests/installed-tests-a11ystate
@ -1,19 +0,0 @@
-#!/bin/sh
-# autopkgtest check: Run the installed-tests to verify GTK works correctly
-# Based on glib2.0's d/tests/installed-tests, (C) 2013 Canonical Ltd.
-
-set -e
-
-# Disable gvfs if it happens to be installed. We want to test the built-in
-# stuff
-export GIO_USE_VFS=local
-export GIO_USE_VOLUME_MONITOR=unix
-
-export XDG_RUNTIME_DIR="$AUTOPKGTEST_TMP"
-
-exec dbus-run-session -- \
-xvfb-run -a -s "-screen 0 1024x768x24" \
-gnome-desktop-testing-runner \
--report-directory="$AUTOPKGTEST_ARTIFACTS" \
--tap \
-"gtk+/a11ystate.test"
--- a/debian/tests/installed-tests-reftests
+++ b/debian/tests/installed-tests-reftests
@ -1,19 +0,0 @@
-#!/bin/sh
-# autopkgtest check: Run the installed-tests to verify GTK works correctly
-# Based on glib2.0's d/tests/installed-tests, (C) 2013 Canonical Ltd.
-
-set -e
-
-# Disable gvfs if it happens to be installed. We want to test the built-in
-# stuff
-export GIO_USE_VFS=local
-export GIO_USE_VOLUME_MONITOR=unix
-
-export XDG_RUNTIME_DIR="$AUTOPKGTEST_TMP"
-
-exec dbus-run-session -- \
-xvfb-run -a -s "-screen 0 1024x768x24" \
-gnome-desktop-testing-runner \
--report-directory="$AUTOPKGTEST_ARTIFACTS" \
--tap \
-"gtk+/reftests"