diff options
author | Pascal Terjan <pterjan@mageia.org> | 2023-12-10 21:43:07 +0000 |
---|---|---|
committer | Pascal Terjan <pterjan@mageia.org> | 2023-12-11 23:45:04 +0000 |
commit | c53d21cf38263206af292b48d6369ee1662bd4be (patch) | |
tree | b402732dc7c0fe1268a6873791b8fc180508c202 | |
parent | 42466ac7d8bcd3fd1d1097d3dbf1f77cd1a2daa0 (diff) | |
download | iurt-c53d21cf38263206af292b48d6369ee1662bd4be.tar iurt-c53d21cf38263206af292b48d6369ee1662bd4be.tar.gz iurt-c53d21cf38263206af292b48d6369ee1662bd4be.tar.bz2 iurt-c53d21cf38263206af292b48d6369ee1662bd4be.tar.xz iurt-c53d21cf38263206af292b48d6369ee1662bd4be.zip |
Limit retries in case of install_deps_failure
A new config option (backoff_delays) was added to ulri.
It contains a list of delays (in seconds) between retries and default to
[5*60, 30*60, 60*60, 120*60] (5m, 30m, 1h, 2h).
When reaching the end of the list, we fail permanently.
To never retry, set it to [].
To retry forever, set it to undef.
-rw-r--r-- | NEWS | 3 | ||||
-rw-r--r-- | lib/Iurt/Queue.pm | 36 | ||||
-rw-r--r-- | t/queue.t | 49 | ||||
-rwxr-xr-x | ulri | 20 |
4 files changed, 100 insertions, 8 deletions
@@ -4,6 +4,9 @@ - iurt: Fix / of the chroot belonging to the user instead of root - iurt: Make generation of the chroot archive more atomic - iurt: Use the target directory when reference chroot needs to be updated +- ulri: Limit retries in case of install_deps_failure. A new config option + backoff_delays is a list of delays before retrying, and when reaching the + end of the list, we fail permanently. 0.8.2.2 - ulri: Fix a crash after build failures diff --git a/lib/Iurt/Queue.pm b/lib/Iurt/Queue.pm index 585294e..d877046 100644 --- a/lib/Iurt/Queue.pm +++ b/lib/Iurt/Queue.pm @@ -3,8 +3,9 @@ package Iurt::Queue; use base qw(Exporter); use File::Copy 'move'; use File::Path 'make_path'; +use File::stat 'stat'; use Iurt::Config qw(get_mandatory_arch get_target_arch); -use Iurt::File qw(read_line); +use Iurt::File qw(read_line create_file); use Iurt::Util qw(plog); use MDK::Common qw(cat_ find member partition); use strict; @@ -17,6 +18,7 @@ our @EXPORT = qw( load_lock_file_data record_bot_complete remove_bot_from_package + schedule_next_retry ); sub apply_to_upload_tree { @@ -234,6 +236,20 @@ sub get_upload_tree_state { $pkg_tree{$prefix}{deps} = \@deps; } + + if ($r =~ /(\d{14}\.\w+\.\w+\.\d+)_(.*)\.retry$/) { + my $prefix = $1; + my $arch = $2; + my $mtime = stat("$todo/$f/$m/$s/$r")->mtime; + my $nb_failures = cat_("$todo/$f/$m/$s/$r"); + plog('DEBUG', "$prefix failed $nb_failures times, last one was as " . localtime($mtime)); + if ($mtime < time) { + plog('INFO', "Too early to retry $prefix"); + $pkg_tree{$prefix}{media}{$media}{later}{$arch} = 1; + } else { + $pkg_tree{$prefix}{media}{$media}{retries}{arch}{nb_failures} = $nb_failures; + } + } } sub done_func { @@ -286,3 +302,21 @@ sub get_upload_tree_state { return %pkg_tree; } + +sub schedule_next_retry { + my ($config, $todo_dir, $prefix, $arch, $nb_failures) = @_; + + # If backoff_delays is not set, do nothing and retry forever + return 1 unless defined $config->{backoff_delays}; + + my $backoff_delays = $config->{backoff_delays}; + my $file = "$todo_dir/${prefix}_$arch.retry"; + create_file($file, $nb_failures+1); + if ($nb_failures >= scalar(@$backoff_delays)) { + plog('INFO', "$prefix failed too many times with a retriable error ($nb_failures)"); + return; + } + my $mtime = time + @$backoff_delays[$nb_failures]; + utime(time, $mtime, $file); + return 1; +} @@ -1,6 +1,11 @@ -use Test::More; use Iurt::Queue; +use Test::More; + +use Cwd; +use File::stat 'stat'; +use MDK::Common qw(cat_); + $config = {}; my $media = 'core/release'; @@ -57,4 +62,46 @@ $ent = create_ent(); remove_bot_from_package($ent, $media, 'h1', 2); is_deeply $ent->{media}{$media}{bot}, [$b1, $b3] or diag explain $ent->{media}{$media}{bot}; + +chdir 't' if -d 't'; +mkdir "tmp"; +my $dir = Cwd::cwd() . "/tmp"; +my $retry_file = "${dir}/test_noarch.retry"; +unlink $retry_file; + +sub verify_retry_file { + my ($test, $present, $content, $mtime) = @_; + if ($present) { + ok(-f $retry_file, "$test - $retry_file should exist"); + } else { + ok(!-f $retry_file, "$test - $retry_file should not exist"); + return; + } + is(cat_($retry_file), $content, "$test - $retry_file should contain $content"); + if ($mtime) { + my $t = stat($retry_file)->mtime; + # Allow 5s difference if running on a very slow machine + ok($t > $mtime - 5); + ok($t < $mtime + 5); + } +} + +unlink $retry_file; +ok(schedule_next_retry({'backoff_delays' => [1000, 2000]}, $dir, 'test', 'noarch', 0), "schedule_next_retry - first failure is retried"); +verify_retry_file("schedule_next_retry - first failure is retried", 1, 1, time+1000); + +unlink $retry_file; +ok(schedule_next_retry({'backoff_delays' => [1000, 2000]}, $dir, 'test', 'noarch', 1), "schedule_next_retry - one retry left is retried"); +verify_retry_file("schedule_next_retry - one retry left is retried", 1, 2, time+2000); + +unlink $retry_file; +ok(!schedule_next_retry({'backoff_delays' => [120, 1000]}, $dir, 'test', 'noarch', 2), "schedule_next_retry - no retry left is failed"); + +unlink $retry_file; +ok(!schedule_next_retry({'backoff_delays' => []}, $dir, 'test', 'noarch', 0), "schedule_next_retry - no retry is failed"); + +unlink $retry_file; +ok(schedule_next_retry({}, $dir, 'test', 'noarch', 0), "schedule_next_retry - always retry is retried"); +verify_retry_file("schedule_next_retry - always retry is retried", 0); + done_testing(); @@ -25,7 +25,7 @@ use Iurt::Config qw(config_usage get_date config_init get_author_email get_targe use Iurt::File qw(create_file); use Iurt::Mail qw(sendmail); use Iurt::Process qw(check_pid); -use Iurt::Queue qw(check_if_mandatory_arch_failed cleanup_failed_build get_upload_tree_state load_lock_file_data record_bot_complete); +use Iurt::Queue qw(check_if_mandatory_arch_failed cleanup_failed_build get_upload_tree_state load_lock_file_data record_bot_complete schedule_next_retry); use Iurt::RPM qw(check_arch check_noarch); use Iurt::Util qw(plog_init plog ssh_setup ssh sout sget sput); use Iurt::Ulri qw(build_package fetch_logs_and_cleanup warn_about_failure); @@ -169,6 +169,10 @@ my %config_usage = ( default => [ 'i586', 'x86_64' ], }, }, + 'backoff_delays' => { + desc => 'List of delays in seconds before retrying retriable errors. Error becomes permanent after reaching the end of the list.', + default => [5*60, 30*60, 60*60, 120*60] + }, ); config_usage(\%config_usage, $config) if $run{config_usage}; config_init(\%config_usage, $config, \%run); @@ -227,7 +231,6 @@ foreach my $prefix (keys %pkg_tree) { # TODO: Make this parallel plog('MSG', "check build bot results"); -my %later; my $something_finished; foreach my $prefix (keys %pkg_tree) { my $ent = $pkg_tree{$prefix}; @@ -324,8 +327,13 @@ foreach my $prefix (keys %pkg_tree) { plog('DEBUG', $res); if ($r eq 'install_deps_failure') { plog('FAIL', "install deps failure, rebuild later: $p"); - $later{$prefix} = 1; - $later = 1; + if (schedule_next_retry($config, $todo_dir, $prefix, $arch, $pkg_tree{$prefix}{media}{$media}{retries}{arch}{nb_failures})) { + $later = 1; + $pkg_tree{$prefix}{media}{$media}{later}{$arch} = 1; + } else { + plog('FAIL', "Too many retries due to install_deps_failure: $p"); + $fail = 1; + } } if ($r ne 'ok') { plog('FAIL', "$r: $p"); @@ -423,8 +431,6 @@ my %to_compile; # crash or just lock ulri somehow foreach my $prefix (sort keys %pkg_tree) { - next if $later{$prefix}; - my $ent = $pkg_tree{$prefix}; my $ready = 1; @@ -491,6 +497,8 @@ foreach my $prefix (sort keys %pkg_tree) { # need to find a bot for each arch foreach my $arch (@$arch_list) { + next if $pkg_tree{$prefix}{media}{$media}{later}{$arch}; + # Skip this arch if the package is already building for it or if it # should not be built on this arch or it has already failed or # succeeded. |