aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPascal Terjan <pterjan@mageia.org>2023-12-10 21:43:07 +0000
committerPascal Terjan <pterjan@mageia.org>2023-12-11 23:45:04 +0000
commitc53d21cf38263206af292b48d6369ee1662bd4be (patch)
treeb402732dc7c0fe1268a6873791b8fc180508c202
parent42466ac7d8bcd3fd1d1097d3dbf1f77cd1a2daa0 (diff)
downloadiurt-c53d21cf38263206af292b48d6369ee1662bd4be.tar
iurt-c53d21cf38263206af292b48d6369ee1662bd4be.tar.gz
iurt-c53d21cf38263206af292b48d6369ee1662bd4be.tar.bz2
iurt-c53d21cf38263206af292b48d6369ee1662bd4be.tar.xz
iurt-c53d21cf38263206af292b48d6369ee1662bd4be.zip
Limit retries in case of install_deps_failure
A new config option (backoff_delays) was added to ulri. It contains a list of delays (in seconds) between retries and default to [5*60, 30*60, 60*60, 120*60] (5m, 30m, 1h, 2h). When reaching the end of the list, we fail permanently. To never retry, set it to []. To retry forever, set it to undef.
-rw-r--r--NEWS3
-rw-r--r--lib/Iurt/Queue.pm36
-rw-r--r--t/queue.t49
-rwxr-xr-xulri20
4 files changed, 100 insertions, 8 deletions
diff --git a/NEWS b/NEWS
index e76243e..4254478 100644
--- a/NEWS
+++ b/NEWS
@@ -4,6 +4,9 @@
- iurt: Fix / of the chroot belonging to the user instead of root
- iurt: Make generation of the chroot archive more atomic
- iurt: Use the target directory when reference chroot needs to be updated
+- ulri: Limit retries in case of install_deps_failure. A new config option
+ backoff_delays is a list of delays before retrying, and when reaching the
+ end of the list, we fail permanently.
0.8.2.2
- ulri: Fix a crash after build failures
diff --git a/lib/Iurt/Queue.pm b/lib/Iurt/Queue.pm
index 585294e..d877046 100644
--- a/lib/Iurt/Queue.pm
+++ b/lib/Iurt/Queue.pm
@@ -3,8 +3,9 @@ package Iurt::Queue;
use base qw(Exporter);
use File::Copy 'move';
use File::Path 'make_path';
+use File::stat 'stat';
use Iurt::Config qw(get_mandatory_arch get_target_arch);
-use Iurt::File qw(read_line);
+use Iurt::File qw(read_line create_file);
use Iurt::Util qw(plog);
use MDK::Common qw(cat_ find member partition);
use strict;
@@ -17,6 +18,7 @@ our @EXPORT = qw(
load_lock_file_data
record_bot_complete
remove_bot_from_package
+ schedule_next_retry
);
sub apply_to_upload_tree {
@@ -234,6 +236,20 @@ sub get_upload_tree_state {
$pkg_tree{$prefix}{deps} = \@deps;
}
+
+ if ($r =~ /(\d{14}\.\w+\.\w+\.\d+)_(.*)\.retry$/) {
+ my $prefix = $1;
+ my $arch = $2;
+ my $mtime = stat("$todo/$f/$m/$s/$r")->mtime;
+ my $nb_failures = cat_("$todo/$f/$m/$s/$r");
+ plog('DEBUG', "$prefix failed $nb_failures times, last one was as " . localtime($mtime));
+ if ($mtime < time) {
+ plog('INFO', "Too early to retry $prefix");
+ $pkg_tree{$prefix}{media}{$media}{later}{$arch} = 1;
+ } else {
+ $pkg_tree{$prefix}{media}{$media}{retries}{arch}{nb_failures} = $nb_failures;
+ }
+ }
}
sub done_func {
@@ -286,3 +302,21 @@ sub get_upload_tree_state {
return %pkg_tree;
}
+
+sub schedule_next_retry {
+ my ($config, $todo_dir, $prefix, $arch, $nb_failures) = @_;
+
+ # If backoff_delays is not set, do nothing and retry forever
+ return 1 unless defined $config->{backoff_delays};
+
+ my $backoff_delays = $config->{backoff_delays};
+ my $file = "$todo_dir/${prefix}_$arch.retry";
+ create_file($file, $nb_failures+1);
+ if ($nb_failures >= scalar(@$backoff_delays)) {
+ plog('INFO', "$prefix failed too many times with a retriable error ($nb_failures)");
+ return;
+ }
+ my $mtime = time + @$backoff_delays[$nb_failures];
+ utime(time, $mtime, $file);
+ return 1;
+}
diff --git a/t/queue.t b/t/queue.t
index d9f8056..b480e67 100644
--- a/t/queue.t
+++ b/t/queue.t
@@ -1,6 +1,11 @@
-use Test::More;
use Iurt::Queue;
+use Test::More;
+
+use Cwd;
+use File::stat 'stat';
+use MDK::Common qw(cat_);
+
$config = {};
my $media = 'core/release';
@@ -57,4 +62,46 @@ $ent = create_ent();
remove_bot_from_package($ent, $media, 'h1', 2);
is_deeply $ent->{media}{$media}{bot}, [$b1, $b3] or diag explain $ent->{media}{$media}{bot};
+
+chdir 't' if -d 't';
+mkdir "tmp";
+my $dir = Cwd::cwd() . "/tmp";
+my $retry_file = "${dir}/test_noarch.retry";
+unlink $retry_file;
+
+sub verify_retry_file {
+ my ($test, $present, $content, $mtime) = @_;
+ if ($present) {
+ ok(-f $retry_file, "$test - $retry_file should exist");
+ } else {
+ ok(!-f $retry_file, "$test - $retry_file should not exist");
+ return;
+ }
+ is(cat_($retry_file), $content, "$test - $retry_file should contain $content");
+ if ($mtime) {
+ my $t = stat($retry_file)->mtime;
+ # Allow 5s difference if running on a very slow machine
+ ok($t > $mtime - 5);
+ ok($t < $mtime + 5);
+ }
+}
+
+unlink $retry_file;
+ok(schedule_next_retry({'backoff_delays' => [1000, 2000]}, $dir, 'test', 'noarch', 0), "schedule_next_retry - first failure is retried");
+verify_retry_file("schedule_next_retry - first failure is retried", 1, 1, time+1000);
+
+unlink $retry_file;
+ok(schedule_next_retry({'backoff_delays' => [1000, 2000]}, $dir, 'test', 'noarch', 1), "schedule_next_retry - one retry left is retried");
+verify_retry_file("schedule_next_retry - one retry left is retried", 1, 2, time+2000);
+
+unlink $retry_file;
+ok(!schedule_next_retry({'backoff_delays' => [120, 1000]}, $dir, 'test', 'noarch', 2), "schedule_next_retry - no retry left is failed");
+
+unlink $retry_file;
+ok(!schedule_next_retry({'backoff_delays' => []}, $dir, 'test', 'noarch', 0), "schedule_next_retry - no retry is failed");
+
+unlink $retry_file;
+ok(schedule_next_retry({}, $dir, 'test', 'noarch', 0), "schedule_next_retry - always retry is retried");
+verify_retry_file("schedule_next_retry - always retry is retried", 0);
+
done_testing();
diff --git a/ulri b/ulri
index b814708..553fbd9 100755
--- a/ulri
+++ b/ulri
@@ -25,7 +25,7 @@ use Iurt::Config qw(config_usage get_date config_init get_author_email get_targe
use Iurt::File qw(create_file);
use Iurt::Mail qw(sendmail);
use Iurt::Process qw(check_pid);
-use Iurt::Queue qw(check_if_mandatory_arch_failed cleanup_failed_build get_upload_tree_state load_lock_file_data record_bot_complete);
+use Iurt::Queue qw(check_if_mandatory_arch_failed cleanup_failed_build get_upload_tree_state load_lock_file_data record_bot_complete schedule_next_retry);
use Iurt::RPM qw(check_arch check_noarch);
use Iurt::Util qw(plog_init plog ssh_setup ssh sout sget sput);
use Iurt::Ulri qw(build_package fetch_logs_and_cleanup warn_about_failure);
@@ -169,6 +169,10 @@ my %config_usage = (
default => [ 'i586', 'x86_64' ],
},
},
+ 'backoff_delays' => {
+ desc => 'List of delays in seconds before retrying retriable errors. Error becomes permanent after reaching the end of the list.',
+ default => [5*60, 30*60, 60*60, 120*60]
+ },
);
config_usage(\%config_usage, $config) if $run{config_usage};
config_init(\%config_usage, $config, \%run);
@@ -227,7 +231,6 @@ foreach my $prefix (keys %pkg_tree) {
# TODO: Make this parallel
plog('MSG', "check build bot results");
-my %later;
my $something_finished;
foreach my $prefix (keys %pkg_tree) {
my $ent = $pkg_tree{$prefix};
@@ -324,8 +327,13 @@ foreach my $prefix (keys %pkg_tree) {
plog('DEBUG', $res);
if ($r eq 'install_deps_failure') {
plog('FAIL', "install deps failure, rebuild later: $p");
- $later{$prefix} = 1;
- $later = 1;
+ if (schedule_next_retry($config, $todo_dir, $prefix, $arch, $pkg_tree{$prefix}{media}{$media}{retries}{arch}{nb_failures})) {
+ $later = 1;
+ $pkg_tree{$prefix}{media}{$media}{later}{$arch} = 1;
+ } else {
+ plog('FAIL', "Too many retries due to install_deps_failure: $p");
+ $fail = 1;
+ }
}
if ($r ne 'ok') {
plog('FAIL', "$r: $p");
@@ -423,8 +431,6 @@ my %to_compile;
# crash or just lock ulri somehow
foreach my $prefix (sort keys %pkg_tree) {
- next if $later{$prefix};
-
my $ent = $pkg_tree{$prefix};
my $ready = 1;
@@ -491,6 +497,8 @@ foreach my $prefix (sort keys %pkg_tree) {
# need to find a bot for each arch
foreach my $arch (@$arch_list) {
+ next if $pkg_tree{$prefix}{media}{$media}{later}{$arch};
+
# Skip this arch if the package is already building for it or if it
# should not be built on this arch or it has already failed or
# succeeded.