#!/usr/bin/perl # # Copyright (C) 2005,2006 Mandriva # # Author: Florent Villard # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2, or (at your option) # any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # # compare and rebuild packages on different architecture # # TODO # # - create a configuration file to handle the various iurt running # - get the content of the rebuild dir # - run as many iurt as machines are available and gather results # - the scheduler just take the results, launch new rebuild, and quit # - use perl ssh and try to workarround the non working timeout when the # remote machine is stalled # - use submitter as packager, not generic name # use strict; use MDK::Common qw(any cat_ if_ find); use Iurt::Config qw(config_usage get_date config_init get_author_email check_arch check_noarch get_target_arch); use Iurt::File qw(create_file); use Iurt::Process qw(check_pid); use Iurt::Queue qw(check_if_mandatory_arch_failed cleanup_failed_build get_upload_tree_state); use Iurt::Mail qw(sendmail); use Iurt::Util qw(plog_init plog ssh_setup ssh sout sget sput); use Iurt::Ulri qw(build_package warn_about_failure); use File::Copy 'move'; use File::Path 'make_path'; use File::Temp 'mktemp'; use Filesys::Df qw(df); use Data::Dumper; use File::Slurp qw(read_file); my %run; my $program_name = 'ulri'; $run{program_name} = $program_name; my $LOG; if (!$ENV{ULRI_LOG_FILE} || !open($LOG, '>>', $ENV{ULRI_LOG_FILE})) { open($LOG, ">&STDERR"); } plog_init($program_name, $LOG, 7, 1); my $HOME = $ENV{HOME}; my $configfile = "$HOME/.upload.conf"; my $sysconfigfile = "/etc/iurt/upload.conf"; my $config = {}; foreach my $f ($configfile, $sysconfigfile) { plog('DEBUG', "load config: $f"); if (-f $f) { $config = eval(cat_($f)) or die "FATAL $program_name: syntax error in $f"; last; } } my %config_usage = ( admin => { desc => 'mail address of the bot administrator', default => 'distrib-admin@mandrivalinux.org' }, 'arch_translation' => { desc => "Renaming of arch", default => { 'sparc64' => 'sparcv9' } }, bot => { desc => "List of bot able to compile the packages", default => { i586 => { localhost => { iurt => { user => 'builder', command => qq(iurt --copy_srpm --group --config local_spool /home/builder/iurt/__DIR__ --no_rsync --chrooted-urpmi -m __MEDIA__ -- http://localhost/distrib/ -p "__PACKAGER__" -r __TARGET__ __ARCH__), packages => '/home/builder/iurt/', } , }, }, }, }, media => { desc => 'Corresponding media to add given the current media', default => { default => { "core/backports" => [ "core/backports", "core/release", "core/updates" ], "core/backports_testing" => [ "core/backports", "core/backports_testing", "core/release", "core/updates" ], "core/release" => [ "core/release" ], "core/updates" => [ "core/release", "core/updates" ], "core/updates_testing" => [ "core/release", "core/updates", "core/updates_testing" ], "nonfree/backports" => [ "core/backports", "core/release", "core/updates", "nonfree/backports", "nonfree/release", "nonfree/updates" ], "nonfree/backports_testing" => [ "core/backports", "core/backports_testing", "core/release", "core/updates", "nonfree/backports", "nonfree/backports_testing", "nonfree/release", "nonfree/updates" ], "nonfree/release" => [ "core/release", "nonfree/release" ], "nonfree/updates" => [ "core/release", "core/updates", "nonfree/release", "nonfree/updates" ], "nonfree/updates_testing" => [ "core/release", "core/updates", "core/updates_testing", "nonfree/release", "nonfree/updates", "nonfree/updates_testing" ], "tainted/backports" => [ "core/backports", "core/release", "core/updates", "tainted/backports", "tainted/release", "tainted/updates" ], "tainted/backports_testing" => [ "core/backports", "core/backports_testing", "core/release", "core/updates", "tainted/backports", "tainted/backports_testing", "tainted/release", "tainted/updates" ], "tainted/release" => [ "core/release", "tainted/release" ], "tainted/updates" => [ "core/release", "core/updates", "tainted/release", "tainted/updates" ], "tainted/updates_testing" => [ "core/release", "core/updates", "core/updates_testing", "tainted/release", "tainted/updates", "tainted/updates_testing" ], }, }, }, faildelay => { desc => "Time after which the rebuild is considered as a failure", default => 36000 }, http_queue => { desc => 'Address where log can be consulted', default => 'http://kenobi.mandriva.com/queue ' }, queue => { desc => "Root of the tree where the packages to compile are located", default => "$HOME/uploads" }, tmp => { desc => "Temporary directory", default => "$HOME/tmp" }, ssh_options => { desc => "SSH options", default => "-o ConnectTimeout=5 -o BatchMode=yes -o ServerAliveInterval=5" }, packager => { desc => 'Default packager tag user by bot', default => 'Mageia Team ' }, 'arch' => { desc => 'Architectures list for each target', default => { default => [ 'i586', 'x86_64' ], }, }, ); config_usage(\%config_usage, $config) if $run{config_usage}; config_init(\%config_usage, $config, \%run); my %untranslated_arch; foreach my $k (keys %{$config->{arch_translation}}) { my $v = $config->{arch_translation}{$k}; push @{$untranslated_arch{$v}}, $k; } $run{pidfile_home} = $config->{tmp}; $run{pidfile} = $program_name; my $pidfile = check_pid(\%run); my ($fulldate, $daydate) = get_date(); $run{daydate} = $daydate; my $df = df $config->{queue}; if ($df->{per} == 100) { # FIXME should send a mail too die "FATAL $program_name: not enough space on the filesystem, only $df->{bavail} KB on $config->{queue}, full at $df->{per}%"; } ($fulldate, $daydate) = get_date(); my $todo = "$config->{queue}/todo"; my $failure = "$config->{queue}/failure"; my $done = "$config->{queue}/done"; my $reject = "$config->{queue}/reject"; # # Part 0: gather data from upload tree # plog('MSG', "check uploads tree"); plog('DEBUG', "input queue is $todo"); my %pkg_tree = get_upload_tree_state($config); # Load bot status # foreach my $prefix (keys %pkg_tree) { my $ent = $pkg_tree{$prefix}; foreach my $media (keys %{$ent->{media}}) { foreach my $bot (@{$ent->{media}{$media}{bot}}) { $run{bot}{$bot->{host}}{$bot->{bot}} = $prefix; } } } # # Part 1: get results from finished builds # # TODO: Make this parallel plog('MSG', "check build bot results"); my %later; my $something_finished; foreach my $prefix (keys %pkg_tree) { my $ent = $pkg_tree{$prefix}; foreach my $media (keys %{$ent->{media}}) { my $path = $ent->{media}{$media}{path}; my $user = $ent->{user}; # Local pathnames my $done_dir = "$done/$path"; my $todo_dir = "$todo/$path"; my $fail_dir = "$failure/$path"; bot: foreach my $bot_list (@{$ent->{media}{$media}{bot}}) { my ($bot, $host, $date, $pid, $arch, $time) = @$bot_list{qw(bot host date pid arch time)}; my $bot_conf = $config->{bot}{$arch}{$host}{$bot}; my $remote = ssh_setup($config->{ssh_options}, $bot_conf->{user}, $host); my $prefix_dir = "$bot_conf->{packages}/$path/$prefix-$arch/"; # If our build is noarch, set arch appropriately. # my $lock_file = "$todo_dir/${prefix}_$arch-noarch.$bot.$host.$date.$pid.lock"; if (-f $lock_file) { plog('DEBUG', "$prefix is noarch"); $arch = "noarch"; } else { $lock_file =~ s/-noarch//; } my $status_file = "$prefix_dir/log/status.log"; plog('INFO', "check status: $host/$arch ($bot [$pid])"); my $seconds = time()-$time; my $timeout = $seconds > $config->{faildelay}; my $status = sout($remote, "cat $status_file"); if ($? == 255) { plog('WARN', "failed to get status for $host/$arch"); # If we are already out of time, we should fail anyway next bot unless $timeout; } my $proc_state; # If we don't get a status, the build should be in progress if (!$status) { # Need to differenciate process not running with failure to ssh chomp($proc_state = sout($remote, "ps h -o state $pid || echo NOT_RUNNING")); # Reasons for failure my $zombie = $proc_state eq 'Z'; my $ended = $proc_state eq 'NOT_RUNNING'; unless ($timeout || $zombie || $ended) { # Everything is fine, build is continuing! # Kill it if that package had failed on a mandatory arch if (check_if_mandatory_arch_failed($media, $prefix, $ent, $config)) { ssh($remote, "kill -14 $pid"); } next bot; } if ($timeout) { plog('FAIL', "$bot timed out on $host/$arch ($seconds sec), killing it"); ssh($remote, "kill -14 $pid"); # Give it some time to die/cleanup if ($seconds < 1.02 * $config->{faildelay}) { next bot; } plog('FAIL', "$bot didn't die after timing out on $host/$arch, removing lock"); } else { plog('FAIL', "$bot died on $host/$arch (status $proc_state), removing lock"); } $pkg_tree{$prefix}{media}{$media}{arch}{$arch} = 0; } # Either we are done or we should kill the build plog('INFO', "delete lock file for $prefix"); unlink $lock_file; $run{bot}{$host}{$bot} = 0; if (!$status) { # TODO: fetch/clean the logs next bot; } my $fail; my $later; # Check if the build bot finished on the other side # plog('INFO', "check result: $host/$arch ($bot [$pid])"); foreach my $res (split "\n", $status) { my ($p, $r) = $res =~ /(.*):\s+(.*)/; plog('DEBUG', $res); if ($r eq 'install_deps_failure') { plog('FAIL', "install deps failure, rebuild later: $p"); $later{$prefix} = 1; $later = 1; # TODO: fetch/clean the logs } if ($r ne 'ok') { plog('FAIL', "$r: $p"); $fail = 1; } } if (!$fail && !$later) { my @list = split "\n", sout($remote, "ls $prefix_dir"); my $error; my $done; my $arch_check = join '|', $arch, if_($untranslated_arch{$arch}, @{$untranslated_arch{$arch}}); plog('MSG', "checking for $arch_check arch"); foreach my $result (@list) { $result =~ /\.(src|$arch_check|noarch)\.rpm$/ or next; # do not copy the initial src package $result =~ /^$prefix/ and next; my $result_file = "$done_dir/${prefix}_$result"; plog('OK', "build ok: $result"); if ($result =~ /\.$arch_check\.rpm$/) { $done = 1; } plog('DEBUG', "copy files to done"); make_path($done_dir); if (sget($remote, "$prefix_dir/$result", "$result_file.new")) { plog('ERROR', "copying $result from $host failed ($!)"); $error = 1; last; } if (!move("$result_file.new", $result_file)) { $error = 1; last; } # Add the package to the list of built ones, in case we fail another arch and need to cleanup push @{$ent->{rpms}}, $result_file; } next if $error; if ($done) { if (check_if_mandatory_arch_failed($media, $prefix, $ent, $config)) { # Discard this arch as another mandatory one failed cleanup_failed_build($todo_dir, $done_dir, $fail_dir, $prefix, $ent, $media, $arch, $config); } else { create_file("$done_dir/${prefix}_$arch.done", "$bot $host"); $pkg_tree{$prefix}{media}{$media}{done_arch}{$arch} = 1; make_path("$done_dir/$prefix"); sget($remote, "$prefix_dir/log/*", "$done_dir/$prefix"); $something_finished = 1; } # Either we already fetched the success logs, or don't care # as this success was discarded due to another failure. ssh($remote, "rm -rf $prefix_dir"); next bot; } } make_path($fail_dir); mkdir("$fail_dir/$prefix"); if (sget($remote, "$prefix_dir/*", "$fail_dir/$prefix")) { plog('ERROR', "copying from $host:$prefix_dir/ " . "to $fail_dir/ failed ($!)"); $pkg_tree{$prefix}{media}{$media}{arch}{$arch} = 0; } # clean the log on the compilation machine ssh($remote, "rm -rf $prefix_dir"); # We got the logs but want to retry so don't record a failure next bot if $later; cleanup_failed_build($todo_dir, $done_dir, $fail_dir, $prefix, $ent, $media, $arch, $config); plog('FAIL', "build failed"); create_file("$done_dir/${prefix}_$arch.fail", "$bot $host"); $pkg_tree{$prefix}{media}{$media}{failed_arch}{$arch} = 1; # Notify user if build failed if ($user) { warn_about_failure($config, $user, $ent, $arch, $fail_dir, $path, $prefix); } } # end bot } # end path } # end prefix # # Part 2: check queue and start new jobs if a bot is available # plog('MSG', "launching new compilations"); my %to_compile; # do not sort the keys to be able to ignore packages which makes iurt # crash or just lock ulri somehow foreach my $prefix (sort keys %pkg_tree) { next if $later{$prefix}; my $ent = $pkg_tree{$prefix}; my $ready = 1; my $failed_dep = 0; foreach my $dep (@{$ent->{deps}}) { if (glob "$done/*/*/*/$dep.upload") { plog('DEBUG', "Dependent build $dep was uploaded"); next; } # $dep was not uploaded yet, so it's too early to build this one plog('DEBUG', "Dependent build $dep not ready ($prefix)"); $ready = 0; if (glob("$reject/*/*/*/$dep.youri") || glob("$done/*/*/*/${dep}_*.fail")) { plog('ERROR', "Dependent build $dep has failed"); $failed_dep = 1; } } if ($failed_dep) { plog('DEBUG', "Dependent build(s) failed, rejecting this one"); foreach my $media (keys %{$ent->{media}}) { my $path = $ent->{media}{$media}{path}; make_path("$reject/$path"); foreach my $srpm (@{$ent->{srpms}}) { move("$todo/$path/${prefix}_$srpm", "$reject/$path/${prefix}_$srpm"); } } next; } next unless $ready; plog('DEBUG', "No missing dependent build for $prefix"); foreach my $media (keys %{$ent->{media}}) { my $path = $ent->{media}{$media}{path}; my $todo_dir = "$config->{queue}/todo/$path"; my $target = $ent->{target}; my $srpms = $ent->{srpms} or next; #plog('DEBUG', "searching a bot to compile @$srpms"); # count noarch todos only once even if searching multiple bots my $noarch_countflag = 0; # If all packages in a group are noarch, consider the entire group # as noarch my $noarch = 1; $noarch = 0 if any { !check_noarch("$todo_dir/${prefix}_$_") } @$srpms; # Skip this media if the package is already building as noarch for it or # already done. next if $pkg_tree{$prefix}{media}{$media}{arch}{noarch}; next if $pkg_tree{$prefix}{media}{$media}{done_arch}{noarch}; # Only build noarch packages on mandatory architectures # Other architectures may not be up to date my $arch_list; if ($noarch) { $arch_list = get_mandatory_arch($config, $target); } else { $arch_list = get_target_arch($config, $target); } # need to find a bot for each arch foreach my $arch (@$arch_list) { # Skip this arch if the package is already building for it or if it # should not be built on this arch or it has already failed or # succeeded. next if $pkg_tree{$prefix}{media}{$media}{arch}{$arch}; next if $pkg_tree{$prefix}{media}{$media}{arch}{noarch}; next if $pkg_tree{$prefix}{media}{$media}{done_arch}{$arch}; next if $pkg_tree{$prefix}{media}{$media}{excluded_arch}{$arch}; next if $pkg_tree{$prefix}{media}{$media}{failed_arch}{$arch}; my $excluded = any { !check_arch("$todo_dir/${prefix}_$_", $arch) } @$srpms; if ($excluded) { plog('WARN', "excluding from $arch: $prefix"); my $done_dir = "$done/$path"; make_path($done_dir); create_file("$done_dir/${prefix}_$arch.excluded", "ulri $arch excluded"); $pkg_tree{$prefix}{media}{$media}{excluded_arch}{$arch} = 1; next; } if ($noarch) { plog('DEBUG', "search any bot for @$srpms") unless $noarch_countflag; } else { plog('DEBUG', "search $arch bot for @$srpms"); } hosts: foreach my $host (keys %{$config->{bot}{$arch}}) { foreach my $bot (keys %{$config->{bot}{$arch}{$host}}) { next if $run{bot}{$host}{$bot}; my $pid = build_package($config, \%pkg_tree, $media, $prefix, $host, $arch, $bot); if ($pid) { # Register that the package is building $run{bot}{$host}{$bot} = $prefix; $pkg_tree{$prefix}{media}{$media}{arch}{$noarch ? 'noarch' : $arch} = 1; my $lock_arch = $noarch ? "$arch-noarch" : $arch; my $lock_file = "$todo_dir/${prefix}_" . "$lock_arch.$bot.$host.$fulldate.$pid.lock"; plog('DEBUG', "create lock $lock_file"); create_file($lock_file, "$program_name $$", time()); last hosts; } } } # Count packages to compile for each architecture. Count noarch # package only once. # $arch = 'noarch' if $noarch; unless ($pkg_tree{$prefix}{media}{$media}{arch}{$arch}) { $to_compile{$arch}++ if !($noarch && $noarch_countflag); } $noarch_countflag = 1 if $noarch; } } } plog('MSG', "Current status"); my %build_list; foreach my $prefix (keys %pkg_tree) { my $ent = $pkg_tree{$prefix}; foreach my $media (keys %{$ent->{media}}) { foreach my $bot (@{$ent->{media}{$media}{bot}}) { my $host = $bot->{host}; my $arch = $bot->{arch}; push @{$build_list{"$host/$arch"}}, $prefix; } } } if (keys %build_list) { plog('INFO', "currently building:"); map { plog('INFO', " $_: " . join('', @{$build_list{$_}})) } sort keys %build_list; } plog('INFO', "jobs in queue:", %to_compile ? map { sprintf("%s(%d)", $_, $to_compile{$_}) } sort keys %to_compile : "none"); unlink $pidfile; exec "emi" if $something_finished; __END__ # ulri ends here Discussion ---------- 20060802 (Warly) * I prefer creating a separate scheduler, so that it can eventually call other bots. * bots should be able to take packages by themselves. * Iurt will perform several checks, they have to be standard and usable by the maintainer, the results must be put in a visible directory or path * We can put packages either in a dir or to prefix all files with the date and uploader. Having all files in a dir will make the listing simpler. Prefixing the files could be problematic if we base the rpm name and version parsing on the filename. * ulri knows the prefix, he could ask iurt to put the packages in a dir with the same prefix. 20060806 (Warly) * All the packages are put in done, then the final youri is run to put them in queue/ 20061104 (claudio) * Instead if having configuration defaults for our environment and using ulri with the defaults, it would be nicer to have minimalistic/generic defaults and install a configuration file in kenobi * Ulri's configuration file could be renamed to .ulri.conf instead of .upload.conf. ==> ok, it's also used by emi