From 841b5d3961f31277c424a4432fc51f0ac4bf093f Mon Sep 17 00:00:00 2001 From: "mkanat%bugzilla.org" <> Date: Sun, 6 Sep 2009 22:45:51 +0000 Subject: Bug 176002: Move duplicate statistics into the db Patch by Max Kanat-Alexander r=LpSolit, a=LpSolit --- Bugzilla/DB/Schema.pm | 3 +- Bugzilla/Install/DB.pm | 4 + Bugzilla/Install/Filesystem.pm | 7 +- collectstats.pl | 80 --------- duplicates.cgi | 182 +++++++++------------ template/en/default/global/user-error.html.tmpl | 25 --- .../en/default/reports/duplicates-table.html.tmpl | 7 +- 7 files changed, 93 insertions(+), 215 deletions(-) diff --git a/Bugzilla/DB/Schema.pm b/Bugzilla/DB/Schema.pm index 583b49352..5e8c49ccf 100644 --- a/Bugzilla/DB/Schema.pm +++ b/Bugzilla/DB/Schema.pm @@ -352,7 +352,7 @@ use constant ABSTRACT_SCHEMA => { fieldid => {TYPE => 'INT3', NOTNULL => 1, REFERENCES => {TABLE => 'fielddefs', COLUMN => 'id'}}, - added => {TYPE => 'TINYTEXT'}, + added => {TYPE => 'varchar(255)'}, removed => {TYPE => 'TINYTEXT'}, ], INDEXES => [ @@ -360,6 +360,7 @@ use constant ABSTRACT_SCHEMA => { bugs_activity_who_idx => ['who'], bugs_activity_bug_when_idx => ['bug_when'], bugs_activity_fieldid_idx => ['fieldid'], + bugs_activity_added_idx => ['added'], ], }, diff --git a/Bugzilla/Install/DB.pm b/Bugzilla/Install/DB.pm index dd4b2fe1d..d39d1a770 100644 --- a/Bugzilla/Install/DB.pm +++ b/Bugzilla/Install/DB.pm @@ -574,6 +574,10 @@ sub update_table_definitions { _convert_disallownew_to_isactive(); + $dbh->bz_alter_column('bugs_activity', 'added', + { TYPE => 'varchar(255)' }); + $dbh->bz_add_index('bugs_activity', 'bugs_activity_added_idx', ['added']); + ################################################################ # New --TABLE-- changes should go *** A B O V E *** this point # ################################################################ diff --git a/Bugzilla/Install/Filesystem.pm b/Bugzilla/Install/Filesystem.pm index fe9783805..763fd85bf 100644 --- a/Bugzilla/Install/Filesystem.pm +++ b/Bugzilla/Install/Filesystem.pm @@ -157,8 +157,6 @@ sub FILESYSTEM { # Readable directories "$datadir/mining" => { files => $ws_readable, dirs => $ws_dir_readable }, - "$datadir/duplicates" => { files => $ws_readable, - dirs => $ws_dir_readable }, "$libdir/Bugzilla" => { files => $ws_readable, dirs => $ws_dir_readable }, $extlib => { files => $ws_readable, @@ -379,6 +377,11 @@ EOT unlink "$datadir/duplicates.rdf"; unlink "$datadir/duplicates-old.rdf"; } + + if (-e "$datadir/duplicates") { + print "Removing duplicates directory...\n"; + rmtree("$datadir/duplicates"); + } } # A simple helper for creating "empty" CSS files. diff --git a/collectstats.pl b/collectstats.pl index e550c1613..5f9b4eef3 100755 --- a/collectstats.pl +++ b/collectstats.pl @@ -33,12 +33,9 @@ use strict; use lib qw(. lib); -use AnyDBM_File; -use IO::Handle; use List::Util qw(first); use Cwd; - use Bugzilla; use Bugzilla::Constants; use Bugzilla::Error; @@ -160,8 +157,6 @@ my $tend = time; # Uncomment the following line for performance testing. #print "Total time taken " . delta_time($tstart, $tend) . "\n"; -&calculate_dupes(); - CollectSeriesData(); sub check_data_dir { @@ -299,81 +294,6 @@ sub get_old_data { return @data; } -sub calculate_dupes { - my $dbh = Bugzilla->dbh; - my $rows = $dbh->selectall_arrayref("SELECT dupe_of, dupe FROM duplicates"); - - my %dupes; - my %count; - my $key; - my $changed = 1; - - my $today = &today_dash; - - # Save % count here in a date-named file - # so we can read it back in to do changed counters - # First, delete it if it exists, so we don't add to the contents of an old file - my $datadir = bz_locations()->{'datadir'}; - - if (my @files = <$datadir/duplicates/dupes$today*>) { - map { trick_taint($_) } @files; - unlink @files; - } - - dbmopen(%count, "$datadir/duplicates/dupes$today", 0644) || die "Can't open DBM dupes file: $!"; - - # Create a hash with key "a bug number", value "bug which that bug is a - # direct dupe of" - straight from the duplicates table. - foreach my $row (@$rows) { - my ($dupe_of, $dupe) = @$row; - $dupes{$dupe} = $dupe_of; - } - - # Total up the number of bugs which are dupes of a given bug - # count will then have key = "bug number", - # value = "number of immediate dupes of that bug". - foreach $key (keys(%dupes)) - { - my $dupe_of = $dupes{$key}; - - if (!defined($count{$dupe_of})) { - $count{$dupe_of} = 0; - } - - $count{$dupe_of}++; - } - - # Now we collapse the dupe tree by iterating over %count until - # there is no further change. - while ($changed == 1) - { - $changed = 0; - foreach $key (keys(%count)) { - # if this bug is actually itself a dupe, and has a count... - if (defined($dupes{$key}) && $count{$key} > 0) { - # add that count onto the bug it is a dupe of, - # and zero the count; the check is to avoid - # loops - if ($count{$dupes{$key}} != 0) { - $count{$dupes{$key}} += $count{$key}; - $count{$key} = 0; - $changed = 1; - } - } - } - } - - # Remove the values for which the count is zero - foreach $key (keys(%count)) - { - if ($count{$key} == 0) { - delete $count{$key}; - } - } - - dbmclose(%count); -} - # This regenerates all statistics from the database. sub regenerate_stats { my ($dir, $product, $bug_resolution, $bug_status, $removed) = @_; diff --git a/duplicates.cgi b/duplicates.cgi index af239d632..4c0509864 100755 --- a/duplicates.cgi +++ b/duplicates.cgi @@ -18,15 +18,11 @@ # Copyright (C) 1998 Netscape Communications Corporation. All # Rights Reserved. # -# Contributor(s): Gervase Markham -# -# Generates mostfreq list from data collected by collectstats.pl. - +# Contributor(s): +# Gervase Markham +# Max Kanat-Alexander use strict; - -use AnyDBM_File; - use lib qw(. lib); use Bugzilla; @@ -34,28 +30,52 @@ use Bugzilla::Constants; use Bugzilla::Util; use Bugzilla::Error; use Bugzilla::Search; +use Bugzilla::Field; use Bugzilla::Product; +############### +# Subroutines # +############### + +# $counts is a count of exactly how many direct duplicates there are for +# each bug we're considering. $dups is a map of duplicates, from one +# bug_id to another. We go through the duplicates map ($dups) and if one bug +# in $count is a duplicate of another bug in $count, we add their counts +# together under the target bug. +sub add_indirect_dups { + my ($counts, $dups) = @_; + + foreach my $add_from (keys %$dups) { + my $add_to = walk_dup_chain($dups, $add_from); + my $add_amount = delete $counts->{$add_from} || 0; + $counts->{$add_to} += $add_amount; + } +} + +sub walk_dup_chain { + my ($dups, $from_id) = @_; + my $to_id = $dups->{$from_id}; + while (my $bug_id = $dups->{$to_id}) { + last if $bug_id == $from_id; # avoid duplicate loops + $to_id = $bug_id; + } + # Optimize for future calls to add_indirect_dups. + $dups->{$from_id} = $to_id; + return $to_id; +} + +############### +# Main Script # +############### + my $cgi = Bugzilla->cgi; my $template = Bugzilla->template; my $vars = {}; -# collectstats.pl uses duplicates.cgi to generate the RDF duplicates stats. -# However, this conflicts with requirelogin if it's enabled; so we make -# logging-in optional if we are running from the command line. -if ($::ENV{'GATEWAY_INTERFACE'} eq "cmdline") { - Bugzilla->login(LOGIN_OPTIONAL); -} -else { - Bugzilla->login(); -} +Bugzilla->login(); my $dbh = Bugzilla->switch_to_shadow_db(); -my %dbmcount; -my %count; -my %before; - # Get params from URL sub formvalue { my ($name, $default) = (@_); @@ -70,6 +90,10 @@ my $reverse = formvalue("reverse") ? 1 : 0; my @query_products = $cgi->param('product'); my $sortvisible = formvalue("sortvisible"); my @buglist = (split(/[:,]/, formvalue("bug_id"))); +detaint_natural($_) foreach @buglist; +# If we got any non-numeric items, they will now be undef. Remove them from +# the list. +@buglist = grep($_, @buglist); # Make sure all products are valid. foreach my $p (@query_products) { @@ -79,54 +103,6 @@ foreach my $p (@query_products) { # Small backwards-compatibility hack, dated 2002-04-10. $sortby = "count" if $sortby eq "dup_count"; -# Open today's record of dupes -my $today = days_ago(0); -my $yesterday = days_ago(1); - -# We don't know the exact file name, because the extension depends on the -# underlying dbm library, which could be anything. We can't glob, because -# perl < 5.6 considers if (<*>) { ... } to be tainted -# Instead, just check the return value for today's data and yesterday's, -# and ignore file not found errors - -use Errno; -use Fcntl; - -my $datadir = bz_locations()->{'datadir'}; - -if (!tie(%dbmcount, 'AnyDBM_File', "$datadir/duplicates/dupes$today", - O_RDONLY, 0644)) { - if ($!{ENOENT}) { - if (!tie(%dbmcount, 'AnyDBM_File', "$datadir/duplicates/dupes$yesterday", - O_RDONLY, 0644)) { - my $vars = { today => $today }; - if ($!{ENOENT}) { - ThrowUserError("no_dupe_stats", $vars); - } else { - $vars->{'error_msg'} = $!; - ThrowUserError("no_dupe_stats_error_yesterday", $vars); - } - } - } else { - ThrowUserError("no_dupe_stats_error_today", - { error_msg => $! }); - } -} - -# Copy hash (so we don't mess up the on-disk file when we remove entries) -%count = %dbmcount; - -# Remove all those dupes under the threshold parameter. -# We do this, before the sorting, for performance reasons. -my $threshold = Bugzilla->params->{"mostfreqthreshold"}; - -while (my ($key, $value) = each %count) { - delete $count{$key} if ($value < $threshold); - - # If there's a buglist, restrict the bugs to that list. - delete $count{$key} if $sortvisible && (lsearch(\@buglist, $key) == -1); -} - my $origmaxrows = $maxrows; detaint_natural($maxrows) || ThrowUserError("invalid_maxrows", { maxrows => $origmaxrows}); @@ -136,34 +112,45 @@ detaint_natural($changedsince) || ThrowUserError("invalid_changedsince", { changedsince => $origchangedsince }); -# Try and open the database from "changedsince" days ago -my $dobefore = 0; -my %delta; -my $whenever = days_ago($changedsince); - -if (!tie(%before, 'AnyDBM_File', "$datadir/duplicates/dupes$whenever", - O_RDONLY, 0644)) { - # Ignore file not found errors - if (!$!{ENOENT}) { - ThrowUserError("no_dupe_stats_error_whenever", - { error_msg => $!, - changedsince => $changedsince, - whenever => $whenever, - }); - } -} else { - # Calculate the deltas - ($delta{$_} = $count{$_} - ($before{$_} || 0)) foreach (keys(%count)); - $dobefore = 1; +my %total_dups = @{$dbh->selectcol_arrayref( + "SELECT dupe_of, COUNT(dupe) + FROM duplicates + GROUP BY dupe_of", {Columns => [1,2]})}; + +my %dupe_relation = @{$dbh->selectcol_arrayref( + "SELECT dupe, dupe_of FROM duplicates + WHERE dupe IN (SELECT dupe_of FROM duplicates)", + {Columns => [1,2]})}; +add_indirect_dups(\%total_dups, \%dupe_relation); + +my $reso_field_id = get_field_id('resolution'); +my %since_dups = @{$dbh->selectcol_arrayref( + "SELECT dupe_of, COUNT(dupe) + FROM duplicates INNER JOIN bugs_activity + ON bugs_activity.bug_id = duplicates.dupe + WHERE added = 'DUPLICATE' AND fieldid = ? AND " + . $dbh->sql_to_days('bug_when') . " >= (" + . $dbh->sql_to_days('NOW()') . " - ?) + GROUP BY dupe_of", {Columns=>[1,2]}, + $reso_field_id, $changedsince)}; +add_indirect_dups(\%since_dups, \%dupe_relation); + +my (@bugs, @bug_ids); + +foreach my $id (keys %total_dups) { + if ($total_dups{$id} < Bugzilla->params->{'mostfreqthreshold'}) { + delete $total_dups{$id}; + next; + } + if ($sortvisible and @buglist and !grep($_ == $id, @buglist)) { + delete $total_dups{$id}; + } } -my @bugs; -my @bug_ids; - -if (scalar(%count)) { +if (scalar %total_dups) { # use Bugzilla::Search so that we get the security checking - my $params = new Bugzilla::CGI({ 'bug_id' => [keys %count] }); + my $params = new Bugzilla::CGI({ 'bug_id' => [keys %total_dups] }); if ($openonly) { $params->param('resolution', '---'); @@ -221,8 +208,8 @@ if (scalar(%count)) { $short_desc, $bug_status, $resolution) = @$result; push (@bugs, { id => $id, - count => $count{$id}, - delta => $delta{$id}, + count => $total_dups{$id}, + delta => $since_dups{$id} || 0, component => $component, bug_severity => $bug_severity, op_sys => $op_sys, @@ -237,7 +224,6 @@ if (scalar(%count)) { $vars->{'bugs'} = \@bugs; $vars->{'bug_ids'} = \@bug_ids; -$vars->{'dobefore'} = $dobefore; $vars->{'sortby'} = $sortby; $vars->{'sortvisible'} = $sortvisible; $vars->{'changedsince'} = $changedsince; @@ -264,9 +250,3 @@ print $cgi->header( # Generate and return the UI (HTML page) from the appropriate template. $template->process($format->{'template'}, $vars) || ThrowTemplateError($template->error()); - - -sub days_ago { - my ($dom, $mon, $year) = (localtime(time - ($_[0]*24*60*60)))[3, 4, 5]; - return sprintf "%04d-%02d-%02d", 1900 + $year, ++$mon, $dom; -} diff --git a/template/en/default/global/user-error.html.tmpl b/template/en/default/global/user-error.html.tmpl index c4eefb4aa..3783e523b 100644 --- a/template/en/default/global/user-error.html.tmpl +++ b/template/en/default/global/user-error.html.tmpl @@ -1168,31 +1168,6 @@ [% title = "No Tag Selected" %] You didn't select a tag from which to remove [% terms.bugs %]. - [% ELSIF error == "no_dupe_stats" %] - [% title = "Cannot Find Duplicate Statistics" %] - [% admindocslinks = {'extraconfig.html' => 'Setting up the collecstats.pl job'} %] - There are no duplicate statistics for today ([% today FILTER html %]) - or yesterday. - - [% ELSIF error == "no_dupe_stats_error_today" %] - [% title = "Error Reading Today's Dupes File" %] - [% admindocslinks = {'extraconfig.html' => 'Setting up the collecstats.pl job'} %] - An error occurred opening today's dupes file: [% error_msg FILTER html %]. - - [% ELSIF error == "no_dupe_stats_error_whenever" %] - [% title = "Error Reading Previous Dupes File" %] - [% admindocslinks = {'extraconfig.html' => 'Setting up the collecstats.pl job'} %] - An error occurred opening [% changedsince FILTER html %] days ago - ([% whenever FILTER html %])'s dupes file: - [% error_msg FILTER html %]. - - [% ELSIF error == "no_dupe_stats_error_yesterday" %] - [% title = "Error Reading Yesterday's Dupes File" %] - [% admindocslinks = {'extraconfig.html' => 'Setting up the collecstats.pl job'} %] - There are no duplicate statistics for today ([% today FILTER html %]), - and an error - occurred opening yesterday's dupes file: [% error_msg FILTER html %]. - [% ELSIF error == "no_initial_bug_status" %] [% title = "No Initial $terms.Bug Status" %] No [% terms.bug %] status is available on [% terms.bug %] creation. diff --git a/template/en/default/reports/duplicates-table.html.tmpl b/template/en/default/reports/duplicates-table.html.tmpl index 5dbef2144..f651a7fd4 100644 --- a/template/en/default/reports/duplicates-table.html.tmpl +++ b/template/en/default/reports/duplicates-table.html.tmpl @@ -61,9 +61,6 @@ { name => "short_desc", description => "Summary" } ] %] - [%# Small hack to keep delta column out if we don't need it %] - [% NEXT IF column.name == "delta" AND NOT dobefore %] - [% bug_ids_string = bug_ids.join(',') %]