diff options
Diffstat (limited to 'phpBB')
-rw-r--r-- | phpBB/db/mysql_schema.sql | 16 | ||||
-rw-r--r-- | phpBB/develop/search_fill.php | 419 |
2 files changed, 241 insertions, 194 deletions
diff --git a/phpBB/db/mysql_schema.sql b/phpBB/db/mysql_schema.sql index 2ad2682371..4810b3e812 100644 --- a/phpBB/db/mysql_schema.sql +++ b/phpBB/db/mysql_schema.sql @@ -271,14 +271,14 @@ CREATE TABLE phpbb_search_results ( # Table structure for table `phpbb_search_wordlist` # DROP TABLE IF EXISTS phpbb_search_wordlist; -CREATE TABLE phpbb_search_wordlist ( - word_id int(11) NOT NULL auto_increment, - word_text varchar(100) NOT NULL default '', - word_weight tinyint(4) NOT NULL default '0', - PRIMARY KEY (word_id), - KEY word_text (word_text) -); - +CREATE TABLE `phpbb_search_wordlist` ( + `word_text` varchar(50) binary NOT NULL default '', + `word_id` int(11) NOT NULL auto_increment, + `word_weight` tinyint(4) NOT NULL default '0', + `word_common` tinyint(1) unsigned NOT NULL default '0', + PRIMARY KEY (`word_text`), + KEY `word_id`(`word_id`) +) # -------------------------------------------------------- # diff --git a/phpBB/develop/search_fill.php b/phpBB/develop/search_fill.php index 531598282a..42c97e64aa 100644 --- a/phpBB/develop/search_fill.php +++ b/phpBB/develop/search_fill.php @@ -1,49 +1,52 @@ <?php -function clean_words($entry, &$stopword_list, &$synonym_list) +// Clean up an entry (posting), remove HTML, BBcode, stopwords, etc +function clean_words($entry, &$search, &$replace) { - $init_match = array("^", "$", "&", "(", ")", "<", ">", "`", "'", "|", ",", "@", "_", "?", "%"); - $init_replace = array(" ", " ", " ", " ", " ", " ", " ", " ", "", " ", " ", " ", " ", " ", " "); + // Weird, $init_match doesn't work with static when double quotes (") are used... + static $init_match = array('^', '$', '&', '(', ')', '<', '>', '`', "'", '|', ',', '@', '_', '?', '%'); + static $init_replace = array(" ", " ", " ", " ", " ", " ", " ", " ", "", " ", " ", " ", " ", " ", " "); - $later_match = array("-", "~", "+", ".", "[", "]", "{", "}", ":", "\\", "/", "=", "#", "\"", ";", "*", "!"); - $later_replace = array(" ", " ", " ", " ", " ", " ", " ", " ", " ", " ", " ", " ", " ", " ", " ", " "); + static $later_match = array("-", "~", "+", ".", "[", "]", "{", "}", ":", "\\", "/", "=", "#", "\"", ";", "*", "!"); + static $later_replace = array(" ", " ", " ", " ", " ", " ", " ", " ", " ", " " , " ", " ", " ", " ", " ", " ", " "); - $entry = " " . stripslashes(strip_tags(strtolower($entry))) . " "; + $entry = " " . strip_tags(strtolower($entry)) . " "; + // Replace line endings by a space $entry = preg_replace("/[\n\r]/is", " ", $entry); + // Remove URL's $entry = preg_replace("/\b[a-z0-9]+:\/\/[a-z0-9\.\-]+(\/[a-z0-9\?\.%_\-\+=&\/]+)?/si", " ", $entry); - $entry = str_replace($init_match, $init_replace, $entry); + // Filter out strange characters like ^, $, &, change "it's" to "its" + // str_replace with arrays is buggy in some PHP versions so traverse the arrays manually ;( + for($i = 0; $i < count($init_match); $i++) + { + $entry = str_replace($init_match[$i], $init_replace[$i], $entry); + } + // Quickly remove BBcode. $entry = preg_replace("/\[code:[0-9]+:[0-9a-z]{10,}\].*?\[\/code:[0-9]+:[0-9a-z]{10,}\]/is", " ", $entry); $entry = preg_replace("/\[img\].*?\[\/img\]/is", " ", $entry); $entry = preg_replace("/\[\/?[a-z\*=\+\-]+[0-9a-z]?(\:[a-z0-9]+)?:[a-z0-9]{10,}(\:[a-z0-9]+)?=?.*?\]/si", " ", $entry); + // URLs $entry = preg_replace("/\[\/?[a-z\*]+[=\+\-]?[0-9a-z]+?:[a-z0-9]{10,}[=.*?]?\]/si", " ", $entry); $entry = preg_replace("/\[\/?url(=.*?)?\]/si", " ", $entry); + // Numbers $entry = preg_replace("/\b[0-9]+\b/si", " ", $entry); + // HTML entities like &1234; $entry = preg_replace("/\b&[a-z]+;\b/is", " ", $entry); + // 'words' that consist of <2 or >50 characters are removed. $entry = preg_replace("/\b[a-z0-9]{1,2}?\b/si", " ", $entry); $entry = preg_replace("/\b[a-z0-9]{50,}?\b/si", " ", $entry); - $entry = str_replace($later_match, $later_replace, $entry); - - if( !empty($stopword_list) ) + // Remove some more strange characters + for($i = 0; $i < count($later_match); $i++) { - for ($j = 0; $j < count($stopword_list); $j++) - { - $filter_word = trim(strtolower($stopword_list[$j])); - $entry = preg_replace("/\b" . preg_quote($filter_word, "/") . "\b/is", " ", $entry); - } + $entry = str_replace($later_match[$i], $later_replace[$i], $entry); } - if( !empty($synonym_list) ) - { - for ($j = 0; $j < count($synonym_list); $j++) - { - list($replace_synonym, $match_synonym) = split(" ", trim(strtolower($synonym_list[$j]))); - $entry = preg_replace("/\b" . preg_quote(trim($match_synonym), "/") . "\b/is", " " . trim($replace_synonym) . " ", $entry); - } - } + // Remove stopwords + $entry = preg_replace($search, $replace, $entry); return $entry; } @@ -55,64 +58,87 @@ function split_words(&$entry) return $split_entries[1]; } -function remove_common($percent) +function remove_common($percent, $delete_common = 0) { global $db; - - $sql = "SELECT sl.word_id, SUM(sm.word_count) AS post_occur_count - FROM phpbb_search_wordlist sl, phpbb_search_wordmatch sm - WHERE sl.word_id = sm.word_id - GROUP BY sl.word_id - ORDER BY post_occur_count DESC"; + + $sql = " + SELECT + COUNT(DISTINCT post_id) as total_posts + FROM " . SEARCH_MATCH_TABLE; $result = $db->sql_query($sql); if( !$result ) { - message_die(GENERAL_ERROR, "Couldn't obtain search word sums", "", __LINE__, __FILE__, $sql); + $error = $db->sql_error(); + die("Couldn't get maximum post ID :: " . $sql . " :: " . $error['message']); } - - $post_count = $db->sql_numrows($result); - $rowset = $db->sql_fetchrowset($result); - - $sql = "SELECT COUNT(post_id) AS total_posts - FROM phpbb_posts"; + $total_posts = $db->sql_fetchrow($result); + $total_posts = $total_posts['total_posts']; + + $common_threshold = floor($total_posts * ($percent/100)); + + $sql = " + SELECT + word_id, + count(word_id) AS word_occur + FROM + ".SEARCH_MATCH_TABLE." + GROUP BY + word_id + HAVING + word_occur > $common_threshold + "; $result = $db->sql_query($sql); if( !$result ) { - message_die(GENERAL_ERROR, "Couldn't obtain post count", "", __LINE__, __FILE__, $sql); + $error = $db->sql_error(); + die("Couldn't obtain common word list :: " . $sql . " :: " . $error['message']); } + $common_words = $db->sql_numrows($result); - $row = $db->sql_fetchrow($result); + while($row = $db->sql_fetchrow($result)) + { + $common_word_ids[] = $row['word_id']; + } + $db->sql_freeresult($result); + + if(count($common_word_ids) != 0) + { + $common_word_ids = implode(',',$common_word_ids); + } + else + { + // We didn't remove any common words + return 0; + } - $words_removed = 0; + $sql = "UPDATE ". SEARCH_WORD_TABLE ." + SET word_common = 1 + WHERE word_id IN ($common_word_ids)"; + $result = $db->sql_query($sql); + if( !$result ) + { + $error = $db->sql_error(); + die("Couldn't delete word list entry :: " . $sql . " :: " . $error['message']); + } - for($i = 0; $i < $post_count; $i++) + if( $delete_common) { - if( ($rowset[$i]['post_occur_count'] / $row['total_posts'] ) >= $percent ) + $sql = "DELETE FROM phpbb_search_wordmatch + WHERE word_id IN ($common_word_ids)"; + $result = $db->sql_query($sql); + if( !$result ) { - $sql = "DELETE FROM phpbb_search_wordlist - WHERE word_id = " . $rowset[$i]['word_id']; - $result = $db->sql_query($sql); - if( !$result ) - { - message_die(GENERAL_ERROR, "Couldn't delete word list entry", "", __LINE__, __FILE__, $sql); - } - - $sql = "DELETE FROM phpbb_search_wordmatch - WHERE word_id = " . $rowset[$i]['word_id']; - $result = $db->sql_query($sql); - if( !$result ) - { - message_die(GENERAL_ERROR, "Couldn't delete word match entry", "", __LINE__, __FILE__, $sql); - } - - $words_removed++; + $error = $db->sql_error(); + die("Couldn't delete word match entry :: " . $sql . " :: " . $error['message']); } } - - return $words_removed; + + return $common_words; } -set_time_limit(2400); +set_time_limit(0); +$common_percent = 40; // Percentage of posts in which a word has to appear to be marked as common $phpbb_root_path = "../"; @@ -121,152 +147,175 @@ include($phpbb_root_path . 'config.'.$phpEx); include($phpbb_root_path . 'includes/constants.'.$phpEx); include($phpbb_root_path . 'includes/db.'.$phpEx); +print "<html>\n<body>\n"; + // // Try and load stopword and synonym files // //$stopword_array = file($phpbb_root_path . "language/lang_" . $board_config['default_lang'] . "/search_stopwords.txt"); //$synonym_array = file($phpbb_root_path . "language/lang_" . $board_config['default_lang'] . "/search_synonyms.txt"); + +// This needs fixing! Shouldn't be hardcoded to English files! $stopword_array = file($phpbb_root_path . "language/lang_english/search_stopwords.txt"); $synonym_array = file($phpbb_root_path . "language/lang_english/search_synonyms.txt"); +for ($j = 0; $j < count($stopword_array); $j++) +{ + $filter_word = trim(strtolower($stopword_array[$j])); + $search[] = "/\b" . preg_quote($filter_word, "/") . "\b/is"; + $replace[] = ''; +} + +for ($j = 0; $j < count($synonym_list); $j++) +{ + list($replace_synonym, $match_synonym) = split(" ", trim(strtolower($synonym_list[$j]))); + $search[] = "/\b" . preg_quote(trim($match_synonym), "/") . "\b/is"; + $replace[] = " " . trim($replace_synonym) . " "; +} + // -// Build search ... +// Fetch a batch of posts_text entries // -$start = ( isset($HTTP_GET_VARS['start']) ) ? $HTTP_GET_VARS['start'] : 0; - -$sql = "SELECT * - FROM " . POSTS_TEXT_TABLE . " - ORDER BY post_id ASC - LIMIT $start, 200"; -$result = $db->sql_query($sql); -if( !$result ) +$sql = " + SELECT + count(*) as total, + max(post_id) as max_post_id + FROM ". POSTS_TEXT_TABLE; +if(!$result = $db->sql_query($sql)) { $error = $db->sql_error(); - die("Couldn't select words :: " . $sql . " :: " . $error['message']); + die("Couldn't get maximum post ID :: " . $sql . " :: " . $error['message']); } +$max_post_id = $db->sql_fetchrow($result); +$totalposts = $max_post_id['total']; +$max_post_id = $max_post_id['max_post_id']; -$rowset = $db->sql_fetchrowset($result); +$postcounter = (!isset($HTTP_GET_VARS['batchstart'])) ? 0 : $HTTP_GET_VARS['batchstart']; -if( $post_rows = $db->sql_numrows($result) ) +$batchsize = 200; // Process this many posts per loop +$batchcount = 0; +for(;$postcounter <= $max_post_id; $postcounter += $batchsize) { + $batchstart = $postcounter + 1; + $batchend = $postcounter + $batchsize; + $batchcount++; + + $sql = "SELECT * + FROM " . + POSTS_TEXT_TABLE ." + WHERE + post_id BETWEEN $batchstart AND $batchend"; + if(!$posts_result = $db->sql_query($sql)) + { + $error = $db->sql_error(); + die("Couldn't get post_text :: " . $sql . " :: " . $error['message']); + } -// $sql = "LOCK TABLES phpbb_posts_text WRITE"; -// $result = $db->sql_query($sql); + $rowset = $db->sql_fetchrowset($posts_result); - for($i = 0; $i < $post_rows; $i++ ) - { - $matches = array(); + if( $post_rows = $db->sql_numrows($posts_result) ) + { - $post_id = $rowset[$i]['post_id']; - $data = $rowset[$i]['post_text']; + // $sql = "LOCK TABLES phpbb_posts_text WRITE"; + // $result = $db->sql_query($sql); + print "\n<p>\n<a href='$PHP_SELF?batchstart=$batchstart'>Restart from posting $batchstart</a><br>\n"; - $text = clean_words($data, $stopword_array, $synonym_array); - $matches = split_words($text); + // For every post in the batch: + for($post_nr = 0; $post_nr < $post_rows; $post_nr++ ) + { + + print "."; + flush(); + $matches = array(); + + $post_id = $rowset[$post_nr]['post_id']; + $data = $rowset[$post_nr]['post_text']; // Raw data + + $text = clean_words($data, $search, $replace); // Cleaned up post + $matches = split_words($text); + $num_matches = count($matches); + if($num_matches < 1) + { + // Skip this post if no words where found + continue; + } - if( count($matches) ) - { $word = array(); $word_count = array(); + $sql_in = ""; $phrase_string = $text; - $sql_in = ""; - for ($j = 0; $j < count($matches); $j++) - { + // For all words in the posting + $sql_insert = ''; + $sql_select = ''; + for($j = 0; $j < $num_matches; $j++) + { $this_word = strtolower(trim($matches[$j])); - - if( empty($word_count[$this_word]) ) - { - $word_count[$this_word] = 1; - } - - $new_word = true; - for($k = 0; $k < count($word); $k++) - { - if( $this_word == $word[$k] ) - { - $new_word = false; - $word_count[$this_word]++; - } - } - - if( $new_word ) + if($this_word != '') { - $word[] = $this_word; + $word_count[$this_word]++; + $comma = ($sql_insert != '')? ', ': ''; + + $sql_insert .= "$comma('" .$this_word. "')"; + $sql_select .= "$comma'" .$this_word. "'"; } } - - for($j = 0; $j < count($word); $j++) + if($sql_insert == '') { - if( $word[$j] ) - { - if( $sql_in != "" ) - { - $sql_in .= ", "; - } - $sql_in .= "'" . $word[$j] . "'"; - } + die("no words found"); + } + + $sql = 'INSERT IGNORE INTO '.SEARCH_WORD_TABLE." + (word_text) + VALUES $sql_insert"; + if( !$result = $db->sql_query($sql) ) + { + $error = $db->sql_error(); + die("Couldn't INSERT words :: " . $sql . " :: " . $error['message']); } - $sql = "SELECT word_id, word_text + // Get the word_id's out of the DB (to see if they are already there) + $sql = "SELECT word_id, word_text FROM phpbb_search_wordlist - WHERE word_text IN ($sql_in)"; + WHERE word_text IN ($sql_select) + GROUP BY word_text"; $result = $db->sql_query($sql); if( !$result ) { $error = $db->sql_error(); die("Couldn't select words :: " . $sql . " :: " . $error['message']); } - if( $word_check_count = $db->sql_numrows($result) ) { - $check_words = $db->sql_fetchrowset($result); + $selected_words = $db->sql_fetchrowset($result); } - - for ($j = 0; $j < count($word); $j++) - { - if( $word[$j] ) - { - $new_match = true; - - if( $word_check_count ) - { - for($k = 0; $k < $word_check_count; $k++) - { - if( $word[$j] == $check_words[$k]['word_text'] ) - { - $new_match = false; - $word_id = $check_words[$k]['word_id']; - } - } - } - - if( $new_match ) - { - $sql = "INSERT INTO phpbb_search_wordlist (word_text) - VALUES ('". addslashes($word[$j]) . "')"; - $result = $db->sql_query($sql); - if( !$result ) - { - $error = $db->sql_error(); - die("Couldn't insert new word :: " . $sql . " :: " . $error['message']); - } - - $word_id = $db->sql_nextid(); - } - - $sql = "INSERT INTO phpbb_search_wordmatch (post_id, word_id, word_count, title_match) - VALUES ($post_id, $word_id, " . $word_count[$word[$j]] . ", 0)"; - $result = $db->sql_query($sql); - if( !$result ) - { - $error = $db->sql_error(); - die("Couldn't insert new word match :: " . $sql . " :: " . $error['message']); - } - - $phrase_string = preg_replace("/\b" . preg_quote($word[$j], "/") . "\b/is", $word_id, $phrase_string); - } + else + { + print "Couldn't do sql_numrows<br>\n"; } + $db->sql_freeresult($result); + + $sql_insert = ''; + while(list($junk, $row) = each($selected_words)) + { + $comma = ($sql_insert != '')? ', ': ''; + $sql_insert .= "$comma($post_id, ".$row['word_id'].", ".$word_count[$row['word_text']]." ,0)"; + } + + $sql = "INSERT INTO ".SEARCH_MATCH_TABLE." + (post_id, word_id, word_count, title_match) + VALUES + $sql_insert + "; + $result = $db->sql_query($sql); + if( !$result ) + { + $error = $db->sql_error(); + die("Couldn't insert new word match :: " . $sql . " :: " . $error['message']); + } + /* + //$phrase_string = preg_replace("/\b" . preg_quote($word[$j], "/") . "\b/is", $word_id, $phrase_string); $phrase_string = trim(preg_replace("/ {2,}/s", " ", str_replace(array("*", "'"), " ", $phrase_string))); $sql = "INSERT INTO phpbb_search_phrasematch (post_id, phrase_list) @@ -278,34 +327,32 @@ if( $post_rows = $db->sql_numrows($result) ) die("Couldn't insert new phrase match :: " . $sql . " :: " . $error['message']); } */ - } - } + } // All posts -// $sql = "UNLOCK TABLES"; -// $result = $db->sql_query($sql); + // $sql = "UNLOCK TABLES"; + // $result = $db->sql_query($sql); -} + } + else + { + print "Couldn't get rowcount for number of posts<br>$sql<br>\n"; + } // All posts; -if( $post_rows == 200 ) -{ - header("Location: search_fill.php?start=" . ($start+200) . "&total=" . ($start + $post_rows)); + $db->sql_freeresult($posts_result); + + // Remove common words after the first 2 batches and after every 4th batch after that. + if( $batchcount % 4 == 3 ) + { + print "<br>Removing common words (words that appear in more than $common_percent of the posts)<br>\n"; + flush(); + print "Removed ". remove_common($common_percent, 1) ." words that where too common.<br>"; + } + } -?> -<html> -<body> - -<?php - -$total_rows = ( $HTTP_GET_VARS['total'] ) ? $HTTP_GET_VARS['total'] : $post_rows; - -echo "<BR><BR>Total posts = " . $total_rows . "<BR><BR>"; - -echo remove_common(0.4); - -exit; +echo "<br>Done"; ?> </body> -</html>
\ No newline at end of file +</html> |