diff options
| author | Nils Adermann <naderman@naderman.de> | 2006-01-11 18:56:07 +0000 |
|---|---|---|
| committer | Nils Adermann <naderman@naderman.de> | 2006-01-11 18:56:07 +0000 |
| commit | 0e0b1120fba8ed4f2ebc5d62eb29b1a34c1b1007 (patch) | |
| tree | 5cd57f820281c20c6936433d92483cc4712a7ab7 /phpBB/develop | |
| parent | 9ea5fa1768feebfb04f2303788eb4c685161e3dd (diff) | |
| download | forums-0e0b1120fba8ed4f2ebc5d62eb29b1a34c1b1007.tar forums-0e0b1120fba8ed4f2ebc5d62eb29b1a34c1b1007.tar.gz forums-0e0b1120fba8ed4f2ebc5d62eb29b1a34c1b1007.tar.bz2 forums-0e0b1120fba8ed4f2ebc5d62eb29b1a34c1b1007.tar.xz forums-0e0b1120fba8ed4f2ebc5d62eb29b1a34c1b1007.zip | |
- overhauled search system
- updated structure for search backend plugins
- better result caching using ACM
- search results no longer session restricted => link to them by copying the URL :)
- in-topic search
- indexing posts now uses search backend plugins
- develop/search_fill.php working again
- fulltext_mysql not working yet
- tiny bugfixes to ACM
git-svn-id: file:///svn/phpbb/trunk@5441 89ea8834-ac86-4346-8a33-228a782c2dd0
Diffstat (limited to 'phpBB/develop')
| -rw-r--r-- | phpBB/develop/search_fill.php | 143 |
1 files changed, 35 insertions, 108 deletions
diff --git a/phpBB/develop/search_fill.php b/phpBB/develop/search_fill.php index d8a2dfc11d..a4337245fa 100644 --- a/phpBB/develop/search_fill.php +++ b/phpBB/develop/search_fill.php @@ -13,21 +13,34 @@ // set_time_limit(0); -$phpbb_root_path = "../"; -include($phpbb_root_path . 'extension.inc'); +define('IN_PHPBB', true); +$phpbb_root_path = '../'; +$phpEx = substr(strrchr(__FILE__, '.'), 1); include($phpbb_root_path . 'common.'.$phpEx); -include($phpbb_root_path . 'includes/search.'.$phpEx); -$common_percent = 0.4; // Percentage of posts in which a word has to appear to be marked as common +// Start session management +$user->session_begin(); +$auth->acl($user->data); +$user->setup(); -print "<html>\n<body>\n"; +$search_type = $config['search_type']; -// -// Try and load stopword and synonym files -// -// This needs fixing! Shouldn't be hardcoded to English files! -$stopword_array = file($phpbb_root_path . "language/lang_english/search_stopwords.txt"); -$synonym_array = file($phpbb_root_path . "language/lang_english/search_synonyms.txt"); +if (!file_exists($phpbb_root_path . 'includes/search/' . $search_type . '.' . $phpEx)) +{ + trigger_error('NO_SUCH_SEARCH_MODULE'); +} + +require($phpbb_root_path . 'includes/search/' . $search_type . '.' . $phpEx); + +$error = false; +$search = new $search_type($error); + +if ($error) +{ + trigger_error($error); +} + +print "<html>\n<body>\n"; // // Fetch a batch of posts_text entries @@ -74,9 +87,9 @@ for(;$postcounter <= $max_post_id; $postcounter += $batchsize) if( $post_rows ) { - // $sql = "LOCK TABLES ".POST_TEXT_TABLE." WRITE"; - // $result = $db->sql_query($sql); - print "\n<p>\n<a href='$PHP_SELF?batchstart=$batchstart'>Restart from posting $batchstart</a><br>\n"; + // $sql = "LOCK TABLES ".POST_TEXT_TABLE." WRITE"; + // $result = $db->sql_query($sql); + print "\n<p>\n<a href='{$_SERVER['PHP_SELF']}?batchstart=$batchstart'>Restart from posting $batchstart</a><br>\n"; // For every post in the batch: for($post_nr = 0; $post_nr < $post_rows; $post_nr++ ) @@ -86,105 +99,19 @@ for(;$postcounter <= $max_post_id; $postcounter += $batchsize) $post_id = $rowset[$post_nr]['post_id']; - $matches = array(); - $matches['text'] = split_words(clean_words("post", $rowset[$post_nr]['post_text'], $stopword_array, $synonym_array)); - $matches['title'] = split_words(clean_words("post", $rowset[$post_nr]['post_subject'], $stopword_array, $synonym_array)); - - while( list($match_type, $match_ary) = @each($matches) ) - { - $title_match = ( $match_type == 'title' ) ? 1 : 0; - - $num_matches = count($match_ary); - - if ( $num_matches < 1 ) - { - // Skip this post if no words where found - continue; - } - - // For all words in the posting - $sql_in = ""; - - $sql_insert = ''; - $sql_select = ''; - - $word = array(); - $word_count = array(); - - for($j = 0; $j < $num_matches; $j++) - { - $this_word = strtolower(trim($match_ary[$j])); - if ( $this_word != '' ) - { - $word_count[$this_word] = ( isset($word_count[$this_word]) ) ? $word_count[$this_word] + 1 : 0; - $comma = ($sql_insert != '')? ', ': ''; - - $sql_insert .= "$comma('" . $this_word . "')"; - $sql_select .= "$comma'" . $this_word . "'"; - } - } - - if ( $sql_insert == '' ) - { - die("no words found"); - } - - $sql = 'INSERT IGNORE INTO ' . SEARCH_WORD_TABLE . " - (word_text) - VALUES $sql_insert"; - if ( !$result = $db->sql_query($sql) ) - { - $error = $db->sql_error(); - die("Couldn't INSERT words :: " . $sql . " :: " . $error['message']); - } - - // Get the word_id's out of the DB (to see if they are already there) - $sql = "SELECT word_id, word_text - FROM " . SEARCH_WORD_TABLE . " - WHERE word_text IN ($sql_select) - GROUP BY word_text"; - $result = $db->sql_query($sql); - if ( !$result ) - { - $error = $db->sql_error(); - die("Couldn't select words :: " . $sql . " :: " . $error['message']); - } - - $sql_insert = array(); - while( $row = $db->sql_fetchrow($result) ) - { - $sql_insert[] = "($post_id, " . $row['word_id'] . ", $title_match)"; - } - - $db->sql_freeresult($result); - - $sql = "INSERT INTO " . SEARCH_MATCH_TABLE . " - (post_id, word_id, title_match) - VALUES " . implode(", ", $sql_insert); - $result = $db->sql_query($sql); - if ( !$result ) - { - $error = $db->sql_error(); - die("Couldn't insert new word match :: " . $sql . " :: " . $error['message']); - } - - } // All posts + $search->index('post', $rowset[$post_nr]['post_id'], $rowset[$post_nr]['post_text'], $rowset[$post_nr]['post_subject']); } - - // $sql = "UNLOCK TABLES"; - // $result = $db->sql_query($sql); + // $sql = "UNLOCK TABLES"; + // $result = $db->sql_query($sql); } - - // Remove common words after the first 2 batches and after every 4th batch after that. - if( $batchcount % 4 == 3 ) - { - print "<br>Removing common words (words that appear in more than $common_percent of the posts)<br>\n"; - flush(); - print "Removed ". remove_common("global", $common_percent) ." words that where too common.<br>"; - } } +print "<br>Removing common words (words that appear in more than 50% of the posts)<br>\n"; +flush(); +$search->tidy(); +print "Removed words that where too common.<br>"; + echo "<br>Done"; ?> |
