From 0e0b1120fba8ed4f2ebc5d62eb29b1a34c1b1007 Mon Sep 17 00:00:00 2001 From: Nils Adermann Date: Wed, 11 Jan 2006 18:56:07 +0000 Subject: - overhauled search system - updated structure for search backend plugins - better result caching using ACM - search results no longer session restricted => link to them by copying the URL :) - in-topic search - indexing posts now uses search backend plugins - develop/search_fill.php working again - fulltext_mysql not working yet - tiny bugfixes to ACM git-svn-id: file:///svn/phpbb/trunk@5441 89ea8834-ac86-4346-8a33-228a782c2dd0 --- phpBB/includes/search/fulltext_phpbb.php | 1017 +++++++++++++++++++++++------- 1 file changed, 802 insertions(+), 215 deletions(-) (limited to 'phpBB/includes/search/fulltext_phpbb.php') diff --git a/phpBB/includes/search/fulltext_phpbb.php b/phpBB/includes/search/fulltext_phpbb.php index b4b0efb7b1..517a586579 100644 --- a/phpBB/includes/search/fulltext_phpbb.php +++ b/phpBB/includes/search/fulltext_phpbb.php @@ -8,314 +8,901 @@ * */ +/** +* @ignore +*/ +include($phpbb_root_path . 'includes/search/search.' . $phpEx); + /** * @package search * fulltext_phpbb * phpBB's own db driven fulltext search */ -class fulltext_phpbb +class fulltext_phpbb extends search_backend { - var $split_words = array(); - var $common_words = array(); - var $old_split_words = array(); - function fulltext_phpbb(&$error) { $error = false; } - function search($type, &$fields, &$terms, &$fid_ary, &$keywords, &$author, &$pid_ary, $sort_days) + /** + * Splits keywords entered by a user into an array of words stored in $this->split_words + * + * @param string $keywords Contains the keyword as entered by the user + * @param string $terms is either 'all' or 'any' + * @return false if no valid keywords were found and otherwise true + */ + function split_keywords(&$keywords, $terms) { - global $phpbb_root_path, $phpEx, $config, $db, $user, $SID; + global $db, $config; - // Are we looking for words - if ($keywords) - { - $author = ($author) ? ' AND ' . $author : ''; - - $this->split_words = $this->common_words = array(); - $drop_char_match = array('-', '^', '$', ';', '#', '&', '(', ')', '<', '>', '`', '\'', '"', '|', ',', '@', '_', '?', '%', '~', '.', '[', ']', '{', '}', ':', '\\', '/', '=', '\'', '!', '*'); - $drop_char_replace = array(' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '', '', ' ', ' ', ' ', ' ', '', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '' , ' ', ' ', ' ', ' ', ' '); - - if ($fp = @fopen($user->lang_path . '/search_stopwords.txt', 'rb')) - { - $stopwords = explode("\n", str_replace("\r\n", "\n", fread($fp, filesize($user->lang_path . '/search_stopwords.txt')))); - } - fclose($fp); + $drop_char_match = array('^', '$', ';', '#', '&', '(', ')', '<', '>', '`', '\'', '"', ',', '@', '_', '?', '%', '~', '.', '[', ']', '{', '}', ':', '\\', '/', '=', '\'', '!'); + $drop_char_replace = array(' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '', '', ' ', ' ', ' ', '', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '' , ' ', ' ', ' ', ' '); - if ($fp = @fopen($user->lang_path . '/search_synonyms.txt', 'rb')) - { - preg_match_all('#^(.*?) (.*?)$#ms', fread($fp, filesize($user->lang_path . '/search_synonyms.txt')), $match); - $replace_synonym = &$match[1]; - $match_synonym = &$match[2]; - } - fclose($fp); + $this->get_ignore_words(); + $this->get_synonyms(); + if ($terms == 'all') + { $match = array('#\sand\s#i', '#\sor\s#i', '#\snot\s#i', '#\+#', '#-#', '#\|#'); $replace = array(' + ', ' | ', ' - ', ' + ', ' - ', ' | '); $keywords = preg_replace($match, $replace, $keywords); + } + + $match = array(); + // New lines, carriage returns + $match[] = "#[\n\r]+#"; + // NCRs like   etc. + $match[] = '#(&|&)[\#a-z0-9]+?;#i'; - $match = array(); - // Comments for hardcoded bbcode elements (urls, smilies, html) - $match[] = '#(.*?)#is'; - // New lines, carriage returns - $match[] = "#[\n\r]+#"; - // NCRs like   etc. - $match[] = '#(&|&)[\#a-z0-9]+?;#i'; - // BBcode - $match[] = '#\[\/?[a-z\*\+\-]+(=.*)?(\:?[0-9a-z]{5,})\]#'; + // Filter out as above + $keywords = preg_replace($match, ' ', strtolower(trim($keywords))); + $keywords = str_replace($drop_char_match, $drop_char_replace, $keywords); - // Filter out as above - $keywords = preg_replace($match, ' ', strtolower(trim($keywords))); - $keywords = str_replace($drop_char_match, $drop_char_replace, $keywords); + // Split words + $this->split_words = explode(' ', preg_replace('#\s+#', ' ', $keywords)); + + if (sizeof($this->ignore_words)) + { + $this->common_words = array_intersect($this->split_words, $this->ignore_words); + $this->split_words = array_diff($this->split_words, $this->ignore_words); + } - // Split words - $this->split_words = explode(' ', preg_replace('#\s+#', ' ', $keywords)); + if (sizeof($this->replace_synonym)) + { + $this->split_words = str_replace($this->replace_synonym, $this->match_synonym, $this->split_words); + } - if (sizeof($stopwords)) + $prefixes = array('+', '-', '|'); + $prefixed = false; + $in_words = ''; + foreach ($this->split_words as $i => $word) + { + if (in_array($word, $prefixes)) { - $this->common_words = array_intersect($this->split_words, $stopwords); - $this->split_words = array_diff($this->split_words, $stopwords); + $prefixed = true; + continue; } - if (sizeof($replace_synonym)) + // check word length + $clean_len = strlen(str_replace('*', '', $word)); + if (($clean_len < $config['min_search_chars']) || ($clean_len > $config['max_search_chars'])) + { + if ($prefixed) + { + $this->common_words[] = $this->split_words[$i - 1]; + unset($this->split_words[$i - 1]); + } + $this->common_words[] = $this->split_words[$i]; + unset($this->split_words[$i]); + } + else if (strpos($word, '*') === false) { - $this->split_words = str_replace($replace_synonym, $match_synonym, $this->split_words); + $in_words .= (($in_words) ? ', ' : '') . '\'' . $db->sql_escape($word) . '\''; } + + $prefixed = false; } - if ($this->old_split_words && sizeof($this->old_split_words)) + if ($in_words) { - $this->split_words = (sizeof($this->split_words)) ? array_diff($this->split_words, $this->old_split_words) : $this->old_split_words; + // identify common words and ignore them + $sql = 'SELECT word_text + FROM ' . SEARCH_WORD_TABLE . " + WHERE word_text IN ($in_words) + AND word_common = 1"; + $result = $db->sql_query($sql); + + while ($row = $db->sql_fetchrow($result)) + { + $key = array_search($row['word_text'], $this->split_words); + + if (isset($this->split_words[$key - 1]) && (in_array($this->split_words[$key - 1], $prefixes))) + { + $this->common_words[] = $this->split_words[$key - 1]; + unset($this->split_words[$key - 1]); + } + $this->common_words[] = $row['word_text']; + unset($this->split_words[$key]); + } + $db->sql_freeresult($result); } if (sizeof($this->split_words)) { - $bool = ($terms == 'all') ? 'AND' : 'OR'; - $sql_words = ''; - foreach ($this->split_words as $word) + $this->split_words = array_values($this->split_words); + return true; + } + return false; + } + + /** + * Turns text into an array of words that can be stored in the word list table + */ + function split_message($text) + { + global $config; + + static $drop_char_match, $drop_char_replace; + + $this->get_ignore_words(); + $this->get_synonyms(); + + if (!is_array($drop_char_match)) + { + $drop_char_match = array('-', '^', '$', ';', '#', '&', '(', ')', '<', '>', '`', '\'', '"', '|', ',', '@', '_', '?', '%', '~', '.', '[', ']', '{', '}', ':', '\\', '/', '=', '\'', '!', '*', '+'); + $drop_char_replace = array(' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '', '', ' ', ' ', ' ', ' ', '', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '' , ' ', ' ', ' ', ' ', ' ', ' '); + } + + $match = array(); + // Comments for hardcoded bbcode elements (urls, smilies, html) + $match[] = '#(.*?)#is'; + // New lines, carriage returns + $match[] = "#[\n\r]+#"; + // NCRs like   etc. + $match[] = '#(&|&)[\#a-z0-9]+?;#i'; + // Do not index code + $match[] = '#\[code=?.*?(\:?[0-9a-z]{5,})\].*?\[\/code(\:?[0-9a-z]{5,})\]#is'; + // BBcode + $match[] = '#\[\/?[a-z\*\+\-]+=?.*?(\:?[0-9a-z]{5,})\]#'; + // Filter out ; and # but not &#[0-9]+; + //$match[] = '#(&\#[0-9]+;)|;|\#|&#'; + + $text = preg_replace($match, ' ', ' ' . strtolower(trim($text)) . ' '); + + // Filter out non-alphabetical chars + $text = str_replace($drop_char_match, $drop_char_replace, $text); + + // Split words + $text = explode(' ', preg_replace('#\s+#', ' ', trim($text))); + + if (sizeof($this->ignore_words)) + { + $stopped_words = array_intersect($text, $this->ignore_words); + $text = array_diff($text, $this->ignore_words); + } + + if (sizeof($this->replace_synonym)) + { + $text = str_replace($this->replace_synonym, $this->match_synonym, $text); + } + + // remove too short or too long words + $text = array_values($text); + for ($i = 0, $n = sizeof($text); $i < $n; $i++) + { + $text[$i] = trim($text[$i]); + if (strlen($text[$i]) < $config['min_search_chars'] || strlen($text[$i]) > $config['max_search_chars']) { - switch ($word) - { - case '-': - $bool = 'NOT'; - continue; - case '+': - $bool = 'AND'; - continue; - case '|': - $bool = 'OR'; - continue; - default: - $bool = ($terms != 'all') ? 'OR' : $bool; - $sql_words[$bool][] = "'" . preg_replace('#\*+#', '%', trim($word)) . "'"; - $bool = ($terms == 'all') ? 'AND' : 'OR'; - } + unset($text[$i]); } + } + + return $text; + } + + /** + * Performs a search on keywords depending on display specific params. + * + * @param array $id_ary passed by reference, to be filled with ids for the page specified by $start and $per_page, should be ordered + * @param int $start indicated the first index of the page + * @param int $per_page number of ids each page is supposed to contain + * @return total number of results + */ + function keyword_search($type, &$fields, &$terms, &$sort_by_sql, &$sort_key, &$sort_dir, &$sort_days, &$ex_fid_ary, &$topic_id, &$author_ary, &$id_ary, $start, $per_page) + { + global $config, $db; + + // No keywords? No posts. + if (!sizeof($this->split_words)) + { + return false; + } - // Build some display specific variable strings - $sql_author = ($author) ? 'AND p.poster_id = ' . $author : ''; - $sql_fora = (sizeof($fid_ary)) ? ' AND p.forum_id IN (' . implode(',', $fid_ary) . ')' : ''; - $sql_time = ($sort_days) ? 'AND p.post_time >= ' . ($current_time - ($sort_days * 86400)) : ''; - $sql_select = ($type == 'posts') ? 'm.post_id' : 'DISTINCT t.topic_id'; - $sql_from = ($type == 'posts') ? '' : TOPICS_TABLE . ' t, '; - $sql_topic = ($type == 'posts') ? '' : 'AND t.topic_id = p.topic_id'; - $field = ($type == 'posts') ? 'm.post_id' : 't.topic_id'; + // generate a search_key from all the options to identify the results + $search_key = md5(implode('#', array( + implode(',', $this->split_words), + $type, + $fields, + $terms, + $sort_days, + $sort_key, + $topic_id, + implode(',', $ex_fid_ary), + implode(',', $author_ary) + ))); + + // try reading the results from cache + $result_count = 0; + if ($this->obtain_ids($search_key, $result_count, $id_ary, $start, $per_page, $sort_dir) == SEARCH_RESULT_IN_CACHE) + { + return $result_count; + } + + $result_count = 0; + $id_ary = array(); + + // Build sql strings for sorting + $sql_sort = $sort_by_sql[$sort_key] . (($sort_dir == 'a') ? ' ASC' : ' DESC'); + $sql_sort_table = $sql_sort_join = ''; + switch ($sql_sort[0]) + { + case 'u': + $sql_sort_table = USERS_TABLE . ' u, '; + $sql_sort_join = 'AND u.user_id = p.poster_id '; + break; + case 't': + $sql_sort_table = ($type == 'posts') ? TOPICS_TABLE . ' t, ' : ''; + $sql_sort_join = ($type == 'posts') ? ' AND t.topic_id = p.topic_id ' : ''; + break; + case 'f': + $sql_sort_table = FORUMS_TABLE . ' f, '; + $sql_sort_join = ' AND f.forum_id = p.forum_id '; + break; + } + + // Build some display specific sql strings + switch ($fields) + { + case 'titleonly': + $sql_match = ' AND m.title_match = 1'; + break; + case 'msgonly': + $sql_match = ' AND m.title_match = 0'; + break; + default: + $sql_match = ''; + } + + $sql_select = ($type == 'posts') ? 'm.post_id' : 'DISTINCT t.topic_id'; + $sql_from = ($type == 'posts') ? '' : TOPICS_TABLE . ' t, '; + $field = ($type == 'posts') ? 'm.post_id' : 't.topic_id'; + $sql_author = (sizeof($author_ary) == 1) ? ' = ' . $author_ary[0] : 'IN (' . implode(',', $author_ary) . ')'; + + $sql_where_options = $sql_sort_join; + $sql_where_options .= ($topic_id) ? 'AND p.topic_id = ' . $topic_id : ''; + $sql_where_options .= ($type == 'posts') ? '' : 'AND t.topic_id = p.topic_id'; + $sql_where_options .= (sizeof($ex_fid_ary)) ? ' AND p.forum_id NOT IN (' . implode(',', $ex_fid_ary) . ')' : ''; + $sql_where_options .= (sizeof($author_ary)) ? 'AND p.poster_id ' . $sql_author : ''; + $sql_where_options .= ($sort_days) ? 'AND p.post_time >= ' . (time() - ($sort_days * 86400)) : ''; + $sql_where_options .= $sql_match; + + // split the words into three arrays (AND, OR, NOT) + $sql_words = array('AND' => array(), 'OR' => array(), 'NOT' => array()); + $bool = ($terms == 'all') ? 'AND' : 'OR'; - switch ($fields) + foreach ($this->split_words as $word) + { + switch ($word) { - case 'titleonly': - $sql_match = ' AND m.title_match = 1'; - break; - case 'msgonly': - $sql_match = ' AND m.title_match = 0'; - break; + case '-': + $bool = 'NOT'; + continue; + case '+': + $bool = 'AND'; + continue; + case '|': + $bool = 'OR'; + continue; default: - $sql_match = ''; + $bool = ($terms != 'all') ? 'OR' : $bool; + $sql_words[$bool][] = "'" . $db->sql_escape(preg_replace('#\*+#', '%', trim($word))) . "'"; + $bool = ($terms == 'all') ? 'AND' : 'OR'; } + } - // Are we searching within an existing search set? Yes, then include the old ids - $sql_find_in = (sizeof($pid_ary)) ? 'AND ' . (($type == 'topics') ? 't.topic_id' : 'm.post_id') . ' IN (' . implode(', ', $pid_ary) . ')' : ''; - - $result_ary = array(); - $_bool = array('AND', 'OR', 'NOT'); - foreach ($_bool as $bool) + // Select all post_ids that contain all AND-words + $result_ary= array('AND' => array(), 'OR' => array(), 'NOT' => array()); + if (sizeof($sql_words['AND'])) + { + $sql_in = ''; + foreach ($sql_words['AND'] as $word) { - if (isset($sql_words[$bool]) && is_array($sql_words[$bool])) + // first select all post ids that match a word containing a wildcard + if (strstr($word, '%')) { - switch ($bool) + $sql = "SELECT $sql_select + FROM $sql_from$sql_sort_table" . POSTS_TABLE . ' p, ' . SEARCH_MATCH_TABLE . ' m, ' . SEARCH_WORD_TABLE . " w + WHERE w.word_text LIKE $word + AND m.word_id = w.word_id + AND w.word_common <> 1 + AND p.post_id = m.post_id + $sql_where_options + GROUP BY $field + ORDER BY $sql_sort"; + $result = $db->sql_query($sql); + + if (!($row = $db->sql_fetchrow($result))) { - case 'AND': - case 'NOT': - foreach ($sql_words[$bool] as $word) - { - if (strlen($word) < 4) - { - continue; - } - - $sql_where = (strstr($word, '%')) ? "LIKE $word" : "= $word"; - - $sql_and = (isset($result_ary['AND']) && sizeof($result_ary['AND'])) ? "AND $field IN (" . implode(', ', $result_ary['AND']) . ')' : ''; - - $sql = "SELECT $sql_select - FROM $sql_from" . POSTS_TABLE . ' p, ' . SEARCH_MATCH_TABLE . ' m, ' . SEARCH_WORD_TABLE . " w - WHERE w.word_text $sql_where - AND m.word_id = w.word_id - AND w.word_common <> 1 - AND p.post_id = m.post_id - $sql_topic - $sql_fora - $sql_author - $sql_and - $sql_time - $sql_match - $sql_find_in"; - $result = $db->sql_query_limit($sql, 1000); - - if ($db->sql_numrows() > 999) - { - trigger_error($user->lang['TOO_MANY_SEARCH_RESULTS']); - } - - if (!($row = $db->sql_fetchrow($result)) && $bool == 'AND') - { - trigger_error($user->lang['NO_SEARCH_RESULTS']); - } - - if ($bool == 'AND') - { - $result_ary['AND'] = array(); - } - - do - { - $result_ary[$bool][] = ($type == 'topics') ? $row['topic_id'] : $row['post_id']; - } - while ($row = $db->sql_fetchrow($result)); - $db->sql_freeresult($result); - } - break; - - case 'OR': - $sql_where = $sql_in = ''; - foreach ($sql_words[$bool] as $word) - { - if (strlen($word) < 4) - { - continue; - } - - if (strstr($word, '%')) - { - $sql_where .= (($sql_where) ? ' OR w.word_text ' : 'w.word_text ') . "LIKE $word"; - } - else - { - $sql_in .= (($sql_in) ? ', ' : '') . $word; - } - } - $sql_where = ($sql_in) ? (($sql_where) ? ' OR ' : '') . 'w.word_text IN (' . $sql_in . ')' : $sql_where; - - $sql_and = (isset($result_ary['AND']) && sizeof($result_ary['AND'])) ? "AND $field IN (" . implode(', ', $result_ary['AND']) . ')' : ''; - $sql = "SELECT $sql_select - FROM $sql_from" . POSTS_TABLE . ' p, ' . SEARCH_MATCH_TABLE . ' m, ' . SEARCH_WORD_TABLE . " w - WHERE ($sql_where) - AND m.word_id = w.word_id - AND w.word_common <> 1 - AND p.post_id = m.post_id - $sql_topic - $sql_fora - $sql_author - $sql_and - $sql_time - $sql_match - $sql_find_in"; - $result = $db->sql_query_limit($sql, 1000); - - while ($row = $db->sql_fetchrow($result)) - { - $result_ary[$bool][] = ($type == 'topics') ? $row['topic_id'] : $row['post_id']; - } - $db->sql_freeresult($result); - break; + $id_ary = array(); + return false; } + + $ids = array(); + do + { + $ids[] = ($type == 'topics') ? $row['topic_id'] : $row['post_id']; + } + while ($row = $db->sql_fetchrow($result)); + $db->sql_freeresult($result); + + // remove ids that are not present in all AND-word results + if (sizeof($result_ary['AND'])) + { + $result_ary['AND'] = array_intersect($result_ary['AND'], $ids); + } + else + { + $result_ary['AND'] = $ids; + } + unset($ids); } else { - $sql_words[$bool] = array(); + $sql_in .= (($sql_in) ? ', ' : '') . $word; } } - if (isset($result_ary['OR']) && sizeof($result_ary['OR'])) + if ($sql_in) { - $pid_ary = (isset($result_ary['AND']) && sizeof($result_ary['AND'])) ? array_diff($result_ary['AND'], $result_ary['OR']) : $result_ary['OR']; + $sql = "SELECT $sql_select, COUNT(DISTINCT m.word_id) as matches + FROM $sql_from$sql_sort_table" . POSTS_TABLE . ' p, ' . SEARCH_MATCH_TABLE . ' m, ' . SEARCH_WORD_TABLE . " w + WHERE w.word_text IN ($sql_in) + AND m.word_id = w.word_id + AND w.word_common <> 1 + AND p.post_id = m.post_id + $sql_where_options + GROUP BY $field + ORDER BY $sql_sort"; + $result = $db->sql_query($sql); + + if (!($row = $db->sql_fetchrow($result))) + { + $id_ary = array(); + return false; + } + + // A little trick so we only need one query: using DISTINCT makes every word unique so if the + // number of all words for one post_id equals the number of AND-words it has to contain all + // AND-words + $ids = array(); + do + { + if ($row['matches'] == sizeof($sql_words['AND'])) + { + $ids[] = ($type == 'topics') ? $row['topic_id'] : $row['post_id']; + } + } + while ($row = $db->sql_fetchrow($result)); + $db->sql_freeresult($result); + + // remove ids that are not present in all AND-word results + if (sizeof($result_ary['AND'])) + { + $result_ary['AND'] = array_intersect($result_ary['AND'], $ids); + } + else + { + $result_ary['AND'] = $ids; + } + unset($ids); } - else + } + + // Select all post_ids that contain one of the OR-words + if (sizeof($sql_words['OR'])) + { + $sql_where = $sql_in = ''; + foreach ($sql_words['OR'] as $word) { - $pid_ary = (isset($result_ary['AND'])) ? $result_ary['AND'] : array(); + if (strstr($word, '%')) + { + $sql_where .= (($sql_where) ? ' OR w.word_text ' : 'w.word_text ') . "LIKE $word"; + } + else + { + $sql_in .= (($sql_in) ? ', ' : '') . $word; + } } + $sql_where = ($sql_in) ? $sql_where . (($sql_where) ? ' OR ' : '') . 'w.word_text IN (' . $sql_in . ')' : $sql_where; + + $sql_and = (sizeof($result_ary['AND'])) ? "AND $field IN (" . implode(', ', $result_ary['AND']) . ')' : ''; + $sql = "SELECT $sql_select + FROM $sql_from$sql_sort_table" . POSTS_TABLE . ' p, ' . SEARCH_MATCH_TABLE . ' m, ' . SEARCH_WORD_TABLE . " w + WHERE ($sql_where) + AND m.word_id = w.word_id + AND w.word_common <> 1 + AND p.post_id = m.post_id + $sql_where_options + ORDER BY $sql_sort"; + $result = $db->sql_query($sql); - if (isset($result_ary['NOT']) && sizeof($result_ary['NOT'])) + while ($row = $db->sql_fetchrow($result)) { - $pid_ary = (sizeof($pid_ary)) ? array_diff($pid_ary, $result_ary['NOT']) : array(); + $result_ary['OR'][] = ($type == 'topics') ? $row['topic_id'] : $row['post_id']; } - unset($result_ary); + $db->sql_freeresult($result); + } + + // remove post_ids that do not contain any OR-word + if (sizeof($result_ary['OR'])) + { + $id_ary = (sizeof($result_ary['AND'])) ? array_intersect($result_ary['AND'], $result_ary['OR']) : $result_ary['OR']; + } + else + { + $id_ary = (sizeof($result_ary['AND'])) ? $result_ary['AND'] : array(); + } - $pid_ary = array_unique($pid_ary); + unset($result_ary['AND']); + unset($result_ary['OR']); - if (!sizeof($pid_ary)) + // remove all post_ids that contain a NOT-word + if (sizeof($sql_words['NOT']) && sizeof($id_ary)) + { + $sql_where = $sql_in = ''; + foreach ($sql_words['NOT'] as $word) { - trigger_error($user->lang['NO_SEARCH_RESULTS']); + if (strstr($word, '%')) + { + $sql_where .= (($sql_where) ? ' OR w.word_text ' : 'w.word_text ') . "LIKE $word"; + } + else + { + $sql_in .= (($sql_in) ? ', ' : '') . $word; + } } - - $sql = 'SELECT word_text - FROM ' . SEARCH_WORD_TABLE . ' - WHERE word_text IN (' . implode(', ', array_unique(array_merge($sql_words['AND'], $sql_words['OR'], $sql_words['NOT']))) . ') - AND word_common = 1'; + $sql_where = ($sql_in) ? $sql_where . (($sql_where) ? ' OR ' : '') . 'w.word_text IN (' . $sql_in . ')' : $sql_where; + + $sql_and = "AND $field IN (" . implode(', ', $id_ary) . ')'; + $sql = "SELECT $sql_select + FROM $sql_from" . POSTS_TABLE . ' p, ' . SEARCH_MATCH_TABLE . ' m, ' . SEARCH_WORD_TABLE . " w + WHERE ($sql_where) + AND m.word_id = w.word_id + AND w.word_common <> 1 + AND p.post_id = m.post_id + $sql_where_options"; $result = $db->sql_query($sql); while ($row = $db->sql_fetchrow($result)) { - $common_words[] = $row['word_text']; + $result_ary['NOT'][] = ($type == 'topics') ? $row['topic_id'] : $row['post_id']; } $db->sql_freeresult($result); } - else if ($author) + + if (sizeof($result_ary['NOT'])) + { + $id_ary = (sizeof($id_ary)) ? array_diff($id_ary, $result_ary['NOT']) : array(); + } + unset($result_ary); + + if (!sizeof($id_ary)) + { + return false; + } + + $result_count = sizeof($id_ary); + + // store the ids, from start on then delete anything that isn't on the current page because we only need ids for one page + $id_ary = array_slice($id_ary, $start); + $this->save_ids($search_key, implode(' ', $this->split_words), $author_ary, $result_count, $id_ary, $start, $sort_dir); + $id_ary = array_slice($id_ary, 0, (int) $per_page); + + return $result_count; + } + + /** + * Performs a search on an author's posts without caring about message contents. Depends on display specific params + * + * @param array $id_ary passed by reference, to be filled with ids for the page specified by $start and $per_page, should be ordered + * @param int $start indicated the first index of the page + * @param int $per_page number of ids each page is supposed to contain + * @return total number of results + */ + function author_search($type, &$sort_by_sql, &$sort_key, &$sort_dir, &$sort_days, &$ex_fid_ary, &$topic_id, &$author_ary, &$id_ary, $start, $per_page) + { + global $config, $db; + + // No author? No posts. + if (!sizeof($author_ary)) + { + return 0; + } + + // generate a search_key from all the options to identify the results + $search_key = md5(implode('#', array( + '', + $type, + '', + '', + $sort_days, + $sort_key, + $topic_id, + implode(',', $ex_fid_ary), + implode(',', $author_ary) + ))); + + // try reading the results from cache + $result_count = 0; + if ($this->obtain_ids($search_key, $result_count, $id_ary, $start, $per_page, $sort_dir) == SEARCH_RESULT_IN_CACHE) { - $sql_author = ($author) ? 'p.poster_id = ' . $author : ''; - $sql_fora = (sizeof($fid_ary)) ? ' AND p.forum_id IN (' . implode(',', $fid_ary) . ')' : ''; + return $result_count; + } + + $id_ary = array(); + + // Create some display specific sql strings + $sql_author = 'p.poster_id ' . ((sizeof($author_ary) > 1) ? 'IN (' . implode(',', $author_ary) . ')' : '= ' . $author_ary[0]); + $sql_fora = (sizeof($ex_fid_ary)) ? ' AND p.forum_id NOT IN (' . implode(',', $ex_fid_ary) . ')' : ''; + $sql_topic_id = (sizeof($ex_fid_ary)) ? ' AND p.forum_id NOT IN (' . implode(',', $ex_fid_ary) . ')' : ''; + $sql_time = ($sort_days) ? 'AND p.post_time >= ' . (time() - ($sort_days * 86400)) : ''; + + // Build sql strings for sorting + $sql_sort = $sort_by_sql[$sort_key] . (($sort_dir == 'a') ? ' ASC' : ' DESC'); + $sql_sort_table = $sql_sort_join = ''; + switch ($sql_sort[0]) + { + case 'u': + $sql_sort_table = USERS_TABLE . ' u, '; + $sql_sort_join = 'AND u.user_id = p.poster_id '; + break; + case 't': + $sql_sort_table = ($type == 'posts') ? TOPICS_TABLE . ' t, ' : ''; + $sql_sort_join = ($type == 'posts') ? ' AND t.topic_id = p.topic_id ' : ''; + break; + case 'f': + $sql_sort_table = FORUMS_TABLE . ' f, '; + $sql_sort_join = ' AND f.forum_id = p.forum_id '; + break; + } + // If the cache was completely empty count the results + if (!$result_count) + { if ($type == 'posts') { - $sql = 'SELECT p.post_id + $sql = 'SELECT COUNT(p.post_id) as result_count FROM ' . POSTS_TABLE . " p WHERE $sql_author - $sql_fora"; - $field = 'post_id'; + $sql_fora + $sql_time"; } else { - $sql = 'SELECT t.topic_id + $sql = 'SELECT COUNT(DISTINCT t.topic_id) as result_count FROM ' . TOPICS_TABLE . ' t, ' . POSTS_TABLE . " p WHERE $sql_author $sql_fora AND t.topic_id = p.topic_id - GROUP BY t.topic_id"; - $field = 'topic_id'; + $sql_time"; } - $result = $db->sql_query_limit($sql, 1000); + $result = $db->sql_query($sql); - while ($row = $db->sql_fetchrow($result)) + if ($row = $db->sql_fetchrow()) { - $pid_ary[] = $row[$field]; + $result_count = $row['result_count']; } $db->sql_freeresult($result); } + // Build the query for really selecting the post_ids + if ($type == 'posts') + { + $sql = 'SELECT p.post_id + FROM ' . $sql_sort_table . POSTS_TABLE . " p + WHERE $sql_author + $sql_fora + $sql_sort_join + $sql_time + ORDER BY $sql_sort"; + $field = 'post_id'; + } + else + { + $sql = 'SELECT t.topic_id + FROM ' . $sql_sort_table . TOPICS_TABLE . ' t, ' . POSTS_TABLE . " p + WHERE $sql_author + $sql_fora + AND t.topic_id = p.topic_id + $sql_sort_join + $sql_time + GROUP BY t.topic_id + ORDER BY $sql_sort"; + $field = 'topic_id'; + } + + // Only read one block of posts from the db and then cache it + $result = $db->sql_query_limit($sql, $config['search_block_size'], $start); + + while ($row = $db->sql_fetchrow($result)) + { + $id_ary[] = $row[$field]; + } + $db->sql_freeresult($result); + + if (sizeof($id_ary)) + { + $this->save_ids($search_key, '', $author_ary, $result_count, $id_ary, $start, $sort_dir); + $id_ary = array_slice($id_ary, 0, $per_page); + + return $result_count; + } return false; } + + /** + * Updates wordlist and wordmatch tables when a message is posted or changed + * + * @param string $mode contains the post mode: edit, post, reply, quote ... + */ + function index($mode, $post_id, &$message, &$subject) + { + global $config, $db; + + // Is the fulltext indexer disabled? If yes then we need not + // carry on ... it's okay ... I know when I'm not wanted boo hoo + if (!$config['load_search_upd']) + { + return; + } + + // Split old and new post/subject to obtain array of 'words' + $split_text = $this->split_message($message); + $split_title = ($subject) ? $this->split_message($subject) : array(); + $cur_words = array('post' => array(), 'title' => array()); + + $words = array(); + if ($mode == 'edit') + { + $words['add']['post'] = array(); + $words['add']['title'] = array(); + $words['del']['post'] = array(); + $words['del']['title'] = array(); + + $sql = 'SELECT w.word_id, w.word_text, m.title_match + FROM ' . SEARCH_WORD_TABLE . ' w, ' . SEARCH_MATCH_TABLE . " m + WHERE m.post_id = $post_id + AND w.word_id = m.word_id"; + $result = $db->sql_query($sql); + + while ($row = $db->sql_fetchrow($result)) + { + $which = ($row['title_match']) ? 'title' : 'post'; + $cur_words[$which][$row['word_text']] = $row['word_id']; + } + $db->sql_freeresult($result); + + $words['add']['post'] = array_diff($split_text, array_keys($cur_words['post'])); + $words['add']['title'] = array_diff($split_title, array_keys($cur_words['title'])); + $words['del']['post'] = array_diff(array_keys($cur_words['post']), $split_text); + $words['del']['title'] = array_diff(array_keys($cur_words['title']), $split_title); + } + else + { + $words['add']['post'] = $split_text; + $words['add']['title'] = $split_title; + $words['del']['post'] = array(); + $words['del']['title'] = array(); + } + unset($split_text); + unset($split_title); + + // Get unique words from the above arrays + $unique_add_words = array_unique(array_merge($words['add']['post'], $words['add']['title'])); + + // We now have unique arrays of all words to be added and removed and + // individual arrays of added and removed words for text and title. What + // we need to do now is add the new words (if they don't already exist) + // and then add (or remove) matches between the words and this post + if (sizeof($unique_add_words)) + { + $sql = 'SELECT word_id, word_text + FROM ' . SEARCH_WORD_TABLE . ' + WHERE word_text IN (' . implode(', ', preg_replace('#^(.*)$#', '\'$1\'', $unique_add_words)) . ')'; + $result = $db->sql_query($sql); + + $word_ids = array(); + while ($row = $db->sql_fetchrow($result)) + { + $word_ids[$row['word_text']] = $row['word_id']; + } + $db->sql_freeresult($result); + + $new_words = array_diff($unique_add_words, array_keys($word_ids)); + + if (sizeof($new_words)) + { + switch (SQL_LAYER) + { + case 'mysql': + $sql = 'INSERT INTO ' . SEARCH_WORD_TABLE . ' (word_text) + VALUES ' . implode(', ', preg_replace('#^(.*)$#', '(\'$1\')', $new_words)); + $db->sql_query($sql); + break; + + case 'mysql4': + case 'mysqli': + case 'mssql': + case 'mssql_odbc': + case 'sqlite': + $sql = 'INSERT INTO ' . SEARCH_WORD_TABLE . ' (word_text) ' . implode(' UNION ALL ', preg_replace('#^(.*)$#', "SELECT '\$1'", $new_words)); + $db->sql_query($sql); + break; + + default: + foreach ($new_words as $word) + { + $sql = 'INSERT INTO ' . SEARCH_WORD_TABLE . " (word_text) + VALUES ('$word')"; + $db->sql_query($sql); + } + break; + } + } + unset($new_words); + } + + // now update the search match table, remove links to removed words and add links to new words + foreach ($words['del'] as $word_in => $word_ary) + { + $title_match = ($word_in == 'title') ? 1 : 0; + + if (sizeof($word_ary)) + { + $sql_in = array(); + foreach ($word_ary as $word) + { + $sql_in[] = $cur_words[$word_in][$word]; + } + + $sql = 'DELETE FROM ' . SEARCH_MATCH_TABLE . ' + WHERE word_id IN (' . implode(', ', $sql_in) . ') + AND post_id = ' . intval($post_id) . " + AND title_match = $title_match"; + $db->sql_query($sql); + unset($sql_in); + } + } + + foreach ($words['add'] as $word_in => $word_ary) + { + $title_match = ($word_in == 'title') ? 1 : 0; + + if (sizeof($word_ary)) + { + $sql = 'INSERT INTO ' . SEARCH_MATCH_TABLE . " (post_id, word_id, title_match) + SELECT $post_id, word_id, $title_match + FROM " . SEARCH_WORD_TABLE . ' + WHERE word_text IN (' . implode(', ', preg_replace('#^(.*)$#', '\'$1\'', $word_ary)) . ')'; + $db->sql_query($sql); + } + } + + // destroy cached search results containing any of the words removed or added + $this->destroy_cache(array_unique(array_merge($words['add']['post'], $words['add']['title'], $words['del']['post'], $words['del']['post']))); + + unset($unique_add_words); + unset($words); + unset($cur_words); + } + + /** + * Removes entries from the wordmatch table for the specified post_ids + */ + function index_remove($post_ids) + { + global $db; + + $sql = 'DELETE FROM ' . SEARCH_MATCH_TABLE . ' + WHERE post_id IN (' . implode(', ', $post_ids) . ')'; + $db->sql_query($sql); + + // SEARCH_WORD_TABLE will be updated by tidy() + } + + /** + * Tidy up indexes: Tag 'common words' and remove + * words no longer referenced in the match table + */ + function tidy() + { + global $db, $config; + + // Is the fulltext indexer disabled? If yes then we need not + // carry on ... it's okay ... I know when I'm not wanted boo hoo + if (!$config['load_search_upd']) + { + return; + } + + $destroy_cache_words = array(); + + // Remove common (> 50% of posts ) words + if ($config['num_posts'] >= 100) + { + $sql = 'SELECT word_id + FROM ' . SEARCH_MATCH_TABLE . ' + GROUP BY word_id + HAVING COUNT(word_id) > ' . floor($config['num_posts'] * 0.5); + $result = $db->sql_query($sql); + + if ($row = $db->sql_fetchrow($result)) + { + $sql_in = array(); + do + { + $sql_in[] = $row['word_id']; + } + while ($row = $db->sql_fetchrow($result)); + + $destroy_cache_words = $sql_in; + + $sql_in = implode(', ', $sql_in); + + $sql = 'UPDATE ' . SEARCH_WORD_TABLE . " + SET word_common = 1 + WHERE word_id IN ($sql_in)"; + $db->sql_query($sql); + + $sql = 'DELETE FROM ' . SEARCH_MATCH_TABLE . " + WHERE word_id IN ($sql_in)"; + $db->sql_query($sql); + unset($sql_in); + } + $db->sql_freeresult($result); + } + + // Remove words with no matches ... this is a potentially nasty query + $sql = 'SELECT w.word_id + FROM ' . SEARCH_WORD_TABLE . ' w + LEFT JOIN ' . SEARCH_MATCH_TABLE . ' m ON (w.word_id = m.word_id) + WHERE w.word_common = 0 AND m.word_id IS NULL + GROUP BY w.word_id'; + $result = $db->sql_query($sql); + + if ($row = $db->sql_fetchrow($result)) + { + $sql_in = array(); + do + { + $sql_in[] = $row['word_id']; + } + while ($row = $db->sql_fetchrow($result)); + + $destroy_cache_words = array_merge($destroy_cache_words, $sql_in); + + $sql = 'DELETE FROM ' . SEARCH_WORD_TABLE . ' + WHERE word_id IN (' . implode(', ', $sql_in) . ')'; + $db->sql_query($sql); + unset($sql_in); + } + $db->sql_freeresult($result); + + // destroy cached search results containing any of the words that are now common or were removed + $this->destroy_cache(array_unique($destroy_cache_words)); + } } ?> \ No newline at end of file -- cgit v1.2.1