diff options
Diffstat (limited to 'phpBB/phpbb/search/fulltext_native.php')
-rw-r--r-- | phpBB/phpbb/search/fulltext_native.php | 208 |
1 files changed, 137 insertions, 71 deletions
diff --git a/phpBB/phpbb/search/fulltext_native.php b/phpBB/phpbb/search/fulltext_native.php index 63b0b24edf..c83de75eed 100644 --- a/phpBB/phpbb/search/fulltext_native.php +++ b/phpBB/phpbb/search/fulltext_native.php @@ -18,6 +18,13 @@ namespace phpbb\search; */ class fulltext_native extends \phpbb\search\base { + const UTF8_HANGUL_FIRST = "\xEA\xB0\x80"; + const UTF8_HANGUL_LAST = "\xED\x9E\xA3"; + const UTF8_CJK_FIRST = "\xE4\xB8\x80"; + const UTF8_CJK_LAST = "\xE9\xBE\xBB"; + const UTF8_CJK_B_FIRST = "\xF0\xA0\x80\x80"; + const UTF8_CJK_B_LAST = "\xF0\xAA\x9B\x96"; + /** * Associative array holding index stats * @var array @@ -99,7 +106,7 @@ class fulltext_native extends \phpbb\search\base protected $user; /** - * Initialises the fulltext_native search backend with min/max word length and makes sure the UTF-8 normalizer is loaded + * Initialises the fulltext_native search backend with min/max word length * * @param boolean|string &$error is passed by reference and should either be set to false on success or an error message on failure * @param \phpbb\event\dispatcher_interface $phpbb_dispatcher Event dispatcher object @@ -113,15 +120,11 @@ class fulltext_native extends \phpbb\search\base $this->phpbb_dispatcher = $phpbb_dispatcher; $this->user = $user; - $this->word_length = array('min' => $this->config['fulltext_native_min_chars'], 'max' => $this->config['fulltext_native_max_chars']); + $this->word_length = array('min' => (int) $this->config['fulltext_native_min_chars'], 'max' => (int) $this->config['fulltext_native_max_chars']); /** * Load the UTF tools */ - if (!class_exists('utf_normalizer')) - { - include($this->phpbb_root_path . 'includes/utf/utf_normalizer.' . $this->php_ext); - } if (!function_exists('utf8_decode_ncr')) { include($this->phpbb_root_path . 'includes/utf/utf_tools.' . $this->php_ext); @@ -187,7 +190,7 @@ class fulltext_native extends \phpbb\search\base */ public function split_keywords($keywords, $terms) { - $tokens = '+-|()*'; + $tokens = '+-|()* '; $keywords = trim($this->cleanup($keywords, $tokens)); @@ -221,12 +224,10 @@ class fulltext_native extends \phpbb\search\base $keywords[$i] = '|'; break; case '*': - if ($i === 0 || ($keywords[$i - 1] !== '*' && strcspn($keywords[$i - 1], $tokens) === 0)) + // $i can never be 0 here since $open_bracket is initialised to false + if (strpos($tokens, $keywords[$i - 1]) !== false && ($i + 1 === $n || strpos($tokens, $keywords[$i + 1]) !== false)) { - if ($i === $n - 1 || ($keywords[$i + 1] !== '*' && strcspn($keywords[$i + 1], $tokens) === 0)) - { - $keywords = substr($keywords, 0, $i) . substr($keywords, $i + 1); - } + $keywords[$i] = '|'; } break; } @@ -261,7 +262,7 @@ class fulltext_native extends \phpbb\search\base } } - if ($open_bracket) + if ($open_bracket !== false) { $keywords .= ')'; } @@ -282,7 +283,7 @@ class fulltext_native extends \phpbb\search\base ); $keywords = preg_replace($match, $replace, $keywords); - $num_keywords = sizeof(explode(' ', $keywords)); + $num_keywords = count(explode(' ', $keywords)); // We limit the number of allowed keywords to minimize load on the database if ($this->config['max_num_search_keywords'] && $num_keywords > $this->config['max_num_search_keywords']) @@ -298,12 +299,26 @@ class fulltext_native extends \phpbb\search\base $words = array(); preg_match_all('#([^\\s+\\-|()]+)(?:$|[\\s+\\-|()])#u', $keywords, $words); - if (sizeof($words[1])) + if (count($words[1])) { $keywords = '(' . implode('|', $words[1]) . ')'; } } + // Remove non trailing wildcards from each word to prevent a full table scan (it's now using the database index) + $match = '#\*(?!$|\s)#'; + $replace = '$1'; + $keywords = preg_replace($match, $replace, $keywords); + + // Only allow one wildcard in the search query to limit the database load + $match = '#\*#'; + $replace = '$1'; + $count_wildcards = substr_count($keywords, '*'); + + // Reverse the string to remove all wildcards except the first one + $keywords = strrev(preg_replace($match, $replace, strrev($keywords), $count_wildcards - 1)); + unset($count_wildcards); + // set the search_query which is shown to the user $this->search_query = $keywords; @@ -313,7 +328,7 @@ class fulltext_native extends \phpbb\search\base $common_ids = $words = array(); - if (sizeof($exact_words)) + if (count($exact_words)) { $sql = 'SELECT word_id, word_text, word_common FROM ' . SEARCH_WORDLIST_TABLE . ' @@ -349,9 +364,6 @@ class fulltext_native extends \phpbb\search\base $this->must_not_contain_ids = array(); $this->must_exclude_one_ids = array(); - $mode = ''; - $ignore_no_id = true; - foreach ($query as $word) { if (empty($word)) @@ -409,8 +421,16 @@ class fulltext_native extends \phpbb\search\base { if (strpos($word_part, '*') !== false) { - $id_words[] = '\'' . $this->db->sql_escape(str_replace('*', '%', $word_part)) . '\''; - $non_common_words[] = $word_part; + $len = utf8_strlen(str_replace('*', '', $word_part)); + if ($len >= $this->word_length['min'] && $len <= $this->word_length['max']) + { + $id_words[] = '\'' . $this->db->sql_escape(str_replace('*', '%', $word_part)) . '\''; + $non_common_words[] = $word_part; + } + else + { + $this->common_words[] = $word_part; + } } else if (isset($words[$word_part])) { @@ -426,10 +446,10 @@ class fulltext_native extends \phpbb\search\base } } } - if (sizeof($id_words)) + if (count($id_words)) { sort($id_words); - if (sizeof($id_words) > 1) + if (count($id_words) > 1) { $this->{$mode . '_ids'}[] = $id_words; } @@ -440,7 +460,7 @@ class fulltext_native extends \phpbb\search\base } } // throw an error if we shall not ignore unexistant words - else if (!$ignore_no_id && sizeof($non_common_words)) + else if (!$ignore_no_id && count($non_common_words)) { trigger_error(sprintf($this->user->lang['WORDS_IN_NO_POST'], implode($this->user->lang['COMMA_SEPARATOR'], $non_common_words))); } @@ -480,7 +500,7 @@ class fulltext_native extends \phpbb\search\base } // Return true if all words are not common words - if (sizeof($exact_words) - sizeof($this->common_words) > 0) + if (count($exact_words) - count($this->common_words) > 0) { return true; } @@ -594,7 +614,6 @@ class fulltext_native extends \phpbb\search\base $id_ary = array(); $sql_where = array(); - $group_by = false; $m_num = 0; $w_num = 0; @@ -717,7 +736,7 @@ class fulltext_native extends \phpbb\search\base } } - if (sizeof($this->must_not_contain_ids)) + if (count($this->must_not_contain_ids)) { $sql_array['LEFT_JOIN'][] = array( 'FROM' => array(SEARCH_WORDMATCH_TABLE => 'm' . $m_num), @@ -827,7 +846,7 @@ class fulltext_native extends \phpbb\search\base $sql_where[] = 'p.topic_id = ' . $topic_id; } - if (sizeof($author_ary)) + if (count($author_ary)) { if ($author_name) { @@ -841,7 +860,7 @@ class fulltext_native extends \phpbb\search\base $sql_where[] = $sql_author; } - if (sizeof($ex_fid_ary)) + if (count($ex_fid_ary)) { $sql_where[] = $this->db->sql_in_set('p.forum_id', $ex_fid_ary, true); } @@ -879,7 +898,6 @@ class fulltext_native extends \phpbb\search\base break; - case 'sqlite': case 'sqlite3': $sql_array_count['SELECT'] = ($type == 'posts') ? 'DISTINCT p.post_id' : 'DISTINCT p.topic_id'; $sql = 'SELECT COUNT(' . (($type == 'posts') ? 'post_id' : 'topic_id') . ') as total_results @@ -1012,7 +1030,7 @@ class fulltext_native extends \phpbb\search\base public function author_search($type, $firstpost_only, $sort_by_sql, $sort_key, $sort_dir, $sort_days, $ex_fid_ary, $post_visibility, $topic_id, $author_ary, $author_name, &$id_ary, &$start, $per_page) { // No author? No posts - if (!sizeof($author_ary)) + if (!count($author_ary)) { return 0; } @@ -1084,7 +1102,7 @@ class fulltext_native extends \phpbb\search\base { $sql_author = $this->db->sql_in_set('p.poster_id', $author_ary); } - $sql_fora = (sizeof($ex_fid_ary)) ? ' AND ' . $this->db->sql_in_set('p.forum_id', $ex_fid_ary, true) : ''; + $sql_fora = (count($ex_fid_ary)) ? ' AND ' . $this->db->sql_in_set('p.forum_id', $ex_fid_ary, true) : ''; $sql_time = ($sort_days) ? ' AND p.post_time >= ' . (time() - ($sort_days * 86400)) : ''; $sql_topic_id = ($topic_id) ? ' AND p.topic_id = ' . (int) $topic_id : ''; $sql_firstpost = ($firstpost_only) ? ' AND p.post_id = t.topic_first_post_id' : ''; @@ -1186,7 +1204,7 @@ class fulltext_native extends \phpbb\search\base } else { - if ($this->db->get_sql_layer() == 'sqlite' || $this->db->get_sql_layer() == 'sqlite3') + if ($this->db->get_sql_layer() == 'sqlite3') { $sql = 'SELECT COUNT(topic_id) as total_results FROM (SELECT DISTINCT t.topic_id'; @@ -1203,7 +1221,7 @@ class fulltext_native extends \phpbb\search\base $post_visibility $sql_fora AND t.topic_id = p.topic_id - $sql_time" . (($this->db->get_sql_layer() == 'sqlite' || $this->db->get_sql_layer() == 'sqlite3') ? ')' : ''); + $sql_time" . ($this->db->get_sql_layer() == 'sqlite3' ? ')' : ''); } $result = $this->db->sql_query($sql); @@ -1291,7 +1309,7 @@ class fulltext_native extends \phpbb\search\base $this->db->sql_freeresult($result); } - if (sizeof($id_ary)) + if (count($id_ary)) { $this->save_ids($search_key, '', $author_ary, $total_results, $id_ary, $start, $sort_dir); $id_ary = array_slice($id_ary, 0, $per_page); @@ -1325,7 +1343,6 @@ class fulltext_native extends \phpbb\search\base $match[] = '#\[\/?[a-z0-9\*\+\-]+(?:=.*?)?(?::[a-z])?(\:?[0-9a-z]{5,})\]#'; $min = $this->word_length['min']; - $max = $this->word_length['max']; $isset_min = $min - 1; @@ -1361,9 +1378,9 @@ class fulltext_native extends \phpbb\search\base * Note: this could be optimized. If the codepoint is lower than Hangul's range * we know that it will also be lower than CJK ranges */ - if ((strncmp($word, UTF8_HANGUL_FIRST, 3) < 0 || strncmp($word, UTF8_HANGUL_LAST, 3) > 0) - && (strncmp($word, UTF8_CJK_FIRST, 3) < 0 || strncmp($word, UTF8_CJK_LAST, 3) > 0) - && (strncmp($word, UTF8_CJK_B_FIRST, 4) < 0 || strncmp($word, UTF8_CJK_B_LAST, 4) > 0)) + if ((strncmp($word, self::UTF8_HANGUL_FIRST, 3) < 0 || strncmp($word, self::UTF8_HANGUL_LAST, 3) > 0) + && (strncmp($word, self::UTF8_CJK_FIRST, 3) < 0 || strncmp($word, self::UTF8_CJK_LAST, 3) > 0) + && (strncmp($word, self::UTF8_CJK_B_FIRST, 4) < 0 || strncmp($word, self::UTF8_CJK_B_LAST, 4) > 0)) { $word = strtok(' '); continue; @@ -1436,6 +1453,38 @@ class fulltext_native extends \phpbb\search\base $words['del']['post'] = array(); $words['del']['title'] = array(); } + + /** + * Event to modify method arguments and words before the native search index is updated + * + * @event core.search_native_index_before + * @var string mode Contains the post mode: edit, post, reply, quote + * @var int post_id The id of the post which is modified/created + * @var string message New or updated post content + * @var string subject New or updated post subject + * @var int poster_id Post author's user id + * @var int forum_id The id of the forum in which the post is located + * @var array words Grouped lists of words added to or remove from the index + * @var array split_text Array of words from the message + * @var array split_title Array of words from the title + * @var array cur_words Array of words currently in the index for comparing to new words + * when mode is edit. Empty for other modes. + * @since 3.2.3-RC1 + */ + $vars = array( + 'mode', + 'post_id', + 'message', + 'subject', + 'poster_id', + 'forum_id', + 'words', + 'split_text', + 'split_title', + 'cur_words', + ); + extract($this->phpbb_dispatcher->trigger_event('core.search_native_index_before', compact($vars))); + unset($split_text); unset($split_title); @@ -1446,7 +1495,7 @@ class fulltext_native extends \phpbb\search\base // individual arrays of added and removed words for text and title. What // we need to do now is add the new words (if they don't already exist) // and then add (or remove) matches between the words and this post - if (sizeof($unique_add_words)) + if (count($unique_add_words)) { $sql = 'SELECT word_id, word_text FROM ' . SEARCH_WORDLIST_TABLE . ' @@ -1462,7 +1511,7 @@ class fulltext_native extends \phpbb\search\base $new_words = array_diff($unique_add_words, array_keys($word_ids)); $this->db->sql_transaction('begin'); - if (sizeof($new_words)) + if (count($new_words)) { $sql_ary = array(); @@ -1486,7 +1535,7 @@ class fulltext_native extends \phpbb\search\base { $title_match = ($word_in == 'title') ? 1 : 0; - if (sizeof($word_ary)) + if (count($word_ary)) { $sql_in = array(); foreach ($word_ary as $word) @@ -1515,7 +1564,7 @@ class fulltext_native extends \phpbb\search\base { $title_match = ($word_in == 'title') ? 1 : 0; - if (sizeof($word_ary)) + if (count($word_ary)) { $sql = 'INSERT INTO ' . SEARCH_WORDMATCH_TABLE . ' (post_id, word_id, title_match) SELECT ' . (int) $post_id . ', word_id, ' . (int) $title_match . ' @@ -1546,7 +1595,7 @@ class fulltext_native extends \phpbb\search\base */ public function index_remove($post_ids, $author_ids, $forum_ids) { - if (sizeof($post_ids)) + if (count($post_ids)) { $sql = 'SELECT w.word_id, w.word_text, m.title_match FROM ' . SEARCH_WORDMATCH_TABLE . ' m, ' . SEARCH_WORDLIST_TABLE . ' w @@ -1569,7 +1618,7 @@ class fulltext_native extends \phpbb\search\base } $this->db->sql_freeresult($result); - if (sizeof($title_word_ids)) + if (count($title_word_ids)) { $sql = 'UPDATE ' . SEARCH_WORDLIST_TABLE . ' SET word_count = word_count - 1 @@ -1578,7 +1627,7 @@ class fulltext_native extends \phpbb\search\base $this->db->sql_query($sql); } - if (sizeof($message_word_ids)) + if (count($message_word_ids)) { $sql = 'UPDATE ' . SEARCH_WORDLIST_TABLE . ' SET word_count = word_count - 1 @@ -1608,7 +1657,7 @@ class fulltext_native extends \phpbb\search\base // carry on ... it's okay ... I know when I'm not wanted boo hoo if (!$this->config['fulltext_native_load_upd']) { - set_config('search_last_gc', time(), true); + $this->config->set('search_last_gc', time(), false); return; } @@ -1633,7 +1682,7 @@ class fulltext_native extends \phpbb\search\base } $this->db->sql_freeresult($result); - if (sizeof($sql_in)) + if (count($sql_in)) { // Flag the words $sql = 'UPDATE ' . SEARCH_WORDLIST_TABLE . ' @@ -1643,7 +1692,7 @@ class fulltext_native extends \phpbb\search\base // by setting search_last_gc to the new time here we make sure that if a user reloads because the // following query takes too long, he won't run into it again - set_config('search_last_gc', time(), true); + $this->config->set('search_last_gc', time(), false); // Delete the matches $sql = 'DELETE FROM ' . SEARCH_WORDMATCH_TABLE . ' @@ -1653,13 +1702,13 @@ class fulltext_native extends \phpbb\search\base unset($sql_in); } - if (sizeof($destroy_cache_words)) + if (count($destroy_cache_words)) { // destroy cached search results containing any of the words that are now common or were removed $this->destroy_cache(array_unique($destroy_cache_words)); } - set_config('search_last_gc', time(), true); + $this->config->set('search_last_gc', time(), false); } /** @@ -1667,21 +1716,43 @@ class fulltext_native extends \phpbb\search\base */ public function delete_index($acp_module, $u_action) { + $sql_queries = []; + switch ($this->db->get_sql_layer()) { - case 'sqlite': case 'sqlite3': - $this->db->sql_query('DELETE FROM ' . SEARCH_WORDLIST_TABLE); - $this->db->sql_query('DELETE FROM ' . SEARCH_WORDMATCH_TABLE); - $this->db->sql_query('DELETE FROM ' . SEARCH_RESULTS_TABLE); + $sql_queries[] = 'DELETE FROM ' . SEARCH_WORDLIST_TABLE; + $sql_queries[] = 'DELETE FROM ' . SEARCH_WORDMATCH_TABLE; + $sql_queries[] = 'DELETE FROM ' . SEARCH_RESULTS_TABLE; break; default: - $this->db->sql_query('TRUNCATE TABLE ' . SEARCH_WORDLIST_TABLE); - $this->db->sql_query('TRUNCATE TABLE ' . SEARCH_WORDMATCH_TABLE); - $this->db->sql_query('TRUNCATE TABLE ' . SEARCH_RESULTS_TABLE); + $sql_queries[] = 'TRUNCATE TABLE ' . SEARCH_WORDLIST_TABLE; + $sql_queries[] = 'TRUNCATE TABLE ' . SEARCH_WORDMATCH_TABLE; + $sql_queries[] = 'TRUNCATE TABLE ' . SEARCH_RESULTS_TABLE; break; } + + $stats = $this->stats; + + /** + * Event to modify SQL queries before the native search index is deleted + * + * @event core.search_native_delete_index_before + * @var array sql_queries Array with queries for deleting the search index + * @var array stats Array with statistics of the current index (read only) + * @since 3.2.3-RC1 + */ + $vars = array( + 'sql_queries', + 'stats', + ); + extract($this->phpbb_dispatcher->trigger_event('core.search_native_delete_index_before', compact($vars))); + + foreach ($sql_queries as $sql_query) + { + $this->db->sql_query($sql_query); + } } /** @@ -1689,7 +1760,7 @@ class fulltext_native extends \phpbb\search\base */ public function index_created() { - if (!sizeof($this->stats)) + if (!count($this->stats)) { $this->get_stats(); } @@ -1702,7 +1773,7 @@ class fulltext_native extends \phpbb\search\base */ public function index_stats() { - if (!sizeof($this->stats)) + if (!count($this->stats)) { $this->get_stats(); } @@ -1730,13 +1801,11 @@ class fulltext_native extends \phpbb\search\base * @param string $allowed_chars String of special chars to allow * @param string $encoding Text encoding * @return string Cleaned up text, only alphanumeric chars are left - * - * @todo \normalizer::cleanup being able to be used? */ protected function cleanup($text, $allowed_chars = null, $encoding = 'utf-8') { static $conv = array(), $conv_loaded = array(); - $words = $allow = array(); + $allow = array(); // Convert the text to UTF-8 $encoding = strtolower($encoding); @@ -1758,12 +1827,9 @@ class fulltext_native extends \phpbb\search\base $text = htmlspecialchars_decode(utf8_decode_ncr($text), ENT_QUOTES); /** - * Load the UTF-8 normalizer - * - * If we use it more widely, an instance of that class should be held in a - * a global variable instead + * Normalize to NFC */ - \utf_normalizer::nfc($text); + $text = \Normalizer::normalize($text); /** * The first thing we do is: @@ -1856,9 +1922,9 @@ class fulltext_native extends \phpbb\search\base $utf_char = substr($text, $pos, $utf_len); $pos += $utf_len; - if (($utf_char >= UTF8_HANGUL_FIRST && $utf_char <= UTF8_HANGUL_LAST) - || ($utf_char >= UTF8_CJK_FIRST && $utf_char <= UTF8_CJK_LAST) - || ($utf_char >= UTF8_CJK_B_FIRST && $utf_char <= UTF8_CJK_B_LAST)) + if (($utf_char >= self::UTF8_HANGUL_FIRST && $utf_char <= self::UTF8_HANGUL_LAST) + || ($utf_char >= self::UTF8_CJK_FIRST && $utf_char <= self::UTF8_CJK_LAST) + || ($utf_char >= self::UTF8_CJK_B_FIRST && $utf_char <= self::UTF8_CJK_B_LAST)) { /** * All characters within these ranges are valid |