aboutsummaryrefslogtreecommitdiffstats
path: root/phpBB/phpbb/search/fulltext_native.php
diff options
context:
space:
mode:
Diffstat (limited to 'phpBB/phpbb/search/fulltext_native.php')
-rw-r--r--phpBB/phpbb/search/fulltext_native.php208
1 files changed, 137 insertions, 71 deletions
diff --git a/phpBB/phpbb/search/fulltext_native.php b/phpBB/phpbb/search/fulltext_native.php
index 63b0b24edf..c83de75eed 100644
--- a/phpBB/phpbb/search/fulltext_native.php
+++ b/phpBB/phpbb/search/fulltext_native.php
@@ -18,6 +18,13 @@ namespace phpbb\search;
*/
class fulltext_native extends \phpbb\search\base
{
+ const UTF8_HANGUL_FIRST = "\xEA\xB0\x80";
+ const UTF8_HANGUL_LAST = "\xED\x9E\xA3";
+ const UTF8_CJK_FIRST = "\xE4\xB8\x80";
+ const UTF8_CJK_LAST = "\xE9\xBE\xBB";
+ const UTF8_CJK_B_FIRST = "\xF0\xA0\x80\x80";
+ const UTF8_CJK_B_LAST = "\xF0\xAA\x9B\x96";
+
/**
* Associative array holding index stats
* @var array
@@ -99,7 +106,7 @@ class fulltext_native extends \phpbb\search\base
protected $user;
/**
- * Initialises the fulltext_native search backend with min/max word length and makes sure the UTF-8 normalizer is loaded
+ * Initialises the fulltext_native search backend with min/max word length
*
* @param boolean|string &$error is passed by reference and should either be set to false on success or an error message on failure
* @param \phpbb\event\dispatcher_interface $phpbb_dispatcher Event dispatcher object
@@ -113,15 +120,11 @@ class fulltext_native extends \phpbb\search\base
$this->phpbb_dispatcher = $phpbb_dispatcher;
$this->user = $user;
- $this->word_length = array('min' => $this->config['fulltext_native_min_chars'], 'max' => $this->config['fulltext_native_max_chars']);
+ $this->word_length = array('min' => (int) $this->config['fulltext_native_min_chars'], 'max' => (int) $this->config['fulltext_native_max_chars']);
/**
* Load the UTF tools
*/
- if (!class_exists('utf_normalizer'))
- {
- include($this->phpbb_root_path . 'includes/utf/utf_normalizer.' . $this->php_ext);
- }
if (!function_exists('utf8_decode_ncr'))
{
include($this->phpbb_root_path . 'includes/utf/utf_tools.' . $this->php_ext);
@@ -187,7 +190,7 @@ class fulltext_native extends \phpbb\search\base
*/
public function split_keywords($keywords, $terms)
{
- $tokens = '+-|()*';
+ $tokens = '+-|()* ';
$keywords = trim($this->cleanup($keywords, $tokens));
@@ -221,12 +224,10 @@ class fulltext_native extends \phpbb\search\base
$keywords[$i] = '|';
break;
case '*':
- if ($i === 0 || ($keywords[$i - 1] !== '*' && strcspn($keywords[$i - 1], $tokens) === 0))
+ // $i can never be 0 here since $open_bracket is initialised to false
+ if (strpos($tokens, $keywords[$i - 1]) !== false && ($i + 1 === $n || strpos($tokens, $keywords[$i + 1]) !== false))
{
- if ($i === $n - 1 || ($keywords[$i + 1] !== '*' && strcspn($keywords[$i + 1], $tokens) === 0))
- {
- $keywords = substr($keywords, 0, $i) . substr($keywords, $i + 1);
- }
+ $keywords[$i] = '|';
}
break;
}
@@ -261,7 +262,7 @@ class fulltext_native extends \phpbb\search\base
}
}
- if ($open_bracket)
+ if ($open_bracket !== false)
{
$keywords .= ')';
}
@@ -282,7 +283,7 @@ class fulltext_native extends \phpbb\search\base
);
$keywords = preg_replace($match, $replace, $keywords);
- $num_keywords = sizeof(explode(' ', $keywords));
+ $num_keywords = count(explode(' ', $keywords));
// We limit the number of allowed keywords to minimize load on the database
if ($this->config['max_num_search_keywords'] && $num_keywords > $this->config['max_num_search_keywords'])
@@ -298,12 +299,26 @@ class fulltext_native extends \phpbb\search\base
$words = array();
preg_match_all('#([^\\s+\\-|()]+)(?:$|[\\s+\\-|()])#u', $keywords, $words);
- if (sizeof($words[1]))
+ if (count($words[1]))
{
$keywords = '(' . implode('|', $words[1]) . ')';
}
}
+ // Remove non trailing wildcards from each word to prevent a full table scan (it's now using the database index)
+ $match = '#\*(?!$|\s)#';
+ $replace = '$1';
+ $keywords = preg_replace($match, $replace, $keywords);
+
+ // Only allow one wildcard in the search query to limit the database load
+ $match = '#\*#';
+ $replace = '$1';
+ $count_wildcards = substr_count($keywords, '*');
+
+ // Reverse the string to remove all wildcards except the first one
+ $keywords = strrev(preg_replace($match, $replace, strrev($keywords), $count_wildcards - 1));
+ unset($count_wildcards);
+
// set the search_query which is shown to the user
$this->search_query = $keywords;
@@ -313,7 +328,7 @@ class fulltext_native extends \phpbb\search\base
$common_ids = $words = array();
- if (sizeof($exact_words))
+ if (count($exact_words))
{
$sql = 'SELECT word_id, word_text, word_common
FROM ' . SEARCH_WORDLIST_TABLE . '
@@ -349,9 +364,6 @@ class fulltext_native extends \phpbb\search\base
$this->must_not_contain_ids = array();
$this->must_exclude_one_ids = array();
- $mode = '';
- $ignore_no_id = true;
-
foreach ($query as $word)
{
if (empty($word))
@@ -409,8 +421,16 @@ class fulltext_native extends \phpbb\search\base
{
if (strpos($word_part, '*') !== false)
{
- $id_words[] = '\'' . $this->db->sql_escape(str_replace('*', '%', $word_part)) . '\'';
- $non_common_words[] = $word_part;
+ $len = utf8_strlen(str_replace('*', '', $word_part));
+ if ($len >= $this->word_length['min'] && $len <= $this->word_length['max'])
+ {
+ $id_words[] = '\'' . $this->db->sql_escape(str_replace('*', '%', $word_part)) . '\'';
+ $non_common_words[] = $word_part;
+ }
+ else
+ {
+ $this->common_words[] = $word_part;
+ }
}
else if (isset($words[$word_part]))
{
@@ -426,10 +446,10 @@ class fulltext_native extends \phpbb\search\base
}
}
}
- if (sizeof($id_words))
+ if (count($id_words))
{
sort($id_words);
- if (sizeof($id_words) > 1)
+ if (count($id_words) > 1)
{
$this->{$mode . '_ids'}[] = $id_words;
}
@@ -440,7 +460,7 @@ class fulltext_native extends \phpbb\search\base
}
}
// throw an error if we shall not ignore unexistant words
- else if (!$ignore_no_id && sizeof($non_common_words))
+ else if (!$ignore_no_id && count($non_common_words))
{
trigger_error(sprintf($this->user->lang['WORDS_IN_NO_POST'], implode($this->user->lang['COMMA_SEPARATOR'], $non_common_words)));
}
@@ -480,7 +500,7 @@ class fulltext_native extends \phpbb\search\base
}
// Return true if all words are not common words
- if (sizeof($exact_words) - sizeof($this->common_words) > 0)
+ if (count($exact_words) - count($this->common_words) > 0)
{
return true;
}
@@ -594,7 +614,6 @@ class fulltext_native extends \phpbb\search\base
$id_ary = array();
$sql_where = array();
- $group_by = false;
$m_num = 0;
$w_num = 0;
@@ -717,7 +736,7 @@ class fulltext_native extends \phpbb\search\base
}
}
- if (sizeof($this->must_not_contain_ids))
+ if (count($this->must_not_contain_ids))
{
$sql_array['LEFT_JOIN'][] = array(
'FROM' => array(SEARCH_WORDMATCH_TABLE => 'm' . $m_num),
@@ -827,7 +846,7 @@ class fulltext_native extends \phpbb\search\base
$sql_where[] = 'p.topic_id = ' . $topic_id;
}
- if (sizeof($author_ary))
+ if (count($author_ary))
{
if ($author_name)
{
@@ -841,7 +860,7 @@ class fulltext_native extends \phpbb\search\base
$sql_where[] = $sql_author;
}
- if (sizeof($ex_fid_ary))
+ if (count($ex_fid_ary))
{
$sql_where[] = $this->db->sql_in_set('p.forum_id', $ex_fid_ary, true);
}
@@ -879,7 +898,6 @@ class fulltext_native extends \phpbb\search\base
break;
- case 'sqlite':
case 'sqlite3':
$sql_array_count['SELECT'] = ($type == 'posts') ? 'DISTINCT p.post_id' : 'DISTINCT p.topic_id';
$sql = 'SELECT COUNT(' . (($type == 'posts') ? 'post_id' : 'topic_id') . ') as total_results
@@ -1012,7 +1030,7 @@ class fulltext_native extends \phpbb\search\base
public function author_search($type, $firstpost_only, $sort_by_sql, $sort_key, $sort_dir, $sort_days, $ex_fid_ary, $post_visibility, $topic_id, $author_ary, $author_name, &$id_ary, &$start, $per_page)
{
// No author? No posts
- if (!sizeof($author_ary))
+ if (!count($author_ary))
{
return 0;
}
@@ -1084,7 +1102,7 @@ class fulltext_native extends \phpbb\search\base
{
$sql_author = $this->db->sql_in_set('p.poster_id', $author_ary);
}
- $sql_fora = (sizeof($ex_fid_ary)) ? ' AND ' . $this->db->sql_in_set('p.forum_id', $ex_fid_ary, true) : '';
+ $sql_fora = (count($ex_fid_ary)) ? ' AND ' . $this->db->sql_in_set('p.forum_id', $ex_fid_ary, true) : '';
$sql_time = ($sort_days) ? ' AND p.post_time >= ' . (time() - ($sort_days * 86400)) : '';
$sql_topic_id = ($topic_id) ? ' AND p.topic_id = ' . (int) $topic_id : '';
$sql_firstpost = ($firstpost_only) ? ' AND p.post_id = t.topic_first_post_id' : '';
@@ -1186,7 +1204,7 @@ class fulltext_native extends \phpbb\search\base
}
else
{
- if ($this->db->get_sql_layer() == 'sqlite' || $this->db->get_sql_layer() == 'sqlite3')
+ if ($this->db->get_sql_layer() == 'sqlite3')
{
$sql = 'SELECT COUNT(topic_id) as total_results
FROM (SELECT DISTINCT t.topic_id';
@@ -1203,7 +1221,7 @@ class fulltext_native extends \phpbb\search\base
$post_visibility
$sql_fora
AND t.topic_id = p.topic_id
- $sql_time" . (($this->db->get_sql_layer() == 'sqlite' || $this->db->get_sql_layer() == 'sqlite3') ? ')' : '');
+ $sql_time" . ($this->db->get_sql_layer() == 'sqlite3' ? ')' : '');
}
$result = $this->db->sql_query($sql);
@@ -1291,7 +1309,7 @@ class fulltext_native extends \phpbb\search\base
$this->db->sql_freeresult($result);
}
- if (sizeof($id_ary))
+ if (count($id_ary))
{
$this->save_ids($search_key, '', $author_ary, $total_results, $id_ary, $start, $sort_dir);
$id_ary = array_slice($id_ary, 0, $per_page);
@@ -1325,7 +1343,6 @@ class fulltext_native extends \phpbb\search\base
$match[] = '#\[\/?[a-z0-9\*\+\-]+(?:=.*?)?(?::[a-z])?(\:?[0-9a-z]{5,})\]#';
$min = $this->word_length['min'];
- $max = $this->word_length['max'];
$isset_min = $min - 1;
@@ -1361,9 +1378,9 @@ class fulltext_native extends \phpbb\search\base
* Note: this could be optimized. If the codepoint is lower than Hangul's range
* we know that it will also be lower than CJK ranges
*/
- if ((strncmp($word, UTF8_HANGUL_FIRST, 3) < 0 || strncmp($word, UTF8_HANGUL_LAST, 3) > 0)
- && (strncmp($word, UTF8_CJK_FIRST, 3) < 0 || strncmp($word, UTF8_CJK_LAST, 3) > 0)
- && (strncmp($word, UTF8_CJK_B_FIRST, 4) < 0 || strncmp($word, UTF8_CJK_B_LAST, 4) > 0))
+ if ((strncmp($word, self::UTF8_HANGUL_FIRST, 3) < 0 || strncmp($word, self::UTF8_HANGUL_LAST, 3) > 0)
+ && (strncmp($word, self::UTF8_CJK_FIRST, 3) < 0 || strncmp($word, self::UTF8_CJK_LAST, 3) > 0)
+ && (strncmp($word, self::UTF8_CJK_B_FIRST, 4) < 0 || strncmp($word, self::UTF8_CJK_B_LAST, 4) > 0))
{
$word = strtok(' ');
continue;
@@ -1436,6 +1453,38 @@ class fulltext_native extends \phpbb\search\base
$words['del']['post'] = array();
$words['del']['title'] = array();
}
+
+ /**
+ * Event to modify method arguments and words before the native search index is updated
+ *
+ * @event core.search_native_index_before
+ * @var string mode Contains the post mode: edit, post, reply, quote
+ * @var int post_id The id of the post which is modified/created
+ * @var string message New or updated post content
+ * @var string subject New or updated post subject
+ * @var int poster_id Post author's user id
+ * @var int forum_id The id of the forum in which the post is located
+ * @var array words Grouped lists of words added to or remove from the index
+ * @var array split_text Array of words from the message
+ * @var array split_title Array of words from the title
+ * @var array cur_words Array of words currently in the index for comparing to new words
+ * when mode is edit. Empty for other modes.
+ * @since 3.2.3-RC1
+ */
+ $vars = array(
+ 'mode',
+ 'post_id',
+ 'message',
+ 'subject',
+ 'poster_id',
+ 'forum_id',
+ 'words',
+ 'split_text',
+ 'split_title',
+ 'cur_words',
+ );
+ extract($this->phpbb_dispatcher->trigger_event('core.search_native_index_before', compact($vars)));
+
unset($split_text);
unset($split_title);
@@ -1446,7 +1495,7 @@ class fulltext_native extends \phpbb\search\base
// individual arrays of added and removed words for text and title. What
// we need to do now is add the new words (if they don't already exist)
// and then add (or remove) matches between the words and this post
- if (sizeof($unique_add_words))
+ if (count($unique_add_words))
{
$sql = 'SELECT word_id, word_text
FROM ' . SEARCH_WORDLIST_TABLE . '
@@ -1462,7 +1511,7 @@ class fulltext_native extends \phpbb\search\base
$new_words = array_diff($unique_add_words, array_keys($word_ids));
$this->db->sql_transaction('begin');
- if (sizeof($new_words))
+ if (count($new_words))
{
$sql_ary = array();
@@ -1486,7 +1535,7 @@ class fulltext_native extends \phpbb\search\base
{
$title_match = ($word_in == 'title') ? 1 : 0;
- if (sizeof($word_ary))
+ if (count($word_ary))
{
$sql_in = array();
foreach ($word_ary as $word)
@@ -1515,7 +1564,7 @@ class fulltext_native extends \phpbb\search\base
{
$title_match = ($word_in == 'title') ? 1 : 0;
- if (sizeof($word_ary))
+ if (count($word_ary))
{
$sql = 'INSERT INTO ' . SEARCH_WORDMATCH_TABLE . ' (post_id, word_id, title_match)
SELECT ' . (int) $post_id . ', word_id, ' . (int) $title_match . '
@@ -1546,7 +1595,7 @@ class fulltext_native extends \phpbb\search\base
*/
public function index_remove($post_ids, $author_ids, $forum_ids)
{
- if (sizeof($post_ids))
+ if (count($post_ids))
{
$sql = 'SELECT w.word_id, w.word_text, m.title_match
FROM ' . SEARCH_WORDMATCH_TABLE . ' m, ' . SEARCH_WORDLIST_TABLE . ' w
@@ -1569,7 +1618,7 @@ class fulltext_native extends \phpbb\search\base
}
$this->db->sql_freeresult($result);
- if (sizeof($title_word_ids))
+ if (count($title_word_ids))
{
$sql = 'UPDATE ' . SEARCH_WORDLIST_TABLE . '
SET word_count = word_count - 1
@@ -1578,7 +1627,7 @@ class fulltext_native extends \phpbb\search\base
$this->db->sql_query($sql);
}
- if (sizeof($message_word_ids))
+ if (count($message_word_ids))
{
$sql = 'UPDATE ' . SEARCH_WORDLIST_TABLE . '
SET word_count = word_count - 1
@@ -1608,7 +1657,7 @@ class fulltext_native extends \phpbb\search\base
// carry on ... it's okay ... I know when I'm not wanted boo hoo
if (!$this->config['fulltext_native_load_upd'])
{
- set_config('search_last_gc', time(), true);
+ $this->config->set('search_last_gc', time(), false);
return;
}
@@ -1633,7 +1682,7 @@ class fulltext_native extends \phpbb\search\base
}
$this->db->sql_freeresult($result);
- if (sizeof($sql_in))
+ if (count($sql_in))
{
// Flag the words
$sql = 'UPDATE ' . SEARCH_WORDLIST_TABLE . '
@@ -1643,7 +1692,7 @@ class fulltext_native extends \phpbb\search\base
// by setting search_last_gc to the new time here we make sure that if a user reloads because the
// following query takes too long, he won't run into it again
- set_config('search_last_gc', time(), true);
+ $this->config->set('search_last_gc', time(), false);
// Delete the matches
$sql = 'DELETE FROM ' . SEARCH_WORDMATCH_TABLE . '
@@ -1653,13 +1702,13 @@ class fulltext_native extends \phpbb\search\base
unset($sql_in);
}
- if (sizeof($destroy_cache_words))
+ if (count($destroy_cache_words))
{
// destroy cached search results containing any of the words that are now common or were removed
$this->destroy_cache(array_unique($destroy_cache_words));
}
- set_config('search_last_gc', time(), true);
+ $this->config->set('search_last_gc', time(), false);
}
/**
@@ -1667,21 +1716,43 @@ class fulltext_native extends \phpbb\search\base
*/
public function delete_index($acp_module, $u_action)
{
+ $sql_queries = [];
+
switch ($this->db->get_sql_layer())
{
- case 'sqlite':
case 'sqlite3':
- $this->db->sql_query('DELETE FROM ' . SEARCH_WORDLIST_TABLE);
- $this->db->sql_query('DELETE FROM ' . SEARCH_WORDMATCH_TABLE);
- $this->db->sql_query('DELETE FROM ' . SEARCH_RESULTS_TABLE);
+ $sql_queries[] = 'DELETE FROM ' . SEARCH_WORDLIST_TABLE;
+ $sql_queries[] = 'DELETE FROM ' . SEARCH_WORDMATCH_TABLE;
+ $sql_queries[] = 'DELETE FROM ' . SEARCH_RESULTS_TABLE;
break;
default:
- $this->db->sql_query('TRUNCATE TABLE ' . SEARCH_WORDLIST_TABLE);
- $this->db->sql_query('TRUNCATE TABLE ' . SEARCH_WORDMATCH_TABLE);
- $this->db->sql_query('TRUNCATE TABLE ' . SEARCH_RESULTS_TABLE);
+ $sql_queries[] = 'TRUNCATE TABLE ' . SEARCH_WORDLIST_TABLE;
+ $sql_queries[] = 'TRUNCATE TABLE ' . SEARCH_WORDMATCH_TABLE;
+ $sql_queries[] = 'TRUNCATE TABLE ' . SEARCH_RESULTS_TABLE;
break;
}
+
+ $stats = $this->stats;
+
+ /**
+ * Event to modify SQL queries before the native search index is deleted
+ *
+ * @event core.search_native_delete_index_before
+ * @var array sql_queries Array with queries for deleting the search index
+ * @var array stats Array with statistics of the current index (read only)
+ * @since 3.2.3-RC1
+ */
+ $vars = array(
+ 'sql_queries',
+ 'stats',
+ );
+ extract($this->phpbb_dispatcher->trigger_event('core.search_native_delete_index_before', compact($vars)));
+
+ foreach ($sql_queries as $sql_query)
+ {
+ $this->db->sql_query($sql_query);
+ }
}
/**
@@ -1689,7 +1760,7 @@ class fulltext_native extends \phpbb\search\base
*/
public function index_created()
{
- if (!sizeof($this->stats))
+ if (!count($this->stats))
{
$this->get_stats();
}
@@ -1702,7 +1773,7 @@ class fulltext_native extends \phpbb\search\base
*/
public function index_stats()
{
- if (!sizeof($this->stats))
+ if (!count($this->stats))
{
$this->get_stats();
}
@@ -1730,13 +1801,11 @@ class fulltext_native extends \phpbb\search\base
* @param string $allowed_chars String of special chars to allow
* @param string $encoding Text encoding
* @return string Cleaned up text, only alphanumeric chars are left
- *
- * @todo \normalizer::cleanup being able to be used?
*/
protected function cleanup($text, $allowed_chars = null, $encoding = 'utf-8')
{
static $conv = array(), $conv_loaded = array();
- $words = $allow = array();
+ $allow = array();
// Convert the text to UTF-8
$encoding = strtolower($encoding);
@@ -1758,12 +1827,9 @@ class fulltext_native extends \phpbb\search\base
$text = htmlspecialchars_decode(utf8_decode_ncr($text), ENT_QUOTES);
/**
- * Load the UTF-8 normalizer
- *
- * If we use it more widely, an instance of that class should be held in a
- * a global variable instead
+ * Normalize to NFC
*/
- \utf_normalizer::nfc($text);
+ $text = \Normalizer::normalize($text);
/**
* The first thing we do is:
@@ -1856,9 +1922,9 @@ class fulltext_native extends \phpbb\search\base
$utf_char = substr($text, $pos, $utf_len);
$pos += $utf_len;
- if (($utf_char >= UTF8_HANGUL_FIRST && $utf_char <= UTF8_HANGUL_LAST)
- || ($utf_char >= UTF8_CJK_FIRST && $utf_char <= UTF8_CJK_LAST)
- || ($utf_char >= UTF8_CJK_B_FIRST && $utf_char <= UTF8_CJK_B_LAST))
+ if (($utf_char >= self::UTF8_HANGUL_FIRST && $utf_char <= self::UTF8_HANGUL_LAST)
+ || ($utf_char >= self::UTF8_CJK_FIRST && $utf_char <= self::UTF8_CJK_LAST)
+ || ($utf_char >= self::UTF8_CJK_B_FIRST && $utf_char <= self::UTF8_CJK_B_LAST))
{
/**
* All characters within these ranges are valid