diff options
Diffstat (limited to 'phpBB/includes')
-rw-r--r-- | phpBB/includes/search/fulltext_native_improved.php | 78 | ||||
-rw-r--r-- | phpBB/includes/utf/utf_normalizer.php | 6 |
2 files changed, 48 insertions, 36 deletions
diff --git a/phpBB/includes/search/fulltext_native_improved.php b/phpBB/includes/search/fulltext_native_improved.php index 96d8be06b9..7ca5dc6a43 100644 --- a/phpBB/includes/search/fulltext_native_improved.php +++ b/phpBB/includes/search/fulltext_native_improved.php @@ -865,55 +865,56 @@ class fulltext_native_improved extends search_backend $isset_min = $min - 1; /** + * Load the UTF tools + */ + if (!function_exists('utf8_strlen')) + { + include($phpbb_root_path . 'includes/utf/utf_tools.' . $phpEx); + } + + /** * Clean up the string, remove HTML tags, remove BBCodes */ $word = strtok($this->cleanup(preg_replace($match, ' ', strip_tags($text)), '', $user->lang['ENCODING']), ' '); while (isset($word[0])) { + if (isset($word[252]) + || !isset($word[$isset_min])) + { + /** + * Words longer than 252 bytes are ignored. This will have to be + * changed whenever we change the length of search_wordlist.word_text + * + * Words shorter than $isset_min bytes are ignored, too + */ + $word = strtok(' '); + continue; + } + + $len = utf8_strlen($word); + /** - * We check the length in octets to get an idea of the length - * in chars. If it greater than or equal to $min and lower than - * or equal to $max then we can safely assume they are within the - * char limits + * Test whether the word is too short to be indexed. * - * Words that take more than 255 bytes are ignored + * Note that this limit does NOT apply to CJK and Hangul */ - if (isset($word[$isset_min]) - && !isset($word[255])) + if ($len < $min) { /** - * This word does not exceed the SQL size, but we don't know - * yet if its length in chars exceed the admin-defined one + * Note: this could be optimized. If the codepoint is lower than Hangul's range + * we know that it will also be lower than CJK ranges */ - if (!isset($word[$max])) - { - /** - * No chance, its length in bytes is lower than our limit - * and a single byte can't represent two chars - */ - $words[] = $word; - } - else + if ((strncmp($word, UTF8_HANGUL_FIRST, 3) < 0 || strncmp($word, UTF8_HANGUL_LAST, 3) > 0) + && (strncmp($word, UTF8_CJK_FIRST, 3) < 0 || strncmp($word, UTF8_CJK_LAST, 3) > 0) + && (strncmp($word, UTF8_CJK_B_FIRST, 4) < 0 || strncmp($word, UTF8_CJK_B_LAST, 4) > 0)) { - /** - * We have to find the length in chars - */ - if (!function_exists('utf8_strlen')) - { - include($phpbb_root_path . 'includes/utf/utf_tools.' . $phpEx); - } - - if (utf8_strlen($word) <= $max) - { - /** - * Hurray for us, the word is the right size - */ - $words[] = $word; - } + $word = strtok(' '); + continue; } } + $words[] = $word; $word = strtok(' '); } @@ -1377,12 +1378,17 @@ class fulltext_native_improved extends search_backend $utf_char = substr($text, $pos, $utf_len); $pos += $utf_len; - if ($utf_char >= UTF8_HANGUL_FIRST && $utf_char <= UTF8_HANGUL_LAST) + if (($utf_char >= UTF8_HANGUL_FIRST && $utf_char <= UTF8_HANGUL_LAST) + || ($utf_char >= UTF8_CJK_FIRST && $utf_char <= UTF8_CJK_LAST) + || ($utf_char >= UTF8_CJK_B_FIRST && $utf_char <= UTF8_CJK_B_LAST)) { /** - * All characters within this range are valid + * All characters within these ranges are valid + * + * We index all the characters separately and we pad them to make them + * long enough to be indexed */ - $ret .= $utf_char; + $ret .= ' chr' . $utf_char; continue; } diff --git a/phpBB/includes/utf/utf_normalizer.php b/phpBB/includes/utf/utf_normalizer.php index c985337c64..0b567fad6b 100644 --- a/phpBB/includes/utf/utf_normalizer.php +++ b/phpBB/includes/utf/utf_normalizer.php @@ -22,6 +22,12 @@ define('UTF8_SURROGATE_LAST', "\xED\xBF\xBF"); define('UTF8_HANGUL_FIRST', "\xEA\xB0\x80"); define('UTF8_HANGUL_LAST', "\xED\x9E\xA3"); +define('UTF8_CJK_FIRST', "\xE4\xB8\x80"); +define('UTF8_CJK_LAST', "\xE9\xBE\xBB"); +define('UTF8_CJK_B_FIRST', "\xF0\xA0\x80\x80"); +define('UTF8_CJK_B_LAST', "\xF0\xAA\x9B\x96"); + + if (function_exists('utf8_normalize')) { |