aboutsummaryrefslogtreecommitdiffstats
path: root/phpBB/includes/search
diff options
context:
space:
mode:
authorLudovic Arnaud <ludovic_arnaud@users.sourceforge.net>2006-07-15 15:44:54 +0000
committerLudovic Arnaud <ludovic_arnaud@users.sourceforge.net>2006-07-15 15:44:54 +0000
commit5f88af1a75b59125cd6675f7a76590cbd4deaaa2 (patch)
treed020ce119008ac519a466223ebf127a927c392fa /phpBB/includes/search
parent3b4944a476a696bf97cbab18aba527f44729ae85 (diff)
downloadforums-5f88af1a75b59125cd6675f7a76590cbd4deaaa2.tar
forums-5f88af1a75b59125cd6675f7a76590cbd4deaaa2.tar.gz
forums-5f88af1a75b59125cd6675f7a76590cbd4deaaa2.tar.bz2
forums-5f88af1a75b59125cd6675f7a76590cbd4deaaa2.tar.xz
forums-5f88af1a75b59125cd6675f7a76590cbd4deaaa2.zip
Added: support for CJK and Hangul into the search engine
git-svn-id: file:///svn/phpbb/trunk@6182 89ea8834-ac86-4346-8a33-228a782c2dd0
Diffstat (limited to 'phpBB/includes/search')
-rw-r--r--phpBB/includes/search/fulltext_native_improved.php78
1 files changed, 42 insertions, 36 deletions
diff --git a/phpBB/includes/search/fulltext_native_improved.php b/phpBB/includes/search/fulltext_native_improved.php
index 96d8be06b9..7ca5dc6a43 100644
--- a/phpBB/includes/search/fulltext_native_improved.php
+++ b/phpBB/includes/search/fulltext_native_improved.php
@@ -865,55 +865,56 @@ class fulltext_native_improved extends search_backend
$isset_min = $min - 1;
/**
+ * Load the UTF tools
+ */
+ if (!function_exists('utf8_strlen'))
+ {
+ include($phpbb_root_path . 'includes/utf/utf_tools.' . $phpEx);
+ }
+
+ /**
* Clean up the string, remove HTML tags, remove BBCodes
*/
$word = strtok($this->cleanup(preg_replace($match, ' ', strip_tags($text)), '', $user->lang['ENCODING']), ' ');
while (isset($word[0]))
{
+ if (isset($word[252])
+ || !isset($word[$isset_min]))
+ {
+ /**
+ * Words longer than 252 bytes are ignored. This will have to be
+ * changed whenever we change the length of search_wordlist.word_text
+ *
+ * Words shorter than $isset_min bytes are ignored, too
+ */
+ $word = strtok(' ');
+ continue;
+ }
+
+ $len = utf8_strlen($word);
+
/**
- * We check the length in octets to get an idea of the length
- * in chars. If it greater than or equal to $min and lower than
- * or equal to $max then we can safely assume they are within the
- * char limits
+ * Test whether the word is too short to be indexed.
*
- * Words that take more than 255 bytes are ignored
+ * Note that this limit does NOT apply to CJK and Hangul
*/
- if (isset($word[$isset_min])
- && !isset($word[255]))
+ if ($len < $min)
{
/**
- * This word does not exceed the SQL size, but we don't know
- * yet if its length in chars exceed the admin-defined one
+ * Note: this could be optimized. If the codepoint is lower than Hangul's range
+ * we know that it will also be lower than CJK ranges
*/
- if (!isset($word[$max]))
- {
- /**
- * No chance, its length in bytes is lower than our limit
- * and a single byte can't represent two chars
- */
- $words[] = $word;
- }
- else
+ if ((strncmp($word, UTF8_HANGUL_FIRST, 3) < 0 || strncmp($word, UTF8_HANGUL_LAST, 3) > 0)
+ && (strncmp($word, UTF8_CJK_FIRST, 3) < 0 || strncmp($word, UTF8_CJK_LAST, 3) > 0)
+ && (strncmp($word, UTF8_CJK_B_FIRST, 4) < 0 || strncmp($word, UTF8_CJK_B_LAST, 4) > 0))
{
- /**
- * We have to find the length in chars
- */
- if (!function_exists('utf8_strlen'))
- {
- include($phpbb_root_path . 'includes/utf/utf_tools.' . $phpEx);
- }
-
- if (utf8_strlen($word) <= $max)
- {
- /**
- * Hurray for us, the word is the right size
- */
- $words[] = $word;
- }
+ $word = strtok(' ');
+ continue;
}
}
+ $words[] = $word;
$word = strtok(' ');
}
@@ -1377,12 +1378,17 @@ class fulltext_native_improved extends search_backend
$utf_char = substr($text, $pos, $utf_len);
$pos += $utf_len;
- if ($utf_char >= UTF8_HANGUL_FIRST && $utf_char <= UTF8_HANGUL_LAST)
+ if (($utf_char >= UTF8_HANGUL_FIRST && $utf_char <= UTF8_HANGUL_LAST)
+ || ($utf_char >= UTF8_CJK_FIRST && $utf_char <= UTF8_CJK_LAST)
+ || ($utf_char >= UTF8_CJK_B_FIRST && $utf_char <= UTF8_CJK_B_LAST))
{
/**
- * All characters within this range are valid
+ * All characters within these ranges are valid
+ *
+ * We index all the characters separately and we pad them to make them
+ * long enough to be indexed
*/
- $ret .= $utf_char;
+ $ret .= ' chr' . $utf_char;
continue;
}