diff options
Diffstat (limited to 'phpBB/includes/message_parser.php')
-rw-r--r-- | phpBB/includes/message_parser.php | 321 |
1 files changed, 0 insertions, 321 deletions
diff --git a/phpBB/includes/message_parser.php b/phpBB/includes/message_parser.php index 1aad064c07..cb7b898d12 100644 --- a/phpBB/includes/message_parser.php +++ b/phpBB/includes/message_parser.php @@ -1212,325 +1212,4 @@ class parse_message extends bbcode_firstpass } } -/** -* @package phpBB3 -* Parses a given message and updates/maintains the fulltext tables -* @todo replace fulltext_search in message_parser with search modules -*/ -class fulltext_search -{ - function split_words($mode, $text) - { - global $user, $config; - - static $drop_char_match, $drop_char_replace, $stopwords, $replace_synonym, $match_synonym; - - // Is the fulltext indexer disabled? If yes then we need not - // carry on ... it's okay ... I know when I'm not wanted boo hoo - if (!$config['load_search_upd']) - { - return; - } - - if (!is_array($drop_char_match)) - { - $drop_char_match = array('-', '^', '$', ';', '#', '&', '(', ')', '<', '>', '`', '\'', '"', '|', ',', '@', '_', '?', '%', '~', '.', '[', ']', '{', '}', ':', '\\', '/', '=', '\'', '!', '*'); - $drop_char_replace = array(' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '', '', ' ', ' ', ' ', ' ', '', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '' , ' ', ' ', ' ', ' ', ' '); - - if ($fp = @fopen($user->lang_path . '/search_stopwords.txt', 'rb')) - { - $stopwords = explode("\n", str_replace("\r\n", "\n", fread($fp, filesize($user->lang_path . '/search_stopwords.txt')))); - } - fclose($fp); - - if ($fp = @fopen($user->lang_path . '/search_synonyms.txt', 'rb')) - { - preg_match_all('#^(.*?) (.*?)$#ms', fread($fp, filesize($user->lang_path . '/search_synonyms.txt')), $match); - $replace_synonym = &$match[1]; - $match_synonym = &$match[2]; - } - fclose($fp); - } - - $match = array(); - // Comments for hardcoded bbcode elements (urls, smilies, html) - $match[] = '#<!\-\- .* \-\->(.*?)<!\-\- .* \-\->#is'; - // New lines, carriage returns - $match[] = "#[\n\r]+#"; - // NCRs like etc. - $match[] = '#(&|&)[\#a-z0-9]+?;#i'; - // Do not index code - $match[] = '#\[code=?.*?(\:?[0-9a-z]{5,})\].*?\[\/code(\:?[0-9a-z]{5,})\]#is'; - // BBcode - $match[] = '#\[\/?[a-z\*\+\-]+=?.*?(\:?[0-9a-z]{5,})\]#'; - // Sequences > min_search_chars & < max_search_chars -// $match[] = '#\s([\b]{1,' . $config['min_search_chars'] . '}|[\b]{' . $config['max_search_chars'] . ',})\s#is'; -// $match[] = '#\s((&\#[0-9]+;){1,' . $config['min_search_chars'] . '}|(&\#[0-9]+;){' . $config['max_search_chars'] . ',})\s#is'; - // Filter out ; and # but not &#[0-9]+; -// $match[] = '#(&\#[0-9]+;)|;|\#|&#'; - - $text = preg_replace($match, ' ', ' ' . strtolower(trim($text)) . ' '); - $text = str_replace(array(' + ', ' - '), array(' and ', ' not '), $text); - - // Filter out non-alphabetical chars - $text = str_replace($drop_char_match, $drop_char_replace, $text); - - // Split words - $text = explode(' ', preg_replace('#\s+#', ' ', trim($text))); - - if (sizeof($stopwords)) - { - $stopped_words = array_intersect($text, $stopwords); - $text = array_diff($text, $stopwords); - } - - if (sizeof($replace_synonym)) - { - $text = str_replace($replace_synonym, $match_synonym, $text); - } - - foreach ($text as $index => $word) - { - if (strlen($word) < $config['min_search_chars'] || strlen($word) > $config['max_search_chars']) - { - unset($text[$index]); - } - } - - return $text; - } - - function add($mode, $post_id, &$message, &$subject) - { - global $config, $db; - - // Is the fulltext indexer disabled? If yes then we need not - // carry on ... it's okay ... I know when I'm not wanted boo hoo - if (!$config['load_search_upd']) - { - return; - } - - // Split old and new post/subject to obtain array of 'words' - $split_text = $this->split_words('post', $message); - $split_title = ($subject) ? $this->split_words('post', $subject) : array(); - - $words = array(); - if ($mode == 'edit') - { - $words['add']['post'] = array(); - $words['add']['title'] = array(); - $words['del']['post'] = array(); - $words['del']['title'] = array(); - - $sql = 'SELECT w.word_id, w.word_text, m.title_match - FROM ' . SEARCH_WORD_TABLE . ' w, ' . SEARCH_MATCH_TABLE . " m - WHERE m.post_id = $post_id - AND w.word_id = m.word_id"; - $result = $db->sql_query($sql); - - $cur_words = array(); - while ($row = $db->sql_fetchrow($result)) - { - $which = ($row['title_match']) ? 'title' : 'post'; - $cur_words[$which][$row['word_text']] = $row['word_id']; - } - $db->sql_freeresult($result); - - $words['add']['post'] = array_diff($split_text, array_keys($cur_words['post'])); - $words['add']['title'] = array_diff($split_title, array_keys($cur_words['title'])); - $words['del']['post'] = array_diff(array_keys($cur_words['post']), $split_text); - $words['del']['title'] = array_diff(array_keys($cur_words['title']), $split_title); - } - else - { - $words['add']['post'] = $split_text; - $words['add']['title'] = $split_title; - $words['del']['post'] = array(); - $words['del']['title'] = array(); - } - unset($split_text); - unset($split_title); - - // Get unique words from the above arrays - $unique_add_words = array_unique(array_merge($words['add']['post'], $words['add']['title'])); - - // We now have unique arrays of all words to be added and removed and - // individual arrays of added and removed words for text and title. What - // we need to do now is add the new words (if they don't already exist) - // and then add (or remove) matches between the words and this post - if (sizeof($unique_add_words)) - { - $sql = 'SELECT word_id, word_text - FROM ' . SEARCH_WORD_TABLE . ' - WHERE word_text IN (' . implode(', ', preg_replace('#^(.*)$#', '\'$1\'', $unique_add_words)) . ")"; - $result = $db->sql_query($sql); - - $word_ids = array(); - while ($row = $db->sql_fetchrow($result)) - { - $word_ids[$row['word_text']] = $row['word_id']; - } - $db->sql_freeresult($result); - - $new_words = array_diff($unique_add_words, array_keys($word_ids)); - unset($unique_add_words); - - if (sizeof($new_words)) - { - switch (SQL_LAYER) - { - case 'mysql': - $sql = 'INSERT INTO ' . SEARCH_WORD_TABLE . ' (word_text) - VALUES ' . implode(', ', preg_replace('#^(.*)$#', '(\'$1\')', $new_words)); - $db->sql_query($sql); - break; - - case 'mysql4': - case 'mysqli': - case 'mssql': - case 'mssql_odbc': - case 'sqlite': - $sql = 'INSERT INTO ' . SEARCH_WORD_TABLE . ' (word_text) ' . implode(' UNION ALL ', preg_replace('#^(.*)$#', "SELECT '\$1'", $new_words)); - $db->sql_query($sql); - break; - - default: - foreach ($new_words as $word) - { - $sql = 'INSERT INTO ' . SEARCH_WORD_TABLE . " (word_text) - VALUES ('$word')"; - $db->sql_query($sql); - } - break; - } - } - unset($new_words); - } - - foreach ($words['del'] as $word_in => $word_ary) - { - $title_match = ($word_in == 'title') ? 1 : 0; - - if (sizeof($word_ary)) - { - $sql_in = array(); - foreach ($word_ary as $word) - { - $sql_in[] = $cur_words[$word_in][$word]; - } - - $sql = 'DELETE FROM ' . SEARCH_MATCH_TABLE . ' - WHERE word_id IN (' . implode(', ', $sql_in) . ') - AND post_id = ' . intval($post_id) . " - AND title_match = $title_match"; - $db->sql_query($sql); - unset($sql_in); - } - } - - foreach ($words['add'] as $word_in => $word_ary) - { - $title_match = ($word_in == 'title') ? 1 : 0; - - if (sizeof($word_ary)) - { - $sql = 'INSERT INTO ' . SEARCH_MATCH_TABLE . " (post_id, word_id, title_match) - SELECT $post_id, word_id, $title_match - FROM " . SEARCH_WORD_TABLE . ' - WHERE word_text IN (' . implode(', ', preg_replace('#^(.*)$#', '\'$1\'', $word_ary)) . ')'; - $db->sql_query($sql); - } - } - - unset($words); - - // Run the cleanup infrequently, once per session cleanup - if ($config['search_last_gc'] < time() - $config['search_gc']) - { - $this->search_tidy(); - } - } - - // Tidy up indexes, tag 'common words', remove - // words no longer referenced in the match table, etc. - function search_tidy() - { - global $db, $config; - - // Is the fulltext indexer disabled? If yes then we need not - // carry on ... it's okay ... I know when I'm not wanted boo hoo - if (!$config['load_search_upd']) - { - return; - } - - // Remove common (> 60% of posts ) words - $sql = 'SELECT SUM(forum_posts) AS total_posts - FROM ' . FORUMS_TABLE; - $result = $db->sql_query($sql); - - $row = $db->sql_fetchrow($result); - $db->sql_freeresult($result); - - if ($row['total_posts'] >= 100) - { - $sql = 'SELECT word_id - FROM ' . SEARCH_MATCH_TABLE . ' - GROUP BY word_id - HAVING COUNT(word_id) > ' . floor($row['total_posts'] * 0.6); - $result = $db->sql_query($sql); - - if ($row = $db->sql_fetchrow($result)) - { - $sql_in = array(); - do - { - $sql_in[] = $row['word_id']; - } - while ($row = $db->sql_fetchrow($result)); - - $sql_in = implode(', ', $sql_in); - - $sql = 'UPDATE ' . SEARCH_WORD_TABLE . " - SET word_common = 1 - WHERE word_id IN ($sql_in)"; - $db->sql_query($sql); - - $sql = 'DELETE FROM ' . SEARCH_MATCH_TABLE . " - WHERE word_id IN ($sql_in)"; - $db->sql_query($sql); - unset($sql_in); - } - $db->sql_freeresult($result); - } - - // Remove words with no matches ... this is a potentially nasty query - $sql = 'SELECT w.word_id - FROM ' . SEARCH_WORD_TABLE . ' w - LEFT JOIN ' . SEARCH_MATCH_TABLE . ' m ON (w.word_id = m.word_id) - WHERE w.word_common = 0 AND m.word_id IS NULL - GROUP BY m.word_id'; - $result = $db->sql_query($sql); - - if ($row = $db->sql_fetchrow($result)) - { - $sql_in = array(); - do - { - $sql_in[] = $row['word_id']; - } - while ($row = $db->sql_fetchrow($result)); - - $sql = 'DELETE FROM ' . SEARCH_WORD_TABLE . ' - WHERE word_id IN (' . implode(', ', $sql_in) . ')'; - $db->sql_query($sql); - unset($sql_in); - } - $db->sql_freeresult($result); - - set_config('search_last_gc', time()); - } -} - ?>
\ No newline at end of file |