diff options
| author | Nils Adermann <naderman@naderman.de> | 2010-03-02 01:05:34 +0100 |
|---|---|---|
| committer | Nils Adermann <naderman@naderman.de> | 2010-03-02 01:05:34 +0100 |
| commit | 89b37954f994a7cd517553d2d16686f91dcaae72 (patch) | |
| tree | b20e25768bc55be250454c439ffee08ce2981031 /phpBB/includes/utf | |
| parent | 07633a66e8c9bbb2b288a286bfbea6f562eeca4d (diff) | |
| parent | 80d429a02d26da1f00777e62a0268d83f581f598 (diff) | |
| download | forums-89b37954f994a7cd517553d2d16686f91dcaae72.tar forums-89b37954f994a7cd517553d2d16686f91dcaae72.tar.gz forums-89b37954f994a7cd517553d2d16686f91dcaae72.tar.bz2 forums-89b37954f994a7cd517553d2d16686f91dcaae72.tar.xz forums-89b37954f994a7cd517553d2d16686f91dcaae72.zip | |
Merge commit 'release-3.0-B4'
Diffstat (limited to 'phpBB/includes/utf')
| -rw-r--r-- | phpBB/includes/utf/utf_normalizer.php | 104 | ||||
| -rw-r--r-- | phpBB/includes/utf/utf_tools.php | 112 |
2 files changed, 105 insertions, 111 deletions
diff --git a/phpBB/includes/utf/utf_normalizer.php b/phpBB/includes/utf/utf_normalizer.php index 0d1d74539a..542c1aeeb8 100644 --- a/phpBB/includes/utf/utf_normalizer.php +++ b/phpBB/includes/utf/utf_normalizer.php @@ -67,10 +67,10 @@ class utf_normalizer * The ultimate convenience function! Clean up invalid UTF-8 sequences, * and convert to Normal Form C, canonical composition. * - * @param string $str The dirty string + * @param string &$str The dirty string * @return string The same string, all shiny and cleaned-up */ - function cleanup($str) + function cleanup(&$str) { // The string below is the list of all autorized characters, sorted by frequency in latin text $pos = strspn($str, "\x20\x65\x69\x61\x73\x6E\x74\x72\x6F\x6C\x75\x64\x5D\x5B\x63\x6D\x70\x27\x0A\x67\x7C\x68\x76\x2E\x66\x62\x2C\x3A\x3D\x2D\x71\x31\x30\x43\x32\x2A\x79\x78\x29\x28\x4C\x39\x41\x53\x2F\x50\x22\x45\x6A\x4D\x49\x6B\x33\x3E\x35\x54\x3C\x44\x34\x7D\x42\x7B\x38\x46\x77\x52\x36\x37\x55\x47\x4E\x3B\x4A\x7A\x56\x23\x48\x4F\x57\x5F\x26\x21\x4B\x3F\x58\x51\x25\x59\x5C\x09\x5A\x2B\x7E\x5E\x24\x40\x60\x7F\x0D"); @@ -79,7 +79,7 @@ class utf_normalizer if ($pos == $len) { // ASCII strings with no special chars return immediately - return $str; + return; } // Note: we do not check for $GLOBALS['utf_canonical_decomp']. It is assumed they are always loaded together @@ -91,23 +91,22 @@ class utf_normalizer // Replace any byte in the range 0x00..0x1F, except for \r, \n and \t // We replace those characters with a 0xFF byte, which is illegal in UTF-8 and will in turn be replaced with a UTF replacement char - return utf_normalizer::recompose( - strtr( - $str, - "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F", - "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF" - ), - $pos, $len, $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_canonical_decomp'] + $str = strtr( + $str, + "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F", + "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF" ); + + $str = utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_canonical_decomp']); } /** * Validate and normalize a UTF string to NFC * - * @param string $str Unchecked UTF string + * @param string &$str Unchecked UTF string * @return string The string, validated and in normal form */ - function nfc($str) + function nfc(&$str) { $pos = strspn($str, UTF8_ASCII_RANGE); $len = strlen($str); @@ -115,7 +114,7 @@ class utf_normalizer if ($pos == $len) { // ASCII strings return immediately - return $str; + return; } if (!isset($GLOBALS['utf_nfc_qc'])) @@ -124,16 +123,16 @@ class utf_normalizer include($phpbb_root_path . 'includes/utf/data/utf_nfc_qc.' . $phpEx); } - return utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_canonical_decomp']); + $str = utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_canonical_decomp']); } /** * Validate and normalize a UTF string to NFKC * - * @param string $str Unchecked UTF string + * @param string &$str Unchecked UTF string * @return string The string, validated and in normal form */ - function nfkc($str) + function nfkc(&$str) { $pos = strspn($str, UTF8_ASCII_RANGE); $len = strlen($str); @@ -141,7 +140,7 @@ class utf_normalizer if ($pos == $len) { // ASCII strings return immediately - return $str; + return; } if (!isset($GLOBALS['utf_nfkc_qc'])) @@ -156,16 +155,16 @@ class utf_normalizer include($phpbb_root_path . 'includes/utf/data/utf_canonical_comp.' . $phpEx); } - return utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfkc_qc'], $GLOBALS['utf_compatibility_decomp']); + $str = utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfkc_qc'], $GLOBALS['utf_compatibility_decomp']); } /** * Validate and normalize a UTF string to NFD * - * @param string $str Unchecked UTF string + * @param string &$str Unchecked UTF string * @return string The string, validated and in normal form */ - function nfd($str) + function nfd(&$str) { $pos = strspn($str, UTF8_ASCII_RANGE); $len = strlen($str); @@ -173,7 +172,7 @@ class utf_normalizer if ($pos == $len) { // ASCII strings return immediately - return $str; + return; } if (!isset($GLOBALS['utf_canonical_decomp'])) @@ -182,16 +181,16 @@ class utf_normalizer include($phpbb_root_path . 'includes/utf/data/utf_canonical_decomp.' . $phpEx); } - return utf_normalizer::decompose($str, $pos, $len, $GLOBALS['utf_canonical_decomp']); + $str = utf_normalizer::decompose($str, $pos, $len, $GLOBALS['utf_canonical_decomp']); } /** * Validate and normalize a UTF string to NFKD * - * @param string $str Unchecked UTF string + * @param string &$str Unchecked UTF string * @return string The string, validated and in normal form */ - function nfkd($str) + function nfkd(&$str) { $pos = strspn($str, UTF8_ASCII_RANGE); $len = strlen($str); @@ -199,7 +198,7 @@ class utf_normalizer if ($pos == $len) { // ASCII strings return immediately - return $str; + return; } if (!isset($GLOBALS['utf_compatibility_decomp'])) @@ -208,19 +207,19 @@ class utf_normalizer include($phpbb_root_path . 'includes/utf/data/utf_compatibility_decomp.' . $phpEx); } - return utf_normalizer::decompose($str, $pos, $len, $GLOBALS['utf_compatibility_decomp']); + $str = utf_normalizer::decompose($str, $pos, $len, $GLOBALS['utf_compatibility_decomp']); } /** * Recompose a UTF string * - * @param string $str Unchecked UTF string - * @param integer $pos Position of the first UTF char (in bytes) - * @param integer $len Length of the string (in bytes) - * @param array $qc Quick-check array, passed by reference but never modified - * @param array $decomp_map Decomposition mapping, passed by reference but never modified - * @return string The string, validated and recomposed + * @param string $str Unchecked UTF string + * @param integer $pos Position of the first UTF char (in bytes) + * @param integer $len Length of the string (in bytes) + * @param array &$qc Quick-check array, passed by reference but never modified + * @param array &$decomp_map Decomposition mapping, passed by reference but never modified + * @return string The string, validated and recomposed * * @access private */ @@ -239,14 +238,7 @@ class utf_normalizer $tmp = ''; $i = $tmp_pos = $last_cc = 0; - if ($pos) - { - $buffer = array(++$i => $str[$pos - 1]); - } - else - { - $buffer = array(); - } + $buffer = ($pos) ? array(++$i => $str[$pos - 1]) : array(); // UTF char length array // This array is used to determine the length of a UTF character. @@ -327,16 +319,11 @@ class utf_normalizer // has been encoded in a five- or six- byte sequence if ($utf_char[0] >= "\xF8") { - if ($utf_char[0] < "\xF8") - { - $trailing_bytes = 3; - } - else if ($utf_char[0] < "\xFC") + if ($utf_char[0] < "\xFC") { $trailing_bytes = 4; } - - if ($utf_char[0] > "\xFD") + else if ($utf_char[0] > "\xFD") { $trailing_bytes = 0; } @@ -923,17 +910,17 @@ class utf_normalizer /** * Decompose a UTF string * - * @param string $str UTF string - * @param integer $pos Position of the first UTF char (in bytes) - * @param integer $len Length of the string (in bytes) - * @param array $decomp_map Decomposition mapping, passed by reference but never modified - * @return string The string, decomposed and sorted canonically + * @param string $str UTF string + * @param integer $pos Position of the first UTF char (in bytes) + * @param integer $len Length of the string (in bytes) + * @param array &$decomp_map Decomposition mapping, passed by reference but never modified + * @return string The string, decomposed and sorted canonically * * @access private */ function decompose($str, $pos, $len, &$decomp_map) { - global $utf_combining_class, $utf_canonical_decomp, $phpbb_root_path; + global $utf_combining_class, $phpbb_root_path; // Load some commonly-used tables if (!isset($utf_combining_class)) @@ -1011,7 +998,7 @@ class utf_normalizer ksort($utf_sort); } - foreach($utf_sort as $utf_chars) + foreach ($utf_sort as $utf_chars) { $tmp .= implode('', $utf_chars); } @@ -1365,17 +1352,17 @@ class utf_normalizer // LIndex can only range from 0 to 18, therefore it cannot influence the first two bytes of the L Jamo, which allows us to hardcode them (based on LBase). // // The same goes for VIndex, but for TIndex there's a catch: the value of the third byte could exceed 0xBF and we would have to increment the second byte - if ($tIndex = $idx % UNICODE_HANGUL_TCOUNT) + if ($t_index = $idx % UNICODE_HANGUL_TCOUNT) { - if ($tIndex < 25) + if ($t_index < 25) { $utf_char = "\xE1\x84\x00\xE1\x85\x00\xE1\x86\x00"; - $utf_char[8] = chr(0xA7 + $tIndex); + $utf_char[8] = chr(0xA7 + $t_index); } else { $utf_char = "\xE1\x84\x00\xE1\x85\x00\xE1\x87\x00"; - $utf_char[8] = chr(0x67 + $tIndex); + $utf_char[8] = chr(0x67 + $t_index); } } else @@ -1478,7 +1465,6 @@ class utf_normalizer } return $tmp; - } else if ($tmp_pos) { diff --git a/phpBB/includes/utf/utf_tools.php b/phpBB/includes/utf/utf_tools.php index b91fd51c20..4c6c26909a 100644 --- a/phpBB/includes/utf/utf_tools.php +++ b/phpBB/includes/utf/utf_tools.php @@ -7,9 +7,8 @@ * @license http://opensource.org/licenses/gpl-license.php GNU Public License * * @todo make sure the replacements are called correctly -* already done: strtolower, strtoupper, ucfirst, str_split, strrpos, strlen (hopefully!), strpos, substr -* remaining: clean_username, htmlentities (no longer needed for internal data?), htmlspecialchars (using charset) -* strspn, chr, ord +* already done: strtolower, strtoupper, ucfirst, str_split, strrpos, strlen (hopefully!), strpos, substr, htmlspecialchars +* remaining: strspn, chr, ord */ /** @@ -63,7 +62,7 @@ if (!extension_loaded('xml')) /** * Implementation of PHP's native utf8_decode for people without XML support * - * @param string $string UTF-8 encoded data + * @param string $str UTF-8 encoded data * @return string ISO-8859-1 encoded data */ function utf8_decode($str) @@ -126,7 +125,14 @@ if (extension_loaded('mbstring')) return false; } - return mb_strrpos($str, $search); + if (is_null($offset)) + { + return mb_strrpos($str, $needle); + } + else + { + return mb_strrpos($str, $needle, $offset); + } } } else @@ -138,7 +144,7 @@ if (extension_loaded('mbstring')) function utf8_strrpos($str, $needle, $offset = null) { // offset for mb_strrpos was added in 5.2.0 - if ($offset === false) + if (is_null($offset)) { // Emulate behaviour of strrpos rather than raising warning if (empty($str)) @@ -146,7 +152,7 @@ if (extension_loaded('mbstring')) return false; } - return mb_strrpos($str, $search); + return mb_strrpos($str, $needle); } else { @@ -158,7 +164,7 @@ if (extension_loaded('mbstring')) $str = mb_substr($str, $offset); - if (false !== ($pos = mb_strrpos($str, $search))) + if (false !== ($pos = mb_strrpos($str, $needle))) { return $pos + $offset; } @@ -174,7 +180,7 @@ if (extension_loaded('mbstring')) */ function utf8_strpos($str, $needle, $offset = null) { - if ($offset === false) + if (is_null($offset)) { return mb_strpos($str, $needle); } @@ -206,9 +212,9 @@ if (extension_loaded('mbstring')) * UTF-8 aware alternative to substr * @ignore */ - function utf8_substr($str, $offset, $length = null) + function utf8_substr($str, $offset, $length = null) { - if ($length === false) + if (is_null($length)) { return mb_substr($str, $offset); } @@ -234,9 +240,9 @@ else * Find position of last occurrence of a char in a string * * @author Harry Fuecks - * @param string haystack - * @param string needle - * @param integer (optional) offset (from left) + * @param string $str haystack + * @param string $needle needle + * @param integer $offset (optional) offset (from left) * @return mixed integer position or FALSE on failure */ function utf8_strrpos($str, $needle, $offset = null) @@ -279,9 +285,9 @@ else * Find position of first occurrence of a string * * @author Harry Fuecks - * @param string haystack - * @param string needle - * @param integer offset in characters (from left) + * @param string $str haystack + * @param string $needle needle + * @param integer $offset offset in characters (from left) * @return mixed integer position or FALSE on failure */ function utf8_strpos($str, $needle, $offset = null) @@ -482,9 +488,9 @@ else * necessary. It isn't necessary for +ve offsets and no specified length * * @author Chris Smith<chris@jalakai.co.uk> - * @param string - * @param integer number of UTF-8 characters offset (from left) - * @param integer (optional) length in UTF-8 characters from offset + * @param string $str + * @param integer $offset number of UTF-8 characters offset (from left) + * @param integer $length (optional) length in UTF-8 characters from offset * @return mixed string or FALSE if failure */ function utf8_substr($str, $offset, $length = NULL) @@ -624,8 +630,8 @@ else * Convert a string to an array * * @author Harry Fuecks -* @param string UTF-8 encoded -* @param int number to characters to split string by +* @param string $str UTF-8 encoded +* @param int $split_len number to characters to split string by * @return string characters in string reverses */ function utf8_str_split($str, $split_len = 1) @@ -650,8 +656,6 @@ function utf8_str_split($str, $split_len = 1) * Find length of initial segment not matching mask * * @author Harry Fuecks -* @param string -* @return int */ function utf8_strspn($str, $mask, $start = null, $length = null) { @@ -831,8 +835,8 @@ function utf8_ord($chr) /** * Converts an NCR to a UTF-8 char * -* @param integer $cp UNICODE code point -* @return string UTF-8 char +* @param int $cp UNICODE code point +* @return string UTF-8 char */ function utf8_chr($cp) { @@ -858,9 +862,8 @@ function utf8_chr($cp) * Convert Numeric Character References to UTF-8 chars * * Notes: -* - we do not convert NCRs recursively, if you pass &#38; it will return & -* - we DO NOT check for the existence of the Unicode characters, therefore an entity -* may be converted to an inexistent codepoint +* - we do not convert NCRs recursively, if you pass &#38; it will return & +* - we DO NOT check for the existence of the Unicode characters, therefore an entity may be converted to an inexistent codepoint * * @param string $text String to convert, encoded in UTF-8 (no normal form required) * @return string UTF-8 string where NCRs have been replaced with the actual chars @@ -890,9 +893,9 @@ function utf8_decode_ncr_callback($m) * Takes an array of ints representing the Unicode characters and returns * a UTF-8 string. * -* @param string $text text to be case folded -* @param string $option determines how we will fold the cases -* @return string case folded text +* @param string $text text to be case folded +* @param string $option determines how we will fold the cases +* @return string case folded text */ function utf8_case_fold($text, $option = 'full') { @@ -933,30 +936,35 @@ function utf8_case_fold($text, $option = 'full') * A wrapper function for the normalizer which takes care of including the class if required and modifies the passed strings * to be in NFC (Normalization Form Composition). * -* @param mixed $strings Either an array of references to strings, a reference to an array of strings or a reference to a single string +* @param mixed $strings a string or an array of strings to normalize +* @return mixed the normalized content, preserving array keys if array given. */ function utf8_normalize_nfc($strings) { - if (!is_array($strings) || (sizeof($strings) > 0)) - { - if (!class_exists('utf_normalizer')) - { - global $phpbb_root_path, $phpEx; - include($phpbb_root_path . 'includes/utf/utf_normalizer.' . $phpEx); - } + if (empty($strings)) + { + return $strings; + } - if (is_array($strings)) - { - foreach ($strings as $key => $string) - { - $strings[$key] = utf_normalizer::nfc($strings[$key]); - } - } - else + if (!class_exists('utf_normalizer')) + { + global $phpbb_root_path, $phpEx; + include($phpbb_root_path . 'includes/utf/utf_normalizer.' . $phpEx); + } + + if (!is_array($strings)) + { + utf_normalizer::nfc($strings); + } + else if (is_array($strings)) + { + foreach ($strings as $key => $string) { - $strings = utf_normalizer::nfc($strings); + utf_normalizer::nfc($strings[$key]); } } + + return $strings; } /** @@ -969,8 +977,8 @@ function utf8_normalize_nfc($strings) * functions used here you need to rebuild/update the username_clean column in the users table. And all other * columns that store a clean string otherwise you will break this functionality. * -* @param $text An unclean string, mabye user input (has to be valid UTF-8!) -* @return Cleaned up version of the input string +* @param string $text An unclean string, mabye user input (has to be valid UTF-8!) +* @return string Cleaned up version of the input string */ function utf8_clean_string($text) { @@ -982,7 +990,7 @@ function utf8_clean_string($text) include($phpbb_root_path . 'includes/utf/utf_normalizer.' . $phpEx); } - $text = utf_normalizer::nfc($text); + utf_normalizer::nfc($text); static $homographs = array( // cyrllic |
