diff options
-rw-r--r-- | phpBB/includes/utf/utf_tools.php | 216 |
1 files changed, 187 insertions, 29 deletions
diff --git a/phpBB/includes/utf/utf_tools.php b/phpBB/includes/utf/utf_tools.php index 1e7e25c43f..2f7c8de69a 100644 --- a/phpBB/includes/utf/utf_tools.php +++ b/phpBB/includes/utf/utf_tools.php @@ -17,13 +17,15 @@ * @package phpBB3 */ -// huge chunks of this code belong to the PHP UTF-8 project -// TODO: document the functions! - -// utf8_encode and utf8_decode are both XML functions if (!extension_loaded('xml')) { - // This function exploits some nice things that ISO-8859-1 and UTF-8 have in common + /** + * Implementation of PHP's native utf8_encode for people without XML support + * This function exploits some nice things that ISO-8859-1 and UTF-8 have in common + * + * @param string $str ISO-8859-1 encoded data + * @return string UTF-8 encoded data + */ function utf8_encode($str) { $out = ''; @@ -48,7 +50,13 @@ if (!extension_loaded('xml')) return $out; } - // "borrowed" from getID3 + /** + * Implementation of PHP's native utf8_decode for people without XML support + * + * @author GetID3() + * @param string $string UTF-8 encoded data + * @return string ISO-8859-1 encoded data + */ function utf8_decode($string) { $newcharstring = ''; @@ -106,6 +114,16 @@ if (!extension_loaded('xml')) // if mbstring is not loaded, we go into native mode. if (extension_loaded('mbstring')) { + /** + * UTF-8 aware alternative to strrpos + * Find position of last occurrence of a char in a string + * + * @author Harry Fuecks + * @param string haystack + * @param string needle + * @param integer (optional) offset (from left) + * @return mixed integer position or FALSE on failure + */ function utf8_strrpos($str, $needle, $offset = null) { // offset for mb_strrpos was added in 5.2.0 @@ -137,6 +155,16 @@ if (extension_loaded('mbstring')) } } + /** + * UTF-8 aware alternative to strpos + * Find position of first occurrence of a string + * + * @author Harry Fuecks + * @param string haystack + * @param string needle + * @param integer offset in characters (from left) + * @return mixed integer position or FALSE on failure + */ function utf8_strpos($str, $needle, $offset = null) { if ($offset === false) @@ -149,16 +177,50 @@ if (extension_loaded('mbstring')) } } + /** + * UTF-8 aware alternative to strtolower + * Make a string lowercase + * Note: The concept of a characters "case" only exists is some alphabets + * such as Latin, Greek, Cyrillic, Armenian and archaic Georgian - it does + * not exist in the Chinese alphabet, for example. See Unicode Standard + * Annex #21: Case Mappings + * + * @author Andreas Gohr <andi@splitbrain.org> + * @param string + * @return mixed either string in lowercase or FALSE is UTF-8 invalid + */ function utf8_strtolower($str) { return mb_strtolower($str); } + /** + * UTF-8 aware alternative to strtoupper + * Make a string uppercase + * Note: The concept of a characters "case" only exists is some alphabets + * such as Latin, Greek, Cyrillic, Armenian and archaic Georgian - it does + * not exist in the Chinese alphabet, for example. See Unicode Standard + * Annex #21: Case Mappings + * + * @author Andreas Gohr <andi@splitbrain.org> + * @param string + * @return mixed either string in lowercase or FALSE is UTF-8 invalid + */ function utf8_strtoupper($str) { return mb_strtoupper($str); } + /** + * UTF-8 aware alternative to substr + * Return part of a string given character offset (and optionally length) + * + * @author Harry Fuecks + * @param string + * @param integer number of UTF-8 characters offset (from left) + * @param integer (optional) length in UTF-8 characters from offset + * @return mixed string or FALSE if failure + */ function utf8_substr($str, $offset, $length = null) { if ($length === false) @@ -170,9 +232,30 @@ if (extension_loaded('mbstring')) return mb_substr($str, $offset, $length); } } + + /** + * Return the length (in characters) of a UTF-8 string + * + * @param string $text UTF-8 string + * @return integer Length (in chars) of given string + */ + function utf8_strlen($text) + { + return mb_strlen($text, 'utf-8'); + } } else { + /** + * UTF-8 aware alternative to strrpos + * Find position of last occurrence of a char in a string + * + * @author Harry Fuecks + * @param string haystack + * @param string needle + * @param integer (optional) offset (from left) + * @return mixed integer position or FALSE on failure + */ function utf8_strrpos($str, $needle, $offset = null) { if (is_null($offset)) @@ -207,6 +290,16 @@ else } } + /** + * UTF-8 aware alternative to strpos + * Find position of first occurrence of a string + * + * @author Harry Fuecks + * @param string haystack + * @param string needle + * @param integer offset in characters (from left) + * @return mixed integer position or FALSE on failure + */ function utf8_strpos($str, $needle, $offset = null) { // native @@ -330,6 +423,18 @@ $UTF8_LOWER_TO_UPPER = array( 0x00F0=>0x00D0, 0x0457=>0x0407, 0x0123=>0x0122, ); + /** + * UTF-8 aware alternative to strtolower + * Make a string lowercase + * Note: The concept of a characters "case" only exists is some alphabets + * such as Latin, Greek, Cyrillic, Armenian and archaic Georgian - it does + * not exist in the Chinese alphabet, for example. See Unicode Standard + * Annex #21: Case Mappings + * + * @author Andreas Gohr <andi@splitbrain.org> + * @param string + * @return mixed either string in lowercase or FALSE is UTF-8 invalid + */ function utf8_strtolower($string) { global $UTF8_UPPER_TO_LOWER; @@ -351,6 +456,18 @@ $UTF8_LOWER_TO_UPPER = array( return utf8_from_unicode($uni); } + /** + * UTF-8 aware alternative to strtoupper + * Make a string uppercase + * Note: The concept of a characters "case" only exists is some alphabets + * such as Latin, Greek, Cyrillic, Armenian and archaic Georgian - it does + * not exist in the Chinese alphabet, for example. See Unicode Standard + * Annex #21: Case Mappings + * + * @author Andreas Gohr <andi@splitbrain.org> + * @param string + * @return mixed either string in lowercase or FALSE is UTF-8 invalid + */ function utf8_strtoupper($str) { global $UTF8_LOWER_TO_UPPER; @@ -372,6 +489,16 @@ $UTF8_LOWER_TO_UPPER = array( return utf8_from_unicode($uni); } + /** + * UTF-8 aware alternative to substr + * Return part of a string given character offset (and optionally length) + * + * @author Harry Fuecks + * @param string + * @param integer number of UTF-8 characters offset (from left) + * @param integer (optional) length in UTF-8 characters from offset + * @return mixed string or FALSE if failure + */ function utf8_substr($str, $offset, $length = null) { if ($offset >= 0 && $length >= 0) @@ -436,8 +563,30 @@ $UTF8_LOWER_TO_UPPER = array( } } } + + /** + * Return the length (in characters) of a UTF-8 string + * + * @param string $text UTF-8 string + * @return integer Length (in chars) of given string + */ + function utf8_strlen($text) + { + // Since utf8_decode is replacing multibyte characters to ? strlen works fine + return strlen(utf8_decode($text)); + } + } +/** +* UTF-8 aware alternative to str_split +* Convert a string to an array +* +* @author Harry Fuecks +* @param string UTF-8 encoded +* @param int number to characters to split string by +* @return string characters in string reverses +*/ function utf8_str_split($str, $split_len = 1) { if (!preg_match('/^[0-9]+$/', $split_len) || $split_len < 1) @@ -455,6 +604,14 @@ function utf8_str_split($str, $split_len = 1) return $ar[0]; } +/** +* UTF-8 aware alternative to strcspn +* Find length of initial segment not matching mask +* +* @author Harry Fuecks +* @param string +* @return int +*/ function utf8_strspn($str, $mask, $start = null, $length = null) { $mask = preg_replace('!([\\\\\\-\\]\\[/^])!', '\\\${1}', $mask); @@ -474,6 +631,14 @@ function utf8_strspn($str, $mask, $start = null, $length = null) return 0; } +/** +* UTF-8 aware alternative to ucfirst +* Make a string's first character uppercase +* +* @author Harry Fuecks +* @param string +* @return string with first character as upper case (if applicable) +*/ function utf8_ucfirst($str) { switch (utf8_strlen($str)) @@ -494,28 +659,6 @@ function utf8_ucfirst($str) } /** -* Return the length (in characters) of a UTF-8 string -* -* @param string $text UTF-8 string -* @return integer Length (in chars) of given string -*/ -function utf8_strlen($text) -{ - if (function_exists('iconv_strlen')) - { - return iconv_strlen($text, 'utf-8'); - } - - if (function_exists('mb_strlen')) - { - return mb_strlen($text, 'utf-8'); - } - - // Since utf8_decode is replacing multibyte characters to ? strlen works fine - return strlen(utf8_decode($text)); -} - -/** * Recode a string to UTF-8 * * If the encoding is not supported, the string is returned as-is @@ -614,6 +757,12 @@ function utf8_encode_ncr_callback($m) return '&#' . utf8_ord($m[0]) . ';'; } +/** + * Enter description here... + * + * @param string $chr UTF-8 char + * @return integer UNICODE code point + */ function utf8_ord($chr) { switch (strlen($chr)) @@ -639,6 +788,12 @@ function utf8_ord($chr) } } +/** + * Converts an NCR to a UTF-8 char + * + * @param integer $cp UNICODE code point + * @return string UTF-8 char + */ function utf8_chr($cp) { if ($cp > 0xFFFF) @@ -694,7 +849,9 @@ function utf8_decode_ncr_callback($m) /** * Takes an UTF-8 string and returns an array of ints representing the * Unicode characters. + * * @param string UTF-8 encoded string + * @return array array of UNICODE code points */ function utf8_to_unicode($string) { @@ -752,7 +909,8 @@ function utf8_to_unicode($string) * Takes an array of ints representing the Unicode characters and returns * a UTF-8 string. * - * @param array of unicode code points representing a string + * @param array $array array of unicode code points representing a string + * @return string UTF-8 character string */ function utf8_from_unicode($array) { |