aboutsummaryrefslogtreecommitdiffstats
path: root/phpBB/includes/utf
diff options
context:
space:
mode:
authorNils Adermann <naderman@naderman.de>2010-03-02 01:05:34 +0100
committerNils Adermann <naderman@naderman.de>2010-03-02 01:05:34 +0100
commit89b37954f994a7cd517553d2d16686f91dcaae72 (patch)
treeb20e25768bc55be250454c439ffee08ce2981031 /phpBB/includes/utf
parent07633a66e8c9bbb2b288a286bfbea6f562eeca4d (diff)
parent80d429a02d26da1f00777e62a0268d83f581f598 (diff)
downloadforums-89b37954f994a7cd517553d2d16686f91dcaae72.tar
forums-89b37954f994a7cd517553d2d16686f91dcaae72.tar.gz
forums-89b37954f994a7cd517553d2d16686f91dcaae72.tar.bz2
forums-89b37954f994a7cd517553d2d16686f91dcaae72.tar.xz
forums-89b37954f994a7cd517553d2d16686f91dcaae72.zip
Merge commit 'release-3.0-B4'
Diffstat (limited to 'phpBB/includes/utf')
-rw-r--r--phpBB/includes/utf/utf_normalizer.php104
-rw-r--r--phpBB/includes/utf/utf_tools.php112
2 files changed, 105 insertions, 111 deletions
diff --git a/phpBB/includes/utf/utf_normalizer.php b/phpBB/includes/utf/utf_normalizer.php
index 0d1d74539a..542c1aeeb8 100644
--- a/phpBB/includes/utf/utf_normalizer.php
+++ b/phpBB/includes/utf/utf_normalizer.php
@@ -67,10 +67,10 @@ class utf_normalizer
* The ultimate convenience function! Clean up invalid UTF-8 sequences,
* and convert to Normal Form C, canonical composition.
*
- * @param string $str The dirty string
+ * @param string &$str The dirty string
* @return string The same string, all shiny and cleaned-up
*/
- function cleanup($str)
+ function cleanup(&$str)
{
// The string below is the list of all autorized characters, sorted by frequency in latin text
$pos = strspn($str, "\x20\x65\x69\x61\x73\x6E\x74\x72\x6F\x6C\x75\x64\x5D\x5B\x63\x6D\x70\x27\x0A\x67\x7C\x68\x76\x2E\x66\x62\x2C\x3A\x3D\x2D\x71\x31\x30\x43\x32\x2A\x79\x78\x29\x28\x4C\x39\x41\x53\x2F\x50\x22\x45\x6A\x4D\x49\x6B\x33\x3E\x35\x54\x3C\x44\x34\x7D\x42\x7B\x38\x46\x77\x52\x36\x37\x55\x47\x4E\x3B\x4A\x7A\x56\x23\x48\x4F\x57\x5F\x26\x21\x4B\x3F\x58\x51\x25\x59\x5C\x09\x5A\x2B\x7E\x5E\x24\x40\x60\x7F\x0D");
@@ -79,7 +79,7 @@ class utf_normalizer
if ($pos == $len)
{
// ASCII strings with no special chars return immediately
- return $str;
+ return;
}
// Note: we do not check for $GLOBALS['utf_canonical_decomp']. It is assumed they are always loaded together
@@ -91,23 +91,22 @@ class utf_normalizer
// Replace any byte in the range 0x00..0x1F, except for \r, \n and \t
// We replace those characters with a 0xFF byte, which is illegal in UTF-8 and will in turn be replaced with a UTF replacement char
- return utf_normalizer::recompose(
- strtr(
- $str,
- "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F",
- "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF"
- ),
- $pos, $len, $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_canonical_decomp']
+ $str = strtr(
+ $str,
+ "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F",
+ "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF"
);
+
+ $str = utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_canonical_decomp']);
}
/**
* Validate and normalize a UTF string to NFC
*
- * @param string $str Unchecked UTF string
+ * @param string &$str Unchecked UTF string
* @return string The string, validated and in normal form
*/
- function nfc($str)
+ function nfc(&$str)
{
$pos = strspn($str, UTF8_ASCII_RANGE);
$len = strlen($str);
@@ -115,7 +114,7 @@ class utf_normalizer
if ($pos == $len)
{
// ASCII strings return immediately
- return $str;
+ return;
}
if (!isset($GLOBALS['utf_nfc_qc']))
@@ -124,16 +123,16 @@ class utf_normalizer
include($phpbb_root_path . 'includes/utf/data/utf_nfc_qc.' . $phpEx);
}
- return utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_canonical_decomp']);
+ $str = utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_canonical_decomp']);
}
/**
* Validate and normalize a UTF string to NFKC
*
- * @param string $str Unchecked UTF string
+ * @param string &$str Unchecked UTF string
* @return string The string, validated and in normal form
*/
- function nfkc($str)
+ function nfkc(&$str)
{
$pos = strspn($str, UTF8_ASCII_RANGE);
$len = strlen($str);
@@ -141,7 +140,7 @@ class utf_normalizer
if ($pos == $len)
{
// ASCII strings return immediately
- return $str;
+ return;
}
if (!isset($GLOBALS['utf_nfkc_qc']))
@@ -156,16 +155,16 @@ class utf_normalizer
include($phpbb_root_path . 'includes/utf/data/utf_canonical_comp.' . $phpEx);
}
- return utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfkc_qc'], $GLOBALS['utf_compatibility_decomp']);
+ $str = utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfkc_qc'], $GLOBALS['utf_compatibility_decomp']);
}
/**
* Validate and normalize a UTF string to NFD
*
- * @param string $str Unchecked UTF string
+ * @param string &$str Unchecked UTF string
* @return string The string, validated and in normal form
*/
- function nfd($str)
+ function nfd(&$str)
{
$pos = strspn($str, UTF8_ASCII_RANGE);
$len = strlen($str);
@@ -173,7 +172,7 @@ class utf_normalizer
if ($pos == $len)
{
// ASCII strings return immediately
- return $str;
+ return;
}
if (!isset($GLOBALS['utf_canonical_decomp']))
@@ -182,16 +181,16 @@ class utf_normalizer
include($phpbb_root_path . 'includes/utf/data/utf_canonical_decomp.' . $phpEx);
}
- return utf_normalizer::decompose($str, $pos, $len, $GLOBALS['utf_canonical_decomp']);
+ $str = utf_normalizer::decompose($str, $pos, $len, $GLOBALS['utf_canonical_decomp']);
}
/**
* Validate and normalize a UTF string to NFKD
*
- * @param string $str Unchecked UTF string
+ * @param string &$str Unchecked UTF string
* @return string The string, validated and in normal form
*/
- function nfkd($str)
+ function nfkd(&$str)
{
$pos = strspn($str, UTF8_ASCII_RANGE);
$len = strlen($str);
@@ -199,7 +198,7 @@ class utf_normalizer
if ($pos == $len)
{
// ASCII strings return immediately
- return $str;
+ return;
}
if (!isset($GLOBALS['utf_compatibility_decomp']))
@@ -208,19 +207,19 @@ class utf_normalizer
include($phpbb_root_path . 'includes/utf/data/utf_compatibility_decomp.' . $phpEx);
}
- return utf_normalizer::decompose($str, $pos, $len, $GLOBALS['utf_compatibility_decomp']);
+ $str = utf_normalizer::decompose($str, $pos, $len, $GLOBALS['utf_compatibility_decomp']);
}
/**
* Recompose a UTF string
*
- * @param string $str Unchecked UTF string
- * @param integer $pos Position of the first UTF char (in bytes)
- * @param integer $len Length of the string (in bytes)
- * @param array $qc Quick-check array, passed by reference but never modified
- * @param array $decomp_map Decomposition mapping, passed by reference but never modified
- * @return string The string, validated and recomposed
+ * @param string $str Unchecked UTF string
+ * @param integer $pos Position of the first UTF char (in bytes)
+ * @param integer $len Length of the string (in bytes)
+ * @param array &$qc Quick-check array, passed by reference but never modified
+ * @param array &$decomp_map Decomposition mapping, passed by reference but never modified
+ * @return string The string, validated and recomposed
*
* @access private
*/
@@ -239,14 +238,7 @@ class utf_normalizer
$tmp = '';
$i = $tmp_pos = $last_cc = 0;
- if ($pos)
- {
- $buffer = array(++$i => $str[$pos - 1]);
- }
- else
- {
- $buffer = array();
- }
+ $buffer = ($pos) ? array(++$i => $str[$pos - 1]) : array();
// UTF char length array
// This array is used to determine the length of a UTF character.
@@ -327,16 +319,11 @@ class utf_normalizer
// has been encoded in a five- or six- byte sequence
if ($utf_char[0] >= "\xF8")
{
- if ($utf_char[0] < "\xF8")
- {
- $trailing_bytes = 3;
- }
- else if ($utf_char[0] < "\xFC")
+ if ($utf_char[0] < "\xFC")
{
$trailing_bytes = 4;
}
-
- if ($utf_char[0] > "\xFD")
+ else if ($utf_char[0] > "\xFD")
{
$trailing_bytes = 0;
}
@@ -923,17 +910,17 @@ class utf_normalizer
/**
* Decompose a UTF string
*
- * @param string $str UTF string
- * @param integer $pos Position of the first UTF char (in bytes)
- * @param integer $len Length of the string (in bytes)
- * @param array $decomp_map Decomposition mapping, passed by reference but never modified
- * @return string The string, decomposed and sorted canonically
+ * @param string $str UTF string
+ * @param integer $pos Position of the first UTF char (in bytes)
+ * @param integer $len Length of the string (in bytes)
+ * @param array &$decomp_map Decomposition mapping, passed by reference but never modified
+ * @return string The string, decomposed and sorted canonically
*
* @access private
*/
function decompose($str, $pos, $len, &$decomp_map)
{
- global $utf_combining_class, $utf_canonical_decomp, $phpbb_root_path;
+ global $utf_combining_class, $phpbb_root_path;
// Load some commonly-used tables
if (!isset($utf_combining_class))
@@ -1011,7 +998,7 @@ class utf_normalizer
ksort($utf_sort);
}
- foreach($utf_sort as $utf_chars)
+ foreach ($utf_sort as $utf_chars)
{
$tmp .= implode('', $utf_chars);
}
@@ -1365,17 +1352,17 @@ class utf_normalizer
// LIndex can only range from 0 to 18, therefore it cannot influence the first two bytes of the L Jamo, which allows us to hardcode them (based on LBase).
//
// The same goes for VIndex, but for TIndex there's a catch: the value of the third byte could exceed 0xBF and we would have to increment the second byte
- if ($tIndex = $idx % UNICODE_HANGUL_TCOUNT)
+ if ($t_index = $idx % UNICODE_HANGUL_TCOUNT)
{
- if ($tIndex < 25)
+ if ($t_index < 25)
{
$utf_char = "\xE1\x84\x00\xE1\x85\x00\xE1\x86\x00";
- $utf_char[8] = chr(0xA7 + $tIndex);
+ $utf_char[8] = chr(0xA7 + $t_index);
}
else
{
$utf_char = "\xE1\x84\x00\xE1\x85\x00\xE1\x87\x00";
- $utf_char[8] = chr(0x67 + $tIndex);
+ $utf_char[8] = chr(0x67 + $t_index);
}
}
else
@@ -1478,7 +1465,6 @@ class utf_normalizer
}
return $tmp;
-
}
else if ($tmp_pos)
{
diff --git a/phpBB/includes/utf/utf_tools.php b/phpBB/includes/utf/utf_tools.php
index b91fd51c20..4c6c26909a 100644
--- a/phpBB/includes/utf/utf_tools.php
+++ b/phpBB/includes/utf/utf_tools.php
@@ -7,9 +7,8 @@
* @license http://opensource.org/licenses/gpl-license.php GNU Public License
*
* @todo make sure the replacements are called correctly
-* already done: strtolower, strtoupper, ucfirst, str_split, strrpos, strlen (hopefully!), strpos, substr
-* remaining: clean_username, htmlentities (no longer needed for internal data?), htmlspecialchars (using charset)
-* strspn, chr, ord
+* already done: strtolower, strtoupper, ucfirst, str_split, strrpos, strlen (hopefully!), strpos, substr, htmlspecialchars
+* remaining: strspn, chr, ord
*/
/**
@@ -63,7 +62,7 @@ if (!extension_loaded('xml'))
/**
* Implementation of PHP's native utf8_decode for people without XML support
*
- * @param string $string UTF-8 encoded data
+ * @param string $str UTF-8 encoded data
* @return string ISO-8859-1 encoded data
*/
function utf8_decode($str)
@@ -126,7 +125,14 @@ if (extension_loaded('mbstring'))
return false;
}
- return mb_strrpos($str, $search);
+ if (is_null($offset))
+ {
+ return mb_strrpos($str, $needle);
+ }
+ else
+ {
+ return mb_strrpos($str, $needle, $offset);
+ }
}
}
else
@@ -138,7 +144,7 @@ if (extension_loaded('mbstring'))
function utf8_strrpos($str, $needle, $offset = null)
{
// offset for mb_strrpos was added in 5.2.0
- if ($offset === false)
+ if (is_null($offset))
{
// Emulate behaviour of strrpos rather than raising warning
if (empty($str))
@@ -146,7 +152,7 @@ if (extension_loaded('mbstring'))
return false;
}
- return mb_strrpos($str, $search);
+ return mb_strrpos($str, $needle);
}
else
{
@@ -158,7 +164,7 @@ if (extension_loaded('mbstring'))
$str = mb_substr($str, $offset);
- if (false !== ($pos = mb_strrpos($str, $search)))
+ if (false !== ($pos = mb_strrpos($str, $needle)))
{
return $pos + $offset;
}
@@ -174,7 +180,7 @@ if (extension_loaded('mbstring'))
*/
function utf8_strpos($str, $needle, $offset = null)
{
- if ($offset === false)
+ if (is_null($offset))
{
return mb_strpos($str, $needle);
}
@@ -206,9 +212,9 @@ if (extension_loaded('mbstring'))
* UTF-8 aware alternative to substr
* @ignore
*/
- function utf8_substr($str, $offset, $length = null)
+ function utf8_substr($str, $offset, $length = null)
{
- if ($length === false)
+ if (is_null($length))
{
return mb_substr($str, $offset);
}
@@ -234,9 +240,9 @@ else
* Find position of last occurrence of a char in a string
*
* @author Harry Fuecks
- * @param string haystack
- * @param string needle
- * @param integer (optional) offset (from left)
+ * @param string $str haystack
+ * @param string $needle needle
+ * @param integer $offset (optional) offset (from left)
* @return mixed integer position or FALSE on failure
*/
function utf8_strrpos($str, $needle, $offset = null)
@@ -279,9 +285,9 @@ else
* Find position of first occurrence of a string
*
* @author Harry Fuecks
- * @param string haystack
- * @param string needle
- * @param integer offset in characters (from left)
+ * @param string $str haystack
+ * @param string $needle needle
+ * @param integer $offset offset in characters (from left)
* @return mixed integer position or FALSE on failure
*/
function utf8_strpos($str, $needle, $offset = null)
@@ -482,9 +488,9 @@ else
* necessary. It isn't necessary for +ve offsets and no specified length
*
* @author Chris Smith<chris@jalakai.co.uk>
- * @param string
- * @param integer number of UTF-8 characters offset (from left)
- * @param integer (optional) length in UTF-8 characters from offset
+ * @param string $str
+ * @param integer $offset number of UTF-8 characters offset (from left)
+ * @param integer $length (optional) length in UTF-8 characters from offset
* @return mixed string or FALSE if failure
*/
function utf8_substr($str, $offset, $length = NULL)
@@ -624,8 +630,8 @@ else
* Convert a string to an array
*
* @author Harry Fuecks
-* @param string UTF-8 encoded
-* @param int number to characters to split string by
+* @param string $str UTF-8 encoded
+* @param int $split_len number to characters to split string by
* @return string characters in string reverses
*/
function utf8_str_split($str, $split_len = 1)
@@ -650,8 +656,6 @@ function utf8_str_split($str, $split_len = 1)
* Find length of initial segment not matching mask
*
* @author Harry Fuecks
-* @param string
-* @return int
*/
function utf8_strspn($str, $mask, $start = null, $length = null)
{
@@ -831,8 +835,8 @@ function utf8_ord($chr)
/**
* Converts an NCR to a UTF-8 char
*
-* @param integer $cp UNICODE code point
-* @return string UTF-8 char
+* @param int $cp UNICODE code point
+* @return string UTF-8 char
*/
function utf8_chr($cp)
{
@@ -858,9 +862,8 @@ function utf8_chr($cp)
* Convert Numeric Character References to UTF-8 chars
*
* Notes:
-* - we do not convert NCRs recursively, if you pass &#38;#38; it will return &#38;
-* - we DO NOT check for the existence of the Unicode characters, therefore an entity
-* may be converted to an inexistent codepoint
+* - we do not convert NCRs recursively, if you pass &#38;#38; it will return &#38;
+* - we DO NOT check for the existence of the Unicode characters, therefore an entity may be converted to an inexistent codepoint
*
* @param string $text String to convert, encoded in UTF-8 (no normal form required)
* @return string UTF-8 string where NCRs have been replaced with the actual chars
@@ -890,9 +893,9 @@ function utf8_decode_ncr_callback($m)
* Takes an array of ints representing the Unicode characters and returns
* a UTF-8 string.
*
-* @param string $text text to be case folded
-* @param string $option determines how we will fold the cases
-* @return string case folded text
+* @param string $text text to be case folded
+* @param string $option determines how we will fold the cases
+* @return string case folded text
*/
function utf8_case_fold($text, $option = 'full')
{
@@ -933,30 +936,35 @@ function utf8_case_fold($text, $option = 'full')
* A wrapper function for the normalizer which takes care of including the class if required and modifies the passed strings
* to be in NFC (Normalization Form Composition).
*
-* @param mixed $strings Either an array of references to strings, a reference to an array of strings or a reference to a single string
+* @param mixed $strings a string or an array of strings to normalize
+* @return mixed the normalized content, preserving array keys if array given.
*/
function utf8_normalize_nfc($strings)
{
- if (!is_array($strings) || (sizeof($strings) > 0))
- {
- if (!class_exists('utf_normalizer'))
- {
- global $phpbb_root_path, $phpEx;
- include($phpbb_root_path . 'includes/utf/utf_normalizer.' . $phpEx);
- }
+ if (empty($strings))
+ {
+ return $strings;
+ }
- if (is_array($strings))
- {
- foreach ($strings as $key => $string)
- {
- $strings[$key] = utf_normalizer::nfc($strings[$key]);
- }
- }
- else
+ if (!class_exists('utf_normalizer'))
+ {
+ global $phpbb_root_path, $phpEx;
+ include($phpbb_root_path . 'includes/utf/utf_normalizer.' . $phpEx);
+ }
+
+ if (!is_array($strings))
+ {
+ utf_normalizer::nfc($strings);
+ }
+ else if (is_array($strings))
+ {
+ foreach ($strings as $key => $string)
{
- $strings = utf_normalizer::nfc($strings);
+ utf_normalizer::nfc($strings[$key]);
}
}
+
+ return $strings;
}
/**
@@ -969,8 +977,8 @@ function utf8_normalize_nfc($strings)
* functions used here you need to rebuild/update the username_clean column in the users table. And all other
* columns that store a clean string otherwise you will break this functionality.
*
-* @param $text An unclean string, mabye user input (has to be valid UTF-8!)
-* @return Cleaned up version of the input string
+* @param string $text An unclean string, mabye user input (has to be valid UTF-8!)
+* @return string Cleaned up version of the input string
*/
function utf8_clean_string($text)
{
@@ -982,7 +990,7 @@ function utf8_clean_string($text)
include($phpbb_root_path . 'includes/utf/utf_normalizer.' . $phpEx);
}
- $text = utf_normalizer::nfc($text);
+ utf_normalizer::nfc($text);
static $homographs = array(
// cyrllic