aboutsummaryrefslogtreecommitdiffstats
path: root/phpBB/includes/utf/utf_normalizer.php
diff options
context:
space:
mode:
authorMeik Sievertsen <acydburn@phpbb.com>2006-11-15 15:35:50 +0000
committerMeik Sievertsen <acydburn@phpbb.com>2006-11-15 15:35:50 +0000
commit548cc2c10b56cc9e5c71c2f87356947939abe888 (patch)
tree82a2ceac1eb474aad83281f5d5b4fe94b0ad4d92 /phpBB/includes/utf/utf_normalizer.php
parent979e36077fa6ae9bbee81bacaaef029aa13c6df0 (diff)
downloadforums-548cc2c10b56cc9e5c71c2f87356947939abe888.tar
forums-548cc2c10b56cc9e5c71c2f87356947939abe888.tar.gz
forums-548cc2c10b56cc9e5c71c2f87356947939abe888.tar.bz2
forums-548cc2c10b56cc9e5c71c2f87356947939abe888.tar.xz
forums-548cc2c10b56cc9e5c71c2f87356947939abe888.zip
- fixes for the following bugs:
#5326 #5318 #5304 #5290 #5288 #5278 #5276 #5272 #5266 - also fixed the "Call-time pass-by-reference" bug #5252 - within this step changed the normalize calls to require references. - added captcha size variables to the class scope (suggestion was posted at area51) git-svn-id: file:///svn/phpbb/trunk@6584 89ea8834-ac86-4346-8a33-228a782c2dd0
Diffstat (limited to 'phpBB/includes/utf/utf_normalizer.php')
-rw-r--r--phpBB/includes/utf/utf_normalizer.php77
1 files changed, 37 insertions, 40 deletions
diff --git a/phpBB/includes/utf/utf_normalizer.php b/phpBB/includes/utf/utf_normalizer.php
index 0d1d74539a..62923ccbe4 100644
--- a/phpBB/includes/utf/utf_normalizer.php
+++ b/phpBB/includes/utf/utf_normalizer.php
@@ -67,10 +67,10 @@ class utf_normalizer
* The ultimate convenience function! Clean up invalid UTF-8 sequences,
* and convert to Normal Form C, canonical composition.
*
- * @param string $str The dirty string
+ * @param string &$str The dirty string
* @return string The same string, all shiny and cleaned-up
*/
- function cleanup($str)
+ function cleanup(&$str)
{
// The string below is the list of all autorized characters, sorted by frequency in latin text
$pos = strspn($str, "\x20\x65\x69\x61\x73\x6E\x74\x72\x6F\x6C\x75\x64\x5D\x5B\x63\x6D\x70\x27\x0A\x67\x7C\x68\x76\x2E\x66\x62\x2C\x3A\x3D\x2D\x71\x31\x30\x43\x32\x2A\x79\x78\x29\x28\x4C\x39\x41\x53\x2F\x50\x22\x45\x6A\x4D\x49\x6B\x33\x3E\x35\x54\x3C\x44\x34\x7D\x42\x7B\x38\x46\x77\x52\x36\x37\x55\x47\x4E\x3B\x4A\x7A\x56\x23\x48\x4F\x57\x5F\x26\x21\x4B\x3F\x58\x51\x25\x59\x5C\x09\x5A\x2B\x7E\x5E\x24\x40\x60\x7F\x0D");
@@ -79,7 +79,7 @@ class utf_normalizer
if ($pos == $len)
{
// ASCII strings with no special chars return immediately
- return $str;
+ return;
}
// Note: we do not check for $GLOBALS['utf_canonical_decomp']. It is assumed they are always loaded together
@@ -91,23 +91,22 @@ class utf_normalizer
// Replace any byte in the range 0x00..0x1F, except for \r, \n and \t
// We replace those characters with a 0xFF byte, which is illegal in UTF-8 and will in turn be replaced with a UTF replacement char
- return utf_normalizer::recompose(
- strtr(
- $str,
- "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F",
- "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF"
- ),
- $pos, $len, $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_canonical_decomp']
+ $str = strtr(
+ $str,
+ "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F",
+ "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF"
);
+
+ $str = utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_canonical_decomp']);
}
/**
* Validate and normalize a UTF string to NFC
*
- * @param string $str Unchecked UTF string
+ * @param string &$str Unchecked UTF string
* @return string The string, validated and in normal form
*/
- function nfc($str)
+ function nfc(&$str)
{
$pos = strspn($str, UTF8_ASCII_RANGE);
$len = strlen($str);
@@ -115,7 +114,7 @@ class utf_normalizer
if ($pos == $len)
{
// ASCII strings return immediately
- return $str;
+ return;
}
if (!isset($GLOBALS['utf_nfc_qc']))
@@ -124,16 +123,16 @@ class utf_normalizer
include($phpbb_root_path . 'includes/utf/data/utf_nfc_qc.' . $phpEx);
}
- return utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_canonical_decomp']);
+ $str = utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_canonical_decomp']);
}
/**
* Validate and normalize a UTF string to NFKC
*
- * @param string $str Unchecked UTF string
+ * @param string &$str Unchecked UTF string
* @return string The string, validated and in normal form
*/
- function nfkc($str)
+ function nfkc(&$str)
{
$pos = strspn($str, UTF8_ASCII_RANGE);
$len = strlen($str);
@@ -141,7 +140,7 @@ class utf_normalizer
if ($pos == $len)
{
// ASCII strings return immediately
- return $str;
+ return;
}
if (!isset($GLOBALS['utf_nfkc_qc']))
@@ -156,16 +155,16 @@ class utf_normalizer
include($phpbb_root_path . 'includes/utf/data/utf_canonical_comp.' . $phpEx);
}
- return utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfkc_qc'], $GLOBALS['utf_compatibility_decomp']);
+ $str = utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfkc_qc'], $GLOBALS['utf_compatibility_decomp']);
}
/**
* Validate and normalize a UTF string to NFD
*
- * @param string $str Unchecked UTF string
+ * @param string &$str Unchecked UTF string
* @return string The string, validated and in normal form
*/
- function nfd($str)
+ function nfd(&$str)
{
$pos = strspn($str, UTF8_ASCII_RANGE);
$len = strlen($str);
@@ -173,7 +172,7 @@ class utf_normalizer
if ($pos == $len)
{
// ASCII strings return immediately
- return $str;
+ return;
}
if (!isset($GLOBALS['utf_canonical_decomp']))
@@ -182,16 +181,16 @@ class utf_normalizer
include($phpbb_root_path . 'includes/utf/data/utf_canonical_decomp.' . $phpEx);
}
- return utf_normalizer::decompose($str, $pos, $len, $GLOBALS['utf_canonical_decomp']);
+ $str = utf_normalizer::decompose($str, $pos, $len, $GLOBALS['utf_canonical_decomp']);
}
/**
* Validate and normalize a UTF string to NFKD
*
- * @param string $str Unchecked UTF string
+ * @param string &$str Unchecked UTF string
* @return string The string, validated and in normal form
*/
- function nfkd($str)
+ function nfkd(&$str)
{
$pos = strspn($str, UTF8_ASCII_RANGE);
$len = strlen($str);
@@ -199,7 +198,7 @@ class utf_normalizer
if ($pos == $len)
{
// ASCII strings return immediately
- return $str;
+ return;
}
if (!isset($GLOBALS['utf_compatibility_decomp']))
@@ -208,7 +207,7 @@ class utf_normalizer
include($phpbb_root_path . 'includes/utf/data/utf_compatibility_decomp.' . $phpEx);
}
- return utf_normalizer::decompose($str, $pos, $len, $GLOBALS['utf_compatibility_decomp']);
+ $str = utf_normalizer::decompose($str, $pos, $len, $GLOBALS['utf_compatibility_decomp']);
}
@@ -239,14 +238,7 @@ class utf_normalizer
$tmp = '';
$i = $tmp_pos = $last_cc = 0;
- if ($pos)
- {
- $buffer = array(++$i => $str[$pos - 1]);
- }
- else
- {
- $buffer = array();
- }
+ $buffer = ($pos) ? array(++$i => $str[$pos - 1]) : array();
// UTF char length array
// This array is used to determine the length of a UTF character.
@@ -325,6 +317,9 @@ class utf_normalizer
{
// Current char isn't well-formed or legal: either one or several trailing bytes are missing, or the Unicode char
// has been encoded in a five- or six- byte sequence
+ /**
+ * @todo $trailing_bytes always == 5?
+ */
if ($utf_char[0] >= "\xF8")
{
if ($utf_char[0] < "\xF8")
@@ -421,6 +416,9 @@ class utf_normalizer
default:
// Five- and six- byte sequences do not need being checked for here anymore
+ /**
+ * @todo $trailing_bytes always == 5?
+ */
if ($utf_char > UTF8_MAX)
{
// Out of the Unicode range
@@ -1011,7 +1009,7 @@ class utf_normalizer
ksort($utf_sort);
}
- foreach($utf_sort as $utf_chars)
+ foreach ($utf_sort as $utf_chars)
{
$tmp .= implode('', $utf_chars);
}
@@ -1365,17 +1363,17 @@ class utf_normalizer
// LIndex can only range from 0 to 18, therefore it cannot influence the first two bytes of the L Jamo, which allows us to hardcode them (based on LBase).
//
// The same goes for VIndex, but for TIndex there's a catch: the value of the third byte could exceed 0xBF and we would have to increment the second byte
- if ($tIndex = $idx % UNICODE_HANGUL_TCOUNT)
+ if ($t_index = $idx % UNICODE_HANGUL_TCOUNT)
{
- if ($tIndex < 25)
+ if ($t_index < 25)
{
$utf_char = "\xE1\x84\x00\xE1\x85\x00\xE1\x86\x00";
- $utf_char[8] = chr(0xA7 + $tIndex);
+ $utf_char[8] = chr(0xA7 + $t_index);
}
else
{
$utf_char = "\xE1\x84\x00\xE1\x85\x00\xE1\x87\x00";
- $utf_char[8] = chr(0x67 + $tIndex);
+ $utf_char[8] = chr(0x67 + $t_index);
}
}
else
@@ -1478,7 +1476,6 @@ class utf_normalizer
}
return $tmp;
-
}
else if ($tmp_pos)
{