1 files changed, 45 insertions, 59 deletions
diff --git a/phpBB/includes/utf/utf_normalizer.php b/phpBB/includes/utf/utf_normalizer.php
index 0d1d74539a..542c1aeeb8 100644
--- a/phpBB/includes/utf/utf_normalizer.php
+++ b/phpBB/includes/utf/utf_normalizer.php
@@ -67,10 +67,10 @@ class utf_normalizer
 	* The ultimate convenience function! Clean up invalid UTF-8 sequences,
 	* and convert to Normal Form C, canonical composition.
 	*
-	* @param	string	$str	The dirty string
+	* @param	string	&$str	The dirty string
 	* @return	string			The same string, all shiny and cleaned-up
 	*/
-	function cleanup($str)
+	function cleanup(&$str)
 	{
 		// The string below is the list of all autorized characters, sorted by frequency in latin text
 		$pos = strspn($str, "\x20\x65\x69\x61\x73\x6E\x74\x72\x6F\x6C\x75\x64\x5D\x5B\x63\x6D\x70\x27\x0A\x67\x7C\x68\x76\x2E\x66\x62\x2C\x3A\x3D\x2D\x71\x31\x30\x43\x32\x2A\x79\x78\x29\x28\x4C\x39\x41\x53\x2F\x50\x22\x45\x6A\x4D\x49\x6B\x33\x3E\x35\x54\x3C\x44\x34\x7D\x42\x7B\x38\x46\x77\x52\x36\x37\x55\x47\x4E\x3B\x4A\x7A\x56\x23\x48\x4F\x57\x5F\x26\x21\x4B\x3F\x58\x51\x25\x59\x5C\x09\x5A\x2B\x7E\x5E\x24\x40\x60\x7F\x0D");
@@ -79,7 +79,7 @@ class utf_normalizer
 		if ($pos == $len)
 		{
 			// ASCII strings with no special chars return immediately
-			return $str;
+			return;
 		}
 
 		// Note: we do not check for $GLOBALS['utf_canonical_decomp']. It is assumed they are always loaded together
@@ -91,23 +91,22 @@ class utf_normalizer
 
 		// Replace any byte in the range 0x00..0x1F, except for \r, \n and \t
 		// We replace those characters with a 0xFF byte, which is illegal in UTF-8 and will in turn be replaced with a UTF replacement char
-		return utf_normalizer::recompose(
-			strtr(
-				$str,
-				"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F",
-				"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF"
-			),
-			$pos, $len, $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_canonical_decomp']
+		$str = strtr(
+			$str,
+			"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F",
+			"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF"
 		);
+
+		$str = utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_canonical_decomp']);
 	}
 
 	/**
 	* Validate and normalize a UTF string to NFC
 	*
-	* @param	string	$str	Unchecked UTF string
+	* @param	string	&$str	Unchecked UTF string
 	* @return	string			The string, validated and in normal form
 	*/
-	function nfc($str)
+	function nfc(&$str)
 	{
 		$pos = strspn($str, UTF8_ASCII_RANGE);
 		$len = strlen($str);
@@ -115,7 +114,7 @@ class utf_normalizer
 		if ($pos == $len)
 		{
 			// ASCII strings return immediately
-			return $str;
+			return;
 		}
 
 		if (!isset($GLOBALS['utf_nfc_qc']))
@@ -124,16 +123,16 @@ class utf_normalizer
 			include($phpbb_root_path . 'includes/utf/data/utf_nfc_qc.' . $phpEx);
 		}
 
-		return utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_canonical_decomp']);
+		$str = utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_canonical_decomp']);
 	}
 
 	/**
 	* Validate and normalize a UTF string to NFKC
 	*
-	* @param	string	$str	Unchecked UTF string
+	* @param	string	&$str	Unchecked UTF string
 	* @return	string			The string, validated and in normal form
 	*/
-	function nfkc($str)
+	function nfkc(&$str)
 	{
 		$pos = strspn($str, UTF8_ASCII_RANGE);
 		$len = strlen($str);
@@ -141,7 +140,7 @@ class utf_normalizer
 		if ($pos == $len)
 		{
 			// ASCII strings return immediately
-			return $str;
+			return;
 		}
 
 		if (!isset($GLOBALS['utf_nfkc_qc']))
@@ -156,16 +155,16 @@ class utf_normalizer
 			include($phpbb_root_path . 'includes/utf/data/utf_canonical_comp.' . $phpEx);
 		}
 
-		return utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfkc_qc'], $GLOBALS['utf_compatibility_decomp']);
+		$str = utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfkc_qc'], $GLOBALS['utf_compatibility_decomp']);
 	}
 
 	/**
 	* Validate and normalize a UTF string to NFD
 	*
-	* @param	string	$str	Unchecked UTF string
+	* @param	string	&$str	Unchecked UTF string
 	* @return	string			The string, validated and in normal form
 	*/
-	function nfd($str)
+	function nfd(&$str)
 	{
 		$pos = strspn($str, UTF8_ASCII_RANGE);
 		$len = strlen($str);
@@ -173,7 +172,7 @@ class utf_normalizer
 		if ($pos == $len)
 		{
 			// ASCII strings return immediately
-			return $str;
+			return;
 		}
 
 		if (!isset($GLOBALS['utf_canonical_decomp']))
@@ -182,16 +181,16 @@ class utf_normalizer
 			include($phpbb_root_path . 'includes/utf/data/utf_canonical_decomp.' . $phpEx);
 		}
 
-		return utf_normalizer::decompose($str, $pos, $len, $GLOBALS['utf_canonical_decomp']);
+		$str = utf_normalizer::decompose($str, $pos, $len, $GLOBALS['utf_canonical_decomp']);
 	}
 
 	/**
 	* Validate and normalize a UTF string to NFKD
 	*
-	* @param	string	$str	Unchecked UTF string
+	* @param	string	&$str	Unchecked UTF string
 	* @return	string			The string, validated and in normal form
 	*/
-	function nfkd($str)
+	function nfkd(&$str)
 	{
 		$pos = strspn($str, UTF8_ASCII_RANGE);
 		$len = strlen($str);
@@ -199,7 +198,7 @@ class utf_normalizer
 		if ($pos == $len)
 		{
 			// ASCII strings return immediately
-			return $str;
+			return;
 		}
 
 		if (!isset($GLOBALS['utf_compatibility_decomp']))
@@ -208,19 +207,19 @@ class utf_normalizer
 			include($phpbb_root_path . 'includes/utf/data/utf_compatibility_decomp.' . $phpEx);
 		}
 
-		return utf_normalizer::decompose($str, $pos, $len, $GLOBALS['utf_compatibility_decomp']);
+		$str = utf_normalizer::decompose($str, $pos, $len, $GLOBALS['utf_compatibility_decomp']);
 	}
 
 
 	/**
 	* Recompose a UTF string
 	*
-	* @param	string	$str		Unchecked UTF string
-	* @param	integer	$pos		Position of the first UTF char (in bytes)
-	* @param	integer	$len		Length of the string (in bytes)
-	* @param	array	$qc			Quick-check array, passed by reference but never modified
-	* @param	array	$decomp_map	Decomposition mapping, passed by reference but never modified
-	* @return	string				The string, validated and recomposed
+	* @param	string	$str			Unchecked UTF string
+	* @param	integer	$pos			Position of the first UTF char (in bytes)
+	* @param	integer	$len			Length of the string (in bytes)
+	* @param	array	&$qc			Quick-check array, passed by reference but never modified
+	* @param	array	&$decomp_map	Decomposition mapping, passed by reference but never modified
+	* @return	string					The string, validated and recomposed
 	*
 	* @access	private
 	*/
@@ -239,14 +238,7 @@ class utf_normalizer
 		$tmp = '';
 		$i = $tmp_pos = $last_cc = 0;
 
-		if ($pos)
-		{
-			$buffer = array(++$i => $str[$pos - 1]);
-		}
-		else
-		{
-			$buffer = array();
-		}
+		$buffer = ($pos) ? array(++$i => $str[$pos - 1]) : array();
 
 		// UTF char length array
 		// This array is used to determine the length of a UTF character.
@@ -327,16 +319,11 @@ class utf_normalizer
 							// has been encoded in a five- or six- byte sequence
 							if ($utf_char[0] >= "\xF8")
 							{
-								if ($utf_char[0] < "\xF8")
-								{
-									$trailing_bytes = 3;
-								}
-								else if ($utf_char[0] < "\xFC")
+								if ($utf_char[0] < "\xFC")
 								{
 									$trailing_bytes = 4;
 								}
-
-								if ($utf_char[0] > "\xFD")
+								else if ($utf_char[0] > "\xFD")
 								{
 									$trailing_bytes = 0;
 								}
@@ -923,17 +910,17 @@ class utf_normalizer
 	/**
 	* Decompose a UTF string
 	*
-	* @param	string	$str		UTF string
-	* @param	integer	$pos		Position of the first UTF char (in bytes)
-	* @param	integer	$len		Length of the string (in bytes)
-	* @param	array	$decomp_map	Decomposition mapping, passed by reference but never modified
-	* @return	string				The string, decomposed and sorted canonically
+	* @param	string	$str			UTF string
+	* @param	integer	$pos			Position of the first UTF char (in bytes)
+	* @param	integer	$len			Length of the string (in bytes)
+	* @param	array	&$decomp_map	Decomposition mapping, passed by reference but never modified
+	* @return	string					The string, decomposed and sorted canonically
 	*
 	* @access	private
 	*/
 	function decompose($str, $pos, $len, &$decomp_map)
 	{
-		global $utf_combining_class, $utf_canonical_decomp, $phpbb_root_path;
+		global $utf_combining_class, $phpbb_root_path;
 
 		// Load some commonly-used tables
 		if (!isset($utf_combining_class))
@@ -1011,7 +998,7 @@ class utf_normalizer
 								ksort($utf_sort);
 							}
 
-							foreach($utf_sort as $utf_chars)
+							foreach ($utf_sort as $utf_chars)
 							{
 								$tmp .= implode('', $utf_chars);
 							}
@@ -1365,17 +1352,17 @@ class utf_normalizer
 						// LIndex can only range from 0 to 18, therefore it cannot influence the first two bytes of the L Jamo, which allows us to hardcode them (based on LBase).
 						//
 						// The same goes for VIndex, but for TIndex there's a catch: the value of the third byte could exceed 0xBF and we would have to increment the second byte
-						if ($tIndex = $idx % UNICODE_HANGUL_TCOUNT)
+						if ($t_index = $idx % UNICODE_HANGUL_TCOUNT)
 						{
-							if ($tIndex < 25)
+							if ($t_index < 25)
 							{
 								$utf_char = "\xE1\x84\x00\xE1\x85\x00\xE1\x86\x00";
-								$utf_char[8] = chr(0xA7 + $tIndex);
+								$utf_char[8] = chr(0xA7 + $t_index);
 							}
 							else
 							{
 								$utf_char = "\xE1\x84\x00\xE1\x85\x00\xE1\x87\x00";
-								$utf_char[8] = chr(0x67 + $tIndex);
+								$utf_char[8] = chr(0x67 + $t_index);
 							}
 						}
 						else
@@ -1478,7 +1465,6 @@ class utf_normalizer
 			}
 
 			return $tmp;
-
 		}
 		else if ($tmp_pos)
 		{