array( 'c2' => array('c1', 'c2', 'c3'), 'c4' => array('c4', 'c5') ), /** * NFD * c3 == NFD(c1) == NFD(c2) == NFD(c3) * c5 == NFD(c4) == NFD(c5) */ 'NFD' => array( 'c3' => array('c1', 'c2', 'c3'), 'c5' => array('c4', 'c5') ), /** * NFKC * c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5) */ 'NFKC' => array( 'c4' => array('c1', 'c2', 'c3', 'c4', 'c5') ), /** * NFKD * c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5) */ 'NFKD' => array( 'c5' => array('c1', 'c2', 'c3', 'c4', 'c5') ) ); $tested_chars = array(); $fp = fopen(self::$data_dir . '/NormalizationTest.txt', 'rb'); while (!feof($fp)) { $line = fgets($fp); if ($line[0] == '@') { continue; } if (!strpos(' 0123456789ABCDEF', $line[0])) { continue; } list($c1, $c2, $c3, $c4, $c5) = explode(';', $line); if (!strpos($c1, ' ')) { /** * We are currently testing a single character, we add it to the list of * characters we have processed so that we can exclude it when testing * for invariants */ $tested_chars[$c1] = 1; } foreach ($test_suite as $form => $serie) { foreach ($serie as $expected => $tests) { $hex_expected = ${$expected}; $utf_expected = $this->hexseq_to_utf($hex_expected); foreach ($tests as $test) { $utf_result = $utf_expected; call_user_func_array(array('utf_normalizer', $form), array(&$utf_result)); $hex_result = $this->utf_to_hexseq($utf_result); $this->assertEquals($utf_expected, $utf_result, "$expected == $form($test) ($hex_expected != $hex_result)"); } } } } fclose($fp); return $tested_chars; } /** * @depends test_normalizer */ public function test_invariants(array $tested_chars) { $fp = fopen(self::$data_dir . '/UnicodeData.txt', 'rb'); while (!feof($fp)) { $line = fgets($fp, 1024); if (!$pos = strpos($line, ';')) { continue; } $hex_tested = $hex_expected = substr($line, 0, $pos); if (isset($tested_chars[$hex_tested])) { continue; } $utf_expected = $this->hex_to_utf($hex_expected); if ($utf_expected >= UTF8_SURROGATE_FIRST && $utf_expected <= UTF8_SURROGATE_LAST) { /** * Surrogates are illegal on their own, we expect the normalizer * to return a replacement char */ $utf_expected = UTF8_REPLACEMENT; $hex_expected = $this->utf_to_hexseq($utf_expected); } foreach (array('nfc', 'nfkc', 'nfd', 'nfkd') as $form) { $utf_result = $utf_expected; call_user_func_array(array('utf_normalizer', $form), array(&$utf_result)); $hex_result = $this->utf_to_hexseq($utf_result); $this->assertEquals($utf_expected, $utf_result, "$hex_expected == $form($hex_tested) ($hex_expected != $hex_result)"); } } fclose($fp); } /** * Convert a UTF string to a sequence of codepoints in hexadecimal * * @param string $utf UTF string * @return integer Unicode codepoints in hex */ protected function utf_to_hexseq($str) { $pos = 0; $len = strlen($str); $ret = array(); while ($pos < $len) { $c = $str[$pos]; switch ($c & "\xF0") { case "\xC0": case "\xD0": $utf_char = substr($str, $pos, 2); $pos += 2; break; case "\xE0": $utf_char = substr($str, $pos, 3); $pos += 3; break; case "\xF0": $utf_char = substr($str, $pos, 4); $pos += 4; break; default: $utf_char = $c; ++$pos; } $hex = dechex($this->utf_to_cp($utf_char)); if (!isset($hex[3])) { $hex = substr('000' . $hex, -4); } $ret[] = $hex; } return strtr(implode(' ', $ret), 'abcdef', 'ABCDEF'); } /** * Convert a UTF-8 char to its codepoint * * @param string $utf_char UTF-8 char * @return integer Unicode codepoint */ protected function utf_to_cp($utf_char) { switch (strlen($utf_char)) { case 1: return ord($utf_char); case 2: return ((ord($utf_char[0]) & 0x1F) << 6) | (ord($utf_char[1]) & 0x3F); case 3: return ((ord($utf_char[0]) & 0x0F) << 12) | ((ord($utf_char[1]) & 0x3F) << 6) | (ord($utf_char[2]) & 0x3F); case 4: return ((ord($utf_char[0]) & 0x07) << 18) | ((ord($utf_char[1]) & 0x3F) << 12) | ((ord($utf_char[2]) & 0x3F) << 6) | (ord($utf_char[3]) & 0x3F); default: throw new RuntimeException('UTF-8 chars can only be 1-4 bytes long'); } } /** * Return a UTF string formed from a sequence of codepoints in hexadecimal * * @param string $seq Sequence of codepoints, separated with a space * @return string UTF-8 string */ protected function hexseq_to_utf($seq) { return implode('', array_map(array($this, 'hex_to_utf'), explode(' ', $seq))); } /** * Convert a codepoint in hexadecimal to a UTF-8 char * * @param string $hex Codepoint, in hexadecimal * @return string UTF-8 char */ protected function hex_to_utf($hex) { return $this->cp_to_utf(hexdec($hex)); } /** * Convert a codepoint to a UTF-8 char * * @param integer $cp Unicode codepoint * @return string UTF-8 string */ protected function cp_to_utf($cp) { if ($cp > 0xFFFF) { return chr(0xF0 | ($cp >> 18)) . chr(0x80 | (($cp >> 12) & 0x3F)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F)); } else if ($cp > 0x7FF) { return chr(0xE0 | ($cp >> 12)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F)); } else if ($cp > 0x7F) { return chr(0xC0 | ($cp >> 6)) . chr(0x80 | ($cp & 0x3F)); } else { return chr($cp); } } // chunked download helper static protected function download($url, $to) { $target = $to . '/' . basename($url); if (file_exists($target)) { return; } if (!$fpr = fopen($url, 'rb')) { echo "Failed to download $url\n"; return; } if (!$fpw = fopen($target, 'wb')) { echo "Failed to open $target for writing\n"; return; } $chunk = 32768; while (!feof($fpr)) { fwrite($fpw, fread($fpr, $chunk)); } fclose($fpr); fclose($fpw); } }