aboutsummaryrefslogtreecommitdiffstats
path: root/phpBB/develop/generate_utf_confusables.php
diff options
context:
space:
mode:
authorDavid M <davidmj@users.sourceforge.net>2007-07-19 20:38:08 +0000
committerDavid M <davidmj@users.sourceforge.net>2007-07-19 20:38:08 +0000
commit870991c0608caf7467018245a0a4e1f1d55efd12 (patch)
tree95a81e7c78ba914d9970e3e6de8dd1a82d31b534 /phpBB/develop/generate_utf_confusables.php
parente9b908174cbeed0c780c56d6d74b761e40d8b594 (diff)
downloadforums-870991c0608caf7467018245a0a4e1f1d55efd12.tar
forums-870991c0608caf7467018245a0a4e1f1d55efd12.tar.gz
forums-870991c0608caf7467018245a0a4e1f1d55efd12.tar.bz2
forums-870991c0608caf7467018245a0a4e1f1d55efd12.tar.xz
forums-870991c0608caf7467018245a0a4e1f1d55efd12.zip
let's remove 350+ impossible entries
git-svn-id: file:///svn/phpbb/trunk@7908 89ea8834-ac86-4346-8a33-228a782c2dd0
Diffstat (limited to 'phpBB/develop/generate_utf_confusables.php')
-rw-r--r--phpBB/develop/generate_utf_confusables.php18
1 files changed, 17 insertions, 1 deletions
diff --git a/phpBB/develop/generate_utf_confusables.php b/phpBB/develop/generate_utf_confusables.php
index e2fd5bbaa3..908ebbf6f4 100644
--- a/phpBB/develop/generate_utf_confusables.php
+++ b/phpBB/develop/generate_utf_confusables.php
@@ -30,15 +30,22 @@ $phpEx = substr(strrchr(__FILE__, '.'), 1);
echo "Checking for required files\n";
download('http://unicode.org/reports/tr39/data/confusables.txt');
+download('http://unicode.org/Public/UNIDATA/CaseFolding.txt');
echo "\n";
/**
-* Load the CaseFolding table
+* Load the confusables table
*/
echo "Loading confusables\n";
$unidata = file_get_contents('confusables.txt');
+/**
+* Load the CaseFolding table
+*/
+echo "Loading CaseFolding\n";
+$casefolds = file_get_contents('CaseFolding.txt');
+
function utf8_chr($cp)
{
@@ -61,6 +68,7 @@ function utf8_chr($cp)
}
preg_match_all('/^([0-9A-F]+) ;\s((?:[0-9A-F]+ )*);.*?$/im', $unidata, $array, PREG_SET_ORDER);
+preg_match_all('/^([0-9A-F]+); ([CFS]); ([0-9A-F]+(?: [0-9A-F]+)*);/im', $casefolds, $casefold_array);
// some that we defined ourselves
$uniarray = array(
@@ -136,6 +144,14 @@ foreach ($array as $value)
$temp_hold = str_replace(utf8_chr(0x0031), utf8_chr(0x006C), $temp_hold);
}
}
+
+ // uppercased chars that were folded do not exist in this universe,
+ // no amount of normalization could ever "trick" this into not working
+ if (in_array($value[1], $casefold_array[1]))
+ {
+ continue;
+ }
+
$uniarray[utf8_chr(hexdec((string)$value[1]))] = $temp_hold;
}