aboutsummaryrefslogtreecommitdiffstats
path: root/phpBB/develop/generate_utf_tables.php
diff options
context:
space:
mode:
authorLudovic Arnaud <ludovic_arnaud@users.sourceforge.net>2006-07-12 16:29:42 +0000
committerLudovic Arnaud <ludovic_arnaud@users.sourceforge.net>2006-07-12 16:29:42 +0000
commit2cc250947314e27de739c3a1002356c7d5da8959 (patch)
tree92b77d0adc0264da20f2a8092a1ce00231bf3908 /phpBB/develop/generate_utf_tables.php
parent9c844b15cee3de0c50804aabe6e7545a12cc4277 (diff)
downloadforums-2cc250947314e27de739c3a1002356c7d5da8959.tar
forums-2cc250947314e27de739c3a1002356c7d5da8959.tar.gz
forums-2cc250947314e27de739c3a1002356c7d5da8959.tar.bz2
forums-2cc250947314e27de739c3a1002356c7d5da8959.tar.xz
forums-2cc250947314e27de739c3a1002356c7d5da8959.zip
Changed: split the tables used by the search indexer in order to load them on a need-to-use basis and preserve memory
git-svn-id: file:///svn/phpbb/trunk@6172 89ea8834-ac86-4346-8a33-228a782c2dd0
Diffstat (limited to 'phpBB/develop/generate_utf_tables.php')
-rw-r--r--phpBB/develop/generate_utf_tables.php46
1 files changed, 36 insertions, 10 deletions
diff --git a/phpBB/develop/generate_utf_tables.php b/phpBB/develop/generate_utf_tables.php
index 0f84e7b584..1d7fbc1d67 100644
--- a/phpBB/develop/generate_utf_tables.php
+++ b/phpBB/develop/generate_utf_tables.php
@@ -277,6 +277,8 @@ echo "\n*** UTF-8 normalization tables done\n\n";
/**
* Now we'll generate the files needed by the search indexer
*/
+echo "Generating search indexer tables\n";
+
$fp = fopen($phpbb_root_path . 'develop/UnicodeData.txt', 'rt');
$map = array();
@@ -289,9 +291,11 @@ while ($line = fgets($fp, 1024))
$m = explode(';', $line);
/**
+ * @var integer $cp Current char codepoint
* @var string $utf_char UTF-8 representation of current char
*/
- $utf_char = hex_to_utf($m[0]);
+ $cp = hexdec($m[0]);
+ $utf_char = cp_to_utf($cp);
/**
* $m[2] holds the "General Category" of the character
@@ -314,21 +318,21 @@ while ($line = fgets($fp, 1024))
* @todo Note that ligatures with combining marks such as U+01E2 are
* not supported at this time
*/
- $map[$utf_char] = strtolower($capture[1]);
+ $map[$cp] = strtolower($capture[1]);
}
elseif (isset($m[13][0]))
{
/**
* If the letter has a lowercased form, use it
*/
- $map[$utf_char] = hex_to_utf($m[13]);
+ $map[$cp] = hex_to_utf($m[13]);
}
else
{
/**
* In all other cases, map the letter to itself
*/
- $map[$utf_char] = $utf_char;
+ $map[$cp] = $utf_char;
}
break;
@@ -336,7 +340,7 @@ while ($line = fgets($fp, 1024))
/**
* We allow all marks, they are mapped to themselves
*/
- $map[$utf_char] = $utf_char;
+ $map[$cp] = $utf_char;
break;
case 'N':
@@ -348,7 +352,7 @@ while ($line = fgets($fp, 1024))
* like "1/2", with a slash. However, "1/2" entered in ASCII is converted
* to "1 2". This will have to be fixed.
*/
- $map[$utf_char] = (isset($m[8][0])) ? $m[8] : $utf_char;
+ $map[$cp] = (isset($m[8][0])) ? $m[8] : $utf_char;
break;
default:
@@ -369,11 +373,33 @@ $cheats = array(
'00F6' => 'oe', # Small O with diaeresis
);
-echo count($map);
+/**
+* Add our "cheat replacements" to the map
+*/
+foreach ($cheats as $hex => $map_to)
+{
+ $map[hexdec($hex)] = $map_to;
+}
+
+/**
+* Split the map into smaller blocks
+*/
+$file_contents = array();
+foreach ($map as $cp => $map_to)
+{
+ $file_contents[$cp >> 11][cp_to_utf($cp)] = $map_to;
+}
+unset($map);
+
+foreach ($file_contents as $idx => $contents)
+{
+ echo "Writing to search_indexer_$idx.$phpEx\n";
+ $fp = fopen($phpbb_root_path . 'includes/utf/data/search_indexer_' . $idx . '.' . $phpEx, 'wb');
+ fwrite($fp, '<?php return ' . my_var_export($contents) . ';');
+ fclose($fp);
+}
+echo "\n*** Search indexer tables done\n\n";
-$fp = fopen($phpbb_root_path . 'includes/utf/data/search_indexer.php', 'wb');
-fwrite($fp, '<?php return ' . my_var_export($map) . ';');
-fclose($fp);
die("\nAll done!\n");