diff options
author | Ludovic Arnaud <ludovic_arnaud@users.sourceforge.net> | 2006-07-12 16:29:42 +0000 |
---|---|---|
committer | Ludovic Arnaud <ludovic_arnaud@users.sourceforge.net> | 2006-07-12 16:29:42 +0000 |
commit | 2cc250947314e27de739c3a1002356c7d5da8959 (patch) | |
tree | 92b77d0adc0264da20f2a8092a1ce00231bf3908 /phpBB/develop/generate_utf_tables.php | |
parent | 9c844b15cee3de0c50804aabe6e7545a12cc4277 (diff) | |
download | forums-2cc250947314e27de739c3a1002356c7d5da8959.tar forums-2cc250947314e27de739c3a1002356c7d5da8959.tar.gz forums-2cc250947314e27de739c3a1002356c7d5da8959.tar.bz2 forums-2cc250947314e27de739c3a1002356c7d5da8959.tar.xz forums-2cc250947314e27de739c3a1002356c7d5da8959.zip |
Changed: split the tables used by the search indexer in order to load them on a need-to-use basis and preserve memory
git-svn-id: file:///svn/phpbb/trunk@6172 89ea8834-ac86-4346-8a33-228a782c2dd0
Diffstat (limited to 'phpBB/develop/generate_utf_tables.php')
-rw-r--r-- | phpBB/develop/generate_utf_tables.php | 46 |
1 files changed, 36 insertions, 10 deletions
diff --git a/phpBB/develop/generate_utf_tables.php b/phpBB/develop/generate_utf_tables.php index 0f84e7b584..1d7fbc1d67 100644 --- a/phpBB/develop/generate_utf_tables.php +++ b/phpBB/develop/generate_utf_tables.php @@ -277,6 +277,8 @@ echo "\n*** UTF-8 normalization tables done\n\n"; /** * Now we'll generate the files needed by the search indexer */ +echo "Generating search indexer tables\n"; + $fp = fopen($phpbb_root_path . 'develop/UnicodeData.txt', 'rt'); $map = array(); @@ -289,9 +291,11 @@ while ($line = fgets($fp, 1024)) $m = explode(';', $line); /** + * @var integer $cp Current char codepoint * @var string $utf_char UTF-8 representation of current char */ - $utf_char = hex_to_utf($m[0]); + $cp = hexdec($m[0]); + $utf_char = cp_to_utf($cp); /** * $m[2] holds the "General Category" of the character @@ -314,21 +318,21 @@ while ($line = fgets($fp, 1024)) * @todo Note that ligatures with combining marks such as U+01E2 are * not supported at this time */ - $map[$utf_char] = strtolower($capture[1]); + $map[$cp] = strtolower($capture[1]); } elseif (isset($m[13][0])) { /** * If the letter has a lowercased form, use it */ - $map[$utf_char] = hex_to_utf($m[13]); + $map[$cp] = hex_to_utf($m[13]); } else { /** * In all other cases, map the letter to itself */ - $map[$utf_char] = $utf_char; + $map[$cp] = $utf_char; } break; @@ -336,7 +340,7 @@ while ($line = fgets($fp, 1024)) /** * We allow all marks, they are mapped to themselves */ - $map[$utf_char] = $utf_char; + $map[$cp] = $utf_char; break; case 'N': @@ -348,7 +352,7 @@ while ($line = fgets($fp, 1024)) * like "1/2", with a slash. However, "1/2" entered in ASCII is converted * to "1 2". This will have to be fixed. */ - $map[$utf_char] = (isset($m[8][0])) ? $m[8] : $utf_char; + $map[$cp] = (isset($m[8][0])) ? $m[8] : $utf_char; break; default: @@ -369,11 +373,33 @@ $cheats = array( '00F6' => 'oe', # Small O with diaeresis ); -echo count($map); +/** +* Add our "cheat replacements" to the map +*/ +foreach ($cheats as $hex => $map_to) +{ + $map[hexdec($hex)] = $map_to; +} + +/** +* Split the map into smaller blocks +*/ +$file_contents = array(); +foreach ($map as $cp => $map_to) +{ + $file_contents[$cp >> 11][cp_to_utf($cp)] = $map_to; +} +unset($map); + +foreach ($file_contents as $idx => $contents) +{ + echo "Writing to search_indexer_$idx.$phpEx\n"; + $fp = fopen($phpbb_root_path . 'includes/utf/data/search_indexer_' . $idx . '.' . $phpEx, 'wb'); + fwrite($fp, '<?php return ' . my_var_export($contents) . ';'); + fclose($fp); +} +echo "\n*** Search indexer tables done\n\n"; -$fp = fopen($phpbb_root_path . 'includes/utf/data/search_indexer.php', 'wb'); -fwrite($fp, '<?php return ' . my_var_export($map) . ';'); -fclose($fp); die("\nAll done!\n"); |