Changed: split the tables used by the search indexer in order to load them on a need-to-use basis and preserve memory

git-svn-id: file:///svn/phpbb/trunk@6172 89ea8834-ac86-4346-8a33-228a782c2dd0
author: Ludovic Arnaud <ludovic_arnaud@users.sourceforge.net> 2006-07-12 16:29:42 +0000
committer: Ludovic Arnaud <ludovic_arnaud@users.sourceforge.net> 2006-07-12 16:29:42 +0000
commit: 2cc250947314e27de739c3a1002356c7d5da8959 (patch)
tree: 92b77d0adc0264da20f2a8092a1ce00231bf3908 /phpBB/develop/generate_utf_tables.php
parent: 9c844b15cee3de0c50804aabe6e7545a12cc4277 (diff)
download: forums-2cc250947314e27de739c3a1002356c7d5da8959.tar
forums-2cc250947314e27de739c3a1002356c7d5da8959.tar.gz
forums-2cc250947314e27de739c3a1002356c7d5da8959.tar.bz2
forums-2cc250947314e27de739c3a1002356c7d5da8959.tar.xz
forums-2cc250947314e27de739c3a1002356c7d5da8959.zip
1 files changed, 36 insertions, 10 deletions
diff --git a/phpBB/develop/generate_utf_tables.php b/phpBB/develop/generate_utf_tables.php
index 0f84e7b584..1d7fbc1d67 100644
--- a/phpBB/develop/generate_utf_tables.php
+++ b/phpBB/develop/generate_utf_tables.php
@@ -277,6 +277,8 @@ echo "\n*** UTF-8 normalization tables done\n\n";
 /**
 * Now we'll generate the files needed by the search indexer
 */
+echo "Generating search indexer tables\n";
+
 $fp = fopen($phpbb_root_path . 'develop/UnicodeData.txt', 'rt');
 
 $map = array();
@@ -289,9 +291,11 @@ while ($line = fgets($fp, 1024))
 	$m = explode(';', $line);
 
 	/**
+	* @var	integer	$cp			Current char codepoint
 	* @var	string	$utf_char	UTF-8 representation of current char
 	*/
-	$utf_char = hex_to_utf($m[0]);
+	$cp = hexdec($m[0]);
+	$utf_char = cp_to_utf($cp);
 
 	/**
 	* $m[2] holds the "General Category" of the character
@@ -314,21 +318,21 @@ while ($line = fgets($fp, 1024))
 				* @todo Note that ligatures with combining marks such as U+01E2 are
 				* not supported at this time
 				*/
-				$map[$utf_char] = strtolower($capture[1]);
+				$map[$cp] = strtolower($capture[1]);
 			}
 			elseif (isset($m[13][0]))
 			{
 				/**
 				* If the letter has a lowercased form, use it
 				*/
-				$map[$utf_char] = hex_to_utf($m[13]);
+				$map[$cp] = hex_to_utf($m[13]);
 			}
 			else
 			{
 				/**
 				* In all other cases, map the letter to itself
 				*/
-				$map[$utf_char] = $utf_char;
+				$map[$cp] = $utf_char;
 			}
 			break;
 
@@ -336,7 +340,7 @@ while ($line = fgets($fp, 1024))
 			/**
 			* We allow all marks, they are mapped to themselves
 			*/
-			$map[$utf_char] = $utf_char;
+			$map[$cp] = $utf_char;
 			break;
 
 		case 'N':
@@ -348,7 +352,7 @@ while ($line = fgets($fp, 1024))
 			* like "1/2", with a slash. However, "1/2" entered in ASCII is converted
 			* to "1 2". This will have to be fixed.
 			*/
-			$map[$utf_char] = (isset($m[8][0])) ? $m[8] : $utf_char;
+			$map[$cp] = (isset($m[8][0])) ? $m[8] : $utf_char;
 			break;
 
 		default:
@@ -369,11 +373,33 @@ $cheats = array(
 	'00F6'	=>	'oe',		#	Small O with diaeresis
 );
 
-echo count($map);
+/**
+* Add our "cheat replacements" to the map
+*/
+foreach ($cheats as $hex => $map_to)
+{
+	$map[hexdec($hex)] = $map_to;
+}
+
+/**
+* Split the map into smaller blocks
+*/
+$file_contents = array();
+foreach ($map as $cp => $map_to)
+{
+	$file_contents[$cp >> 11][cp_to_utf($cp)] = $map_to;
+}
+unset($map);
+
+foreach ($file_contents as $idx => $contents)
+{
+	echo "Writing to search_indexer_$idx.$phpEx\n";
+	$fp = fopen($phpbb_root_path . 'includes/utf/data/search_indexer_' . $idx . '.' . $phpEx, 'wb');
+	fwrite($fp, '<?php return ' . my_var_export($contents) . ';');
+	fclose($fp);
+}
+echo "\n*** Search indexer tables done\n\n";
 
-$fp = fopen($phpbb_root_path . 'includes/utf/data/search_indexer.php', 'wb');
-fwrite($fp, '<?php return ' . my_var_export($map) . ';');
-fclose($fp);
 
 die("\nAll done!\n");
author	Ludovic Arnaud <ludovic_arnaud@users.sourceforge.net>	2006-07-12 16:29:42 +0000
committer	Ludovic Arnaud <ludovic_arnaud@users.sourceforge.net>	2006-07-12 16:29:42 +0000
commit	2cc250947314e27de739c3a1002356c7d5da8959 (patch)
tree	92b77d0adc0264da20f2a8092a1ce00231bf3908 /phpBB/develop/generate_utf_tables.php
parent	9c844b15cee3de0c50804aabe6e7545a12cc4277 (diff)
download	forums-2cc250947314e27de739c3a1002356c7d5da8959.tar forums-2cc250947314e27de739c3a1002356c7d5da8959.tar.gz forums-2cc250947314e27de739c3a1002356c7d5da8959.tar.bz2 forums-2cc250947314e27de739c3a1002356c7d5da8959.tar.xz forums-2cc250947314e27de739c3a1002356c7d5da8959.zip