#!/usr/bin/perl -w # # The Intltool Message Merger # # Copyright (C) 2000, 2002 Free Software Foundation. # Copyright (C) 2000, 2001 Eazel, Inc # # Intltool is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # version 2 published by the Free Software Foundation. # # Intltool is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # # As a special exception to the GNU General Public License, if you # distribute this file as part of a program that contains a # configuration script generated by Autoconf, you may include it under # the same distribution terms that you use for the rest of that program. # # Authors: Maciej Stachowiak <mjs@noisehavoc.org> # Kenneth Christiansen <kenneth@gnu.org> # Darin Adler <darin@bentspoon.com> # # Proper XML UTF-8'ification written by Cyrille Chepelov <chepelov@calixo.net> # ## Release information my $PROGRAM = "intltool-merge"; my $PACKAGE = "intltool"; my $VERSION = "0.22"; ## Loaded modules use strict; use Getopt::Long; ## Scalars used by the option stuff my $HELP_ARG = 0; my $VERSION_ARG = 0; my $BA_STYLE_ARG = 0; my $XML_STYLE_ARG = 0; my $KEYS_STYLE_ARG = 0; my $DESKTOP_STYLE_ARG = 0; my $SCHEMAS_STYLE_ARG = 0; my $QUIET_ARG = 0; my $PASS_THROUGH_ARG = 0; my $UTF8_ARG = 0; my $cache_file; ## Handle options GetOptions ( "help" => \$HELP_ARG, "version" => \$VERSION_ARG, "quiet|q" => \$QUIET_ARG, "oaf-style|o" => \$BA_STYLE_ARG, ## for compatibility "ba-style|b" => \$BA_STYLE_ARG, "xml-style|x" => \$XML_STYLE_ARG, "keys-style|k" => \$KEYS_STYLE_ARG, "desktop-style|d" => \$DESKTOP_STYLE_ARG, "schemas-style|s" => \$SCHEMAS_STYLE_ARG, "pass-through|p" => \$PASS_THROUGH_ARG, "utf8|u" => \$UTF8_ARG, "cache|c=s" => \$cache_file ) or &error; my $PO_DIR; my $FILE; my $OUTFILE; my %po_files_by_lang = (); my %translations = (); # Use this instead of \w for XML files to handle more possible characters. my $w = "[-A-Za-z0-9._:]"; # XML quoted string contents my $q = "[^\\\"]*"; ## Check for options. if ($VERSION_ARG) { &print_version; } elsif ($HELP_ARG) { &print_help; } elsif ($BA_STYLE_ARG && @ARGV > 2) { &preparation; &print_message; &ba_merge_translations; &finalize; } elsif ($XML_STYLE_ARG && @ARGV > 2) { &utf8_sanity_check; &preparation; &print_message; &xml_merge_translations; &finalize; } elsif ($KEYS_STYLE_ARG && @ARGV > 2) { &utf8_sanity_check; &preparation; &print_message; &keys_merge_translations; &finalize; } elsif ($DESKTOP_STYLE_ARG && @ARGV > 2) { &preparation; &print_message; &desktop_merge_translations; &finalize; } elsif ($SCHEMAS_STYLE_ARG && @ARGV > 2) { &preparation; &print_message; &schemas_merge_translations; &finalize; } else { &print_help; } exit; ## Sub for printing release information sub print_version { print "${PROGRAM} (${PACKAGE}) ${VERSION}\n"; print "Written by Maciej Stachowiak, Darin Adler and Kenneth Christiansen.\n\n"; print "Copyright (C) 2000-2002 Free Software Foundation, Inc.\n"; print "Copyright (C) 2000-2001 Eazel, Inc.\n"; print "This is free software; see the source for copying conditions. There is NO\n"; print "warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n"; exit; } ## Sub for printing usage information sub print_help { print "Usage: ${PROGRAM} [OPTIONS] PO_DIRECTORY FILENAME OUTPUT_FILE\n"; print "Generates an output file that includes translated versions of some attributes,\n"; print "from an untranslated source and a po directory that includes translations.\n\n"; print " -b, --ba-style includes translations in the bonobo-activation style\n"; print " -d, --desktop-style includes translations in the desktop style\n"; print " -k, --keys-style includes translations in the keys style\n"; print " -s, --schemas-style includes translations in the schemas style\n"; print " -x, --xml-style includes translations in the standard xml style\n"; print " -u, --utf8 convert all strings to UTF-8 before merging\n"; print " -p, --pass-through use strings as found in .po files, without\n"; print " conversion (STRONGLY unrecommended with -x)\n"; print " -q, --quiet suppress most messages\n"; print " --help display this help and exit\n"; print " --version output version information and exit\n"; print "\nReport bugs to bugzilla.gnome.org, module intltool, or contact us through \n"; print "<xml-i18n-tools-list\@gnome.org>.\n"; exit; } ## Sub for printing error messages sub print_error { print "Try `${PROGRAM} --help' for more information.\n"; exit; } sub print_message { print "Merging translations into $OUTFILE.\n" unless $QUIET_ARG; } sub preparation { $PO_DIR = $ARGV[0]; $FILE = $ARGV[1]; $OUTFILE = $ARGV[2]; &gather_po_files; &get_translation_database; } # General-purpose code for looking up translations in .po files sub po_file2lang { my ($tmp) = @_; $tmp =~ s/^.*\/(.*)\.po$/$1/; return $tmp; } sub gather_po_files { for my $po_file (glob "$PO_DIR/*.po") { $po_files_by_lang{po_file2lang($po_file)} = $po_file; } } sub get_po_encoding { my ($in_po_file) = @_; my $encoding = ""; open IN_PO_FILE, $in_po_file or die; while (<IN_PO_FILE>) { ## example: "Content-Type: text/plain; charset=ISO-8859-1\n" if (/Content-Type\:.*charset=([-a-zA-Z0-9]+)\\n/) { $encoding = $1; last; } } close IN_PO_FILE; if (!$encoding) { print "Warning: no encoding found in $in_po_file. Assuming ISO-8859-1\n"; $encoding = "ISO-8859-1"; } return $encoding } sub utf8_sanity_check { if (!$UTF8_ARG) { if (!$PASS_THROUGH_ARG) { $PASS_THROUGH_ARG="1"; } } } sub get_translation_database { if ($cache_file) { &get_cached_translation_database; } else { &create_translation_database; } } sub get_newest_po_age { my $newest_age; foreach my $file (values %po_files_by_lang) { my $file_age = -M $file; $newest_age = $file_age if !$newest_age || $file_age < $newest_age; } return $newest_age; } sub create_cache { print "Generating and caching the translation database\n" unless $QUIET_ARG; &create_translation_database; open CACHE, ">$cache_file" || die; print CACHE join "\x01", %translations; close CACHE; } sub load_cache { print "Found cached translation database\n" unless $QUIET_ARG; my $contents; open CACHE, "<$cache_file" || die; { local $/; $contents = <CACHE>; } close CACHE; %translations = split "\x01", $contents; } sub get_cached_translation_database { my $cache_file_age = -M $cache_file; if (defined $cache_file_age) { if ($cache_file_age <= &get_newest_po_age) { &load_cache; return; } print "Found too-old cached translation database\n" unless $QUIET_ARG; } &create_cache; } sub create_translation_database { for my $lang (keys %po_files_by_lang) { my $po_file = $po_files_by_lang{$lang}; if ($UTF8_ARG) { my $encoding = get_po_encoding ($po_file); if (lc $encoding eq "utf-8") { open PO_FILE, "<$po_file"; } else { my $iconv = $ENV{"INTLTOOL_ICONV"} || "iconv"; open PO_FILE, "$iconv -f $encoding -t UTF-8 $po_file|"; } } else { open PO_FILE, "<$po_file"; } my $nextfuzzy = 0; my $inmsgid = 0; my $inmsgstr = 0; my $msgid = ""; my $msgstr = ""; while (<PO_FILE>) { $nextfuzzy = 1 if /^#, fuzzy/; if (/^msgid "((\\.|[^\\])*)"/ ) { $translations{$lang, $msgid} = $msgstr if $inmsgstr && $msgid && $msgstr; $msgid = ""; $msgstr = ""; if ($nextfuzzy) { $inmsgid = 0; } else { $msgid = unescape_po_string($1); $inmsgid = 1; } $inmsgstr = 0; $nextfuzzy = 0; } if (/^msgstr "((\\.|[^\\])*)"/) { $msgstr = unescape_po_string($1); $inmsgstr = 1; $inmsgid = 0; } if (/^"((\\.|[^\\])*)"/) { $msgid .= unescape_po_string($1) if $inmsgid; $msgstr .= unescape_po_string($1) if $inmsgstr; } } $translations{$lang, $msgid} = $msgstr if $inmsgstr && $msgid && $msgstr; } } sub finalize { } sub unescape_one_sequence { my ($sequence) = @_; return "\\" if $sequence eq "\\\\"; return "\"" if $sequence eq "\\\""; # gettext also handles \n, \t, \b, \r, \f, \v, \a, \xxx (octal), # \xXX (hex) and has a comment saying they want to handle \u and \U. return $sequence; } sub unescape_po_string { my ($string) = @_; $string =~ s/(\\.)/unescape_one_sequence($1)/eg; return $string; } sub entity_decode { local ($_) = @_; s/'/'/g; # ' s/"/"/g; # " s/&/&/g; return $_; } sub entity_encode { my ($pre_encoded) = @_; my @list_of_chars = unpack ('C*', $pre_encoded); if ($PASS_THROUGH_ARG) { return join ('', map (&entity_encode_int_even_high_bit, @list_of_chars)); } else { return join ('', map (&entity_encode_int_minimalist, @list_of_chars)); } } sub entity_encode_int_minimalist { return """ if $_ == 34; return "&" if $_ == 38; return "'" if $_ == 39; return chr $_; } sub entity_encode_int_even_high_bit { if ($_ > 127 || $_ == 34 || $_ == 38 || $_ == 39) { # the ($_ > 127) should probably be removed return "&#" . $_ . ";"; } else { return chr $_; } } sub entity_encoded_translation { my ($lang, $string) = @_; my $translation = $translations{$lang, $string}; return $string if !$translation; return entity_encode ($translation); } ## XML (bonobo-activation specific) merge code sub ba_merge_translations { my $source; { local $/; # slurp mode open INPUT, "<$FILE" or die "can't open $FILE: $!"; $source = <INPUT>; close INPUT; } open OUTPUT, ">$OUTFILE" or die "can't open $OUTFILE: $!"; while ($source =~ s|^(.*?)([ \t]*<\s*$w+\s+($w+\s*=\s*"$q"\s*)+/?>)([ \t]*\n)?||s) { print OUTPUT $1; my $node = $2 . "\n"; my @strings = (); $_ = $node; while (s/(\s)_($w+\s*=\s*"($q)")/$1$2/s) { push @strings, entity_decode($3); } print OUTPUT; my %langs; for my $string (@strings) { for my $lang (keys %po_files_by_lang) { $langs{$lang} = 1 if $translations{$lang, $string}; } } for my $lang (sort keys %langs) { $_ = $node; s/(\sname\s*=\s*)"($q)"/$1"$2-$lang"/s; s/(\s)_($w+\s*=\s*")($q)"/$1 . $2 . entity_encoded_translation($lang, $3) . '"'/seg; print OUTPUT; } } print OUTPUT $source; close OUTPUT; } ## XML (non-bonobo-activation) merge code sub xml_merge_translations { my $source; { local $/; # slurp mode open INPUT, "<$FILE" or die "can't open $FILE: $!"; $source = <INPUT>; close INPUT; } open OUTPUT, ">$OUTFILE" or die; # FIXME: support attribute translations # Empty nodes never need translation, so unmark all of them. # For example, <_foo/> is just replaced by <foo/>. $source =~ s|<\s*_($w+)\s*/>|<$1/>|g; # Support for <_foo>blah</_foo> style translations. while ($source =~ s|^(.*?)([ \t]*)<\s*_($w+)\s*>(.*?)<\s*/_\3\s*>([ \t]*\n)?||s) { print OUTPUT $1; my $spaces = $2; my $tag = $3; my $string = $4; print OUTPUT "$spaces<$tag>$string</$tag>\n"; $string =~ s/\s+/ /g; $string =~ s/^ //; $string =~ s/ $//; $string = entity_decode($string); for my $lang (sort keys %po_files_by_lang) { my $translation = $translations{$lang, $string}; next if !$translation; $translation = entity_encode($translation); print OUTPUT "$spaces<$tag xml:lang=\"$lang\">$translation</$tag>\n"; } } print OUTPUT $source; close OUTPUT; } sub keys_merge_translations { open INPUT, "<${FILE}" or die; open OUTPUT, ">${OUTFILE}" or die; while (<INPUT>) { if (s/^(\s*)_(\w+=(.*))/$1$2/) { my $string = $3; print OUTPUT; my $non_translated_line = $_; for my $lang (sort keys %po_files_by_lang) { my $translation = $translations{$lang, $string}; next if !$translation; $_ = $non_translated_line; s/(\w+)=.*/[$lang]$1=$translation/; print OUTPUT; } } else { print OUTPUT; } } close OUTPUT; close INPUT; } sub desktop_merge_translations { open INPUT, "<${FILE}" or die; open OUTPUT, ">${OUTFILE}" or die; while (<INPUT>) { if (s/^(\s*)_(\w+=(.*))/$1$2/) { my $string = $3; print OUTPUT; my $non_translated_line = $_; for my $lang (sort keys %po_files_by_lang) { my $translation = $translations{$lang, $string}; next if !$translation; $_ = $non_translated_line; s/(\w+)=.*/${1}[$lang]=$translation/; print OUTPUT; } } else { print OUTPUT; } } close OUTPUT; close INPUT; } sub schemas_merge_translations { my $source; { local $/; # slurp mode open INPUT, "<$FILE" or die "can't open $FILE: $!"; $source = <INPUT>; close INPUT; } open OUTPUT, ">$OUTFILE" or die; # FIXME: support attribute translations # Empty nodes never need translation, so unmark all of them. # For example, <_foo/> is just replaced by <foo/>. $source =~ s|<\s*_($w+)\s*/>|<$1/>|g; # Support for <_foo>blah</_foo> style translations. my $regex_start = "^(.*?)([ \t]*)<locale name=\"C\">"; my $regex_short = "([ \t\n]*)<short>(.*?)</short>"; my $regex_long = "([ \t\n]*)<long>(.*?)</long>"; my $regex_end = "([ \t\n]*)</locale>"; while ($source =~ s|$regex_start$regex_short$regex_long$regex_end||s) { print OUTPUT $1; my $locale_start_spaces = $2; my $locale_end_spaces = $7; my $short_spaces = $3; my $short_string = $4; my $long_spaces = $5; my $long_string = $6; # English first print OUTPUT "$locale_start_spaces<locale name=\"C\">"; print OUTPUT "$short_spaces<short>$short_string</short>"; print OUTPUT "$long_spaces<long>$long_string</long>"; print OUTPUT "$locale_end_spaces</locale>"; $short_string =~ s/\s+/ /g; $short_string =~ s/^ //; $short_string =~ s/ $//; $short_string = entity_decode($short_string); $long_string =~ s/\s+/ /g; $long_string =~ s/^ //; $long_string =~ s/ $//; $long_string = entity_decode($long_string); for my $lang (sort keys %po_files_by_lang) { my $short_translation = $translations{$lang, $short_string}; my $long_translation = $translations{$lang, $long_string}; next if (!$short_translation && !$long_translation); print OUTPUT "\n$locale_start_spaces<locale name=\"$lang\">"; if ($short_translation) { $short_translation = entity_encode($short_translation); print OUTPUT "$short_spaces<short>$short_translation</short>"; } if ($long_translation) { $long_translation = entity_encode($long_translation); print OUTPUT "$long_spaces<long>$long_translation</long>"; } print OUTPUT "$locale_end_spaces</locale>"; } } print OUTPUT $source; close OUTPUT; }