diff options
Diffstat (limited to 'data/intltool-merge')
-rwxr-xr-x | data/intltool-merge | 657 |
1 files changed, 657 insertions, 0 deletions
diff --git a/data/intltool-merge b/data/intltool-merge new file mode 100755 index 00000000..06637146 --- /dev/null +++ b/data/intltool-merge @@ -0,0 +1,657 @@ +#!/usr/bin/perl -w + +# +# The Intltool Message Merger +# +# Copyright (C) 2000, 2002 Free Software Foundation. +# Copyright (C) 2000, 2001 Eazel, Inc +# +# Intltool is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# version 2 published by the Free Software Foundation. +# +# Intltool is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +# +# As a special exception to the GNU General Public License, if you +# distribute this file as part of a program that contains a +# configuration script generated by Autoconf, you may include it under +# the same distribution terms that you use for the rest of that program. +# +# Authors: Maciej Stachowiak <mjs@noisehavoc.org> +# Kenneth Christiansen <kenneth@gnu.org> +# Darin Adler <darin@bentspoon.com> +# +# Proper XML UTF-8'ification written by Cyrille Chepelov <chepelov@calixo.net> +# + +## Release information +my $PROGRAM = "intltool-merge"; +my $PACKAGE = "intltool"; +my $VERSION = "0.22"; + +## Loaded modules +use strict; +use Getopt::Long; + +## Scalars used by the option stuff +my $HELP_ARG = 0; +my $VERSION_ARG = 0; +my $BA_STYLE_ARG = 0; +my $XML_STYLE_ARG = 0; +my $KEYS_STYLE_ARG = 0; +my $DESKTOP_STYLE_ARG = 0; +my $SCHEMAS_STYLE_ARG = 0; +my $QUIET_ARG = 0; +my $PASS_THROUGH_ARG = 0; +my $UTF8_ARG = 0; +my $cache_file; + +## Handle options +GetOptions +( + "help" => \$HELP_ARG, + "version" => \$VERSION_ARG, + "quiet|q" => \$QUIET_ARG, + "oaf-style|o" => \$BA_STYLE_ARG, ## for compatibility + "ba-style|b" => \$BA_STYLE_ARG, + "xml-style|x" => \$XML_STYLE_ARG, + "keys-style|k" => \$KEYS_STYLE_ARG, + "desktop-style|d" => \$DESKTOP_STYLE_ARG, + "schemas-style|s" => \$SCHEMAS_STYLE_ARG, + "pass-through|p" => \$PASS_THROUGH_ARG, + "utf8|u" => \$UTF8_ARG, + "cache|c=s" => \$cache_file + ) or &error; + +my $PO_DIR; +my $FILE; +my $OUTFILE; + +my %po_files_by_lang = (); +my %translations = (); + +# Use this instead of \w for XML files to handle more possible characters. +my $w = "[-A-Za-z0-9._:]"; + +# XML quoted string contents +my $q = "[^\\\"]*"; + +## Check for options. + +if ($VERSION_ARG) { + &print_version; +} elsif ($HELP_ARG) { + &print_help; +} elsif ($BA_STYLE_ARG && @ARGV > 2) { + &preparation; + &print_message; + &ba_merge_translations; + &finalize; +} elsif ($XML_STYLE_ARG && @ARGV > 2) { + &utf8_sanity_check; + &preparation; + &print_message; + &xml_merge_translations; + &finalize; +} elsif ($KEYS_STYLE_ARG && @ARGV > 2) { + &utf8_sanity_check; + &preparation; + &print_message; + &keys_merge_translations; + &finalize; +} elsif ($DESKTOP_STYLE_ARG && @ARGV > 2) { + &preparation; + &print_message; + &desktop_merge_translations; + &finalize; +} elsif ($SCHEMAS_STYLE_ARG && @ARGV > 2) { + &preparation; + &print_message; + &schemas_merge_translations; + &finalize; +} else { + &print_help; +} + +exit; + +## Sub for printing release information +sub print_version +{ + print "${PROGRAM} (${PACKAGE}) ${VERSION}\n"; + print "Written by Maciej Stachowiak, Darin Adler and Kenneth Christiansen.\n\n"; + print "Copyright (C) 2000-2002 Free Software Foundation, Inc.\n"; + print "Copyright (C) 2000-2001 Eazel, Inc.\n"; + print "This is free software; see the source for copying conditions. There is NO\n"; + print "warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n"; + exit; +} + +## Sub for printing usage information +sub print_help +{ + print "Usage: ${PROGRAM} [OPTIONS] PO_DIRECTORY FILENAME OUTPUT_FILE\n"; + print "Generates an output file that includes translated versions of some attributes,\n"; + print "from an untranslated source and a po directory that includes translations.\n\n"; + print " -b, --ba-style includes translations in the bonobo-activation style\n"; + print " -d, --desktop-style includes translations in the desktop style\n"; + print " -k, --keys-style includes translations in the keys style\n"; + print " -s, --schemas-style includes translations in the schemas style\n"; + print " -x, --xml-style includes translations in the standard xml style\n"; + print " -u, --utf8 convert all strings to UTF-8 before merging\n"; + print " -p, --pass-through use strings as found in .po files, without\n"; + print " conversion (STRONGLY unrecommended with -x)\n"; + print " -q, --quiet suppress most messages\n"; + print " --help display this help and exit\n"; + print " --version output version information and exit\n"; + print "\nReport bugs to bugzilla.gnome.org, module intltool, or contact us through \n"; + print "<xml-i18n-tools-list\@gnome.org>.\n"; + exit; +} + + +## Sub for printing error messages +sub print_error +{ + print "Try `${PROGRAM} --help' for more information.\n"; + exit; +} + + +sub print_message +{ + print "Merging translations into $OUTFILE.\n" unless $QUIET_ARG; +} + + +sub preparation +{ + $PO_DIR = $ARGV[0]; + $FILE = $ARGV[1]; + $OUTFILE = $ARGV[2]; + + &gather_po_files; + &get_translation_database; +} + +# General-purpose code for looking up translations in .po files + +sub po_file2lang +{ + my ($tmp) = @_; + $tmp =~ s/^.*\/(.*)\.po$/$1/; + return $tmp; +} + +sub gather_po_files +{ + for my $po_file (glob "$PO_DIR/*.po") { + $po_files_by_lang{po_file2lang($po_file)} = $po_file; + } +} + +sub get_po_encoding +{ + my ($in_po_file) = @_; + my $encoding = ""; + + open IN_PO_FILE, $in_po_file or die; + while (<IN_PO_FILE>) { + ## example: "Content-Type: text/plain; charset=ISO-8859-1\n" + if (/Content-Type\:.*charset=([-a-zA-Z0-9]+)\\n/) { + $encoding = $1; + last; + } + } + close IN_PO_FILE; + + if (!$encoding) { + print "Warning: no encoding found in $in_po_file. Assuming ISO-8859-1\n"; + $encoding = "ISO-8859-1"; + } + return $encoding +} + +sub utf8_sanity_check +{ + if (!$UTF8_ARG) { + if (!$PASS_THROUGH_ARG) { + $PASS_THROUGH_ARG="1"; + } + } +} + +sub get_translation_database +{ + if ($cache_file) { + &get_cached_translation_database; + } else { + &create_translation_database; + } +} + +sub get_newest_po_age +{ + my $newest_age; + + foreach my $file (values %po_files_by_lang) { + my $file_age = -M $file; + $newest_age = $file_age if !$newest_age || $file_age < $newest_age; + } + + return $newest_age; +} + +sub create_cache +{ + print "Generating and caching the translation database\n" unless $QUIET_ARG; + + &create_translation_database; + + open CACHE, ">$cache_file" || die; + print CACHE join "\x01", %translations; + close CACHE; +} + +sub load_cache +{ + print "Found cached translation database\n" unless $QUIET_ARG; + + my $contents; + open CACHE, "<$cache_file" || die; + { + local $/; + $contents = <CACHE>; + } + close CACHE; + %translations = split "\x01", $contents; +} + +sub get_cached_translation_database +{ + my $cache_file_age = -M $cache_file; + if (defined $cache_file_age) { + if ($cache_file_age <= &get_newest_po_age) { + &load_cache; + return; + } + print "Found too-old cached translation database\n" unless $QUIET_ARG; + } + + &create_cache; +} + +sub create_translation_database +{ + for my $lang (keys %po_files_by_lang) { + my $po_file = $po_files_by_lang{$lang}; + + if ($UTF8_ARG) { + my $encoding = get_po_encoding ($po_file); + if (lc $encoding eq "utf-8") { + open PO_FILE, "<$po_file"; + } else { + my $iconv = $ENV{"INTLTOOL_ICONV"} || "iconv"; + open PO_FILE, "$iconv -f $encoding -t UTF-8 $po_file|"; + } + } else { + open PO_FILE, "<$po_file"; + } + + my $nextfuzzy = 0; + my $inmsgid = 0; + my $inmsgstr = 0; + my $msgid = ""; + my $msgstr = ""; + while (<PO_FILE>) { + $nextfuzzy = 1 if /^#, fuzzy/; + if (/^msgid "((\\.|[^\\])*)"/ ) { + $translations{$lang, $msgid} = $msgstr if $inmsgstr && $msgid && $msgstr; + $msgid = ""; + $msgstr = ""; + + if ($nextfuzzy) { + $inmsgid = 0; + } else { + $msgid = unescape_po_string($1); + $inmsgid = 1; + } + $inmsgstr = 0; + $nextfuzzy = 0; + } + if (/^msgstr "((\\.|[^\\])*)"/) { + $msgstr = unescape_po_string($1); + $inmsgstr = 1; + $inmsgid = 0; + } + if (/^"((\\.|[^\\])*)"/) { + $msgid .= unescape_po_string($1) if $inmsgid; + $msgstr .= unescape_po_string($1) if $inmsgstr; + } + } + $translations{$lang, $msgid} = $msgstr if $inmsgstr && $msgid && $msgstr; + } +} + +sub finalize +{ +} + +sub unescape_one_sequence +{ + my ($sequence) = @_; + + return "\\" if $sequence eq "\\\\"; + return "\"" if $sequence eq "\\\""; + + # gettext also handles \n, \t, \b, \r, \f, \v, \a, \xxx (octal), + # \xXX (hex) and has a comment saying they want to handle \u and \U. + + return $sequence; +} + +sub unescape_po_string +{ + my ($string) = @_; + + $string =~ s/(\\.)/unescape_one_sequence($1)/eg; + + return $string; +} + +sub entity_decode +{ + local ($_) = @_; + + s/'/'/g; # ' + s/"/"/g; # " + s/&/&/g; + + return $_; +} + +sub entity_encode +{ + my ($pre_encoded) = @_; + + my @list_of_chars = unpack ('C*', $pre_encoded); + + if ($PASS_THROUGH_ARG) { + return join ('', map (&entity_encode_int_even_high_bit, @list_of_chars)); + } else { + return join ('', map (&entity_encode_int_minimalist, @list_of_chars)); + } +} + +sub entity_encode_int_minimalist +{ + return """ if $_ == 34; + return "&" if $_ == 38; + return "'" if $_ == 39; + return chr $_; +} + +sub entity_encode_int_even_high_bit +{ + if ($_ > 127 || $_ == 34 || $_ == 38 || $_ == 39) { + # the ($_ > 127) should probably be removed + return "&#" . $_ . ";"; + } else { + return chr $_; + } +} + +sub entity_encoded_translation +{ + my ($lang, $string) = @_; + + my $translation = $translations{$lang, $string}; + return $string if !$translation; + return entity_encode ($translation); +} + +## XML (bonobo-activation specific) merge code + +sub ba_merge_translations +{ + my $source; + + { + local $/; # slurp mode + open INPUT, "<$FILE" or die "can't open $FILE: $!"; + $source = <INPUT>; + close INPUT; + } + + open OUTPUT, ">$OUTFILE" or die "can't open $OUTFILE: $!"; + + while ($source =~ s|^(.*?)([ \t]*<\s*$w+\s+($w+\s*=\s*"$q"\s*)+/?>)([ \t]*\n)?||s) { + print OUTPUT $1; + + my $node = $2 . "\n"; + + my @strings = (); + $_ = $node; + while (s/(\s)_($w+\s*=\s*"($q)")/$1$2/s) { + push @strings, entity_decode($3); + } + print OUTPUT; + + my %langs; + for my $string (@strings) { + for my $lang (keys %po_files_by_lang) { + $langs{$lang} = 1 if $translations{$lang, $string}; + } + } + + for my $lang (sort keys %langs) { + $_ = $node; + s/(\sname\s*=\s*)"($q)"/$1"$2-$lang"/s; + s/(\s)_($w+\s*=\s*")($q)"/$1 . $2 . entity_encoded_translation($lang, $3) . '"'/seg; + print OUTPUT; + } + } + + print OUTPUT $source; + + close OUTPUT; +} + + +## XML (non-bonobo-activation) merge code + +sub xml_merge_translations +{ + my $source; + + { + local $/; # slurp mode + open INPUT, "<$FILE" or die "can't open $FILE: $!"; + $source = <INPUT>; + close INPUT; + } + + open OUTPUT, ">$OUTFILE" or die; + + # FIXME: support attribute translations + + # Empty nodes never need translation, so unmark all of them. + # For example, <_foo/> is just replaced by <foo/>. + $source =~ s|<\s*_($w+)\s*/>|<$1/>|g; + + # Support for <_foo>blah</_foo> style translations. + while ($source =~ s|^(.*?)([ \t]*)<\s*_($w+)\s*>(.*?)<\s*/_\3\s*>([ \t]*\n)?||s) { + print OUTPUT $1; + + my $spaces = $2; + my $tag = $3; + my $string = $4; + + print OUTPUT "$spaces<$tag>$string</$tag>\n"; + + $string =~ s/\s+/ /g; + $string =~ s/^ //; + $string =~ s/ $//; + $string = entity_decode($string); + + for my $lang (sort keys %po_files_by_lang) { + my $translation = $translations{$lang, $string}; + next if !$translation; + $translation = entity_encode($translation); + print OUTPUT "$spaces<$tag xml:lang=\"$lang\">$translation</$tag>\n"; + } + } + + print OUTPUT $source; + + close OUTPUT; +} + +sub keys_merge_translations +{ + open INPUT, "<${FILE}" or die; + open OUTPUT, ">${OUTFILE}" or die; + + while (<INPUT>) { + if (s/^(\s*)_(\w+=(.*))/$1$2/) { + my $string = $3; + + print OUTPUT; + + my $non_translated_line = $_; + + for my $lang (sort keys %po_files_by_lang) { + my $translation = $translations{$lang, $string}; + next if !$translation; + + $_ = $non_translated_line; + s/(\w+)=.*/[$lang]$1=$translation/; + print OUTPUT; + } + } else { + print OUTPUT; + } + } + + close OUTPUT; + close INPUT; +} + +sub desktop_merge_translations +{ + open INPUT, "<${FILE}" or die; + open OUTPUT, ">${OUTFILE}" or die; + + while (<INPUT>) { + if (s/^(\s*)_(\w+=(.*))/$1$2/) { + my $string = $3; + + print OUTPUT; + + my $non_translated_line = $_; + + for my $lang (sort keys %po_files_by_lang) { + my $translation = $translations{$lang, $string}; + next if !$translation; + + $_ = $non_translated_line; + s/(\w+)=.*/${1}[$lang]=$translation/; + print OUTPUT; + } + } else { + print OUTPUT; + } + } + + close OUTPUT; + close INPUT; +} + +sub schemas_merge_translations +{ + my $source; + + { + local $/; # slurp mode + open INPUT, "<$FILE" or die "can't open $FILE: $!"; + $source = <INPUT>; + close INPUT; + } + + open OUTPUT, ">$OUTFILE" or die; + + # FIXME: support attribute translations + + # Empty nodes never need translation, so unmark all of them. + # For example, <_foo/> is just replaced by <foo/>. + $source =~ s|<\s*_($w+)\s*/>|<$1/>|g; + + # Support for <_foo>blah</_foo> style translations. + + my $regex_start = "^(.*?)([ \t]*)<locale name=\"C\">"; + my $regex_short = "([ \t\n]*)<short>(.*?)</short>"; + my $regex_long = "([ \t\n]*)<long>(.*?)</long>"; + my $regex_end = "([ \t\n]*)</locale>"; + + while ($source =~ s|$regex_start$regex_short$regex_long$regex_end||s) { + print OUTPUT $1; + + my $locale_start_spaces = $2; + my $locale_end_spaces = $7; + my $short_spaces = $3; + my $short_string = $4; + my $long_spaces = $5; + my $long_string = $6; + + # English first + + print OUTPUT "$locale_start_spaces<locale name=\"C\">"; + print OUTPUT "$short_spaces<short>$short_string</short>"; + print OUTPUT "$long_spaces<long>$long_string</long>"; + print OUTPUT "$locale_end_spaces</locale>"; + + $short_string =~ s/\s+/ /g; + $short_string =~ s/^ //; + $short_string =~ s/ $//; + $short_string = entity_decode($short_string); + + $long_string =~ s/\s+/ /g; + $long_string =~ s/^ //; + $long_string =~ s/ $//; + $long_string = entity_decode($long_string); + + for my $lang (sort keys %po_files_by_lang) { + my $short_translation = $translations{$lang, $short_string}; + my $long_translation = $translations{$lang, $long_string}; + + next if (!$short_translation && !$long_translation); + + print OUTPUT "\n$locale_start_spaces<locale name=\"$lang\">"; + + if ($short_translation) + { + $short_translation = entity_encode($short_translation); + print OUTPUT "$short_spaces<short>$short_translation</short>"; + } + + if ($long_translation) + { + $long_translation = entity_encode($long_translation); + print OUTPUT "$long_spaces<long>$long_translation</long>"; + } + + print OUTPUT "$locale_end_spaces</locale>"; + } + } + + print OUTPUT $source; + + close OUTPUT; +} |