hgbook

diff tools/po4a/lib/Locale/Po4a/TransTractor.pm @ 631:f7d674e6e736
Revert to original hgbook.css
author: Dongsheng Song <dongsheng.song@gmail.com>
date: Thu Mar 12 17:43:30 2009 +0800 (2009-03-12)
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/tools/po4a/lib/Locale/Po4a/TransTractor.pm	Thu Mar 12 17:43:30 2009 +0800
     1.3 @@ -0,0 +1,1100 @@
     1.4 +#!/usr/bin/perl -w
     1.5 +
     1.6 +require Exporter;
     1.7 +
     1.8 +package Locale::Po4a::TransTractor;
     1.9 +use DynaLoader;
    1.10 +
    1.11 +use 5.006;
    1.12 +use strict;
    1.13 +use warnings;
    1.14 +
    1.15 +use subs qw(makespace);
    1.16 +use vars qw($VERSION @ISA @EXPORT);
    1.17 +$VERSION="0.36";
    1.18 +@ISA = qw(DynaLoader);
    1.19 +@EXPORT = qw(new process translate 
    1.20 +             read write readpo writepo
    1.21 +             getpoout setpoout);
    1.22 +
    1.23 +# Try to use a C extension if present.
    1.24 +eval("bootstrap Locale::Po4a::TransTractor $VERSION");
    1.25 +
    1.26 +use Carp qw(croak);
    1.27 +use Locale::Po4a::Po;
    1.28 +use Locale::Po4a::Common;
    1.29 +
    1.30 +use File::Path; # mkdir before write
    1.31 +
    1.32 +use Encode;
    1.33 +use Encode::Guess;
    1.34 +
    1.35 +=head1 NAME
    1.36 +
    1.37 +Locale::Po4a::TransTractor - Generic trans(lator ex)tractor.
    1.38 +
    1.39 +=head1 DESCRIPTION
    1.40 +
    1.41 +The po4a (po for anything) project goal is to ease translations (and more
    1.42 +interestingly, the maintenance of translations) using gettext tools on
    1.43 +areas where they were not expected like documentation.
    1.44 +
    1.45 +This class is the ancestor of every po4a parsers used to parse a document to
    1.46 +search translatable strings, extract them to a po file and replace them by
    1.47 +their translation in the output document. 
    1.48 +
    1.49 +More formally, it takes the following arguments as input:
    1.50 +
    1.51 +=over 2
    1.52 +
    1.53 +=item -
    1.54 +
    1.55 +a document to translate ;
    1.56 +
    1.57 +=item -
    1.58 +
    1.59 +a po file containing the translations to use.
    1.60 +
    1.61 +=back
    1.62 +
    1.63 +As output, it produces:
    1.64 +
    1.65 +=over 2
    1.66 +
    1.67 +=item -
    1.68 +
    1.69 +another po file, resulting of the extraction of translatable strings from
    1.70 +the input document ;
    1.71 +
    1.72 +=item -
    1.73 +
    1.74 +a translated document, with the same structure than the one in input, but
    1.75 +with all translatable strings replaced with the translations found in the
    1.76 +po file provided in input.
    1.77 +
    1.78 +=back
    1.79 +
    1.80 +Here is a graphical representation of this:
    1.81 +
    1.82 +   Input document --\                             /---> Output document
    1.83 +                     \                           /       (translated)
    1.84 +                      +-> parse() function -----+
    1.85 +                     /                           \
    1.86 +   Input po --------/                             \---> Output po
    1.87 +                                                         (extracted)
    1.88 +
    1.89 +=head1 FUNCTIONS YOUR PARSER SHOULD OVERRIDE
    1.90 +
    1.91 +=over 4
    1.92 +
    1.93 +=item parse()
    1.94 +
    1.95 +This is where all the work takes place: the parsing of input documents, the
    1.96 +generation of output, and the extraction of the translatable strings. This
    1.97 +is pretty simple using the provided functions presented in the section
    1.98 +"INTERNAL FUNCTIONS" below. See also the synopsis, which present an
    1.99 +example.
   1.100 +
   1.101 +This function is called by the process() function bellow, but if you choose
   1.102 +to use the new() function, and to add content manually to your document,
   1.103 +you will have to call this function yourself.
   1.104 +
   1.105 +=item docheader()
   1.106 +
   1.107 +This function returns the header we should add to the produced document,
   1.108 +quoted properly to be a comment in the target language.  See the section
   1.109 +"Educating developers about translations", from L<po4a(7)|po4a.7>, for what
   1.110 +it is good for.
   1.111 +
   1.112 +=back
   1.113 +
   1.114 +=cut
   1.115 +
   1.116 +sub docheader {}
   1.117 +
   1.118 +sub parse {}
   1.119 +
   1.120 +=head1 SYNOPSIS
   1.121 +
   1.122 +The following example parses a list of paragraphs beginning with "<p>". For the sake
   1.123 +of simplicity, we assume that the document is well formatted, i.e. that '<p>'
   1.124 +tags are the only tags present, and that this tag is at the very beginning
   1.125 +of each paragraph.
   1.126 +
   1.127 + sub parse {
   1.128 +   my $self = shift;
   1.129 +
   1.130 +   PARAGRAPH: while (1) {
   1.131 +       my ($paragraph,$pararef)=("","");
   1.132 +       my $first=1;
   1.133 +       my ($line,$lref)=$self->shiftline();
   1.134 +       while (defined($line)) {
   1.135 +	   if ($line =~ m/<p>/ && !$first--; ) {
   1.136 +	       # Not the first time we see <p>. 
   1.137 +	       # Reput the current line in input,
   1.138 +	       #  and put the built paragraph to output
   1.139 +	       $self->unshiftline($line,$lref);
   1.140 +	      
   1.141 +	       # Now that the document is formed, translate it:
   1.142 +	       #   - Remove the leading tag
   1.143 +	       $paragraph =~ s/^<p>//s;
   1.144 +
   1.145 +	       #   - push to output the leading tag (untranslated) and the
   1.146 +	       #     rest of the paragraph (translated)
   1.147 +	       $self->pushline(  "<p>"
   1.148 +                               . $document->translate($paragraph,$pararef)
   1.149 +                               );
   1.150 +
   1.151 + 	       next PARAGRAPH;
   1.152 +	   } else {
   1.153 +	       # Append to the paragraph
   1.154 +	       $paragraph .= $line;
   1.155 +	       $pararef = $lref unless(length($pararef));
   1.156 +	   }
   1.157 +
   1.158 +           # Reinit the loop
   1.159 +           ($line,$lref)=$self->shiftline();
   1.160 +       }
   1.161 +       # Did not get a defined line? End of input file.
   1.162 +       return;
   1.163 +   }
   1.164 + } 
   1.165 +
   1.166 +Once you've implemented the parse function, you can use your document
   1.167 +class, using the public interface presented in the next section.
   1.168 +
   1.169 +=head1 PUBLIC INTERFACE for scripts using your parser
   1.170 +
   1.171 +=head2 Constructor
   1.172 +
   1.173 +=over 4
   1.174 +
   1.175 +=item process(%)
   1.176 +
   1.177 +This function can do all you need to do with a po4a document in one
   1.178 +invocation. Its arguments must be packed as a hash. ACTIONS:
   1.179 +
   1.180 +=over 3
   1.181 +
   1.182 +=item a.
   1.183 +
   1.184 +Reads all the po files specified in po_in_name
   1.185 +
   1.186 +=item b.
   1.187 +
   1.188 +Reads all original documents specified in file_in_name
   1.189 +
   1.190 +=item c.
   1.191 +
   1.192 +Parses the document
   1.193 +
   1.194 +=item d.
   1.195 +
   1.196 +Reads and applies all the addenda specified
   1.197 +
   1.198 +=item e.
   1.199 +
   1.200 +Writes the translated document to file_out_name (if given)
   1.201 +
   1.202 +=item f.
   1.203 +
   1.204 +Writes the extracted po file to po_out_name (if given)
   1.205 +
   1.206 +=back
   1.207 +
   1.208 +ARGUMENTS, beside the ones accepted by new() (with expected type):
   1.209 +
   1.210 +=over 4
   1.211 +
   1.212 +=item file_in_name (@)
   1.213 +
   1.214 +List of filenames where we should read the input document.
   1.215 +
   1.216 +=item file_in_charset ($)
   1.217 +
   1.218 +Charset used in the input document (if it isn't specified, it will try
   1.219 +to detect it from the input document).
   1.220 +
   1.221 +=item file_out_name ($)
   1.222 +
   1.223 +Filename where we should write the output document.
   1.224 +
   1.225 +=item file_out_charset ($)
   1.226 +
   1.227 +Charset used in the output document (if it isn't specified, it will use
   1.228 +the po file charset).
   1.229 +
   1.230 +=item po_in_name (@)
   1.231 +
   1.232 +List of filenames where we should read the input po files from, containing
   1.233 +the translation which will be used to translate the document.
   1.234 +
   1.235 +=item po_out_name ($)
   1.236 +
   1.237 +Filename where we should write the output po file, containing the strings
   1.238 +extracted from the input document.
   1.239 +
   1.240 +=item addendum (@)
   1.241 +
   1.242 +List of filenames where we should read the addenda from.
   1.243 +
   1.244 +=item addendum_charset ($)
   1.245 +
   1.246 +Charset for the addenda.
   1.247 +
   1.248 +=back
   1.249 +
   1.250 +=item new(%)
   1.251 +
   1.252 +Create a new Po4a document. Accepted options (but be in a hash):
   1.253 +
   1.254 +=over 4
   1.255 +
   1.256 +=item verbose ($)
   1.257 +
   1.258 +Sets the verbosity.
   1.259 +
   1.260 +=item debug ($)
   1.261 +
   1.262 +Sets the debugging.
   1.263 +
   1.264 +=back
   1.265 +
   1.266 +=cut
   1.267 +
   1.268 +sub process {
   1.269 +    ## Determine if we were called via an object-ref or a classname
   1.270 +    my $self = shift;
   1.271 +
   1.272 +    ## Any remaining arguments are treated as initial values for the
   1.273 +    ## hash that is used to represent this object.
   1.274 +    my %params = @_;
   1.275 +    
   1.276 +    # Build the args for new()
   1.277 +    my %newparams = ();
   1.278 +    foreach (keys %params) {
   1.279 +	next if ($_ eq 'po_in_name' ||
   1.280 +		 $_ eq 'po_out_name' ||
   1.281 +		 $_ eq 'file_in_name' ||
   1.282 +		 $_ eq 'file_in_charset' ||
   1.283 +		 $_ eq 'file_out_name' ||
   1.284 +		 $_ eq 'file_out_charset' ||
   1.285 +		 $_ eq 'addendum' ||
   1.286 +		 $_ eq 'addendum_charset');
   1.287 +	$newparams{$_}=$params{$_};
   1.288 +    }
   1.289 +
   1.290 +    $self->detected_charset($params{'file_in_charset'});
   1.291 +    $self->{TT}{'file_out_charset'}=$params{'file_out_charset'};
   1.292 +    if (defined($self->{TT}{'file_out_charset'}) and
   1.293 +	length($self->{TT}{'file_out_charset'})) {
   1.294 +	$self->{TT}{'file_out_encoder'} = find_encoding($self->{TT}{'file_out_charset'});
   1.295 +    }
   1.296 +    $self->{TT}{'addendum_charset'}=$params{'addendum_charset'};
   1.297 +
   1.298 +    foreach my $file (@{$params{'po_in_name'}}) {
   1.299 +	print STDERR "readpo($file)... " if $self->debug();
   1.300 +	$self->readpo($file);
   1.301 +	print STDERR "done.\n" if $self->debug()
   1.302 +    }
   1.303 +    foreach my $file (@{$params{'file_in_name'}}) {
   1.304 +	print STDERR "read($file)..." if $self->debug();
   1.305 +	$self->read($file);
   1.306 +	print STDERR "done.\n"  if $self->debug();
   1.307 +    }
   1.308 +    print STDERR "parse..." if $self->debug();
   1.309 +    $self->parse();
   1.310 +    print STDERR "done.\n" if $self->debug();
   1.311 +    foreach my $file (@{$params{'addendum'}}) {
   1.312 +	print STDERR "addendum($file)..." if $self->debug();
   1.313 +	$self->addendum($file) || die "An addendum failed\n";
   1.314 +	print STDERR "done.\n" if $self->debug();
   1.315 +    }
   1.316 +    if (defined $params{'file_out_name'}) {
   1.317 +	print STDERR "write(".$params{'file_out_name'}.")... " 
   1.318 +	    if $self->debug();
   1.319 +	$self->write($params{'file_out_name'});
   1.320 +	print STDERR "done.\n" if $self->debug();
   1.321 +    }
   1.322 +    if (defined $params{'po_out_name'}) {
   1.323 +	print STDERR "writepo(".$params{'po_out_name'}.")... "
   1.324 +	     if $self->debug();
   1.325 +	$self->writepo($params{'po_out_name'});
   1.326 +	print STDERR "done.\n" if $self->debug();
   1.327 +    }
   1.328 +    return $self;
   1.329 +}
   1.330 +
   1.331 +sub new {
   1.332 +    ## Determine if we were called via an object-ref or a classname
   1.333 +    my $this = shift;
   1.334 +    my $class = ref($this) || $this;
   1.335 +    my $self = { };
   1.336 +    my %options=@_;
   1.337 +    ## Bless ourselves into the desired class and perform any initialization
   1.338 +    bless $self, $class;
   1.339 +    
   1.340 +    ## initialize the plugin
   1.341 +    # prevent the plugin from croaking on the options intended for Po.pm
   1.342 +    $self->{options}{'porefs'} = '';
   1.343 +    # let the plugin parse the options and such
   1.344 +    $self->initialize(%options);
   1.345 +
   1.346 +    ## Create our private data
   1.347 +    my %po_options;
   1.348 +    $po_options{'porefs'} = $self->{options}{'porefs'};
   1.349 +    
   1.350 +    # private data
   1.351 +    $self->{TT}=(); 
   1.352 +    $self->{TT}{po_in}=Locale::Po4a::Po->new();
   1.353 +    $self->{TT}{po_out}=Locale::Po4a::Po->new(\%po_options);
   1.354 +    # Warning, this is an array of array:
   1.355 +    #  The document is splited on lines, and for each
   1.356 +    #  [0] is the line content, [1] is the reference [2] the type
   1.357 +    $self->{TT}{doc_in}=();
   1.358 +    $self->{TT}{doc_out}=();
   1.359 +    if (defined $options{'verbose'}) {
   1.360 +	$self->{TT}{verbose}  =  $options{'verbose'};
   1.361 +    }
   1.362 +    if (defined $options{'debug'}) {
   1.363 +	$self->{TT}{debug}  =  $options{'debug'};
   1.364 +    }
   1.365 +    # Input document is in ascii until we prove the opposite (in read())
   1.366 +    $self->{TT}{ascii_input}=1;
   1.367 +    # We try not to use utf unless it's forced from the outside (in case the
   1.368 +    # document isn't in ascii)
   1.369 +    $self->{TT}{utf_mode}=0;
   1.370 +
   1.371 +    
   1.372 +    return $self;
   1.373 +}
   1.374 +
   1.375 +=back
   1.376 +
   1.377 +=head2 Manipulating document files
   1.378 +
   1.379 +=over 4
   1.380 +
   1.381 +=item read($)
   1.382 +
   1.383 +Add another input document at the end of the existing one. The argument is
   1.384 +the filename to read. 
   1.385 +
   1.386 +Please note that it does not parse anything. You should use the parse()
   1.387 +function when you're done with packing input files into the document. 
   1.388 +
   1.389 +=cut
   1.390 +
   1.391 +#'
   1.392 +sub read() {
   1.393 +    my $self=shift;
   1.394 +    my $filename=shift
   1.395 +	or croak wrap_msg(dgettext("po4a", "Can't read from file without having a filename"));
   1.396 +    my $linenum=0;
   1.397 +
   1.398 +    open INPUT,"<$filename" 
   1.399 +	or croak wrap_msg(dgettext("po4a", "Can't read from %s: %s"), $filename, $!);
   1.400 +    while (defined (my $textline = <INPUT>)) {
   1.401 +	$linenum++;
   1.402 +	my $ref="$filename:$linenum";
   1.403 +	my @entry=($textline,$ref);
   1.404 +	push @{$self->{TT}{doc_in}}, @entry;
   1.405 +
   1.406 +	if (!defined($self->{TT}{'file_in_charset'})) {
   1.407 +	    # Detect if this file has non-ascii characters
   1.408 +	    if($self->{TT}{ascii_input}) {
   1.409 +		my $decoder = guess_encoding($textline);
   1.410 +		if (!ref($decoder) or $decoder !~ /Encode::XS=/) {
   1.411 +		    # We have detected a non-ascii line
   1.412 +		    $self->{TT}{ascii_input} = 0;
   1.413 +		    # Save the reference for future error message
   1.414 +		    $self->{TT}{non_ascii_ref} ||= $ref;
   1.415 +		}
   1.416 +	    }
   1.417 +	}
   1.418 +    }
   1.419 +    close INPUT 
   1.420 +	or croak wrap_msg(dgettext("po4a", "Can't close %s after reading: %s"), $filename, $!);
   1.421 +
   1.422 +}
   1.423 +
   1.424 +=item write($)
   1.425 +
   1.426 +Write the translated document to the given filename.
   1.427 +
   1.428 +=cut
   1.429 +
   1.430 +sub write {
   1.431 +    my $self=shift;
   1.432 +    my $filename=shift
   1.433 +	or croak wrap_msg(dgettext("po4a", "Can't write to a file without filename"));
   1.434 +
   1.435 +    my $fh;
   1.436 +    if ($filename eq '-') {
   1.437 +	$fh=\*STDOUT;
   1.438 +    } else {
   1.439 +	# make sure the directory in which we should write the localized file exists
   1.440 +	my $dir = $filename;
   1.441 +	if ($dir =~ m|/|) {
   1.442 +	    $dir =~ s|/[^/]*$||;
   1.443 +	
   1.444 +	    File::Path::mkpath($dir, 0, 0755) # Croaks on error
   1.445 +	      if (length ($dir) && ! -e $dir);
   1.446 +	}
   1.447 +	open $fh,">$filename"
   1.448 +	    or croak wrap_msg(dgettext("po4a", "Can't write to %s: %s"), $filename, $!);
   1.449 +    }
   1.450 +    
   1.451 +    map { print $fh $_ } $self->docheader();
   1.452 +    map { print $fh $_ } @{$self->{TT}{doc_out}};
   1.453 +
   1.454 +    if ($filename ne '-') {
   1.455 +	close $fh or croak wrap_msg(dgettext("po4a", "Can't close %s after writing: %s"), $filename, $!);
   1.456 +    }
   1.457 +
   1.458 +}
   1.459 +
   1.460 +=back
   1.461 +
   1.462 +=head2 Manipulating po files
   1.463 +
   1.464 +=over 4 
   1.465 +
   1.466 +=item readpo($)
   1.467 +
   1.468 +Add the content of a file (which name is passed in argument) to the
   1.469 +existing input po. The old content is not discarded.
   1.470 +
   1.471 +=item writepo($)
   1.472 +
   1.473 +Write the extracted po file to the given filename.
   1.474 +
   1.475 +=item stats()
   1.476 +
   1.477 +Returns some statistics about the translation done so far. Please note that
   1.478 +it's not the same statistics than the one printed by msgfmt
   1.479 +--statistic. Here, it's stats about recent usage of the po file, while
   1.480 +msgfmt reports the status of the file. It is a wrapper to the
   1.481 +Locale::Po4a::Po::stats_get function applied to the input po file. Example
   1.482 +of use:
   1.483 +
   1.484 +    [normal use of the po4a document...]
   1.485 +
   1.486 +    ($percent,$hit,$queries) = $document->stats();
   1.487 +    print "We found translations for $percent\%  ($hit from $queries) of strings.\n";
   1.488 +
   1.489 +=back
   1.490 +
   1.491 +=cut
   1.492 +
   1.493 +sub getpoout {
   1.494 +    return $_[0]->{TT}{po_out};
   1.495 +}
   1.496 +sub setpoout {
   1.497 +    $_[0]->{TT}{po_out} = $_[1];
   1.498 +}
   1.499 +sub readpo  { 
   1.500 +    $_[0]->{TT}{po_in}->read($_[1]);        
   1.501 +}
   1.502 +sub writepo { 
   1.503 +    $_[0]->{TT}{po_out}->write( $_[1] );    
   1.504 +}
   1.505 +sub stats   { 
   1.506 +    return $_[0]->{TT}{po_in}->stats_get(); 
   1.507 +}
   1.508 +
   1.509 +=head2 Manipulating addenda
   1.510 +
   1.511 +=over 4
   1.512 +
   1.513 +=item addendum($)
   1.514 +
   1.515 +Please refer to L<po4a(7)|po4a.7> for more information on what addenda are,
   1.516 +and how translators should write them. To apply an addendum to the translated
   1.517 +document, simply pass its filename to this function and you are done ;)
   1.518 +
   1.519 +This function returns a non-null integer on error.
   1.520 +
   1.521 +=cut
   1.522 +
   1.523 +# Internal function to read the header.
   1.524 +sub addendum_parse {
   1.525 +    my ($filename,$header)=shift;
   1.526 +
   1.527 +    my ($errcode,$mode,$position,$boundary,$bmode,$content)=
   1.528 +	(1,"","","","","");
   1.529 +
   1.530 +    unless (open (INS, "<$filename")) {
   1.531 +	warn wrap_msg(dgettext("po4a", "Can't read from %s: %s"), $filename, $!);
   1.532 +	goto END_PARSE_ADDFILE;
   1.533 +    } 
   1.534 +
   1.535 +    unless (defined ($header=<INS>) && $header)  {
   1.536 +	warn wrap_msg(dgettext("po4a", "Can't read Po4a header from %s."), $filename);
   1.537 +	goto END_PARSE_ADDFILE;
   1.538 +    }
   1.539 +
   1.540 +    unless ($header =~ s/PO4A-HEADER://i) {
   1.541 +	warn wrap_msg(dgettext("po4a", "First line of %s does not look like a Po4a header."), $filename);
   1.542 +	goto END_PARSE_ADDFILE;
   1.543 +    }
   1.544 +    foreach my $part (split(/;/,$header)) {
   1.545 +	unless ($part =~ m/^\s*([^=]*)=(.*)$/) {
   1.546 +	    warn wrap_msg(dgettext("po4a", "Syntax error in Po4a header of %s, near \"%s\""), $filename, $part);
   1.547 +	    goto END_PARSE_ADDFILE;
   1.548 +	}
   1.549 +	my ($key,$value)=($1,$2);
   1.550 +	$key=lc($key);
   1.551 +  	     if ($key eq 'mode')     {  $mode=lc($value);
   1.552 +	} elsif ($key eq 'position') {  $position=$value;
   1.553 +	} elsif ($key eq 'endboundary') {  
   1.554 +	    $boundary=$value;
   1.555 +	    $bmode='after';
   1.556 +	} elsif ($key eq 'beginboundary') {  
   1.557 +	    $boundary=$value;
   1.558 +	    $bmode='before';
   1.559 +	} else { 
   1.560 +	    warn wrap_msg(dgettext("po4a", "Invalid argument in the Po4a header of %s: %s"), $filename, $key);
   1.561 +	    goto END_PARSE_ADDFILE;
   1.562 +	}
   1.563 +    }
   1.564 +
   1.565 +    unless (length($mode)) {
   1.566 +	warn wrap_msg(dgettext("po4a", "The Po4a header of %s does not define the mode."), $filename);
   1.567 +	goto END_PARSE_ADDFILE;
   1.568 +    }
   1.569 +    unless ($mode eq "before" || $mode eq "after") {
   1.570 +	warn wrap_msg(dgettext("po4a", "Mode invalid in the Po4a header of %s: should be 'before' or 'after' not %s."), $filename, $mode);
   1.571 +	goto END_PARSE_ADDFILE;
   1.572 +    }
   1.573 +
   1.574 +    unless (length($position)) {
   1.575 +	warn wrap_msg(dgettext("po4a", "The Po4a header of %s does not define the position."), $filename);
   1.576 +	goto END_PARSE_ADDFILE;
   1.577 +    }
   1.578 +    unless ($mode eq "before" || length($boundary)) {
   1.579 +    	warn wrap_msg(dgettext("po4a", "No ending boundary given in the Po4a header, but mode=after."));
   1.580 +	goto END_PARSE_ADDFILE;
   1.581 +    }
   1.582 +
   1.583 +    while (defined(my $line = <INS>)) {
   1.584 +	$content .= $line;
   1.585 +    }
   1.586 +    close INS;
   1.587 +
   1.588 +    $errcode=0;
   1.589 +  END_PARSE_ADDFILE: 
   1.590 +      return ($errcode,$mode,$position,$boundary,$bmode,$content);
   1.591 +}
   1.592 +
   1.593 +sub mychomp {
   1.594 +    my ($str) = shift;
   1.595 +    chomp($str);
   1.596 +    return $str;
   1.597 +}
   1.598 +
   1.599 +sub addendum {
   1.600 +    my ($self,$filename) = @_;
   1.601 +
   1.602 +    print STDERR "Apply addendum $filename..." if $self->debug();
   1.603 +    unless ($filename) {
   1.604 +	warn wrap_msg(dgettext("po4a",
   1.605 +	    "Can't apply addendum when not given the filename"));
   1.606 +	return 0;
   1.607 +    }
   1.608 +    die wrap_msg(dgettext("po4a", "Addendum %s does not exist."), $filename)
   1.609 +      unless -e $filename;
   1.610 +  
   1.611 +    my ($errcode,$mode,$position,$boundary,$bmode,$content)=
   1.612 +	addendum_parse($filename);
   1.613 +    return 0 if ($errcode);
   1.614 +
   1.615 +    print STDERR "mode=$mode;pos=$position;bound=$boundary;bmode=$bmode;ctn=$content\n"
   1.616 +      if $self->debug();
   1.617 +    
   1.618 +    # We only recode the addendum if an origin charset is specified, else we
   1.619 +    # suppose it's already in the output document's charset
   1.620 +    if (defined($self->{TT}{'addendum_charset'}) &&
   1.621 +        length($self->{TT}{'addendum_charset'})) {
   1.622 +	Encode::from_to($content,$self->{TT}{'addendum_charset'},
   1.623 +	    $self->get_out_charset);
   1.624 +    }
   1.625 +
   1.626 +    my $found = scalar grep { /$position/ } @{$self->{TT}{doc_out}};
   1.627 +    if ($found == 0) {
   1.628 +	warn wrap_msg(dgettext("po4a",
   1.629 +	    "No candidate position for the addendum %s."), $filename);
   1.630 +	return 0;
   1.631 +    }
   1.632 +    if ($found > 1) {
   1.633 +	warn wrap_msg(dgettext("po4a",
   1.634 +	    "More than one candidate position found for the addendum %s."), $filename);
   1.635 +	return 0;
   1.636 +    }
   1.637 +
   1.638 +    if ($mode eq "before") {
   1.639 +	if ($self->verbose() > 1 || $self->debug() ) {
   1.640 +	    map { print STDERR wrap_msg(dgettext("po4a", "Addendum '%s' applied before this line: %s"), $filename, $_) if (/$position/);
   1.641 + 	        } @{$self->{TT}{doc_out}};
   1.642 +	}
   1.643 +	@{$self->{TT}{doc_out}} = map { /$position/ ? ($content,$_) : $_ 
   1.644 +                                        }  @{$self->{TT}{doc_out}};
   1.645 +    } else {
   1.646 +	my @newres=();
   1.647 +
   1.648 +	do {
   1.649 +	    # make sure it doesnt whine on empty document
   1.650 +	    my $line = scalar @{$self->{TT}{doc_out}} ? shift @{$self->{TT}{doc_out}} : "";
   1.651 +	    push @newres,$line;
   1.652 +	    my $outline=mychomp($line);
   1.653 +	    $outline =~ s/^[ \t]*//;
   1.654 +	      
   1.655 +	    if ($line =~ m/$position/) {
   1.656 +		while ($line=shift @{$self->{TT}{doc_out}}) {
   1.657 +		    last if ($line=~/$boundary/);
   1.658 +		    push @newres,$line;
   1.659 +		}
   1.660 +		if (defined $line) {
   1.661 +		    if ($bmode eq 'before') {
   1.662 +			print wrap_msg(dgettext("po4a",
   1.663 +			    "Addendum '%s' applied before this line: %s"),
   1.664 +			    $filename, $outline)
   1.665 +			  if ($self->verbose() > 1 || $self->debug());
   1.666 +			push @newres,$content;
   1.667 +			push @newres,$line;
   1.668 +		    } else {
   1.669 +			print wrap_msg(dgettext("po4a",
   1.670 +			    "Addendum '%s' applied after the line: %s."),
   1.671 +			    $filename, $outline)
   1.672 +			  if ($self->verbose() > 1 || $self->debug());
   1.673 +			push @newres,$line;
   1.674 +			push @newres,$content;
   1.675 +		    }
   1.676 +		} else {
   1.677 +		    print wrap_msg(dgettext("po4a", "Addendum '%s' applied at the end of the file."), $filename)
   1.678 +		      if ($self->verbose() > 1 || $self->debug());
   1.679 +		    push @newres,$content;
   1.680 +		}
   1.681 +	    }
   1.682 +	} while (scalar @{$self->{TT}{doc_out}});
   1.683 +	@{$self->{TT}{doc_out}} = @newres;
   1.684 +    }
   1.685 +    print STDERR "done.\n" if $self->debug();
   1.686 +    return 1;
   1.687 +}
   1.688 +
   1.689 +=back
   1.690 +
   1.691 +=head1 INTERNAL FUNCTIONS used to write derivated parsers
   1.692 +
   1.693 +=head2 Getting input, providing output
   1.694 +
   1.695 +Four functions are provided to get input and return output. They are very
   1.696 +similar to shift/unshift and push/pop. The first pair is about input, while
   1.697 +the second is about output. Mnemonic: in input, you are interested in the
   1.698 +first line, what shift gives, and in output you want to add your result at
   1.699 +the end, like push does.
   1.700 +
   1.701 +=over 4
   1.702 +
   1.703 +=item shiftline()
   1.704 +
   1.705 +This function returns the next line of the doc_in to be parsed and its
   1.706 +reference (packed as an array).
   1.707 +
   1.708 +=item unshiftline($$)
   1.709 +
   1.710 +Unshifts a line of the input document and its reference. 
   1.711 +
   1.712 +=item pushline($)
   1.713 +
   1.714 +Push a new line to the doc_out.
   1.715 +
   1.716 +=item popline()
   1.717 +
   1.718 +Pop the last pushed line from the doc_out.
   1.719 +
   1.720 +=back
   1.721 +
   1.722 +=cut
   1.723 +
   1.724 +sub shiftline   {  
   1.725 +    my ($line,$ref)=(shift @{$_[0]->{TT}{doc_in}},
   1.726 +		     shift @{$_[0]->{TT}{doc_in}}); 
   1.727 +    return ($line,$ref);
   1.728 +}
   1.729 +sub unshiftline {
   1.730 +	my $self = shift;
   1.731 +	unshift @{$self->{TT}{doc_in}},@_;
   1.732 +}
   1.733 +
   1.734 +sub pushline    {  push @{$_[0]->{TT}{doc_out}}, $_[1] if defined $_[1]; }
   1.735 +sub popline     {  return pop @{$_[0]->{TT}{doc_out}};            }
   1.736 +
   1.737 +=head2 Marking strings as translatable
   1.738 +
   1.739 +One function is provided to handle the text which should be translated. 
   1.740 +
   1.741 +=over 4
   1.742 +
   1.743 +=item translate($$$)
   1.744 +
   1.745 +Mandatory arguments:
   1.746 +
   1.747 +=over 2
   1.748 +
   1.749 +=item -
   1.750 +
   1.751 +A string to translate
   1.752 +
   1.753 +=item -
   1.754 +
   1.755 +The reference of this string (ie, position in inputfile)
   1.756 +
   1.757 +=item -
   1.758 +
   1.759 +The type of this string (ie, the textual description of its structural role
   1.760 +; used in Locale::Po4a::Po::gettextization() ; see also L<po4a(7)|po4a.7>,
   1.761 +section I<Gettextization: how does it work?>)
   1.762 +
   1.763 +=back
   1.764 +
   1.765 +This function can also take some extra arguments. They must be organized as
   1.766 +a hash. For example:
   1.767 +
   1.768 +  $self->translate("string","ref","type",
   1.769 +		   'wrap' => 1);
   1.770 +
   1.771 +=over
   1.772 +
   1.773 +=item wrap
   1.774 +
   1.775 +boolean indicating whether we can consider that whitespaces in string are
   1.776 +not important. If yes, the function canonizes the string before looking for
   1.777 +a translation or extracting it, and wraps the translation.
   1.778 +
   1.779 +=item wrapcol
   1.780 +
   1.781 +The column at which we should wrap (default: 76).
   1.782 +
   1.783 +=item comment
   1.784 +
   1.785 +An extra comment to add to the entry.
   1.786 +
   1.787 +=back
   1.788 +
   1.789 +Actions:
   1.790 +
   1.791 +=over 2
   1.792 +
   1.793 +=item -
   1.794 +
   1.795 +Pushes the string, reference and type to po_out.
   1.796 +
   1.797 +=item -
   1.798 +
   1.799 +Returns the translation of the string (as found in po_in) so that the
   1.800 +parser can build the doc_out.
   1.801 +
   1.802 +=item -
   1.803 +
   1.804 +Handles the charsets to recode the strings before sending them to
   1.805 +po_out and before returning the translations.
   1.806 +
   1.807 +=back
   1.808 +
   1.809 +=back
   1.810 +
   1.811 +=cut
   1.812 +
   1.813 +sub translate {
   1.814 +    my $self=shift;
   1.815 +    my ($string,$ref,$type)=(shift,shift,shift);
   1.816 +    my (%options)=@_;
   1.817 +
   1.818 +    # my $validoption="wrap wrapcol";
   1.819 +    # my %validoption;
   1.820 +
   1.821 +    return "" unless defined($string) && length($string);
   1.822 +
   1.823 +    # map { $validoption{$_}=1 } (split(/ /,$validoption));
   1.824 +    # foreach (keys %options) {
   1.825 +    #	Carp::confess "internal error: translate() called with unknown arg $_. Valid options: $validoption"
   1.826 +    #	    unless $validoption{$_};
   1.827 +    # }
   1.828 +
   1.829 +    my $in_charset;
   1.830 +    if ($self->{TT}{ascii_input}) {
   1.831 +	$in_charset = "ascii";
   1.832 +    } else {
   1.833 +	if (defined($self->{TT}{'file_in_charset'}) and
   1.834 +	    length($self->{TT}{'file_in_charset'}) and
   1.835 +	    $self->{TT}{'file_in_charset'} !~ m/ascii/i) {
   1.836 +	    $in_charset=$self->{TT}{'file_in_charset'};
   1.837 +	} else {
   1.838 +	    # FYI, the document charset have to be determined *before* we see the first
   1.839 +	    # string to recode.
   1.840 +	    die wrap_mod("po4a", dgettext("po4a", "Couldn't determine the input document's charset. Please specify it on the command line. (non-ascii char at %s)"), $self->{TT}{non_ascii_ref})
   1.841 +	}
   1.842 +    }
   1.843 +
   1.844 +    if ($self->{TT}{po_in}->get_charset ne "CHARSET") {
   1.845 +	$string = encode_from_to($string,
   1.846 +	                         $self->{TT}{'file_in_encoder'},
   1.847 +	                         $self->{TT}{po_in}{encoder});
   1.848 +    }
   1.849 +
   1.850 +    if (defined $options{'wrapcol'} && $options{'wrapcol'} < 0) {
   1.851 +# FIXME: should be the parameter given with --width
   1.852 +        $options{'wrapcol'} = 76 + $options{'wrapcol'};
   1.853 +    }
   1.854 +    my $transstring = $self->{TT}{po_in}->gettext($string,
   1.855 +					'wrap'      => $options{'wrap'}||0,
   1.856 +					'wrapcol'   => $options{'wrapcol'});
   1.857 +
   1.858 +    if ($self->{TT}{po_in}->get_charset ne "CHARSET") {
   1.859 +	my $out_encoder = $self->{TT}{'file_out_encoder'};
   1.860 +	unless (defined $out_encoder) {
   1.861 +	    $out_encoder = find_encoding($self->get_out_charset)
   1.862 +	}
   1.863 +	$transstring = encode_from_to($transstring,
   1.864 +	                              $self->{TT}{po_in}{encoder},
   1.865 +	                              $out_encoder);
   1.866 +    }
   1.867 +
   1.868 +    # If the input document isn't completely in ascii, we should see what to
   1.869 +    # do with the current string
   1.870 +    unless ($self->{TT}{ascii_input}) {
   1.871 +        my $out_charset = $self->{TT}{po_out}->get_charset;
   1.872 +	# We set the output po charset 
   1.873 +        if ($out_charset eq "CHARSET") {
   1.874 +	    if ($self->{TT}{utf_mode}) {
   1.875 +		$out_charset="utf-8";
   1.876 +	    } else {
   1.877 +		$out_charset=$in_charset;
   1.878 +	    }
   1.879 +	    $self->{TT}{po_out}->set_charset($out_charset);
   1.880 +	}
   1.881 +	if ( $in_charset !~ /^$out_charset$/i ) {
   1.882 +	    Encode::from_to($string,$in_charset,$out_charset);
   1.883 +	    if (defined($options{'comment'}) and length($options{'comment'})) {
   1.884 +		Encode::from_to($options{'comment'},$in_charset,$out_charset);
   1.885 +	    }
   1.886 +	}
   1.887 +    }
   1.888 +
   1.889 +    # the comments provided by the modules are automatic comments from the PO point of view
   1.890 +    $self->{TT}{po_out}->push('msgid'     => $string,
   1.891 +			      'reference' => $ref,
   1.892 +			      'type'      => $type,
   1.893 +	                      'automatic' => $options{'comment'},
   1.894 +			      'wrap'      => $options{'wrap'}||0,
   1.895 +			      'wrapcol'   => $options{'wrapcol'});
   1.896 +
   1.897 +#    if ($self->{TT}{po_in}->get_charset ne "CHARSET") {
   1.898 +#	Encode::from_to($transstring,$self->{TT}{po_in}->get_charset,
   1.899 +#	    $self->get_out_charset);
   1.900 +#    }
   1.901 +
   1.902 +    if ($options{'wrap'}||0) {
   1.903 +        $transstring =~ s/( *)$//s;
   1.904 +        my $trailing_spaces = $1||"";
   1.905 +        $transstring =~ s/ *$//gm;
   1.906 +        $transstring .= $trailing_spaces;
   1.907 +    }
   1.908 +
   1.909 +    return $transstring;
   1.910 +}
   1.911 +
   1.912 +=head2 Misc functions
   1.913 +
   1.914 +=over 4
   1.915 +
   1.916 +=item verbose()
   1.917 +
   1.918 +Returns if the verbose option was passed during the creation of the
   1.919 +TransTractor.
   1.920 +
   1.921 +=cut
   1.922 +
   1.923 +sub verbose {
   1.924 +    if (defined $_[1]) {
   1.925 +	$_[0]->{TT}{verbose} = $_[1];
   1.926 +    } else {
   1.927 +	return $_[0]->{TT}{verbose} || 0; # undef and 0 have the same meaning, but one generates warnings
   1.928 +    }
   1.929 +}
   1.930 +
   1.931 +=item debug()
   1.932 +
   1.933 +Returns if the debug option was passed during the creation of the
   1.934 +TransTractor.
   1.935 +
   1.936 +=cut
   1.937 +
   1.938 +sub debug {
   1.939 +    return $_[0]->{TT}{debug};
   1.940 +}
   1.941 +
   1.942 +=item detected_charset($)
   1.943 +
   1.944 +This tells TransTractor that a new charset (the first argument) has been
   1.945 +detected from the input document. It can usually be read from the document
   1.946 +header. Only the first charset will remain, coming either from the
   1.947 +process() arguments or detected from the document.
   1.948 +
   1.949 +=cut
   1.950 +
   1.951 +sub detected_charset {
   1.952 +    my ($self,$charset)=(shift,shift);
   1.953 +    unless (defined($self->{TT}{'file_in_charset'}) and
   1.954 +            length($self->{TT}{'file_in_charset'}) ) {
   1.955 +        $self->{TT}{'file_in_charset'}=$charset;
   1.956 +        if (defined $charset) {
   1.957 +            $self->{TT}{'file_in_encoder'}=find_encoding($charset);
   1.958 +        }
   1.959 +    }
   1.960 +
   1.961 +    if (defined $self->{TT}{'file_in_charset'} and
   1.962 +        length $self->{TT}{'file_in_charset'} and
   1.963 +        $self->{TT}{'file_in_charset'} !~ m/ascii/i) {
   1.964 +	$self->{TT}{ascii_input}=0;
   1.965 +    }
   1.966 +}
   1.967 +
   1.968 +=item get_out_charset()
   1.969 +
   1.970 +This function will return the charset that should be used in the output
   1.971 +document (usually useful to substitute the input document's detected charset
   1.972 +where it has been found).
   1.973 +
   1.974 +It will use the output charset specified in the command line. If it wasn't
   1.975 +specified, it will use the input po's charset, and if the input po has the
   1.976 +default "CHARSET", it will return the input document's charset, so that no
   1.977 +encoding is performed.
   1.978 +
   1.979 +=cut
   1.980 +
   1.981 +sub get_out_charset {
   1.982 +    my $self=shift;
   1.983 +    my $charset;
   1.984 +
   1.985 +    # Use the value specified at the command line
   1.986 +    if (defined($self->{TT}{'file_out_charset'}) and
   1.987 +	length($self->{TT}{'file_out_charset'})) {
   1.988 +	$charset=$self->{TT}{'file_out_charset'};
   1.989 +    } else {
   1.990 +	if ($self->{TT}{utf_mode} && $self->{TT}{ascii_input}) {
   1.991 +	    $charset="utf-8";
   1.992 +	} else {
   1.993 +	    $charset=$self->{TT}{po_in}->get_charset;
   1.994 +	    $charset=$self->{TT}{'file_in_charset'}
   1.995 +		if $charset eq "CHARSET" and
   1.996 +		    defined($self->{TT}{'file_in_charset'}) and
   1.997 +		    length($self->{TT}{'file_in_charset'});
   1.998 +	    $charset="ascii"
   1.999 +		if $charset eq "CHARSET";
  1.1000 +	}
  1.1001 +    }
  1.1002 +    return $charset;
  1.1003 +}
  1.1004 +
  1.1005 +=item recode_skipped_text($)
  1.1006 +
  1.1007 +This function returns the recoded text passed as argument, from the input
  1.1008 +document's charset to the output document's one. This isn't needed when
  1.1009 +translating a string (translate() recodes everything itself), but it is when
  1.1010 +you skip a string from the input document and you want the output document to
  1.1011 +be consistent with the global encoding.
  1.1012 +
  1.1013 +=cut
  1.1014 +
  1.1015 +sub recode_skipped_text {
  1.1016 +    my ($self,$text)=(shift,shift);
  1.1017 +    unless ($self->{TT}{'ascii_input'}) {
  1.1018 +	if(defined($self->{TT}{'file_in_charset'}) and
  1.1019 +	    length($self->{TT}{'file_in_charset'}) ) {
  1.1020 +	    $text = encode_from_to($text,
  1.1021 +	                           $self->{TT}{'file_in_encoder'},
  1.1022 +	                           find_encoding($self->get_out_charset));
  1.1023 +	} else {
  1.1024 +	    die wrap_mod("po4a", dgettext("po4a", "Couldn't determine the input document's charset. Please specify it on the command line. (non-ascii char at %s)"), $self->{TT}{non_ascii_ref})
  1.1025 +	}
  1.1026 +    }
  1.1027 +    return $text;
  1.1028 +}
  1.1029 +
  1.1030 +
  1.1031 +# encode_from_to($,$,$)
  1.1032 +#
  1.1033 +# Encode the given text from one encoding to another one.
  1.1034 +# It differs from Encode::from_to because it does not take the name of the
  1.1035 +# encoding in argument, but the encoders (as returned by the
  1.1036 +# Encode::find_encoding(<name>) method). Thus it permits to save a bunch
  1.1037 +# of call to find_encoding.
  1.1038 +#
  1.1039 +# If the "from" encoding is undefined, it is considered as UTF-8 (or
  1.1040 +# ascii).
  1.1041 +# If the "to" encoding is undefined, it is considered as UTF-8.
  1.1042 +#
  1.1043 +sub encode_from_to {
  1.1044 +    my ($text,$from,$to) = (shift,shift,shift);
  1.1045 +
  1.1046 +    if (not defined $from) {
  1.1047 +        # for ascii and UTF-8, no conversion needed to get an utf-8
  1.1048 +        # string.
  1.1049 +    } else {
  1.1050 +        $text = $from->decode($text, 0);
  1.1051 +    }
  1.1052 +
  1.1053 +    if (not defined $to) {
  1.1054 +        # Already in UTF-8, no conversion needed
  1.1055 +    } else {
  1.1056 +        $text = $to->encode($text, 0);
  1.1057 +    }
  1.1058 +
  1.1059 +    return $text;
  1.1060 +}
  1.1061 +
  1.1062 +=back
  1.1063 +
  1.1064 +=head1 FUTURE DIRECTIONS
  1.1065 +
  1.1066 +One shortcoming of the current TransTractor is that it can't handle
  1.1067 +translated document containing all languages, like debconf templates, or
  1.1068 +.desktop files.
  1.1069 +
  1.1070 +To address this problem, the only interface changes needed are:
  1.1071 +
  1.1072 +=over 2
  1.1073 +
  1.1074 +=item -
  1.1075 +
  1.1076 +take a hash as po_in_name (a list per language)
  1.1077 +
  1.1078 +=item -
  1.1079 +
  1.1080 +add an argument to translate to indicate the target language
  1.1081 +
  1.1082 +=item -
  1.1083 +
  1.1084 +make a pushline_all function, which would make pushline of its content for
  1.1085 +all language, using a map-like syntax:
  1.1086 +
  1.1087 +    $self->pushline_all({ "Description[".$langcode."]=".
  1.1088 +			  $self->translate($line,$ref,$langcode) 
  1.1089 +		        });
  1.1090 +
  1.1091 +=back
  1.1092 +
  1.1093 +Will see if it's enough ;)
  1.1094 +
  1.1095 +=head1 AUTHORS
  1.1096 +
  1.1097 + Denis Barbier <barbier@linuxfr.org>
  1.1098 + Martin Quinson (mquinson#debian.org)
  1.1099 + Jordi Vilalta <jvprat@gmail.com>
  1.1100 +
  1.1101 +=cut
  1.1102 +
  1.1103 +1;
author	Dongsheng Song <dongsheng.song@gmail.com>
date	Thu Mar 12 17:43:30 2009 +0800 (2009-03-12)
parents
children