hgbook

diff tools/po4a/lib/Locale/Po4a/Xml.pm @ 623:082bb76417f1
Add Po4a 0.37-dev(2009-03-08)
author: Dongsheng Song <dongsheng.song@gmail.com>
date: Thu Mar 12 15:43:56 2009 +0800 (2009-03-12)
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/tools/po4a/lib/Locale/Po4a/Xml.pm	Thu Mar 12 15:43:56 2009 +0800
     1.3 @@ -0,0 +1,1973 @@
     1.4 +#!/usr/bin/perl
     1.5 +
     1.6 +# Po4a::Xml.pm 
     1.7 +# 
     1.8 +# extract and translate translatable strings from XML documents.
     1.9 +# 
    1.10 +# This code extracts plain text from tags and attributes from generic
    1.11 +# XML documents, and it can be used as a base to build modules for
    1.12 +# XML-based documents.
    1.13 +#
    1.14 +# Copyright (c) 2004 by Jordi Vilalta  <jvprat@gmail.com>
    1.15 +# Copyright (c) 2008-2009 by Nicolas François  <nicolas.francois@centraliens.net>
    1.16 +#
    1.17 +# This program is free software; you can redistribute it and/or modify
    1.18 +# it under the terms of the GNU General Public License as published by
    1.19 +# the Free Software Foundation; either version 2 of the License, or
    1.20 +# (at your option) any later version.
    1.21 +#
    1.22 +# This program is distributed in the hope that it will be useful,
    1.23 +# but WITHOUT ANY WARRANTY; without even the implied warranty of
    1.24 +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    1.25 +# GNU General Public License for more details.
    1.26 +#
    1.27 +# You should have received a copy of the GNU General Public License
    1.28 +# along with this program; if not, write to the Free Software
    1.29 +# Foundation, Inc.,
    1.30 +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
    1.31 +#
    1.32 +########################################################################
    1.33 +
    1.34 +=head1 NAME
    1.35 +
    1.36 +Locale::Po4a::Xml - Convert XML documents and derivates from/to PO files
    1.37 +
    1.38 +=head1 DESCRIPTION
    1.39 +
    1.40 +The po4a (po for anything) project goal is to ease translations (and more
    1.41 +interestingly, the maintenance of translations) using gettext tools on
    1.42 +areas where they were not expected like documentation.
    1.43 +
    1.44 +Locale::Po4a::Xml is a module to help the translation of XML documents into
    1.45 +other [human] languages. It can also be used as a base to build modules for
    1.46 +XML-based documents.
    1.47 +
    1.48 +=cut
    1.49 +
    1.50 +package Locale::Po4a::Xml;
    1.51 +
    1.52 +use 5.006;
    1.53 +use strict;
    1.54 +use warnings;
    1.55 +
    1.56 +require Exporter;
    1.57 +use vars qw(@ISA @EXPORT);
    1.58 +@ISA = qw(Locale::Po4a::TransTractor);
    1.59 +@EXPORT = qw(new initialize @tag_types);
    1.60 +
    1.61 +use Locale::Po4a::TransTractor;
    1.62 +use Locale::Po4a::Common;
    1.63 +use Carp qw(croak);
    1.64 +use File::Basename;
    1.65 +use File::Spec;
    1.66 +
    1.67 +#It will mantain the path from the root tag to the current one
    1.68 +my @path;
    1.69 +
    1.70 +#It will contain a list of external entities and their attached paths
    1.71 +my %entities;
    1.72 +
    1.73 +my @comments;
    1.74 +
    1.75 +sub shiftline {
    1.76 +    my $self = shift;
    1.77 +    # call Transtractor's shiftline
    1.78 +    my ($line,$ref) = $self->SUPER::shiftline();
    1.79 +    return ($line,$ref) if (not defined $line);
    1.80 +
    1.81 +    for my $k (keys %entities) {
    1.82 +        if ($line =~ m/^(.*?)&$k;(.*)$/s) {
    1.83 +            my ($before, $after) = ($1, $2);
    1.84 +            my $linenum=0;
    1.85 +            my @textentries;
    1.86 +
    1.87 +            open (my $in, $entities{$k})
    1.88 +                or croak wrap_mod("po4a::xml",
    1.89 +                                  dgettext("po4a", "Can't read from %s: %s"),
    1.90 +                                  $entities{$k}, $!);
    1.91 +            while (defined (my $textline = <$in>)) {
    1.92 +                $linenum++;
    1.93 +                my $textref=$entities{$k}.":$linenum";
    1.94 +                push @textentries, ($textline,$textref);
    1.95 +            }
    1.96 +            close $in
    1.97 +                or croak wrap_mod("po4a::xml",
    1.98 +                          dgettext("po4a", "Can't close %s after reading: %s"),
    1.99 +                                  $entities{$k}, $!);
   1.100 +
   1.101 +            push @textentries, ($after, $ref);
   1.102 +            $line = $before.(shift @textentries);
   1.103 +            $ref .= " ".(shift @textentries);
   1.104 +            $self->unshiftline(@textentries);
   1.105 +        }
   1.106 +    }
   1.107 +
   1.108 +    return ($line,$ref);
   1.109 +}
   1.110 +
   1.111 +sub read {
   1.112 +	my ($self,$filename)=@_;
   1.113 +	push @{$self->{DOCPOD}{infile}}, $filename;
   1.114 +	$self->Locale::Po4a::TransTractor::read($filename);
   1.115 +}
   1.116 +
   1.117 +sub parse {
   1.118 +	my $self=shift;
   1.119 +	map {$self->parse_file($_)} @{$self->{DOCPOD}{infile}};
   1.120 +}
   1.121 +
   1.122 +# @save_holders is a stack of references to ('paragraph', 'translation',
   1.123 +# 'sub_translations', 'open', 'close', 'folded_attributes') hashes, where:
   1.124 +# paragraph         is a reference to an array (see paragraph in the
   1.125 +#                   treat_content() subroutine) of strings followed by
   1.126 +#                   references.  It contains the @paragraph array as it was
   1.127 +#                   before the processing was interrupted by a tag instroducing
   1.128 +#                   a placeholder.
   1.129 +# translation       is the translation of this level up to now
   1.130 +# sub_translations  is a reference to an array of strings containing the
   1.131 +#                   translations which must replace the placeholders.
   1.132 +# open              is the tag which opened the placeholder.
   1.133 +# close             is the tag which closed the placeholder.
   1.134 +# folded_attributes is an hash of tags with their attributes (<tag attrs=...>
   1.135 +#                   strings), referenced by the folded tag id, which should
   1.136 +#                   replace the <tag po4a-id=id> strings in the current
   1.137 +#                   translation.
   1.138 +#
   1.139 +# If @save_holders only has 1 holder, then we are not processing the
   1.140 +# content of an holder, we are translating the document.
   1.141 +my @save_holders;
   1.142 +
   1.143 +
   1.144 +# If we are at the bottom of the stack and there is no <placeholder ...> in
   1.145 +# the current translation, we can push the translation in the translated
   1.146 +# document.
   1.147 +# Otherwise, we keep the translation in the current holder.
   1.148 +sub pushline {
   1.149 +	my ($self, $line) = (shift, shift);
   1.150 +
   1.151 +	my $holder = $save_holders[$#save_holders];
   1.152 +	my $translation = $holder->{'translation'};
   1.153 +	$translation .= $line;
   1.154 +
   1.155 +	while (    %{$holder->{folded_attributes}}
   1.156 +	       and $translation =~ m/^(.*)<([^>]+?)\s+po4a-id=([0-9]+)>(.*)$/s) {
   1.157 +		my $begin = $1;
   1.158 +		my $tag = $2;
   1.159 +		my $id = $3;
   1.160 +		my $end = $4;
   1.161 +		if (defined $holder->{folded_attributes}->{$id}) {
   1.162 +			# TODO: check if the tag is the same
   1.163 +			$translation = $begin.$holder->{folded_attributes}->{$id}.$end;
   1.164 +			delete $holder->{folded_attributes}->{$id};
   1.165 +		} else {
   1.166 +			# TODO: It will be hard to identify the location.
   1.167 +			#       => find a way to retrieve the reference.
   1.168 +			die wrap_mod("po4a::xml", dgettext("po4a", "'po4a-id=%d' in the translation does not exist in the original string (or 'po4a-id=%d' used twice in the translation)."), $id, $id);
   1.169 +		}
   1.170 +	}
   1.171 +# TODO: check that %folded_attributes is empty at some time
   1.172 +# => in translate_paragraph?
   1.173 +
   1.174 +	if (   ($#save_holders > 0)
   1.175 +	    or ($translation =~ m/<placeholder\s+type="[^"]+"\s+id="(\d+)"\s*\/>/s)) {
   1.176 +		$holder->{'translation'} = $translation;
   1.177 +	} else {
   1.178 +		$self->SUPER::pushline($translation);
   1.179 +		$holder->{'translation'} = '';
   1.180 +	}
   1.181 +}
   1.182 +
   1.183 +=head1 TRANSLATING WITH PO4A::XML
   1.184 +
   1.185 +This module can be used directly to handle generic XML documents.  This will
   1.186 +extract all tag's content, and no attributes, since it's where the text is
   1.187 +written in most XML based documents.
   1.188 +
   1.189 +There are some options (described in the next section) that can customize
   1.190 +this behavior.  If this doesn't fit to your document format you're encouraged
   1.191 +to write your own module derived from this, to describe your format's details.
   1.192 +See the section "Writing derivate modules" below, for the process description.
   1.193 +
   1.194 +=cut
   1.195 +
   1.196 +#
   1.197 +# Parse file and translate it
   1.198 +#
   1.199 +sub parse_file {
   1.200 +	my ($self,$filename) = @_;
   1.201 +	my $eof = 0;
   1.202 +
   1.203 +	while (!$eof) {
   1.204 +		# We get all the text until the next breaking tag (not
   1.205 +		# inline) and translate it
   1.206 +		$eof = $self->treat_content;
   1.207 +		if (!$eof) {
   1.208 +			# And then we treat the following breaking tag
   1.209 +			$eof = $self->treat_tag;
   1.210 +		}
   1.211 +	}
   1.212 +}
   1.213 +
   1.214 +=head1 OPTIONS ACCEPTED BY THIS MODULE
   1.215 +
   1.216 +The global debug option causes this module to show the excluded strings, in
   1.217 +order to see if it skips something important.
   1.218 +
   1.219 +These are this module's particular options:
   1.220 +
   1.221 +=over 4
   1.222 +
   1.223 +=item B<nostrip>
   1.224 +
   1.225 +Prevents it to strip the spaces around the extracted strings.
   1.226 +
   1.227 +=item B<wrap>
   1.228 +
   1.229 +Canonizes the string to translate, considering that whitespaces are not
   1.230 +important, and wraps the translated document. This option can be overridden
   1.231 +by custom tag options. See the "tags" option below.
   1.232 +
   1.233 +=item B<caseinsensitive>
   1.234 +
   1.235 +It makes the tags and attributes searching to work in a case insensitive
   1.236 +way.  If it's defined, it will treat E<lt>BooKE<gt>laNG and E<lt>BOOKE<gt>Lang as E<lt>bookE<gt>lang.
   1.237 +
   1.238 +=item B<includeexternal>
   1.239 +
   1.240 +When defined, external entities are included in the generated (translated)
   1.241 +document, and for the extraction of strings.  If it's not defined, you
   1.242 +will have to translate external entities separately as independent
   1.243 +documents.
   1.244 +
   1.245 +=item B<ontagerror>
   1.246 +
   1.247 +This option defines the behavior of the module when it encounter a invalid
   1.248 +Xml syntax (a closing tag which does not match the last opening tag, or a
   1.249 +tag's attribute without value).
   1.250 +It can take the following values:
   1.251 +
   1.252 +=over
   1.253 +
   1.254 +=item I<fail>
   1.255 +
   1.256 +This is the default value.
   1.257 +The module will exit with an error.
   1.258 +
   1.259 +=item I<warn>
   1.260 +
   1.261 +The module will continue, and will issue a warning.
   1.262 +
   1.263 +=item I<silent>
   1.264 +
   1.265 +The module will continue without any warnings.
   1.266 +
   1.267 +=back
   1.268 +
   1.269 +Be careful when using this option.
   1.270 +It is generally recommended to fix the input file.
   1.271 +
   1.272 +=item B<tagsonly>
   1.273 +
   1.274 +Extracts only the specified tags in the "tags" option.  Otherwise, it
   1.275 +will extract all the tags except the ones specified.
   1.276 +
   1.277 +Note: This option is deprecated.
   1.278 +
   1.279 +=item B<doctype>
   1.280 +
   1.281 +String that will try to match with the first line of the document's doctype
   1.282 +(if defined). If it doesn't, a warning will indicate that the document
   1.283 +might be of a bad type.
   1.284 +
   1.285 +=item B<tags>
   1.286 +
   1.287 +Space-separated list of tags you want to translate or skip.  By default,
   1.288 +the specified tags will be excluded, but if you use the "tagsonly" option,
   1.289 +the specified tags will be the only ones included.  The tags must be in the
   1.290 +form E<lt>aaaE<gt>, but you can join some (E<lt>bbbE<gt>E<lt>aaaE<gt>) to say that the content of
   1.291 +the tag E<lt>aaaE<gt> will only be translated when it's into a E<lt>bbbE<gt> tag.
   1.292 +
   1.293 +You can also specify some tag options putting some characters in front of
   1.294 +the tag hierarchy. For example, you can put 'w' (wrap) or 'W' (don't wrap)
   1.295 +to override the default behavior specified by the global "wrap" option.
   1.296 +
   1.297 +Example: WE<lt>chapterE<gt>E<lt>titleE<gt>
   1.298 +
   1.299 +Note: This option is deprecated.
   1.300 +You should use the B<translated> and B<untranslated> options instead.
   1.301 +
   1.302 +=item B<attributes>
   1.303 +
   1.304 +Space-separated list of tag's attributes you want to translate.  You can
   1.305 +specify the attributes by their name (for example, "lang"), but you can
   1.306 +prefix it with a tag hierarchy, to specify that this attribute will only be
   1.307 +translated when it's into the specified tag. For example: E<lt>bbbE<gt>E<lt>aaaE<gt>lang
   1.308 +specifies that the lang attribute will only be translated if it's into an
   1.309 +E<lt>aaaE<gt> tag, and it's into a E<lt>bbbE<gt> tag.
   1.310 +
   1.311 +=item B<foldattributes>
   1.312 +
   1.313 +Do not translate attributes in inline tags.
   1.314 +Instead, replace all attributes of a tag by po4a-id=<id>.
   1.315 +
   1.316 +This is useful when attributes shall not be translated, as this simplifies the
   1.317 +strings for translators, and avoids typos.
   1.318 +
   1.319 +=item B<break>
   1.320 +
   1.321 +Space-separated list of tags which should break the sequence.
   1.322 +By default, all tags break the sequence.
   1.323 +
   1.324 +The tags must be in the form <aaa>, but you can join some
   1.325 +(<bbb><aaa>), if a tag (<aaa>) should only be considered 
   1.326 +when it's into another tag (<bbb>).
   1.327 +
   1.328 +=item B<inline>
   1.329 +
   1.330 +Space-separated list of tags which should be treated as inline.
   1.331 +By default, all tags break the sequence.
   1.332 +
   1.333 +The tags must be in the form <aaa>, but you can join some
   1.334 +(<bbb><aaa>), if a tag (<aaa>) should only be considered 
   1.335 +when it's into another tag (<bbb>).
   1.336 +
   1.337 +=item B<placeholder>
   1.338 +
   1.339 +Space-separated list of tags which should be treated as placeholders.
   1.340 +Placeholders do not break the sequence, but the content of placeholders is
   1.341 +translated separately.
   1.342 +
   1.343 +The location of the placeholder in its blocks will be marked with a string
   1.344 +similar to:
   1.345 +
   1.346 +  <placeholder type=\"footnote\" id=\"0\"/>
   1.347 +
   1.348 +The tags must be in the form <aaa>, but you can join some
   1.349 +(<bbb><aaa>), if a tag (<aaa>) should only be considered 
   1.350 +when it's into another tag (<bbb>).
   1.351 +
   1.352 +=item B<nodefault>
   1.353 +
   1.354 +Space separated list of tags that the module should not try to set by
   1.355 +default in any category.
   1.356 +
   1.357 +=item B<cpp>
   1.358 +
   1.359 +Support C preprocessor directives.
   1.360 +When this option is set, po4a will consider preprocessor directives as
   1.361 +paragraph separators.
   1.362 +This is important if the XML file must be preprocessed because otherwise
   1.363 +the directives may be inserted in the middle of lines if po4a consider it
   1.364 +belong to the current paragraph, and they won't be recognized by the
   1.365 +preprocessor.
   1.366 +Note: the preprocessor directives must only appear between tags
   1.367 +(they must not break a tag).
   1.368 +
   1.369 +=item B<translated>
   1.370 +
   1.371 +Space-separated list of tags you want to translate.
   1.372 +
   1.373 +The tags must be in the form <aaa>, but you can join some
   1.374 +(<bbb><aaa>), if a tag (<aaa>) should only be considered 
   1.375 +when it's into another tag (<bbb>).
   1.376 +
   1.377 +You can also specify some tag options putting some characters in front of
   1.378 +the tag hierarchy. For example, you can put 'w' (wrap) or 'W' (don't wrap)
   1.379 +to overide the default behavior specified by the global "wrap" option.
   1.380 +
   1.381 +Example: WE<lt>chapterE<gt>E<lt>titleE<gt>
   1.382 +
   1.383 +=item B<untranslated>
   1.384 +
   1.385 +Space-separated list of tags you do not want to translate.
   1.386 +
   1.387 +The tags must be in the form <aaa>, but you can join some
   1.388 +(<bbb><aaa>), if a tag (<aaa>) should only be considered 
   1.389 +when it's into another tag (<bbb>).
   1.390 +
   1.391 +=item B<defaulttranslateoption>
   1.392 +
   1.393 +The default categories for tags that are not in any of the translated,
   1.394 +untranslated, break, inline, or placeholder.
   1.395 +
   1.396 +This is a set of letters:
   1.397 +
   1.398 +=over
   1.399 +
   1.400 +=item I<w>
   1.401 +
   1.402 +Tags should be translated and content can be re-wrapped.
   1.403 +
   1.404 +=item I<W>
   1.405 +
   1.406 +Tags should be translated and content should not be re-wrapped.
   1.407 +
   1.408 +=item I<i>
   1.409 +
   1.410 +Tags should be translated inline.
   1.411 +
   1.412 +=item I<p>
   1.413 +
   1.414 +Tags should be translated as placeholders.
   1.415 +
   1.416 +=back
   1.417 +
   1.418 +=back
   1.419 +
   1.420 +=cut
   1.421 +# TODO: defaulttranslateoption
   1.422 +# w => indicate that it is only valid for translatable tags and do not
   1.423 +#      care about inline/break/placeholder?
   1.424 +# ...
   1.425 +
   1.426 +sub initialize {
   1.427 +	my $self = shift;
   1.428 +	my %options = @_;
   1.429 +
   1.430 +	# Reset the path
   1.431 +	@path = ();
   1.432 +
   1.433 +	# Initialize the stack of holders
   1.434 +	my @paragraph = ();
   1.435 +	my @sub_translations = ();
   1.436 +	my %folded_attributes;
   1.437 +	my %holder = ('paragraph' => \@paragraph,
   1.438 +	              'translation' => "",
   1.439 +	              'sub_translations' => \@sub_translations,
   1.440 +	              'folded_attributes' => \%folded_attributes);
   1.441 +	@save_holders = (\%holder);
   1.442 +
   1.443 +	$self->{options}{'nostrip'}=0;
   1.444 +	$self->{options}{'wrap'}=0;
   1.445 +	$self->{options}{'caseinsensitive'}=0;
   1.446 +	$self->{options}{'tagsonly'}=0;
   1.447 +	$self->{options}{'tags'}='';
   1.448 +	$self->{options}{'break'}='';
   1.449 +	$self->{options}{'translated'}='';
   1.450 +	$self->{options}{'untranslated'}='';
   1.451 +	$self->{options}{'defaulttranslateoption'}='';
   1.452 +	$self->{options}{'attributes'}='';
   1.453 +	$self->{options}{'foldattributes'}=0;
   1.454 +	$self->{options}{'inline'}='';
   1.455 +	$self->{options}{'placeholder'}='';
   1.456 +	$self->{options}{'doctype'}='';
   1.457 +	$self->{options}{'nodefault'}='';
   1.458 +	$self->{options}{'includeexternal'}=0;
   1.459 +	$self->{options}{'ontagerror'}="fail";
   1.460 +	$self->{options}{'cpp'}=0;
   1.461 +
   1.462 +	$self->{options}{'verbose'}='';
   1.463 +	$self->{options}{'debug'}='';
   1.464 +
   1.465 +	foreach my $opt (keys %options) {
   1.466 +		if ($options{$opt}) {
   1.467 +			die wrap_mod("po4a::xml",
   1.468 +				dgettext("po4a", "Unknown option: %s"), $opt)
   1.469 +				unless exists $self->{options}{$opt};
   1.470 +			$self->{options}{$opt} = $options{$opt};
   1.471 +		}
   1.472 +	}
   1.473 +	# Default options set by modules. Forbidden for users.
   1.474 +	$self->{options}{'_default_translated'}='';
   1.475 +	$self->{options}{'_default_untranslated'}='';
   1.476 +	$self->{options}{'_default_break'}='';
   1.477 +	$self->{options}{'_default_inline'}='';
   1.478 +	$self->{options}{'_default_placeholder'}='';
   1.479 +	$self->{options}{'_default_attributes'}='';
   1.480 +
   1.481 +	#It will maintain the list of the translatable tags
   1.482 +	$self->{tags}=();
   1.483 +	$self->{translated}=();
   1.484 +	$self->{untranslated}=();
   1.485 +	#It will maintain the list of the translatable attributes
   1.486 +	$self->{attributes}=();
   1.487 +	#It will maintain the list of the breaking tags
   1.488 +	$self->{break}=();
   1.489 +	#It will maintain the list of the inline tags
   1.490 +	$self->{inline}=();
   1.491 +	#It will maintain the list of the placeholder tags
   1.492 +	$self->{placeholder}=();
   1.493 +	#list of the tags that must not be set in the tags or inline category
   1.494 +	#by this module or sub-module (unless specified in an option)
   1.495 +	$self->{nodefault}=();
   1.496 +
   1.497 +	$self->treat_options;
   1.498 +}
   1.499 +
   1.500 +=head1 WRITING DERIVATE MODULES
   1.501 +
   1.502 +=head2 DEFINE WHAT TAGS AND ATTRIBUTES TO TRANSLATE
   1.503 +
   1.504 +The simplest customization is to define which tags and attributes you want
   1.505 +the parser to translate.  This should be done in the initialize function.
   1.506 +First you should call the main initialize, to get the command-line options,
   1.507 +and then, append your custom definitions to the options hash.  If you want
   1.508 +to treat some new options from command line, you should define them before
   1.509 +calling the main initialize:
   1.510 +
   1.511 +  $self->{options}{'new_option'}='';
   1.512 +  $self->SUPER::initialize(%options);
   1.513 +  $self->{options}{'_default_translated'}.=' <p> <head><title>';
   1.514 +  $self->{options}{'attributes'}.=' <p>lang id';
   1.515 +  $self->{options}{'_default_inline'}.=' <br>';
   1.516 +  $self->treat_options;
   1.517 +
   1.518 +You should use the B<_default_inline>, B<_default_break>,
   1.519 +B<_default_placeholder>, B<_default_translated>, B<_default_untranslated>,
   1.520 +and B<_default_attributes> options in derivated modules. This allow users
   1.521 +to override the default behavior defined in your module with command line
   1.522 +options.
   1.523 +
   1.524 +=head2 OVERRIDING THE found_string FUNCTION
   1.525 +
   1.526 +Another simple step is to override the function "found_string", which
   1.527 +receives the extracted strings from the parser, in order to translate them.
   1.528 +There you can control which strings you want to translate, and perform
   1.529 +transformations to them before or after the translation itself.
   1.530 +
   1.531 +It receives the extracted text, the reference on where it was, and a hash
   1.532 +that contains extra information to control what strings to translate, how
   1.533 +to translate them and to generate the comment.
   1.534 +
   1.535 +The content of these options depends on the kind of string it is (specified in an 
   1.536 +entry of this hash):
   1.537 +
   1.538 +=over
   1.539 +
   1.540 +=item type="tag"
   1.541 +
   1.542 +The found string is the content of a translatable tag. The entry "tag_options"
   1.543 +contains the option characters in front of the tag hierarchy in the module
   1.544 +"tags" option.
   1.545 +
   1.546 +=item type="attribute"
   1.547 +
   1.548 +Means that the found string is the value of a translatable attribute. The
   1.549 +entry "attribute" has the name of the attribute.
   1.550 +
   1.551 +=back
   1.552 +
   1.553 +It must return the text that will replace the original in the translated
   1.554 +document. Here's a basic example of this function:
   1.555 +
   1.556 +  sub found_string {
   1.557 +    my ($self,$text,$ref,$options)=@_;
   1.558 +    $text = $self->translate($text,$ref,"type ".$options->{'type'},
   1.559 +      'wrap'=>$self->{options}{'wrap'});
   1.560 +    return $text;
   1.561 +  }
   1.562 +
   1.563 +There's another simple example in the new Dia module, which only filters
   1.564 +some strings.
   1.565 +
   1.566 +=cut
   1.567 +
   1.568 +sub found_string {
   1.569 +	my ($self,$text,$ref,$options)=@_;
   1.570 +
   1.571 +	if ($text =~ m/^\s*$/s) {
   1.572 +		return $text;
   1.573 +	}
   1.574 +
   1.575 +	my $comment;
   1.576 +	my $wrap = $self->{options}{'wrap'};
   1.577 +
   1.578 +	if ($options->{'type'} eq "tag") {
   1.579 +		$comment = "Content of: ".$self->get_path;
   1.580 +
   1.581 +		if($options->{'tag_options'} =~ /w/) {
   1.582 +			$wrap = 1;
   1.583 +		}
   1.584 +		if($options->{'tag_options'} =~ /W/) {
   1.585 +			$wrap = 0;
   1.586 +		}
   1.587 +	} elsif ($options->{'type'} eq "attribute") {
   1.588 +		$comment = "Attribute '".$options->{'attribute'}."' of: ".$self->get_path;
   1.589 +	} elsif ($options->{'type'} eq "CDATA") {
   1.590 +		$comment = "CDATA";
   1.591 +		$wrap = 0;
   1.592 +	} else {
   1.593 +		die wrap_ref_mod($ref, "po4a::xml", dgettext("po4a", "Internal error: unknown type identifier '%s'."), $options->{'type'});
   1.594 +	}
   1.595 +	$text = $self->translate($text,$ref,$comment,'wrap'=>$wrap, comment => $options->{'comments'});
   1.596 +	return $text;
   1.597 +}
   1.598 +
   1.599 +=head2 MODIFYING TAG TYPES (TODO)
   1.600 +
   1.601 +This is a more complex one, but it enables a (almost) total customization.
   1.602 +It's based in a list of hashes, each one defining a tag type's behavior. The
   1.603 +list should be sorted so that the most general tags are after the most
   1.604 +concrete ones (sorted first by the beginning and then by the end keys). To
   1.605 +define a tag type you'll have to make a hash with the following keys:
   1.606 +
   1.607 +=over 4
   1.608 +
   1.609 +=item beginning
   1.610 +
   1.611 +Specifies the beginning of the tag, after the "E<lt>".
   1.612 +
   1.613 +=item end
   1.614 +
   1.615 +Specifies the end of the tag, before the "E<gt>".
   1.616 +
   1.617 +=item breaking
   1.618 +
   1.619 +It says if this is a breaking tag class.  A non-breaking (inline) tag is one
   1.620 +that can be taken as part of the content of another tag.  It can take the
   1.621 +values false (0), true (1) or undefined.  If you leave this undefined, you'll
   1.622 +have to define the f_breaking function that will say whether a concrete tag of
   1.623 +this class is a breaking tag or not.
   1.624 +
   1.625 +=item f_breaking
   1.626 +
   1.627 +It's a function that will tell if the next tag is a breaking one or not.  It
   1.628 +should be defined if the "breaking" option is not.
   1.629 +
   1.630 +=item f_extract
   1.631 +
   1.632 +If you leave this key undefined, the generic extraction function will have to
   1.633 +extract the tag itself.  It's useful for tags that can have other tags or
   1.634 +special structures in them, so that the main parser doesn't get mad.  This
   1.635 +function receives a boolean that says if the tag should be removed from the
   1.636 +input stream or not.
   1.637 +
   1.638 +=item f_translate
   1.639 +
   1.640 +This function receives the tag (in the get_string_until() format) and returns
   1.641 +the translated tag (translated attributes or all needed transformations) as a
   1.642 +single string.
   1.643 +
   1.644 +=back
   1.645 +
   1.646 +=cut
   1.647 +
   1.648 +##### Generic XML tag types #####' 
   1.649 +
   1.650 +our @tag_types = ( 
   1.651 +	{	beginning	=> "!--#",
   1.652 +		end		=> "--",
   1.653 +		breaking	=> 0,
   1.654 +		f_extract	=> \&tag_extract_comment,
   1.655 +		f_translate	=> \&tag_trans_comment},
   1.656 +	{	beginning	=> "!--",
   1.657 +		end		=> "--",
   1.658 +		breaking	=> 0,
   1.659 +		f_extract	=> \&tag_extract_comment,
   1.660 +		f_translate	=> \&tag_trans_comment},
   1.661 +	{	beginning	=> "?xml",
   1.662 +		end		=> "?",
   1.663 +		breaking	=> 1,
   1.664 +		f_translate	=> \&tag_trans_xmlhead},
   1.665 +	{	beginning	=> "?",
   1.666 +		end		=> "?",
   1.667 +		breaking	=> 1,
   1.668 +		f_translate	=> \&tag_trans_procins},
   1.669 +	{	beginning	=> "!DOCTYPE",
   1.670 +		end		=> "",
   1.671 +		breaking	=> 1,
   1.672 +		f_extract	=> \&tag_extract_doctype,
   1.673 +		f_translate	=> \&tag_trans_doctype},
   1.674 +	{	beginning	=> "![CDATA[",
   1.675 +		end		=> "",
   1.676 +		breaking	=> 1,
   1.677 +		f_extract	=> \&CDATA_extract,
   1.678 +		f_translate	=> \&CDATA_trans},
   1.679 +	{	beginning	=> "/",
   1.680 +		end		=> "",
   1.681 +		f_breaking	=> \&tag_break_close,
   1.682 +		f_translate	=> \&tag_trans_close},
   1.683 +	{	beginning	=> "",
   1.684 +		end		=> "/",
   1.685 +		f_breaking	=> \&tag_break_alone,
   1.686 +		f_translate	=> \&tag_trans_alone},
   1.687 +	{	beginning	=> "",
   1.688 +		end		=> "",
   1.689 +		f_breaking	=> \&tag_break_open,
   1.690 +		f_translate	=> \&tag_trans_open}
   1.691 +);
   1.692 +
   1.693 +sub tag_extract_comment {
   1.694 +	my ($self,$remove)=(shift,shift);
   1.695 +	my ($eof,@tag)=$self->get_string_until('-->',{include=>1,remove=>$remove});
   1.696 +	return ($eof,@tag);
   1.697 +}
   1.698 +
   1.699 +sub tag_trans_comment {
   1.700 +	my ($self,@tag)=@_;
   1.701 +	return $self->join_lines(@tag);
   1.702 +}
   1.703 +
   1.704 +sub tag_trans_xmlhead {
   1.705 +	my ($self,@tag)=@_;
   1.706 +
   1.707 +	# We don't have to translate anything from here: throw away references
   1.708 +	my $tag = $self->join_lines(@tag);
   1.709 +	$tag =~ /encoding=(("|')|)(.*?)(\s|\2)/s;
   1.710 +	my $in_charset=$3;
   1.711 +	$self->detected_charset($in_charset);
   1.712 +	my $out_charset=$self->get_out_charset;
   1.713 +
   1.714 +	if (defined $in_charset) {
   1.715 +		$tag =~ s/$in_charset/$out_charset/;
   1.716 +	} else {
   1.717 +		if ($tag =~ m/standalone/) {
   1.718 +			$tag =~ s/(standalone)/encoding="$out_charset" $1/;
   1.719 +		} else {
   1.720 +			$tag.= " encoding=\"$out_charset\"";
   1.721 +		}
   1.722 +	}
   1.723 +
   1.724 +	return $tag;
   1.725 +}
   1.726 +
   1.727 +sub tag_trans_procins {
   1.728 +	my ($self,@tag)=@_;
   1.729 +	return $self->join_lines(@tag);
   1.730 +}
   1.731 +
   1.732 +sub tag_extract_doctype {
   1.733 +	my ($self,$remove)=(shift,shift);
   1.734 +
   1.735 +	# Check if there is an internal subset (between []).
   1.736 +	my ($eof,@tag)=$self->get_string_until('>',{include=>1,unquoted=>1});
   1.737 +	my $parity = 0;
   1.738 +	my $paragraph = "";
   1.739 +	map { $parity = 1 - $parity; $paragraph.= $parity?$_:""; } @tag;
   1.740 +	my $found = 0;
   1.741 +	if ($paragraph =~ m/<.*\[.*</s) {
   1.742 +		$found = 1
   1.743 +	}
   1.744 +
   1.745 +	if (not $found) {
   1.746 +		($eof,@tag)=$self->get_string_until('>',{include=>1,remove=>$remove,unquoted=>1});
   1.747 +	} else {
   1.748 +		($eof,@tag)=$self->get_string_until(']\s*>',{include=>1,remove=>$remove,unquoted=>1,regex=>1});
   1.749 +	}
   1.750 +	return ($eof,@tag);
   1.751 +}
   1.752 +
   1.753 +sub tag_trans_doctype {
   1.754 +# This check is not really reliable.  There are system and public
   1.755 +# identifiers.  Only the public one could be checked reliably.
   1.756 +	my ($self,@tag)=@_;
   1.757 +	if (defined $self->{options}{'doctype'} ) {
   1.758 +		my $doctype = $self->{options}{'doctype'};
   1.759 +		if ( $tag[0] !~ /\Q$doctype\E/i ) {
   1.760 +			warn wrap_ref_mod($tag[1], "po4a::xml", dgettext("po4a", "Bad document type. '%s' expected. You can fix this warning with a -o doctype option, or ignore this check with -o doctype=\"\"."), $doctype);
   1.761 +		}
   1.762 +	}
   1.763 +	my $i = 0;
   1.764 +	my $basedir = $tag[1];
   1.765 +	$basedir =~ s/:[0-9]+$//;
   1.766 +	$basedir = dirname($basedir);
   1.767 +
   1.768 +	while ( $i < $#tag ) {
   1.769 +		my $t = $tag[$i];
   1.770 +		my $ref = $tag[$i+1];
   1.771 +		if ( $t =~ /^(\s*<!ENTITY\s+)(.*)$/is ) {
   1.772 +			my $part1 = $1;
   1.773 +			my $part2 = $2;
   1.774 +			my $includenow = 0;
   1.775 +			my $file = 0;
   1.776 +			my $name = "";
   1.777 +			if ($part2 =~ /^(%\s+)(.*)$/s ) {
   1.778 +				$part1.= $1;
   1.779 +				$part2 = $2;
   1.780 +				$includenow = 1;
   1.781 +			}
   1.782 +			$part2 =~ /^(\S+)(\s+)(.*)$/s;
   1.783 +			$name = $1;
   1.784 +			$part1.= $1.$2;
   1.785 +			$part2 = $3;
   1.786 +			if ( $part2 =~ /^(SYSTEM\s+)(.*)$/is ) {
   1.787 +				$part1.= $1;
   1.788 +				$part2 = $2;
   1.789 +				$file = 1;
   1.790 +				if ($self->{options}{'includeexternal'}) {
   1.791 +					$entities{$name} = $part2;
   1.792 +					$entities{$name} =~ s/^"?(.*?)".*$/$1/s;
   1.793 +					$entities{$name} = File::Spec->catfile($basedir, $entities{$name});
   1.794 +				}
   1.795 +			}
   1.796 +			if ((not $file) and (not $includenow)) {
   1.797 +			    if ($part2 =~ m/^\s*(["'])(.*)\1(\s*>.*)$/s) {
   1.798 +				my $comment = "Content of the $name entity";
   1.799 +				my $quote = $1;
   1.800 +				my $text = $2;
   1.801 +				$part2 = $3;
   1.802 +				$text = $self->translate($text,
   1.803 +				                         $ref,
   1.804 +				                         $comment,
   1.805 +				                         'wrap'=>1);
   1.806 +				$t = $part1."$quote$text$quote$part2";
   1.807 +			    }
   1.808 +			}
   1.809 +#			print $part1."\n";
   1.810 +#			print $name."\n";
   1.811 +#			print $part2."\n";
   1.812 +		}
   1.813 +		$tag[$i] = $t;
   1.814 +		$i += 2;
   1.815 +	}
   1.816 +	return $self->join_lines(@tag);
   1.817 +}
   1.818 +
   1.819 +sub tag_break_close {
   1.820 +	my ($self,@tag)=@_;
   1.821 +	my $struct = $self->get_path;
   1.822 +	my $options = $self->get_translate_options($struct);
   1.823 +	if ($options =~ m/[ip]/) {
   1.824 +		return 0;
   1.825 +	} else {
   1.826 +		return 1;
   1.827 +	}
   1.828 +}
   1.829 +
   1.830 +sub tag_trans_close {
   1.831 +	my ($self,@tag)=@_;
   1.832 +	my $name = $self->get_tag_name(@tag);
   1.833 +
   1.834 +	my $test = pop @path;
   1.835 +	if (!defined($test) || $test ne $name ) {
   1.836 +		my $ontagerror = $self->{options}{'ontagerror'};
   1.837 +		if ($ontagerror eq "warn") {
   1.838 +			warn wrap_ref_mod($tag[1], "po4a::xml", dgettext("po4a", "Unexpected closing tag </%s> found. The main document may be wrong.  Continuing..."), $name);
   1.839 +		} elsif ($ontagerror ne "silent") {
   1.840 +			die wrap_ref_mod($tag[1], "po4a::xml", dgettext("po4a", "Unexpected closing tag </%s> found. The main document may be wrong."), $name);
   1.841 +		}
   1.842 +	}
   1.843 +	return $self->join_lines(@tag);
   1.844 +}
   1.845 +
   1.846 +sub CDATA_extract {
   1.847 +	my ($self,$remove)=(shift,shift);
   1.848 +        my ($eof, @tag) = $self->get_string_until(']]>',{include=>1,unquoted=>0,remove=>$remove});
   1.849 +
   1.850 +	return ($eof, @tag);
   1.851 +}
   1.852 +
   1.853 +sub CDATA_trans {
   1.854 +	my ($self,@tag)=@_;
   1.855 +	return $self->found_string($self->join_lines(@tag),
   1.856 +	                           $tag[1],
   1.857 +	                           {'type' => "CDATA"});
   1.858 +}
   1.859 +
   1.860 +sub tag_break_alone {
   1.861 +	my ($self,@tag)=@_;
   1.862 +	my $struct = $self->get_path($self->get_tag_name(@tag));
   1.863 +	if ($self->get_translate_options($struct) =~ m/i/) {
   1.864 +		return 0;
   1.865 +	} else {
   1.866 +		return 1;
   1.867 +	}
   1.868 +}
   1.869 +
   1.870 +sub tag_trans_alone {
   1.871 +	my ($self,@tag)=@_;
   1.872 +	my $name = $self->get_tag_name(@tag);
   1.873 +	push @path, $name;
   1.874 +
   1.875 +	$name = $self->treat_attributes(@tag);
   1.876 +
   1.877 +	pop @path;
   1.878 +	return $name;
   1.879 +}
   1.880 +
   1.881 +sub tag_break_open {
   1.882 +	my ($self,@tag)=@_;
   1.883 +	my $struct = $self->get_path($self->get_tag_name(@tag));
   1.884 +	my $options = $self->get_translate_options($struct);
   1.885 +	if ($options =~ m/[ip]/) {
   1.886 +		return 0;
   1.887 +	} else {
   1.888 +		return 1;
   1.889 +	}
   1.890 +}
   1.891 +
   1.892 +sub tag_trans_open {
   1.893 +	my ($self,@tag)=@_;
   1.894 +	my $name = $self->get_tag_name(@tag);
   1.895 +	push @path, $name;
   1.896 +
   1.897 +	$name = $self->treat_attributes(@tag);
   1.898 +
   1.899 +	return $name;
   1.900 +}
   1.901 +
   1.902 +##### END of Generic XML tag types #####
   1.903 +
   1.904 +=head1 INTERNAL FUNCTIONS used to write derivated parsers
   1.905 +
   1.906 +=head2 WORKING WITH TAGS
   1.907 +
   1.908 +=over 4
   1.909 +
   1.910 +=item get_path()
   1.911 +
   1.912 +This function returns the path to the current tag from the document's root,
   1.913 +in the form E<lt>htmlE<gt>E<lt>bodyE<gt>E<lt>pE<gt>.
   1.914 +
   1.915 +An additional array of tags (without brackets) can be passed in argument.
   1.916 +These path elements are added to the end of the current path.
   1.917 +
   1.918 +=cut
   1.919 +
   1.920 +sub get_path {
   1.921 +	my $self = shift;
   1.922 +	my @add = @_;
   1.923 +	if ( @path > 0 or @add > 0 ) {
   1.924 +		return "<".join("><",@path,@add).">";
   1.925 +	} else {
   1.926 +		return "outside any tag (error?)";
   1.927 +	}
   1.928 +}
   1.929 +
   1.930 +=item tag_type()
   1.931 +
   1.932 +This function returns the index from the tag_types list that fits to the next
   1.933 +tag in the input stream, or -1 if it's at the end of the input file.
   1.934 +
   1.935 +=cut
   1.936 +
   1.937 +sub tag_type {
   1.938 +	my $self = shift;
   1.939 +	my ($line,$ref) = $self->shiftline();
   1.940 +	my ($match1,$match2);
   1.941 +	my $found = 0;
   1.942 +	my $i = 0;
   1.943 +
   1.944 +	if (!defined($line)) { return -1; }
   1.945 +
   1.946 +	$self->unshiftline($line,$ref);
   1.947 +	my ($eof,@lines) = $self->get_string_until(">",{include=>1,unquoted=>1});
   1.948 +	my $line2 = $self->join_lines(@lines);
   1.949 +	while (!$found && $i < @tag_types) {
   1.950 +		($match1,$match2) = ($tag_types[$i]->{beginning},$tag_types[$i]->{end});
   1.951 +		if ($line =~ /^<\Q$match1\E/) {
   1.952 +			if (!defined($tag_types[$i]->{f_extract})) {
   1.953 +#print substr($line2,length($line2)-1-length($match2),1+length($match2))."\n";
   1.954 +				if (defined($line2) and $line2 =~ /\Q$match2\E>$/) {
   1.955 +					$found = 1;
   1.956 +#print "YES: <".$match1." ".$match2.">\n";
   1.957 +				} else {
   1.958 +#print "NO: <".$match1." ".$match2.">\n";
   1.959 +					$i++;
   1.960 +				}
   1.961 +			} else {
   1.962 +				$found = 1;
   1.963 +			}
   1.964 +		} else {
   1.965 +			$i++;
   1.966 +		}
   1.967 +	}
   1.968 +	if (!$found) {
   1.969 +		#It should never enter here, unless you undefine the most
   1.970 +		#general tags (as <...>)
   1.971 +		die "po4a::xml: Unknown tag type: ".$line."\n";
   1.972 +	} else {
   1.973 +		return $i;
   1.974 +	}
   1.975 +}
   1.976 +
   1.977 +=item extract_tag($$)
   1.978 +
   1.979 +This function returns the next tag from the input stream without the beginning
   1.980 +and end, in an array form, to maintain the references from the input file.  It
   1.981 +has two parameters: the type of the tag (as returned by tag_type) and a
   1.982 +boolean, that indicates if it should be removed from the input stream.
   1.983 +
   1.984 +=cut
   1.985 +
   1.986 +sub extract_tag {
   1.987 +	my ($self,$type,$remove) = (shift,shift,shift);
   1.988 +	my ($match1,$match2) = ($tag_types[$type]->{beginning},$tag_types[$type]->{end});
   1.989 +	my ($eof,@tag);
   1.990 +	if (defined($tag_types[$type]->{f_extract})) {
   1.991 +		($eof,@tag) = &{$tag_types[$type]->{f_extract}}($self,$remove);
   1.992 +	} else {
   1.993 +		($eof,@tag) = $self->get_string_until($match2.">",{include=>1,remove=>$remove,unquoted=>1});
   1.994 +	}
   1.995 +	$tag[0] =~ /^<\Q$match1\E(.*)$/s;
   1.996 +	$tag[0] = $1;
   1.997 +	$tag[$#tag-1] =~ /^(.*)\Q$match2\E>$/s;
   1.998 +	$tag[$#tag-1] = $1;
   1.999 +	return ($eof,@tag);
  1.1000 +}
  1.1001 +
  1.1002 +=item get_tag_name(@)
  1.1003 +
  1.1004 +This function returns the name of the tag passed as an argument, in the array
  1.1005 +form returned by extract_tag.
  1.1006 +
  1.1007 +=cut
  1.1008 +
  1.1009 +sub get_tag_name {
  1.1010 +	my ($self,@tag)=@_;
  1.1011 +	$tag[0] =~ /^(\S*)/;
  1.1012 +	return $1;
  1.1013 +}
  1.1014 +
  1.1015 +=item breaking_tag()
  1.1016 +
  1.1017 +This function returns a boolean that says if the next tag in the input stream
  1.1018 +is a breaking tag or not (inline tag).  It leaves the input stream intact.
  1.1019 +
  1.1020 +=cut
  1.1021 +
  1.1022 +sub breaking_tag {
  1.1023 +	my $self = shift;
  1.1024 +	my $break;
  1.1025 +
  1.1026 +	my $type = $self->tag_type;
  1.1027 +	if ($type == -1) { return 0; }
  1.1028 +
  1.1029 +#print "TAG TYPE = ".$type."\n";
  1.1030 +	$break = $tag_types[$type]->{breaking};
  1.1031 +	if (!defined($break)) {
  1.1032 +		# This tag's breaking depends on its content
  1.1033 +		my ($eof,@lines) = $self->extract_tag($type,0);
  1.1034 +		$break = &{$tag_types[$type]->{f_breaking}}($self,@lines);
  1.1035 +	}
  1.1036 +#print "break = ".$break."\n";
  1.1037 +	return $break;
  1.1038 +}
  1.1039 +
  1.1040 +=item treat_tag()
  1.1041 +
  1.1042 +This function translates the next tag from the input stream.  Using each
  1.1043 +tag type's custom translation functions.
  1.1044 +
  1.1045 +=cut
  1.1046 +
  1.1047 +sub treat_tag {
  1.1048 +	my $self = shift;
  1.1049 +	my $type = $self->tag_type;
  1.1050 +
  1.1051 +	my ($match1,$match2) = ($tag_types[$type]->{beginning},$tag_types[$type]->{end});
  1.1052 +	my ($eof,@lines) = $self->extract_tag($type,1);
  1.1053 +
  1.1054 +	$lines[0] =~ /^(\s*)(.*)$/s;
  1.1055 +	my $space1 = $1;
  1.1056 +	$lines[0] = $2;
  1.1057 +	$lines[$#lines-1] =~ /^(.*?)(\s*)$/s;
  1.1058 +	my $space2 = $2;
  1.1059 +	$lines[$#lines-1] = $1;
  1.1060 +
  1.1061 +	# Calling this tag type's specific handling (translation of
  1.1062 +	# attributes...)
  1.1063 +	my $line = &{$tag_types[$type]->{f_translate}}($self,@lines);
  1.1064 +	$self->pushline("<".$match1.$space1.$line.$space2.$match2.">");
  1.1065 +	return $eof;
  1.1066 +}
  1.1067 +
  1.1068 +=item tag_in_list($@)
  1.1069 +
  1.1070 +This function returns a string value that says if the first argument (a tag
  1.1071 +hierarchy) matches any of the tags from the second argument (a list of tags
  1.1072 +or tag hierarchies). If it doesn't match, it returns 0. Else, it returns the
  1.1073 +matched tag's options (the characters in front of the tag) or 1 (if that tag
  1.1074 +doesn't have options).
  1.1075 +
  1.1076 +=back
  1.1077 +
  1.1078 +=cut
  1.1079 +sub tag_in_list ($$$) {
  1.1080 +	my ($self,$path,$list) = @_;
  1.1081 +	if ($self->{options}{'caseinsensitive'}) {
  1.1082 +		$path = lc $path;
  1.1083 +	}
  1.1084 +
  1.1085 +	while (1) {
  1.1086 +		if (defined $list->{$path}) {
  1.1087 +			if (length $list->{$path}) {
  1.1088 +				return $list->{$path};
  1.1089 +			} else {
  1.1090 +				return 1;
  1.1091 +			}
  1.1092 +		}
  1.1093 +		last unless ($path =~ m/</);
  1.1094 +		$path =~ s/^<.*?>//;
  1.1095 +	} 
  1.1096 +
  1.1097 +	return 0;
  1.1098 +}
  1.1099 +
  1.1100 +=head2 WORKING WITH ATTRIBUTES
  1.1101 +
  1.1102 +=over 4
  1.1103 +
  1.1104 +=item treat_attributes(@)
  1.1105 +
  1.1106 +This function handles the translation of the tags' attributes. It receives the tag
  1.1107 +without the beginning / end marks, and then it finds the attributes, and it
  1.1108 +translates the translatable ones (specified by the module option "attributes").
  1.1109 +This returns a plain string with the translated tag.
  1.1110 +
  1.1111 +=back
  1.1112 +
  1.1113 +=cut
  1.1114 +
  1.1115 +sub treat_attributes {
  1.1116 +	my ($self,@tag)=@_;
  1.1117 +
  1.1118 +	$tag[0] =~ /^(\S*)(.*)/s;
  1.1119 +	my $text = $1;
  1.1120 +	$tag[0] = $2;
  1.1121 +
  1.1122 +	while (@tag) {
  1.1123 +		my $complete = 1;
  1.1124 +
  1.1125 +		$text .= $self->skip_spaces(\@tag);
  1.1126 +		if (@tag) {
  1.1127 +			# Get the attribute's name
  1.1128 +			$complete = 0;
  1.1129 +
  1.1130 +			$tag[0] =~ /^([^\s=]+)(.*)/s;
  1.1131 +			my $name = $1;
  1.1132 +			my $ref = $tag[1];
  1.1133 +			$tag[0] = $2;
  1.1134 +			$text .= $name;
  1.1135 +			$text .= $self->skip_spaces(\@tag);
  1.1136 +			if (@tag) {
  1.1137 +				# Get the '='
  1.1138 +				if ($tag[0] =~ /^=(.*)/s) {
  1.1139 +					$tag[0] = $1;
  1.1140 +					$text .= "=";
  1.1141 +					$text .= $self->skip_spaces(\@tag);
  1.1142 +					if (@tag) {
  1.1143 +						# Get the value
  1.1144 +						my $value="";
  1.1145 +						$ref=$tag[1];
  1.1146 +						my $quot=substr($tag[0],0,1);
  1.1147 +						if ($quot ne "\"" and $quot ne "'") {
  1.1148 +							# Unquoted value
  1.1149 +							$quot="";
  1.1150 +							$tag[0] =~ /^(\S+)(.*)/s;
  1.1151 +							$value = $1;
  1.1152 +							$tag[0] = $2;
  1.1153 +						} else {
  1.1154 +							# Quoted value
  1.1155 +							$text .= $quot;
  1.1156 +							$tag[0] =~ /^\Q$quot\E(.*)/s;
  1.1157 +							$tag[0] = $1;
  1.1158 +							while ($tag[0] !~ /\Q$quot\E/) {
  1.1159 +								$value .= $tag[0];
  1.1160 +								shift @tag;
  1.1161 +								shift @tag;
  1.1162 +							}
  1.1163 +							$tag[0] =~ /^(.*?)\Q$quot\E(.*)/s;
  1.1164 +							$value .= $1;
  1.1165 +							$tag[0] = $2;
  1.1166 +						}
  1.1167 +						$complete = 1;
  1.1168 +						if ($self->tag_in_list($self->get_path.$name,$self->{attributes})) {
  1.1169 +							$text .= $self->found_string($value, $ref, { type=>"attribute", attribute=>$name });
  1.1170 +						} else {
  1.1171 +							print wrap_ref_mod($ref, "po4a::xml", dgettext("po4a", "Content of attribute %s excluded: %s"), $self->get_path.$name, $value)
  1.1172 +							       if $self->debug();
  1.1173 +							$text .= $self->recode_skipped_text($value);
  1.1174 +						}
  1.1175 +						$text .= $quot;
  1.1176 +					}
  1.1177 +				}
  1.1178 +			}
  1.1179 +          
  1.1180 +			unless ($complete) {
  1.1181 +				my $ontagerror = $self->{options}{'ontagerror'};
  1.1182 +				if ($ontagerror eq "warn") {
  1.1183 +					warn wrap_ref_mod($ref, "po4a::xml", dgettext ("po4a", "Bad attribute syntax.  Continuing..."));
  1.1184 +				} elsif ($ontagerror ne "silent") {
  1.1185 +					die wrap_ref_mod($ref, "po4a::xml", dgettext ("po4a", "Bad attribute syntax"));
  1.1186 +				}
  1.1187 +			}
  1.1188 +		}
  1.1189 +	}
  1.1190 +	return $text;
  1.1191 +}
  1.1192 +
  1.1193 +# Returns an empty string if the content in the $path should not be
  1.1194 +# translated.
  1.1195 +#
  1.1196 +# Otherwise, returns the set of options for translation:
  1.1197 +#   w: the content shall be re-wrapped
  1.1198 +#   W: the content shall not be re-wrapped
  1.1199 +#   i: the tag shall be inlined
  1.1200 +#   p: a placeholder shall replace the tag (and its content)
  1.1201 +#
  1.1202 +# A translatable inline tag in an untranslated tag is treated as a translatable breaking tag.
  1.1203 +my %translate_options_cache;
  1.1204 +sub get_translate_options {
  1.1205 +	my $self = shift;
  1.1206 +	my $path = shift;
  1.1207 +
  1.1208 +	if (defined $translate_options_cache{$path}) {
  1.1209 +		return $translate_options_cache{$path};
  1.1210 +	}
  1.1211 +
  1.1212 +	my $options = "";
  1.1213 +	my $translate = 0;
  1.1214 +	my $usedefault = 1;
  1.1215 +
  1.1216 +	my $inlist = 0;
  1.1217 +	my $tag = $self->get_tag_from_list($path, $self->{tags});
  1.1218 +	if (defined $tag) {
  1.1219 +		$inlist = 1;
  1.1220 +	}
  1.1221 +	if ($self->{options}{'tagsonly'} eq $inlist) {
  1.1222 +		$usedefault = 0;
  1.1223 +		if (defined $tag) {
  1.1224 +			$options = $tag;
  1.1225 +			$options =~ s/<.*$//;
  1.1226 +		} else {
  1.1227 +			if ($self->{options}{'wrap'}) {
  1.1228 +				$options = "w";
  1.1229 +			} else {
  1.1230 +				$options = "W";
  1.1231 +			}
  1.1232 +		}
  1.1233 +		$translate = 1;
  1.1234 +	}
  1.1235 +
  1.1236 +# TODO: a less precise set of tags should not override a more precise one
  1.1237 +	# The tags and tagsonly options are deprecated.
  1.1238 +	# The translated and untranslated options have an higher priority.
  1.1239 +	$tag = $self->get_tag_from_list($path, $self->{translated});
  1.1240 +	if (defined $tag) {
  1.1241 +		$usedefault = 0;
  1.1242 +		$options = $tag;
  1.1243 +		$options =~ s/<.*$//;
  1.1244 +		$translate = 1;
  1.1245 +	}
  1.1246 +
  1.1247 +	if ($translate and $options !~ m/w/i) {
  1.1248 +		$options .= ($self->{options}{'wrap'})?"w":"W";
  1.1249 +	}
  1.1250 +
  1.1251 +	if (not defined $tag) {
  1.1252 +		$tag = $self->get_tag_from_list($path, $self->{untranslated});
  1.1253 +		if (defined $tag) {
  1.1254 +			$usedefault = 0;
  1.1255 +			$options = "";
  1.1256 +			$translate = 0;
  1.1257 +		}
  1.1258 +	}
  1.1259 +
  1.1260 +	$tag = $self->get_tag_from_list($path, $self->{inline});
  1.1261 +	if (defined $tag) {
  1.1262 +		$usedefault = 0;
  1.1263 +		$options .= "i";
  1.1264 +	} else {
  1.1265 +		$tag = $self->get_tag_from_list($path, $self->{placeholder});
  1.1266 +		if (defined $tag) {
  1.1267 +			$usedefault = 0;
  1.1268 +			$options .= "p";
  1.1269 +		}
  1.1270 +	}
  1.1271 +
  1.1272 +	if ($usedefault) {
  1.1273 +		$options = $self->{options}{'defaulttranslateoption'};
  1.1274 +	}
  1.1275 +
  1.1276 +	# A translatable inline tag in an untranslated tag is treated as a
  1.1277 +	# translatable breaking tag.
  1.1278 +	if ($options =~ m/i/) {
  1.1279 +		my $ppath = $path;
  1.1280 +		$ppath =~ s/<[^>]*>$//;
  1.1281 +		my $poptions = $self->get_translate_options ($ppath);
  1.1282 +		if ($poptions eq "") {
  1.1283 +			$options =~ s/i//;
  1.1284 +		}
  1.1285 +	}
  1.1286 +
  1.1287 +	if ($options =~ m/i/ and $self->{options}{'foldattributes'}) {
  1.1288 +		$options .= "f";
  1.1289 +	}
  1.1290 +
  1.1291 +	$translate_options_cache{$path} = $options;
  1.1292 +	return $options;
  1.1293 +}
  1.1294 +
  1.1295 +
  1.1296 +# Return the tag (or biggest set of tags) of a list which matches with the
  1.1297 +# given path.
  1.1298 +#
  1.1299 +# The tag (or set of tags) is returned with its options.
  1.1300 +#
  1.1301 +# If no tags could match the path, undef is returned.
  1.1302 +sub get_tag_from_list ($$$) {
  1.1303 +	my ($self,$path,$list) = @_;
  1.1304 +	if ($self->{options}{'caseinsensitive'}) {
  1.1305 +		$path = lc $path;
  1.1306 +	}
  1.1307 +
  1.1308 +	while (1) {
  1.1309 +		if (defined $list->{$path}) {
  1.1310 +			return $list->{$path}.$path;
  1.1311 +		}
  1.1312 +		last unless ($path =~ m/</);
  1.1313 +		$path =~ s/^<.*?>//;
  1.1314 +	}
  1.1315 +
  1.1316 +	return undef;
  1.1317 +}
  1.1318 +
  1.1319 +
  1.1320 +
  1.1321 +sub treat_content {
  1.1322 +	my $self = shift;
  1.1323 +	my $blank="";
  1.1324 +	# Indicates if the paragraph will have to be translated
  1.1325 +	my $translate = "";
  1.1326 +
  1.1327 +	my ($eof,@paragraph)=$self->get_string_until('<',{remove=>1});
  1.1328 +
  1.1329 +	while (!$eof and !$self->breaking_tag) {
  1.1330 +	NEXT_TAG:
  1.1331 +		my @text;
  1.1332 +		my $type = $self->tag_type;
  1.1333 +		my $f_extract = $tag_types[$type]->{'f_extract'};
  1.1334 +		if (    defined($f_extract)
  1.1335 +		    and $f_extract eq \&tag_extract_comment) {
  1.1336 +			# Remove the content of the comments
  1.1337 +			($eof, @text) = $self->extract_tag($type,1);
  1.1338 +			$text[$#text-1] .= "\0";
  1.1339 +			if ($tag_types[$type]->{'beginning'} eq "!--#") {
  1.1340 +				$text[0] = "#".$text[0];
  1.1341 +			}
  1.1342 +			push @comments, @text;
  1.1343 +		} else {
  1.1344 +			my ($tmpeof, @tag) = $self->extract_tag($type,0);
  1.1345 +			# Append the found inline tag
  1.1346 +			($eof,@text)=$self->get_string_until('>',
  1.1347 +			                                     {include=>1,
  1.1348 +			                                      remove=>1,
  1.1349 +			                                      unquoted=>1});
  1.1350 +			# Append or remove the opening/closing tag from
  1.1351 +			# the tag path
  1.1352 +			if ($tag_types[$type]->{'end'} eq "") {
  1.1353 +				if ($tag_types[$type]->{'beginning'} eq "") {
  1.1354 +					# Opening inline tag
  1.1355 +					my $cur_tag_name = $self->get_tag_name(@tag);
  1.1356 +					my $t_opts = $self->get_translate_options($self->get_path($cur_tag_name));
  1.1357 +					if ($t_opts =~ m/p/) {
  1.1358 +						# We enter a new holder.
  1.1359 +						# Append a <placeholder ...> tag to the current
  1.1360 +						# paragraph, and save the @paragraph in the
  1.1361 +						# current holder.
  1.1362 +						my $last_holder = $save_holders[$#save_holders];
  1.1363 +						my $placeholder_str = "<placeholder type=\"".$cur_tag_name."\" id=\"".($#{$last_holder->{'sub_translations'}}+1)."\"/>";
  1.1364 +						push @paragraph, ($placeholder_str, $text[1]);
  1.1365 +						my @saved_paragraph = @paragraph;
  1.1366 +
  1.1367 +						$last_holder->{'paragraph'} = \@saved_paragraph;
  1.1368 +
  1.1369 +						# Then we must push a new holder
  1.1370 +						my @new_paragraph = ();
  1.1371 +						my @sub_translations = ();
  1.1372 +						my %folded_attributes;
  1.1373 +						my %new_holder = ('paragraph' => \@new_paragraph,
  1.1374 +						                  'open' => $text[0],
  1.1375 +						                  'translation' => "",
  1.1376 +						                  'close' => undef,
  1.1377 +						                  'sub_translations' => \@sub_translations,
  1.1378 +						                  'folded_attributes' => \%folded_attributes);
  1.1379 +						push @save_holders, \%new_holder;
  1.1380 +						@text = ();
  1.1381 +
  1.1382 +						# The current @paragraph
  1.1383 +						# (for the current holder)
  1.1384 +						# is empty.
  1.1385 +						@paragraph = ();
  1.1386 +					} elsif ($t_opts =~ m/f/) {
  1.1387 +						my $tag_full = $self->join_lines(@text);
  1.1388 +						my $tag_ref = $text[1];
  1.1389 +						if ($tag_full =~ m/^<\s*\S+\s+\S.*>$/s) {
  1.1390 +							my $holder = $save_holders[$#save_holders];
  1.1391 +							my $id = 0;
  1.1392 +							foreach (keys %{$holder->{folded_attributes}}) {
  1.1393 +								$id = $_ + 1 if ($_ >= $id);
  1.1394 +							}
  1.1395 +							$holder->{folded_attributes}->{$id} = $tag_full;
  1.1396 +
  1.1397 +							@text = ("<$cur_tag_name po4a-id=$id>", $tag_ref);
  1.1398 +						}
  1.1399 +					}
  1.1400 +					push @path, $cur_tag_name;
  1.1401 +				} elsif ($tag_types[$type]->{'beginning'} eq "/") {
  1.1402 +					# Closing inline tag
  1.1403 +
  1.1404 +					# Check if this is closing the
  1.1405 +					# last opening tag we detected.
  1.1406 +					my $test = pop @path;
  1.1407 +					my $name = $self->get_tag_name(@tag);
  1.1408 +					if (!defined($test) ||
  1.1409 +					    $test ne $name ) {
  1.1410 +						my $ontagerror = $self->{options}{'ontagerror'};
  1.1411 +						if ($ontagerror eq "warn") {
  1.1412 +							warn wrap_ref_mod($tag[1], "po4a::xml", dgettext("po4a", "Unexpected closing tag </%s> found. The main document may be wrong.  Continuing..."), $name);
  1.1413 +						} elsif ($ontagerror ne "silent") {
  1.1414 +							die wrap_ref_mod($tag[1], "po4a::xml", dgettext("po4a", "Unexpected closing tag </%s> found. The main document may be wrong."), $name);
  1.1415 +						}
  1.1416 +					}
  1.1417 +
  1.1418 +					if ($self->get_translate_options($self->get_path($self->get_tag_name(@tag))) =~ m/p/) {
  1.1419 +						# This closes the current holder.
  1.1420 +
  1.1421 +						push @path, $self->get_tag_name(@tag);
  1.1422 +						# Now translate this paragraph if needed.
  1.1423 +						# This will call pushline and append the
  1.1424 +						# translation to the current holder's translation.
  1.1425 +						$self->translate_paragraph(@paragraph);
  1.1426 +						pop @path;
  1.1427 +
  1.1428 +						# Now that this holder is closed, we can remove
  1.1429 +						# the holder from the stack.
  1.1430 +						my $holder = pop @save_holders;
  1.1431 +						# We need to keep the translation of this holder
  1.1432 +						my $translation = $holder->{'open'}.$holder->{'translation'}.$text[0];
  1.1433 +						# FIXME: @text could be multilines.
  1.1434 +
  1.1435 +						@text = ();
  1.1436 +
  1.1437 +						# Then we store the translation in the previous
  1.1438 +						# holder's sub_translations array
  1.1439 +						my $previous_holder = $save_holders[$#save_holders];
  1.1440 +						push @{$previous_holder->{'sub_translations'}}, $translation;
  1.1441 +						# We also need to restore the @paragraph array, as
  1.1442 +						# it was before we encountered the holder.
  1.1443 +						@paragraph = @{$previous_holder->{'paragraph'}};
  1.1444 +					}
  1.1445 +				}
  1.1446 +			}
  1.1447 +			push @paragraph, @text;
  1.1448 +		}
  1.1449 +
  1.1450 +		# Next tag
  1.1451 +		($eof,@text)=$self->get_string_until('<',{remove=>1});
  1.1452 +		if ($#text > 0) {
  1.1453 +			# Check if text (extracted after the inline tag)
  1.1454 +			# has to be translated
  1.1455 +			push @paragraph, @text;
  1.1456 +		}
  1.1457 +	}
  1.1458 +
  1.1459 +	# This strips the extracted strings
  1.1460 +	# (only if you don't specify the 'nostrip' option, and if the
  1.1461 +	# paragraph can be re-wrapped)
  1.1462 +	$translate = $self->get_translate_options($self->get_path);
  1.1463 +	if (!$self->{options}{'nostrip'} and $translate !~ m/W/) {
  1.1464 +		my $clean = 0;
  1.1465 +		# Clean the beginning
  1.1466 +		while (!$clean and $#paragraph > 0) {
  1.1467 +			$paragraph[0] =~ /^(\s*)(.*)/s;
  1.1468 +			my $match = $1;
  1.1469 +			if ($paragraph[0] eq $match) {
  1.1470 +				if ($match ne "") {
  1.1471 +					$self->pushline($match);
  1.1472 +				}
  1.1473 +				shift @paragraph;
  1.1474 +				shift @paragraph;
  1.1475 +			} else {
  1.1476 +				$paragraph[0] = $2;
  1.1477 +				if ($match ne "") {
  1.1478 +					$self->pushline($match);
  1.1479 +				}
  1.1480 +				$clean = 1;
  1.1481 +			}
  1.1482 +		}
  1.1483 +		$clean = 0;
  1.1484 +		# Clean the end
  1.1485 +		while (!$clean and $#paragraph > 0) {
  1.1486 +			$paragraph[$#paragraph-1] =~ /^(.*?)(\s*)$/s;
  1.1487 +			my $match = $2;
  1.1488 +			if ($paragraph[$#paragraph-1] eq $match) {
  1.1489 +				if ($match ne "") {
  1.1490 +					$blank = $match.$blank;
  1.1491 +				}
  1.1492 +				pop @paragraph;
  1.1493 +				pop @paragraph;
  1.1494 +			} else {
  1.1495 +				$paragraph[$#paragraph-1] = $1;
  1.1496 +				if ($match ne "") {
  1.1497 +					$blank = $match.$blank;
  1.1498 +				}
  1.1499 +				$clean = 1;
  1.1500 +			}
  1.1501 +		}
  1.1502 +	}
  1.1503 +
  1.1504 +	# Translate the string when needed
  1.1505 +	# This will either push the translation in the translated document or
  1.1506 +	# in the current holder translation.
  1.1507 +	$self->translate_paragraph(@paragraph);
  1.1508 +
  1.1509 +	# Push the trailing blanks
  1.1510 +	if ($blank ne "") {
  1.1511 +		$self->pushline($blank);
  1.1512 +	}
  1.1513 +	return $eof;
  1.1514 +}
  1.1515 +
  1.1516 +# Translate a @paragraph array of (string, reference).
  1.1517 +# The $translate argument indicates if the strings must be translated or
  1.1518 +# just pushed
  1.1519 +sub translate_paragraph {
  1.1520 +	my $self = shift;
  1.1521 +	my @paragraph = @_;
  1.1522 +	my $translate = $self->get_translate_options($self->get_path);
  1.1523 +
  1.1524 +	while (    (scalar @paragraph)
  1.1525 +	       and ($paragraph[0] =~ m/^\s*\n/s)) {
  1.1526 +		$self->pushline($paragraph[0]);
  1.1527 +		shift @paragraph;
  1.1528 +		shift @paragraph;
  1.1529 +	}
  1.1530 +
  1.1531 +	my $comments;
  1.1532 +	while (@comments) {
  1.1533 +		my ($comment,$eoc);
  1.1534 +		do {
  1.1535 +			my ($t,$l) = (shift @comments, shift @comments);
  1.1536 +			$t =~ s/\n?(\0)?$//;
  1.1537 +			$eoc = $1;
  1.1538 +			$comment .= "\n" if defined $comment;
  1.1539 +			$comment .= $t;
  1.1540 +		} until ($eoc);
  1.1541 +		$comments .= "\n" if defined $comments;
  1.1542 +		$comments .= $comment;
  1.1543 +		$self->pushline("<!--".$comment."-->\n") if defined $comment;
  1.1544 +	}
  1.1545 +	@comments = ();
  1.1546 +
  1.1547 +	if ($self->{options}{'cpp'}) {
  1.1548 +		my @tmp = @paragraph;
  1.1549 +		@paragraph = ();
  1.1550 +		while (@tmp) {
  1.1551 +			my ($t,$l) = (shift @tmp, shift @tmp);
  1.1552 +			# #include can be followed by a filename between
  1.1553 +			# <> brackets. In that case, the argument won't be
  1.1554 +			# handled in the same call to translate_paragraph.
  1.1555 +			# Thus do not try to match "include ".
  1.1556 +			if ($t =~ m/^#[ \t]*(if |endif|undef |include|else|ifdef |ifndef |define )/si) {
  1.1557 +				if (@paragraph) {
  1.1558 +					$self->translate_paragraph(@paragraph);
  1.1559 +					@paragraph = ();
  1.1560 +					$self->pushline("\n");
  1.1561 +				}
  1.1562 +				$self->pushline($t);
  1.1563 +			} else {
  1.1564 +				push @paragraph, ($t,$l);
  1.1565 +			}
  1.1566 +		}
  1.1567 +	}
  1.1568 +
  1.1569 +	my $para = $self->join_lines(@paragraph);
  1.1570 +	if ( length($para) > 0 ) {
  1.1571 +		if ($translate ne "") {
  1.1572 +			# This tag should be translated
  1.1573 +			$self->pushline($self->found_string(
  1.1574 +				$para,
  1.1575 +				$paragraph[1], {
  1.1576 +					type=>"tag",
  1.1577 +					tag_options=>$translate,
  1.1578 +					comments=>$comments
  1.1579 +				}));
  1.1580 +		} else {
  1.1581 +			# Inform that this tag isn't translated in debug mode
  1.1582 +			print wrap_ref_mod($paragraph[1], "po4a::xml", dgettext ("po4a", "Content of tag %s excluded: %s"), $self->get_path, $para)
  1.1583 +			       if $self->debug();
  1.1584 +			$self->pushline($self->recode_skipped_text($para));
  1.1585 +		}
  1.1586 +	}
  1.1587 +	# Now the paragraph is fully translated.
  1.1588 +	# If we have all the holders' translation, we can replace the
  1.1589 +	# placeholders by their translations.
  1.1590 +	# We must wait to have all the translations because the holders are
  1.1591 +	# numbered.
  1.1592 +	{
  1.1593 +		my $holder = $save_holders[$#save_holders];
  1.1594 +		my $translation = $holder->{'translation'};
  1.1595 +
  1.1596 +		# Count the number of <placeholder ...> in $translation
  1.1597 +		my $count = 0;
  1.1598 +		my $str = $translation;
  1.1599 +		while (    (defined $str)
  1.1600 +		       and ($str =~ m/^.*?<placeholder\s+type="[^"]+"\s+id="(\d+)"\s*\/>(.*)$/s)) {
  1.1601 +			$count += 1;
  1.1602 +			$str = $2;
  1.1603 +			if ($holder->{'sub_translations'}->[$1] =~ m/<placeholder\s+type="[^"]+"\s+id="(\d+)"\s*\/>/s) {
  1.1604 +				$count = -1;
  1.1605 +				last;
  1.1606 +			}
  1.1607 +		}
  1.1608 +
  1.1609 +		if (    (defined $translation)
  1.1610 +		    and (scalar(@{$holder->{'sub_translations'}}) == $count)) {
  1.1611 +			# OK, all the holders of the current paragraph are
  1.1612 +			# closed (and translated).
  1.1613 +			# Replace them by their translation.
  1.1614 +			while ($translation =~ m/^(.*?)<placeholder\s+type="[^"]+"\s+id="(\d+)"\s*\/>(.*)$/s) {
  1.1615 +				# FIXME: we could also check that
  1.1616 +				#          * the holder exists
  1.1617 +				#          * all the holders are used
  1.1618 +				$translation = $1.$holder->{'sub_translations'}->[$2].$3;
  1.1619 +			}
  1.1620 +			# We have our translation
  1.1621 +			$holder->{'translation'} = $translation;
  1.1622 +			# And there is no need for any holder in it.
  1.1623 +			my @sub_translations = ();
  1.1624 +			$holder->{'sub_translations'} = \@sub_translations;
  1.1625 +		}
  1.1626 +	}
  1.1627 +
  1.1628 +}
  1.1629 +
  1.1630 +
  1.1631 +
  1.1632 +=head2 WORKING WITH THE MODULE OPTIONS
  1.1633 +
  1.1634 +=over 4
  1.1635 +
  1.1636 +=item treat_options()
  1.1637 +
  1.1638 +This function fills the internal structures that contain the tags, attributes
  1.1639 +and inline data with the options of the module (specified in the command-line
  1.1640 +or in the initialize function).
  1.1641 +
  1.1642 +=back
  1.1643 +
  1.1644 +=cut
  1.1645 +
  1.1646 +sub treat_options {
  1.1647 +	my $self = shift;
  1.1648 +
  1.1649 +	if ($self->{options}{'caseinsensitive'}) {
  1.1650 +		$self->{options}{'nodefault'}             = lc $self->{options}{'nodefault'};
  1.1651 +		$self->{options}{'tags'}                  = lc $self->{options}{'tags'};
  1.1652 +		$self->{options}{'break'}                 = lc $self->{options}{'break'};
  1.1653 +		$self->{options}{'_default_break'}        = lc $self->{options}{'_default_break'};
  1.1654 +		$self->{options}{'translated'}            = lc $self->{options}{'translated'};
  1.1655 +		$self->{options}{'_default_translated'}   = lc $self->{options}{'_default_translated'};
  1.1656 +		$self->{options}{'untranslated'}          = lc $self->{options}{'untranslated'};
  1.1657 +		$self->{options}{'_default_untranslated'} = lc $self->{options}{'_default_untranslated'};
  1.1658 +		$self->{options}{'attributes'}            = lc $self->{options}{'attributes'};
  1.1659 +		$self->{options}{'_default_attributes'}   = lc $self->{options}{'_default_attributes'};
  1.1660 +		$self->{options}{'inline'}                = lc $self->{options}{'inline'};
  1.1661 +		$self->{options}{'_default_inline'}       = lc $self->{options}{'_default_inline'};
  1.1662 +		$self->{options}{'placeholder'}           = lc $self->{options}{'placeholder'};
  1.1663 +		$self->{options}{'_default_placeholder'}  = lc $self->{options}{'_default_placeholder'};
  1.1664 +	}
  1.1665 +
  1.1666 +	$self->{options}{'nodefault'} =~ /^\s*(.*)\s*$/s;
  1.1667 +	my %list_nodefault;
  1.1668 +	foreach (split(/\s+/s,$1)) {
  1.1669 +		$list_nodefault{$_} = 1;
  1.1670 +	}
  1.1671 +	$self->{nodefault} = \%list_nodefault;
  1.1672 +
  1.1673 +	$self->{options}{'tags'} =~ /^\s*(.*)\s*$/s;
  1.1674 +	if (length $self->{options}{'tags'}) {
  1.1675 +		warn wrap_mod("po4a::xml",
  1.1676 +		             dgettext("po4a",
  1.1677 +		                      "The '%s' option is deprecated. Please use the translated/untranslated and/or break/inline/placeholder categories."), "tags");
  1.1678 +	}
  1.1679 +	foreach (split(/\s+/s,$1)) {
  1.1680 +		$_ =~ m/^(.*?)(<.*)$/;
  1.1681 +		$self->{tags}->{$2} = $1 || "";
  1.1682 +	}
  1.1683 +
  1.1684 +	if ($self->{options}{'tagsonly'}) {
  1.1685 +		warn wrap_mod("po4a::xml",
  1.1686 +		             dgettext("po4a",
  1.1687 +		                      "The '%s' option is deprecated. Please use the translated/untranslated and/or break/inline/placeholder categories."), "tagsonly");
  1.1688 +	}
  1.1689 +
  1.1690 +	$self->{options}{'break'} =~ /^\s*(.*)\s*$/s;
  1.1691 +	foreach my $tag (split(/\s+/s,$1)) {
  1.1692 +		$tag =~ m/^(.*?)(<.*)$/;
  1.1693 +		$self->{break}->{$2} = $1 || "";
  1.1694 +	}
  1.1695 +	$self->{options}{'_default_break'} =~ /^\s*(.*)\s*$/s;
  1.1696 +	foreach my $tag (split(/\s+/s,$1)) {
  1.1697 +		$tag =~ m/^(.*?)(<.*)$/;
  1.1698 +		$self->{break}->{$2} = $1 || ""
  1.1699 +			unless    $list_nodefault{$2}
  1.1700 +			       or defined $self->{break}->{$2};
  1.1701 +	}
  1.1702 +
  1.1703 +	$self->{options}{'translated'} =~ /^\s*(.*)\s*$/s;
  1.1704 +	foreach my $tag (split(/\s+/s,$1)) {
  1.1705 +		$tag =~ m/^(.*?)(<.*)$/;
  1.1706 +		$self->{translated}->{$2} = $1 || "";
  1.1707 +	}
  1.1708 +	$self->{options}{'_default_translated'} =~ /^\s*(.*)\s*$/s;
  1.1709 +	foreach my $tag (split(/\s+/s,$1)) {
  1.1710 +		$tag =~ m/^(.*?)(<.*)$/;
  1.1711 +		$self->{translated}->{$2} = $1 || ""
  1.1712 +			unless    $list_nodefault{$2}
  1.1713 +			       or defined $self->{translated}->{$2};
  1.1714 +	}
  1.1715 +
  1.1716 +	$self->{options}{'untranslated'} =~ /^\s*(.*)\s*$/s;
  1.1717 +	foreach my $tag (split(/\s+/s,$1)) {
  1.1718 +		$tag =~ m/^(.*?)(<.*)$/;
  1.1719 +		$self->{untranslated}->{$2} = $1 || "";
  1.1720 +	}
  1.1721 +	$self->{options}{'_default_untranslated'} =~ /^\s*(.*)\s*$/s;
  1.1722 +	foreach my $tag (split(/\s+/s,$1)) {
  1.1723 +		$tag =~ m/^(.*?)(<.*)$/;
  1.1724 +		$self->{untranslated}->{$2} = $1 || ""
  1.1725 +			unless    $list_nodefault{$2}
  1.1726 +			       or defined $self->{untranslated}->{$2};
  1.1727 +	}
  1.1728 +
  1.1729 +	$self->{options}{'attributes'} =~ /^\s*(.*)\s*$/s;
  1.1730 +	foreach my $tag (split(/\s+/s,$1)) {
  1.1731 +		if ($tag =~ m/^(.*?)(<.*)$/) {
  1.1732 +			$self->{attributes}->{$2} = $1 || "";
  1.1733 +		} else {
  1.1734 +			$self->{attributes}->{$tag} = "";
  1.1735 +		}
  1.1736 +	}
  1.1737 +	$self->{options}{'_default_attributes'} =~ /^\s*(.*)\s*$/s;
  1.1738 +	foreach my $tag (split(/\s+/s,$1)) {
  1.1739 +		if ($tag =~ m/^(.*?)(<.*)$/) {
  1.1740 +			$self->{attributes}->{$2} = $1 || ""
  1.1741 +				unless    $list_nodefault{$2}
  1.1742 +				       or defined $self->{attributes}->{$2};
  1.1743 +		} else {
  1.1744 +			$self->{attributes}->{$tag} = ""
  1.1745 +				unless    $list_nodefault{$tag}
  1.1746 +				       or defined $self->{attributes}->{$tag};
  1.1747 +		}
  1.1748 +	}
  1.1749 +
  1.1750 +	my @list_inline;
  1.1751 +	$self->{options}{'inline'} =~ /^\s*(.*)\s*$/s;
  1.1752 +	foreach my $tag (split(/\s+/s,$1)) {
  1.1753 +		$tag =~ m/^(.*?)(<.*)$/;
  1.1754 +		$self->{inline}->{$2} = $1 || "";
  1.1755 +	}
  1.1756 +	$self->{options}{'_default_inline'} =~ /^\s*(.*)\s*$/s;
  1.1757 +	foreach my $tag (split(/\s+/s,$1)) {
  1.1758 +		$tag =~ m/^(.*?)(<.*)$/;
  1.1759 +		$self->{inline}->{$2} = $1 || ""
  1.1760 +			unless    $list_nodefault{$2}
  1.1761 +			       or defined $self->{inline}->{$2};
  1.1762 +	}
  1.1763 +
  1.1764 +	$self->{options}{'placeholder'} =~ /^\s*(.*)\s*$/s;
  1.1765 +	foreach my $tag (split(/\s+/s,$1)) {
  1.1766 +		$tag =~ m/^(.*?)(<.*)$/;
  1.1767 +		$self->{placeholder}->{$2} = $1 || "";
  1.1768 +	}
  1.1769 +	$self->{options}{'_default_placeholder'} =~ /^\s*(.*)\s*$/s;
  1.1770 +	foreach my $tag (split(/\s+/s,$1)) {
  1.1771 +		$tag =~ m/^(.*?)(<.*)$/;
  1.1772 +		$self->{placeholder}->{$2} = $1 || ""
  1.1773 +			unless    $list_nodefault{$2}
  1.1774 +			       or defined $self->{placeholder}->{$2};
  1.1775 +	}
  1.1776 +
  1.1777 +	# There should be no translated and untranslated tags
  1.1778 +	foreach my $tag (keys %{$self->{translated}}) {
  1.1779 +		die wrap_mod("po4a::xml",
  1.1780 +		             dgettext("po4a",
  1.1781 +		                      "Tag '%s' both in the %s and %s categories."), $tag, "translated", "untranslated")
  1.1782 +			if defined $self->{untranslated}->{$tag};
  1.1783 +	}
  1.1784 +	# There should be no inline, break, and placeholder tags
  1.1785 +	foreach my $tag (keys %{$self->{inline}}) {
  1.1786 +		die wrap_mod("po4a::xml",
  1.1787 +		             dgettext("po4a",
  1.1788 +		                      "Tag '%s' both in the %s and %s categories."), $tag, "inline", "break")
  1.1789 +			if defined $self->{break}->{$tag};
  1.1790 +		die wrap_mod("po4a::xml",
  1.1791 +		             dgettext("po4a",
  1.1792 +		                      "Tag '%s' both in the %s and %s categories."), $tag, "inline", "placeholder")
  1.1793 +			if defined $self->{placeholder}->{$tag};
  1.1794 +	}
  1.1795 +	foreach my $tag (keys %{$self->{break}}) {
  1.1796 +		die wrap_mod("po4a::xml",
  1.1797 +		             dgettext("po4a",
  1.1798 +		                      "Tag '%s' both in the %s and %s categories."), $tag, "break", "placeholder")
  1.1799 +			if defined $self->{placeholder}->{$tag};
  1.1800 +	}
  1.1801 +}
  1.1802 +
  1.1803 +=head2 GETTING TEXT FROM THE INPUT DOCUMENT
  1.1804 +
  1.1805 +=over
  1.1806 +
  1.1807 +=item get_string_until($%)
  1.1808 +
  1.1809 +This function returns an array with the lines (and references) from the input
  1.1810 +document until it finds the first argument.  The second argument is an options
  1.1811 +hash. Value 0 means disabled (the default) and 1, enabled.
  1.1812 +
  1.1813 +The valid options are:
  1.1814 +
  1.1815 +=over 4
  1.1816 +
  1.1817 +=item include
  1.1818 +
  1.1819 +This makes the returned array to contain the searched text
  1.1820 +
  1.1821 +=item remove
  1.1822 +
  1.1823 +This removes the returned stream from the input
  1.1824 +
  1.1825 +=item unquoted
  1.1826 +
  1.1827 +This ensures that the searched text is outside any quotes
  1.1828 +
  1.1829 +=back
  1.1830 +
  1.1831 +=cut
  1.1832 +
  1.1833 +sub get_string_until {
  1.1834 +	my ($self,$search) = (shift,shift);
  1.1835 +	my $options = shift;
  1.1836 +	my ($include,$remove,$unquoted, $regex) = (0,0,0,0);
  1.1837 +
  1.1838 +	if (defined($options->{include})) { $include = $options->{include}; }
  1.1839 +	if (defined($options->{remove})) { $remove = $options->{remove}; }
  1.1840 +	if (defined($options->{unquoted})) { $unquoted = $options->{unquoted}; }
  1.1841 +	if (defined($options->{regex})) { $regex = $options->{regex}; }
  1.1842 +
  1.1843 +	my ($line,$ref) = $self->shiftline();
  1.1844 +	my (@text,$paragraph);
  1.1845 +	my ($eof,$found) = (0,0);
  1.1846 +
  1.1847 +	$search = "\Q$search\E" unless $regex;
  1.1848 +	while (defined($line) and !$found) {
  1.1849 +		push @text, ($line,$ref);
  1.1850 +		$paragraph .= $line;
  1.1851 +		if ($unquoted) {
  1.1852 +			if ( $paragraph =~ /^((\".*?\")|(\'.*?\')|[^\"\'])*$search/s ) {
  1.1853 +				$found = 1;
  1.1854 +			}
  1.1855 +		} else {
  1.1856 +			if ( $paragraph =~ /$search/s ) {
  1.1857 +				$found = 1;
  1.1858 +			}
  1.1859 +		}
  1.1860 +		if (!$found) {
  1.1861 +			($line,$ref)=$self->shiftline();
  1.1862 +		}
  1.1863 +	}
  1.1864 +
  1.1865 +	if (!defined($line)) { $eof = 1; }
  1.1866 +
  1.1867 +	if ( $found ) {
  1.1868 +		$line = "";
  1.1869 +		if($unquoted) {
  1.1870 +			$paragraph =~ /^(?:(?:\".*?\")|(?:\'.*?\')|[^\"\'])*?$search(.*)$/s;
  1.1871 +			$line = $1;
  1.1872 +			$text[$#text-1] =~ s/\Q$line\E$//s;
  1.1873 +		} else {
  1.1874 +			$paragraph =~ /$search(.*)$/s;
  1.1875 +			$line = $1;
  1.1876 +			$text[$#text-1] =~ s/\Q$line\E$//s;
  1.1877 +		}
  1.1878 +		if(!$include) {
  1.1879 +			$text[$#text-1] =~ /^(.*)($search.*)$/s;
  1.1880 +			$text[$#text-1] = $1;
  1.1881 +			$line = $2.$line;
  1.1882 +		}
  1.1883 +		if (defined($line) and ($line ne "")) {
  1.1884 +			$self->unshiftline ($line,$text[$#text]);
  1.1885 +		}
  1.1886 +	}
  1.1887 +	if (!$remove) {
  1.1888 +		$self->unshiftline (@text);
  1.1889 +	}
  1.1890 +
  1.1891 +	#If we get to the end of the file, we return the whole paragraph
  1.1892 +	return ($eof,@text);
  1.1893 +}
  1.1894 +
  1.1895 +=item skip_spaces(\@)
  1.1896 +
  1.1897 +This function receives as argument the reference to a paragraph (in the format
  1.1898 +returned by get_string_until), skips his heading spaces and returns them as
  1.1899 +a simple string.
  1.1900 +
  1.1901 +=cut
  1.1902 +
  1.1903 +sub skip_spaces {
  1.1904 +	my ($self,$pstring)=@_;
  1.1905 +	my $space="";
  1.1906 +
  1.1907 +	while (@$pstring and (@$pstring[0] =~ /^(\s+)(.*)$/s or @$pstring[0] eq "")) {
  1.1908 +		if (@$pstring[0] ne "") {
  1.1909 +			$space .= $1;
  1.1910 +			@$pstring[0] = $2;
  1.1911 +		}
  1.1912 +
  1.1913 +		if (@$pstring[0] eq "") {
  1.1914 +			shift @$pstring;
  1.1915 +			shift @$pstring;
  1.1916 +		}
  1.1917 +	}
  1.1918 +	return $space;
  1.1919 +}
  1.1920 +
  1.1921 +=item join_lines(@)
  1.1922 +
  1.1923 +This function returns a simple string with the text from the argument array
  1.1924 +(discarding the references).
  1.1925 +
  1.1926 +=cut
  1.1927 +
  1.1928 +sub join_lines {
  1.1929 +	my ($self,@lines)=@_;
  1.1930 +	my ($line,$ref);
  1.1931 +	my $text = "";
  1.1932 +	while ($#lines > 0) {
  1.1933 +		($line,$ref) = (shift @lines,shift @lines);
  1.1934 +		$text .= $line;
  1.1935 +	}
  1.1936 +	return $text;
  1.1937 +}
  1.1938 +
  1.1939 +=back
  1.1940 +
  1.1941 +=head1 STATUS OF THIS MODULE
  1.1942 +
  1.1943 +This module can translate tags and attributes.
  1.1944 +
  1.1945 +=head1 TODO LIST
  1.1946 +
  1.1947 +DOCTYPE (ENTITIES)
  1.1948 +
  1.1949 +There is a minimal support for the translation of entities. They are
  1.1950 +translated as a whole, and tags are not taken into account. Multilines
  1.1951 +entities are not supported and entities are always rewrapped during the
  1.1952 +translation.
  1.1953 +
  1.1954 +MODIFY TAG TYPES FROM INHERITED MODULES
  1.1955 +(move the tag_types structure inside the $self hash?)
  1.1956 +
  1.1957 +=head1 SEE ALSO
  1.1958 +
  1.1959 +L<po4a(7)|po4a.7>, L<Locale::Po4a::TransTractor(3pm)|Locale::Po4a::TransTractor>.
  1.1960 +
  1.1961 +=head1 AUTHORS
  1.1962 +
  1.1963 + Jordi Vilalta <jvprat@gmail.com>
  1.1964 + Nicolas François <nicolas.francois@centraliens.net>
  1.1965 +
  1.1966 +=head1 COPYRIGHT AND LICENSE
  1.1967 +
  1.1968 + Copyright (c) 2004 by Jordi Vilalta  <jvprat@gmail.com>
  1.1969 + Copyright (c) 2008-2009 by Nicolas François <nicolas.francois@centraliens.net>
  1.1970 +
  1.1971 +This program is free software; you may redistribute it and/or modify it
  1.1972 +under the terms of GPL (see the COPYING file).
  1.1973 +
  1.1974 +=cut
  1.1975 +
  1.1976 +1;
author	Dongsheng Song <dongsheng.song@gmail.com>
date	Thu Mar 12 15:43:56 2009 +0800 (2009-03-12)
parents
children